diff --git a/.github/scripts/cmake.sh b/.github/scripts/cmake.sh index ef3e5f61dad..bc49c80a309 100755 --- a/.github/scripts/cmake.sh +++ b/.github/scripts/cmake.sh @@ -46,16 +46,10 @@ fi echo '::group::Prepare CMake builds' mkdir -p cpp_build -pushd test/tracing/frcnn -python trace_model.py +pushd examples/cpp +python script_model.py mkdir -p build -mv fasterrcnn_resnet50_fpn.pt build -popd - -pushd examples/cpp/hello_world -python trace_model.py -mkdir -p build -mv resnet18.pt build +mv resnet18.pt fasterrcnn_resnet50_fpn.pt build popd # This was only needed for the tracing above @@ -65,6 +59,7 @@ echo '::endgroup::' echo '::group::Build and install libtorchvision' pushd cpp_build + # On macOS, CMake is looking for the library (*.dylib) and the header (*.h) separately. By default, it prefers to load # the header from other packages that install the library. This easily leads to a mismatch if the library installed # from conda doesn't have the exact same version. Thus, we need to explicitly set CMAKE_FIND_FRAMEWORK=NEVER to force @@ -85,40 +80,24 @@ fi popd echo '::endgroup::' -echo '::group::Build and run project that uses Faster-RCNN' -pushd test/tracing/frcnn/build - -cmake .. -DTorch_DIR="${Torch_DIR}" -DWITH_CUDA="${WITH_CUDA}" \ - -DCMAKE_PREFIX_PATH="${CONDA_PREFIX}" \ - -DCMAKE_FIND_FRAMEWORK=NEVER -if [[ $OS_TYPE == windows ]]; then - "${PACKAGING_DIR}/windows/internal/vc_env_helper.bat" "${PACKAGING_DIR}/windows/internal/build_frcnn.bat" $JOBS - cd Release - cp ../fasterrcnn_resnet50_fpn.pt . -else - make -j$JOBS -fi - -./test_frcnn_tracing - -popd -echo '::endgroup::' - echo '::group::Build and run C++ example' -pushd examples/cpp/hello_world/build +pushd examples/cpp/build cmake .. -DTorch_DIR="${Torch_DIR}" \ -DCMAKE_PREFIX_PATH="${CONDA_PREFIX}" \ - -DCMAKE_FIND_FRAMEWORK=NEVER + -DCMAKE_FIND_FRAMEWORK=NEVER \ + -DUSE_TORCHVISION=ON # Needed for faster-rcnn since it's using torchvision ops like NMS. if [[ $OS_TYPE == windows ]]; then "${PACKAGING_DIR}/windows/internal/vc_env_helper.bat" "${PACKAGING_DIR}/windows/internal/build_cpp_example.bat" $JOBS cd Release cp ../resnet18.pt . + cp ../fasterrcnn_resnet50_fpn.pt . else make -j$JOBS fi -./hello-world +./run_model resnet18.pt +./run_model fasterrcnn_resnet50_fpn.pt popd echo '::endgroup::' diff --git a/.github/scripts/setup-env.sh b/.github/scripts/setup-env.sh index a4f113c367f..26a607558d3 100755 --- a/.github/scripts/setup-env.sh +++ b/.github/scripts/setup-env.sh @@ -22,17 +22,6 @@ case $(uname) in ;; esac -if [[ "${OS_TYPE}" == "macos" && $(uname -m) == x86_64 ]]; then - echo '::group::Uninstall system JPEG libraries on macOS' - # The x86 macOS runners, e.g. the GitHub Actions native "macos-12" runner, has some JPEG and PNG libraries - # installed by default that interfere with our build. We uninstall them here and use the one from conda below. - IMAGE_LIBS=$(brew list | grep -E "jpeg|png") - for lib in $IMAGE_LIBS; do - brew uninstall --ignore-dependencies --force "${lib}" - done - echo '::endgroup::' -fi - echo '::group::Create build environment' # See https://github.com/pytorch/vision/issues/7296 for ffmpeg conda create \ diff --git a/.github/scripts/unittest.sh b/.github/scripts/unittest.sh index fc4cb8f2796..da8a06928ea 100755 --- a/.github/scripts/unittest.sh +++ b/.github/scripts/unittest.sh @@ -9,7 +9,7 @@ eval "$($(which conda) shell.bash hook)" && conda deactivate && conda activate c echo '::group::Install testing utilities' # TODO: remove the <8 constraint on pytest when https://github.com/pytorch/vision/issues/8238 is closed -pip install --progress-bar=off "pytest<8" pytest-mock pytest-cov expecttest!=0.2.0 +pip install --progress-bar=off "pytest<8" pytest-mock pytest-cov expecttest!=0.2.0 requests echo '::endgroup::' python test/smoke_test.py diff --git a/.github/workflows/build-cmake.yml b/.github/workflows/build-cmake.yml index 107583235ed..1dce7b8446a 100644 --- a/.github/workflows/build-cmake.yml +++ b/.github/workflows/build-cmake.yml @@ -40,7 +40,6 @@ jobs: strategy: matrix: include: - - runner: macos-12 - runner: macos-m1-stable fail-fast: false uses: pytorch/test-infra/.github/workflows/macos_job.yml@main diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index b8dc5566cc7..5c2b3344247 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -63,7 +63,7 @@ jobs: echo '::group::Lint C source' set +e - ./.github/scripts/run-clang-format.py -r torchvision/csrc --clang-format-executable ./clang-format + ./.github/scripts/run-clang-format.py -r torchvision/csrc --clang-format-executable ./clang-format --exclude "torchvision/csrc/io/image/cpu/giflib/*" if [ $? -ne 0 ]; then git --no-pager diff diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 9cfc6be9d5e..ad327129912 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -53,16 +53,11 @@ jobs: - "3.10" - "3.11" - "3.12" - runner: ["macos-12"] - include: - - python-version: "3.8" - runner: macos-m1-stable + runner: ["macos-m1-stable"] fail-fast: false uses: pytorch/test-infra/.github/workflows/macos_job.yml@main with: repository: pytorch/vision - # We need an increased timeout here, since the macos-12 runner is the free one from GH - # and needs roughly 2 hours to just run the test suite timeout: 240 runner: ${{ matrix.runner }} test-infra-ref: main diff --git a/CMakeLists.txt b/CMakeLists.txt index 8798b64351d..2db9c1e274a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -7,7 +7,6 @@ option(WITH_CUDA "Enable CUDA support" OFF) option(WITH_MPS "Enable MPS support" OFF) option(WITH_PNG "Enable features requiring LibPNG." ON) option(WITH_JPEG "Enable features requiring LibJPEG." ON) -option(USE_PYTHON "Link to Python when building" OFF) if(WITH_CUDA) enable_language(CUDA) @@ -33,11 +32,6 @@ if (WITH_JPEG) find_package(JPEG REQUIRED) endif() -if (USE_PYTHON) - add_definitions(-DUSE_PYTHON) - find_package(Python3 REQUIRED COMPONENTS Development) -endif() - function(CUDA_CONVERT_FLAGS EXISTING_TARGET) get_property(old_flags TARGET ${EXISTING_TARGET} PROPERTY INTERFACE_COMPILE_OPTIONS) if(NOT "${old_flags}" STREQUAL "") @@ -80,7 +74,7 @@ include(GNUInstallDirs) include(CMakePackageConfigHelpers) set(TVCPP torchvision/csrc) -list(APPEND ALLOW_LISTED ${TVCPP} ${TVCPP}/io/image ${TVCPP}/io/image/cpu ${TVCPP}/models ${TVCPP}/ops +list(APPEND ALLOW_LISTED ${TVCPP} ${TVCPP}/io/image ${TVCPP}/io/image/cpu ${TVCPP}/io/image/cpu/giflib ${TVCPP}/models ${TVCPP}/ops ${TVCPP}/ops/autograd ${TVCPP}/ops/cpu ${TVCPP}/io/image/cuda) if(WITH_CUDA) list(APPEND ALLOW_LISTED ${TVCPP}/ops/cuda ${TVCPP}/ops/autocast) @@ -110,10 +104,6 @@ if (WITH_JPEG) target_link_libraries(${PROJECT_NAME} PRIVATE ${JPEG_LIBRARIES}) endif() -if (USE_PYTHON) - target_link_libraries(${PROJECT_NAME} PRIVATE Python3::Python) -endif() - set_target_properties(${PROJECT_NAME} PROPERTIES EXPORT_NAME TorchVision INSTALL_RPATH ${TORCH_INSTALL_PREFIX}/lib) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 9f724b20f87..41ecd860055 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -74,7 +74,7 @@ We don't officially support building from source using `pip`, but _if_ you do, y #### Other development dependencies (some of these are needed to run tests): ``` -pip install expecttest flake8 typing mypy pytest pytest-mock scipy +pip install expecttest flake8 typing mypy pytest pytest-mock scipy requests ``` ## Development Process diff --git a/README.md b/README.md index 75588300551..52298e79049 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,8 @@ versions. | `torch` | `torchvision` | Python | | ------------------ | ------------------ | ------------------- | -| `main` / `nightly` | `main` / `nightly` | `>=3.8`, `<=3.11` | +| `main` / `nightly` | `main` / `nightly` | `>=3.8`, `<=3.12` | +| `2.3` | `0.18` | `>=3.8`, `<=3.12` | | `2.2` | `0.17` | `>=3.8`, `<=3.11` | | `2.1` | `0.16` | `>=3.8`, `<=3.11` | | `2.0` | `0.15` | `>=3.8`, `<=3.11` | @@ -73,40 +74,14 @@ python setup.py install # Using the models on C++ -TorchVision provides an example project for how to use the models on C++ using JIT Script. +Refer to [example/cpp](https://github.com/pytorch/vision/tree/main/examples/cpp). -Installation From source: - -``` -mkdir build -cd build -# Add -DWITH_CUDA=on support for the CUDA if needed -cmake .. -make -make install -``` - -Once installed, the library can be accessed in cmake (after properly configuring `CMAKE_PREFIX_PATH`) via the -`TorchVision::TorchVision` target: - -``` -find_package(TorchVision REQUIRED) -target_link_libraries(my-target PUBLIC TorchVision::TorchVision) -``` - -The `TorchVision` package will also automatically look for the `Torch` package and add it as a dependency to -`my-target`, so make sure that it is also available to cmake via the `CMAKE_PREFIX_PATH`. - -For an example setup, take a look at `examples/cpp/hello_world`. - -Python linking is disabled by default when compiling TorchVision with CMake, this allows you to run models without any -Python dependency. In some special cases where TorchVision's operators are used from Python code, you may need to link -to Python. This can be done by passing `-DUSE_PYTHON=on` to CMake. - -### TorchVision Operators - -In order to get the torchvision operators registered with torch (eg. for the JIT), all you need to do is to ensure that -you `#include ` in your project. +**DISCLAIMER**: the `libtorchvision` library includes the torchvision +custom ops as well as most of the C++ torchvision APIs. Those APIs do not come +with any backward-compatibility guarantees and may change from one version to +the next. Only the Python APIs are stable and with backward-compatibility +guarantees. So, if you need stability within a C++ environment, your best bet is +to export the Python APIs via torchscript. ## Documentation diff --git a/cmake/TorchVisionConfig.cmake.in b/cmake/TorchVisionConfig.cmake.in index 9e92bc3b512..7f7e78817fa 100644 --- a/cmake/TorchVisionConfig.cmake.in +++ b/cmake/TorchVisionConfig.cmake.in @@ -46,13 +46,5 @@ if(@WITH_JPEG@) target_compile_definitions(${PN}::${PN} INTERFACE JPEG_FOUND) endif() -if (@USE_PYTHON@) - if(NOT TARGET Python3::Python) - find_package(Python3 COMPONENTS Development) - endif() - target_link_libraries(torch INTERFACE Python3::Python) - target_compile_definitions(${PN}::${PN} INTERFACE USE_PYTHON) -endif() - endif() endif() diff --git a/docs/source/io.rst b/docs/source/io.rst index 1da9bb6882a..f8258713163 100644 --- a/docs/source/io.rst +++ b/docs/source/io.rst @@ -19,6 +19,7 @@ Images encode_jpeg decode_jpeg write_jpeg + decode_gif encode_png decode_png write_png diff --git a/docs/source/transforms.rst b/docs/source/transforms.rst index 54ed18394cd..4bb18cf6b48 100644 --- a/docs/source/transforms.rst +++ b/docs/source/transforms.rst @@ -350,6 +350,7 @@ Color v2.RGB v2.RandomGrayscale v2.GaussianBlur + v2.GaussianNoise v2.RandomInvert v2.RandomPosterize v2.RandomSolarize @@ -368,6 +369,7 @@ Functionals v2.functional.grayscale_to_rgb v2.functional.to_grayscale v2.functional.gaussian_blur + v2.functional.gaussian_noise v2.functional.invert v2.functional.posterize v2.functional.solarize diff --git a/examples/cpp/CMakeLists.txt b/examples/cpp/CMakeLists.txt new file mode 100644 index 00000000000..a1329b0c968 --- /dev/null +++ b/examples/cpp/CMakeLists.txt @@ -0,0 +1,18 @@ +cmake_minimum_required(VERSION 3.10) +project(run_model) + +option(USE_TORCHVISION "Whether to link to torchvision" OFF) + +find_package(Torch REQUIRED) +if(USE_TORCHVISION) + find_package(TorchVision REQUIRED) +endif() + +add_executable(run_model run_model.cpp) + +target_link_libraries(run_model "${TORCH_LIBRARIES}") +if(USE_TORCHVISION) + target_link_libraries(run_model TorchVision::TorchVision) +endif() + +set_property(TARGET run_model PROPERTY CXX_STANDARD 17) diff --git a/examples/cpp/README.rst b/examples/cpp/README.rst new file mode 100644 index 00000000000..b2a9174c8ba --- /dev/null +++ b/examples/cpp/README.rst @@ -0,0 +1,101 @@ +Using torchvision models in C++ +=============================== + +This is a minimal example of getting TorchVision models to work in C++ with +Torchscript. The model is first scripted in Python and exported to a file, and +then loaded in C++. For a similar tutorial, see [this +tutorial](https://pytorch.org/tutorials/advanced/cpp_export.html). + +In order to successfully compile this example, make sure you have ``LibTorch`` +installed. You can either: + +- Install PyTorch normally +- Or download the LibTorch C++ distribution. + +In both cases refer [here](https://pytorch.org/get-started/locally/) the +corresponding install or download instructions. + +Some torchvision models only depend on PyTorch operators, and can be used in C++ +without depending on the torchvision lib. Other models rely on torchvision's C++ +operators like NMS, RoiAlign (typically the detection models) and those need to +be linked against the torchvision lib. + +We'll first see the simpler case of running a model without the torchvision lib +dependency. + +Running a model that doesn't need torchvision lib +------------------------------------------------- + +Create a ``build`` directory inside the current one. + +```bash +mkdir build +cd build +``` + +Then run `python ../trace_model.py` which should create a `resnet18.pt` file in +the build directory. This is the scripted model that will be used in the C++ +code. + +We can now start building with CMake. We have to tell CMake where it can find +the necessary PyTorch resources. If you installed PyTorch normally, you can do: + +```bash +TORCH_PATH=$(python -c "import pathlib, torch; print(pathlib.Path(torch.__path__[0]))") +Torch_DIR="${TORCH_PATH}/share/cmake/Torch" # there should be .cmake files in there + +cmake .. -DTorch_DIR=$Torch_DIR +``` + +If instead you downloaded the LibTorch somewhere, you can do: + +```bash +cmake .. -DCMAKE_PREFIX_PATH=/path/to/libtorch +``` + +Then `cmake --build .` and you should now be able to run + +```bash +./run_model resnet18.pt +``` + +If you try to run the model with a model that depends on the torchvision lib, like +`./run_model fasterrcnn_resnet50_fpn.pt`, you should get a runtime error. This is +because the executable wasn't linked against the torchvision lib. + + +Running a model that needs torchvision lib +------------------------------------------ + +First, we need to build the torchvision lib. To build the torchvision lib go to +the root of the torchvision project and run: + +```bash +mkdir build +cd build +cmake .. -DCMAKE_PREFIX_PATH=/path/to/libtorch # or -DTorch_DIR= if you installed PyTorch normally, see above +cmake --build . +cmake --install . +``` + +You may want to pass `-DCMAKE_INSTALL_PREFIX=/path/to/libtorchvision` for +cmake to copy/install the files to a specific location (e.g. `$CONDA_PREFIX`). + +**DISCLAIMER**: the `libtorchvision` library includes the torchvision +custom ops as well as most of the C++ torchvision APIs. Those APIs do not come +with any backward-compatibility guarantees and may change from one version to +the next. Only the Python APIs are stable and with backward-compatibility +guarantees. So, if you need stability within a C++ environment, your best bet is +to export the Python APIs via torchscript. + +Now that libtorchvision is built and installed we can tell our project to use +and link to it via the `-DUSE_TORCHVISION` flag. We also need to tell CMake +where to find it, just like we did with LibTorch, e.g.: + +```bash +cmake .. -DTorch_DIR=$Torch_DIR -DTorchVision_DIR=path/to/libtorchvision -DUSE_TORCHVISION=ON +cmake --build . +``` + +Now the `run_model` executable should be able to run the +`fasterrcnn_resnet50_fpn.pt` file. diff --git a/examples/cpp/hello_world/CMakeLists.txt b/examples/cpp/hello_world/CMakeLists.txt deleted file mode 100644 index 7d49178b8b3..00000000000 --- a/examples/cpp/hello_world/CMakeLists.txt +++ /dev/null @@ -1,20 +0,0 @@ -cmake_minimum_required(VERSION 3.10) -project(hello-world) - -# The first thing do is to tell cmake to find the TorchVision library. -# The package pulls in all the necessary torch libraries, -# so there is no need to also add `find_package(Torch)` here. -find_package(TorchVision REQUIRED) - -# This due to LibTorch's version is the one included in the Python -# package that links to Python. -find_package(Python3 COMPONENTS Development) - -add_executable(hello-world main.cpp) - -# We now need to link the TorchVision library to our executable. -# We can do that by using the TorchVision::TorchVision target, -# which also adds all the necessary torch dependencies. -target_compile_features(hello-world PUBLIC cxx_range_for) -target_link_libraries(hello-world TorchVision::TorchVision) -set_property(TARGET hello-world PROPERTY CXX_STANDARD 17) diff --git a/examples/cpp/hello_world/README.rst b/examples/cpp/hello_world/README.rst deleted file mode 100644 index 68b10a65b3c..00000000000 --- a/examples/cpp/hello_world/README.rst +++ /dev/null @@ -1,20 +0,0 @@ -Hello World! -============ - -This is a minimal example of getting TorchVision to work in C++ with CMake. - - -In order to successfully compile this example, make sure you have both ``LibTorch`` and -``TorchVision`` installed. -Once both dependencies are sorted, we can start the CMake fun: - -1) Create a ``build`` directory inside the current one. -2) from within the ``build`` directory, run the following commands: - - ``python ../trace_model.py`` To use a torchvision model in C++, you must first export it from the python version of torchvision. More information can be found on the corresponding `documentation page `_. - - | ``cmake -DCMAKE_PREFIX_PATH=";" ..`` - | where ```` and ```` are the paths to the libtorch and torchvision installations. - - ``cmake --build .`` - -| That's it! -| You should now have a ``hello-world`` executable in your ``build`` folder. - Running it will output a (fairly long) tensor of random values to your terminal. diff --git a/examples/cpp/hello_world/main.cpp b/examples/cpp/hello_world/main.cpp deleted file mode 100644 index bcbe68dd07d..00000000000 --- a/examples/cpp/hello_world/main.cpp +++ /dev/null @@ -1,44 +0,0 @@ -#include -#include -#include -#include - -int main() { - torch::DeviceType device_type; - device_type = torch::kCPU; - - torch::jit::script::Module model; - try { - std::cout << "Loading model\n"; - // Deserialize the ScriptModule from a file using torch::jit::load(). - model = torch::jit::load("resnet18.pt"); - std::cout << "Model loaded\n"; - } catch (const torch::Error& e) { - std::cout << "error loading the model\n"; - return -1; - } catch (const std::exception& e) { - std::cout << "Other error: " << e.what() << "\n"; - return -1; - } - - // TorchScript models require a List[IValue] as input - std::vector inputs; - - // Create a random input tensor and run it through the model. - inputs.push_back(torch::rand({1, 3, 10, 10})); - auto out = model.forward(inputs); - std::cout << out << "\n"; - - if (torch::cuda::is_available()) { - // Move model and inputs to GPU - model.to(torch::kCUDA); - - // Add GPU inputs - inputs.clear(); - torch::TensorOptions options = torch::TensorOptions{torch::kCUDA}; - inputs.push_back(torch::rand({1, 3, 10, 10}, options)); - - auto gpu_out = model.forward(inputs); - std::cout << gpu_out << "\n"; - } -} diff --git a/examples/cpp/hello_world/trace_model.py b/examples/cpp/hello_world/trace_model.py deleted file mode 100644 index 41bbaf8b6dd..00000000000 --- a/examples/cpp/hello_world/trace_model.py +++ /dev/null @@ -1,13 +0,0 @@ -import os.path as osp - -import torch -import torchvision - -HERE = osp.dirname(osp.abspath(__file__)) -ASSETS = osp.dirname(osp.dirname(HERE)) - -model = torchvision.models.resnet18() -model.eval() - -traced_model = torch.jit.script(model) -traced_model.save("resnet18.pt") diff --git a/examples/cpp/run_model.cpp b/examples/cpp/run_model.cpp new file mode 100644 index 00000000000..36c9d93cfa4 --- /dev/null +++ b/examples/cpp/run_model.cpp @@ -0,0 +1,67 @@ +#include +#include +#include +#include + +#ifdef _WIN32 +#include +#endif // _WIN32 + +int main(int argc, const char* argv[]) { + if (argc != 2) { + std::cout << "Usage: run_model \n"; + return -1; + } + torch::DeviceType device_type; + device_type = torch::kCPU; + + torch::jit::script::Module model; + try { + std::cout << "Loading model\n"; + // Deserialize the ScriptModule from a file using torch::jit::load(). + model = torch::jit::load(argv[1]); + std::cout << "Model loaded\n"; + } catch (const torch::Error& e) { + std::cout << "error loading the model.\n"; + return -1; + } catch (const std::exception& e) { + std::cout << "Other error: " << e.what() << "\n"; + return -1; + } + + // TorchScript models require a List[IValue] as input + std::vector inputs; + + if (std::strstr(argv[1], "fasterrcnn") != NULL) { + // Faster RCNN accepts a List[Tensor] as main input + std::vector images; + images.push_back(torch::rand({3, 256, 275})); + images.push_back(torch::rand({3, 256, 275})); + inputs.push_back(images); + } else { + inputs.push_back(torch::rand({1, 3, 10, 10})); + } + auto out = model.forward(inputs); + std::cout << out << "\n"; + + if (torch::cuda::is_available()) { + // Move model and inputs to GPU + model.to(torch::kCUDA); + + // Add GPU inputs + inputs.clear(); + torch::TensorOptions options = torch::TensorOptions{torch::kCUDA}; + if (std::strstr(argv[1], "fasterrcnn") != NULL) { + // Faster RCNN accepts a List[Tensor] as main input + std::vector images; + images.push_back(torch::rand({3, 256, 275}, options)); + images.push_back(torch::rand({3, 256, 275}, options)); + inputs.push_back(images); + } else { + inputs.push_back(torch::rand({1, 3, 10, 10}, options)); + } + + auto gpu_out = model.forward(inputs); + std::cout << gpu_out << "\n"; + } +} diff --git a/examples/cpp/script_model.py b/examples/cpp/script_model.py new file mode 100644 index 00000000000..e91e888e7be --- /dev/null +++ b/examples/cpp/script_model.py @@ -0,0 +1,10 @@ +import torch +from torchvision import models + +for model, name in ( + (models.resnet18(weights=None), "resnet18"), + (models.detection.fasterrcnn_resnet50_fpn(weights=None, weights_backbone=None), "fasterrcnn_resnet50_fpn"), +): + model.eval() + traced_model = torch.jit.script(model) + traced_model.save(f"{name}.pt") diff --git a/mypy.ini b/mypy.ini index de78c1e6c64..d6f3cb16963 100644 --- a/mypy.ini +++ b/mypy.ini @@ -23,17 +23,7 @@ allow_redefinition = True [mypy-torchvision.prototype.transforms.*] -; untyped definitions and calls -disallow_untyped_defs = True - -; None and Optional handling -no_implicit_optional = True - -; warnings -warn_unused_ignores = True - -; miscellaneous strictness flags -allow_redefinition = True +ignore_errors = True [mypy-torchvision.prototype.datasets.*] @@ -47,6 +37,10 @@ ignore_errors = True ignore_errors = True +[mypy-torchvision.io.video_reader] + +ignore_errors = True + [mypy-torchvision.models.densenet.*] ignore_errors=True @@ -95,6 +89,10 @@ ignore_errors = True ignore_errors = True +[mypy-torchvision.transforms._functional_pil] + +ignore_errors = True + [mypy-torchvision.transforms.functional.*] ignore_errors = True diff --git a/packaging/windows/internal/build_cpp_example.bat b/packaging/windows/internal/build_cpp_example.bat index e3f7afe9f02..129c574e391 100644 --- a/packaging/windows/internal/build_cpp_example.bat +++ b/packaging/windows/internal/build_cpp_example.bat @@ -1,3 +1,3 @@ @echo on set CL=/I"C:\Program Files (x86)\torchvision\include" -msbuild "-p:Configuration=Release" "-p:BuildInParallel=true" "-p:MultiProcessorCompilation=true" "-p:CL_MPCount=%1" hello-world.vcxproj -maxcpucount:%1 +msbuild "-p:Configuration=Release" "-p:BuildInParallel=true" "-p:MultiProcessorCompilation=true" "-p:CL_MPCount=%1" run_model.vcxproj -maxcpucount:%1 diff --git a/packaging/windows/internal/build_frcnn.bat b/packaging/windows/internal/build_frcnn.bat deleted file mode 100644 index 36e3757d01c..00000000000 --- a/packaging/windows/internal/build_frcnn.bat +++ /dev/null @@ -1,3 +0,0 @@ -@echo on -set CL=/I"C:\Program Files (x86)\torchvision\include" -msbuild "-p:Configuration=Release" "-p:BuildInParallel=true" "-p:MultiProcessorCompilation=true" "-p:CL_MPCount=%1" test_frcnn_tracing.vcxproj -maxcpucount:%1 diff --git a/references/classification/README.md b/references/classification/README.md index 203dae5dbc4..65ee416bf89 100644 --- a/references/classification/README.md +++ b/references/classification/README.md @@ -120,7 +120,7 @@ Here `$MODEL` is one of `efficientnet_v2_s` and `efficientnet_v2_m`. Note that the Small variant had a `$TRAIN_SIZE` of `300` and a `$EVAL_SIZE` of `384`, while the Medium `384` and `480` respectively. Note that the above command corresponds to training on a single node with 8 GPUs. -For generatring the pre-trained weights, we trained with 4 nodes, each with 8 GPUs (for a total of 32 GPUs), +For generating the pre-trained weights, we trained with 4 nodes, each with 8 GPUs (for a total of 32 GPUs), and `--batch_size 32`. The weights of the Large variant are ported from the original paper rather than trained from scratch. See the `EfficientNet_V2_L_Weights` entry for their exact preprocessing transforms. @@ -167,7 +167,7 @@ torchrun --nproc_per_node=8 train.py\ ``` Note that the above command corresponds to training on a single node with 8 GPUs. -For generatring the pre-trained weights, we trained with 8 nodes, each with 8 GPUs (for a total of 64 GPUs), +For generating the pre-trained weights, we trained with 8 nodes, each with 8 GPUs (for a total of 64 GPUs), and `--batch_size 64`. #### vit_b_32 @@ -180,7 +180,7 @@ torchrun --nproc_per_node=8 train.py\ ``` Note that the above command corresponds to training on a single node with 8 GPUs. -For generatring the pre-trained weights, we trained with 2 nodes, each with 8 GPUs (for a total of 16 GPUs), +For generating the pre-trained weights, we trained with 2 nodes, each with 8 GPUs (for a total of 16 GPUs), and `--batch_size 256`. #### vit_l_16 @@ -193,7 +193,7 @@ torchrun --nproc_per_node=8 train.py\ ``` Note that the above command corresponds to training on a single node with 8 GPUs. -For generatring the pre-trained weights, we trained with 2 nodes, each with 8 GPUs (for a total of 16 GPUs), +For generating the pre-trained weights, we trained with 2 nodes, each with 8 GPUs (for a total of 16 GPUs), and `--batch_size 64`. #### vit_l_32 @@ -206,7 +206,7 @@ torchrun --nproc_per_node=8 train.py\ ``` Note that the above command corresponds to training on a single node with 8 GPUs. -For generatring the pre-trained weights, we trained with 8 nodes, each with 8 GPUs (for a total of 64 GPUs), +For generating the pre-trained weights, we trained with 8 nodes, each with 8 GPUs (for a total of 64 GPUs), and `--batch_size 64`. @@ -221,7 +221,7 @@ torchrun --nproc_per_node=8 train.py\ Here `$MODEL` is one of `convnext_tiny`, `convnext_small`, `convnext_base` and `convnext_large`. Note that each variant had its `--val-resize-size` optimized in a post-training step, see their `Weights` entry for their exact value. Note that the above command corresponds to training on a single node with 8 GPUs. -For generatring the pre-trained weights, we trained with 2 nodes, each with 8 GPUs (for a total of 16 GPUs), +For generating the pre-trained weights, we trained with 2 nodes, each with 8 GPUs (for a total of 16 GPUs), and `--batch_size 64`. diff --git a/references/classification/transforms.py b/references/classification/transforms.py index 5443437d29d..96236608eec 100644 --- a/references/classification/transforms.py +++ b/references/classification/transforms.py @@ -19,9 +19,9 @@ def get_mixup_cutmix(*, mixup_alpha, cutmix_alpha, num_classes, use_v2): ) if cutmix_alpha > 0: mixup_cutmix.append( - transforms_module.CutMix(alpha=mixup_alpha, num_classes=num_classes) + transforms_module.CutMix(alpha=cutmix_alpha, num_classes=num_classes) if use_v2 - else RandomCutMix(num_classes=num_classes, p=1.0, alpha=mixup_alpha) + else RandomCutMix(num_classes=num_classes, p=1.0, alpha=cutmix_alpha) ) if not mixup_cutmix: return None diff --git a/setup.py b/setup.py index f101f5d5adc..fedbc370f72 100644 --- a/setup.py +++ b/setup.py @@ -209,7 +209,6 @@ def get_extensions(): if sys.platform == "win32": define_macros += [("torchvision_EXPORTS", None)] - define_macros += [("USE_PYTHON", None)] extra_compile_args["cxx"].append("/MP") if debug_mode: @@ -254,9 +253,6 @@ def get_extensions(): image_library = [] image_link_flags = [] - if sys.platform == "win32": - image_macros += [("USE_PYTHON", None)] - # Locating libPNG libpng = shutil.which("libpng-config") pngfix = shutil.which("pngfix") @@ -332,7 +328,11 @@ def get_extensions(): image_macros += [("NVJPEG_FOUND", str(int(use_nvjpeg)))] image_path = os.path.join(extensions_dir, "io", "image") - image_src = glob.glob(os.path.join(image_path, "*.cpp")) + glob.glob(os.path.join(image_path, "cpu", "*.cpp")) + image_src = ( + glob.glob(os.path.join(image_path, "*.cpp")) + + glob.glob(os.path.join(image_path, "cpu", "*.cpp")) + + glob.glob(os.path.join(image_path, "cpu", "giflib", "*.c")) + ) if is_rocm_pytorch: image_src += glob.glob(os.path.join(image_path, "hip", "*.cpp")) @@ -341,18 +341,17 @@ def get_extensions(): else: image_src += glob.glob(os.path.join(image_path, "cuda", "*.cpp")) - if use_png or use_jpeg: - ext_modules.append( - extension( - "torchvision.image", - image_src, - include_dirs=image_include + include_dirs + [image_path], - library_dirs=image_library + library_dirs, - define_macros=image_macros, - libraries=image_link_flags, - extra_compile_args=extra_compile_args, - ) + ext_modules.append( + extension( + "torchvision.image", + image_src, + include_dirs=image_include + include_dirs + [image_path], + library_dirs=image_library + library_dirs, + define_macros=image_macros, + libraries=image_link_flags, + extra_compile_args=extra_compile_args, ) + ) # Locating ffmpeg ffmpeg_exe = shutil.which("ffmpeg") @@ -555,6 +554,7 @@ def run(self): zip_safe=False, install_requires=requirements, extras_require={ + "gdown": ["gdown>=4.7.3"], "scipy": ["scipy"], }, ext_modules=get_extensions(), diff --git a/test/test_datasets.py b/test/test_datasets.py index aa100aa55c1..38a5fe33e3e 100644 --- a/test/test_datasets.py +++ b/test/test_datasets.py @@ -2553,7 +2553,7 @@ class OxfordIIITPetTestCase(datasets_utils.ImageDatasetTestCase): ADDITIONAL_CONFIGS = combinations_grid( split=("trainval", "test"), - target_types=("category", "segmentation", ["category", "segmentation"], []), + target_types=("category", "binary-category", "segmentation", ["category", "segmentation"], []), ) def inject_fake_data(self, tmpdir, config): diff --git a/test/test_image.py b/test/test_image.py index 3d9d612b5f3..619017df407 100644 --- a/test/test_image.py +++ b/test/test_image.py @@ -1,17 +1,20 @@ import glob import io import os +import re import sys from pathlib import Path import numpy as np import pytest +import requests import torch import torchvision.transforms.functional as F from common_utils import assert_equal, needs_cuda -from PIL import __version__ as PILLOW_VERSION, Image, ImageOps +from PIL import __version__ as PILLOW_VERSION, Image, ImageOps, ImageSequence from torchvision.io.image import ( _read_png_16, + decode_gif, decode_image, decode_jpeg, decode_png, @@ -548,5 +551,58 @@ def test_pathlib_support(tmpdir): write_png(img, write_path) +@pytest.mark.parametrize( + "name", ("gifgrid", "fire", "porsche", "treescap", "treescap-interlaced", "solid2", "x-trans", "earth") +) +@pytest.mark.parametrize("scripted", (True, False)) +def test_decode_gif(tmpdir, name, scripted): + # Using test images from GIFLIB + # https://sourceforge.net/p/giflib/code/ci/master/tree/pic/, we assert PIL + # and torchvision decoded outputs are equal. + # We're not testing against "welcome2" because PIL and GIFLIB disagee on what + # the background color should be (likely a difference in the way they handle + # transparency?) + # 'earth' image is from wikipedia, licensed under CC BY-SA 3.0 + # https://creativecommons.org/licenses/by-sa/3.0/ + # it allows to properly test for transparency, TOP-LEFT offsets, and + # disposal modes. + + path = tmpdir / f"{name}.gif" + if name == "earth": + url = "https://upload.wikimedia.org/wikipedia/commons/2/2c/Rotating_earth_%28large%29.gif" + else: + url = f"https://sourceforge.net/p/giflib/code/ci/master/tree/pic/{name}.gif?format=raw" + with open(path, "wb") as f: + f.write(requests.get(url).content) + + encoded_bytes = read_file(path) + f = torch.jit.script(decode_gif) if scripted else decode_gif + tv_out = f(encoded_bytes) + if tv_out.ndim == 3: + tv_out = tv_out[None] + + assert tv_out.is_contiguous(memory_format=torch.channels_last) + + # For some reason, not using Image.open() as a CM causes "ResourceWarning: unclosed file" + with Image.open(path) as pil_img: + pil_seq = ImageSequence.Iterator(pil_img) + + for pil_frame, tv_frame in zip(pil_seq, tv_out): + pil_frame = F.pil_to_tensor(pil_frame.convert("RGB")) + torch.testing.assert_close(tv_frame, pil_frame, atol=0, rtol=0) + + +def test_decode_gif_errors(): + encoded_data = torch.randint(0, 256, (100,), dtype=torch.uint8) + with pytest.raises(RuntimeError, match="Input tensor must be 1-dimensional"): + decode_gif(encoded_data[None]) + with pytest.raises(RuntimeError, match="Input tensor must have uint8 data type"): + decode_gif(encoded_data.float()) + with pytest.raises(RuntimeError, match="Input tensor must be contiguous"): + decode_gif(encoded_data[::2]) + with pytest.raises(RuntimeError, match=re.escape("DGifOpenFileName() failed - 103")): + decode_gif(encoded_data) + + if __name__ == "__main__": pytest.main([__file__]) diff --git a/test/test_ops.py b/test/test_ops.py index 52a66f380a6..99b259f73f5 100644 --- a/test/test_ops.py +++ b/test/test_ops.py @@ -14,6 +14,7 @@ from common_utils import assert_equal, cpu_and_cuda, cpu_and_cuda_and_mps, needs_cuda, needs_mps from PIL import Image from torch import nn, Tensor +from torch._dynamo.utils import is_compile_supported from torch.autograd import gradcheck from torch.nn.modules.utils import _pair from torchvision import models, ops @@ -529,6 +530,10 @@ def test_autocast_cpu(self, aligned, deterministic, x_dtype, rois_dtype): def test_backward(self, seed, device, contiguous, deterministic): if deterministic and device == "cpu": pytest.skip("cpu is always deterministic, don't retest") + if deterministic and device == "mps": + pytest.skip("no deterministic implementation for mps") + if deterministic and not is_compile_supported(device): + pytest.skip("deterministic implementation only if torch.compile supported") super().test_backward(seed, device, contiguous, deterministic) def _make_rois(self, img_size, num_imgs, dtype, num_rois=1000): diff --git a/test/test_transforms_v2.py b/test/test_transforms_v2.py index 24574eb1a43..8a47a589508 100644 --- a/test/test_transforms_v2.py +++ b/test/test_transforms_v2.py @@ -111,8 +111,10 @@ def _check_kernel_scripted_vs_eager(kernel, input, *args, rtol, atol, **kwargs): input = input.as_subclass(torch.Tensor) with ignore_jit_no_profile_information_warning(): - actual = kernel_scripted(input, *args, **kwargs) - expected = kernel(input, *args, **kwargs) + with freeze_rng_state(): + actual = kernel_scripted(input, *args, **kwargs) + with freeze_rng_state(): + expected = kernel(input, *args, **kwargs) assert_close(actual, expected, rtol=rtol, atol=atol) @@ -2169,26 +2171,30 @@ def test_image_correctness(self, brightness_factor): class TestCutMixMixUp: class DummyDataset: - def __init__(self, size, num_classes): + def __init__(self, size, num_classes, one_hot_labels): self.size = size self.num_classes = num_classes + self.one_hot_labels = one_hot_labels assert size < num_classes def __getitem__(self, idx): img = torch.rand(3, 100, 100) label = idx # This ensures all labels in a batch are unique and makes testing easier + if self.one_hot_labels: + label = torch.nn.functional.one_hot(torch.tensor(label), num_classes=self.num_classes) return img, label def __len__(self): return self.size @pytest.mark.parametrize("T", [transforms.CutMix, transforms.MixUp]) - def test_supported_input_structure(self, T): + @pytest.mark.parametrize("one_hot_labels", (True, False)) + def test_supported_input_structure(self, T, one_hot_labels): batch_size = 32 num_classes = 100 - dataset = self.DummyDataset(size=batch_size, num_classes=num_classes) + dataset = self.DummyDataset(size=batch_size, num_classes=num_classes, one_hot_labels=one_hot_labels) cutmix_mixup = T(num_classes=num_classes) @@ -2198,7 +2204,7 @@ def test_supported_input_structure(self, T): img, target = next(iter(dl)) input_img_size = img.shape[-3:] assert isinstance(img, torch.Tensor) and isinstance(target, torch.Tensor) - assert target.shape == (batch_size,) + assert target.shape == (batch_size, num_classes) if one_hot_labels else (batch_size,) def check_output(img, target): assert img.shape == (batch_size, *input_img_size) @@ -2209,7 +2215,7 @@ def check_output(img, target): # After Dataloader, as unpacked input img, target = next(iter(dl)) - assert target.shape == (batch_size,) + assert target.shape == (batch_size, num_classes) if one_hot_labels else (batch_size,) img, target = cutmix_mixup(img, target) check_output(img, target) @@ -2264,7 +2270,7 @@ def test_error(self, T): with pytest.raises(ValueError, match="Could not infer where the labels are"): cutmix_mixup({"img": imgs, "Nothing_else": 3}) - with pytest.raises(ValueError, match="labels tensor should be of shape"): + with pytest.raises(ValueError, match="labels should be index based"): # Note: the error message isn't ideal, but that's because the label heuristic found the img as the label # It's OK, it's an edge-case. The important thing is that this fails loudly instead of passing silently cutmix_mixup(imgs) @@ -2272,22 +2278,21 @@ def test_error(self, T): with pytest.raises(ValueError, match="When using the default labels_getter"): cutmix_mixup(imgs, "not_a_tensor") - with pytest.raises(ValueError, match="labels tensor should be of shape"): - cutmix_mixup(imgs, torch.randint(0, 2, size=(2, 3))) - with pytest.raises(ValueError, match="Expected a batched input with 4 dims"): cutmix_mixup(imgs[None, None], torch.randint(0, num_classes, size=(batch_size,))) with pytest.raises(ValueError, match="does not match the batch size of the labels"): cutmix_mixup(imgs, torch.randint(0, num_classes, size=(batch_size + 1,))) - with pytest.raises(ValueError, match="labels tensor should be of shape"): - # The purpose of this check is more about documenting the current - # behaviour of what happens on a Compose(), rather than actually - # asserting the expected behaviour. We may support Compose() in the - # future, e.g. for 2 consecutive CutMix? - labels = torch.randint(0, num_classes, size=(batch_size,)) - transforms.Compose([cutmix_mixup, cutmix_mixup])(imgs, labels) + with pytest.raises(ValueError, match="When passing 2D labels"): + wrong_num_classes = num_classes + 1 + T(alpha=0.5, num_classes=num_classes)(imgs, torch.randint(0, 2, size=(batch_size, wrong_num_classes))) + + with pytest.raises(ValueError, match="but got a tensor of shape"): + cutmix_mixup(imgs, torch.randint(0, 2, size=(2, 3, 4))) + + with pytest.raises(ValueError, match="num_classes must be passed"): + T(alpha=0.5)(imgs, torch.randint(0, num_classes, size=(batch_size,))) @pytest.mark.parametrize("key", ("labels", "LABELS", "LaBeL", "SOME_WEIRD_KEY_THAT_HAS_LABeL_IN_IT")) @@ -3235,6 +3240,78 @@ def test_functional_image_correctness(self, dimensions, kernel_size, sigma, dtyp torch.testing.assert_close(actual, expected, rtol=0, atol=1) +class TestGaussianNoise: + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image, make_video], + ) + def test_kernel(self, make_input): + check_kernel( + F.gaussian_noise, + make_input(dtype=torch.float32), + # This cannot pass because the noise on a batch in not per-image + check_batched_vs_unbatched=False, + ) + + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image, make_video], + ) + def test_functional(self, make_input): + check_functional(F.gaussian_noise, make_input(dtype=torch.float32)) + + @pytest.mark.parametrize( + ("kernel", "input_type"), + [ + (F.gaussian_noise, torch.Tensor), + (F.gaussian_noise_image, tv_tensors.Image), + (F.gaussian_noise_video, tv_tensors.Video), + ], + ) + def test_functional_signature(self, kernel, input_type): + check_functional_kernel_signature_match(F.gaussian_noise, kernel=kernel, input_type=input_type) + + @pytest.mark.parametrize( + "make_input", + [make_image_tensor, make_image, make_video], + ) + def test_transform(self, make_input): + def adapter(_, input, __): + # This transform doesn't support uint8 so we have to convert the auto-generated uint8 tensors to float32 + # Same for PIL images + for key, value in input.items(): + if isinstance(value, torch.Tensor) and not value.is_floating_point(): + input[key] = value.to(torch.float32) + if isinstance(value, PIL.Image.Image): + input[key] = F.pil_to_tensor(value).to(torch.float32) + return input + + check_transform(transforms.GaussianNoise(), make_input(dtype=torch.float32), check_sample_input=adapter) + + def test_bad_input(self): + with pytest.raises(ValueError, match="Gaussian Noise is not implemented for PIL images."): + F.gaussian_noise(make_image_pil()) + with pytest.raises(ValueError, match="Input tensor is expected to be in float dtype"): + F.gaussian_noise(make_image(dtype=torch.uint8)) + with pytest.raises(ValueError, match="sigma shouldn't be negative"): + F.gaussian_noise(make_image(dtype=torch.float32), sigma=-1) + + def test_clip(self): + img = make_image(dtype=torch.float32) + + out = F.gaussian_noise(img, mean=100, clip=False) + assert out.min() > 50 + + out = F.gaussian_noise(img, mean=100, clip=True) + assert (out == 1).all() + + out = F.gaussian_noise(img, mean=-100, clip=False) + assert out.min() < -50 + + out = F.gaussian_noise(img, mean=-100, clip=True) + assert (out == 0).all() + + class TestAutoAugmentTransforms: # These transforms have a lot of branches in their `forward()` passes which are conditioned on random sampling. # It's typically very hard to test the effect on some parameters without heavy mocking logic. @@ -5200,6 +5277,11 @@ def test_transform(self, make_input, dtype, device): transforms.LinearTransformation(*self._make_matrix_and_vector(input)), input, check_sample_input=self._sample_input_adapter, + # Compat check is failing on M1 with: + # AssertionError: Tensor-likes are not close! + # Mismatched elements: 1 / 561 (0.2%) + # See https://github.com/pytorch/vision/issues/8453 + check_v1_compatibility=(sys.platform != "darwin"), ) def test_transform_error(self): diff --git a/test/test_utils.py b/test/test_utils.py index ac394b51d63..e89bef4a6d9 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -355,6 +355,13 @@ def test_draw_keypoints_vanilla(): assert_equal(img, img_cp) +def test_draw_keypoins_K_equals_one(): + # Non-regression test for https://github.com/pytorch/vision/pull/8439 + img = torch.full((3, 100, 100), 0, dtype=torch.uint8) + keypoints = torch.tensor([[[10, 10]]], dtype=torch.float) + utils.draw_keypoints(img, keypoints) + + @pytest.mark.parametrize("colors", ["red", "#FF00FF", (1, 34, 122)]) def test_draw_keypoints_colored(colors): # Keypoints is declared on top as global variable diff --git a/test/tracing/frcnn/CMakeLists.txt b/test/tracing/frcnn/CMakeLists.txt deleted file mode 100644 index 8ede462e34b..00000000000 --- a/test/tracing/frcnn/CMakeLists.txt +++ /dev/null @@ -1,13 +0,0 @@ -cmake_minimum_required(VERSION 3.1 FATAL_ERROR) -project(test_frcnn_tracing) - -find_package(Torch REQUIRED) -find_package(TorchVision REQUIRED) - -# This due to some headers importing Python.h -find_package(Python3 COMPONENTS Development) - -add_executable(test_frcnn_tracing test_frcnn_tracing.cpp) -target_compile_features(test_frcnn_tracing PUBLIC cxx_range_for) -target_link_libraries(test_frcnn_tracing ${TORCH_LIBRARIES} TorchVision::TorchVision Python3::Python) -set_property(TARGET test_frcnn_tracing PROPERTY CXX_STANDARD 17) diff --git a/test/tracing/frcnn/test_frcnn_tracing.cpp b/test/tracing/frcnn/test_frcnn_tracing.cpp deleted file mode 100644 index f5f350b6b02..00000000000 --- a/test/tracing/frcnn/test_frcnn_tracing.cpp +++ /dev/null @@ -1,58 +0,0 @@ -#include -#include -#include -#include - - -int main() { - torch::DeviceType device_type; - device_type = torch::kCPU; - - torch::jit::script::Module module; - try { - std::cout << "Loading model\n"; - // Deserialize the ScriptModule from a file using torch::jit::load(). - module = torch::jit::load("fasterrcnn_resnet50_fpn.pt"); - std::cout << "Model loaded\n"; - } catch (const torch::Error& e) { - std::cout << "error loading the model\n"; - return -1; - } catch (const std::exception& e) { - std::cout << "Other error: " << e.what() << "\n"; - return -1; - } - - // TorchScript models require a List[IValue] as input - std::vector inputs; - - // Faster RCNN accepts a List[Tensor] as main input - std::vector images; - images.push_back(torch::rand({3, 256, 275})); - images.push_back(torch::rand({3, 256, 275})); - - inputs.push_back(images); - auto output = module.forward(inputs); - - std::cout << "ok\n"; - std::cout << "output" << output << "\n"; - - if (torch::cuda::is_available()) { - // Move traced model to GPU - module.to(torch::kCUDA); - - // Add GPU inputs - images.clear(); - inputs.clear(); - - torch::TensorOptions options = torch::TensorOptions{torch::kCUDA}; - images.push_back(torch::rand({3, 256, 275}, options)); - images.push_back(torch::rand({3, 256, 275}, options)); - - inputs.push_back(images); - auto output = module.forward(inputs); - - std::cout << "ok\n"; - std::cout << "output" << output << "\n"; - } - return 0; -} diff --git a/test/tracing/frcnn/trace_model.py b/test/tracing/frcnn/trace_model.py deleted file mode 100644 index b5ec50bdab1..00000000000 --- a/test/tracing/frcnn/trace_model.py +++ /dev/null @@ -1,13 +0,0 @@ -import os.path as osp - -import torch -import torchvision - -HERE = osp.dirname(osp.abspath(__file__)) -ASSETS = osp.dirname(osp.dirname(HERE)) - -model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights=None, weights_backbone=None) -model.eval() - -traced_model = torch.jit.script(model) -traced_model.save("fasterrcnn_resnet50_fpn.pt") diff --git a/torchvision/csrc/io/image/cpu/decode_gif.cpp b/torchvision/csrc/io/image/cpu/decode_gif.cpp new file mode 100644 index 00000000000..183d42e86a4 --- /dev/null +++ b/torchvision/csrc/io/image/cpu/decode_gif.cpp @@ -0,0 +1,173 @@ +#include "decode_gif.h" +#include +#include "giflib/gif_lib.h" + +namespace vision { +namespace image { + +typedef struct reader_helper_t { + uint8_t const* encoded_data; // input tensor data pointer + size_t encoded_data_size; // size of input tensor in bytes + size_t num_bytes_read; // number of bytes read so far in the tensor +} reader_helper_t; + +// That function is used by GIFLIB routines to read the encoded bytes. +// This reads `len` bytes and writes them into `buf`. The data is read from the +// input tensor passed to decode_gif() starting at the `num_bytes_read` +// position. +int read_from_tensor(GifFileType* gifFile, GifByteType* buf, int len) { + // the UserData field was set in DGifOpen() + reader_helper_t* reader_helper = + static_cast(gifFile->UserData); + + size_t num_bytes_to_read = std::min( + (size_t)len, + reader_helper->encoded_data_size - reader_helper->num_bytes_read); + std::memcpy( + buf, reader_helper->encoded_data + reader_helper->num_bytes_read, len); + reader_helper->num_bytes_read += num_bytes_to_read; + return num_bytes_to_read; +} + +torch::Tensor decode_gif(const torch::Tensor& encoded_data) { + // LibGif docs: https://giflib.sourceforge.net/intro.html + // Refer over there for more details on the libgif API, API ref, and a + // detailed description of the GIF format. + + TORCH_CHECK(encoded_data.is_contiguous(), "Input tensor must be contiguous."); + TORCH_CHECK( + encoded_data.dtype() == torch::kU8, + "Input tensor must have uint8 data type, got ", + encoded_data.dtype()); + TORCH_CHECK( + encoded_data.dim() == 1, + "Input tensor must be 1-dimensional, got ", + encoded_data.dim(), + " dims."); + + int error = D_GIF_SUCCEEDED; + + // We're using DGidOpen. The other entrypoints of libgif are + // DGifOpenFileName and DGifOpenFileHandle but we don't want to use those, + // since we need to read the encoded bytes from a tensor of encoded bytes, not + // from a file (for consistency with existing jpeg and png decoders). Using + // DGifOpen is the only way to read from a custom source. + // For that we need to provide a reader function `read_from_tensor` that + // reads from the tensor, and we have to keep track of the number of bytes + // read so far: this is why we need the reader_helper struct. + + // TODO: We are potentially doing an unnecessary copy of the encoded bytes: + // - 1 copy in from file to tensor (in read_file()) + // - 1 copy from tensor to GIFLIB buffers (in read_from_tensor()) + // Since we're vendoring GIFLIB we can potentially modify the calls to + // InternalRead() and just set the `buf` pointer to the tensor data directly. + // That might even save allocation of those buffers. + // If we do that, we'd have to make sure the buffers are never written to by + // GIFLIB, otherwise we'd be overridding the tensor data. + reader_helper_t reader_helper; + reader_helper.encoded_data = encoded_data.data_ptr(); + reader_helper.encoded_data_size = encoded_data.numel(); + reader_helper.num_bytes_read = 0; + GifFileType* gifFile = + DGifOpen(static_cast(&reader_helper), read_from_tensor, &error); + + TORCH_CHECK( + (gifFile != nullptr) && (error == D_GIF_SUCCEEDED), + "DGifOpenFileName() failed - ", + error); + + if (DGifSlurp(gifFile) == GIF_ERROR) { + auto gifFileError = gifFile->Error; + DGifCloseFile(gifFile, &error); + TORCH_CHECK(false, "DGifSlurp() failed - ", gifFileError); + } + auto num_images = gifFile->ImageCount; + + // This check should already done within DGifSlurp(), just to be safe + TORCH_CHECK(num_images > 0, "GIF file should contain at least one image!"); + + GifColorType bg = {0, 0, 0}; + if (gifFile->SColorMap) { + bg = gifFile->SColorMap->Colors[gifFile->SBackGroundColor]; + } + + // The GIFLIB docs say that the canvas's height and width are potentially + // ignored by modern viewers, so to be on the safe side we set the output + // height to max(canvas_heigh, first_image_height). Same for width. + // https://giflib.sourceforge.net/whatsinagif/bits_and_bytes.html + auto out_h = + std::max(gifFile->SHeight, gifFile->SavedImages[0].ImageDesc.Height); + auto out_w = + std::max(gifFile->SWidth, gifFile->SavedImages[0].ImageDesc.Width); + + // We output a channels-last tensor for consistency with other image decoders. + // Torchvision's resize tends to be is faster on uint8 channels-last tensors. + auto options = torch::TensorOptions() + .dtype(torch::kU8) + .memory_format(torch::MemoryFormat::ChannelsLast); + auto out = torch::empty( + {int64_t(num_images), 3, int64_t(out_h), int64_t(out_w)}, options); + auto out_a = out.accessor(); + for (int i = 0; i < num_images; i++) { + const SavedImage& img = gifFile->SavedImages[i]; + + GraphicsControlBlock gcb; + DGifSavedExtensionToGCB(gifFile, i, &gcb); + + const GifImageDesc& desc = img.ImageDesc; + const ColorMapObject* cmap = + desc.ColorMap ? desc.ColorMap : gifFile->SColorMap; + TORCH_CHECK( + cmap != nullptr, + "Global and local color maps are missing. This should never happen!"); + + // When going from one image to another, there is a "disposal method" which + // specifies how to handle the transition. E.g. DISPOSE_DO_NOT means that + // the current image should essentially be drawn on top of the previous + // canvas. The pixels of that previous canvas will appear on the new one if + // either: + // - a pixel is transparent in the current image + // - the current image is smaller than the canvas, hence exposing its pixels + // The "background" disposal method means that the current canvas should be + // set to the background color. + // We only support these 2 modes and default to "background" when the + // disposal method is unspecified, or when it's set to "DISPOSE_PREVIOUS" + // which according to GIFLIB is not widely supported. + // (https://giflib.sourceforge.net/whatsinagif/animation_and_transparency.html). + if (i > 0 && gcb.DisposalMode == DISPOSE_DO_NOT) { + out[i] = out[i - 1]; + } else { + // Background. If bg wasn't defined, it will be (0, 0, 0) + for (int h = 0; h < gifFile->SHeight; h++) { + for (int w = 0; w < gifFile->SWidth; w++) { + out_a[i][0][h][w] = bg.Red; + out_a[i][1][h][w] = bg.Green; + out_a[i][2][h][w] = bg.Blue; + } + } + } + + for (int h = 0; h < desc.Height; h++) { + for (int w = 0; w < desc.Width; w++) { + auto c = img.RasterBits[h * desc.Width + w]; + if (c == gcb.TransparentColor) { + continue; + } + GifColorType rgb = cmap->Colors[c]; + out_a[i][0][h + desc.Top][w + desc.Left] = rgb.Red; + out_a[i][1][h + desc.Top][w + desc.Left] = rgb.Green; + out_a[i][2][h + desc.Top][w + desc.Left] = rgb.Blue; + } + } + } + + out = out.squeeze(0); // remove batch dim if there's only one image + + DGifCloseFile(gifFile, &error); + TORCH_CHECK(error == D_GIF_SUCCEEDED, "DGifCloseFile() failed - ", error); + + return out; +} + +} // namespace image +} // namespace vision diff --git a/torchvision/csrc/io/image/cpu/decode_gif.h b/torchvision/csrc/io/image/cpu/decode_gif.h new file mode 100644 index 00000000000..68d5073c91b --- /dev/null +++ b/torchvision/csrc/io/image/cpu/decode_gif.h @@ -0,0 +1,12 @@ +#pragma once + +#include + +namespace vision { +namespace image { + +// encoded_data tensor must be 1D uint8 and contiguous +C10_EXPORT torch::Tensor decode_gif(const torch::Tensor& encoded_data); + +} // namespace image +} // namespace vision diff --git a/torchvision/csrc/io/image/cpu/decode_image.cpp b/torchvision/csrc/io/image/cpu/decode_image.cpp index dbf349b06ca..1f09da17597 100644 --- a/torchvision/csrc/io/image/cpu/decode_image.cpp +++ b/torchvision/csrc/io/image/cpu/decode_image.cpp @@ -1,5 +1,6 @@ #include "decode_image.h" +#include "decode_gif.h" #include "decode_jpeg.h" #include "decode_png.h" @@ -23,16 +24,24 @@ torch::Tensor decode_image( const uint8_t jpeg_signature[3] = {255, 216, 255}; // == "\xFF\xD8\xFF" const uint8_t png_signature[4] = {137, 80, 78, 71}; // == "\211PNG" + const uint8_t gif_signature_1[6] = { + 0x47, 0x49, 0x46, 0x38, 0x39, 0x61}; // == "GIF89a" + const uint8_t gif_signature_2[6] = { + 0x47, 0x49, 0x46, 0x38, 0x37, 0x61}; // == "GIF87a" if (memcmp(jpeg_signature, datap, 3) == 0) { return decode_jpeg(data, mode, apply_exif_orientation); } else if (memcmp(png_signature, datap, 4) == 0) { return decode_png( data, mode, /*allow_16_bits=*/false, apply_exif_orientation); + } else if ( + memcmp(gif_signature_1, datap, 6) == 0 || + memcmp(gif_signature_2, datap, 6) == 0) { + return decode_gif(data); } else { TORCH_CHECK( false, - "Unsupported image file. Only jpeg and png ", + "Unsupported image file. Only jpeg, png and gif ", "are currently supported."); } } diff --git a/torchvision/csrc/io/image/cpu/giflib/README b/torchvision/csrc/io/image/cpu/giflib/README new file mode 100644 index 00000000000..7353453e32e --- /dev/null +++ b/torchvision/csrc/io/image/cpu/giflib/README @@ -0,0 +1,28 @@ +These files come from the GIFLIB project (https://giflib.sourceforge.net/) and +are licensed under the MIT license. + +Some modifications have been made to the original files: +- Remove use of "register" keyword in gifalloc.c for C++17 compatibility. +- Declare loop variable i in DGifGetImageHeader as int instead of unsigned int. + +Below is the original license text from the COPYING file of the GIFLIB project: + += MIT LICENSE + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/torchvision/csrc/io/image/cpu/giflib/dgif_lib.c b/torchvision/csrc/io/image/cpu/giflib/dgif_lib.c new file mode 100644 index 00000000000..297f12f15c4 --- /dev/null +++ b/torchvision/csrc/io/image/cpu/giflib/dgif_lib.c @@ -0,0 +1,1312 @@ +/****************************************************************************** + +dgif_lib.c - GIF decoding + +The functions here and in egif_lib.c are partitioned carefully so that +if you only require one of read and write capability, only one of these +two modules will be linked. Preserve this property! + +*****************************************************************************/ +// SPDX-License-Identifier: MIT +// SPDX-FileCopyrightText: Copyright (C) Eric S. Raymond + +#include +#include +#include +#include +#include +#include + +#ifdef _WIN32 +#include +#else +#include +#endif /* _WIN32 */ + +#include "gif_lib.h" +#include "gif_lib_private.h" + +/* compose unsigned little endian value */ +#define UNSIGNED_LITTLE_ENDIAN(lo, hi) ((lo) | ((hi) << 8)) + +/* avoid extra function call in case we use fread (TVT) */ +static int InternalRead(GifFileType *gif, GifByteType *buf, int len) { + // fprintf(stderr, "### Read: %d\n", len); + return (((GifFilePrivateType *)gif->Private)->Read + ? ((GifFilePrivateType *)gif->Private)->Read(gif, buf, len) + : fread(buf, 1, len, + ((GifFilePrivateType *)gif->Private)->File)); +} + +static int DGifGetWord(GifFileType *GifFile, GifWord *Word); +static int DGifSetupDecompress(GifFileType *GifFile); +static int DGifDecompressLine(GifFileType *GifFile, GifPixelType *Line, + int LineLen); +static int DGifGetPrefixChar(const GifPrefixType *Prefix, int Code, + int ClearCode); +static int DGifDecompressInput(GifFileType *GifFile, int *Code); +static int DGifBufferedInput(GifFileType *GifFile, GifByteType *Buf, + GifByteType *NextByte); + +/****************************************************************************** + Open a new GIF file for read, given by its name. + Returns dynamically allocated GifFileType pointer which serves as the GIF + info record. +******************************************************************************/ +GifFileType *DGifOpenFileName(const char *FileName, int *Error) { + int FileHandle; + GifFileType *GifFile; + + if ((FileHandle = open(FileName, O_RDONLY)) == -1) { + if (Error != NULL) { + *Error = D_GIF_ERR_OPEN_FAILED; + } + return NULL; + } + + GifFile = DGifOpenFileHandle(FileHandle, Error); + return GifFile; +} + +/****************************************************************************** + Update a new GIF file, given its file handle. + Returns dynamically allocated GifFileType pointer which serves as the GIF + info record. +******************************************************************************/ +GifFileType *DGifOpenFileHandle(int FileHandle, int *Error) { + char Buf[GIF_STAMP_LEN + 1]; + GifFileType *GifFile; + GifFilePrivateType *Private; + FILE *f; + + GifFile = (GifFileType *)malloc(sizeof(GifFileType)); + if (GifFile == NULL) { + if (Error != NULL) { + *Error = D_GIF_ERR_NOT_ENOUGH_MEM; + } + (void)close(FileHandle); + return NULL; + } + + /*@i1@*/ memset(GifFile, '\0', sizeof(GifFileType)); + + /* Belt and suspenders, in case the null pointer isn't zero */ + GifFile->SavedImages = NULL; + GifFile->SColorMap = NULL; + + Private = (GifFilePrivateType *)calloc(1, sizeof(GifFilePrivateType)); + if (Private == NULL) { + if (Error != NULL) { + *Error = D_GIF_ERR_NOT_ENOUGH_MEM; + } + (void)close(FileHandle); + free((char *)GifFile); + return NULL; + } + + /*@i1@*/ memset(Private, '\0', sizeof(GifFilePrivateType)); + +#ifdef _WIN32 + _setmode(FileHandle, O_BINARY); /* Make sure it is in binary mode. */ +#endif /* _WIN32 */ + + f = fdopen(FileHandle, "rb"); /* Make it into a stream: */ + + /*@-mustfreeonly@*/ + GifFile->Private = (void *)Private; + Private->FileHandle = FileHandle; + Private->File = f; + Private->FileState = FILE_STATE_READ; + Private->Read = NULL; /* don't use alternate input method (TVT) */ + GifFile->UserData = NULL; /* TVT */ + /*@=mustfreeonly@*/ + + /* Let's see if this is a GIF file: */ + /* coverity[check_return] */ + if (InternalRead(GifFile, (unsigned char *)Buf, GIF_STAMP_LEN) != + GIF_STAMP_LEN) { + if (Error != NULL) { + *Error = D_GIF_ERR_READ_FAILED; + } + (void)fclose(f); + free((char *)Private); + free((char *)GifFile); + return NULL; + } + + /* Check for GIF prefix at start of file */ + Buf[GIF_STAMP_LEN] = 0; + if (strncmp(GIF_STAMP, Buf, GIF_VERSION_POS) != 0) { + if (Error != NULL) { + *Error = D_GIF_ERR_NOT_GIF_FILE; + } + (void)fclose(f); + free((char *)Private); + free((char *)GifFile); + return NULL; + } + + if (DGifGetScreenDesc(GifFile) == GIF_ERROR) { + (void)fclose(f); + free((char *)Private); + free((char *)GifFile); + return NULL; + } + + GifFile->Error = 0; + + /* What version of GIF? */ + Private->gif89 = (Buf[GIF_VERSION_POS + 1] == '9'); + + return GifFile; +} + +/****************************************************************************** + GifFileType constructor with user supplied input function (TVT) +******************************************************************************/ +GifFileType *DGifOpen(void *userData, InputFunc readFunc, int *Error) { + char Buf[GIF_STAMP_LEN + 1]; + GifFileType *GifFile; + GifFilePrivateType *Private; + + GifFile = (GifFileType *)malloc(sizeof(GifFileType)); + if (GifFile == NULL) { + if (Error != NULL) { + *Error = D_GIF_ERR_NOT_ENOUGH_MEM; + } + return NULL; + } + + memset(GifFile, '\0', sizeof(GifFileType)); + + /* Belt and suspenders, in case the null pointer isn't zero */ + GifFile->SavedImages = NULL; + GifFile->SColorMap = NULL; + + Private = (GifFilePrivateType *)calloc(1, sizeof(GifFilePrivateType)); + if (!Private) { + if (Error != NULL) { + *Error = D_GIF_ERR_NOT_ENOUGH_MEM; + } + free((char *)GifFile); + return NULL; + } + /*@i1@*/ memset(Private, '\0', sizeof(GifFilePrivateType)); + + GifFile->Private = (void *)Private; + Private->FileHandle = 0; + Private->File = NULL; + Private->FileState = FILE_STATE_READ; + + Private->Read = readFunc; /* TVT */ + GifFile->UserData = userData; /* TVT */ + + /* Lets see if this is a GIF file: */ + /* coverity[check_return] */ + if (InternalRead(GifFile, (unsigned char *)Buf, GIF_STAMP_LEN) != + GIF_STAMP_LEN) { + if (Error != NULL) { + *Error = D_GIF_ERR_READ_FAILED; + } + free((char *)Private); + free((char *)GifFile); + return NULL; + } + + /* Check for GIF prefix at start of file */ + Buf[GIF_STAMP_LEN] = '\0'; + if (strncmp(GIF_STAMP, Buf, GIF_VERSION_POS) != 0) { + if (Error != NULL) { + *Error = D_GIF_ERR_NOT_GIF_FILE; + } + free((char *)Private); + free((char *)GifFile); + return NULL; + } + + if (DGifGetScreenDesc(GifFile) == GIF_ERROR) { + free((char *)Private); + free((char *)GifFile); + if (Error != NULL) { + *Error = D_GIF_ERR_NO_SCRN_DSCR; + } + return NULL; + } + + GifFile->Error = 0; + + /* What version of GIF? */ + Private->gif89 = (Buf[GIF_VERSION_POS + 1] == '9'); + + return GifFile; +} + +/****************************************************************************** + This routine should be called before any other DGif calls. Note that + this routine is called automatically from DGif file open routines. +******************************************************************************/ +int DGifGetScreenDesc(GifFileType *GifFile) { + int BitsPerPixel; + bool SortFlag; + GifByteType Buf[3]; + GifFilePrivateType *Private = (GifFilePrivateType *)GifFile->Private; + + if (!IS_READABLE(Private)) { + /* This file was NOT open for reading: */ + GifFile->Error = D_GIF_ERR_NOT_READABLE; + return GIF_ERROR; + } + + /* Put the screen descriptor into the file: */ + if (DGifGetWord(GifFile, &GifFile->SWidth) == GIF_ERROR || + DGifGetWord(GifFile, &GifFile->SHeight) == GIF_ERROR) { + return GIF_ERROR; + } + + if (InternalRead(GifFile, Buf, 3) != 3) { + GifFile->Error = D_GIF_ERR_READ_FAILED; + GifFreeMapObject(GifFile->SColorMap); + GifFile->SColorMap = NULL; + return GIF_ERROR; + } + GifFile->SColorResolution = (((Buf[0] & 0x70) + 1) >> 4) + 1; + SortFlag = (Buf[0] & 0x08) != 0; + BitsPerPixel = (Buf[0] & 0x07) + 1; + GifFile->SBackGroundColor = Buf[1]; + GifFile->AspectByte = Buf[2]; + if (Buf[0] & 0x80) { /* Do we have global color map? */ + int i; + + GifFile->SColorMap = GifMakeMapObject(1 << BitsPerPixel, NULL); + if (GifFile->SColorMap == NULL) { + GifFile->Error = D_GIF_ERR_NOT_ENOUGH_MEM; + return GIF_ERROR; + } + + /* Get the global color map: */ + GifFile->SColorMap->SortFlag = SortFlag; + for (i = 0; i < GifFile->SColorMap->ColorCount; i++) { + /* coverity[check_return] */ + if (InternalRead(GifFile, Buf, 3) != 3) { + GifFreeMapObject(GifFile->SColorMap); + GifFile->SColorMap = NULL; + GifFile->Error = D_GIF_ERR_READ_FAILED; + return GIF_ERROR; + } + GifFile->SColorMap->Colors[i].Red = Buf[0]; + GifFile->SColorMap->Colors[i].Green = Buf[1]; + GifFile->SColorMap->Colors[i].Blue = Buf[2]; + } + } else { + GifFile->SColorMap = NULL; + } + + /* + * No check here for whether the background color is in range for the + * screen color map. Possibly there should be. + */ + + return GIF_OK; +} + +const char *DGifGetGifVersion(GifFileType *GifFile) { + GifFilePrivateType *Private = (GifFilePrivateType *)GifFile->Private; + + if (Private->gif89) { + return GIF89_STAMP; + } else { + return GIF87_STAMP; + } +} + +/****************************************************************************** + This routine should be called before any attempt to read an image. +******************************************************************************/ +int DGifGetRecordType(GifFileType *GifFile, GifRecordType *Type) { + GifByteType Buf; + GifFilePrivateType *Private = (GifFilePrivateType *)GifFile->Private; + + if (!IS_READABLE(Private)) { + /* This file was NOT open for reading: */ + GifFile->Error = D_GIF_ERR_NOT_READABLE; + return GIF_ERROR; + } + + /* coverity[check_return] */ + if (InternalRead(GifFile, &Buf, 1) != 1) { + GifFile->Error = D_GIF_ERR_READ_FAILED; + return GIF_ERROR; + } + + // fprintf(stderr, "### DGifGetRecordType: %02x\n", Buf); + switch (Buf) { + case DESCRIPTOR_INTRODUCER: + *Type = IMAGE_DESC_RECORD_TYPE; + break; + case EXTENSION_INTRODUCER: + *Type = EXTENSION_RECORD_TYPE; + break; + case TERMINATOR_INTRODUCER: + *Type = TERMINATE_RECORD_TYPE; + break; + default: + *Type = UNDEFINED_RECORD_TYPE; + GifFile->Error = D_GIF_ERR_WRONG_RECORD; + return GIF_ERROR; + } + + return GIF_OK; +} + +int DGifGetImageHeader(GifFileType *GifFile) { + unsigned int BitsPerPixel; + GifByteType Buf[3]; + GifFilePrivateType *Private = (GifFilePrivateType *)GifFile->Private; + + if (!IS_READABLE(Private)) { + /* This file was NOT open for reading: */ + GifFile->Error = D_GIF_ERR_NOT_READABLE; + return GIF_ERROR; + } + + if (DGifGetWord(GifFile, &GifFile->Image.Left) == GIF_ERROR || + DGifGetWord(GifFile, &GifFile->Image.Top) == GIF_ERROR || + DGifGetWord(GifFile, &GifFile->Image.Width) == GIF_ERROR || + DGifGetWord(GifFile, &GifFile->Image.Height) == GIF_ERROR) { + return GIF_ERROR; + } + if (InternalRead(GifFile, Buf, 1) != 1) { + GifFile->Error = D_GIF_ERR_READ_FAILED; + GifFreeMapObject(GifFile->Image.ColorMap); + GifFile->Image.ColorMap = NULL; + return GIF_ERROR; + } + BitsPerPixel = (Buf[0] & 0x07) + 1; + GifFile->Image.Interlace = (Buf[0] & 0x40) ? true : false; + + /* Setup the colormap */ + if (GifFile->Image.ColorMap) { + GifFreeMapObject(GifFile->Image.ColorMap); + GifFile->Image.ColorMap = NULL; + } + /* Does this image have local color map? */ + if (Buf[0] & 0x80) { + int i; + + GifFile->Image.ColorMap = + GifMakeMapObject(1 << BitsPerPixel, NULL); + if (GifFile->Image.ColorMap == NULL) { + GifFile->Error = D_GIF_ERR_NOT_ENOUGH_MEM; + return GIF_ERROR; + } + + /* Get the image local color map: */ + for (i = 0; i < GifFile->Image.ColorMap->ColorCount; i++) { + /* coverity[check_return] */ + if (InternalRead(GifFile, Buf, 3) != 3) { + GifFreeMapObject(GifFile->Image.ColorMap); + GifFile->Error = D_GIF_ERR_READ_FAILED; + GifFile->Image.ColorMap = NULL; + return GIF_ERROR; + } + GifFile->Image.ColorMap->Colors[i].Red = Buf[0]; + GifFile->Image.ColorMap->Colors[i].Green = Buf[1]; + GifFile->Image.ColorMap->Colors[i].Blue = Buf[2]; + } + } + + Private->PixelCount = + (long)GifFile->Image.Width * (long)GifFile->Image.Height; + + /* Reset decompress algorithm parameters. */ + return DGifSetupDecompress(GifFile); +} + +/****************************************************************************** + This routine should be called before any attempt to read an image. + Note it is assumed the Image desc. header has been read. +******************************************************************************/ +int DGifGetImageDesc(GifFileType *GifFile) { + GifFilePrivateType *Private = (GifFilePrivateType *)GifFile->Private; + SavedImage *sp; + + if (!IS_READABLE(Private)) { + /* This file was NOT open for reading: */ + GifFile->Error = D_GIF_ERR_NOT_READABLE; + return GIF_ERROR; + } + + if (DGifGetImageHeader(GifFile) == GIF_ERROR) { + return GIF_ERROR; + } + + if (GifFile->SavedImages) { + SavedImage *new_saved_images = (SavedImage *)reallocarray( + GifFile->SavedImages, (GifFile->ImageCount + 1), + sizeof(SavedImage)); + if (new_saved_images == NULL) { + GifFile->Error = D_GIF_ERR_NOT_ENOUGH_MEM; + return GIF_ERROR; + } + GifFile->SavedImages = new_saved_images; + } else { + if ((GifFile->SavedImages = + (SavedImage *)malloc(sizeof(SavedImage))) == NULL) { + GifFile->Error = D_GIF_ERR_NOT_ENOUGH_MEM; + return GIF_ERROR; + } + } + + sp = &GifFile->SavedImages[GifFile->ImageCount]; + memcpy(&sp->ImageDesc, &GifFile->Image, sizeof(GifImageDesc)); + if (GifFile->Image.ColorMap != NULL) { + sp->ImageDesc.ColorMap = + GifMakeMapObject(GifFile->Image.ColorMap->ColorCount, + GifFile->Image.ColorMap->Colors); + if (sp->ImageDesc.ColorMap == NULL) { + GifFile->Error = D_GIF_ERR_NOT_ENOUGH_MEM; + return GIF_ERROR; + } + } + sp->RasterBits = (unsigned char *)NULL; + sp->ExtensionBlockCount = 0; + sp->ExtensionBlocks = (ExtensionBlock *)NULL; + + GifFile->ImageCount++; + + return GIF_OK; +} + +/****************************************************************************** + Get one full scanned line (Line) of length LineLen from GIF file. +******************************************************************************/ +int DGifGetLine(GifFileType *GifFile, GifPixelType *Line, int LineLen) { + GifByteType *Dummy; + GifFilePrivateType *Private = (GifFilePrivateType *)GifFile->Private; + + if (!IS_READABLE(Private)) { + /* This file was NOT open for reading: */ + GifFile->Error = D_GIF_ERR_NOT_READABLE; + return GIF_ERROR; + } + + if (!LineLen) { + LineLen = GifFile->Image.Width; + } + + if ((Private->PixelCount -= LineLen) > 0xffff0000UL) { + GifFile->Error = D_GIF_ERR_DATA_TOO_BIG; + return GIF_ERROR; + } + + if (DGifDecompressLine(GifFile, Line, LineLen) == GIF_OK) { + if (Private->PixelCount == 0) { + /* We probably won't be called any more, so let's clean + * up everything before we return: need to flush out all + * the rest of image until an empty block (size 0) + * detected. We use GetCodeNext. + */ + do { + if (DGifGetCodeNext(GifFile, &Dummy) == + GIF_ERROR) { + return GIF_ERROR; + } + } while (Dummy != NULL); + } + return GIF_OK; + } else { + return GIF_ERROR; + } +} + +/****************************************************************************** + Put one pixel (Pixel) into GIF file. +******************************************************************************/ +int DGifGetPixel(GifFileType *GifFile, GifPixelType Pixel) { + GifByteType *Dummy; + GifFilePrivateType *Private = (GifFilePrivateType *)GifFile->Private; + + if (!IS_READABLE(Private)) { + /* This file was NOT open for reading: */ + GifFile->Error = D_GIF_ERR_NOT_READABLE; + return GIF_ERROR; + } + if (--Private->PixelCount > 0xffff0000UL) { + GifFile->Error = D_GIF_ERR_DATA_TOO_BIG; + return GIF_ERROR; + } + + if (DGifDecompressLine(GifFile, &Pixel, 1) == GIF_OK) { + if (Private->PixelCount == 0) { + /* We probably won't be called any more, so let's clean + * up everything before we return: need to flush out all + * the rest of image until an empty block (size 0) + * detected. We use GetCodeNext. + */ + do { + if (DGifGetCodeNext(GifFile, &Dummy) == + GIF_ERROR) { + return GIF_ERROR; + } + } while (Dummy != NULL); + } + return GIF_OK; + } else { + return GIF_ERROR; + } +} + +/****************************************************************************** + Get an extension block (see GIF manual) from GIF file. This routine only + returns the first data block, and DGifGetExtensionNext should be called + after this one until NULL extension is returned. + The Extension should NOT be freed by the user (not dynamically allocated). + Note it is assumed the Extension description header has been read. +******************************************************************************/ +int DGifGetExtension(GifFileType *GifFile, int *ExtCode, + GifByteType **Extension) { + GifByteType Buf; + GifFilePrivateType *Private = (GifFilePrivateType *)GifFile->Private; + + // fprintf(stderr, "### -> DGifGetExtension:\n"); + if (!IS_READABLE(Private)) { + /* This file was NOT open for reading: */ + GifFile->Error = D_GIF_ERR_NOT_READABLE; + return GIF_ERROR; + } + + /* coverity[check_return] */ + if (InternalRead(GifFile, &Buf, 1) != 1) { + GifFile->Error = D_GIF_ERR_READ_FAILED; + return GIF_ERROR; + } + *ExtCode = Buf; + // fprintf(stderr, "### <- DGifGetExtension: %02x, about to call + // next\n", Buf); + + return DGifGetExtensionNext(GifFile, Extension); +} + +/****************************************************************************** + Get a following extension block (see GIF manual) from GIF file. This + routine should be called until NULL Extension is returned. + The Extension should NOT be freed by the user (not dynamically allocated). +******************************************************************************/ +int DGifGetExtensionNext(GifFileType *GifFile, GifByteType **Extension) { + GifByteType Buf; + GifFilePrivateType *Private = (GifFilePrivateType *)GifFile->Private; + + // fprintf(stderr, "### -> DGifGetExtensionNext\n"); + if (InternalRead(GifFile, &Buf, 1) != 1) { + GifFile->Error = D_GIF_ERR_READ_FAILED; + return GIF_ERROR; + } + // fprintf(stderr, "### DGifGetExtensionNext sees %d\n", Buf); + + if (Buf > 0) { + *Extension = Private->Buf; /* Use private unused buffer. */ + (*Extension)[0] = + Buf; /* Pascal strings notation (pos. 0 is len.). */ + /* coverity[tainted_data,check_return] */ + if (InternalRead(GifFile, &((*Extension)[1]), Buf) != Buf) { + GifFile->Error = D_GIF_ERR_READ_FAILED; + return GIF_ERROR; + } + } else { + *Extension = NULL; + } + // fprintf(stderr, "### <- DGifGetExtensionNext: %p\n", Extension); + + return GIF_OK; +} + +/****************************************************************************** + Extract a Graphics Control Block from raw extension data +******************************************************************************/ + +int DGifExtensionToGCB(const size_t GifExtensionLength, + const GifByteType *GifExtension, + GraphicsControlBlock *GCB) { + if (GifExtensionLength != 4) { + return GIF_ERROR; + } + + GCB->DisposalMode = (GifExtension[0] >> 2) & 0x07; + GCB->UserInputFlag = (GifExtension[0] & 0x02) != 0; + GCB->DelayTime = + UNSIGNED_LITTLE_ENDIAN(GifExtension[1], GifExtension[2]); + if (GifExtension[0] & 0x01) { + GCB->TransparentColor = (int)GifExtension[3]; + } else { + GCB->TransparentColor = NO_TRANSPARENT_COLOR; + } + + return GIF_OK; +} + +/****************************************************************************** + Extract the Graphics Control Block for a saved image, if it exists. +******************************************************************************/ + +int DGifSavedExtensionToGCB(GifFileType *GifFile, int ImageIndex, + GraphicsControlBlock *GCB) { + int i; + + if (ImageIndex < 0 || ImageIndex > GifFile->ImageCount - 1) { + return GIF_ERROR; + } + + GCB->DisposalMode = DISPOSAL_UNSPECIFIED; + GCB->UserInputFlag = false; + GCB->DelayTime = 0; + GCB->TransparentColor = NO_TRANSPARENT_COLOR; + + for (i = 0; i < GifFile->SavedImages[ImageIndex].ExtensionBlockCount; + i++) { + ExtensionBlock *ep = + &GifFile->SavedImages[ImageIndex].ExtensionBlocks[i]; + if (ep->Function == GRAPHICS_EXT_FUNC_CODE) { + return DGifExtensionToGCB(ep->ByteCount, ep->Bytes, + GCB); + } + } + + return GIF_ERROR; +} + +/****************************************************************************** + This routine should be called last, to close the GIF file. +******************************************************************************/ +int DGifCloseFile(GifFileType *GifFile, int *ErrorCode) { + GifFilePrivateType *Private; + + if (GifFile == NULL || GifFile->Private == NULL) { + return GIF_ERROR; + } + + if (GifFile->Image.ColorMap) { + GifFreeMapObject(GifFile->Image.ColorMap); + GifFile->Image.ColorMap = NULL; + } + + if (GifFile->SColorMap) { + GifFreeMapObject(GifFile->SColorMap); + GifFile->SColorMap = NULL; + } + + if (GifFile->SavedImages) { + GifFreeSavedImages(GifFile); + GifFile->SavedImages = NULL; + } + + GifFreeExtensions(&GifFile->ExtensionBlockCount, + &GifFile->ExtensionBlocks); + + Private = (GifFilePrivateType *)GifFile->Private; + + if (!IS_READABLE(Private)) { + /* This file was NOT open for reading: */ + if (ErrorCode != NULL) { + *ErrorCode = D_GIF_ERR_NOT_READABLE; + } + free((char *)GifFile->Private); + free(GifFile); + return GIF_ERROR; + } + + if (Private->File && (fclose(Private->File) != 0)) { + if (ErrorCode != NULL) { + *ErrorCode = D_GIF_ERR_CLOSE_FAILED; + } + free((char *)GifFile->Private); + free(GifFile); + return GIF_ERROR; + } + + free((char *)GifFile->Private); + free(GifFile); + if (ErrorCode != NULL) { + *ErrorCode = D_GIF_SUCCEEDED; + } + return GIF_OK; +} + +/****************************************************************************** + Get 2 bytes (word) from the given file: +******************************************************************************/ +static int DGifGetWord(GifFileType *GifFile, GifWord *Word) { + unsigned char c[2]; + + /* coverity[check_return] */ + if (InternalRead(GifFile, c, 2) != 2) { + GifFile->Error = D_GIF_ERR_READ_FAILED; + return GIF_ERROR; + } + + *Word = (GifWord)UNSIGNED_LITTLE_ENDIAN(c[0], c[1]); + return GIF_OK; +} + +/****************************************************************************** + Get the image code in compressed form. This routine can be called if the + information needed to be piped out as is. Obviously this is much faster + than decoding and encoding again. This routine should be followed by calls + to DGifGetCodeNext, until NULL block is returned. + The block should NOT be freed by the user (not dynamically allocated). +******************************************************************************/ +int DGifGetCode(GifFileType *GifFile, int *CodeSize, GifByteType **CodeBlock) { + GifFilePrivateType *Private = (GifFilePrivateType *)GifFile->Private; + + if (!IS_READABLE(Private)) { + /* This file was NOT open for reading: */ + GifFile->Error = D_GIF_ERR_NOT_READABLE; + return GIF_ERROR; + } + + *CodeSize = Private->BitsPerPixel; + + return DGifGetCodeNext(GifFile, CodeBlock); +} + +/****************************************************************************** + Continue to get the image code in compressed form. This routine should be + called until NULL block is returned. + The block should NOT be freed by the user (not dynamically allocated). +******************************************************************************/ +int DGifGetCodeNext(GifFileType *GifFile, GifByteType **CodeBlock) { + GifByteType Buf; + GifFilePrivateType *Private = (GifFilePrivateType *)GifFile->Private; + + /* coverity[tainted_data_argument] */ + /* coverity[check_return] */ + if (InternalRead(GifFile, &Buf, 1) != 1) { + GifFile->Error = D_GIF_ERR_READ_FAILED; + return GIF_ERROR; + } + + /* coverity[lower_bounds] */ + if (Buf > 0) { + *CodeBlock = Private->Buf; /* Use private unused buffer. */ + (*CodeBlock)[0] = + Buf; /* Pascal strings notation (pos. 0 is len.). */ + /* coverity[tainted_data] */ + if (InternalRead(GifFile, &((*CodeBlock)[1]), Buf) != Buf) { + GifFile->Error = D_GIF_ERR_READ_FAILED; + return GIF_ERROR; + } + } else { + *CodeBlock = NULL; + Private->Buf[0] = 0; /* Make sure the buffer is empty! */ + Private->PixelCount = + 0; /* And local info. indicate image read. */ + } + + return GIF_OK; +} + +/****************************************************************************** + Setup the LZ decompression for this image: +******************************************************************************/ +static int DGifSetupDecompress(GifFileType *GifFile) { + int i, BitsPerPixel; + GifByteType CodeSize; + GifPrefixType *Prefix; + GifFilePrivateType *Private = (GifFilePrivateType *)GifFile->Private; + + /* coverity[check_return] */ + if (InternalRead(GifFile, &CodeSize, 1) < + 1) { /* Read Code size from file. */ + GifFile->Error = D_GIF_ERR_READ_FAILED; + return GIF_ERROR; /* Failed to read Code size. */ + } + BitsPerPixel = CodeSize; + + /* this can only happen on a severely malformed GIF */ + if (BitsPerPixel > 8) { + GifFile->Error = + D_GIF_ERR_READ_FAILED; /* somewhat bogus error code */ + return GIF_ERROR; /* Failed to read Code size. */ + } + + Private->Buf[0] = 0; /* Input Buffer empty. */ + Private->BitsPerPixel = BitsPerPixel; + Private->ClearCode = (1 << BitsPerPixel); + Private->EOFCode = Private->ClearCode + 1; + Private->RunningCode = Private->EOFCode + 1; + Private->RunningBits = BitsPerPixel + 1; /* Number of bits per code. */ + Private->MaxCode1 = 1 << Private->RunningBits; /* Max. code + 1. */ + Private->StackPtr = 0; /* No pixels on the pixel stack. */ + Private->LastCode = NO_SUCH_CODE; + Private->CrntShiftState = 0; /* No information in CrntShiftDWord. */ + Private->CrntShiftDWord = 0; + + Prefix = Private->Prefix; + for (i = 0; i <= LZ_MAX_CODE; i++) { + Prefix[i] = NO_SUCH_CODE; + } + + return GIF_OK; +} + +/****************************************************************************** + The LZ decompression routine: + This version decompress the given GIF file into Line of length LineLen. + This routine can be called few times (one per scan line, for example), in + order the complete the whole image. +******************************************************************************/ +static int DGifDecompressLine(GifFileType *GifFile, GifPixelType *Line, + int LineLen) { + int i = 0; + int j, CrntCode, EOFCode, ClearCode, CrntPrefix, LastCode, StackPtr; + GifByteType *Stack, *Suffix; + GifPrefixType *Prefix; + GifFilePrivateType *Private = (GifFilePrivateType *)GifFile->Private; + + StackPtr = Private->StackPtr; + Prefix = Private->Prefix; + Suffix = Private->Suffix; + Stack = Private->Stack; + EOFCode = Private->EOFCode; + ClearCode = Private->ClearCode; + LastCode = Private->LastCode; + + if (StackPtr > LZ_MAX_CODE) { + return GIF_ERROR; + } + + if (StackPtr != 0) { + /* Let pop the stack off before continueing to read the GIF + * file: */ + while (StackPtr != 0 && i < LineLen) { + Line[i++] = Stack[--StackPtr]; + } + } + + while (i < LineLen) { /* Decode LineLen items. */ + if (DGifDecompressInput(GifFile, &CrntCode) == GIF_ERROR) { + return GIF_ERROR; + } + + if (CrntCode == EOFCode) { + /* Note however that usually we will not be here as we + * will stop decoding as soon as we got all the pixel, + * or EOF code will not be read at all, and + * DGifGetLine/Pixel clean everything. */ + GifFile->Error = D_GIF_ERR_EOF_TOO_SOON; + return GIF_ERROR; + } else if (CrntCode == ClearCode) { + /* We need to start over again: */ + for (j = 0; j <= LZ_MAX_CODE; j++) { + Prefix[j] = NO_SUCH_CODE; + } + Private->RunningCode = Private->EOFCode + 1; + Private->RunningBits = Private->BitsPerPixel + 1; + Private->MaxCode1 = 1 << Private->RunningBits; + LastCode = Private->LastCode = NO_SUCH_CODE; + } else { + /* Its regular code - if in pixel range simply add it to + * output stream, otherwise trace to codes linked list + * until the prefix is in pixel range: */ + if (CrntCode < ClearCode) { + /* This is simple - its pixel scalar, so add it + * to output: */ + Line[i++] = CrntCode; + } else { + /* Its a code to needed to be traced: trace the + * linked list until the prefix is a pixel, + * while pushing the suffix pixels on our stack. + * If we done, pop the stack in reverse (thats + * what stack is good for!) order to output. */ + if (Prefix[CrntCode] == NO_SUCH_CODE) { + CrntPrefix = LastCode; + + /* Only allowed if CrntCode is exactly + * the running code: In that case + * CrntCode = XXXCode, CrntCode or the + * prefix code is last code and the + * suffix char is exactly the prefix of + * last code! */ + if (CrntCode == + Private->RunningCode - 2) { + Suffix[Private->RunningCode - + 2] = Stack[StackPtr++] = + DGifGetPrefixChar( + Prefix, LastCode, + ClearCode); + } else { + Suffix[Private->RunningCode - + 2] = Stack[StackPtr++] = + DGifGetPrefixChar( + Prefix, CrntCode, + ClearCode); + } + } else { + CrntPrefix = CrntCode; + } + + /* Now (if image is O.K.) we should not get a + * NO_SUCH_CODE during the trace. As we might + * loop forever, in case of defective image, we + * use StackPtr as loop counter and stop before + * overflowing Stack[]. */ + while (StackPtr < LZ_MAX_CODE && + CrntPrefix > ClearCode && + CrntPrefix <= LZ_MAX_CODE) { + Stack[StackPtr++] = Suffix[CrntPrefix]; + CrntPrefix = Prefix[CrntPrefix]; + } + if (StackPtr >= LZ_MAX_CODE || + CrntPrefix > LZ_MAX_CODE) { + GifFile->Error = D_GIF_ERR_IMAGE_DEFECT; + return GIF_ERROR; + } + /* Push the last character on stack: */ + Stack[StackPtr++] = CrntPrefix; + + /* Now lets pop all the stack into output: */ + while (StackPtr != 0 && i < LineLen) { + Line[i++] = Stack[--StackPtr]; + } + } + if (LastCode != NO_SUCH_CODE && + Private->RunningCode - 2 < (LZ_MAX_CODE + 1) && + Prefix[Private->RunningCode - 2] == NO_SUCH_CODE) { + Prefix[Private->RunningCode - 2] = LastCode; + + if (CrntCode == Private->RunningCode - 2) { + /* Only allowed if CrntCode is exactly + * the running code: In that case + * CrntCode = XXXCode, CrntCode or the + * prefix code is last code and the + * suffix char is exactly the prefix of + * last code! */ + Suffix[Private->RunningCode - 2] = + DGifGetPrefixChar(Prefix, LastCode, + ClearCode); + } else { + Suffix[Private->RunningCode - 2] = + DGifGetPrefixChar(Prefix, CrntCode, + ClearCode); + } + } + LastCode = CrntCode; + } + } + + Private->LastCode = LastCode; + Private->StackPtr = StackPtr; + + return GIF_OK; +} + +/****************************************************************************** + Routine to trace the Prefixes linked list until we get a prefix which is + not code, but a pixel value (less than ClearCode). Returns that pixel value. + If image is defective, we might loop here forever, so we limit the loops to + the maximum possible if image O.k. - LZ_MAX_CODE times. +******************************************************************************/ +static int DGifGetPrefixChar(const GifPrefixType *Prefix, int Code, + int ClearCode) { + int i = 0; + + while (Code > ClearCode && i++ <= LZ_MAX_CODE) { + if (Code > LZ_MAX_CODE) { + return NO_SUCH_CODE; + } + Code = Prefix[Code]; + } + return Code; +} + +/****************************************************************************** + Interface for accessing the LZ codes directly. Set Code to the real code + (12bits), or to -1 if EOF code is returned. +******************************************************************************/ +int DGifGetLZCodes(GifFileType *GifFile, int *Code) { + GifByteType *CodeBlock; + GifFilePrivateType *Private = (GifFilePrivateType *)GifFile->Private; + + if (!IS_READABLE(Private)) { + /* This file was NOT open for reading: */ + GifFile->Error = D_GIF_ERR_NOT_READABLE; + return GIF_ERROR; + } + + if (DGifDecompressInput(GifFile, Code) == GIF_ERROR) { + return GIF_ERROR; + } + + if (*Code == Private->EOFCode) { + /* Skip rest of codes (hopefully only NULL terminating block): + */ + do { + if (DGifGetCodeNext(GifFile, &CodeBlock) == GIF_ERROR) { + return GIF_ERROR; + } + } while (CodeBlock != NULL); + + *Code = -1; + } else if (*Code == Private->ClearCode) { + /* We need to start over again: */ + Private->RunningCode = Private->EOFCode + 1; + Private->RunningBits = Private->BitsPerPixel + 1; + Private->MaxCode1 = 1 << Private->RunningBits; + } + + return GIF_OK; +} + +/****************************************************************************** + The LZ decompression input routine: + This routine is responsable for the decompression of the bit stream from + 8 bits (bytes) packets, into the real codes. + Returns GIF_OK if read successfully. +******************************************************************************/ +static int DGifDecompressInput(GifFileType *GifFile, int *Code) { + static const unsigned short CodeMasks[] = { + 0x0000, 0x0001, 0x0003, 0x0007, 0x000f, 0x001f, 0x003f, + 0x007f, 0x00ff, 0x01ff, 0x03ff, 0x07ff, 0x0fff}; + + GifFilePrivateType *Private = (GifFilePrivateType *)GifFile->Private; + + GifByteType NextByte; + + /* The image can't contain more than LZ_BITS per code. */ + if (Private->RunningBits > LZ_BITS) { + GifFile->Error = D_GIF_ERR_IMAGE_DEFECT; + return GIF_ERROR; + } + + while (Private->CrntShiftState < Private->RunningBits) { + /* Needs to get more bytes from input stream for next code: */ + if (DGifBufferedInput(GifFile, Private->Buf, &NextByte) == + GIF_ERROR) { + return GIF_ERROR; + } + Private->CrntShiftDWord |= ((unsigned long)NextByte) + << Private->CrntShiftState; + Private->CrntShiftState += 8; + } + *Code = Private->CrntShiftDWord & CodeMasks[Private->RunningBits]; + + Private->CrntShiftDWord >>= Private->RunningBits; + Private->CrntShiftState -= Private->RunningBits; + + /* If code cannot fit into RunningBits bits, must raise its size. Note + * however that codes above 4095 are used for special signaling. + * If we're using LZ_BITS bits already and we're at the max code, just + * keep using the table as it is, don't increment Private->RunningCode. + */ + if (Private->RunningCode < LZ_MAX_CODE + 2 && + ++Private->RunningCode > Private->MaxCode1 && + Private->RunningBits < LZ_BITS) { + Private->MaxCode1 <<= 1; + Private->RunningBits++; + } + return GIF_OK; +} + +/****************************************************************************** + This routines read one GIF data block at a time and buffers it internally + so that the decompression routine could access it. + The routine returns the next byte from its internal buffer (or read next + block in if buffer empty) and returns GIF_OK if succesful. +******************************************************************************/ +static int DGifBufferedInput(GifFileType *GifFile, GifByteType *Buf, + GifByteType *NextByte) { + if (Buf[0] == 0) { + /* Needs to read the next buffer - this one is empty: */ + /* coverity[check_return] */ + if (InternalRead(GifFile, Buf, 1) != 1) { + GifFile->Error = D_GIF_ERR_READ_FAILED; + return GIF_ERROR; + } + /* There shouldn't be any empty data blocks here as the LZW spec + * says the LZW termination code should come first. Therefore + * we shouldn't be inside this routine at that point. + */ + if (Buf[0] == 0) { + GifFile->Error = D_GIF_ERR_IMAGE_DEFECT; + return GIF_ERROR; + } + if (InternalRead(GifFile, &Buf[1], Buf[0]) != Buf[0]) { + GifFile->Error = D_GIF_ERR_READ_FAILED; + return GIF_ERROR; + } + *NextByte = Buf[1]; + Buf[1] = 2; /* We use now the second place as last char read! */ + Buf[0]--; + } else { + *NextByte = Buf[Buf[1]++]; + Buf[0]--; + } + + return GIF_OK; +} + +/****************************************************************************** + This routine is called in case of error during parsing image. We need to + decrease image counter and reallocate memory for saved images. Not decreasing + ImageCount may lead to null pointer dereference, because the last element in + SavedImages may point to the spoilt image and null pointer buffers. +*******************************************************************************/ +void DGifDecreaseImageCounter(GifFileType *GifFile) { + GifFile->ImageCount--; + if (GifFile->SavedImages[GifFile->ImageCount].RasterBits != NULL) { + free(GifFile->SavedImages[GifFile->ImageCount].RasterBits); + } + + // Realloc array according to the new image counter. + SavedImage *correct_saved_images = (SavedImage *)reallocarray( + GifFile->SavedImages, GifFile->ImageCount, sizeof(SavedImage)); + if (correct_saved_images != NULL) { + GifFile->SavedImages = correct_saved_images; + } +} + +/****************************************************************************** + This routine reads an entire GIF into core, hanging all its state info off + the GifFileType pointer. Call DGifOpenFileName() or DGifOpenFileHandle() + first to initialize I/O. Its inverse is EGifSpew(). +*******************************************************************************/ +int DGifSlurp(GifFileType *GifFile) { + size_t ImageSize; + GifRecordType RecordType; + SavedImage *sp; + GifByteType *ExtData; + int ExtFunction; + + GifFile->ExtensionBlocks = NULL; + GifFile->ExtensionBlockCount = 0; + + do { + if (DGifGetRecordType(GifFile, &RecordType) == GIF_ERROR) { + return (GIF_ERROR); + } + + switch (RecordType) { + case IMAGE_DESC_RECORD_TYPE: + if (DGifGetImageDesc(GifFile) == GIF_ERROR) { + return (GIF_ERROR); + } + + sp = &GifFile->SavedImages[GifFile->ImageCount - 1]; + /* Allocate memory for the image */ + if (sp->ImageDesc.Width <= 0 || + sp->ImageDesc.Height <= 0 || + sp->ImageDesc.Width > + (INT_MAX / sp->ImageDesc.Height)) { + DGifDecreaseImageCounter(GifFile); + return GIF_ERROR; + } + ImageSize = sp->ImageDesc.Width * sp->ImageDesc.Height; + + if (ImageSize > (SIZE_MAX / sizeof(GifPixelType))) { + DGifDecreaseImageCounter(GifFile); + return GIF_ERROR; + } + sp->RasterBits = (unsigned char *)reallocarray( + NULL, ImageSize, sizeof(GifPixelType)); + + if (sp->RasterBits == NULL) { + DGifDecreaseImageCounter(GifFile); + return GIF_ERROR; + } + + if (sp->ImageDesc.Interlace) { + int i, j; + /* + * The way an interlaced image should be read - + * offsets and jumps... + */ + static const int InterlacedOffset[] = {0, 4, 2, + 1}; + static const int InterlacedJumps[] = {8, 8, 4, + 2}; + /* Need to perform 4 passes on the image */ + for (i = 0; i < 4; i++) { + for (j = InterlacedOffset[i]; + j < sp->ImageDesc.Height; + j += InterlacedJumps[i]) { + if (DGifGetLine( + GifFile, + sp->RasterBits + + j * sp->ImageDesc + .Width, + sp->ImageDesc.Width) == + GIF_ERROR) { + DGifDecreaseImageCounter( + GifFile); + return GIF_ERROR; + } + } + } + } else { + if (DGifGetLine(GifFile, sp->RasterBits, + ImageSize) == GIF_ERROR) { + DGifDecreaseImageCounter(GifFile); + return GIF_ERROR; + } + } + + if (GifFile->ExtensionBlocks) { + sp->ExtensionBlocks = GifFile->ExtensionBlocks; + sp->ExtensionBlockCount = + GifFile->ExtensionBlockCount; + + GifFile->ExtensionBlocks = NULL; + GifFile->ExtensionBlockCount = 0; + } + break; + + case EXTENSION_RECORD_TYPE: + if (DGifGetExtension(GifFile, &ExtFunction, &ExtData) == + GIF_ERROR) { + return (GIF_ERROR); + } + /* Create an extension block with our data */ + if (ExtData != NULL) { + if (GifAddExtensionBlock( + &GifFile->ExtensionBlockCount, + &GifFile->ExtensionBlocks, ExtFunction, + ExtData[0], &ExtData[1]) == GIF_ERROR) { + return (GIF_ERROR); + } + } + for (;;) { + if (DGifGetExtensionNext(GifFile, &ExtData) == + GIF_ERROR) { + return (GIF_ERROR); + } + if (ExtData == NULL) { + break; + } + /* Continue the extension block */ + if (GifAddExtensionBlock( + &GifFile->ExtensionBlockCount, + &GifFile->ExtensionBlocks, + CONTINUE_EXT_FUNC_CODE, ExtData[0], + &ExtData[1]) == GIF_ERROR) { + return (GIF_ERROR); + } + } + break; + + case TERMINATE_RECORD_TYPE: + break; + + default: /* Should be trapped by DGifGetRecordType */ + break; + } + } while (RecordType != TERMINATE_RECORD_TYPE); + + /* Sanity check for corrupted file */ + if (GifFile->ImageCount == 0) { + GifFile->Error = D_GIF_ERR_NO_IMAG_DSCR; + return (GIF_ERROR); + } + + return (GIF_OK); +} + +/* end */ diff --git a/torchvision/csrc/io/image/cpu/giflib/gif_hash.c b/torchvision/csrc/io/image/cpu/giflib/gif_hash.c new file mode 100644 index 00000000000..e63a72accd4 --- /dev/null +++ b/torchvision/csrc/io/image/cpu/giflib/gif_hash.c @@ -0,0 +1,128 @@ +/***************************************************************************** + +gif_hash.c -- module to support the following operations: + +1. InitHashTable - initialize hash table. +2. ClearHashTable - clear the hash table to an empty state. +2. InsertHashTable - insert one item into data structure. +3. ExistsHashTable - test if item exists in data structure. + +This module is used to hash the GIF codes during encoding. + +*****************************************************************************/ +// SPDX-License-Identifier: MIT +// SPDX-File-Copyright-Txt: (C) Copyright 1989 Gershon Elber + +#include +#include +#include +#include +#include + +#include "gif_hash.h" +#include "gif_lib.h" +#include "gif_lib_private.h" + +/* #define DEBUG_HIT_RATE Debug number of misses per hash Insert/Exists. */ + +#ifdef DEBUG_HIT_RATE +static long NumberOfTests = 0, NumberOfMisses = 0; +#endif /* DEBUG_HIT_RATE */ + +static int KeyItem(uint32_t Item); + +/****************************************************************************** + Initialize HashTable - allocate the memory needed and clear it. * +******************************************************************************/ +GifHashTableType *_InitHashTable(void) { + GifHashTableType *HashTable; + + if ((HashTable = (GifHashTableType *)malloc( + sizeof(GifHashTableType))) == NULL) { + return NULL; + } + + _ClearHashTable(HashTable); + + return HashTable; +} + +/****************************************************************************** + Routine to clear the HashTable to an empty state. * + This part is a little machine depended. Use the commented part otherwise. * +******************************************************************************/ +void _ClearHashTable(GifHashTableType *HashTable) { + memset(HashTable->HTable, 0xFF, HT_SIZE * sizeof(uint32_t)); +} + +/****************************************************************************** + Routine to insert a new Item into the HashTable. The data is assumed to be * + new one. * +******************************************************************************/ +void _InsertHashTable(GifHashTableType *HashTable, uint32_t Key, int Code) { + int HKey = KeyItem(Key); + uint32_t *HTable = HashTable->HTable; + +#ifdef DEBUG_HIT_RATE + NumberOfTests++; + NumberOfMisses++; +#endif /* DEBUG_HIT_RATE */ + + while (HT_GET_KEY(HTable[HKey]) != 0xFFFFFL) { +#ifdef DEBUG_HIT_RATE + NumberOfMisses++; +#endif /* DEBUG_HIT_RATE */ + HKey = (HKey + 1) & HT_KEY_MASK; + } + HTable[HKey] = HT_PUT_KEY(Key) | HT_PUT_CODE(Code); +} + +/****************************************************************************** + Routine to test if given Key exists in HashTable and if so returns its code * + Returns the Code if key was found, -1 if not. * +******************************************************************************/ +int _ExistsHashTable(GifHashTableType *HashTable, uint32_t Key) { + int HKey = KeyItem(Key); + uint32_t *HTable = HashTable->HTable, HTKey; + +#ifdef DEBUG_HIT_RATE + NumberOfTests++; + NumberOfMisses++; +#endif /* DEBUG_HIT_RATE */ + + while ((HTKey = HT_GET_KEY(HTable[HKey])) != 0xFFFFFL) { +#ifdef DEBUG_HIT_RATE + NumberOfMisses++; +#endif /* DEBUG_HIT_RATE */ + if (Key == HTKey) { + return HT_GET_CODE(HTable[HKey]); + } + HKey = (HKey + 1) & HT_KEY_MASK; + } + + return -1; +} + +/****************************************************************************** + Routine to generate an HKey for the hashtable out of the given unique key. * + The given Key is assumed to be 20 bits as follows: lower 8 bits are the * + new postfix character, while the upper 12 bits are the prefix code. * + Because the average hit ratio is only 2 (2 hash references per entry), * + evaluating more complex keys (such as twin prime keys) does not worth it! * +******************************************************************************/ +static int KeyItem(uint32_t Item) { + return ((Item >> 12) ^ Item) & HT_KEY_MASK; +} + +#ifdef DEBUG_HIT_RATE +/****************************************************************************** + Debugging routine to print the hit ratio - number of times the hash table * + was tested per operation. This routine was used to test the KeyItem routine * +******************************************************************************/ +void HashTablePrintHitRatio(void) { + printf("Hash Table Hit Ratio is %ld/%ld = %ld%%.\n", NumberOfMisses, + NumberOfTests, NumberOfMisses * 100 / NumberOfTests); +} +#endif /* DEBUG_HIT_RATE */ + +/* end */ diff --git a/torchvision/csrc/io/image/cpu/giflib/gif_hash.h b/torchvision/csrc/io/image/cpu/giflib/gif_hash.h new file mode 100644 index 00000000000..009cb5b8081 --- /dev/null +++ b/torchvision/csrc/io/image/cpu/giflib/gif_hash.h @@ -0,0 +1,42 @@ +/****************************************************************************** + +gif_hash.h - magfic constants and declarations for GIF LZW + +******************************************************************************/ +// SPDX-License-Identifier: MIT + +#ifndef _GIF_HASH_H_ +#define _GIF_HASH_H_ + +#ifndef _WIN32 +#include +#endif /* _WIN32 */ +#include + +#define HT_SIZE 8192 /* 12bits = 4096 or twice as big! */ +#define HT_KEY_MASK 0x1FFF /* 13bits keys */ +#define HT_KEY_NUM_BITS 13 /* 13bits keys */ +#define HT_MAX_KEY 8191 /* 13bits - 1, maximal code possible */ +#define HT_MAX_CODE 4095 /* Biggest code possible in 12 bits. */ + +/* The 32 bits of the long are divided into two parts for the key & code: */ +/* 1. The code is 12 bits as our compression algorithm is limited to 12bits */ +/* 2. The key is 12 bits Prefix code + 8 bit new char or 20 bits. */ +/* The key is the upper 20 bits. The code is the lower 12. */ +#define HT_GET_KEY(l) (l >> 12) +#define HT_GET_CODE(l) (l & 0x0FFF) +#define HT_PUT_KEY(l) (l << 12) +#define HT_PUT_CODE(l) (l & 0x0FFF) + +typedef struct GifHashTableType { + uint32_t HTable[HT_SIZE]; +} GifHashTableType; + +GifHashTableType *_InitHashTable(void); +void _ClearHashTable(GifHashTableType *HashTable); +void _InsertHashTable(GifHashTableType *HashTable, uint32_t Key, int Code); +int _ExistsHashTable(GifHashTableType *HashTable, uint32_t Key); + +#endif /* _GIF_HASH_H_ */ + +/* end */ diff --git a/torchvision/csrc/io/image/cpu/giflib/gif_lib.h b/torchvision/csrc/io/image/cpu/giflib/gif_lib.h new file mode 100644 index 00000000000..d0c61d51682 --- /dev/null +++ b/torchvision/csrc/io/image/cpu/giflib/gif_lib.h @@ -0,0 +1,291 @@ +/****************************************************************************** + +gif_lib.h - service library for decoding and encoding GIF images + +SPDX-License-Identifier: MIT + +*****************************************************************************/ + +#ifndef _GIF_LIB_H_ +#define _GIF_LIB_H_ 1 + +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ + +#define GIFLIB_MAJOR 5 +#define GIFLIB_MINOR 2 +#define GIFLIB_RELEASE 2 + +#define GIF_ERROR 0 +#define GIF_OK 1 + +#include +#include + +#define GIF_STAMP "GIFVER" /* First chars in file - GIF stamp. */ +#define GIF_STAMP_LEN sizeof(GIF_STAMP) - 1 +#define GIF_VERSION_POS 3 /* Version first character in stamp. */ +#define GIF87_STAMP "GIF87a" /* First chars in file - GIF stamp. */ +#define GIF89_STAMP "GIF89a" /* First chars in file - GIF stamp. */ + +typedef unsigned char GifPixelType; +typedef unsigned char *GifRowType; +typedef unsigned char GifByteType; +typedef unsigned int GifPrefixType; +typedef int GifWord; + +typedef struct GifColorType { + GifByteType Red, Green, Blue; +} GifColorType; + +typedef struct ColorMapObject { + int ColorCount; + int BitsPerPixel; + bool SortFlag; + GifColorType *Colors; /* on malloc(3) heap */ +} ColorMapObject; + +typedef struct GifImageDesc { + GifWord Left, Top, Width, Height; /* Current image dimensions. */ + bool Interlace; /* Sequential/Interlaced lines. */ + ColorMapObject *ColorMap; /* The local color map */ +} GifImageDesc; + +typedef struct ExtensionBlock { + int ByteCount; + GifByteType *Bytes; /* on malloc(3) heap */ + int Function; /* The block function code */ +#define CONTINUE_EXT_FUNC_CODE 0x00 /* continuation subblock */ +#define COMMENT_EXT_FUNC_CODE 0xfe /* comment */ +#define GRAPHICS_EXT_FUNC_CODE 0xf9 /* graphics control (GIF89) */ +#define PLAINTEXT_EXT_FUNC_CODE 0x01 /* plaintext */ +#define APPLICATION_EXT_FUNC_CODE 0xff /* application block (GIF89) */ +} ExtensionBlock; + +typedef struct SavedImage { + GifImageDesc ImageDesc; + GifByteType *RasterBits; /* on malloc(3) heap */ + int ExtensionBlockCount; /* Count of extensions before image */ + ExtensionBlock *ExtensionBlocks; /* Extensions before image */ +} SavedImage; + +typedef struct GifFileType { + GifWord SWidth, SHeight; /* Size of virtual canvas */ + GifWord SColorResolution; /* How many colors can we generate? */ + GifWord SBackGroundColor; /* Background color for virtual canvas */ + GifByteType AspectByte; /* Used to compute pixel aspect ratio */ + ColorMapObject *SColorMap; /* Global colormap, NULL if nonexistent. */ + int ImageCount; /* Number of current image (both APIs) */ + GifImageDesc Image; /* Current image (low-level API) */ + SavedImage *SavedImages; /* Image sequence (high-level API) */ + int ExtensionBlockCount; /* Count extensions past last image */ + ExtensionBlock *ExtensionBlocks; /* Extensions past last image */ + int Error; /* Last error condition reported */ + void *UserData; /* hook to attach user data (TVT) */ + void *Private; /* Don't mess with this! */ +} GifFileType; + +#define GIF_ASPECT_RATIO(n) ((n) + 15.0 / 64.0) + +typedef enum { + UNDEFINED_RECORD_TYPE, + SCREEN_DESC_RECORD_TYPE, + IMAGE_DESC_RECORD_TYPE, /* Begin with ',' */ + EXTENSION_RECORD_TYPE, /* Begin with '!' */ + TERMINATE_RECORD_TYPE /* Begin with ';' */ +} GifRecordType; + +/* func type to read gif data from arbitrary sources (TVT) */ +typedef int (*InputFunc)(GifFileType *, GifByteType *, int); + +/* func type to write gif data to arbitrary targets. + * Returns count of bytes written. (MRB) + */ +typedef int (*OutputFunc)(GifFileType *, const GifByteType *, int); + +/****************************************************************************** + GIF89 structures +******************************************************************************/ + +typedef struct GraphicsControlBlock { + int DisposalMode; +#define DISPOSAL_UNSPECIFIED 0 /* No disposal specified. */ +#define DISPOSE_DO_NOT 1 /* Leave image in place */ +#define DISPOSE_BACKGROUND 2 /* Set area too background color */ +#define DISPOSE_PREVIOUS 3 /* Restore to previous content */ + bool UserInputFlag; /* User confirmation required before disposal */ + int DelayTime; /* pre-display delay in 0.01sec units */ + int TransparentColor; /* Palette index for transparency, -1 if none */ +#define NO_TRANSPARENT_COLOR -1 +} GraphicsControlBlock; + +/****************************************************************************** + GIF encoding routines +******************************************************************************/ + +/* Main entry points */ +GifFileType *EGifOpenFileName(const char *GifFileName, + const bool GifTestExistence, int *Error); +GifFileType *EGifOpenFileHandle(const int GifFileHandle, int *Error); +GifFileType *EGifOpen(void *userPtr, OutputFunc writeFunc, int *Error); +int EGifSpew(GifFileType *GifFile); +const char *EGifGetGifVersion(GifFileType *GifFile); /* new in 5.x */ +int EGifCloseFile(GifFileType *GifFile, int *ErrorCode); + +#define E_GIF_SUCCEEDED 0 +#define E_GIF_ERR_OPEN_FAILED 1 /* And EGif possible errors. */ +#define E_GIF_ERR_WRITE_FAILED 2 +#define E_GIF_ERR_HAS_SCRN_DSCR 3 +#define E_GIF_ERR_HAS_IMAG_DSCR 4 +#define E_GIF_ERR_NO_COLOR_MAP 5 +#define E_GIF_ERR_DATA_TOO_BIG 6 +#define E_GIF_ERR_NOT_ENOUGH_MEM 7 +#define E_GIF_ERR_DISK_IS_FULL 8 +#define E_GIF_ERR_CLOSE_FAILED 9 +#define E_GIF_ERR_NOT_WRITEABLE 10 + +/* These are legacy. You probably do not want to call them directly */ +int EGifPutScreenDesc(GifFileType *GifFile, const int GifWidth, + const int GifHeight, const int GifColorRes, + const int GifBackGround, + const ColorMapObject *GifColorMap); +int EGifPutImageDesc(GifFileType *GifFile, const int GifLeft, const int GifTop, + const int GifWidth, const int GifHeight, + const bool GifInterlace, + const ColorMapObject *GifColorMap); +void EGifSetGifVersion(GifFileType *GifFile, const bool gif89); +int EGifPutLine(GifFileType *GifFile, GifPixelType *GifLine, int GifLineLen); +int EGifPutPixel(GifFileType *GifFile, const GifPixelType GifPixel); +int EGifPutComment(GifFileType *GifFile, const char *GifComment); +int EGifPutExtensionLeader(GifFileType *GifFile, const int GifExtCode); +int EGifPutExtensionBlock(GifFileType *GifFile, const int GifExtLen, + const void *GifExtension); +int EGifPutExtensionTrailer(GifFileType *GifFile); +int EGifPutExtension(GifFileType *GifFile, const int GifExtCode, + const int GifExtLen, const void *GifExtension); +int EGifPutCode(GifFileType *GifFile, int GifCodeSize, + const GifByteType *GifCodeBlock); +int EGifPutCodeNext(GifFileType *GifFile, const GifByteType *GifCodeBlock); + +/****************************************************************************** + GIF decoding routines +******************************************************************************/ + +/* Main entry points */ +GifFileType *DGifOpenFileName(const char *GifFileName, int *Error); +GifFileType *DGifOpenFileHandle(int GifFileHandle, int *Error); +int DGifSlurp(GifFileType *GifFile); +GifFileType *DGifOpen(void *userPtr, InputFunc readFunc, + int *Error); /* new one (TVT) */ +int DGifCloseFile(GifFileType *GifFile, int *ErrorCode); + +#define D_GIF_SUCCEEDED 0 +#define D_GIF_ERR_OPEN_FAILED 101 /* And DGif possible errors. */ +#define D_GIF_ERR_READ_FAILED 102 +#define D_GIF_ERR_NOT_GIF_FILE 103 +#define D_GIF_ERR_NO_SCRN_DSCR 104 +#define D_GIF_ERR_NO_IMAG_DSCR 105 +#define D_GIF_ERR_NO_COLOR_MAP 106 +#define D_GIF_ERR_WRONG_RECORD 107 +#define D_GIF_ERR_DATA_TOO_BIG 108 +#define D_GIF_ERR_NOT_ENOUGH_MEM 109 +#define D_GIF_ERR_CLOSE_FAILED 110 +#define D_GIF_ERR_NOT_READABLE 111 +#define D_GIF_ERR_IMAGE_DEFECT 112 +#define D_GIF_ERR_EOF_TOO_SOON 113 + +/* These are legacy. You probably do not want to call them directly */ +int DGifGetScreenDesc(GifFileType *GifFile); +int DGifGetRecordType(GifFileType *GifFile, GifRecordType *GifType); +int DGifGetImageHeader(GifFileType *GifFile); +int DGifGetImageDesc(GifFileType *GifFile); +int DGifGetLine(GifFileType *GifFile, GifPixelType *GifLine, int GifLineLen); +int DGifGetPixel(GifFileType *GifFile, GifPixelType GifPixel); +int DGifGetExtension(GifFileType *GifFile, int *GifExtCode, + GifByteType **GifExtension); +int DGifGetExtensionNext(GifFileType *GifFile, GifByteType **GifExtension); +int DGifGetCode(GifFileType *GifFile, int *GifCodeSize, + GifByteType **GifCodeBlock); +int DGifGetCodeNext(GifFileType *GifFile, GifByteType **GifCodeBlock); +int DGifGetLZCodes(GifFileType *GifFile, int *GifCode); +const char *DGifGetGifVersion(GifFileType *GifFile); + +/****************************************************************************** + Error handling and reporting. +******************************************************************************/ +extern const char *GifErrorString(int ErrorCode); /* new in 2012 - ESR */ + +/***************************************************************************** + it g in core. +******************************************************************************/ + +/****************************************************************************** + Color map handling from gif_alloc.c +******************************************************************************/ + +extern ColorMapObject *GifMakeMapObject(int ColorCount, + const GifColorType *ColorMap); +extern void GifFreeMapObject(ColorMapObject *Object); +extern ColorMapObject *GifUnionColorMap(const ColorMapObject *ColorIn1, + const ColorMapObject *ColorIn2, + GifPixelType ColorTransIn2[]); +extern int GifBitSize(int n); + +/****************************************************************************** + Support for the in-core structures allocation (slurp mode). +******************************************************************************/ + +extern void GifApplyTranslation(SavedImage *Image, + const GifPixelType Translation[]); +extern int GifAddExtensionBlock(int *ExtensionBlock_Count, + ExtensionBlock **ExtensionBlocks, int Function, + unsigned int Len, unsigned char ExtData[]); +extern void GifFreeExtensions(int *ExtensionBlock_Count, + ExtensionBlock **ExtensionBlocks); +extern SavedImage *GifMakeSavedImage(GifFileType *GifFile, + const SavedImage *CopyFrom); +extern void GifFreeSavedImages(GifFileType *GifFile); + +/****************************************************************************** + 5.x functions for GIF89 graphics control blocks +******************************************************************************/ + +int DGifExtensionToGCB(const size_t GifExtensionLength, + const GifByteType *GifExtension, + GraphicsControlBlock *GCB); +size_t EGifGCBToExtension(const GraphicsControlBlock *GCB, + GifByteType *GifExtension); + +int DGifSavedExtensionToGCB(GifFileType *GifFile, int ImageIndex, + GraphicsControlBlock *GCB); +int EGifGCBToSavedExtension(const GraphicsControlBlock *GCB, + GifFileType *GifFile, int ImageIndex); + +/****************************************************************************** + The library's internal utility font +******************************************************************************/ + +#define GIF_FONT_WIDTH 8 +#define GIF_FONT_HEIGHT 8 +extern const unsigned char GifAsciiTable8x8[][GIF_FONT_WIDTH]; + +extern void GifDrawText8x8(SavedImage *Image, const int x, const int y, + const char *legend, const int color); + +extern void GifDrawBox(SavedImage *Image, const int x, const int y, const int w, + const int d, const int color); + +extern void GifDrawRectangle(SavedImage *Image, const int x, const int y, + const int w, const int d, const int color); + +extern void GifDrawBoxedText8x8(SavedImage *Image, const int x, const int y, + const char *legend, const int border, + const int bg, const int fg); + +#ifdef __cplusplus +} +#endif /* __cplusplus */ +#endif /* _GIF_LIB_H */ + +/* end */ diff --git a/torchvision/csrc/io/image/cpu/giflib/gif_lib_private.h b/torchvision/csrc/io/image/cpu/giflib/gif_lib_private.h new file mode 100644 index 00000000000..19578d4530c --- /dev/null +++ b/torchvision/csrc/io/image/cpu/giflib/gif_lib_private.h @@ -0,0 +1,72 @@ +/**************************************************************************** + +gif_lib_private.h - internal giflib routines and structures + +SPDX-License-Identifier: MIT + +****************************************************************************/ + +#ifndef _GIF_LIB_PRIVATE_H +#define _GIF_LIB_PRIVATE_H + +#include "gif_hash.h" +#include "gif_lib.h" + +#ifndef SIZE_MAX +#define SIZE_MAX UINTPTR_MAX +#endif + +#define EXTENSION_INTRODUCER 0x21 +#define DESCRIPTOR_INTRODUCER 0x2c +#define TERMINATOR_INTRODUCER 0x3b + +#define LZ_MAX_CODE 4095 /* Biggest code possible in 12 bits. */ +#define LZ_BITS 12 + +#define FLUSH_OUTPUT 4096 /* Impossible code, to signal flush. */ +#define FIRST_CODE 4097 /* Impossible code, to signal first. */ +#define NO_SUCH_CODE 4098 /* Impossible code, to signal empty. */ + +#define FILE_STATE_WRITE 0x01 +#define FILE_STATE_SCREEN 0x02 +#define FILE_STATE_IMAGE 0x04 +#define FILE_STATE_READ 0x08 + +#define IS_READABLE(Private) (Private->FileState & FILE_STATE_READ) +#define IS_WRITEABLE(Private) (Private->FileState & FILE_STATE_WRITE) + +typedef struct GifFilePrivateType { + GifWord FileState, FileHandle, /* Where all this data goes to! */ + BitsPerPixel, /* Bits per pixel (Codes uses at least this + 1). */ + ClearCode, /* The CLEAR LZ code. */ + EOFCode, /* The EOF LZ code. */ + RunningCode, /* The next code algorithm can generate. */ + RunningBits, /* The number of bits required to represent + RunningCode. */ + MaxCode1, /* 1 bigger than max. possible code, in RunningBits bits. + */ + LastCode, /* The code before the current code. */ + CrntCode, /* Current algorithm code. */ + StackPtr, /* For character stack (see below). */ + CrntShiftState; /* Number of bits in CrntShiftDWord. */ + unsigned long CrntShiftDWord; /* For bytes decomposition into codes. */ + unsigned long PixelCount; /* Number of pixels in image. */ + FILE *File; /* File as stream. */ + InputFunc Read; /* function to read gif input (TVT) */ + OutputFunc Write; /* function to write gif output (MRB) */ + GifByteType Buf[256]; /* Compressed input is buffered here. */ + GifByteType Stack[LZ_MAX_CODE]; /* Decoded pixels are stacked here. */ + GifByteType Suffix[LZ_MAX_CODE + 1]; /* So we can trace the codes. */ + GifPrefixType Prefix[LZ_MAX_CODE + 1]; + GifHashTableType *HashTable; + bool gif89; +} GifFilePrivateType; + +#ifndef HAVE_REALLOCARRAY +extern void *openbsd_reallocarray(void *optr, size_t nmemb, size_t size); +#define reallocarray openbsd_reallocarray +#endif + +#endif /* _GIF_LIB_PRIVATE_H */ + +/* end */ diff --git a/torchvision/csrc/io/image/cpu/giflib/gifalloc.c b/torchvision/csrc/io/image/cpu/giflib/gifalloc.c new file mode 100644 index 00000000000..926d54ebcf7 --- /dev/null +++ b/torchvision/csrc/io/image/cpu/giflib/gifalloc.c @@ -0,0 +1,425 @@ +/***************************************************************************** + + GIF construction tools + +****************************************************************************/ +// SPDX-License-Identifier: MIT +// SPDX-FileCopyrightText: Copyright (C) Eric S. Raymond + +#include +#include +#include + +#include "gif_lib.h" +#include "gif_lib_private.h" + +#define MAX(x, y) (((x) > (y)) ? (x) : (y)) + +/****************************************************************************** + Miscellaneous utility functions +******************************************************************************/ + +/* return smallest bitfield size n will fit in */ +int GifBitSize(int n) { + int i; + + for (i = 1; i <= 8; i++) { + if ((1 << i) >= n) { + break; + } + } + return (i); +} + +/****************************************************************************** + Color map object functions +******************************************************************************/ + +/* + * Allocate a color map of given size; initialize with contents of + * ColorMap if that pointer is non-NULL. + */ +ColorMapObject *GifMakeMapObject(int ColorCount, const GifColorType *ColorMap) { + ColorMapObject *Object; + + /*** FIXME: Our ColorCount has to be a power of two. Is it necessary to + * make the user know that or should we automatically round up instead? + */ + if (ColorCount != (1 << GifBitSize(ColorCount))) { + return ((ColorMapObject *)NULL); + } + + Object = (ColorMapObject *)malloc(sizeof(ColorMapObject)); + if (Object == (ColorMapObject *)NULL) { + return ((ColorMapObject *)NULL); + } + + Object->Colors = + (GifColorType *)calloc(ColorCount, sizeof(GifColorType)); + if (Object->Colors == (GifColorType *)NULL) { + free(Object); + return ((ColorMapObject *)NULL); + } + + Object->ColorCount = ColorCount; + Object->BitsPerPixel = GifBitSize(ColorCount); + Object->SortFlag = false; + + if (ColorMap != NULL) { + memcpy((char *)Object->Colors, (char *)ColorMap, + ColorCount * sizeof(GifColorType)); + } + + return (Object); +} + +/******************************************************************************* + Free a color map object +*******************************************************************************/ +void GifFreeMapObject(ColorMapObject *Object) { + if (Object != NULL) { + (void)free(Object->Colors); + (void)free(Object); + } +} + +#ifdef DEBUG +void DumpColorMap(ColorMapObject *Object, FILE *fp) { + if (Object != NULL) { + int i, j, Len = Object->ColorCount; + + for (i = 0; i < Len; i += 4) { + for (j = 0; j < 4 && j < Len; j++) { + (void)fprintf(fp, "%3d: %02x %02x %02x ", + i + j, Object->Colors[i + j].Red, + Object->Colors[i + j].Green, + Object->Colors[i + j].Blue); + } + (void)fprintf(fp, "\n"); + } + } +} +#endif /* DEBUG */ + +/******************************************************************************* + Compute the union of two given color maps and return it. If result can't + fit into 256 colors, NULL is returned, the allocated union otherwise. + ColorIn1 is copied as is to ColorUnion, while colors from ColorIn2 are + copied iff they didn't exist before. ColorTransIn2 maps the old + ColorIn2 into the ColorUnion color map table./ +*******************************************************************************/ +ColorMapObject *GifUnionColorMap(const ColorMapObject *ColorIn1, + const ColorMapObject *ColorIn2, + GifPixelType ColorTransIn2[]) { + int i, j, CrntSlot, RoundUpTo, NewGifBitSize; + ColorMapObject *ColorUnion; + + /* + * We don't worry about duplicates within either color map; if + * the caller wants to resolve those, he can perform unions + * with an empty color map. + */ + + /* Allocate table which will hold the result for sure. */ + ColorUnion = GifMakeMapObject( + MAX(ColorIn1->ColorCount, ColorIn2->ColorCount) * 2, NULL); + + if (ColorUnion == NULL) { + return (NULL); + } + + /* + * Copy ColorIn1 to ColorUnion. + */ + for (i = 0; i < ColorIn1->ColorCount; i++) { + ColorUnion->Colors[i] = ColorIn1->Colors[i]; + } + CrntSlot = ColorIn1->ColorCount; + + /* + * Potentially obnoxious hack: + * + * Back CrntSlot down past all contiguous {0, 0, 0} slots at the end + * of table 1. This is very useful if your display is limited to + * 16 colors. + */ + while (ColorIn1->Colors[CrntSlot - 1].Red == 0 && + ColorIn1->Colors[CrntSlot - 1].Green == 0 && + ColorIn1->Colors[CrntSlot - 1].Blue == 0) { + CrntSlot--; + } + + /* Copy ColorIn2 to ColorUnion (use old colors if they exist): */ + for (i = 0; i < ColorIn2->ColorCount && CrntSlot <= 256; i++) { + /* Let's see if this color already exists: */ + for (j = 0; j < ColorIn1->ColorCount; j++) { + if (memcmp(&ColorIn1->Colors[j], &ColorIn2->Colors[i], + sizeof(GifColorType)) == 0) { + break; + } + } + + if (j < ColorIn1->ColorCount) { + ColorTransIn2[i] = j; /* color exists in Color1 */ + } else { + /* Color is new - copy it to a new slot: */ + ColorUnion->Colors[CrntSlot] = ColorIn2->Colors[i]; + ColorTransIn2[i] = CrntSlot++; + } + } + + if (CrntSlot > 256) { + GifFreeMapObject(ColorUnion); + return ((ColorMapObject *)NULL); + } + + NewGifBitSize = GifBitSize(CrntSlot); + RoundUpTo = (1 << NewGifBitSize); + + if (RoundUpTo != ColorUnion->ColorCount) { + GifColorType *Map = ColorUnion->Colors; + + /* + * Zero out slots up to next power of 2. + * We know these slots exist because of the way ColorUnion's + * start dimension was computed. + */ + for (j = CrntSlot; j < RoundUpTo; j++) { + Map[j].Red = Map[j].Green = Map[j].Blue = 0; + } + + /* perhaps we can shrink the map? */ + if (RoundUpTo < ColorUnion->ColorCount) { + GifColorType *new_map = (GifColorType *)reallocarray( + Map, RoundUpTo, sizeof(GifColorType)); + if (new_map == NULL) { + GifFreeMapObject(ColorUnion); + return ((ColorMapObject *)NULL); + } + ColorUnion->Colors = new_map; + } + } + + ColorUnion->ColorCount = RoundUpTo; + ColorUnion->BitsPerPixel = NewGifBitSize; + + return (ColorUnion); +} + +/******************************************************************************* + Apply a given color translation to the raster bits of an image +*******************************************************************************/ +void GifApplyTranslation(SavedImage *Image, const GifPixelType Translation[]) { + int i; + int RasterSize = + Image->ImageDesc.Height * Image->ImageDesc.Width; + + for (i = 0; i < RasterSize; i++) { + Image->RasterBits[i] = Translation[Image->RasterBits[i]]; + } +} + +/****************************************************************************** + Extension record functions +******************************************************************************/ +int GifAddExtensionBlock(int *ExtensionBlockCount, + ExtensionBlock **ExtensionBlocks, int Function, + unsigned int Len, unsigned char ExtData[]) { + ExtensionBlock *ep; + + if (*ExtensionBlocks == NULL) { + *ExtensionBlocks = + (ExtensionBlock *)malloc(sizeof(ExtensionBlock)); + } else { + ExtensionBlock *ep_new = (ExtensionBlock *)reallocarray( + *ExtensionBlocks, (*ExtensionBlockCount + 1), + sizeof(ExtensionBlock)); + if (ep_new == NULL) { + return (GIF_ERROR); + } + *ExtensionBlocks = ep_new; + } + + if (*ExtensionBlocks == NULL) { + return (GIF_ERROR); + } + + ep = &(*ExtensionBlocks)[(*ExtensionBlockCount)++]; + + ep->Function = Function; + ep->ByteCount = Len; + ep->Bytes = (GifByteType *)malloc(ep->ByteCount); + if (ep->Bytes == NULL) { + return (GIF_ERROR); + } + + if (ExtData != NULL) { + memcpy(ep->Bytes, ExtData, Len); + } + + return (GIF_OK); +} + +void GifFreeExtensions(int *ExtensionBlockCount, + ExtensionBlock **ExtensionBlocks) { + ExtensionBlock *ep; + + if (*ExtensionBlocks == NULL) { + return; + } + + for (ep = *ExtensionBlocks; + ep < (*ExtensionBlocks + *ExtensionBlockCount); ep++) { + (void)free((char *)ep->Bytes); + } + (void)free((char *)*ExtensionBlocks); + *ExtensionBlocks = NULL; + *ExtensionBlockCount = 0; +} + +/****************************************************************************** + Image block allocation functions +******************************************************************************/ + +/* Private Function: + * Frees the last image in the GifFile->SavedImages array + */ +void FreeLastSavedImage(GifFileType *GifFile) { + SavedImage *sp; + + if ((GifFile == NULL) || (GifFile->SavedImages == NULL)) { + return; + } + + /* Remove one SavedImage from the GifFile */ + GifFile->ImageCount--; + sp = &GifFile->SavedImages[GifFile->ImageCount]; + + /* Deallocate its Colormap */ + if (sp->ImageDesc.ColorMap != NULL) { + GifFreeMapObject(sp->ImageDesc.ColorMap); + sp->ImageDesc.ColorMap = NULL; + } + + /* Deallocate the image data */ + if (sp->RasterBits != NULL) { + free((char *)sp->RasterBits); + } + + /* Deallocate any extensions */ + GifFreeExtensions(&sp->ExtensionBlockCount, &sp->ExtensionBlocks); + + /*** FIXME: We could realloc the GifFile->SavedImages structure but is + * there a point to it? Saves some memory but we'd have to do it every + * time. If this is used in GifFreeSavedImages then it would be + * inefficient (The whole array is going to be deallocated.) If we just + * use it when we want to free the last Image it's convenient to do it + * here. + */ +} + +/* + * Append an image block to the SavedImages array + */ +SavedImage *GifMakeSavedImage(GifFileType *GifFile, + const SavedImage *CopyFrom) { + // cppcheck-suppress ctunullpointer + if (GifFile->SavedImages == NULL) { + GifFile->SavedImages = (SavedImage *)malloc(sizeof(SavedImage)); + } else { + SavedImage *newSavedImages = (SavedImage *)reallocarray( + GifFile->SavedImages, (GifFile->ImageCount + 1), + sizeof(SavedImage)); + if (newSavedImages == NULL) { + return ((SavedImage *)NULL); + } + GifFile->SavedImages = newSavedImages; + } + if (GifFile->SavedImages == NULL) { + return ((SavedImage *)NULL); + } else { + SavedImage *sp = &GifFile->SavedImages[GifFile->ImageCount++]; + + if (CopyFrom != NULL) { + memcpy((char *)sp, CopyFrom, sizeof(SavedImage)); + + /* + * Make our own allocated copies of the heap fields in + * the copied record. This guards against potential + * aliasing problems. + */ + + /* first, the local color map */ + if (CopyFrom->ImageDesc.ColorMap != NULL) { + sp->ImageDesc.ColorMap = GifMakeMapObject( + CopyFrom->ImageDesc.ColorMap->ColorCount, + CopyFrom->ImageDesc.ColorMap->Colors); + if (sp->ImageDesc.ColorMap == NULL) { + FreeLastSavedImage(GifFile); + return (SavedImage *)(NULL); + } + } + + /* next, the raster */ + sp->RasterBits = (unsigned char *)reallocarray( + NULL, + (CopyFrom->ImageDesc.Height * + CopyFrom->ImageDesc.Width), + sizeof(GifPixelType)); + if (sp->RasterBits == NULL) { + FreeLastSavedImage(GifFile); + return (SavedImage *)(NULL); + } + memcpy(sp->RasterBits, CopyFrom->RasterBits, + sizeof(GifPixelType) * + CopyFrom->ImageDesc.Height * + CopyFrom->ImageDesc.Width); + + /* finally, the extension blocks */ + if (CopyFrom->ExtensionBlocks != NULL) { + sp->ExtensionBlocks = + (ExtensionBlock *)reallocarray( + NULL, CopyFrom->ExtensionBlockCount, + sizeof(ExtensionBlock)); + if (sp->ExtensionBlocks == NULL) { + FreeLastSavedImage(GifFile); + return (SavedImage *)(NULL); + } + memcpy(sp->ExtensionBlocks, + CopyFrom->ExtensionBlocks, + sizeof(ExtensionBlock) * + CopyFrom->ExtensionBlockCount); + } + } else { + memset((char *)sp, '\0', sizeof(SavedImage)); + } + + return (sp); + } +} + +void GifFreeSavedImages(GifFileType *GifFile) { + SavedImage *sp; + + if ((GifFile == NULL) || (GifFile->SavedImages == NULL)) { + return; + } + for (sp = GifFile->SavedImages; + sp < GifFile->SavedImages + GifFile->ImageCount; sp++) { + if (sp->ImageDesc.ColorMap != NULL) { + GifFreeMapObject(sp->ImageDesc.ColorMap); + sp->ImageDesc.ColorMap = NULL; + } + + if (sp->RasterBits != NULL) { + free((char *)sp->RasterBits); + } + + GifFreeExtensions(&sp->ExtensionBlockCount, + &sp->ExtensionBlocks); + } + free((char *)GifFile->SavedImages); + GifFile->SavedImages = NULL; +} + +/* end */ diff --git a/torchvision/csrc/io/image/cpu/giflib/openbsd-reallocarray.c b/torchvision/csrc/io/image/cpu/giflib/openbsd-reallocarray.c new file mode 100644 index 00000000000..e09ab245ad4 --- /dev/null +++ b/torchvision/csrc/io/image/cpu/giflib/openbsd-reallocarray.c @@ -0,0 +1,73 @@ +/* + * SPDX-FileCopyrightText: Copyright (C) 2008 Otto Moerbeek + * SPDX-License-Identifier: MIT + */ + +#include +#include +#include +#include + +#ifndef SIZE_MAX +#define SIZE_MAX UINTPTR_MAX +#endif + +/* + * This is sqrt(SIZE_MAX+1), as s1*s2 <= SIZE_MAX + * if both s1 < MUL_NO_OVERFLOW and s2 < MUL_NO_OVERFLOW + */ +#define MUL_NO_OVERFLOW ((size_t)1 << (sizeof(size_t) * 4)) + +void *openbsd_reallocarray(void *optr, size_t nmemb, size_t size) { + if ((nmemb >= MUL_NO_OVERFLOW || size >= MUL_NO_OVERFLOW) && + nmemb > 0 && SIZE_MAX / nmemb < size) { + errno = ENOMEM; + return NULL; + } + /* + * Head off variations in realloc behavior on different + * platforms (reported by MarkR ) + * + * The behaviour of reallocarray is implementation-defined if + * nmemb or size is zero. It can return NULL or non-NULL + * depending on the platform. + * https://www.securecoding.cert.org/confluence/display/c/MEM04-C.Beware+of+zero-lengthallocations + * + * Here are some extracts from realloc man pages on different platforms. + * + * void realloc( void memblock, size_t size ); + * + * Windows: + * + * If there is not enough available memory to expand the block + * to the given size, the original block is left unchanged, + * and NULL is returned. If size is zero, then the block + * pointed to by memblock is freed; the return value is NULL, + * and memblock is left pointing at a freed block. + * + * OpenBSD: + * + * If size or nmemb is equal to 0, a unique pointer to an + * access protected, zero sized object is returned. Access via + * this pointer will generate a SIGSEGV exception. + * + * Linux: + * + * If size was equal to 0, either NULL or a pointer suitable + * to be passed to free() is returned. + * + * OS X: + * + * If size is zero and ptr is not NULL, a new, minimum sized + * object is allocated and the original object is freed. + * + * It looks like images with zero width or height can trigger + * this, and fuzzing behaviour will differ by platform, so + * fuzzing on one platform may not detect zero-size allocation + * problems on other platforms. + */ + if (size == 0 || nmemb == 0) { + return NULL; + } + return realloc(optr, size * nmemb); +} diff --git a/torchvision/csrc/io/image/image.cpp b/torchvision/csrc/io/image/image.cpp index fb5ee874acb..2909938f0bd 100644 --- a/torchvision/csrc/io/image/image.cpp +++ b/torchvision/csrc/io/image/image.cpp @@ -1,26 +1,21 @@ #include "image.h" #include -#ifdef USE_PYTHON -#include -#endif // If we are in a Windows environment, we need to define // initialization functions for the _custom_ops extension -#ifdef USE_PYTHON #ifdef _WIN32 -PyMODINIT_FUNC PyInit_image(void) { - // No need to do anything. +void* PyInit_image(void) { return nullptr; } #endif -#endif // USE_PYTHON namespace vision { namespace image { static auto registry = torch::RegisterOperators() + .op("image::decode_gif", &decode_gif) .op("image::decode_png(Tensor data, int mode, bool allow_16_bits = False, bool apply_exif_orientation=False) -> Tensor", &decode_png) .op("image::encode_png", &encode_png) diff --git a/torchvision/csrc/io/image/image.h b/torchvision/csrc/io/image/image.h index 05bac44c77d..457b1548d4d 100644 --- a/torchvision/csrc/io/image/image.h +++ b/torchvision/csrc/io/image/image.h @@ -1,5 +1,6 @@ #pragma once +#include "cpu/decode_gif.h" #include "cpu/decode_image.h" #include "cpu/decode_jpeg.h" #include "cpu/decode_png.h" diff --git a/torchvision/csrc/io/video_reader/video_reader.cpp b/torchvision/csrc/io/video_reader/video_reader.cpp index 78b0a64d1cb..f9a5e9085d8 100644 --- a/torchvision/csrc/io/video_reader/video_reader.cpp +++ b/torchvision/csrc/io/video_reader/video_reader.cpp @@ -1,22 +1,15 @@ #include "video_reader.h" -#ifdef USE_PYTHON -#include -#endif - #include "../decoder/memory_buffer.h" #include "../decoder/sync_decoder.h" -#ifdef USE_PYTHON // If we are in a Windows environment, we need to define // initialization functions for the _custom_ops extension #ifdef _WIN32 -PyMODINIT_FUNC PyInit_video_reader(void) { - // No need to do anything. +void* PyInit_video_reader(void) { return nullptr; } #endif -#endif // USE_PYTHONs using namespace ffmpeg; diff --git a/torchvision/csrc/macros.h b/torchvision/csrc/macros.h index 64ca89429a9..f907280e24e 100644 --- a/torchvision/csrc/macros.h +++ b/torchvision/csrc/macros.h @@ -9,14 +9,3 @@ #else #define VISION_API #endif - -#if (defined __cpp_inline_variables) || __cplusplus >= 201703L -#define VISION_INLINE_VARIABLE inline -#else -#ifdef _MSC_VER -#define VISION_INLINE_VARIABLE __declspec(selectany) -#define HINT_MSVC_LINKER_INCLUDE_SYMBOL -#else -#define VISION_INLINE_VARIABLE __attribute__((weak)) -#endif -#endif diff --git a/torchvision/csrc/vision.cpp b/torchvision/csrc/vision.cpp index b7040cdf4de..806e870a83f 100644 --- a/torchvision/csrc/vision.cpp +++ b/torchvision/csrc/vision.cpp @@ -1,10 +1,5 @@ #include "vision.h" -#ifndef MOBILE -#ifdef USE_PYTHON -#include -#endif -#endif #include #ifdef WITH_CUDA @@ -16,14 +11,10 @@ // If we are in a Windows environment, we need to define // initialization functions for the _custom_ops extension. -// For PyMODINIT_FUNC to work, we need to include Python.h #if !defined(MOBILE) && defined(_WIN32) -#ifdef USE_PYTHON -PyMODINIT_FUNC PyInit__C(void) { - // No need to do anything. +void* PyInit__C(void) { return nullptr; } -#endif // USE_PYTHON #endif // !defined(MOBILE) && defined(_WIN32) namespace vision { diff --git a/torchvision/csrc/vision.h b/torchvision/csrc/vision.h index 22f8c6cdd38..651ef3ca143 100644 --- a/torchvision/csrc/vision.h +++ b/torchvision/csrc/vision.h @@ -7,10 +7,6 @@ namespace vision { VISION_API int64_t cuda_version(); namespace detail { -extern "C" VISION_INLINE_VARIABLE auto _register_ops = &cuda_version; -#ifdef HINT_MSVC_LINKER_INCLUDE_SYMBOL -#pragma comment(linker, "/include:_register_ops") -#endif - +extern "C" inline auto _register_ops = &cuda_version; } // namespace detail } // namespace vision diff --git a/torchvision/datasets/cityscapes.py b/torchvision/datasets/cityscapes.py index 6f7281f2574..969642553a1 100644 --- a/torchvision/datasets/cityscapes.py +++ b/torchvision/datasets/cityscapes.py @@ -188,7 +188,7 @@ def __getitem__(self, index: int) -> Tuple[Any, Any]: if t == "polygon": target = self._load_json(self.targets[index][i]) else: - target = Image.open(self.targets[index][i]) + target = Image.open(self.targets[index][i]) # type: ignore[assignment] targets.append(target) diff --git a/torchvision/datasets/oxford_iiit_pet.py b/torchvision/datasets/oxford_iiit_pet.py index 9fe78901626..1d6d990fdf9 100644 --- a/torchvision/datasets/oxford_iiit_pet.py +++ b/torchvision/datasets/oxford_iiit_pet.py @@ -19,6 +19,7 @@ class OxfordIIITPet(VisionDataset): ``segmentation``. Can also be a list to output a tuple with all specified target types. The types represent: - ``category`` (int): Label for one of the 37 pet categories. + - ``binary-category`` (int): Binary label for cat or dog. - ``segmentation`` (PIL image): Segmentation trimap of the image. If empty, ``None`` will be returned as target. @@ -34,7 +35,7 @@ class OxfordIIITPet(VisionDataset): ("https://www.robots.ox.ac.uk/~vgg/data/pets/data/images.tar.gz", "5c4f3ee8e5d25df40f4fd59a7f44e54c"), ("https://www.robots.ox.ac.uk/~vgg/data/pets/data/annotations.tar.gz", "95a8c909bbe2e81eed6a22bccdf3f68f"), ) - _VALID_TARGET_TYPES = ("category", "segmentation") + _VALID_TARGET_TYPES = ("category", "binary-category", "segmentation") def __init__( self, @@ -67,12 +68,15 @@ def __init__( image_ids = [] self._labels = [] + self._bin_labels = [] with open(self._anns_folder / f"{self._split}.txt") as file: for line in file: - image_id, label, *_ = line.strip().split() + image_id, label, bin_label, _ = line.strip().split() image_ids.append(image_id) self._labels.append(int(label) - 1) + self._bin_labels.append(int(bin_label) - 1) + self.bin_classes = ["Cat", "Dog"] self.classes = [ " ".join(part.title() for part in raw_cls.split("_")) for raw_cls, _ in sorted( @@ -80,6 +84,7 @@ def __init__( key=lambda image_id_and_label: image_id_and_label[1], ) ] + self.bin_class_to_idx = dict(zip(self.bin_classes, range(len(self.bin_classes)))) self.class_to_idx = dict(zip(self.classes, range(len(self.classes)))) self._images = [self._images_folder / f"{image_id}.jpg" for image_id in image_ids] @@ -95,6 +100,8 @@ def __getitem__(self, idx: int) -> Tuple[Any, Any]: for target_type in self._target_types: if target_type == "category": target.append(self._labels[idx]) + elif target_type == "binary-category": + target.append(self._bin_labels[idx]) else: # target_type == "segmentation" target.append(Image.open(self._segs[idx])) diff --git a/torchvision/io/__init__.py b/torchvision/io/__init__.py index 8427095cea6..f38ce687a0d 100644 --- a/torchvision/io/__init__.py +++ b/torchvision/io/__init__.py @@ -21,6 +21,7 @@ VideoMetaData, ) from .image import ( + decode_gif, decode_image, decode_jpeg, decode_png, diff --git a/torchvision/io/image.py b/torchvision/io/image.py index 8d3b294b32e..5caf9b111ac 100644 --- a/torchvision/io/image.py +++ b/torchvision/io/image.py @@ -225,7 +225,7 @@ def decode_image( input: torch.Tensor, mode: ImageReadMode = ImageReadMode.UNCHANGED, apply_exif_orientation: bool = False ) -> torch.Tensor: """ - Detects whether an image is a JPEG or PNG and performs the appropriate + Detect whether an image is a JPEG, PNG or GIF and performs the appropriate operation to decode the image into a 3 dimensional RGB or grayscale Tensor. Optionally converts the image to the desired format. @@ -237,9 +237,9 @@ def decode_image( mode (ImageReadMode): the read mode used for optionally converting the image. Default: ``ImageReadMode.UNCHANGED``. See ``ImageReadMode`` class for more information on various - available modes. + available modes. Ignored for GIFs. apply_exif_orientation (bool): apply EXIF orientation transformation to the output tensor. - Default: False. + Ignored for GIFs. Default: False. Returns: output (Tensor[image_channels, image_height, image_width]) @@ -254,7 +254,7 @@ def read_image( path: str, mode: ImageReadMode = ImageReadMode.UNCHANGED, apply_exif_orientation: bool = False ) -> torch.Tensor: """ - Reads a JPEG or PNG image into a 3 dimensional RGB or grayscale Tensor. + Reads a JPEG, PNG or GIF image into a 3 dimensional RGB or grayscale Tensor. Optionally converts the image to the desired format. The values of the output tensor are uint8 in [0, 255]. @@ -263,9 +263,9 @@ def read_image( mode (ImageReadMode): the read mode used for optionally converting the image. Default: ``ImageReadMode.UNCHANGED``. See ``ImageReadMode`` class for more information on various - available modes. + available modes. Ignored for GIFs. apply_exif_orientation (bool): apply EXIF orientation transformation to the output tensor. - Default: False. + Ignored for GIFs. Default: False. Returns: output (Tensor[image_channels, image_height, image_width]) @@ -279,3 +279,23 @@ def read_image( def _read_png_16(path: str, mode: ImageReadMode = ImageReadMode.UNCHANGED) -> torch.Tensor: data = read_file(path) return torch.ops.image.decode_png(data, mode.value, True) + + +def decode_gif(input: torch.Tensor) -> torch.Tensor: + """ + Decode a GIF image into a 3 or 4 dimensional RGB Tensor. + + The values of the output tensor are uint8 between 0 and 255. + The output tensor has shape ``(C, H, W)`` if there is only one image in the + GIF, and ``(N, C, H, W)`` if there are ``N`` images. + + Args: + input (Tensor[1]): a one dimensional contiguous uint8 tensor containing + the raw bytes of the GIF image. + + Returns: + output (Tensor[image_channels, image_height, image_width] or Tensor[num_images, image_channels, image_height, image_width]) + """ + if not torch.jit.is_scripting() and not torch.jit.is_tracing(): + _log_api_usage_once(decode_gif) + return torch.ops.image.decode_gif(input) diff --git a/torchvision/io/video.py b/torchvision/io/video.py index 73e61dac18e..9b2eacbab11 100644 --- a/torchvision/io/video.py +++ b/torchvision/io/video.py @@ -244,7 +244,7 @@ def read_video( Reads a video from a file, returning both the video frames and the audio frames Args: - filename (str): path to the video file + filename (str): path to the video file. If using the pyav backend, this can be whatever ``av.open`` accepts. start_pts (int if pts_unit = 'pts', float / Fraction if pts_unit = 'sec', optional): The start presentation time of the video end_pts (int if pts_unit = 'pts', float / Fraction if pts_unit = 'sec', optional): @@ -267,10 +267,9 @@ def read_video( from torchvision import get_video_backend - if not os.path.exists(filename): - raise RuntimeError(f"File not found: {filename}") - if get_video_backend() != "pyav": + if not os.path.exists(filename): + raise RuntimeError(f"File not found: {filename}") vframes, aframes, info = _video_opt._read_video(filename, start_pts, end_pts, pts_unit) else: _check_av_available() diff --git a/torchvision/models/detection/faster_rcnn.py b/torchvision/models/detection/faster_rcnn.py index 0dc9a580ffe..18474ee84f4 100644 --- a/torchvision/models/detection/faster_rcnn.py +++ b/torchvision/models/detection/faster_rcnn.py @@ -73,8 +73,12 @@ class FasterRCNN(GeneralizedRCNN): The backbone should return a single Tensor or and OrderedDict[Tensor]. num_classes (int): number of output classes of the model (including the background). If box_predictor is specified, num_classes should be None. - min_size (int): minimum size of the image to be rescaled before feeding it to the backbone - max_size (int): maximum size of the image to be rescaled before feeding it to the backbone + min_size (int): Images are rescaled before feeding them to the backbone: + we attempt to preserve the aspect ratio and scale the shorter edge + to ``min_size``. If the resulting longer edge exceeds ``max_size``, + then downscale so that the longer edge does not exceed ``max_size``. + This may result in the shorter edge beeing lower than ``min_size``. + max_size (int): See ``min_size``. image_mean (Tuple[float, float, float]): mean values used for input normalization. They are generally the mean values of the dataset on which the backbone has been trained on diff --git a/torchvision/models/detection/fcos.py b/torchvision/models/detection/fcos.py index dd846aea9ad..a86ad2f424c 100644 --- a/torchvision/models/detection/fcos.py +++ b/torchvision/models/detection/fcos.py @@ -299,8 +299,12 @@ class FCOS(nn.Module): channels that each feature map has (and it should be the same for all feature maps). The backbone should return a single Tensor or an OrderedDict[Tensor]. num_classes (int): number of output classes of the model (including the background). - min_size (int): minimum size of the image to be rescaled before feeding it to the backbone - max_size (int): maximum size of the image to be rescaled before feeding it to the backbone + min_size (int): Images are rescaled before feeding them to the backbone: + we attempt to preserve the aspect ratio and scale the shorter edge + to ``min_size``. If the resulting longer edge exceeds ``max_size``, + then downscale so that the longer edge does not exceed ``max_size``. + This may result in the shorter edge beeing lower than ``min_size``. + max_size (int): See ``min_size``. image_mean (Tuple[float, float, float]): mean values used for input normalization. They are generally the mean values of the dataset on which the backbone has been trained on diff --git a/torchvision/models/detection/keypoint_rcnn.py b/torchvision/models/detection/keypoint_rcnn.py index 987df6603b8..0494a7329ee 100644 --- a/torchvision/models/detection/keypoint_rcnn.py +++ b/torchvision/models/detection/keypoint_rcnn.py @@ -60,8 +60,12 @@ class KeypointRCNN(FasterRCNN): The backbone should return a single Tensor or and OrderedDict[Tensor]. num_classes (int): number of output classes of the model (including the background). If box_predictor is specified, num_classes should be None. - min_size (int): minimum size of the image to be rescaled before feeding it to the backbone - max_size (int): maximum size of the image to be rescaled before feeding it to the backbone + min_size (int): Images are rescaled before feeding them to the backbone: + we attempt to preserve the aspect ratio and scale the shorter edge + to ``min_size``. If the resulting longer edge exceeds ``max_size``, + then downscale so that the longer edge does not exceed ``max_size``. + This may result in the shorter edge beeing lower than ``min_size``. + max_size (int): See ``min_size``. image_mean (Tuple[float, float, float]): mean values used for input normalization. They are generally the mean values of the dataset on which the backbone has been trained on diff --git a/torchvision/models/detection/mask_rcnn.py b/torchvision/models/detection/mask_rcnn.py index 862eee49fda..cdabbfd26ca 100644 --- a/torchvision/models/detection/mask_rcnn.py +++ b/torchvision/models/detection/mask_rcnn.py @@ -61,8 +61,12 @@ class MaskRCNN(FasterRCNN): The backbone should return a single Tensor or and OrderedDict[Tensor]. num_classes (int): number of output classes of the model (including the background). If box_predictor is specified, num_classes should be None. - min_size (int): minimum size of the image to be rescaled before feeding it to the backbone - max_size (int): maximum size of the image to be rescaled before feeding it to the backbone + min_size (int): Images are rescaled before feeding them to the backbone: + we attempt to preserve the aspect ratio and scale the shorter edge + to ``min_size``. If the resulting longer edge exceeds ``max_size``, + then downscale so that the longer edge does not exceed ``max_size``. + This may result in the shorter edge beeing lower than ``min_size``. + max_size (int): See ``min_size``. image_mean (Tuple[float, float, float]): mean values used for input normalization. They are generally the mean values of the dataset on which the backbone has been trained on diff --git a/torchvision/models/detection/retinanet.py b/torchvision/models/detection/retinanet.py index 3a9cf80d1d5..a8cc7755014 100644 --- a/torchvision/models/detection/retinanet.py +++ b/torchvision/models/detection/retinanet.py @@ -352,8 +352,12 @@ class RetinaNet(nn.Module): channels that each feature map has (and it should be the same for all feature maps). The backbone should return a single Tensor or an OrderedDict[Tensor]. num_classes (int): number of output classes of the model (including the background). - min_size (int): minimum size of the image to be rescaled before feeding it to the backbone - max_size (int): maximum size of the image to be rescaled before feeding it to the backbone + min_size (int): Images are rescaled before feeding them to the backbone: + we attempt to preserve the aspect ratio and scale the shorter edge + to ``min_size``. If the resulting longer edge exceeds ``max_size``, + then downscale so that the longer edge does not exceed ``max_size``. + This may result in the shorter edge beeing lower than ``min_size``. + max_size (int): See ``min_size``. image_mean (Tuple[float, float, float]): mean values used for input normalization. They are generally the mean values of the dataset on which the backbone has been trained on diff --git a/torchvision/models/feature_extraction.py b/torchvision/models/feature_extraction.py index d8c2dca4afe..f42bc124c7b 100644 --- a/torchvision/models/feature_extraction.py +++ b/torchvision/models/feature_extraction.py @@ -204,6 +204,7 @@ def get_graph_node_names( model: nn.Module, tracer_kwargs: Optional[Dict[str, Any]] = None, suppress_diff_warning: bool = False, + concrete_args: Optional[Dict[str, Any]] = None, ) -> Tuple[List[str], List[str]]: """ Dev utility to return node names in order of execution. See note on node @@ -232,10 +233,13 @@ def get_graph_node_names( {"autowrap_modules": (math, torchvision.ops,),"leaf_modules": _get_leaf_modules_for_ops(),} WARNING: In case the user provides tracer_kwargs, above default arguments will be appended to the user provided dictionary. - suppress_diff_warning (bool, optional): whether to suppress a warning when there are discrepancies between the train and eval version of the graph. Defaults to False. + concrete_args (Optional[Dict[str, any]]): Concrete arguments that should + not be treated as Proxies. According to the `Pytorch docs + `_, + this parameter's API may not be guaranteed. Returns: tuple(list, list): a list of node names from tracing the model in @@ -249,9 +253,9 @@ def get_graph_node_names( tracer_kwargs = _set_default_tracer_kwargs(tracer_kwargs) is_training = model.training train_tracer = NodePathTracer(**tracer_kwargs) - train_tracer.trace(model.train()) + train_tracer.trace(model.train(), concrete_args=concrete_args) eval_tracer = NodePathTracer(**tracer_kwargs) - eval_tracer.trace(model.eval()) + eval_tracer.trace(model.eval(), concrete_args=concrete_args) train_nodes = list(train_tracer.node_to_qualname.values()) eval_nodes = list(eval_tracer.node_to_qualname.values()) if not suppress_diff_warning: @@ -334,6 +338,7 @@ def create_feature_extractor( eval_return_nodes: Optional[Union[List[str], Dict[str, str]]] = None, tracer_kwargs: Optional[Dict[str, Any]] = None, suppress_diff_warning: bool = False, + concrete_args: Optional[Dict[str, Any]] = None, ) -> fx.GraphModule: """ Creates a new graph module that returns intermediate nodes from a given @@ -398,6 +403,10 @@ def create_feature_extractor( suppress_diff_warning (bool, optional): whether to suppress a warning when there are discrepancies between the train and eval version of the graph. Defaults to False. + concrete_args (Optional[Dict[str, any]]): Concrete arguments that should + not be treated as Proxies. According to the `Pytorch docs + `_, + this parameter's API may not be guaranteed. Examples:: @@ -482,7 +491,7 @@ def to_strdict(n) -> Dict[str, str]: # Instantiate our NodePathTracer and use that to trace the model tracer = NodePathTracer(**tracer_kwargs) - graph = tracer.trace(model) + graph = tracer.trace(model, concrete_args=concrete_args) name = model.__class__.__name__ if isinstance(model, nn.Module) else model.__name__ graph_module = fx.GraphModule(tracer.root, graph, name) diff --git a/torchvision/ops/roi_align.py b/torchvision/ops/roi_align.py index 0d505c140ee..ac1ec8b429a 100644 --- a/torchvision/ops/roi_align.py +++ b/torchvision/ops/roi_align.py @@ -1,9 +1,11 @@ +import functools from typing import List, Union import torch import torch._dynamo import torch.fx from torch import nn, Tensor +from torch._dynamo.utils import is_compile_supported from torch.jit.annotations import BroadcastingList2 from torch.nn.modules.utils import _pair from torchvision.extension import _assert_has_ops, _has_ops @@ -12,6 +14,24 @@ from ._utils import check_roi_boxes_shape, convert_boxes_to_roi_format +def lazy_compile(**compile_kwargs): + """Lazily wrap a function with torch.compile on the first call + + This avoids eagerly importing dynamo. + """ + + def decorate_fn(fn): + @functools.wraps(fn) + def compile_hook(*args, **kwargs): + compiled_fn = torch.compile(fn, **compile_kwargs) + globals()[fn.__name__] = functools.wraps(fn)(compiled_fn) + return compiled_fn(*args, **kwargs) + + return compile_hook + + return decorate_fn + + # NB: all inputs are tensors def _bilinear_interpolate( input, # [N, C, H, W] @@ -86,15 +106,13 @@ def maybe_cast(tensor): return tensor -# This is a slow but pure Python and differentiable implementation of -# roi_align. It potentially is a good basis for Inductor compilation -# (but I have not benchmarked it) but today it is solely used for the -# fact that its backwards can be implemented deterministically, -# which is needed for the PT2 benchmark suite. -# +# This is a pure Python and differentiable implementation of roi_align. When +# run in eager mode, it uses a lot of memory, but when compiled it has +# acceptable memory usage. The main point of this implementation is that +# its backwards is deterministic. # It is transcribed directly off of the roi_align CUDA kernel, see # https://dev-discuss.pytorch.org/t/a-pure-python-implementation-of-roi-align-that-looks-just-like-its-cuda-kernel/1266 -@torch._dynamo.allow_in_graph +@lazy_compile(dynamic=True) def _roi_align(input, rois, spatial_scale, pooled_height, pooled_width, sampling_ratio, aligned): orig_dtype = input.dtype @@ -232,7 +250,9 @@ def roi_align( if not isinstance(rois, torch.Tensor): rois = convert_boxes_to_roi_format(rois) if not torch.jit.is_scripting(): - if not _has_ops() or (torch.are_deterministic_algorithms_enabled() and (input.is_cuda or input.is_mps)): + if ( + not _has_ops() or (torch.are_deterministic_algorithms_enabled() and (input.is_cuda or input.is_mps)) + ) and is_compile_supported(input.device.type): return _roi_align(input, rois, spatial_scale, output_size[0], output_size[1], sampling_ratio, aligned) _assert_has_ops() return torch.ops.torchvision.roi_align( diff --git a/torchvision/prototype/utils/_internal.py b/torchvision/prototype/utils/_internal.py index 3dee4b59a7a..da1038f206d 100644 --- a/torchvision/prototype/utils/_internal.py +++ b/torchvision/prototype/utils/_internal.py @@ -99,7 +99,7 @@ def fromfile( class ReadOnlyTensorBuffer: def __init__(self, tensor: torch.Tensor) -> None: - self._memory = memoryview(tensor.numpy()) + self._memory = memoryview(tensor.numpy()) # type: ignore[arg-type] self._cursor: int = 0 def tell(self) -> int: diff --git a/torchvision/transforms/_functional_tensor.py b/torchvision/transforms/_functional_tensor.py index 88dc9ca21cc..348f01bb1e6 100644 --- a/torchvision/transforms/_functional_tensor.py +++ b/torchvision/transforms/_functional_tensor.py @@ -722,10 +722,10 @@ def perspective( return _apply_grid_transform(img, grid, interpolation, fill=fill) -def _get_gaussian_kernel1d(kernel_size: int, sigma: float) -> Tensor: +def _get_gaussian_kernel1d(kernel_size: int, sigma: float, dtype: torch.dtype, device: torch.device) -> Tensor: ksize_half = (kernel_size - 1) * 0.5 - x = torch.linspace(-ksize_half, ksize_half, steps=kernel_size) + x = torch.linspace(-ksize_half, ksize_half, steps=kernel_size, dtype=dtype, device=device) pdf = torch.exp(-0.5 * (x / sigma).pow(2)) kernel1d = pdf / pdf.sum() @@ -735,8 +735,8 @@ def _get_gaussian_kernel1d(kernel_size: int, sigma: float) -> Tensor: def _get_gaussian_kernel2d( kernel_size: List[int], sigma: List[float], dtype: torch.dtype, device: torch.device ) -> Tensor: - kernel1d_x = _get_gaussian_kernel1d(kernel_size[0], sigma[0]).to(device, dtype=dtype) - kernel1d_y = _get_gaussian_kernel1d(kernel_size[1], sigma[1]).to(device, dtype=dtype) + kernel1d_x = _get_gaussian_kernel1d(kernel_size[0], sigma[0], dtype, device) + kernel1d_y = _get_gaussian_kernel1d(kernel_size[1], sigma[1], dtype, device) kernel2d = torch.mm(kernel1d_y[:, None], kernel1d_x[None, :]) return kernel2d diff --git a/torchvision/transforms/v2/__init__.py b/torchvision/transforms/v2/__init__.py index 6dccb8a5b78..33d83f1fe3f 100644 --- a/torchvision/transforms/v2/__init__.py +++ b/torchvision/transforms/v2/__init__.py @@ -45,6 +45,7 @@ from ._misc import ( ConvertImageDtype, GaussianBlur, + GaussianNoise, Identity, Lambda, LinearTransformation, diff --git a/torchvision/transforms/v2/_augment.py b/torchvision/transforms/v2/_augment.py index cc645d6c8a8..f085ef3ca6e 100644 --- a/torchvision/transforms/v2/_augment.py +++ b/torchvision/transforms/v2/_augment.py @@ -1,7 +1,7 @@ import math import numbers import warnings -from typing import Any, Callable, Dict, List, Sequence, Tuple, Union +from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union import PIL.Image import torch @@ -142,7 +142,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: class _BaseMixUpCutMix(Transform): - def __init__(self, *, alpha: float = 1.0, num_classes: int, labels_getter="default") -> None: + def __init__(self, *, alpha: float = 1.0, num_classes: Optional[int] = None, labels_getter="default") -> None: super().__init__() self.alpha = float(alpha) self._dist = torch.distributions.Beta(torch.tensor([alpha]), torch.tensor([alpha])) @@ -162,10 +162,21 @@ def forward(self, *inputs): labels = self._labels_getter(inputs) if not isinstance(labels, torch.Tensor): raise ValueError(f"The labels must be a tensor, but got {type(labels)} instead.") - elif labels.ndim != 1: + if labels.ndim not in (1, 2): raise ValueError( - f"labels tensor should be of shape (batch_size,) " f"but got shape {labels.shape} instead." + f"labels should be index based with shape (batch_size,) " + f"or probability based with shape (batch_size, num_classes), " + f"but got a tensor of shape {labels.shape} instead." ) + if labels.ndim == 2 and self.num_classes is not None and labels.shape[-1] != self.num_classes: + raise ValueError( + f"When passing 2D labels, " + f"the number of elements in last dimension must match num_classes: " + f"{labels.shape[-1]} != {self.num_classes}. " + f"You can Leave num_classes to None." + ) + if labels.ndim == 1 and self.num_classes is None: + raise ValueError("num_classes must be passed if the labels are index-based (1D)") params = { "labels": labels, @@ -198,7 +209,8 @@ def _check_image_or_video(self, inpt: torch.Tensor, *, batch_size: int): ) def _mixup_label(self, label: torch.Tensor, *, lam: float) -> torch.Tensor: - label = one_hot(label, num_classes=self.num_classes) + if label.ndim == 1: + label = one_hot(label, num_classes=self.num_classes) # type: ignore[arg-type] if not label.dtype.is_floating_point: label = label.float() return label.roll(1, 0).mul_(1.0 - lam).add_(label.mul(lam)) @@ -223,7 +235,8 @@ class MixUp(_BaseMixUpCutMix): Args: alpha (float, optional): hyperparameter of the Beta distribution used for mixup. Default is 1. - num_classes (int): number of classes in the batch. Used for one-hot-encoding. + num_classes (int, optional): number of classes in the batch. Used for one-hot-encoding. + Can be None only if the labels are already one-hot-encoded. labels_getter (callable or "default", optional): indicates how to identify the labels in the input. By default, this will pick the second parameter as the labels if it's a tensor. This covers the most common scenario where this transform is called as ``MixUp()(imgs_batch, labels_batch)``. @@ -271,7 +284,8 @@ class CutMix(_BaseMixUpCutMix): Args: alpha (float, optional): hyperparameter of the Beta distribution used for mixup. Default is 1. - num_classes (int): number of classes in the batch. Used for one-hot-encoding. + num_classes (int, optional): number of classes in the batch. Used for one-hot-encoding. + Can be None only if the labels are already one-hot-encoded. labels_getter (callable or "default", optional): indicates how to identify the labels in the input. By default, this will pick the second parameter as the labels if it's a tensor. This covers the most common scenario where this transform is called as ``CutMix()(imgs_batch, labels_batch)``. diff --git a/torchvision/transforms/v2/_auto_augment.py b/torchvision/transforms/v2/_auto_augment.py index 193a1b280c6..4dd7ba343aa 100644 --- a/torchvision/transforms/v2/_auto_augment.py +++ b/torchvision/transforms/v2/_auto_augment.py @@ -1,5 +1,5 @@ import math -from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union +from typing import Any, Callable, cast, Dict, List, Optional, Tuple, Type, Union import PIL.Image import torch @@ -94,6 +94,8 @@ def _apply_image_or_video_transform( interpolation: Union[InterpolationMode, int], fill: Dict[Union[Type, str], _FillTypeJIT], ) -> ImageOrVideo: + # Note: this cast is wrong and is only here to make mypy happy (it disagrees with torchscript) + image = cast(torch.Tensor, image) fill_ = _get_fill(fill, type(image)) if transform_id == "Identity": @@ -322,7 +324,7 @@ def _get_policies( def forward(self, *inputs: Any) -> Any: flat_inputs_with_spec, image_or_video = self._flatten_and_extract_image_or_video(inputs) - height, width = get_size(image_or_video) + height, width = get_size(image_or_video) # type: ignore[arg-type] policy = self._policies[int(torch.randint(len(self._policies), ()))] @@ -411,7 +413,7 @@ def __init__( def forward(self, *inputs: Any) -> Any: flat_inputs_with_spec, image_or_video = self._flatten_and_extract_image_or_video(inputs) - height, width = get_size(image_or_video) + height, width = get_size(image_or_video) # type: ignore[arg-type] for _ in range(self.num_ops): transform_id, (magnitudes_fn, signed) = self._get_random_item(self._AUGMENTATION_SPACE) @@ -480,7 +482,7 @@ def __init__( def forward(self, *inputs: Any) -> Any: flat_inputs_with_spec, image_or_video = self._flatten_and_extract_image_or_video(inputs) - height, width = get_size(image_or_video) + height, width = get_size(image_or_video) # type: ignore[arg-type] transform_id, (magnitudes_fn, signed) = self._get_random_item(self._AUGMENTATION_SPACE) @@ -572,7 +574,7 @@ def _sample_dirichlet(self, params: torch.Tensor) -> torch.Tensor: def forward(self, *inputs: Any) -> Any: flat_inputs_with_spec, orig_image_or_video = self._flatten_and_extract_image_or_video(inputs) - height, width = get_size(orig_image_or_video) + height, width = get_size(orig_image_or_video) # type: ignore[arg-type] if isinstance(orig_image_or_video, torch.Tensor): image_or_video = orig_image_or_video @@ -613,9 +615,7 @@ def forward(self, *inputs: Any) -> Any: else: magnitude = 0.0 - aug = self._apply_image_or_video_transform( - aug, transform_id, magnitude, interpolation=self.interpolation, fill=self._fill - ) + aug = self._apply_image_or_video_transform(aug, transform_id, magnitude, interpolation=self.interpolation, fill=self._fill) # type: ignore[assignment] mix.add_(combined_weights[:, i].reshape(batch_dims) * aug) mix = mix.reshape(orig_dims).to(dtype=image_or_video.dtype) diff --git a/torchvision/transforms/v2/_misc.py b/torchvision/transforms/v2/_misc.py index ad2c08150cc..6d62539ccd7 100644 --- a/torchvision/transforms/v2/_misc.py +++ b/torchvision/transforms/v2/_misc.py @@ -205,6 +205,33 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: return self._call_kernel(F.gaussian_blur, inpt, self.kernel_size, **params) +class GaussianNoise(Transform): + """Add gaussian noise to images or videos. + + The input tensor is expected to be in [..., 1 or 3, H, W] format, + where ... means it can have an arbitrary number of leading dimensions. + Each image or frame in a batch will be transformed independently i.e. the + noise added to each image will be different. + + The input tensor is also expected to be of float dtype in ``[0, 1]``. + This transform does not support PIL images. + + Args: + mean (float): Mean of the sampled normal distribution. Default is 0. + sigma (float): Standard deviation of the sampled normal distribution. Default is 0.1. + clip (bool, optional): Whether to clip the values in ``[0, 1]`` after adding noise. Default is True. + """ + + def __init__(self, mean: float = 0.0, sigma: float = 0.1, clip=True) -> None: + super().__init__() + self.mean = mean + self.sigma = sigma + self.clip = clip + + def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: + return self._call_kernel(F.gaussian_noise, inpt, mean=self.mean, sigma=self.sigma, clip=self.clip) + + class ToDtype(Transform): """Converts the input to a specific dtype, optionally scaling the values for images or videos. diff --git a/torchvision/transforms/v2/functional/__init__.py b/torchvision/transforms/v2/functional/__init__.py index 4d4bbf2e86d..d5705d55c4b 100644 --- a/torchvision/transforms/v2/functional/__init__.py +++ b/torchvision/transforms/v2/functional/__init__.py @@ -136,6 +136,9 @@ gaussian_blur, gaussian_blur_image, gaussian_blur_video, + gaussian_noise, + gaussian_noise_image, + gaussian_noise_video, normalize, normalize_image, normalize_video, diff --git a/torchvision/transforms/v2/functional/_color.py b/torchvision/transforms/v2/functional/_color.py index 3025f876dff..34d1e101dbd 100644 --- a/torchvision/transforms/v2/functional/_color.py +++ b/torchvision/transforms/v2/functional/_color.py @@ -687,7 +687,7 @@ def permute_channels(inpt: torch.Tensor, permutation: List[int]) -> torch.Tensor Example: >>> rgb_image = torch.rand(3, 256, 256) - >>> bgr_image = F.permutate_channels(rgb_image, permutation=[2, 1, 0]) + >>> bgr_image = F.permute_channels(rgb_image, permutation=[2, 1, 0]) Args: permutation (List[int]): Valid permutation of the input channel indices. The index of the element determines the @@ -730,7 +730,7 @@ def permute_channels_image(image: torch.Tensor, permutation: List[int]) -> torch @_register_kernel_internal(permute_channels, PIL.Image.Image) -def _permute_channels_image_pil(image: PIL.Image.Image, permutation: List[int]) -> PIL.Image: +def _permute_channels_image_pil(image: PIL.Image.Image, permutation: List[int]) -> PIL.Image.Image: return to_pil_image(permute_channels_image(pil_to_tensor(image), permutation=permutation)) diff --git a/torchvision/transforms/v2/functional/_geometry.py b/torchvision/transforms/v2/functional/_geometry.py index 2a1250ddf6c..67338d1a839 100644 --- a/torchvision/transforms/v2/functional/_geometry.py +++ b/torchvision/transforms/v2/functional/_geometry.py @@ -113,7 +113,7 @@ def vertical_flip_image(image: torch.Tensor) -> torch.Tensor: @_register_kernel_internal(vertical_flip, PIL.Image.Image) -def _vertical_flip_image_pil(image: PIL.Image) -> PIL.Image: +def _vertical_flip_image_pil(image: PIL.Image.Image) -> PIL.Image.Image: return _FP.vflip(image) @@ -194,7 +194,7 @@ def resize( # according to our benchmarks on eager, non-AVX CPUs should still prefer u8->f32->interpolate->u8 path for bilinear def _do_native_uint8_resize_on_cpu(interpolation: InterpolationMode) -> bool: if interpolation == InterpolationMode.BILINEAR: - if torch._dynamo.is_compiling(): + if torch.compiler.is_compiling(): return True else: return "AVX2" in torch.backends.cpu.get_cpu_capability() @@ -525,7 +525,7 @@ def _get_inverse_affine_matrix( def _compute_affine_output_size(matrix: List[float], w: int, h: int) -> Tuple[int, int]: - if torch._dynamo.is_compiling() and not torch.jit.is_scripting(): + if torch.compiler.is_compiling() and not torch.jit.is_scripting(): return _compute_affine_output_size_python(matrix, w, h) else: return _compute_affine_output_size_tensor(matrix, w, h) diff --git a/torchvision/transforms/v2/functional/_misc.py b/torchvision/transforms/v2/functional/_misc.py index 12d064f6638..84b686d50f9 100644 --- a/torchvision/transforms/v2/functional/_misc.py +++ b/torchvision/transforms/v2/functional/_misc.py @@ -181,6 +181,44 @@ def gaussian_blur_video( return gaussian_blur_image(video, kernel_size, sigma) +def gaussian_noise(inpt: torch.Tensor, mean: float = 0.0, sigma: float = 0.1, clip: bool = True) -> torch.Tensor: + """See :class:`~torchvision.transforms.v2.GaussianNoise`""" + if torch.jit.is_scripting(): + return gaussian_noise_image(inpt, mean=mean, sigma=sigma) + + _log_api_usage_once(gaussian_noise) + + kernel = _get_kernel(gaussian_noise, type(inpt)) + return kernel(inpt, mean=mean, sigma=sigma, clip=clip) + + +@_register_kernel_internal(gaussian_noise, torch.Tensor) +@_register_kernel_internal(gaussian_noise, tv_tensors.Image) +def gaussian_noise_image(image: torch.Tensor, mean: float = 0.0, sigma: float = 0.1, clip: bool = True) -> torch.Tensor: + if not image.is_floating_point(): + raise ValueError(f"Input tensor is expected to be in float dtype, got dtype={image.dtype}") + if sigma < 0: + raise ValueError(f"sigma shouldn't be negative. Got {sigma}") + + noise = mean + torch.randn_like(image) * sigma + out = image + noise + if clip: + out = torch.clamp(out, 0, 1) + return out + + +@_register_kernel_internal(gaussian_noise, tv_tensors.Video) +def gaussian_noise_video(video: torch.Tensor, mean: float = 0.0, sigma: float = 0.1, clip: bool = True) -> torch.Tensor: + return gaussian_noise_image(video, mean=mean, sigma=sigma, clip=clip) + + +@_register_kernel_internal(gaussian_noise, PIL.Image.Image) +def _gaussian_noise_pil( + video: torch.Tensor, mean: float = 0.0, sigma: float = 0.1, clip: bool = True +) -> PIL.Image.Image: + raise ValueError("Gaussian Noise is not implemented for PIL images.") + + def to_dtype(inpt: torch.Tensor, dtype: torch.dtype = torch.float, scale: bool = False) -> torch.Tensor: """See :func:`~torchvision.transforms.v2.ToDtype` for details.""" if torch.jit.is_scripting(): diff --git a/torchvision/utils.py b/torchvision/utils.py index 94b3ec65c87..6b2d19ec3dd 100644 --- a/torchvision/utils.py +++ b/torchvision/utils.py @@ -392,10 +392,10 @@ def draw_keypoints( # validate visibility if visibility is None: # set default visibility = torch.ones(keypoints.shape[:-1], dtype=torch.bool) - # If the last dimension is 1, e.g., after calling split([2, 1], dim=-1) on the output of a keypoint-prediction - # model, make sure visibility has shape (num_instances, K). - # Iff K = 1, this has unwanted behavior, but K=1 does not really make sense in the first place. - visibility = visibility.squeeze(-1) + if visibility.ndim == 3: + # If visibility was passed as pred.split([2, 1], dim=-1), it will be of shape (num_instances, K, 1). + # We make sure it is of shape (num_instances, K). This isn't documented, we're just being nice. + visibility = visibility.squeeze(-1) if visibility.ndim != 2: raise ValueError(f"visibility must be of shape (num_instances, K). Got ndim={visibility.ndim}") if visibility.shape != keypoints.shape[:-1]: