From adf944c05c11972ec89a69750c644f230ed8fa75 Mon Sep 17 00:00:00 2001
From: Gabriel Rasskin <43894452+grasskin@users.noreply.github.com>
Date: Fri, 12 Jan 2024 19:43:25 +0100
Subject: [PATCH 01/30] Add `version()` API to unify with Keras and KerasNLP
 (#2199)

* Unify `version` API with keras and keras_nlp

* Formatting

* Update to keep `version` parity with KerasNLP, support nightly version string

* Update version_utils.py

* Update version_utils.py
---
 keras_cv/__init__.py      |  4 ++--
 keras_cv/version_utils.py | 23 +++++++++++++++++++++++
 pip_build.py              | 13 ++++++++++++-
 setup.py                  | 20 ++++++++++++++++++++
 4 files changed, 57 insertions(+), 3 deletions(-)
 create mode 100644 keras_cv/version_utils.py

diff --git a/keras_cv/__init__.py b/keras_cv/__init__.py
index 1c9e7bcdf3..36c7d3511b 100644
--- a/keras_cv/__init__.py
+++ b/keras_cv/__init__.py
@@ -41,5 +41,5 @@
 from keras_cv.core import FactorSampler
 from keras_cv.core import NormalFactorSampler
 from keras_cv.core import UniformFactorSampler
-
-__version__ = "0.8.2"
+from keras_cv.version_utils import __version__
+from keras_cv.version_utils import version
diff --git a/keras_cv/version_utils.py b/keras_cv/version_utils.py
new file mode 100644
index 0000000000..527546c643
--- /dev/null
+++ b/keras_cv/version_utils.py
@@ -0,0 +1,23 @@
+# Copyright 2023 The KerasCV Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from keras_cv.api_export import keras_cv_export
+
+# Unique source of truth for the version number.
+__version__ = "0.8.2"
+
+
+@keras_cv_export("keras_cv.version")
+def version():
+    return __version__
diff --git a/pip_build.py b/pip_build.py
index 29f574001f..ba61963697 100644
--- a/pip_build.py
+++ b/pip_build.py
@@ -64,11 +64,22 @@ def export_version_string(version, is_nightly=False):
             )
             f.write(setup_contents)
 
+    # Overwrite the version string with our package version.
+    with open(os.path.join(package, "src", "version_utils.py")) as f:
+        version_contents = f.readlines()
+    with open(os.path.join(package, "src", "version_utils.py"), "w") as f:
+        for line in version_contents:
+            if line.startswith("__version__"):
+                f.write(f'__version__ = "{version}"\n')
+            else:
+                f.write(line)
+
     # Make sure to export the __version__ string
     with open(os.path.join(package, "__init__.py")) as f:
         init_contents = f.read()
     with open(os.path.join(package, "__init__.py"), "w") as f:
-        f.write(init_contents + "\n\n" + f'__version__ = "{version}"\n')
+        f.write(init_contents)
+        f.write("from keras_cv.src.version_utils import __version__\n")
 
 
 def copy_source_to_build_directory(root_path):
diff --git a/setup.py b/setup.py
index ffe7cbb4a8..19dc42248c 100644
--- a/setup.py
+++ b/setup.py
@@ -21,6 +21,21 @@
 from setuptools import setup
 from setuptools.dist import Distribution
 
+
+def read(rel_path):
+    here = os.path.abspath(os.path.dirname(__file__))
+    with open(os.path.join(here, rel_path)) as fp:
+        return fp.read()
+
+
+def get_version(rel_path):
+    for line in read(rel_path).splitlines():
+        if line.startswith("__version__"):
+            delim = '"' if '"' in line else "'"
+            return line.split(delim)[1]
+    raise RuntimeError("Unable to find version string.")
+
+
 BUILD_WITH_CUSTOM_OPS = (
     "BUILD_WITH_CUSTOM_OPS" in os.environ
     and os.environ["BUILD_WITH_CUSTOM_OPS"] == "true"
@@ -28,6 +43,10 @@
 
 HERE = pathlib.Path(__file__).parent
 README = (HERE / "README.md").read_text()
+if os.path.exists("keras_cv/version_utils.py"):
+    VERSION = get_version("keras_cv/version_utils.py")
+else:
+    VERSION = get_version("keras_cv/src/version_utils.py")
 
 
 class BinaryDistribution(Distribution):
@@ -45,6 +64,7 @@ def is_pure(self):
     description="Industry-strength computer Vision extensions for Keras.",
     long_description=README,
     long_description_content_type="text/markdown",
+    version=VERSION,
     url="https://github.com/keras-team/keras-cv",
     author="Keras team",
     author_email="keras-cv@google.com",

From 9fbf36b9bd57f2fb8f4c93192106fd60e8c4e605 Mon Sep 17 00:00:00 2001
From: Sachin Prasad <sachinprasad@google.com>
Date: Fri, 12 Jan 2024 15:18:56 -0800
Subject: [PATCH 02/30] Update random_crop_and_zoom.py (#2294)

* Update random_crop_and_zoom.py

* Update description

* rename file
---
 ...random_crop_and_zoom.py => random_crop_and_resize_demo.py} | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
 rename examples/layers/preprocessing/classification/{random_crop_and_zoom.py => random_crop_and_resize_demo.py} (90%)

diff --git a/examples/layers/preprocessing/classification/random_crop_and_zoom.py b/examples/layers/preprocessing/classification/random_crop_and_resize_demo.py
similarity index 90%
rename from examples/layers/preprocessing/classification/random_crop_and_zoom.py
rename to examples/layers/preprocessing/classification/random_crop_and_resize_demo.py
index 3fe8aa1e00..cb8f9e5ffe 100644
--- a/examples/layers/preprocessing/classification/random_crop_and_zoom.py
+++ b/examples/layers/preprocessing/classification/random_crop_and_resize_demo.py
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""random_resized_crop_demo.py.py shows how to use the RandomResizedCrop
-preprocessing layer. Operates on an image of elephant. In this script the image
+"""This demo example shows how to use the RandomCropAndResize preprocessing
+layer. Operates on an image of elephant. In this script the image
 is loaded, then are passed through the preprocessing layers.
 Finally, they are shown using matplotlib.
 """

From a6a9cc23432a6a319f19356f9bae9ca67e1bc78e Mon Sep 17 00:00:00 2001
From: Haifeng Jin <5476582+haifeng-jin@users.noreply.github.com>
Date: Wed, 17 Jan 2024 12:57:05 -0800
Subject: [PATCH 03/30] bug fix (#2303)

---
 keras_cv/models/stable_diffusion/stable_diffusion.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/keras_cv/models/stable_diffusion/stable_diffusion.py b/keras_cv/models/stable_diffusion/stable_diffusion.py
index 299f44d3d0..a68923dc78 100644
--- a/keras_cv/models/stable_diffusion/stable_diffusion.py
+++ b/keras_cv/models/stable_diffusion/stable_diffusion.py
@@ -209,7 +209,10 @@ def generate_image(
             latent = self._get_initial_diffusion_noise(batch_size, seed)
 
         # Iterative reverse diffusion stage
-        timesteps = np.arange(1, 1000, 1000 // num_steps)
+        num_timesteps = 1000
+        ratio = (num_timesteps - 1) / (num_steps - 1)
+        timesteps = (np.arange(0, num_steps) * ratio).round().astype(np.int64)
+
         alphas, alphas_prev = self._get_initial_alphas(timesteps)
         progbar = keras.utils.Progbar(len(timesteps))
         iteration = 0

From 5796b76f9c30f37d8a3e4c60fda3ed9b25567635 Mon Sep 17 00:00:00 2001
From: Hamid Ali <hamidriasat@gmail.com>
Date: Thu, 18 Jan 2024 01:59:15 +0500
Subject: [PATCH 04/30] Add BASNet Segmentation Model (#2006) (#2271)

* BASNet model initial code structure

* adding test and initial preset details

* adding comments

* cleaning and formatting code

* keras 3 support added

* disabling preset test for BASNet
---
 keras_cv/models/__init__.py                   |   1 +
 keras_cv/models/segmentation/__init__.py      |   1 +
 .../models/segmentation/basnet/__init__.py    |  15 +
 keras_cv/models/segmentation/basnet/basnet.py | 454 ++++++++++++++++++
 .../segmentation/basnet/basnet_presets.py     |  51 ++
 .../models/segmentation/basnet/basnet_test.py | 138 ++++++
 6 files changed, 660 insertions(+)
 create mode 100644 keras_cv/models/segmentation/basnet/__init__.py
 create mode 100644 keras_cv/models/segmentation/basnet/basnet.py
 create mode 100644 keras_cv/models/segmentation/basnet/basnet_presets.py
 create mode 100644 keras_cv/models/segmentation/basnet/basnet_test.py

diff --git a/keras_cv/models/__init__.py b/keras_cv/models/__init__.py
index ae775ed824..b9b90b946a 100644
--- a/keras_cv/models/__init__.py
+++ b/keras_cv/models/__init__.py
@@ -190,6 +190,7 @@
 from keras_cv.models.object_detection.yolo_v8.yolo_v8_detector import (
     YOLOV8Detector,
 )
+from keras_cv.models.segmentation import BASNet
 from keras_cv.models.segmentation import DeepLabV3Plus
 from keras_cv.models.segmentation import SAMMaskDecoder
 from keras_cv.models.segmentation import SAMPromptEncoder
diff --git a/keras_cv/models/segmentation/__init__.py b/keras_cv/models/segmentation/__init__.py
index aa4ffab4a4..13a9795dda 100644
--- a/keras_cv/models/segmentation/__init__.py
+++ b/keras_cv/models/segmentation/__init__.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from keras_cv.models.segmentation.basnet import BASNet
 from keras_cv.models.segmentation.deeplab_v3_plus import DeepLabV3Plus
 from keras_cv.models.segmentation.segformer import SegFormer
 from keras_cv.models.segmentation.segment_anything import SAMMaskDecoder
diff --git a/keras_cv/models/segmentation/basnet/__init__.py b/keras_cv/models/segmentation/basnet/__init__.py
new file mode 100644
index 0000000000..b51fd6c004
--- /dev/null
+++ b/keras_cv/models/segmentation/basnet/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2023 The KerasCV Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from keras_cv.models.segmentation.basnet.basnet import BASNet
diff --git a/keras_cv/models/segmentation/basnet/basnet.py b/keras_cv/models/segmentation/basnet/basnet.py
new file mode 100644
index 0000000000..2803d4425c
--- /dev/null
+++ b/keras_cv/models/segmentation/basnet/basnet.py
@@ -0,0 +1,454 @@
+# Copyright 2023 The KerasCV Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+
+from keras_cv.api_export import keras_cv_export
+from keras_cv.backend import keras
+from keras_cv.models import utils
+from keras_cv.models.backbones.backbone_presets import backbone_presets
+from keras_cv.models.backbones.resnet_v1.resnet_v1_backbone import (
+    apply_basic_block as resnet_basic_block,
+)
+from keras_cv.models.segmentation.basnet.basnet_presets import basnet_presets
+from keras_cv.models.segmentation.basnet.basnet_presets import (
+    presets_no_weights,
+)
+from keras_cv.models.segmentation.basnet.basnet_presets import (
+    presets_with_weights,
+)
+from keras_cv.models.task import Task
+from keras_cv.utils.python_utils import classproperty
+
+
+@keras_cv_export(
+    [
+        "keras_cv.models.BASNet",
+        "keras_cv.models.segmentation.BASNet",
+    ]
+)
+class BASNet(Task):
+    """
+    A Keras model implementing the BASNet architecture for semantic
+    segmentation.
+
+    References:
+        - [BASNet: Boundary-Aware Segmentation Network for Mobile and Web Applications](https://arxiv.org/abs/2101.04704)
+
+    Args:
+        backbone: `keras.Model`. The backbone network for the model that is
+            used as a feature extractor for BASNet prediction encoder. Currently
+            supported backbones are ResNet18 and ResNet34. Default backbone is
+            `keras_cv.models.ResNet34Backbone()`
+            (Note: Do not specify 'input_shape', 'input_tensor', or 'include_rescaling'
+            within the backbone. Please provide these while initializing the
+            'BASNet' model.)
+        num_classes: int, the number of classes for the segmentation model.
+        input_shape: optional shape tuple, defaults to (None, None, 3).
+        input_tensor: optional Keras tensor (i.e., output of `layers.Input()`)
+            to use as image input for the model.
+        include_rescaling: bool, whether to rescale the inputs. If set
+            to `True`, inputs will be passed through a `Rescaling(1/255.0)`
+            layer.
+        projection_filters: int, number of filters in the convolution layer
+            projecting low-level features from the `backbone`.
+        prediction_heads: (Optional) List of `keras.layers.Layer` defining
+            the prediction module head for the model. If not provided, a
+            default head is created with a Conv2D layer followed by resizing.
+        refinement_head: (Optional) a `keras.layers.Layer` defining the
+            refinement module head for the model. If not provided, a default
+            head is created with a Conv2D layer.
+
+    Examples:
+    ```python
+
+    import keras_cv
+
+    images = np.ones(shape=(1, 288, 288, 3))
+    labels = np.zeros(shape=(1, 288, 288, 1))
+
+    # Note: Do not specify 'input_shape', 'input_tensor', or
+    # 'include_rescaling' within the backbone.
+    backbone = keras_cv.models.ResNet34Backbone()
+    model = keras_cv.models.segmentation.BASNet(
+        backbone=backbone,
+        num_classes=1,
+        input_shape=[288, 288, 3],
+        include_rescaling=False
+    )
+
+    # Evaluate model
+    output = model(images)
+    pred_labels = output[0]
+
+    # Train model
+    model.compile(
+        optimizer="adam",
+        loss=keras.losses.BinaryCrossentropy(from_logits=False),
+        metrics=["accuracy"],
+    )
+    model.fit(images, labels, epochs=3)
+        ```
+    """  # noqa: E501
+
+    def __init__(
+        self,
+        backbone,
+        num_classes,
+        input_shape=(None, None, 3),
+        input_tensor=None,
+        include_rescaling=False,
+        projection_filters=64,
+        prediction_heads=None,
+        refinement_head=None,
+        **kwargs,
+    ):
+        if not isinstance(backbone, keras.layers.Layer) or not isinstance(
+            backbone, keras.Model
+        ):
+            raise ValueError(
+                "Argument `backbone` must be a `keras.layers.Layer` instance"
+                f" or `keras.Model`. Received instead"
+                f" backbone={backbone} (of type {type(backbone)})."
+            )
+
+        if backbone.input_shape != (None, None, None, 3):
+            raise ValueError(
+                "Do not specify 'input_shape' or 'input_tensor' within the"
+                " 'BASNet' backbone. \nPlease provide 'input_shape' or"
+                " 'input_tensor' while initializing the 'BASNet' model."
+            )
+
+        inputs = utils.parse_model_inputs(input_shape, input_tensor)
+        x = inputs
+
+        if include_rescaling:
+            x = keras.layers.Rescaling(1 / 255.0)(x)
+
+        if prediction_heads is None:
+            prediction_heads = []
+            for size in (1, 2, 4, 8, 16, 32, 32):
+                head_layers = [
+                    keras.layers.Conv2D(
+                        num_classes, kernel_size=(3, 3), padding="same"
+                    )
+                ]
+                if size != 1:
+                    head_layers.append(
+                        keras.layers.UpSampling2D(
+                            size=size, interpolation="bilinear"
+                        )
+                    )
+                prediction_heads.append(keras.Sequential(head_layers))
+
+        if refinement_head is None:
+            refinement_head = keras.Sequential(
+                [
+                    keras.layers.Conv2D(
+                        num_classes, kernel_size=(3, 3), padding="same"
+                    ),
+                ]
+            )
+
+        # Prediction model.
+        predict_model = basnet_predict(
+            x, backbone, projection_filters, prediction_heads
+        )
+
+        # Refinement model.
+        refine_model = basnet_rrm(
+            predict_model, projection_filters, refinement_head
+        )
+
+        outputs = refine_model.outputs  # Combine outputs.
+        outputs.extend(predict_model.outputs)
+
+        outputs = [
+            keras.layers.Activation("sigmoid", dtype="float32")(_)
+            for _ in outputs
+        ]  # Activations.
+
+        super().__init__(inputs=inputs, outputs=outputs, **kwargs)
+
+        self.backbone = backbone
+        self.num_classes = num_classes
+        self.input_tensor = input_tensor
+        self.include_rescaling = include_rescaling
+        self.projection_filters = projection_filters
+        self.prediction_heads = prediction_heads
+        self.refinement_head = refinement_head
+
+    def get_config(self):
+        return {
+            "backbone": keras.saving.serialize_keras_object(self.backbone),
+            "num_classes": self.num_classes,
+            "input_shape": self.input_shape[1:],
+            "input_tensor": keras.saving.serialize_keras_object(
+                self.input_tensor
+            ),
+            "include_rescaling": self.include_rescaling,
+            "projection_filters": self.projection_filters,
+            "prediction_heads": [
+                keras.saving.serialize_keras_object(prediction_head)
+                for prediction_head in self.prediction_heads
+            ],
+            "refinement_head": keras.saving.serialize_keras_object(
+                self.refinement_head
+            ),
+        }
+
+    @classmethod
+    def from_config(cls, config):
+        if "backbone" in config and isinstance(config["backbone"], dict):
+            input_shape = (None, None, 3)
+            if isinstance(config["backbone"]["config"]["input_shape"], list):
+                input_shape = list(input_shape)
+            if config["backbone"]["config"]["input_shape"] != input_shape:
+                config["input_shape"] = config["backbone"]["config"][
+                    "input_shape"
+                ]
+                config["backbone"]["config"]["input_shape"] = input_shape
+            config["backbone"] = keras.layers.deserialize(config["backbone"])
+
+        if "input_tensor" in config and isinstance(
+            config["input_tensor"], dict
+        ):
+            config["input_tensor"] = keras.layers.deserialize(
+                config["input_tensor"]
+            )
+
+        if "prediction_heads" in config and isinstance(
+            config["prediction_heads"], list
+        ):
+            for i in range(len(config["prediction_heads"])):
+                if isinstance(config["prediction_heads"][i], dict):
+                    config["prediction_heads"][i] = keras.layers.deserialize(
+                        config["prediction_heads"][i]
+                    )
+
+        if "refinement_head" in config and isinstance(
+            config["refinement_head"], dict
+        ):
+            config["refinement_head"] = keras.layers.deserialize(
+                config["refinement_head"]
+            )
+        return super().from_config(config)
+
+    @classproperty
+    def presets(cls):
+        """Dictionary of preset names and configurations."""
+        filtered_backbone_presets = copy.deepcopy(
+            {
+                k: v
+                for k, v in backbone_presets.items()
+                if k in ("resnet18", "resnet34")
+            }
+        )
+
+        return copy.deepcopy({**filtered_backbone_presets, **basnet_presets})
+
+    @classproperty
+    def presets_with_weights(cls):
+        """
+        Dictionary of preset names and configurations that include weights.
+        """
+        return copy.deepcopy(presets_with_weights)
+
+    @classproperty
+    def presets_without_weights(cls):
+        """
+        Dictionary of preset names and configurations that has no weights.
+        """
+        return copy.deepcopy(presets_no_weights)
+
+    @classproperty
+    def backbone_presets(cls):
+        """
+        Dictionary of preset names and configurations of compatible backbones.
+        """
+        filtered_backbone_presets = copy.deepcopy(
+            {
+                k: v
+                for k, v in backbone_presets.items()
+                if k in ("resnet18", "resnet34")
+            }
+        )
+        filtered_presets = copy.deepcopy(filtered_backbone_presets)
+        return filtered_presets
+
+
+def convolution_block(x_input, filters, dilation=1):
+    """
+    Apply convolution + batch normalization + ReLU activation.
+
+    Args:
+        x_input: Input keras tensor.
+        filters: int, number of output filters in the convolution.
+        dilation: int, dilation rate for the convolution operation.
+            Defaults to 1.
+
+    Returns:
+        A tensor with convolution, batch normalization, and ReLU
+        activation applied.
+    """
+    x = keras.layers.Conv2D(
+        filters, (3, 3), padding="same", dilation_rate=dilation
+    )(x_input)
+    x = keras.layers.BatchNormalization()(x)
+    return keras.layers.Activation("relu")(x)
+
+
+def get_resnet_block(_resnet, block_num):
+    """
+    Extract and return a specific ResNet block.
+
+    Args:
+        _resnet: `keras.Model`. ResNet model instance.
+        block_num: int, block number to extract.
+
+    Returns:
+        A Keras Model representing the specified ResNet block.
+    """
+
+    extractor_levels = ["P2", "P3", "P4", "P5"]
+    return keras.models.Model(
+        inputs=_resnet.get_layer(f"v2_stack_{block_num}_block1_1_conv").input,
+        outputs=_resnet.get_layer(
+            _resnet.pyramid_level_inputs[extractor_levels[block_num]]
+        ).output,
+        name=f"resnet_block{block_num + 1}",
+    )
+
+
+def basnet_predict(x_input, backbone, filters, segmentation_heads):
+    """
+    BASNet Prediction Module.
+
+    This module outputs a coarse label map by integrating heavy
+    encoder, bridge, and decoder blocks.
+
+    Args:
+        x_input: Input keras tensor.
+        backbone: `keras.Model`. The backbone network used as a feature
+            extractor for BASNet prediction encoder.
+        filters: int, the number of filters.
+        segmentation_heads: List of `keras.layers.Layer`, A list of Keras
+            layers serving as the segmentation head for prediction module.
+
+
+    Returns:
+        A Keras Model that integrates the encoder, bridge, and decoder
+        blocks for coarse label map prediction.
+    """
+    num_stages = 6
+
+    x = x_input
+
+    # -------------Encoder--------------
+    x = keras.layers.Conv2D(filters, kernel_size=(3, 3), padding="same")(x)
+
+    encoder_blocks = []
+    for i in range(num_stages):
+        if i < 4:  # First four stages are adopted from ResNet backbone.
+            x = get_resnet_block(backbone, i)(x)
+            encoder_blocks.append(x)
+        else:  # Last 2 stages consist of three basic resnet blocks.
+            x = keras.layers.MaxPool2D(pool_size=(2, 2), strides=(2, 2))(x)
+            for j in range(3):
+                x = resnet_basic_block(
+                    x,
+                    filters=x.shape[3],
+                    conv_shortcut=False,
+                    name=f"v1_basic_block_{i + 1}_{j + 1}",
+                )
+            encoder_blocks.append(x)
+
+    # -------------Bridge-------------
+    x = convolution_block(x, filters=filters * 8, dilation=2)
+    x = convolution_block(x, filters=filters * 8, dilation=2)
+    x = convolution_block(x, filters=filters * 8, dilation=2)
+    encoder_blocks.append(x)
+
+    # -------------Decoder-------------
+    decoder_blocks = []
+    for i in reversed(range(num_stages)):
+        if i != (num_stages - 1):  # Except first, scale other decoder stages.
+            x = keras.layers.UpSampling2D(size=2, interpolation="bilinear")(x)
+
+        x = keras.layers.concatenate([encoder_blocks[i], x], axis=-1)
+        x = convolution_block(x, filters=filters * 8)
+        x = convolution_block(x, filters=filters * 8)
+        x = convolution_block(x, filters=filters * 8)
+        decoder_blocks.append(x)
+
+    decoder_blocks.reverse()  # Change order from last to first decoder stage.
+    decoder_blocks.append(encoder_blocks[-1])  # Copy bridge to decoder.
+
+    # -------------Side Outputs--------------
+    decoder_blocks = [
+        segmentation_head(decoder_block)  # Prediction segmentation head.
+        for segmentation_head, decoder_block in zip(
+            segmentation_heads, decoder_blocks
+        )
+    ]
+
+    return keras.models.Model(inputs=[x_input], outputs=decoder_blocks)
+
+
+def basnet_rrm(base_model, filters, segmentation_head):
+    """
+    BASNet Residual Refinement Module (RRM).
+
+    This module outputs a fine label map by integrating light encoder,
+    bridge, and decoder blocks.
+
+    Args:
+        base_model: Keras model used as the base or coarse label map.
+        filters: int, the number of filters.
+        segmentation_head: a `keras.layers.Layer`, A Keras layer serving
+            as the segmentation head for refinement module.
+
+    Returns:
+        A Keras Model that constructs the Residual Refinement Module (RRM).
+    """
+    num_stages = 4
+
+    x_input = base_model.output[0]
+
+    # -------------Encoder--------------
+    x = keras.layers.Conv2D(filters, kernel_size=(3, 3), padding="same")(
+        x_input
+    )
+
+    encoder_blocks = []
+    for _ in range(num_stages):
+        x = convolution_block(x, filters=filters)
+        encoder_blocks.append(x)
+        x = keras.layers.MaxPool2D(pool_size=(2, 2), strides=(2, 2))(x)
+
+    # -------------Bridge--------------
+    x = convolution_block(x, filters=filters)
+
+    # -------------Decoder--------------
+    for i in reversed(range(num_stages)):
+        x = keras.layers.UpSampling2D(size=2, interpolation="bilinear")(x)
+        x = keras.layers.concatenate([encoder_blocks[i], x], axis=-1)
+        x = convolution_block(x, filters=filters)
+
+    x = segmentation_head(x)  # Refinement segmentation head.
+
+    # ------------- refined = coarse + residual
+    x = keras.layers.Add()([x_input, x])  # Add prediction + refinement output
+
+    return keras.models.Model(inputs=base_model.input, outputs=[x])
diff --git a/keras_cv/models/segmentation/basnet/basnet_presets.py b/keras_cv/models/segmentation/basnet/basnet_presets.py
new file mode 100644
index 0000000000..69d323fd0f
--- /dev/null
+++ b/keras_cv/models/segmentation/basnet/basnet_presets.py
@@ -0,0 +1,51 @@
+# Copyright 2023 The KerasCV Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""BASNet model preset configurations."""
+
+from keras_cv.models.backbones.resnet_v1 import resnet_v1_backbone_presets
+
+presets_no_weights = {
+    "basnet_resnet18": {
+        "metadata": {
+            "description": "BASNet with a ResNet18 v1 backbone.",
+            "params": 98780872,
+            "official_name": "BASNet",
+            "path": "basnet_resnet18",
+        },
+        "config": {
+            "backbone": resnet_v1_backbone_presets.backbone_presets["resnet18"],
+            "num_classes": 1,
+            "input_shape": (288, 288, 3),
+        },
+    },
+    "basnet_resnet34": {
+        "metadata": {
+            "description": "BASNet with a ResNet34 v1 backbone.",
+            "params": 108896456,
+            "official_name": "BASNet",
+            "path": "basnet_resnet34",
+        },
+        "config": {
+            "backbone": resnet_v1_backbone_presets.backbone_presets["resnet34"],
+            "num_classes": 1,
+            "input_shape": (288, 288, 3),
+        },
+    },
+}
+
+presets_with_weights = {
+    # TODO: Add BASNet preset with weights
+}
+
+basnet_presets = {**presets_no_weights, **presets_with_weights}
diff --git a/keras_cv/models/segmentation/basnet/basnet_test.py b/keras_cv/models/segmentation/basnet/basnet_test.py
new file mode 100644
index 0000000000..81ebd8e13e
--- /dev/null
+++ b/keras_cv/models/segmentation/basnet/basnet_test.py
@@ -0,0 +1,138 @@
+# Copyright 2023 The KerasCV Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import numpy as np
+import pytest
+import tensorflow as tf
+from absl.testing import parameterized
+
+from keras_cv.backend import keras
+from keras_cv.backend import ops
+from keras_cv.backend.config import keras_3
+from keras_cv.models import BASNet
+from keras_cv.models import ResNet34Backbone
+from keras_cv.tests.test_case import TestCase
+
+
+class BASNetTest(TestCase):
+    def test_basnet_construction(self):
+        backbone = ResNet34Backbone()
+        model = BASNet(
+            input_shape=[288, 288, 3], backbone=backbone, num_classes=1
+        )
+        model.compile(
+            optimizer="adam",
+            loss=keras.losses.BinaryCrossentropy(),
+            metrics=["accuracy"],
+        )
+
+    @pytest.mark.large
+    def test_basnet_call(self):
+        backbone = ResNet34Backbone()
+        model = BASNet(
+            input_shape=[288, 288, 3], backbone=backbone, num_classes=1
+        )
+        images = np.random.uniform(size=(2, 288, 288, 3))
+        _ = model(images)
+        _ = model.predict(images)
+
+    @pytest.mark.large
+    @pytest.mark.filterwarnings("ignore::UserWarning")
+    def test_weights_change(self):
+        input_size = [288, 288, 3]
+        target_size = [288, 288, 1]
+
+        images = np.ones([1] + input_size)
+        labels = np.random.uniform(size=[1] + target_size)
+        ds = tf.data.Dataset.from_tensor_slices((images, labels))
+        ds = ds.repeat(2)
+        ds = ds.batch(2)
+
+        backbone = ResNet34Backbone()
+        model = BASNet(
+            input_shape=[288, 288, 3], backbone=backbone, num_classes=1
+        )
+        model_metrics = ["accuracy"]
+        if keras_3():
+            model_metrics = ["accuracy" for _ in range(8)]
+
+        model.compile(
+            optimizer="adam",
+            loss=keras.losses.BinaryCrossentropy(),
+            metrics=model_metrics,
+        )
+
+        original_weights = model.refinement_head.get_weights()
+        model.fit(ds, epochs=1)
+        updated_weights = model.refinement_head.get_weights()
+
+        for w1, w2 in zip(original_weights, updated_weights):
+            self.assertNotAllEqual(w1, w2)
+            self.assertFalse(ops.any(ops.isnan(w2)))
+
+    @pytest.mark.large
+    def test_with_model_preset_forward_pass(self):
+        self.skipTest("Skipping preset test until BASNet weights are added.")
+        model = BASNet.from_preset(
+            "basnet_resnet34",
+        )
+        image = np.ones((1, 288, 288, 3))
+        output = ops.expand_dims(ops.argmax(model(image), axis=-1), axis=-1)
+        output = output[0]
+        expected_output = np.zeros((1, 288, 288, 1))
+        self.assertAllClose(output, expected_output)
+
+    @pytest.mark.large
+    def test_saved_model(self):
+        target_size = [288, 288, 3]
+
+        backbone = ResNet34Backbone()
+        model = BASNet(
+            input_shape=[288, 288, 3], backbone=backbone, num_classes=1
+        )
+
+        input_batch = np.ones(shape=[2] + target_size)
+        model_output = model(input_batch)
+
+        save_path = os.path.join(self.get_temp_dir(), "model.keras")
+        if keras_3():
+            model.save(save_path)
+        else:
+            model.save(save_path, save_format="keras_v3")
+        restored_model = keras.models.load_model(save_path)
+
+        # Check we got the real object back.
+        self.assertIsInstance(restored_model, BASNet)
+
+        # Check that output matches.
+        restored_output = restored_model(input_batch)
+        self.assertAllClose(model_output, restored_output)
+
+
+@pytest.mark.large
+class BASNetSmokeTest(TestCase):
+    @parameterized.named_parameters(
+        *[(preset, preset) for preset in ["resnet18", "resnet34"]]
+    )
+    def test_backbone_preset(self, preset):
+        model = BASNet.from_preset(
+            preset,
+            num_classes=1,
+        )
+        xs = np.random.uniform(size=(1, 128, 128, 3))
+        output = model(xs)[0]
+
+        self.assertEqual(output.shape, (1, 128, 128, 1))

From 01e5c34091e67a63186d148c9cc3b9e1fbeb1dd8 Mon Sep 17 00:00:00 2001
From: Sachin Prasad <sachinprasad@google.com>
Date: Thu, 18 Jan 2024 15:59:38 -0800
Subject: [PATCH 05/30] Fix image.shape type (#2305)

Fixed image.shape issue for tensorflow backend
---
 keras_cv/layers/object_detection/anchor_generator.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/keras_cv/layers/object_detection/anchor_generator.py b/keras_cv/layers/object_detection/anchor_generator.py
index 30dd421afd..effc125143 100644
--- a/keras_cv/layers/object_detection/anchor_generator.py
+++ b/keras_cv/layers/object_detection/anchor_generator.py
@@ -172,7 +172,7 @@ def __call__(self, image=None, image_shape=None):
                     "Expected `image` to be a Tensor of rank 3. Got "
                     f"image.shape.rank={len(image.shape)}"
                 )
-            image_shape = image.shape
+            image_shape = tuple(image.shape)
 
         results = {}
         for key, generator in self.anchor_generators.items():

From d73aa8cb64252130fc4fb4ed9d0601a9c46d159f Mon Sep 17 00:00:00 2001
From: Sachin Prasad <sachinprasad@google.com>
Date: Mon, 29 Jan 2024 13:02:18 -0800
Subject: [PATCH 06/30] Create workflow for auto assignment of issues and for
 stale issues (#2313)

* Create auto-assignment.yaml

* Create auto-assignment.js

* Create stale-issue-pr.yaml

* Rename auto-assignment.yaml to auto-assignment.yml

* Rename stale-issue-pr.yaml to stale-issue-pr.yml
---
 .github/workflows/auto-assignment.yml        | 21 ++++++++
 .github/workflows/scripts/auto-assignment.js | 43 +++++++++++++++++
 .github/workflows/stale-issue-pr.yml         | 50 ++++++++++++++++++++
 3 files changed, 114 insertions(+)
 create mode 100644 .github/workflows/auto-assignment.yml
 create mode 100644 .github/workflows/scripts/auto-assignment.js
 create mode 100644 .github/workflows/stale-issue-pr.yml

diff --git a/.github/workflows/auto-assignment.yml b/.github/workflows/auto-assignment.yml
new file mode 100644
index 0000000000..de72da8ba2
--- /dev/null
+++ b/.github/workflows/auto-assignment.yml
@@ -0,0 +1,21 @@
+name: auto-assignment
+on:
+  issues:
+    types:
+      - opened
+
+permissions:
+  contents: read
+  issues: write
+  pull-requests: write
+
+jobs:
+  welcome:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: actions/github-script@v7
+        with:
+          script: |
+            const script = require('./\.github/workflows/scripts/auto-assignment.js')
+            script({github, context})
diff --git a/.github/workflows/scripts/auto-assignment.js b/.github/workflows/scripts/auto-assignment.js
new file mode 100644
index 0000000000..d08b06a8b7
--- /dev/null
+++ b/.github/workflows/scripts/auto-assignment.js
@@ -0,0 +1,43 @@
+/** Automatically assign issues and PRs to users in the `assigneesList` 
+ *  on a rotating basis.
+
+  @param {!object}
+    GitHub objects can call GitHub APIs using their built-in library functions.
+    The context object contains issue and PR details.
+*/
+
+module.exports = async ({ github, context }) => {
+  let issueNumber;
+  let assigneesList;
+  // Is this an issue? If so, assign the issue number. Otherwise, assign the PR number.
+  if (context.payload.issue) {
+    //assignee List for issues. 
+    assigneesList = ["SuryanarayanaY", "sachinprasadhs"];
+    issueNumber = context.payload.issue.number;
+  } else {
+    //assignee List for PRs. 
+    assigneesList = [];
+    issueNumber = context.payload.number;
+  }
+  console.log("assignee list", assigneesList);
+  console.log("entered auto assignment for this issue:  ", issueNumber);
+  if (!assigneesList.length) {
+    console.log("No assignees found for this repo.");
+    return;
+  }
+  let noOfAssignees = assigneesList.length;
+  let selection = issueNumber % noOfAssignees;
+  let assigneeForIssue = assigneesList[selection];
+
+  console.log(
+    "issue Number = ",
+    issueNumber + " , assigning to: ",
+    assigneeForIssue
+  );
+  return github.rest.issues.addAssignees({
+    issue_number: context.issue.number,
+    owner: context.repo.owner,
+    repo: context.repo.repo,
+    assignees: [assigneeForIssue],
+  });
+};
diff --git a/.github/workflows/stale-issue-pr.yml b/.github/workflows/stale-issue-pr.yml
new file mode 100644
index 0000000000..034fb4c266
--- /dev/null
+++ b/.github/workflows/stale-issue-pr.yml
@@ -0,0 +1,50 @@
+name: Close inactive issues
+on:
+  schedule:
+    - cron: "30 1 * * *"
+jobs:
+  close-issues:
+    runs-on: ubuntu-latest
+    permissions:
+      issues: write
+      pull-requests: write
+    steps:
+      - name: Awaiting response issues
+        uses: actions/stale@v9
+        with:
+          days-before-issue-stale: 14
+          days-before-issue-close: 14
+          stale-issue-label: "stale"
+          # reason for closed the issue default value is not_planned
+          close-issue-reason: completed
+          only-labels: "stat:awaiting response from contributor"
+          stale-issue-message: > 
+            This issue is stale because it has been open for 14 days with no activity.
+            It will be closed if no further activity occurs. Thank you.
+          # List of labels to remove when issues/PRs unstale. 
+          labels-to-remove-when-unstale: "stat:awaiting response from contributor"
+          close-issue-message: >
+            This issue was closed because it has been inactive for 28 days.
+            Please reopen if you'd like to work on this further.
+          days-before-pr-stale: 14
+          days-before-pr-close: 14
+          stale-pr-message: "This PR is stale because it has been open for 14 days with no activity. It will be closed if no further activity occurs. Thank you."
+          close-pr-message: "This PR was closed because it has been inactive for 28 days. Please reopen if you'd like to work on this further."
+          repo-token: ${{ secrets.GITHUB_TOKEN }}
+      - name: Contribution issues
+        uses: actions/stale@v9
+        with:
+          days-before-issue-stale: 180
+          days-before-issue-close: 365
+          stale-issue-label: "stale"
+          # reason for closed the issue default value is not_planned
+          close-issue-reason: not_planned
+          any-of-labels: "stat:contributions welcome,good first issue"
+          # List of labels to remove when issues/PRs unstale. 
+          labels-to-remove-when-unstale: "stat:contributions welcome,good first issue"
+          stale-issue-message: > 
+            This issue is stale because it has been open for 180 days with no activity.
+            It will be closed if no further activity occurs. Thank you.
+          close-issue-message: >
+            This issue was closed because it has been inactive for more than 1 year.
+          repo-token: ${{ secrets.GITHUB_TOKEN }}

From 813d43d69319dfbfd265b3a14d54e1e642254b1d Mon Sep 17 00:00:00 2001
From: Ramesh Sampath <1437573+sampathweb@users.noreply.github.com>
Date: Tue, 30 Jan 2024 13:03:02 -0600
Subject: [PATCH 07/30] Fix format and Update Vectorized Base (#2323)

---
 benchmarks/vectorized_randomly_zoomed_crop.py |  8 ++++----
 .../base_image_augmentation_layer.py          | 12 +++++------
 .../preprocessing/random_crop_and_resize.py   |  8 ++++----
 ...ectorized_base_image_augmentation_layer.py | 20 +++++++++++--------
 ...ized_base_image_augmentation_layer_test.py | 12 +++++++++++
 .../layers/regularization/squeeze_excite.py   |  8 ++++----
 keras_cv/layers/vit_det_layers.py             |  6 +++---
 .../object_detection/box_coco_metrics.py      |  6 +++---
 .../backbones/densenet/densenet_backbone.py   |  6 +++---
 .../backbones/resnet_v1/resnet_v1_backbone.py |  6 +++---
 .../backbones/resnet_v2/resnet_v2_backbone.py |  6 +++---
 .../backbones/vit_det/vit_det_backbone.py     |  6 +++---
 keras_cv/models/legacy/darknet.py             |  1 -
 keras_cv/models/legacy/mlp_mixer.py           |  1 -
 .../yolo_v8/yolo_v8_backbone.py               |  6 +++---
 .../yolo_v8/yolo_v8_detector.py               |  6 +++---
 .../stable_diffusion/noise_scheduler.py       |  4 +---
 17 files changed, 67 insertions(+), 55 deletions(-)

diff --git a/benchmarks/vectorized_randomly_zoomed_crop.py b/benchmarks/vectorized_randomly_zoomed_crop.py
index 4e807fd1ab..3a207ed2e3 100644
--- a/benchmarks/vectorized_randomly_zoomed_crop.py
+++ b/benchmarks/vectorized_randomly_zoomed_crop.py
@@ -249,10 +249,10 @@ def from_config(cls, config):
                 config["zoom_factor"]
             )
         if isinstance(config["aspect_ratio_factor"], dict):
-            config[
-                "aspect_ratio_factor"
-            ] = keras.utils.deserialize_keras_object(
-                config["aspect_ratio_factor"]
+            config["aspect_ratio_factor"] = (
+                keras.utils.deserialize_keras_object(
+                    config["aspect_ratio_factor"]
+                )
             )
         return cls(**config)
 
diff --git a/keras_cv/layers/preprocessing/base_image_augmentation_layer.py b/keras_cv/layers/preprocessing/base_image_augmentation_layer.py
index ef2e9cefe7..167da7ad0b 100644
--- a/keras_cv/layers/preprocessing/base_image_augmentation_layer.py
+++ b/keras_cv/layers/preprocessing/base_image_augmentation_layer.py
@@ -236,15 +236,15 @@ def _compute_output_signature(self, inputs):
         bounding_boxes = inputs.get(BOUNDING_BOXES, None)
 
         if bounding_boxes is not None:
-            fn_output_signature[
-                BOUNDING_BOXES
-            ] = self._compute_bounding_box_signature(bounding_boxes)
+            fn_output_signature[BOUNDING_BOXES] = (
+                self._compute_bounding_box_signature(bounding_boxes)
+            )
 
         segmentation_masks = inputs.get(SEGMENTATION_MASKS, None)
         if segmentation_masks is not None:
-            fn_output_signature[
-                SEGMENTATION_MASKS
-            ] = self.compute_image_signature(segmentation_masks)
+            fn_output_signature[SEGMENTATION_MASKS] = (
+                self.compute_image_signature(segmentation_masks)
+            )
 
         keypoints = inputs.get(KEYPOINTS, None)
         if keypoints is not None:
diff --git a/keras_cv/layers/preprocessing/random_crop_and_resize.py b/keras_cv/layers/preprocessing/random_crop_and_resize.py
index 593515ad09..cd947d5835 100644
--- a/keras_cv/layers/preprocessing/random_crop_and_resize.py
+++ b/keras_cv/layers/preprocessing/random_crop_and_resize.py
@@ -272,10 +272,10 @@ def from_config(cls, config):
                 config["crop_area_factor"]
             )
         if isinstance(config["aspect_ratio_factor"], dict):
-            config[
-                "aspect_ratio_factor"
-            ] = keras.utils.deserialize_keras_object(
-                config["aspect_ratio_factor"]
+            config["aspect_ratio_factor"] = (
+                keras.utils.deserialize_keras_object(
+                    config["aspect_ratio_factor"]
+                )
             )
         return cls(**config)
 
diff --git a/keras_cv/layers/preprocessing/vectorized_base_image_augmentation_layer.py b/keras_cv/layers/preprocessing/vectorized_base_image_augmentation_layer.py
index 3d9fc8e52a..fd36e22065 100644
--- a/keras_cv/layers/preprocessing/vectorized_base_image_augmentation_layer.py
+++ b/keras_cv/layers/preprocessing/vectorized_base_image_augmentation_layer.py
@@ -17,6 +17,7 @@
 
 from keras_cv import bounding_box
 from keras_cv.api_export import keras_cv_export
+from keras_cv.backend import config
 from keras_cv.backend import keras
 from keras_cv.backend import ops
 from keras_cv.backend import scope
@@ -412,6 +413,8 @@ def _batch_augment(self, inputs):
     def call(self, inputs):
         # try to convert a given backend native tensor to TensorFlow tensor
         # before passing it over to TFDataScope
+        is_tf_backend = config.backend() == "tensorflow"
+        is_in_tf_graph = not tf.executing_eagerly()
         contains_ragged = lambda y: any(
             tree.map_structure(
                 lambda x: isinstance(x, (tf.RaggedTensor, tf.SparseTensor)),
@@ -419,7 +422,7 @@ def call(self, inputs):
             )
         )
         inputs_contain_ragged = contains_ragged(inputs)
-        if not inputs_contain_ragged:
+        if not is_tf_backend and not inputs_contain_ragged:
             inputs = tree.map_structure(
                 lambda x: tf.convert_to_tensor(x), inputs
             )
@@ -443,13 +446,14 @@ def call(self, inputs):
         # backend native tensors. This is to avoid breaking TF data
         # pipelines that can't easily be ported to become backend
         # agnostic.
-        if not inputs_contain_ragged and not contains_ragged(outputs):
-            outputs = tree.map_structure(
-                # some layers return None, handle that case when
-                # converting to tensors
-                lambda x: ops.convert_to_tensor(x) if x is not None else x,
-                outputs,
-            )
+        if not is_tf_backend and not is_in_tf_graph:
+            if not inputs_contain_ragged and not contains_ragged(outputs):
+                outputs = tree.map_structure(
+                    # some layers return None, handle that case when
+                    # converting to tensors
+                    lambda x: ops.convert_to_tensor(x) if x is not None else x,
+                    outputs,
+                )
         return outputs
 
     def _format_inputs(self, inputs):
diff --git a/keras_cv/layers/preprocessing/vectorized_base_image_augmentation_layer_test.py b/keras_cv/layers/preprocessing/vectorized_base_image_augmentation_layer_test.py
index 3ebdfdb820..c2d0daa840 100644
--- a/keras_cv/layers/preprocessing/vectorized_base_image_augmentation_layer_test.py
+++ b/keras_cv/layers/preprocessing/vectorized_base_image_augmentation_layer_test.py
@@ -549,3 +549,15 @@ def test_converts_ragged_to_dense_segmentation_masks(self):
             {"images": images, "segmentation_masks": segmentation_masks}
         )
         self.assertTrue(isinstance(result["segmentation_masks"], tf.Tensor))
+
+    def test_in_tf_data_pipeline(self):
+        images = np.random.randn(4, 100, 100, 3).astype("float32")
+        train_ds = tf.data.Dataset.from_tensor_slices(images)
+        train_ds = train_ds.map(lambda x: {"images": x})
+        train_ds = train_ds.map(
+            VectorizedRandomAddLayer(fixed_value=2.0)
+        ).batch(4)
+        for output in train_ds.take(1):
+            pass
+        self.assertTrue(isinstance(output["images"], tf.Tensor))
+        self.assertAllClose(output["images"], images + 2.0)
diff --git a/keras_cv/layers/regularization/squeeze_excite.py b/keras_cv/layers/regularization/squeeze_excite.py
index cb03cc6942..8cbcc5bd94 100644
--- a/keras_cv/layers/regularization/squeeze_excite.py
+++ b/keras_cv/layers/regularization/squeeze_excite.py
@@ -118,10 +118,10 @@ def get_config(self):
     @classmethod
     def from_config(cls, config):
         if isinstance(config["squeeze_activation"], dict):
-            config[
-                "squeeze_activation"
-            ] = keras.saving.deserialize_keras_object(
-                config["squeeze_activation"]
+            config["squeeze_activation"] = (
+                keras.saving.deserialize_keras_object(
+                    config["squeeze_activation"]
+                )
             )
         if isinstance(config["excite_activation"], dict):
             config["excite_activation"] = keras.saving.deserialize_keras_object(
diff --git a/keras_cv/layers/vit_det_layers.py b/keras_cv/layers/vit_det_layers.py
index 9311a957f5..2e053db4cb 100644
--- a/keras_cv/layers/vit_det_layers.py
+++ b/keras_cv/layers/vit_det_layers.py
@@ -430,9 +430,9 @@ def __init__(
             key_dim=self.project_dim // self.num_heads,
             use_bias=use_bias,
             use_rel_pos=use_rel_pos,
-            input_size=input_size
-            if window_size == 0
-            else (window_size, window_size),
+            input_size=(
+                input_size if window_size == 0 else (window_size, window_size)
+            ),
         )
         self.mlp_block = MLP(
             mlp_dim,
diff --git a/keras_cv/metrics/object_detection/box_coco_metrics.py b/keras_cv/metrics/object_detection/box_coco_metrics.py
index a59af8c767..47d86ba1c2 100644
--- a/keras_cv/metrics/object_detection/box_coco_metrics.py
+++ b/keras_cv/metrics/object_detection/box_coco_metrics.py
@@ -212,9 +212,9 @@ def result_fn(self, force=False):
             )
             result = {}
             for i, key in enumerate(METRIC_NAMES):
-                result[
-                    self.name_prefix() + METRIC_MAPPING[key]
-                ] = py_func_result[i]
+                result[self.name_prefix() + METRIC_MAPPING[key]] = (
+                    py_func_result[i]
+                )
             return result
 
         obj.result = types.MethodType(result_fn, obj)
diff --git a/keras_cv/models/backbones/densenet/densenet_backbone.py b/keras_cv/models/backbones/densenet/densenet_backbone.py
index 28109b64fa..251f3601ec 100644
--- a/keras_cv/models/backbones/densenet/densenet_backbone.py
+++ b/keras_cv/models/backbones/densenet/densenet_backbone.py
@@ -119,9 +119,9 @@ def __init__(
             name=f"conv{len(stackwise_num_repeats) + 1}",
         )
 
-        pyramid_level_inputs[
-            f"P{len(stackwise_num_repeats) + 1}"
-        ] = utils.get_tensor_input_name(x)
+        pyramid_level_inputs[f"P{len(stackwise_num_repeats) + 1}"] = (
+            utils.get_tensor_input_name(x)
+        )
         x = keras.layers.BatchNormalization(
             axis=BN_AXIS, epsilon=BN_EPSILON, name="bn"
         )(x)
diff --git a/keras_cv/models/backbones/resnet_v1/resnet_v1_backbone.py b/keras_cv/models/backbones/resnet_v1/resnet_v1_backbone.py
index 61046234d3..07c896613c 100644
--- a/keras_cv/models/backbones/resnet_v1/resnet_v1_backbone.py
+++ b/keras_cv/models/backbones/resnet_v1/resnet_v1_backbone.py
@@ -130,9 +130,9 @@ def __init__(
                 first_shortcut=(block_type == "block" or stack_index > 0),
                 name=f"v2_stack_{stack_index}",
             )
-            pyramid_level_inputs[
-                f"P{stack_index + 2}"
-            ] = utils.get_tensor_input_name(x)
+            pyramid_level_inputs[f"P{stack_index + 2}"] = (
+                utils.get_tensor_input_name(x)
+            )
 
         # Create model.
         super().__init__(inputs=inputs, outputs=x, **kwargs)
diff --git a/keras_cv/models/backbones/resnet_v2/resnet_v2_backbone.py b/keras_cv/models/backbones/resnet_v2/resnet_v2_backbone.py
index a31841f7fc..6a0cc74740 100644
--- a/keras_cv/models/backbones/resnet_v2/resnet_v2_backbone.py
+++ b/keras_cv/models/backbones/resnet_v2/resnet_v2_backbone.py
@@ -136,9 +136,9 @@ def __init__(
                 first_shortcut=(block_type == "block" or stack_index > 0),
                 name=f"v2_stack_{stack_index}",
             )
-            pyramid_level_inputs[
-                f"P{stack_index + 2}"
-            ] = utils.get_tensor_input_name(x)
+            pyramid_level_inputs[f"P{stack_index + 2}"] = (
+                utils.get_tensor_input_name(x)
+            )
 
         x = keras.layers.BatchNormalization(
             axis=BN_AXIS, epsilon=BN_EPSILON, name="post_bn"
diff --git a/keras_cv/models/backbones/vit_det/vit_det_backbone.py b/keras_cv/models/backbones/vit_det/vit_det_backbone.py
index c2c21ab98e..beb730f4df 100644
--- a/keras_cv/models/backbones/vit_det/vit_det_backbone.py
+++ b/keras_cv/models/backbones/vit_det/vit_det_backbone.py
@@ -144,9 +144,9 @@ def __init__(
                 num_heads=num_heads,
                 use_bias=use_bias,
                 use_rel_pos=use_rel_pos,
-                window_size=window_size
-                if i not in global_attention_indices
-                else 0,
+                window_size=(
+                    window_size if i not in global_attention_indices else 0
+                ),
                 input_size=(img_size // patch_size, img_size // patch_size),
             )(x)
         x = keras.models.Sequential(
diff --git a/keras_cv/models/legacy/darknet.py b/keras_cv/models/legacy/darknet.py
index ea7fd429f2..2dc14d499d 100644
--- a/keras_cv/models/legacy/darknet.py
+++ b/keras_cv/models/legacy/darknet.py
@@ -76,7 +76,6 @@
 
 @keras.utils.register_keras_serializable(package="keras_cv.models")
 class DarkNet(keras.Model):
-
     """Represents the DarkNet architecture.
 
     The DarkNet architecture is commonly used for detection tasks. It is
diff --git a/keras_cv/models/legacy/mlp_mixer.py b/keras_cv/models/legacy/mlp_mixer.py
index a48544f905..170d0a4c6f 100644
--- a/keras_cv/models/legacy/mlp_mixer.py
+++ b/keras_cv/models/legacy/mlp_mixer.py
@@ -143,7 +143,6 @@ def apply_mixer_block(x, tokens_mlp_dim, channels_mlp_dim, name=None):
 
 @keras.utils.register_keras_serializable(package="keras_cv.models")
 class MLPMixer(keras.Model):
-
     """Instantiates the MLP Mixer architecture.
 
     Args:
diff --git a/keras_cv/models/object_detection/yolo_v8/yolo_v8_backbone.py b/keras_cv/models/object_detection/yolo_v8/yolo_v8_backbone.py
index f4bd99fafa..a2bf4bdd3b 100644
--- a/keras_cv/models/object_detection/yolo_v8/yolo_v8_backbone.py
+++ b/keras_cv/models/object_detection/yolo_v8/yolo_v8_backbone.py
@@ -178,9 +178,9 @@ def __init__(
                     activation=activation,
                     name=f"{stack_name}_spp_fast",
                 )
-            pyramid_level_inputs[
-                f"P{stack_id + 2}"
-            ] = utils.get_tensor_input_name(x)
+            pyramid_level_inputs[f"P{stack_id + 2}"] = (
+                utils.get_tensor_input_name(x)
+            )
 
         super().__init__(inputs=inputs, outputs=x, **kwargs)
         self.pyramid_level_inputs = pyramid_level_inputs
diff --git a/keras_cv/models/object_detection/yolo_v8/yolo_v8_detector.py b/keras_cv/models/object_detection/yolo_v8/yolo_v8_detector.py
index bfba44945c..6c17c71a72 100644
--- a/keras_cv/models/object_detection/yolo_v8/yolo_v8_detector.py
+++ b/keras_cv/models/object_detection/yolo_v8/yolo_v8_detector.py
@@ -663,9 +663,9 @@ def from_config(cls, config):
         if prediction_decoder is not None and isinstance(
             prediction_decoder, dict
         ):
-            config[
-                "prediction_decoder"
-            ] = keras.saving.deserialize_keras_object(prediction_decoder)
+            config["prediction_decoder"] = (
+                keras.saving.deserialize_keras_object(prediction_decoder)
+            )
         return cls(**config)
 
     @classproperty
diff --git a/keras_cv/models/stable_diffusion/noise_scheduler.py b/keras_cv/models/stable_diffusion/noise_scheduler.py
index bd1c0dc51e..c5c100848c 100644
--- a/keras_cv/models/stable_diffusion/noise_scheduler.py
+++ b/keras_cv/models/stable_diffusion/noise_scheduler.py
@@ -54,9 +54,7 @@ def __init__(
         elif beta_schedule == "scaled_linear":
             # this schedule is very specific to the latent diffusion model.
             self.betas = (
-                ops.linspace(
-                    beta_start**0.5, beta_end**0.5, train_timesteps
-                )
+                ops.linspace(beta_start**0.5, beta_end**0.5, train_timesteps)
                 ** 2
             )
         else:

From d04fbccb9fcf3b32df1143d0bf976206cdaa4695 Mon Sep 17 00:00:00 2001
From: Ramesh Sampath <1437573+sampathweb@users.noreply.github.com>
Date: Tue, 30 Jan 2024 14:57:09 -0600
Subject: [PATCH 08/30] Fix CI Test for Basnet OOM and PyCoCo Test Failure for
 JAX (#2322)

---
 .kokoro/github/ubuntu/gpu/build.sh                 |  4 ++--
 keras_cv/metrics/coco/pycoco_wrapper.py            |  3 +++
 keras_cv/models/segmentation/basnet/basnet_test.py | 14 +++++++++-----
 3 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/.kokoro/github/ubuntu/gpu/build.sh b/.kokoro/github/ubuntu/gpu/build.sh
index c145fae536..9d07218317 100644
--- a/.kokoro/github/ubuntu/gpu/build.sh
+++ b/.kokoro/github/ubuntu/gpu/build.sh
@@ -51,7 +51,7 @@ pip install --no-deps -e "." --progress-bar off
 # Run Extra Large Tests for Continuous builds
 if [ "${RUN_XLARGE:-0}" == "1" ]
 then
-   pytest --check_gpu --run_large --run_extra_large --durations 0 \
+   pytest --cache-clear --check_gpu --run_large --run_extra_large --durations 0 \
       keras_cv/bounding_box \
       keras_cv/callbacks \
       keras_cv/losses \
@@ -65,7 +65,7 @@ then
       keras_cv/models/segmentation \
       keras_cv/models/stable_diffusion
 else
-   pytest --check_gpu --run_large --durations 0 \
+   pytest --cache-clear --check_gpu --run_large --durations 0 \
       keras_cv/bounding_box \
       keras_cv/callbacks \
       keras_cv/losses \
diff --git a/keras_cv/metrics/coco/pycoco_wrapper.py b/keras_cv/metrics/coco/pycoco_wrapper.py
index 3c09784388..659cdef0a0 100644
--- a/keras_cv/metrics/coco/pycoco_wrapper.py
+++ b/keras_cv/metrics/coco/pycoco_wrapper.py
@@ -125,6 +125,9 @@ def _convert_predictions_to_coco_annotations(predictions):
     num_batches = len(predictions["source_id"])
     for i in range(num_batches):
         batch_size = predictions["source_id"][i].shape[0]
+        predictions["detection_boxes"][i] = predictions["detection_boxes"][
+            i
+        ].copy()
         for j in range(batch_size):
             max_num_detections = predictions["num_detections"][i][j]
             predictions["detection_boxes"][i][j] = _yxyx_to_xywh(
diff --git a/keras_cv/models/segmentation/basnet/basnet_test.py b/keras_cv/models/segmentation/basnet/basnet_test.py
index 81ebd8e13e..3571f4e005 100644
--- a/keras_cv/models/segmentation/basnet/basnet_test.py
+++ b/keras_cv/models/segmentation/basnet/basnet_test.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import gc
 import os
 
 import numpy as np
@@ -23,13 +24,13 @@
 from keras_cv.backend import ops
 from keras_cv.backend.config import keras_3
 from keras_cv.models import BASNet
-from keras_cv.models import ResNet34Backbone
+from keras_cv.models import ResNet18Backbone
 from keras_cv.tests.test_case import TestCase
 
 
 class BASNetTest(TestCase):
     def test_basnet_construction(self):
-        backbone = ResNet34Backbone()
+        backbone = ResNet18Backbone()
         model = BASNet(
             input_shape=[288, 288, 3], backbone=backbone, num_classes=1
         )
@@ -41,7 +42,7 @@ def test_basnet_construction(self):
 
     @pytest.mark.large
     def test_basnet_call(self):
-        backbone = ResNet34Backbone()
+        backbone = ResNet18Backbone()
         model = BASNet(
             input_shape=[288, 288, 3], backbone=backbone, num_classes=1
         )
@@ -61,7 +62,7 @@ def test_weights_change(self):
         ds = ds.repeat(2)
         ds = ds.batch(2)
 
-        backbone = ResNet34Backbone()
+        backbone = ResNet18Backbone()
         model = BASNet(
             input_shape=[288, 288, 3], backbone=backbone, num_classes=1
         )
@@ -99,7 +100,7 @@ def test_with_model_preset_forward_pass(self):
     def test_saved_model(self):
         target_size = [288, 288, 3]
 
-        backbone = ResNet34Backbone()
+        backbone = ResNet18Backbone()
         model = BASNet(
             input_shape=[288, 288, 3], backbone=backbone, num_classes=1
         )
@@ -112,6 +113,9 @@ def test_saved_model(self):
             model.save(save_path)
         else:
             model.save(save_path, save_format="keras_v3")
+        # Free up model memory
+        del model
+        gc.collect()
         restored_model = keras.models.load_model(save_path)
 
         # Check we got the real object back.

From c67a0c75ad774d1bf006a572af53838144a23f43 Mon Sep 17 00:00:00 2001
From: Tirth Patel <tirthasheshpatel@gmail.com>
Date: Tue, 30 Jan 2024 14:38:37 -0800
Subject: [PATCH 09/30] Reduce memory consumption for BasNet tests (#2325)

---
 .../models/segmentation/basnet/basnet_test.py  | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/keras_cv/models/segmentation/basnet/basnet_test.py b/keras_cv/models/segmentation/basnet/basnet_test.py
index 3571f4e005..88408c134c 100644
--- a/keras_cv/models/segmentation/basnet/basnet_test.py
+++ b/keras_cv/models/segmentation/basnet/basnet_test.py
@@ -32,7 +32,7 @@ class BASNetTest(TestCase):
     def test_basnet_construction(self):
         backbone = ResNet18Backbone()
         model = BASNet(
-            input_shape=[288, 288, 3], backbone=backbone, num_classes=1
+            input_shape=[64, 64, 3], backbone=backbone, num_classes=1
         )
         model.compile(
             optimizer="adam",
@@ -44,17 +44,17 @@ def test_basnet_construction(self):
     def test_basnet_call(self):
         backbone = ResNet18Backbone()
         model = BASNet(
-            input_shape=[288, 288, 3], backbone=backbone, num_classes=1
+            input_shape=[64, 64, 3], backbone=backbone, num_classes=1
         )
-        images = np.random.uniform(size=(2, 288, 288, 3))
+        images = np.random.uniform(size=(2, 64, 64, 3))
         _ = model(images)
         _ = model.predict(images)
 
     @pytest.mark.large
     @pytest.mark.filterwarnings("ignore::UserWarning")
     def test_weights_change(self):
-        input_size = [288, 288, 3]
-        target_size = [288, 288, 1]
+        input_size = [64, 64, 3]
+        target_size = [64, 64, 1]
 
         images = np.ones([1] + input_size)
         labels = np.random.uniform(size=[1] + target_size)
@@ -64,7 +64,7 @@ def test_weights_change(self):
 
         backbone = ResNet18Backbone()
         model = BASNet(
-            input_shape=[288, 288, 3], backbone=backbone, num_classes=1
+            input_shape=[64, 64, 3], backbone=backbone, num_classes=1
         )
         model_metrics = ["accuracy"]
         if keras_3():
@@ -77,7 +77,7 @@ def test_weights_change(self):
         )
 
         original_weights = model.refinement_head.get_weights()
-        model.fit(ds, epochs=1)
+        model.fit(ds, epochs=1, batch_size=1)
         updated_weights = model.refinement_head.get_weights()
 
         for w1, w2 in zip(original_weights, updated_weights):
@@ -98,11 +98,11 @@ def test_with_model_preset_forward_pass(self):
 
     @pytest.mark.large
     def test_saved_model(self):
-        target_size = [288, 288, 3]
+        target_size = [64, 64, 3]
 
         backbone = ResNet18Backbone()
         model = BASNet(
-            input_shape=[288, 288, 3], backbone=backbone, num_classes=1
+            input_shape=[64, 64, 3], backbone=backbone, num_classes=1
         )
 
         input_batch = np.ones(shape=[2] + target_size)

From 34fe74c8284b448f906f8b3017d48438f60ea4ba Mon Sep 17 00:00:00 2001
From: Ramesh Sampath <1437573+sampathweb@users.noreply.github.com>
Date: Wed, 31 Jan 2024 05:26:35 -0600
Subject: [PATCH 10/30] Remove Bazel steps from Contributing Guide (#2324)

---
 CONTRIBUTING.md | 21 ---------------------
 1 file changed, 21 deletions(-)

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 7c04b1c707..6cd2487eed 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -112,15 +112,6 @@ The first line relies on having an installation of [the GitHub CLI](https://gith
 Following these commands you should be able to run the tests using `pytest keras_cv`.
 Please report any issues running tests following these steps.
 
-Note that this will _not_ install custom ops. If you'd like to install custom ops from source, you can compile the binaries and add them to your local environment manually (requires Bazel):
-
-```shell
-python build_deps/configure.py
-
-bazel build keras_cv/custom_ops:all
-mv bazel-bin/keras_cv/custom_ops/*.so keras_cv/custom_ops
-```
-
 ## Run tests
 
 KerasCV is tested using [PyTest](https://docs.pytest.org/en/6.2.x/).
@@ -148,18 +139,6 @@ You can run the unit tests for KerasCV by running:
 pytest keras_cv/
 ```
 
-### Tests that require custom ops
-
-For tests that require custom ops, you'll have to compile the custom ops and make them available to your local Python code:
-
-```shell
-python build_deps/configure.py
-bazel build keras_cv/custom_ops:all
-cp bazel-bin/keras_cv/custom_ops/*.so keras_cv/custom_ops/
-```
-
-Tests which use custom ops are disabled by default, but can be run by setting the environment variable `TEST_CUSTOM_OPS=true`.
-
 ## Formatting the Code
 
 We use `flake8`, `isort`, `black` and `clang-format` for code formatting. You can run

From 09601ba2e0cfc0d171dd9bbf559da963b517254c Mon Sep 17 00:00:00 2001
From: Ramesh Sampath <1437573+sampathweb@users.noreply.github.com>
Date: Wed, 31 Jan 2024 12:26:55 -0600
Subject: [PATCH 11/30] Update version to 0.8.3 (#2327)

---
 keras_cv/version_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/keras_cv/version_utils.py b/keras_cv/version_utils.py
index 527546c643..7b0568bdd7 100644
--- a/keras_cv/version_utils.py
+++ b/keras_cv/version_utils.py
@@ -15,7 +15,7 @@
 from keras_cv.api_export import keras_cv_export
 
 # Unique source of truth for the version number.
-__version__ = "0.8.2"
+__version__ = "0.8.3"
 
 
 @keras_cv_export("keras_cv.version")

From 94d7327735a2921b4da37939a5d61cd6fb9b525a Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 5 Feb 2024 11:13:20 -0800
Subject: [PATCH 12/30] Bump the github-actions group with 3 updates (#2330)

Bumps the github-actions group with 3 updates: [actions/cache](https://github.com/actions/cache), [actions/upload-artifact](https://github.com/actions/upload-artifact) and [github/codeql-action](https://github.com/github/codeql-action).


Updates `actions/cache` from 3 to 4
- [Release notes](https://github.com/actions/cache/releases)
- [Changelog](https://github.com/actions/cache/blob/main/RELEASES.md)
- [Commits](https://github.com/actions/cache/compare/v3...v4)

Updates `actions/upload-artifact` from 4.0.0 to 4.3.0
- [Release notes](https://github.com/actions/upload-artifact/releases)
- [Commits](https://github.com/actions/upload-artifact/compare/c7d193f32edcb7bfad88892161225aeda64e9392...26f96dfa697d77e81fd5907df203aa23a56210a8)

Updates `github/codeql-action` from 3.22.12 to 3.23.2
- [Release notes](https://github.com/github/codeql-action/releases)
- [Changelog](https://github.com/github/codeql-action/blob/main/CHANGELOG.md)
- [Commits](https://github.com/github/codeql-action/compare/012739e5082ff0c22ca6d6ab32e07c36df03c4a4...b7bf0a3ed3ecfa44160715d7c442788f65f0f923)

---
updated-dependencies:
- dependency-name: actions/cache
  dependency-type: direct:production
  update-type: version-update:semver-major
  dependency-group: github-actions
- dependency-name: actions/upload-artifact
  dependency-type: direct:production
  update-type: version-update:semver-minor
  dependency-group: github-actions
- dependency-name: github/codeql-action
  dependency-type: direct:production
  update-type: version-update:semver-minor
  dependency-group: github-actions
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 .github/workflows/actions.yml   | 6 +++---
 .github/workflows/nightly.yml   | 2 +-
 .github/workflows/release.yml   | 2 +-
 .github/workflows/scorecard.yml | 4 ++--
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/actions.yml b/.github/workflows/actions.yml
index a80bc1b735..8be69b967b 100644
--- a/.github/workflows/actions.yml
+++ b/.github/workflows/actions.yml
@@ -26,7 +26,7 @@ jobs:
         python -m pip install --upgrade pip setuptools
         echo "::set-output name=dir::$(pip cache dir)"
     - name: pip cache
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: ${{ steps.pip-cache.outputs.dir }}
         key: ${{ runner.os }}-pip-${{ hashFiles('setup.py') }}
@@ -65,7 +65,7 @@ jobs:
         python -m pip install --upgrade pip setuptools
         echo "::set-output name=dir::$(pip cache dir)"
     - name: pip cache
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: ${{ steps.pip-cache.outputs.dir }}
         key: ${{ runner.os }}-pip-${{ hashFiles('setup.py') }}
@@ -110,7 +110,7 @@ jobs:
         python -m pip install --upgrade pip setuptools
         echo "::set-output name=dir::$(pip cache dir)"
     - name: pip cache
-      uses: actions/cache@v3
+      uses: actions/cache@v4
       with:
         path: ${{ steps.pip-cache.outputs.dir }}
         key: ${{ runner.os }}-pip-${{ hashFiles('setup.py') }}
diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
index 4ec23461b3..ded0a461b2 100644
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -27,7 +27,7 @@ jobs:
           python -m pip install --upgrade pip setuptools
           echo "::set-output name=dir::$(pip cache dir)"
       - name: pip cache
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         with:
           path: ${{ steps.pip-cache.outputs.dir }}
           key: ${{ runner.os }}-pip-${{ hashFiles('setup.py') }}
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index afe38eb519..7a471e938a 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -25,7 +25,7 @@ jobs:
           python -m pip install --upgrade pip setuptools
           echo "::set-output name=dir::$(pip cache dir)"
       - name: pip cache
-        uses: actions/cache@v3
+        uses: actions/cache@v4
         with:
           path: ${{ steps.pip-cache.outputs.dir }}
           key: ${{ runner.os }}-pip-${{ hashFiles('setup.py') }}
diff --git a/.github/workflows/scorecard.yml b/.github/workflows/scorecard.yml
index ff310c9dee..98509aef93 100644
--- a/.github/workflows/scorecard.yml
+++ b/.github/workflows/scorecard.yml
@@ -45,7 +45,7 @@ jobs:
       # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF
       # format to the repository Actions tab.
       - name: "Upload artifact"
-        uses: actions/upload-artifact@c7d193f32edcb7bfad88892161225aeda64e9392 # v4.0.0
+        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 # v4.3.0
         with:
           name: SARIF file
           path: results.sarif
@@ -53,6 +53,6 @@ jobs:
 
       # Upload the results to GitHub's code scanning dashboard.
       - name: "Upload to code-scanning"
-        uses: github/codeql-action/upload-sarif@012739e5082ff0c22ca6d6ab32e07c36df03c4a4 # v3.22.12
+        uses: github/codeql-action/upload-sarif@b7bf0a3ed3ecfa44160715d7c442788f65f0f923 # v3.23.2
         with:
           sarif_file: results.sarif

From 1def8ba37763fa5185761bd0a266fb9acda590c2 Mon Sep 17 00:00:00 2001
From: Ramesh Sampath <1437573+sampathweb@users.noreply.github.com>
Date: Fri, 9 Feb 2024 13:15:19 -0600
Subject: [PATCH 13/30] Pin Jax Version in GPU CI (#2338)

---
 requirements-jax-cuda.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/requirements-jax-cuda.txt b/requirements-jax-cuda.txt
index 6f5ff522e0..a2d69180b8 100644
--- a/requirements-jax-cuda.txt
+++ b/requirements-jax-cuda.txt
@@ -7,7 +7,8 @@ torch>=2.1.0
 torchvision>=0.16.0
 
 # Jax with cuda support.
+# TODO: 0.4.24 has an updated Cuda version breaks Jax CI.
 --find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
-jax[cuda12_pip]
+jax[cuda12_pip]==0.4.23
 
 -r requirements-common.txt
\ No newline at end of file

From 9a61e1a253eade55c4dfff1c71516ea5a82bef9b Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Mon, 12 Feb 2024 11:05:05 -0800
Subject: [PATCH 14/30] Bump the python group with 5 updates (#2340)

Bumps the python group with 5 updates:

| Package | From | To |
| --- | --- | --- |
| [tf-nightly-cpu](https://github.com/tensorflow/tensorflow) | `2.16.0.dev20240104` | `2.16.0.dev20240209` |
| torch | `2.1.2+cu121` | `2.2.0+cu121` |
| torchvision | `0.16.2+cu121` | `0.17.0+cu121` |
| [tf-nightly[and-cuda]](https://github.com/tensorflow/tensorflow) | `2.16.0.dev20240104` | `2.16.0.dev20240209` |
| [jax[cuda12_pip]](https://github.com/google/jax) | `0.4.23` | `0.4.24` |


Updates `tf-nightly-cpu` from 2.16.0.dev20240104 to 2.16.0.dev20240209
- [Release notes](https://github.com/tensorflow/tensorflow/releases)
- [Changelog](https://github.com/tensorflow/tensorflow/blob/master/RELEASE.md)
- [Commits](https://github.com/tensorflow/tensorflow/commits)

Updates `torch` from 2.1.2+cu121 to 2.2.0+cu121

Updates `torchvision` from 0.16.2+cu121 to 0.17.0+cu121

Updates `tf-nightly[and-cuda]` from 2.16.0.dev20240104 to 2.16.0.dev20240209
- [Release notes](https://github.com/tensorflow/tensorflow/releases)
- [Changelog](https://github.com/tensorflow/tensorflow/blob/master/RELEASE.md)
- [Commits](https://github.com/tensorflow/tensorflow/commits)

Updates `jax[cuda12_pip]` from 0.4.23 to 0.4.24
- [Release notes](https://github.com/google/jax/releases)
- [Changelog](https://github.com/google/jax/blob/main/CHANGELOG.md)
- [Commits](https://github.com/google/jax/compare/jax-v0.4.23...jax-v0.4.24)

---
updated-dependencies:
- dependency-name: tf-nightly-cpu
  dependency-type: direct:production
  update-type: version-update:semver-patch
  dependency-group: python
- dependency-name: torch
  dependency-type: direct:production
  update-type: version-update:semver-minor
  dependency-group: python
- dependency-name: torchvision
  dependency-type: direct:production
  update-type: version-update:semver-minor
  dependency-group: python
- dependency-name: tf-nightly[and-cuda]
  dependency-type: direct:production
  update-type: version-update:semver-patch
  dependency-group: python
- dependency-name: jax[cuda12_pip]
  dependency-type: direct:production
  update-type: version-update:semver-patch
  dependency-group: python
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 requirements-jax-cuda.txt        | 4 ++--
 requirements-tensorflow-cuda.txt | 2 +-
 requirements-torch-cuda.txt      | 6 +++---
 requirements.txt                 | 2 +-
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/requirements-jax-cuda.txt b/requirements-jax-cuda.txt
index a2d69180b8..99157c6d66 100644
--- a/requirements-jax-cuda.txt
+++ b/requirements-jax-cuda.txt
@@ -1,5 +1,5 @@
 # Tensorflow cpu-only version.
-tf-nightly-cpu==2.16.0.dev20240104  # Pin a working nightly until rc0.
+tf-nightly-cpu==2.16.0.dev20240209  # Pin a working nightly until rc0.
 
 # Torch cpu-only version.
 --extra-index-url https://download.pytorch.org/whl/cpu
@@ -9,6 +9,6 @@ torchvision>=0.16.0
 # Jax with cuda support.
 # TODO: 0.4.24 has an updated Cuda version breaks Jax CI.
 --find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
-jax[cuda12_pip]==0.4.23
+jax[cuda12_pip]==0.4.24
 
 -r requirements-common.txt
\ No newline at end of file
diff --git a/requirements-tensorflow-cuda.txt b/requirements-tensorflow-cuda.txt
index 3889e83934..d27982d236 100644
--- a/requirements-tensorflow-cuda.txt
+++ b/requirements-tensorflow-cuda.txt
@@ -1,5 +1,5 @@
 # Tensorflow with cuda support.
-tf-nightly[and-cuda]==2.16.0.dev20240104  # Pin a working nightly until rc0.
+tf-nightly[and-cuda]==2.16.0.dev20240209  # Pin a working nightly until rc0.
 
 # Torch cpu-only version.
 --extra-index-url https://download.pytorch.org/whl/cpu
diff --git a/requirements-torch-cuda.txt b/requirements-torch-cuda.txt
index 5bfbc37546..a86868f903 100644
--- a/requirements-torch-cuda.txt
+++ b/requirements-torch-cuda.txt
@@ -1,10 +1,10 @@
 # Tensorflow cpu-only version.
-tf-nightly-cpu==2.16.0.dev20240104  # Pin a working nightly until rc0.
+tf-nightly-cpu==2.16.0.dev20240209  # Pin a working nightly until rc0.
 
 # Torch with cuda support.
 --extra-index-url https://download.pytorch.org/whl/cu121
-torch==2.1.2+cu121
-torchvision==0.16.2+cu121
+torch==2.2.0+cu121
+torchvision==0.17.0+cu121
 
 # Jax cpu-only version.
 jax[cpu]
diff --git a/requirements.txt b/requirements.txt
index 02792b29b7..b4791d9109 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 # Tensorflow.
-tf-nightly-cpu==2.16.0.dev20240104  # Pin a working nightly until rc0.
+tf-nightly-cpu==2.16.0.dev20240209  # Pin a working nightly until rc0.
 
 # Torch.
 --extra-index-url https://download.pytorch.org/whl/cpu

From 3d0d1ddc3069446f49b877093c0b466a3fec87a0 Mon Sep 17 00:00:00 2001
From: Sachin Prasad <sachinprasad@google.com>
Date: Mon, 12 Feb 2024 11:05:31 -0800
Subject: [PATCH 15/30] Update auto-assignment.js (#2337)

---
 .github/workflows/scripts/auto-assignment.js | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/scripts/auto-assignment.js b/.github/workflows/scripts/auto-assignment.js
index d08b06a8b7..f4979e71af 100644
--- a/.github/workflows/scripts/auto-assignment.js
+++ b/.github/workflows/scripts/auto-assignment.js
@@ -12,11 +12,11 @@ module.exports = async ({ github, context }) => {
   // Is this an issue? If so, assign the issue number. Otherwise, assign the PR number.
   if (context.payload.issue) {
     //assignee List for issues. 
-    assigneesList = ["SuryanarayanaY", "sachinprasadhs"];
+    assigneesList = ["sachinprasadhs"];
     issueNumber = context.payload.issue.number;
   } else {
     //assignee List for PRs. 
-    assigneesList = [];
+    assigneesList = ["sampathweb", "divyashreepathihalli"];
     issueNumber = context.payload.number;
   }
   console.log("assignee list", assigneesList);

From d42aa0772cb45f5f6410ee7b5f4a14b8d7b4638e Mon Sep 17 00:00:00 2001
From: Ramesh Sampath <1437573+sampathweb@users.noreply.github.com>
Date: Tue, 13 Feb 2024 12:01:29 -0600
Subject: [PATCH 16/30] Revert back to JAX 0.4.23 (#2342)

* Revert back to JAX 0.4.23

* Revert back to JAX 0.4.23

* Remove keras_core and use backend
---
 .github/dependabot.yml    | 3 +++
 keras_cv/conftest.py      | 3 +--
 requirements-jax-cuda.txt | 2 +-
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/.github/dependabot.yml b/.github/dependabot.yml
index 0df37b1230..6267930e9e 100644
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -21,3 +21,6 @@ updates:
       python:
         patterns:
           - "*"
+    ignore:
+        # TODO: ignore all updates for JAX GPU due to cuda version issue
+      - dependency-name: "jax[cuda12_pip]"
diff --git a/keras_cv/conftest.py b/keras_cv/conftest.py
index eaee5024b9..6d5630df53 100644
--- a/keras_cv/conftest.py
+++ b/keras_cv/conftest.py
@@ -12,7 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import keras_core
 import pytest
 import tensorflow as tf
 from packaging import version
@@ -101,7 +100,7 @@ def pytest_collection_modifyitems(config, items):
         reason="This test is only supported on Keras 2",
     )
     skip_tf_only = pytest.mark.skipif(
-        keras_3() and keras_core.backend.backend() != "tensorflow",
+        keras_3() and backend_config.backend() != "tensorflow",
         reason="This test is only supported on TensorFlow",
     )
     for item in items:
diff --git a/requirements-jax-cuda.txt b/requirements-jax-cuda.txt
index 99157c6d66..b3bb025e42 100644
--- a/requirements-jax-cuda.txt
+++ b/requirements-jax-cuda.txt
@@ -9,6 +9,6 @@ torchvision>=0.16.0
 # Jax with cuda support.
 # TODO: 0.4.24 has an updated Cuda version breaks Jax CI.
 --find-links https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
-jax[cuda12_pip]==0.4.24
+jax[cuda12_pip]==0.4.23
 
 -r requirements-common.txt
\ No newline at end of file

From e71ee60c6719e2b7fb064f0915b8a5794d6e3969 Mon Sep 17 00:00:00 2001
From: Ramesh Sampath <1437573+sampathweb@users.noreply.github.com>
Date: Tue, 13 Feb 2024 14:50:26 -0600
Subject: [PATCH 17/30] move conftest.py to top level (#2345)

---
 keras_cv/conftest.py => conftest.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename keras_cv/conftest.py => conftest.py (100%)

diff --git a/keras_cv/conftest.py b/conftest.py
similarity index 100%
rename from keras_cv/conftest.py
rename to conftest.py

From 79a1c26567ddc1313200a88476d235bb2d3020a7 Mon Sep 17 00:00:00 2001
From: Ramesh Sampath <1437573+sampathweb@users.noreply.github.com>
Date: Tue, 13 Feb 2024 15:05:50 -0600
Subject: [PATCH 18/30] Fix format of Custom Ops build (#2346)

---
 keras_cv/custom_ops/BUILD | 64 ++++++++++++++++++++++++++++-----------
 1 file changed, 47 insertions(+), 17 deletions(-)

diff --git a/keras_cv/custom_ops/BUILD b/keras_cv/custom_ops/BUILD
index 37b551dfbe..13f1342ea1 100644
--- a/keras_cv/custom_ops/BUILD
+++ b/keras_cv/custom_ops/BUILD
@@ -11,38 +11,68 @@ cc_library(
     name = "box_util",
     srcs = ["box_util.cc"],
     hdrs = ["box_util.h"],
+    copts = select({
+        ":windows": [
+            "/DEIGEN_STRONG_INLINE=inline",
+            "-DTENSORFLOW_MONOLITHIC_BUILD",
+            "/DPLATFORM_WINDOWS",
+            "/DEIGEN_HAS_C99_MATH",
+            "/DTENSORFLOW_USE_EIGEN_THREADPOOL",
+            "/DEIGEN_AVOID_STL_ARRAY",
+            "/Iexternal/gemmlowp",
+            "/wd4018",
+            "/wd4577",
+            "/DNOGDI",
+            "/UTF_COMPILE_LIBRARY",
+        ],
+        "//conditions:default": [
+            "-pthread",
+            "-std=c++17",
+        ],
+    }),
     deps = [
         "@local_config_tf//:libtensorflow_framework",
         "@local_config_tf//:tf_header_lib",
     ],
-    copts = select({
-        ":windows": ["/DEIGEN_STRONG_INLINE=inline", "-DTENSORFLOW_MONOLITHIC_BUILD", "/DPLATFORM_WINDOWS", "/DEIGEN_HAS_C99_MATH", "/DTENSORFLOW_USE_EIGEN_THREADPOOL", "/DEIGEN_AVOID_STL_ARRAY", "/Iexternal/gemmlowp", "/wd4018", "/wd4577", "/DNOGDI", "/UTF_COMPILE_LIBRARY"],
-        "//conditions:default": ["-pthread", "-std=c++17"],
-    }),
 )
 
 cc_binary(
-    name = '_keras_cv_custom_ops.so',
+    name = "_keras_cv_custom_ops.so",
     srcs = [
         "kernels/pairwise_iou_kernel.cc",
-        "ops/pairwise_iou_op.cc",
-        "kernels/withinbox_op.cc",
-        "ops/withinbox_op.cc",
         "kernels/within_any_box_op.cc",
+        "kernels/withinbox_op.cc",
+        "ops/pairwise_iou_op.cc",
         "ops/within_any_box_op.cc",
+        "ops/withinbox_op.cc",
     ],
+    copts = select({
+        ":windows": [
+            "/DEIGEN_STRONG_INLINE=inline",
+            "-DTENSORFLOW_MONOLITHIC_BUILD",
+            "/DPLATFORM_WINDOWS",
+            "/DEIGEN_HAS_C99_MATH",
+            "/DTENSORFLOW_USE_EIGEN_THREADPOOL",
+            "/DEIGEN_AVOID_STL_ARRAY",
+            "/Iexternal/gemmlowp",
+            "/wd4018",
+            "/wd4577",
+            "/DNOGDI",
+            "/UTF_COMPILE_LIBRARY",
+        ],
+        "//conditions:default": [
+            "-pthread",
+            "-std=c++17",
+        ],
+    }),
+    features = select({
+        ":windows": ["windows_export_all_symbols"],
+        "//conditions:default": [],
+    }),
     linkshared = 1,
     deps = [
+        ":box_util",
         "@local_config_tf//:libtensorflow_framework",
         "@local_config_tf//:tf_header_lib",
-        ":box_util",
     ],
-    features = select({
-        ":windows": ["windows_export_all_symbols"],
-        "//conditions:default": [],
-    }),
-    copts = select({
-        ":windows": ["/DEIGEN_STRONG_INLINE=inline", "-DTENSORFLOW_MONOLITHIC_BUILD", "/DPLATFORM_WINDOWS", "/DEIGEN_HAS_C99_MATH", "/DTENSORFLOW_USE_EIGEN_THREADPOOL", "/DEIGEN_AVOID_STL_ARRAY", "/Iexternal/gemmlowp", "/wd4018", "/wd4577", "/DNOGDI", "/UTF_COMPILE_LIBRARY"],
-        "//conditions:default": ["-pthread", "-std=c++17"],
-    }),
 )

From 2562f857d4fb0d120f9b40e0b29f77fec941d4f1 Mon Sep 17 00:00:00 2001
From: Ramesh Sampath <1437573+sampathweb@users.noreply.github.com>
Date: Tue, 13 Feb 2024 16:52:30 -0600
Subject: [PATCH 19/30] Fix Custom Ops Build (#2347)

---
 keras_cv/custom_ops/BUILD | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/keras_cv/custom_ops/BUILD b/keras_cv/custom_ops/BUILD
index 13f1342ea1..7b627d6cca 100644
--- a/keras_cv/custom_ops/BUILD
+++ b/keras_cv/custom_ops/BUILD
@@ -1,6 +1,8 @@
-licenses(["notice"])  # Apache 2.0
-
-package(default_visibility = ["//visibility:public"])
+package(
+    # copybara:uncomment default_applicable_licenses = ["//keras_cv:license"],
+    default_visibility = ["//visibility:public"],
+    licenses = ["notice"],
+)
 
 config_setting(
     name = "windows",

From 3fb584cae2dee309070825d0ea3a66c7294d6491 Mon Sep 17 00:00:00 2001
From: Ramesh Sampath <1437573+sampathweb@users.noreply.github.com>
Date: Tue, 13 Feb 2024 17:28:48 -0600
Subject: [PATCH 20/30] Add Waymo folder License to the top (#2348)

---
 LICENSE | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 57 insertions(+), 4 deletions(-)

diff --git a/LICENSE b/LICENSE
index f2e54070a8..c7a6ac1f74 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,3 +1,5 @@
+Files: keras_cv/*
+
 Copyright © 2023 The KerasCV Authors
 All code in this repository excluding the code located in
 keras_cv/layers/preprocessing_3d/waymo is licensed under the Apache License,
@@ -206,7 +208,58 @@ folder is licensed under terms appearing below.
    See the License for the specific language governing permissions and
    limitations under the License.
 
-# The following applies only to the code appearing in
-# keras_cv/layers/preprocessing_3d/waymo
-
-License: https://github.com/keras-team/keras-cv/blob/master/keras_cv/layers/preprocessing_3d/waymo/LICENSE
+---
+
+Files: keras_cv/layers/preprocessing_3d/waymo/*
+
+Copyright (c) 2023 Waymo LLC. All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+1. 	Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. 	Redistributions in binary form must reproduce the above
+copyright notice, this list of conditions and the following disclaimer
+in the documentation and/or other materials provided with the
+distribution.
+3. 	Neither the name of the copyright holder nor the names of its
+contributors may be used to endorse or promote products derived
+from this software without specific prior written permission.
+
+Additional IP Rights Grant (Patents)
+"Works" means the code located at keras_cv/layers/preprocessing_3d/waymo
+licensed from Waymo LLC ("Waymo") for inclusion in the KerasCV project at
+github.com/keras-team/keras-cv. “Patents" means the pending U.S. Patent App.
+No. 63/418,259 and any issued patents arising therefrom. Subject to the terms
+and conditions of this license, Waymo hereby grants to you a limited worldwide,
+non-exclusive, royalty-free, personal patent license to make, have made, use,
+and import the Works, where such license applies only to those Patent claims
+that are necessarily infringed by the Works executing the ”preprocessing_3d”
+augmentation library on 3D perception tasks using the
+“lidaraugment_keraspolicy.py” file. This grant does not include claims that
+would be infringed by combining the Works with other works, utilizing the Works
+on other tasks, or as a consequence of further modification of the Works. If
+you or your agent or exclusive licensee institute or order or agree to the
+institution of patent litigation or any other patent enforcement activity
+against any entity (including a cross-claim or counterclaim in a lawsuit)
+alleging that the Works or any activity using the Works to execute functions for
+3D perception tasks constitutes direct or contributory patent infringement, or
+inducement of patent infringement, then any patent rights granted to you under
+this license for the Works shall terminate as of the date such litigation is
+filed.
+
+DISCLAIMER
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

From 0c6944ff679953af7cbb6442f31475fc8bc403b9 Mon Sep 17 00:00:00 2001
From: Ramesh Sampath <1437573+sampathweb@users.noreply.github.com>
Date: Tue, 13 Feb 2024 18:08:38 -0600
Subject: [PATCH 21/30] update ops build (#2349)

---
 keras_cv/custom_ops/BUILD | 1 -
 1 file changed, 1 deletion(-)

diff --git a/keras_cv/custom_ops/BUILD b/keras_cv/custom_ops/BUILD
index 7b627d6cca..dcf45ab878 100644
--- a/keras_cv/custom_ops/BUILD
+++ b/keras_cv/custom_ops/BUILD
@@ -1,5 +1,4 @@
 package(
-    # copybara:uncomment default_applicable_licenses = ["//keras_cv:license"],
     default_visibility = ["//visibility:public"],
     licenses = ["notice"],
 )

From 0f9e34d3c470838f84d3645bc7e0091029c29273 Mon Sep 17 00:00:00 2001
From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com>
Date: Tue, 20 Feb 2024 11:06:44 -0600
Subject: [PATCH 22/30] Bump the github-actions group with 2 updates (#2343)

Bumps the github-actions group with 2 updates: [actions/upload-artifact](https://github.com/actions/upload-artifact) and [github/codeql-action](https://github.com/github/codeql-action).


Updates `actions/upload-artifact` from 4.3.0 to 4.3.1
- [Release notes](https://github.com/actions/upload-artifact/releases)
- [Commits](https://github.com/actions/upload-artifact/compare/26f96dfa697d77e81fd5907df203aa23a56210a8...5d5d22a31266ced268874388b861e4b58bb5c2f3)

Updates `github/codeql-action` from 3.23.2 to 3.24.1
- [Release notes](https://github.com/github/codeql-action/releases)
- [Changelog](https://github.com/github/codeql-action/blob/main/CHANGELOG.md)
- [Commits](https://github.com/github/codeql-action/compare/b7bf0a3ed3ecfa44160715d7c442788f65f0f923...e675ced7a7522a761fc9c8eb26682c8b27c42b2b)

---
updated-dependencies:
- dependency-name: actions/upload-artifact
  dependency-type: direct:production
  update-type: version-update:semver-patch
  dependency-group: github-actions
- dependency-name: github/codeql-action
  dependency-type: direct:production
  update-type: version-update:semver-minor
  dependency-group: github-actions
...

Signed-off-by: dependabot[bot] <support@github.com>
Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
---
 .github/workflows/scorecard.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/scorecard.yml b/.github/workflows/scorecard.yml
index 98509aef93..8bcdbe833a 100644
--- a/.github/workflows/scorecard.yml
+++ b/.github/workflows/scorecard.yml
@@ -45,7 +45,7 @@ jobs:
       # Upload the results as artifacts (optional). Commenting out will disable uploads of run results in SARIF
       # format to the repository Actions tab.
       - name: "Upload artifact"
-        uses: actions/upload-artifact@26f96dfa697d77e81fd5907df203aa23a56210a8 # v4.3.0
+        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # v4.3.1
         with:
           name: SARIF file
           path: results.sarif
@@ -53,6 +53,6 @@ jobs:
 
       # Upload the results to GitHub's code scanning dashboard.
       - name: "Upload to code-scanning"
-        uses: github/codeql-action/upload-sarif@b7bf0a3ed3ecfa44160715d7c442788f65f0f923 # v3.23.2
+        uses: github/codeql-action/upload-sarif@e675ced7a7522a761fc9c8eb26682c8b27c42b2b # v3.24.1
         with:
           sarif_file: results.sarif

From 15db57c056d790255715397aa33da2aef30fd52d Mon Sep 17 00:00:00 2001
From: Divyashree Sreepathihalli <divyashreepathihalli@gmail.com>
Date: Wed, 21 Feb 2024 17:11:27 -0800
Subject: [PATCH 23/30] Add CLIP to KerasCV (#2331)

* clip refactor

* code cleanup and reformat

* update encoder name

* update clip encoder name

* update clip encoder name in image encoder

* add weights conversion script

* update setup to install keras-nlp

* new black formatting

* add preset file

* update array

* update clip prests kaggle handle

* update text model

* update text encoder

* update position embeddings

* update positonal embeddings

* add attention masks

* update expanded mask

* revert previous commit

* change causal masks

* undo previous commit

* update attention masks

* update clip encoder

* add print statements

* update the pooler output

* remove print statements

* add tests and preset

* cleanup and reformat

* update build

* add copywrite to presets file

* fix build state errors

* update github actions and add preprocessor test

* incorporate review comments

* add modifications from review

* change import checks

* update keras_nlp import check

* update kokoro tests

* update kaggle preset version

* update install instructions for keras-nlp

---------

Co-authored-by: Divyashree Sreepathihalli <divyashreepathihalli>
---
 .github/workflows/actions.yml                 |    3 +
 .kokoro/github/ubuntu/gpu/build.sh            |    5 +
 keras_cv/models/__init__.py                   |    1 +
 keras_cv/models/feature_extractor/__init__.py |   13 +
 .../models/feature_extractor/clip/__init__.py |   23 +
 .../feature_extractor/clip/clip_encoder.py    |  321 +++++
 .../clip/clip_image_model.py                  |  170 +++
 .../feature_extractor/clip/clip_model.py      |  188 +++
 .../feature_extractor/clip/clip_model_test.py |  135 +++
 .../feature_extractor/clip/clip_presets.py    |   81 ++
 .../feature_extractor/clip/clip_processor.py  |  131 +++
 .../feature_extractor/clip/clip_text_model.py |  118 ++
 .../feature_extractor/clip/clip_tokenizer.py  |  186 +++
 .../clip_weights_conversion.ipynb             | 1032 +++++++++++++++++
 requirements-common.txt                       |    2 +-
 15 files changed, 2408 insertions(+), 1 deletion(-)
 create mode 100644 keras_cv/models/feature_extractor/__init__.py
 create mode 100644 keras_cv/models/feature_extractor/clip/__init__.py
 create mode 100644 keras_cv/models/feature_extractor/clip/clip_encoder.py
 create mode 100644 keras_cv/models/feature_extractor/clip/clip_image_model.py
 create mode 100644 keras_cv/models/feature_extractor/clip/clip_model.py
 create mode 100644 keras_cv/models/feature_extractor/clip/clip_model_test.py
 create mode 100644 keras_cv/models/feature_extractor/clip/clip_presets.py
 create mode 100644 keras_cv/models/feature_extractor/clip/clip_processor.py
 create mode 100644 keras_cv/models/feature_extractor/clip/clip_text_model.py
 create mode 100644 keras_cv/models/feature_extractor/clip/clip_tokenizer.py
 create mode 100644 keras_cv/tools/checkpoint_conversion/clip_weights_conversion.ipynb

diff --git a/.github/workflows/actions.yml b/.github/workflows/actions.yml
index 8be69b967b..316e623c57 100644
--- a/.github/workflows/actions.yml
+++ b/.github/workflows/actions.yml
@@ -38,6 +38,8 @@ jobs:
         pip install torch>=2.0.1+cpu
         pip install "jax[cpu]"
         pip install keras-core
+        pip install keras-nlp-nightly --no-deps
+        pip install tensorflow-text==2.15
         pip install -e ".[tests]" --progress-bar off --upgrade
     - name: Test with pytest
       env:
@@ -75,6 +77,7 @@ jobs:
       run: |
         pip install -r requirements.txt
         pip install -e ".[tests]" --progress-bar off --upgrade
+        pip install keras-nlp-nightly
     - name: Test with pytest
       env:
         TEST_CUSTOM_OPS: false # TODO(ianstenbit): test custom ops, or figure out what our story is here
diff --git a/.kokoro/github/ubuntu/gpu/build.sh b/.kokoro/github/ubuntu/gpu/build.sh
index 9d07218317..fedfcd0566 100644
--- a/.kokoro/github/ubuntu/gpu/build.sh
+++ b/.kokoro/github/ubuntu/gpu/build.sh
@@ -29,21 +29,26 @@ then
    pip install --extra-index-url https://download.pytorch.org/whl/cpu torch==2.1.0+cpu
    pip install torchvision~=0.16.0
    pip install "jax[cpu]"
+   pip install keras-nlp-nightly --no-deps
+   pip install tensorflow-text==2.15
 
 elif [ "$KERAS_BACKEND" == "tensorflow" ]
 then
    echo "TensorFlow backend detected."
    pip install -r requirements-tensorflow-cuda.txt --progress-bar off
+   pip install keras-nlp-nightly
 
 elif [ "$KERAS_BACKEND" == "jax" ]
 then
    echo "JAX backend detected."
    pip install -r requirements-jax-cuda.txt --progress-bar off
+   pip install keras-nlp-nightly
 
 elif [ "$KERAS_BACKEND" == "torch" ]
 then
    echo "PyTorch backend detected."
    pip install -r requirements-torch-cuda.txt --progress-bar off
+   pip install keras-nlp-nightly
 fi
 
 pip install --no-deps -e "." --progress-bar off
diff --git a/keras_cv/models/__init__.py b/keras_cv/models/__init__.py
index b9b90b946a..8e6a849a95 100644
--- a/keras_cv/models/__init__.py
+++ b/keras_cv/models/__init__.py
@@ -183,6 +183,7 @@
 from keras_cv.models.backbones.vit_det.vit_det_aliases import ViTDetLBackbone
 from keras_cv.models.backbones.vit_det.vit_det_backbone import ViTDetBackbone
 from keras_cv.models.classification.image_classifier import ImageClassifier
+from keras_cv.models.feature_extractor.clip import CLIP
 from keras_cv.models.object_detection.retinanet.retinanet import RetinaNet
 from keras_cv.models.object_detection.yolo_v8.yolo_v8_backbone import (
     YOLOV8Backbone,
diff --git a/keras_cv/models/feature_extractor/__init__.py b/keras_cv/models/feature_extractor/__init__.py
new file mode 100644
index 0000000000..3992ffb59a
--- /dev/null
+++ b/keras_cv/models/feature_extractor/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2023 The KerasCV Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/keras_cv/models/feature_extractor/clip/__init__.py b/keras_cv/models/feature_extractor/clip/__init__.py
new file mode 100644
index 0000000000..8826871115
--- /dev/null
+++ b/keras_cv/models/feature_extractor/clip/__init__.py
@@ -0,0 +1,23 @@
+# Copyright 2023 The KerasCV Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from keras_cv.models.feature_extractor.clip.clip_image_model import (
+    CLIPImageEncoder,
+)
+from keras_cv.models.feature_extractor.clip.clip_model import CLIP
+from keras_cv.models.feature_extractor.clip.clip_processor import CLIPProcessor
+from keras_cv.models.feature_extractor.clip.clip_text_model import (
+    CLIPTextEncoder,
+)
+from keras_cv.models.feature_extractor.clip.clip_tokenizer import CLIPTokenizer
diff --git a/keras_cv/models/feature_extractor/clip/clip_encoder.py b/keras_cv/models/feature_extractor/clip/clip_encoder.py
new file mode 100644
index 0000000000..aeb345c857
--- /dev/null
+++ b/keras_cv/models/feature_extractor/clip/clip_encoder.py
@@ -0,0 +1,321 @@
+# Copyright 2023 The KerasCV Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+
+from keras_cv.backend import keras
+from keras_cv.backend import ops
+
+
+def get_initializer(initializer_range=0.02):
+    """
+    Creates a `keras.initializers.TruncatedNormal` with the given range.
+
+    Args:
+        initializer_range (*float*, defaults to 0.02): Standard deviation of the
+        initializer range.
+
+    Returns:
+        `keras.initializers.TruncatedNormal`: The truncated normal initializer.
+    """
+    return keras.initializers.TruncatedNormal(stddev=initializer_range)
+
+
+class QuickGELU(keras.layers.Layer):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    def call(self, x):
+        return x * ops.sigmoid(1.702 * x)
+
+
+class ResidualAttention(keras.layers.Layer):
+    def __init__(
+        self,
+        proj_dim,
+        num_heads,
+        num_hidden_layers,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.proj_dim = proj_dim
+        self.num_heads = num_heads
+        self.num_hidden_layers = num_hidden_layers
+        self.fc_std = np.power(2 * self.proj_dim, -0.5) * 0.02
+
+        self.in_proj_std = (
+            np.power(self.proj_dim, -0.5)
+            * (np.power(2 * self.num_hidden_layers, -0.5))
+            * 0.02
+        )
+        self.attn = CLIPAttention(
+            self.proj_dim,
+            self.num_heads,
+            self.num_hidden_layers,
+            name="multi_head_attention",
+        )
+        self.ln_1 = keras.layers.LayerNormalization(epsilon=1e-5, name="ln_1")
+        self.mlp_dense_1 = keras.layers.Dense(
+            self.proj_dim * 4,
+            name="c_fc",
+        )
+        self.mlp_activation = QuickGELU(name="gelu")
+        self.mlp_dense_2 = keras.layers.Dense(
+            self.proj_dim,
+            name="c_proj",
+        )
+        self.ln_2 = keras.layers.LayerNormalization(epsilon=1e-5, name="ln_2")
+
+    def attention(self, x, causal_attention_mask=None, attention_mask=None):
+        mask = None
+        if causal_attention_mask is not None:
+            mask = (
+                ops.cast(causal_attention_mask, dtype=x.dtype)
+                if causal_attention_mask is not None
+                else None
+            )
+        if attention_mask is not None:
+            attention_mask = (
+                ops.cast(attention_mask, dtype=x.dtype)
+                if attention_mask is not None
+                else None
+            )
+            mask = ops.add(causal_attention_mask, attention_mask)
+
+        return self.attn(
+            x,
+            attention_mask=mask,
+        )[0]
+
+    def build(self, input_shape):
+        super().build(input_shape)
+        self.attn.build(None)
+        self.ln_1.build([None, None, self.proj_dim])
+        self.mlp_dense_1.build([None, None, self.proj_dim])
+        self.mlp_dense_2.build([None, None, self.proj_dim * 4])
+        self.ln_2.build([None, None, self.proj_dim])
+
+    def call(self, x, causal_attention_mask=None, attention_mask=None):
+        residual = x
+        x = self.ln_1(x)
+        x = self.attention(
+            x,
+            causal_attention_mask=causal_attention_mask,
+            attention_mask=attention_mask,
+        )
+        x = x + residual
+        residual = x
+        x = self.mlp_dense_1(self.ln_2(residual))
+        x = self.mlp_activation(x)
+        x = self.mlp_dense_2(x)
+        x = residual + x
+        return x
+
+    def compute_output_shape(self, inputs_shape):
+        return inputs_shape
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "proj_dim": self.proj_dim,
+                "num_heads": self.num_heads,
+                "num_hidden_layers": self.num_hidden_layers,
+            }
+        )
+        return config
+
+
+class CLIPEncoder(keras.layers.Layer):
+    def __init__(self, width, num_layers, heads, **kwargs):
+        super().__init__(**kwargs)
+        self.width = width
+        self.num_layers = num_layers
+        self.heads = heads
+        self.resblocks = [
+            ResidualAttention(
+                self.width,
+                self.heads,
+                self.num_layers,
+            )
+            for _ in range(self.num_layers)
+        ]
+
+    def build(self, input_shape):
+        super().build(input_shape)
+        for block in self.resblocks:
+            block.build(input_shape)
+
+    def call(
+        self,
+        x,
+        causal_attention_mask=None,
+        attention_mask=None,
+    ):
+        for block in self.resblocks:
+            x = block(
+                x,
+                causal_attention_mask=causal_attention_mask,
+                attention_mask=attention_mask,
+            )
+        return x
+
+    def compute_output_shape(self, inputs_shape):
+        return inputs_shape
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "width": self.width,
+                "num_layers": self.num_layers,
+                "heads": self.heads,
+            }
+        )
+        return config
+
+
+class CLIPAttention(keras.layers.Layer):
+    """
+    Adapted from https://github.com/huggingface/transformers/blob/main/src/transformers/models/clip/modeling_clip.py # noqa: E501
+    """
+
+    def __init__(
+        self, proj_dim, num_heads, num_hidden_layers, dropout=0.0, **kwargs
+    ):
+        super().__init__(**kwargs)
+
+        self.proj_dim = proj_dim
+        self.num_heads = num_heads
+        self.num_hidden_layers = num_hidden_layers
+        self.dropout = dropout
+        self.head_dim = self.proj_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.proj_dim:
+            raise ValueError(
+                f"proj_dim must be divisible by num_heads (got `proj_dim`"
+                f": {self.proj_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+
+        self.scale = self.head_dim**-0.5
+        in_proj_std = (
+            (self.proj_dim**-0.5)
+            * ((2 * self.num_hidden_layers) ** -0.5)
+            * 0.02
+        )
+        out_proj_std = (self.proj_dim**-0.5) * 0.02
+        self.q_proj = keras.layers.Dense(
+            units=self.proj_dim,
+            kernel_initializer=get_initializer(in_proj_std),
+            name="q_proj",
+        )
+        self.k_proj = keras.layers.Dense(
+            units=self.proj_dim,
+            kernel_initializer=get_initializer(in_proj_std),
+            name="k_proj",
+        )
+        self.v_proj = keras.layers.Dense(
+            units=self.proj_dim,
+            kernel_initializer=get_initializer(in_proj_std),
+            name="v_proj",
+        )
+        self.out_proj = keras.layers.Dense(
+            units=self.proj_dim,
+            kernel_initializer=get_initializer(out_proj_std),
+            name="out_proj",
+        )
+
+    def build(self, input_shape):
+        super().build(input_shape)
+        self.q_proj.build([None, None, self.proj_dim])
+        self.k_proj.build([None, None, self.proj_dim])
+        self.v_proj.build([None, None, self.proj_dim])
+        self.out_proj.build([None, None, self.proj_dim])
+
+    def _transpose_for_scores(self, tensor, batch_size):
+        """
+        Adapted from https://github.com/huggingface/transformers/blob/8e164c5400b7b413c7b8fb32e35132001effc970/src/transformers/models/bert/modeling_tf_bert.py#L252 # noqa: E501
+        """
+        # [batch_size, seq_len, all_head_dim] ->
+        # [batch_size, seq_len, num_heads, head_dim]
+        tensor = ops.reshape(
+            tensor, (batch_size, -1, self.num_heads, self.head_dim)
+        )
+        # [batch_size, seq_len, num_heads, head_dim] ->
+        # [batch_size, num_heads, seq_len, head_dim]
+        return ops.transpose(tensor, axes=[0, 2, 1, 3])
+
+    def call(
+        self,
+        x,
+        attention_mask=None,
+        output_attentions=None,
+        training=False,
+    ):
+        batch_size = ops.shape(x)[0]
+        mixed_query_layer = self.q_proj(inputs=x)
+        mixed_key_layer = self.k_proj(inputs=x)
+        mixed_value_layer = self.v_proj(inputs=x)
+        query_layer = self._transpose_for_scores(mixed_query_layer, batch_size)
+        key_layer = self._transpose_for_scores(mixed_key_layer, batch_size)
+        value_layer = self._transpose_for_scores(mixed_value_layer, batch_size)
+
+        # Scaled dot product between key and query = raw attention scores.
+        attention_scores = ops.matmul(
+            query_layer, ops.transpose(key_layer, axes=[0, 1, 3, 2])
+        )
+        dk = ops.cast(ops.sqrt(self.head_dim), dtype=attention_scores.dtype)
+        attention_scores = ops.divide(
+            attention_scores, dk
+        )  # (batch_size, num_heads, seq_len_q, seq_len_k)
+
+        if attention_mask is not None:
+            # Apply the attention mask (precomputed for all layers in the
+            # call() function)
+            attention_scores = ops.add(attention_scores, attention_mask)
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = ops.softmax(attention_scores, axis=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        dropout_attention_probs = keras.layers.Dropout(self.dropout)(
+            inputs=attention_probs, training=training
+        )
+
+        attn_output = ops.matmul(dropout_attention_probs, value_layer)
+        attn_output = ops.transpose(attn_output, axes=[0, 2, 1, 3])
+
+        # (batch_size, seq_len_q, proj_dim)
+        attn_output = ops.reshape(attn_output, (batch_size, -1, self.proj_dim))
+
+        attn_output = self.out_proj(attn_output, training=training)
+        outputs = (
+            (attn_output, attention_probs)
+            if output_attentions
+            else (attn_output,)
+        )
+
+        return outputs
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "proj_dim": self.proj_dim,
+                "num_heads": self.num_heads,
+                "num_hidden_layers": self.num_hidden_layers,
+                "dropout": self.dropout,
+            }
+        )
+        return config
diff --git a/keras_cv/models/feature_extractor/clip/clip_image_model.py b/keras_cv/models/feature_extractor/clip/clip_image_model.py
new file mode 100644
index 0000000000..1718768116
--- /dev/null
+++ b/keras_cv/models/feature_extractor/clip/clip_image_model.py
@@ -0,0 +1,170 @@
+# Copyright 2023 The KerasCV Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from keras_cv.backend import keras
+from keras_cv.backend import ops
+from keras_cv.models.feature_extractor.clip.clip_encoder import CLIPEncoder
+from keras_cv.models.feature_extractor.clip.clip_encoder import get_initializer
+
+
+class CLIPPatchingAndEmbedding(keras.layers.Layer):
+    def __init__(
+        self, width, patch_size, input_resolution, output_dim, **kwargs
+    ):
+        super().__init__(**kwargs)
+
+        self.conv1 = keras.layers.Conv2D(
+            filters=width,
+            kernel_size=patch_size,
+            strides=patch_size,
+            padding="valid",
+            use_bias=False,
+            data_format="channels_last",
+            kernel_initializer=get_initializer(0.02),
+            name="patch_embed.embedding",
+        )
+        self.width = width
+        self.input_resolution = input_resolution
+        self.patch_size = patch_size
+        self.num_patches = ops.power(
+            (self.input_resolution // self.patch_size), 2
+        )
+        self.class_embedding_initializer = get_initializer(
+            ops.power(self.width, -0.5) * 0.02
+        )
+        self.output_dim = output_dim
+
+    def build(self, input_shape):
+        super().build(input_shape)
+        self.conv1.build(input_shape)
+        self.class_embedding = self.add_weight(
+            shape=((self.width,)),
+            initializer=self.class_embedding_initializer,
+            name="patch_embed.class_embedding",
+        )
+
+        self.positional_embedding = self.add_weight(
+            shape=(
+                (
+                    (self.input_resolution // self.patch_size) ** 2 + 1,
+                    self.width,
+                )
+            ),
+            trainable=True,
+            name="patch_embed.positional_embedding",
+        )
+
+    def call(self, x):
+        batch_size = ops.shape(x)[0]
+        patch_embeddings = self.conv1(x)  # shape = [*, grid, grid, channel]
+
+        patch_embeddings = ops.reshape(
+            patch_embeddings, (batch_size, self.num_patches, -1)
+        )
+        class_embeds = ops.broadcast_to(
+            self.class_embedding, (batch_size, 1, self.width)
+        )
+        embeddings = ops.concatenate(
+            [class_embeds, patch_embeddings], axis=1
+        )  # shape = [*, grid ** 2 + 1, width]
+        positional_embedding = self.positional_embedding
+        embeddings = embeddings + positional_embedding
+        return embeddings
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "width": self.width,
+                "patch_size": self.patch_size,
+                "input_resolution": self.input_resolution,
+                "output_dim": self.output_dim,
+            }
+        )
+        return config
+
+
+class CLIPImageEncoder(keras.Model):
+    def __init__(
+        self,
+        input_resolution,
+        patch_size,
+        width,
+        num_layers,
+        heads,
+        output_dim,
+        **kwargs,
+    ):
+        super().__init__(
+            **kwargs,
+        )
+        self.input_resolution = input_resolution
+        self.width = width
+        self.patch_size = patch_size
+        self.output_dim = output_dim
+        self.heads = heads
+        self.num_layers = num_layers
+
+        self.embeddings = CLIPPatchingAndEmbedding(
+            width=self.width,
+            patch_size=self.patch_size,
+            input_resolution=self.input_resolution,
+            output_dim=self.output_dim,
+            name="clip_patch_embedding",
+        )
+        self.pre_norm = keras.layers.LayerNormalization(
+            epsilon=1e-5, name="ln_1"
+        )
+        self.encoder = CLIPEncoder(
+            self.width,
+            self.num_layers,
+            self.heads,
+            name="clip_encoder",
+        )
+        self.post_norm = keras.layers.LayerNormalization(
+            epsilon=1e-5, name="ln_2"
+        )
+        self.image_projector = keras.layers.Dense(
+            output_dim, name="vision_projector", use_bias=False
+        )
+
+    def build(self, input_shape):
+        super().build(input_shape)
+        self.embeddings.build(input_shape)
+        self.pre_norm.build([None, None, self.width])
+        self.encoder.build(None)
+        self.post_norm.build([None, self.width])
+        self.image_projector.build([None, None, self.width])
+
+    def call(self, image):
+        x = self.embeddings(image)
+        x = self.pre_norm(x)
+        x = self.encoder(x)
+        x = self.post_norm(x[:, 0, :])
+        image_projected_embeddings = self.image_projector(x)
+        return image_projected_embeddings
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "input_resolution": self.input_resolution,
+                "patch_size": self.patch_size,
+                "width": self.width,
+                "layers": self.num_layers,
+                "heads": self.heads,
+                "output_dim": self.output_dim,
+            }
+        )
+        return config
diff --git a/keras_cv/models/feature_extractor/clip/clip_model.py b/keras_cv/models/feature_extractor/clip/clip_model.py
new file mode 100644
index 0000000000..e81dbd5d09
--- /dev/null
+++ b/keras_cv/models/feature_extractor/clip/clip_model.py
@@ -0,0 +1,188 @@
+# Copyright 2023 The KerasCV Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import copy
+
+from keras_cv.api_export import keras_cv_export
+from keras_cv.backend import keras
+from keras_cv.backend import ops
+from keras_cv.models.feature_extractor.clip.clip_image_model import (
+    CLIPImageEncoder,
+)
+from keras_cv.models.feature_extractor.clip.clip_presets import (  # noqa: E501
+    clip_presets,
+)
+from keras_cv.models.feature_extractor.clip.clip_text_model import (
+    CLIPTextEncoder,
+)
+from keras_cv.models.task import Task
+from keras_cv.utils.python_utils import classproperty
+
+try:
+    import keras_nlp
+except ImportError:
+    keras_nlp = None
+
+
+@keras_cv_export(["keras_cv.models.CLIP"])
+class CLIP(Task):
+    """
+    CLIP implements the Contrastive Language-Image Pretraining (CLIP)
+    architecture, which enables joint learning of visual and textual
+    representations for various downstream tasks. The deafult base model
+    achitecture will be set to clip-vit-base-patch32.
+
+    Args:
+        embed_dim (int): The dimensionality of the joint embedding space for
+            images and texts.
+        image_resolution (int): The resolution of the input images (both height
+            and width).
+        vision_layers (int): The number of layers in the vision (image) encoder.
+            vision_width (int): The width of the hidden layers in the vision
+            encoder.
+        vision_patch_size (int): The size of each square patch in the input
+            images.
+        context_length (int): The maximum length of the contextualized text
+            sequences.
+        vocab_size (int): The size of the vocabulary for tokenization.
+        transformer_width (int): The width of the hidden layers in the
+            transformer-based text encoder.
+        transformer_heads (int): The number of attention heads in the
+            transformer-based text encoder.
+        transformer_layers (int): The number of layers in the transformer-based
+            text encoder.
+    """
+
+    def __init__(
+        self,
+        embed_dim=512,
+        image_resolution=224,
+        vision_layers=12,
+        vision_width=768,
+        vision_patch_size=32,
+        context_length=77,
+        vocab_size=49408,
+        transformer_width=768,
+        transformer_heads=8,
+        transformer_layers=12,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        if keras_nlp is None:
+            raise ValueError(
+                "ClipTokenizer requires keras-nlp. Please install "
+                "using pip `pip install -U keras-nlp && pip install -U keras`"
+            )
+        self.embed_dim = embed_dim
+        self.image_resolution = image_resolution
+        self.vision_layers = vision_layers
+        self.vision_width = vision_width
+        self.vision_patch_size = vision_patch_size
+        self.context_length = context_length
+        self.vocab_size = vocab_size
+        self.transformer_width = transformer_width
+        self.transformer_heads = transformer_heads
+        self.transformer_layers = transformer_layers
+
+        vision_heads = self.vision_width // 64
+        self.image_encoder = CLIPImageEncoder(
+            input_resolution=self.image_resolution,
+            patch_size=self.vision_patch_size,
+            width=self.vision_width,
+            num_layers=self.vision_layers,
+            heads=vision_heads,
+            output_dim=self.embed_dim,
+            name="image_encoder",
+        )
+        self.text_encoder = CLIPTextEncoder(
+            transformer_width=self.transformer_width,
+            transformer_layers=self.transformer_layers,
+            transformer_heads=self.transformer_heads,
+            vocab_size=self.vocab_size,
+            embed_dim=self.embed_dim,
+            context_length=self.context_length,
+            name="text_encoder",
+        )
+
+        self.logit_scale = keras.Variable(
+            ops.ones([]) * ops.log(1 / 0.07), name="logit_scale"
+        )
+        self.image_embeddings = None
+        self.text_embeddings = None
+
+    def build(self, input_shape):
+        super().build(input_shape)
+        self.text_encoder.build([None, self.context_length])
+        self.image_encoder.build(
+            [None, self.image_resolution, self.image_resolution, 3]
+        )
+
+    def encode_images(self, image):
+        return self.image_encoder(image)
+
+    def encode_text(self, text, attention_mask=None):
+        return self.text_encoder(text, attention_mask=attention_mask)
+
+    def call(self, image, text, attention_mask=None):
+        self.image_embeddings = self.encode_images(image)
+        self.text_embeddings = self.encode_text(
+            text, attention_mask=attention_mask
+        )
+        normalize_image_features = ops.sqrt(
+            ops.sum(ops.power(self.image_embeddings, 2), keepdims=True)
+        )
+        normalize_text_features = ops.sqrt(
+            ops.sum(ops.power(self.text_embeddings, 2), keepdims=True)
+        )
+        self.image_embeddings = self.image_embeddings / normalize_image_features
+        self.text_embeddings = self.text_embeddings / normalize_text_features
+        logit_scale = ops.exp(self.logit_scale)
+        logits_per_image = (
+            ops.matmul(
+                self.image_embeddings,
+                ops.transpose(self.text_embeddings),
+            )
+            * logit_scale
+        )
+        logits_per_text = ops.transpose(logits_per_image)
+
+        return logits_per_image, logits_per_text
+
+    @classproperty
+    def presets(cls):
+        """Dictionary of preset names and configurations."""
+        return copy.deepcopy({**clip_presets})
+
+    @classproperty
+    def presets_with_weights(cls):
+        """Dictionary of preset names and configurations that include
+        weights."""
+        return copy.deepcopy({**clip_presets})
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "embed_dim": self.embed_dim,
+                "image_resolution": self.image_resolution,
+                "vision_layers": self.vision_layers,
+                "vision_width": self.vision_width,
+                "vision_patch_size": self.vision_patch_size,
+                "context_length": self.context_length,
+                "vocab_size": self.vocab_size,
+                "transformer_width": self.transformer_width,
+                "transformer_heads": self.transformer_heads,
+                "transformer_layers": self.transformer_layers,
+            }
+        )
+        return config
diff --git a/keras_cv/models/feature_extractor/clip/clip_model_test.py b/keras_cv/models/feature_extractor/clip/clip_model_test.py
new file mode 100644
index 0000000000..d5c777c653
--- /dev/null
+++ b/keras_cv/models/feature_extractor/clip/clip_model_test.py
@@ -0,0 +1,135 @@
+# Copyright 2022 The KerasCV Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import numpy as np
+import pytest
+from tensorflow import data as tf_data
+
+from keras_cv.backend import keras
+from keras_cv.backend import ops
+from keras_cv.backend.config import keras_3
+from keras_cv.models import CLIP
+from keras_cv.models.feature_extractor.clip import CLIPProcessor
+from keras_cv.tests.test_case import TestCase
+
+VOCAB_PATH = keras.utils.get_file(
+    None,
+    "https://storage.googleapis.com/keras-cv/models/clip/vocab.json",
+)
+MERGE_PATH = keras.utils.get_file(
+    None,
+    "https://storage.googleapis.com/keras-cv/models/clip/merges.txt",
+)
+
+MODEL_PATH = keras.utils.get_file(
+    None,
+    "https://storage.googleapis.com/keras-cv/models/clip/clip-vit-base-patch32.weights.h5",  # noqa: E501
+)
+
+
+class CLIPTest(TestCase):
+    @pytest.mark.large
+    def test_clip_model_golden_values(self):
+        model = CLIP()
+        model.load_weights(MODEL_PATH)
+        processed_image = np.ones(shape=[1, 224, 224, 3])
+        processed_text = np.ones(shape=[3, 77])
+        attention_mask = np.ones(shape=[3, 77])
+        image_logits, text_logits = model(
+            processed_image, processed_text, attention_mask
+        )
+        print(image_logits)
+        self.assertAllClose(image_logits, [[2.932678, 2.932678, 2.932675]])
+        self.assertAllClose(
+            text_logits, ops.transpose([[2.932678, 2.932678, 2.932675]])
+        )
+
+    def test_clip_preprocessor(self):
+        processor = CLIPProcessor(224, VOCAB_PATH, MERGE_PATH)
+        processed_text, attention_mask = processor.process_texts(
+            ["mountains", "cat on tortoise"]
+        )
+        self.assertAllClose(
+            processed_text[:, :3], [[49406, 5873, 49407], [49406, 2368, 525]]
+        )
+        self.assertAllClose(
+            attention_mask[0, :5], [True, True, True, False, False]
+        )
+
+    def test_clip_preprocessor_tf_data(self):
+        processor = CLIPProcessor(224, VOCAB_PATH, MERGE_PATH)
+        text_input = ["a bus", "a dog", "a cat"]
+        dataset = tf_data.Dataset.from_tensor_slices(text_input)
+        dataset.map(processor.process_texts)
+
+    @pytest.mark.large
+    def test_presets(self):
+        self.skipTest("TODO: Enable after Kaggle model is public")
+        model = CLIP.from_preset("clip-vit-base-patch32")
+        processed_image = np.ones(shape=[1, 224, 224, 3])
+        processed_text = np.ones(shape=[3, 77])
+        attention_mask = np.ones(shape=[3, 77])
+        image_logits, text_logits = model(
+            processed_image, processed_text, attention_mask
+        )
+
+    @pytest.mark.large
+    def test_image_encoder_golden_values(self):
+        model = CLIP()
+        model.load_weights(MODEL_PATH)
+        processed_image = np.ones(shape=[1, 224, 224, 3])
+        processed_text = np.ones(shape=[3, 77])
+        attention_mask = np.ones(shape=[3, 77])
+        model(processed_image, processed_text, attention_mask)
+        self.assertAllClose(
+            model.image_embeddings[:, :5],
+            [[0.023215, 0.026526, 0.008914, -0.091689, 0.021791]],
+        )
+
+    @pytest.mark.large
+    def test_text_encoder_golden_values(self):
+        model = CLIP()
+        processed_image = np.ones(shape=[1, 224, 224, 3])
+        processed_text = np.ones(shape=[3, 77])
+        attention_mask = np.ones(shape=[3, 77])
+        model(processed_image, processed_text, attention_mask)
+        print(model.text_embeddings)
+        self.assertAllClose(
+            model.text_embeddings[0, :3],
+            [-0.018502, 0.000906, 0.020372],
+        )
+
+    @pytest.mark.large  # Saving is slow, so mark these large.
+    def test_saved_model(self):
+        model = CLIP()
+        processed_image = np.ones(shape=[1, 224, 224, 3])
+        processed_text = np.ones(shape=[3, 77])
+        attention_mask = np.ones(shape=[3, 77])
+        model_output, _ = model(processed_image, processed_text, attention_mask)
+        save_path = os.path.join(self.get_temp_dir(), "model.keras")
+        if keras_3():
+            model.save(save_path)
+        else:
+            model.save(save_path, save_format="keras_v3")
+        restored_model = keras.models.load_model(save_path)
+
+        # Check we got the real object back.
+        self.assertIsInstance(restored_model, CLIP)
+        # Check that output matches.
+        restored_output, _ = restored_model(
+            processed_image, processed_text, attention_mask
+        )
+        self.assertAllClose(model_output, restored_output)
diff --git a/keras_cv/models/feature_extractor/clip/clip_presets.py b/keras_cv/models/feature_extractor/clip/clip_presets.py
new file mode 100644
index 0000000000..6b4d98727e
--- /dev/null
+++ b/keras_cv/models/feature_extractor/clip/clip_presets.py
@@ -0,0 +1,81 @@
+# Copyright 2023 The KerasCV Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""CLIP presets."""
+
+clip_presets = {
+    "clip-vit-base-patch16": {
+        "metadata": {
+            "description": (
+                "The model uses a ViT-B/16 Transformer architecture as an "
+                "image encoder and uses a masked self-attention Transformer as "
+                "a text encoder. These encoders are trained to maximize the "
+                "similarity of (image, text) pairs via a contrastive loss. The "
+                "model uses a patch size of 16 and input images of size (224, "
+                "224)"
+            ),
+            "params": 149620737,
+            "official_name": "CLIP",
+            "path": "clip",
+        },
+        "kaggle_handle": "kaggle://keras/clip/keras/clip-vit-base-patch16/2",
+    },
+    "clip-vit-base-patch32": {
+        "metadata": {
+            "description": (
+                "The model uses a ViT-B/32 Transformer architecture as an "
+                "image encoder and uses a masked self-attention Transformer as "
+                "a text encoder. These encoders are trained to maximize the "
+                "similarity of (image, text) pairs via a contrastive loss.The "
+                "model uses a patch size of 32 and input images of size (224, "
+                "224)"
+            ),
+            "params": 151277313,
+            "official_name": "CLIP",
+            "path": "clip",
+        },
+        "kaggle_handle": "kaggle://keras/clip/keras/clip-vit-base-patch32/2",
+    },
+    "clip-vit-large-patch14": {
+        "metadata": {
+            "description": (
+                "The model uses a ViT-L/14 Transformer architecture as an "
+                "image encoder and uses a masked self-attention Transformer as "
+                "a text encoder. These encoders are trained to maximize the "
+                "similarity of (image, text) pairs via a contrastive loss.The "
+                "model uses a patch size of 14 and input images of size (224, "
+                "224)"
+            ),
+            "params": 427616513,
+            "official_name": "CLIP",
+            "path": "clip",
+        },
+        "kaggle_handle": "kaggle://keras/clip/keras/clip-vit-large-patch14/2",
+    },
+    "clip-vit-large-patch14-336": {
+        "metadata": {
+            "description": (
+                "The model uses a ViT-L/14 Transformer architecture as an "
+                "image encoder and uses a masked self-attention Transformer as "
+                "a text encoder. These encoders are trained to maximize the "
+                "similarity of (image, text) pairs via a contrastive loss.The "
+                "model uses a patch size of 14 and input images of size (336, "
+                "336)"
+            ),
+            "params": 427944193,
+            "official_name": "CLIP",
+            "path": "clip",
+        },
+        "kaggle_handle": "kaggle://keras/clip/keras/clip-vit-large-patch14-336/2",  # noqa: E501
+    },
+}
diff --git a/keras_cv/models/feature_extractor/clip/clip_processor.py b/keras_cv/models/feature_extractor/clip/clip_processor.py
new file mode 100644
index 0000000000..80e616cc02
--- /dev/null
+++ b/keras_cv/models/feature_extractor/clip/clip_processor.py
@@ -0,0 +1,131 @@
+# Copyright 2023 The KerasCV Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from keras_nlp.layers import StartEndPacker
+
+from keras_cv.api_export import keras_cv_export
+from keras_cv.backend import keras
+from keras_cv.backend import ops
+from keras_cv.models.feature_extractor.clip.clip_tokenizer import CLIPTokenizer
+
+
+@keras_cv_export("keras_cv.models.feature_extractors.CLIPProcessor")
+class CLIPProcessor:
+    """
+    CLIPProcessor is a utility class that provides functionality for processing
+    images and texts in the context of the CLIP (Contrastive Language-Image
+    Pretraining) model.
+
+    Args:
+        input_resolution (int): The resolution of input images.
+        vocabulary (str): string or dict, maps token to integer ids. If it is a
+            string, it should be the file path to a json file.
+        merges: string or list, contains the merge rule. If it is a string, it
+            should be the file path to merge rules. The merge rule file should
+            have one merge rule per line.
+
+    Methods:
+        process_images(image_path: List[str]): Transforms an image located at
+            the specified path.
+
+        process_texts(texts: Union[str, List[str]], context_length: int = 77):
+            Processes a single text or a list of texts, returning packed token
+            sequences.
+
+    """
+
+    def __init__(self, input_resolution, vocabulary, merges, **kwargs):
+        self.input_resolution = input_resolution
+        self.vocabulary = vocabulary
+        self.merges = merges
+        self.image_transform = self.transform_image
+        self.tokenizer = CLIPTokenizer(
+            vocabulary=self.vocabulary,
+            merges=self.merges,
+            unsplittable_tokens=["</w>"],
+        )
+        self.packer = StartEndPacker(
+            start_value=self.tokenizer.token_to_id("<|startoftext|>"),
+            end_value=self.tokenizer.token_to_id("<|endoftext|>"),
+            pad_value=None,
+            sequence_length=77,
+            return_padding_mask=True,
+        )
+
+    def transform_image(self, image_path):
+        input_resolution = self.input_resolution
+        mean = ops.array([0.48145466, 0.4578275, 0.40821073])
+        std = ops.array([0.26862954, 0.26130258, 0.27577711])
+
+        image = keras.utils.load_img(image_path)
+        image = keras.utils.img_to_array(image)
+        image = (
+            ops.image.resize(
+                image,
+                (input_resolution, input_resolution),
+                interpolation="bicubic",
+            )
+            / 255.0
+        )
+        central_fraction = input_resolution / image.shape[0]
+        width, height = image.shape[0], image.shape[1]
+        left = ops.cast((width - width * central_fraction) / 2, dtype="int32")
+        top = ops.cast((height - height * central_fraction) / 2, dtype="int32")
+        right = ops.cast((width + width * central_fraction) / 2, dtype="int32")
+        bottom = ops.cast(
+            (height + height * central_fraction) / 2, dtype="int32"
+        )
+
+        image = ops.slice(
+            image, [left, top, 0], [right - left, bottom - top, 3]
+        )
+
+        image = (image - mean) / std
+        return image
+
+    def process_images(self, images):
+        if isinstance(images, str):
+            images = [images]
+
+        def process_image(image):
+            if isinstance(image, str):
+                return self.image_transform(image)
+
+        processed_images = list(map(process_image, images))
+        processed_images = ops.stack(processed_images)
+        return processed_images
+
+    def process_texts(self, texts, context_length: int = 77):
+        if isinstance(texts, str):
+            texts = [texts]
+
+        def pack_tokens(text):
+            return self.packer(
+                self.tokenizer(text),
+                sequence_length=context_length,
+                add_start_value=True,
+                add_end_value=True,
+            )
+
+        return pack_tokens(texts)
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "input_resolution": self.input_resolution,
+                "vocabulary": self.vocabulary,
+                "merges": self.merges,
+            }
+        )
+        return config
diff --git a/keras_cv/models/feature_extractor/clip/clip_text_model.py b/keras_cv/models/feature_extractor/clip/clip_text_model.py
new file mode 100644
index 0000000000..5fc92990d2
--- /dev/null
+++ b/keras_cv/models/feature_extractor/clip/clip_text_model.py
@@ -0,0 +1,118 @@
+# Copyright 2023 The KerasCV Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from keras_cv.backend import keras
+from keras_cv.backend import ops
+from keras_cv.models.feature_extractor.clip.clip_encoder import CLIPEncoder
+
+
+class CLIPTextEncoder(keras.Model):
+    def __init__(
+        self,
+        transformer_width,
+        transformer_layers,
+        transformer_heads,
+        vocab_size,
+        embed_dim,
+        context_length,
+        **kwargs,
+    ):
+        super().__init__(
+            **kwargs,
+        )
+        self.transformer_width = transformer_width
+        self.transformer_layers = transformer_layers
+        self.transformer_heads = transformer_heads
+        self.vocab_size = vocab_size
+        self.embed_dim = embed_dim
+        self.context_length = context_length
+        self.token_embedding = keras.layers.Embedding(
+            vocab_size,
+            transformer_width,
+            name="token_embedding",
+        )
+        self.positional_embedding = keras.layers.Embedding(
+            self.context_length,
+            transformer_width,
+            name="positional_embedding",
+        )
+
+        self.encoder = CLIPEncoder(
+            width=transformer_width,
+            num_layers=transformer_layers,
+            heads=transformer_heads,
+            name="clip_encoder",
+        )
+        self.ln_final = keras.layers.LayerNormalization(name="ln_final")
+
+        self.text_projector = keras.layers.Dense(
+            embed_dim, name="text_projector", use_bias=False
+        )
+
+    def build(self, input_shape):
+        super().build(input_shape)
+        self.token_embedding.build(input_shape)
+        self.positional_embedding.build([1, self.context_length])
+        self.encoder.build(None)
+        self.ln_final.build([None, None, self.transformer_width])
+        self.text_projector.build([None, None, self.transformer_width])
+
+    def call(self, inputs, attention_mask=None):
+        token_embedding = self.token_embedding(inputs)
+        position_ids = ops.expand_dims(
+            ops.arange(self.context_length, dtype="int32"), 0
+        )
+        position_embedding = self.positional_embedding(position_ids)
+        position_embedding = ops.tile(
+            position_embedding, repeats=(inputs.shape[0], 1, 1)
+        )
+        causal_attention_mask = ops.ones(
+            (self.context_length, self.context_length)
+        )
+        # Zero out the lower diagonal
+        causal_attention_mask = ops.triu(causal_attention_mask)
+        causal_attention_mask = ops.cast(causal_attention_mask, "float32")
+        attention_mask = ops.cast(attention_mask, dtype="float32")
+        expanded_mask = ops.tile(
+            attention_mask[:, None, None, :], (1, 1, self.context_length, 1)
+        )
+        expanded_mask = (1.0 - expanded_mask) * (-1e8)
+        encoded_output = self.encoder(
+            token_embedding + position_embedding,
+            causal_attention_mask=causal_attention_mask,
+            attention_mask=expanded_mask,
+        )
+        layer_norm = self.ln_final(encoded_output)
+        indices = ops.expand_dims(
+            ops.cast(ops.argmax(inputs, axis=-1), "int32"), axis=-1
+        )
+        selected_features = ops.take_along_axis(
+            layer_norm, indices[:, :, None], axis=1
+        )
+        text_features = self.text_projector(selected_features)
+        output = ops.squeeze(text_features, axis=1)
+        return output
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "transformer_width": self.transformer_width,
+                "transformer_layers": self.transformer_layers,
+                "transformer_heads": self.transformer_heads,
+                "vocab_size": self.vocab_size,
+                "embed_dim": self.embed_dim,
+                "context_length": self.context_length,
+            }
+        )
+        return config
diff --git a/keras_cv/models/feature_extractor/clip/clip_tokenizer.py b/keras_cv/models/feature_extractor/clip/clip_tokenizer.py
new file mode 100644
index 0000000000..66b4d7cef6
--- /dev/null
+++ b/keras_cv/models/feature_extractor/clip/clip_tokenizer.py
@@ -0,0 +1,186 @@
+# Copyright 2023 The KerasCV Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import regex as re
+import tensorflow as tf
+import tensorflow_text as tf_text
+
+try:
+    import keras_nlp
+    from keras_nlp.tokenizers import BytePairTokenizer
+except ImportError:
+    keras_nlp = None
+
+# As python and TF handles special spaces differently, we need to
+# manually handle special spaces during string split.
+SPECIAL_WHITESPACES = r"\x{a0}\x{2009}\x{202f}\x{3000}"
+SPLIT_PATTERN_1 = (
+    r"'s|'t|'re|'ve|'m|'ll|'d"
+    + r"|[\s{special_spaces}]+[\n\r\t\f६{special_spaces}]| ?\p{L}+|"
+    + r" ?[\p{N}]+| ?[^\s\p{L}\p{N}{special_spaces}</w>]+"
+)
+SPLIT_PATTERN_1 = SPLIT_PATTERN_1.replace(
+    "{special_spaces}", SPECIAL_WHITESPACES
+)
+SPLIT_PATTERN_2 = rf"""[\s६{SPECIAL_WHITESPACES}]$"""
+
+
+def split_strings_for_bpe(inputs, unsplittable_tokens=None):
+    # We need to recreate the exact behavior of token presplitting in the
+    # original gpt2 tokenizer which uses a lookahead. As re2 does not
+    # support lookahead match, we are using an alternative insert a special
+    # token "६" before leading space of non-space characters and after the
+    # trailing space, e.g., " keras" will be "६ keras".
+    inputs = tf.strings.regex_replace(
+        inputs, rf"( )([^\s{SPECIAL_WHITESPACES}])", r"६\1\2"
+    )
+    inputs = tf.strings.regex_replace(
+        inputs, rf"(\s{SPECIAL_WHITESPACES})$", r"\1६"
+    )
+    inputs = tf.strings.regex_replace(inputs, r"\s", "")
+    if unsplittable_tokens:
+        alts = create_alts_for_unsplittable_tokens(unsplittable_tokens)
+        for token, alt in zip(unsplittable_tokens, alts):
+            escaped_token = re.escape(token)
+            inputs = tf_text.regex_split(inputs, escaped_token, escaped_token)
+            inputs = tf.strings.regex_replace(inputs, escaped_token, alt)
+    raw_tokens = tf_text.regex_split(inputs, SPLIT_PATTERN_1, SPLIT_PATTERN_1)
+    # Second pass splits out the last whilespace char or "६".
+    raw_tokens = tf_text.regex_split(
+        raw_tokens, SPLIT_PATTERN_2, SPLIT_PATTERN_2
+    )
+    if unsplittable_tokens:
+        # Replace special tokens alternate with originals.
+        for token, alt in zip(unsplittable_tokens, alts):
+            escaped_alt = re.escape(alt)
+            raw_tokens = tf.strings.regex_replace(
+                raw_tokens, escaped_alt, token
+            )
+
+    # Add '</w>' to the end of each token
+    tokens_with_end_tag = tf.strings.regex_replace(
+        raw_tokens, r"(\p{L}+)", r"\1</w>"
+    )
+
+    while tokens_with_end_tag.shape.rank > 2:
+        tokens_with_end_tag = tokens_with_end_tag.merge_dims(1, 2)
+
+    return remove_strings_from_inputs(tokens_with_end_tag, "६")
+
+
+def create_alts_for_unsplittable_tokens(unsplittable_tokens):
+    # Create alternates for all special tokens that will be not split during
+    # tokenization.
+    alts = []
+    prefix = "Ĵ"
+    # Trim out splitters.
+    replace_pattern = r"'|\s+|[^\p{L}\p{N}]+"
+    for token in unsplittable_tokens:
+        token = re.sub(replace_pattern, "", token)
+        alts.append(prefix + token)
+    return alts
+
+
+def remove_strings_from_inputs(tensor, string_to_remove):
+    """Remove certain strings from input tensor."""
+    non_empty_mask = tensor != string_to_remove
+    flatten_indexes = tf.where(non_empty_mask)
+    flatten_result = tf.gather_nd(tensor, flatten_indexes)
+    row_lengths = tf.reduce_sum(tf.cast(non_empty_mask, "int64"), axis=1)
+    result = tf.RaggedTensor.from_row_lengths(
+        values=flatten_result,
+        row_lengths=row_lengths,
+    )
+    return result
+
+
+class CLIPTokenizer(BytePairTokenizer):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        if keras_nlp is None:
+            raise ValueError(
+                "ClipTokenizer requires keras-nlp. Please install "
+                "using pip `pip install -U keras-nlp && pip install -U keras`"
+            )
+
+    def _bpe_merge_and_update_cache(self, tokens):
+        """Process unseen tokens and add to cache."""
+        words = self._transform_bytes(tokens)
+        tokenized_words = self._bpe_merge(words)
+
+        # For each word, join all its token by a whitespace,
+        # e.g., ["dragon", "fly"] => "dragon fly" for hash purpose.
+        tokenized_words = tf.strings.reduce_join(
+            tokenized_words,
+            axis=1,
+        )
+        self.cache.insert(tokens, tokenized_words)
+
+    def tokenize(self, inputs):
+        self._check_vocabulary()
+        if not isinstance(inputs, (tf.Tensor, tf.RaggedTensor)):
+            inputs = tf.convert_to_tensor(inputs)
+
+        if self.add_prefix_space:
+            inputs = tf.strings.join([" ", inputs])
+
+        scalar_input = inputs.shape.rank == 0
+        if scalar_input:
+            inputs = tf.expand_dims(inputs, 0)
+
+        raw_tokens = split_strings_for_bpe(inputs, self.unsplittable_tokens)
+        token_row_splits = raw_tokens.row_splits
+        flat_tokens = raw_tokens.flat_values
+        # Check cache.
+        cache_lookup = self.cache.lookup(flat_tokens)
+        cache_mask = cache_lookup == ""
+
+        has_unseen_words = tf.math.reduce_any(
+            (cache_lookup == "") & (flat_tokens != "")
+        )
+
+        def process_unseen_tokens():
+            unseen_tokens = tf.boolean_mask(flat_tokens, cache_mask)
+            self._bpe_merge_and_update_cache(unseen_tokens)
+            return self.cache.lookup(flat_tokens)
+
+        # If `has_unseen_words == True`, it means not all tokens are in cache,
+        # we will process the unseen tokens. Otherwise return the cache lookup.
+        tokenized_words = tf.cond(
+            has_unseen_words,
+            process_unseen_tokens,
+            lambda: cache_lookup,
+        )
+        tokens = tf.strings.split(tokenized_words, sep=" ")
+        if self.compute_dtype != tf.string:
+            # Encode merged tokens.
+            tokens = self.token_to_id_map.lookup(tokens)
+
+        # Unflatten to match input.
+        tokens = tf.RaggedTensor.from_row_splits(
+            tokens.flat_values,
+            tf.gather(tokens.row_splits, token_row_splits),
+        )
+
+        # Convert to a dense output if `sequence_length` is set.
+        if self.sequence_length:
+            output_shape = tokens.shape.as_list()
+            output_shape[-1] = self.sequence_length
+            tokens = tokens.to_tensor(shape=output_shape)
+
+        # Convert to a dense output if input in scalar
+        if scalar_input:
+            tokens = tf.squeeze(tokens, 0)
+            tf.ensure_shape(tokens, shape=[self.sequence_length])
+
+        return tokens
diff --git a/keras_cv/tools/checkpoint_conversion/clip_weights_conversion.ipynb b/keras_cv/tools/checkpoint_conversion/clip_weights_conversion.ipynb
new file mode 100644
index 0000000000..13e443669a
--- /dev/null
+++ b/keras_cv/tools/checkpoint_conversion/clip_weights_conversion.ipynb
@@ -0,0 +1,1032 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "0DhV6hzOMY0W"
+   },
+   "source": [
+    "# Setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "cRzYR-oFgxt1",
+    "outputId": "e4b01fcd-9f71-4ba7-b8a2-1796f7ef260d"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "  Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n",
+      "  Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n",
+      "  Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m950.8/950.8 kB\u001b[0m \u001b[31m5.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25h  Building wheel for keras-cv (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m415.4/415.4 kB\u001b[0m \u001b[31m5.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.2/5.2 MB\u001b[0m \u001b[31m17.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hCollecting keras==3.0.2\n",
+      "  Downloading keras-3.0.2-py3-none-any.whl (1.0 MB)\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.0/1.0 MB\u001b[0m \u001b[31m8.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25hRequirement already satisfied: absl-py in /usr/local/lib/python3.10/dist-packages (from keras==3.0.2) (1.4.0)\n",
+      "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from keras==3.0.2) (1.23.5)\n",
+      "Requirement already satisfied: rich in /usr/local/lib/python3.10/dist-packages (from keras==3.0.2) (13.7.0)\n",
+      "Requirement already satisfied: namex in /usr/local/lib/python3.10/dist-packages (from keras==3.0.2) (0.0.7)\n",
+      "Requirement already satisfied: h5py in /usr/local/lib/python3.10/dist-packages (from keras==3.0.2) (3.9.0)\n",
+      "Requirement already satisfied: dm-tree in /usr/local/lib/python3.10/dist-packages (from keras==3.0.2) (0.1.8)\n",
+      "Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from rich->keras==3.0.2) (3.0.0)\n",
+      "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from rich->keras==3.0.2) (2.16.1)\n",
+      "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.10/dist-packages (from markdown-it-py>=2.2.0->rich->keras==3.0.2) (0.1.2)\n",
+      "Installing collected packages: keras\n",
+      "  Attempting uninstall: keras\n",
+      "    Found existing installation: keras 2.15.0\n",
+      "    Uninstalling keras-2.15.0:\n",
+      "      Successfully uninstalled keras-2.15.0\n",
+      "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
+      "tensorflow 2.15.0 requires keras<2.16,>=2.15.0, but you have keras 3.0.2 which is incompatible.\u001b[0m\u001b[31m\n",
+      "\u001b[0mSuccessfully installed keras-3.0.2\n"
+     ]
+    }
+   ],
+   "source": [
+    "!pip install -q git+https://github.com/divyashreepathihalli/keras-cv.git@CLIP_refactor\n",
+    "!pip install -q keras-nlp\n",
+    "!pip install -q tf-keras\n",
+    "!pip install -q tensorflow-text\n",
+    "!pip install keras==3.0.2"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "mdGT8Em4Mc4b"
+   },
+   "source": [
+    "# Import"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "GDvJmQuug4-x"
+   },
+   "outputs": [],
+   "source": [
+    "from keras_cv.models.feature_extractor.clip import CLIPProcessor\n",
+    "import keras\n",
+    "from keras_cv.models import CLIP"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "nuFgha2jTshi",
+    "outputId": "b99d73eb-cc97-47d0-f46e-687c9e8b8237"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "--2024-02-02 22:19:20--  https://i.imgur.com/8H7XCH0.jpg\n",
+      "Resolving i.imgur.com (i.imgur.com)... 151.101.52.193\n",
+      "Connecting to i.imgur.com (i.imgur.com)|151.101.52.193|:443... connected.\n",
+      "HTTP request sent, awaiting response... 200 OK\n",
+      "Length: 44544 (44K) [image/jpeg]\n",
+      "Saving to: ‘cat.jpg’\n",
+      "\n",
+      "\rcat.jpg               0%[                    ]       0  --.-KB/s               \rcat.jpg             100%[===================>]  43.50K  --.-KB/s    in 0.01s   \n",
+      "\n",
+      "2024-02-02 22:19:20 (3.58 MB/s) - ‘cat.jpg’ saved [44544/44544]\n",
+      "\n",
+      "--2024-02-02 22:19:20--  http://images.cocodataset.org/val2017/000000039769.jpg\n",
+      "Resolving images.cocodataset.org (images.cocodataset.org)... 52.216.78.4, 3.5.1.13, 52.217.139.73, ...\n",
+      "Connecting to images.cocodataset.org (images.cocodataset.org)|52.216.78.4|:80... connected.\n",
+      "HTTP request sent, awaiting response... 200 OK\n",
+      "Length: 173131 (169K) [image/jpeg]\n",
+      "Saving to: ‘test.jpg’\n",
+      "\n",
+      "test.jpg            100%[===================>] 169.07K  --.-KB/s    in 0.06s   \n",
+      "\n",
+      "2024-02-02 22:19:20 (2.67 MB/s) - ‘test.jpg’ saved [173131/173131]\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "!wget https://i.imgur.com/8H7XCH0.jpg -O cat.jpg\n",
+    "!wget http://images.cocodataset.org/val2017/000000039769.jpg -O test.jpg"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "cellView": "form",
+    "id": "X3kkmK6h_gFH"
+   },
+   "outputs": [],
+   "source": [
+    "# @title Select which model weights you would like to convert\n",
+    "MODEL_CONFIGS = {\n",
+    "    \"CLIP_B32\": {\n",
+    "        \"embed_dim\": 512,\n",
+    "        \"context_length\": 77,\n",
+    "        \"vocab_size\": 49408,\n",
+    "        \"transformer_width\": 512,\n",
+    "        \"transformer_heads\": 8,\n",
+    "        \"transformer_layers\": 12,\n",
+    "        \"vision_layers\": 12,\n",
+    "        \"vision_width\": 768,\n",
+    "        \"image_resolution\": 224,\n",
+    "        \"vision_patch_size\": 32,\n",
+    "    },\n",
+    "    \"CLIP_B16\": {\n",
+    "        \"embed_dim\": 512,\n",
+    "        \"context_length\": 77,\n",
+    "        \"vocab_size\": 49408,\n",
+    "        \"transformer_width\": 512,\n",
+    "        \"transformer_heads\": 8,\n",
+    "        \"transformer_layers\": 12,\n",
+    "        \"vision_layers\": 12,\n",
+    "        \"vision_width\": 768,\n",
+    "        \"image_resolution\": 224,\n",
+    "        \"vision_patch_size\": 16,\n",
+    "    },\n",
+    "    \"CLIP_L14\": {\n",
+    "        \"embed_dim\": 768,\n",
+    "        \"context_length\": 77,\n",
+    "        \"vocab_size\": 49408,\n",
+    "        \"transformer_width\": 768,\n",
+    "        \"transformer_heads\": 12,\n",
+    "        \"transformer_layers\": 12,\n",
+    "        \"vision_layers\": 24,\n",
+    "        \"vision_width\": 1024,\n",
+    "        \"image_resolution\": 224,\n",
+    "        \"vision_patch_size\": 14,\n",
+    "    },\n",
+    "    \"CLIP_L14_336\": {\n",
+    "        \"embed_dim\": 768,\n",
+    "        \"context_length\": 77,\n",
+    "        \"vocab_size\": 49408,\n",
+    "        \"transformer_width\": 768,\n",
+    "        \"transformer_heads\": 12,\n",
+    "        \"transformer_layers\": 12,\n",
+    "        \"vision_layers\": 24,\n",
+    "        \"vision_width\": 1024,\n",
+    "        \"image_resolution\": 336,\n",
+    "        \"vision_patch_size\": 14,\n",
+    "    },\n",
+    "}\n",
+    "model_map_hf = {\n",
+    "    \"CLIP_B16\": \"openai/clip-vit-base-patch32\",\n",
+    "    \"CLIP_B32\": \"openai/clip-vit-base-patch16\",\n",
+    "    \"CLIP_L14\": \"openai/clip-vit-large-patch14\",\n",
+    "    \"CLIP_L14_336\": \"openai/clip-vit-large-patch14-336\",\n",
+    "}\n",
+    "config_name = \"CLIP_L14_336\"  # @param [\"CLIP_B16\", \"CLIP_B32\", \"CLIP_L14\", \"CLIP_L14_336\"]\n",
+    "config_name_hf = model_map_hf[config_name]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "2l3Ll7dMMd-m"
+   },
+   "source": [
+    "# Keras 3 CLIP"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "urhuhwq0Dczo"
+   },
+   "outputs": [],
+   "source": [
+    "embed_dim = MODEL_CONFIGS[config_name][\"embed_dim\"]\n",
+    "context_length = MODEL_CONFIGS[config_name][\"context_length\"]\n",
+    "vocab_size = MODEL_CONFIGS[config_name][\"vocab_size\"]\n",
+    "transformer_width = MODEL_CONFIGS[config_name][\"transformer_width\"]\n",
+    "transformer_heads = MODEL_CONFIGS[config_name][\"transformer_heads\"]\n",
+    "transformer_layers = MODEL_CONFIGS[config_name][\"transformer_layers\"]\n",
+    "vision_layers = MODEL_CONFIGS[config_name][\"vision_layers\"]\n",
+    "vision_width = MODEL_CONFIGS[config_name][\"vision_width\"]\n",
+    "vision_patch_size = MODEL_CONFIGS[config_name][\"vision_patch_size\"]\n",
+    "image_resolution = MODEL_CONFIGS[config_name][\"image_resolution\"]\n",
+    "model = CLIP(\n",
+    "    embed_dim,\n",
+    "    image_resolution,\n",
+    "    vision_layers,\n",
+    "    vision_width,\n",
+    "    vision_patch_size,\n",
+    "    context_length,\n",
+    "    vocab_size,\n",
+    "    transformer_width,\n",
+    "    transformer_heads,\n",
+    "    transformer_layers,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 193
+    },
+    "id": "uE6x7gfqa3Ee",
+    "outputId": "9a080569-7ab9-49ad-8589-87f335ef2f31"
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\">Model: \"clip\"</span>\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[1mModel: \"clip\"\u001b[0m\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓\n",
+       "┃<span style=\"font-weight: bold\"> Layer (type)                       </span>┃<span style=\"font-weight: bold\"> Output Shape                  </span>┃<span style=\"font-weight: bold\">     Param # </span>┃\n",
+       "┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩\n",
+       "│ image_encoder (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">CLIPImageEncoder</span>)   │ ?                             │ <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> (unbuilt) │\n",
+       "├────────────────────────────────────┼───────────────────────────────┼─────────────┤\n",
+       "│ text_encoder (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">CLIPTextEncoder</span>)     │ ?                             │ <span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> (unbuilt) │\n",
+       "└────────────────────────────────────┴───────────────────────────────┴─────────────┘\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓\n",
+       "┃\u001b[1m \u001b[0m\u001b[1mLayer (type)                      \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mOutput Shape                 \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1m    Param #\u001b[0m\u001b[1m \u001b[0m┃\n",
+       "┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩\n",
+       "│ image_encoder (\u001b[38;5;33mCLIPImageEncoder\u001b[0m)   │ ?                             │ \u001b[38;5;34m0\u001b[0m (unbuilt) │\n",
+       "├────────────────────────────────────┼───────────────────────────────┼─────────────┤\n",
+       "│ text_encoder (\u001b[38;5;33mCLIPTextEncoder\u001b[0m)     │ ?                             │ \u001b[38;5;34m0\u001b[0m (unbuilt) │\n",
+       "└────────────────────────────────────┴───────────────────────────────┴─────────────┘\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Total params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">39,425</span> (154.00 KB)\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[1m Total params: \u001b[0m\u001b[38;5;34m39,425\u001b[0m (154.00 KB)\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Trainable params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">39,425</span> (154.00 KB)\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[1m Trainable params: \u001b[0m\u001b[38;5;34m39,425\u001b[0m (154.00 KB)\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Non-trainable params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> (0.00 B)\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[1m Non-trainable params: \u001b[0m\u001b[38;5;34m0\u001b[0m (0.00 B)\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "model.summary()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "buXKlNfGTenW"
+   },
+   "outputs": [],
+   "source": [
+    "processor = CLIPProcessor(224, \"vocab.json\", \"merges.txt\")\n",
+    "image = processor.process_images([\"cat.jpg\"])\n",
+    "text_input = [\n",
+    "    \"photo of a cat on a tortoise\",\n",
+    "    \"tortoise on a dog\",\n",
+    "    \"a photo of a tortoise\",\n",
+    "]\n",
+    "text = processor.process_texts(text_input)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "BHSpMv0PT5SX",
+    "outputId": "566c92c4-fbf3-4e2d-87f1-6112b2cff96f"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "tf.Tensor([[ 0.42190465  0.6262117  -0.2368357 ]], shape=(1, 3), dtype=float32)\n",
+      "tortoise on a dog\n"
+     ]
+    }
+   ],
+   "source": [
+    "image_logits, text_logits = model(image, text)\n",
+    "output = keras.layers.Softmax()(image_logits)\n",
+    "print(image_logits)\n",
+    "print(text_input[keras.ops.argmax(output)])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 193
+    },
+    "id": "GgNBvYCTtmA3",
+    "outputId": "35b9a26c-325e-4535-c33b-3f67ab112e19"
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\">Model: \"clip\"</span>\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[1mModel: \"clip\"\u001b[0m\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓\n",
+       "┃<span style=\"font-weight: bold\"> Layer (type)                       </span>┃<span style=\"font-weight: bold\"> Output Shape                  </span>┃<span style=\"font-weight: bold\">     Param # </span>┃\n",
+       "┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩\n",
+       "│ image_encoder (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">CLIPImageEncoder</span>)   │ ?                             │  <span style=\"color: #00af00; text-decoration-color: #00af00\">87,849,216</span> │\n",
+       "├────────────────────────────────────┼───────────────────────────────┼─────────────┤\n",
+       "│ text_encoder (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">CLIPTextEncoder</span>)     │ ?                             │  <span style=\"color: #00af00; text-decoration-color: #00af00\">63,428,096</span> │\n",
+       "└────────────────────────────────────┴───────────────────────────────┴─────────────┘\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓\n",
+       "┃\u001b[1m \u001b[0m\u001b[1mLayer (type)                      \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mOutput Shape                 \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1m    Param #\u001b[0m\u001b[1m \u001b[0m┃\n",
+       "┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩\n",
+       "│ image_encoder (\u001b[38;5;33mCLIPImageEncoder\u001b[0m)   │ ?                             │  \u001b[38;5;34m87,849,216\u001b[0m │\n",
+       "├────────────────────────────────────┼───────────────────────────────┼─────────────┤\n",
+       "│ text_encoder (\u001b[38;5;33mCLIPTextEncoder\u001b[0m)     │ ?                             │  \u001b[38;5;34m63,428,096\u001b[0m │\n",
+       "└────────────────────────────────────┴───────────────────────────────┴─────────────┘\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Total params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">151,277,313</span> (577.08 MB)\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[1m Total params: \u001b[0m\u001b[38;5;34m151,277,313\u001b[0m (577.08 MB)\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Trainable params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">151,277,313</span> (577.08 MB)\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[1m Trainable params: \u001b[0m\u001b[38;5;34m151,277,313\u001b[0m (577.08 MB)\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/html": [
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Non-trainable params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">0</span> (0.00 B)\n",
+       "</pre>\n"
+      ],
+      "text/plain": [
+       "\u001b[1m Non-trainable params: \u001b[0m\u001b[38;5;34m0\u001b[0m (0.00 B)\n"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "model.summary()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "P8DWYq_hVFnz"
+   },
+   "source": [
+    "# HF CLIP"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "3W2prd6C0pxe"
+   },
+   "outputs": [],
+   "source": [
+    "from PIL import Image\n",
+    "import requests\n",
+    "\n",
+    "from transformers import CLIPProcessor as CP\n",
+    "from transformers import CLIPModel as CM"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "EntuvOq1MhwU",
+    "outputId": "e154a367-2f94-4fa1-e97d-d2f32db7a2cf"
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config[\"id2label\"]` will be overriden.\n",
+      "`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config[\"bos_token_id\"]` will be overriden.\n",
+      "`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config[\"eos_token_id\"]` will be overriden.\n"
+     ]
+    }
+   ],
+   "source": [
+    "model_hf = CM.from_pretrained(config_name_hf)\n",
+    "processor = CP.from_pretrained(config_name_hf)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "Ep8DRTkv3AwS",
+    "outputId": "770756bc-8829-484f-b6e5-763fe81e24d0"
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([[0.9957, 0.0023, 0.0020]], grad_fn=<SoftmaxBackward0>)"
+      ]
+     },
+     "execution_count": 14,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "url = \"https://i.imgur.com/8H7XCH0.jpg\"\n",
+    "image_hf = Image.open(requests.get(url, stream=True).raw)\n",
+    "text_inputs = [\n",
+    "    \"photo of a cat on a tortoise\",\n",
+    "    \"tortoise on a dog\",\n",
+    "    \"a photo of a tortoise\",\n",
+    "]\n",
+    "inputs = processor(\n",
+    "    text=text_inputs, images=image_hf, return_tensors=\"pt\", padding=True\n",
+    ")\n",
+    "\n",
+    "outputs = model_hf(**inputs)\n",
+    "logits_per_image = (\n",
+    "    outputs.logits_per_image\n",
+    ")  # this is the image-text similarity score\n",
+    "probs = logits_per_image.softmax(\n",
+    "    dim=1\n",
+    ")  # we can take the softmax to get the label probabilitiesprobs\n",
+    "probs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "wPa0cVnY3cBC"
+   },
+   "outputs": [],
+   "source": [
+    "# hugging face weights\n",
+    "hf_wts = model_hf.state_dict()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "ArkCHlVZVKfM"
+   },
+   "source": [
+    "# Copy weights"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "TUCpKltRG4Gd"
+   },
+   "source": [
+    "##vision encoder"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "tn_U02N7U2VN"
+   },
+   "outputs": [],
+   "source": [
+    "model.logit_scale.assign(hf_wts.pop(\"logit_scale\").numpy())\n",
+    "model.get_layer(\"image_encoder\").get_layer(\n",
+    "    \"clip_patching_and_embedding\"\n",
+    ").class_embedding.assign(\n",
+    "    hf_wts.pop(\"vision_model.embeddings.class_embedding\").numpy()\n",
+    ")\n",
+    "model.get_layer(\"image_encoder\").get_layer(\n",
+    "    \"clip_patching_and_embedding\"\n",
+    ").positional_embedding.assign(\n",
+    "    hf_wts.pop(\"vision_model.embeddings.position_embedding.weight\").numpy()\n",
+    ")\n",
+    "model.get_layer(\"image_encoder\").get_layer(\n",
+    "    \"clip_patching_and_embedding\"\n",
+    ").conv1.weights[0].assign(\n",
+    "    hf_wts.pop(\"vision_model.embeddings.patch_embedding.weight\")\n",
+    "    .permute(3, 2, 1, 0)\n",
+    "    .numpy()\n",
+    ")\n",
+    "model.get_layer(\"image_encoder\").get_layer(\"ln_1\").weights[0].assign(\n",
+    "    hf_wts.pop(\"vision_model.pre_layrnorm.weight\").numpy()\n",
+    ")\n",
+    "model.get_layer(\"image_encoder\").get_layer(\"ln_1\").weights[1].assign(\n",
+    "    hf_wts.pop(\"vision_model.pre_layrnorm.bias\").numpy()\n",
+    ")\n",
+    "model.get_layer(\"image_encoder\").get_layer(\"ln_2\").weights[0].assign(\n",
+    "    hf_wts.pop(\"vision_model.post_layernorm.weight\").numpy()\n",
+    ")\n",
+    "model.get_layer(\"image_encoder\").get_layer(\"ln_2\").weights[1].assign(\n",
+    "    hf_wts.pop(\"vision_model.post_layernorm.bias\").numpy()\n",
+    ")\n",
+    "model.get_layer(\"image_encoder\").get_layer(\"vision_projector\").weights[\n",
+    "    0\n",
+    "].assign(hf_wts.pop(\"visual_projection.weight\").transpose(1, 0).numpy())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "qptfuWobZcbT"
+   },
+   "outputs": [],
+   "source": [
+    "for i in range(0, MODEL_CONFIGS[config_name][\"vision_layers\"]):\n",
+    "    if i == 0:\n",
+    "        residual_attention = f\"residual_attention\"\n",
+    "    else:\n",
+    "        residual_attention = f\"residual_attention_{i}\"\n",
+    "    model.get_layer(\"image_encoder\").get_layer(\n",
+    "        \"clip_encoder\"\n",
+    "    ).resblocks.get_layer(residual_attention).attn.q_proj.weights[0].assign(\n",
+    "        hf_wts.pop(f\"vision_model.encoder.layers.{i}.self_attn.q_proj.weight\")\n",
+    "    )\n",
+    "    model.get_layer(\"image_encoder\").get_layer(\n",
+    "        \"clip_encoder\"\n",
+    "    ).resblocks.get_layer(residual_attention).attn.q_proj.weights[1].assign(\n",
+    "        hf_wts.pop(f\"vision_model.encoder.layers.{i}.self_attn.q_proj.bias\")\n",
+    "    )\n",
+    "    model.get_layer(\"image_encoder\").get_layer(\n",
+    "        \"clip_encoder\"\n",
+    "    ).resblocks.get_layer(residual_attention).attn.k_proj.weights[0].assign(\n",
+    "        hf_wts.pop(f\"vision_model.encoder.layers.{i}.self_attn.k_proj.weight\")\n",
+    "    )\n",
+    "    model.get_layer(\"image_encoder\").get_layer(\n",
+    "        \"clip_encoder\"\n",
+    "    ).resblocks.get_layer(residual_attention).attn.k_proj.weights[1].assign(\n",
+    "        hf_wts.pop(f\"vision_model.encoder.layers.{i}.self_attn.k_proj.bias\")\n",
+    "    )\n",
+    "    model.get_layer(\"image_encoder\").get_layer(\n",
+    "        \"clip_encoder\"\n",
+    "    ).resblocks.get_layer(residual_attention).attn.v_proj.weights[0].assign(\n",
+    "        hf_wts.pop(f\"vision_model.encoder.layers.{i}.self_attn.v_proj.weight\")\n",
+    "    )\n",
+    "    model.get_layer(\"image_encoder\").get_layer(\n",
+    "        \"clip_encoder\"\n",
+    "    ).resblocks.get_layer(residual_attention).attn.v_proj.weights[1].assign(\n",
+    "        hf_wts.pop(f\"vision_model.encoder.layers.{i}.self_attn.v_proj.bias\")\n",
+    "    )\n",
+    "    model.get_layer(\"image_encoder\").get_layer(\n",
+    "        \"clip_encoder\"\n",
+    "    ).resblocks.get_layer(residual_attention).attn.out_proj.weights[1].assign(\n",
+    "        hf_wts.pop(\n",
+    "            f\"vision_model.encoder.layers.{i}.self_attn.out_proj.bias\"\n",
+    "        ).numpy()\n",
+    "    )\n",
+    "    model.get_layer(\"image_encoder\").get_layer(\n",
+    "        \"clip_encoder\"\n",
+    "    ).resblocks.get_layer(residual_attention).attn.out_proj.weights[0].assign(\n",
+    "        hf_wts.pop(\n",
+    "            f\"vision_model.encoder.layers.{i}.self_attn.out_proj.weight\"\n",
+    "        ).numpy()\n",
+    "    )\n",
+    "    model.get_layer(\"image_encoder\").get_layer(\n",
+    "        \"clip_encoder\"\n",
+    "    ).resblocks.get_layer(residual_attention).ln_1.weights[0].assign(\n",
+    "        hf_wts.pop(\n",
+    "            f\"vision_model.encoder.layers.{i}.layer_norm1.weight\"\n",
+    "        ).numpy()\n",
+    "    )\n",
+    "    model.get_layer(\"image_encoder\").get_layer(\n",
+    "        \"clip_encoder\"\n",
+    "    ).resblocks.get_layer(residual_attention).ln_1.weights[1].assign(\n",
+    "        hf_wts.pop(f\"vision_model.encoder.layers.{i}.layer_norm1.bias\").numpy()\n",
+    "    )\n",
+    "    model.get_layer(\"image_encoder\").get_layer(\n",
+    "        \"clip_encoder\"\n",
+    "    ).resblocks.get_layer(residual_attention).ln_2.weights[0].assign(\n",
+    "        hf_wts.pop(\n",
+    "            f\"vision_model.encoder.layers.{i}.layer_norm2.weight\"\n",
+    "        ).numpy()\n",
+    "    )\n",
+    "    model.get_layer(\"image_encoder\").get_layer(\n",
+    "        \"clip_encoder\"\n",
+    "    ).resblocks.get_layer(residual_attention).ln_2.weights[1].assign(\n",
+    "        hf_wts.pop(f\"vision_model.encoder.layers.{i}.layer_norm2.bias\").numpy()\n",
+    "    )\n",
+    "    model.get_layer(\"image_encoder\").get_layer(\n",
+    "        \"clip_encoder\"\n",
+    "    ).resblocks.get_layer(residual_attention).mlp.get_layer(\"c_fc\").weights[\n",
+    "        0\n",
+    "    ].assign(\n",
+    "        hf_wts.pop(f\"vision_model.encoder.layers.{i}.mlp.fc1.weight\")\n",
+    "        .transpose(1, 0)\n",
+    "        .numpy()\n",
+    "    )\n",
+    "    model.get_layer(\"image_encoder\").get_layer(\n",
+    "        \"clip_encoder\"\n",
+    "    ).resblocks.get_layer(residual_attention).mlp.get_layer(\"c_fc\").weights[\n",
+    "        1\n",
+    "    ].assign(\n",
+    "        hf_wts.pop(f\"vision_model.encoder.layers.{i}.mlp.fc1.bias\").numpy()\n",
+    "    )\n",
+    "    model.get_layer(\"image_encoder\").get_layer(\n",
+    "        \"clip_encoder\"\n",
+    "    ).resblocks.get_layer(residual_attention).mlp.get_layer(\"c_proj\").weights[\n",
+    "        0\n",
+    "    ].assign(\n",
+    "        hf_wts.pop(f\"vision_model.encoder.layers.{i}.mlp.fc2.weight\")\n",
+    "        .transpose(1, 0)\n",
+    "        .numpy()\n",
+    "    )\n",
+    "    model.get_layer(\"image_encoder\").get_layer(\n",
+    "        \"clip_encoder\"\n",
+    "    ).resblocks.get_layer(residual_attention).mlp.get_layer(\"c_proj\").weights[\n",
+    "        1\n",
+    "    ].assign(\n",
+    "        hf_wts.pop(f\"vision_model.encoder.layers.{i}.mlp.fc2.bias\").numpy()\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "1RN2aVrYG8T3"
+   },
+   "source": [
+    "## Text encoder"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "5FtDROnynb0N"
+   },
+   "outputs": [],
+   "source": [
+    "num_transformer_layers = MODEL_CONFIGS[config_name][\"vision_layers\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "_1AD7TcbdWEC"
+   },
+   "outputs": [],
+   "source": [
+    "model.get_layer(\"text_encoder\").get_layer(\"text_projector\").weights[0].assign(\n",
+    "    hf_wts.pop(\"text_projection.weight\").numpy()\n",
+    ")\n",
+    "model.get_layer(\"text_encoder\").get_layer(\"token_embedding\").weights[0].assign(\n",
+    "    hf_wts.pop(\"text_model.embeddings.token_embedding.weight\").numpy()\n",
+    ")\n",
+    "model.get_layer(\"text_encoder\").positional_embedding.assign(\n",
+    "    hf_wts.pop(\"text_model.embeddings.position_embedding.weight\").numpy()\n",
+    ")\n",
+    "model.get_layer(\"text_encoder\").get_layer(\"ln_final\").weights[0].assign(\n",
+    "    hf_wts.pop(\"text_model.final_layer_norm.weight\")\n",
+    ")\n",
+    "model.get_layer(\"text_encoder\").get_layer(\"ln_final\").weights[1].assign(\n",
+    "    hf_wts.pop(\"text_model.final_layer_norm.bias\")\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "s6leOiFO6V2U"
+   },
+   "outputs": [],
+   "source": [
+    "for i in range(MODEL_CONFIGS[config_name][\"transformer_layers\"]):\n",
+    "    model.get_layer(\"text_encoder\").get_layer(\n",
+    "        \"clip_encoder\"\n",
+    "    ).resblocks.get_layer(\n",
+    "        f\"residual_attention_{num_transformer_layers+i}\"\n",
+    "    ).attn.k_proj.weights[\n",
+    "        0\n",
+    "    ].assign(\n",
+    "        hf_wts.pop(f\"text_model.encoder.layers.{i}.self_attn.k_proj.weight\")\n",
+    "    )\n",
+    "    model.get_layer(\"text_encoder\").get_layer(\n",
+    "        \"clip_encoder\"\n",
+    "    ).resblocks.get_layer(\n",
+    "        f\"residual_attention_{num_transformer_layers+i}\"\n",
+    "    ).attn.k_proj.weights[\n",
+    "        1\n",
+    "    ].assign(\n",
+    "        hf_wts.pop(f\"text_model.encoder.layers.{i}.self_attn.k_proj.bias\")\n",
+    "    )\n",
+    "    model.get_layer(\"text_encoder\").get_layer(\n",
+    "        \"clip_encoder\"\n",
+    "    ).resblocks.get_layer(\n",
+    "        f\"residual_attention_{num_transformer_layers+i}\"\n",
+    "    ).attn.q_proj.weights[\n",
+    "        0\n",
+    "    ].assign(\n",
+    "        hf_wts.pop(f\"text_model.encoder.layers.{i}.self_attn.q_proj.weight\")\n",
+    "    )\n",
+    "    model.get_layer(\"text_encoder\").get_layer(\n",
+    "        \"clip_encoder\"\n",
+    "    ).resblocks.get_layer(\n",
+    "        f\"residual_attention_{num_transformer_layers+i}\"\n",
+    "    ).attn.q_proj.weights[\n",
+    "        1\n",
+    "    ].assign(\n",
+    "        hf_wts.pop(f\"text_model.encoder.layers.{i}.self_attn.q_proj.bias\")\n",
+    "    )\n",
+    "    model.get_layer(\"text_encoder\").get_layer(\n",
+    "        \"clip_encoder\"\n",
+    "    ).resblocks.get_layer(\n",
+    "        f\"residual_attention_{num_transformer_layers+i}\"\n",
+    "    ).attn.v_proj.weights[\n",
+    "        0\n",
+    "    ].assign(\n",
+    "        hf_wts.pop(f\"text_model.encoder.layers.{i}.self_attn.v_proj.weight\")\n",
+    "    )\n",
+    "    model.get_layer(\"text_encoder\").get_layer(\n",
+    "        \"clip_encoder\"\n",
+    "    ).resblocks.get_layer(\n",
+    "        f\"residual_attention_{num_transformer_layers+i}\"\n",
+    "    ).attn.v_proj.weights[\n",
+    "        1\n",
+    "    ].assign(\n",
+    "        hf_wts.pop(f\"text_model.encoder.layers.{i}.self_attn.v_proj.bias\")\n",
+    "    )\n",
+    "    model.get_layer(\"text_encoder\").get_layer(\n",
+    "        \"clip_encoder\"\n",
+    "    ).resblocks.get_layer(\n",
+    "        f\"residual_attention_{num_transformer_layers+i}\"\n",
+    "    ).attn.out_proj.weights[\n",
+    "        0\n",
+    "    ].assign(\n",
+    "        hf_wts.pop(f\"text_model.encoder.layers.{i}.self_attn.out_proj.weight\")\n",
+    "    )\n",
+    "    model.get_layer(\"text_encoder\").get_layer(\n",
+    "        \"clip_encoder\"\n",
+    "    ).resblocks.get_layer(\n",
+    "        f\"residual_attention_{num_transformer_layers+i}\"\n",
+    "    ).attn.out_proj.weights[\n",
+    "        1\n",
+    "    ].assign(\n",
+    "        hf_wts.pop(f\"text_model.encoder.layers.{i}.self_attn.out_proj.bias\")\n",
+    "    )\n",
+    "    model.get_layer(\"text_encoder\").get_layer(\n",
+    "        \"clip_encoder\"\n",
+    "    ).resblocks.get_layer(\n",
+    "        f\"residual_attention_{num_transformer_layers+i}\"\n",
+    "    ).ln_1.weights[\n",
+    "        0\n",
+    "    ].assign(\n",
+    "        hf_wts.pop(f\"text_model.encoder.layers.{i}.layer_norm1.weight\").numpy()\n",
+    "    )\n",
+    "    model.get_layer(\"text_encoder\").get_layer(\n",
+    "        \"clip_encoder\"\n",
+    "    ).resblocks.get_layer(\n",
+    "        f\"residual_attention_{num_transformer_layers+i}\"\n",
+    "    ).ln_1.weights[\n",
+    "        1\n",
+    "    ].assign(\n",
+    "        hf_wts.pop(f\"text_model.encoder.layers.{i}.layer_norm1.bias\").numpy()\n",
+    "    )\n",
+    "    model.get_layer(\"text_encoder\").get_layer(\n",
+    "        \"clip_encoder\"\n",
+    "    ).resblocks.get_layer(\n",
+    "        f\"residual_attention_{num_transformer_layers+i}\"\n",
+    "    ).ln_2.weights[\n",
+    "        0\n",
+    "    ].assign(\n",
+    "        hf_wts.pop(f\"text_model.encoder.layers.{i}.layer_norm2.weight\").numpy()\n",
+    "    )\n",
+    "    model.get_layer(\"text_encoder\").get_layer(\n",
+    "        \"clip_encoder\"\n",
+    "    ).resblocks.get_layer(\n",
+    "        f\"residual_attention_{num_transformer_layers+i}\"\n",
+    "    ).ln_2.weights[\n",
+    "        1\n",
+    "    ].assign(\n",
+    "        hf_wts.pop(f\"text_model.encoder.layers.{i}.layer_norm2.bias\").numpy()\n",
+    "    )\n",
+    "    model.get_layer(\"text_encoder\").get_layer(\n",
+    "        \"clip_encoder\"\n",
+    "    ).resblocks.get_layer(\n",
+    "        f\"residual_attention_{num_transformer_layers+i}\"\n",
+    "    ).mlp.get_layer(\n",
+    "        \"c_fc\"\n",
+    "    ).weights[\n",
+    "        0\n",
+    "    ].assign(\n",
+    "        hf_wts.pop(f\"text_model.encoder.layers.{i}.mlp.fc1.weight\")\n",
+    "        .transpose(1, 0)\n",
+    "        .numpy()\n",
+    "    )\n",
+    "    model.get_layer(\"text_encoder\").get_layer(\n",
+    "        \"clip_encoder\"\n",
+    "    ).resblocks.get_layer(\n",
+    "        f\"residual_attention_{num_transformer_layers+i}\"\n",
+    "    ).mlp.get_layer(\n",
+    "        \"c_fc\"\n",
+    "    ).weights[\n",
+    "        1\n",
+    "    ].assign(\n",
+    "        hf_wts.pop(f\"text_model.encoder.layers.{i}.mlp.fc1.bias\").numpy()\n",
+    "    )\n",
+    "    model.get_layer(\"text_encoder\").get_layer(\n",
+    "        \"clip_encoder\"\n",
+    "    ).resblocks.get_layer(\n",
+    "        f\"residual_attention_{num_transformer_layers+i}\"\n",
+    "    ).mlp.get_layer(\n",
+    "        \"c_proj\"\n",
+    "    ).weights[\n",
+    "        0\n",
+    "    ].assign(\n",
+    "        hf_wts.pop(f\"text_model.encoder.layers.{i}.mlp.fc2.weight\")\n",
+    "        .transpose(1, 0)\n",
+    "        .numpy()\n",
+    "    )\n",
+    "    model.get_layer(\"text_encoder\").get_layer(\n",
+    "        \"clip_encoder\"\n",
+    "    ).resblocks.get_layer(\n",
+    "        f\"residual_attention_{num_transformer_layers+i}\"\n",
+    "    ).mlp.get_layer(\n",
+    "        \"c_proj\"\n",
+    "    ).weights[\n",
+    "        1\n",
+    "    ].assign(\n",
+    "        hf_wts.pop(f\"text_model.encoder.layers.{i}.mlp.fc2.bias\").numpy()\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "Bgen7hxCCeZ7",
+    "outputId": "c777d6f1-4aa7-4f3e-8fd7-759364364c44"
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "odict_keys([])"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# verify that we copied all weights\n",
+    "hf_wts.keys()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "wlfDdO-mid62"
+   },
+   "source": [
+    "# save weights"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "QscCUUZFiqBV"
+   },
+   "outputs": [],
+   "source": [
+    "model.save_weights(\"clip-vit-base-patch32.weights.h5\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "colab": {
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/requirements-common.txt b/requirements-common.txt
index fc21cc5f96..29f7ee9a19 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -13,4 +13,4 @@ isort
 black
 pytest
 build
-namex
\ No newline at end of file
+namex

From cc450d7fc9773ce7231b4f53726a988f91d5e2e1 Mon Sep 17 00:00:00 2001
From: Pranav Prajapati <94780581+pranavvp16@users.noreply.github.com>
Date: Fri, 23 Feb 2024 00:12:07 +0530
Subject: [PATCH 24/30] Migrate VGG16 from legacy to backbone (#2341)

* Add VGG16 to backbone from legacy

* Add backbone tests

* Add model to __init__.py

* Fix code format for vgg16 backbone
---
 keras_cv/models/__init__.py                   |   1 +
 keras_cv/models/backbones/vgg16/__init__.py   |  13 ++
 .../models/backbones/vgg16/vgg16_backbone.py  | 219 ++++++++++++++++++
 .../backbones/vgg16/vgg16_backbone_test.py    |  75 ++++++
 4 files changed, 308 insertions(+)
 create mode 100644 keras_cv/models/backbones/vgg16/__init__.py
 create mode 100644 keras_cv/models/backbones/vgg16/vgg16_backbone.py
 create mode 100644 keras_cv/models/backbones/vgg16/vgg16_backbone_test.py

diff --git a/keras_cv/models/__init__.py b/keras_cv/models/__init__.py
index 8e6a849a95..77c3ad33d9 100644
--- a/keras_cv/models/__init__.py
+++ b/keras_cv/models/__init__.py
@@ -178,6 +178,7 @@
 from keras_cv.models.backbones.resnet_v2.resnet_v2_backbone import (
     ResNetV2Backbone,
 )
+from keras_cv.models.backbones.vgg16.vgg16_backbone import VGG16Backbone
 from keras_cv.models.backbones.vit_det.vit_det_aliases import ViTDetBBackbone
 from keras_cv.models.backbones.vit_det.vit_det_aliases import ViTDetHBackbone
 from keras_cv.models.backbones.vit_det.vit_det_aliases import ViTDetLBackbone
diff --git a/keras_cv/models/backbones/vgg16/__init__.py b/keras_cv/models/backbones/vgg16/__init__.py
new file mode 100644
index 0000000000..3992ffb59a
--- /dev/null
+++ b/keras_cv/models/backbones/vgg16/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2023 The KerasCV Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/keras_cv/models/backbones/vgg16/vgg16_backbone.py b/keras_cv/models/backbones/vgg16/vgg16_backbone.py
new file mode 100644
index 0000000000..901ab0d582
--- /dev/null
+++ b/keras_cv/models/backbones/vgg16/vgg16_backbone.py
@@ -0,0 +1,219 @@
+# Copyright 2023 The KerasCV Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from keras import layers
+
+from keras_cv.models import utils
+from keras_cv.models.backbones.backbone import Backbone
+
+
+class VGG16Backbone(Backbone):
+    """
+    Reference:
+    - [Very Deep Convolutional Networks for Large-Scale Image Recognition](https://arxiv.org/abs/1409.1556)
+        (ICLR 2015)
+    This class represents Keras Backbone of VGG16 model.
+    Args:
+      include_rescaling: bool, whether to rescale the inputs. If set to
+        True, inputs will be passed through a `Rescaling(1/255.0)` layer.
+      include_top: bool, whether to include the 3 fully-connected
+        layers at the top of the network. If provided, num_classes must be
+          provided.
+      num_classes: int, optional number of classes to classify images into,
+        only to be specified if `include_top` is True.
+      input_shape: tuple, optional shape tuple, defaults to (224, 224, 3).
+      input_tensor: Tensor, optional Keras tensor (i.e. output of
+        `layers.Input()`) to use as image input for the model.
+      pooling: bool, Optional pooling mode for feature extraction
+        when `include_top` is `False`.
+        - `None` means that the output of the model will be
+            the 4D tensor output of the
+            last convolutional block.
+        - `avg` means that global average pooling
+            will be applied to the output of the
+            last convolutional block, and thus
+            the output of the model will be a 2D tensor.
+        - `max` means that global max pooling will
+            be applied.
+      classifier_activation:`str` or callable. The activation function to use
+        on the "top" layer. Ignored unless `include_top=True`. Set
+        `classifier_activation=None` to return the logits of the "top" layer.
+        When loading pretrained weights, `classifier_activation` can only
+        be `None` or `"softmax"`.
+      name: (Optional) name to pass to the model, defaults to "VGG16".
+    Returns:
+      A `keras.Model` instance.
+    """  # noqa: E501
+
+    def __init__(
+        self,
+        include_rescaling,
+        include_top,
+        input_tensor=None,
+        num_classes=None,
+        input_shape=(224, 224, 3),
+        pooling=None,
+        classifier_activation="softmax",
+        name="VGG16",
+        **kwargs,
+    ):
+
+        if include_top and num_classes is None:
+            raise ValueError(
+                "If `include_top` is True, you should specify `num_classes`. "
+                f"Received: num_classes={num_classes}"
+            )
+
+        if include_top and pooling:
+            raise ValueError(
+                f"`pooling` must be `None` when `include_top=True`."
+                f"Received pooling={pooling} and include_top={include_top}. "
+            )
+
+        img_input = utils.parse_model_inputs(input_shape, input_tensor)
+        x = img_input
+
+        if include_rescaling:
+            x = layers.Rescaling(scale=1 / 255.0)(x)
+
+        x = apply_vgg_block(
+            x=x,
+            num_layers=2,
+            filters=64,
+            kernel_size=(3, 3),
+            activation="relu",
+            padding="same",
+            max_pool=True,
+            name="block1",
+        )
+
+        x = apply_vgg_block(
+            x=x,
+            num_layers=2,
+            filters=128,
+            kernel_size=(3, 3),
+            activation="relu",
+            padding="same",
+            max_pool=True,
+            name="block2",
+        )
+
+        x = apply_vgg_block(
+            x=x,
+            num_layers=3,
+            filters=256,
+            kernel_size=(3, 3),
+            activation="relu",
+            padding="same",
+            max_pool=True,
+            name="block3",
+        )
+
+        x = apply_vgg_block(
+            x=x,
+            num_layers=3,
+            filters=512,
+            kernel_size=(3, 3),
+            activation="relu",
+            padding="same",
+            max_pool=True,
+            name="block4",
+        )
+
+        x = apply_vgg_block(
+            x=x,
+            num_layers=3,
+            filters=512,
+            kernel_size=(3, 3),
+            activation="relu",
+            padding="same",
+            max_pool=True,
+            name="block5",
+        )
+
+        if include_top:
+            x = layers.Flatten(name="flatten")(x)
+            x = layers.Dense(4096, activation="relu", name="fc1")(x)
+            x = layers.Dense(4096, activation="relu", name="fc2")(x)
+            x = layers.Dense(
+                num_classes,
+                activation=classifier_activation,
+                name="predictions",
+            )(x)
+        else:
+            if pooling == "avg":
+                x = layers.GlobalAveragePooling2D()(x)
+            elif pooling == "max":
+                x = layers.GlobalMaxPooling2D()(x)
+
+        super().__init__(inputs=img_input, outputs=x, name=name, **kwargs)
+
+        self.include_rescaling = include_rescaling
+        self.include_top = include_top
+        self.num_classes = num_classes
+        self.input_tensor = input_tensor
+        self.pooling = pooling
+        self.classifier_activation = classifier_activation
+
+    def get_config(self):
+        return {
+            "include_rescaling": self.include_rescaling,
+            "include_top": self.include_top,
+            "name": self.name,
+            "input_shape": self.input_shape[1:],
+            "input_tensor": self.input_tensor,
+            "pooling": self.pooling,
+            "num_classes": self.num_classes,
+            "classifier_activation": self.classifier_activation,
+            "trainable": self.trainable,
+        }
+
+
+def apply_vgg_block(
+    x,
+    num_layers,
+    filters,
+    kernel_size,
+    activation,
+    padding,
+    max_pool,
+    name,
+):
+    """
+    Applies VGG block
+    Args:
+        x: Tensor, input tensor to pass through network
+        num_layers: int, number of CNN layers in the block
+        filters: int, filter size of each CNN layer in block
+        kernel_size: int (or) tuple, kernel size for CNN layer in block
+        activation: str (or) callable, activation function for each CNN layer in
+            block
+        padding: str (or) callable, padding function for each CNN layer in block
+        max_pool: bool, whether to add MaxPooling2D layer at end of block
+        name: str, name of the block
+
+    Returns:
+        keras.KerasTensor
+    """
+    for num in range(1, num_layers + 1):
+        x = layers.Conv2D(
+            filters,
+            kernel_size,
+            activation=activation,
+            padding=padding,
+            name=f"{name}_conv{num}",
+        )(x)
+    if max_pool:
+        x = layers.MaxPooling2D((2, 2), (2, 2), name=f"{name}_pool")(x)
+    return x
diff --git a/keras_cv/models/backbones/vgg16/vgg16_backbone_test.py b/keras_cv/models/backbones/vgg16/vgg16_backbone_test.py
new file mode 100644
index 0000000000..d7a8c9724f
--- /dev/null
+++ b/keras_cv/models/backbones/vgg16/vgg16_backbone_test.py
@@ -0,0 +1,75 @@
+# Copyright 2023 The KerasCV Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import numpy as np
+import pytest
+
+from keras_cv.backend import keras
+from keras_cv.models import VGG16Backbone
+from keras_cv.tests.test_case import TestCase
+
+
+class VGG16BackboneTest(TestCase):
+    def setUp(self):
+        self.img_input = np.ones((2, 224, 224, 3), dtype="float32")
+
+    def test_valid_call(self):
+        model = VGG16Backbone(
+            input_shape=(224, 224, 3),
+            include_top=False,
+            include_rescaling=False,
+            pooling="avg",
+        )
+        model(self.img_input)
+
+    def test_valid_call_with_rescaling(self):
+        model = VGG16Backbone(
+            input_shape=(224, 224, 3),
+            include_top=False,
+            include_rescaling=True,
+            pooling="avg",
+        )
+        model(self.img_input)
+
+    def test_valid_call_with_top(self):
+        model = VGG16Backbone(
+            input_shape=(224, 224, 3),
+            include_top=True,
+            include_rescaling=False,
+            num_classes=2,
+        )
+        model(self.img_input)
+
+    @pytest.mark.large
+    def test_saved_model(self):
+        model = VGG16Backbone(
+            input_shape=(224, 224, 3),
+            include_top=False,
+            include_rescaling=False,
+            num_classes=2,
+            pooling="avg",
+        )
+        model_output = model(self.img_input)
+        save_path = os.path.join(self.get_temp_dir(), "vgg16.keras")
+        model.save(save_path)
+        restored_model = keras.models.load_model(save_path)
+
+        # Check the restored model is instance of VGG16Backbone
+        self.assertIsInstance(restored_model, VGG16Backbone)
+
+        # Check if the restored model gives the same output
+        restored_model_output = restored_model(self.img_input)
+        self.assertAllClose(model_output, restored_model_output)

From 2f5e0574772505b9eab721dbc011c7a2f39a960d Mon Sep 17 00:00:00 2001
From: Divyashree Sreepathihalli <divyashreepathihalli@gmail.com>
Date: Thu, 22 Feb 2024 16:26:16 -0800
Subject: [PATCH 25/30] CLIP : Enable CLIP.from_preset() Kaggle tests (#2357)

* update kaggle handle and weights conversion file

* code reformat

---------

Co-authored-by: Divyashree Sreepathihalli <divyashreepathihalli>
---
 .../feature_extractor/clip/clip_model.py      |    2 +-
 .../feature_extractor/clip/clip_model_test.py |   10 +-
 .../feature_extractor/clip/clip_presets.py    |    8 +-
 .../clip_weights_conversion.ipynb             | 3568 +++++++++++++++--
 4 files changed, 3233 insertions(+), 355 deletions(-)

diff --git a/keras_cv/models/feature_extractor/clip/clip_model.py b/keras_cv/models/feature_extractor/clip/clip_model.py
index e81dbd5d09..c3e6d49caf 100644
--- a/keras_cv/models/feature_extractor/clip/clip_model.py
+++ b/keras_cv/models/feature_extractor/clip/clip_model.py
@@ -72,7 +72,7 @@ def __init__(
         vision_patch_size=32,
         context_length=77,
         vocab_size=49408,
-        transformer_width=768,
+        transformer_width=512,
         transformer_heads=8,
         transformer_layers=12,
         **kwargs,
diff --git a/keras_cv/models/feature_extractor/clip/clip_model_test.py b/keras_cv/models/feature_extractor/clip/clip_model_test.py
index d5c777c653..14304b73ef 100644
--- a/keras_cv/models/feature_extractor/clip/clip_model_test.py
+++ b/keras_cv/models/feature_extractor/clip/clip_model_test.py
@@ -52,9 +52,9 @@ def test_clip_model_golden_values(self):
             processed_image, processed_text, attention_mask
         )
         print(image_logits)
-        self.assertAllClose(image_logits, [[2.932678, 2.932678, 2.932675]])
+        self.assertAllClose(image_logits, [[1.896713, 1.896713, 1.896713]])
         self.assertAllClose(
-            text_logits, ops.transpose([[2.932678, 2.932678, 2.932675]])
+            text_logits, ops.transpose([[1.896713, 1.896713, 1.896713]])
         )
 
     def test_clip_preprocessor(self):
@@ -77,8 +77,8 @@ def test_clip_preprocessor_tf_data(self):
 
     @pytest.mark.large
     def test_presets(self):
-        self.skipTest("TODO: Enable after Kaggle model is public")
-        model = CLIP.from_preset("clip-vit-base-patch32")
+        # self.skipTest("TODO: Enable after Kaggle model is public")
+        model = CLIP.from_preset("clip-vit-base-patch16")
         processed_image = np.ones(shape=[1, 224, 224, 3])
         processed_text = np.ones(shape=[3, 77])
         attention_mask = np.ones(shape=[3, 77])
@@ -109,7 +109,7 @@ def test_text_encoder_golden_values(self):
         print(model.text_embeddings)
         self.assertAllClose(
             model.text_embeddings[0, :3],
-            [-0.018502, 0.000906, 0.020372],
+            [0.007531, -0.038361, -0.035686],
         )
 
     @pytest.mark.large  # Saving is slow, so mark these large.
diff --git a/keras_cv/models/feature_extractor/clip/clip_presets.py b/keras_cv/models/feature_extractor/clip/clip_presets.py
index 6b4d98727e..656c9ad8ed 100644
--- a/keras_cv/models/feature_extractor/clip/clip_presets.py
+++ b/keras_cv/models/feature_extractor/clip/clip_presets.py
@@ -28,7 +28,7 @@
             "official_name": "CLIP",
             "path": "clip",
         },
-        "kaggle_handle": "kaggle://keras/clip/keras/clip-vit-base-patch16/2",
+        "kaggle_handle": "kaggle://keras/clip/keras/clip-vit-base-patch16/4",
     },
     "clip-vit-base-patch32": {
         "metadata": {
@@ -44,7 +44,7 @@
             "official_name": "CLIP",
             "path": "clip",
         },
-        "kaggle_handle": "kaggle://keras/clip/keras/clip-vit-base-patch32/2",
+        "kaggle_handle": "kaggle://keras/clip/keras/clip-vit-base-patch32/4",
     },
     "clip-vit-large-patch14": {
         "metadata": {
@@ -60,7 +60,7 @@
             "official_name": "CLIP",
             "path": "clip",
         },
-        "kaggle_handle": "kaggle://keras/clip/keras/clip-vit-large-patch14/2",
+        "kaggle_handle": "kaggle://keras/clip/keras/clip-vit-large-patch14/4",
     },
     "clip-vit-large-patch14-336": {
         "metadata": {
@@ -76,6 +76,6 @@
             "official_name": "CLIP",
             "path": "clip",
         },
-        "kaggle_handle": "kaggle://keras/clip/keras/clip-vit-large-patch14-336/2",  # noqa: E501
+        "kaggle_handle": "kaggle://keras/clip/keras/clip-vit-large-patch14-336/4",  # noqa: E501
     },
 }
diff --git a/keras_cv/tools/checkpoint_conversion/clip_weights_conversion.ipynb b/keras_cv/tools/checkpoint_conversion/clip_weights_conversion.ipynb
index 13e443669a..ff3bb4c991 100644
--- a/keras_cv/tools/checkpoint_conversion/clip_weights_conversion.ipynb
+++ b/keras_cv/tools/checkpoint_conversion/clip_weights_conversion.ipynb
@@ -17,7 +17,7 @@
      "base_uri": "https://localhost:8080/"
     },
     "id": "cRzYR-oFgxt1",
-    "outputId": "e4b01fcd-9f71-4ba7-b8a2-1796f7ef260d"
+    "outputId": "80b8db20-da09-43bd-9b70-fad93b1e1ca1"
    },
    "outputs": [
     {
@@ -27,30 +27,14 @@
       "  Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n",
       "  Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n",
       "  Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
-      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m950.8/950.8 kB\u001b[0m \u001b[31m5.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m950.8/950.8 kB\u001b[0m \u001b[31m6.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
       "\u001b[?25h  Building wheel for keras-cv (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
-      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m415.4/415.4 kB\u001b[0m \u001b[31m5.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.2/5.2 MB\u001b[0m \u001b[31m17.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-      "\u001b[?25hCollecting keras==3.0.2\n",
-      "  Downloading keras-3.0.2-py3-none-any.whl (1.0 MB)\n",
-      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.0/1.0 MB\u001b[0m \u001b[31m8.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
-      "\u001b[?25hRequirement already satisfied: absl-py in /usr/local/lib/python3.10/dist-packages (from keras==3.0.2) (1.4.0)\n",
-      "Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from keras==3.0.2) (1.23.5)\n",
-      "Requirement already satisfied: rich in /usr/local/lib/python3.10/dist-packages (from keras==3.0.2) (13.7.0)\n",
-      "Requirement already satisfied: namex in /usr/local/lib/python3.10/dist-packages (from keras==3.0.2) (0.0.7)\n",
-      "Requirement already satisfied: h5py in /usr/local/lib/python3.10/dist-packages (from keras==3.0.2) (3.9.0)\n",
-      "Requirement already satisfied: dm-tree in /usr/local/lib/python3.10/dist-packages (from keras==3.0.2) (0.1.8)\n",
-      "Requirement already satisfied: markdown-it-py>=2.2.0 in /usr/local/lib/python3.10/dist-packages (from rich->keras==3.0.2) (3.0.0)\n",
-      "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /usr/local/lib/python3.10/dist-packages (from rich->keras==3.0.2) (2.16.1)\n",
-      "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.10/dist-packages (from markdown-it-py>=2.2.0->rich->keras==3.0.2) (0.1.2)\n",
-      "Installing collected packages: keras\n",
-      "  Attempting uninstall: keras\n",
-      "    Found existing installation: keras 2.15.0\n",
-      "    Uninstalling keras-2.15.0:\n",
-      "      Successfully uninstalled keras-2.15.0\n",
-      "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m465.2/465.2 kB\u001b[0m \u001b[31m3.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.2/5.2 MB\u001b[0m \u001b[31m36.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.0/1.0 MB\u001b[0m \u001b[31m2.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+      "\u001b[?25h\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
       "tensorflow 2.15.0 requires keras<2.16,>=2.15.0, but you have keras 3.0.2 which is incompatible.\u001b[0m\u001b[31m\n",
-      "\u001b[0mSuccessfully installed keras-3.0.2\n"
+      "\u001b[0m"
      ]
     }
    ],
@@ -59,29 +43,7 @@
     "!pip install -q keras-nlp\n",
     "!pip install -q tf-keras\n",
     "!pip install -q tensorflow-text\n",
-    "!pip install keras==3.0.2"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "mdGT8Em4Mc4b"
-   },
-   "source": [
-    "# Import"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "GDvJmQuug4-x"
-   },
-   "outputs": [],
-   "source": [
-    "from keras_cv.models.feature_extractor.clip import CLIPProcessor\n",
-    "import keras\n",
-    "from keras_cv.models import CLIP"
+    "!pip install -q keras==3.0.2"
    ]
   },
   {
@@ -92,48 +54,94 @@
      "base_uri": "https://localhost:8080/"
     },
     "id": "nuFgha2jTshi",
-    "outputId": "b99d73eb-cc97-47d0-f46e-687c9e8b8237"
+    "outputId": "63d4160e-42b3-4f6b-e672-ba30c9402d25"
    },
    "outputs": [
     {
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "--2024-02-02 22:19:20--  https://i.imgur.com/8H7XCH0.jpg\n",
-      "Resolving i.imgur.com (i.imgur.com)... 151.101.52.193\n",
-      "Connecting to i.imgur.com (i.imgur.com)|151.101.52.193|:443... connected.\n",
+      "--2024-02-21 20:54:06--  https://i.imgur.com/8H7XCH0.jpg\n",
+      "Resolving i.imgur.com (i.imgur.com)... 146.75.76.193\n",
+      "Connecting to i.imgur.com (i.imgur.com)|146.75.76.193|:443... connected.\n",
       "HTTP request sent, awaiting response... 200 OK\n",
       "Length: 44544 (44K) [image/jpeg]\n",
       "Saving to: ‘cat.jpg’\n",
       "\n",
       "\rcat.jpg               0%[                    ]       0  --.-KB/s               \rcat.jpg             100%[===================>]  43.50K  --.-KB/s    in 0.01s   \n",
       "\n",
-      "2024-02-02 22:19:20 (3.58 MB/s) - ‘cat.jpg’ saved [44544/44544]\n",
+      "2024-02-21 20:54:06 (4.16 MB/s) - ‘cat.jpg’ saved [44544/44544]\n",
       "\n",
-      "--2024-02-02 22:19:20--  http://images.cocodataset.org/val2017/000000039769.jpg\n",
-      "Resolving images.cocodataset.org (images.cocodataset.org)... 52.216.78.4, 3.5.1.13, 52.217.139.73, ...\n",
-      "Connecting to images.cocodataset.org (images.cocodataset.org)|52.216.78.4|:80... connected.\n",
+      "--2024-02-21 20:54:06--  http://images.cocodataset.org/val2017/000000039769.jpg\n",
+      "Resolving images.cocodataset.org (images.cocodataset.org)... 52.217.206.137, 16.182.42.89, 54.231.201.177, ...\n",
+      "Connecting to images.cocodataset.org (images.cocodataset.org)|52.217.206.137|:80... connected.\n",
       "HTTP request sent, awaiting response... 200 OK\n",
       "Length: 173131 (169K) [image/jpeg]\n",
-      "Saving to: ‘test.jpg’\n",
+      "Saving to: ‘two_cats.jpg’\n",
+      "\n",
+      "two_cats.jpg        100%[===================>] 169.07K  --.-KB/s    in 0.09s   \n",
       "\n",
-      "test.jpg            100%[===================>] 169.07K  --.-KB/s    in 0.06s   \n",
+      "2024-02-21 20:54:07 (1.77 MB/s) - ‘two_cats.jpg’ saved [173131/173131]\n",
       "\n",
-      "2024-02-02 22:19:20 (2.67 MB/s) - ‘test.jpg’ saved [173131/173131]\n",
+      "--2024-02-21 20:54:07--  https://i.imgur.com/PpgZzP4.jpeg\n",
+      "Resolving i.imgur.com (i.imgur.com)... 146.75.76.193\n",
+      "Connecting to i.imgur.com (i.imgur.com)|146.75.76.193|:443... connected.\n",
+      "HTTP request sent, awaiting response... 200 OK\n",
+      "Length: 1610285 (1.5M) [image/jpeg]\n",
+      "Saving to: ‘mountain.jpg’\n",
+      "\n",
+      "mountain.jpg        100%[===================>]   1.54M  --.-KB/s    in 0.06s   \n",
+      "\n",
+      "2024-02-21 20:54:07 (27.6 MB/s) - ‘mountain.jpg’ saved [1610285/1610285]\n",
       "\n"
      ]
     }
    ],
    "source": [
     "!wget https://i.imgur.com/8H7XCH0.jpg -O cat.jpg\n",
-    "!wget http://images.cocodataset.org/val2017/000000039769.jpg -O test.jpg"
+    "!wget http://images.cocodataset.org/val2017/000000039769.jpg -O two_cats.jpg\n",
+    "!wget https://i.imgur.com/PpgZzP4.jpeg -O mountain.jpg"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "mdGT8Em4Mc4b"
+   },
+   "source": [
+    "# Import"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "0mtj1abS2cVf"
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "os.environ[\"KERAS_BACKEND\"] = \"torch\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "GDvJmQuug4-x"
+   },
+   "outputs": [],
+   "source": [
+    "from keras_cv.models.feature_extractor.clip import CLIPProcessor\n",
+    "import keras\n",
+    "from keras_cv.models import CLIP"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "cellView": "form",
     "id": "X3kkmK6h_gFH"
    },
    "outputs": [],
@@ -190,8 +198,8 @@
     "    },\n",
     "}\n",
     "model_map_hf = {\n",
-    "    \"CLIP_B16\": \"openai/clip-vit-base-patch32\",\n",
-    "    \"CLIP_B32\": \"openai/clip-vit-base-patch16\",\n",
+    "    \"CLIP_B16\": \"openai/clip-vit-base-patch16\",\n",
+    "    \"CLIP_B32\": \"openai/clip-vit-base-patch32\",\n",
     "    \"CLIP_L14\": \"openai/clip-vit-large-patch14\",\n",
     "    \"CLIP_L14_336\": \"openai/clip-vit-large-patch14-336\",\n",
     "}\n",
@@ -249,7 +257,7 @@
      "height": 193
     },
     "id": "uE6x7gfqa3Ee",
-    "outputId": "9a080569-7ab9-49ad-8589-87f335ef2f31"
+    "outputId": "f55fc358-04a4-42ce-c397-3f81a238ab1e"
    },
    "outputs": [
     {
@@ -293,11 +301,11 @@
     {
      "data": {
       "text/html": [
-       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Total params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">39,425</span> (154.00 KB)\n",
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Total params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">1</span> (4.00 B)\n",
        "</pre>\n"
       ],
       "text/plain": [
-       "\u001b[1m Total params: \u001b[0m\u001b[38;5;34m39,425\u001b[0m (154.00 KB)\n"
+       "\u001b[1m Total params: \u001b[0m\u001b[38;5;34m1\u001b[0m (4.00 B)\n"
       ]
      },
      "metadata": {},
@@ -306,11 +314,11 @@
     {
      "data": {
       "text/html": [
-       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Trainable params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">39,425</span> (154.00 KB)\n",
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Trainable params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">1</span> (4.00 B)\n",
        "</pre>\n"
       ],
       "text/plain": [
-       "\u001b[1m Trainable params: \u001b[0m\u001b[38;5;34m39,425\u001b[0m (154.00 KB)\n"
+       "\u001b[1m Trainable params: \u001b[0m\u001b[38;5;34m1\u001b[0m (4.00 B)\n"
       ]
      },
      "metadata": {},
@@ -342,14 +350,23 @@
    },
    "outputs": [],
    "source": [
-    "processor = CLIPProcessor(224, \"vocab.json\", \"merges.txt\")\n",
-    "image = processor.process_images([\"cat.jpg\"])\n",
-    "text_input = [\n",
-    "    \"photo of a cat on a tortoise\",\n",
-    "    \"tortoise on a dog\",\n",
-    "    \"a photo of a tortoise\",\n",
-    "]\n",
-    "text = processor.process_texts(text_input)"
+    "processor = CLIPProcessor(\n",
+    "    MODEL_CONFIGS[config_name][\"image_resolution\"], \"vocab.json\", \"merges.txt\"\n",
+    ")\n",
+    "image = processor.process_images([\"two_cats.jpg\"])\n",
+    "text_input = [\"mountains\", \"cat on tortoise\", \"two cats\"]\n",
+    "text, attention_mask = processor.process_texts(text_input)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "BHSpMv0PT5SX"
+   },
+   "outputs": [],
+   "source": [
+    "image_logits, text_logits = model(image, text, attention_mask)"
    ]
   },
   {
@@ -359,24 +376,23 @@
     "colab": {
      "base_uri": "https://localhost:8080/"
     },
-    "id": "BHSpMv0PT5SX",
-    "outputId": "566c92c4-fbf3-4e2d-87f1-6112b2cff96f"
+    "id": "JPn0gACJjKy5",
+    "outputId": "cbc7313a-4ddd-4021-9e84-fa668987849d"
    },
    "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "tf.Tensor([[ 0.42190465  0.6262117  -0.2368357 ]], shape=(1, 3), dtype=float32)\n",
-      "tortoise on a dog\n"
-     ]
+     "data": {
+      "text/plain": [
+       "tensor([[3.7318, 3.7792, 3.7633]], grad_fn=<MulBackward0>)"
+      ]
+     },
+     "execution_count": 32,
+     "metadata": {},
+     "output_type": "execute_result"
     }
    ],
    "source": [
-    "image_logits, text_logits = model(image, text)\n",
-    "output = keras.layers.Softmax()(image_logits)\n",
-    "print(image_logits)\n",
-    "print(text_input[keras.ops.argmax(output)])"
+    "image_logits"
    ]
   },
   {
@@ -388,7 +404,7 @@
      "height": 193
     },
     "id": "GgNBvYCTtmA3",
-    "outputId": "35b9a26c-325e-4535-c33b-3f67ab112e19"
+    "outputId": "a667a9e5-397e-4299-fdc1-8899621112ad"
    },
    "outputs": [
     {
@@ -410,9 +426,9 @@
        "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\">┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓\n",
        "┃<span style=\"font-weight: bold\"> Layer (type)                       </span>┃<span style=\"font-weight: bold\"> Output Shape                  </span>┃<span style=\"font-weight: bold\">     Param # </span>┃\n",
        "┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩\n",
-       "│ image_encoder (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">CLIPImageEncoder</span>)   │ ?                             │  <span style=\"color: #00af00; text-decoration-color: #00af00\">87,849,216</span> │\n",
+       "│ image_encoder (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">CLIPImageEncoder</span>)   │ ?                             │ <span style=\"color: #00af00; text-decoration-color: #00af00\">304,293,888</span> │\n",
        "├────────────────────────────────────┼───────────────────────────────┼─────────────┤\n",
-       "│ text_encoder (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">CLIPTextEncoder</span>)     │ ?                             │  <span style=\"color: #00af00; text-decoration-color: #00af00\">63,428,096</span> │\n",
+       "│ text_encoder (<span style=\"color: #0087ff; text-decoration-color: #0087ff\">CLIPTextEncoder</span>)     │ ?                             │ <span style=\"color: #00af00; text-decoration-color: #00af00\">123,650,304</span> │\n",
        "└────────────────────────────────────┴───────────────────────────────┴─────────────┘\n",
        "</pre>\n"
       ],
@@ -420,9 +436,9 @@
        "┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓\n",
        "┃\u001b[1m \u001b[0m\u001b[1mLayer (type)                      \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mOutput Shape                 \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1m    Param #\u001b[0m\u001b[1m \u001b[0m┃\n",
        "┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩\n",
-       "│ image_encoder (\u001b[38;5;33mCLIPImageEncoder\u001b[0m)   │ ?                             │  \u001b[38;5;34m87,849,216\u001b[0m │\n",
+       "│ image_encoder (\u001b[38;5;33mCLIPImageEncoder\u001b[0m)   │ ?                             │ \u001b[38;5;34m304,293,888\u001b[0m │\n",
        "├────────────────────────────────────┼───────────────────────────────┼─────────────┤\n",
-       "│ text_encoder (\u001b[38;5;33mCLIPTextEncoder\u001b[0m)     │ ?                             │  \u001b[38;5;34m63,428,096\u001b[0m │\n",
+       "│ text_encoder (\u001b[38;5;33mCLIPTextEncoder\u001b[0m)     │ ?                             │ \u001b[38;5;34m123,650,304\u001b[0m │\n",
        "└────────────────────────────────────┴───────────────────────────────┴─────────────┘\n"
       ]
      },
@@ -432,11 +448,11 @@
     {
      "data": {
       "text/html": [
-       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Total params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">151,277,313</span> (577.08 MB)\n",
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Total params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">427,944,193</span> (1.59 GB)\n",
        "</pre>\n"
       ],
       "text/plain": [
-       "\u001b[1m Total params: \u001b[0m\u001b[38;5;34m151,277,313\u001b[0m (577.08 MB)\n"
+       "\u001b[1m Total params: \u001b[0m\u001b[38;5;34m427,944,193\u001b[0m (1.59 GB)\n"
       ]
      },
      "metadata": {},
@@ -445,11 +461,11 @@
     {
      "data": {
       "text/html": [
-       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Trainable params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">151,277,313</span> (577.08 MB)\n",
+       "<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-weight: bold\"> Trainable params: </span><span style=\"color: #00af00; text-decoration-color: #00af00\">427,944,193</span> (1.59 GB)\n",
        "</pre>\n"
       ],
       "text/plain": [
-       "\u001b[1m Trainable params: \u001b[0m\u001b[38;5;34m151,277,313\u001b[0m (577.08 MB)\n"
+       "\u001b[1m Trainable params: \u001b[0m\u001b[38;5;34m427,944,193\u001b[0m (1.59 GB)\n"
       ]
      },
      "metadata": {},
@@ -502,25 +518,239 @@
    "execution_count": null,
    "metadata": {
     "colab": {
-     "base_uri": "https://localhost:8080/"
+     "base_uri": "https://localhost:8080/",
+     "height": 432,
+     "referenced_widgets": [
+      "46636db47838400cb7407fc2ab0720eb",
+      "718081783f7f411599ba5bac18748697",
+      "effcd9cbd407405cbbffe4e76b19df72",
+      "7d4e0cdb53474d50a0b70a6f2d9a0eba",
+      "b3bf65796de8494e82506f6849316426",
+      "5c1c89c4eafc4dc7888236023a4716fd",
+      "3e48c9a0c27d49829a0637c9c727ed8f",
+      "9b1db90efb8b4ef096c5254494c063db",
+      "c9fdbe12b7934fd8accc09645df0f214",
+      "a5b8a21a005d4ef7b253b782dae3b88d",
+      "002207337b4b47c9a6b8ddda823128ba",
+      "f359ba8ef0cf4841b40acafcd770480c",
+      "0861f873665f4514abfd2b09ad944ab8",
+      "b8924ff1bbb5409c8dc141e65c4cdcdb",
+      "4b62b742aacb45c7a286a5d85a0194da",
+      "dfbe3622fad14e798c9f413a134fd107",
+      "0eda9fca096945a48059cbc2b1c9ffdf",
+      "e2bc0bc25b5044abb8bb1c752935922e",
+      "b1b3b62f8d7545938d1e2d8f8757c589",
+      "2d744d00de6745bda5835f0cd66e3909",
+      "6f708ee77df84a8cb1bff9df119ca7df",
+      "8b766a31754f4445bf2614da1ad45446",
+      "3af19b8b653c4b21a65f7e96dd463aac",
+      "a4eddaf970084d9ba057e7038423be01",
+      "f0af662a1a884fb78c693ccf0d0b6d8e",
+      "9acadb088a75425a8115ffd876e161bf",
+      "ef6fd54de3aa46af868c8c810068d9ad",
+      "af229a4850174254b09f850c67aefe3a",
+      "b007afd6777e4364a57a717088356780",
+      "4609b1b46de5441a9632985833bd0b05",
+      "5c5e2b0d9fa7435a92c95d417ed03956",
+      "b8bd9787d9c640e19798be15e94ede04",
+      "e191179e7e4048b69b47d3b9b550b459",
+      "5c21455d9faa4112ba6f18819f7ef038",
+      "a6bd1a75b94f4809b5d275db402f1751",
+      "10b9c5cf60b04a2c875ffe63adb55fb7",
+      "a6e1fe5e2caf42b2968a19df388daf66",
+      "1978049440924b349939aac789bdf797",
+      "6ac5948711754a6c9ef851f6db861e72",
+      "096f5fba1a1e4fbe82d0411363b8c477",
+      "923748d15c194b93bc71fb1046775903",
+      "c37415464174453b9ce13466ed6ff20c",
+      "15b5253136ec4e7db56e103960f4d3f6",
+      "f0ff7fa2d15f41b4b6fae00cb2936acd",
+      "418143d2ad92458094259dfca0a747cc",
+      "6aa0b130877c40f1ab51435705ee1459",
+      "5c4391af49964b7c8dc9839fe649376d",
+      "a45a2501e43448289e482a5713c5fa91",
+      "2b7b34c0eeec43aea25c723ef11d9263",
+      "a2cd61263e2e41e59d3e32a0bafe149a",
+      "9d1ecd1c6e584b7baae967ecba6eaa10",
+      "cb386abe77244108b8f98de9ad3f1bdd",
+      "77f3821d937b486e8d1b40c0f7c4c7dd",
+      "9551ec31f22a4e5fb3c7b6aa618d7f09",
+      "de2e8bd2816b4b2990a78bdb5091f097",
+      "c06b5a6588eb42189210d1c20ccba87a",
+      "da46a678b1fc40d7b660de63d9202997",
+      "c0e3b6e7e7304dc9877d6800f924d05e",
+      "e7035db245c7430c92ceb5b61c31ba14",
+      "d0d7ebc4ce264a6b8ae5c6ba4e18a9b3",
+      "0c46bf3c0a1840cfba66afef11e16cd2",
+      "2a1f21cd845e44c89197edc86b482b71",
+      "837c2d8dd75342a8bbeb1c5ce899e759",
+      "95649d04b8b144b091bba9e8106a44d6",
+      "081d380b0c52402abfd57337726b1aa3",
+      "5da887b8b4fd4437846c639b3ffb575b",
+      "79020bd42626472a85bf9047d014830f",
+      "1771b7a0f46e41dbaa5720effb6084ac",
+      "4542b8ce91584e42b7b98518726ab009",
+      "2b5e2622c68a46d2b407c7cfeca32ae5",
+      "b2d0b2f0ec7648b89fc19a1dda8462ba",
+      "fa6ed2fba5bf4abdaceefc180e4f9a41",
+      "029f9b9eea5a4bd9a70d29a3c9478cb8",
+      "9d61237ba4944593adbfcffd51aa6889",
+      "fc83fdb519174250ae312080e2918abe",
+      "4ec11a213b0d4fdd8300c0ea5a8f8db7",
+      "49807785ba664c49a6b2395ebe7fbec8",
+      "ec7bc6e82f2042b8b29a6f21e6db1709",
+      "609cc0908e6f4edd95306f72b40afd0c",
+      "d5611bb67e8d49f19e2700652d5309c1",
+      "fed0f8954a6b4e1194c63ccc9fba1238",
+      "174aacf5b59048b6ad27a6dffeb87950",
+      "b1cc5c487a364d3ba8d33e0aa3b2a305",
+      "0d9650ba583e45c18bf7c57cc6c57e4b",
+      "380b18596be246d6bc6fd4412fd20379",
+      "2d6ea61d0fa84510b44fff80ab10e553",
+      "098832b366c6410b824d2c210222dc24",
+      "e7e79c91380c478dabb2b1e7ddca647e"
+     ]
     },
     "id": "EntuvOq1MhwU",
-    "outputId": "e154a367-2f94-4fa1-e97d-d2f32db7a2cf"
+    "outputId": "cbd7cd77-6d8f-4a76-dae0-24530c12eeb6"
    },
    "outputs": [
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config[\"id2label\"]` will be overriden.\n",
-      "`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config[\"bos_token_id\"]` will be overriden.\n",
-      "`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config[\"eos_token_id\"]` will be overriden.\n"
+      "/usr/local/lib/python3.10/dist-packages/huggingface_hub/utils/_token.py:88: UserWarning: \n",
+      "The secret `HF_TOKEN` does not exist in your Colab secrets.\n",
+      "To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.\n",
+      "You will be able to reuse this secret in all of your notebooks.\n",
+      "Please note that authentication is recommended but still optional to access public models or datasets.\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "46636db47838400cb7407fc2ab0720eb",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "config.json:   0%|          | 0.00/4.76k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f359ba8ef0cf4841b40acafcd770480c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "pytorch_model.bin:   0%|          | 0.00/1.71G [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/usr/local/lib/python3.10/dist-packages/torch/_utils.py:831: UserWarning: TypedStorage is deprecated. It will be removed in the future and UntypedStorage will be the only storage class. This should only matter to you if you are using storages directly.  To access UntypedStorage directly, use tensor.untyped_storage() instead of tensor.storage()\n",
+      "  return self.fget.__get__(instance, owner)()\n"
      ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "3af19b8b653c4b21a65f7e96dd463aac",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "5c21455d9faa4112ba6f18819f7ef038",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "tokenizer_config.json:   0%|          | 0.00/844 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "418143d2ad92458094259dfca0a747cc",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "vocab.json:   0%|          | 0.00/862k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "c06b5a6588eb42189210d1c20ccba87a",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "79020bd42626472a85bf9047d014830f",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "tokenizer.json:   0%|          | 0.00/2.22M [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ec7bc6e82f2042b8b29a6f21e6db1709",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
     }
    ],
    "source": [
     "model_hf = CM.from_pretrained(config_name_hf)\n",
-    "processor = CP.from_pretrained(config_name_hf)"
+    "processor_hf = CP.from_pretrained(config_name_hf)"
    ]
   },
   {
@@ -531,32 +761,35 @@
      "base_uri": "https://localhost:8080/"
     },
     "id": "Ep8DRTkv3AwS",
-    "outputId": "770756bc-8829-484f-b6e5-763fe81e24d0"
+    "outputId": "6e3e802c-3db6-48ac-e3ab-4f52416449a8"
    },
    "outputs": [
     {
      "data": {
       "text/plain": [
-       "tensor([[0.9957, 0.0023, 0.0020]], grad_fn=<SoftmaxBackward0>)"
+       "tensor([[11.7865, 31.2010, 11.9718]], grad_fn=<TBackward0>)"
       ]
      },
-     "execution_count": 14,
+     "execution_count": 11,
      "metadata": {},
      "output_type": "execute_result"
     }
    ],
    "source": [
-    "url = \"https://i.imgur.com/8H7XCH0.jpg\"\n",
+    "import torch\n",
+    "import numpy as np\n",
+    "\n",
+    "photo = {\n",
+    "    \"cat\": \"https://i.imgur.com/8H7XCH0.jpg\",\n",
+    "    \"two_cats\": \"http://images.cocodataset.org/val2017/000000039769.jpg\",\n",
+    "    \"mountain\": \"https://i.imgur.com/PpgZzP4.jpeg\",\n",
+    "}\n",
+    "url = photo[\"cat\"]\n",
     "image_hf = Image.open(requests.get(url, stream=True).raw)\n",
-    "text_inputs = [\n",
-    "    \"photo of a cat on a tortoise\",\n",
-    "    \"tortoise on a dog\",\n",
-    "    \"a photo of a tortoise\",\n",
-    "]\n",
-    "inputs = processor(\n",
+    "text_inputs = [\"mountains\", \"cat on tortoise\", \"two dogs\"]\n",
+    "inputs = processor_hf(\n",
     "    text=text_inputs, images=image_hf, return_tensors=\"pt\", padding=True\n",
     ")\n",
-    "\n",
     "outputs = model_hf(**inputs)\n",
     "logits_per_image = (\n",
     "    outputs.logits_per_image\n",
@@ -564,28 +797,28 @@
     "probs = logits_per_image.softmax(\n",
     "    dim=1\n",
     ")  # we can take the softmax to get the label probabilitiesprobs\n",
-    "probs"
+    "logits_per_image"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
+   "cell_type": "markdown",
    "metadata": {
-    "id": "wPa0cVnY3cBC"
+    "id": "ArkCHlVZVKfM"
    },
-   "outputs": [],
    "source": [
-    "# hugging face weights\n",
-    "hf_wts = model_hf.state_dict()"
+    "# Copy weights"
    ]
   },
   {
-   "cell_type": "markdown",
+   "cell_type": "code",
+   "execution_count": null,
    "metadata": {
-    "id": "ArkCHlVZVKfM"
+    "id": "wPa0cVnY3cBC"
    },
+   "outputs": [],
    "source": [
-    "# Copy weights"
+    "# hugging face weights\n",
+    "hf_wts = model_hf.state_dict()"
    ]
   },
   {
@@ -607,17 +840,17 @@
    "source": [
     "model.logit_scale.assign(hf_wts.pop(\"logit_scale\").numpy())\n",
     "model.get_layer(\"image_encoder\").get_layer(\n",
-    "    \"clip_patching_and_embedding\"\n",
+    "    \"clip_patch_embedding\"\n",
     ").class_embedding.assign(\n",
-    "    hf_wts.pop(\"vision_model.embeddings.class_embedding\").numpy()\n",
+    "    hf_wts.pop(\"vision_model.embeddings.class_embedding\").numpy().T\n",
     ")\n",
     "model.get_layer(\"image_encoder\").get_layer(\n",
-    "    \"clip_patching_and_embedding\"\n",
+    "    \"clip_patch_embedding\"\n",
     ").positional_embedding.assign(\n",
     "    hf_wts.pop(\"vision_model.embeddings.position_embedding.weight\").numpy()\n",
     ")\n",
     "model.get_layer(\"image_encoder\").get_layer(\n",
-    "    \"clip_patching_and_embedding\"\n",
+    "    \"clip_patch_embedding\"\n",
     ").conv1.weights[0].assign(\n",
     "    hf_wts.pop(\"vision_model.embeddings.patch_embedding.weight\")\n",
     "    .permute(3, 2, 1, 0)\n",
@@ -637,14 +870,14 @@
     ")\n",
     "model.get_layer(\"image_encoder\").get_layer(\"vision_projector\").weights[\n",
     "    0\n",
-    "].assign(hf_wts.pop(\"visual_projection.weight\").transpose(1, 0).numpy())"
+    "].assign(hf_wts.pop(\"visual_projection.weight\").numpy().T)"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "id": "qptfuWobZcbT"
+    "id": "YRXC2HZC3FjG"
    },
    "outputs": [],
    "source": [
@@ -653,104 +886,93 @@
     "        residual_attention = f\"residual_attention\"\n",
     "    else:\n",
     "        residual_attention = f\"residual_attention_{i}\"\n",
-    "    model.get_layer(\"image_encoder\").get_layer(\n",
-    "        \"clip_encoder\"\n",
-    "    ).resblocks.get_layer(residual_attention).attn.q_proj.weights[0].assign(\n",
-    "        hf_wts.pop(f\"vision_model.encoder.layers.{i}.self_attn.q_proj.weight\")\n",
+    "\n",
+    "    model.get_layer(\"image_encoder\").get_layer(\"clip_encoder\").resblocks[\n",
+    "        i\n",
+    "    ].attn.q_proj.weights[0].assign(\n",
+    "        hf_wts.pop(f\"vision_model.encoder.layers.{i}.self_attn.q_proj.weight\").T\n",
     "    )\n",
-    "    model.get_layer(\"image_encoder\").get_layer(\n",
-    "        \"clip_encoder\"\n",
-    "    ).resblocks.get_layer(residual_attention).attn.q_proj.weights[1].assign(\n",
+    "    model.get_layer(\"image_encoder\").get_layer(\"clip_encoder\").resblocks[\n",
+    "        i\n",
+    "    ].attn.q_proj.weights[1].assign(\n",
     "        hf_wts.pop(f\"vision_model.encoder.layers.{i}.self_attn.q_proj.bias\")\n",
     "    )\n",
-    "    model.get_layer(\"image_encoder\").get_layer(\n",
-    "        \"clip_encoder\"\n",
-    "    ).resblocks.get_layer(residual_attention).attn.k_proj.weights[0].assign(\n",
-    "        hf_wts.pop(f\"vision_model.encoder.layers.{i}.self_attn.k_proj.weight\")\n",
+    "    model.get_layer(\"image_encoder\").get_layer(\"clip_encoder\").resblocks[\n",
+    "        i\n",
+    "    ].attn.k_proj.weights[0].assign(\n",
+    "        hf_wts.pop(f\"vision_model.encoder.layers.{i}.self_attn.k_proj.weight\").T\n",
     "    )\n",
-    "    model.get_layer(\"image_encoder\").get_layer(\n",
-    "        \"clip_encoder\"\n",
-    "    ).resblocks.get_layer(residual_attention).attn.k_proj.weights[1].assign(\n",
+    "    model.get_layer(\"image_encoder\").get_layer(\"clip_encoder\").resblocks[\n",
+    "        i\n",
+    "    ].attn.k_proj.weights[1].assign(\n",
     "        hf_wts.pop(f\"vision_model.encoder.layers.{i}.self_attn.k_proj.bias\")\n",
     "    )\n",
-    "    model.get_layer(\"image_encoder\").get_layer(\n",
-    "        \"clip_encoder\"\n",
-    "    ).resblocks.get_layer(residual_attention).attn.v_proj.weights[0].assign(\n",
-    "        hf_wts.pop(f\"vision_model.encoder.layers.{i}.self_attn.v_proj.weight\")\n",
+    "    model.get_layer(\"image_encoder\").get_layer(\"clip_encoder\").resblocks[\n",
+    "        i\n",
+    "    ].attn.v_proj.weights[0].assign(\n",
+    "        hf_wts.pop(f\"vision_model.encoder.layers.{i}.self_attn.v_proj.weight\").T\n",
     "    )\n",
-    "    model.get_layer(\"image_encoder\").get_layer(\n",
-    "        \"clip_encoder\"\n",
-    "    ).resblocks.get_layer(residual_attention).attn.v_proj.weights[1].assign(\n",
+    "    model.get_layer(\"image_encoder\").get_layer(\"clip_encoder\").resblocks[\n",
+    "        i\n",
+    "    ].attn.v_proj.weights[1].assign(\n",
     "        hf_wts.pop(f\"vision_model.encoder.layers.{i}.self_attn.v_proj.bias\")\n",
     "    )\n",
-    "    model.get_layer(\"image_encoder\").get_layer(\n",
-    "        \"clip_encoder\"\n",
-    "    ).resblocks.get_layer(residual_attention).attn.out_proj.weights[1].assign(\n",
+    "    model.get_layer(\"image_encoder\").get_layer(\"clip_encoder\").resblocks[\n",
+    "        i\n",
+    "    ].attn.out_proj.weights[1].assign(\n",
     "        hf_wts.pop(\n",
     "            f\"vision_model.encoder.layers.{i}.self_attn.out_proj.bias\"\n",
     "        ).numpy()\n",
     "    )\n",
-    "    model.get_layer(\"image_encoder\").get_layer(\n",
-    "        \"clip_encoder\"\n",
-    "    ).resblocks.get_layer(residual_attention).attn.out_proj.weights[0].assign(\n",
-    "        hf_wts.pop(\n",
-    "            f\"vision_model.encoder.layers.{i}.self_attn.out_proj.weight\"\n",
-    "        ).numpy()\n",
+    "    model.get_layer(\"image_encoder\").get_layer(\"clip_encoder\").resblocks[\n",
+    "        i\n",
+    "    ].attn.out_proj.weights[0].assign(\n",
+    "        hf_wts.pop(f\"vision_model.encoder.layers.{i}.self_attn.out_proj.weight\")\n",
+    "        .numpy()\n",
+    "        .T\n",
     "    )\n",
-    "    model.get_layer(\"image_encoder\").get_layer(\n",
-    "        \"clip_encoder\"\n",
-    "    ).resblocks.get_layer(residual_attention).ln_1.weights[0].assign(\n",
+    "    model.get_layer(\"image_encoder\").get_layer(\"clip_encoder\").resblocks[\n",
+    "        i\n",
+    "    ].ln_1.weights[0].assign(\n",
     "        hf_wts.pop(\n",
     "            f\"vision_model.encoder.layers.{i}.layer_norm1.weight\"\n",
     "        ).numpy()\n",
     "    )\n",
-    "    model.get_layer(\"image_encoder\").get_layer(\n",
-    "        \"clip_encoder\"\n",
-    "    ).resblocks.get_layer(residual_attention).ln_1.weights[1].assign(\n",
+    "    model.get_layer(\"image_encoder\").get_layer(\"clip_encoder\").resblocks[\n",
+    "        i\n",
+    "    ].ln_1.weights[1].assign(\n",
     "        hf_wts.pop(f\"vision_model.encoder.layers.{i}.layer_norm1.bias\").numpy()\n",
     "    )\n",
-    "    model.get_layer(\"image_encoder\").get_layer(\n",
-    "        \"clip_encoder\"\n",
-    "    ).resblocks.get_layer(residual_attention).ln_2.weights[0].assign(\n",
+    "    model.get_layer(\"image_encoder\").get_layer(\"clip_encoder\").resblocks[\n",
+    "        i\n",
+    "    ].ln_2.weights[0].assign(\n",
     "        hf_wts.pop(\n",
     "            f\"vision_model.encoder.layers.{i}.layer_norm2.weight\"\n",
     "        ).numpy()\n",
     "    )\n",
-    "    model.get_layer(\"image_encoder\").get_layer(\n",
-    "        \"clip_encoder\"\n",
-    "    ).resblocks.get_layer(residual_attention).ln_2.weights[1].assign(\n",
+    "    model.get_layer(\"image_encoder\").get_layer(\"clip_encoder\").resblocks[\n",
+    "        i\n",
+    "    ].ln_2.weights[1].assign(\n",
     "        hf_wts.pop(f\"vision_model.encoder.layers.{i}.layer_norm2.bias\").numpy()\n",
     "    )\n",
-    "    model.get_layer(\"image_encoder\").get_layer(\n",
-    "        \"clip_encoder\"\n",
-    "    ).resblocks.get_layer(residual_attention).mlp.get_layer(\"c_fc\").weights[\n",
-    "        0\n",
-    "    ].assign(\n",
-    "        hf_wts.pop(f\"vision_model.encoder.layers.{i}.mlp.fc1.weight\")\n",
-    "        .transpose(1, 0)\n",
-    "        .numpy()\n",
+    "    model.get_layer(\"image_encoder\").get_layer(\"clip_encoder\").resblocks[\n",
+    "        i\n",
+    "    ].mlp_dense_1.weights[0].assign(\n",
+    "        hf_wts.pop(f\"vision_model.encoder.layers.{i}.mlp.fc1.weight\").numpy().T\n",
     "    )\n",
-    "    model.get_layer(\"image_encoder\").get_layer(\n",
-    "        \"clip_encoder\"\n",
-    "    ).resblocks.get_layer(residual_attention).mlp.get_layer(\"c_fc\").weights[\n",
-    "        1\n",
-    "    ].assign(\n",
+    "    model.get_layer(\"image_encoder\").get_layer(\"clip_encoder\").resblocks[\n",
+    "        i\n",
+    "    ].mlp_dense_1.weights[1].assign(\n",
     "        hf_wts.pop(f\"vision_model.encoder.layers.{i}.mlp.fc1.bias\").numpy()\n",
     "    )\n",
-    "    model.get_layer(\"image_encoder\").get_layer(\n",
-    "        \"clip_encoder\"\n",
-    "    ).resblocks.get_layer(residual_attention).mlp.get_layer(\"c_proj\").weights[\n",
-    "        0\n",
-    "    ].assign(\n",
-    "        hf_wts.pop(f\"vision_model.encoder.layers.{i}.mlp.fc2.weight\")\n",
-    "        .transpose(1, 0)\n",
-    "        .numpy()\n",
+    "    model.get_layer(\"image_encoder\").get_layer(\"clip_encoder\").resblocks[\n",
+    "        i\n",
+    "    ].mlp_dense_2.weights[0].assign(\n",
+    "        hf_wts.pop(f\"vision_model.encoder.layers.{i}.mlp.fc2.weight\").numpy().T\n",
     "    )\n",
-    "    model.get_layer(\"image_encoder\").get_layer(\n",
-    "        \"clip_encoder\"\n",
-    "    ).resblocks.get_layer(residual_attention).mlp.get_layer(\"c_proj\").weights[\n",
-    "        1\n",
-    "    ].assign(\n",
+    "    model.get_layer(\"image_encoder\").get_layer(\"clip_encoder\").resblocks[\n",
+    "        i\n",
+    "    ].mlp_dense_2.weights[1].assign(\n",
     "        hf_wts.pop(f\"vision_model.encoder.layers.{i}.mlp.fc2.bias\").numpy()\n",
     "    )"
    ]
@@ -764,17 +986,6 @@
     "## Text encoder"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "5FtDROnynb0N"
-   },
-   "outputs": [],
-   "source": [
-    "num_transformer_layers = MODEL_CONFIGS[config_name][\"vision_layers\"]"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -784,14 +995,14 @@
    "outputs": [],
    "source": [
     "model.get_layer(\"text_encoder\").get_layer(\"text_projector\").weights[0].assign(\n",
-    "    hf_wts.pop(\"text_projection.weight\").numpy()\n",
+    "    hf_wts.pop(\"text_projection.weight\").numpy().T\n",
     ")\n",
     "model.get_layer(\"text_encoder\").get_layer(\"token_embedding\").weights[0].assign(\n",
     "    hf_wts.pop(\"text_model.embeddings.token_embedding.weight\").numpy()\n",
     ")\n",
-    "model.get_layer(\"text_encoder\").positional_embedding.assign(\n",
-    "    hf_wts.pop(\"text_model.embeddings.position_embedding.weight\").numpy()\n",
-    ")\n",
+    "model.get_layer(\"text_encoder\").get_layer(\"positional_embedding\").weights[\n",
+    "    0\n",
+    "].assign(hf_wts.pop(\"text_model.embeddings.position_embedding.weight\").numpy())\n",
     "model.get_layer(\"text_encoder\").get_layer(\"ln_final\").weights[0].assign(\n",
     "    hf_wts.pop(\"text_model.final_layer_norm.weight\")\n",
     ")\n",
@@ -804,165 +1015,89 @@
    "cell_type": "code",
    "execution_count": null,
    "metadata": {
-    "id": "s6leOiFO6V2U"
+    "id": "IQFquy9R75G8"
    },
    "outputs": [],
    "source": [
     "for i in range(MODEL_CONFIGS[config_name][\"transformer_layers\"]):\n",
-    "    model.get_layer(\"text_encoder\").get_layer(\n",
-    "        \"clip_encoder\"\n",
-    "    ).resblocks.get_layer(\n",
-    "        f\"residual_attention_{num_transformer_layers+i}\"\n",
-    "    ).attn.k_proj.weights[\n",
-    "        0\n",
-    "    ].assign(\n",
-    "        hf_wts.pop(f\"text_model.encoder.layers.{i}.self_attn.k_proj.weight\")\n",
+    "    model.get_layer(\"text_encoder\").get_layer(\"clip_encoder\").resblocks[\n",
+    "        i\n",
+    "    ].attn.k_proj.weights[0].assign(\n",
+    "        hf_wts.pop(f\"text_model.encoder.layers.{i}.self_attn.k_proj.weight\").T\n",
     "    )\n",
-    "    model.get_layer(\"text_encoder\").get_layer(\n",
-    "        \"clip_encoder\"\n",
-    "    ).resblocks.get_layer(\n",
-    "        f\"residual_attention_{num_transformer_layers+i}\"\n",
-    "    ).attn.k_proj.weights[\n",
-    "        1\n",
-    "    ].assign(\n",
+    "    model.get_layer(\"text_encoder\").get_layer(\"clip_encoder\").resblocks[\n",
+    "        i\n",
+    "    ].attn.k_proj.weights[1].assign(\n",
     "        hf_wts.pop(f\"text_model.encoder.layers.{i}.self_attn.k_proj.bias\")\n",
     "    )\n",
-    "    model.get_layer(\"text_encoder\").get_layer(\n",
-    "        \"clip_encoder\"\n",
-    "    ).resblocks.get_layer(\n",
-    "        f\"residual_attention_{num_transformer_layers+i}\"\n",
-    "    ).attn.q_proj.weights[\n",
-    "        0\n",
-    "    ].assign(\n",
-    "        hf_wts.pop(f\"text_model.encoder.layers.{i}.self_attn.q_proj.weight\")\n",
+    "    model.get_layer(\"text_encoder\").get_layer(\"clip_encoder\").resblocks[\n",
+    "        i\n",
+    "    ].attn.q_proj.weights[0].assign(\n",
+    "        hf_wts.pop(f\"text_model.encoder.layers.{i}.self_attn.q_proj.weight\").T\n",
     "    )\n",
-    "    model.get_layer(\"text_encoder\").get_layer(\n",
-    "        \"clip_encoder\"\n",
-    "    ).resblocks.get_layer(\n",
-    "        f\"residual_attention_{num_transformer_layers+i}\"\n",
-    "    ).attn.q_proj.weights[\n",
-    "        1\n",
-    "    ].assign(\n",
+    "    model.get_layer(\"text_encoder\").get_layer(\"clip_encoder\").resblocks[\n",
+    "        i\n",
+    "    ].attn.q_proj.weights[1].assign(\n",
     "        hf_wts.pop(f\"text_model.encoder.layers.{i}.self_attn.q_proj.bias\")\n",
     "    )\n",
-    "    model.get_layer(\"text_encoder\").get_layer(\n",
-    "        \"clip_encoder\"\n",
-    "    ).resblocks.get_layer(\n",
-    "        f\"residual_attention_{num_transformer_layers+i}\"\n",
-    "    ).attn.v_proj.weights[\n",
-    "        0\n",
-    "    ].assign(\n",
-    "        hf_wts.pop(f\"text_model.encoder.layers.{i}.self_attn.v_proj.weight\")\n",
+    "    model.get_layer(\"text_encoder\").get_layer(\"clip_encoder\").resblocks[\n",
+    "        i\n",
+    "    ].attn.v_proj.weights[0].assign(\n",
+    "        hf_wts.pop(f\"text_model.encoder.layers.{i}.self_attn.v_proj.weight\").T\n",
     "    )\n",
-    "    model.get_layer(\"text_encoder\").get_layer(\n",
-    "        \"clip_encoder\"\n",
-    "    ).resblocks.get_layer(\n",
-    "        f\"residual_attention_{num_transformer_layers+i}\"\n",
-    "    ).attn.v_proj.weights[\n",
-    "        1\n",
-    "    ].assign(\n",
+    "    model.get_layer(\"text_encoder\").get_layer(\"clip_encoder\").resblocks[\n",
+    "        i\n",
+    "    ].attn.v_proj.weights[1].assign(\n",
     "        hf_wts.pop(f\"text_model.encoder.layers.{i}.self_attn.v_proj.bias\")\n",
     "    )\n",
-    "    model.get_layer(\"text_encoder\").get_layer(\n",
-    "        \"clip_encoder\"\n",
-    "    ).resblocks.get_layer(\n",
-    "        f\"residual_attention_{num_transformer_layers+i}\"\n",
-    "    ).attn.out_proj.weights[\n",
-    "        0\n",
-    "    ].assign(\n",
-    "        hf_wts.pop(f\"text_model.encoder.layers.{i}.self_attn.out_proj.weight\")\n",
+    "    model.get_layer(\"text_encoder\").get_layer(\"clip_encoder\").resblocks[\n",
+    "        i\n",
+    "    ].attn.out_proj.weights[0].assign(\n",
+    "        hf_wts.pop(f\"text_model.encoder.layers.{i}.self_attn.out_proj.weight\").T\n",
     "    )\n",
-    "    model.get_layer(\"text_encoder\").get_layer(\n",
-    "        \"clip_encoder\"\n",
-    "    ).resblocks.get_layer(\n",
-    "        f\"residual_attention_{num_transformer_layers+i}\"\n",
-    "    ).attn.out_proj.weights[\n",
-    "        1\n",
-    "    ].assign(\n",
+    "    model.get_layer(\"text_encoder\").get_layer(\"clip_encoder\").resblocks[\n",
+    "        i\n",
+    "    ].attn.out_proj.weights[1].assign(\n",
     "        hf_wts.pop(f\"text_model.encoder.layers.{i}.self_attn.out_proj.bias\")\n",
     "    )\n",
-    "    model.get_layer(\"text_encoder\").get_layer(\n",
-    "        \"clip_encoder\"\n",
-    "    ).resblocks.get_layer(\n",
-    "        f\"residual_attention_{num_transformer_layers+i}\"\n",
-    "    ).ln_1.weights[\n",
-    "        0\n",
-    "    ].assign(\n",
+    "    model.get_layer(\"text_encoder\").get_layer(\"clip_encoder\").resblocks[\n",
+    "        i\n",
+    "    ].ln_1.weights[0].assign(\n",
     "        hf_wts.pop(f\"text_model.encoder.layers.{i}.layer_norm1.weight\").numpy()\n",
     "    )\n",
-    "    model.get_layer(\"text_encoder\").get_layer(\n",
-    "        \"clip_encoder\"\n",
-    "    ).resblocks.get_layer(\n",
-    "        f\"residual_attention_{num_transformer_layers+i}\"\n",
-    "    ).ln_1.weights[\n",
-    "        1\n",
-    "    ].assign(\n",
+    "    model.get_layer(\"text_encoder\").get_layer(\"clip_encoder\").resblocks[\n",
+    "        i\n",
+    "    ].ln_1.weights[1].assign(\n",
     "        hf_wts.pop(f\"text_model.encoder.layers.{i}.layer_norm1.bias\").numpy()\n",
     "    )\n",
-    "    model.get_layer(\"text_encoder\").get_layer(\n",
-    "        \"clip_encoder\"\n",
-    "    ).resblocks.get_layer(\n",
-    "        f\"residual_attention_{num_transformer_layers+i}\"\n",
-    "    ).ln_2.weights[\n",
-    "        0\n",
-    "    ].assign(\n",
+    "    model.get_layer(\"text_encoder\").get_layer(\"clip_encoder\").resblocks[\n",
+    "        i\n",
+    "    ].ln_2.weights[0].assign(\n",
     "        hf_wts.pop(f\"text_model.encoder.layers.{i}.layer_norm2.weight\").numpy()\n",
     "    )\n",
-    "    model.get_layer(\"text_encoder\").get_layer(\n",
-    "        \"clip_encoder\"\n",
-    "    ).resblocks.get_layer(\n",
-    "        f\"residual_attention_{num_transformer_layers+i}\"\n",
-    "    ).ln_2.weights[\n",
-    "        1\n",
-    "    ].assign(\n",
+    "    model.get_layer(\"text_encoder\").get_layer(\"clip_encoder\").resblocks[\n",
+    "        i\n",
+    "    ].ln_2.weights[1].assign(\n",
     "        hf_wts.pop(f\"text_model.encoder.layers.{i}.layer_norm2.bias\").numpy()\n",
     "    )\n",
-    "    model.get_layer(\"text_encoder\").get_layer(\n",
-    "        \"clip_encoder\"\n",
-    "    ).resblocks.get_layer(\n",
-    "        f\"residual_attention_{num_transformer_layers+i}\"\n",
-    "    ).mlp.get_layer(\n",
-    "        \"c_fc\"\n",
-    "    ).weights[\n",
-    "        0\n",
-    "    ].assign(\n",
-    "        hf_wts.pop(f\"text_model.encoder.layers.{i}.mlp.fc1.weight\")\n",
-    "        .transpose(1, 0)\n",
-    "        .numpy()\n",
+    "    model.get_layer(\"text_encoder\").get_layer(\"clip_encoder\").resblocks[\n",
+    "        i\n",
+    "    ].mlp_dense_1.weights[0].assign(\n",
+    "        hf_wts.pop(f\"text_model.encoder.layers.{i}.mlp.fc1.weight\").numpy().T\n",
     "    )\n",
-    "    model.get_layer(\"text_encoder\").get_layer(\n",
-    "        \"clip_encoder\"\n",
-    "    ).resblocks.get_layer(\n",
-    "        f\"residual_attention_{num_transformer_layers+i}\"\n",
-    "    ).mlp.get_layer(\n",
-    "        \"c_fc\"\n",
-    "    ).weights[\n",
-    "        1\n",
-    "    ].assign(\n",
+    "    model.get_layer(\"text_encoder\").get_layer(\"clip_encoder\").resblocks[\n",
+    "        i\n",
+    "    ].mlp_dense_1.weights[1].assign(\n",
     "        hf_wts.pop(f\"text_model.encoder.layers.{i}.mlp.fc1.bias\").numpy()\n",
     "    )\n",
-    "    model.get_layer(\"text_encoder\").get_layer(\n",
-    "        \"clip_encoder\"\n",
-    "    ).resblocks.get_layer(\n",
-    "        f\"residual_attention_{num_transformer_layers+i}\"\n",
-    "    ).mlp.get_layer(\n",
-    "        \"c_proj\"\n",
-    "    ).weights[\n",
-    "        0\n",
-    "    ].assign(\n",
-    "        hf_wts.pop(f\"text_model.encoder.layers.{i}.mlp.fc2.weight\")\n",
-    "        .transpose(1, 0)\n",
-    "        .numpy()\n",
+    "    model.get_layer(\"text_encoder\").get_layer(\"clip_encoder\").resblocks[\n",
+    "        i\n",
+    "    ].mlp_dense_2.weights[0].assign(\n",
+    "        hf_wts.pop(f\"text_model.encoder.layers.{i}.mlp.fc2.weight\").numpy().T\n",
     "    )\n",
-    "    model.get_layer(\"text_encoder\").get_layer(\n",
-    "        \"clip_encoder\"\n",
-    "    ).resblocks.get_layer(\n",
-    "        f\"residual_attention_{num_transformer_layers+i}\"\n",
-    "    ).mlp.get_layer(\n",
-    "        \"c_proj\"\n",
-    "    ).weights[\n",
-    "        1\n",
-    "    ].assign(\n",
+    "    model.get_layer(\"text_encoder\").get_layer(\"clip_encoder\").resblocks[\n",
+    "        i\n",
+    "    ].mlp_dense_2.weights[1].assign(\n",
     "        hf_wts.pop(f\"text_model.encoder.layers.{i}.mlp.fc2.bias\").numpy()\n",
     "    )"
    ]
@@ -975,7 +1110,7 @@
      "base_uri": "https://localhost:8080/"
     },
     "id": "Bgen7hxCCeZ7",
-    "outputId": "c777d6f1-4aa7-4f3e-8fd7-759364364c44"
+    "outputId": "e706ca82-d292-4868-9215-d8c160b3736c"
    },
    "outputs": [
     {
@@ -984,7 +1119,7 @@
        "odict_keys([])"
       ]
      },
-     "execution_count": 22,
+     "execution_count": 17,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -1011,12 +1146,15 @@
    },
    "outputs": [],
    "source": [
-    "model.save_weights(\"clip-vit-base-patch32.weights.h5\")"
+    "model.save_weights(\"model.weights.h5\")"
    ]
   }
  ],
  "metadata": {
+  "accelerator": "GPU",
   "colab": {
+   "gpuType": "V100",
+   "machine_shape": "hm",
    "provenance": []
   },
   "kernelspec": {
@@ -1025,6 +1163,2746 @@
   },
   "language_info": {
    "name": "python"
+  },
+  "widgets": {
+   "application/vnd.jupyter.widget-state+json": {
+    "002207337b4b47c9a6b8ddda823128ba": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "029f9b9eea5a4bd9a70d29a3c9478cb8": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "081d380b0c52402abfd57337726b1aa3": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "0861f873665f4514abfd2b09ad944ab8": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_0eda9fca096945a48059cbc2b1c9ffdf",
+      "placeholder": "​",
+      "style": "IPY_MODEL_e2bc0bc25b5044abb8bb1c752935922e",
+      "value": "pytorch_model.bin: 100%"
+     }
+    },
+    "096f5fba1a1e4fbe82d0411363b8c477": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "098832b366c6410b824d2c210222dc24": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "0c46bf3c0a1840cfba66afef11e16cd2": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "0d9650ba583e45c18bf7c57cc6c57e4b": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "0eda9fca096945a48059cbc2b1c9ffdf": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "10b9c5cf60b04a2c875ffe63adb55fb7": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "FloatProgressModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "FloatProgressModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "ProgressView",
+      "bar_style": "success",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_923748d15c194b93bc71fb1046775903",
+      "max": 844,
+      "min": 0,
+      "orientation": "horizontal",
+      "style": "IPY_MODEL_c37415464174453b9ce13466ed6ff20c",
+      "value": 844
+     }
+    },
+    "15b5253136ec4e7db56e103960f4d3f6": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "174aacf5b59048b6ad27a6dffeb87950": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "1771b7a0f46e41dbaa5720effb6084ac": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_fa6ed2fba5bf4abdaceefc180e4f9a41",
+      "placeholder": "​",
+      "style": "IPY_MODEL_029f9b9eea5a4bd9a70d29a3c9478cb8",
+      "value": "tokenizer.json: 100%"
+     }
+    },
+    "1978049440924b349939aac789bdf797": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "2a1f21cd845e44c89197edc86b482b71": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "2b5e2622c68a46d2b407c7cfeca32ae5": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_4ec11a213b0d4fdd8300c0ea5a8f8db7",
+      "placeholder": "​",
+      "style": "IPY_MODEL_49807785ba664c49a6b2395ebe7fbec8",
+      "value": " 2.22M/2.22M [00:00&lt;00:00, 22.9MB/s]"
+     }
+    },
+    "2b7b34c0eeec43aea25c723ef11d9263": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "2d6ea61d0fa84510b44fff80ab10e553": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "ProgressStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "ProgressStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "bar_color": null,
+      "description_width": ""
+     }
+    },
+    "2d744d00de6745bda5835f0cd66e3909": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "ProgressStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "ProgressStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "bar_color": null,
+      "description_width": ""
+     }
+    },
+    "380b18596be246d6bc6fd4412fd20379": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "3af19b8b653c4b21a65f7e96dd463aac": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HBoxModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HBoxModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HBoxView",
+      "box_style": "",
+      "children": [
+       "IPY_MODEL_a4eddaf970084d9ba057e7038423be01",
+       "IPY_MODEL_f0af662a1a884fb78c693ccf0d0b6d8e",
+       "IPY_MODEL_9acadb088a75425a8115ffd876e161bf"
+      ],
+      "layout": "IPY_MODEL_ef6fd54de3aa46af868c8c810068d9ad"
+     }
+    },
+    "3e48c9a0c27d49829a0637c9c727ed8f": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "418143d2ad92458094259dfca0a747cc": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HBoxModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HBoxModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HBoxView",
+      "box_style": "",
+      "children": [
+       "IPY_MODEL_6aa0b130877c40f1ab51435705ee1459",
+       "IPY_MODEL_5c4391af49964b7c8dc9839fe649376d",
+       "IPY_MODEL_a45a2501e43448289e482a5713c5fa91"
+      ],
+      "layout": "IPY_MODEL_2b7b34c0eeec43aea25c723ef11d9263"
+     }
+    },
+    "4542b8ce91584e42b7b98518726ab009": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "FloatProgressModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "FloatProgressModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "ProgressView",
+      "bar_style": "success",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_9d61237ba4944593adbfcffd51aa6889",
+      "max": 2224041,
+      "min": 0,
+      "orientation": "horizontal",
+      "style": "IPY_MODEL_fc83fdb519174250ae312080e2918abe",
+      "value": 2224041
+     }
+    },
+    "4609b1b46de5441a9632985833bd0b05": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "46636db47838400cb7407fc2ab0720eb": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HBoxModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HBoxModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HBoxView",
+      "box_style": "",
+      "children": [
+       "IPY_MODEL_718081783f7f411599ba5bac18748697",
+       "IPY_MODEL_effcd9cbd407405cbbffe4e76b19df72",
+       "IPY_MODEL_7d4e0cdb53474d50a0b70a6f2d9a0eba"
+      ],
+      "layout": "IPY_MODEL_b3bf65796de8494e82506f6849316426"
+     }
+    },
+    "49807785ba664c49a6b2395ebe7fbec8": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "4b62b742aacb45c7a286a5d85a0194da": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_6f708ee77df84a8cb1bff9df119ca7df",
+      "placeholder": "​",
+      "style": "IPY_MODEL_8b766a31754f4445bf2614da1ad45446",
+      "value": " 1.71G/1.71G [00:14&lt;00:00, 117MB/s]"
+     }
+    },
+    "4ec11a213b0d4fdd8300c0ea5a8f8db7": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "5c1c89c4eafc4dc7888236023a4716fd": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "5c21455d9faa4112ba6f18819f7ef038": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HBoxModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HBoxModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HBoxView",
+      "box_style": "",
+      "children": [
+       "IPY_MODEL_a6bd1a75b94f4809b5d275db402f1751",
+       "IPY_MODEL_10b9c5cf60b04a2c875ffe63adb55fb7",
+       "IPY_MODEL_a6e1fe5e2caf42b2968a19df388daf66"
+      ],
+      "layout": "IPY_MODEL_1978049440924b349939aac789bdf797"
+     }
+    },
+    "5c4391af49964b7c8dc9839fe649376d": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "FloatProgressModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "FloatProgressModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "ProgressView",
+      "bar_style": "success",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_cb386abe77244108b8f98de9ad3f1bdd",
+      "max": 862328,
+      "min": 0,
+      "orientation": "horizontal",
+      "style": "IPY_MODEL_77f3821d937b486e8d1b40c0f7c4c7dd",
+      "value": 862328
+     }
+    },
+    "5c5e2b0d9fa7435a92c95d417ed03956": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "ProgressStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "ProgressStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "bar_color": null,
+      "description_width": ""
+     }
+    },
+    "5da887b8b4fd4437846c639b3ffb575b": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "609cc0908e6f4edd95306f72b40afd0c": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_b1cc5c487a364d3ba8d33e0aa3b2a305",
+      "placeholder": "​",
+      "style": "IPY_MODEL_0d9650ba583e45c18bf7c57cc6c57e4b",
+      "value": "special_tokens_map.json: 100%"
+     }
+    },
+    "6aa0b130877c40f1ab51435705ee1459": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_a2cd61263e2e41e59d3e32a0bafe149a",
+      "placeholder": "​",
+      "style": "IPY_MODEL_9d1ecd1c6e584b7baae967ecba6eaa10",
+      "value": "vocab.json: 100%"
+     }
+    },
+    "6ac5948711754a6c9ef851f6db861e72": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "6f708ee77df84a8cb1bff9df119ca7df": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "718081783f7f411599ba5bac18748697": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_5c1c89c4eafc4dc7888236023a4716fd",
+      "placeholder": "​",
+      "style": "IPY_MODEL_3e48c9a0c27d49829a0637c9c727ed8f",
+      "value": "config.json: 100%"
+     }
+    },
+    "77f3821d937b486e8d1b40c0f7c4c7dd": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "ProgressStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "ProgressStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "bar_color": null,
+      "description_width": ""
+     }
+    },
+    "79020bd42626472a85bf9047d014830f": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HBoxModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HBoxModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HBoxView",
+      "box_style": "",
+      "children": [
+       "IPY_MODEL_1771b7a0f46e41dbaa5720effb6084ac",
+       "IPY_MODEL_4542b8ce91584e42b7b98518726ab009",
+       "IPY_MODEL_2b5e2622c68a46d2b407c7cfeca32ae5"
+      ],
+      "layout": "IPY_MODEL_b2d0b2f0ec7648b89fc19a1dda8462ba"
+     }
+    },
+    "7d4e0cdb53474d50a0b70a6f2d9a0eba": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_a5b8a21a005d4ef7b253b782dae3b88d",
+      "placeholder": "​",
+      "style": "IPY_MODEL_002207337b4b47c9a6b8ddda823128ba",
+      "value": " 4.76k/4.76k [00:00&lt;00:00, 166kB/s]"
+     }
+    },
+    "837c2d8dd75342a8bbeb1c5ce899e759": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "8b766a31754f4445bf2614da1ad45446": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "923748d15c194b93bc71fb1046775903": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "9551ec31f22a4e5fb3c7b6aa618d7f09": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "95649d04b8b144b091bba9e8106a44d6": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "ProgressStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "ProgressStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "bar_color": null,
+      "description_width": ""
+     }
+    },
+    "9acadb088a75425a8115ffd876e161bf": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_b8bd9787d9c640e19798be15e94ede04",
+      "placeholder": "​",
+      "style": "IPY_MODEL_e191179e7e4048b69b47d3b9b550b459",
+      "value": " 316/316 [00:00&lt;00:00, 19.2kB/s]"
+     }
+    },
+    "9b1db90efb8b4ef096c5254494c063db": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "9d1ecd1c6e584b7baae967ecba6eaa10": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "9d61237ba4944593adbfcffd51aa6889": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "a2cd61263e2e41e59d3e32a0bafe149a": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "a45a2501e43448289e482a5713c5fa91": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_9551ec31f22a4e5fb3c7b6aa618d7f09",
+      "placeholder": "​",
+      "style": "IPY_MODEL_de2e8bd2816b4b2990a78bdb5091f097",
+      "value": " 862k/862k [00:00&lt;00:00, 11.3MB/s]"
+     }
+    },
+    "a4eddaf970084d9ba057e7038423be01": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_af229a4850174254b09f850c67aefe3a",
+      "placeholder": "​",
+      "style": "IPY_MODEL_b007afd6777e4364a57a717088356780",
+      "value": "preprocessor_config.json: 100%"
+     }
+    },
+    "a5b8a21a005d4ef7b253b782dae3b88d": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "a6bd1a75b94f4809b5d275db402f1751": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_6ac5948711754a6c9ef851f6db861e72",
+      "placeholder": "​",
+      "style": "IPY_MODEL_096f5fba1a1e4fbe82d0411363b8c477",
+      "value": "tokenizer_config.json: 100%"
+     }
+    },
+    "a6e1fe5e2caf42b2968a19df388daf66": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_15b5253136ec4e7db56e103960f4d3f6",
+      "placeholder": "​",
+      "style": "IPY_MODEL_f0ff7fa2d15f41b4b6fae00cb2936acd",
+      "value": " 844/844 [00:00&lt;00:00, 64.4kB/s]"
+     }
+    },
+    "af229a4850174254b09f850c67aefe3a": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "b007afd6777e4364a57a717088356780": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "b1b3b62f8d7545938d1e2d8f8757c589": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "b1cc5c487a364d3ba8d33e0aa3b2a305": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "b2d0b2f0ec7648b89fc19a1dda8462ba": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "b3bf65796de8494e82506f6849316426": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "b8924ff1bbb5409c8dc141e65c4cdcdb": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "FloatProgressModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "FloatProgressModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "ProgressView",
+      "bar_style": "success",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_b1b3b62f8d7545938d1e2d8f8757c589",
+      "max": 1711974081,
+      "min": 0,
+      "orientation": "horizontal",
+      "style": "IPY_MODEL_2d744d00de6745bda5835f0cd66e3909",
+      "value": 1711974081
+     }
+    },
+    "b8bd9787d9c640e19798be15e94ede04": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "c06b5a6588eb42189210d1c20ccba87a": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HBoxModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HBoxModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HBoxView",
+      "box_style": "",
+      "children": [
+       "IPY_MODEL_da46a678b1fc40d7b660de63d9202997",
+       "IPY_MODEL_c0e3b6e7e7304dc9877d6800f924d05e",
+       "IPY_MODEL_e7035db245c7430c92ceb5b61c31ba14"
+      ],
+      "layout": "IPY_MODEL_d0d7ebc4ce264a6b8ae5c6ba4e18a9b3"
+     }
+    },
+    "c0e3b6e7e7304dc9877d6800f924d05e": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "FloatProgressModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "FloatProgressModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "ProgressView",
+      "bar_style": "success",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_837c2d8dd75342a8bbeb1c5ce899e759",
+      "max": 524657,
+      "min": 0,
+      "orientation": "horizontal",
+      "style": "IPY_MODEL_95649d04b8b144b091bba9e8106a44d6",
+      "value": 524657
+     }
+    },
+    "c37415464174453b9ce13466ed6ff20c": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "ProgressStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "ProgressStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "bar_color": null,
+      "description_width": ""
+     }
+    },
+    "c9fdbe12b7934fd8accc09645df0f214": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "ProgressStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "ProgressStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "bar_color": null,
+      "description_width": ""
+     }
+    },
+    "cb386abe77244108b8f98de9ad3f1bdd": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "d0d7ebc4ce264a6b8ae5c6ba4e18a9b3": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "d5611bb67e8d49f19e2700652d5309c1": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "FloatProgressModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "FloatProgressModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "ProgressView",
+      "bar_style": "success",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_380b18596be246d6bc6fd4412fd20379",
+      "max": 389,
+      "min": 0,
+      "orientation": "horizontal",
+      "style": "IPY_MODEL_2d6ea61d0fa84510b44fff80ab10e553",
+      "value": 389
+     }
+    },
+    "da46a678b1fc40d7b660de63d9202997": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_0c46bf3c0a1840cfba66afef11e16cd2",
+      "placeholder": "​",
+      "style": "IPY_MODEL_2a1f21cd845e44c89197edc86b482b71",
+      "value": "merges.txt: 100%"
+     }
+    },
+    "de2e8bd2816b4b2990a78bdb5091f097": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "dfbe3622fad14e798c9f413a134fd107": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "e191179e7e4048b69b47d3b9b550b459": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "e2bc0bc25b5044abb8bb1c752935922e": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "e7035db245c7430c92ceb5b61c31ba14": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_081d380b0c52402abfd57337726b1aa3",
+      "placeholder": "​",
+      "style": "IPY_MODEL_5da887b8b4fd4437846c639b3ffb575b",
+      "value": " 525k/525k [00:00&lt;00:00, 11.2MB/s]"
+     }
+    },
+    "e7e79c91380c478dabb2b1e7ddca647e": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "ec7bc6e82f2042b8b29a6f21e6db1709": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HBoxModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HBoxModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HBoxView",
+      "box_style": "",
+      "children": [
+       "IPY_MODEL_609cc0908e6f4edd95306f72b40afd0c",
+       "IPY_MODEL_d5611bb67e8d49f19e2700652d5309c1",
+       "IPY_MODEL_fed0f8954a6b4e1194c63ccc9fba1238"
+      ],
+      "layout": "IPY_MODEL_174aacf5b59048b6ad27a6dffeb87950"
+     }
+    },
+    "ef6fd54de3aa46af868c8c810068d9ad": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "effcd9cbd407405cbbffe4e76b19df72": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "FloatProgressModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "FloatProgressModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "ProgressView",
+      "bar_style": "success",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_9b1db90efb8b4ef096c5254494c063db",
+      "max": 4757,
+      "min": 0,
+      "orientation": "horizontal",
+      "style": "IPY_MODEL_c9fdbe12b7934fd8accc09645df0f214",
+      "value": 4757
+     }
+    },
+    "f0af662a1a884fb78c693ccf0d0b6d8e": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "FloatProgressModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "FloatProgressModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "ProgressView",
+      "bar_style": "success",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_4609b1b46de5441a9632985833bd0b05",
+      "max": 316,
+      "min": 0,
+      "orientation": "horizontal",
+      "style": "IPY_MODEL_5c5e2b0d9fa7435a92c95d417ed03956",
+      "value": 316
+     }
+    },
+    "f0ff7fa2d15f41b4b6fae00cb2936acd": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "DescriptionStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "DescriptionStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "description_width": ""
+     }
+    },
+    "f359ba8ef0cf4841b40acafcd770480c": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HBoxModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HBoxModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HBoxView",
+      "box_style": "",
+      "children": [
+       "IPY_MODEL_0861f873665f4514abfd2b09ad944ab8",
+       "IPY_MODEL_b8924ff1bbb5409c8dc141e65c4cdcdb",
+       "IPY_MODEL_4b62b742aacb45c7a286a5d85a0194da"
+      ],
+      "layout": "IPY_MODEL_dfbe3622fad14e798c9f413a134fd107"
+     }
+    },
+    "fa6ed2fba5bf4abdaceefc180e4f9a41": {
+     "model_module": "@jupyter-widgets/base",
+     "model_module_version": "1.2.0",
+     "model_name": "LayoutModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/base",
+      "_model_module_version": "1.2.0",
+      "_model_name": "LayoutModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "LayoutView",
+      "align_content": null,
+      "align_items": null,
+      "align_self": null,
+      "border": null,
+      "bottom": null,
+      "display": null,
+      "flex": null,
+      "flex_flow": null,
+      "grid_area": null,
+      "grid_auto_columns": null,
+      "grid_auto_flow": null,
+      "grid_auto_rows": null,
+      "grid_column": null,
+      "grid_gap": null,
+      "grid_row": null,
+      "grid_template_areas": null,
+      "grid_template_columns": null,
+      "grid_template_rows": null,
+      "height": null,
+      "justify_content": null,
+      "justify_items": null,
+      "left": null,
+      "margin": null,
+      "max_height": null,
+      "max_width": null,
+      "min_height": null,
+      "min_width": null,
+      "object_fit": null,
+      "object_position": null,
+      "order": null,
+      "overflow": null,
+      "overflow_x": null,
+      "overflow_y": null,
+      "padding": null,
+      "right": null,
+      "top": null,
+      "visibility": null,
+      "width": null
+     }
+    },
+    "fc83fdb519174250ae312080e2918abe": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "ProgressStyleModel",
+     "state": {
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "ProgressStyleModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/base",
+      "_view_module_version": "1.2.0",
+      "_view_name": "StyleView",
+      "bar_color": null,
+      "description_width": ""
+     }
+    },
+    "fed0f8954a6b4e1194c63ccc9fba1238": {
+     "model_module": "@jupyter-widgets/controls",
+     "model_module_version": "1.5.0",
+     "model_name": "HTMLModel",
+     "state": {
+      "_dom_classes": [],
+      "_model_module": "@jupyter-widgets/controls",
+      "_model_module_version": "1.5.0",
+      "_model_name": "HTMLModel",
+      "_view_count": null,
+      "_view_module": "@jupyter-widgets/controls",
+      "_view_module_version": "1.5.0",
+      "_view_name": "HTMLView",
+      "description": "",
+      "description_tooltip": null,
+      "layout": "IPY_MODEL_098832b366c6410b824d2c210222dc24",
+      "placeholder": "​",
+      "style": "IPY_MODEL_e7e79c91380c478dabb2b1e7ddca647e",
+      "value": " 389/389 [00:00&lt;00:00, 27.5kB/s]"
+     }
+    }
+   }
   }
  },
  "nbformat": 4,

From 7aac7210327772870ab31176d1d8630f6f825aa1 Mon Sep 17 00:00:00 2001
From: Ramesh Sampath <1437573+sampathweb@users.noreply.github.com>
Date: Sat, 24 Feb 2024 12:11:34 -0600
Subject: [PATCH 26/30] Adds psutil to CI Build Script (#2355)

* Adds psutil

* Adds psutil
---
 .kokoro/github/ubuntu/gpu/build.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.kokoro/github/ubuntu/gpu/build.sh b/.kokoro/github/ubuntu/gpu/build.sh
index fedfcd0566..76ac0631b4 100644
--- a/.kokoro/github/ubuntu/gpu/build.sh
+++ b/.kokoro/github/ubuntu/gpu/build.sh
@@ -20,6 +20,8 @@ nvcc --version
 
 cd "src/github/keras-cv"
 pip install -U pip setuptools
+# psutil is used by background log reader
+pip install -U psutil
 
 if [ "${KERAS2:-0}" == "1" ]
 then

From e170bfea278e4c3c420162c44250f70bb9d64ae1 Mon Sep 17 00:00:00 2001
From: Divyashree Sreepathihalli <divyashreepathihalli@gmail.com>
Date: Mon, 26 Feb 2024 14:49:00 -0800
Subject: [PATCH 27/30] Update clip_image_model.py (#2359)

---
 keras_cv/models/feature_extractor/clip/clip_image_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/keras_cv/models/feature_extractor/clip/clip_image_model.py b/keras_cv/models/feature_extractor/clip/clip_image_model.py
index 1718768116..69c1002f8e 100644
--- a/keras_cv/models/feature_extractor/clip/clip_image_model.py
+++ b/keras_cv/models/feature_extractor/clip/clip_image_model.py
@@ -73,7 +73,7 @@ def call(self, x):
             patch_embeddings, (batch_size, self.num_patches, -1)
         )
         class_embeds = ops.broadcast_to(
-            self.class_embedding, (batch_size, 1, self.width)
+            self.class_embedding.value, (batch_size, 1, self.width)
         )
         embeddings = ops.concatenate(
             [class_embeds, patch_embeddings], axis=1

From 18b8d790499b3d536e05a82a6a444f4f1f3b622d Mon Sep 17 00:00:00 2001
From: Varun Singh <VarunS1997@users.noreply.github.com>
Date: Mon, 26 Feb 2024 20:46:02 -0800
Subject: [PATCH 28/30] Fixed ROI Pooling Output Shape to Consider Multiple
 ROIs (#2350) (#2360)

* Fixed indentation and output shape in roi pooling to consider multiple ROIs

* Formatted code
---
 keras_cv/layers/object_detection/roi_pool.py  | 19 ++++---
 .../layers/object_detection/roi_pool_test.py  | 50 ++++++++++++++++---
 2 files changed, 53 insertions(+), 16 deletions(-)

diff --git a/keras_cv/layers/object_detection/roi_pool.py b/keras_cv/layers/object_detection/roi_pool.py
index e9774e3e15..3105b1d4be 100644
--- a/keras_cv/layers/object_detection/roi_pool.py
+++ b/keras_cv/layers/object_detection/roi_pool.py
@@ -112,11 +112,12 @@ def _pool_single_sample(self, args):
           feature_map: [H, W, C] float Tensor
           rois: [N, 4] float Tensor
         Returns:
-          pooled_feature_map: [target_size, C] float Tensor
+          pooled_feature_map: [N, target_height, target_width, C] float Tensor
         """
         feature_map, rois = args
         num_rois = rois.get_shape().as_list()[0]
         height, width, channel = feature_map.get_shape().as_list()
+        regions = []
         # TODO (consider vectorize it for better performance)
         for n in range(num_rois):
             # [4]
@@ -127,7 +128,7 @@ def _pool_single_sample(self, args):
             region_width = width * (roi[3] - roi[1])
             h_step = region_height / self.target_height
             w_step = region_width / self.target_width
-            regions = []
+            region_steps = []
             for i in range(self.target_height):
                 for j in range(self.target_width):
                     height_start = y_start + i * h_step
@@ -147,16 +148,18 @@ def _pool_single_sample(self, args):
                         1, width_end - width_start
                     )
                     # [h_step, w_step, C]
-                    region = feature_map[
+                    region_step = feature_map[
                         height_start:height_end, width_start:width_end, :
                     ]
                     # target_height * target_width * [C]
-                    regions.append(tf.reduce_max(region, axis=[0, 1]))
-            regions = tf.reshape(
-                tf.stack(regions),
-                [self.target_height, self.target_width, channel],
+                    region_steps.append(tf.reduce_max(region_step, axis=[0, 1]))
+            regions.append(
+                tf.reshape(
+                    tf.stack(region_steps),
+                    [self.target_height, self.target_width, channel],
+                )
             )
-            return regions
+        return tf.stack(regions)
 
     def get_config(self):
         config = {
diff --git a/keras_cv/layers/object_detection/roi_pool_test.py b/keras_cv/layers/object_detection/roi_pool_test.py
index c6401beebc..e605c3e5a7 100644
--- a/keras_cv/layers/object_detection/roi_pool_test.py
+++ b/keras_cv/layers/object_detection/roi_pool_test.py
@@ -43,7 +43,7 @@ def test_no_quantize(self):
         # | 56, 57, 58, 59(max) | 60, 61, 62, 63(max)   |
         # --------------------------------------------
         expected_feature_map = tf.reshape(
-            tf.constant([27, 31, 59, 63]), [1, 2, 2, 1]
+            tf.constant([27, 31, 59, 63]), [1, 1, 2, 2, 1]
         )
         self.assertAllClose(expected_feature_map, pooled_feature_map)
 
@@ -69,7 +69,7 @@ def test_roi_quantize_y(self):
         # | 56, 57, 58(max)     | 59, 60, 61, 62(max)   | 63 (removed)
         # --------------------------------------------
         expected_feature_map = tf.reshape(
-            tf.constant([26, 30, 58, 62]), [1, 2, 2, 1]
+            tf.constant([26, 30, 58, 62]), [1, 1, 2, 2, 1]
         )
         self.assertAllClose(expected_feature_map, pooled_feature_map)
 
@@ -94,7 +94,7 @@ def test_roi_quantize_x(self):
         # | 48, 49, 50, 51(max) | 52, 53, 54, 55(max)   |
         # --------------------------------------------
         expected_feature_map = tf.reshape(
-            tf.constant([19, 23, 51, 55]), [1, 2, 2, 1]
+            tf.constant([19, 23, 51, 55]), [1, 1, 2, 2, 1]
         )
         self.assertAllClose(expected_feature_map, pooled_feature_map)
 
@@ -121,7 +121,7 @@ def test_roi_quantize_h(self):
         # | 56, 57, 58, 59(max) | 60, 61, 62, 63(max)   |
         # --------------------------------------------
         expected_feature_map = tf.reshape(
-            tf.constant([11, 15, 35, 39, 59, 63]), [1, 3, 2, 1]
+            tf.constant([11, 15, 35, 39, 59, 63]), [1, 1, 3, 2, 1]
         )
         self.assertAllClose(expected_feature_map, pooled_feature_map)
 
@@ -147,7 +147,7 @@ def test_roi_quantize_w(self):
         # | 56, 57(max) | 58, 59, 60(max)   | 61, 62, 63(max)   |
         # --------------------------------------------
         expected_feature_map = tf.reshape(
-            tf.constant([25, 28, 31, 57, 60, 63]), [1, 2, 3, 1]
+            tf.constant([25, 28, 31, 57, 60, 63]), [1, 1, 2, 3, 1]
         )
         self.assertAllClose(expected_feature_map, pooled_feature_map)
 
@@ -168,7 +168,8 @@ def test_roi_feature_map_height_smaller_than_roi(self):
         # ------------------repeated----------------------
         # | 12, 13(max) | 14, 15(max)   |
         expected_feature_map = tf.reshape(
-            tf.constant([1, 3, 1, 3, 5, 7, 9, 11, 9, 11, 13, 15]), [1, 6, 2, 1]
+            tf.constant([1, 3, 1, 3, 5, 7, 9, 11, 9, 11, 13, 15]),
+            [1, 1, 6, 2, 1],
         )
         self.assertAllClose(expected_feature_map, pooled_feature_map)
 
@@ -189,7 +190,7 @@ def test_roi_feature_map_width_smaller_than_roi(self):
         # --------------------------------------------
         expected_feature_map = tf.reshape(
             tf.constant([4, 4, 5, 6, 6, 7, 12, 12, 13, 14, 14, 15]),
-            [1, 2, 6, 1],
+            [1, 1, 2, 6, 1],
         )
         self.assertAllClose(expected_feature_map, pooled_feature_map)
 
@@ -203,10 +204,43 @@ def test_roi_empty(self):
         rois = tf.reshape(tf.constant([0.0, 0.0, 0.0, 0.0]), [1, 1, 4])
         pooled_feature_map = roi_pooler(feature_map, rois)
         # all outputs should be top-left pixel
-        self.assertAllClose(tf.ones([1, 2, 2, 1]), pooled_feature_map)
+        self.assertAllClose(tf.ones([1, 1, 2, 2, 1]), pooled_feature_map)
 
     def test_invalid_image_shape(self):
         with self.assertRaisesRegex(ValueError, "dynamic shape"):
             _ = ROIPooler(
                 "rel_yxyx", target_size=[2, 2], image_shape=[None, 224, 3]
             )
+
+    def test_multiple_rois(self):
+        feature_map = tf.expand_dims(
+            tf.reshape(tf.range(0, 64), [8, 8, 1]), axis=0
+        )
+
+        roi_pooler = ROIPooler(
+            bounding_box_format="yxyx",
+            target_size=[2, 2],
+            image_shape=[224, 224, 3],
+        )
+        rois = tf.constant(
+            [[[0.0, 0.0, 112.0, 112.0], [0.0, 112.0, 224.0, 224.0]]],
+        )
+
+        pooled_feature_map = roi_pooler(feature_map, rois)
+        # the maximum value would be at bottom-right at each block, roi sharded
+        # into 2x2 blocks
+        # | 0, 1, 2, 3          | 4, 5, 6, 7            |
+        # | 8, 9, 10, 11        | 12, 13, 14, 15        |
+        # | 16, 17, 18, 19      | 20, 21, 22, 23        |
+        # | 24, 25, 26, 27(max) | 28, 29, 30, 31(max)   |
+        # --------------------------------------------
+        # | 32, 33, 34, 35      | 36, 37, 38, 39        |
+        # | 40, 41, 42, 43      | 44, 45, 46, 47        |
+        # | 48, 49, 50, 51      | 52, 53, 54, 55        |
+        # | 56, 57, 58, 59(max) | 60, 61, 62, 63(max)   |
+        # --------------------------------------------
+
+        expected_feature_map = tf.reshape(
+            tf.constant([9, 11, 25, 27, 29, 31, 61, 63]), [1, 2, 2, 2, 1]
+        )
+        self.assertAllClose(expected_feature_map, pooled_feature_map)

From 9207602d943af82ce8c2bcc1e33b5f395971371c Mon Sep 17 00:00:00 2001
From: Divyashree Sreepathihalli <divyashreepathihalli@gmail.com>
Date: Mon, 26 Feb 2024 21:04:06 -0800
Subject: [PATCH 29/30] Add training scripts for yolov8 and deeplabv3plus
 (#2361)

* add yolov8 training script

* add deeplabv3plus traning script

* code reformat

---------

Co-authored-by: Divyashree Sreepathihalli <divyashreepathihalli>
---
 .../training_scipts/Training_YOLOv8.ipynb     | 3321 +++++++++++++++++
 .../training_deeplab_v3_plus.ipynb            |  569 +++
 2 files changed, 3890 insertions(+)
 create mode 100644 keras_cv/tools/training_scipts/Training_YOLOv8.ipynb
 create mode 100644 keras_cv/tools/training_scipts/training_deeplab_v3_plus.ipynb

diff --git a/keras_cv/tools/training_scipts/Training_YOLOv8.ipynb b/keras_cv/tools/training_scipts/Training_YOLOv8.ipynb
new file mode 100644
index 0000000000..dc0cf695e2
--- /dev/null
+++ b/keras_cv/tools/training_scipts/Training_YOLOv8.ipynb
@@ -0,0 +1,3321 @@
+{
+ "nbformat": 4,
+ "nbformat_minor": 0,
+ "metadata": {
+  "colab": {
+   "provenance": [],
+   "gpuType": "A100",
+   "machine_shape": "hm"
+  },
+  "kernelspec": {
+   "name": "python3",
+   "display_name": "Python 3"
+  },
+  "language_info": {
+   "name": "python"
+  },
+  "accelerator": "GPU"
+ },
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "rtDJ7E2lv01f"
+   },
+   "outputs": [],
+   "source": [
+    "!pip install keras-cv keras-core"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "!pip uninstall -y keras-cv\n",
+    "!pip install git+https://github.com/ianstenbit/keras-cv.git@task-aligned-assignment"
+   ],
+   "metadata": {
+    "id": "0D0rrgB5vJVj"
+   },
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "# Copyright 2022 The KerasCV Authors\n",
+    "#\n",
+    "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+    "# you may not use this file except in compliance with the License.\n",
+    "# You may obtain a copy of the License at\n",
+    "#\n",
+    "#     https://www.apache.org/licenses/LICENSE-2.0\n",
+    "#\n",
+    "# Unless required by applicable law or agreed to in writing, software\n",
+    "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+    "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+    "# See the License for the specific language governing permissions and\n",
+    "# limitations under the License.\n",
+    "\"\"\"\n",
+    "Title: Train an Object Detection Model on Pascal VOC 2007 using KerasCV\n",
+    "Author: [lukewood](https://github.com/LukeWood), [tanzhenyu](https://github.com/tanzhenyu)\n",
+    "Date created: 2022/09/27\n",
+    "Last modified: 2023/03/29\n",
+    "Description: Use KerasCV to train a RetinaNet on Pascal VOC 2007.\n",
+    "\"\"\"\n",
+    "import resource\n",
+    "import sys\n",
+    "\n",
+    "import tensorflow as tf\n",
+    "import tensorflow_datasets as tfds\n",
+    "import tqdm\n",
+    "from tensorflow import keras\n",
+    "\n",
+    "import keras_cv\n",
+    "\n",
+    "# Temporarily need PyCOCOCallback to verify\n",
+    "# a 1:1 comparison with the PyMetrics version.\n",
+    "from keras_cv.callbacks import PyCOCOCallback\n",
+    "\n",
+    "low, high = resource.getrlimit(resource.RLIMIT_NOFILE)\n",
+    "resource.setrlimit(resource.RLIMIT_NOFILE, (high, high))"
+   ],
+   "metadata": {
+    "id": "eWYAJolSwMZ3",
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "outputId": "31435123-f99a-4c32-c374-93f38fc35e69"
+   },
+   "execution_count": null,
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "Using TensorFlow backend\n"
+     ]
+    }
+   ]
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "from google.colab import auth\n",
+    "\n",
+    "auth.authenticate_user()"
+   ],
+   "metadata": {
+    "id": "bZ_jp2X1PKM5"
+   },
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "try:\n",
+    "    tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect()\n",
+    "    strategy = tf.distribute.TPUStrategy(tpu)\n",
+    "except ValueError:\n",
+    "    # MirroredStrategy is best for a single machine with one or multiple GPUs\n",
+    "    strategy = tf.distribute.MirroredStrategy()\n",
+    "\n",
+    "BATCH_SIZE = 4\n",
+    "GLOBAL_BATCH_SIZE = BATCH_SIZE * strategy.num_replicas_in_sync\n",
+    "BASE_LR = 0.01 * GLOBAL_BATCH_SIZE / 64\n",
+    "print(\"Number of accelerators: \", strategy.num_replicas_in_sync)\n",
+    "print(\"Global Batch Size: \", GLOBAL_BATCH_SIZE)\n",
+    "\n",
+    "IMG_SIZE = 640\n",
+    "image_size = [IMG_SIZE, IMG_SIZE, 3]\n",
+    "\n",
+    "# data_dir=\"gs://kerascv-dataset\"\n",
+    "train_ds = tfds.load(\n",
+    "    \"voc/2007\",\n",
+    "    split=\"train+validation\",\n",
+    "    with_info=False,\n",
+    "    shuffle_files=True,  # , data_dir=\"gs://kerascv-dataset\"\n",
+    ")\n",
+    "train_ds = train_ds.concatenate(\n",
+    "    tfds.load(\n",
+    "        \"voc/2012\",\n",
+    "        split=\"train+validation\",\n",
+    "        with_info=False,\n",
+    "        shuffle_files=True,\n",
+    "        # data_dir=\"gs://kerascv-dataset\"\n",
+    "    )\n",
+    ")\n",
+    "eval_ds = tfds.load(\n",
+    "    \"voc/2007\", split=\"test\", with_info=False\n",
+    ")  # , data_dir=\"gs://kerascv-dataset\")\n",
+    "\n",
+    "\n",
+    "def unpackage_tfds_inputs(inputs, bounding_box_format):\n",
+    "    image = inputs[\"image\"]\n",
+    "    boxes = keras_cv.bounding_box.convert_format(\n",
+    "        inputs[\"objects\"][\"bbox\"],\n",
+    "        images=image,\n",
+    "        source=\"rel_yxyx\",\n",
+    "        target=bounding_box_format,\n",
+    "    )\n",
+    "    bounding_boxes = {\n",
+    "        \"classes\": tf.cast(inputs[\"objects\"][\"label\"], dtype=tf.float32),\n",
+    "        \"boxes\": tf.cast(boxes, dtype=tf.float32),\n",
+    "    }\n",
+    "    return {\n",
+    "        \"images\": tf.cast(image, tf.float32),\n",
+    "        \"bounding_boxes\": bounding_boxes,\n",
+    "    }\n",
+    "\n",
+    "\n",
+    "train_ds = train_ds.map(\n",
+    "    lambda inputs: unpackage_tfds_inputs(inputs, bounding_box_format=\"xywh\"),\n",
+    "    num_parallel_calls=tf.data.AUTOTUNE,\n",
+    ")\n",
+    "eval_ds = eval_ds.map(\n",
+    "    lambda inputs: unpackage_tfds_inputs(inputs, bounding_box_format=\"xywh\"),\n",
+    "    num_parallel_calls=tf.data.AUTOTUNE,\n",
+    ")\n",
+    "\n",
+    "augmenter = keras.Sequential(\n",
+    "    layers=[\n",
+    "        keras_cv.layers.RandomFlip(\n",
+    "            mode=\"horizontal\", bounding_box_format=\"xywh\"\n",
+    "        ),\n",
+    "        keras_cv.layers.JitteredResize(\n",
+    "            target_size=(640, 640),\n",
+    "            scale_factor=(0.8, 1.25),\n",
+    "            bounding_box_format=\"xywh\",\n",
+    "        ),\n",
+    "    ]\n",
+    ")\n",
+    "train_ds = train_ds.apply(\n",
+    "    tf.data.experimental.dense_to_ragged_batch(BATCH_SIZE)\n",
+    ")\n",
+    "train_ds = train_ds.map(augmenter, num_parallel_calls=tf.data.AUTOTUNE)\n",
+    "\n",
+    "\n",
+    "def pad_fn(inputs):\n",
+    "    inputs[\"bounding_boxes\"] = keras_cv.bounding_box.to_dense(\n",
+    "        inputs[\"bounding_boxes\"], max_boxes=32\n",
+    "    )\n",
+    "    return inputs\n",
+    "\n",
+    "\n",
+    "train_ds = train_ds.shuffle(8 * strategy.num_replicas_in_sync)\n",
+    "train_ds = train_ds.map(pad_fn, num_parallel_calls=tf.data.AUTOTUNE)\n",
+    "train_ds = train_ds.prefetch(tf.data.AUTOTUNE)\n",
+    "\n",
+    "eval_resizing = keras_cv.layers.Resizing(\n",
+    "    640, 640, pad_to_aspect_ratio=True, bounding_box_format=\"xywh\"\n",
+    ")\n",
+    "eval_ds = eval_ds.map(\n",
+    "    eval_resizing,\n",
+    "    num_parallel_calls=tf.data.AUTOTUNE,\n",
+    ")\n",
+    "eval_ds = eval_ds.apply(tf.data.experimental.dense_to_ragged_batch(BATCH_SIZE))\n",
+    "eval_ds = eval_ds.map(pad_fn, num_parallel_calls=tf.data.AUTOTUNE)\n",
+    "eval_ds = eval_ds.prefetch(tf.data.AUTOTUNE)"
+   ],
+   "metadata": {
+    "id": "96w4OHJgMseo",
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "outputId": "b5a9d0f9-3730-4d5e-c6c3-47b8eaeae0e7"
+   },
+   "execution_count": null,
+   "outputs": [
+    {
+     "output_type": "stream",
+     "name": "stdout",
+     "text": [
+      "Number of accelerators:  1\n",
+      "Global Batch Size:  4\n"
+     ]
+    },
+    {
+     "output_type": "stream",
+     "name": "stderr",
+     "text": [
+      "WARNING:tensorflow:From <ipython-input-4-fd95518ce226>:73: dense_to_ragged_batch (from tensorflow.python.data.experimental.ops.batching) is deprecated and will be removed in a future version.\n",
+      "Instructions for updating:\n",
+      "Use `tf.data.Dataset.ragged_batch` instead.\n",
+      "WARNING:tensorflow:Layers in a Sequential model should only have a single input tensor. Received: inputs={'images': tf.RaggedTensor(values=tf.RaggedTensor(values=Tensor(\"RaggedFromVariant_2/RaggedTensorFromVariant:2\", shape=(None, 3), dtype=float32), row_splits=Tensor(\"RaggedFromVariant_2/RaggedTensorFromVariant:1\", shape=(None,), dtype=int64)), row_splits=Tensor(\"RaggedFromVariant_2/RaggedTensorFromVariant:0\", shape=(None,), dtype=int64)), 'bounding_boxes': {'classes': tf.RaggedTensor(values=Tensor(\"RaggedFromVariant_1/RaggedTensorFromVariant:1\", shape=(None,), dtype=float32), row_splits=Tensor(\"RaggedFromVariant_1/RaggedTensorFromVariant:0\", shape=(None,), dtype=int64)), 'boxes': tf.RaggedTensor(values=Tensor(\"RaggedFromVariant/RaggedTensorFromVariant:1\", shape=(None, 4), dtype=float32), row_splits=Tensor(\"RaggedFromVariant/RaggedTensorFromVariant:0\", shape=(None,), dtype=int64))}}. Consider rewriting this model with the Functional API.\n"
+     ]
+    }
+   ]
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "with strategy.scope():\n",
+    "    model = keras_cv.models.YOLOV8Detector(\n",
+    "        num_classes=20,\n",
+    "        backbone=keras_cv.models.YOLOV8Backbone.from_preset(\n",
+    "            \"yolo_v8_m_backbone_coco\"\n",
+    "        ),\n",
+    "        fpn_depth=2,\n",
+    "        bounding_box_format=\"xywh\",\n",
+    "    )\n",
+    "    lr_schedule = keras.optimizers.schedules.PolynomialDecay(\n",
+    "        initial_learning_rate=BASE_LR,\n",
+    "        decay_steps=train_ds.cardinality() * 120,\n",
+    "    )\n",
+    "    optimizer = tf.keras.optimizers.SGD(\n",
+    "        learning_rate=lr_schedule,\n",
+    "        momentum=0.937,\n",
+    "        clipnorm=5.0,\n",
+    "        weight_decay=5e-4,\n",
+    "        use_ema=True,\n",
+    "        ema_momentum=0.9999,\n",
+    "    )\n",
+    "\n",
+    "model.compile(\n",
+    "    optimizer=optimizer,\n",
+    "    box_loss=\"ciou\",\n",
+    "    classification_loss=\"binary_crossentropy\",\n",
+    ")\n",
+    "model.backbone.trainable = True\n",
+    "\n",
+    "callbacks = [\n",
+    "    keras_cv.callbacks.PyCOCOCallback(eval_ds, bounding_box_format=\"xywh\"),\n",
+    "    keras.callbacks.TensorBoard(\"gs://ian-kerascv/yolov8-gpu-logs-v4\"),\n",
+    "    keras.callbacks.ModelCheckpoint(\n",
+    "        \"./weights.h5\", save_best_only=True, save_weights_only=True\n",
+    "    ),\n",
+    "]\n",
+    "\n",
+    "history = model.fit(\n",
+    "    train_ds,\n",
+    "    validation_data=eval_ds,\n",
+    "    epochs=120,\n",
+    "    callbacks=callbacks,\n",
+    ")"
+   ],
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "3kQ4z0AwMyEi",
+    "outputId": "e8131d4a-c12f-438b-8642-7bce0abeee7f"
+   },
+   "execution_count": null,
+   "outputs": [
+    {
+     "metadata": {
+      "tags": null
+     },
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Epoch 1/120\n",
+      "   6/4138 [..............................] - ETA: 7:32 - loss: 543.9800 - box_loss: 2.9202 - class_loss: 541.0598"
+     ]
+    },
+    {
+     "metadata": {
+      "tags": null
+     },
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "WARNING:tensorflow:Callback method `on_train_batch_end` is slow compared to the batch time (batch time: 0.1021s vs `on_train_batch_end` time: 0.3079s). Check your callbacks.\n"
+     ]
+    },
+    {
+     "metadata": {
+      "tags": null
+     },
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "1238/1238 [==============================] - 131s 101ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=0.04s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=0.06s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.005\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.010\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.004\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.000\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.000\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.006\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.008\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.009\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.009\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.000\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.000\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.012\n",
+      "4138/4138 [==============================] - 691s 151ms/step - loss: 6.6185 - box_loss: 2.2628 - class_loss: 4.3558 - val_loss: 2.1278 - val_box_loss: 1.9264 - val_class_loss: 0.2014 - val_AP: 0.0047 - val_AP50: 0.0099 - val_AP75: 0.0042 - val_APs: 0.0000e+00 - val_APm: 0.0000e+00 - val_APl: 0.0065 - val_ARmax1: 0.0075 - val_ARmax10: 0.0085 - val_ARmax100: 0.0085 - val_ARs: 0.0000e+00 - val_ARm: 0.0000e+00 - val_ARl: 0.0120\n",
+      "Epoch 2/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=0.89s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=0.27s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.010\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.021\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.009\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.004\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.005\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.015\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.010\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.014\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.014\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.004\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.008\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.020\n",
+      "4138/4138 [==============================] - 602s 145ms/step - loss: 1.9417 - box_loss: 1.7593 - class_loss: 0.1824 - val_loss: 1.8599 - val_box_loss: 1.6905 - val_class_loss: 0.1694 - val_AP: 0.0103 - val_AP50: 0.0209 - val_AP75: 0.0091 - val_APs: 0.0037 - val_APm: 0.0054 - val_APl: 0.0146 - val_ARmax1: 0.0100 - val_ARmax10: 0.0142 - val_ARmax100: 0.0142 - val_ARs: 0.0043 - val_ARm: 0.0079 - val_ARl: 0.0196\n",
+      "Epoch 3/120\n",
+      "1238/1238 [==============================] - 110s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=2.04s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=0.42s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.018\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.033\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.018\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.005\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.012\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.024\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.024\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.032\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.032\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.007\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.018\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.039\n",
+      "4138/4138 [==============================] - 598s 144ms/step - loss: 1.7546 - box_loss: 1.5924 - class_loss: 0.1622 - val_loss: 1.7456 - val_box_loss: 1.5875 - val_class_loss: 0.1581 - val_AP: 0.0181 - val_AP50: 0.0327 - val_AP75: 0.0175 - val_APs: 0.0052 - val_APm: 0.0123 - val_APl: 0.0238 - val_ARmax1: 0.0245 - val_ARmax10: 0.0320 - val_ARmax100: 0.0320 - val_ARs: 0.0070 - val_ARm: 0.0183 - val_ARl: 0.0391\n",
+      "Epoch 4/120\n",
+      "1238/1238 [==============================] - 110s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=2.55s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=0.54s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.022\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.039\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.021\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.004\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.013\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.029\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.030\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.040\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.040\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.005\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.019\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.050\n",
+      "4138/4138 [==============================] - 600s 145ms/step - loss: 1.6476 - box_loss: 1.4961 - class_loss: 0.1515 - val_loss: 1.6936 - val_box_loss: 1.5437 - val_class_loss: 0.1498 - val_AP: 0.0215 - val_AP50: 0.0388 - val_AP75: 0.0210 - val_APs: 0.0045 - val_APm: 0.0125 - val_APl: 0.0286 - val_ARmax1: 0.0298 - val_ARmax10: 0.0399 - val_ARmax100: 0.0402 - val_ARs: 0.0054 - val_ARm: 0.0191 - val_ARl: 0.0502\n",
+      "Epoch 5/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=2.24s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=0.59s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.029\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.050\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.031\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.006\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.020\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.036\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.047\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.060\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.061\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.010\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.028\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.070\n",
+      "4138/4138 [==============================] - 599s 145ms/step - loss: 1.5638 - box_loss: 1.4202 - class_loss: 0.1436 - val_loss: 1.6210 - val_box_loss: 1.4787 - val_class_loss: 0.1422 - val_AP: 0.0291 - val_AP50: 0.0499 - val_AP75: 0.0308 - val_APs: 0.0062 - val_APm: 0.0197 - val_APl: 0.0355 - val_ARmax1: 0.0471 - val_ARmax10: 0.0603 - val_ARmax100: 0.0606 - val_ARs: 0.0104 - val_ARm: 0.0280 - val_ARl: 0.0703\n",
+      "Epoch 6/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=2.84s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=0.61s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.042\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.068\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.044\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.009\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.024\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.052\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.061\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.077\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.077\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.015\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.037\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.090\n",
+      "4138/4138 [==============================] - 600s 145ms/step - loss: 1.5040 - box_loss: 1.3672 - class_loss: 0.1368 - val_loss: 1.5693 - val_box_loss: 1.4325 - val_class_loss: 0.1368 - val_AP: 0.0419 - val_AP50: 0.0681 - val_AP75: 0.0437 - val_APs: 0.0093 - val_APm: 0.0240 - val_APl: 0.0522 - val_ARmax1: 0.0608 - val_ARmax10: 0.0770 - val_ARmax100: 0.0773 - val_ARs: 0.0149 - val_ARm: 0.0368 - val_ARl: 0.0904\n",
+      "Epoch 7/120\n",
+      "1238/1238 [==============================] - 112s 90ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=2.87s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=0.61s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.046\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.074\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.050\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.007\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.024\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.056\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.073\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.089\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.089\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.012\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.036\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.102\n",
+      "4138/4138 [==============================] - 605s 146ms/step - loss: 1.4514 - box_loss: 1.3209 - class_loss: 0.1305 - val_loss: 1.5627 - val_box_loss: 1.4287 - val_class_loss: 0.1340 - val_AP: 0.0463 - val_AP50: 0.0745 - val_AP75: 0.0501 - val_APs: 0.0068 - val_APm: 0.0244 - val_APl: 0.0561 - val_ARmax1: 0.0731 - val_ARmax10: 0.0886 - val_ARmax100: 0.0888 - val_ARs: 0.0122 - val_ARm: 0.0364 - val_ARl: 0.1023\n",
+      "Epoch 8/120\n",
+      "1238/1238 [==============================] - 112s 90ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=3.06s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=0.65s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.065\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.102\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.070\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.011\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.034\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.077\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.094\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.114\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.115\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.015\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.052\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.129\n",
+      "4138/4138 [==============================] - 606s 146ms/step - loss: 1.4050 - box_loss: 1.2801 - class_loss: 0.1249 - val_loss: 1.5070 - val_box_loss: 1.3815 - val_class_loss: 0.1255 - val_AP: 0.0653 - val_AP50: 0.1017 - val_AP75: 0.0695 - val_APs: 0.0106 - val_APm: 0.0342 - val_APl: 0.0768 - val_ARmax1: 0.0936 - val_ARmax10: 0.1144 - val_ARmax100: 0.1146 - val_ARs: 0.0149 - val_ARm: 0.0523 - val_ARl: 0.1287\n",
+      "Epoch 9/120\n",
+      "1238/1238 [==============================] - 111s 90ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=3.57s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=0.77s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.072\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.115\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.076\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.012\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.044\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.084\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.105\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.132\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.132\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.018\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.066\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.144\n",
+      "4138/4138 [==============================] - 606s 146ms/step - loss: 1.3625 - box_loss: 1.2422 - class_loss: 0.1203 - val_loss: 1.4845 - val_box_loss: 1.3633 - val_class_loss: 0.1212 - val_AP: 0.0721 - val_AP50: 0.1148 - val_AP75: 0.0762 - val_APs: 0.0118 - val_APm: 0.0445 - val_APl: 0.0836 - val_ARmax1: 0.1051 - val_ARmax10: 0.1315 - val_ARmax100: 0.1322 - val_ARs: 0.0185 - val_ARm: 0.0664 - val_ARl: 0.1444\n",
+      "Epoch 10/120\n",
+      "1238/1238 [==============================] - 112s 90ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=3.61s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=0.78s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.082\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.129\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.088\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.008\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.045\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.094\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.117\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.144\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.144\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.013\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.072\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.157\n",
+      "4138/4138 [==============================] - 607s 146ms/step - loss: 1.3286 - box_loss: 1.2130 - class_loss: 0.1156 - val_loss: 1.4917 - val_box_loss: 1.3721 - val_class_loss: 0.1197 - val_AP: 0.0815 - val_AP50: 0.1286 - val_AP75: 0.0876 - val_APs: 0.0083 - val_APm: 0.0448 - val_APl: 0.0937 - val_ARmax1: 0.1170 - val_ARmax10: 0.1436 - val_ARmax100: 0.1445 - val_ARs: 0.0134 - val_ARm: 0.0724 - val_ARl: 0.1566\n",
+      "Epoch 11/120\n",
+      "1238/1238 [==============================] - 112s 90ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=3.60s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.40s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.105\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.162\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.112\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.012\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.052\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.122\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.143\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.172\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.173\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.020\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.080\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.191\n",
+      "4138/4138 [==============================] - 609s 147ms/step - loss: 1.2937 - box_loss: 1.1823 - class_loss: 0.1115 - val_loss: 1.4541 - val_box_loss: 1.3397 - val_class_loss: 0.1145 - val_AP: 0.1046 - val_AP50: 0.1617 - val_AP75: 0.1123 - val_APs: 0.0123 - val_APm: 0.0518 - val_APl: 0.1221 - val_ARmax1: 0.1429 - val_ARmax10: 0.1721 - val_ARmax100: 0.1732 - val_ARs: 0.0201 - val_ARm: 0.0803 - val_ARl: 0.1909\n",
+      "Epoch 12/120\n",
+      "1238/1238 [==============================] - 111s 90ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=3.85s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=0.83s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.113\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.175\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.121\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.016\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.057\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.133\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.155\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.188\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.190\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.027\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.090\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.209\n",
+      "4138/4138 [==============================] - 609s 147ms/step - loss: 1.2626 - box_loss: 1.1547 - class_loss: 0.1079 - val_loss: 1.4111 - val_box_loss: 1.3020 - val_class_loss: 0.1091 - val_AP: 0.1134 - val_AP50: 0.1750 - val_AP75: 0.1213 - val_APs: 0.0162 - val_APm: 0.0569 - val_APl: 0.1331 - val_ARmax1: 0.1548 - val_ARmax10: 0.1882 - val_ARmax100: 0.1899 - val_ARs: 0.0272 - val_ARm: 0.0898 - val_ARl: 0.2093\n",
+      "Epoch 13/120\n",
+      "1238/1238 [==============================] - 111s 90ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=3.99s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=0.88s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.124\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.195\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.133\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.014\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.065\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.146\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.165\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.206\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.208\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.023\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.103\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.230\n",
+      "4138/4138 [==============================] - 607s 146ms/step - loss: 1.2352 - box_loss: 1.1307 - class_loss: 0.1045 - val_loss: 1.4255 - val_box_loss: 1.3185 - val_class_loss: 0.1070 - val_AP: 0.1244 - val_AP50: 0.1953 - val_AP75: 0.1332 - val_APs: 0.0138 - val_APm: 0.0646 - val_APl: 0.1459 - val_ARmax1: 0.1654 - val_ARmax10: 0.2064 - val_ARmax100: 0.2077 - val_ARs: 0.0229 - val_ARm: 0.1026 - val_ARl: 0.2300\n",
+      "Epoch 14/120\n",
+      "1238/1238 [==============================] - 112s 90ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=3.86s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=0.83s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.132\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.203\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.142\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.013\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.062\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.156\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.178\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.216\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.218\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.023\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.096\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.243\n",
+      "4138/4138 [==============================] - 608s 147ms/step - loss: 1.2154 - box_loss: 1.1132 - class_loss: 0.1022 - val_loss: 1.4007 - val_box_loss: 1.2956 - val_class_loss: 0.1051 - val_AP: 0.1324 - val_AP50: 0.2034 - val_AP75: 0.1419 - val_APs: 0.0131 - val_APm: 0.0619 - val_APl: 0.1560 - val_ARmax1: 0.1782 - val_ARmax10: 0.2162 - val_ARmax100: 0.2178 - val_ARs: 0.0225 - val_ARm: 0.0956 - val_ARl: 0.2432\n",
+      "Epoch 15/120\n",
+      "1238/1238 [==============================] - 111s 90ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=3.96s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=0.87s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.133\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.206\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.141\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.014\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.065\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.156\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.177\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.216\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.217\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.021\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.106\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.239\n",
+      "4138/4138 [==============================] - 608s 147ms/step - loss: 1.1902 - box_loss: 1.0915 - class_loss: 0.0987 - val_loss: 1.3892 - val_box_loss: 1.2849 - val_class_loss: 0.1043 - val_AP: 0.1328 - val_AP50: 0.2063 - val_AP75: 0.1407 - val_APs: 0.0145 - val_APm: 0.0651 - val_APl: 0.1556 - val_ARmax1: 0.1765 - val_ARmax10: 0.2159 - val_ARmax100: 0.2173 - val_ARs: 0.0214 - val_ARm: 0.1058 - val_ARl: 0.2389\n",
+      "Epoch 16/120\n",
+      "1238/1238 [==============================] - 112s 90ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.21s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=0.91s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.158\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.241\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.169\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.015\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.084\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.184\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.194\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.244\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.246\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.028\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.133\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.270\n",
+      "4138/4138 [==============================] - 609s 147ms/step - loss: 1.1717 - box_loss: 1.0749 - class_loss: 0.0968 - val_loss: 1.3653 - val_box_loss: 1.2657 - val_class_loss: 0.0995 - val_AP: 0.1579 - val_AP50: 0.2411 - val_AP75: 0.1686 - val_APs: 0.0151 - val_APm: 0.0837 - val_APl: 0.1844 - val_ARmax1: 0.1943 - val_ARmax10: 0.2435 - val_ARmax100: 0.2456 - val_ARs: 0.0282 - val_ARm: 0.1330 - val_ARl: 0.2696\n",
+      "Epoch 17/120\n",
+      "1238/1238 [==============================] - 112s 90ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.29s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=0.94s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.161\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.247\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.172\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.020\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.081\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.190\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.201\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.252\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.254\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.038\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.125\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.283\n",
+      "4138/4138 [==============================] - 609s 147ms/step - loss: 1.1573 - box_loss: 1.0629 - class_loss: 0.0944 - val_loss: 1.3572 - val_box_loss: 1.2598 - val_class_loss: 0.0975 - val_AP: 0.1610 - val_AP50: 0.2467 - val_AP75: 0.1725 - val_APs: 0.0197 - val_APm: 0.0810 - val_APl: 0.1901 - val_ARmax1: 0.2014 - val_ARmax10: 0.2520 - val_ARmax100: 0.2542 - val_ARs: 0.0384 - val_ARm: 0.1252 - val_ARl: 0.2830\n",
+      "Epoch 18/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.34s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=0.95s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.159\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.247\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.170\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.020\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.080\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.186\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.201\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.256\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.258\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.038\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.126\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.287\n",
+      "4138/4138 [==============================] - 604s 146ms/step - loss: 1.1259 - box_loss: 1.0339 - class_loss: 0.0919 - val_loss: 1.3639 - val_box_loss: 1.2669 - val_class_loss: 0.0970 - val_AP: 0.1592 - val_AP50: 0.2474 - val_AP75: 0.1698 - val_APs: 0.0199 - val_APm: 0.0800 - val_APl: 0.1865 - val_ARmax1: 0.2013 - val_ARmax10: 0.2562 - val_ARmax100: 0.2577 - val_ARs: 0.0376 - val_ARm: 0.1261 - val_ARl: 0.2873\n",
+      "Epoch 19/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.31s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=0.94s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.176\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.266\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.188\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.019\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.089\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.203\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.212\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.268\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.271\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.032\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.136\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.300\n",
+      "4138/4138 [==============================] - 602s 145ms/step - loss: 1.1077 - box_loss: 1.0178 - class_loss: 0.0900 - val_loss: 1.3421 - val_box_loss: 1.2475 - val_class_loss: 0.0945 - val_AP: 0.1757 - val_AP50: 0.2661 - val_AP75: 0.1880 - val_APs: 0.0188 - val_APm: 0.0893 - val_APl: 0.2034 - val_ARmax1: 0.2120 - val_ARmax10: 0.2684 - val_ARmax100: 0.2708 - val_ARs: 0.0320 - val_ARm: 0.1356 - val_ARl: 0.2999\n",
+      "Epoch 20/120\n",
+      "1238/1238 [==============================] - 110s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.21s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=0.93s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.177\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.270\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.191\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.021\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.091\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.206\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.212\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.269\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.271\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.037\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.140\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.299\n",
+      "4138/4138 [==============================] - 602s 145ms/step - loss: 1.0830 - box_loss: 0.9950 - class_loss: 0.0881 - val_loss: 1.3855 - val_box_loss: 1.2876 - val_class_loss: 0.0979 - val_AP: 0.1770 - val_AP50: 0.2697 - val_AP75: 0.1908 - val_APs: 0.0213 - val_APm: 0.0914 - val_APl: 0.2056 - val_ARmax1: 0.2124 - val_ARmax10: 0.2694 - val_ARmax100: 0.2712 - val_ARs: 0.0366 - val_ARm: 0.1404 - val_ARl: 0.2993\n",
+      "Epoch 21/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.97s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=0.96s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.190\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.284\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.207\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.018\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.099\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.217\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.229\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.287\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.289\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.031\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.145\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.319\n",
+      "4138/4138 [==============================] - 603s 145ms/step - loss: 1.0674 - box_loss: 0.9810 - class_loss: 0.0863 - val_loss: 1.3425 - val_box_loss: 1.2493 - val_class_loss: 0.0931 - val_AP: 0.1900 - val_AP50: 0.2840 - val_AP75: 0.2069 - val_APs: 0.0183 - val_APm: 0.0995 - val_APl: 0.2173 - val_ARmax1: 0.2290 - val_ARmax10: 0.2867 - val_ARmax100: 0.2886 - val_ARs: 0.0306 - val_ARm: 0.1447 - val_ARl: 0.3187\n",
+      "Epoch 22/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.41s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=0.95s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.194\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.293\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.209\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.023\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.096\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.225\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.231\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.291\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.293\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.040\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.150\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.324\n",
+      "4138/4138 [==============================] - 602s 145ms/step - loss: 1.0508 - box_loss: 0.9663 - class_loss: 0.0845 - val_loss: 1.3495 - val_box_loss: 1.2578 - val_class_loss: 0.0917 - val_AP: 0.1935 - val_AP50: 0.2931 - val_AP75: 0.2091 - val_APs: 0.0234 - val_APm: 0.0964 - val_APl: 0.2253 - val_ARmax1: 0.2311 - val_ARmax10: 0.2913 - val_ARmax100: 0.2934 - val_ARs: 0.0398 - val_ARm: 0.1498 - val_ARl: 0.3241\n",
+      "Epoch 23/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.42s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=0.97s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.204\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.306\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.218\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.019\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.102\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.235\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.242\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.307\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.309\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.037\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.169\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.341\n",
+      "4138/4138 [==============================] - 604s 146ms/step - loss: 1.0350 - box_loss: 0.9521 - class_loss: 0.0828 - val_loss: 1.3130 - val_box_loss: 1.2233 - val_class_loss: 0.0898 - val_AP: 0.2039 - val_AP50: 0.3057 - val_AP75: 0.2180 - val_APs: 0.0193 - val_APm: 0.1022 - val_APl: 0.2348 - val_ARmax1: 0.2422 - val_ARmax10: 0.3070 - val_ARmax100: 0.3091 - val_ARs: 0.0375 - val_ARm: 0.1691 - val_ARl: 0.3412\n",
+      "Epoch 24/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.48s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=0.98s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.212\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.316\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.229\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.023\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.098\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.250\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.252\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.316\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.317\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.040\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.154\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.357\n",
+      "4138/4138 [==============================] - 604s 146ms/step - loss: 1.0194 - box_loss: 0.9381 - class_loss: 0.0814 - val_loss: 1.3040 - val_box_loss: 1.2155 - val_class_loss: 0.0885 - val_AP: 0.2125 - val_AP50: 0.3158 - val_AP75: 0.2290 - val_APs: 0.0232 - val_APm: 0.0982 - val_APl: 0.2502 - val_ARmax1: 0.2524 - val_ARmax10: 0.3155 - val_ARmax100: 0.3172 - val_ARs: 0.0405 - val_ARm: 0.1543 - val_ARl: 0.3571\n",
+      "Epoch 25/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.45s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=0.97s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.218\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.327\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.236\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.021\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.103\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.254\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.251\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.320\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.323\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.038\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.157\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.361\n",
+      "4138/4138 [==============================] - 603s 146ms/step - loss: 1.0019 - box_loss: 0.9219 - class_loss: 0.0800 - val_loss: 1.3165 - val_box_loss: 1.2270 - val_class_loss: 0.0894 - val_AP: 0.2179 - val_AP50: 0.3266 - val_AP75: 0.2365 - val_APs: 0.0208 - val_APm: 0.1035 - val_APl: 0.2541 - val_ARmax1: 0.2513 - val_ARmax10: 0.3197 - val_ARmax100: 0.3226 - val_ARs: 0.0379 - val_ARm: 0.1568 - val_ARl: 0.3611\n",
+      "Epoch 26/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.56s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=0.99s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.225\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.332\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.244\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.022\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.108\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.261\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.257\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.326\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.328\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.041\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.165\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.368\n",
+      "4138/4138 [==============================] - 604s 146ms/step - loss: 0.9869 - box_loss: 0.9086 - class_loss: 0.0784 - val_loss: 1.2998 - val_box_loss: 1.2129 - val_class_loss: 0.0870 - val_AP: 0.2245 - val_AP50: 0.3321 - val_AP75: 0.2440 - val_APs: 0.0215 - val_APm: 0.1084 - val_APl: 0.2607 - val_ARmax1: 0.2570 - val_ARmax10: 0.3265 - val_ARmax100: 0.3285 - val_ARs: 0.0409 - val_ARm: 0.1651 - val_ARl: 0.3679\n",
+      "Epoch 27/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.59s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.00s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.219\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.330\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.237\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.021\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.110\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.254\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.250\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.321\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.324\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.038\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.160\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.362\n",
+      "4138/4138 [==============================] - 603s 146ms/step - loss: 0.9912 - box_loss: 0.9128 - class_loss: 0.0784 - val_loss: 1.3152 - val_box_loss: 1.2280 - val_class_loss: 0.0871 - val_AP: 0.2189 - val_AP50: 0.3299 - val_AP75: 0.2372 - val_APs: 0.0215 - val_APm: 0.1100 - val_APl: 0.2543 - val_ARmax1: 0.2501 - val_ARmax10: 0.3212 - val_ARmax100: 0.3235 - val_ARs: 0.0384 - val_ARm: 0.1604 - val_ARl: 0.3620\n",
+      "Epoch 28/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.50s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=0.98s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.226\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.337\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.244\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.032\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.111\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.263\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.265\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.336\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.339\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.053\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.170\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.378\n",
+      "4138/4138 [==============================] - 603s 146ms/step - loss: 0.9677 - box_loss: 0.8910 - class_loss: 0.0767 - val_loss: 1.2900 - val_box_loss: 1.2043 - val_class_loss: 0.0857 - val_AP: 0.2261 - val_AP50: 0.3373 - val_AP75: 0.2444 - val_APs: 0.0319 - val_APm: 0.1110 - val_APl: 0.2632 - val_ARmax1: 0.2649 - val_ARmax10: 0.3365 - val_ARmax100: 0.3386 - val_ARs: 0.0533 - val_ARm: 0.1699 - val_ARl: 0.3782\n",
+      "Epoch 29/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.59s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.00s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.236\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.349\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.253\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.026\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.110\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.279\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.265\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.341\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.343\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.044\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.168\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.387\n",
+      "4138/4138 [==============================] - 605s 146ms/step - loss: 0.9463 - box_loss: 0.8713 - class_loss: 0.0750 - val_loss: 1.2881 - val_box_loss: 1.2031 - val_class_loss: 0.0850 - val_AP: 0.2356 - val_AP50: 0.3489 - val_AP75: 0.2529 - val_APs: 0.0257 - val_APm: 0.1103 - val_APl: 0.2786 - val_ARmax1: 0.2648 - val_ARmax10: 0.3408 - val_ARmax100: 0.3434 - val_ARs: 0.0444 - val_ARm: 0.1682 - val_ARl: 0.3865\n",
+      "Epoch 30/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.57s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.00s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.242\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.359\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.262\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.025\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.118\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.283\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.267\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.346\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.349\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.047\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.193\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.390\n",
+      "4138/4138 [==============================] - 605s 146ms/step - loss: 0.9327 - box_loss: 0.8588 - class_loss: 0.0739 - val_loss: 1.2842 - val_box_loss: 1.2005 - val_class_loss: 0.0837 - val_AP: 0.2424 - val_AP50: 0.3590 - val_AP75: 0.2625 - val_APs: 0.0253 - val_APm: 0.1184 - val_APl: 0.2826 - val_ARmax1: 0.2675 - val_ARmax10: 0.3460 - val_ARmax100: 0.3487 - val_ARs: 0.0470 - val_ARm: 0.1929 - val_ARl: 0.3903\n",
+      "Epoch 31/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=5.17s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=0.98s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.240\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.354\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.259\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.021\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.123\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.276\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.270\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.344\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.346\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.043\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.182\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.384\n",
+      "4138/4138 [==============================] - 604s 146ms/step - loss: 0.9352 - box_loss: 0.8616 - class_loss: 0.0736 - val_loss: 1.2895 - val_box_loss: 1.2052 - val_class_loss: 0.0843 - val_AP: 0.2398 - val_AP50: 0.3543 - val_AP75: 0.2587 - val_APs: 0.0212 - val_APm: 0.1228 - val_APl: 0.2762 - val_ARmax1: 0.2696 - val_ARmax10: 0.3438 - val_ARmax100: 0.3463 - val_ARs: 0.0434 - val_ARm: 0.1816 - val_ARl: 0.3837\n",
+      "Epoch 32/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.50s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=0.98s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.249\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.365\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.272\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.026\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.121\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.290\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.276\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.355\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.357\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.045\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.183\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.399\n",
+      "4138/4138 [==============================] - 604s 146ms/step - loss: 0.9127 - box_loss: 0.8407 - class_loss: 0.0720 - val_loss: 1.2811 - val_box_loss: 1.1979 - val_class_loss: 0.0832 - val_AP: 0.2487 - val_AP50: 0.3646 - val_AP75: 0.2717 - val_APs: 0.0258 - val_APm: 0.1215 - val_APl: 0.2905 - val_ARmax1: 0.2761 - val_ARmax10: 0.3550 - val_ARmax100: 0.3570 - val_ARs: 0.0448 - val_ARm: 0.1827 - val_ARl: 0.3985\n",
+      "Epoch 33/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=5.17s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=0.97s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.251\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.371\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.276\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.033\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.118\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.297\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.279\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.356\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.359\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.050\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.196\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.405\n",
+      "4138/4138 [==============================] - 604s 146ms/step - loss: 0.8977 - box_loss: 0.8271 - class_loss: 0.0706 - val_loss: 1.2887 - val_box_loss: 1.2060 - val_class_loss: 0.0827 - val_AP: 0.2508 - val_AP50: 0.3712 - val_AP75: 0.2756 - val_APs: 0.0326 - val_APm: 0.1178 - val_APl: 0.2970 - val_ARmax1: 0.2792 - val_ARmax10: 0.3563 - val_ARmax100: 0.3586 - val_ARs: 0.0503 - val_ARm: 0.1955 - val_ARl: 0.4048\n",
+      "Epoch 34/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.65s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.01s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.255\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.375\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.278\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.027\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.130\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.295\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.273\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.354\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.356\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.052\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.189\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.398\n",
+      "4138/4138 [==============================] - 603s 146ms/step - loss: 0.9026 - box_loss: 0.8319 - class_loss: 0.0707 - val_loss: 1.2777 - val_box_loss: 1.1964 - val_class_loss: 0.0813 - val_AP: 0.2549 - val_AP50: 0.3751 - val_AP75: 0.2778 - val_APs: 0.0273 - val_APm: 0.1301 - val_APl: 0.2946 - val_ARmax1: 0.2730 - val_ARmax10: 0.3539 - val_ARmax100: 0.3564 - val_ARs: 0.0517 - val_ARm: 0.1891 - val_ARl: 0.3976\n",
+      "Epoch 35/120\n",
+      "1238/1238 [==============================] - 110s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=5.30s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.02s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.262\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.384\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.285\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.030\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.132\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.305\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.286\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.368\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.371\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.056\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.190\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.418\n",
+      "4138/4138 [==============================] - 604s 146ms/step - loss: 0.8759 - box_loss: 0.8072 - class_loss: 0.0687 - val_loss: 1.2667 - val_box_loss: 1.1857 - val_class_loss: 0.0810 - val_AP: 0.2618 - val_AP50: 0.3838 - val_AP75: 0.2846 - val_APs: 0.0297 - val_APm: 0.1319 - val_APl: 0.3055 - val_ARmax1: 0.2858 - val_ARmax10: 0.3679 - val_ARmax100: 0.3710 - val_ARs: 0.0557 - val_ARm: 0.1896 - val_ARl: 0.4176\n",
+      "Epoch 36/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.58s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=0.98s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.264\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.386\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.292\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.026\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.131\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.309\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.284\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.366\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.369\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.052\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.204\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.416\n",
+      "4138/4138 [==============================] - 604s 146ms/step - loss: 0.8657 - box_loss: 0.7976 - class_loss: 0.0681 - val_loss: 1.2801 - val_box_loss: 1.1996 - val_class_loss: 0.0806 - val_AP: 0.2638 - val_AP50: 0.3863 - val_AP75: 0.2921 - val_APs: 0.0264 - val_APm: 0.1312 - val_APl: 0.3094 - val_ARmax1: 0.2841 - val_ARmax10: 0.3660 - val_ARmax100: 0.3688 - val_ARs: 0.0525 - val_ARm: 0.2043 - val_ARl: 0.4160\n",
+      "Epoch 37/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=5.28s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.01s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.270\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.394\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.295\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.026\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.130\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.319\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.293\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.375\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.377\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.049\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.188\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.429\n",
+      "4138/4138 [==============================] - 604s 146ms/step - loss: 0.8567 - box_loss: 0.7897 - class_loss: 0.0669 - val_loss: 1.2754 - val_box_loss: 1.1958 - val_class_loss: 0.0796 - val_AP: 0.2705 - val_AP50: 0.3944 - val_AP75: 0.2952 - val_APs: 0.0261 - val_APm: 0.1305 - val_APl: 0.3192 - val_ARmax1: 0.2927 - val_ARmax10: 0.3745 - val_ARmax100: 0.3773 - val_ARs: 0.0494 - val_ARm: 0.1877 - val_ARl: 0.4288\n",
+      "Epoch 38/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.65s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.01s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.265\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.390\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.288\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.027\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.131\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.310\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.288\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.368\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.371\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.051\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.193\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.417\n",
+      "4138/4138 [==============================] - 604s 146ms/step - loss: 0.8617 - box_loss: 0.7948 - class_loss: 0.0668 - val_loss: 1.2780 - val_box_loss: 1.1982 - val_class_loss: 0.0798 - val_AP: 0.2647 - val_AP50: 0.3895 - val_AP75: 0.2877 - val_APs: 0.0271 - val_APm: 0.1308 - val_APl: 0.3099 - val_ARmax1: 0.2877 - val_ARmax10: 0.3683 - val_ARmax100: 0.3712 - val_ARs: 0.0512 - val_ARm: 0.1928 - val_ARl: 0.4172\n",
+      "Epoch 39/120\n",
+      "1238/1238 [==============================] - 110s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.67s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.01s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.278\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.408\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.301\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.032\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.137\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.326\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.302\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.386\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.389\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.059\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.220\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.439\n",
+      "4138/4138 [==============================] - 605s 146ms/step - loss: 0.8383 - box_loss: 0.7730 - class_loss: 0.0653 - val_loss: 1.2666 - val_box_loss: 1.1879 - val_class_loss: 0.0787 - val_AP: 0.2784 - val_AP50: 0.4080 - val_AP75: 0.3013 - val_APs: 0.0319 - val_APm: 0.1374 - val_APl: 0.3264 - val_ARmax1: 0.3018 - val_ARmax10: 0.3864 - val_ARmax100: 0.3893 - val_ARs: 0.0592 - val_ARm: 0.2203 - val_ARl: 0.4393\n",
+      "Epoch 40/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.68s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.01s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.275\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.405\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.298\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.030\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.138\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.322\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.295\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.381\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.384\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.056\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.220\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.432\n",
+      "4138/4138 [==============================] - 604s 146ms/step - loss: 0.8355 - box_loss: 0.7700 - class_loss: 0.0654 - val_loss: 1.2627 - val_box_loss: 1.1846 - val_class_loss: 0.0781 - val_AP: 0.2753 - val_AP50: 0.4052 - val_AP75: 0.2975 - val_APs: 0.0302 - val_APm: 0.1379 - val_APl: 0.3224 - val_ARmax1: 0.2946 - val_ARmax10: 0.3813 - val_ARmax100: 0.3843 - val_ARs: 0.0557 - val_ARm: 0.2196 - val_ARl: 0.4316\n",
+      "Epoch 41/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.76s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.03s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.280\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.408\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.305\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.028\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.136\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.327\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.298\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.386\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.388\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.054\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.201\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.439\n",
+      "4138/4138 [==============================] - 605s 146ms/step - loss: 0.8167 - box_loss: 0.7530 - class_loss: 0.0637 - val_loss: 1.2644 - val_box_loss: 1.1860 - val_class_loss: 0.0784 - val_AP: 0.2801 - val_AP50: 0.4076 - val_AP75: 0.3049 - val_APs: 0.0284 - val_APm: 0.1360 - val_APl: 0.3274 - val_ARmax1: 0.2981 - val_ARmax10: 0.3858 - val_ARmax100: 0.3884 - val_ARs: 0.0537 - val_ARm: 0.2014 - val_ARl: 0.4390\n",
+      "Epoch 42/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.71s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.03s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.280\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.411\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.305\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.029\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.145\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.325\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.304\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.393\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.396\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.053\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.227\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.443\n",
+      "4138/4138 [==============================] - 604s 146ms/step - loss: 0.8121 - box_loss: 0.7488 - class_loss: 0.0632 - val_loss: 1.2774 - val_box_loss: 1.1994 - val_class_loss: 0.0780 - val_AP: 0.2802 - val_AP50: 0.4109 - val_AP75: 0.3048 - val_APs: 0.0288 - val_APm: 0.1448 - val_APl: 0.3250 - val_ARmax1: 0.3042 - val_ARmax10: 0.3932 - val_ARmax100: 0.3957 - val_ARs: 0.0531 - val_ARm: 0.2267 - val_ARl: 0.4429\n",
+      "Epoch 43/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.68s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.00s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.283\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.411\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.311\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.033\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.143\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.330\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.303\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.388\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.391\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.057\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.201\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.441\n",
+      "4138/4138 [==============================] - 604s 146ms/step - loss: 0.7997 - box_loss: 0.7371 - class_loss: 0.0626 - val_loss: 1.2725 - val_box_loss: 1.1946 - val_class_loss: 0.0779 - val_AP: 0.2833 - val_AP50: 0.4112 - val_AP75: 0.3107 - val_APs: 0.0331 - val_APm: 0.1429 - val_APl: 0.3305 - val_ARmax1: 0.3029 - val_ARmax10: 0.3885 - val_ARmax100: 0.3910 - val_ARs: 0.0567 - val_ARm: 0.2009 - val_ARl: 0.4407\n",
+      "Epoch 44/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.77s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.03s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.290\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.424\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.314\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.036\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.148\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.338\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.309\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.397\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.400\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.064\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.230\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.450\n",
+      "4138/4138 [==============================] - 605s 146ms/step - loss: 0.7893 - box_loss: 0.7278 - class_loss: 0.0614 - val_loss: 1.2544 - val_box_loss: 1.1779 - val_class_loss: 0.0765 - val_AP: 0.2903 - val_AP50: 0.4244 - val_AP75: 0.3143 - val_APs: 0.0358 - val_APm: 0.1483 - val_APl: 0.3378 - val_ARmax1: 0.3086 - val_ARmax10: 0.3971 - val_ARmax100: 0.4001 - val_ARs: 0.0640 - val_ARm: 0.2296 - val_ARl: 0.4495\n",
+      "Epoch 45/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.70s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.00s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.290\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.423\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.316\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.029\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.139\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.341\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.308\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.398\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.401\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.055\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.224\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.454\n",
+      "4138/4138 [==============================] - 604s 146ms/step - loss: 0.7990 - box_loss: 0.7371 - class_loss: 0.0619 - val_loss: 1.2662 - val_box_loss: 1.1904 - val_class_loss: 0.0758 - val_AP: 0.2895 - val_AP50: 0.4227 - val_AP75: 0.3165 - val_APs: 0.0289 - val_APm: 0.1391 - val_APl: 0.3410 - val_ARmax1: 0.3084 - val_ARmax10: 0.3979 - val_ARmax100: 0.4006 - val_ARs: 0.0549 - val_ARm: 0.2241 - val_ARl: 0.4542\n",
+      "Epoch 46/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.70s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.01s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.296\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.426\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.326\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.034\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.142\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.350\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.310\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.402\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.405\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.059\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.208\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.461\n",
+      "4138/4138 [==============================] - 605s 146ms/step - loss: 0.7900 - box_loss: 0.7289 - class_loss: 0.0612 - val_loss: 1.2528 - val_box_loss: 1.1772 - val_class_loss: 0.0756 - val_AP: 0.2959 - val_AP50: 0.4262 - val_AP75: 0.3258 - val_APs: 0.0342 - val_APm: 0.1417 - val_APl: 0.3502 - val_ARmax1: 0.3102 - val_ARmax10: 0.4024 - val_ARmax100: 0.4052 - val_ARs: 0.0591 - val_ARm: 0.2084 - val_ARl: 0.4609\n",
+      "Epoch 47/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.71s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.00s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.294\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.427\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.319\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.028\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.147\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.347\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.311\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.403\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.406\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.049\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.215\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.463\n",
+      "4138/4138 [==============================] - 605s 146ms/step - loss: 0.7789 - box_loss: 0.7188 - class_loss: 0.0602 - val_loss: 1.2567 - val_box_loss: 1.1812 - val_class_loss: 0.0756 - val_AP: 0.2944 - val_AP50: 0.4271 - val_AP75: 0.3193 - val_APs: 0.0284 - val_APm: 0.1468 - val_APl: 0.3470 - val_ARmax1: 0.3115 - val_ARmax10: 0.4032 - val_ARmax100: 0.4064 - val_ARs: 0.0493 - val_ARm: 0.2145 - val_ARl: 0.4629\n",
+      "Epoch 48/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.77s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.02s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.297\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.430\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.321\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.031\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.150\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.349\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.311\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.398\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.401\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.060\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.237\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.453\n",
+      "4138/4138 [==============================] - 605s 146ms/step - loss: 0.7607 - box_loss: 0.7021 - class_loss: 0.0586 - val_loss: 1.2492 - val_box_loss: 1.1740 - val_class_loss: 0.0752 - val_AP: 0.2970 - val_AP50: 0.4296 - val_AP75: 0.3210 - val_APs: 0.0312 - val_APm: 0.1496 - val_APl: 0.3494 - val_ARmax1: 0.3109 - val_ARmax10: 0.3985 - val_ARmax100: 0.4011 - val_ARs: 0.0601 - val_ARm: 0.2369 - val_ARl: 0.4535\n",
+      "Epoch 49/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=5.28s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.01s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.299\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.433\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.326\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.043\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.149\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.350\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.316\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.406\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.409\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.070\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.231\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.463\n",
+      "4138/4138 [==============================] - 605s 146ms/step - loss: 0.7512 - box_loss: 0.6928 - class_loss: 0.0584 - val_loss: 1.2506 - val_box_loss: 1.1753 - val_class_loss: 0.0753 - val_AP: 0.2995 - val_AP50: 0.4335 - val_AP75: 0.3263 - val_APs: 0.0429 - val_APm: 0.1490 - val_APl: 0.3504 - val_ARmax1: 0.3162 - val_ARmax10: 0.4063 - val_ARmax100: 0.4092 - val_ARs: 0.0701 - val_ARm: 0.2315 - val_ARl: 0.4631\n",
+      "Epoch 50/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.76s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.04s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.300\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.433\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.325\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.035\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.145\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.349\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.309\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.403\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.407\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.066\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.233\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.456\n",
+      "4138/4138 [==============================] - 604s 146ms/step - loss: 0.7524 - box_loss: 0.6940 - class_loss: 0.0584 - val_loss: 1.2530 - val_box_loss: 1.1776 - val_class_loss: 0.0754 - val_AP: 0.2998 - val_AP50: 0.4334 - val_AP75: 0.3254 - val_APs: 0.0348 - val_APm: 0.1449 - val_APl: 0.3493 - val_ARmax1: 0.3089 - val_ARmax10: 0.4035 - val_ARmax100: 0.4067 - val_ARs: 0.0658 - val_ARm: 0.2331 - val_ARl: 0.4563\n",
+      "Epoch 51/120\n",
+      "1238/1238 [==============================] - 110s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=5.34s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.01s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.308\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.445\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.335\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.038\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.151\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.359\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.320\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.414\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.417\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.063\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.239\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.468\n",
+      "4138/4138 [==============================] - 605s 146ms/step - loss: 0.7398 - box_loss: 0.6825 - class_loss: 0.0573 - val_loss: 1.2493 - val_box_loss: 1.1759 - val_class_loss: 0.0734 - val_AP: 0.3077 - val_AP50: 0.4453 - val_AP75: 0.3355 - val_APs: 0.0377 - val_APm: 0.1513 - val_APl: 0.3586 - val_ARmax1: 0.3196 - val_ARmax10: 0.4140 - val_ARmax100: 0.4170 - val_ARs: 0.0629 - val_ARm: 0.2392 - val_ARl: 0.4681\n",
+      "Epoch 52/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.79s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.03s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.311\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.455\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.336\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.036\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.154\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.362\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.322\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.420\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.423\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.063\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.249\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.475\n",
+      "4138/4138 [==============================] - 605s 146ms/step - loss: 0.7419 - box_loss: 0.6849 - class_loss: 0.0571 - val_loss: 1.2458 - val_box_loss: 1.1725 - val_class_loss: 0.0734 - val_AP: 0.3109 - val_AP50: 0.4545 - val_AP75: 0.3357 - val_APs: 0.0356 - val_APm: 0.1545 - val_APl: 0.3621 - val_ARmax1: 0.3220 - val_ARmax10: 0.4198 - val_ARmax100: 0.4232 - val_ARs: 0.0633 - val_ARm: 0.2493 - val_ARl: 0.4754\n",
+      "Epoch 53/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.73s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.02s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.301\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.438\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.325\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.030\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.147\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.352\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.315\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.406\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.409\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.060\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.240\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.460\n",
+      "4138/4138 [==============================] - 605s 146ms/step - loss: 0.7359 - box_loss: 0.6793 - class_loss: 0.0566 - val_loss: 1.2609 - val_box_loss: 1.1857 - val_class_loss: 0.0752 - val_AP: 0.3013 - val_AP50: 0.4378 - val_AP75: 0.3249 - val_APs: 0.0298 - val_APm: 0.1468 - val_APl: 0.3519 - val_ARmax1: 0.3147 - val_ARmax10: 0.4062 - val_ARmax100: 0.4090 - val_ARs: 0.0602 - val_ARm: 0.2396 - val_ARl: 0.4605\n",
+      "Epoch 54/120\n",
+      "1238/1238 [==============================] - 110s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.66s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.67s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.314\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.452\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.343\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.032\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.156\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.369\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.325\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.419\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.421\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.064\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.245\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.476\n",
+      "4138/4138 [==============================] - 604s 146ms/step - loss: 0.7176 - box_loss: 0.6620 - class_loss: 0.0556 - val_loss: 1.2498 - val_box_loss: 1.1762 - val_class_loss: 0.0737 - val_AP: 0.3144 - val_AP50: 0.4522 - val_AP75: 0.3426 - val_APs: 0.0323 - val_APm: 0.1565 - val_APl: 0.3690 - val_ARmax1: 0.3251 - val_ARmax10: 0.4188 - val_ARmax100: 0.4213 - val_ARs: 0.0638 - val_ARm: 0.2451 - val_ARl: 0.4764\n",
+      "Epoch 55/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.78s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.04s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.309\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.449\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.338\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.044\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.155\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.363\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.322\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.417\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.420\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.073\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.248\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.476\n",
+      "4138/4138 [==============================] - 604s 146ms/step - loss: 0.7078 - box_loss: 0.6531 - class_loss: 0.0547 - val_loss: 1.2627 - val_box_loss: 1.1891 - val_class_loss: 0.0736 - val_AP: 0.3092 - val_AP50: 0.4493 - val_AP75: 0.3379 - val_APs: 0.0441 - val_APm: 0.1548 - val_APl: 0.3634 - val_ARmax1: 0.3224 - val_ARmax10: 0.4174 - val_ARmax100: 0.4203 - val_ARs: 0.0735 - val_ARm: 0.2476 - val_ARl: 0.4759\n",
+      "Epoch 56/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=5.35s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.02s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.310\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.448\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.338\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.035\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.152\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.364\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.320\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.416\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.419\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.068\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.247\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.474\n",
+      "4138/4138 [==============================] - 606s 146ms/step - loss: 0.6986 - box_loss: 0.6445 - class_loss: 0.0541 - val_loss: 1.2498 - val_box_loss: 1.1760 - val_class_loss: 0.0738 - val_AP: 0.3095 - val_AP50: 0.4480 - val_AP75: 0.3380 - val_APs: 0.0349 - val_APm: 0.1524 - val_APl: 0.3641 - val_ARmax1: 0.3203 - val_ARmax10: 0.4162 - val_ARmax100: 0.4189 - val_ARs: 0.0678 - val_ARm: 0.2468 - val_ARl: 0.4745\n",
+      "Epoch 57/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.72s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.03s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.312\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.451\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.340\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.033\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.154\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.365\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.320\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.416\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.419\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.061\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.247\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.471\n",
+      "4138/4138 [==============================] - 603s 146ms/step - loss: 0.7083 - box_loss: 0.6536 - class_loss: 0.0547 - val_loss: 1.2516 - val_box_loss: 1.1791 - val_class_loss: 0.0725 - val_AP: 0.3122 - val_AP50: 0.4505 - val_AP75: 0.3399 - val_APs: 0.0328 - val_APm: 0.1539 - val_APl: 0.3653 - val_ARmax1: 0.3201 - val_ARmax10: 0.4163 - val_ARmax100: 0.4185 - val_ARs: 0.0610 - val_ARm: 0.2466 - val_ARl: 0.4709\n",
+      "Epoch 58/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=5.34s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.02s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.316\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.457\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.346\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.039\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.157\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.370\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.325\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.423\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.426\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.063\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.246\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.482\n",
+      "4138/4138 [==============================] - 605s 146ms/step - loss: 0.6981 - box_loss: 0.6443 - class_loss: 0.0538 - val_loss: 1.2455 - val_box_loss: 1.1739 - val_class_loss: 0.0716 - val_AP: 0.3160 - val_AP50: 0.4565 - val_AP75: 0.3463 - val_APs: 0.0389 - val_APm: 0.1572 - val_APl: 0.3704 - val_ARmax1: 0.3253 - val_ARmax10: 0.4228 - val_ARmax100: 0.4256 - val_ARs: 0.0632 - val_ARm: 0.2462 - val_ARl: 0.4817\n",
+      "Epoch 59/120\n",
+      "1238/1238 [==============================] - 110s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.84s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.03s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.315\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.455\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.347\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.035\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.156\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.370\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.324\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.426\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.429\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.063\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.250\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.484\n",
+      "4138/4138 [==============================] - 604s 146ms/step - loss: 0.6801 - box_loss: 0.6278 - class_loss: 0.0524 - val_loss: 1.2500 - val_box_loss: 1.1775 - val_class_loss: 0.0725 - val_AP: 0.3152 - val_AP50: 0.4553 - val_AP75: 0.3473 - val_APs: 0.0350 - val_APm: 0.1556 - val_APl: 0.3702 - val_ARmax1: 0.3240 - val_ARmax10: 0.4261 - val_ARmax100: 0.4286 - val_ARs: 0.0635 - val_ARm: 0.2496 - val_ARl: 0.4840\n",
+      "Epoch 60/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=5.38s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.03s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.319\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.459\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.351\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.030\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.163\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.373\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.328\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.427\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.430\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.059\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.258\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.483\n",
+      "4138/4138 [==============================] - 606s 146ms/step - loss: 0.6702 - box_loss: 0.6184 - class_loss: 0.0519 - val_loss: 1.2401 - val_box_loss: 1.1686 - val_class_loss: 0.0715 - val_AP: 0.3192 - val_AP50: 0.4590 - val_AP75: 0.3506 - val_APs: 0.0301 - val_APm: 0.1628 - val_APl: 0.3732 - val_ARmax1: 0.3282 - val_ARmax10: 0.4272 - val_ARmax100: 0.4300 - val_ARs: 0.0593 - val_ARm: 0.2576 - val_ARl: 0.4825\n",
+      "Epoch 61/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.79s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.02s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.317\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.460\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.345\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.039\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.156\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.370\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.324\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.426\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.429\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.068\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.253\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.483\n",
+      "4138/4138 [==============================] - 606s 146ms/step - loss: 0.6786 - box_loss: 0.6265 - class_loss: 0.0522 - val_loss: 1.2553 - val_box_loss: 1.1827 - val_class_loss: 0.0726 - val_AP: 0.3167 - val_AP50: 0.4597 - val_AP75: 0.3448 - val_APs: 0.0385 - val_APm: 0.1564 - val_APl: 0.3703 - val_ARmax1: 0.3241 - val_ARmax10: 0.4261 - val_ARmax100: 0.4291 - val_ARs: 0.0676 - val_ARm: 0.2527 - val_ARl: 0.4825\n",
+      "Epoch 62/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.80s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.03s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.323\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.466\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.357\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.041\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.157\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.381\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.327\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.429\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.432\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.074\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.250\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.491\n",
+      "4138/4138 [==============================] - 605s 146ms/step - loss: 0.6773 - box_loss: 0.6249 - class_loss: 0.0524 - val_loss: 1.2505 - val_box_loss: 1.1786 - val_class_loss: 0.0719 - val_AP: 0.3231 - val_AP50: 0.4665 - val_AP75: 0.3571 - val_APs: 0.0413 - val_APm: 0.1574 - val_APl: 0.3812 - val_ARmax1: 0.3270 - val_ARmax10: 0.4287 - val_ARmax100: 0.4320 - val_ARs: 0.0744 - val_ARm: 0.2502 - val_ARl: 0.4909\n",
+      "Epoch 63/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.72s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.03s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.321\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.468\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.352\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.038\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.165\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.375\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.327\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.428\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.431\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.070\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.255\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.485\n",
+      "4138/4138 [==============================] - 604s 146ms/step - loss: 0.6592 - box_loss: 0.6082 - class_loss: 0.0510 - val_loss: 1.2489 - val_box_loss: 1.1781 - val_class_loss: 0.0707 - val_AP: 0.3214 - val_AP50: 0.4682 - val_AP75: 0.3520 - val_APs: 0.0380 - val_APm: 0.1652 - val_APl: 0.3750 - val_ARmax1: 0.3267 - val_ARmax10: 0.4277 - val_ARmax100: 0.4306 - val_ARs: 0.0696 - val_ARm: 0.2551 - val_ARl: 0.4850\n",
+      "Epoch 64/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.67s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.00s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.325\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.467\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.356\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.037\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.159\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.381\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.330\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.432\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.435\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.065\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.257\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.494\n",
+      "4138/4138 [==============================] - 605s 146ms/step - loss: 0.6498 - box_loss: 0.5996 - class_loss: 0.0501 - val_loss: 1.2489 - val_box_loss: 1.1776 - val_class_loss: 0.0713 - val_AP: 0.3250 - val_AP50: 0.4672 - val_AP75: 0.3558 - val_APs: 0.0372 - val_APm: 0.1591 - val_APl: 0.3811 - val_ARmax1: 0.3299 - val_ARmax10: 0.4320 - val_ARmax100: 0.4347 - val_ARs: 0.0651 - val_ARm: 0.2573 - val_ARl: 0.4943\n",
+      "Epoch 65/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.80s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.02s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.326\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.471\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.357\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.041\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.158\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.384\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.334\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.435\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.438\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.072\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.253\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.498\n",
+      "4138/4138 [==============================] - 605s 146ms/step - loss: 0.6423 - box_loss: 0.5929 - class_loss: 0.0494 - val_loss: 1.2458 - val_box_loss: 1.1751 - val_class_loss: 0.0707 - val_AP: 0.3257 - val_AP50: 0.4712 - val_AP75: 0.3573 - val_APs: 0.0412 - val_APm: 0.1581 - val_APl: 0.3844 - val_ARmax1: 0.3342 - val_ARmax10: 0.4353 - val_ARmax100: 0.4381 - val_ARs: 0.0723 - val_ARm: 0.2529 - val_ARl: 0.4976\n",
+      "Epoch 66/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.78s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.02s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.328\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.476\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.358\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.044\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.163\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.383\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.331\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.436\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.439\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.075\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.256\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.494\n",
+      "4138/4138 [==============================] - 605s 146ms/step - loss: 0.6395 - box_loss: 0.5902 - class_loss: 0.0493 - val_loss: 1.2390 - val_box_loss: 1.1682 - val_class_loss: 0.0707 - val_AP: 0.3281 - val_AP50: 0.4756 - val_AP75: 0.3583 - val_APs: 0.0443 - val_APm: 0.1631 - val_APl: 0.3826 - val_ARmax1: 0.3308 - val_ARmax10: 0.4362 - val_ARmax100: 0.4391 - val_ARs: 0.0748 - val_ARm: 0.2564 - val_ARl: 0.4938\n",
+      "Epoch 67/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.83s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.03s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.327\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.472\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.359\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.045\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.164\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.384\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.332\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.436\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.439\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.074\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.251\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.498\n",
+      "4138/4138 [==============================] - 605s 146ms/step - loss: 0.6309 - box_loss: 0.5824 - class_loss: 0.0486 - val_loss: 1.2445 - val_box_loss: 1.1740 - val_class_loss: 0.0705 - val_AP: 0.3271 - val_AP50: 0.4718 - val_AP75: 0.3593 - val_APs: 0.0448 - val_APm: 0.1639 - val_APl: 0.3839 - val_ARmax1: 0.3322 - val_ARmax10: 0.4358 - val_ARmax100: 0.4388 - val_ARs: 0.0736 - val_ARm: 0.2507 - val_ARl: 0.4982\n",
+      "Epoch 68/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=5.42s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.03s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.330\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.476\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.361\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.041\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.161\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.390\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.334\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.435\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.438\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.073\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.250\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.498\n",
+      "4138/4138 [==============================] - 605s 146ms/step - loss: 0.6405 - box_loss: 0.5911 - class_loss: 0.0494 - val_loss: 1.2396 - val_box_loss: 1.1700 - val_class_loss: 0.0696 - val_AP: 0.3300 - val_AP50: 0.4762 - val_AP75: 0.3611 - val_APs: 0.0407 - val_APm: 0.1612 - val_APl: 0.3897 - val_ARmax1: 0.3342 - val_ARmax10: 0.4348 - val_ARmax100: 0.4377 - val_ARs: 0.0735 - val_ARm: 0.2499 - val_ARl: 0.4982\n",
+      "Epoch 69/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.87s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.03s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.335\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.481\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.361\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.041\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.170\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.392\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.338\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.440\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.443\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.071\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.265\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.500\n",
+      "4138/4138 [==============================] - 605s 146ms/step - loss: 0.6178 - box_loss: 0.5699 - class_loss: 0.0479 - val_loss: 1.2288 - val_box_loss: 1.1592 - val_class_loss: 0.0696 - val_AP: 0.3348 - val_AP50: 0.4811 - val_AP75: 0.3614 - val_APs: 0.0406 - val_APm: 0.1704 - val_APl: 0.3921 - val_ARmax1: 0.3376 - val_ARmax10: 0.4399 - val_ARmax100: 0.4427 - val_ARs: 0.0708 - val_ARm: 0.2646 - val_ARl: 0.5003\n",
+      "Epoch 70/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=5.43s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.03s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.327\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.471\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.357\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.038\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.160\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.383\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.335\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.435\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.437\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.065\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.253\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.493\n",
+      "4138/4138 [==============================] - 605s 146ms/step - loss: 0.6195 - box_loss: 0.5718 - class_loss: 0.0477 - val_loss: 1.2435 - val_box_loss: 1.1730 - val_class_loss: 0.0705 - val_AP: 0.3274 - val_AP50: 0.4711 - val_AP75: 0.3566 - val_APs: 0.0376 - val_APm: 0.1595 - val_APl: 0.3825 - val_ARmax1: 0.3354 - val_ARmax10: 0.4346 - val_ARmax100: 0.4374 - val_ARs: 0.0648 - val_ARm: 0.2532 - val_ARl: 0.4932\n",
+      "Epoch 71/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.76s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.03s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.328\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.473\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.356\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.035\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.159\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.385\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.334\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.435\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.438\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.064\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.252\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.496\n",
+      "4138/4138 [==============================] - 605s 146ms/step - loss: 0.6085 - box_loss: 0.5614 - class_loss: 0.0471 - val_loss: 1.2360 - val_box_loss: 1.1660 - val_class_loss: 0.0700 - val_AP: 0.3283 - val_AP50: 0.4726 - val_AP75: 0.3564 - val_APs: 0.0346 - val_APm: 0.1587 - val_APl: 0.3852 - val_ARmax1: 0.3345 - val_ARmax10: 0.4351 - val_ARmax100: 0.4377 - val_ARs: 0.0639 - val_ARm: 0.2522 - val_ARl: 0.4962\n",
+      "Epoch 72/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.71s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.03s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.331\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.477\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.364\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.037\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.162\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.388\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.336\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.438\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.441\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.071\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.252\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.498\n",
+      "4138/4138 [==============================] - 604s 146ms/step - loss: 0.6023 - box_loss: 0.5558 - class_loss: 0.0465 - val_loss: 1.2502 - val_box_loss: 1.1798 - val_class_loss: 0.0704 - val_AP: 0.3315 - val_AP50: 0.4766 - val_AP75: 0.3641 - val_APs: 0.0372 - val_APm: 0.1616 - val_APl: 0.3877 - val_ARmax1: 0.3358 - val_ARmax10: 0.4381 - val_ARmax100: 0.4405 - val_ARs: 0.0706 - val_ARm: 0.2519 - val_ARl: 0.4978\n",
+      "Epoch 73/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.72s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.03s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.331\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.477\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.361\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.050\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.165\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.387\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.334\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.437\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.440\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.087\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.257\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.496\n",
+      "4138/4138 [==============================] - 604s 146ms/step - loss: 0.5951 - box_loss: 0.5490 - class_loss: 0.0461 - val_loss: 1.2444 - val_box_loss: 1.1750 - val_class_loss: 0.0694 - val_AP: 0.3309 - val_AP50: 0.4767 - val_AP75: 0.3608 - val_APs: 0.0497 - val_APm: 0.1647 - val_APl: 0.3865 - val_ARmax1: 0.3341 - val_ARmax10: 0.4373 - val_ARmax100: 0.4405 - val_ARs: 0.0874 - val_ARm: 0.2572 - val_ARl: 0.4958\n",
+      "Epoch 74/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.81s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.03s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.339\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.483\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.371\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.036\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.169\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.395\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.340\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.445\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.448\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.066\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.265\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.507\n",
+      "4138/4138 [==============================] - 604s 146ms/step - loss: 0.6105 - box_loss: 0.5636 - class_loss: 0.0469 - val_loss: 1.2329 - val_box_loss: 1.1640 - val_class_loss: 0.0689 - val_AP: 0.3386 - val_AP50: 0.4833 - val_AP75: 0.3710 - val_APs: 0.0360 - val_APm: 0.1695 - val_APl: 0.3954 - val_ARmax1: 0.3396 - val_ARmax10: 0.4455 - val_ARmax100: 0.4483 - val_ARs: 0.0662 - val_ARm: 0.2650 - val_ARl: 0.5065\n",
+      "Epoch 75/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=5.33s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.02s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.335\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.486\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.368\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.042\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.167\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.393\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.336\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.440\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.443\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.065\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.263\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.502\n",
+      "4138/4138 [==============================] - 605s 146ms/step - loss: 0.5984 - box_loss: 0.5520 - class_loss: 0.0464 - val_loss: 1.2381 - val_box_loss: 1.1687 - val_class_loss: 0.0694 - val_AP: 0.3351 - val_AP50: 0.4856 - val_AP75: 0.3678 - val_APs: 0.0419 - val_APm: 0.1666 - val_APl: 0.3928 - val_ARmax1: 0.3357 - val_ARmax10: 0.4403 - val_ARmax100: 0.4431 - val_ARs: 0.0650 - val_ARm: 0.2626 - val_ARl: 0.5025\n",
+      "Epoch 76/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.81s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.04s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.337\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.485\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.367\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.040\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.165\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.396\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.338\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.444\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.447\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.073\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.259\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.506\n",
+      "4138/4138 [==============================] - 605s 146ms/step - loss: 0.5940 - box_loss: 0.5480 - class_loss: 0.0460 - val_loss: 1.2382 - val_box_loss: 1.1690 - val_class_loss: 0.0692 - val_AP: 0.3371 - val_AP50: 0.4848 - val_AP75: 0.3673 - val_APs: 0.0397 - val_APm: 0.1652 - val_APl: 0.3963 - val_ARmax1: 0.3376 - val_ARmax10: 0.4440 - val_ARmax100: 0.4471 - val_ARs: 0.0733 - val_ARm: 0.2590 - val_ARl: 0.5062\n",
+      "Epoch 77/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.80s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.02s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.336\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.487\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.366\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.042\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.169\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.394\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.337\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.441\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.444\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.071\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.262\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.504\n",
+      "4138/4138 [==============================] - 606s 146ms/step - loss: 0.5802 - box_loss: 0.5352 - class_loss: 0.0450 - val_loss: 1.2430 - val_box_loss: 1.1741 - val_class_loss: 0.0689 - val_AP: 0.3358 - val_AP50: 0.4874 - val_AP75: 0.3665 - val_APs: 0.0416 - val_APm: 0.1689 - val_APl: 0.3943 - val_ARmax1: 0.3368 - val_ARmax10: 0.4411 - val_ARmax100: 0.4439 - val_ARs: 0.0706 - val_ARm: 0.2623 - val_ARl: 0.5042\n",
+      "Epoch 78/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.73s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.02s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.341\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.493\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.371\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.044\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.170\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.400\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.344\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.449\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.451\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.075\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.268\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.509\n",
+      "4138/4138 [==============================] - 605s 146ms/step - loss: 0.5713 - box_loss: 0.5272 - class_loss: 0.0442 - val_loss: 1.2382 - val_box_loss: 1.1692 - val_class_loss: 0.0690 - val_AP: 0.3411 - val_AP50: 0.4931 - val_AP75: 0.3714 - val_APs: 0.0438 - val_APm: 0.1704 - val_APl: 0.3998 - val_ARmax1: 0.3442 - val_ARmax10: 0.4486 - val_ARmax100: 0.4513 - val_ARs: 0.0753 - val_ARm: 0.2684 - val_ARl: 0.5094\n",
+      "Epoch 79/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.75s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.01s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.337\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.485\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.368\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.042\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.178\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.393\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.338\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.443\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.445\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.073\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.269\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.502\n",
+      "4138/4138 [==============================] - 605s 146ms/step - loss: 0.5786 - box_loss: 0.5338 - class_loss: 0.0448 - val_loss: 1.2447 - val_box_loss: 1.1755 - val_class_loss: 0.0693 - val_AP: 0.3373 - val_AP50: 0.4846 - val_AP75: 0.3685 - val_APs: 0.0417 - val_APm: 0.1775 - val_APl: 0.3933 - val_ARmax1: 0.3377 - val_ARmax10: 0.4426 - val_ARmax100: 0.4451 - val_ARs: 0.0728 - val_ARm: 0.2686 - val_ARl: 0.5016\n",
+      "Epoch 80/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.82s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.03s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.338\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.486\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.371\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.048\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.175\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.395\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.339\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.447\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.450\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.079\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.287\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.504\n",
+      "4138/4138 [==============================] - 606s 146ms/step - loss: 0.5653 - box_loss: 0.5215 - class_loss: 0.0438 - val_loss: 1.2362 - val_box_loss: 1.1666 - val_class_loss: 0.0696 - val_AP: 0.3384 - val_AP50: 0.4860 - val_AP75: 0.3712 - val_APs: 0.0480 - val_APm: 0.1748 - val_APl: 0.3951 - val_ARmax1: 0.3392 - val_ARmax10: 0.4468 - val_ARmax100: 0.4497 - val_ARs: 0.0787 - val_ARm: 0.2866 - val_ARl: 0.5038\n",
+      "Epoch 81/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.80s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.05s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.339\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.489\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.369\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.047\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.164\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.397\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.341\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.446\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.448\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.080\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.258\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.509\n",
+      "4138/4138 [==============================] - 606s 146ms/step - loss: 0.5614 - box_loss: 0.5179 - class_loss: 0.0435 - val_loss: 1.2338 - val_box_loss: 1.1650 - val_class_loss: 0.0687 - val_AP: 0.3386 - val_AP50: 0.4893 - val_AP75: 0.3693 - val_APs: 0.0466 - val_APm: 0.1645 - val_APl: 0.3970 - val_ARmax1: 0.3408 - val_ARmax10: 0.4457 - val_ARmax100: 0.4484 - val_ARs: 0.0804 - val_ARm: 0.2577 - val_ARl: 0.5087\n",
+      "Epoch 82/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.70s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.01s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.341\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.494\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.372\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.054\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.174\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.398\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.340\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.446\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.450\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.093\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.268\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.507\n",
+      "4138/4138 [==============================] - 605s 146ms/step - loss: 0.5578 - box_loss: 0.5147 - class_loss: 0.0431 - val_loss: 1.2430 - val_box_loss: 1.1734 - val_class_loss: 0.0696 - val_AP: 0.3413 - val_AP50: 0.4936 - val_AP75: 0.3722 - val_APs: 0.0544 - val_APm: 0.1739 - val_APl: 0.3983 - val_ARmax1: 0.3399 - val_ARmax10: 0.4464 - val_ARmax100: 0.4496 - val_ARs: 0.0930 - val_ARm: 0.2678 - val_ARl: 0.5071\n",
+      "Epoch 83/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.73s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.01s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.340\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.489\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.372\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.054\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.171\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.398\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.340\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.446\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.449\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.095\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.261\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.508\n",
+      "4138/4138 [==============================] - 605s 146ms/step - loss: 0.5646 - box_loss: 0.5210 - class_loss: 0.0436 - val_loss: 1.2417 - val_box_loss: 1.1733 - val_class_loss: 0.0684 - val_AP: 0.3400 - val_AP50: 0.4892 - val_AP75: 0.3720 - val_APs: 0.0538 - val_APm: 0.1714 - val_APl: 0.3975 - val_ARmax1: 0.3404 - val_ARmax10: 0.4462 - val_ARmax100: 0.4491 - val_ARs: 0.0950 - val_ARm: 0.2612 - val_ARl: 0.5078\n",
+      "Epoch 84/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.69s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.00s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.341\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.489\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.371\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.050\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.169\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.398\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.342\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.446\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.449\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.086\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.265\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.506\n",
+      "4138/4138 [==============================] - 604s 146ms/step - loss: 0.5480 - box_loss: 0.5056 - class_loss: 0.0425 - val_loss: 1.2412 - val_box_loss: 1.1724 - val_class_loss: 0.0689 - val_AP: 0.3410 - val_AP50: 0.4889 - val_AP75: 0.3713 - val_APs: 0.0497 - val_APm: 0.1690 - val_APl: 0.3982 - val_ARmax1: 0.3417 - val_ARmax10: 0.4461 - val_ARmax100: 0.4490 - val_ARs: 0.0860 - val_ARm: 0.2648 - val_ARl: 0.5060\n",
+      "Epoch 85/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.77s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.01s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.344\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.494\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.377\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.050\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.171\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.402\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.342\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.448\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.451\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.085\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.264\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.510\n",
+      "4138/4138 [==============================] - 605s 146ms/step - loss: 0.5501 - box_loss: 0.5075 - class_loss: 0.0427 - val_loss: 1.2498 - val_box_loss: 1.1808 - val_class_loss: 0.0689 - val_AP: 0.3443 - val_AP50: 0.4940 - val_AP75: 0.3771 - val_APs: 0.0503 - val_APm: 0.1712 - val_APl: 0.4024 - val_ARmax1: 0.3423 - val_ARmax10: 0.4479 - val_ARmax100: 0.4508 - val_ARs: 0.0852 - val_ARm: 0.2643 - val_ARl: 0.5100\n",
+      "Epoch 86/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.82s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.02s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.345\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.497\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.375\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.050\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.166\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.404\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.343\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.450\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.453\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.082\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.262\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.512\n",
+      "4138/4138 [==============================] - 606s 146ms/step - loss: 0.5503 - box_loss: 0.5076 - class_loss: 0.0426 - val_loss: 1.2389 - val_box_loss: 1.1706 - val_class_loss: 0.0683 - val_AP: 0.3449 - val_AP50: 0.4971 - val_AP75: 0.3754 - val_APs: 0.0499 - val_APm: 0.1657 - val_APl: 0.4042 - val_ARmax1: 0.3430 - val_ARmax10: 0.4501 - val_ARmax100: 0.4533 - val_ARs: 0.0816 - val_ARm: 0.2619 - val_ARl: 0.5118\n",
+      "Epoch 87/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=5.45s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.03s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.345\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.495\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.378\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.057\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.164\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.406\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.346\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.451\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.454\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.090\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.258\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.515\n",
+      "4138/4138 [==============================] - 605s 146ms/step - loss: 0.5344 - box_loss: 0.4928 - class_loss: 0.0416 - val_loss: 1.2376 - val_box_loss: 1.1695 - val_class_loss: 0.0681 - val_AP: 0.3455 - val_AP50: 0.4952 - val_AP75: 0.3783 - val_APs: 0.0566 - val_APm: 0.1644 - val_APl: 0.4061 - val_ARmax1: 0.3458 - val_ARmax10: 0.4510 - val_ARmax100: 0.4537 - val_ARs: 0.0900 - val_ARm: 0.2579 - val_ARl: 0.5151\n",
+      "Epoch 88/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.88s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.03s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.350\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.501\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.384\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.056\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.173\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.410\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.344\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.454\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.457\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.095\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.261\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.516\n",
+      "4138/4138 [==============================] - 605s 146ms/step - loss: 0.5402 - box_loss: 0.4982 - class_loss: 0.0420 - val_loss: 1.2366 - val_box_loss: 1.1678 - val_class_loss: 0.0689 - val_AP: 0.3502 - val_AP50: 0.5007 - val_AP75: 0.3838 - val_APs: 0.0559 - val_APm: 0.1730 - val_APl: 0.4095 - val_ARmax1: 0.3443 - val_ARmax10: 0.4539 - val_ARmax100: 0.4571 - val_ARs: 0.0948 - val_ARm: 0.2613 - val_ARl: 0.5160\n",
+      "Epoch 89/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=5.39s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.02s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.352\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.502\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.384\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.053\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.173\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.414\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.346\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.454\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.457\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.090\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.264\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.519\n",
+      "4138/4138 [==============================] - 605s 146ms/step - loss: 0.5330 - box_loss: 0.4915 - class_loss: 0.0415 - val_loss: 1.2327 - val_box_loss: 1.1649 - val_class_loss: 0.0678 - val_AP: 0.3521 - val_AP50: 0.5021 - val_AP75: 0.3835 - val_APs: 0.0533 - val_APm: 0.1735 - val_APl: 0.4138 - val_ARmax1: 0.3465 - val_ARmax10: 0.4542 - val_ARmax100: 0.4573 - val_ARs: 0.0900 - val_ARm: 0.2637 - val_ARl: 0.5195\n",
+      "Epoch 90/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.78s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.03s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.344\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.492\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.375\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.055\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.168\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.403\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.342\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.447\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.450\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.086\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.262\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.509\n",
+      "4138/4138 [==============================] - 605s 146ms/step - loss: 0.5319 - box_loss: 0.4906 - class_loss: 0.0413 - val_loss: 1.2377 - val_box_loss: 1.1694 - val_class_loss: 0.0683 - val_AP: 0.3443 - val_AP50: 0.4917 - val_AP75: 0.3753 - val_APs: 0.0552 - val_APm: 0.1675 - val_APl: 0.4035 - val_ARmax1: 0.3421 - val_ARmax10: 0.4473 - val_ARmax100: 0.4501 - val_ARs: 0.0860 - val_ARm: 0.2620 - val_ARl: 0.5091\n",
+      "Epoch 91/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=5.46s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.05s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.348\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.498\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.378\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.066\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.177\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.406\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.345\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.452\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.455\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.099\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.269\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.513\n",
+      "4138/4138 [==============================] - 606s 146ms/step - loss: 0.5280 - box_loss: 0.4871 - class_loss: 0.0408 - val_loss: 1.2316 - val_box_loss: 1.1638 - val_class_loss: 0.0678 - val_AP: 0.3476 - val_AP50: 0.4978 - val_AP75: 0.3778 - val_APs: 0.0661 - val_APm: 0.1771 - val_APl: 0.4058 - val_ARmax1: 0.3447 - val_ARmax10: 0.4524 - val_ARmax100: 0.4555 - val_ARs: 0.0989 - val_ARm: 0.2688 - val_ARl: 0.5135\n",
+      "Epoch 92/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.81s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.02s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.350\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.501\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.381\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.045\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.169\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.412\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.349\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.453\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.457\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.076\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.259\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.518\n",
+      "4138/4138 [==============================] - 604s 146ms/step - loss: 0.5164 - box_loss: 0.4762 - class_loss: 0.0401 - val_loss: 1.2365 - val_box_loss: 1.1685 - val_class_loss: 0.0680 - val_AP: 0.3503 - val_AP50: 0.5005 - val_AP75: 0.3806 - val_APs: 0.0452 - val_APm: 0.1687 - val_APl: 0.4116 - val_ARmax1: 0.3491 - val_ARmax10: 0.4535 - val_ARmax100: 0.4568 - val_ARs: 0.0756 - val_ARm: 0.2590 - val_ARl: 0.5183\n",
+      "Epoch 93/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=5.38s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.01s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.350\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.504\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.382\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.050\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.180\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.410\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.348\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.455\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.458\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.079\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.267\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.518\n",
+      "4138/4138 [==============================] - 606s 146ms/step - loss: 0.5081 - box_loss: 0.4686 - class_loss: 0.0396 - val_loss: 1.2380 - val_box_loss: 1.1700 - val_class_loss: 0.0681 - val_AP: 0.3505 - val_AP50: 0.5036 - val_AP75: 0.3819 - val_APs: 0.0497 - val_APm: 0.1798 - val_APl: 0.4096 - val_ARmax1: 0.3479 - val_ARmax10: 0.4548 - val_ARmax100: 0.4581 - val_ARs: 0.0791 - val_ARm: 0.2672 - val_ARl: 0.5183\n",
+      "Epoch 94/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.79s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.01s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.352\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.504\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.385\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.048\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.172\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.414\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.351\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.457\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.460\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.085\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.261\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.522\n",
+      "4138/4138 [==============================] - 605s 146ms/step - loss: 0.5134 - box_loss: 0.4735 - class_loss: 0.0399 - val_loss: 1.2370 - val_box_loss: 1.1695 - val_class_loss: 0.0675 - val_AP: 0.3524 - val_AP50: 0.5045 - val_AP75: 0.3850 - val_APs: 0.0475 - val_APm: 0.1719 - val_APl: 0.4141 - val_ARmax1: 0.3513 - val_ARmax10: 0.4569 - val_ARmax100: 0.4598 - val_ARs: 0.0847 - val_ARm: 0.2609 - val_ARl: 0.5220\n",
+      "Epoch 95/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=5.46s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.03s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.355\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.507\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.387\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.063\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.172\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.416\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.350\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.459\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.463\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.099\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.267\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.523\n",
+      "4138/4138 [==============================] - 606s 146ms/step - loss: 0.5109 - box_loss: 0.4711 - class_loss: 0.0398 - val_loss: 1.2341 - val_box_loss: 1.1667 - val_class_loss: 0.0674 - val_AP: 0.3548 - val_AP50: 0.5072 - val_AP75: 0.3872 - val_APs: 0.0630 - val_APm: 0.1716 - val_APl: 0.4156 - val_ARmax1: 0.3501 - val_ARmax10: 0.4590 - val_ARmax100: 0.4625 - val_ARs: 0.0989 - val_ARm: 0.2670 - val_ARl: 0.5229\n",
+      "Epoch 96/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.78s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.02s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.350\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.500\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.381\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.054\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.176\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.411\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.349\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.457\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.460\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.088\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.265\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.521\n",
+      "4138/4138 [==============================] - 605s 146ms/step - loss: 0.4990 - box_loss: 0.4599 - class_loss: 0.0391 - val_loss: 1.2384 - val_box_loss: 1.1706 - val_class_loss: 0.0678 - val_AP: 0.3498 - val_AP50: 0.5001 - val_AP75: 0.3811 - val_APs: 0.0537 - val_APm: 0.1756 - val_APl: 0.4107 - val_ARmax1: 0.3495 - val_ARmax10: 0.4566 - val_ARmax100: 0.4595 - val_ARs: 0.0884 - val_ARm: 0.2649 - val_ARl: 0.5205\n",
+      "Epoch 97/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=5.39s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.01s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.354\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.508\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.387\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.066\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.175\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.415\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.349\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.458\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.461\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.098\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.265\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.522\n",
+      "4138/4138 [==============================] - 607s 146ms/step - loss: 0.4921 - box_loss: 0.4534 - class_loss: 0.0387 - val_loss: 1.2402 - val_box_loss: 1.1727 - val_class_loss: 0.0675 - val_AP: 0.3539 - val_AP50: 0.5078 - val_AP75: 0.3872 - val_APs: 0.0656 - val_APm: 0.1753 - val_APl: 0.4152 - val_ARmax1: 0.3493 - val_ARmax10: 0.4578 - val_ARmax100: 0.4610 - val_ARs: 0.0976 - val_ARm: 0.2654 - val_ARl: 0.5223\n",
+      "Epoch 98/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.71s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.00s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.344\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.493\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.377\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.058\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.169\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.405\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.343\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.448\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.451\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.094\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.262\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.512\n",
+      "4138/4138 [==============================] - 605s 146ms/step - loss: 0.4868 - box_loss: 0.4485 - class_loss: 0.0383 - val_loss: 1.2434 - val_box_loss: 1.1750 - val_class_loss: 0.0684 - val_AP: 0.3445 - val_AP50: 0.4929 - val_AP75: 0.3775 - val_APs: 0.0578 - val_APm: 0.1694 - val_APl: 0.4052 - val_ARmax1: 0.3435 - val_ARmax10: 0.4482 - val_ARmax100: 0.4511 - val_ARs: 0.0943 - val_ARm: 0.2622 - val_ARl: 0.5122\n",
+      "Epoch 99/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.87s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.04s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.358\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.513\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.389\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.057\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.176\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.421\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.354\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.466\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.469\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.096\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.267\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.531\n",
+      "4138/4138 [==============================] - 606s 146ms/step - loss: 0.4826 - box_loss: 0.4446 - class_loss: 0.0380 - val_loss: 1.2316 - val_box_loss: 1.1647 - val_class_loss: 0.0669 - val_AP: 0.3583 - val_AP50: 0.5129 - val_AP75: 0.3888 - val_APs: 0.0567 - val_APm: 0.1757 - val_APl: 0.4208 - val_ARmax1: 0.3545 - val_ARmax10: 0.4657 - val_ARmax100: 0.4688 - val_ARs: 0.0961 - val_ARm: 0.2671 - val_ARl: 0.5308\n",
+      "Epoch 100/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.79s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.01s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.354\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.507\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.384\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.052\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.176\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.417\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.350\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.460\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.464\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.092\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.270\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.526\n",
+      "4138/4138 [==============================] - 605s 146ms/step - loss: 0.4918 - box_loss: 0.4532 - class_loss: 0.0386 - val_loss: 1.2328 - val_box_loss: 1.1654 - val_class_loss: 0.0674 - val_AP: 0.3544 - val_AP50: 0.5068 - val_AP75: 0.3843 - val_APs: 0.0516 - val_APm: 0.1761 - val_APl: 0.4166 - val_ARmax1: 0.3498 - val_ARmax10: 0.4603 - val_ARmax100: 0.4638 - val_ARs: 0.0921 - val_ARm: 0.2698 - val_ARl: 0.5261\n",
+      "Epoch 101/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.71s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.00s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.353\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.503\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.384\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.057\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.171\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.415\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.349\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.457\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.460\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.090\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.266\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.522\n",
+      "4138/4138 [==============================] - 607s 146ms/step - loss: 0.4804 - box_loss: 0.4427 - class_loss: 0.0378 - val_loss: 1.2321 - val_box_loss: 1.1649 - val_class_loss: 0.0672 - val_AP: 0.3532 - val_AP50: 0.5033 - val_AP75: 0.3844 - val_APs: 0.0570 - val_APm: 0.1708 - val_APl: 0.4151 - val_ARmax1: 0.3490 - val_ARmax10: 0.4572 - val_ARmax100: 0.4602 - val_ARs: 0.0903 - val_ARm: 0.2662 - val_ARl: 0.5223\n",
+      "Epoch 102/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.80s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.01s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.358\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.511\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.393\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.061\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.172\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.421\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.353\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.462\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.465\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.098\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.268\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.527\n",
+      "4138/4138 [==============================] - 605s 146ms/step - loss: 0.4729 - box_loss: 0.4358 - class_loss: 0.0372 - val_loss: 1.2324 - val_box_loss: 1.1654 - val_class_loss: 0.0669 - val_AP: 0.3580 - val_AP50: 0.5106 - val_AP75: 0.3927 - val_APs: 0.0614 - val_APm: 0.1716 - val_APl: 0.4214 - val_ARmax1: 0.3530 - val_ARmax10: 0.4619 - val_ARmax100: 0.4649 - val_ARs: 0.0981 - val_ARm: 0.2679 - val_ARl: 0.5272\n",
+      "Epoch 103/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.83s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.03s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.356\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.510\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.389\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.058\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.171\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.416\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.348\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.459\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.462\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.102\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.269\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.521\n",
+      "4138/4138 [==============================] - 606s 146ms/step - loss: 0.4819 - box_loss: 0.4441 - class_loss: 0.0378 - val_loss: 1.2363 - val_box_loss: 1.1692 - val_class_loss: 0.0670 - val_AP: 0.3555 - val_AP50: 0.5102 - val_AP75: 0.3889 - val_APs: 0.0580 - val_APm: 0.1713 - val_APl: 0.4162 - val_ARmax1: 0.3481 - val_ARmax10: 0.4588 - val_ARmax100: 0.4621 - val_ARs: 0.1024 - val_ARm: 0.2693 - val_ARl: 0.5205\n",
+      "Epoch 104/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.78s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.02s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.357\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.509\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.391\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.055\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.174\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.419\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.353\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.462\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.465\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.094\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.266\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.525\n",
+      "4138/4138 [==============================] - 605s 146ms/step - loss: 0.4663 - box_loss: 0.4296 - class_loss: 0.0367 - val_loss: 1.2301 - val_box_loss: 1.1631 - val_class_loss: 0.0670 - val_AP: 0.3572 - val_AP50: 0.5088 - val_AP75: 0.3912 - val_APs: 0.0551 - val_APm: 0.1741 - val_APl: 0.4193 - val_ARmax1: 0.3529 - val_ARmax10: 0.4617 - val_ARmax100: 0.4650 - val_ARs: 0.0936 - val_ARm: 0.2664 - val_ARl: 0.5254\n",
+      "Epoch 105/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=5.37s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.02s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.358\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.510\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.391\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.064\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.173\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.419\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.353\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.463\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.466\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.103\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.271\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.525\n",
+      "4138/4138 [==============================] - 608s 147ms/step - loss: 0.4637 - box_loss: 0.4270 - class_loss: 0.0367 - val_loss: 1.2275 - val_box_loss: 1.1606 - val_class_loss: 0.0669 - val_AP: 0.3578 - val_AP50: 0.5102 - val_AP75: 0.3905 - val_APs: 0.0642 - val_APm: 0.1726 - val_APl: 0.4188 - val_ARmax1: 0.3532 - val_ARmax10: 0.4625 - val_ARmax100: 0.4657 - val_ARs: 0.1025 - val_ARm: 0.2708 - val_ARl: 0.5249\n",
+      "Epoch 106/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.78s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.01s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.356\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.507\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.389\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.062\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.175\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.418\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.351\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.459\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.462\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.101\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.269\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.524\n",
+      "4138/4138 [==============================] - 606s 146ms/step - loss: 0.4600 - box_loss: 0.4236 - class_loss: 0.0364 - val_loss: 1.2348 - val_box_loss: 1.1677 - val_class_loss: 0.0672 - val_AP: 0.3556 - val_AP50: 0.5073 - val_AP75: 0.3890 - val_APs: 0.0616 - val_APm: 0.1752 - val_APl: 0.4182 - val_ARmax1: 0.3508 - val_ARmax10: 0.4590 - val_ARmax100: 0.4622 - val_ARs: 0.1007 - val_ARm: 0.2685 - val_ARl: 0.5242\n",
+      "Epoch 107/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.80s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.01s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.354\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.504\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.387\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.060\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.172\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.415\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.352\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.458\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.461\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.104\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.267\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.520\n",
+      "4138/4138 [==============================] - 607s 146ms/step - loss: 0.4627 - box_loss: 0.4260 - class_loss: 0.0367 - val_loss: 1.2333 - val_box_loss: 1.1663 - val_class_loss: 0.0670 - val_AP: 0.3544 - val_AP50: 0.5044 - val_AP75: 0.3875 - val_APs: 0.0595 - val_APm: 0.1725 - val_APl: 0.4150 - val_ARmax1: 0.3516 - val_ARmax10: 0.4580 - val_ARmax100: 0.4609 - val_ARs: 0.1035 - val_ARm: 0.2668 - val_ARl: 0.5199\n",
+      "Epoch 108/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.78s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.00s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.360\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.511\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.396\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.064\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.178\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.423\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.354\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.462\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.465\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.102\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.274\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.527\n",
+      "4138/4138 [==============================] - 606s 146ms/step - loss: 0.4550 - box_loss: 0.4189 - class_loss: 0.0361 - val_loss: 1.2288 - val_box_loss: 1.1625 - val_class_loss: 0.0663 - val_AP: 0.3601 - val_AP50: 0.5113 - val_AP75: 0.3955 - val_APs: 0.0640 - val_APm: 0.1775 - val_APl: 0.4229 - val_ARmax1: 0.3541 - val_ARmax10: 0.4622 - val_ARmax100: 0.4654 - val_ARs: 0.1020 - val_ARm: 0.2735 - val_ARl: 0.5272\n",
+      "Epoch 109/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.80s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.02s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.358\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.511\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.389\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.057\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.172\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.420\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.350\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.459\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.462\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.100\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.268\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.524\n",
+      "4138/4138 [==============================] - 606s 146ms/step - loss: 0.4535 - box_loss: 0.4175 - class_loss: 0.0359 - val_loss: 1.2286 - val_box_loss: 1.1620 - val_class_loss: 0.0666 - val_AP: 0.3579 - val_AP50: 0.5107 - val_AP75: 0.3893 - val_APs: 0.0572 - val_APm: 0.1718 - val_APl: 0.4205 - val_ARmax1: 0.3497 - val_ARmax10: 0.4589 - val_ARmax100: 0.4621 - val_ARs: 0.0998 - val_ARm: 0.2678 - val_ARl: 0.5235\n",
+      "Epoch 110/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.77s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.02s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.361\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.514\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.393\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.066\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.173\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.425\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.355\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.467\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.470\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.102\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.267\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.534\n",
+      "4138/4138 [==============================] - 605s 146ms/step - loss: 0.4477 - box_loss: 0.4122 - class_loss: 0.0356 - val_loss: 1.2308 - val_box_loss: 1.1642 - val_class_loss: 0.0666 - val_AP: 0.3609 - val_AP50: 0.5137 - val_AP75: 0.3932 - val_APs: 0.0662 - val_APm: 0.1730 - val_APl: 0.4254 - val_ARmax1: 0.3551 - val_ARmax10: 0.4668 - val_ARmax100: 0.4700 - val_ARs: 0.1015 - val_ARm: 0.2666 - val_ARl: 0.5338\n",
+      "Epoch 111/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.70s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.00s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.356\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.507\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.387\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.060\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.171\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.419\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.350\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.456\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.460\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.099\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.263\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.522\n",
+      "4138/4138 [==============================] - 605s 146ms/step - loss: 0.4427 - box_loss: 0.4074 - class_loss: 0.0352 - val_loss: 1.2370 - val_box_loss: 1.1696 - val_class_loss: 0.0674 - val_AP: 0.3563 - val_AP50: 0.5066 - val_AP75: 0.3871 - val_APs: 0.0597 - val_APm: 0.1711 - val_APl: 0.4189 - val_ARmax1: 0.3502 - val_ARmax10: 0.4564 - val_ARmax100: 0.4598 - val_ARs: 0.0988 - val_ARm: 0.2626 - val_ARl: 0.5216\n",
+      "Epoch 112/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.81s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.02s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.362\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.516\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.396\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.065\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.175\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.424\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.356\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.465\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.468\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.100\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.268\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.531\n",
+      "4138/4138 [==============================] - 606s 146ms/step - loss: 0.4384 - box_loss: 0.4033 - class_loss: 0.0350 - val_loss: 1.2323 - val_box_loss: 1.1656 - val_class_loss: 0.0667 - val_AP: 0.3620 - val_AP50: 0.5164 - val_AP75: 0.3960 - val_APs: 0.0653 - val_APm: 0.1751 - val_APl: 0.4244 - val_ARmax1: 0.3562 - val_ARmax10: 0.4653 - val_ARmax100: 0.4684 - val_ARs: 0.0997 - val_ARm: 0.2679 - val_ARl: 0.5309\n",
+      "Epoch 113/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=5.37s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.01s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.355\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.506\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.388\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.061\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.172\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.418\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.354\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.460\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.463\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.095\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.261\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.526\n",
+      "4138/4138 [==============================] - 607s 146ms/step - loss: 0.4370 - box_loss: 0.4022 - class_loss: 0.0349 - val_loss: 1.2397 - val_box_loss: 1.1725 - val_class_loss: 0.0672 - val_AP: 0.3552 - val_AP50: 0.5063 - val_AP75: 0.3877 - val_APs: 0.0606 - val_APm: 0.1716 - val_APl: 0.4175 - val_ARmax1: 0.3535 - val_ARmax10: 0.4600 - val_ARmax100: 0.4630 - val_ARs: 0.0955 - val_ARm: 0.2609 - val_ARl: 0.5264\n",
+      "Epoch 114/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.81s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.02s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.360\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.514\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.395\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.063\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.179\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.423\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.353\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.463\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.466\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.101\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.269\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.530\n",
+      "4138/4138 [==============================] - 605s 146ms/step - loss: 0.4473 - box_loss: 0.4118 - class_loss: 0.0355 - val_loss: 1.2302 - val_box_loss: 1.1637 - val_class_loss: 0.0665 - val_AP: 0.3603 - val_AP50: 0.5143 - val_AP75: 0.3950 - val_APs: 0.0626 - val_APm: 0.1788 - val_APl: 0.4235 - val_ARmax1: 0.3533 - val_ARmax10: 0.4631 - val_ARmax100: 0.4662 - val_ARs: 0.1013 - val_ARm: 0.2686 - val_ARl: 0.5296\n",
+      "Epoch 115/120\n",
+      "1238/1238 [==============================] - 110s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=5.42s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.01s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.355\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.507\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.387\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.061\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.176\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.418\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.351\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.459\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.462\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.100\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.266\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.525\n",
+      "4138/4138 [==============================] - 606s 146ms/step - loss: 0.4296 - box_loss: 0.3952 - class_loss: 0.0344 - val_loss: 1.2352 - val_box_loss: 1.1683 - val_class_loss: 0.0669 - val_AP: 0.3548 - val_AP50: 0.5075 - val_AP75: 0.3867 - val_APs: 0.0612 - val_APm: 0.1756 - val_APl: 0.4177 - val_ARmax1: 0.3509 - val_ARmax10: 0.4589 - val_ARmax100: 0.4618 - val_ARs: 0.0998 - val_ARm: 0.2657 - val_ARl: 0.5250\n",
+      "Epoch 116/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.84s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.02s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.363\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.515\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.397\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.062\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.183\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.424\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.357\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.467\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.470\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.104\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.276\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.529\n",
+      "4138/4138 [==============================] - 606s 146ms/step - loss: 0.4361 - box_loss: 0.4013 - class_loss: 0.0348 - val_loss: 1.2340 - val_box_loss: 1.1675 - val_class_loss: 0.0665 - val_AP: 0.3634 - val_AP50: 0.5154 - val_AP75: 0.3966 - val_APs: 0.0619 - val_APm: 0.1833 - val_APl: 0.4239 - val_ARmax1: 0.3567 - val_ARmax10: 0.4667 - val_ARmax100: 0.4700 - val_ARs: 0.1042 - val_ARm: 0.2763 - val_ARl: 0.5293\n",
+      "Epoch 117/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=5.39s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.00s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.357\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.505\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.391\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.056\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.174\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.421\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.352\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.458\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.461\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.096\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.262\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.526\n",
+      "4138/4138 [==============================] - 606s 146ms/step - loss: 0.4258 - box_loss: 0.3915 - class_loss: 0.0342 - val_loss: 1.2340 - val_box_loss: 1.1670 - val_class_loss: 0.0670 - val_AP: 0.3567 - val_AP50: 0.5053 - val_AP75: 0.3907 - val_APs: 0.0564 - val_APm: 0.1737 - val_APl: 0.4208 - val_ARmax1: 0.3516 - val_ARmax10: 0.4580 - val_ARmax100: 0.4609 - val_ARs: 0.0958 - val_ARm: 0.2620 - val_ARl: 0.5263\n",
+      "Epoch 118/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.80s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.01s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.360\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.513\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.394\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.071\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.176\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.424\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.355\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.463\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.465\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.106\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.268\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.528\n",
+      "4138/4138 [==============================] - 607s 146ms/step - loss: 0.4199 - box_loss: 0.3861 - class_loss: 0.0338 - val_loss: 1.2327 - val_box_loss: 1.1661 - val_class_loss: 0.0667 - val_AP: 0.3604 - val_AP50: 0.5131 - val_AP75: 0.3940 - val_APs: 0.0706 - val_APm: 0.1765 - val_APl: 0.4237 - val_ARmax1: 0.3550 - val_ARmax10: 0.4625 - val_ARmax100: 0.4654 - val_ARs: 0.1059 - val_ARm: 0.2678 - val_ARl: 0.5282\n",
+      "Epoch 119/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=5.49s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.03s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.359\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.513\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.389\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.066\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.179\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.421\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.356\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.464\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.467\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.101\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.272\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.528\n",
+      "4138/4138 [==============================] - 607s 147ms/step - loss: 0.4292 - box_loss: 0.3946 - class_loss: 0.0345 - val_loss: 1.2323 - val_box_loss: 1.1659 - val_class_loss: 0.0665 - val_AP: 0.3593 - val_AP50: 0.5133 - val_AP75: 0.3895 - val_APs: 0.0659 - val_APm: 0.1789 - val_APl: 0.4210 - val_ARmax1: 0.3556 - val_ARmax10: 0.4644 - val_ARmax100: 0.4674 - val_ARs: 0.1006 - val_ARm: 0.2718 - val_ARl: 0.5280\n",
+      "Epoch 120/120\n",
+      "1238/1238 [==============================] - 111s 89ms/step\n",
+      "creating index...\n",
+      "index created!\n",
+      "creating index...\n",
+      "index created!\n",
+      "Running per image evaluation...\n",
+      "Evaluate annotation type *bbox*\n",
+      "DONE (t=4.83s).\n",
+      "Accumulating evaluation results...\n",
+      "DONE (t=1.01s).\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.359\n",
+      " Average Precision  (AP) @[ IoU=0.50      | area=   all | maxDets=100 ] = 0.513\n",
+      " Average Precision  (AP) @[ IoU=0.75      | area=   all | maxDets=100 ] = 0.391\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.066\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.176\n",
+      " Average Precision  (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.422\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=  1 ] = 0.355\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets= 10 ] = 0.464\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=   all | maxDets=100 ] = 0.467\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.102\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.266\n",
+      " Average Recall     (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.530\n",
+      "4138/4138 [==============================] - 606s 146ms/step - loss: 0.4178 - box_loss: 0.3841 - class_loss: 0.0337 - val_loss: 1.2377 - val_box_loss: 1.1710 - val_class_loss: 0.0668 - val_AP: 0.3588 - val_AP50: 0.5127 - val_AP75: 0.3908 - val_APs: 0.0661 - val_APm: 0.1760 - val_APl: 0.4215 - val_ARmax1: 0.3549 - val_ARmax10: 0.4638 - val_ARmax100: 0.4671 - val_ARs: 0.1020 - val_ARm: 0.2658 - val_ARl: 0.5299\n"
+     ]
+    }
+   ]
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "from keras_cv import bounding_box, visualization\n",
+    "\n",
+    "\n",
+    "def visualize_detections(model, dataset, bounding_box_format, rows, cols):\n",
+    "    images, y_true = next(iter(dataset.take(1)))\n",
+    "    y_pred = model.predict(images)\n",
+    "    y_pred = bounding_box.to_ragged(y_pred)\n",
+    "    visualization.plot_bounding_box_gallery(\n",
+    "        images,\n",
+    "        value_range=(0, 255),\n",
+    "        bounding_box_format=bounding_box_format,\n",
+    "        y_true=y_true,\n",
+    "        y_pred=y_pred,\n",
+    "        scale=4,\n",
+    "        rows=rows,\n",
+    "        cols=cols,\n",
+    "        show=True,\n",
+    "        font_scale=0.7,\n",
+    "        class_mapping=class_mapping,\n",
+    "    )"
+   ],
+   "metadata": {
+    "id": "Zt_Wg_PpObgK"
+   },
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "class_ids = [\n",
+    "    \"Aeroplane\",\n",
+    "    \"Bicycle\",\n",
+    "    \"Bird\",\n",
+    "    \"Boat\",\n",
+    "    \"Bottle\",\n",
+    "    \"Bus\",\n",
+    "    \"Car\",\n",
+    "    \"Cat\",\n",
+    "    \"Chair\",\n",
+    "    \"Cow\",\n",
+    "    \"Dining Table\",\n",
+    "    \"Dog\",\n",
+    "    \"Horse\",\n",
+    "    \"Motorbike\",\n",
+    "    \"Person\",\n",
+    "    \"Potted Plant\",\n",
+    "    \"Sheep\",\n",
+    "    \"Sofa\",\n",
+    "    \"Train\",\n",
+    "    \"Tvmonitor\",\n",
+    "    \"Total\",\n",
+    "]\n",
+    "class_mapping = dict(zip(range(len(class_ids)), class_ids))"
+   ],
+   "metadata": {
+    "id": "MOGlE8o9Obbc"
+   },
+   "execution_count": null,
+   "outputs": []
+  },
+  {
+   "cell_type": "code",
+   "source": [
+    "model.prediction_decoder = keras_cv.layers.MultiClassNonMaxSuppression(\n",
+    "    bounding_box_format=\"xywh\",\n",
+    "    from_logits=False,\n",
+    "    confidence_threshold=0.3,\n",
+    "    iou_threshold=0.5,\n",
+    ")\n",
+    "model.make_predict_function(force=True)\n",
+    "visualize_detections(model, eval_ds.shuffle(10), \"xywh\", rows=2, cols=2)\n",
+    "old_model = model"
+   ],
+   "metadata": {
+    "id": "qMTWxQQ_Op1Q",
+    "colab": {
+     "base_uri": "https://localhost:8080/",
+     "height": 373
+    },
+    "outputId": "9d21c7f8-a023-43ab-f775-41011eff7753"
+   },
+   "execution_count": null,
+   "outputs": [
+    {
+     "output_type": "error",
+     "ename": "IndexError",
+     "evalue": "ignored",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mIndexError\u001b[0m                                Traceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-8-6c0e288d3833>\u001b[0m in \u001b[0;36m<cell line: 8>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      6\u001b[0m         )\n\u001b[1;32m      7\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmake_predict_function\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mforce\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;32mTrue\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 8\u001b[0;31m \u001b[0mvisualize_detections\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0meval_ds\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshuffle\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m10\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"xywh\"\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrows\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcols\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m2\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      9\u001b[0m \u001b[0mold_model\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m<ipython-input-6-b6a08908680c>\u001b[0m in \u001b[0;36mvisualize_detections\u001b[0;34m(model, dataset, bounding_box_format, rows, cols)\u001b[0m\n\u001b[1;32m      2\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mvisualize_detections\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mmodel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mdataset\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mbounding_box_format\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mrows\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcols\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      3\u001b[0m     \u001b[0mimages\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_true\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mnext\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0miter\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdataset\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtake\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 4\u001b[0;31m     \u001b[0my_pred\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mpredict\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mimages\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      5\u001b[0m     \u001b[0my_pred\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mbounding_box\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_ragged\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0my_pred\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      6\u001b[0m     visualization.plot_bounding_box_gallery(\n",
+      "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/keras/utils/traceback_utils.py\u001b[0m in \u001b[0;36merror_handler\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m     68\u001b[0m             \u001b[0;31m# To get the full stack trace, call:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     69\u001b[0m             \u001b[0;31m# `tf.debugging.disable_traceback_filtering()`\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 70\u001b[0;31m             \u001b[0;32mraise\u001b[0m \u001b[0me\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mwith_traceback\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfiltered_tb\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m     71\u001b[0m         \u001b[0;32mfinally\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m     72\u001b[0m             \u001b[0;32mdel\u001b[0m \u001b[0mfiltered_tb\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;32m/usr/local/lib/python3.10/dist-packages/tensorflow/python/framework/tensor_shape.py\u001b[0m in \u001b[0;36m__getitem__\u001b[0;34m(self, key)\u001b[0m\n\u001b[1;32m    955\u001b[0m       \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    956\u001b[0m         \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_v2_behavior\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 957\u001b[0;31m           \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_dims\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m    958\u001b[0m         \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m    959\u001b[0m           \u001b[0;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdims\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mkey\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mIndexError\u001b[0m: tuple index out of range"
+     ]
+    }
+   ]
+  }
+ ]
+}
\ No newline at end of file
diff --git a/keras_cv/tools/training_scipts/training_deeplab_v3_plus.ipynb b/keras_cv/tools/training_scipts/training_deeplab_v3_plus.ipynb
new file mode 100644
index 0000000000..e7ff38752a
--- /dev/null
+++ b/keras_cv/tools/training_scipts/training_deeplab_v3_plus.ipynb
@@ -0,0 +1,569 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "0YC2vlsGs5tg"
+   },
+   "source": [
+    "# Semantic Segmentation with KerasCV\n",
+    "\n",
+    "**Author:** [Divyashree Sreepathihalli](https://github.com/divyashreepathihalli), [Ian Stenbit](https://github.com/ianstenbit)<br>\n",
+    "**Date created:** 2023/08/22<br>\n",
+    "**Last modified:** 2023/08/24<br>\n",
+    "**Description:** Train and use DeepLabv3+ segmentation model with KerasCV."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "zEUpBnaGs5th"
+   },
+   "source": [
+    "![](https://storage.googleapis.com/keras-nlp/getting_started_guide/prof_keras_intermediate.png)\n",
+    "\n",
+    "## Background\n",
+    "Semantic segmentation is a type of computer vision task that involves assigning a\n",
+    "class label such as person, bike, or background to each individual pixel of an\n",
+    "image, effectively dividing the image into regions that correspond to different\n",
+    "fobject classes or categories.\n",
+    "\n",
+    "![](https://miro.medium.com/v2/resize:fit:4800/format:webp/1*z6ch-2BliDGLIHpOPFY_Sw.png)\n",
+    "\n",
+    "\n",
+    "\n",
+    "KerasCV offers the DeepLabv3+ model developed by Google for semantic\n",
+    "segmentation. This guide demonstrates how to finetune and use DeepLabv3+ model for\n",
+    "image semantic segmentaion with KerasCV. Its architecture that combines atrous convolutions,\n",
+    "contextual information aggregation, and powerful backbones to achieve accurate and\n",
+    "detailed semantic segmentation. The DeepLabv3+ model has been shown to achieve\n",
+    "state-of-the-art results on a variety of image segmentation benchmarks.\n",
+    "\n",
+    "### References\n",
+    "[Encoder-Decoder with Atrous Separable Convolution for Semantic Image\n",
+    "Segmentation](https://arxiv.org/abs/1802.02611)<br>\n",
+    "[Rethinking Atrous Convolution for Semantic Image\n",
+    "Segmentation](https://arxiv.org/abs/1706.05587)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "vgm-Z4Rus5ti"
+   },
+   "source": [
+    "## Setup and Imports\n",
+    "\n",
+    "Let's install the dependencies and import the necessary modules.\n",
+    "\n",
+    "To run this tutorial, you will need to install the following packages:\n",
+    "\n",
+    "* `keras-cv`\n",
+    "* `keras-core`"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "89IDcffts5ti"
+   },
+   "outputs": [],
+   "source": [
+    "!pip install -q --upgrade keras-cv\n",
+    "!pip install -q --upgrade keras # Upgrade to Keras 3."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "aT_RCAG3s5tj"
+   },
+   "source": [
+    "After installing `keras-core` and `keras-cv`, set the backend for `keras-core`.\n",
+    "This guide can be run with any backend (Tensorflow, JAX, PyTorch).\n",
+    "\n",
+    "```\n",
+    "import os\n",
+    "\n",
+    "os.environ[\"KERAS_BACKEND\"] = \"jax\"\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "xRyHrUEDs5tj"
+   },
+   "outputs": [],
+   "source": [
+    "import keras\n",
+    "from keras import ops\n",
+    "\n",
+    "import keras_cv\n",
+    "import numpy as np\n",
+    "\n",
+    "from keras_cv.datasets.pascal_voc.segmentation import load as load_voc"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "98f7WhdZs5tj"
+   },
+   "source": [
+    "## Perform semantic segmentation with a pretrained DeepLabv3+ model\n",
+    "\n",
+    "The highest level API in the KerasCV semantic segmentation API is the `keras_cv.models`\n",
+    "API. This API includes fully pretrained semantic segmentation models, such as\n",
+    "`keras_cv.models.DeepLabV3Plus`.\n",
+    "\n",
+    "Let's get started by constructing a DeepLabv3+ pretrained on the pascalvoc dataset."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "M97l1P2Ms5tj"
+   },
+   "outputs": [],
+   "source": [
+    "model = keras_cv.models.DeepLabV3Plus.from_preset(\n",
+    "    \"deeplab_v3_plus_resnet50_pascalvoc\",\n",
+    "    num_classes=21,\n",
+    "    input_shape=[512, 512, 3],\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "9lUDEOr4s5tk"
+   },
+   "source": [
+    "Let us visualize the results of this pretrained model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "nUzsOeyqs5tk"
+   },
+   "outputs": [],
+   "source": [
+    "filepath = keras.utils.get_file(origin=\"https://i.imgur.com/gCNcJJI.jpg\")\n",
+    "image = keras.utils.load_img(filepath)\n",
+    "\n",
+    "resize = keras_cv.layers.Resizing(height=512, width=512)\n",
+    "image = resize(image)\n",
+    "image = keras.ops.expand_dims(np.array(image), axis=0)\n",
+    "preds = ops.expand_dims(ops.argmax(model(image), axis=-1), axis=-1)\n",
+    "keras_cv.visualization.plot_segmentation_mask_gallery(\n",
+    "    image,\n",
+    "    value_range=(0, 255),\n",
+    "    num_classes=1,\n",
+    "    y_true=None,\n",
+    "    y_pred=preds,\n",
+    "    scale=3,\n",
+    "    rows=1,\n",
+    "    cols=1,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "vyqoiZcis5tk"
+   },
+   "source": [
+    "## Train a custom semantic segmentation model\n",
+    "In this guide, we'll assemble a full training pipeline for a KerasCV DeepLabV3 semantic\n",
+    "segmentation model. This includes data loading, augmentation, training, metric\n",
+    "evaluation, and inference!"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "bLz1WdoZs5tk"
+   },
+   "source": [
+    "## Download the data\n",
+    "\n",
+    "We download\n",
+    "[Pascal VOC dataset](https://www.eecs.berkeley.edu/Research/Projects/CS/vision/grouping/semantic_contours/benchmark.tgz)\n",
+    "with KerasCV datasets and split them into train dataset `train_ds` and `eval_ds`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "nfB7NSHHs5tk"
+   },
+   "outputs": [],
+   "source": [
+    "train_ds = load_voc(split=\"sbd_train\")\n",
+    "eval_ds = load_voc(split=\"sbd_eval\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "fFF-YE1fs5tl"
+   },
+   "source": [
+    "## Preprocess the data\n",
+    "\n",
+    "The `preprocess_tfds_inputs` utility function preprocesses the inputs to a dictionary of\n",
+    "`images` and `segmentation_masks`. The images and segmentation masks are resized to\n",
+    "512x512. The resulting dataset is then batched into groups of 4 image and segmentation\n",
+    "mask pairs.\n",
+    "\n",
+    "A batch of this preprocessed input training data can be visualized using the\n",
+    "`keras_cv.visualization.plot_segmentation_mask_gallery` function. This function takes a\n",
+    "batch of images and segmentation masks as input and displays them in a grid."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "mD0Y8iMLs5tl"
+   },
+   "outputs": [],
+   "source": [
+    "def preprocess_tfds_inputs(inputs):\n",
+    "    def unpackage_tfds_inputs(tfds_inputs):\n",
+    "        return {\n",
+    "            \"images\": tfds_inputs[\"image\"],\n",
+    "            \"segmentation_masks\": tfds_inputs[\"class_segmentation\"],\n",
+    "        }\n",
+    "\n",
+    "    outputs = inputs.map(unpackage_tfds_inputs)\n",
+    "    outputs = outputs.map(keras_cv.layers.Resizing(height=512, width=512))\n",
+    "    outputs = outputs.batch(4, drop_remainder=True)\n",
+    "    return outputs\n",
+    "\n",
+    "\n",
+    "train_ds = preprocess_tfds_inputs(train_ds)\n",
+    "batch = train_ds.take(1).get_single_element()\n",
+    "keras_cv.visualization.plot_segmentation_mask_gallery(\n",
+    "    batch[\"images\"],\n",
+    "    value_range=(0, 255),\n",
+    "    num_classes=21,  # The number of classes for the oxford iiit pet dataset. The VOC dataset also includes 1 class for the background.\n",
+    "    y_true=batch[\"segmentation_masks\"],\n",
+    "    scale=3,\n",
+    "    rows=2,\n",
+    "    cols=2,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "7NIGx0zHs5tl"
+   },
+   "source": [
+    "The preprocessing is applied to the evaluation dataset `eval_ds`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "t0264OIJs5tl"
+   },
+   "outputs": [],
+   "source": [
+    "eval_ds = preprocess_tfds_inputs(eval_ds)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "KfPbd-TTs5tl"
+   },
+   "source": [
+    "## Data Augmentation\n",
+    "\n",
+    "KerasCV provides a variety of image augmentation options. In this example, we will use\n",
+    "the `RandomFlip` augmentation to augment the training dataset. The `RandomFlip`\n",
+    "augmentation randomly flips the images in the training dataset horizontally or\n",
+    "vertically. This can help to improve the model's robustness to changes in the orientation\n",
+    "of the objects in the images."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "W_0Ei44ls5tl"
+   },
+   "outputs": [],
+   "source": [
+    "train_ds = train_ds.map(keras_cv.layers.RandomFlip())\n",
+    "batch = train_ds.take(1).get_single_element()\n",
+    "\n",
+    "keras_cv.visualization.plot_segmentation_mask_gallery(\n",
+    "    batch[\"images\"],\n",
+    "    value_range=(0, 255),\n",
+    "    num_classes=21,\n",
+    "    y_true=batch[\"segmentation_masks\"],\n",
+    "    scale=3,\n",
+    "    rows=2,\n",
+    "    cols=2,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "M99ecGY4s5tm"
+   },
+   "source": [
+    "## Model Configuration\n",
+    "\n",
+    "Please feel free to modify the configurations for model training and note how the\n",
+    "training results changes. This is an great exercise to get a better understanding of the\n",
+    "training pipeline.\n",
+    "\n",
+    "The learning rate schedule is used by the optimizer to calculate the learning rate for\n",
+    "each epoch. The optimizer then uses the learning rate to update the weights of the model.\n",
+    "In this case, the learning rate schedule uses a cosine decay function. A cosine decay\n",
+    "function starts high and then decreases over time, eventually reaching zero. The\n",
+    "cardinality of the VOC dataset is 2124 with a batch size of 4. The dataset cardinality\n",
+    "is important for learning rate decay because it determines how many steps the model\n",
+    "will train for. The initial learning rate is proportional to 0.007 and the decay\n",
+    "steps are 2124. This means that the learning rate will start at `INITIAL_LR` and then\n",
+    "decrease to zero over 2124 steps.\n",
+    "![png](/img/guides/semantic_segmentation_deeplab_v3_plus/learning_rate_schedule.png)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "4zqr0oF5s5tm"
+   },
+   "outputs": [],
+   "source": [
+    "BATCH_SIZE = 4\n",
+    "INITIAL_LR = 0.007 * BATCH_SIZE / 16\n",
+    "EPOCHS = 1\n",
+    "NUM_CLASSES = 21\n",
+    "learning_rate = keras.optimizers.schedules.CosineDecay(\n",
+    "    INITIAL_LR,\n",
+    "    decay_steps=EPOCHS * 2124,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "ES4SUSims5tm"
+   },
+   "source": [
+    "We instantiate a DeepLabV3+ model with a ResNet50 backbone pretrained on ImageNet classification:\n",
+    "`resnet50_v2_imagenet` pre-trained weights will be used as the backbone feature\n",
+    "extractor for the DeepLabV3Plus model. The `num_classes` parameter specifies the number of\n",
+    "classes that the model will be trained to segment."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "LoNY90Cgs5tm"
+   },
+   "outputs": [],
+   "source": [
+    "model = keras_cv.models.DeepLabV3Plus.from_preset(\n",
+    "    \"resnet50_v2_imagenet\", num_classes=NUM_CLASSES\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "wlwA_LTUs5tm"
+   },
+   "source": [
+    "## Compile the model\n",
+    "\n",
+    "The model.compile() function sets up the training process for the model. It defines the\n",
+    "- optimization algorithm - Stochastic Gradient Descent (SGD)\n",
+    "- the loss function - categorical cross-entropy\n",
+    "- the evaluation metrics - Mean IoU and categorical accuracy\n",
+    "\n",
+    "Semantic segmentation evaluation metrics:\n",
+    "\n",
+    "Mean Intersection over Union (MeanIoU):\n",
+    "MeanIoU measures how well a semantic segmentation model accurately identifies\n",
+    "and delineates different objects or regions in an image. It calculates the\n",
+    "overlap between predicted and actual object boundaries, providing a score\n",
+    "between 0 and 1, where 1 represents a perfect match.\n",
+    "\n",
+    "Categorical Accuracy:\n",
+    "Categorical Accuracy measures the proportion of correctly classified pixels in\n",
+    "an image. It gives a simple percentage indicating how accurately the model\n",
+    "predicts the categories of pixels in the entire image.\n",
+    "\n",
+    "In essence, MeanIoU emphasizes the accuracy of identifying specific object\n",
+    "boundaries, while Categorical Accuracy gives a broad overview of overall\n",
+    "pixel-level correctness."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "uM-Im0Mjs5tn"
+   },
+   "outputs": [],
+   "source": [
+    "model.compile(\n",
+    "    optimizer=keras.optimizers.SGD(\n",
+    "        learning_rate=learning_rate,\n",
+    "        weight_decay=0.0001,\n",
+    "        momentum=0.9,\n",
+    "        clipnorm=10.0,\n",
+    "    ),\n",
+    "    loss=keras.losses.CategoricalCrossentropy(from_logits=False),\n",
+    "    metrics=[\n",
+    "        keras.metrics.MeanIoU(\n",
+    "            num_classes=NUM_CLASSES, sparse_y_true=False, sparse_y_pred=False\n",
+    "        ),\n",
+    "        keras.metrics.CategoricalAccuracy(),\n",
+    "    ],\n",
+    ")\n",
+    "\n",
+    "model.summary()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "Buh6A_1fs5tn"
+   },
+   "source": [
+    "The utility function `dict_to_tuple` effectively transforms the dictionaries of training\n",
+    "and validation datasets into tuples of images and one-hot encoded segmentation masks,\n",
+    "which is used during training and evaluation of the DeepLabv3+ model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "kOLcpKLbs5tn"
+   },
+   "outputs": [],
+   "source": [
+    "def dict_to_tuple(x):\n",
+    "    import tensorflow as tf\n",
+    "\n",
+    "    return x[\"images\"], tf.one_hot(\n",
+    "        tf.cast(tf.squeeze(x[\"segmentation_masks\"], axis=-1), \"int32\"), 21\n",
+    "    )\n",
+    "\n",
+    "\n",
+    "train_ds = train_ds.map(dict_to_tuple)\n",
+    "eval_ds = eval_ds.map(dict_to_tuple)\n",
+    "\n",
+    "model.fit(train_ds, validation_data=eval_ds, epochs=EPOCHS)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "r8ZSZmtPs5tn"
+   },
+   "source": [
+    "## Predictions with trained model\n",
+    "Now that the model training of DeepLabv3+ has completed, let's test it by making\n",
+    "predications\n",
+    "on a few sample images."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "RG07dyEUs5tn"
+   },
+   "outputs": [],
+   "source": [
+    "test_ds = load_voc(split=\"sbd_eval\")\n",
+    "test_ds = preprocess_tfds_inputs(test_ds)\n",
+    "\n",
+    "images, masks = next(iter(train_ds.take(1)))\n",
+    "images = ops.convert_to_tensor(images)\n",
+    "masks = ops.convert_to_tensor(masks)\n",
+    "preds = ops.expand_dims(ops.argmax(model(images), axis=-1), axis=-1)\n",
+    "masks = ops.expand_dims(ops.argmax(masks, axis=-1), axis=-1)\n",
+    "\n",
+    "keras_cv.visualization.plot_segmentation_mask_gallery(\n",
+    "    images,\n",
+    "    value_range=(0, 255),\n",
+    "    num_classes=21,\n",
+    "    y_true=masks,\n",
+    "    y_pred=preds,\n",
+    "    scale=3,\n",
+    "    rows=1,\n",
+    "    cols=4,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "loWDjb1_s5tn"
+   },
+   "source": [
+    "Here are some additional tips for using the KerasCV DeepLabv3+ model:\n",
+    "\n",
+    "- The model can be trained on a variety of datasets, including the COCO dataset, the\n",
+    "PASCAL VOC dataset, and the Cityscapes dataset.\n",
+    "- The model can be fine-tuned on a custom dataset to improve its performance on a\n",
+    "specific task.\n",
+    "- The model can be used to perform real-time inference on images.\n",
+    "- Also, try out KerasCV's SegFormer model `keras_cv.models.segmentation.SegFormer`. The\n",
+    "SegFormer model is a newer model that has been shown to achieve state-of-the-art results\n",
+    "on a variety of image segmentation benchmarks. It is based on the Swin Transformer\n",
+    "architecture, and it is more efficient and accurate than previous image segmentation\n",
+    "models."
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "name": "semantic_segmentation_deeplab_v3_plus",
+   "provenance": [],
+   "toc_visible": true
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
\ No newline at end of file

From 9dd547a725c081db0264023717f43d14301ea3d2 Mon Sep 17 00:00:00 2001
From: Aritra Roy Gosthipaty <aritra.born2fly@gmail.com>
Date: Tue, 27 Feb 2024 17:19:15 +0530
Subject: [PATCH 30/30] [Port] OD layers to Keras 3 (#2295)

* chore: porting roi aling to keras 3

* chore: fixing the scope, using ones in place of constant

* chore: porting roi generation to keras 3 with test

note: the nms bit reproduces -1 instead of 0

* chore: port roi pooling

* chore: fix pool and port sampler

* chore: port label encoder

* chore: swap get_shape with ops.shape

* lint error

* chore: porting sampling to keras 3

* lint fix

* chore: using random from backend

* chore: disabling flaky test

* chore: disable roi sampler test

* chore: ignore lint

* chore: skipping test the right way

* chore: using ops shape

* chore: tests pass for all backends

removed vectorized map as it was not working for jax and torch
used ops convert_to_numpy in tests to make np operations work on torch tensor

* chore: explicit type cast to int32
---
 keras_cv/layers/object_detection/roi_align.py | 432 +++++++++---------
 .../layers/object_detection/roi_generator.py  |  87 ++--
 .../object_detection/roi_generator_test.py    |  94 ++--
 keras_cv/layers/object_detection/roi_pool.py  |  59 ++-
 .../layers/object_detection/roi_pool_test.py  |  92 ++--
 .../layers/object_detection/roi_sampler.py    |  49 +-
 .../object_detection/roi_sampler_test.py      | 142 +++---
 .../object_detection/rpn_label_encoder.py     |  41 +-
 .../rpn_label_encoder_test.py                 |  74 +--
 keras_cv/layers/object_detection/sampling.py  |  53 ++-
 .../layers/object_detection/sampling_test.py  |  44 +-
 11 files changed, 584 insertions(+), 583 deletions(-)

diff --git a/keras_cv/layers/object_detection/roi_align.py b/keras_cv/layers/object_detection/roi_align.py
index 2c45060147..7821311af7 100644
--- a/keras_cv/layers/object_detection/roi_align.py
+++ b/keras_cv/layers/object_detection/roi_align.py
@@ -12,21 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Dict
-from typing import Mapping
 from typing import Optional
-from typing import Tuple
-
-import tensorflow as tf
-from tensorflow import keras
 
 from keras_cv import bounding_box
-from keras_cv.backend import assert_tf_keras
+from keras_cv.backend import keras
+from keras_cv.backend import ops
 
 
-def _feature_bilinear_interpolation(
-    features: tf.Tensor, kernel_y: tf.Tensor, kernel_x: tf.Tensor
-) -> tf.Tensor:
+def _feature_bilinear_interpolation(features, kernel_y, kernel_x):
     """
     Feature bilinear interpolation.
 
@@ -49,7 +42,7 @@ def _feature_bilinear_interpolation(
       A 5-D tensor representing feature crop of shape
       [batch_size, num_boxes, output_size, output_size, num_filters].
     """
-    features_shape = tf.shape(features)
+    features_shape = ops.shape(features)
     batch_size, num_boxes, output_size, num_filters = (
         features_shape[0],
         features_shape[1],
@@ -58,33 +51,39 @@ def _feature_bilinear_interpolation(
     )
 
     output_size = output_size // 2
-    kernel_y = tf.reshape(kernel_y, [batch_size, num_boxes, output_size * 2, 1])
-    kernel_x = tf.reshape(kernel_x, [batch_size, num_boxes, 1, output_size * 2])
+    kernel_y = ops.reshape(
+        kernel_y, [batch_size, num_boxes, output_size * 2, 1]
+    )
+    kernel_x = ops.reshape(
+        kernel_x, [batch_size, num_boxes, 1, output_size * 2]
+    )
     # Use implicit broadcast to generate the interpolation kernel. The
     # multiplier `4` is for avg pooling.
     interpolation_kernel = kernel_y * kernel_x * 4
 
     # Interpolate the gathered features with computed interpolation kernels.
-    features *= tf.cast(
-        tf.expand_dims(interpolation_kernel, axis=-1), dtype=features.dtype
+    features *= ops.cast(
+        ops.expand_dims(interpolation_kernel, axis=-1), dtype=features.dtype
     )
-    features = tf.reshape(
+    features = ops.reshape(
         features,
         [batch_size * num_boxes, output_size * 2, output_size * 2, num_filters],
     )
-    features = tf.nn.avg_pool(features, [1, 2, 2, 1], [1, 2, 2, 1], "VALID")
-    features = tf.reshape(
+    features = ops.nn.average_pool(
+        features, [1, 2, 2, 1], [1, 2, 2, 1], "VALID"
+    )
+    features = ops.reshape(
         features, [batch_size, num_boxes, output_size, output_size, num_filters]
     )
     return features
 
 
 def _compute_grid_positions(
-    boxes: tf.Tensor,
-    boundaries: tf.Tensor,
-    output_size: int,
-    sample_offset: float,
-) -> Tuple[tf.Tensor, tf.Tensor, tf.Tensor, tf.Tensor]:
+    boxes,
+    boundaries,
+    output_size,
+    sample_offset,
+):
     """
     Computes the grid position w.r.t. the corresponding feature map.
 
@@ -108,10 +107,10 @@ def _compute_grid_positions(
       box_grid_y0y1: Tensor of size [batch_size, boxes, output_size, 2]
       box_grid_x0x1: Tensor of size [batch_size, boxes, output_size, 2]
     """
-    boxes_shape = tf.shape(boxes)
+    boxes_shape = ops.shape(boxes)
     batch_size, num_boxes = boxes_shape[0], boxes_shape[1]
     if batch_size is None:
-        batch_size = tf.shape(boxes)[0]
+        batch_size = ops.shape(boxes)[0]
     box_grid_x = []
     box_grid_y = []
     for i in range(output_size):
@@ -121,29 +120,33 @@ def _compute_grid_positions(
         box_grid_y.append(
             boxes[:, :, 0] + (i + sample_offset) * boxes[:, :, 2] / output_size
         )
-    box_grid_x = tf.stack(box_grid_x, axis=2)
-    box_grid_y = tf.stack(box_grid_y, axis=2)
+    box_grid_x = ops.stack(box_grid_x, axis=2)
+    box_grid_y = ops.stack(box_grid_y, axis=2)
 
-    box_grid_y0 = tf.floor(box_grid_y)
-    box_grid_x0 = tf.floor(box_grid_x)
-    box_grid_x0 = tf.maximum(tf.cast(0.0, dtype=box_grid_x0.dtype), box_grid_x0)
-    box_grid_y0 = tf.maximum(tf.cast(0.0, dtype=box_grid_y0.dtype), box_grid_y0)
+    box_grid_y0 = ops.floor(box_grid_y)
+    box_grid_x0 = ops.floor(box_grid_x)
+    box_grid_x0 = ops.maximum(
+        ops.cast(0.0, dtype=box_grid_x0.dtype), box_grid_x0
+    )
+    box_grid_y0 = ops.maximum(
+        ops.cast(0.0, dtype=box_grid_y0.dtype), box_grid_y0
+    )
 
-    box_grid_x0 = tf.minimum(
-        box_grid_x0, tf.expand_dims(boundaries[:, :, 1], -1)
+    box_grid_x0 = ops.minimum(
+        box_grid_x0, ops.expand_dims(boundaries[:, :, 1], -1)
     )
-    box_grid_x1 = tf.minimum(
-        box_grid_x0 + 1, tf.expand_dims(boundaries[:, :, 1], -1)
+    box_grid_x1 = ops.minimum(
+        box_grid_x0 + 1, ops.expand_dims(boundaries[:, :, 1], -1)
     )
-    box_grid_y0 = tf.minimum(
-        box_grid_y0, tf.expand_dims(boundaries[:, :, 0], -1)
+    box_grid_y0 = ops.minimum(
+        box_grid_y0, ops.expand_dims(boundaries[:, :, 0], -1)
     )
-    box_grid_y1 = tf.minimum(
-        box_grid_y0 + 1, tf.expand_dims(boundaries[:, :, 0], -1)
+    box_grid_y1 = ops.minimum(
+        box_grid_y0 + 1, ops.expand_dims(boundaries[:, :, 0], -1)
     )
 
-    box_gridx0x1 = tf.stack([box_grid_x0, box_grid_x1], axis=-1)
-    box_gridy0y1 = tf.stack([box_grid_y0, box_grid_y1], axis=-1)
+    box_gridx0x1 = ops.stack([box_grid_x0, box_grid_x1], axis=-1)
+    box_gridy0y1 = ops.stack([box_grid_y0, box_grid_y1], axis=-1)
 
     # The RoIAlign feature f can be computed by bilinear interpolation of four
     # neighboring feature points f0, f1, f2, and f3.
@@ -155,21 +158,21 @@ def _compute_grid_positions(
     lx = box_grid_x - box_grid_x0
     hy = 1.0 - ly
     hx = 1.0 - lx
-    kernel_y = tf.reshape(
-        tf.stack([hy, ly], axis=3), [batch_size, num_boxes, output_size, 2, 1]
+    kernel_y = ops.reshape(
+        ops.stack([hy, ly], axis=3), [batch_size, num_boxes, output_size, 2, 1]
     )
-    kernel_x = tf.reshape(
-        tf.stack([hx, lx], axis=3), [batch_size, num_boxes, output_size, 2, 1]
+    kernel_x = ops.reshape(
+        ops.stack([hx, lx], axis=3), [batch_size, num_boxes, output_size, 2, 1]
     )
     return kernel_y, kernel_x, box_gridy0y1, box_gridx0x1
 
 
 def multilevel_crop_and_resize(
-    features: Dict[str, tf.Tensor],
-    boxes: tf.Tensor,
+    features,
+    boxes,
     output_size: int = 7,
     sample_offset: float = 0.5,
-) -> tf.Tensor:
+):
     """
     Crop and resize on multilevel feature pyramid.
 
@@ -194,182 +197,181 @@ def multilevel_crop_and_resize(
       [batch_size, num_boxes, output_size, output_size, num_filters].
     """
 
-    with tf.name_scope("multilevel_crop_and_resize"):
-        levels_str = list(features.keys())
-        # Levels are represented by strings with a prefix "P" to represent
-        # pyramid levels. The integer level can be obtained by looking at
-        # the value that follows the "P".
-        levels = [int(level_str[1:]) for level_str in levels_str]
-        min_level = min(levels)
-        max_level = max(levels)
-        features_shape = tf.shape(features[f"P{min_level}"])
-        batch_size, max_feature_height, max_feature_width, num_filters = (
-            features_shape[0],
-            features_shape[1],
-            features_shape[2],
-            features_shape[3],
-        )
+    levels_str = list(features.keys())
+    # Levels are represented by strings with a prefix "P" to represent
+    # pyramid levels. The integer level can be obtained by looking at
+    # the value that follows the "P".
+    levels = [int(level_str[1:]) for level_str in levels_str]
+    min_level = min(levels)
+    max_level = max(levels)
+    features_shape = ops.shape(features[f"P{min_level}"])
+    batch_size, max_feature_height, max_feature_width, num_filters = (
+        features_shape[0],
+        features_shape[1],
+        features_shape[2],
+        features_shape[3],
+    )
 
-        num_boxes = tf.shape(boxes)[1]
-
-        # Stack feature pyramid into a features_all of shape
-        # [batch_size, levels, height, width, num_filters].
-        features_all = []
-        feature_heights = []
-        feature_widths = []
-        for level in range(min_level, max_level + 1):
-            shape = features[f"P{level}"].get_shape().as_list()
-            feature_heights.append(shape[1])
-            feature_widths.append(shape[2])
-            # Concat tensor of [batch_size, height_l * width_l, num_filters] for
-            # each level.
-            features_all.append(
-                tf.reshape(features[f"P{level}"], [batch_size, -1, num_filters])
-            )
-        features_r2 = tf.reshape(tf.concat(features_all, 1), [-1, num_filters])
-
-        # Calculate height_l * width_l for each level.
-        level_dim_sizes = [
-            feature_widths[i] * feature_heights[i]
-            for i in range(len(feature_widths))
-        ]
-        # level_dim_offsets is accumulated sum of level_dim_size.
-        level_dim_offsets = [0]
-        for i in range(len(feature_widths) - 1):
-            level_dim_offsets.append(level_dim_offsets[i] + level_dim_sizes[i])
-        batch_dim_size = level_dim_offsets[-1] + level_dim_sizes[-1]
-        level_dim_offsets = tf.constant(level_dim_offsets, tf.int32)
-        height_dim_sizes = tf.constant(feature_widths, tf.int32)
-
-        # Assigns boxes to the right level.
-        box_width = boxes[:, :, 3] - boxes[:, :, 1]
-        box_height = boxes[:, :, 2] - boxes[:, :, 0]
-        areas_sqrt = tf.sqrt(
-            tf.cast(box_height, tf.float32) * tf.cast(box_width, tf.float32)
+    num_boxes = ops.shape(boxes)[1]
+
+    # Stack feature pyramid into a features_all of shape
+    # [batch_size, levels, height, width, num_filters].
+    features_all = []
+    feature_heights = []
+    feature_widths = []
+    for level in range(min_level, max_level + 1):
+        shape = ops.shape(features[f"P{level}"])
+        feature_heights.append(shape[1])
+        feature_widths.append(shape[2])
+        # Concat tensor of [batch_size, height_l * width_l, num_filters] for
+        # each level.
+        features_all.append(
+            ops.reshape(features[f"P{level}"], [batch_size, -1, num_filters])
         )
+    features_r2 = ops.reshape(
+        ops.concatenate(features_all, 1), [-1, num_filters]
+    )
 
-        # following the FPN paper to divide by 224.
-        levels = tf.cast(
-            tf.math.floordiv(
-                tf.math.log(tf.math.divide_no_nan(areas_sqrt, 224.0)),
-                tf.math.log(2.0),
-            )
-            + 4.0,
-            dtype=tf.int32,
-        )
-        # Maps levels between [min_level, max_level].
-        levels = tf.minimum(max_level, tf.maximum(levels, min_level))
+    # Calculate height_l * width_l for each level.
+    level_dim_sizes = [
+        feature_widths[i] * feature_heights[i]
+        for i in range(len(feature_widths))
+    ]
+    # level_dim_offsets is accumulated sum of level_dim_size.
+    level_dim_offsets = [0]
+    for i in range(len(feature_widths) - 1):
+        level_dim_offsets.append(level_dim_offsets[i] + level_dim_sizes[i])
+    batch_dim_size = level_dim_offsets[-1] + level_dim_sizes[-1]
+    level_dim_offsets = (
+        ops.ones_like(level_dim_offsets, dtype="int32") * level_dim_offsets
+    )
+    height_dim_sizes = (
+        ops.ones_like(feature_widths, dtype="int32") * feature_widths
+    )
 
-        # Projects box location and sizes to corresponding feature levels.
-        scale_to_level = tf.cast(
-            tf.pow(tf.constant(2.0), tf.cast(levels, tf.float32)),
-            dtype=boxes.dtype,
+    # Assigns boxes to the right level.
+    box_width = boxes[:, :, 3] - boxes[:, :, 1]
+    box_height = boxes[:, :, 2] - boxes[:, :, 0]
+    areas_sqrt = ops.sqrt(
+        ops.cast(box_height, "float32") * ops.cast(box_width, "float32")
+    )
+
+    # following the FPN paper to divide by 224.
+    levels = ops.cast(
+        ops.floor_divide(
+            ops.log(ops.divide(areas_sqrt, 224.0)),
+            ops.log(2.0),
         )
-        boxes /= tf.expand_dims(scale_to_level, axis=2)
-        box_width /= scale_to_level
-        box_height /= scale_to_level
-        boxes = tf.concat(
+        + 4.0,
+        dtype="int32",
+    )
+    # Maps levels between [min_level, max_level].
+    levels = ops.minimum(max_level, ops.maximum(levels, min_level))
+
+    # Projects box location and sizes to corresponding feature levels.
+    scale_to_level = ops.cast(
+        ops.pow(2.0, ops.cast(levels, "float32")),
+        dtype=boxes.dtype,
+    )
+    boxes /= ops.expand_dims(scale_to_level, axis=2)
+    box_width /= scale_to_level
+    box_height /= scale_to_level
+    boxes = ops.concatenate(
+        [
+            boxes[:, :, 0:2],
+            ops.expand_dims(box_height, -1),
+            ops.expand_dims(box_width, -1),
+        ],
+        axis=-1,
+    )
+
+    # Maps levels to [0, max_level-min_level].
+    levels -= min_level
+    level_strides = ops.pow([[2.0]], ops.cast(levels, "float32"))
+    boundary = ops.cast(
+        ops.concatenate(
             [
-                boxes[:, :, 0:2],
-                tf.expand_dims(box_height, -1),
-                tf.expand_dims(box_width, -1),
+                ops.expand_dims(
+                    [[ops.cast(max_feature_height, "float32")]] / level_strides
+                    - 1,
+                    axis=-1,
+                ),
+                ops.expand_dims(
+                    [[ops.cast(max_feature_width, "float32")]] / level_strides
+                    - 1,
+                    axis=-1,
+                ),
             ],
             axis=-1,
-        )
-
-        # Maps levels to [0, max_level-min_level].
-        levels -= min_level
-        level_strides = tf.pow([[2.0]], tf.cast(levels, tf.float32))
-        boundary = tf.cast(
-            tf.concat(
-                [
-                    tf.expand_dims(
-                        [[tf.cast(max_feature_height, tf.float32)]]
-                        / level_strides
-                        - 1,
-                        axis=-1,
-                    ),
-                    tf.expand_dims(
-                        [[tf.cast(max_feature_width, tf.float32)]]
-                        / level_strides
-                        - 1,
-                        axis=-1,
-                    ),
-                ],
-                axis=-1,
-            ),
-            boxes.dtype,
-        )
+        ),
+        boxes.dtype,
+    )
 
-        # Compute grid positions.
-        (
-            kernel_y,
-            kernel_x,
-            box_gridy0y1,
-            box_gridx0x1,
-        ) = _compute_grid_positions(boxes, boundary, output_size, sample_offset)
-
-        x_indices = tf.cast(
-            tf.reshape(box_gridx0x1, [batch_size, num_boxes, output_size * 2]),
-            dtype=tf.int32,
-        )
-        y_indices = tf.cast(
-            tf.reshape(box_gridy0y1, [batch_size, num_boxes, output_size * 2]),
-            dtype=tf.int32,
-        )
+    # Compute grid positions.
+    (
+        kernel_y,
+        kernel_x,
+        box_gridy0y1,
+        box_gridx0x1,
+    ) = _compute_grid_positions(boxes, boundary, output_size, sample_offset)
+
+    x_indices = ops.cast(
+        ops.reshape(box_gridx0x1, [batch_size, num_boxes, output_size * 2]),
+        dtype="int32",
+    )
+    y_indices = ops.cast(
+        ops.reshape(box_gridy0y1, [batch_size, num_boxes, output_size * 2]),
+        dtype="int32",
+    )
 
-        batch_size_offset = tf.tile(
-            tf.reshape(
-                tf.range(batch_size) * batch_dim_size, [batch_size, 1, 1, 1]
-            ),
-            [1, num_boxes, output_size * 2, output_size * 2],
-        )
-        # Get level offset for each box. Each box belongs to one level.
-        levels_offset = tf.tile(
-            tf.reshape(
-                tf.gather(level_dim_offsets, levels),
-                [batch_size, num_boxes, 1, 1],
-            ),
-            [1, 1, output_size * 2, output_size * 2],
-        )
-        y_indices_offset = tf.tile(
-            tf.reshape(
-                y_indices
-                * tf.expand_dims(tf.gather(height_dim_sizes, levels), -1),
-                [batch_size, num_boxes, output_size * 2, 1],
-            ),
-            [1, 1, 1, output_size * 2],
-        )
-        x_indices_offset = tf.tile(
-            tf.reshape(x_indices, [batch_size, num_boxes, 1, output_size * 2]),
-            [1, 1, output_size * 2, 1],
-        )
-        indices = tf.reshape(
-            batch_size_offset
-            + levels_offset
-            + y_indices_offset
-            + x_indices_offset,
-            [-1],
-        )
+    batch_size_offset = ops.tile(
+        ops.reshape(
+            ops.arange(batch_size) * batch_dim_size, [batch_size, 1, 1, 1]
+        ),
+        [1, num_boxes, output_size * 2, output_size * 2],
+    )
+    # Get level offset for each box. Each box belongs to one level.
+    levels_offset = ops.tile(
+        ops.reshape(
+            ops.take(level_dim_offsets, levels),
+            [batch_size, num_boxes, 1, 1],
+        ),
+        [1, 1, output_size * 2, output_size * 2],
+    )
+    y_indices_offset = ops.tile(
+        ops.reshape(
+            y_indices * ops.expand_dims(ops.take(height_dim_sizes, levels), -1),
+            [batch_size, num_boxes, output_size * 2, 1],
+        ),
+        [1, 1, 1, output_size * 2],
+    )
+    x_indices_offset = ops.tile(
+        ops.reshape(x_indices, [batch_size, num_boxes, 1, output_size * 2]),
+        [1, 1, output_size * 2, 1],
+    )
+    indices = ops.reshape(
+        batch_size_offset + levels_offset + y_indices_offset + x_indices_offset,
+        [-1],
+    )
 
-        # TODO(tanzhenyu): replace tf.gather with tf.gather_nd and try to get
-        #  similar performance.
-        features_per_box = tf.reshape(
-            tf.gather(features_r2, indices),
-            [
-                batch_size,
-                num_boxes,
-                output_size * 2,
-                output_size * 2,
-                num_filters,
-            ],
-        )
+    # TODO(tanzhenyu): replace tf.gather with tf.gather_nd and try to get
+    #  similar performance.
+    features_per_box = ops.reshape(
+        ops.take(features_r2, indices),
+        [
+            batch_size,
+            num_boxes,
+            output_size * 2,
+            output_size * 2,
+            num_filters,
+        ],
+    )
 
-        # Bilinear interpolation.
-        features_per_box = _feature_bilinear_interpolation(
-            features_per_box, kernel_y, kernel_x
-        )
-        return features_per_box
+    # Bilinear interpolation.
+    features_per_box = _feature_bilinear_interpolation(
+        features_per_box, kernel_y, kernel_x
+    )
+    return features_per_box
 
 
 # TODO(tanzhenyu): Remove this implementation once roi_pool has better
@@ -395,7 +397,7 @@ def __init__(
           sample_offset: A `float` in [0, 1] of the subpixel sample offset.
           **kwargs: Additional keyword arguments passed to Layer.
         """
-        assert_tf_keras("keras_cv.layers._ROIAligner")
+        # assert_tf_keras("keras_cv.layers._ROIAligner")
         self._config_dict = {
             "bounding_box_format": bounding_box_format,
             "crop_size": target_size,
@@ -405,8 +407,8 @@ def __init__(
 
     def call(
         self,
-        features: Mapping[str, tf.Tensor],
-        boxes: tf.Tensor,
+        features,
+        boxes,
         training: Optional[bool] = None,
     ):
         """
diff --git a/keras_cv/layers/object_detection/roi_generator.py b/keras_cv/layers/object_detection/roi_generator.py
index db779f8d3f..da99dc080f 100644
--- a/keras_cv/layers/object_detection/roi_generator.py
+++ b/keras_cv/layers/object_detection/roi_generator.py
@@ -12,17 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Mapping
 from typing import Optional
-from typing import Tuple
-from typing import Union
 
-import tensorflow as tf
-from tensorflow import keras
-
-from keras_cv import bounding_box
 from keras_cv.api_export import keras_cv_export
-from keras_cv.backend import assert_tf_keras
+from keras_cv.backend import keras
+from keras_cv.backend import ops
+from keras_cv.layers import NonMaxSuppression
 
 
 @keras_cv_export("keras_cv.layers.ROIGenerator")
@@ -97,7 +92,6 @@ def __init__(
         post_nms_topk_test: int = 1000,
         **kwargs,
     ):
-        assert_tf_keras("keras_cv.layers.ROIGenerator")
         super().__init__(**kwargs)
         self.bounding_box_format = bounding_box_format
         self.pre_nms_topk_train = pre_nms_topk_train
@@ -112,10 +106,10 @@ def __init__(
 
     def call(
         self,
-        multi_level_boxes: Union[tf.Tensor, Mapping[int, tf.Tensor]],
-        multi_level_scores: Union[tf.Tensor, Mapping[int, tf.Tensor]],
+        multi_level_boxes,
+        multi_level_scores,
         training: Optional[bool] = None,
-    ) -> Tuple[tf.Tensor, tf.Tensor]:
+    ):
         """
         Args:
           multi_level_boxes: float Tensor. A dictionary or single Tensor of
@@ -131,7 +125,6 @@ def call(
           rois: float Tensor of [batch_size, post_nms_topk, 4]
           roi_scores: float Tensor of [batch_size, post_nms_topk]
         """
-
         if training:
             pre_nms_topk = self.pre_nms_topk_train
             post_nms_topk = self.post_nms_topk_train
@@ -144,53 +137,35 @@ def call(
             nms_iou_threshold = self.nms_iou_threshold_test
 
         def per_level_gen(boxes, scores):
-            scores_shape = scores.get_shape().as_list()
-            # scores can also be [batch_size, num_boxes, 1]
+            boxes = ops.convert_to_tensor(boxes, dtype="float32")
+            scores = ops.convert_to_tensor(scores, dtype="float32")
+            scores_shape = ops.shape(scores)
+            # Check if scores is a 3-dimensional tensor
+            # ([batch_size, num_boxes, 1])
+            # If so, remove the last dimension to make it 2D
             if len(scores_shape) == 3:
-                scores = tf.squeeze(scores, axis=-1)
-            _, num_boxes = scores.get_shape().as_list()
+                scores = ops.squeeze(scores, axis=-1)
+            _, num_boxes = scores_shape
             level_pre_nms_topk = min(num_boxes, pre_nms_topk)
             level_post_nms_topk = min(num_boxes, post_nms_topk)
-            scores, sorted_indices = tf.nn.top_k(
+            scores, sorted_indices = ops.top_k(
                 scores, k=level_pre_nms_topk, sorted=True
             )
-            boxes = tf.gather(boxes, sorted_indices, batch_dims=1)
-            # convert from input format to yxyx for the TF NMS operation
-            boxes = bounding_box.convert_format(
-                boxes,
-                source=self.bounding_box_format,
-                target="yxyx",
+            boxes = ops.take_along_axis(
+                boxes, sorted_indices[..., None], axis=1
             )
             # TODO(tanzhenyu): consider supporting soft / batched nms for accl
-            selected_indices, num_valid = tf.image.non_max_suppression_padded(
-                boxes,
-                scores,
-                max_output_size=level_post_nms_topk,
+            boxes = NonMaxSuppression(
+                bounding_box_format=self.bounding_box_format,
+                from_logits=False,
                 iou_threshold=nms_iou_threshold,
-                score_threshold=nms_score_threshold,
-                pad_to_max_output_size=True,
-                sorted_input=True,
-                canonicalized_coordinates=True,
-            )
-            # convert back to input format
-            boxes = bounding_box.convert_format(
-                boxes,
-                source="yxyx",
-                target=self.bounding_box_format,
-            )
-            level_rois = tf.gather(boxes, selected_indices, batch_dims=1)
-            level_roi_scores = tf.gather(scores, selected_indices, batch_dims=1)
-            level_rois = level_rois * tf.cast(
-                tf.reshape(tf.range(level_post_nms_topk), [1, -1, 1])
-                < tf.reshape(num_valid, [-1, 1, 1]),
-                level_rois.dtype,
-            )
-            level_roi_scores = level_roi_scores * tf.cast(
-                tf.reshape(tf.range(level_post_nms_topk), [1, -1])
-                < tf.reshape(num_valid, [-1, 1]),
-                level_roi_scores.dtype,
+                confidence_threshold=nms_score_threshold,
+                max_detections=level_post_nms_topk,
+            )(
+                box_prediction=boxes,
+                class_prediction=scores[..., None],
             )
-            return level_rois, level_roi_scores
+            return boxes["boxes"], boxes["confidence"]
 
         if not isinstance(multi_level_boxes, dict):
             return per_level_gen(multi_level_boxes, multi_level_scores)
@@ -204,14 +179,14 @@ def per_level_gen(boxes, scores):
             rois.append(level_rois)
             roi_scores.append(level_roi_scores)
 
-        rois = tf.concat(rois, axis=1)
-        roi_scores = tf.concat(roi_scores, axis=1)
-        _, num_valid_rois = roi_scores.get_shape().as_list()
+        rois = ops.concatenate(rois, axis=1)
+        roi_scores = ops.concatenate(roi_scores, axis=1)
+        _, num_valid_rois = ops.shape(roi_scores)
         overall_top_k = min(num_valid_rois, post_nms_topk)
-        roi_scores, sorted_indices = tf.nn.top_k(
+        roi_scores, sorted_indices = ops.top_k(
             roi_scores, k=overall_top_k, sorted=True
         )
-        rois = tf.gather(rois, sorted_indices, batch_dims=1)
+        rois = ops.take_along_axis(rois, sorted_indices[..., None], axis=1)
 
         return rois, roi_scores
 
diff --git a/keras_cv/layers/object_detection/roi_generator_test.py b/keras_cv/layers/object_detection/roi_generator_test.py
index 1513044b11..77d984d96f 100644
--- a/keras_cv/layers/object_detection/roi_generator_test.py
+++ b/keras_cv/layers/object_detection/roi_generator_test.py
@@ -12,8 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import numpy as np
 import pytest
-import tensorflow as tf
 
 from keras_cv.layers.object_detection.roi_generator import ROIGenerator
 from keras_cv.tests.test_case import TestCase
@@ -23,7 +23,7 @@
 class ROIGeneratorTest(TestCase):
     def test_single_tensor(self):
         roi_generator = ROIGenerator("xyxy", nms_iou_threshold_train=0.96)
-        rpn_boxes = tf.constant(
+        rpn_boxes = np.array(
             [
                 [
                     [0, 0, 10, 10],
@@ -33,26 +33,33 @@ def test_single_tensor(self):
                 ],
             ]
         )
-        expected_rois = tf.gather(rpn_boxes, [[1, 3, 2]], batch_dims=1)
-        expected_rois = tf.concat([expected_rois, tf.zeros([1, 1, 4])], axis=1)
-        rpn_scores = tf.constant(
+        indices = [1, 3, 2]
+        expected_rois = np.take(rpn_boxes, indices, axis=1)
+        expected_rois = np.concatenate(
+            [expected_rois, -np.ones([1, 1, 4])], axis=1
+        )
+        rpn_scores = np.array(
             [
                 [0.6, 0.9, 0.2, 0.3],
             ]
         )
         # selecting the 1st, then 3rd, then 2nd as they don't overlap
         # 0th box overlaps with 1st box
-        expected_roi_scores = tf.gather(rpn_scores, [[1, 3, 2]], batch_dims=1)
-        expected_roi_scores = tf.concat(
-            [expected_roi_scores, tf.zeros([1, 1])], axis=1
+        expected_roi_scores = np.take(rpn_scores, indices, axis=1)
+        expected_roi_scores = np.concatenate(
+            [expected_roi_scores, -np.ones([1, 1])], axis=1
+        )
+        rois, roi_scores = roi_generator(
+            multi_level_boxes=rpn_boxes,
+            multi_level_scores=rpn_scores,
+            training=True,
         )
-        rois, roi_scores = roi_generator(rpn_boxes, rpn_scores, training=True)
         self.assertAllClose(expected_rois, rois)
         self.assertAllClose(expected_roi_scores, roi_scores)
 
     def test_single_level_single_batch_roi_ignore_box(self):
         roi_generator = ROIGenerator("xyxy", nms_iou_threshold_train=0.96)
-        rpn_boxes = tf.constant(
+        rpn_boxes = np.array(
             [
                 [
                     [0, 0, 10, 10],
@@ -62,19 +69,22 @@ def test_single_level_single_batch_roi_ignore_box(self):
                 ],
             ]
         )
-        expected_rois = tf.gather(rpn_boxes, [[1, 3, 2]], batch_dims=1)
-        expected_rois = tf.concat([expected_rois, tf.zeros([1, 1, 4])], axis=1)
+        indices = [1, 3, 2]
+        expected_rois = np.take(rpn_boxes, indices, axis=1)
+        expected_rois = np.concatenate(
+            [expected_rois, -np.ones([1, 1, 4])], axis=1
+        )
         rpn_boxes = {2: rpn_boxes}
-        rpn_scores = tf.constant(
+        rpn_scores = np.array(
             [
                 [0.6, 0.9, 0.2, 0.3],
             ]
         )
         # selecting the 1st, then 3rd, then 2nd as they don't overlap
         # 0th box overlaps with 1st box
-        expected_roi_scores = tf.gather(rpn_scores, [[1, 3, 2]], batch_dims=1)
-        expected_roi_scores = tf.concat(
-            [expected_roi_scores, tf.zeros([1, 1])], axis=1
+        expected_roi_scores = np.take(rpn_scores, indices, axis=1)
+        expected_roi_scores = np.concatenate(
+            [expected_roi_scores, -np.ones([1, 1])], axis=1
         )
         rpn_scores = {2: rpn_scores}
         rois, roi_scores = roi_generator(rpn_boxes, rpn_scores, training=True)
@@ -85,7 +95,7 @@ def test_single_level_single_batch_roi_all_box(self):
         # for iou between 1st and 2nd box is 0.9604, so setting to 0.97 to
         # such that NMS would treat them as different ROIs
         roi_generator = ROIGenerator("xyxy", nms_iou_threshold_train=0.97)
-        rpn_boxes = tf.constant(
+        rpn_boxes = np.array(
             [
                 [
                     [0, 0, 10, 10],
@@ -95,17 +105,16 @@ def test_single_level_single_batch_roi_all_box(self):
                 ],
             ]
         )
-        expected_rois = tf.gather(rpn_boxes, [[1, 0, 3, 2]], batch_dims=1)
+        indices = [1, 0, 3, 2]
+        expected_rois = np.take(rpn_boxes, indices, axis=1)
         rpn_boxes = {2: rpn_boxes}
-        rpn_scores = tf.constant(
+        rpn_scores = np.array(
             [
                 [0.6, 0.9, 0.2, 0.3],
             ]
         )
         # selecting the 1st, then 0th, then 3rd, then 2nd as they don't overlap
-        expected_roi_scores = tf.gather(
-            rpn_scores, [[1, 0, 3, 2]], batch_dims=1
-        )
+        expected_roi_scores = np.take(rpn_scores, indices, axis=1)
         rpn_scores = {2: rpn_scores}
         rois, roi_scores = roi_generator(rpn_boxes, rpn_scores, training=True)
         self.assertAllClose(expected_rois, rois)
@@ -113,7 +122,7 @@ def test_single_level_single_batch_roi_all_box(self):
 
     def test_single_level_propose_rois(self):
         roi_generator = ROIGenerator("xyxy")
-        rpn_boxes = tf.constant(
+        rpn_boxes = np.array(
             [
                 [
                     [0, 0, 10, 10],
@@ -129,21 +138,22 @@ def test_single_level_propose_rois(self):
                 ],
             ]
         )
-        expected_rois = tf.gather(
-            rpn_boxes, [[1, 3, 2], [1, 3, 0]], batch_dims=1
+        indices = np.array([[1, 3, 2], [1, 3, 0]])
+        expected_rois = np.take_along_axis(
+            rpn_boxes, indices[:, :, None], axis=1
+        )
+        expected_rois = np.concatenate(
+            [expected_rois, -np.ones([2, 1, 4])], axis=1
         )
-        expected_rois = tf.concat([expected_rois, tf.zeros([2, 1, 4])], axis=1)
         rpn_boxes = {2: rpn_boxes}
-        rpn_scores = tf.constant([[0.6, 0.9, 0.2, 0.3], [0.1, 0.8, 0.3, 0.5]])
+        rpn_scores = np.array([[0.6, 0.9, 0.2, 0.3], [0.1, 0.8, 0.3, 0.5]])
         # 1st batch -- selecting the 1st, then 3rd, then 2nd as they don't
         #   overlap
         # 2nd batch -- selecting the 1st, then 3rd, then 0th as they don't
         #   overlap
-        expected_roi_scores = tf.gather(
-            rpn_scores, [[1, 3, 2], [1, 3, 0]], batch_dims=1
-        )
-        expected_roi_scores = tf.concat(
-            [expected_roi_scores, tf.zeros([2, 1])], axis=1
+        expected_roi_scores = np.take_along_axis(rpn_scores, indices, axis=1)
+        expected_roi_scores = np.concatenate(
+            [expected_roi_scores, -np.ones([2, 1])], axis=1
         )
         rpn_scores = {2: rpn_scores}
         rois, roi_scores = roi_generator(rpn_boxes, rpn_scores, training=True)
@@ -152,7 +162,7 @@ def test_single_level_propose_rois(self):
 
     def test_two_level_single_batch_propose_rois_ignore_box(self):
         roi_generator = ROIGenerator("xyxy")
-        rpn_boxes = tf.constant(
+        rpn_boxes = np.array(
             [
                 [
                     [0, 0, 10, 10],
@@ -168,7 +178,7 @@ def test_two_level_single_batch_propose_rois_ignore_box(self):
                 ],
             ]
         )
-        expected_rois = tf.constant(
+        expected_rois = np.array(
             [
                 [
                     [0.1, 0.1, 9.9, 9.9],
@@ -177,13 +187,13 @@ def test_two_level_single_batch_propose_rois_ignore_box(self):
                     [2, 2, 8, 8],
                     [5, 5, 10, 10],
                     [2, 2, 4, 4],
-                    [0, 0, 0, 0],
-                    [0, 0, 0, 0],
+                    [-1, -1, -1, -1],
+                    [-1, -1, -1, -1],
                 ]
             ]
         )
         rpn_boxes = {2: rpn_boxes[0:1], 3: rpn_boxes[1:2]}
-        rpn_scores = tf.constant([[0.6, 0.9, 0.2, 0.3], [0.1, 0.8, 0.3, 0.5]])
+        rpn_scores = np.array([[0.6, 0.9, 0.2, 0.3], [0.1, 0.8, 0.3, 0.5]])
         # 1st batch -- selecting the 1st, then 3rd, then 2nd as they don't
         #   overlap
         # 2nd batch -- selecting the 1st, then 3rd, then 0th as they don't
@@ -196,8 +206,8 @@ def test_two_level_single_batch_propose_rois_ignore_box(self):
                 0.3,
                 0.2,
                 0.1,
-                0.0,
-                0.0,
+                -1.0,
+                -1.0,
             ]
         ]
         rpn_scores = {2: rpn_scores[0:1], 3: rpn_scores[1:2]}
@@ -207,7 +217,7 @@ def test_two_level_single_batch_propose_rois_ignore_box(self):
 
     def test_two_level_single_batch_propose_rois_all_box(self):
         roi_generator = ROIGenerator("xyxy", nms_iou_threshold_train=0.99)
-        rpn_boxes = tf.constant(
+        rpn_boxes = np.array(
             [
                 [
                     [0, 0, 10, 10],
@@ -223,7 +233,7 @@ def test_two_level_single_batch_propose_rois_all_box(self):
                 ],
             ]
         )
-        expected_rois = tf.constant(
+        expected_rois = np.array(
             [
                 [
                     [0.1, 0.1, 9.9, 9.9],
@@ -238,7 +248,7 @@ def test_two_level_single_batch_propose_rois_all_box(self):
             ]
         )
         rpn_boxes = {2: rpn_boxes[0:1], 3: rpn_boxes[1:2]}
-        rpn_scores = tf.constant([[0.6, 0.9, 0.2, 0.3], [0.1, 0.8, 0.3, 0.5]])
+        rpn_scores = np.array([[0.6, 0.9, 0.2, 0.3], [0.1, 0.8, 0.3, 0.5]])
         # 1st batch -- selecting the 1st, then 0th, then 3rd, then 2nd as they
         #   don't overlap
         # 2nd batch -- selecting the 1st, then 3rd, then 2nd, then 0th as they
diff --git a/keras_cv/layers/object_detection/roi_pool.py b/keras_cv/layers/object_detection/roi_pool.py
index 3105b1d4be..6edbe5ea06 100644
--- a/keras_cv/layers/object_detection/roi_pool.py
+++ b/keras_cv/layers/object_detection/roi_pool.py
@@ -12,12 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import tensorflow as tf
-from tensorflow import keras
 
 from keras_cv import bounding_box
 from keras_cv.api_export import keras_cv_export
-from keras_cv.backend import assert_tf_keras
+from keras_cv.backend import keras
+from keras_cv.backend import ops
 
 
 @keras_cv_export("keras_cv.layers.ROIPooler")
@@ -59,7 +58,6 @@ def __init__(
         image_shape,
         **kwargs,
     ):
-        assert_tf_keras("keras_cv.layers.ROIPooler")
         if not isinstance(target_size, (tuple, list)):
             raise ValueError(
                 "Expected `target_size` to be tuple or list, got "
@@ -101,10 +99,14 @@ def call(self, feature_map, rois):
             target="rel_yxyx",
             image_shape=self.image_shape,
         )
-        pooled_feature_map = tf.vectorized_map(
-            self._pool_single_sample, (feature_map, rois)
-        )
-        return pooled_feature_map
+        pooled_feature_map = list()
+        for batch_idx in range(ops.shape(feature_map)[0]):
+            pooled_feature_map.append(
+                self._pool_single_sample(
+                    args=[feature_map[batch_idx], rois[batch_idx]]
+                )
+            )
+        return ops.stack(pooled_feature_map, axis=0)
 
     def _pool_single_sample(self, args):
         """
@@ -115,8 +117,8 @@ def _pool_single_sample(self, args):
           pooled_feature_map: [N, target_height, target_width, C] float Tensor
         """
         feature_map, rois = args
-        num_rois = rois.get_shape().as_list()[0]
-        height, width, channel = feature_map.get_shape().as_list()
+        num_rois = ops.shape(rois)[0]
+        height, width, channel = ops.shape(feature_map)
         regions = []
         # TODO (consider vectorize it for better performance)
         for n in range(num_rois):
@@ -133,33 +135,42 @@ def _pool_single_sample(self, args):
                 for j in range(self.target_width):
                     height_start = y_start + i * h_step
                     height_end = height_start + h_step
-                    height_start = tf.cast(height_start, tf.int32)
-                    height_end = tf.cast(height_end, tf.int32)
+                    height_start = ops.cast(height_start, "int32")
+                    height_end = ops.cast(height_end, "int32")
                     # if feature_map shape smaller than roi, h_step would be 0
                     # in this case the result will be feature_map[0, 0, ...]
-                    height_end = height_start + tf.maximum(
+                    height_end = height_start + ops.maximum(
                         1, height_end - height_start
                     )
                     width_start = x_start + j * w_step
                     width_end = width_start + w_step
-                    width_start = tf.cast(width_start, tf.int32)
-                    width_end = tf.cast(width_end, tf.int32)
-                    width_end = width_start + tf.maximum(
+                    width_start = ops.cast(width_start, "int32")
+                    width_end = ops.cast(width_end, "int32")
+                    width_end = width_start + ops.maximum(
                         1, width_end - width_start
                     )
                     # [h_step, w_step, C]
-                    region_step = feature_map[
-                        height_start:height_end, width_start:width_end, :
-                    ]
-                    # target_height * target_width * [C]
-                    region_steps.append(tf.reduce_max(region_step, axis=[0, 1]))
+                    region_step = ops.slice(
+                        inputs=feature_map,
+                        start_indices=[
+                            height_start,
+                            width_start,
+                            ops.convert_to_tensor(0, dtype="int32"),
+                        ],
+                        shape=[
+                            height_end - height_start,
+                            width_end - width_start,
+                            ops.shape(feature_map)[-1],
+                        ],
+                    )
+                    region_steps.append(ops.max(region_step, axis=[0, 1]))
             regions.append(
-                tf.reshape(
-                    tf.stack(region_steps),
+                ops.reshape(
+                    ops.stack(region_steps),
                     [self.target_height, self.target_width, channel],
                 )
             )
-        return tf.stack(regions)
+        return ops.stack(regions)
 
     def get_config(self):
         config = {
diff --git a/keras_cv/layers/object_detection/roi_pool_test.py b/keras_cv/layers/object_detection/roi_pool_test.py
index e605c3e5a7..be51f121e1 100644
--- a/keras_cv/layers/object_detection/roi_pool_test.py
+++ b/keras_cv/layers/object_detection/roi_pool_test.py
@@ -12,23 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import pytest
-import tensorflow as tf
+import numpy as np
 
 from keras_cv.layers.object_detection.roi_pool import ROIPooler
 from keras_cv.tests.test_case import TestCase
 
 
-@pytest.mark.tf_keras_only
 class ROIPoolTest(TestCase):
     def test_no_quantize(self):
         roi_pooler = ROIPooler(
             "rel_yxyx", target_size=[2, 2], image_shape=[224, 224, 3]
         )
-        feature_map = tf.expand_dims(
-            tf.reshape(tf.range(64), [8, 8, 1]), axis=0
+        feature_map = np.expand_dims(
+            np.reshape(np.arange(64), [8, 8, 1]), axis=0
         )
-        rois = tf.reshape(tf.constant([0.0, 0.0, 1.0, 1.0]), [1, 1, 4])
+        rois = np.reshape(np.array([0.0, 0.0, 1.0, 1.0]), [1, 1, 4])
         pooled_feature_map = roi_pooler(feature_map, rois)
         # the maximum value would be at bottom-right at each block, roi sharded
         # into 2x2 blocks
@@ -42,8 +40,8 @@ def test_no_quantize(self):
         # | 48, 49, 50, 51      | 52, 53, 54, 55        |
         # | 56, 57, 58, 59(max) | 60, 61, 62, 63(max)   |
         # --------------------------------------------
-        expected_feature_map = tf.reshape(
-            tf.constant([27, 31, 59, 63]), [1, 1, 2, 2, 1]
+        expected_feature_map = np.reshape(
+            np.array([27, 31, 59, 63]), [1, 1, 2, 2, 1]
         )
         self.assertAllClose(expected_feature_map, pooled_feature_map)
 
@@ -51,10 +49,10 @@ def test_roi_quantize_y(self):
         roi_pooler = ROIPooler(
             "yxyx", target_size=[2, 2], image_shape=[224, 224, 3]
         )
-        feature_map = tf.expand_dims(
-            tf.reshape(tf.range(64), [8, 8, 1]), axis=0
+        feature_map = np.expand_dims(
+            np.reshape(np.arange(64), [8, 8, 1]), axis=0
         )
-        rois = tf.reshape(tf.constant([0.0, 0.0, 224, 220]), [1, 1, 4])
+        rois = np.reshape(np.array([0.0, 0.0, 224, 220]), [1, 1, 4])
         pooled_feature_map = roi_pooler(feature_map, rois)
         # the maximum value would be at bottom-right at each block, roi sharded
         # into 2x2 blocks
@@ -68,8 +66,8 @@ def test_roi_quantize_y(self):
         # | 48, 49, 50          | 51, 52, 53, 54        | 55 (removed)
         # | 56, 57, 58(max)     | 59, 60, 61, 62(max)   | 63 (removed)
         # --------------------------------------------
-        expected_feature_map = tf.reshape(
-            tf.constant([26, 30, 58, 62]), [1, 1, 2, 2, 1]
+        expected_feature_map = np.reshape(
+            np.array([26, 30, 58, 62]), [1, 1, 2, 2, 1]
         )
         self.assertAllClose(expected_feature_map, pooled_feature_map)
 
@@ -77,10 +75,10 @@ def test_roi_quantize_x(self):
         roi_pooler = ROIPooler(
             "yxyx", target_size=[2, 2], image_shape=[224, 224, 3]
         )
-        feature_map = tf.expand_dims(
-            tf.reshape(tf.range(64), [8, 8, 1]), axis=0
+        feature_map = np.expand_dims(
+            np.reshape(np.arange(64), [8, 8, 1]), axis=0
         )
-        rois = tf.reshape(tf.constant([0.0, 0.0, 220, 224]), [1, 1, 4])
+        rois = np.reshape(np.array([0.0, 0.0, 220, 224]), [1, 1, 4])
         pooled_feature_map = roi_pooler(feature_map, rois)
         # the maximum value would be at bottom-right at each block, roi sharded
         # into 2x2 blocks
@@ -93,8 +91,8 @@ def test_roi_quantize_x(self):
         # | 40, 41, 42, 43      | 44, 45, 46, 47        |
         # | 48, 49, 50, 51(max) | 52, 53, 54, 55(max)   |
         # --------------------------------------------
-        expected_feature_map = tf.reshape(
-            tf.constant([19, 23, 51, 55]), [1, 1, 2, 2, 1]
+        expected_feature_map = np.reshape(
+            np.array([19, 23, 51, 55]), [1, 1, 2, 2, 1]
         )
         self.assertAllClose(expected_feature_map, pooled_feature_map)
 
@@ -102,10 +100,10 @@ def test_roi_quantize_h(self):
         roi_pooler = ROIPooler(
             "yxyx", target_size=[3, 2], image_shape=[224, 224, 3]
         )
-        feature_map = tf.expand_dims(
-            tf.reshape(tf.range(64), [8, 8, 1]), axis=0
+        feature_map = np.expand_dims(
+            np.reshape(np.arange(64), [8, 8, 1]), axis=0
         )
-        rois = tf.reshape(tf.constant([0.0, 0.0, 224, 224]), [1, 1, 4])
+        rois = np.reshape(np.array([0.0, 0.0, 224, 224]), [1, 1, 4])
         pooled_feature_map = roi_pooler(feature_map, rois)
         # the maximum value would be at bottom-right at each block, roi sharded
         # into 3x2 blocks
@@ -120,8 +118,8 @@ def test_roi_quantize_h(self):
         # | 48, 49, 50, 51      | 52, 53, 54, 55        |
         # | 56, 57, 58, 59(max) | 60, 61, 62, 63(max)   |
         # --------------------------------------------
-        expected_feature_map = tf.reshape(
-            tf.constant([11, 15, 35, 39, 59, 63]), [1, 1, 3, 2, 1]
+        expected_feature_map = np.reshape(
+            np.array([11, 15, 35, 39, 59, 63]), [1, 1, 3, 2, 1]
         )
         self.assertAllClose(expected_feature_map, pooled_feature_map)
 
@@ -129,10 +127,10 @@ def test_roi_quantize_w(self):
         roi_pooler = ROIPooler(
             "yxyx", target_size=[2, 3], image_shape=[224, 224, 3]
         )
-        feature_map = tf.expand_dims(
-            tf.reshape(tf.range(64), [8, 8, 1]), axis=0
+        feature_map = np.expand_dims(
+            np.reshape(np.arange(64), [8, 8, 1]), axis=0
         )
-        rois = tf.reshape(tf.constant([0.0, 0.0, 224, 224]), [1, 1, 4])
+        rois = np.reshape(np.array([0.0, 0.0, 224, 224]), [1, 1, 4])
         pooled_feature_map = roi_pooler(feature_map, rois)
         # the maximum value would be at bottom-right at each block, roi sharded
         # into 2x3 blocks
@@ -146,8 +144,8 @@ def test_roi_quantize_w(self):
         # | 48, 49      | 50, 51, 52        | 53, 54, 55        |
         # | 56, 57(max) | 58, 59, 60(max)   | 61, 62, 63(max)   |
         # --------------------------------------------
-        expected_feature_map = tf.reshape(
-            tf.constant([25, 28, 31, 57, 60, 63]), [1, 1, 2, 3, 1]
+        expected_feature_map = np.reshape(
+            np.array([25, 28, 31, 57, 60, 63]), [1, 1, 2, 3, 1]
         )
         self.assertAllClose(expected_feature_map, pooled_feature_map)
 
@@ -155,10 +153,10 @@ def test_roi_feature_map_height_smaller_than_roi(self):
         roi_pooler = ROIPooler(
             "yxyx", target_size=[6, 2], image_shape=[224, 224, 3]
         )
-        feature_map = tf.expand_dims(
-            tf.reshape(tf.range(16), [4, 4, 1]), axis=0
+        feature_map = np.expand_dims(
+            np.reshape(np.arange(16), [4, 4, 1]), axis=0
         )
-        rois = tf.reshape(tf.constant([0.0, 0.0, 224, 224]), [1, 1, 4])
+        rois = np.reshape(np.array([0.0, 0.0, 224, 224]), [1, 1, 4])
         pooled_feature_map = roi_pooler(feature_map, rois)
         # | 0, 1(max)   | 2, 3(max)     |
         # ------------------repeated----------------------
@@ -167,8 +165,8 @@ def test_roi_feature_map_height_smaller_than_roi(self):
         # | 8, 9(max)   | 10, 11(max)   |
         # ------------------repeated----------------------
         # | 12, 13(max) | 14, 15(max)   |
-        expected_feature_map = tf.reshape(
-            tf.constant([1, 3, 1, 3, 5, 7, 9, 11, 9, 11, 13, 15]),
+        expected_feature_map = np.reshape(
+            np.array([1, 3, 1, 3, 5, 7, 9, 11, 9, 11, 13, 15]),
             [1, 1, 6, 2, 1],
         )
         self.assertAllClose(expected_feature_map, pooled_feature_map)
@@ -177,10 +175,10 @@ def test_roi_feature_map_width_smaller_than_roi(self):
         roi_pooler = ROIPooler(
             "yxyx", target_size=[2, 6], image_shape=[224, 224, 3]
         )
-        feature_map = tf.expand_dims(
-            tf.reshape(tf.range(16), [4, 4, 1]), axis=0
+        feature_map = np.expand_dims(
+            np.reshape(np.arange(16), [4, 4, 1]), axis=0
         )
-        rois = tf.reshape(tf.constant([0.0, 0.0, 224, 224]), [1, 1, 4])
+        rois = np.reshape(np.array([0.0, 0.0, 224, 224]), [1, 1, 4])
         pooled_feature_map = roi_pooler(feature_map, rois)
         # | 0       | 1         | 2         | 3         |
         # | 4(max)  | 5(max)    | 6(max)    | 7(max)    |
@@ -188,8 +186,8 @@ def test_roi_feature_map_width_smaller_than_roi(self):
         # | 8       | 9         | 10        | 11        |
         # | 12(max) | 13(max)   | 14(max)   | 15(max)   |
         # --------------------------------------------
-        expected_feature_map = tf.reshape(
-            tf.constant([4, 4, 5, 6, 6, 7, 12, 12, 13, 14, 14, 15]),
+        expected_feature_map = np.reshape(
+            np.array([4, 4, 5, 6, 6, 7, 12, 12, 13, 14, 14, 15]),
             [1, 1, 2, 6, 1],
         )
         self.assertAllClose(expected_feature_map, pooled_feature_map)
@@ -198,13 +196,13 @@ def test_roi_empty(self):
         roi_pooler = ROIPooler(
             "yxyx", target_size=[2, 2], image_shape=[224, 224, 3]
         )
-        feature_map = tf.expand_dims(
-            tf.reshape(tf.range(1, 65), [8, 8, 1]), axis=0
+        feature_map = np.expand_dims(
+            np.reshape(np.arange(1, 65), [8, 8, 1]), axis=0
         )
-        rois = tf.reshape(tf.constant([0.0, 0.0, 0.0, 0.0]), [1, 1, 4])
+        rois = np.reshape(np.array([0.0, 0.0, 0.0, 0.0]), [1, 1, 4])
         pooled_feature_map = roi_pooler(feature_map, rois)
         # all outputs should be top-left pixel
-        self.assertAllClose(tf.ones([1, 1, 2, 2, 1]), pooled_feature_map)
+        self.assertAllClose(np.ones([1, 1, 2, 2, 1]), pooled_feature_map)
 
     def test_invalid_image_shape(self):
         with self.assertRaisesRegex(ValueError, "dynamic shape"):
@@ -213,8 +211,8 @@ def test_invalid_image_shape(self):
             )
 
     def test_multiple_rois(self):
-        feature_map = tf.expand_dims(
-            tf.reshape(tf.range(0, 64), [8, 8, 1]), axis=0
+        feature_map = np.expand_dims(
+            np.reshape(np.arange(0, 64), [8, 8, 1]), axis=0
         )
 
         roi_pooler = ROIPooler(
@@ -222,7 +220,7 @@ def test_multiple_rois(self):
             target_size=[2, 2],
             image_shape=[224, 224, 3],
         )
-        rois = tf.constant(
+        rois = np.array(
             [[[0.0, 0.0, 112.0, 112.0], [0.0, 112.0, 224.0, 224.0]]],
         )
 
@@ -240,7 +238,7 @@ def test_multiple_rois(self):
         # | 56, 57, 58, 59(max) | 60, 61, 62, 63(max)   |
         # --------------------------------------------
 
-        expected_feature_map = tf.reshape(
-            tf.constant([9, 11, 25, 27, 29, 31, 61, 63]), [1, 2, 2, 2, 1]
+        expected_feature_map = np.reshape(
+            np.array([9, 11, 25, 27, 29, 31, 61, 63]), [1, 2, 2, 2, 1]
         )
         self.assertAllClose(expected_feature_map, pooled_feature_map)
diff --git a/keras_cv/layers/object_detection/roi_sampler.py b/keras_cv/layers/object_detection/roi_sampler.py
index fe63e31ba9..56d774dba5 100644
--- a/keras_cv/layers/object_detection/roi_sampler.py
+++ b/keras_cv/layers/object_detection/roi_sampler.py
@@ -12,11 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import tensorflow as tf
-from tensorflow import keras
-
 from keras_cv import bounding_box
-from keras_cv.backend import assert_tf_keras
+from keras_cv.backend import keras
+from keras_cv.backend import ops
 from keras_cv.bounding_box import iou
 from keras_cv.layers.object_detection import box_matcher
 from keras_cv.layers.object_detection import sampling
@@ -69,7 +67,6 @@ def __init__(
         append_gt_boxes: bool = True,
         **kwargs,
     ):
-        assert_tf_keras("keras_cv.layers._ROISampler")
         super().__init__(**kwargs)
         self.bounding_box_format = bounding_box_format
         self.roi_matcher = roi_matcher
@@ -84,9 +81,9 @@ def __init__(
 
     def call(
         self,
-        rois: tf.Tensor,
-        gt_boxes: tf.Tensor,
-        gt_classes: tf.Tensor,
+        rois,
+        gt_boxes,
+        gt_classes,
     ):
         """
         Args:
@@ -102,11 +99,11 @@ def call(
         """
         if self.append_gt_boxes:
             # num_rois += num_gt
-            rois = tf.concat([rois, gt_boxes], axis=1)
-        num_rois = rois.get_shape().as_list()[1]
+            rois = ops.concatenate([rois, gt_boxes], axis=1)
+        num_rois = ops.shape(rois)[1]
         if num_rois is None:
             raise ValueError(
-                f"`rois` must have static shape, got {rois.get_shape()}"
+                f"`rois` must have static shape, got {ops.shape(rois)}"
             )
         if num_rois < self.num_sampled_rois:
             raise ValueError(
@@ -126,27 +123,27 @@ def call(
         # [batch_size, num_rois] | [batch_size, num_rois]
         matched_gt_cols, matched_vals = self.roi_matcher(similarity_mat)
         # [batch_size, num_rois]
-        positive_matches = tf.math.equal(matched_vals, 1)
-        negative_matches = tf.math.equal(matched_vals, -1)
+        positive_matches = ops.equal(matched_vals, 1)
+        negative_matches = ops.equal(matched_vals, -1)
         self._positives.update_state(
-            tf.reduce_sum(tf.cast(positive_matches, tf.float32), axis=-1)
+            ops.sum(ops.cast(positive_matches, "float32"), axis=-1)
         )
         self._negatives.update_state(
-            tf.reduce_sum(tf.cast(negative_matches, tf.float32), axis=-1)
+            ops.sum(ops.cast(negative_matches, "float32"), axis=-1)
         )
         # [batch_size, num_rois, 1]
-        background_mask = tf.expand_dims(
-            tf.logical_not(positive_matches), axis=-1
+        background_mask = ops.expand_dims(
+            ops.logical_not(positive_matches), axis=-1
         )
         # [batch_size, num_rois, 1]
         matched_gt_classes = target_gather._target_gather(
             gt_classes, matched_gt_cols
         )
         # also set all background matches to `background_class`
-        matched_gt_classes = tf.where(
+        matched_gt_classes = ops.where(
             background_mask,
-            tf.cast(
-                self.background_class * tf.ones_like(matched_gt_classes),
+            ops.cast(
+                self.background_class * ops.ones_like(matched_gt_classes),
                 gt_classes.dtype,
             ),
             matched_gt_classes,
@@ -163,9 +160,9 @@ def call(
             variance=[0.1, 0.1, 0.2, 0.2],
         )
         # also set all background matches to 0 coordinates
-        encoded_matched_gt_boxes = tf.where(
+        encoded_matched_gt_boxes = ops.where(
             background_mask,
-            tf.zeros_like(matched_gt_boxes),
+            ops.zeros_like(matched_gt_boxes),
             encoded_matched_gt_boxes,
         )
         # [batch_size, num_rois]
@@ -176,7 +173,7 @@ def call(
             self.positive_fraction,
         )
         # [batch_size, num_sampled_rois] in the range of [0, num_rois)
-        sampled_indicators, sampled_indices = tf.math.top_k(
+        sampled_indicators, sampled_indices = ops.top_k(
             sampled_indicators, k=self.num_sampled_rois, sorted=True
         )
         # [batch_size, num_sampled_rois, 4]
@@ -192,12 +189,12 @@ def call(
         # [batch_size, num_sampled_rois, 1]
         # all negative samples will be ignored in regression
         sampled_box_weights = target_gather._target_gather(
-            tf.cast(positive_matches[..., tf.newaxis], gt_boxes.dtype),
+            ops.cast(positive_matches[..., None], gt_boxes.dtype),
             sampled_indices,
         )
         # [batch_size, num_sampled_rois, 1]
-        sampled_indicators = sampled_indicators[..., tf.newaxis]
-        sampled_class_weights = tf.cast(sampled_indicators, gt_classes.dtype)
+        sampled_indicators = sampled_indicators[..., None]
+        sampled_class_weights = ops.cast(sampled_indicators, gt_classes.dtype)
         return (
             sampled_rois,
             sampled_gt_boxes,
diff --git a/keras_cv/layers/object_detection/roi_sampler_test.py b/keras_cv/layers/object_detection/roi_sampler_test.py
index a0ab5c92c2..bc7c7888e8 100644
--- a/keras_cv/layers/object_detection/roi_sampler_test.py
+++ b/keras_cv/layers/object_detection/roi_sampler_test.py
@@ -12,15 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import pytest
-import tensorflow as tf
 
+import numpy as np
+
+from keras_cv.backend import ops
 from keras_cv.layers.object_detection.box_matcher import BoxMatcher
 from keras_cv.layers.object_detection.roi_sampler import _ROISampler
 from keras_cv.tests.test_case import TestCase
 
 
-@pytest.mark.tf_keras_only
 class ROISamplerTest(TestCase):
     def test_roi_sampler(self):
         box_matcher = BoxMatcher(thresholds=[0.3], match_values=[-1, 1])
@@ -31,7 +31,7 @@ def test_roi_sampler(self):
             num_sampled_rois=2,
             append_gt_boxes=False,
         )
-        rois = tf.constant(
+        rois = np.array(
             [
                 [0, 0, 5, 5],
                 [2.5, 2.5, 7.5, 7.5],
@@ -39,35 +39,37 @@ def test_roi_sampler(self):
                 [7.5, 7.5, 12.5, 12.5],
             ]
         )
-        rois = rois[tf.newaxis, ...]
+        rois = rois[np.newaxis, ...]
         # the 3rd box will generate 0 IOUs and not sampled.
-        gt_boxes = tf.constant(
+        gt_boxes = np.array(
             [[10, 10, 15, 15], [2.5, 2.5, 7.5, 7.5], [-1, -1, -1, -1]]
         )
-        gt_boxes = gt_boxes[tf.newaxis, ...]
-        gt_classes = tf.constant([[2, 10, -1]], dtype=tf.int32)
-        gt_classes = gt_classes[..., tf.newaxis]
+        gt_boxes = gt_boxes[np.newaxis, ...]
+        gt_classes = np.array([[2, 10, -1]], dtype=np.int32)
+        gt_classes = gt_classes[..., np.newaxis]
         _, sampled_gt_boxes, _, sampled_gt_classes, _ = roi_sampler(
             rois, gt_boxes, gt_classes
         )
         # given we only choose 1 positive sample, and `append_label` is False,
         # only the 2nd ROI is chosen.
-        expected_gt_boxes = tf.constant(
-            [[0.0, 0.0, 0, 0.0], [0.0, 0.0, 0, 0.0]]
-        )
-        expected_gt_boxes = expected_gt_boxes[tf.newaxis, ...]
+        expected_gt_boxes = np.array([[0.0, 0.0, 0, 0.0], [0.0, 0.0, 0, 0.0]])
+        expected_gt_boxes = expected_gt_boxes[np.newaxis, ...]
         # only the 2nd ROI is chosen, and the negative ROI is mapped to 0.
-        expected_gt_classes = tf.constant([[10], [0]], dtype=tf.int32)
-        expected_gt_classes = expected_gt_classes[tf.newaxis, ...]
+        expected_gt_classes = np.array([[10], [0]], dtype=np.int32)
+        expected_gt_classes = expected_gt_classes[np.newaxis, ...]
         self.assertAllClose(
-            tf.reduce_max(expected_gt_boxes), tf.reduce_max(sampled_gt_boxes)
+            np.max(expected_gt_boxes),
+            np.max(ops.convert_to_numpy(sampled_gt_boxes)),
         )
         self.assertAllClose(
-            tf.reduce_min(expected_gt_classes),
-            tf.reduce_min(sampled_gt_classes),
+            np.min(expected_gt_classes),
+            np.min(ops.convert_to_numpy(sampled_gt_classes)),
         )
 
     def test_roi_sampler_small_threshold(self):
+        self.skipTest(
+            "TODO: resolving flaky test, https://github.com/keras-team/keras-cv/issues/2336"  # noqa
+        )
         box_matcher = BoxMatcher(thresholds=[0.1], match_values=[-1, 1])
         roi_sampler = _ROISampler(
             bounding_box_format="xyxy",
@@ -76,7 +78,7 @@ def test_roi_sampler_small_threshold(self):
             num_sampled_rois=2,
             append_gt_boxes=False,
         )
-        rois = tf.constant(
+        rois = np.array(
             [
                 [0, 0, 5, 5],
                 [2.5, 2.5, 7.5, 7.5],
@@ -84,14 +86,14 @@ def test_roi_sampler_small_threshold(self):
                 [7.5, 7.5, 12.5, 12.5],
             ]
         )
-        rois = rois[tf.newaxis, ...]
+        rois = rois[np.newaxis, ...]
         # the 3rd box will generate 0 IOUs and not sampled.
-        gt_boxes = tf.constant(
+        gt_boxes = np.array(
             [[10, 10, 15, 15], [2.6, 2.6, 7.6, 7.6], [-1, -1, -1, -1]]
         )
-        gt_boxes = gt_boxes[tf.newaxis, ...]
-        gt_classes = tf.constant([[2, 10, -1]], dtype=tf.int32)
-        gt_classes = gt_classes[..., tf.newaxis]
+        gt_boxes = gt_boxes[np.newaxis, ...]
+        gt_classes = np.array([[2, 10, -1]], dtype=np.int32)
+        gt_classes = gt_classes[..., np.newaxis]
         sampled_rois, sampled_gt_boxes, _, sampled_gt_classes, _ = roi_sampler(
             rois, gt_boxes, gt_classes
         )
@@ -99,25 +101,23 @@ def test_roi_sampler_small_threshold(self):
         # only the 2nd ROI is chosen. No negative samples exist given we
         # select positive_threshold to be 0.1. (the minimum IOU is 1/7)
         # given num_sampled_rois=2, it selects the 1st ROI as well.
-        expected_rois = tf.constant([[5, 5, 10, 10], [0.0, 0.0, 5.0, 5.0]])
-        expected_rois = expected_rois[tf.newaxis, ...]
+        expected_rois = np.array([[5, 5, 10, 10], [0.0, 0.0, 5.0, 5.0]])
+        expected_rois = expected_rois[np.newaxis, ...]
         # all ROIs are matched to the 2nd gt box.
         # the boxes are encoded by dimensions, so the result is
         # tx, ty = (5.1 - 5.0) / 5 = 0.02, tx, ty = (5.1 - 2.5) / 5 = 0.52
         # then divide by 0.1 as box variance.
         expected_gt_boxes = (
-            tf.constant([[0.02, 0.02, 0.0, 0.0], [0.52, 0.52, 0.0, 0.0]]) / 0.1
+            np.array([[0.02, 0.02, 0.0, 0.0], [0.52, 0.52, 0.0, 0.0]]) / 0.1
         )
-        expected_gt_boxes = expected_gt_boxes[tf.newaxis, ...]
+        expected_gt_boxes = expected_gt_boxes[np.newaxis, ...]
         # only the 2nd ROI is chosen, and the negative ROI is mapped to 0.
-        expected_gt_classes = tf.constant([[10], [10]], dtype=tf.int32)
-        expected_gt_classes = expected_gt_classes[tf.newaxis, ...]
+        expected_gt_classes = np.array([[10], [10]], dtype=np.int32)
+        expected_gt_classes = expected_gt_classes[np.newaxis, ...]
+        self.assertAllClose(np.max(expected_rois, 1), np.max(sampled_rois, 1))
         self.assertAllClose(
-            tf.reduce_max(expected_rois, 1), tf.reduce_max(sampled_rois, 1)
-        )
-        self.assertAllClose(
-            tf.reduce_max(expected_gt_boxes, 1),
-            tf.reduce_max(sampled_gt_boxes, 1),
+            np.max(expected_gt_boxes, 1),
+            np.max(sampled_gt_boxes, 1),
         )
         self.assertAllClose(expected_gt_classes, sampled_gt_classes)
 
@@ -132,7 +132,7 @@ def test_roi_sampler_large_threshold(self):
             num_sampled_rois=2,
             append_gt_boxes=False,
         )
-        rois = tf.constant(
+        rois = np.array(
             [
                 [0, 0, 5, 5],
                 [2.5, 2.5, 7.5, 7.5],
@@ -140,22 +140,22 @@ def test_roi_sampler_large_threshold(self):
                 [7.5, 7.5, 12.5, 12.5],
             ]
         )
-        rois = rois[tf.newaxis, ...]
+        rois = rois[np.newaxis, ...]
         # the 3rd box will generate 0 IOUs and not sampled.
-        gt_boxes = tf.constant(
+        gt_boxes = np.array(
             [[10, 10, 15, 15], [2.6, 2.6, 7.6, 7.6], [-1, -1, -1, -1]]
         )
-        gt_boxes = gt_boxes[tf.newaxis, ...]
-        gt_classes = tf.constant([[2, 10, -1]], dtype=tf.int32)
-        gt_classes = gt_classes[..., tf.newaxis]
+        gt_boxes = gt_boxes[np.newaxis, ...]
+        gt_classes = np.array([[2, 10, -1]], dtype=np.int32)
+        gt_classes = gt_classes[..., np.newaxis]
         _, sampled_gt_boxes, _, sampled_gt_classes, _ = roi_sampler(
             rois, gt_boxes, gt_classes
         )
         # all ROIs are negative matches, so they are mapped to 0.
-        expected_gt_boxes = tf.zeros([1, 2, 4], dtype=tf.float32)
+        expected_gt_boxes = np.zeros([1, 2, 4], dtype=np.float32)
         # only the 2nd ROI is chosen, and the negative ROI is mapped to 0.
-        expected_gt_classes = tf.constant([[0], [0]], dtype=tf.int32)
-        expected_gt_classes = expected_gt_classes[tf.newaxis, ...]
+        expected_gt_classes = np.array([[0], [0]], dtype=np.int32)
+        expected_gt_classes = expected_gt_classes[np.newaxis, ...]
         # self.assertAllClose(expected_rois, sampled_rois)
         self.assertAllClose(expected_gt_boxes, sampled_gt_boxes)
         self.assertAllClose(expected_gt_classes, sampled_gt_classes)
@@ -172,7 +172,7 @@ def test_roi_sampler_large_threshold_custom_bg_class(self):
             num_sampled_rois=2,
             append_gt_boxes=False,
         )
-        rois = tf.constant(
+        rois = np.array(
             [
                 [0, 0, 5, 5],
                 [2.5, 2.5, 7.5, 7.5],
@@ -180,23 +180,23 @@ def test_roi_sampler_large_threshold_custom_bg_class(self):
                 [7.5, 7.5, 12.5, 12.5],
             ]
         )
-        rois = rois[tf.newaxis, ...]
+        rois = rois[np.newaxis, ...]
         # the 3rd box will generate 0 IOUs and not sampled.
-        gt_boxes = tf.constant(
+        gt_boxes = np.array(
             [[10, 10, 15, 15], [2.6, 2.6, 7.6, 7.6], [-1, -1, -1, -1]]
         )
-        gt_boxes = gt_boxes[tf.newaxis, ...]
-        gt_classes = tf.constant([[2, 10, -1]], dtype=tf.int32)
-        gt_classes = gt_classes[..., tf.newaxis]
+        gt_boxes = gt_boxes[np.newaxis, ...]
+        gt_classes = np.array([[2, 10, -1]], dtype=np.int32)
+        gt_classes = gt_classes[..., np.newaxis]
         _, sampled_gt_boxes, _, sampled_gt_classes, _ = roi_sampler(
             rois, gt_boxes, gt_classes
         )
         # all ROIs are negative matches, so they are mapped to 0.
-        expected_gt_boxes = tf.zeros([1, 2, 4], dtype=tf.float32)
+        expected_gt_boxes = np.zeros([1, 2, 4], dtype=np.float32)
         # only the 2nd ROI is chosen, and the negative ROI is mapped to -1 from
         # customization.
-        expected_gt_classes = tf.constant([[-1], [-1]], dtype=tf.int32)
-        expected_gt_classes = expected_gt_classes[tf.newaxis, ...]
+        expected_gt_classes = np.array([[-1], [-1]], dtype=np.int32)
+        expected_gt_classes = expected_gt_classes[np.newaxis, ...]
         # self.assertAllClose(expected_rois, sampled_rois)
         self.assertAllClose(expected_gt_boxes, sampled_gt_boxes)
         self.assertAllClose(expected_gt_classes, sampled_gt_classes)
@@ -212,7 +212,7 @@ def test_roi_sampler_large_threshold_append_gt_boxes(self):
             num_sampled_rois=2,
             append_gt_boxes=True,
         )
-        rois = tf.constant(
+        rois = np.array(
             [
                 [0, 0, 5, 5],
                 [2.5, 2.5, 7.5, 7.5],
@@ -220,24 +220,28 @@ def test_roi_sampler_large_threshold_append_gt_boxes(self):
                 [7.5, 7.5, 12.5, 12.5],
             ]
         )
-        rois = rois[tf.newaxis, ...]
+        rois = rois[np.newaxis, ...]
         # the 3rd box will generate 0 IOUs and not sampled.
-        gt_boxes = tf.constant(
+        gt_boxes = np.array(
             [[10, 10, 15, 15], [2.6, 2.6, 7.6, 7.6], [-1, -1, -1, -1]]
         )
-        gt_boxes = gt_boxes[tf.newaxis, ...]
-        gt_classes = tf.constant([[2, 10, -1]], dtype=tf.int32)
-        gt_classes = gt_classes[..., tf.newaxis]
+        gt_boxes = gt_boxes[np.newaxis, ...]
+        gt_classes = np.array([[2, 10, -1]], dtype=np.int32)
+        gt_classes = gt_classes[..., np.newaxis]
         _, sampled_gt_boxes, _, sampled_gt_classes, _ = roi_sampler(
             rois, gt_boxes, gt_classes
         )
         # the selected gt boxes should be [0, 0, 0, 0], and [10, 10, 15, 15]
         # but the 2nd will be encoded to 0.
-        self.assertAllClose(tf.reduce_min(sampled_gt_boxes), 0)
-        self.assertAllClose(tf.reduce_max(sampled_gt_boxes), 0)
+        self.assertAllClose(np.min(ops.convert_to_numpy(sampled_gt_boxes)), 0)
+        self.assertAllClose(np.max(ops.convert_to_numpy(sampled_gt_boxes)), 0)
         # the selected gt classes should be [0, 2 or 10]
-        self.assertAllLessEqual(tf.reduce_max(sampled_gt_classes), 10)
-        self.assertAllGreaterEqual(tf.reduce_min(sampled_gt_classes), 0)
+        self.assertAllLessEqual(
+            np.max(ops.convert_to_numpy(sampled_gt_classes)), 10
+        )
+        self.assertAllGreaterEqual(
+            np.min(ops.convert_to_numpy(sampled_gt_classes)), 0
+        )
 
     def test_roi_sampler_large_num_sampled_rois(self):
         box_matcher = BoxMatcher(thresholds=[0.95], match_values=[-1, 1])
@@ -248,7 +252,7 @@ def test_roi_sampler_large_num_sampled_rois(self):
             num_sampled_rois=200,
             append_gt_boxes=True,
         )
-        rois = tf.constant(
+        rois = np.array(
             [
                 [0, 0, 5, 5],
                 [2.5, 2.5, 7.5, 7.5],
@@ -256,14 +260,14 @@ def test_roi_sampler_large_num_sampled_rois(self):
                 [7.5, 7.5, 12.5, 12.5],
             ]
         )
-        rois = rois[tf.newaxis, ...]
+        rois = rois[np.newaxis, ...]
         # the 3rd box will generate 0 IOUs and not sampled.
-        gt_boxes = tf.constant(
+        gt_boxes = np.array(
             [[10, 10, 15, 15], [2.6, 2.6, 7.6, 7.6], [-1, -1, -1, -1]]
         )
-        gt_boxes = gt_boxes[tf.newaxis, ...]
-        gt_classes = tf.constant([[2, 10, -1]], dtype=tf.int32)
-        gt_classes = gt_classes[..., tf.newaxis]
+        gt_boxes = gt_boxes[np.newaxis, ...]
+        gt_classes = np.array([[2, 10, -1]], dtype=np.int32)
+        gt_classes = gt_classes[..., np.newaxis]
         with self.assertRaisesRegex(ValueError, "must be less than"):
             _, _, _ = roi_sampler(rois, gt_boxes, gt_classes)
 
diff --git a/keras_cv/layers/object_detection/rpn_label_encoder.py b/keras_cv/layers/object_detection/rpn_label_encoder.py
index 5cd9d88415..97c2ca5c80 100644
--- a/keras_cv/layers/object_detection/rpn_label_encoder.py
+++ b/keras_cv/layers/object_detection/rpn_label_encoder.py
@@ -12,13 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from typing import Mapping
-
-import tensorflow as tf
-from tensorflow import keras
+import tree
 
 from keras_cv import bounding_box
-from keras_cv.backend import assert_tf_keras
+from keras_cv.backend import keras
+from keras_cv.backend import ops
 from keras_cv.bounding_box import iou
 from keras_cv.layers.object_detection import box_matcher
 from keras_cv.layers.object_detection import sampling
@@ -73,7 +71,6 @@ def __init__(
         box_variance=[0.1, 0.1, 0.2, 0.2],
         **kwargs,
     ):
-        assert_tf_keras("keras_cv.layers._RpnLabelEncoder")
         super().__init__(**kwargs)
         self.anchor_format = anchor_format
         self.ground_truth_box_format = ground_truth_box_format
@@ -92,9 +89,9 @@ def __init__(
 
     def call(
         self,
-        anchors_dict: Mapping[str, tf.Tensor],
-        gt_boxes: tf.Tensor,
-        gt_classes: tf.Tensor,
+        anchors_dict,
+        gt_boxes,
+        gt_classes,
     ):
         """
         Args:
@@ -112,7 +109,7 @@ def call(
         anchors = anchors_dict
         if isinstance(anchors, dict):
             pack = True
-            anchors = tf.concat(tf.nest.flatten(anchors), axis=0)
+            anchors = ops.concatenate(tree.flatten(anchors), axis=0)
         anchors = bounding_box.convert_format(
             anchors, source=self.anchor_format, target="yxyx"
         )
@@ -126,14 +123,14 @@ def call(
         # [num_anchors] or [batch_size, num_anchors]
         matched_gt_indices, matched_vals = self.box_matcher(similarity_mat)
         # [num_anchors] or [batch_size, num_anchors]
-        positive_matches = tf.math.equal(matched_vals, 1)
+        positive_matches = ops.equal(matched_vals, 1)
         # currently SyncOnReadVariable does not support `assign_add` in
         # cross-replica.
         #    self._positives.update_state(
         #        tf.reduce_sum(tf.cast(positive_matches, tf.float32), axis=-1)
         #    )
 
-        negative_matches = tf.math.equal(matched_vals, -1)
+        negative_matches = ops.equal(matched_vals, -1)
         # [num_anchors, 4] or [batch_size, num_anchors, 4]
         matched_gt_boxes = target_gather._target_gather(
             gt_boxes, matched_gt_indices
@@ -148,18 +145,18 @@ def call(
             variance=self.box_variance,
         )
         # [num_anchors, 1] or [batch_size, num_anchors, 1]
-        box_sample_weights = tf.cast(
-            positive_matches[..., tf.newaxis], gt_boxes.dtype
+        box_sample_weights = ops.cast(
+            positive_matches[..., None], gt_boxes.dtype
         )
 
         # [num_anchors, 1] or [batch_size, num_anchors, 1]
-        positive_mask = tf.expand_dims(positive_matches, axis=-1)
+        positive_mask = ops.expand_dims(positive_matches, axis=-1)
         # set all negative and ignored matches to 0, and all positive matches to
         # 1 [num_anchors, 1] or [batch_size, num_anchors, 1]
-        positive_classes = tf.ones_like(positive_mask, dtype=gt_classes.dtype)
-        negative_classes = tf.zeros_like(positive_mask, dtype=gt_classes.dtype)
+        positive_classes = ops.ones_like(positive_mask, dtype=gt_classes.dtype)
+        negative_classes = ops.zeros_like(positive_mask, dtype=gt_classes.dtype)
         # [num_anchors, 1] or [batch_size, num_anchors, 1]
-        class_targets = tf.where(
+        class_targets = ops.where(
             positive_mask, positive_classes, negative_classes
         )
         # [num_anchors] or [batch_size, num_anchors]
@@ -170,8 +167,8 @@ def call(
             self.positive_fraction,
         )
         # [num_anchors, 1] or [batch_size, num_anchors, 1]
-        class_sample_weights = tf.cast(
-            sampled_indicators[..., tf.newaxis], gt_classes.dtype
+        class_sample_weights = ops.cast(
+            sampled_indicators[..., None], gt_classes.dtype
         )
         if pack:
             encoded_box_targets = self.unpack_targets(
@@ -192,7 +189,7 @@ def call(
         )
 
     def unpack_targets(self, targets, anchors_dict):
-        target_shape = len(targets.get_shape().as_list())
+        target_shape = len(ops.shape(targets))
         if target_shape != 2 and target_shape != 3:
             raise ValueError(
                 "unpacking targets must be rank 2 or rank 3, got "
@@ -201,7 +198,7 @@ def unpack_targets(self, targets, anchors_dict):
         unpacked_targets = {}
         count = 0
         for level, anchors in anchors_dict.items():
-            num_anchors_lvl = anchors.get_shape().as_list()[0]
+            num_anchors_lvl = ops.shape(anchors)[0]
             if target_shape == 2:
                 unpacked_targets[level] = targets[
                     count : count + num_anchors_lvl, ...
diff --git a/keras_cv/layers/object_detection/rpn_label_encoder_test.py b/keras_cv/layers/object_detection/rpn_label_encoder_test.py
index ac6891010a..910bc03757 100644
--- a/keras_cv/layers/object_detection/rpn_label_encoder_test.py
+++ b/keras_cv/layers/object_detection/rpn_label_encoder_test.py
@@ -12,14 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import pytest
-import tensorflow as tf
+import numpy as np
 
+from keras_cv.backend import ops
 from keras_cv.layers.object_detection.rpn_label_encoder import _RpnLabelEncoder
 from keras_cv.tests.test_case import TestCase
 
 
-@pytest.mark.tf_keras_only
 class RpnLabelEncoderTest(TestCase):
     def test_rpn_label_encoder(self):
         rpn_encoder = _RpnLabelEncoder(
@@ -30,7 +29,7 @@ def test_rpn_label_encoder(self):
             positive_fraction=0.5,
             samples_per_image=2,
         )
-        rois = tf.constant(
+        rois = np.array(
             [
                 [0, 0, 5, 5],
                 [2.5, 2.5, 7.5, 7.5],
@@ -39,15 +38,15 @@ def test_rpn_label_encoder(self):
             ]
         )
         # the 3rd box will generate 0 IOUs and not sampled.
-        gt_boxes = tf.constant([[10, 10, 15, 15], [2.5, 2.5, 7.5, 7.5]])
-        gt_classes = tf.constant([2, 10, -1], dtype=tf.int32)
-        gt_classes = gt_classes[..., tf.newaxis]
+        gt_boxes = np.array([[10, 10, 15, 15], [2.5, 2.5, 7.5, 7.5]])
+        gt_classes = np.array([2, 10, -1], dtype=np.int32)
+        gt_classes = gt_classes[..., np.newaxis]
         box_targets, box_weights, cls_targets, cls_weights = rpn_encoder(
             rois, gt_boxes, gt_classes
         )
         # all rois will be matched to the 2nd gt boxes, and encoded
         expected_box_targets = (
-            tf.constant(
+            np.array(
                 [
                     [0.5, 0.5, 0.0, 0.0],
                     [0.0, 0.0, 0.0, 0.0],
@@ -59,15 +58,18 @@ def test_rpn_label_encoder(self):
         )
         self.assertAllClose(expected_box_targets, box_targets)
         # only foreground and background classes
-        self.assertAllClose(tf.reduce_max(cls_targets), 1.0)
-        self.assertAllClose(tf.reduce_min(cls_targets), 0.0)
+        self.assertAllClose(np.max(ops.convert_to_numpy(cls_targets)), 1.0)
+        self.assertAllClose(np.min(ops.convert_to_numpy(cls_targets)), 0.0)
         # all weights between 0 and 1
-        self.assertAllClose(tf.reduce_max(cls_weights), 1.0)
-        self.assertAllClose(tf.reduce_min(cls_weights), 0.0)
-        self.assertAllClose(tf.reduce_max(box_weights), 1.0)
-        self.assertAllClose(tf.reduce_min(box_weights), 0.0)
+        self.assertAllClose(np.max(ops.convert_to_numpy(cls_weights)), 1.0)
+        self.assertAllClose(np.min(ops.convert_to_numpy(cls_weights)), 0.0)
+        self.assertAllClose(np.max(ops.convert_to_numpy(box_weights)), 1.0)
+        self.assertAllClose(np.min(ops.convert_to_numpy(box_weights)), 0.0)
 
     def test_rpn_label_encoder_multi_level(self):
+        self.skipTest(
+            "TODO: resolving flaky test, https://github.com/keras-team/keras-cv/issues/2336"  # noqa
+        )
         rpn_encoder = _RpnLabelEncoder(
             anchor_format="xyxy",
             ground_truth_box_format="xyxy",
@@ -77,18 +79,18 @@ def test_rpn_label_encoder_multi_level(self):
             samples_per_image=2,
         )
         rois = {
-            2: tf.constant([[0, 0, 5, 5], [2.5, 2.5, 7.5, 7.5]]),
-            3: tf.constant([[5, 5, 10, 10], [7.5, 7.5, 12.5, 12.5]]),
+            2: np.array([[0, 0, 5, 5], [2.5, 2.5, 7.5, 7.5]]),
+            3: np.array([[5, 5, 10, 10], [7.5, 7.5, 12.5, 12.5]]),
         }
         # the 3rd box will generate 0 IOUs and not sampled.
-        gt_boxes = tf.constant([[10, 10, 15, 15], [2.5, 2.5, 7.5, 7.5]])
-        gt_classes = tf.constant([2, 10, -1], dtype=tf.float32)
-        gt_classes = gt_classes[..., tf.newaxis]
+        gt_boxes = np.array([[10, 10, 15, 15], [2.5, 2.5, 7.5, 7.5]])
+        gt_classes = np.array([2, 10, -1], dtype=np.float32)
+        gt_classes = gt_classes[..., np.newaxis]
         _, _, _, cls_weights = rpn_encoder(rois, gt_boxes, gt_classes)
         # the 2nd level found 2 positive matches, the 3rd level found no match
         expected_cls_weights = {
-            2: tf.constant([[0.0], [1.0]]),
-            3: tf.constant([[0.0], [1.0]]),
+            2: np.array([[0.0], [1.0]]),
+            3: np.array([[0.0], [1.0]]),
         }
         self.assertAllClose(expected_cls_weights[2], cls_weights[2])
         self.assertAllClose(expected_cls_weights[3], cls_weights[3])
@@ -102,7 +104,7 @@ def test_rpn_label_encoder_batched(self):
             positive_fraction=0.5,
             samples_per_image=2,
         )
-        rois = tf.constant(
+        rois = np.array(
             [
                 [0, 0, 5, 5],
                 [2.5, 2.5, 7.5, 7.5],
@@ -111,18 +113,18 @@ def test_rpn_label_encoder_batched(self):
             ]
         )
         # the 3rd box will generate 0 IOUs and not sampled.
-        gt_boxes = tf.constant([[10, 10, 15, 15], [2.5, 2.5, 7.5, 7.5]])
-        gt_classes = tf.constant([2, 10, -1], dtype=tf.int32)
-        gt_classes = gt_classes[..., tf.newaxis]
-        rois = rois[tf.newaxis, ...]
-        gt_boxes = gt_boxes[tf.newaxis, ...]
-        gt_classes = gt_classes[tf.newaxis, ...]
+        gt_boxes = np.array([[10, 10, 15, 15], [2.5, 2.5, 7.5, 7.5]])
+        gt_classes = np.array([2, 10, -1], dtype=np.int32)
+        gt_classes = gt_classes[..., np.newaxis]
+        rois = rois[np.newaxis, ...]
+        gt_boxes = gt_boxes[np.newaxis, ...]
+        gt_classes = gt_classes[np.newaxis, ...]
         box_targets, box_weights, cls_targets, cls_weights = rpn_encoder(
             rois, gt_boxes, gt_classes
         )
         # all rois will be matched to the 2nd gt boxes, and encoded
         expected_box_targets = (
-            tf.constant(
+            np.array(
                 [
                     [0.5, 0.5, 0.0, 0.0],
                     [0.0, 0.0, 0.0, 0.0],
@@ -132,13 +134,13 @@ def test_rpn_label_encoder_batched(self):
             )
             / 0.1
         )
-        expected_box_targets = expected_box_targets[tf.newaxis, ...]
+        expected_box_targets = expected_box_targets[np.newaxis, ...]
         self.assertAllClose(expected_box_targets, box_targets)
         # only foreground and background classes
-        self.assertAllClose(tf.reduce_max(cls_targets), 1.0)
-        self.assertAllClose(tf.reduce_min(cls_targets), 0.0)
+        self.assertAllClose(np.max(ops.convert_to_numpy(cls_targets)), 1.0)
+        self.assertAllClose(np.min(ops.convert_to_numpy(cls_targets)), 0.0)
         # all weights between 0 and 1
-        self.assertAllClose(tf.reduce_max(cls_weights), 1.0)
-        self.assertAllClose(tf.reduce_min(cls_weights), 0.0)
-        self.assertAllClose(tf.reduce_max(box_weights), 1.0)
-        self.assertAllClose(tf.reduce_min(box_weights), 0.0)
+        self.assertAllClose(np.max(ops.convert_to_numpy(cls_weights)), 1.0)
+        self.assertAllClose(np.min(ops.convert_to_numpy(cls_weights)), 0.0)
+        self.assertAllClose(np.max(ops.convert_to_numpy(box_weights)), 1.0)
+        self.assertAllClose(np.min(ops.convert_to_numpy(box_weights)), 0.0)
diff --git a/keras_cv/layers/object_detection/sampling.py b/keras_cv/layers/object_detection/sampling.py
index ce1674bfa4..0cfc36d489 100644
--- a/keras_cv/layers/object_detection/sampling.py
+++ b/keras_cv/layers/object_detection/sampling.py
@@ -12,12 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import tensorflow as tf
+from keras_cv.backend import ops
+from keras_cv.backend import random
 
 
 def balanced_sample(
-    positive_matches: tf.Tensor,
-    negative_matches: tf.Tensor,
+    positive_matches,
+    negative_matches,
     num_samples: int,
     positive_fraction: float,
 ):
@@ -40,7 +41,7 @@ def balanced_sample(
         indicating the index is not sampled.
     """
 
-    N = positive_matches.get_shape().as_list()[-1]
+    N = ops.shape(positive_matches)[-1]
     if N < num_samples:
         raise ValueError(
             "passed in {positive_matches.shape} has less element than "
@@ -48,35 +49,37 @@ def balanced_sample(
         )
     # random_val = tf.random.uniform(tf.shape(positive_matches), minval=0.,
     # maxval=1.)
-    zeros = tf.zeros_like(positive_matches, dtype=tf.float32)
-    ones = tf.ones_like(positive_matches, dtype=tf.float32)
-    ones_rand = ones + tf.random.uniform(ones.shape, minval=-0.2, maxval=0.2)
-    halfs = 0.5 * tf.ones_like(positive_matches, dtype=tf.float32)
-    halfs_rand = halfs + tf.random.uniform(halfs.shape, minval=-0.2, maxval=0.2)
+    zeros = ops.zeros_like(positive_matches, dtype="float32")
+    ones = ops.ones_like(positive_matches, dtype="float32")
+    ones_rand = ones + random.uniform(ops.shape(ones), minval=-0.2, maxval=0.2)
+    halfs = 0.5 * ops.ones_like(positive_matches, dtype="float32")
+    halfs_rand = halfs + random.uniform(
+        ops.shape(halfs), minval=-0.2, maxval=0.2
+    )
     values = zeros
-    values = tf.where(positive_matches, ones_rand, values)
-    values = tf.where(negative_matches, halfs_rand, values)
+    values = ops.where(positive_matches, ones_rand, values)
+    values = ops.where(negative_matches, halfs_rand, values)
     num_pos_samples = int(num_samples * positive_fraction)
-    valid_matches = tf.logical_or(positive_matches, negative_matches)
+    valid_matches = ops.logical_or(positive_matches, negative_matches)
     # this might contain negative samples as well
-    _, positive_indices = tf.math.top_k(values, k=num_pos_samples)
-    selected_indicators = tf.cast(
-        tf.reduce_sum(tf.one_hot(positive_indices, depth=N), axis=-2), tf.bool
+    _, positive_indices = ops.top_k(values, k=num_pos_samples)
+    selected_indicators = ops.cast(
+        ops.sum(ops.one_hot(positive_indices, N), axis=-2), dtype="bool"
     )
     # setting all selected samples to zeros
-    values = tf.where(selected_indicators, zeros, values)
+    values = ops.where(selected_indicators, zeros, values)
     # setting all excessive positive matches to zeros as well
-    values = tf.where(positive_matches, zeros, values)
+    values = ops.where(positive_matches, zeros, values)
     num_neg_samples = num_samples - num_pos_samples
-    _, negative_indices = tf.math.top_k(values, k=num_neg_samples)
-    selected_indices = tf.concat([positive_indices, negative_indices], axis=-1)
-    selected_indicators = tf.reduce_sum(
-        tf.one_hot(selected_indices, depth=N), axis=-2
+    _, negative_indices = ops.top_k(values, k=num_neg_samples)
+    selected_indices = ops.concatenate(
+        [positive_indices, negative_indices], axis=-1
     )
-    selected_indicators = tf.minimum(
-        selected_indicators, tf.ones_like(selected_indicators)
+    selected_indicators = ops.sum(ops.one_hot(selected_indices, N), axis=-2)
+    selected_indicators = ops.minimum(
+        selected_indicators, ops.ones_like(selected_indicators)
     )
-    selected_indicators = tf.where(
-        valid_matches, selected_indicators, tf.zeros_like(selected_indicators)
+    selected_indicators = ops.where(
+        valid_matches, selected_indicators, ops.zeros_like(selected_indicators)
     )
     return selected_indicators
diff --git a/keras_cv/layers/object_detection/sampling_test.py b/keras_cv/layers/object_detection/sampling_test.py
index 9a3fb0aa1e..5394e2e6b3 100644
--- a/keras_cv/layers/object_detection/sampling_test.py
+++ b/keras_cv/layers/object_detection/sampling_test.py
@@ -12,17 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import pytest
-import tensorflow as tf
+import numpy as np
 
+from keras_cv.backend import ops
 from keras_cv.layers.object_detection.sampling import balanced_sample
 from keras_cv.tests.test_case import TestCase
 
 
-@pytest.mark.tf_keras_only
 class BalancedSamplingTest(TestCase):
     def test_balanced_sampling(self):
-        positive_matches = tf.constant(
+        positive_matches = np.array(
             [
                 True,
                 False,
@@ -36,7 +35,7 @@ def test_balanced_sampling(self):
                 False,
             ]
         )
-        negative_matches = tf.constant(
+        negative_matches = np.array(
             [False, True, True, True, True, True, True, True, True, True]
         )
         num_samples = 5
@@ -48,7 +47,7 @@ def test_balanced_sampling(self):
         self.assertAllClose(res[0], 1)
 
     def test_balanced_batched_sampling(self):
-        positive_matches = tf.constant(
+        positive_matches = np.array(
             [
                 [
                     True,
@@ -76,7 +75,7 @@ def test_balanced_batched_sampling(self):
                 ],
             ]
         )
-        negative_matches = tf.constant(
+        negative_matches = np.array(
             [
                 [False, True, True, True, True, True, True, True, True, True],
                 [True, True, True, True, True, True, False, True, True, True],
@@ -95,7 +94,7 @@ def test_balanced_batched_sampling(self):
         self.assertAllClose(res[1][6], 1)
 
     def test_balanced_sampling_over_positive_fraction(self):
-        positive_matches = tf.constant(
+        positive_matches = np.array(
             [
                 True,
                 False,
@@ -109,7 +108,7 @@ def test_balanced_sampling_over_positive_fraction(self):
                 False,
             ]
         )
-        negative_matches = tf.constant(
+        negative_matches = np.array(
             [False, True, True, True, True, True, True, True, True, True]
         )
         num_samples = 5
@@ -121,7 +120,7 @@ def test_balanced_sampling_over_positive_fraction(self):
         self.assertAllClose(res[0], 1)
 
     def test_balanced_sampling_under_positive_fraction(self):
-        positive_matches = tf.constant(
+        positive_matches = np.array(
             [
                 True,
                 False,
@@ -135,7 +134,7 @@ def test_balanced_sampling_under_positive_fraction(self):
                 False,
             ]
         )
-        negative_matches = tf.constant(
+        negative_matches = np.array(
             [False, True, True, True, True, True, True, True, True, True]
         )
         num_samples = 5
@@ -145,10 +144,11 @@ def test_balanced_sampling_under_positive_fraction(self):
         )
         # no positive is chosen
         self.assertAllClose(res[0], 0)
-        self.assertAllClose(tf.reduce_sum(res), 5)
+        print(res)
+        self.assertAllClose(np.sum(ops.convert_to_numpy(res)), 5)
 
     def test_balanced_sampling_over_num_samples(self):
-        positive_matches = tf.constant(
+        positive_matches = np.array(
             [
                 True,
                 False,
@@ -162,7 +162,7 @@ def test_balanced_sampling_over_num_samples(self):
                 False,
             ]
         )
-        negative_matches = tf.constant(
+        negative_matches = np.array(
             [False, True, True, True, True, True, True, True, True, True]
         )
         # users want to get 20 samples, but only 10 are available
@@ -177,7 +177,7 @@ def test_balanced_sampling_over_num_samples(self):
             )
 
     def test_balanced_sampling_no_positive(self):
-        positive_matches = tf.constant(
+        positive_matches = np.array(
             [
                 False,
                 False,
@@ -192,7 +192,7 @@ def test_balanced_sampling_no_positive(self):
             ]
         )
         # the rest are neither positive nor negative, but ignored matches
-        negative_matches = tf.constant(
+        negative_matches = np.array(
             [False, False, True, False, False, True, False, False, True, False]
         )
         num_samples = 5
@@ -204,11 +204,11 @@ def test_balanced_sampling_no_positive(self):
         self.assertAllClose(res, [0, 0, 1, 0, 0, 1, 0, 0, 1, 0])
 
     def test_balanced_sampling_no_negative(self):
-        positive_matches = tf.constant(
+        positive_matches = np.array(
             [True, True, False, False, False, False, False, False, False, False]
         )
         # 2-9 indices are neither positive nor negative, they're ignored matches
-        negative_matches = tf.constant([False] * 10)
+        negative_matches = np.array([False] * 10)
         num_samples = 5
         positive_fraction = 0.5
         res = balanced_sample(
@@ -218,11 +218,13 @@ def test_balanced_sampling_no_negative(self):
         self.assertAllClose(res, [1, 1, 0, 0, 0, 0, 0, 0, 0, 0])
 
     def test_balanced_sampling_many_samples(self):
-        positive_matches = tf.random.uniform(
-            [2, 1000], minval=0, maxval=1, dtype=tf.float32
+        positive_matches = np.random.uniform(
+            size=[2, 1000],
+            low=0,
+            high=1,
         )
         positive_matches = positive_matches > 0.98
-        negative_matches = tf.logical_not(positive_matches)
+        negative_matches = np.logical_not(positive_matches)
         num_samples = 256
         positive_fraction = 0.25
         _ = balanced_sample(