diff --git a/.gitignore b/.gitignore
index 6a59b32803..68d68189bd 100644
--- a/.gitignore
+++ b/.gitignore
@@ -16,3 +16,4 @@ __pycache__/
 .vscode/
 .devcontainer/
 .coverage
+.history
diff --git a/keras_cv/backend/__init__.py b/keras_cv/backend/__init__.py
index da703722b9..7440acbd38 100644
--- a/keras_cv/backend/__init__.py
+++ b/keras_cv/backend/__init__.py
@@ -76,6 +76,7 @@
 
 from keras_cv.backend import config  # noqa: E402
 from keras_cv.backend import ops  # noqa: E402
+from keras_cv.backend import random  # noqa: E402
 from keras_cv.backend import tf_ops  # noqa: E402
 
 
diff --git a/keras_cv/backend/random.py b/keras_cv/backend/random.py
new file mode 100644
index 0000000000..21d4b08c7d
--- /dev/null
+++ b/keras_cv/backend/random.py
@@ -0,0 +1,20 @@
+# Copyright 2023 The KerasCV Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from keras_cv.backend.config import multi_backend
+
+if multi_backend():
+    from keras_core.random import *  # noqa: F403, F401
+else:
+    from keras_core.src.backend.tensorflow.random import *  # noqa: F403, F401
diff --git a/keras_cv/layers/__init__.py b/keras_cv/layers/__init__.py
index c8b01f2769..342a942f64 100644
--- a/keras_cv/layers/__init__.py
+++ b/keras_cv/layers/__init__.py
@@ -19,6 +19,9 @@
 from keras_cv.layers.augmenter import Augmenter
 from keras_cv.layers.feature_pyramid import FeaturePyramid
 from keras_cv.layers.fusedmbconv import FusedMBConvBlock
+from keras_cv.layers.hierarchical_transformer_encoder import (
+    HierarchicalTransformerEncoder,
+)
 from keras_cv.layers.mbconv import MBConvBlock
 from keras_cv.layers.object_detection.anchor_generator import AnchorGenerator
 from keras_cv.layers.object_detection.box_matcher import BoxMatcher
@@ -32,6 +35,9 @@
     CenterNetLabelEncoder,
 )
 from keras_cv.layers.object_detection_3d.voxelization import DynamicVoxelization
+from keras_cv.layers.overlapping_patching_embedding import (
+    OverlappingPatchingAndEmbedding,
+)
 from keras_cv.layers.preprocessing.aug_mix import AugMix
 from keras_cv.layers.preprocessing.auto_contrast import AutoContrast
 from keras_cv.layers.preprocessing.base_image_augmentation_layer import (
@@ -124,6 +130,9 @@
 from keras_cv.layers.regularization.dropblock_2d import DropBlock2D
 from keras_cv.layers.regularization.squeeze_excite import SqueezeAndExcite2D
 from keras_cv.layers.regularization.stochastic_depth import StochasticDepth
+from keras_cv.layers.segformer_multihead_attention import (
+    SegFormerMultiheadAttention,
+)
 from keras_cv.layers.spatial_pyramid import SpatialPyramidPooling
 from keras_cv.layers.transformer_encoder import TransformerEncoder
 from keras_cv.layers.vit_layers import PatchingAndEmbedding
diff --git a/keras_cv/layers/hierarchical_transformer_encoder.py b/keras_cv/layers/hierarchical_transformer_encoder.py
new file mode 100644
index 0000000000..ee67a17b56
--- /dev/null
+++ b/keras_cv/layers/hierarchical_transformer_encoder.py
@@ -0,0 +1,140 @@
+# Copyright 2023 The KerasCV Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+from keras_cv.api_export import keras_cv_export
+from keras_cv.backend import keras
+from keras_cv.backend import ops
+from keras_cv.layers.regularization.drop_path import DropPath
+from keras_cv.layers.segformer_multihead_attention import (
+    SegFormerMultiheadAttention,
+)
+
+
+@keras_cv_export("keras_cv.layers.HierarchicalTransformerEncoder")
+class HierarchicalTransformerEncoder(keras.layers.Layer):
+    """
+    Hierarchical transformer encoder block implementation as a Keras Layer.
+    The layer uses `SegFormerMultiheadAttention` as a `MultiHeadAttention`
+    alternative for computational efficiency, and is meant to be used
+    within the SegFormer architecture.
+
+    References:
+        - [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) (CVPR 2021) # noqa: E501
+        - [Official PyTorch implementation](https://github.com/NVlabs/SegFormer/blob/master/mmseg/models/backbones/mix_transformer.py) # noqa: E501
+        - [Ported from the TensorFlow implementation from DeepVision](https://github.com/DavidLandup0/deepvision/blob/main/deepvision/layers/hierarchical_transformer_encoder.py) # noqa: E501
+
+    Args:
+        project_dim: integer, the dimensionality of the projection of the
+            encoder, and output of the `SegFormerMultiheadAttention` layer.
+            Due to the residual addition the input dimensionality has to be
+            equal to the output dimensionality.
+        num_heads: integer, the number of heads for the
+            `SegFormerMultiheadAttention` layer.
+        drop_prob: float, the probability of dropping a random
+            sample using the `DropPath` layer. Defaults to `0.0`.
+        layer_norm_epsilon: float, the epsilon for
+            `LayerNormalization` layers. Defaults to `1e-06`
+        sr_ratio: integer, the ratio to use within
+            `SegFormerMultiheadAttention`. If set to > 1, a `Conv2D`
+             layer is used to reduce the length of the sequence. Defaults to `1`.
+
+    Basic usage:
+
+    ```
+    project_dim = 1024
+    num_heads = 4
+    patch_size = 16
+
+    encoded_patches = keras_cv.layers.OverlappingPatchingAndEmbedding(
+    project_dim=project_dim, patch_size=patch_size)(img_batch)
+
+    trans_encoded = keras_cv.layers.HierarchicalTransformerEncoder(project_dim=project_dim,
+                                                                   num_heads=num_heads,
+                                                                   sr_ratio=1)(encoded_patches)
+
+    print(trans_encoded.shape) # (1, 3136, 1024)
+    ```
+    """
+
+    def __init__(
+        self,
+        project_dim,
+        num_heads,
+        sr_ratio=1,
+        drop_prob=0.0,
+        layer_norm_epsilon=1e-6,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.project_dim = project_dim
+        self.num_heads = num_heads
+        self.drop_prop = drop_prob
+
+        self.norm1 = keras.layers.LayerNormalization(epsilon=layer_norm_epsilon)
+        self.attn = SegFormerMultiheadAttention(
+            project_dim, num_heads, sr_ratio
+        )
+        self.drop_path = DropPath(drop_prob)
+        self.norm2 = keras.layers.LayerNormalization(epsilon=layer_norm_epsilon)
+        self.mlp = self.MixFFN(
+            channels=project_dim,
+            mid_channels=int(project_dim * 4),
+        )
+
+    def build(self, input_shape):
+        super().build(input_shape)
+        self.H = ops.sqrt(ops.cast(input_shape[1], "float32"))
+        self.W = ops.sqrt(ops.cast(input_shape[2], "float32"))
+
+    def call(self, x):
+        x = x + self.drop_path(self.attn(self.norm1(x)))
+        x = x + self.drop_path(self.mlp(self.norm2(x)))
+        return x
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "mlp": keras.saving.serialize_keras_object(self.mlp),
+                "project_dim": self.project_dim,
+                "num_heads": self.num_heads,
+                "drop_prop": self.drop_prop,
+            }
+        )
+        return config
+
+    class MixFFN(keras.layers.Layer):
+        def __init__(self, channels, mid_channels):
+            super().__init__()
+            self.fc1 = keras.layers.Dense(mid_channels)
+            self.dwconv = keras.layers.DepthwiseConv2D(
+                kernel_size=3,
+                strides=1,
+                padding="same",
+            )
+            self.fc2 = keras.layers.Dense(channels)
+
+        def call(self, x):
+            x = self.fc1(x)
+            shape = ops.shape(x)
+            H, W = int(math.sqrt(shape[1])), int(math.sqrt(shape[1]))
+            B, C = shape[0], shape[2]
+            x = ops.reshape(x, (B, H, W, C))
+            x = self.dwconv(x)
+            x = ops.reshape(x, (B, -1, C))
+            x = ops.nn.gelu(x)
+            x = self.fc2(x)
+            return x
diff --git a/keras_cv/layers/overlapping_patching_embedding.py b/keras_cv/layers/overlapping_patching_embedding.py
new file mode 100644
index 0000000000..69060087ec
--- /dev/null
+++ b/keras_cv/layers/overlapping_patching_embedding.py
@@ -0,0 +1,85 @@
+# Copyright 2023 The KerasCV Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from keras_cv.api_export import keras_cv_export
+from keras_cv.backend import keras
+from keras_cv.backend import ops
+
+
+@keras_cv_export("keras_cv.layers.OverlappingPatchingAndEmbedding")
+class OverlappingPatchingAndEmbedding(keras.layers.Layer):
+    def __init__(self, project_dim=32, patch_size=7, stride=4, **kwargs):
+        """
+        Overlapping Patching and Embedding layer. Differs from `PatchingAndEmbedding`
+        in that the patch size does not affect the sequence length. It's fully derived
+        from the `stride` parameter. Additionally, no positional embedding is done
+        as part of the layer - only a projection using a `Conv2D` layer.
+
+        References:
+            - [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) (CVPR 2021) # noqa: E501
+            - [Official PyTorch implementation](https://github.com/NVlabs/SegFormer/blob/master/mmseg/models/backbones/mix_transformer.py) # noqa: E501
+            - [Ported from the TensorFlow implementation from DeepVision](https://github.com/DavidLandup0/deepvision/blob/main/deepvision/layers/hierarchical_transformer_encoder.py) # noqa: E501
+
+        Args:
+            project_dim: integer, the dimensionality of the projection.
+                Defaults to `32`.
+            patch_size: integer, the size of the patches to encode.
+                Defaults to `7`.
+            stride: integer, the stride to use for the patching before
+                projection. Defaults to `5`.
+
+        Basic usage:
+
+        ```
+        project_dim = 1024
+        patch_size = 16
+
+        encoded_patches = keras_cv.layers.OverlappingPatchingAndEmbedding(
+        project_dim=project_dim, patch_size=patch_size)(img_batch)
+
+        print(encoded_patches.shape) # (1, 3136, 1024)
+        ```
+        """
+        super().__init__(**kwargs)
+
+        self.project_dim = project_dim
+        self.patch_size = patch_size
+        self.stride = stride
+
+        self.proj = keras.layers.Conv2D(
+            filters=project_dim,
+            kernel_size=patch_size,
+            strides=stride,
+            padding="same",
+        )
+        self.norm = keras.layers.LayerNormalization()
+
+    def call(self, x):
+        x = self.proj(x)
+        # B, H, W, C
+        shape = x.shape
+        x = ops.reshape(x, (-1, shape[1] * shape[2], shape[3]))
+        x = self.norm(x)
+        return x
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "project_dim": self.project_dim,
+                "patch_size": self.patch_size,
+                "stride": self.stride,
+            }
+        )
+        return config
diff --git a/keras_cv/layers/regularization/drop_path.py b/keras_cv/layers/regularization/drop_path.py
index e254f29493..4475e2365f 100644
--- a/keras_cv/layers/regularization/drop_path.py
+++ b/keras_cv/layers/regularization/drop_path.py
@@ -12,13 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from tensorflow import keras
-
 from keras_cv.api_export import keras_cv_export
+from keras_cv.backend import keras
+from keras_cv.backend import ops
+from keras_cv.backend import random
 
 
 @keras_cv_export("keras_cv.layers.DropPath")
-class DropPath(keras.__internal__.layers.BaseRandomLayer):
+class DropPath(keras.layers.Layer):
     """
     Implements the DropPath layer. DropPath randomly drops samples during
     training with a probability of `rate`. Note that this layer drops individual
@@ -47,7 +48,7 @@ class DropPath(keras.__internal__.layers.BaseRandomLayer):
     """  # noqa: E501
 
     def __init__(self, rate=0.5, seed=None, **kwargs):
-        super().__init__(seed=seed, **kwargs)
+        super().__init__(**kwargs)
         self.rate = rate
         self.seed = seed
 
@@ -55,12 +56,13 @@ def call(self, x, training=None):
         if self.rate == 0.0 or not training:
             return x
         else:
-            keep_prob = 1 - self.rate
-            drop_map_shape = (x.shape[0],) + (1,) * (len(x.shape) - 1)
-            drop_map = keras.backend.random_bernoulli(
-                drop_map_shape, p=keep_prob, seed=self.seed
+            batch_size = x.shape[0] or ops.shape(x)[0]
+            drop_map_shape = (batch_size,) + (1,) * (len(x.shape) - 1)
+            drop_map = ops.cast(
+                random.uniform(drop_map_shape, seed=self.seed) > self.rate,
+                x.dtype,
             )
-            x = x / keep_prob
+            x = x / (1.0 - self.rate)
             x = x * drop_map
             return x
 
diff --git a/keras_cv/layers/regularization/drop_path_test.py b/keras_cv/layers/regularization/drop_path_test.py
index 22f63b5223..00b4b790f0 100644
--- a/keras_cv/layers/regularization/drop_path_test.py
+++ b/keras_cv/layers/regularization/drop_path_test.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import numpy as np
+import pytest
 import tensorflow as tf
 
 from keras_cv.layers import DropPath
@@ -23,7 +25,7 @@ class DropPathTest(TestCase):
 
     def test_input_unchanged_in_eval_mode(self):
         layer = DropPath(rate=0.5, seed=42)
-        inputs = tf.random.uniform(self.FEATURE_SHAPE)
+        inputs = np.random.uniform(size=self.FEATURE_SHAPE)
 
         outputs = layer(inputs, training=False)
 
@@ -31,7 +33,7 @@ def test_input_unchanged_in_eval_mode(self):
 
     def test_input_unchanged_with_rate_equal_to_zero(self):
         layer = DropPath(rate=0, seed=42)
-        inputs = tf.random.uniform(self.FEATURE_SHAPE)
+        inputs = np.random.uniform(size=self.FEATURE_SHAPE)
 
         outputs = layer(inputs, training=True)
 
@@ -39,7 +41,7 @@ def test_input_unchanged_with_rate_equal_to_zero(self):
 
     def test_input_gets_partially_zeroed_out_in_train_mode(self):
         layer = DropPath(rate=0.2, seed=42)
-        inputs = tf.random.uniform(self.FEATURE_SHAPE)
+        inputs = np.random.uniform(size=self.FEATURE_SHAPE)
 
         outputs = layer(inputs, training=True)
 
@@ -48,9 +50,11 @@ def test_input_gets_partially_zeroed_out_in_train_mode(self):
 
         self.assertGreaterEqual(non_zeros_inputs, non_zeros_outputs)
 
+    # Because randomness is inconsistent across backends, we just test with 1.
+    @pytest.mark.tf_keras_only
     def test_strict_input_gets_partially_zeroed_out_in_train_mode(self):
-        layer = DropPath(rate=0.5, seed=42)
-        inputs = tf.random.uniform(self.FEATURE_SHAPE)
+        layer = DropPath(rate=0.5, seed=10)
+        inputs = np.random.uniform(size=self.FEATURE_SHAPE)
 
         total_non_zero_inputs = 0
         total_non_zero_outputs = 0
@@ -66,6 +70,6 @@ def test_strict_input_gets_partially_zeroed_out_in_train_mode(self):
 
         self.assertAllInRange(
             total_non_zero_outputs,
-            int(0.49 * tf.cast(total_non_zero_inputs, tf.float32)),
-            int(0.51 * tf.cast(total_non_zero_inputs, tf.float32)),
+            int(0.40 * tf.cast(total_non_zero_inputs, tf.float32)),
+            int(0.60 * tf.cast(total_non_zero_inputs, tf.float32)),
         )
diff --git a/keras_cv/layers/segformer_multihead_attention.py b/keras_cv/layers/segformer_multihead_attention.py
new file mode 100644
index 0000000000..203773d4ea
--- /dev/null
+++ b/keras_cv/layers/segformer_multihead_attention.py
@@ -0,0 +1,132 @@
+# Copyright 2023 The KerasCV Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+
+from keras_cv.api_export import keras_cv_export
+from keras_cv.backend import keras
+from keras_cv.backend import ops
+
+
+@keras_cv_export("keras_cv.layers.SegFormerMultiheadAttention")
+class SegFormerMultiheadAttention(keras.layers.Layer):
+    def __init__(self, project_dim, num_heads, sr_ratio):
+        """
+        Efficient MultiHeadAttention implementation as a Keras layer.
+        A huge bottleneck in scaling transformers is the self-attention layer
+        with an O(n^2) complexity.
+
+        SegFormerMultiheadAttention performs a sequence reduction (SR) operation
+        with a given ratio, to reduce the sequence length before performing key and value projections,
+        reducing the O(n^2) complexity to O(n^2/R) where R is the sequence reduction ratio.
+
+        References:
+        - [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) (CVPR 2021) # noqa: E501
+        - [NVlabs' official implementation](https://github.com/NVlabs/SegFormer/blob/master/mmseg/models/backbones/mix_transformer.py) # noqa: E501
+        - [@sithu31296's reimplementation](https://github.com/sithu31296/semantic-segmentation/blob/main/semseg/models/backbones/mit.py) # noqa: E501
+        - [Ported from the TensorFlow implementation from DeepVision](https://github.com/DavidLandup0/deepvision/blob/main/deepvision/layers/efficient_attention.py) # noqa: E501
+
+        Args:
+            project_dim: integer, the dimensionality of the projection
+                of the `SegFormerMultiheadAttention` layer.
+            num_heads: integer, the number of heads to use in the
+                attention computation.
+            sr_ratio: integer, the sequence reduction ratio to perform
+                on the sequence before key and value projections.
+
+        Basic usage:
+
+        ```
+        tensor = tf.random.uniform([1, 196, 32])
+        output = keras_cv.layers.SegFormerMultiheadAttention(project_dim=768,
+                                                            num_heads=2,
+                                                            sr_ratio=4)(tensor)
+        print(output.shape) # (1, 196, 32)
+        ```
+        """
+        super().__init__()
+        self.num_heads = num_heads
+        self.sr_ratio = sr_ratio
+        self.scale = (project_dim // num_heads) ** -0.5
+        self.q = keras.layers.Dense(project_dim)
+        self.k = keras.layers.Dense(project_dim)
+        self.v = keras.layers.Dense(project_dim)
+        self.proj = keras.layers.Dense(project_dim)
+
+        if sr_ratio > 1:
+            self.sr = keras.layers.Conv2D(
+                filters=project_dim,
+                kernel_size=sr_ratio,
+                strides=sr_ratio,
+                padding="same",
+            )
+            self.norm = keras.layers.LayerNormalization()
+
+    def call(self, x):
+        input_shape = ops.shape(x)
+        H, W = int(math.sqrt(input_shape[1])), int(math.sqrt(input_shape[1]))
+        B, C = input_shape[0], input_shape[2]
+
+        q = self.q(x)
+        q = ops.reshape(
+            q,
+            (
+                input_shape[0],
+                input_shape[1],
+                self.num_heads,
+                input_shape[2] // self.num_heads,
+            ),
+        )
+        q = ops.transpose(q, [0, 2, 1, 3])
+
+        if self.sr_ratio > 1:
+            x = ops.reshape(
+                ops.transpose(x, [0, 2, 1]),
+                (B, H, W, C),
+            )
+            x = self.sr(x)
+            x = ops.reshape(x, [input_shape[0], input_shape[2], -1])
+            x = ops.transpose(x, [0, 2, 1])
+            x = self.norm(x)
+
+        k = self.k(x)
+        v = self.v(x)
+
+        k = ops.transpose(
+            ops.reshape(
+                k,
+                [B, -1, self.num_heads, C // self.num_heads],
+            ),
+            [0, 2, 1, 3],
+        )
+
+        v = ops.transpose(
+            ops.reshape(
+                v,
+                [B, -1, self.num_heads, C // self.num_heads],
+            ),
+            [0, 2, 1, 3],
+        )
+
+        attn = (q @ ops.transpose(k, [0, 1, 3, 2])) * self.scale
+        attn = ops.nn.softmax(attn, axis=-1)
+
+        attn = attn @ v
+        attn = ops.reshape(
+            ops.transpose(attn, [0, 2, 1, 3]),
+            [input_shape[0], input_shape[1], input_shape[2]],
+        )
+
+        x = self.proj(attn)
+        return x
diff --git a/keras_cv/models/__init__.py b/keras_cv/models/__init__.py
index 4191c07575..9c83a3891a 100644
--- a/keras_cv/models/__init__.py
+++ b/keras_cv/models/__init__.py
@@ -112,6 +112,27 @@
 from keras_cv.models.backbones.efficientnet_v2.efficientnet_v2_aliases import (
     EfficientNetV2SBackbone,
 )
+from keras_cv.models.backbones.mix_transformer.mix_transformer_aliases import (
+    MiTB0Backbone,
+)
+from keras_cv.models.backbones.mix_transformer.mix_transformer_aliases import (
+    MiTB1Backbone,
+)
+from keras_cv.models.backbones.mix_transformer.mix_transformer_aliases import (
+    MiTB2Backbone,
+)
+from keras_cv.models.backbones.mix_transformer.mix_transformer_aliases import (
+    MiTB3Backbone,
+)
+from keras_cv.models.backbones.mix_transformer.mix_transformer_aliases import (
+    MiTB4Backbone,
+)
+from keras_cv.models.backbones.mix_transformer.mix_transformer_aliases import (
+    MiTB5Backbone,
+)
+from keras_cv.models.backbones.mix_transformer.mix_transformer_aliases import (
+    MiTBackbone,
+)
 from keras_cv.models.backbones.mobilenet_v3.mobilenet_v3_aliases import (
     MobileNetV3LargeBackbone,
 )
@@ -166,5 +187,12 @@
     YOLOV8Detector,
 )
 from keras_cv.models.segmentation import DeepLabV3Plus
+from keras_cv.models.segmentation.segformer.segformer_aliases import SegFormer
+from keras_cv.models.segmentation.segformer.segformer_aliases import SegFormerB0
+from keras_cv.models.segmentation.segformer.segformer_aliases import SegFormerB1
+from keras_cv.models.segmentation.segformer.segformer_aliases import SegFormerB2
+from keras_cv.models.segmentation.segformer.segformer_aliases import SegFormerB3
+from keras_cv.models.segmentation.segformer.segformer_aliases import SegFormerB4
+from keras_cv.models.segmentation.segformer.segformer_aliases import SegFormerB5
 from keras_cv.models.stable_diffusion import StableDiffusion
 from keras_cv.models.stable_diffusion import StableDiffusionV2
diff --git a/keras_cv/models/backbones/mix_transformer/__init__.py b/keras_cv/models/backbones/mix_transformer/__init__.py
new file mode 100644
index 0000000000..3992ffb59a
--- /dev/null
+++ b/keras_cv/models/backbones/mix_transformer/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2023 The KerasCV Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/keras_cv/models/backbones/mix_transformer/mix_transformer_aliases.py b/keras_cv/models/backbones/mix_transformer/mix_transformer_aliases.py
new file mode 100644
index 0000000000..7c7ea6a8b6
--- /dev/null
+++ b/keras_cv/models/backbones/mix_transformer/mix_transformer_aliases.py
@@ -0,0 +1,262 @@
+# Copyright 2023 The KerasCV Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+
+from keras_cv.models.backbones.mix_transformer.mix_transformer_backbone import (
+    MiTBackbone,
+)
+from keras_cv.models.backbones.mix_transformer.mix_transformer_backbone_presets import (  # noqa: E501
+    backbone_presets,
+)
+from keras_cv.utils.python_utils import classproperty
+
+ALIAS_DOCSTRING = """MiT model.
+
+    For transfer learning use cases, make sure to read the
+    [guide to transfer learning & fine-tuning](https://keras.io/guides/transfer_learning/).
+
+    Args:
+        include_rescaling: bool, whether to rescale the inputs. If set to
+            True, inputs will be passed through a `Rescaling(scale=1 / 255)`
+            layer. Defaults to True.
+        input_shape: optional shape tuple, defaults to (None, None, 3).
+        input_tensor: optional Keras tensor (i.e., output of `layers.Input()`)
+            to use as image input for the model.
+
+    Examples:
+    ```python
+    input_data = tf.ones(shape=(8, 224, 224, 3))
+
+    # Randomly initialized backbone
+    model = {name}Backbone()
+    output = model(input_data)
+    ```
+"""  # noqa: E501
+
+
+class MiTB0Backbone(MiTBackbone):
+    def __new__(
+        cls,
+        include_rescaling=True,
+        input_shape=(224, 224, 3),
+        input_tensor=None,
+        **kwargs,
+    ):
+        # Pack args in kwargs
+        kwargs.update(
+            {
+                "include_rescaling": include_rescaling,
+                "input_shape": input_shape,
+                "input_tensor": input_tensor,
+            }
+        )
+        return MiTBackbone.from_preset("mit_b0", **kwargs)
+
+    @classproperty
+    def presets(cls):
+        """Dictionary of preset names and configurations."""
+        return {
+            "mit_b0_imagenet": copy.deepcopy(
+                backbone_presets["mit_b0_imagenet"]
+            ),
+        }
+
+    @classproperty
+    def presets_with_weights(cls):
+        """Dictionary of preset names and configurations that include
+        weights."""
+        return cls.presets
+
+
+class MiTB1Backbone(MiTBackbone):
+    def __new__(
+        cls,
+        include_rescaling=True,
+        input_shape=(224, 224, 3),
+        input_tensor=None,
+        **kwargs,
+    ):
+        # Pack args in kwargs
+        kwargs.update(
+            {
+                "include_rescaling": include_rescaling,
+                "input_shape": input_shape,
+                "input_tensor": input_tensor,
+            }
+        )
+        return MiTBackbone.from_preset("mit_b1", **kwargs)
+
+    @classproperty
+    def presets(cls):
+        """Dictionary of preset names and configurations."""
+        return {}
+
+    @classproperty
+    def presets_with_weights(cls):
+        """Dictionary of preset names and configurations."""
+        return {}
+
+
+class MiTB2Backbone(MiTBackbone):
+    def __new__(
+        cls,
+        include_rescaling=True,
+        input_shape=(224, 224, 3),
+        input_tensor=None,
+        **kwargs,
+    ):
+        # Pack args in kwargs
+        kwargs.update(
+            {
+                "include_rescaling": include_rescaling,
+                "input_shape": input_shape,
+                "input_tensor": input_tensor,
+            }
+        )
+        return MiTBackbone.from_preset("mit_b2", **kwargs)
+
+    @classproperty
+    def presets(cls):
+        """Dictionary of preset names and configurations."""
+        return {}
+
+    @classproperty
+    def presets_with_weights(cls):
+        """Dictionary of preset names and configurations."""
+        return {}
+
+
+class MiTB3Backbone(MiTBackbone):
+    def __new__(
+        cls,
+        include_rescaling=True,
+        input_shape=(224, 224, 3),
+        input_tensor=None,
+        **kwargs,
+    ):
+        # Pack args in kwargs
+        kwargs.update(
+            {
+                "include_rescaling": include_rescaling,
+                "input_shape": input_shape,
+                "input_tensor": input_tensor,
+            }
+        )
+        return MiTBackbone.from_preset("mit_b3", **kwargs)
+
+    @classproperty
+    def presets(cls):
+        """Dictionary of preset names and configurations."""
+        return {}
+
+    @classproperty
+    def presets_with_weights(cls):
+        """Dictionary of preset names and configurations."""
+        return {}
+
+
+class MiTB4Backbone(MiTBackbone):
+    def __new__(
+        cls,
+        include_rescaling=True,
+        input_shape=(224, 224, 3),
+        input_tensor=None,
+        **kwargs,
+    ):
+        # Pack args in kwargs
+        kwargs.update(
+            {
+                "include_rescaling": include_rescaling,
+                "input_shape": input_shape,
+                "input_tensor": input_tensor,
+            }
+        )
+        return MiTBackbone.from_preset("mit_b4", **kwargs)
+
+    @classproperty
+    def presets(cls):
+        """Dictionary of preset names and configurations."""
+        return {}
+
+    @classproperty
+    def presets_with_weights(cls):
+        """Dictionary of preset names and configurations."""
+        return {}
+
+
+class MiTB5Backbone(MiTBackbone):
+    def __new__(
+        cls,
+        include_rescaling=True,
+        input_shape=(224, 224, 3),
+        input_tensor=None,
+        **kwargs,
+    ):
+        # Pack args in kwargs
+        kwargs.update(
+            {
+                "include_rescaling": include_rescaling,
+                "input_shape": input_shape,
+                "input_tensor": input_tensor,
+            }
+        )
+        return MiTBackbone.from_preset("mit_b5", **kwargs)
+
+    @classproperty
+    def presets(cls):
+        """Dictionary of preset names and configurations."""
+        return {}
+
+    @classproperty
+    def presets_with_weights(cls):
+        """Dictionary of preset names and configurations."""
+        return {}
+
+
+setattr(
+    MiTB0Backbone,
+    "__doc__",
+    ALIAS_DOCSTRING.format(name="MiTB0"),
+)
+
+setattr(
+    MiTB1Backbone,
+    "__doc__",
+    ALIAS_DOCSTRING.format(name="MiTB1"),
+)
+
+setattr(
+    MiTB2Backbone,
+    "__doc__",
+    ALIAS_DOCSTRING.format(name="MiTB2"),
+)
+
+setattr(
+    MiTB3Backbone,
+    "__doc__",
+    ALIAS_DOCSTRING.format(name="MiTB3"),
+)
+
+setattr(
+    MiTB4Backbone,
+    "__doc__",
+    ALIAS_DOCSTRING.format(name="MiTB4"),
+)
+
+setattr(
+    MiTB5Backbone,
+    "__doc__",
+    ALIAS_DOCSTRING.format(name="MiTB5"),
+)
diff --git a/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py
new file mode 100644
index 0000000000..bf6a1a6ec2
--- /dev/null
+++ b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py
@@ -0,0 +1,188 @@
+# Copyright 2023 The KerasCV Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""MiT backbone model.
+
+References:
+  - [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) (CVPR 2021)
+  - [Based on the TensorFlow implementation from DeepVision](https://github.com/DavidLandup0/deepvision/blob/main/deepvision/models/classification/mix_transformer/mit_tf.py)
+  - [Based on the NVlabs' official PyTorch implementation](https://github.com/NVlabs/SegFormer/blob/master/mmseg/models/backbones/mix_transformer.py)
+  - [Inspired by @sithu31296's reimplementation](https://github.com/sithu31296/semantic-segmentation/blob/main/semseg/models/backbones/mit.py)
+"""  # noqa: E501
+
+import copy
+
+import numpy as np
+
+from keras_cv import layers as cv_layers
+from keras_cv.api_export import keras_cv_export
+from keras_cv.backend import keras
+from keras_cv.backend import ops
+from keras_cv.models import utils
+from keras_cv.models.backbones.backbone import Backbone
+from keras_cv.models.backbones.mix_transformer.mix_transformer_backbone_presets import (  # noqa: E501
+    backbone_presets,
+)
+from keras_cv.models.backbones.mix_transformer.mix_transformer_backbone_presets import (  # noqa: E501
+    backbone_presets_with_weights,
+)
+from keras_cv.utils.python_utils import classproperty
+
+
+@keras_cv_export("keras_cv.models.MiTBackbone")
+class MiTBackbone(Backbone):
+    def __init__(
+        self,
+        include_rescaling,
+        depths,
+        input_shape=(224, 224, 3),
+        input_tensor=None,
+        embedding_dims=None,
+        **kwargs,
+    ):
+        """A Keras model implementing the MixTransformer architecture to be
+        used as a backbone for the SegFormer architecture.
+
+        References:
+            - [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) # noqa: E501
+            - [Based on the TensorFlow implementation from DeepVision](https://github.com/DavidLandup0/deepvision/tree/main/deepvision/models/classification/mix_transformer) # noqa: E501
+
+        Args:
+            include_rescaling: bool, whether to rescale the inputs. If set
+                to `True`, inputs will be passed through a `Rescaling(1/255.0)`
+                layer.
+            depths: the number of transformer encoders to be used per stage in the
+                network
+            embedding_dims: the embedding dims per hierarchical stage, used as
+                the levels of the feature pyramid
+            input_shape: optional shape tuple, defaults to (None, None, 3).
+            input_tensor: optional Keras tensor (i.e. output of `keras.layers.Input()`)
+                to use as image input for the model.
+
+        Examples:
+
+        Using the class with a `backbone`:
+
+        ```python
+        import tensorflow as tf
+        import keras_cv
+
+        images = np.ones(shape=(1, 96, 96, 3))
+        labels = np.zeros(shape=(1, 96, 96, 1))
+        backbone = keras_cv.models.MiTBackbone.from_preset("mit_b0_imagenet")
+
+        # Evaluate model
+        model(images)
+
+        # Train model
+        model.compile(
+            optimizer="adam",
+            loss=keras.losses.BinaryCrossentropy(from_logits=False),
+            metrics=["accuracy"],
+        )
+        model.fit(images, labels, epochs=3)
+        ```
+        """
+        drop_path_rate = 0.1
+        dpr = [x for x in np.linspace(0.0, drop_path_rate, sum(depths))]
+        blockwise_num_heads = [1, 2, 5, 8]
+        blockwise_sr_ratios = [8, 4, 2, 1]
+        num_stages = 4
+
+        cur = 0
+        patch_embedding_layers = []
+        transformer_blocks = []
+        layer_norms = []
+
+        for i in range(num_stages):
+            patch_embed_layer = cv_layers.OverlappingPatchingAndEmbedding(
+                project_dim=embedding_dims[0] if i == 0 else embedding_dims[i],
+                patch_size=7 if i == 0 else 3,
+                stride=4 if i == 0 else 2,
+                name=f"patch_and_embed_{i}",
+            )
+            patch_embedding_layers.append(patch_embed_layer)
+
+            transformer_block = [
+                cv_layers.HierarchicalTransformerEncoder(
+                    project_dim=embedding_dims[i],
+                    num_heads=blockwise_num_heads[i],
+                    sr_ratio=blockwise_sr_ratios[i],
+                    drop_prob=dpr[cur + k],
+                    name=f"hierarchical_encoder_{i}_{k}",
+                )
+                for k in range(depths[i])
+            ]
+            transformer_blocks.append(transformer_block)
+            cur += depths[i]
+            layer_norms.append(keras.layers.LayerNormalization())
+
+        inputs = utils.parse_model_inputs(input_shape, input_tensor)
+        x = inputs
+
+        if include_rescaling:
+            x = keras.layers.Rescaling(scale=1 / 255)(x)
+
+        pyramid_level_inputs = []
+        for i in range(num_stages):
+            # Compute new height/width after the `proj`
+            # call in `OverlappingPatchingAndEmbedding`
+            stride = 4 if i == 0 else 2
+            new_height, new_width = (
+                int(ops.shape(x)[1] / stride),
+                int(ops.shape(x)[2] / stride),
+            )
+
+            x = patch_embedding_layers[i](x)
+            for blk in transformer_blocks[i]:
+                x = blk(x)
+            x = layer_norms[i](x)
+            x = keras.layers.Reshape(
+                (new_height, new_width, -1), name=f"output_level_{i}"
+            )(x)
+            pyramid_level_inputs.append(utils.get_tensor_input_name(x))
+
+        super().__init__(inputs=inputs, outputs=x, **kwargs)
+
+        self.depths = depths
+        self.embedding_dims = embedding_dims
+        self.include_rescaling = include_rescaling
+        self.input_tensor = input_tensor
+        self.pyramid_level_inputs = {
+            f"P{i + 1}": name for i, name in enumerate(pyramid_level_inputs)
+        }
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "depths": self.depths,
+                "embedding_dims": self.embedding_dims,
+                "include_rescaling": self.include_rescaling,
+                "input_shape": self.input_shape[1:],
+                "input_tensor": self.input_tensor,
+            }
+        )
+        return config
+
+    @classproperty
+    def presets(cls):
+        """Dictionary of preset names and configurations."""
+        return copy.deepcopy(backbone_presets)
+
+    @classproperty
+    def presets_with_weights(cls):
+        """Dictionary of preset names and configurations that include
+        weights."""
+        return copy.deepcopy(backbone_presets_with_weights)
diff --git a/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets.py b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets.py
new file mode 100644
index 0000000000..a4c1c2a3e1
--- /dev/null
+++ b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets.py
@@ -0,0 +1,153 @@
+# Copyright 2023 The KerasCV Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""MiT model preset configurations."""
+
+backbone_presets_no_weights = {
+    "mit_b0": {
+        "metadata": {
+            "description": (
+                "MiT (MixTransformer) model with 8 transformer blocks."
+            ),
+            "params": 3321962,
+            "official_name": "MiT",
+            "path": "mit",
+        },
+        "class_name": "keras_cv>MiTBackbone",
+        "config": {
+            "embedding_dims": [32, 64, 160, 256],
+            "depths": [2, 2, 2, 2],
+            "include_rescaling": True,
+            "input_shape": (224, 224, 3),
+            "input_tensor": None,
+        },
+    },
+    "mit_b1": {
+        "metadata": {
+            "description": (
+                "MiT (MixTransformer) model with 8 transformer blocks."
+            ),
+            "params": 13156554,
+            "official_name": "MiT",
+            "path": "mit",
+        },
+        "class_name": "keras_cv>MiTBackbone",
+        "config": {
+            "embedding_dims": [64, 128, 320, 512],
+            "depths": [2, 2, 2, 2],
+            "include_rescaling": True,
+            "input_shape": (224, 224, 3),
+            "input_tensor": None,
+        },
+    },
+    "mit_b2": {
+        "metadata": {
+            "description": (
+                "MiT (MixTransformer) model with 16 transformer blocks."
+            ),
+            "params": 24201418,
+            "official_name": "MiT",
+            "path": "mit",
+        },
+        "class_name": "keras_cv>MiTBackbone",
+        "config": {
+            "embedding_dims": [64, 128, 320, 512],
+            "depths": [3, 4, 6, 3],
+            "include_rescaling": True,
+            "input_shape": (224, 224, 3),
+            "input_tensor": None,
+        },
+    },
+    "mit_b3": {
+        "metadata": {
+            "description": (
+                "MiT (MixTransformer) model with 28 transformer blocks."
+            ),
+            "params": 44077258,
+            "official_name": "MiT",
+            "path": "mit",
+        },
+        "class_name": "keras_cv>MiTBackbone",
+        "config": {
+            "embedding_dims": [64, 128, 320, 512],
+            "depths": [3, 4, 18, 3],
+            "include_rescaling": True,
+            "input_shape": (224, 224, 3),
+            "input_tensor": None,
+        },
+    },
+    "mit_b4": {
+        "metadata": {
+            "description": (
+                "MiT (MixTransformer) model with 41 transformer blocks."
+            ),
+            "params": 60847818,
+            "official_name": "MiT",
+            "path": "mit",
+        },
+        "class_name": "keras_cv>MiTBackbone",
+        "config": {
+            "embedding_dims": [64, 128, 320, 512],
+            "depths": [3, 8, 27, 3],
+            "include_rescaling": True,
+            "input_shape": (224, 224, 3),
+            "input_tensor": None,
+        },
+    },
+    "mit_b5": {
+        "metadata": {
+            "description": (
+                "MiT (MixTransformer) model with 52 transformer blocks."
+            ),
+            "params": 81448138,
+            "official_name": "MiT",
+            "path": "mit",
+        },
+        "class_name": "keras_cv>MiTBackbone",
+        "config": {
+            "embedding_dims": [64, 128, 320, 512],
+            "depths": [3, 6, 40, 3],
+            "include_rescaling": True,
+            "input_shape": (224, 224, 3),
+            "input_tensor": None,
+        },
+    },
+}
+
+backbone_presets_with_weights = {
+    "mit_b0_imagenet": {
+        "metadata": {
+            "description": (
+                "MiT (MixTransformer) model with 8 transformer blocks. Pre-trained on ImageNet-1K and scores 69% top-1 accuracy on the validation set."  # noqa: E501
+            ),
+            "params": 3321962,
+            "official_name": "MiT",
+            "path": "mit",
+        },
+        "class_name": "keras_cv>MiTBackbone",
+        "config": {
+            "embedding_dims": [32, 64, 160, 256],
+            "depths": [2, 2, 2, 2],
+            "include_rescaling": True,
+            "input_shape": (224, 224, 3),
+            "input_tensor": None,
+        },
+        "weights_url": "https://storage.googleapis.com/keras-cv/models/mitb0/imagenet/classification-v0.h5",  # noqa: E501
+        "weights_hash": "8e0c416cd330b6fa0bcfb3a5ccc43edcbcabf6a463aee3c2a9b6a1398c207d10",  # noqa: E501
+    },
+}
+
+backbone_presets = {
+    **backbone_presets_no_weights,
+    **backbone_presets_with_weights,
+}
diff --git a/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets_test.py b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets_test.py
new file mode 100644
index 0000000000..0bc443ee92
--- /dev/null
+++ b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets_test.py
@@ -0,0 +1,100 @@
+# Copyright 2023 The KerasCV Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tests for loading pretrained model presets."""
+
+import numpy as np
+import pytest
+
+from keras_cv.backend import ops
+from keras_cv.models.backbones.mix_transformer.mix_transformer_aliases import (
+    MiTB0Backbone,
+)
+from keras_cv.models.backbones.mix_transformer.mix_transformer_backbone import (
+    MiTBackbone,
+)
+from keras_cv.tests.test_case import TestCase
+
+
+@pytest.mark.large
+class MixTransformerPresetSmokeTest(TestCase):
+    """
+    A smoke test for MixTransformer presets we run continuously.
+    This only tests the smallest weights we have available. Run with:
+    `pytest keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets_test.py --run_large`  # noqa: E501
+    """
+
+    def setUp(self):
+        self.input_batch = np.ones(shape=(2, 224, 224, 3))
+
+    def test_backbone_output(self):
+        model = MiTBackbone.from_preset("mit_b0")
+        model(self.input_batch)
+
+    def test_backbone_output_with_weights(self):
+        model = MiTBackbone.from_preset("mit_b0_imagenet")
+
+        # The forward pass from a preset should be stable!
+        # This test should catch cases where we unintentionally change our
+        # network code in a way that would invalidate our preset weights.
+        # We should only update these numbers if we are updating a weights
+        # file, or have found a discrepancy with the upstream source.
+
+        outputs = model(np.ones(shape=(1, 224, 224, 3)))
+        expected = [-0.603472, -0.180627, -1.92137, -0.004339, 2.396384]
+        # Keep a high tolerance, so we are robust to different hardware.
+        self.assertAllClose(
+            ops.convert_to_numpy(outputs[0, 0, 0, :5]),
+            expected,
+            atol=0.01,
+            rtol=0.01,
+        )
+
+    def test_applications_model_output(self):
+        model = MiTB0Backbone()
+        model(self.input_batch)
+
+    def test_applications_model_output_with_preset(self):
+        model = MiTB0Backbone.from_preset("mit_b0_imagenet")
+        model(self.input_batch)
+
+    def test_preset_docstring(self):
+        """Check we did our docstring formatting correctly."""
+        for name in MiTBackbone.presets:
+            self.assertRegex(MiTBackbone.from_preset.__doc__, name)
+
+    def test_unknown_preset_error(self):
+        # Not a preset name
+        with self.assertRaises(ValueError):
+            MiTBackbone.from_preset("mit_b0_clowntown")
+
+    def test_load_weights_error(self):
+        # Try to load weights when none available
+        with self.assertRaises(ValueError):
+            MiTBackbone.from_preset("mit_b0", load_weights=True)
+
+
+@pytest.mark.extra_large
+class MixTransformerPresetFullTest(TestCase):
+    """
+    Test the full enumeration of our preset.
+    This tests every preset for Mix Transformer and is only run manually.
+    Run with:
+    `pytest keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets_test.py --run_extra_large`  # noqa: E501
+    """
+
+    def test_load_mix_transformer(self):
+        input_data = np.ones(shape=(2, 224, 224, 3))
+        for preset in MiTBackbone.presets:
+            model = MiTBackbone.from_preset(preset)
+            model(input_data)
diff --git a/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_test.py b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_test.py
new file mode 100644
index 0000000000..f24596bdfe
--- /dev/null
+++ b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_test.py
@@ -0,0 +1,69 @@
+# Copyright 2023 The KerasCV Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import numpy as np
+import pytest
+from absl.testing import parameterized
+
+from keras_cv.backend import keras
+from keras_cv.backend import ops
+from keras_cv.models.backbones.mix_transformer.mix_transformer_aliases import (
+    MiTB0Backbone,
+)
+from keras_cv.models.backbones.mix_transformer.mix_transformer_backbone import (
+    MiTBackbone,
+)
+from keras_cv.tests.test_case import TestCase
+
+
+class MixTransformerBackboneTest(TestCase):
+    def setUp(self):
+        self.input_batch = np.ones(shape=(2, 224, 224, 3))
+
+    def test_valid_call(self):
+        model = MiTB0Backbone()
+        model(self.input_batch)
+
+    @pytest.mark.large  # Saving is slow, so mark these large.
+    def test_saved_model(self):
+        model = MiTB0Backbone(
+            include_rescaling=False,
+        )
+        model_output = model(self.input_batch)
+        save_path = os.path.join(self.get_temp_dir(), "mit_backbone.keras")
+        model.save(save_path)
+        restored_model = keras.models.load_model(save_path)
+
+        # Check we got the real object back.
+        self.assertIsInstance(restored_model, MiTBackbone)
+
+        # Check that output matches.
+        restored_output = restored_model(self.input_batch)
+        self.assertAllClose(
+            ops.convert_to_numpy(model_output),
+            ops.convert_to_numpy(restored_output),
+        )
+
+    @parameterized.named_parameters(
+        ("one_channel", 1),
+        ("four_channels", 4),
+    )
+    def test_application_variable_input_channels(self, num_channels):
+        model = MiTB0Backbone(
+            input_shape=(224, 224, num_channels),
+            include_rescaling=False,
+        )
+        self.assertEqual(model.output_shape, (None, 7, 7, 256))
diff --git a/keras_cv/models/segmentation/__init__.py b/keras_cv/models/segmentation/__init__.py
index 122dc4191e..f25ee4ea7c 100644
--- a/keras_cv/models/segmentation/__init__.py
+++ b/keras_cv/models/segmentation/__init__.py
@@ -13,3 +13,4 @@
 # limitations under the License.
 
 from keras_cv.models.segmentation.deeplab_v3_plus import DeepLabV3Plus
+from keras_cv.models.segmentation.segformer import SegFormer
diff --git a/keras_cv/models/segmentation/segformer/__init__.py b/keras_cv/models/segmentation/segformer/__init__.py
new file mode 100644
index 0000000000..59d29582c2
--- /dev/null
+++ b/keras_cv/models/segmentation/segformer/__init__.py
@@ -0,0 +1,15 @@
+# Copyright 2023 The KerasCV Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from keras_cv.models.segmentation.segformer.segformer import SegFormer
diff --git a/keras_cv/models/segmentation/segformer/segformer.py b/keras_cv/models/segmentation/segformer/segformer.py
new file mode 100644
index 0000000000..0985b13749
--- /dev/null
+++ b/keras_cv/models/segmentation/segformer/segformer.py
@@ -0,0 +1,175 @@
+# Copyright 2023 The KerasCV Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+
+from keras_cv.api_export import keras_cv_export
+from keras_cv.backend import keras
+from keras_cv.models.segmentation.segformer.segformer_presets import (  # noqa: E501
+    presets,
+)
+from keras_cv.models.segmentation.segformer.segformer_presets import (  # noqa: E501
+    presets_with_weights,
+)
+from keras_cv.models.task import Task
+from keras_cv.utils.python_utils import classproperty
+from keras_cv.utils.train import get_feature_extractor
+
+
+@keras_cv_export("keras_cv.models.segmentation.SegFormer")
+class SegFormer(Task):
+    """A Keras model implementing the SegFormer architecture for semantic
+    segmentation.
+
+    References:
+        - [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) # noqa: E501
+        - [Based on the TensorFlow implementation from DeepVision](https://github.com/DavidLandup0/deepvision/tree/main/deepvision/models/segmentation/segformer) # noqa: E501
+
+    Args:
+        backbone: `keras.Model`. The backbone network for the model that is
+            used as a feature extractor for the SegFormer encoder.
+            It is *intended* to be used only with the MiT backbone model which
+            was created specifically for SegFormers. It should either be a
+            `keras_cv.models.backbones.backbone.Backbone` or a `tf.keras.Model`
+            that implements the `pyramid_level_inputs` property with keys
+            "P2", "P3", "P4", and "P5" and layer names as
+            values.
+        num_classes: int, the number of classes for the detection model,
+            including the background class.
+        projection_filters: int, number of filters in the
+            convolution layer projecting the concatenated features into
+            a segmentation map. Defaults to 256`.
+
+    Examples:
+
+    Using the class with a `backbone`:
+
+    ```python
+    import tensorflow as tf
+    import keras_cv
+
+    images = np.ones(shape=(1, 96, 96, 3))
+    labels = np.zeros(shape=(1, 96, 96, 1))
+    backbone = keras_cv.models.MiTBackbone.from_preset("mit_b0_imagenet")
+    model = keras_cv.models.segmentation.SegFormer(
+        num_classes=1, backbone=backbone,
+    )
+
+    # Evaluate model
+    model(images)
+
+    # Train model
+    model.compile(
+        optimizer="adam",
+        loss=keras.losses.BinaryCrossentropy(from_logits=False),
+        metrics=["accuracy"],
+    )
+    model.fit(images, labels, epochs=3)
+    ```
+    """
+
+    def __init__(
+        self,
+        backbone,
+        num_classes,
+        projection_filters=256,
+        **kwargs,
+    ):
+        if not isinstance(backbone, keras.layers.Layer) or not isinstance(
+            backbone, keras.Model
+        ):
+            raise ValueError(
+                "Argument `backbone` must be a `keras.layers.Layer` instance "
+                f" or `keras.Model`. Received instead "
+                f"backbone={backbone} (of type {type(backbone)})."
+            )
+
+        inputs = backbone.input
+
+        feature_extractor = get_feature_extractor(
+            backbone, list(backbone.pyramid_level_inputs.values())
+        )
+        # Multi-level dictionary
+        features = list(feature_extractor(inputs).values())
+
+        # Get H and W of level one output
+        _, H, W, _ = features[0].shape
+        # Project all multi-level outputs onto the same dimensionality
+        # and feature map shape
+        multi_layer_outs = []
+        for feature_dim, feature in zip(backbone.embedding_dims, features):
+            out = keras.layers.Dense(
+                projection_filters, name=f"linear_{feature_dim}"
+            )(feature)
+            out = keras.layers.Resizing(H, W, interpolation="bilinear")(out)
+            multi_layer_outs.append(out)
+
+        # Concat now-equal feature maps
+        concatenated_outs = keras.layers.Concatenate(axis=3)(
+            multi_layer_outs[::-1]
+        )
+
+        # Fuse concatenated features into a segmentation map
+        seg = keras.Sequential(
+            [
+                keras.layers.Conv2D(
+                    filters=projection_filters, kernel_size=1, use_bias=False
+                ),
+                keras.layers.BatchNormalization(),
+                keras.layers.Activation("relu"),
+            ]
+        )(concatenated_outs)
+
+        seg = keras.layers.Dropout(0.1)(seg)
+        seg = keras.layers.Conv2D(
+            filters=num_classes, kernel_size=1, activation="softmax"
+        )(seg)
+
+        output = keras.layers.Resizing(
+            height=inputs.shape[1],
+            width=inputs.shape[2],
+            interpolation="bilinear",
+        )(seg)
+
+        super().__init__(
+            inputs=inputs,
+            outputs=output,
+            **kwargs,
+        )
+
+        self.num_classes = num_classes
+        self.projection_filters = projection_filters
+        self.backbone = backbone
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "num_classes": self.num_classes,
+                "projection_filters": self.projection_filters,
+                "backbone": keras.saving.serialize_keras_object(self.backbone),
+            }
+        )
+        return config
+
+    @classproperty
+    def presets(cls):
+        """Dictionary of preset names and configurations."""
+        return copy.deepcopy(presets)
+
+    @classproperty
+    def presets_with_weights(cls):
+        """Dictionary of preset names and configurations that include
+        weights."""
+        return copy.deepcopy(presets_with_weights)
diff --git a/keras_cv/models/segmentation/segformer/segformer_aliases.py b/keras_cv/models/segmentation/segformer/segformer_aliases.py
new file mode 100644
index 0000000000..03547f60f2
--- /dev/null
+++ b/keras_cv/models/segmentation/segformer/segformer_aliases.py
@@ -0,0 +1,244 @@
+# Copyright 2023 The KerasCV Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+
+from keras_cv.models.segmentation.segformer.segformer import SegFormer
+from keras_cv.models.segmentation.segformer.segformer_presets import presets
+from keras_cv.utils.python_utils import classproperty
+
+ALIAS_DOCSTRING = """SegFormer model.
+
+    For transfer learning use cases, make sure to read the
+    [guide to transfer learning & fine-tuning](https://keras.io/guides/transfer_learning/).
+
+    Args:
+        backbone: a KerasCV backbone for feature extraction.
+        num_classes: the number of classes for segmentation, including the background class.
+
+    Examples:
+    ```python
+    input_data = tf.ones(shape=(8, 224, 224, 3))
+
+    # Randomly initialized backbone
+    backbone = keras_cv.models.MiTBackbone.from_preset("mit_b0_imagenet")
+    segformer = keras_cv.models.SegFormer(backbone=backbone, num_classes=19)
+    output = model(input_data)
+    ```
+"""  # noqa: E501
+
+
+class SegFormerB0(SegFormer):
+    def __new__(
+        cls,
+        num_classes,
+        **kwargs,
+    ):
+        # Pack args in kwargs
+        kwargs.update(
+            {
+                "num_classes": num_classes,
+            }
+        )
+        return SegFormer.from_preset("segformer_b0", **kwargs)
+
+    @classproperty
+    def presets(cls):
+        """Dictionary of preset names and configurations."""
+        return {
+            "segformer_b0": copy.deepcopy(presets["segformer_b0"]),
+        }
+
+    @classproperty
+    def presets_with_weights(cls):
+        """Dictionary of preset names and configurations that include
+        weights."""
+        return cls.presets
+
+
+class SegFormerB1(SegFormer):
+    def __new__(
+        cls,
+        num_classes,
+        **kwargs,
+    ):
+        # Pack args in kwargs
+        kwargs.update(
+            {
+                "num_classes": num_classes,
+            }
+        )
+        return SegFormer.from_preset("segformer_b1", **kwargs)
+
+    @classproperty
+    def presets(cls):
+        """Dictionary of preset names and configurations."""
+        return {
+            "segformer_b1": copy.deepcopy(presets["segformer_b1"]),
+        }
+
+    @classproperty
+    def presets_with_weights(cls):
+        """Dictionary of preset names and configurations that include
+        weights."""
+        return cls.presets
+
+
+class SegFormerB2(SegFormer):
+    def __new__(
+        cls,
+        num_classes,
+        **kwargs,
+    ):
+        # Pack args in kwargs
+        kwargs.update(
+            {
+                "num_classes": num_classes,
+            }
+        )
+        return SegFormer.from_preset("segformer_b2", **kwargs)
+
+    @classproperty
+    def presets(cls):
+        """Dictionary of preset names and configurations."""
+        return {
+            "segformer_b2": copy.deepcopy(presets["segformer_b2"]),
+        }
+
+    @classproperty
+    def presets_with_weights(cls):
+        """Dictionary of preset names and configurations that include
+        weights."""
+        return cls.presets
+
+
+class SegFormerB3(SegFormer):
+    def __new__(
+        cls,
+        num_classes,
+        **kwargs,
+    ):
+        # Pack args in kwargs
+        kwargs.update(
+            {
+                "num_classes": num_classes,
+            }
+        )
+        return SegFormer.from_preset("segformer_b3", **kwargs)
+
+    @classproperty
+    def presets(cls):
+        """Dictionary of preset names and configurations."""
+        return {
+            "segformer_b3": copy.deepcopy(presets["segformer_b3"]),
+        }
+
+    @classproperty
+    def presets_with_weights(cls):
+        """Dictionary of preset names and configurations that include
+        weights."""
+        return cls.presets
+
+
+class SegFormerB4(SegFormer):
+    def __new__(
+        cls,
+        num_classes,
+        **kwargs,
+    ):
+        # Pack args in kwargs
+        kwargs.update(
+            {
+                "num_classes": num_classes,
+            }
+        )
+        return SegFormer.from_preset("segformer_b4", **kwargs)
+
+    @classproperty
+    def presets(cls):
+        """Dictionary of preset names and configurations."""
+        return {
+            "segformer_b4": copy.deepcopy(presets["segformer_b4"]),
+        }
+
+    @classproperty
+    def presets_with_weights(cls):
+        """Dictionary of preset names and configurations that include
+        weights."""
+        return cls.presets
+
+
+class SegFormerB5(SegFormer):
+    def __new__(
+        cls,
+        num_classes,
+        **kwargs,
+    ):
+        # Pack args in kwargs
+        kwargs.update(
+            {
+                "num_classes": num_classes,
+            }
+        )
+        return SegFormer.from_preset("segformer_b5", **kwargs)
+
+    @classproperty
+    def presets(cls):
+        """Dictionary of preset names and configurations."""
+        return {
+            "segformer_b5": copy.deepcopy(presets["segformer_b5"]),
+        }
+
+    @classproperty
+    def presets_with_weights(cls):
+        """Dictionary of preset names and configurations that include
+        weights."""
+        return cls.presets
+
+
+setattr(
+    SegFormerB0,
+    "__doc__",
+    ALIAS_DOCSTRING.format(name="SegFormerB0"),
+)
+
+setattr(
+    SegFormerB1,
+    "__doc__",
+    ALIAS_DOCSTRING.format(name="SegFormerB1"),
+)
+
+setattr(
+    SegFormerB2,
+    "__doc__",
+    ALIAS_DOCSTRING.format(name="SegFormerB2"),
+)
+
+setattr(
+    SegFormerB3,
+    "__doc__",
+    ALIAS_DOCSTRING.format(name="SegFormerB3"),
+)
+
+setattr(
+    SegFormerB4,
+    "__doc__",
+    ALIAS_DOCSTRING.format(name="SegFormerB4"),
+)
+
+setattr(
+    SegFormerB5,
+    "__doc__",
+    ALIAS_DOCSTRING.format(name="SegFormerB5"),
+)
diff --git a/keras_cv/models/segmentation/segformer/segformer_presets.py b/keras_cv/models/segmentation/segformer/segformer_presets.py
new file mode 100644
index 0000000000..e19e2ec9ba
--- /dev/null
+++ b/keras_cv/models/segmentation/segformer/segformer_presets.py
@@ -0,0 +1,105 @@
+# Copyright 2023 The KerasCV Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""SegFormer model preset configurations."""
+
+from keras_cv.models.backbones.mix_transformer.mix_transformer_backbone_presets import (  # noqa: E501
+    backbone_presets,
+)
+
+presets_no_weights = {
+    "segformer_b0": {
+        "metadata": {
+            "description": ("SegFormer model with MiTB0 backbone."),
+            "params": 3719027,
+            "official_name": "SegFormerB0",
+            "path": "segformer_b0",
+        },
+        "class_name": "keras_cv>SegFormer",
+        "config": {
+            "backbone": backbone_presets["mit_b0"],
+        },
+    },
+    "segformer_b1": {
+        "metadata": {
+            "description": ("SegFormer model with MiTB1 backbone."),
+            "params": 13682643,
+            "official_name": "SegFormerB1",
+            "path": "segformer_b1",
+        },
+        "class_name": "keras_cv>SegFormer",
+        "config": {"backbone": backbone_presets["mit_b1"]},
+    },
+    "segformer_b2": {
+        "metadata": {
+            "description": ("SegFormer model with MiTB2 backbone."),
+            "params": 24727507,
+            "official_name": "SegFormerB2",
+            "path": "segformer_b2",
+        },
+        "class_name": "keras_cv>SegFormer",
+        "config": {"backbone": backbone_presets["mit_b2"]},
+    },
+    "segformer_b3": {
+        "metadata": {
+            "description": ("SegFormer model with MiTB3 backbone."),
+            "params": 44603347,
+            "official_name": "SegFormerB3",
+            "path": "segformer_b3",
+        },
+        "class_name": "keras_cv>SegFormer",
+        "config": {"backbone": backbone_presets["mit_b3"]},
+    },
+    "segformer_b4": {
+        "metadata": {
+            "description": ("SegFormer model with MiTB4 backbone."),
+            "params": 61373907,
+            "official_name": "SegFormerB4",
+            "path": "segformer_b4",
+        },
+        "class_name": "keras_cv>SegFormer",
+        "config": {"backbone": backbone_presets["mit_b4"]},
+    },
+    "segformer_b5": {
+        "metadata": {
+            "description": ("SegFormer model with MiTB5 backbone."),
+            "params": 81974227,
+            "official_name": "SegFormerB5",
+            "path": "segformer_b5",
+        },
+        "class_name": "keras_cv>SegFormer",
+        "config": {"backbone": backbone_presets["mit_b5"]},
+    },
+}
+
+presets_with_weights = {
+    "segformer_b0_imagenet": {
+        "metadata": {
+            "description": (
+                "SegFormer model with a pretrained MiTB0 backbone."
+            ),
+            "params": 3719027,
+            "official_name": "SegFormerB0",
+            "path": "segformer_b0",
+        },
+        "class_name": "keras_cv>SegFormer",
+        "config": {
+            "backbone": backbone_presets["mit_b0_imagenet"],
+        },
+    },
+}
+
+presets = {
+    **presets_no_weights,
+    **presets_with_weights,
+}
diff --git a/keras_cv/models/segmentation/segformer/segformer_test.py b/keras_cv/models/segmentation/segformer/segformer_test.py
new file mode 100644
index 0000000000..0990e0e88f
--- /dev/null
+++ b/keras_cv/models/segmentation/segformer/segformer_test.py
@@ -0,0 +1,92 @@
+# Copyright 2023 The KerasCV Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+import numpy as np
+import pytest
+import tensorflow as tf
+
+from keras_cv.backend import keras
+from keras_cv.backend import ops
+from keras_cv.models import MiTBackbone
+from keras_cv.models import SegFormer
+from keras_cv.tests.test_case import TestCase
+
+
+class SegFormerTest(TestCase):
+    def test_segformer_construction(self):
+        backbone = MiTBackbone.from_preset("mit_b0", input_shape=[512, 512, 3])
+        model = SegFormer(backbone=backbone, num_classes=1)
+        model.compile(
+            optimizer="adam",
+            loss=keras.losses.BinaryCrossentropy(),
+            metrics=["accuracy"],
+        )
+
+    @pytest.mark.large
+    def test_segformer_call(self):
+        backbone = MiTBackbone.from_preset("mit_b0", input_shape=[512, 512, 3])
+        model = SegFormer(backbone=backbone, num_classes=1)
+        images = np.random.uniform(size=(2, 512, 512, 3))
+        _ = model(images)
+        _ = model.predict(images)
+
+    @pytest.mark.large
+    def test_weights_change(self):
+        target_size = [512, 512, 2]
+
+        images = tf.ones(shape=[1] + [512, 512, 3])
+        labels = tf.zeros(shape=[1] + target_size)
+        ds = tf.data.Dataset.from_tensor_slices((images, labels))
+        ds = ds.repeat(2)
+        ds = ds.batch(2)
+
+        backbone = MiTBackbone.from_preset("mit_b0", input_shape=[512, 512, 3])
+        model = SegFormer(backbone=backbone, num_classes=2)
+
+        model.compile(
+            optimizer="adam",
+            loss=keras.losses.BinaryCrossentropy(),
+            metrics=["accuracy"],
+        )
+
+        original_weights = model.get_weights()
+        model.fit(ds, epochs=1)
+        updated_weights = model.get_weights()
+
+        for w1, w2 in zip(original_weights, updated_weights):
+            self.assertNotAllEqual(w1, w2)
+            self.assertFalse(ops.any(ops.isnan(w2)))
+
+    @pytest.mark.large  # Saving is slow, so mark these large.
+    def test_saved_model(self):
+        target_size = [512, 512, 3]
+
+        backbone = MiTBackbone.from_preset("mit_b0", input_shape=[512, 512, 3])
+        model = SegFormer(backbone=backbone, num_classes=1)
+
+        input_batch = np.ones(shape=[2] + target_size)
+        model_output = model(input_batch)
+
+        save_path = os.path.join(self.get_temp_dir(), "model.keras")
+        model.save(save_path, save_format="keras_v3")
+        restored_model = keras.models.load_model(save_path)
+
+        # Check we got the real object back.
+        self.assertIsInstance(restored_model, SegFormer)
+
+        # Check that output matches.
+        restored_output = restored_model(input_batch)
+        self.assertAllClose(model_output, restored_output)