From dc4189228e121246fc797975763f6af6f2416bcd Mon Sep 17 00:00:00 2001 From: David Landup Date: Thu, 13 Jul 2023 20:12:36 +0200 Subject: [PATCH 01/53] initial dump --- .../segmentation/segformer/segformer.py | 44 +++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 keras_cv/models/segmentation/segformer/segformer.py diff --git a/keras_cv/models/segmentation/segformer/segformer.py b/keras_cv/models/segmentation/segformer/segformer.py new file mode 100644 index 0000000000..c1fcc39200 --- /dev/null +++ b/keras_cv/models/segmentation/segformer/segformer.py @@ -0,0 +1,44 @@ +import tensorflow as tf + +from deepvision.layers.segformer_segmentation_head import SegFormerHead +from deepvision.utils.utils import parse_model_inputs + + +class __SegFormerTF(tf.keras.Model): + def __init__( + self, + num_classes=None, + backbone=None, + embed_dim=None, + input_shape=None, + input_tensor=None, + softmax_output=None, + **kwargs + ): + inputs = parse_model_inputs("tensorflow", input_shape, input_tensor) + x = inputs + y = backbone(x) + y = SegFormerHead( + in_dims=backbone.output_channels, + embed_dim=embed_dim, + num_classes=num_classes, + name="segformer_head", + backend="tensorflow", + )(y) + output = tf.keras.layers.Resizing( + height=x.shape[1], width=x.shape[2], interpolation="bilinear" + )(y) + if softmax_output: + output = tf.keras.layers.Activation("softmax", name="output_activation")( + output + ) + + super().__init__( + inputs=inputs, + outputs=output, + **kwargs, + ) + + self.num_classes = num_classes + self.embed_dim = embed_dim + self.softmax_output = softmax_output \ No newline at end of file From e5677e6679a25689dd65b40246821939d16c785e Mon Sep 17 00:00:00 2001 From: DavidLandup0 Date: Mon, 17 Jul 2023 12:05:01 +0200 Subject: [PATCH 02/53] add all basic layers, port roughly to keras core ops --- ...ient_multihead_attention_20230717115601.py | 0 ...ient_multihead_attention_20230717115636.py | 199 ++++++++++++ ...ient_multihead_attention_20230717115647.py | 202 +++++++++++++ ...ient_multihead_attention_20230717115709.py | 148 +++++++++ ...ient_multihead_attention_20230717115745.py | 147 +++++++++ ...ient_multihead_attention_20230717115849.py | 145 +++++++++ ...ient_multihead_attention_20230717115857.py | 145 +++++++++ ...ient_multihead_attention_20230717115932.py | 146 +++++++++ ...ient_multihead_attention_20230717115958.py | 142 +++++++++ ...ient_multihead_attention_20230717120016.py | 140 +++++++++ ...ient_multihead_attention_20230717120020.py | 140 +++++++++ ...ient_multihead_attention_20230717120108.py | 140 +++++++++ ...ient_multihead_attention_20230717120147.py | 138 +++++++++ ...ient_multihead_attention_20230717120153.py | 83 ++++++ ...ient_multihead_attention_20230717120213.py | 83 ++++++ ...ical_transformer_encoder_20230717115219.py | 0 ...ical_transformer_encoder_20230717115222.py | 0 ...ical_transformer_encoder_20230717115257.py | 146 +++++++++ ...ical_transformer_encoder_20230717115327.py | 53 ++++ ...ical_transformer_encoder_20230717115542.py | 31 ++ ...ical_transformer_encoder_20230717120252.py | 32 ++ ...ical_transformer_encoder_20230717120400.py | 56 ++++ ...ical_transformer_encoder_20230717120409.py | 55 ++++ ...ical_transformer_encoder_20230717120411.py | 55 ++++ ...ical_transformer_encoder_20230717120414.py | 55 ++++ ...pping_patching_embedding_20230717115127.py | 0 ...pping_patching_embedding_20230717115248.py | 133 +++++++++ ...pping_patching_embedding_20230717115350.py | 24 ++ ...pping_patching_embedding_20230717115401.py | 22 ++ ...pping_patching_embedding_20230717115451.py | 22 ++ ...pping_patching_embedding_20230717115505.py | 22 ++ ...pping_patching_embedding_20230717115509.py | 22 ++ ...pping_patching_embedding_20230717115518.py | 22 ++ ...mix_transformer_backbone_20230717113450.py | 0 ...mix_transformer_backbone_20230717113606.py | 158 ++++++++++ ...mix_transformer_backbone_20230717113615.py | 158 ++++++++++ ...mix_transformer_backbone_20230717113618.py | 156 ++++++++++ ...mix_transformer_backbone_20230717115045.py | 146 +++++++++ ...mix_transformer_backbone_20230717115058.py | 144 +++++++++ ...sformer_backbone_presets_20230717113721.py | 0 ...sformer_backbone_presets_20230717113912.py | 282 ++++++++++++++++++ ...sformer_backbone_presets_20230717114045.py | 126 ++++++++ ...sformer_backbone_presets_20230717114112.py | 126 ++++++++ ...sformer_backbone_presets_20230717114332.py | 114 +++++++ ...sformer_backbone_presets_20230717114420.py | 114 +++++++ ...sformer_backbone_presets_20230717114436.py | 114 +++++++ .../layers/efficient_multihead_attention.py | 83 ++++++ .../hierarchical_transformer_encoder.py | 55 ++++ .../layers/overlapping_patching_embedding.py | 22 ++ .../mix_transformer_backbone.py | 144 +++++++++ .../mix_transformer_backbone_presets.py | 114 +++++++ .../segmentation/segformer/segformer.py | 8 +- 52 files changed, 4808 insertions(+), 4 deletions(-) create mode 100644 .history/keras_cv/layers/efficient_multihead_attention_20230717115601.py create mode 100644 .history/keras_cv/layers/efficient_multihead_attention_20230717115636.py create mode 100644 .history/keras_cv/layers/efficient_multihead_attention_20230717115647.py create mode 100644 .history/keras_cv/layers/efficient_multihead_attention_20230717115709.py create mode 100644 .history/keras_cv/layers/efficient_multihead_attention_20230717115745.py create mode 100644 .history/keras_cv/layers/efficient_multihead_attention_20230717115849.py create mode 100644 .history/keras_cv/layers/efficient_multihead_attention_20230717115857.py create mode 100644 .history/keras_cv/layers/efficient_multihead_attention_20230717115932.py create mode 100644 .history/keras_cv/layers/efficient_multihead_attention_20230717115958.py create mode 100644 .history/keras_cv/layers/efficient_multihead_attention_20230717120016.py create mode 100644 .history/keras_cv/layers/efficient_multihead_attention_20230717120020.py create mode 100644 .history/keras_cv/layers/efficient_multihead_attention_20230717120108.py create mode 100644 .history/keras_cv/layers/efficient_multihead_attention_20230717120147.py create mode 100644 .history/keras_cv/layers/efficient_multihead_attention_20230717120153.py create mode 100644 .history/keras_cv/layers/efficient_multihead_attention_20230717120213.py create mode 100644 .history/keras_cv/layers/hierarchical_transformer_encoder_20230717115219.py create mode 100644 .history/keras_cv/layers/hierarchical_transformer_encoder_20230717115222.py create mode 100644 .history/keras_cv/layers/hierarchical_transformer_encoder_20230717115257.py create mode 100644 .history/keras_cv/layers/hierarchical_transformer_encoder_20230717115327.py create mode 100644 .history/keras_cv/layers/hierarchical_transformer_encoder_20230717115542.py create mode 100644 .history/keras_cv/layers/hierarchical_transformer_encoder_20230717120252.py create mode 100644 .history/keras_cv/layers/hierarchical_transformer_encoder_20230717120400.py create mode 100644 .history/keras_cv/layers/hierarchical_transformer_encoder_20230717120409.py create mode 100644 .history/keras_cv/layers/hierarchical_transformer_encoder_20230717120411.py create mode 100644 .history/keras_cv/layers/hierarchical_transformer_encoder_20230717120414.py create mode 100644 .history/keras_cv/layers/overlapping_patching_embedding_20230717115127.py create mode 100644 .history/keras_cv/layers/overlapping_patching_embedding_20230717115248.py create mode 100644 .history/keras_cv/layers/overlapping_patching_embedding_20230717115350.py create mode 100644 .history/keras_cv/layers/overlapping_patching_embedding_20230717115401.py create mode 100644 .history/keras_cv/layers/overlapping_patching_embedding_20230717115451.py create mode 100644 .history/keras_cv/layers/overlapping_patching_embedding_20230717115505.py create mode 100644 .history/keras_cv/layers/overlapping_patching_embedding_20230717115509.py create mode 100644 .history/keras_cv/layers/overlapping_patching_embedding_20230717115518.py create mode 100644 .history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_20230717113450.py create mode 100644 .history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_20230717113606.py create mode 100644 .history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_20230717113615.py create mode 100644 .history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_20230717113618.py create mode 100644 .history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_20230717115045.py create mode 100644 .history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_20230717115058.py create mode 100644 .history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets_20230717113721.py create mode 100644 .history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets_20230717113912.py create mode 100644 .history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets_20230717114045.py create mode 100644 .history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets_20230717114112.py create mode 100644 .history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets_20230717114332.py create mode 100644 .history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets_20230717114420.py create mode 100644 .history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets_20230717114436.py create mode 100644 keras_cv/layers/efficient_multihead_attention.py create mode 100644 keras_cv/layers/hierarchical_transformer_encoder.py create mode 100644 keras_cv/layers/overlapping_patching_embedding.py create mode 100644 keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py create mode 100644 keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets.py diff --git a/.history/keras_cv/layers/efficient_multihead_attention_20230717115601.py b/.history/keras_cv/layers/efficient_multihead_attention_20230717115601.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/.history/keras_cv/layers/efficient_multihead_attention_20230717115636.py b/.history/keras_cv/layers/efficient_multihead_attention_20230717115636.py new file mode 100644 index 0000000000..11a54263ed --- /dev/null +++ b/.history/keras_cv/layers/efficient_multihead_attention_20230717115636.py @@ -0,0 +1,199 @@ +""" +Based on: https://github.com/sithu31296/semantic-segmentation/blob/main/semseg/models/backbones/mit.py +""" + + +class __EfficientMultiheadAttentionPT(nn.Module): + def __init__(self, project_dim, num_heads, sr_ratio): + super().__init__() + self.num_heads = num_heads + self.sr_ratio = sr_ratio + self.scale = (project_dim // num_heads) ** -0.5 + self.q = nn.Linear(project_dim, project_dim) + self.kv = nn.Linear(project_dim, project_dim * 2) + self.proj = nn.Linear(project_dim, project_dim) + + if sr_ratio > 1: + self.sr = nn.Conv2d( + in_channels=project_dim, + out_channels=project_dim, + kernel_size=sr_ratio, + stride=sr_ratio, + padding=same_padding(sr_ratio, sr_ratio), + ) + self.norm = nn.LayerNorm(project_dim) + + def forward(self, x, H, W): + batch_size, seq_len, project_dim = x.shape + q = ( + self.q(x) + .reshape( + batch_size, + seq_len, + self.num_heads, + project_dim // self.num_heads, + ) + .permute(0, 2, 1, 3) + ) + + if self.sr_ratio > 1: + x = x.permute(0, 2, 1).reshape(batch_size, project_dim, H, W) + x = self.sr(x).reshape(batch_size, project_dim, -1).permute(0, 2, 1) + x = self.norm(x) + + k, v = ( + self.kv(x) + .reshape( + batch_size, -1, 2, self.num_heads, project_dim // self.num_heads + ) + .permute(2, 0, 3, 1, 4) + ) + + attn = (q @ k.transpose(-2, -1)) * self.scale + attn = attn.softmax(dim=-1) + + x = (attn @ v).transpose(1, 2).reshape(batch_size, seq_len, project_dim) + x = self.proj(x) + return x + + +class __EfficientMultiheadAttentionTF(tf.keras.layers.Layer): + def __init__(self, project_dim, num_heads, sr_ratio): + super().__init__() + self.num_heads = num_heads + self.sr_ratio = sr_ratio + self.scale = (project_dim // num_heads) ** -0.5 + self.q = tf.keras.layers.Dense(project_dim) + self.k = tf.keras.layers.Dense(project_dim) + self.v = tf.keras.layers.Dense(project_dim) + self.proj = tf.keras.layers.Dense(project_dim) + + if sr_ratio > 1: + self.sr = tf.keras.layers.Conv2D( + filters=project_dim, + kernel_size=sr_ratio, + strides=sr_ratio, + padding="same", + ) + self.norm = tf.keras.layers.LayerNormalization() + + def call(self, x, H, W): + input_shape = tf.shape(x) + + q = self.q(x) + q = tf.reshape( + q, + shape=[ + input_shape[0], + input_shape[1], + self.num_heads, + input_shape[2] // self.num_heads, + ], + ) + + q = tf.transpose(q, [0, 2, 1, 3]) + + if self.sr_ratio > 1: + x = tf.reshape( + tf.transpose(x, [0, 2, 1]), + shape=[input_shape[0], H, W, input_shape[2]], + ) + x = self.sr(x) + x = tf.reshape(x, [input_shape[0], input_shape[2], -1]) + x = tf.transpose(x, [0, 2, 1]) + x = self.norm(x) + + k = self.k(x) + v = self.v(x) + + k = tf.transpose( + tf.reshape( + k, + [ + input_shape[0], + -1, + self.num_heads, + input_shape[2] // self.num_heads, + ], + ), + [0, 2, 1, 3], + ) + + v = tf.transpose( + tf.reshape( + v, + [ + input_shape[0], + -1, + self.num_heads, + input_shape[2] // self.num_heads, + ], + ), + [0, 2, 1, 3], + ) + + attn = (q @ tf.transpose(k, [0, 1, 3, 2])) * self.scale + attn = tf.nn.softmax(attn, axis=-1) + + attn = attn @ v + attn = tf.transpose(attn, [0, 2, 1, 3]) + attn = tf.reshape( + attn, shape=[input_shape[0], input_shape[1], input_shape[2]] + ) + x = self.proj(attn) + return x + + +LAYER_BACKBONES = { + "tensorflow": __EfficientMultiheadAttentionTF, + "pytorch": __EfficientMultiheadAttentionPT, +} + + +def EfficientMultiheadAttention( + project_dim, num_heads, sr_ratio, backend="pytorch" +): + """ + `EfficientMultiheadAttention` is a standard scaled softmax attention layer, but shortens the sequence it operates on by a reduction factor, to reduce computational cost. + The layer is meant to be used as part of the `deepvision.layers.HierarchicalTransformerEncoder` for the SegFormer architecture. + + Reference: + - ["SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers"](https://arxiv.org/pdf/2105.15203v2.pdf) + + Args: + project_dim: the dimensionality of the projection for the keys, values and queries + num_heads: the number of attention heads to apply + sr_ratio: the reduction ratio for the sequence length + backend: the backend framework to use + + Basic usage: + + ``` + tensor = torch.rand(1, 196, 32) + output = deepvision.layers.EfficientMultiheadAttention(project_dim=32, + num_heads=2, + sr_ratio=4, + backend='pytorch')(tensor, H=14, W=14) + + print(output.shape) # torch.Size([1, 196, 32]) + + tensor = tf.random.uniform([1, 196, 32]) + output = deepvision.layers.EfficientMultiheadAttention(project_dim=32, + num_heads=2, + sr_ratio=4, + backend='tensorflow')(tensor, H=14, W=14) + print(output.shape) # (1, 196, 32) + ``` + + """ + layer_class = LAYER_BACKBONES.get(backend) + if layer_class is None: + raise ValueError( + f"Backend not supported: {backend}. Supported backbones are {LAYER_BACKBONES.keys()}" + ) + + layer = layer_class( + project_dim=project_dim, num_heads=num_heads, sr_ratio=sr_ratio + ) + + return layer diff --git a/.history/keras_cv/layers/efficient_multihead_attention_20230717115647.py b/.history/keras_cv/layers/efficient_multihead_attention_20230717115647.py new file mode 100644 index 0000000000..46e20038cf --- /dev/null +++ b/.history/keras_cv/layers/efficient_multihead_attention_20230717115647.py @@ -0,0 +1,202 @@ +from keras_cv.backend import keras + +""" +Based on: https://github.com/sithu31296/semantic-segmentation/blob/main/semseg/models/backbones/mit.py +""" + + +@keras.saving.register_keras_serializable(package="keras_cv") +class __EfficientMultiheadAttentionPT(nn.Module): + def __init__(self, project_dim, num_heads, sr_ratio): + super().__init__() + self.num_heads = num_heads + self.sr_ratio = sr_ratio + self.scale = (project_dim // num_heads) ** -0.5 + self.q = nn.Linear(project_dim, project_dim) + self.kv = nn.Linear(project_dim, project_dim * 2) + self.proj = nn.Linear(project_dim, project_dim) + + if sr_ratio > 1: + self.sr = nn.Conv2d( + in_channels=project_dim, + out_channels=project_dim, + kernel_size=sr_ratio, + stride=sr_ratio, + padding=same_padding(sr_ratio, sr_ratio), + ) + self.norm = nn.LayerNorm(project_dim) + + def forward(self, x, H, W): + batch_size, seq_len, project_dim = x.shape + q = ( + self.q(x) + .reshape( + batch_size, + seq_len, + self.num_heads, + project_dim // self.num_heads, + ) + .permute(0, 2, 1, 3) + ) + + if self.sr_ratio > 1: + x = x.permute(0, 2, 1).reshape(batch_size, project_dim, H, W) + x = self.sr(x).reshape(batch_size, project_dim, -1).permute(0, 2, 1) + x = self.norm(x) + + k, v = ( + self.kv(x) + .reshape( + batch_size, -1, 2, self.num_heads, project_dim // self.num_heads + ) + .permute(2, 0, 3, 1, 4) + ) + + attn = (q @ k.transpose(-2, -1)) * self.scale + attn = attn.softmax(dim=-1) + + x = (attn @ v).transpose(1, 2).reshape(batch_size, seq_len, project_dim) + x = self.proj(x) + return x + + +class __EfficientMultiheadAttentionTF(tf.keras.layers.Layer): + def __init__(self, project_dim, num_heads, sr_ratio): + super().__init__() + self.num_heads = num_heads + self.sr_ratio = sr_ratio + self.scale = (project_dim // num_heads) ** -0.5 + self.q = tf.keras.layers.Dense(project_dim) + self.k = tf.keras.layers.Dense(project_dim) + self.v = tf.keras.layers.Dense(project_dim) + self.proj = tf.keras.layers.Dense(project_dim) + + if sr_ratio > 1: + self.sr = tf.keras.layers.Conv2D( + filters=project_dim, + kernel_size=sr_ratio, + strides=sr_ratio, + padding="same", + ) + self.norm = tf.keras.layers.LayerNormalization() + + def call(self, x, H, W): + input_shape = tf.shape(x) + + q = self.q(x) + q = tf.reshape( + q, + shape=[ + input_shape[0], + input_shape[1], + self.num_heads, + input_shape[2] // self.num_heads, + ], + ) + + q = tf.transpose(q, [0, 2, 1, 3]) + + if self.sr_ratio > 1: + x = tf.reshape( + tf.transpose(x, [0, 2, 1]), + shape=[input_shape[0], H, W, input_shape[2]], + ) + x = self.sr(x) + x = tf.reshape(x, [input_shape[0], input_shape[2], -1]) + x = tf.transpose(x, [0, 2, 1]) + x = self.norm(x) + + k = self.k(x) + v = self.v(x) + + k = tf.transpose( + tf.reshape( + k, + [ + input_shape[0], + -1, + self.num_heads, + input_shape[2] // self.num_heads, + ], + ), + [0, 2, 1, 3], + ) + + v = tf.transpose( + tf.reshape( + v, + [ + input_shape[0], + -1, + self.num_heads, + input_shape[2] // self.num_heads, + ], + ), + [0, 2, 1, 3], + ) + + attn = (q @ tf.transpose(k, [0, 1, 3, 2])) * self.scale + attn = tf.nn.softmax(attn, axis=-1) + + attn = attn @ v + attn = tf.transpose(attn, [0, 2, 1, 3]) + attn = tf.reshape( + attn, shape=[input_shape[0], input_shape[1], input_shape[2]] + ) + x = self.proj(attn) + return x + + +LAYER_BACKBONES = { + "tensorflow": __EfficientMultiheadAttentionTF, + "pytorch": __EfficientMultiheadAttentionPT, +} + + +def EfficientMultiheadAttention( + project_dim, num_heads, sr_ratio, backend="pytorch" +): + """ + `EfficientMultiheadAttention` is a standard scaled softmax attention layer, but shortens the sequence it operates on by a reduction factor, to reduce computational cost. + The layer is meant to be used as part of the `deepvision.layers.HierarchicalTransformerEncoder` for the SegFormer architecture. + + Reference: + - ["SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers"](https://arxiv.org/pdf/2105.15203v2.pdf) + + Args: + project_dim: the dimensionality of the projection for the keys, values and queries + num_heads: the number of attention heads to apply + sr_ratio: the reduction ratio for the sequence length + backend: the backend framework to use + + Basic usage: + + ``` + tensor = torch.rand(1, 196, 32) + output = deepvision.layers.EfficientMultiheadAttention(project_dim=32, + num_heads=2, + sr_ratio=4, + backend='pytorch')(tensor, H=14, W=14) + + print(output.shape) # torch.Size([1, 196, 32]) + + tensor = tf.random.uniform([1, 196, 32]) + output = deepvision.layers.EfficientMultiheadAttention(project_dim=32, + num_heads=2, + sr_ratio=4, + backend='tensorflow')(tensor, H=14, W=14) + print(output.shape) # (1, 196, 32) + ``` + + """ + layer_class = LAYER_BACKBONES.get(backend) + if layer_class is None: + raise ValueError( + f"Backend not supported: {backend}. Supported backbones are {LAYER_BACKBONES.keys()}" + ) + + layer = layer_class( + project_dim=project_dim, num_heads=num_heads, sr_ratio=sr_ratio + ) + + return layer diff --git a/.history/keras_cv/layers/efficient_multihead_attention_20230717115709.py b/.history/keras_cv/layers/efficient_multihead_attention_20230717115709.py new file mode 100644 index 0000000000..c758bc53c4 --- /dev/null +++ b/.history/keras_cv/layers/efficient_multihead_attention_20230717115709.py @@ -0,0 +1,148 @@ +from keras_cv.backend import keras + +""" +Based on: https://github.com/sithu31296/semantic-segmentation/blob/main/semseg/models/backbones/mit.py +""" + + +@keras.saving.register_keras_serializable(package="keras_cv") +class __EfficientMultiheadAttentionTF(tf.keras.layers.Layer): + def __init__(self, project_dim, num_heads, sr_ratio): + super().__init__() + self.num_heads = num_heads + self.sr_ratio = sr_ratio + self.scale = (project_dim // num_heads) ** -0.5 + self.q = tf.keras.layers.Dense(project_dim) + self.k = tf.keras.layers.Dense(project_dim) + self.v = tf.keras.layers.Dense(project_dim) + self.proj = tf.keras.layers.Dense(project_dim) + + if sr_ratio > 1: + self.sr = tf.keras.layers.Conv2D( + filters=project_dim, + kernel_size=sr_ratio, + strides=sr_ratio, + padding="same", + ) + self.norm = tf.keras.layers.LayerNormalization() + + def call(self, x, H, W): + input_shape = tf.shape(x) + + q = self.q(x) + q = tf.reshape( + q, + shape=[ + input_shape[0], + input_shape[1], + self.num_heads, + input_shape[2] // self.num_heads, + ], + ) + + q = tf.transpose(q, [0, 2, 1, 3]) + + if self.sr_ratio > 1: + x = tf.reshape( + tf.transpose(x, [0, 2, 1]), + shape=[input_shape[0], H, W, input_shape[2]], + ) + x = self.sr(x) + x = tf.reshape(x, [input_shape[0], input_shape[2], -1]) + x = tf.transpose(x, [0, 2, 1]) + x = self.norm(x) + + k = self.k(x) + v = self.v(x) + + k = tf.transpose( + tf.reshape( + k, + [ + input_shape[0], + -1, + self.num_heads, + input_shape[2] // self.num_heads, + ], + ), + [0, 2, 1, 3], + ) + + v = tf.transpose( + tf.reshape( + v, + [ + input_shape[0], + -1, + self.num_heads, + input_shape[2] // self.num_heads, + ], + ), + [0, 2, 1, 3], + ) + + attn = (q @ tf.transpose(k, [0, 1, 3, 2])) * self.scale + attn = tf.nn.softmax(attn, axis=-1) + + attn = attn @ v + attn = tf.transpose(attn, [0, 2, 1, 3]) + attn = tf.reshape( + attn, shape=[input_shape[0], input_shape[1], input_shape[2]] + ) + x = self.proj(attn) + return x + + +LAYER_BACKBONES = { + "tensorflow": __EfficientMultiheadAttentionTF, + "pytorch": __EfficientMultiheadAttentionPT, +} + + +def EfficientMultiheadAttention( + project_dim, num_heads, sr_ratio, backend="pytorch" +): + """ + `EfficientMultiheadAttention` is a standard scaled softmax attention layer, but shortens the sequence it operates on by a reduction factor, to reduce computational cost. + The layer is meant to be used as part of the `deepvision.layers.HierarchicalTransformerEncoder` for the SegFormer architecture. + + Reference: + - ["SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers"](https://arxiv.org/pdf/2105.15203v2.pdf) + + Args: + project_dim: the dimensionality of the projection for the keys, values and queries + num_heads: the number of attention heads to apply + sr_ratio: the reduction ratio for the sequence length + backend: the backend framework to use + + Basic usage: + + ``` + tensor = torch.rand(1, 196, 32) + output = deepvision.layers.EfficientMultiheadAttention(project_dim=32, + num_heads=2, + sr_ratio=4, + backend='pytorch')(tensor, H=14, W=14) + + print(output.shape) # torch.Size([1, 196, 32]) + + tensor = tf.random.uniform([1, 196, 32]) + output = deepvision.layers.EfficientMultiheadAttention(project_dim=32, + num_heads=2, + sr_ratio=4, + backend='tensorflow')(tensor, H=14, W=14) + print(output.shape) # (1, 196, 32) + ``` + + """ + layer_class = LAYER_BACKBONES.get(backend) + if layer_class is None: + raise ValueError( + f"Backend not supported: {backend}. Supported backbones are {LAYER_BACKBONES.keys()}" + ) + + layer = layer_class( + project_dim=project_dim, num_heads=num_heads, sr_ratio=sr_ratio + ) + + return layer diff --git a/.history/keras_cv/layers/efficient_multihead_attention_20230717115745.py b/.history/keras_cv/layers/efficient_multihead_attention_20230717115745.py new file mode 100644 index 0000000000..2c69162376 --- /dev/null +++ b/.history/keras_cv/layers/efficient_multihead_attention_20230717115745.py @@ -0,0 +1,147 @@ +from keras_cv.backend import keras + +""" +Based on: https://github.com/sithu31296/semantic-segmentation/blob/main/semseg/models/backbones/mit.py +""" + + +@keras.saving.register_keras_serializable(package="keras_cv") +class EfficientMultiheadAttention(tf.keras.layers.Layer): + def __init__(self, project_dim, num_heads, sr_ratio): + super().__init__() + self.num_heads = num_heads + self.sr_ratio = sr_ratio + self.scale = (project_dim // num_heads) ** -0.5 + self.q = keras.layers.Dense(project_dim) + self.k = keras.layers.Dense(project_dim) + self.v = keras.layers.Dense(project_dim) + self.proj = keras.layers.Dense(project_dim) + + if sr_ratio > 1: + self.sr = keras.layers.Conv2D( + filters=project_dim, + kernel_size=sr_ratio, + strides=sr_ratio, + padding="same", + ) + self.norm = keras.layers.LayerNormalization() + + def call(self, x, H, W): + input_shape = shape(x) + + q = self.q(x) + q = q.reshape( + shape=[ + input_shape[0], + input_shape[1], + self.num_heads, + input_shape[2] // self.num_heads, + ], + ) + + q = tf.transpose(q, [0, 2, 1, 3]) + + if self.sr_ratio > 1: + x = tf.reshape( + tf.transpose(x, [0, 2, 1]), + shape=[input_shape[0], H, W, input_shape[2]], + ) + x = self.sr(x) + x = tf.reshape(x, [input_shape[0], input_shape[2], -1]) + x = tf.transpose(x, [0, 2, 1]) + x = self.norm(x) + + k = self.k(x) + v = self.v(x) + + k = tf.transpose( + tf.reshape( + k, + [ + input_shape[0], + -1, + self.num_heads, + input_shape[2] // self.num_heads, + ], + ), + [0, 2, 1, 3], + ) + + v = tf.transpose( + tf.reshape( + v, + [ + input_shape[0], + -1, + self.num_heads, + input_shape[2] // self.num_heads, + ], + ), + [0, 2, 1, 3], + ) + + attn = (q @ tf.transpose(k, [0, 1, 3, 2])) * self.scale + attn = tf.nn.softmax(attn, axis=-1) + + attn = attn @ v + attn = tf.transpose(attn, [0, 2, 1, 3]) + attn = tf.reshape( + attn, shape=[input_shape[0], input_shape[1], input_shape[2]] + ) + x = self.proj(attn) + return x + + +LAYER_BACKBONES = { + "tensorflow": __EfficientMultiheadAttentionTF, + "pytorch": __EfficientMultiheadAttentionPT, +} + + +def EfficientMultiheadAttention( + project_dim, num_heads, sr_ratio, backend="pytorch" +): + """ + `EfficientMultiheadAttention` is a standard scaled softmax attention layer, but shortens the sequence it operates on by a reduction factor, to reduce computational cost. + The layer is meant to be used as part of the `deepvision.layers.HierarchicalTransformerEncoder` for the SegFormer architecture. + + Reference: + - ["SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers"](https://arxiv.org/pdf/2105.15203v2.pdf) + + Args: + project_dim: the dimensionality of the projection for the keys, values and queries + num_heads: the number of attention heads to apply + sr_ratio: the reduction ratio for the sequence length + backend: the backend framework to use + + Basic usage: + + ``` + tensor = torch.rand(1, 196, 32) + output = deepvision.layers.EfficientMultiheadAttention(project_dim=32, + num_heads=2, + sr_ratio=4, + backend='pytorch')(tensor, H=14, W=14) + + print(output.shape) # torch.Size([1, 196, 32]) + + tensor = tf.random.uniform([1, 196, 32]) + output = deepvision.layers.EfficientMultiheadAttention(project_dim=32, + num_heads=2, + sr_ratio=4, + backend='tensorflow')(tensor, H=14, W=14) + print(output.shape) # (1, 196, 32) + ``` + + """ + layer_class = LAYER_BACKBONES.get(backend) + if layer_class is None: + raise ValueError( + f"Backend not supported: {backend}. Supported backbones are {LAYER_BACKBONES.keys()}" + ) + + layer = layer_class( + project_dim=project_dim, num_heads=num_heads, sr_ratio=sr_ratio + ) + + return layer diff --git a/.history/keras_cv/layers/efficient_multihead_attention_20230717115849.py b/.history/keras_cv/layers/efficient_multihead_attention_20230717115849.py new file mode 100644 index 0000000000..6efe64dc32 --- /dev/null +++ b/.history/keras_cv/layers/efficient_multihead_attention_20230717115849.py @@ -0,0 +1,145 @@ +from keras_cv.backend import keras + +""" +Based on: https://github.com/sithu31296/semantic-segmentation/blob/main/semseg/models/backbones/mit.py +""" + + +@keras.saving.register_keras_serializable(package="keras_cv") +class EfficientMultiheadAttention(tf.keras.layers.Layer): + def __init__(self, project_dim, num_heads, sr_ratio): + super().__init__() + self.num_heads = num_heads + self.sr_ratio = sr_ratio + self.scale = (project_dim // num_heads) ** -0.5 + self.q = keras.layers.Dense(project_dim) + self.k = keras.layers.Dense(project_dim) + self.v = keras.layers.Dense(project_dim) + self.proj = keras.layers.Dense(project_dim) + + if sr_ratio > 1: + self.sr = keras.layers.Conv2D( + filters=project_dim, + kernel_size=sr_ratio, + strides=sr_ratio, + padding="same", + ) + self.norm = keras.layers.LayerNormalization() + + def call(self, x, H, W): + input_shape = shape(x) + + q = self.q(x) + q = q.reshape( + ( + input_shape[0], + input_shape[1], + self.num_heads, + input_shape[2] // self.num_heads, + ), + ) + + q = x.transpose([0, 2, 1, 3]) + + if self.sr_ratio > 1: + x = x.transpose(x, [0, 2, 1]) + x = x.reshape((input_shape[0], H, W, input_shape[2])) + x = self.sr(x) + x = tf.reshape(x, [input_shape[0], input_shape[2], -1]) + x = tf.transpose(x, [0, 2, 1]) + x = self.norm(x) + + k = self.k(x) + v = self.v(x) + + k = tf.transpose( + tf.reshape( + k, + [ + input_shape[0], + -1, + self.num_heads, + input_shape[2] // self.num_heads, + ], + ), + [0, 2, 1, 3], + ) + + v = tf.transpose( + tf.reshape( + v, + [ + input_shape[0], + -1, + self.num_heads, + input_shape[2] // self.num_heads, + ], + ), + [0, 2, 1, 3], + ) + + attn = (q @ tf.transpose(k, [0, 1, 3, 2])) * self.scale + attn = tf.nn.softmax(attn, axis=-1) + + attn = attn @ v + attn = tf.transpose(attn, [0, 2, 1, 3]) + attn = tf.reshape( + attn, shape=[input_shape[0], input_shape[1], input_shape[2]] + ) + x = self.proj(attn) + return x + + +LAYER_BACKBONES = { + "tensorflow": __EfficientMultiheadAttentionTF, + "pytorch": __EfficientMultiheadAttentionPT, +} + + +def EfficientMultiheadAttention( + project_dim, num_heads, sr_ratio, backend="pytorch" +): + """ + `EfficientMultiheadAttention` is a standard scaled softmax attention layer, but shortens the sequence it operates on by a reduction factor, to reduce computational cost. + The layer is meant to be used as part of the `deepvision.layers.HierarchicalTransformerEncoder` for the SegFormer architecture. + + Reference: + - ["SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers"](https://arxiv.org/pdf/2105.15203v2.pdf) + + Args: + project_dim: the dimensionality of the projection for the keys, values and queries + num_heads: the number of attention heads to apply + sr_ratio: the reduction ratio for the sequence length + backend: the backend framework to use + + Basic usage: + + ``` + tensor = torch.rand(1, 196, 32) + output = deepvision.layers.EfficientMultiheadAttention(project_dim=32, + num_heads=2, + sr_ratio=4, + backend='pytorch')(tensor, H=14, W=14) + + print(output.shape) # torch.Size([1, 196, 32]) + + tensor = tf.random.uniform([1, 196, 32]) + output = deepvision.layers.EfficientMultiheadAttention(project_dim=32, + num_heads=2, + sr_ratio=4, + backend='tensorflow')(tensor, H=14, W=14) + print(output.shape) # (1, 196, 32) + ``` + + """ + layer_class = LAYER_BACKBONES.get(backend) + if layer_class is None: + raise ValueError( + f"Backend not supported: {backend}. Supported backbones are {LAYER_BACKBONES.keys()}" + ) + + layer = layer_class( + project_dim=project_dim, num_heads=num_heads, sr_ratio=sr_ratio + ) + + return layer diff --git a/.history/keras_cv/layers/efficient_multihead_attention_20230717115857.py b/.history/keras_cv/layers/efficient_multihead_attention_20230717115857.py new file mode 100644 index 0000000000..a3f69e6631 --- /dev/null +++ b/.history/keras_cv/layers/efficient_multihead_attention_20230717115857.py @@ -0,0 +1,145 @@ +from keras_cv.backend import keras + +""" +Based on: https://github.com/sithu31296/semantic-segmentation/blob/main/semseg/models/backbones/mit.py +""" + + +@keras.saving.register_keras_serializable(package="keras_cv") +class EfficientMultiheadAttention(tf.keras.layers.Layer): + def __init__(self, project_dim, num_heads, sr_ratio): + super().__init__() + self.num_heads = num_heads + self.sr_ratio = sr_ratio + self.scale = (project_dim // num_heads) ** -0.5 + self.q = keras.layers.Dense(project_dim) + self.k = keras.layers.Dense(project_dim) + self.v = keras.layers.Dense(project_dim) + self.proj = keras.layers.Dense(project_dim) + + if sr_ratio > 1: + self.sr = keras.layers.Conv2D( + filters=project_dim, + kernel_size=sr_ratio, + strides=sr_ratio, + padding="same", + ) + self.norm = keras.layers.LayerNormalization() + + def call(self, x, H, W): + input_shape = shape(x) + + q = self.q(x) + q = q.reshape( + ( + input_shape[0], + input_shape[1], + self.num_heads, + input_shape[2] // self.num_heads, + ), + ) + + q = x.transpose([0, 2, 1, 3]) + + if self.sr_ratio > 1: + x = x.transpose(x, [0, 2, 1]) + x = x.reshape((input_shape[0], H, W, input_shape[2])) + x = self.sr(x) + x = x.reshape([input_shape[0], input_shape[2], -1]) + x = x.transpose([0, 2, 1]) + x = self.norm(x) + + k = self.k(x) + v = self.v(x) + + k = tf.transpose( + tf.reshape( + k, + [ + input_shape[0], + -1, + self.num_heads, + input_shape[2] // self.num_heads, + ], + ), + [0, 2, 1, 3], + ) + + v = tf.transpose( + tf.reshape( + v, + [ + input_shape[0], + -1, + self.num_heads, + input_shape[2] // self.num_heads, + ], + ), + [0, 2, 1, 3], + ) + + attn = (q @ tf.transpose(k, [0, 1, 3, 2])) * self.scale + attn = tf.nn.softmax(attn, axis=-1) + + attn = attn @ v + attn = tf.transpose(attn, [0, 2, 1, 3]) + attn = tf.reshape( + attn, shape=[input_shape[0], input_shape[1], input_shape[2]] + ) + x = self.proj(attn) + return x + + +LAYER_BACKBONES = { + "tensorflow": __EfficientMultiheadAttentionTF, + "pytorch": __EfficientMultiheadAttentionPT, +} + + +def EfficientMultiheadAttention( + project_dim, num_heads, sr_ratio, backend="pytorch" +): + """ + `EfficientMultiheadAttention` is a standard scaled softmax attention layer, but shortens the sequence it operates on by a reduction factor, to reduce computational cost. + The layer is meant to be used as part of the `deepvision.layers.HierarchicalTransformerEncoder` for the SegFormer architecture. + + Reference: + - ["SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers"](https://arxiv.org/pdf/2105.15203v2.pdf) + + Args: + project_dim: the dimensionality of the projection for the keys, values and queries + num_heads: the number of attention heads to apply + sr_ratio: the reduction ratio for the sequence length + backend: the backend framework to use + + Basic usage: + + ``` + tensor = torch.rand(1, 196, 32) + output = deepvision.layers.EfficientMultiheadAttention(project_dim=32, + num_heads=2, + sr_ratio=4, + backend='pytorch')(tensor, H=14, W=14) + + print(output.shape) # torch.Size([1, 196, 32]) + + tensor = tf.random.uniform([1, 196, 32]) + output = deepvision.layers.EfficientMultiheadAttention(project_dim=32, + num_heads=2, + sr_ratio=4, + backend='tensorflow')(tensor, H=14, W=14) + print(output.shape) # (1, 196, 32) + ``` + + """ + layer_class = LAYER_BACKBONES.get(backend) + if layer_class is None: + raise ValueError( + f"Backend not supported: {backend}. Supported backbones are {LAYER_BACKBONES.keys()}" + ) + + layer = layer_class( + project_dim=project_dim, num_heads=num_heads, sr_ratio=sr_ratio + ) + + return layer diff --git a/.history/keras_cv/layers/efficient_multihead_attention_20230717115932.py b/.history/keras_cv/layers/efficient_multihead_attention_20230717115932.py new file mode 100644 index 0000000000..2ce2c70dac --- /dev/null +++ b/.history/keras_cv/layers/efficient_multihead_attention_20230717115932.py @@ -0,0 +1,146 @@ +from keras_cv.backend import keras + +""" +Based on: https://github.com/sithu31296/semantic-segmentation/blob/main/semseg/models/backbones/mit.py +""" + + +@keras.saving.register_keras_serializable(package="keras_cv") +class EfficientMultiheadAttention(tf.keras.layers.Layer): + def __init__(self, project_dim, num_heads, sr_ratio): + super().__init__() + self.num_heads = num_heads + self.sr_ratio = sr_ratio + self.scale = (project_dim // num_heads) ** -0.5 + self.q = keras.layers.Dense(project_dim) + self.k = keras.layers.Dense(project_dim) + self.v = keras.layers.Dense(project_dim) + self.proj = keras.layers.Dense(project_dim) + + if sr_ratio > 1: + self.sr = keras.layers.Conv2D( + filters=project_dim, + kernel_size=sr_ratio, + strides=sr_ratio, + padding="same", + ) + self.norm = keras.layers.LayerNormalization() + + def call(self, x, H, W): + input_shape = shape(x) + + q = self.q(x) + q = q.reshape( + ( + input_shape[0], + input_shape[1], + self.num_heads, + input_shape[2] // self.num_heads, + ), + ) + + q = x.transpose([0, 2, 1, 3]) + + if self.sr_ratio > 1: + x = x.transpose(x, [0, 2, 1]).reshape( + (input_shape[0], H, W, input_shape[2]) + ) + x = self.sr(x) + x = x.reshape([input_shape[0], input_shape[2], -1]) + x = x.transpose([0, 2, 1]) + x = self.norm(x) + + k = self.k(x) + v = self.v(x) + + k = tf.transpose( + tf.reshape( + k, + [ + input_shape[0], + -1, + self.num_heads, + input_shape[2] // self.num_heads, + ], + ), + [0, 2, 1, 3], + ) + + v = tf.transpose( + tf.reshape( + v, + [ + input_shape[0], + -1, + self.num_heads, + input_shape[2] // self.num_heads, + ], + ), + [0, 2, 1, 3], + ) + + attn = (q @ tf.transpose(k, [0, 1, 3, 2])) * self.scale + attn = tf.nn.softmax(attn, axis=-1) + + attn = attn @ v + attn = tf.transpose(attn, [0, 2, 1, 3]) + attn = tf.reshape( + attn, shape=[input_shape[0], input_shape[1], input_shape[2]] + ) + x = self.proj(attn) + return x + + +LAYER_BACKBONES = { + "tensorflow": __EfficientMultiheadAttentionTF, + "pytorch": __EfficientMultiheadAttentionPT, +} + + +def EfficientMultiheadAttention( + project_dim, num_heads, sr_ratio, backend="pytorch" +): + """ + `EfficientMultiheadAttention` is a standard scaled softmax attention layer, but shortens the sequence it operates on by a reduction factor, to reduce computational cost. + The layer is meant to be used as part of the `deepvision.layers.HierarchicalTransformerEncoder` for the SegFormer architecture. + + Reference: + - ["SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers"](https://arxiv.org/pdf/2105.15203v2.pdf) + + Args: + project_dim: the dimensionality of the projection for the keys, values and queries + num_heads: the number of attention heads to apply + sr_ratio: the reduction ratio for the sequence length + backend: the backend framework to use + + Basic usage: + + ``` + tensor = torch.rand(1, 196, 32) + output = deepvision.layers.EfficientMultiheadAttention(project_dim=32, + num_heads=2, + sr_ratio=4, + backend='pytorch')(tensor, H=14, W=14) + + print(output.shape) # torch.Size([1, 196, 32]) + + tensor = tf.random.uniform([1, 196, 32]) + output = deepvision.layers.EfficientMultiheadAttention(project_dim=32, + num_heads=2, + sr_ratio=4, + backend='tensorflow')(tensor, H=14, W=14) + print(output.shape) # (1, 196, 32) + ``` + + """ + layer_class = LAYER_BACKBONES.get(backend) + if layer_class is None: + raise ValueError( + f"Backend not supported: {backend}. Supported backbones are {LAYER_BACKBONES.keys()}" + ) + + layer = layer_class( + project_dim=project_dim, num_heads=num_heads, sr_ratio=sr_ratio + ) + + return layer diff --git a/.history/keras_cv/layers/efficient_multihead_attention_20230717115958.py b/.history/keras_cv/layers/efficient_multihead_attention_20230717115958.py new file mode 100644 index 0000000000..03ae5581af --- /dev/null +++ b/.history/keras_cv/layers/efficient_multihead_attention_20230717115958.py @@ -0,0 +1,142 @@ +from keras_cv.backend import keras + +""" +Based on: https://github.com/sithu31296/semantic-segmentation/blob/main/semseg/models/backbones/mit.py +""" + + +@keras.saving.register_keras_serializable(package="keras_cv") +class EfficientMultiheadAttention(tf.keras.layers.Layer): + def __init__(self, project_dim, num_heads, sr_ratio): + super().__init__() + self.num_heads = num_heads + self.sr_ratio = sr_ratio + self.scale = (project_dim // num_heads) ** -0.5 + self.q = keras.layers.Dense(project_dim) + self.k = keras.layers.Dense(project_dim) + self.v = keras.layers.Dense(project_dim) + self.proj = keras.layers.Dense(project_dim) + + if sr_ratio > 1: + self.sr = keras.layers.Conv2D( + filters=project_dim, + kernel_size=sr_ratio, + strides=sr_ratio, + padding="same", + ) + self.norm = keras.layers.LayerNormalization() + + def call(self, x, H, W): + input_shape = shape(x) + + q = self.q(x) + q = q.reshape( + ( + input_shape[0], + input_shape[1], + self.num_heads, + input_shape[2] // self.num_heads, + ), + ) + + q = x.transpose([0, 2, 1, 3]) + + if self.sr_ratio > 1: + x = x.transpose(x, [0, 2, 1]).reshape( + (input_shape[0], H, W, input_shape[2]) + ) + x = self.sr(x) + x = x.reshape([input_shape[0], input_shape[2], -1]) + x = x.transpose([0, 2, 1]) + x = self.norm(x) + + k = self.k(x) + v = self.v(x) + + k = x.transpose([0, 2, 1, 3]).reshape( + [ + input_shape[0], + -1, + self.num_heads, + input_shape[2] // self.num_heads, + ] + ) + + v = tf.transpose( + tf.reshape( + v, + [ + input_shape[0], + -1, + self.num_heads, + input_shape[2] // self.num_heads, + ], + ), + [0, 2, 1, 3], + ) + + attn = (q @ tf.transpose(k, [0, 1, 3, 2])) * self.scale + attn = tf.nn.softmax(attn, axis=-1) + + attn = attn @ v + attn = tf.transpose(attn, [0, 2, 1, 3]) + attn = tf.reshape( + attn, shape=[input_shape[0], input_shape[1], input_shape[2]] + ) + x = self.proj(attn) + return x + + +LAYER_BACKBONES = { + "tensorflow": __EfficientMultiheadAttentionTF, + "pytorch": __EfficientMultiheadAttentionPT, +} + + +def EfficientMultiheadAttention( + project_dim, num_heads, sr_ratio, backend="pytorch" +): + """ + `EfficientMultiheadAttention` is a standard scaled softmax attention layer, but shortens the sequence it operates on by a reduction factor, to reduce computational cost. + The layer is meant to be used as part of the `deepvision.layers.HierarchicalTransformerEncoder` for the SegFormer architecture. + + Reference: + - ["SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers"](https://arxiv.org/pdf/2105.15203v2.pdf) + + Args: + project_dim: the dimensionality of the projection for the keys, values and queries + num_heads: the number of attention heads to apply + sr_ratio: the reduction ratio for the sequence length + backend: the backend framework to use + + Basic usage: + + ``` + tensor = torch.rand(1, 196, 32) + output = deepvision.layers.EfficientMultiheadAttention(project_dim=32, + num_heads=2, + sr_ratio=4, + backend='pytorch')(tensor, H=14, W=14) + + print(output.shape) # torch.Size([1, 196, 32]) + + tensor = tf.random.uniform([1, 196, 32]) + output = deepvision.layers.EfficientMultiheadAttention(project_dim=32, + num_heads=2, + sr_ratio=4, + backend='tensorflow')(tensor, H=14, W=14) + print(output.shape) # (1, 196, 32) + ``` + + """ + layer_class = LAYER_BACKBONES.get(backend) + if layer_class is None: + raise ValueError( + f"Backend not supported: {backend}. Supported backbones are {LAYER_BACKBONES.keys()}" + ) + + layer = layer_class( + project_dim=project_dim, num_heads=num_heads, sr_ratio=sr_ratio + ) + + return layer diff --git a/.history/keras_cv/layers/efficient_multihead_attention_20230717120016.py b/.history/keras_cv/layers/efficient_multihead_attention_20230717120016.py new file mode 100644 index 0000000000..b27f82defd --- /dev/null +++ b/.history/keras_cv/layers/efficient_multihead_attention_20230717120016.py @@ -0,0 +1,140 @@ +from keras_cv.backend import keras + +""" +Based on: https://github.com/sithu31296/semantic-segmentation/blob/main/semseg/models/backbones/mit.py +""" + + +@keras.saving.register_keras_serializable(package="keras_cv") +class EfficientMultiheadAttention(tf.keras.layers.Layer): + def __init__(self, project_dim, num_heads, sr_ratio): + super().__init__() + self.num_heads = num_heads + self.sr_ratio = sr_ratio + self.scale = (project_dim // num_heads) ** -0.5 + self.q = keras.layers.Dense(project_dim) + self.k = keras.layers.Dense(project_dim) + self.v = keras.layers.Dense(project_dim) + self.proj = keras.layers.Dense(project_dim) + + if sr_ratio > 1: + self.sr = keras.layers.Conv2D( + filters=project_dim, + kernel_size=sr_ratio, + strides=sr_ratio, + padding="same", + ) + self.norm = keras.layers.LayerNormalization() + + def call(self, x, H, W): + input_shape = shape(x) + + q = self.q(x) + q = q.reshape( + ( + input_shape[0], + input_shape[1], + self.num_heads, + input_shape[2] // self.num_heads, + ), + ) + + q = x.transpose([0, 2, 1, 3]) + + if self.sr_ratio > 1: + x = x.transpose(x, [0, 2, 1]).reshape( + (input_shape[0], H, W, input_shape[2]) + ) + x = self.sr(x) + x = x.reshape([input_shape[0], input_shape[2], -1]) + x = x.transpose([0, 2, 1]) + x = self.norm(x) + + k = self.k(x) + v = self.v(x) + + k = x.transpose([0, 2, 1, 3]).reshape( + [ + input_shape[0], + -1, + self.num_heads, + input_shape[2] // self.num_heads, + ] + ) + + v = ( + x.transpose([0, 2, 1, 3]).reshape( + [ + input_shape[0], + -1, + self.num_heads, + input_shape[2] // self.num_heads, + ] + ), + ) + + attn = (q @ tf.transpose(k, [0, 1, 3, 2])) * self.scale + attn = tf.nn.softmax(attn, axis=-1) + + attn = attn @ v + attn = tf.transpose(attn, [0, 2, 1, 3]) + attn = tf.reshape( + attn, shape=[input_shape[0], input_shape[1], input_shape[2]] + ) + x = self.proj(attn) + return x + + +LAYER_BACKBONES = { + "tensorflow": __EfficientMultiheadAttentionTF, + "pytorch": __EfficientMultiheadAttentionPT, +} + + +def EfficientMultiheadAttention( + project_dim, num_heads, sr_ratio, backend="pytorch" +): + """ + `EfficientMultiheadAttention` is a standard scaled softmax attention layer, but shortens the sequence it operates on by a reduction factor, to reduce computational cost. + The layer is meant to be used as part of the `deepvision.layers.HierarchicalTransformerEncoder` for the SegFormer architecture. + + Reference: + - ["SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers"](https://arxiv.org/pdf/2105.15203v2.pdf) + + Args: + project_dim: the dimensionality of the projection for the keys, values and queries + num_heads: the number of attention heads to apply + sr_ratio: the reduction ratio for the sequence length + backend: the backend framework to use + + Basic usage: + + ``` + tensor = torch.rand(1, 196, 32) + output = deepvision.layers.EfficientMultiheadAttention(project_dim=32, + num_heads=2, + sr_ratio=4, + backend='pytorch')(tensor, H=14, W=14) + + print(output.shape) # torch.Size([1, 196, 32]) + + tensor = tf.random.uniform([1, 196, 32]) + output = deepvision.layers.EfficientMultiheadAttention(project_dim=32, + num_heads=2, + sr_ratio=4, + backend='tensorflow')(tensor, H=14, W=14) + print(output.shape) # (1, 196, 32) + ``` + + """ + layer_class = LAYER_BACKBONES.get(backend) + if layer_class is None: + raise ValueError( + f"Backend not supported: {backend}. Supported backbones are {LAYER_BACKBONES.keys()}" + ) + + layer = layer_class( + project_dim=project_dim, num_heads=num_heads, sr_ratio=sr_ratio + ) + + return layer diff --git a/.history/keras_cv/layers/efficient_multihead_attention_20230717120020.py b/.history/keras_cv/layers/efficient_multihead_attention_20230717120020.py new file mode 100644 index 0000000000..7c21c3fcdc --- /dev/null +++ b/.history/keras_cv/layers/efficient_multihead_attention_20230717120020.py @@ -0,0 +1,140 @@ +from keras_cv.backend import keras + +""" +Based on: https://github.com/sithu31296/semantic-segmentation/blob/main/semseg/models/backbones/mit.py +""" + + +@keras.saving.register_keras_serializable(package="keras_cv") +class EfficientMultiheadAttention(tf.keras.layers.Layer): + def __init__(self, project_dim, num_heads, sr_ratio): + super().__init__() + self.num_heads = num_heads + self.sr_ratio = sr_ratio + self.scale = (project_dim // num_heads) ** -0.5 + self.q = keras.layers.Dense(project_dim) + self.k = keras.layers.Dense(project_dim) + self.v = keras.layers.Dense(project_dim) + self.proj = keras.layers.Dense(project_dim) + + if sr_ratio > 1: + self.sr = keras.layers.Conv2D( + filters=project_dim, + kernel_size=sr_ratio, + strides=sr_ratio, + padding="same", + ) + self.norm = keras.layers.LayerNormalization() + + def call(self, x, H, W): + input_shape = shape(x) + + q = self.q(x) + q = q.reshape( + ( + input_shape[0], + input_shape[1], + self.num_heads, + input_shape[2] // self.num_heads, + ), + ) + + q = x.transpose([0, 2, 1, 3]) + + if self.sr_ratio > 1: + x = x.transpose(x, [0, 2, 1]).reshape( + (input_shape[0], H, W, input_shape[2]) + ) + x = self.sr(x) + x = x.reshape([input_shape[0], input_shape[2], -1]) + x = x.transpose([0, 2, 1]) + x = self.norm(x) + + k = self.k(x) + v = self.v(x) + + k = x.transpose([0, 2, 1, 3]).reshape( + [ + input_shape[0], + -1, + self.num_heads, + input_shape[2] // self.num_heads, + ] + ) + + v = ( + x.transpose([0, 2, 1, 3]).reshape( + [ + input_shape[0], + -1, + self.num_heads, + input_shape[2] // self.num_heads, + ] + ), + ) + + attn = (q @ x.transpose([0, 1, 3, 2])) * self.scale + attn = tf.nn.softmax(attn, axis=-1) + + attn = attn @ v + attn = tf.transpose(attn, [0, 2, 1, 3]) + attn = tf.reshape( + attn, shape=[input_shape[0], input_shape[1], input_shape[2]] + ) + x = self.proj(attn) + return x + + +LAYER_BACKBONES = { + "tensorflow": __EfficientMultiheadAttentionTF, + "pytorch": __EfficientMultiheadAttentionPT, +} + + +def EfficientMultiheadAttention( + project_dim, num_heads, sr_ratio, backend="pytorch" +): + """ + `EfficientMultiheadAttention` is a standard scaled softmax attention layer, but shortens the sequence it operates on by a reduction factor, to reduce computational cost. + The layer is meant to be used as part of the `deepvision.layers.HierarchicalTransformerEncoder` for the SegFormer architecture. + + Reference: + - ["SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers"](https://arxiv.org/pdf/2105.15203v2.pdf) + + Args: + project_dim: the dimensionality of the projection for the keys, values and queries + num_heads: the number of attention heads to apply + sr_ratio: the reduction ratio for the sequence length + backend: the backend framework to use + + Basic usage: + + ``` + tensor = torch.rand(1, 196, 32) + output = deepvision.layers.EfficientMultiheadAttention(project_dim=32, + num_heads=2, + sr_ratio=4, + backend='pytorch')(tensor, H=14, W=14) + + print(output.shape) # torch.Size([1, 196, 32]) + + tensor = tf.random.uniform([1, 196, 32]) + output = deepvision.layers.EfficientMultiheadAttention(project_dim=32, + num_heads=2, + sr_ratio=4, + backend='tensorflow')(tensor, H=14, W=14) + print(output.shape) # (1, 196, 32) + ``` + + """ + layer_class = LAYER_BACKBONES.get(backend) + if layer_class is None: + raise ValueError( + f"Backend not supported: {backend}. Supported backbones are {LAYER_BACKBONES.keys()}" + ) + + layer = layer_class( + project_dim=project_dim, num_heads=num_heads, sr_ratio=sr_ratio + ) + + return layer diff --git a/.history/keras_cv/layers/efficient_multihead_attention_20230717120108.py b/.history/keras_cv/layers/efficient_multihead_attention_20230717120108.py new file mode 100644 index 0000000000..d7a6325330 --- /dev/null +++ b/.history/keras_cv/layers/efficient_multihead_attention_20230717120108.py @@ -0,0 +1,140 @@ +from keras_cv.backend import keras + +""" +Based on: https://github.com/sithu31296/semantic-segmentation/blob/main/semseg/models/backbones/mit.py +""" + + +@keras.saving.register_keras_serializable(package="keras_cv") +class EfficientMultiheadAttention(tf.keras.layers.Layer): + def __init__(self, project_dim, num_heads, sr_ratio): + super().__init__() + self.num_heads = num_heads + self.sr_ratio = sr_ratio + self.scale = (project_dim // num_heads) ** -0.5 + self.q = keras.layers.Dense(project_dim) + self.k = keras.layers.Dense(project_dim) + self.v = keras.layers.Dense(project_dim) + self.proj = keras.layers.Dense(project_dim) + + if sr_ratio > 1: + self.sr = keras.layers.Conv2D( + filters=project_dim, + kernel_size=sr_ratio, + strides=sr_ratio, + padding="same", + ) + self.norm = keras.layers.LayerNormalization() + + def call(self, x, H, W): + input_shape = shape(x) + + q = self.q(x) + q = q.reshape( + ( + input_shape[0], + input_shape[1], + self.num_heads, + input_shape[2] // self.num_heads, + ), + ) + + q = x.transpose([0, 2, 1, 3]) + + if self.sr_ratio > 1: + x = x.transpose(x, [0, 2, 1]).reshape( + (input_shape[0], H, W, input_shape[2]) + ) + x = self.sr(x) + x = x.reshape([input_shape[0], input_shape[2], -1]) + x = x.transpose([0, 2, 1]) + x = self.norm(x) + + k = self.k(x) + v = self.v(x) + + k = x.transpose([0, 2, 1, 3]).reshape( + [ + input_shape[0], + -1, + self.num_heads, + input_shape[2] // self.num_heads, + ] + ) + + v = ( + x.transpose([0, 2, 1, 3]).reshape( + [ + input_shape[0], + -1, + self.num_heads, + input_shape[2] // self.num_heads, + ] + ), + ) + + attn = (q @ x.transpose([0, 1, 3, 2])) * self.scale + attn = keras.nn.ops.softmax(attn, axis=-1) + + attn = attn @ v + attn = tf.transpose(attn, [0, 2, 1, 3]) + attn = tf.reshape( + attn, shape=[input_shape[0], input_shape[1], input_shape[2]] + ) + x = self.proj(attn) + return x + + +LAYER_BACKBONES = { + "tensorflow": __EfficientMultiheadAttentionTF, + "pytorch": __EfficientMultiheadAttentionPT, +} + + +def EfficientMultiheadAttention( + project_dim, num_heads, sr_ratio, backend="pytorch" +): + """ + `EfficientMultiheadAttention` is a standard scaled softmax attention layer, but shortens the sequence it operates on by a reduction factor, to reduce computational cost. + The layer is meant to be used as part of the `deepvision.layers.HierarchicalTransformerEncoder` for the SegFormer architecture. + + Reference: + - ["SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers"](https://arxiv.org/pdf/2105.15203v2.pdf) + + Args: + project_dim: the dimensionality of the projection for the keys, values and queries + num_heads: the number of attention heads to apply + sr_ratio: the reduction ratio for the sequence length + backend: the backend framework to use + + Basic usage: + + ``` + tensor = torch.rand(1, 196, 32) + output = deepvision.layers.EfficientMultiheadAttention(project_dim=32, + num_heads=2, + sr_ratio=4, + backend='pytorch')(tensor, H=14, W=14) + + print(output.shape) # torch.Size([1, 196, 32]) + + tensor = tf.random.uniform([1, 196, 32]) + output = deepvision.layers.EfficientMultiheadAttention(project_dim=32, + num_heads=2, + sr_ratio=4, + backend='tensorflow')(tensor, H=14, W=14) + print(output.shape) # (1, 196, 32) + ``` + + """ + layer_class = LAYER_BACKBONES.get(backend) + if layer_class is None: + raise ValueError( + f"Backend not supported: {backend}. Supported backbones are {LAYER_BACKBONES.keys()}" + ) + + layer = layer_class( + project_dim=project_dim, num_heads=num_heads, sr_ratio=sr_ratio + ) + + return layer diff --git a/.history/keras_cv/layers/efficient_multihead_attention_20230717120147.py b/.history/keras_cv/layers/efficient_multihead_attention_20230717120147.py new file mode 100644 index 0000000000..93dd8d79fe --- /dev/null +++ b/.history/keras_cv/layers/efficient_multihead_attention_20230717120147.py @@ -0,0 +1,138 @@ +from keras_cv.backend import keras + +""" +Based on: https://github.com/sithu31296/semantic-segmentation/blob/main/semseg/models/backbones/mit.py +""" + + +@keras.saving.register_keras_serializable(package="keras_cv") +class EfficientMultiheadAttention(tf.keras.layers.Layer): + def __init__(self, project_dim, num_heads, sr_ratio): + super().__init__() + self.num_heads = num_heads + self.sr_ratio = sr_ratio + self.scale = (project_dim // num_heads) ** -0.5 + self.q = keras.layers.Dense(project_dim) + self.k = keras.layers.Dense(project_dim) + self.v = keras.layers.Dense(project_dim) + self.proj = keras.layers.Dense(project_dim) + + if sr_ratio > 1: + self.sr = keras.layers.Conv2D( + filters=project_dim, + kernel_size=sr_ratio, + strides=sr_ratio, + padding="same", + ) + self.norm = keras.layers.LayerNormalization() + + def call(self, x, H, W): + input_shape = shape(x) + + q = self.q(x) + q = q.reshape( + ( + input_shape[0], + input_shape[1], + self.num_heads, + input_shape[2] // self.num_heads, + ), + ) + + q = q.transpose([0, 2, 1, 3]) + + if self.sr_ratio > 1: + x = x.transpose(x, [0, 2, 1]).reshape( + (input_shape[0], H, W, input_shape[2]) + ) + x = self.sr(x) + x = x.reshape([input_shape[0], input_shape[2], -1]) + x = x.transpose([0, 2, 1]) + x = self.norm(x) + + k = self.k(x) + v = self.v(x) + + k = k.transpose([0, 2, 1, 3]).reshape( + [ + input_shape[0], + -1, + self.num_heads, + input_shape[2] // self.num_heads, + ] + ) + v = ( + v.transpose([0, 2, 1, 3]).reshape( + [ + input_shape[0], + -1, + self.num_heads, + input_shape[2] // self.num_heads, + ] + ), + ) + + attn = (q @ x.transpose([0, 1, 3, 2])) * self.scale + attn = keras.nn.ops.softmax(attn, axis=-1) + + attn = attn @ v + attn = attn.transpose(attn, [0, 2, 1, 3]).reshape( + [input_shape[0], input_shape[1], input_shape[2]] + ) + x = self.proj(attn) + return x + + +LAYER_BACKBONES = { + "tensorflow": __EfficientMultiheadAttentionTF, + "pytorch": __EfficientMultiheadAttentionPT, +} + + +def EfficientMultiheadAttention( + project_dim, num_heads, sr_ratio, backend="pytorch" +): + """ + `EfficientMultiheadAttention` is a standard scaled softmax attention layer, but shortens the sequence it operates on by a reduction factor, to reduce computational cost. + The layer is meant to be used as part of the `deepvision.layers.HierarchicalTransformerEncoder` for the SegFormer architecture. + + Reference: + - ["SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers"](https://arxiv.org/pdf/2105.15203v2.pdf) + + Args: + project_dim: the dimensionality of the projection for the keys, values and queries + num_heads: the number of attention heads to apply + sr_ratio: the reduction ratio for the sequence length + backend: the backend framework to use + + Basic usage: + + ``` + tensor = torch.rand(1, 196, 32) + output = deepvision.layers.EfficientMultiheadAttention(project_dim=32, + num_heads=2, + sr_ratio=4, + backend='pytorch')(tensor, H=14, W=14) + + print(output.shape) # torch.Size([1, 196, 32]) + + tensor = tf.random.uniform([1, 196, 32]) + output = deepvision.layers.EfficientMultiheadAttention(project_dim=32, + num_heads=2, + sr_ratio=4, + backend='tensorflow')(tensor, H=14, W=14) + print(output.shape) # (1, 196, 32) + ``` + + """ + layer_class = LAYER_BACKBONES.get(backend) + if layer_class is None: + raise ValueError( + f"Backend not supported: {backend}. Supported backbones are {LAYER_BACKBONES.keys()}" + ) + + layer = layer_class( + project_dim=project_dim, num_heads=num_heads, sr_ratio=sr_ratio + ) + + return layer diff --git a/.history/keras_cv/layers/efficient_multihead_attention_20230717120153.py b/.history/keras_cv/layers/efficient_multihead_attention_20230717120153.py new file mode 100644 index 0000000000..ae333c8ae4 --- /dev/null +++ b/.history/keras_cv/layers/efficient_multihead_attention_20230717120153.py @@ -0,0 +1,83 @@ +from keras_cv.backend import keras + +""" +Based on: https://github.com/sithu31296/semantic-segmentation/blob/main/semseg/models/backbones/mit.py +""" + + +@keras.saving.register_keras_serializable(package="keras_cv") +class EfficientMultiheadAttention(tf.keras.layers.Layer): + def __init__(self, project_dim, num_heads, sr_ratio): + super().__init__() + self.num_heads = num_heads + self.sr_ratio = sr_ratio + self.scale = (project_dim // num_heads) ** -0.5 + self.q = keras.layers.Dense(project_dim) + self.k = keras.layers.Dense(project_dim) + self.v = keras.layers.Dense(project_dim) + self.proj = keras.layers.Dense(project_dim) + + if sr_ratio > 1: + self.sr = keras.layers.Conv2D( + filters=project_dim, + kernel_size=sr_ratio, + strides=sr_ratio, + padding="same", + ) + self.norm = keras.layers.LayerNormalization() + + def call(self, x, H, W): + input_shape = shape(x) + + q = self.q(x) + q = q.reshape( + ( + input_shape[0], + input_shape[1], + self.num_heads, + input_shape[2] // self.num_heads, + ), + ) + + q = q.transpose([0, 2, 1, 3]) + + if self.sr_ratio > 1: + x = x.transpose(x, [0, 2, 1]).reshape( + (input_shape[0], H, W, input_shape[2]) + ) + x = self.sr(x) + x = x.reshape([input_shape[0], input_shape[2], -1]) + x = x.transpose([0, 2, 1]) + x = self.norm(x) + + k = self.k(x) + v = self.v(x) + + k = k.transpose([0, 2, 1, 3]).reshape( + [ + input_shape[0], + -1, + self.num_heads, + input_shape[2] // self.num_heads, + ] + ) + v = ( + v.transpose([0, 2, 1, 3]).reshape( + [ + input_shape[0], + -1, + self.num_heads, + input_shape[2] // self.num_heads, + ] + ), + ) + + attn = (q @ x.transpose([0, 1, 3, 2])) * self.scale + attn = keras.nn.ops.softmax(attn, axis=-1) + + attn = attn @ v + attn = attn.transpose(attn, [0, 2, 1, 3]).reshape( + [input_shape[0], input_shape[1], input_shape[2]] + ) + x = self.proj(attn) + return x diff --git a/.history/keras_cv/layers/efficient_multihead_attention_20230717120213.py b/.history/keras_cv/layers/efficient_multihead_attention_20230717120213.py new file mode 100644 index 0000000000..718a5b216b --- /dev/null +++ b/.history/keras_cv/layers/efficient_multihead_attention_20230717120213.py @@ -0,0 +1,83 @@ +from keras_cv.backend import keras + +""" +Based on: https://github.com/sithu31296/semantic-segmentation/blob/main/semseg/models/backbones/mit.py +""" + + +@keras.saving.register_keras_serializable(package="keras_cv") +class EfficientMultiheadAttention(tf.keras.layers.Layer): + def __init__(self, project_dim, num_heads, sr_ratio): + super().__init__() + self.num_heads = num_heads + self.sr_ratio = sr_ratio + self.scale = (project_dim // num_heads) ** -0.5 + self.q = keras.layers.Dense(project_dim) + self.k = keras.layers.Dense(project_dim) + self.v = keras.layers.Dense(project_dim) + self.proj = keras.layers.Dense(project_dim) + + if sr_ratio > 1: + self.sr = keras.layers.Conv2D( + filters=project_dim, + kernel_size=sr_ratio, + strides=sr_ratio, + padding="same", + ) + self.norm = keras.layers.LayerNormalization() + + def call(self, x, H, W): + input_shape = x.shape + + q = self.q(x) + q = q.reshape( + ( + input_shape[0], + input_shape[1], + self.num_heads, + input_shape[2] // self.num_heads, + ), + ) + + q = q.transpose([0, 2, 1, 3]) + + if self.sr_ratio > 1: + x = x.transpose(x, [0, 2, 1]).reshape( + (input_shape[0], H, W, input_shape[2]) + ) + x = self.sr(x) + x = x.reshape([input_shape[0], input_shape[2], -1]) + x = x.transpose([0, 2, 1]) + x = self.norm(x) + + k = self.k(x) + v = self.v(x) + + k = k.transpose([0, 2, 1, 3]).reshape( + [ + input_shape[0], + -1, + self.num_heads, + input_shape[2] // self.num_heads, + ] + ) + v = ( + v.transpose([0, 2, 1, 3]).reshape( + [ + input_shape[0], + -1, + self.num_heads, + input_shape[2] // self.num_heads, + ] + ), + ) + + attn = (q @ x.transpose([0, 1, 3, 2])) * self.scale + attn = keras.nn.ops.softmax(attn, axis=-1) + + attn = attn @ v + attn = attn.transpose(attn, [0, 2, 1, 3]).reshape( + [input_shape[0], input_shape[1], input_shape[2]] + ) + x = self.proj(attn) + return x diff --git a/.history/keras_cv/layers/hierarchical_transformer_encoder_20230717115219.py b/.history/keras_cv/layers/hierarchical_transformer_encoder_20230717115219.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/.history/keras_cv/layers/hierarchical_transformer_encoder_20230717115222.py b/.history/keras_cv/layers/hierarchical_transformer_encoder_20230717115222.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/.history/keras_cv/layers/hierarchical_transformer_encoder_20230717115257.py b/.history/keras_cv/layers/hierarchical_transformer_encoder_20230717115257.py new file mode 100644 index 0000000000..3e66db6f04 --- /dev/null +++ b/.history/keras_cv/layers/hierarchical_transformer_encoder_20230717115257.py @@ -0,0 +1,146 @@ +# Copyright 2023 David Landup +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import tensorflow as tf +from torch import nn + +from deepvision.layers.efficient_attention import EfficientMultiheadAttention +from deepvision.layers.mix_ffn import MixFFN +from deepvision.layers.stochasticdepth import StochasticDepth + + +class __HierarchicalTransformerEncoderPT(nn.Module): + def __init__( + self, + project_dim, + num_heads, + sr_ratio=1, + drop_prob=0.0, + layer_norm_epsilon=1e-6, + name=None, + ): + super().__init__() + self.norm1 = nn.LayerNorm(project_dim) + self.attn = EfficientMultiheadAttention( + project_dim, num_heads, sr_ratio + ) + self.drop_path = StochasticDepth(drop_prob, backend="pytorch") + self.norm2 = nn.LayerNorm(project_dim, eps=layer_norm_epsilon) + self.mlp = MixFFN( + channels=project_dim, + mid_channels=int(project_dim * 4), + backend="pytorch", + ) + + def forward(self, x, H, W): + x = x + self.drop_path(self.attn(self.norm1(x), H, W)) + x = x + self.drop_path(self.mlp(self.norm2(x), H, W)) + return x + + +class __HierarchicalTransformerEncoderTF(tf.keras.layers.Layer): + def __init__( + self, + project_dim, + num_heads, + sr_ratio=1, + drop_prob=0.0, + layer_norm_epsilon=1e-6, + **kwargs, + ): + super().__init__(**kwargs) + self.norm1 = tf.keras.layers.LayerNormalization( + epsilon=layer_norm_epsilon + ) + self.attn = EfficientMultiheadAttention( + project_dim, num_heads, sr_ratio, backend="tensorflow" + ) + self.drop_path = StochasticDepth(drop_prob, backend="tensorflow") + self.norm2 = tf.keras.layers.LayerNormalization( + epsilon=layer_norm_epsilon + ) + self.mlp = MixFFN( + channels=project_dim, + mid_channels=int(project_dim * 4), + backend="tensorflow", + ) + + def call(self, x, H, W): + x = x + self.drop_path(self.attn(self.norm1(x), H, W)) + x = x + self.drop_path(self.mlp(self.norm2(x), H, W)) + return x + + +LAYER_BACKBONES = { + "tensorflow": __HierarchicalTransformerEncoderTF, + "pytorch": __HierarchicalTransformerEncoderPT, +} + + +def HierarchicalTransformerEncoder( + project_dim, + num_heads, + sr_ratio, + drop_prob=0.0, + layer_norm_epsilon=1e-6, + backend=None, + name=None, +): + """ + TransformerEncoder variant, which uses `deepvision.layers.EfficientMultiheadAttention` in lieu of `torch.nn.MultiheadAttention` or `tf.keras.layers.MultiHeadAttention`. + `EfficientMultiheadAttention` shorten the sequence they operate on by a reduction factor, to reduce computational cost. + The `HierarchicalTransformerEncoder` is designed to encode feature maps at multiple spatial levels, similar to how CNNs encode multiple spatial levels. + + Reference: + - ["SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers"](https://arxiv.org/pdf/2105.15203v2.pdf) + + Args: + project_dim: the dimensionality of the projection of the encoder, and output of the `EfficientMultiheadAttention` + num_heads: the number of heads for the `EfficientMultiheadAttention` layer + sr_ratio: the reduction ratio to apply within the `EfficientMultiheadAttention` layer + layer_norm_epsilon: default 1e-06, the epsilon for Layer Normalization layers + drop_prob: the drop probability for the `DropPath` layers + backend: the backend framework to use + + Basic usage: + + ``` + # (B, SEQ_LEN, CHANNELS) + inp = torch.rand(1, 3136, 32) + H, W = 56 + + output = deepvision.layers.HierarchicalTransformerEncoder(project_dim=32, + num_heads=2, + sr_ratio=4, + backend='pytorch')(inp, H, W) + print(output.shape) # torch.Size([1, 3136, 32]) + ``` + """ + + layer_class = LAYER_BACKBONES.get(backend) + if layer_class is None: + raise ValueError( + f"Backend not supported: {backend}. Supported backbones are {LAYER_BACKBONES.keys()}" + ) + + layer = layer_class( + project_dim=project_dim, + num_heads=num_heads, + sr_ratio=sr_ratio, + drop_prob=drop_prob, + layer_norm_epsilon=layer_norm_epsilon, + name=name, + ) + + return layer diff --git a/.history/keras_cv/layers/hierarchical_transformer_encoder_20230717115327.py b/.history/keras_cv/layers/hierarchical_transformer_encoder_20230717115327.py new file mode 100644 index 0000000000..08f218aee3 --- /dev/null +++ b/.history/keras_cv/layers/hierarchical_transformer_encoder_20230717115327.py @@ -0,0 +1,53 @@ +# Copyright 2023 David Landup +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import tensorflow as tf +from torch import nn + +from deepvision.layers.efficient_attention import EfficientMultiheadAttention +from deepvision.layers.mix_ffn import MixFFN +from deepvision.layers.stochasticdepth import StochasticDepth + + +class HierarchicalTransformerEncoder(tf.keras.layers.Layer): + def __init__( + self, + project_dim, + num_heads, + sr_ratio=1, + drop_prob=0.0, + layer_norm_epsilon=1e-6, + **kwargs, + ): + super().__init__(**kwargs) + self.norm1 = tf.keras.layers.LayerNormalization( + epsilon=layer_norm_epsilon + ) + self.attn = EfficientMultiheadAttention( + project_dim, num_heads, sr_ratio, backend="tensorflow" + ) + self.drop_path = StochasticDepth(drop_prob, backend="tensorflow") + self.norm2 = tf.keras.layers.LayerNormalization( + epsilon=layer_norm_epsilon + ) + self.mlp = MixFFN( + channels=project_dim, + mid_channels=int(project_dim * 4), + backend="tensorflow", + ) + + def call(self, x, H, W): + x = x + self.drop_path(self.attn(self.norm1(x), H, W)) + x = x + self.drop_path(self.mlp(self.norm2(x), H, W)) + return x diff --git a/.history/keras_cv/layers/hierarchical_transformer_encoder_20230717115542.py b/.history/keras_cv/layers/hierarchical_transformer_encoder_20230717115542.py new file mode 100644 index 0000000000..61bd26d176 --- /dev/null +++ b/.history/keras_cv/layers/hierarchical_transformer_encoder_20230717115542.py @@ -0,0 +1,31 @@ +from keras_cv.backend import keras + + +@keras.saving.register_keras_serializable(package="keras_cv") +class HierarchicalTransformerEncoder(keras.layers.Layer): + def __init__( + self, + project_dim, + num_heads, + sr_ratio=1, + drop_prob=0.0, + layer_norm_epsilon=1e-6, + **kwargs, + ): + super().__init__(**kwargs) + self.norm1 = keras.layers.LayerNormalization(epsilon=layer_norm_epsilon) + self.attn = EfficientMultiheadAttention( + project_dim, num_heads, sr_ratio, backend="tensorflow" + ) + self.drop_path = StochasticDepth(drop_prob, backend="tensorflow") + self.norm2 = keras.layers.LayerNormalization(epsilon=layer_norm_epsilon) + self.mlp = MixFFN( + channels=project_dim, + mid_channels=int(project_dim * 4), + backend="tensorflow", + ) + + def call(self, x, H, W): + x = x + self.drop_path(self.attn(self.norm1(x), H, W)) + x = x + self.drop_path(self.mlp(self.norm2(x), H, W)) + return x diff --git a/.history/keras_cv/layers/hierarchical_transformer_encoder_20230717120252.py b/.history/keras_cv/layers/hierarchical_transformer_encoder_20230717120252.py new file mode 100644 index 0000000000..bceb8d818a --- /dev/null +++ b/.history/keras_cv/layers/hierarchical_transformer_encoder_20230717120252.py @@ -0,0 +1,32 @@ +from keras_cv.backend import keras +from keras_cv.layers import StochasticDepth + + +@keras.saving.register_keras_serializable(package="keras_cv") +class HierarchicalTransformerEncoder(keras.layers.Layer): + def __init__( + self, + project_dim, + num_heads, + sr_ratio=1, + drop_prob=0.0, + layer_norm_epsilon=1e-6, + **kwargs, + ): + super().__init__(**kwargs) + self.norm1 = keras.layers.LayerNormalization(epsilon=layer_norm_epsilon) + self.attn = EfficientMultiheadAttention( + project_dim, num_heads, sr_ratio, backend="tensorflow" + ) + self.drop_path = StochasticDepth(drop_prob, backend="tensorflow") + self.norm2 = keras.layers.LayerNormalization(epsilon=layer_norm_epsilon) + self.mlp = MixFFN( + channels=project_dim, + mid_channels=int(project_dim * 4), + backend="tensorflow", + ) + + def call(self, x, H, W): + x = x + self.drop_path(self.attn(self.norm1(x), H, W)) + x = x + self.drop_path(self.mlp(self.norm2(x), H, W)) + return x diff --git a/.history/keras_cv/layers/hierarchical_transformer_encoder_20230717120400.py b/.history/keras_cv/layers/hierarchical_transformer_encoder_20230717120400.py new file mode 100644 index 0000000000..ce7d623922 --- /dev/null +++ b/.history/keras_cv/layers/hierarchical_transformer_encoder_20230717120400.py @@ -0,0 +1,56 @@ +from keras_cv.backend import keras +from keras_cv.layers import StochasticDepth +from keras_cv.layers import EfficientMultiheadAttention + + +@keras.saving.register_keras_serializable(package="keras_cv") +class HierarchicalTransformerEncoder(keras.layers.Layer): + def __init__( + self, + project_dim, + num_heads, + sr_ratio=1, + drop_prob=0.0, + layer_norm_epsilon=1e-6, + **kwargs, + ): + super().__init__(**kwargs) + self.norm1 = keras.layers.LayerNormalization(epsilon=layer_norm_epsilon) + self.attn = EfficientMultiheadAttention( + project_dim, num_heads, sr_ratio, backend="tensorflow" + ) + self.drop_path = StochasticDepth(drop_prob, backend="tensorflow") + self.norm2 = keras.layers.LayerNormalization(epsilon=layer_norm_epsilon) + self.mlp = MixFFN( + channels=project_dim, + mid_channels=int(project_dim * 4), + backend="tensorflow", + ) + + def call(self, x, H, W): + x = x + self.drop_path(self.attn(self.norm1(x), H, W)) + x = x + self.drop_path(self.mlp(self.norm2(x), H, W)) + return x + + +class __MixFFN(keras.layers.Layer): + def __init__(self, channels, mid_channels): + super().__init__() + self.fc1 = keras.layers.Dense(mid_channels) + self.dwconv = keras.layers.DepthwiseConv2D( + kernel_size=3, + strides=1, + padding="same", + ) + self.fc2 = keras.layers.Dense(channels) + + def call(self, x, H, W): + x = self.fc1(x) + # B, DIM, C + input_shape = x.shape + x = x.reshape((input_shape[0], H, W, input_shape[-1])) + x = self.dwconv(x) + x = x.reshape((input_shape[0], -1, input_shape[-1])) + x = keras.nn.ops.gelu(x) + x = self.fc2(x) + return x diff --git a/.history/keras_cv/layers/hierarchical_transformer_encoder_20230717120409.py b/.history/keras_cv/layers/hierarchical_transformer_encoder_20230717120409.py new file mode 100644 index 0000000000..b9fbfa0728 --- /dev/null +++ b/.history/keras_cv/layers/hierarchical_transformer_encoder_20230717120409.py @@ -0,0 +1,55 @@ +from keras_cv.backend import keras +from keras_cv.layers import StochasticDepth +from keras_cv.layers import EfficientMultiheadAttention + + +@keras.saving.register_keras_serializable(package="keras_cv") +class HierarchicalTransformerEncoder(keras.layers.Layer): + def __init__( + self, + project_dim, + num_heads, + sr_ratio=1, + drop_prob=0.0, + layer_norm_epsilon=1e-6, + **kwargs, + ): + super().__init__(**kwargs) + self.norm1 = keras.layers.LayerNormalization(epsilon=layer_norm_epsilon) + self.attn = EfficientMultiheadAttention( + project_dim, num_heads, sr_ratio, backend="tensorflow" + ) + self.drop_path = StochasticDepth(drop_prob, backend="tensorflow") + self.norm2 = keras.layers.LayerNormalization(epsilon=layer_norm_epsilon) + self.mlp = __MixFFN( + channels=project_dim, + mid_channels=int(project_dim * 4), + ) + + def call(self, x, H, W): + x = x + self.drop_path(self.attn(self.norm1(x), H, W)) + x = x + self.drop_path(self.mlp(self.norm2(x), H, W)) + return x + + +class __MixFFN(keras.layers.Layer): + def __init__(self, channels, mid_channels): + super().__init__() + self.fc1 = keras.layers.Dense(mid_channels) + self.dwconv = keras.layers.DepthwiseConv2D( + kernel_size=3, + strides=1, + padding="same", + ) + self.fc2 = keras.layers.Dense(channels) + + def call(self, x, H, W): + x = self.fc1(x) + # B, DIM, C + input_shape = x.shape + x = x.reshape((input_shape[0], H, W, input_shape[-1])) + x = self.dwconv(x) + x = x.reshape((input_shape[0], -1, input_shape[-1])) + x = keras.nn.ops.gelu(x) + x = self.fc2(x) + return x diff --git a/.history/keras_cv/layers/hierarchical_transformer_encoder_20230717120411.py b/.history/keras_cv/layers/hierarchical_transformer_encoder_20230717120411.py new file mode 100644 index 0000000000..20b3b0dee7 --- /dev/null +++ b/.history/keras_cv/layers/hierarchical_transformer_encoder_20230717120411.py @@ -0,0 +1,55 @@ +from keras_cv.backend import keras +from keras_cv.layers import StochasticDepth +from keras_cv.layers import EfficientMultiheadAttention + + +@keras.saving.register_keras_serializable(package="keras_cv") +class HierarchicalTransformerEncoder(keras.layers.Layer): + def __init__( + self, + project_dim, + num_heads, + sr_ratio=1, + drop_prob=0.0, + layer_norm_epsilon=1e-6, + **kwargs, + ): + super().__init__(**kwargs) + self.norm1 = keras.layers.LayerNormalization(epsilon=layer_norm_epsilon) + self.attn = EfficientMultiheadAttention( + project_dim, num_heads, sr_ratio + ) + self.drop_path = StochasticDepth(drop_prob, backend="tensorflow") + self.norm2 = keras.layers.LayerNormalization(epsilon=layer_norm_epsilon) + self.mlp = __MixFFN( + channels=project_dim, + mid_channels=int(project_dim * 4), + ) + + def call(self, x, H, W): + x = x + self.drop_path(self.attn(self.norm1(x), H, W)) + x = x + self.drop_path(self.mlp(self.norm2(x), H, W)) + return x + + +class __MixFFN(keras.layers.Layer): + def __init__(self, channels, mid_channels): + super().__init__() + self.fc1 = keras.layers.Dense(mid_channels) + self.dwconv = keras.layers.DepthwiseConv2D( + kernel_size=3, + strides=1, + padding="same", + ) + self.fc2 = keras.layers.Dense(channels) + + def call(self, x, H, W): + x = self.fc1(x) + # B, DIM, C + input_shape = x.shape + x = x.reshape((input_shape[0], H, W, input_shape[-1])) + x = self.dwconv(x) + x = x.reshape((input_shape[0], -1, input_shape[-1])) + x = keras.nn.ops.gelu(x) + x = self.fc2(x) + return x diff --git a/.history/keras_cv/layers/hierarchical_transformer_encoder_20230717120414.py b/.history/keras_cv/layers/hierarchical_transformer_encoder_20230717120414.py new file mode 100644 index 0000000000..846f0146d6 --- /dev/null +++ b/.history/keras_cv/layers/hierarchical_transformer_encoder_20230717120414.py @@ -0,0 +1,55 @@ +from keras_cv.backend import keras +from keras_cv.layers import StochasticDepth +from keras_cv.layers import EfficientMultiheadAttention + + +@keras.saving.register_keras_serializable(package="keras_cv") +class HierarchicalTransformerEncoder(keras.layers.Layer): + def __init__( + self, + project_dim, + num_heads, + sr_ratio=1, + drop_prob=0.0, + layer_norm_epsilon=1e-6, + **kwargs, + ): + super().__init__(**kwargs) + self.norm1 = keras.layers.LayerNormalization(epsilon=layer_norm_epsilon) + self.attn = EfficientMultiheadAttention( + project_dim, num_heads, sr_ratio + ) + self.drop_path = StochasticDepth(drop_prob) + self.norm2 = keras.layers.LayerNormalization(epsilon=layer_norm_epsilon) + self.mlp = __MixFFN( + channels=project_dim, + mid_channels=int(project_dim * 4), + ) + + def call(self, x, H, W): + x = x + self.drop_path(self.attn(self.norm1(x), H, W)) + x = x + self.drop_path(self.mlp(self.norm2(x), H, W)) + return x + + +class __MixFFN(keras.layers.Layer): + def __init__(self, channels, mid_channels): + super().__init__() + self.fc1 = keras.layers.Dense(mid_channels) + self.dwconv = keras.layers.DepthwiseConv2D( + kernel_size=3, + strides=1, + padding="same", + ) + self.fc2 = keras.layers.Dense(channels) + + def call(self, x, H, W): + x = self.fc1(x) + # B, DIM, C + input_shape = x.shape + x = x.reshape((input_shape[0], H, W, input_shape[-1])) + x = self.dwconv(x) + x = x.reshape((input_shape[0], -1, input_shape[-1])) + x = keras.nn.ops.gelu(x) + x = self.fc2(x) + return x diff --git a/.history/keras_cv/layers/overlapping_patching_embedding_20230717115127.py b/.history/keras_cv/layers/overlapping_patching_embedding_20230717115127.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/.history/keras_cv/layers/overlapping_patching_embedding_20230717115248.py b/.history/keras_cv/layers/overlapping_patching_embedding_20230717115248.py new file mode 100644 index 0000000000..e88c3ba844 --- /dev/null +++ b/.history/keras_cv/layers/overlapping_patching_embedding_20230717115248.py @@ -0,0 +1,133 @@ +# Copyright 2023 David Landup +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import tensorflow as tf +from torch import nn + + +class __OverlappingPatchingAndEmbeddingPT(nn.Module): + def __init__( + self, + in_channels=3, + out_channels=32, + patch_size=7, + stride=4, + name=None, + ): + super().__init__() + self.proj = nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=patch_size, + stride=stride, + padding=patch_size // 2, + ) + self.norm = nn.LayerNorm(out_channels) + + def forward(self, x): + x = self.proj(x) + _, _, H, W = x.shape + x = x.flatten(2).transpose(1, 2) + x = self.norm(x) + return x, H, W + + +class __OverlappingPatchingAndEmbeddingTF(tf.keras.layers.Layer): + def __init__( + self, in_channels=3, out_channels=32, patch_size=7, stride=4, **kwargs + ): + super().__init__(**kwargs) + self.proj = tf.keras.layers.Conv2D( + filters=out_channels, + kernel_size=patch_size, + strides=stride, + padding="same", + ) + self.norm = tf.keras.layers.LayerNormalization() + + def call(self, x): + x = self.proj(x) + # B, H, W, C + shape = tf.shape(x) + x = tf.reshape(x, [-1, shape[1] * shape[2], shape[3]]) + x = self.norm(x) + return x, shape[1], shape[2] + + +LAYER_BACKBONES = { + "tensorflow": __OverlappingPatchingAndEmbeddingTF, + "pytorch": __OverlappingPatchingAndEmbeddingPT, +} + + +def OverlappingPatchingAndEmbedding( + in_channels=3, + out_channels=32, + patch_size=7, + stride=4, + backend=None, + name=None, +): + """ + ViT-inspired PatchingAndEmbedding, modified to merge overlapping patches for the SegFormer architecture. + + Reference: + - ["SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers"](https://arxiv.org/pdf/2105.15203v2.pdf) + + + Args: + in_channels: the number of channels in the input tensor + out_channels: the projection dimensionality + patch_size: the patch size/kernel size to apply in the convolutional layer used to patchify + stride: the stride to apply in the convolutional layer used to patchify + backend: the backend framework to use + + Basic usage: + + ``` + inp = torch.rand(1, 3, 224, 224) + output, H, W = deepvision.layers.OverlappingPatchingAndEmbedding(in_channels=3, + out_channels=64, + patch_size=7, + stride=4, + backend='pytorch')(inp) + print(output.shape) # torch.Size([1, 3136, 64]) + + + inp = tf.random.uniform(1, 224, 224, 3) + output, H, W = deepvision.layers.OverlappingPatchingAndEmbedding(in_channels=3, + out_channels=64, + patch_size=7, + stride=4, + backend='tensorflow')(inp) + print(output.shape) # (1, 3136, 64) + ``` + """ + + layer_class = LAYER_BACKBONES.get(backend) + if layer_class is None: + raise ValueError( + f"Backend not supported: {backend}. Supported backbones are {LAYER_BACKBONES.keys()}" + ) + + layer = layer_class( + in_channels=in_channels, + out_channels=out_channels, + patch_size=patch_size, + stride=stride, + name=name, + ) + + return layer diff --git a/.history/keras_cv/layers/overlapping_patching_embedding_20230717115350.py b/.history/keras_cv/layers/overlapping_patching_embedding_20230717115350.py new file mode 100644 index 0000000000..3fda36f09a --- /dev/null +++ b/.history/keras_cv/layers/overlapping_patching_embedding_20230717115350.py @@ -0,0 +1,24 @@ +import tensorflow as tf +from torch import nn + + +class OverlappingPatchingAndEmbedding(tf.keras.layers.Layer): + def __init__( + self, in_channels=3, out_channels=32, patch_size=7, stride=4, **kwargs + ): + super().__init__(**kwargs) + self.proj = tf.keras.layers.Conv2D( + filters=out_channels, + kernel_size=patch_size, + strides=stride, + padding="same", + ) + self.norm = tf.keras.layers.LayerNormalization() + + def call(self, x): + x = self.proj(x) + # B, H, W, C + shape = tf.shape(x) + x = tf.reshape(x, [-1, shape[1] * shape[2], shape[3]]) + x = self.norm(x) + return x, shape[1], shape[2] diff --git a/.history/keras_cv/layers/overlapping_patching_embedding_20230717115401.py b/.history/keras_cv/layers/overlapping_patching_embedding_20230717115401.py new file mode 100644 index 0000000000..19bffa8b66 --- /dev/null +++ b/.history/keras_cv/layers/overlapping_patching_embedding_20230717115401.py @@ -0,0 +1,22 @@ +import tensorflow as tf +from torch import nn + + +class OverlappingPatchingAndEmbedding(tf.keras.layers.Layer): + def __init__(self, out_channels=32, patch_size=7, stride=4, **kwargs): + super().__init__(**kwargs) + self.proj = tf.keras.layers.Conv2D( + filters=out_channels, + kernel_size=patch_size, + strides=stride, + padding="same", + ) + self.norm = tf.keras.layers.LayerNormalization() + + def call(self, x): + x = self.proj(x) + # B, H, W, C + shape = tf.shape(x) + x = tf.reshape(x, [-1, shape[1] * shape[2], shape[3]]) + x = self.norm(x) + return x, shape[1], shape[2] diff --git a/.history/keras_cv/layers/overlapping_patching_embedding_20230717115451.py b/.history/keras_cv/layers/overlapping_patching_embedding_20230717115451.py new file mode 100644 index 0000000000..6036d169b5 --- /dev/null +++ b/.history/keras_cv/layers/overlapping_patching_embedding_20230717115451.py @@ -0,0 +1,22 @@ +from keras_cv.backend import keras + + +@keras.saving.register_keras_serializable(package="keras_cv") +class OverlappingPatchingAndEmbedding(tf.keras.layers.Layer): + def __init__(self, out_channels=32, patch_size=7, stride=4, **kwargs): + super().__init__(**kwargs) + self.proj = tf.keras.layers.Conv2D( + filters=out_channels, + kernel_size=patch_size, + strides=stride, + padding="same", + ) + self.norm = tf.keras.layers.LayerNormalization() + + def call(self, x): + x = self.proj(x) + # B, H, W, C + shape = tf.shape(x) + x = tf.reshape(x, [-1, shape[1] * shape[2], shape[3]]) + x = self.norm(x) + return x, shape[1], shape[2] diff --git a/.history/keras_cv/layers/overlapping_patching_embedding_20230717115505.py b/.history/keras_cv/layers/overlapping_patching_embedding_20230717115505.py new file mode 100644 index 0000000000..ab04114697 --- /dev/null +++ b/.history/keras_cv/layers/overlapping_patching_embedding_20230717115505.py @@ -0,0 +1,22 @@ +from keras_cv.backend import keras + + +@keras.saving.register_keras_serializable(package="keras_cv") +class OverlappingPatchingAndEmbedding(tf.keras.layers.Layer): + def __init__(self, out_channels=32, patch_size=7, stride=4, **kwargs): + super().__init__(**kwargs) + self.proj = keras.layers.Conv2D( + filters=out_channels, + kernel_size=patch_size, + strides=stride, + padding="same", + ) + self.norm = keras.layers.LayerNormalization() + + def call(self, x): + x = self.proj(x) + # B, H, W, C + shape = x.shape + x = x.reshape([-1, shape[1] * shape[2], shape[3]]) + x = self.norm(x) + return x, shape[1], shape[2] diff --git a/.history/keras_cv/layers/overlapping_patching_embedding_20230717115509.py b/.history/keras_cv/layers/overlapping_patching_embedding_20230717115509.py new file mode 100644 index 0000000000..622f2b6ba7 --- /dev/null +++ b/.history/keras_cv/layers/overlapping_patching_embedding_20230717115509.py @@ -0,0 +1,22 @@ +from keras_cv.backend import keras + + +@keras.saving.register_keras_serializable(package="keras_cv") +class OverlappingPatchingAndEmbedding(tf.keras.layers.Layer): + def __init__(self, out_channels=32, patch_size=7, stride=4, **kwargs): + super().__init__(**kwargs) + self.proj = keras.layers.Conv2D( + filters=out_channels, + kernel_size=patch_size, + strides=stride, + padding="same", + ) + self.norm = keras.layers.LayerNormalization() + + def call(self, x): + x = self.proj(x) + # B, H, W, C + shape = x.shape + x = x.reshape((-1, shape[1] * shape[2], shape[3])) + x = self.norm(x) + return x, shape[1], shape[2] diff --git a/.history/keras_cv/layers/overlapping_patching_embedding_20230717115518.py b/.history/keras_cv/layers/overlapping_patching_embedding_20230717115518.py new file mode 100644 index 0000000000..3e3e5daaef --- /dev/null +++ b/.history/keras_cv/layers/overlapping_patching_embedding_20230717115518.py @@ -0,0 +1,22 @@ +from keras_cv.backend import keras + + +@keras.saving.register_keras_serializable(package="keras_cv") +class OverlappingPatchingAndEmbedding(keras.layers.Layer): + def __init__(self, out_channels=32, patch_size=7, stride=4, **kwargs): + super().__init__(**kwargs) + self.proj = keras.layers.Conv2D( + filters=out_channels, + kernel_size=patch_size, + strides=stride, + padding="same", + ) + self.norm = keras.layers.LayerNormalization() + + def call(self, x): + x = self.proj(x) + # B, H, W, C + shape = x.shape + x = x.reshape((-1, shape[1] * shape[2], shape[3])) + x = self.norm(x) + return x, shape[1], shape[2] diff --git a/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_20230717113450.py b/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_20230717113450.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_20230717113606.py b/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_20230717113606.py new file mode 100644 index 0000000000..d741da96ed --- /dev/null +++ b/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_20230717113606.py @@ -0,0 +1,158 @@ +# Copyright 2023 The KerasCV Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""MobileNet v3 backbone model. + +References: + - [Searching for MobileNetV3](https://arxiv.org/pdf/1905.02244.pdf) + (ICCV 2019) + - [Based on the original keras.applications MobileNetv3](https://github.com/keras-team/keras/blob/master/keras/applications/mobilenet_v3.py) +""" # noqa: E501 + +import tensorflow as tf +from tensorflow.keras import layers + + +class MiT(tf.keras.models.Model): + def __init__( + self, + input_shape=None, + input_tensor=None, + classes=None, + include_top=None, + embed_dims=None, + depths=None, + as_backbone=None, + pooling=None, + **kwargs, + ): + if include_top and not classes: + raise ValueError( + "If `include_top` is True, you should specify `classes`. " + f"Received: classes={classes}" + ) + + if include_top and pooling: + raise ValueError( + f"`pooling` must be `None` when `include_top=True`." + f"Received pooling={pooling} and include_top={include_top}. " + ) + + if include_top and as_backbone: + raise ValueError( + f"`as_backbone` must be `False` when `include_top=True`." + f"Received as_backbone={as_backbone} and include_top={include_top}. " + ) + + if as_backbone and classes: + raise ValueError( + f"`as_backbone` must be `False` when `classes` are set." + f"Received as_backbone={as_backbone} and classes={classes}. " + ) + + drop_path_rate = 0.1 + dpr = [x.numpy() for x in tf.linspace(0.0, drop_path_rate, sum(depths))] + blockwise_num_heads = [1, 2, 5, 8] + blockwise_sr_ratios = [8, 4, 2, 1] + num_stages = 4 + + cur = 0 + patch_embedding_layers = [] + transformer_blocks = [] + layer_norms = [] + + for i in range(num_stages): + patch_embed_layer = OverlappingPatchingAndEmbedding( + in_channels=input_shape[-1] if i == 0 else embed_dims[i - 1], + out_channels=embed_dims[0] if i == 0 else embed_dims[i], + patch_size=7 if i == 0 else 3, + stride=4 if i == 0 else 2, + backend="tensorflow", + name=f"patch_and_embed_{i}", + ) + patch_embedding_layers.append(patch_embed_layer) + + transformer_block = [ + HierarchicalTransformerEncoder( + project_dim=embed_dims[i], + num_heads=blockwise_num_heads[i], + sr_ratio=blockwise_sr_ratios[i], + drop_prob=dpr[cur + k], + backend="tensorflow", + name=f"hierarchical_encoder_{i}_{k}", + ) + for k in range(depths[i]) + ] + transformer_blocks.append(transformer_block) + cur += depths[i] + layer_norms.append(layers.LayerNormalization()) + + inputs = parse_model_inputs("tensorflow", input_shape, input_tensor) + x = inputs + + B = tf.shape(x)[0] + outputs = [] + for i in range(num_stages): + x, H, W = patch_embedding_layers[i](x) + for blk in transformer_blocks[i]: + x = blk(x, H, W) + x = layer_norms[i](x) + C = tf.shape(x)[-1] + x = tf.reshape(x, [B, H, W, C]) + outputs.append(x) + + if include_top: + output = layers.GlobalAveragePooling2D(name="avg_pool")(x) + output = layers.Dense( + classes, activation="softmax", name="predictions" + )(output) + elif as_backbone: + output = outputs + else: + if pooling == "avg": + output = layers.GlobalAveragePooling2D(name="avg_pool")(x) + elif pooling == "max": + output = layers.GlobalMaxPooling2D(name="max_pool")(x) + + super().__init__( + inputs=inputs, + outputs=output, + **kwargs, + ) + + self.channels = embed_dims + self.num_stages = num_stages + self.output_channels = embed_dims + self.classes = classes + self.include_top = include_top + self.as_backbone = as_backbone + self.pooling = pooling + + self.patch_embedding_layers = [] + self.transformer_blocks = [] + + def get_config(self): + config = super().get_config() + config.update( + { + "channels": self.channels, + "num_stages": self.num_stages, + "output_channels": self.output_channels, + "classes": self.classes, + "include_top": self.include_top, + "as_backbone": self.as_backbone, + "pooling": self.pooling, + } + ) + return config diff --git a/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_20230717113615.py b/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_20230717113615.py new file mode 100644 index 0000000000..01c71257fd --- /dev/null +++ b/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_20230717113615.py @@ -0,0 +1,158 @@ +# Copyright 2023 The KerasCV Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""MiT backbone model. + +References: + - [Searching for MobileNetV3](https://arxiv.org/pdf/1905.02244.pdf) + (ICCV 2019) + - [Based on the original keras.applications MobileNetv3](https://github.com/keras-team/keras/blob/master/keras/applications/mobilenet_v3.py) +""" # noqa: E501 + +import tensorflow as tf +from tensorflow.keras import layers + + +class MiT(tf.keras.models.Model): + def __init__( + self, + input_shape=None, + input_tensor=None, + classes=None, + include_top=None, + embed_dims=None, + depths=None, + as_backbone=None, + pooling=None, + **kwargs, + ): + if include_top and not classes: + raise ValueError( + "If `include_top` is True, you should specify `classes`. " + f"Received: classes={classes}" + ) + + if include_top and pooling: + raise ValueError( + f"`pooling` must be `None` when `include_top=True`." + f"Received pooling={pooling} and include_top={include_top}. " + ) + + if include_top and as_backbone: + raise ValueError( + f"`as_backbone` must be `False` when `include_top=True`." + f"Received as_backbone={as_backbone} and include_top={include_top}. " + ) + + if as_backbone and classes: + raise ValueError( + f"`as_backbone` must be `False` when `classes` are set." + f"Received as_backbone={as_backbone} and classes={classes}. " + ) + + drop_path_rate = 0.1 + dpr = [x.numpy() for x in tf.linspace(0.0, drop_path_rate, sum(depths))] + blockwise_num_heads = [1, 2, 5, 8] + blockwise_sr_ratios = [8, 4, 2, 1] + num_stages = 4 + + cur = 0 + patch_embedding_layers = [] + transformer_blocks = [] + layer_norms = [] + + for i in range(num_stages): + patch_embed_layer = OverlappingPatchingAndEmbedding( + in_channels=input_shape[-1] if i == 0 else embed_dims[i - 1], + out_channels=embed_dims[0] if i == 0 else embed_dims[i], + patch_size=7 if i == 0 else 3, + stride=4 if i == 0 else 2, + backend="tensorflow", + name=f"patch_and_embed_{i}", + ) + patch_embedding_layers.append(patch_embed_layer) + + transformer_block = [ + HierarchicalTransformerEncoder( + project_dim=embed_dims[i], + num_heads=blockwise_num_heads[i], + sr_ratio=blockwise_sr_ratios[i], + drop_prob=dpr[cur + k], + backend="tensorflow", + name=f"hierarchical_encoder_{i}_{k}", + ) + for k in range(depths[i]) + ] + transformer_blocks.append(transformer_block) + cur += depths[i] + layer_norms.append(layers.LayerNormalization()) + + inputs = parse_model_inputs("tensorflow", input_shape, input_tensor) + x = inputs + + B = tf.shape(x)[0] + outputs = [] + for i in range(num_stages): + x, H, W = patch_embedding_layers[i](x) + for blk in transformer_blocks[i]: + x = blk(x, H, W) + x = layer_norms[i](x) + C = tf.shape(x)[-1] + x = tf.reshape(x, [B, H, W, C]) + outputs.append(x) + + if include_top: + output = layers.GlobalAveragePooling2D(name="avg_pool")(x) + output = layers.Dense( + classes, activation="softmax", name="predictions" + )(output) + elif as_backbone: + output = outputs + else: + if pooling == "avg": + output = layers.GlobalAveragePooling2D(name="avg_pool")(x) + elif pooling == "max": + output = layers.GlobalMaxPooling2D(name="max_pool")(x) + + super().__init__( + inputs=inputs, + outputs=output, + **kwargs, + ) + + self.channels = embed_dims + self.num_stages = num_stages + self.output_channels = embed_dims + self.classes = classes + self.include_top = include_top + self.as_backbone = as_backbone + self.pooling = pooling + + self.patch_embedding_layers = [] + self.transformer_blocks = [] + + def get_config(self): + config = super().get_config() + config.update( + { + "channels": self.channels, + "num_stages": self.num_stages, + "output_channels": self.output_channels, + "classes": self.classes, + "include_top": self.include_top, + "as_backbone": self.as_backbone, + "pooling": self.pooling, + } + ) + return config diff --git a/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_20230717113618.py b/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_20230717113618.py new file mode 100644 index 0000000000..6a8f44bd77 --- /dev/null +++ b/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_20230717113618.py @@ -0,0 +1,156 @@ +# Copyright 2023 The KerasCV Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""MiT backbone model. + +References: + +""" # noqa: E501 + +import tensorflow as tf +from tensorflow.keras import layers + + +class MiT(tf.keras.models.Model): + def __init__( + self, + input_shape=None, + input_tensor=None, + classes=None, + include_top=None, + embed_dims=None, + depths=None, + as_backbone=None, + pooling=None, + **kwargs, + ): + if include_top and not classes: + raise ValueError( + "If `include_top` is True, you should specify `classes`. " + f"Received: classes={classes}" + ) + + if include_top and pooling: + raise ValueError( + f"`pooling` must be `None` when `include_top=True`." + f"Received pooling={pooling} and include_top={include_top}. " + ) + + if include_top and as_backbone: + raise ValueError( + f"`as_backbone` must be `False` when `include_top=True`." + f"Received as_backbone={as_backbone} and include_top={include_top}. " + ) + + if as_backbone and classes: + raise ValueError( + f"`as_backbone` must be `False` when `classes` are set." + f"Received as_backbone={as_backbone} and classes={classes}. " + ) + + drop_path_rate = 0.1 + dpr = [x.numpy() for x in tf.linspace(0.0, drop_path_rate, sum(depths))] + blockwise_num_heads = [1, 2, 5, 8] + blockwise_sr_ratios = [8, 4, 2, 1] + num_stages = 4 + + cur = 0 + patch_embedding_layers = [] + transformer_blocks = [] + layer_norms = [] + + for i in range(num_stages): + patch_embed_layer = OverlappingPatchingAndEmbedding( + in_channels=input_shape[-1] if i == 0 else embed_dims[i - 1], + out_channels=embed_dims[0] if i == 0 else embed_dims[i], + patch_size=7 if i == 0 else 3, + stride=4 if i == 0 else 2, + backend="tensorflow", + name=f"patch_and_embed_{i}", + ) + patch_embedding_layers.append(patch_embed_layer) + + transformer_block = [ + HierarchicalTransformerEncoder( + project_dim=embed_dims[i], + num_heads=blockwise_num_heads[i], + sr_ratio=blockwise_sr_ratios[i], + drop_prob=dpr[cur + k], + backend="tensorflow", + name=f"hierarchical_encoder_{i}_{k}", + ) + for k in range(depths[i]) + ] + transformer_blocks.append(transformer_block) + cur += depths[i] + layer_norms.append(layers.LayerNormalization()) + + inputs = parse_model_inputs("tensorflow", input_shape, input_tensor) + x = inputs + + B = tf.shape(x)[0] + outputs = [] + for i in range(num_stages): + x, H, W = patch_embedding_layers[i](x) + for blk in transformer_blocks[i]: + x = blk(x, H, W) + x = layer_norms[i](x) + C = tf.shape(x)[-1] + x = tf.reshape(x, [B, H, W, C]) + outputs.append(x) + + if include_top: + output = layers.GlobalAveragePooling2D(name="avg_pool")(x) + output = layers.Dense( + classes, activation="softmax", name="predictions" + )(output) + elif as_backbone: + output = outputs + else: + if pooling == "avg": + output = layers.GlobalAveragePooling2D(name="avg_pool")(x) + elif pooling == "max": + output = layers.GlobalMaxPooling2D(name="max_pool")(x) + + super().__init__( + inputs=inputs, + outputs=output, + **kwargs, + ) + + self.channels = embed_dims + self.num_stages = num_stages + self.output_channels = embed_dims + self.classes = classes + self.include_top = include_top + self.as_backbone = as_backbone + self.pooling = pooling + + self.patch_embedding_layers = [] + self.transformer_blocks = [] + + def get_config(self): + config = super().get_config() + config.update( + { + "channels": self.channels, + "num_stages": self.num_stages, + "output_channels": self.output_channels, + "classes": self.classes, + "include_top": self.include_top, + "as_backbone": self.as_backbone, + "pooling": self.pooling, + } + ) + return config diff --git a/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_20230717115045.py b/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_20230717115045.py new file mode 100644 index 0000000000..e770d5db34 --- /dev/null +++ b/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_20230717115045.py @@ -0,0 +1,146 @@ +# Copyright 2023 The KerasCV Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""MiT backbone model. + +References: + +""" # noqa: E501 + +from keras_cv import layers as cv_layers +from keras_cv.backend import keras +from keras_cv.models import utils +from keras_cv.models.backbones.backbone import Backbone +from keras_cv.models.backbones.mix_transformer.mix_transformer_backbone_presets import ( # noqa: E501 + backbone_presets, +) +from keras_cv.models.backbones.mix_transformer.mix_transformer_backbone_presets import ( # noqa: E501 + backbone_presets_with_weights, +) +from keras_cv.utils.python_utils import classproperty + + +@keras.saving.register_keras_serializable(package="keras_cv.models") +class MiT(Backbone): + def __init__( + self, + input_shape=None, + input_tensor=None, + classes=None, + include_top=None, + embed_dims=None, + depths=None, + pooling=None, + **kwargs, + ): + if include_top and not classes: + raise ValueError( + "If `include_top` is True, you should specify `classes`. " + f"Received: classes={classes}" + ) + + if include_top and pooling: + raise ValueError( + f"`pooling` must be `None` when `include_top=True`." + f"Received pooling={pooling} and include_top={include_top}. " + ) + + if include_top and as_backbone: + raise ValueError( + f"`as_backbone` must be `False` when `include_top=True`." + f"Received as_backbone={as_backbone} and include_top={include_top}. " + ) + + drop_path_rate = 0.1 + dpr = [x.numpy() for x in tf.linspace(0.0, drop_path_rate, sum(depths))] + blockwise_num_heads = [1, 2, 5, 8] + blockwise_sr_ratios = [8, 4, 2, 1] + num_stages = 4 + + cur = 0 + patch_embedding_layers = [] + transformer_blocks = [] + layer_norms = [] + + for i in range(num_stages): + patch_embed_layer = OverlappingPatchingAndEmbedding( + in_channels=input_shape[-1] if i == 0 else embed_dims[i - 1], + out_channels=embed_dims[0] if i == 0 else embed_dims[i], + patch_size=7 if i == 0 else 3, + stride=4 if i == 0 else 2, + backend="tensorflow", + name=f"patch_and_embed_{i}", + ) + patch_embedding_layers.append(patch_embed_layer) + + transformer_block = [ + HierarchicalTransformerEncoder( + project_dim=embed_dims[i], + num_heads=blockwise_num_heads[i], + sr_ratio=blockwise_sr_ratios[i], + drop_prob=dpr[cur + k], + backend="tensorflow", + name=f"hierarchical_encoder_{i}_{k}", + ) + for k in range(depths[i]) + ] + transformer_blocks.append(transformer_block) + cur += depths[i] + layer_norms.append(keras.layers.LayerNormalization()) + + inputs = utils.parse_model_inputs(input_shape, input_tensor) + x = inputs + + batch_size = x.shape[0] + pyramid_level_inputs = [] + for i in range(num_stages): + x, H, W = patch_embedding_layers[i](x) + for blk in transformer_blocks[i]: + x = blk(x, H, W) + x = layer_norms[i](x) + C = x.shape[-1] + x = x.reshape((batch_size, H, W, C)) + pyramid_level_inputs.append(x) + + super().__init__( + inputs=inputs, + outputs=x, + **kwargs, + ) + + self.channels = embed_dims + self.num_stages = num_stages + self.output_channels = embed_dims + self.classes = classes + self.include_top = include_top + self.pyramid_level_inputs = pyramid_level_inputs + self.pooling = pooling + + self.patch_embedding_layers = [] + self.transformer_blocks = [] + + def get_config(self): + config = super().get_config() + config.update( + { + "channels": self.channels, + "num_stages": self.num_stages, + "output_channels": self.output_channels, + "classes": self.classes, + "include_top": self.include_top, + "as_backbone": self.as_backbone, + "pooling": self.pooling, + } + ) + return config diff --git a/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_20230717115058.py b/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_20230717115058.py new file mode 100644 index 0000000000..77fd559970 --- /dev/null +++ b/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_20230717115058.py @@ -0,0 +1,144 @@ +# Copyright 2023 The KerasCV Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""MiT backbone model. + +References: + +""" # noqa: E501 + +from keras_cv import layers as cv_layers +from keras_cv.backend import keras +from keras_cv.models import utils +from keras_cv.models.backbones.backbone import Backbone +from keras_cv.models.backbones.mix_transformer.mix_transformer_backbone_presets import ( # noqa: E501 + backbone_presets, +) +from keras_cv.models.backbones.mix_transformer.mix_transformer_backbone_presets import ( # noqa: E501 + backbone_presets_with_weights, +) +from keras_cv.utils.python_utils import classproperty + + +@keras.saving.register_keras_serializable(package="keras_cv.models") +class MiT(Backbone): + def __init__( + self, + input_shape=None, + input_tensor=None, + classes=None, + include_top=None, + embed_dims=None, + depths=None, + pooling=None, + **kwargs, + ): + if include_top and not classes: + raise ValueError( + "If `include_top` is True, you should specify `classes`. " + f"Received: classes={classes}" + ) + + if include_top and pooling: + raise ValueError( + f"`pooling` must be `None` when `include_top=True`." + f"Received pooling={pooling} and include_top={include_top}. " + ) + + if include_top and as_backbone: + raise ValueError( + f"`as_backbone` must be `False` when `include_top=True`." + f"Received as_backbone={as_backbone} and include_top={include_top}. " + ) + + drop_path_rate = 0.1 + dpr = [x.numpy() for x in tf.linspace(0.0, drop_path_rate, sum(depths))] + blockwise_num_heads = [1, 2, 5, 8] + blockwise_sr_ratios = [8, 4, 2, 1] + num_stages = 4 + + cur = 0 + patch_embedding_layers = [] + transformer_blocks = [] + layer_norms = [] + + for i in range(num_stages): + patch_embed_layer = OverlappingPatchingAndEmbedding( + in_channels=input_shape[-1] if i == 0 else embed_dims[i - 1], + out_channels=embed_dims[0] if i == 0 else embed_dims[i], + patch_size=7 if i == 0 else 3, + stride=4 if i == 0 else 2, + name=f"patch_and_embed_{i}", + ) + patch_embedding_layers.append(patch_embed_layer) + + transformer_block = [ + HierarchicalTransformerEncoder( + project_dim=embed_dims[i], + num_heads=blockwise_num_heads[i], + sr_ratio=blockwise_sr_ratios[i], + drop_prob=dpr[cur + k], + name=f"hierarchical_encoder_{i}_{k}", + ) + for k in range(depths[i]) + ] + transformer_blocks.append(transformer_block) + cur += depths[i] + layer_norms.append(keras.layers.LayerNormalization()) + + inputs = utils.parse_model_inputs(input_shape, input_tensor) + x = inputs + + batch_size = x.shape[0] + pyramid_level_inputs = [] + for i in range(num_stages): + x, H, W = patch_embedding_layers[i](x) + for blk in transformer_blocks[i]: + x = blk(x, H, W) + x = layer_norms[i](x) + C = x.shape[-1] + x = x.reshape((batch_size, H, W, C)) + pyramid_level_inputs.append(x) + + super().__init__( + inputs=inputs, + outputs=x, + **kwargs, + ) + + self.channels = embed_dims + self.num_stages = num_stages + self.output_channels = embed_dims + self.classes = classes + self.include_top = include_top + self.pyramid_level_inputs = pyramid_level_inputs + self.pooling = pooling + + self.patch_embedding_layers = [] + self.transformer_blocks = [] + + def get_config(self): + config = super().get_config() + config.update( + { + "channels": self.channels, + "num_stages": self.num_stages, + "output_channels": self.output_channels, + "classes": self.classes, + "include_top": self.include_top, + "as_backbone": self.as_backbone, + "pooling": self.pooling, + } + ) + return config diff --git a/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets_20230717113721.py b/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets_20230717113721.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets_20230717113912.py b/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets_20230717113912.py new file mode 100644 index 0000000000..0f21b0b687 --- /dev/null +++ b/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets_20230717113912.py @@ -0,0 +1,282 @@ +# Copyright 2023 The KerasCV Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""MobileNetV3 model preset configurations.""" + +backbone_presets_no_weights = { + "mobilenet_v3_small": { + "metadata": { + "description": ( + "MobileNetV3 model with 14 layers where the batch " + "normalization and hard-swish activation are applied after the " + "convolution layers." + ), + "params": 933502, + "official_name": "MobileNetV3", + "path": "mobilenetv3", + }, + "class_name": "keras_cv.models>MobileNetV3Backbone", + "config": { + "stackwise_expansion": [ + 1, + 72.0 / 16, + 88.0 / 24, + 4, + 6, + 6, + 3, + 3, + 6, + 6, + 6, + ], + "stackwise_filters": [16, 24, 24, 40, 40, 40, 48, 48, 96, 96, 96], + "stackwise_kernel_size": [3, 3, 3, 5, 5, 5, 5, 5, 5, 5, 5], + "stackwise_stride": [2, 2, 1, 2, 1, 1, 1, 1, 2, 1, 1], + "stackwise_se_ratio": [ + 0.25, + None, + None, + 0.25, + 0.25, + 0.25, + 0.25, + 0.25, + 0.25, + 0.25, + 0.25, + ], + "stackwise_activation": [ + "relu", + "relu", + "relu", + "hard_swish", + "hard_swish", + "hard_swish", + "hard_swish", + "hard_swish", + "hard_swish", + "hard_swish", + "hard_swish", + ], + "include_rescaling": True, + "input_shape": (None, None, 3), + "input_tensor": None, + "alpha": 1.0, + }, + }, + "mobilenet_v3_large": { + "metadata": { + "description": ( + "MobileNetV3 model with 28 layers where the batch " + "normalization and hard-swish activation are applied after the " + "convolution layers." + ), + "params": 2994518, + "official_name": "MobileNetV3", + "path": "mobilenetv3", + }, + "class_name": "keras_cv.models>MobileNetV3Backbone", + "config": { + "embedding_dims": [32, 64, 160, 256], + "depths": [2, 2, 2, 2], + }, + }, +} + +backbone_presets_with_weights = {} + +backbone_presets = { + **backbone_presets_no_weights, + **backbone_presets_with_weights, +} + + +MODEL_CONFIGS = { + "B0": {"embedding_dims": [32, 64, 160, 256], "depths": [2, 2, 2, 2]}, + "B1": {"embedding_dims": [64, 128, 320, 512], "depths": [2, 2, 2, 2]}, + "B2": {"embedding_dims": [64, 128, 320, 512], "depths": [3, 4, 6, 3]}, + "B3": {"embedding_dims": [64, 128, 320, 512], "depths": [3, 4, 18, 3]}, + "B4": {"embedding_dims": [64, 128, 320, 512], "depths": [3, 8, 27, 3]}, + "B5": {"embedding_dims": [64, 128, 320, 512], "depths": [3, 6, 40, 3]}, +} + +MODEL_BACKBONES = {"tensorflow": __MiTTF, "pytorch": __MiTPT} + + +def MiTB0( + backend, + include_top, + classes=None, + input_shape=(None, None, 3), + input_tensor=None, + pooling=None, + as_backbone=False, + **kwargs, +): + model_class = MODEL_BACKBONES.get(backend) + if model_class is None: + raise ValueError( + f"Backend not supported: {backend}. Supported backbones are {MODEL_BACKBONES.keys()}" + ) + return model_class( + input_shape=input_shape, + input_tensor=input_tensor, + pooling=pooling, + embed_dims=MODEL_CONFIGS["B0"]["embedding_dims"], + depths=MODEL_CONFIGS["B0"]["depths"], + classes=classes, + include_top=include_top, + as_backbone=as_backbone, + **kwargs, + ) + + +def MiTB1( + backend, + include_top, + classes=None, + input_shape=(None, None, 3), + input_tensor=None, + pooling=None, + as_backbone=False, + **kwargs, +): + model_class = MODEL_BACKBONES.get(backend) + if model_class is None: + raise ValueError( + f"Backend not supported: {backend}. Supported backbones are {MODEL_BACKBONES.keys()}" + ) + return model_class( + input_shape=input_shape, + input_tensor=input_tensor, + pooling=pooling, + embed_dims=MODEL_CONFIGS["B1"]["embedding_dims"], + depths=MODEL_CONFIGS["B1"]["depths"], + classes=classes, + include_top=include_top, + as_backbone=as_backbone, + **kwargs, + ) + + +def MiTB2( + backend, + include_top, + classes=None, + input_shape=(None, None, 3), + input_tensor=None, + pooling=None, + as_backbone=False, + **kwargs, +): + model_class = MODEL_BACKBONES.get(backend) + if model_class is None: + raise ValueError( + f"Backend not supported: {backend}. Supported backbones are {MODEL_BACKBONES.keys()}" + ) + return model_class( + input_shape=input_shape, + input_tensor=input_tensor, + pooling=pooling, + embed_dims=MODEL_CONFIGS["B2"]["embedding_dims"], + depths=MODEL_CONFIGS["B2"]["depths"], + classes=classes, + include_top=include_top, + as_backbone=as_backbone, + **kwargs, + ) + + +def MiTB3( + backend, + include_top, + classes=None, + input_shape=(None, None, 3), + input_tensor=None, + pooling=None, + as_backbone=False, + **kwargs, +): + model_class = MODEL_BACKBONES.get(backend) + if model_class is None: + raise ValueError( + f"Backend not supported: {backend}. Supported backbones are {MODEL_BACKBONES.keys()}" + ) + return model_class( + input_shape=input_shape, + input_tensor=input_tensor, + pooling=pooling, + embed_dims=MODEL_CONFIGS["B3"]["embedding_dims"], + depths=MODEL_CONFIGS["B3"]["depths"], + classes=classes, + include_top=include_top, + as_backbone=as_backbone, + **kwargs, + ) + + +def MiTB4( + backend, + include_top, + classes=None, + input_shape=(None, None, 3), + input_tensor=None, + pooling=None, + as_backbone=False, + **kwargs, +): + model_class = MODEL_BACKBONES.get(backend) + if model_class is None: + raise ValueError( + f"Backend not supported: {backend}. Supported backbones are {MODEL_BACKBONES.keys()}" + ) + return model_class( + input_shape=input_shape, + input_tensor=input_tensor, + pooling=pooling, + embed_dims=MODEL_CONFIGS["B4"]["embedding_dims"], + depths=MODEL_CONFIGS["B4"]["depths"], + classes=classes, + include_top=include_top, + as_backbone=as_backbone, + **kwargs, + ) + + +def MiTB5( + backend, + include_top, + classes=None, + input_shape=(None, None, 3), + input_tensor=None, + pooling=None, + as_backbone=False, + **kwargs, +): + model_class = MODEL_BACKBONES.get(backend) + if model_class is None: + raise ValueError( + f"Backend not supported: {backend}. Supported backbones are {MODEL_BACKBONES.keys()}" + ) + return model_class( + input_shape=input_shape, + input_tensor=input_tensor, + pooling=pooling, + embed_dims=MODEL_CONFIGS["B5"]["embedding_dims"], + depths=MODEL_CONFIGS["B5"]["depths"], + classes=classes, + include_top=include_top, + as_backbone=as_backbone, + **kwargs, + ) diff --git a/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets_20230717114045.py b/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets_20230717114045.py new file mode 100644 index 0000000000..197b217d1c --- /dev/null +++ b/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets_20230717114045.py @@ -0,0 +1,126 @@ +# Copyright 2023 The KerasCV Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""MobileNetV3 model preset configurations.""" + +backbone_presets_no_weights = { + "MiT_B0": { + "metadata": { + "description": ( + "MobileNetV3 model with 28 layers where the batch " + "normalization and hard-swish activation are applied after the " + "convolution layers." + ), + "params": 2994518, + "official_name": "MobileNetV3", + "path": "mobilenetv3", + }, + "class_name": "keras_cv.models>MobileNetV3Backbone", + "config": { + "embedding_dims": [32, 64, 160, 256], + "depths": [2, 2, 2, 2], + }, + }, + "MiT_B1": { + "metadata": { + "description": ( + "MobileNetV3 model with 28 layers where the batch " + "normalization and hard-swish activation are applied after the " + "convolution layers." + ), + "params": 2994518, + "official_name": "MobileNetV3", + "path": "mobilenetv3", + }, + "class_name": "keras_cv.models>MobileNetV3Backbone", + "config": { + "embedding_dims": [64, 128, 320, 512], + "depths": [2, 2, 2, 2], + }, + }, + "MiT_B2": { + "metadata": { + "description": ( + "MobileNetV3 model with 28 layers where the batch " + "normalization and hard-swish activation are applied after the " + "convolution layers." + ), + "params": 2994518, + "official_name": "MobileNetV3", + "path": "mobilenetv3", + }, + "class_name": "keras_cv.models>MobileNetV3Backbone", + "config": { + "embedding_dims": [64, 128, 320, 512], + "depths": [3, 4, 6, 3], + }, + }, + "MiT_B3": { + "metadata": { + "description": ( + "MobileNetV3 model with 28 layers where the batch " + "normalization and hard-swish activation are applied after the " + "convolution layers." + ), + "params": 2994518, + "official_name": "MobileNetV3", + "path": "mobilenetv3", + }, + "class_name": "keras_cv.models>MobileNetV3Backbone", + "config": { + "embedding_dims": [64, 128, 320, 512], + "depths": [3, 4, 18, 3], + }, + }, + "MiT_B4": { + "metadata": { + "description": ( + "MobileNetV3 model with 28 layers where the batch " + "normalization and hard-swish activation are applied after the " + "convolution layers." + ), + "params": 2994518, + "official_name": "MobileNetV3", + "path": "mobilenetv3", + }, + "class_name": "keras_cv.models>MobileNetV3Backbone", + "config": { + "embedding_dims": [64, 128, 320, 512], + "depths": [3, 8, 27, 3], + }, + }, + "MiT_B5": { + "metadata": { + "description": ( + "MobileNetV3 model with 28 layers where the batch " + "normalization and hard-swish activation are applied after the " + "convolution layers." + ), + "params": 2994518, + "official_name": "MobileNetV3", + "path": "mobilenetv3", + }, + "class_name": "keras_cv.models>MobileNetV3Backbone", + "config": { + "embedding_dims": [64, 128, 320, 512], + "depths": [3, 6, 40, 3], + }, + }, +} + +backbone_presets_with_weights = {} + +backbone_presets = { + **backbone_presets_no_weights, + **backbone_presets_with_weights, +} diff --git a/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets_20230717114112.py b/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets_20230717114112.py new file mode 100644 index 0000000000..197b217d1c --- /dev/null +++ b/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets_20230717114112.py @@ -0,0 +1,126 @@ +# Copyright 2023 The KerasCV Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""MobileNetV3 model preset configurations.""" + +backbone_presets_no_weights = { + "MiT_B0": { + "metadata": { + "description": ( + "MobileNetV3 model with 28 layers where the batch " + "normalization and hard-swish activation are applied after the " + "convolution layers." + ), + "params": 2994518, + "official_name": "MobileNetV3", + "path": "mobilenetv3", + }, + "class_name": "keras_cv.models>MobileNetV3Backbone", + "config": { + "embedding_dims": [32, 64, 160, 256], + "depths": [2, 2, 2, 2], + }, + }, + "MiT_B1": { + "metadata": { + "description": ( + "MobileNetV3 model with 28 layers where the batch " + "normalization and hard-swish activation are applied after the " + "convolution layers." + ), + "params": 2994518, + "official_name": "MobileNetV3", + "path": "mobilenetv3", + }, + "class_name": "keras_cv.models>MobileNetV3Backbone", + "config": { + "embedding_dims": [64, 128, 320, 512], + "depths": [2, 2, 2, 2], + }, + }, + "MiT_B2": { + "metadata": { + "description": ( + "MobileNetV3 model with 28 layers where the batch " + "normalization and hard-swish activation are applied after the " + "convolution layers." + ), + "params": 2994518, + "official_name": "MobileNetV3", + "path": "mobilenetv3", + }, + "class_name": "keras_cv.models>MobileNetV3Backbone", + "config": { + "embedding_dims": [64, 128, 320, 512], + "depths": [3, 4, 6, 3], + }, + }, + "MiT_B3": { + "metadata": { + "description": ( + "MobileNetV3 model with 28 layers where the batch " + "normalization and hard-swish activation are applied after the " + "convolution layers." + ), + "params": 2994518, + "official_name": "MobileNetV3", + "path": "mobilenetv3", + }, + "class_name": "keras_cv.models>MobileNetV3Backbone", + "config": { + "embedding_dims": [64, 128, 320, 512], + "depths": [3, 4, 18, 3], + }, + }, + "MiT_B4": { + "metadata": { + "description": ( + "MobileNetV3 model with 28 layers where the batch " + "normalization and hard-swish activation are applied after the " + "convolution layers." + ), + "params": 2994518, + "official_name": "MobileNetV3", + "path": "mobilenetv3", + }, + "class_name": "keras_cv.models>MobileNetV3Backbone", + "config": { + "embedding_dims": [64, 128, 320, 512], + "depths": [3, 8, 27, 3], + }, + }, + "MiT_B5": { + "metadata": { + "description": ( + "MobileNetV3 model with 28 layers where the batch " + "normalization and hard-swish activation are applied after the " + "convolution layers." + ), + "params": 2994518, + "official_name": "MobileNetV3", + "path": "mobilenetv3", + }, + "class_name": "keras_cv.models>MobileNetV3Backbone", + "config": { + "embedding_dims": [64, 128, 320, 512], + "depths": [3, 6, 40, 3], + }, + }, +} + +backbone_presets_with_weights = {} + +backbone_presets = { + **backbone_presets_no_weights, + **backbone_presets_with_weights, +} diff --git a/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets_20230717114332.py b/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets_20230717114332.py new file mode 100644 index 0000000000..1313c0a151 --- /dev/null +++ b/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets_20230717114332.py @@ -0,0 +1,114 @@ +# Copyright 2023 The KerasCV Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""MiT model preset configurations.""" + +backbone_presets_no_weights = { + "MiT_B0": { + "metadata": { + "description": ( + "MiT (MixTransformer) model with 8 transformer blocks." + ), + "params": 2994518, + "official_name": "MiT", + "path": "mit", + }, + "class_name": "keras_cv.models>MobileNetV3Backbone", + "config": { + "embedding_dims": [32, 64, 160, 256], + "depths": [2, 2, 2, 2], + }, + }, + "MiT_B1": { + "metadata": { + "description": ( + "MiT (MixTransformer) model with 8 transformer blocks." + ), + "params": 2994518, + "official_name": "MiT", + "path": "mit", + }, + "class_name": "keras_cv.models>MobileNetV3Backbone", + "config": { + "embedding_dims": [64, 128, 320, 512], + "depths": [2, 2, 2, 2], + }, + }, + "MiT_B2": { + "metadata": { + "description": ( + "MiT (MixTransformer) model with 16 transformer blocks." + ), + "params": 2994518, + "official_name": "MiT", + "path": "mit", + }, + "class_name": "keras_cv.models>MobileNetV3Backbone", + "config": { + "embedding_dims": [64, 128, 320, 512], + "depths": [3, 4, 6, 3], + }, + }, + "MiT_B3": { + "metadata": { + "description": ( + "MiT (MixTransformer) model with 28 transformer blocks." + ), + "params": 2994518, + "official_name": "MiT", + "path": "mit", + }, + "class_name": "keras_cv.models>MobileNetV3Backbone", + "config": { + "embedding_dims": [64, 128, 320, 512], + "depths": [3, 4, 18, 3], + }, + }, + "MiT_B4": { + "metadata": { + "description": ( + "MiT (MixTransformer) model with 41 transformer blocks." + ), + "params": 2994518, + "official_name": "MiT", + "path": "mit", + }, + "class_name": "keras_cv.models>MobileNetV3Backbone", + "config": { + "embedding_dims": [64, 128, 320, 512], + "depths": [3, 8, 27, 3], + }, + }, + "MiT_B5": { + "metadata": { + "description": ( + "MiT (MixTransformer) model with 52 transformer blocks." + ), + "params": 2994518, + "official_name": "MiT", + "path": "mit", + }, + "class_name": "keras_cv.models>MobileNetV3Backbone", + "config": { + "embedding_dims": [64, 128, 320, 512], + "depths": [3, 6, 40, 3], + }, + }, +} + +backbone_presets_with_weights = {} + +backbone_presets = { + **backbone_presets_no_weights, + **backbone_presets_with_weights, +} diff --git a/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets_20230717114420.py b/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets_20230717114420.py new file mode 100644 index 0000000000..5267a09ace --- /dev/null +++ b/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets_20230717114420.py @@ -0,0 +1,114 @@ +# Copyright 2023 The KerasCV Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""MiT model preset configurations.""" + +backbone_presets_no_weights = { + "MiT_B0": { + "metadata": { + "description": ( + "MiT (MixTransformer) model with 8 transformer blocks." + ), + "params": 3321962, + "official_name": "MiT", + "path": "mit", + }, + "class_name": "keras_cv.models>MobileNetV3Backbone", + "config": { + "embedding_dims": [32, 64, 160, 256], + "depths": [2, 2, 2, 2], + }, + }, + "MiT_B1": { + "metadata": { + "description": ( + "MiT (MixTransformer) model with 8 transformer blocks." + ), + "params": 13156554, + "official_name": "MiT", + "path": "mit", + }, + "class_name": "keras_cv.models>MobileNetV3Backbone", + "config": { + "embedding_dims": [64, 128, 320, 512], + "depths": [2, 2, 2, 2], + }, + }, + "MiT_B2": { + "metadata": { + "description": ( + "MiT (MixTransformer) model with 16 transformer blocks." + ), + "params": 24201418, + "official_name": "MiT", + "path": "mit", + }, + "class_name": "keras_cv.models>MobileNetV3Backbone", + "config": { + "embedding_dims": [64, 128, 320, 512], + "depths": [3, 4, 6, 3], + }, + }, + "MiT_B3": { + "metadata": { + "description": ( + "MiT (MixTransformer) model with 28 transformer blocks." + ), + "params": 44077258, + "official_name": "MiT", + "path": "mit", + }, + "class_name": "keras_cv.models>MobileNetV3Backbone", + "config": { + "embedding_dims": [64, 128, 320, 512], + "depths": [3, 4, 18, 3], + }, + }, + "MiT_B4": { + "metadata": { + "description": ( + "MiT (MixTransformer) model with 41 transformer blocks." + ), + "params": 60847818, + "official_name": "MiT", + "path": "mit", + }, + "class_name": "keras_cv.models>MobileNetV3Backbone", + "config": { + "embedding_dims": [64, 128, 320, 512], + "depths": [3, 8, 27, 3], + }, + }, + "MiT_B5": { + "metadata": { + "description": ( + "MiT (MixTransformer) model with 52 transformer blocks." + ), + "params": 81448138, + "official_name": "MiT", + "path": "mit", + }, + "class_name": "keras_cv.models>MobileNetV3Backbone", + "config": { + "embedding_dims": [64, 128, 320, 512], + "depths": [3, 6, 40, 3], + }, + }, +} + +backbone_presets_with_weights = {} + +backbone_presets = { + **backbone_presets_no_weights, + **backbone_presets_with_weights, +} diff --git a/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets_20230717114436.py b/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets_20230717114436.py new file mode 100644 index 0000000000..b890282a91 --- /dev/null +++ b/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets_20230717114436.py @@ -0,0 +1,114 @@ +# Copyright 2023 The KerasCV Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""MiT model preset configurations.""" + +backbone_presets_no_weights = { + "MiT_B0": { + "metadata": { + "description": ( + "MiT (MixTransformer) model with 8 transformer blocks." + ), + "params": 3321962, + "official_name": "MiT", + "path": "mit", + }, + "class_name": "keras_cv.models>MiTBackbone", + "config": { + "embedding_dims": [32, 64, 160, 256], + "depths": [2, 2, 2, 2], + }, + }, + "MiT_B1": { + "metadata": { + "description": ( + "MiT (MixTransformer) model with 8 transformer blocks." + ), + "params": 13156554, + "official_name": "MiT", + "path": "mit", + }, + "class_name": "keras_cv.models>MiTBackbone", + "config": { + "embedding_dims": [64, 128, 320, 512], + "depths": [2, 2, 2, 2], + }, + }, + "MiT_B2": { + "metadata": { + "description": ( + "MiT (MixTransformer) model with 16 transformer blocks." + ), + "params": 24201418, + "official_name": "MiT", + "path": "mit", + }, + "class_name": "keras_cv.models>MiTBackbone", + "config": { + "embedding_dims": [64, 128, 320, 512], + "depths": [3, 4, 6, 3], + }, + }, + "MiT_B3": { + "metadata": { + "description": ( + "MiT (MixTransformer) model with 28 transformer blocks." + ), + "params": 44077258, + "official_name": "MiT", + "path": "mit", + }, + "class_name": "keras_cv.models>MiTBackbone", + "config": { + "embedding_dims": [64, 128, 320, 512], + "depths": [3, 4, 18, 3], + }, + }, + "MiT_B4": { + "metadata": { + "description": ( + "MiT (MixTransformer) model with 41 transformer blocks." + ), + "params": 60847818, + "official_name": "MiT", + "path": "mit", + }, + "class_name": "keras_cv.models>MiTBackbone", + "config": { + "embedding_dims": [64, 128, 320, 512], + "depths": [3, 8, 27, 3], + }, + }, + "MiT_B5": { + "metadata": { + "description": ( + "MiT (MixTransformer) model with 52 transformer blocks." + ), + "params": 81448138, + "official_name": "MiT", + "path": "mit", + }, + "class_name": "keras_cv.models>MiTBackbone", + "config": { + "embedding_dims": [64, 128, 320, 512], + "depths": [3, 6, 40, 3], + }, + }, +} + +backbone_presets_with_weights = {} + +backbone_presets = { + **backbone_presets_no_weights, + **backbone_presets_with_weights, +} diff --git a/keras_cv/layers/efficient_multihead_attention.py b/keras_cv/layers/efficient_multihead_attention.py new file mode 100644 index 0000000000..718a5b216b --- /dev/null +++ b/keras_cv/layers/efficient_multihead_attention.py @@ -0,0 +1,83 @@ +from keras_cv.backend import keras + +""" +Based on: https://github.com/sithu31296/semantic-segmentation/blob/main/semseg/models/backbones/mit.py +""" + + +@keras.saving.register_keras_serializable(package="keras_cv") +class EfficientMultiheadAttention(tf.keras.layers.Layer): + def __init__(self, project_dim, num_heads, sr_ratio): + super().__init__() + self.num_heads = num_heads + self.sr_ratio = sr_ratio + self.scale = (project_dim // num_heads) ** -0.5 + self.q = keras.layers.Dense(project_dim) + self.k = keras.layers.Dense(project_dim) + self.v = keras.layers.Dense(project_dim) + self.proj = keras.layers.Dense(project_dim) + + if sr_ratio > 1: + self.sr = keras.layers.Conv2D( + filters=project_dim, + kernel_size=sr_ratio, + strides=sr_ratio, + padding="same", + ) + self.norm = keras.layers.LayerNormalization() + + def call(self, x, H, W): + input_shape = x.shape + + q = self.q(x) + q = q.reshape( + ( + input_shape[0], + input_shape[1], + self.num_heads, + input_shape[2] // self.num_heads, + ), + ) + + q = q.transpose([0, 2, 1, 3]) + + if self.sr_ratio > 1: + x = x.transpose(x, [0, 2, 1]).reshape( + (input_shape[0], H, W, input_shape[2]) + ) + x = self.sr(x) + x = x.reshape([input_shape[0], input_shape[2], -1]) + x = x.transpose([0, 2, 1]) + x = self.norm(x) + + k = self.k(x) + v = self.v(x) + + k = k.transpose([0, 2, 1, 3]).reshape( + [ + input_shape[0], + -1, + self.num_heads, + input_shape[2] // self.num_heads, + ] + ) + v = ( + v.transpose([0, 2, 1, 3]).reshape( + [ + input_shape[0], + -1, + self.num_heads, + input_shape[2] // self.num_heads, + ] + ), + ) + + attn = (q @ x.transpose([0, 1, 3, 2])) * self.scale + attn = keras.nn.ops.softmax(attn, axis=-1) + + attn = attn @ v + attn = attn.transpose(attn, [0, 2, 1, 3]).reshape( + [input_shape[0], input_shape[1], input_shape[2]] + ) + x = self.proj(attn) + return x diff --git a/keras_cv/layers/hierarchical_transformer_encoder.py b/keras_cv/layers/hierarchical_transformer_encoder.py new file mode 100644 index 0000000000..846f0146d6 --- /dev/null +++ b/keras_cv/layers/hierarchical_transformer_encoder.py @@ -0,0 +1,55 @@ +from keras_cv.backend import keras +from keras_cv.layers import StochasticDepth +from keras_cv.layers import EfficientMultiheadAttention + + +@keras.saving.register_keras_serializable(package="keras_cv") +class HierarchicalTransformerEncoder(keras.layers.Layer): + def __init__( + self, + project_dim, + num_heads, + sr_ratio=1, + drop_prob=0.0, + layer_norm_epsilon=1e-6, + **kwargs, + ): + super().__init__(**kwargs) + self.norm1 = keras.layers.LayerNormalization(epsilon=layer_norm_epsilon) + self.attn = EfficientMultiheadAttention( + project_dim, num_heads, sr_ratio + ) + self.drop_path = StochasticDepth(drop_prob) + self.norm2 = keras.layers.LayerNormalization(epsilon=layer_norm_epsilon) + self.mlp = __MixFFN( + channels=project_dim, + mid_channels=int(project_dim * 4), + ) + + def call(self, x, H, W): + x = x + self.drop_path(self.attn(self.norm1(x), H, W)) + x = x + self.drop_path(self.mlp(self.norm2(x), H, W)) + return x + + +class __MixFFN(keras.layers.Layer): + def __init__(self, channels, mid_channels): + super().__init__() + self.fc1 = keras.layers.Dense(mid_channels) + self.dwconv = keras.layers.DepthwiseConv2D( + kernel_size=3, + strides=1, + padding="same", + ) + self.fc2 = keras.layers.Dense(channels) + + def call(self, x, H, W): + x = self.fc1(x) + # B, DIM, C + input_shape = x.shape + x = x.reshape((input_shape[0], H, W, input_shape[-1])) + x = self.dwconv(x) + x = x.reshape((input_shape[0], -1, input_shape[-1])) + x = keras.nn.ops.gelu(x) + x = self.fc2(x) + return x diff --git a/keras_cv/layers/overlapping_patching_embedding.py b/keras_cv/layers/overlapping_patching_embedding.py new file mode 100644 index 0000000000..3e3e5daaef --- /dev/null +++ b/keras_cv/layers/overlapping_patching_embedding.py @@ -0,0 +1,22 @@ +from keras_cv.backend import keras + + +@keras.saving.register_keras_serializable(package="keras_cv") +class OverlappingPatchingAndEmbedding(keras.layers.Layer): + def __init__(self, out_channels=32, patch_size=7, stride=4, **kwargs): + super().__init__(**kwargs) + self.proj = keras.layers.Conv2D( + filters=out_channels, + kernel_size=patch_size, + strides=stride, + padding="same", + ) + self.norm = keras.layers.LayerNormalization() + + def call(self, x): + x = self.proj(x) + # B, H, W, C + shape = x.shape + x = x.reshape((-1, shape[1] * shape[2], shape[3])) + x = self.norm(x) + return x, shape[1], shape[2] diff --git a/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py new file mode 100644 index 0000000000..77fd559970 --- /dev/null +++ b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py @@ -0,0 +1,144 @@ +# Copyright 2023 The KerasCV Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""MiT backbone model. + +References: + +""" # noqa: E501 + +from keras_cv import layers as cv_layers +from keras_cv.backend import keras +from keras_cv.models import utils +from keras_cv.models.backbones.backbone import Backbone +from keras_cv.models.backbones.mix_transformer.mix_transformer_backbone_presets import ( # noqa: E501 + backbone_presets, +) +from keras_cv.models.backbones.mix_transformer.mix_transformer_backbone_presets import ( # noqa: E501 + backbone_presets_with_weights, +) +from keras_cv.utils.python_utils import classproperty + + +@keras.saving.register_keras_serializable(package="keras_cv.models") +class MiT(Backbone): + def __init__( + self, + input_shape=None, + input_tensor=None, + classes=None, + include_top=None, + embed_dims=None, + depths=None, + pooling=None, + **kwargs, + ): + if include_top and not classes: + raise ValueError( + "If `include_top` is True, you should specify `classes`. " + f"Received: classes={classes}" + ) + + if include_top and pooling: + raise ValueError( + f"`pooling` must be `None` when `include_top=True`." + f"Received pooling={pooling} and include_top={include_top}. " + ) + + if include_top and as_backbone: + raise ValueError( + f"`as_backbone` must be `False` when `include_top=True`." + f"Received as_backbone={as_backbone} and include_top={include_top}. " + ) + + drop_path_rate = 0.1 + dpr = [x.numpy() for x in tf.linspace(0.0, drop_path_rate, sum(depths))] + blockwise_num_heads = [1, 2, 5, 8] + blockwise_sr_ratios = [8, 4, 2, 1] + num_stages = 4 + + cur = 0 + patch_embedding_layers = [] + transformer_blocks = [] + layer_norms = [] + + for i in range(num_stages): + patch_embed_layer = OverlappingPatchingAndEmbedding( + in_channels=input_shape[-1] if i == 0 else embed_dims[i - 1], + out_channels=embed_dims[0] if i == 0 else embed_dims[i], + patch_size=7 if i == 0 else 3, + stride=4 if i == 0 else 2, + name=f"patch_and_embed_{i}", + ) + patch_embedding_layers.append(patch_embed_layer) + + transformer_block = [ + HierarchicalTransformerEncoder( + project_dim=embed_dims[i], + num_heads=blockwise_num_heads[i], + sr_ratio=blockwise_sr_ratios[i], + drop_prob=dpr[cur + k], + name=f"hierarchical_encoder_{i}_{k}", + ) + for k in range(depths[i]) + ] + transformer_blocks.append(transformer_block) + cur += depths[i] + layer_norms.append(keras.layers.LayerNormalization()) + + inputs = utils.parse_model_inputs(input_shape, input_tensor) + x = inputs + + batch_size = x.shape[0] + pyramid_level_inputs = [] + for i in range(num_stages): + x, H, W = patch_embedding_layers[i](x) + for blk in transformer_blocks[i]: + x = blk(x, H, W) + x = layer_norms[i](x) + C = x.shape[-1] + x = x.reshape((batch_size, H, W, C)) + pyramid_level_inputs.append(x) + + super().__init__( + inputs=inputs, + outputs=x, + **kwargs, + ) + + self.channels = embed_dims + self.num_stages = num_stages + self.output_channels = embed_dims + self.classes = classes + self.include_top = include_top + self.pyramid_level_inputs = pyramid_level_inputs + self.pooling = pooling + + self.patch_embedding_layers = [] + self.transformer_blocks = [] + + def get_config(self): + config = super().get_config() + config.update( + { + "channels": self.channels, + "num_stages": self.num_stages, + "output_channels": self.output_channels, + "classes": self.classes, + "include_top": self.include_top, + "as_backbone": self.as_backbone, + "pooling": self.pooling, + } + ) + return config diff --git a/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets.py b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets.py new file mode 100644 index 0000000000..b890282a91 --- /dev/null +++ b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets.py @@ -0,0 +1,114 @@ +# Copyright 2023 The KerasCV Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""MiT model preset configurations.""" + +backbone_presets_no_weights = { + "MiT_B0": { + "metadata": { + "description": ( + "MiT (MixTransformer) model with 8 transformer blocks." + ), + "params": 3321962, + "official_name": "MiT", + "path": "mit", + }, + "class_name": "keras_cv.models>MiTBackbone", + "config": { + "embedding_dims": [32, 64, 160, 256], + "depths": [2, 2, 2, 2], + }, + }, + "MiT_B1": { + "metadata": { + "description": ( + "MiT (MixTransformer) model with 8 transformer blocks." + ), + "params": 13156554, + "official_name": "MiT", + "path": "mit", + }, + "class_name": "keras_cv.models>MiTBackbone", + "config": { + "embedding_dims": [64, 128, 320, 512], + "depths": [2, 2, 2, 2], + }, + }, + "MiT_B2": { + "metadata": { + "description": ( + "MiT (MixTransformer) model with 16 transformer blocks." + ), + "params": 24201418, + "official_name": "MiT", + "path": "mit", + }, + "class_name": "keras_cv.models>MiTBackbone", + "config": { + "embedding_dims": [64, 128, 320, 512], + "depths": [3, 4, 6, 3], + }, + }, + "MiT_B3": { + "metadata": { + "description": ( + "MiT (MixTransformer) model with 28 transformer blocks." + ), + "params": 44077258, + "official_name": "MiT", + "path": "mit", + }, + "class_name": "keras_cv.models>MiTBackbone", + "config": { + "embedding_dims": [64, 128, 320, 512], + "depths": [3, 4, 18, 3], + }, + }, + "MiT_B4": { + "metadata": { + "description": ( + "MiT (MixTransformer) model with 41 transformer blocks." + ), + "params": 60847818, + "official_name": "MiT", + "path": "mit", + }, + "class_name": "keras_cv.models>MiTBackbone", + "config": { + "embedding_dims": [64, 128, 320, 512], + "depths": [3, 8, 27, 3], + }, + }, + "MiT_B5": { + "metadata": { + "description": ( + "MiT (MixTransformer) model with 52 transformer blocks." + ), + "params": 81448138, + "official_name": "MiT", + "path": "mit", + }, + "class_name": "keras_cv.models>MiTBackbone", + "config": { + "embedding_dims": [64, 128, 320, 512], + "depths": [3, 6, 40, 3], + }, + }, +} + +backbone_presets_with_weights = {} + +backbone_presets = { + **backbone_presets_no_weights, + **backbone_presets_with_weights, +} diff --git a/keras_cv/models/segmentation/segformer/segformer.py b/keras_cv/models/segmentation/segformer/segformer.py index c1fcc39200..e71c0a8bf7 100644 --- a/keras_cv/models/segmentation/segformer/segformer.py +++ b/keras_cv/models/segmentation/segformer/segformer.py @@ -29,9 +29,9 @@ def __init__( height=x.shape[1], width=x.shape[2], interpolation="bilinear" )(y) if softmax_output: - output = tf.keras.layers.Activation("softmax", name="output_activation")( - output - ) + output = tf.keras.layers.Activation( + "softmax", name="output_activation" + )(output) super().__init__( inputs=inputs, @@ -41,4 +41,4 @@ def __init__( self.num_classes = num_classes self.embed_dim = embed_dim - self.softmax_output = softmax_output \ No newline at end of file + self.softmax_output = softmax_output From 7bd1056bc857bb36c57264825d7d8c95708161f5 Mon Sep 17 00:00:00 2001 From: DavidLandup0 Date: Mon, 17 Jul 2023 12:06:29 +0200 Subject: [PATCH 03/53] updated .gitignore --- .gitignore | 1 + ...ient_multihead_attention_20230717115601.py | 0 ...ient_multihead_attention_20230717115636.py | 199 ------------ ...ient_multihead_attention_20230717115647.py | 202 ------------- ...ient_multihead_attention_20230717115709.py | 148 --------- ...ient_multihead_attention_20230717115745.py | 147 --------- ...ient_multihead_attention_20230717115849.py | 145 --------- ...ient_multihead_attention_20230717115857.py | 145 --------- ...ient_multihead_attention_20230717115932.py | 146 --------- ...ient_multihead_attention_20230717115958.py | 142 --------- ...ient_multihead_attention_20230717120016.py | 140 --------- ...ient_multihead_attention_20230717120020.py | 140 --------- ...ient_multihead_attention_20230717120108.py | 140 --------- ...ient_multihead_attention_20230717120147.py | 138 --------- ...ient_multihead_attention_20230717120153.py | 83 ------ ...ient_multihead_attention_20230717120213.py | 83 ------ ...ical_transformer_encoder_20230717115219.py | 0 ...ical_transformer_encoder_20230717115222.py | 0 ...ical_transformer_encoder_20230717115257.py | 146 --------- ...ical_transformer_encoder_20230717115327.py | 53 ---- ...ical_transformer_encoder_20230717115542.py | 31 -- ...ical_transformer_encoder_20230717120252.py | 32 -- ...ical_transformer_encoder_20230717120400.py | 56 ---- ...ical_transformer_encoder_20230717120409.py | 55 ---- ...ical_transformer_encoder_20230717120411.py | 55 ---- ...ical_transformer_encoder_20230717120414.py | 55 ---- ...pping_patching_embedding_20230717115127.py | 0 ...pping_patching_embedding_20230717115248.py | 133 --------- ...pping_patching_embedding_20230717115350.py | 24 -- ...pping_patching_embedding_20230717115401.py | 22 -- ...pping_patching_embedding_20230717115451.py | 22 -- ...pping_patching_embedding_20230717115505.py | 22 -- ...pping_patching_embedding_20230717115509.py | 22 -- ...pping_patching_embedding_20230717115518.py | 22 -- ...mix_transformer_backbone_20230717113450.py | 0 ...mix_transformer_backbone_20230717113606.py | 158 ---------- ...mix_transformer_backbone_20230717113615.py | 158 ---------- ...mix_transformer_backbone_20230717113618.py | 156 ---------- ...mix_transformer_backbone_20230717115045.py | 146 --------- ...mix_transformer_backbone_20230717115058.py | 144 --------- ...sformer_backbone_presets_20230717113721.py | 0 ...sformer_backbone_presets_20230717113912.py | 282 ------------------ ...sformer_backbone_presets_20230717114045.py | 126 -------- ...sformer_backbone_presets_20230717114112.py | 126 -------- ...sformer_backbone_presets_20230717114332.py | 114 ------- ...sformer_backbone_presets_20230717114420.py | 114 ------- ...sformer_backbone_presets_20230717114436.py | 114 ------- 47 files changed, 1 insertion(+), 4386 deletions(-) delete mode 100644 .history/keras_cv/layers/efficient_multihead_attention_20230717115601.py delete mode 100644 .history/keras_cv/layers/efficient_multihead_attention_20230717115636.py delete mode 100644 .history/keras_cv/layers/efficient_multihead_attention_20230717115647.py delete mode 100644 .history/keras_cv/layers/efficient_multihead_attention_20230717115709.py delete mode 100644 .history/keras_cv/layers/efficient_multihead_attention_20230717115745.py delete mode 100644 .history/keras_cv/layers/efficient_multihead_attention_20230717115849.py delete mode 100644 .history/keras_cv/layers/efficient_multihead_attention_20230717115857.py delete mode 100644 .history/keras_cv/layers/efficient_multihead_attention_20230717115932.py delete mode 100644 .history/keras_cv/layers/efficient_multihead_attention_20230717115958.py delete mode 100644 .history/keras_cv/layers/efficient_multihead_attention_20230717120016.py delete mode 100644 .history/keras_cv/layers/efficient_multihead_attention_20230717120020.py delete mode 100644 .history/keras_cv/layers/efficient_multihead_attention_20230717120108.py delete mode 100644 .history/keras_cv/layers/efficient_multihead_attention_20230717120147.py delete mode 100644 .history/keras_cv/layers/efficient_multihead_attention_20230717120153.py delete mode 100644 .history/keras_cv/layers/efficient_multihead_attention_20230717120213.py delete mode 100644 .history/keras_cv/layers/hierarchical_transformer_encoder_20230717115219.py delete mode 100644 .history/keras_cv/layers/hierarchical_transformer_encoder_20230717115222.py delete mode 100644 .history/keras_cv/layers/hierarchical_transformer_encoder_20230717115257.py delete mode 100644 .history/keras_cv/layers/hierarchical_transformer_encoder_20230717115327.py delete mode 100644 .history/keras_cv/layers/hierarchical_transformer_encoder_20230717115542.py delete mode 100644 .history/keras_cv/layers/hierarchical_transformer_encoder_20230717120252.py delete mode 100644 .history/keras_cv/layers/hierarchical_transformer_encoder_20230717120400.py delete mode 100644 .history/keras_cv/layers/hierarchical_transformer_encoder_20230717120409.py delete mode 100644 .history/keras_cv/layers/hierarchical_transformer_encoder_20230717120411.py delete mode 100644 .history/keras_cv/layers/hierarchical_transformer_encoder_20230717120414.py delete mode 100644 .history/keras_cv/layers/overlapping_patching_embedding_20230717115127.py delete mode 100644 .history/keras_cv/layers/overlapping_patching_embedding_20230717115248.py delete mode 100644 .history/keras_cv/layers/overlapping_patching_embedding_20230717115350.py delete mode 100644 .history/keras_cv/layers/overlapping_patching_embedding_20230717115401.py delete mode 100644 .history/keras_cv/layers/overlapping_patching_embedding_20230717115451.py delete mode 100644 .history/keras_cv/layers/overlapping_patching_embedding_20230717115505.py delete mode 100644 .history/keras_cv/layers/overlapping_patching_embedding_20230717115509.py delete mode 100644 .history/keras_cv/layers/overlapping_patching_embedding_20230717115518.py delete mode 100644 .history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_20230717113450.py delete mode 100644 .history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_20230717113606.py delete mode 100644 .history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_20230717113615.py delete mode 100644 .history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_20230717113618.py delete mode 100644 .history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_20230717115045.py delete mode 100644 .history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_20230717115058.py delete mode 100644 .history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets_20230717113721.py delete mode 100644 .history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets_20230717113912.py delete mode 100644 .history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets_20230717114045.py delete mode 100644 .history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets_20230717114112.py delete mode 100644 .history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets_20230717114332.py delete mode 100644 .history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets_20230717114420.py delete mode 100644 .history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets_20230717114436.py diff --git a/.gitignore b/.gitignore index 6a59b32803..68d68189bd 100644 --- a/.gitignore +++ b/.gitignore @@ -16,3 +16,4 @@ __pycache__/ .vscode/ .devcontainer/ .coverage +.history diff --git a/.history/keras_cv/layers/efficient_multihead_attention_20230717115601.py b/.history/keras_cv/layers/efficient_multihead_attention_20230717115601.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/.history/keras_cv/layers/efficient_multihead_attention_20230717115636.py b/.history/keras_cv/layers/efficient_multihead_attention_20230717115636.py deleted file mode 100644 index 11a54263ed..0000000000 --- a/.history/keras_cv/layers/efficient_multihead_attention_20230717115636.py +++ /dev/null @@ -1,199 +0,0 @@ -""" -Based on: https://github.com/sithu31296/semantic-segmentation/blob/main/semseg/models/backbones/mit.py -""" - - -class __EfficientMultiheadAttentionPT(nn.Module): - def __init__(self, project_dim, num_heads, sr_ratio): - super().__init__() - self.num_heads = num_heads - self.sr_ratio = sr_ratio - self.scale = (project_dim // num_heads) ** -0.5 - self.q = nn.Linear(project_dim, project_dim) - self.kv = nn.Linear(project_dim, project_dim * 2) - self.proj = nn.Linear(project_dim, project_dim) - - if sr_ratio > 1: - self.sr = nn.Conv2d( - in_channels=project_dim, - out_channels=project_dim, - kernel_size=sr_ratio, - stride=sr_ratio, - padding=same_padding(sr_ratio, sr_ratio), - ) - self.norm = nn.LayerNorm(project_dim) - - def forward(self, x, H, W): - batch_size, seq_len, project_dim = x.shape - q = ( - self.q(x) - .reshape( - batch_size, - seq_len, - self.num_heads, - project_dim // self.num_heads, - ) - .permute(0, 2, 1, 3) - ) - - if self.sr_ratio > 1: - x = x.permute(0, 2, 1).reshape(batch_size, project_dim, H, W) - x = self.sr(x).reshape(batch_size, project_dim, -1).permute(0, 2, 1) - x = self.norm(x) - - k, v = ( - self.kv(x) - .reshape( - batch_size, -1, 2, self.num_heads, project_dim // self.num_heads - ) - .permute(2, 0, 3, 1, 4) - ) - - attn = (q @ k.transpose(-2, -1)) * self.scale - attn = attn.softmax(dim=-1) - - x = (attn @ v).transpose(1, 2).reshape(batch_size, seq_len, project_dim) - x = self.proj(x) - return x - - -class __EfficientMultiheadAttentionTF(tf.keras.layers.Layer): - def __init__(self, project_dim, num_heads, sr_ratio): - super().__init__() - self.num_heads = num_heads - self.sr_ratio = sr_ratio - self.scale = (project_dim // num_heads) ** -0.5 - self.q = tf.keras.layers.Dense(project_dim) - self.k = tf.keras.layers.Dense(project_dim) - self.v = tf.keras.layers.Dense(project_dim) - self.proj = tf.keras.layers.Dense(project_dim) - - if sr_ratio > 1: - self.sr = tf.keras.layers.Conv2D( - filters=project_dim, - kernel_size=sr_ratio, - strides=sr_ratio, - padding="same", - ) - self.norm = tf.keras.layers.LayerNormalization() - - def call(self, x, H, W): - input_shape = tf.shape(x) - - q = self.q(x) - q = tf.reshape( - q, - shape=[ - input_shape[0], - input_shape[1], - self.num_heads, - input_shape[2] // self.num_heads, - ], - ) - - q = tf.transpose(q, [0, 2, 1, 3]) - - if self.sr_ratio > 1: - x = tf.reshape( - tf.transpose(x, [0, 2, 1]), - shape=[input_shape[0], H, W, input_shape[2]], - ) - x = self.sr(x) - x = tf.reshape(x, [input_shape[0], input_shape[2], -1]) - x = tf.transpose(x, [0, 2, 1]) - x = self.norm(x) - - k = self.k(x) - v = self.v(x) - - k = tf.transpose( - tf.reshape( - k, - [ - input_shape[0], - -1, - self.num_heads, - input_shape[2] // self.num_heads, - ], - ), - [0, 2, 1, 3], - ) - - v = tf.transpose( - tf.reshape( - v, - [ - input_shape[0], - -1, - self.num_heads, - input_shape[2] // self.num_heads, - ], - ), - [0, 2, 1, 3], - ) - - attn = (q @ tf.transpose(k, [0, 1, 3, 2])) * self.scale - attn = tf.nn.softmax(attn, axis=-1) - - attn = attn @ v - attn = tf.transpose(attn, [0, 2, 1, 3]) - attn = tf.reshape( - attn, shape=[input_shape[0], input_shape[1], input_shape[2]] - ) - x = self.proj(attn) - return x - - -LAYER_BACKBONES = { - "tensorflow": __EfficientMultiheadAttentionTF, - "pytorch": __EfficientMultiheadAttentionPT, -} - - -def EfficientMultiheadAttention( - project_dim, num_heads, sr_ratio, backend="pytorch" -): - """ - `EfficientMultiheadAttention` is a standard scaled softmax attention layer, but shortens the sequence it operates on by a reduction factor, to reduce computational cost. - The layer is meant to be used as part of the `deepvision.layers.HierarchicalTransformerEncoder` for the SegFormer architecture. - - Reference: - - ["SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers"](https://arxiv.org/pdf/2105.15203v2.pdf) - - Args: - project_dim: the dimensionality of the projection for the keys, values and queries - num_heads: the number of attention heads to apply - sr_ratio: the reduction ratio for the sequence length - backend: the backend framework to use - - Basic usage: - - ``` - tensor = torch.rand(1, 196, 32) - output = deepvision.layers.EfficientMultiheadAttention(project_dim=32, - num_heads=2, - sr_ratio=4, - backend='pytorch')(tensor, H=14, W=14) - - print(output.shape) # torch.Size([1, 196, 32]) - - tensor = tf.random.uniform([1, 196, 32]) - output = deepvision.layers.EfficientMultiheadAttention(project_dim=32, - num_heads=2, - sr_ratio=4, - backend='tensorflow')(tensor, H=14, W=14) - print(output.shape) # (1, 196, 32) - ``` - - """ - layer_class = LAYER_BACKBONES.get(backend) - if layer_class is None: - raise ValueError( - f"Backend not supported: {backend}. Supported backbones are {LAYER_BACKBONES.keys()}" - ) - - layer = layer_class( - project_dim=project_dim, num_heads=num_heads, sr_ratio=sr_ratio - ) - - return layer diff --git a/.history/keras_cv/layers/efficient_multihead_attention_20230717115647.py b/.history/keras_cv/layers/efficient_multihead_attention_20230717115647.py deleted file mode 100644 index 46e20038cf..0000000000 --- a/.history/keras_cv/layers/efficient_multihead_attention_20230717115647.py +++ /dev/null @@ -1,202 +0,0 @@ -from keras_cv.backend import keras - -""" -Based on: https://github.com/sithu31296/semantic-segmentation/blob/main/semseg/models/backbones/mit.py -""" - - -@keras.saving.register_keras_serializable(package="keras_cv") -class __EfficientMultiheadAttentionPT(nn.Module): - def __init__(self, project_dim, num_heads, sr_ratio): - super().__init__() - self.num_heads = num_heads - self.sr_ratio = sr_ratio - self.scale = (project_dim // num_heads) ** -0.5 - self.q = nn.Linear(project_dim, project_dim) - self.kv = nn.Linear(project_dim, project_dim * 2) - self.proj = nn.Linear(project_dim, project_dim) - - if sr_ratio > 1: - self.sr = nn.Conv2d( - in_channels=project_dim, - out_channels=project_dim, - kernel_size=sr_ratio, - stride=sr_ratio, - padding=same_padding(sr_ratio, sr_ratio), - ) - self.norm = nn.LayerNorm(project_dim) - - def forward(self, x, H, W): - batch_size, seq_len, project_dim = x.shape - q = ( - self.q(x) - .reshape( - batch_size, - seq_len, - self.num_heads, - project_dim // self.num_heads, - ) - .permute(0, 2, 1, 3) - ) - - if self.sr_ratio > 1: - x = x.permute(0, 2, 1).reshape(batch_size, project_dim, H, W) - x = self.sr(x).reshape(batch_size, project_dim, -1).permute(0, 2, 1) - x = self.norm(x) - - k, v = ( - self.kv(x) - .reshape( - batch_size, -1, 2, self.num_heads, project_dim // self.num_heads - ) - .permute(2, 0, 3, 1, 4) - ) - - attn = (q @ k.transpose(-2, -1)) * self.scale - attn = attn.softmax(dim=-1) - - x = (attn @ v).transpose(1, 2).reshape(batch_size, seq_len, project_dim) - x = self.proj(x) - return x - - -class __EfficientMultiheadAttentionTF(tf.keras.layers.Layer): - def __init__(self, project_dim, num_heads, sr_ratio): - super().__init__() - self.num_heads = num_heads - self.sr_ratio = sr_ratio - self.scale = (project_dim // num_heads) ** -0.5 - self.q = tf.keras.layers.Dense(project_dim) - self.k = tf.keras.layers.Dense(project_dim) - self.v = tf.keras.layers.Dense(project_dim) - self.proj = tf.keras.layers.Dense(project_dim) - - if sr_ratio > 1: - self.sr = tf.keras.layers.Conv2D( - filters=project_dim, - kernel_size=sr_ratio, - strides=sr_ratio, - padding="same", - ) - self.norm = tf.keras.layers.LayerNormalization() - - def call(self, x, H, W): - input_shape = tf.shape(x) - - q = self.q(x) - q = tf.reshape( - q, - shape=[ - input_shape[0], - input_shape[1], - self.num_heads, - input_shape[2] // self.num_heads, - ], - ) - - q = tf.transpose(q, [0, 2, 1, 3]) - - if self.sr_ratio > 1: - x = tf.reshape( - tf.transpose(x, [0, 2, 1]), - shape=[input_shape[0], H, W, input_shape[2]], - ) - x = self.sr(x) - x = tf.reshape(x, [input_shape[0], input_shape[2], -1]) - x = tf.transpose(x, [0, 2, 1]) - x = self.norm(x) - - k = self.k(x) - v = self.v(x) - - k = tf.transpose( - tf.reshape( - k, - [ - input_shape[0], - -1, - self.num_heads, - input_shape[2] // self.num_heads, - ], - ), - [0, 2, 1, 3], - ) - - v = tf.transpose( - tf.reshape( - v, - [ - input_shape[0], - -1, - self.num_heads, - input_shape[2] // self.num_heads, - ], - ), - [0, 2, 1, 3], - ) - - attn = (q @ tf.transpose(k, [0, 1, 3, 2])) * self.scale - attn = tf.nn.softmax(attn, axis=-1) - - attn = attn @ v - attn = tf.transpose(attn, [0, 2, 1, 3]) - attn = tf.reshape( - attn, shape=[input_shape[0], input_shape[1], input_shape[2]] - ) - x = self.proj(attn) - return x - - -LAYER_BACKBONES = { - "tensorflow": __EfficientMultiheadAttentionTF, - "pytorch": __EfficientMultiheadAttentionPT, -} - - -def EfficientMultiheadAttention( - project_dim, num_heads, sr_ratio, backend="pytorch" -): - """ - `EfficientMultiheadAttention` is a standard scaled softmax attention layer, but shortens the sequence it operates on by a reduction factor, to reduce computational cost. - The layer is meant to be used as part of the `deepvision.layers.HierarchicalTransformerEncoder` for the SegFormer architecture. - - Reference: - - ["SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers"](https://arxiv.org/pdf/2105.15203v2.pdf) - - Args: - project_dim: the dimensionality of the projection for the keys, values and queries - num_heads: the number of attention heads to apply - sr_ratio: the reduction ratio for the sequence length - backend: the backend framework to use - - Basic usage: - - ``` - tensor = torch.rand(1, 196, 32) - output = deepvision.layers.EfficientMultiheadAttention(project_dim=32, - num_heads=2, - sr_ratio=4, - backend='pytorch')(tensor, H=14, W=14) - - print(output.shape) # torch.Size([1, 196, 32]) - - tensor = tf.random.uniform([1, 196, 32]) - output = deepvision.layers.EfficientMultiheadAttention(project_dim=32, - num_heads=2, - sr_ratio=4, - backend='tensorflow')(tensor, H=14, W=14) - print(output.shape) # (1, 196, 32) - ``` - - """ - layer_class = LAYER_BACKBONES.get(backend) - if layer_class is None: - raise ValueError( - f"Backend not supported: {backend}. Supported backbones are {LAYER_BACKBONES.keys()}" - ) - - layer = layer_class( - project_dim=project_dim, num_heads=num_heads, sr_ratio=sr_ratio - ) - - return layer diff --git a/.history/keras_cv/layers/efficient_multihead_attention_20230717115709.py b/.history/keras_cv/layers/efficient_multihead_attention_20230717115709.py deleted file mode 100644 index c758bc53c4..0000000000 --- a/.history/keras_cv/layers/efficient_multihead_attention_20230717115709.py +++ /dev/null @@ -1,148 +0,0 @@ -from keras_cv.backend import keras - -""" -Based on: https://github.com/sithu31296/semantic-segmentation/blob/main/semseg/models/backbones/mit.py -""" - - -@keras.saving.register_keras_serializable(package="keras_cv") -class __EfficientMultiheadAttentionTF(tf.keras.layers.Layer): - def __init__(self, project_dim, num_heads, sr_ratio): - super().__init__() - self.num_heads = num_heads - self.sr_ratio = sr_ratio - self.scale = (project_dim // num_heads) ** -0.5 - self.q = tf.keras.layers.Dense(project_dim) - self.k = tf.keras.layers.Dense(project_dim) - self.v = tf.keras.layers.Dense(project_dim) - self.proj = tf.keras.layers.Dense(project_dim) - - if sr_ratio > 1: - self.sr = tf.keras.layers.Conv2D( - filters=project_dim, - kernel_size=sr_ratio, - strides=sr_ratio, - padding="same", - ) - self.norm = tf.keras.layers.LayerNormalization() - - def call(self, x, H, W): - input_shape = tf.shape(x) - - q = self.q(x) - q = tf.reshape( - q, - shape=[ - input_shape[0], - input_shape[1], - self.num_heads, - input_shape[2] // self.num_heads, - ], - ) - - q = tf.transpose(q, [0, 2, 1, 3]) - - if self.sr_ratio > 1: - x = tf.reshape( - tf.transpose(x, [0, 2, 1]), - shape=[input_shape[0], H, W, input_shape[2]], - ) - x = self.sr(x) - x = tf.reshape(x, [input_shape[0], input_shape[2], -1]) - x = tf.transpose(x, [0, 2, 1]) - x = self.norm(x) - - k = self.k(x) - v = self.v(x) - - k = tf.transpose( - tf.reshape( - k, - [ - input_shape[0], - -1, - self.num_heads, - input_shape[2] // self.num_heads, - ], - ), - [0, 2, 1, 3], - ) - - v = tf.transpose( - tf.reshape( - v, - [ - input_shape[0], - -1, - self.num_heads, - input_shape[2] // self.num_heads, - ], - ), - [0, 2, 1, 3], - ) - - attn = (q @ tf.transpose(k, [0, 1, 3, 2])) * self.scale - attn = tf.nn.softmax(attn, axis=-1) - - attn = attn @ v - attn = tf.transpose(attn, [0, 2, 1, 3]) - attn = tf.reshape( - attn, shape=[input_shape[0], input_shape[1], input_shape[2]] - ) - x = self.proj(attn) - return x - - -LAYER_BACKBONES = { - "tensorflow": __EfficientMultiheadAttentionTF, - "pytorch": __EfficientMultiheadAttentionPT, -} - - -def EfficientMultiheadAttention( - project_dim, num_heads, sr_ratio, backend="pytorch" -): - """ - `EfficientMultiheadAttention` is a standard scaled softmax attention layer, but shortens the sequence it operates on by a reduction factor, to reduce computational cost. - The layer is meant to be used as part of the `deepvision.layers.HierarchicalTransformerEncoder` for the SegFormer architecture. - - Reference: - - ["SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers"](https://arxiv.org/pdf/2105.15203v2.pdf) - - Args: - project_dim: the dimensionality of the projection for the keys, values and queries - num_heads: the number of attention heads to apply - sr_ratio: the reduction ratio for the sequence length - backend: the backend framework to use - - Basic usage: - - ``` - tensor = torch.rand(1, 196, 32) - output = deepvision.layers.EfficientMultiheadAttention(project_dim=32, - num_heads=2, - sr_ratio=4, - backend='pytorch')(tensor, H=14, W=14) - - print(output.shape) # torch.Size([1, 196, 32]) - - tensor = tf.random.uniform([1, 196, 32]) - output = deepvision.layers.EfficientMultiheadAttention(project_dim=32, - num_heads=2, - sr_ratio=4, - backend='tensorflow')(tensor, H=14, W=14) - print(output.shape) # (1, 196, 32) - ``` - - """ - layer_class = LAYER_BACKBONES.get(backend) - if layer_class is None: - raise ValueError( - f"Backend not supported: {backend}. Supported backbones are {LAYER_BACKBONES.keys()}" - ) - - layer = layer_class( - project_dim=project_dim, num_heads=num_heads, sr_ratio=sr_ratio - ) - - return layer diff --git a/.history/keras_cv/layers/efficient_multihead_attention_20230717115745.py b/.history/keras_cv/layers/efficient_multihead_attention_20230717115745.py deleted file mode 100644 index 2c69162376..0000000000 --- a/.history/keras_cv/layers/efficient_multihead_attention_20230717115745.py +++ /dev/null @@ -1,147 +0,0 @@ -from keras_cv.backend import keras - -""" -Based on: https://github.com/sithu31296/semantic-segmentation/blob/main/semseg/models/backbones/mit.py -""" - - -@keras.saving.register_keras_serializable(package="keras_cv") -class EfficientMultiheadAttention(tf.keras.layers.Layer): - def __init__(self, project_dim, num_heads, sr_ratio): - super().__init__() - self.num_heads = num_heads - self.sr_ratio = sr_ratio - self.scale = (project_dim // num_heads) ** -0.5 - self.q = keras.layers.Dense(project_dim) - self.k = keras.layers.Dense(project_dim) - self.v = keras.layers.Dense(project_dim) - self.proj = keras.layers.Dense(project_dim) - - if sr_ratio > 1: - self.sr = keras.layers.Conv2D( - filters=project_dim, - kernel_size=sr_ratio, - strides=sr_ratio, - padding="same", - ) - self.norm = keras.layers.LayerNormalization() - - def call(self, x, H, W): - input_shape = shape(x) - - q = self.q(x) - q = q.reshape( - shape=[ - input_shape[0], - input_shape[1], - self.num_heads, - input_shape[2] // self.num_heads, - ], - ) - - q = tf.transpose(q, [0, 2, 1, 3]) - - if self.sr_ratio > 1: - x = tf.reshape( - tf.transpose(x, [0, 2, 1]), - shape=[input_shape[0], H, W, input_shape[2]], - ) - x = self.sr(x) - x = tf.reshape(x, [input_shape[0], input_shape[2], -1]) - x = tf.transpose(x, [0, 2, 1]) - x = self.norm(x) - - k = self.k(x) - v = self.v(x) - - k = tf.transpose( - tf.reshape( - k, - [ - input_shape[0], - -1, - self.num_heads, - input_shape[2] // self.num_heads, - ], - ), - [0, 2, 1, 3], - ) - - v = tf.transpose( - tf.reshape( - v, - [ - input_shape[0], - -1, - self.num_heads, - input_shape[2] // self.num_heads, - ], - ), - [0, 2, 1, 3], - ) - - attn = (q @ tf.transpose(k, [0, 1, 3, 2])) * self.scale - attn = tf.nn.softmax(attn, axis=-1) - - attn = attn @ v - attn = tf.transpose(attn, [0, 2, 1, 3]) - attn = tf.reshape( - attn, shape=[input_shape[0], input_shape[1], input_shape[2]] - ) - x = self.proj(attn) - return x - - -LAYER_BACKBONES = { - "tensorflow": __EfficientMultiheadAttentionTF, - "pytorch": __EfficientMultiheadAttentionPT, -} - - -def EfficientMultiheadAttention( - project_dim, num_heads, sr_ratio, backend="pytorch" -): - """ - `EfficientMultiheadAttention` is a standard scaled softmax attention layer, but shortens the sequence it operates on by a reduction factor, to reduce computational cost. - The layer is meant to be used as part of the `deepvision.layers.HierarchicalTransformerEncoder` for the SegFormer architecture. - - Reference: - - ["SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers"](https://arxiv.org/pdf/2105.15203v2.pdf) - - Args: - project_dim: the dimensionality of the projection for the keys, values and queries - num_heads: the number of attention heads to apply - sr_ratio: the reduction ratio for the sequence length - backend: the backend framework to use - - Basic usage: - - ``` - tensor = torch.rand(1, 196, 32) - output = deepvision.layers.EfficientMultiheadAttention(project_dim=32, - num_heads=2, - sr_ratio=4, - backend='pytorch')(tensor, H=14, W=14) - - print(output.shape) # torch.Size([1, 196, 32]) - - tensor = tf.random.uniform([1, 196, 32]) - output = deepvision.layers.EfficientMultiheadAttention(project_dim=32, - num_heads=2, - sr_ratio=4, - backend='tensorflow')(tensor, H=14, W=14) - print(output.shape) # (1, 196, 32) - ``` - - """ - layer_class = LAYER_BACKBONES.get(backend) - if layer_class is None: - raise ValueError( - f"Backend not supported: {backend}. Supported backbones are {LAYER_BACKBONES.keys()}" - ) - - layer = layer_class( - project_dim=project_dim, num_heads=num_heads, sr_ratio=sr_ratio - ) - - return layer diff --git a/.history/keras_cv/layers/efficient_multihead_attention_20230717115849.py b/.history/keras_cv/layers/efficient_multihead_attention_20230717115849.py deleted file mode 100644 index 6efe64dc32..0000000000 --- a/.history/keras_cv/layers/efficient_multihead_attention_20230717115849.py +++ /dev/null @@ -1,145 +0,0 @@ -from keras_cv.backend import keras - -""" -Based on: https://github.com/sithu31296/semantic-segmentation/blob/main/semseg/models/backbones/mit.py -""" - - -@keras.saving.register_keras_serializable(package="keras_cv") -class EfficientMultiheadAttention(tf.keras.layers.Layer): - def __init__(self, project_dim, num_heads, sr_ratio): - super().__init__() - self.num_heads = num_heads - self.sr_ratio = sr_ratio - self.scale = (project_dim // num_heads) ** -0.5 - self.q = keras.layers.Dense(project_dim) - self.k = keras.layers.Dense(project_dim) - self.v = keras.layers.Dense(project_dim) - self.proj = keras.layers.Dense(project_dim) - - if sr_ratio > 1: - self.sr = keras.layers.Conv2D( - filters=project_dim, - kernel_size=sr_ratio, - strides=sr_ratio, - padding="same", - ) - self.norm = keras.layers.LayerNormalization() - - def call(self, x, H, W): - input_shape = shape(x) - - q = self.q(x) - q = q.reshape( - ( - input_shape[0], - input_shape[1], - self.num_heads, - input_shape[2] // self.num_heads, - ), - ) - - q = x.transpose([0, 2, 1, 3]) - - if self.sr_ratio > 1: - x = x.transpose(x, [0, 2, 1]) - x = x.reshape((input_shape[0], H, W, input_shape[2])) - x = self.sr(x) - x = tf.reshape(x, [input_shape[0], input_shape[2], -1]) - x = tf.transpose(x, [0, 2, 1]) - x = self.norm(x) - - k = self.k(x) - v = self.v(x) - - k = tf.transpose( - tf.reshape( - k, - [ - input_shape[0], - -1, - self.num_heads, - input_shape[2] // self.num_heads, - ], - ), - [0, 2, 1, 3], - ) - - v = tf.transpose( - tf.reshape( - v, - [ - input_shape[0], - -1, - self.num_heads, - input_shape[2] // self.num_heads, - ], - ), - [0, 2, 1, 3], - ) - - attn = (q @ tf.transpose(k, [0, 1, 3, 2])) * self.scale - attn = tf.nn.softmax(attn, axis=-1) - - attn = attn @ v - attn = tf.transpose(attn, [0, 2, 1, 3]) - attn = tf.reshape( - attn, shape=[input_shape[0], input_shape[1], input_shape[2]] - ) - x = self.proj(attn) - return x - - -LAYER_BACKBONES = { - "tensorflow": __EfficientMultiheadAttentionTF, - "pytorch": __EfficientMultiheadAttentionPT, -} - - -def EfficientMultiheadAttention( - project_dim, num_heads, sr_ratio, backend="pytorch" -): - """ - `EfficientMultiheadAttention` is a standard scaled softmax attention layer, but shortens the sequence it operates on by a reduction factor, to reduce computational cost. - The layer is meant to be used as part of the `deepvision.layers.HierarchicalTransformerEncoder` for the SegFormer architecture. - - Reference: - - ["SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers"](https://arxiv.org/pdf/2105.15203v2.pdf) - - Args: - project_dim: the dimensionality of the projection for the keys, values and queries - num_heads: the number of attention heads to apply - sr_ratio: the reduction ratio for the sequence length - backend: the backend framework to use - - Basic usage: - - ``` - tensor = torch.rand(1, 196, 32) - output = deepvision.layers.EfficientMultiheadAttention(project_dim=32, - num_heads=2, - sr_ratio=4, - backend='pytorch')(tensor, H=14, W=14) - - print(output.shape) # torch.Size([1, 196, 32]) - - tensor = tf.random.uniform([1, 196, 32]) - output = deepvision.layers.EfficientMultiheadAttention(project_dim=32, - num_heads=2, - sr_ratio=4, - backend='tensorflow')(tensor, H=14, W=14) - print(output.shape) # (1, 196, 32) - ``` - - """ - layer_class = LAYER_BACKBONES.get(backend) - if layer_class is None: - raise ValueError( - f"Backend not supported: {backend}. Supported backbones are {LAYER_BACKBONES.keys()}" - ) - - layer = layer_class( - project_dim=project_dim, num_heads=num_heads, sr_ratio=sr_ratio - ) - - return layer diff --git a/.history/keras_cv/layers/efficient_multihead_attention_20230717115857.py b/.history/keras_cv/layers/efficient_multihead_attention_20230717115857.py deleted file mode 100644 index a3f69e6631..0000000000 --- a/.history/keras_cv/layers/efficient_multihead_attention_20230717115857.py +++ /dev/null @@ -1,145 +0,0 @@ -from keras_cv.backend import keras - -""" -Based on: https://github.com/sithu31296/semantic-segmentation/blob/main/semseg/models/backbones/mit.py -""" - - -@keras.saving.register_keras_serializable(package="keras_cv") -class EfficientMultiheadAttention(tf.keras.layers.Layer): - def __init__(self, project_dim, num_heads, sr_ratio): - super().__init__() - self.num_heads = num_heads - self.sr_ratio = sr_ratio - self.scale = (project_dim // num_heads) ** -0.5 - self.q = keras.layers.Dense(project_dim) - self.k = keras.layers.Dense(project_dim) - self.v = keras.layers.Dense(project_dim) - self.proj = keras.layers.Dense(project_dim) - - if sr_ratio > 1: - self.sr = keras.layers.Conv2D( - filters=project_dim, - kernel_size=sr_ratio, - strides=sr_ratio, - padding="same", - ) - self.norm = keras.layers.LayerNormalization() - - def call(self, x, H, W): - input_shape = shape(x) - - q = self.q(x) - q = q.reshape( - ( - input_shape[0], - input_shape[1], - self.num_heads, - input_shape[2] // self.num_heads, - ), - ) - - q = x.transpose([0, 2, 1, 3]) - - if self.sr_ratio > 1: - x = x.transpose(x, [0, 2, 1]) - x = x.reshape((input_shape[0], H, W, input_shape[2])) - x = self.sr(x) - x = x.reshape([input_shape[0], input_shape[2], -1]) - x = x.transpose([0, 2, 1]) - x = self.norm(x) - - k = self.k(x) - v = self.v(x) - - k = tf.transpose( - tf.reshape( - k, - [ - input_shape[0], - -1, - self.num_heads, - input_shape[2] // self.num_heads, - ], - ), - [0, 2, 1, 3], - ) - - v = tf.transpose( - tf.reshape( - v, - [ - input_shape[0], - -1, - self.num_heads, - input_shape[2] // self.num_heads, - ], - ), - [0, 2, 1, 3], - ) - - attn = (q @ tf.transpose(k, [0, 1, 3, 2])) * self.scale - attn = tf.nn.softmax(attn, axis=-1) - - attn = attn @ v - attn = tf.transpose(attn, [0, 2, 1, 3]) - attn = tf.reshape( - attn, shape=[input_shape[0], input_shape[1], input_shape[2]] - ) - x = self.proj(attn) - return x - - -LAYER_BACKBONES = { - "tensorflow": __EfficientMultiheadAttentionTF, - "pytorch": __EfficientMultiheadAttentionPT, -} - - -def EfficientMultiheadAttention( - project_dim, num_heads, sr_ratio, backend="pytorch" -): - """ - `EfficientMultiheadAttention` is a standard scaled softmax attention layer, but shortens the sequence it operates on by a reduction factor, to reduce computational cost. - The layer is meant to be used as part of the `deepvision.layers.HierarchicalTransformerEncoder` for the SegFormer architecture. - - Reference: - - ["SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers"](https://arxiv.org/pdf/2105.15203v2.pdf) - - Args: - project_dim: the dimensionality of the projection for the keys, values and queries - num_heads: the number of attention heads to apply - sr_ratio: the reduction ratio for the sequence length - backend: the backend framework to use - - Basic usage: - - ``` - tensor = torch.rand(1, 196, 32) - output = deepvision.layers.EfficientMultiheadAttention(project_dim=32, - num_heads=2, - sr_ratio=4, - backend='pytorch')(tensor, H=14, W=14) - - print(output.shape) # torch.Size([1, 196, 32]) - - tensor = tf.random.uniform([1, 196, 32]) - output = deepvision.layers.EfficientMultiheadAttention(project_dim=32, - num_heads=2, - sr_ratio=4, - backend='tensorflow')(tensor, H=14, W=14) - print(output.shape) # (1, 196, 32) - ``` - - """ - layer_class = LAYER_BACKBONES.get(backend) - if layer_class is None: - raise ValueError( - f"Backend not supported: {backend}. Supported backbones are {LAYER_BACKBONES.keys()}" - ) - - layer = layer_class( - project_dim=project_dim, num_heads=num_heads, sr_ratio=sr_ratio - ) - - return layer diff --git a/.history/keras_cv/layers/efficient_multihead_attention_20230717115932.py b/.history/keras_cv/layers/efficient_multihead_attention_20230717115932.py deleted file mode 100644 index 2ce2c70dac..0000000000 --- a/.history/keras_cv/layers/efficient_multihead_attention_20230717115932.py +++ /dev/null @@ -1,146 +0,0 @@ -from keras_cv.backend import keras - -""" -Based on: https://github.com/sithu31296/semantic-segmentation/blob/main/semseg/models/backbones/mit.py -""" - - -@keras.saving.register_keras_serializable(package="keras_cv") -class EfficientMultiheadAttention(tf.keras.layers.Layer): - def __init__(self, project_dim, num_heads, sr_ratio): - super().__init__() - self.num_heads = num_heads - self.sr_ratio = sr_ratio - self.scale = (project_dim // num_heads) ** -0.5 - self.q = keras.layers.Dense(project_dim) - self.k = keras.layers.Dense(project_dim) - self.v = keras.layers.Dense(project_dim) - self.proj = keras.layers.Dense(project_dim) - - if sr_ratio > 1: - self.sr = keras.layers.Conv2D( - filters=project_dim, - kernel_size=sr_ratio, - strides=sr_ratio, - padding="same", - ) - self.norm = keras.layers.LayerNormalization() - - def call(self, x, H, W): - input_shape = shape(x) - - q = self.q(x) - q = q.reshape( - ( - input_shape[0], - input_shape[1], - self.num_heads, - input_shape[2] // self.num_heads, - ), - ) - - q = x.transpose([0, 2, 1, 3]) - - if self.sr_ratio > 1: - x = x.transpose(x, [0, 2, 1]).reshape( - (input_shape[0], H, W, input_shape[2]) - ) - x = self.sr(x) - x = x.reshape([input_shape[0], input_shape[2], -1]) - x = x.transpose([0, 2, 1]) - x = self.norm(x) - - k = self.k(x) - v = self.v(x) - - k = tf.transpose( - tf.reshape( - k, - [ - input_shape[0], - -1, - self.num_heads, - input_shape[2] // self.num_heads, - ], - ), - [0, 2, 1, 3], - ) - - v = tf.transpose( - tf.reshape( - v, - [ - input_shape[0], - -1, - self.num_heads, - input_shape[2] // self.num_heads, - ], - ), - [0, 2, 1, 3], - ) - - attn = (q @ tf.transpose(k, [0, 1, 3, 2])) * self.scale - attn = tf.nn.softmax(attn, axis=-1) - - attn = attn @ v - attn = tf.transpose(attn, [0, 2, 1, 3]) - attn = tf.reshape( - attn, shape=[input_shape[0], input_shape[1], input_shape[2]] - ) - x = self.proj(attn) - return x - - -LAYER_BACKBONES = { - "tensorflow": __EfficientMultiheadAttentionTF, - "pytorch": __EfficientMultiheadAttentionPT, -} - - -def EfficientMultiheadAttention( - project_dim, num_heads, sr_ratio, backend="pytorch" -): - """ - `EfficientMultiheadAttention` is a standard scaled softmax attention layer, but shortens the sequence it operates on by a reduction factor, to reduce computational cost. - The layer is meant to be used as part of the `deepvision.layers.HierarchicalTransformerEncoder` for the SegFormer architecture. - - Reference: - - ["SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers"](https://arxiv.org/pdf/2105.15203v2.pdf) - - Args: - project_dim: the dimensionality of the projection for the keys, values and queries - num_heads: the number of attention heads to apply - sr_ratio: the reduction ratio for the sequence length - backend: the backend framework to use - - Basic usage: - - ``` - tensor = torch.rand(1, 196, 32) - output = deepvision.layers.EfficientMultiheadAttention(project_dim=32, - num_heads=2, - sr_ratio=4, - backend='pytorch')(tensor, H=14, W=14) - - print(output.shape) # torch.Size([1, 196, 32]) - - tensor = tf.random.uniform([1, 196, 32]) - output = deepvision.layers.EfficientMultiheadAttention(project_dim=32, - num_heads=2, - sr_ratio=4, - backend='tensorflow')(tensor, H=14, W=14) - print(output.shape) # (1, 196, 32) - ``` - - """ - layer_class = LAYER_BACKBONES.get(backend) - if layer_class is None: - raise ValueError( - f"Backend not supported: {backend}. Supported backbones are {LAYER_BACKBONES.keys()}" - ) - - layer = layer_class( - project_dim=project_dim, num_heads=num_heads, sr_ratio=sr_ratio - ) - - return layer diff --git a/.history/keras_cv/layers/efficient_multihead_attention_20230717115958.py b/.history/keras_cv/layers/efficient_multihead_attention_20230717115958.py deleted file mode 100644 index 03ae5581af..0000000000 --- a/.history/keras_cv/layers/efficient_multihead_attention_20230717115958.py +++ /dev/null @@ -1,142 +0,0 @@ -from keras_cv.backend import keras - -""" -Based on: https://github.com/sithu31296/semantic-segmentation/blob/main/semseg/models/backbones/mit.py -""" - - -@keras.saving.register_keras_serializable(package="keras_cv") -class EfficientMultiheadAttention(tf.keras.layers.Layer): - def __init__(self, project_dim, num_heads, sr_ratio): - super().__init__() - self.num_heads = num_heads - self.sr_ratio = sr_ratio - self.scale = (project_dim // num_heads) ** -0.5 - self.q = keras.layers.Dense(project_dim) - self.k = keras.layers.Dense(project_dim) - self.v = keras.layers.Dense(project_dim) - self.proj = keras.layers.Dense(project_dim) - - if sr_ratio > 1: - self.sr = keras.layers.Conv2D( - filters=project_dim, - kernel_size=sr_ratio, - strides=sr_ratio, - padding="same", - ) - self.norm = keras.layers.LayerNormalization() - - def call(self, x, H, W): - input_shape = shape(x) - - q = self.q(x) - q = q.reshape( - ( - input_shape[0], - input_shape[1], - self.num_heads, - input_shape[2] // self.num_heads, - ), - ) - - q = x.transpose([0, 2, 1, 3]) - - if self.sr_ratio > 1: - x = x.transpose(x, [0, 2, 1]).reshape( - (input_shape[0], H, W, input_shape[2]) - ) - x = self.sr(x) - x = x.reshape([input_shape[0], input_shape[2], -1]) - x = x.transpose([0, 2, 1]) - x = self.norm(x) - - k = self.k(x) - v = self.v(x) - - k = x.transpose([0, 2, 1, 3]).reshape( - [ - input_shape[0], - -1, - self.num_heads, - input_shape[2] // self.num_heads, - ] - ) - - v = tf.transpose( - tf.reshape( - v, - [ - input_shape[0], - -1, - self.num_heads, - input_shape[2] // self.num_heads, - ], - ), - [0, 2, 1, 3], - ) - - attn = (q @ tf.transpose(k, [0, 1, 3, 2])) * self.scale - attn = tf.nn.softmax(attn, axis=-1) - - attn = attn @ v - attn = tf.transpose(attn, [0, 2, 1, 3]) - attn = tf.reshape( - attn, shape=[input_shape[0], input_shape[1], input_shape[2]] - ) - x = self.proj(attn) - return x - - -LAYER_BACKBONES = { - "tensorflow": __EfficientMultiheadAttentionTF, - "pytorch": __EfficientMultiheadAttentionPT, -} - - -def EfficientMultiheadAttention( - project_dim, num_heads, sr_ratio, backend="pytorch" -): - """ - `EfficientMultiheadAttention` is a standard scaled softmax attention layer, but shortens the sequence it operates on by a reduction factor, to reduce computational cost. - The layer is meant to be used as part of the `deepvision.layers.HierarchicalTransformerEncoder` for the SegFormer architecture. - - Reference: - - ["SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers"](https://arxiv.org/pdf/2105.15203v2.pdf) - - Args: - project_dim: the dimensionality of the projection for the keys, values and queries - num_heads: the number of attention heads to apply - sr_ratio: the reduction ratio for the sequence length - backend: the backend framework to use - - Basic usage: - - ``` - tensor = torch.rand(1, 196, 32) - output = deepvision.layers.EfficientMultiheadAttention(project_dim=32, - num_heads=2, - sr_ratio=4, - backend='pytorch')(tensor, H=14, W=14) - - print(output.shape) # torch.Size([1, 196, 32]) - - tensor = tf.random.uniform([1, 196, 32]) - output = deepvision.layers.EfficientMultiheadAttention(project_dim=32, - num_heads=2, - sr_ratio=4, - backend='tensorflow')(tensor, H=14, W=14) - print(output.shape) # (1, 196, 32) - ``` - - """ - layer_class = LAYER_BACKBONES.get(backend) - if layer_class is None: - raise ValueError( - f"Backend not supported: {backend}. Supported backbones are {LAYER_BACKBONES.keys()}" - ) - - layer = layer_class( - project_dim=project_dim, num_heads=num_heads, sr_ratio=sr_ratio - ) - - return layer diff --git a/.history/keras_cv/layers/efficient_multihead_attention_20230717120016.py b/.history/keras_cv/layers/efficient_multihead_attention_20230717120016.py deleted file mode 100644 index b27f82defd..0000000000 --- a/.history/keras_cv/layers/efficient_multihead_attention_20230717120016.py +++ /dev/null @@ -1,140 +0,0 @@ -from keras_cv.backend import keras - -""" -Based on: https://github.com/sithu31296/semantic-segmentation/blob/main/semseg/models/backbones/mit.py -""" - - -@keras.saving.register_keras_serializable(package="keras_cv") -class EfficientMultiheadAttention(tf.keras.layers.Layer): - def __init__(self, project_dim, num_heads, sr_ratio): - super().__init__() - self.num_heads = num_heads - self.sr_ratio = sr_ratio - self.scale = (project_dim // num_heads) ** -0.5 - self.q = keras.layers.Dense(project_dim) - self.k = keras.layers.Dense(project_dim) - self.v = keras.layers.Dense(project_dim) - self.proj = keras.layers.Dense(project_dim) - - if sr_ratio > 1: - self.sr = keras.layers.Conv2D( - filters=project_dim, - kernel_size=sr_ratio, - strides=sr_ratio, - padding="same", - ) - self.norm = keras.layers.LayerNormalization() - - def call(self, x, H, W): - input_shape = shape(x) - - q = self.q(x) - q = q.reshape( - ( - input_shape[0], - input_shape[1], - self.num_heads, - input_shape[2] // self.num_heads, - ), - ) - - q = x.transpose([0, 2, 1, 3]) - - if self.sr_ratio > 1: - x = x.transpose(x, [0, 2, 1]).reshape( - (input_shape[0], H, W, input_shape[2]) - ) - x = self.sr(x) - x = x.reshape([input_shape[0], input_shape[2], -1]) - x = x.transpose([0, 2, 1]) - x = self.norm(x) - - k = self.k(x) - v = self.v(x) - - k = x.transpose([0, 2, 1, 3]).reshape( - [ - input_shape[0], - -1, - self.num_heads, - input_shape[2] // self.num_heads, - ] - ) - - v = ( - x.transpose([0, 2, 1, 3]).reshape( - [ - input_shape[0], - -1, - self.num_heads, - input_shape[2] // self.num_heads, - ] - ), - ) - - attn = (q @ tf.transpose(k, [0, 1, 3, 2])) * self.scale - attn = tf.nn.softmax(attn, axis=-1) - - attn = attn @ v - attn = tf.transpose(attn, [0, 2, 1, 3]) - attn = tf.reshape( - attn, shape=[input_shape[0], input_shape[1], input_shape[2]] - ) - x = self.proj(attn) - return x - - -LAYER_BACKBONES = { - "tensorflow": __EfficientMultiheadAttentionTF, - "pytorch": __EfficientMultiheadAttentionPT, -} - - -def EfficientMultiheadAttention( - project_dim, num_heads, sr_ratio, backend="pytorch" -): - """ - `EfficientMultiheadAttention` is a standard scaled softmax attention layer, but shortens the sequence it operates on by a reduction factor, to reduce computational cost. - The layer is meant to be used as part of the `deepvision.layers.HierarchicalTransformerEncoder` for the SegFormer architecture. - - Reference: - - ["SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers"](https://arxiv.org/pdf/2105.15203v2.pdf) - - Args: - project_dim: the dimensionality of the projection for the keys, values and queries - num_heads: the number of attention heads to apply - sr_ratio: the reduction ratio for the sequence length - backend: the backend framework to use - - Basic usage: - - ``` - tensor = torch.rand(1, 196, 32) - output = deepvision.layers.EfficientMultiheadAttention(project_dim=32, - num_heads=2, - sr_ratio=4, - backend='pytorch')(tensor, H=14, W=14) - - print(output.shape) # torch.Size([1, 196, 32]) - - tensor = tf.random.uniform([1, 196, 32]) - output = deepvision.layers.EfficientMultiheadAttention(project_dim=32, - num_heads=2, - sr_ratio=4, - backend='tensorflow')(tensor, H=14, W=14) - print(output.shape) # (1, 196, 32) - ``` - - """ - layer_class = LAYER_BACKBONES.get(backend) - if layer_class is None: - raise ValueError( - f"Backend not supported: {backend}. Supported backbones are {LAYER_BACKBONES.keys()}" - ) - - layer = layer_class( - project_dim=project_dim, num_heads=num_heads, sr_ratio=sr_ratio - ) - - return layer diff --git a/.history/keras_cv/layers/efficient_multihead_attention_20230717120020.py b/.history/keras_cv/layers/efficient_multihead_attention_20230717120020.py deleted file mode 100644 index 7c21c3fcdc..0000000000 --- a/.history/keras_cv/layers/efficient_multihead_attention_20230717120020.py +++ /dev/null @@ -1,140 +0,0 @@ -from keras_cv.backend import keras - -""" -Based on: https://github.com/sithu31296/semantic-segmentation/blob/main/semseg/models/backbones/mit.py -""" - - -@keras.saving.register_keras_serializable(package="keras_cv") -class EfficientMultiheadAttention(tf.keras.layers.Layer): - def __init__(self, project_dim, num_heads, sr_ratio): - super().__init__() - self.num_heads = num_heads - self.sr_ratio = sr_ratio - self.scale = (project_dim // num_heads) ** -0.5 - self.q = keras.layers.Dense(project_dim) - self.k = keras.layers.Dense(project_dim) - self.v = keras.layers.Dense(project_dim) - self.proj = keras.layers.Dense(project_dim) - - if sr_ratio > 1: - self.sr = keras.layers.Conv2D( - filters=project_dim, - kernel_size=sr_ratio, - strides=sr_ratio, - padding="same", - ) - self.norm = keras.layers.LayerNormalization() - - def call(self, x, H, W): - input_shape = shape(x) - - q = self.q(x) - q = q.reshape( - ( - input_shape[0], - input_shape[1], - self.num_heads, - input_shape[2] // self.num_heads, - ), - ) - - q = x.transpose([0, 2, 1, 3]) - - if self.sr_ratio > 1: - x = x.transpose(x, [0, 2, 1]).reshape( - (input_shape[0], H, W, input_shape[2]) - ) - x = self.sr(x) - x = x.reshape([input_shape[0], input_shape[2], -1]) - x = x.transpose([0, 2, 1]) - x = self.norm(x) - - k = self.k(x) - v = self.v(x) - - k = x.transpose([0, 2, 1, 3]).reshape( - [ - input_shape[0], - -1, - self.num_heads, - input_shape[2] // self.num_heads, - ] - ) - - v = ( - x.transpose([0, 2, 1, 3]).reshape( - [ - input_shape[0], - -1, - self.num_heads, - input_shape[2] // self.num_heads, - ] - ), - ) - - attn = (q @ x.transpose([0, 1, 3, 2])) * self.scale - attn = tf.nn.softmax(attn, axis=-1) - - attn = attn @ v - attn = tf.transpose(attn, [0, 2, 1, 3]) - attn = tf.reshape( - attn, shape=[input_shape[0], input_shape[1], input_shape[2]] - ) - x = self.proj(attn) - return x - - -LAYER_BACKBONES = { - "tensorflow": __EfficientMultiheadAttentionTF, - "pytorch": __EfficientMultiheadAttentionPT, -} - - -def EfficientMultiheadAttention( - project_dim, num_heads, sr_ratio, backend="pytorch" -): - """ - `EfficientMultiheadAttention` is a standard scaled softmax attention layer, but shortens the sequence it operates on by a reduction factor, to reduce computational cost. - The layer is meant to be used as part of the `deepvision.layers.HierarchicalTransformerEncoder` for the SegFormer architecture. - - Reference: - - ["SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers"](https://arxiv.org/pdf/2105.15203v2.pdf) - - Args: - project_dim: the dimensionality of the projection for the keys, values and queries - num_heads: the number of attention heads to apply - sr_ratio: the reduction ratio for the sequence length - backend: the backend framework to use - - Basic usage: - - ``` - tensor = torch.rand(1, 196, 32) - output = deepvision.layers.EfficientMultiheadAttention(project_dim=32, - num_heads=2, - sr_ratio=4, - backend='pytorch')(tensor, H=14, W=14) - - print(output.shape) # torch.Size([1, 196, 32]) - - tensor = tf.random.uniform([1, 196, 32]) - output = deepvision.layers.EfficientMultiheadAttention(project_dim=32, - num_heads=2, - sr_ratio=4, - backend='tensorflow')(tensor, H=14, W=14) - print(output.shape) # (1, 196, 32) - ``` - - """ - layer_class = LAYER_BACKBONES.get(backend) - if layer_class is None: - raise ValueError( - f"Backend not supported: {backend}. Supported backbones are {LAYER_BACKBONES.keys()}" - ) - - layer = layer_class( - project_dim=project_dim, num_heads=num_heads, sr_ratio=sr_ratio - ) - - return layer diff --git a/.history/keras_cv/layers/efficient_multihead_attention_20230717120108.py b/.history/keras_cv/layers/efficient_multihead_attention_20230717120108.py deleted file mode 100644 index d7a6325330..0000000000 --- a/.history/keras_cv/layers/efficient_multihead_attention_20230717120108.py +++ /dev/null @@ -1,140 +0,0 @@ -from keras_cv.backend import keras - -""" -Based on: https://github.com/sithu31296/semantic-segmentation/blob/main/semseg/models/backbones/mit.py -""" - - -@keras.saving.register_keras_serializable(package="keras_cv") -class EfficientMultiheadAttention(tf.keras.layers.Layer): - def __init__(self, project_dim, num_heads, sr_ratio): - super().__init__() - self.num_heads = num_heads - self.sr_ratio = sr_ratio - self.scale = (project_dim // num_heads) ** -0.5 - self.q = keras.layers.Dense(project_dim) - self.k = keras.layers.Dense(project_dim) - self.v = keras.layers.Dense(project_dim) - self.proj = keras.layers.Dense(project_dim) - - if sr_ratio > 1: - self.sr = keras.layers.Conv2D( - filters=project_dim, - kernel_size=sr_ratio, - strides=sr_ratio, - padding="same", - ) - self.norm = keras.layers.LayerNormalization() - - def call(self, x, H, W): - input_shape = shape(x) - - q = self.q(x) - q = q.reshape( - ( - input_shape[0], - input_shape[1], - self.num_heads, - input_shape[2] // self.num_heads, - ), - ) - - q = x.transpose([0, 2, 1, 3]) - - if self.sr_ratio > 1: - x = x.transpose(x, [0, 2, 1]).reshape( - (input_shape[0], H, W, input_shape[2]) - ) - x = self.sr(x) - x = x.reshape([input_shape[0], input_shape[2], -1]) - x = x.transpose([0, 2, 1]) - x = self.norm(x) - - k = self.k(x) - v = self.v(x) - - k = x.transpose([0, 2, 1, 3]).reshape( - [ - input_shape[0], - -1, - self.num_heads, - input_shape[2] // self.num_heads, - ] - ) - - v = ( - x.transpose([0, 2, 1, 3]).reshape( - [ - input_shape[0], - -1, - self.num_heads, - input_shape[2] // self.num_heads, - ] - ), - ) - - attn = (q @ x.transpose([0, 1, 3, 2])) * self.scale - attn = keras.nn.ops.softmax(attn, axis=-1) - - attn = attn @ v - attn = tf.transpose(attn, [0, 2, 1, 3]) - attn = tf.reshape( - attn, shape=[input_shape[0], input_shape[1], input_shape[2]] - ) - x = self.proj(attn) - return x - - -LAYER_BACKBONES = { - "tensorflow": __EfficientMultiheadAttentionTF, - "pytorch": __EfficientMultiheadAttentionPT, -} - - -def EfficientMultiheadAttention( - project_dim, num_heads, sr_ratio, backend="pytorch" -): - """ - `EfficientMultiheadAttention` is a standard scaled softmax attention layer, but shortens the sequence it operates on by a reduction factor, to reduce computational cost. - The layer is meant to be used as part of the `deepvision.layers.HierarchicalTransformerEncoder` for the SegFormer architecture. - - Reference: - - ["SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers"](https://arxiv.org/pdf/2105.15203v2.pdf) - - Args: - project_dim: the dimensionality of the projection for the keys, values and queries - num_heads: the number of attention heads to apply - sr_ratio: the reduction ratio for the sequence length - backend: the backend framework to use - - Basic usage: - - ``` - tensor = torch.rand(1, 196, 32) - output = deepvision.layers.EfficientMultiheadAttention(project_dim=32, - num_heads=2, - sr_ratio=4, - backend='pytorch')(tensor, H=14, W=14) - - print(output.shape) # torch.Size([1, 196, 32]) - - tensor = tf.random.uniform([1, 196, 32]) - output = deepvision.layers.EfficientMultiheadAttention(project_dim=32, - num_heads=2, - sr_ratio=4, - backend='tensorflow')(tensor, H=14, W=14) - print(output.shape) # (1, 196, 32) - ``` - - """ - layer_class = LAYER_BACKBONES.get(backend) - if layer_class is None: - raise ValueError( - f"Backend not supported: {backend}. Supported backbones are {LAYER_BACKBONES.keys()}" - ) - - layer = layer_class( - project_dim=project_dim, num_heads=num_heads, sr_ratio=sr_ratio - ) - - return layer diff --git a/.history/keras_cv/layers/efficient_multihead_attention_20230717120147.py b/.history/keras_cv/layers/efficient_multihead_attention_20230717120147.py deleted file mode 100644 index 93dd8d79fe..0000000000 --- a/.history/keras_cv/layers/efficient_multihead_attention_20230717120147.py +++ /dev/null @@ -1,138 +0,0 @@ -from keras_cv.backend import keras - -""" -Based on: https://github.com/sithu31296/semantic-segmentation/blob/main/semseg/models/backbones/mit.py -""" - - -@keras.saving.register_keras_serializable(package="keras_cv") -class EfficientMultiheadAttention(tf.keras.layers.Layer): - def __init__(self, project_dim, num_heads, sr_ratio): - super().__init__() - self.num_heads = num_heads - self.sr_ratio = sr_ratio - self.scale = (project_dim // num_heads) ** -0.5 - self.q = keras.layers.Dense(project_dim) - self.k = keras.layers.Dense(project_dim) - self.v = keras.layers.Dense(project_dim) - self.proj = keras.layers.Dense(project_dim) - - if sr_ratio > 1: - self.sr = keras.layers.Conv2D( - filters=project_dim, - kernel_size=sr_ratio, - strides=sr_ratio, - padding="same", - ) - self.norm = keras.layers.LayerNormalization() - - def call(self, x, H, W): - input_shape = shape(x) - - q = self.q(x) - q = q.reshape( - ( - input_shape[0], - input_shape[1], - self.num_heads, - input_shape[2] // self.num_heads, - ), - ) - - q = q.transpose([0, 2, 1, 3]) - - if self.sr_ratio > 1: - x = x.transpose(x, [0, 2, 1]).reshape( - (input_shape[0], H, W, input_shape[2]) - ) - x = self.sr(x) - x = x.reshape([input_shape[0], input_shape[2], -1]) - x = x.transpose([0, 2, 1]) - x = self.norm(x) - - k = self.k(x) - v = self.v(x) - - k = k.transpose([0, 2, 1, 3]).reshape( - [ - input_shape[0], - -1, - self.num_heads, - input_shape[2] // self.num_heads, - ] - ) - v = ( - v.transpose([0, 2, 1, 3]).reshape( - [ - input_shape[0], - -1, - self.num_heads, - input_shape[2] // self.num_heads, - ] - ), - ) - - attn = (q @ x.transpose([0, 1, 3, 2])) * self.scale - attn = keras.nn.ops.softmax(attn, axis=-1) - - attn = attn @ v - attn = attn.transpose(attn, [0, 2, 1, 3]).reshape( - [input_shape[0], input_shape[1], input_shape[2]] - ) - x = self.proj(attn) - return x - - -LAYER_BACKBONES = { - "tensorflow": __EfficientMultiheadAttentionTF, - "pytorch": __EfficientMultiheadAttentionPT, -} - - -def EfficientMultiheadAttention( - project_dim, num_heads, sr_ratio, backend="pytorch" -): - """ - `EfficientMultiheadAttention` is a standard scaled softmax attention layer, but shortens the sequence it operates on by a reduction factor, to reduce computational cost. - The layer is meant to be used as part of the `deepvision.layers.HierarchicalTransformerEncoder` for the SegFormer architecture. - - Reference: - - ["SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers"](https://arxiv.org/pdf/2105.15203v2.pdf) - - Args: - project_dim: the dimensionality of the projection for the keys, values and queries - num_heads: the number of attention heads to apply - sr_ratio: the reduction ratio for the sequence length - backend: the backend framework to use - - Basic usage: - - ``` - tensor = torch.rand(1, 196, 32) - output = deepvision.layers.EfficientMultiheadAttention(project_dim=32, - num_heads=2, - sr_ratio=4, - backend='pytorch')(tensor, H=14, W=14) - - print(output.shape) # torch.Size([1, 196, 32]) - - tensor = tf.random.uniform([1, 196, 32]) - output = deepvision.layers.EfficientMultiheadAttention(project_dim=32, - num_heads=2, - sr_ratio=4, - backend='tensorflow')(tensor, H=14, W=14) - print(output.shape) # (1, 196, 32) - ``` - - """ - layer_class = LAYER_BACKBONES.get(backend) - if layer_class is None: - raise ValueError( - f"Backend not supported: {backend}. Supported backbones are {LAYER_BACKBONES.keys()}" - ) - - layer = layer_class( - project_dim=project_dim, num_heads=num_heads, sr_ratio=sr_ratio - ) - - return layer diff --git a/.history/keras_cv/layers/efficient_multihead_attention_20230717120153.py b/.history/keras_cv/layers/efficient_multihead_attention_20230717120153.py deleted file mode 100644 index ae333c8ae4..0000000000 --- a/.history/keras_cv/layers/efficient_multihead_attention_20230717120153.py +++ /dev/null @@ -1,83 +0,0 @@ -from keras_cv.backend import keras - -""" -Based on: https://github.com/sithu31296/semantic-segmentation/blob/main/semseg/models/backbones/mit.py -""" - - -@keras.saving.register_keras_serializable(package="keras_cv") -class EfficientMultiheadAttention(tf.keras.layers.Layer): - def __init__(self, project_dim, num_heads, sr_ratio): - super().__init__() - self.num_heads = num_heads - self.sr_ratio = sr_ratio - self.scale = (project_dim // num_heads) ** -0.5 - self.q = keras.layers.Dense(project_dim) - self.k = keras.layers.Dense(project_dim) - self.v = keras.layers.Dense(project_dim) - self.proj = keras.layers.Dense(project_dim) - - if sr_ratio > 1: - self.sr = keras.layers.Conv2D( - filters=project_dim, - kernel_size=sr_ratio, - strides=sr_ratio, - padding="same", - ) - self.norm = keras.layers.LayerNormalization() - - def call(self, x, H, W): - input_shape = shape(x) - - q = self.q(x) - q = q.reshape( - ( - input_shape[0], - input_shape[1], - self.num_heads, - input_shape[2] // self.num_heads, - ), - ) - - q = q.transpose([0, 2, 1, 3]) - - if self.sr_ratio > 1: - x = x.transpose(x, [0, 2, 1]).reshape( - (input_shape[0], H, W, input_shape[2]) - ) - x = self.sr(x) - x = x.reshape([input_shape[0], input_shape[2], -1]) - x = x.transpose([0, 2, 1]) - x = self.norm(x) - - k = self.k(x) - v = self.v(x) - - k = k.transpose([0, 2, 1, 3]).reshape( - [ - input_shape[0], - -1, - self.num_heads, - input_shape[2] // self.num_heads, - ] - ) - v = ( - v.transpose([0, 2, 1, 3]).reshape( - [ - input_shape[0], - -1, - self.num_heads, - input_shape[2] // self.num_heads, - ] - ), - ) - - attn = (q @ x.transpose([0, 1, 3, 2])) * self.scale - attn = keras.nn.ops.softmax(attn, axis=-1) - - attn = attn @ v - attn = attn.transpose(attn, [0, 2, 1, 3]).reshape( - [input_shape[0], input_shape[1], input_shape[2]] - ) - x = self.proj(attn) - return x diff --git a/.history/keras_cv/layers/efficient_multihead_attention_20230717120213.py b/.history/keras_cv/layers/efficient_multihead_attention_20230717120213.py deleted file mode 100644 index 718a5b216b..0000000000 --- a/.history/keras_cv/layers/efficient_multihead_attention_20230717120213.py +++ /dev/null @@ -1,83 +0,0 @@ -from keras_cv.backend import keras - -""" -Based on: https://github.com/sithu31296/semantic-segmentation/blob/main/semseg/models/backbones/mit.py -""" - - -@keras.saving.register_keras_serializable(package="keras_cv") -class EfficientMultiheadAttention(tf.keras.layers.Layer): - def __init__(self, project_dim, num_heads, sr_ratio): - super().__init__() - self.num_heads = num_heads - self.sr_ratio = sr_ratio - self.scale = (project_dim // num_heads) ** -0.5 - self.q = keras.layers.Dense(project_dim) - self.k = keras.layers.Dense(project_dim) - self.v = keras.layers.Dense(project_dim) - self.proj = keras.layers.Dense(project_dim) - - if sr_ratio > 1: - self.sr = keras.layers.Conv2D( - filters=project_dim, - kernel_size=sr_ratio, - strides=sr_ratio, - padding="same", - ) - self.norm = keras.layers.LayerNormalization() - - def call(self, x, H, W): - input_shape = x.shape - - q = self.q(x) - q = q.reshape( - ( - input_shape[0], - input_shape[1], - self.num_heads, - input_shape[2] // self.num_heads, - ), - ) - - q = q.transpose([0, 2, 1, 3]) - - if self.sr_ratio > 1: - x = x.transpose(x, [0, 2, 1]).reshape( - (input_shape[0], H, W, input_shape[2]) - ) - x = self.sr(x) - x = x.reshape([input_shape[0], input_shape[2], -1]) - x = x.transpose([0, 2, 1]) - x = self.norm(x) - - k = self.k(x) - v = self.v(x) - - k = k.transpose([0, 2, 1, 3]).reshape( - [ - input_shape[0], - -1, - self.num_heads, - input_shape[2] // self.num_heads, - ] - ) - v = ( - v.transpose([0, 2, 1, 3]).reshape( - [ - input_shape[0], - -1, - self.num_heads, - input_shape[2] // self.num_heads, - ] - ), - ) - - attn = (q @ x.transpose([0, 1, 3, 2])) * self.scale - attn = keras.nn.ops.softmax(attn, axis=-1) - - attn = attn @ v - attn = attn.transpose(attn, [0, 2, 1, 3]).reshape( - [input_shape[0], input_shape[1], input_shape[2]] - ) - x = self.proj(attn) - return x diff --git a/.history/keras_cv/layers/hierarchical_transformer_encoder_20230717115219.py b/.history/keras_cv/layers/hierarchical_transformer_encoder_20230717115219.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/.history/keras_cv/layers/hierarchical_transformer_encoder_20230717115222.py b/.history/keras_cv/layers/hierarchical_transformer_encoder_20230717115222.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/.history/keras_cv/layers/hierarchical_transformer_encoder_20230717115257.py b/.history/keras_cv/layers/hierarchical_transformer_encoder_20230717115257.py deleted file mode 100644 index 3e66db6f04..0000000000 --- a/.history/keras_cv/layers/hierarchical_transformer_encoder_20230717115257.py +++ /dev/null @@ -1,146 +0,0 @@ -# Copyright 2023 David Landup -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import tensorflow as tf -from torch import nn - -from deepvision.layers.efficient_attention import EfficientMultiheadAttention -from deepvision.layers.mix_ffn import MixFFN -from deepvision.layers.stochasticdepth import StochasticDepth - - -class __HierarchicalTransformerEncoderPT(nn.Module): - def __init__( - self, - project_dim, - num_heads, - sr_ratio=1, - drop_prob=0.0, - layer_norm_epsilon=1e-6, - name=None, - ): - super().__init__() - self.norm1 = nn.LayerNorm(project_dim) - self.attn = EfficientMultiheadAttention( - project_dim, num_heads, sr_ratio - ) - self.drop_path = StochasticDepth(drop_prob, backend="pytorch") - self.norm2 = nn.LayerNorm(project_dim, eps=layer_norm_epsilon) - self.mlp = MixFFN( - channels=project_dim, - mid_channels=int(project_dim * 4), - backend="pytorch", - ) - - def forward(self, x, H, W): - x = x + self.drop_path(self.attn(self.norm1(x), H, W)) - x = x + self.drop_path(self.mlp(self.norm2(x), H, W)) - return x - - -class __HierarchicalTransformerEncoderTF(tf.keras.layers.Layer): - def __init__( - self, - project_dim, - num_heads, - sr_ratio=1, - drop_prob=0.0, - layer_norm_epsilon=1e-6, - **kwargs, - ): - super().__init__(**kwargs) - self.norm1 = tf.keras.layers.LayerNormalization( - epsilon=layer_norm_epsilon - ) - self.attn = EfficientMultiheadAttention( - project_dim, num_heads, sr_ratio, backend="tensorflow" - ) - self.drop_path = StochasticDepth(drop_prob, backend="tensorflow") - self.norm2 = tf.keras.layers.LayerNormalization( - epsilon=layer_norm_epsilon - ) - self.mlp = MixFFN( - channels=project_dim, - mid_channels=int(project_dim * 4), - backend="tensorflow", - ) - - def call(self, x, H, W): - x = x + self.drop_path(self.attn(self.norm1(x), H, W)) - x = x + self.drop_path(self.mlp(self.norm2(x), H, W)) - return x - - -LAYER_BACKBONES = { - "tensorflow": __HierarchicalTransformerEncoderTF, - "pytorch": __HierarchicalTransformerEncoderPT, -} - - -def HierarchicalTransformerEncoder( - project_dim, - num_heads, - sr_ratio, - drop_prob=0.0, - layer_norm_epsilon=1e-6, - backend=None, - name=None, -): - """ - TransformerEncoder variant, which uses `deepvision.layers.EfficientMultiheadAttention` in lieu of `torch.nn.MultiheadAttention` or `tf.keras.layers.MultiHeadAttention`. - `EfficientMultiheadAttention` shorten the sequence they operate on by a reduction factor, to reduce computational cost. - The `HierarchicalTransformerEncoder` is designed to encode feature maps at multiple spatial levels, similar to how CNNs encode multiple spatial levels. - - Reference: - - ["SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers"](https://arxiv.org/pdf/2105.15203v2.pdf) - - Args: - project_dim: the dimensionality of the projection of the encoder, and output of the `EfficientMultiheadAttention` - num_heads: the number of heads for the `EfficientMultiheadAttention` layer - sr_ratio: the reduction ratio to apply within the `EfficientMultiheadAttention` layer - layer_norm_epsilon: default 1e-06, the epsilon for Layer Normalization layers - drop_prob: the drop probability for the `DropPath` layers - backend: the backend framework to use - - Basic usage: - - ``` - # (B, SEQ_LEN, CHANNELS) - inp = torch.rand(1, 3136, 32) - H, W = 56 - - output = deepvision.layers.HierarchicalTransformerEncoder(project_dim=32, - num_heads=2, - sr_ratio=4, - backend='pytorch')(inp, H, W) - print(output.shape) # torch.Size([1, 3136, 32]) - ``` - """ - - layer_class = LAYER_BACKBONES.get(backend) - if layer_class is None: - raise ValueError( - f"Backend not supported: {backend}. Supported backbones are {LAYER_BACKBONES.keys()}" - ) - - layer = layer_class( - project_dim=project_dim, - num_heads=num_heads, - sr_ratio=sr_ratio, - drop_prob=drop_prob, - layer_norm_epsilon=layer_norm_epsilon, - name=name, - ) - - return layer diff --git a/.history/keras_cv/layers/hierarchical_transformer_encoder_20230717115327.py b/.history/keras_cv/layers/hierarchical_transformer_encoder_20230717115327.py deleted file mode 100644 index 08f218aee3..0000000000 --- a/.history/keras_cv/layers/hierarchical_transformer_encoder_20230717115327.py +++ /dev/null @@ -1,53 +0,0 @@ -# Copyright 2023 David Landup -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import tensorflow as tf -from torch import nn - -from deepvision.layers.efficient_attention import EfficientMultiheadAttention -from deepvision.layers.mix_ffn import MixFFN -from deepvision.layers.stochasticdepth import StochasticDepth - - -class HierarchicalTransformerEncoder(tf.keras.layers.Layer): - def __init__( - self, - project_dim, - num_heads, - sr_ratio=1, - drop_prob=0.0, - layer_norm_epsilon=1e-6, - **kwargs, - ): - super().__init__(**kwargs) - self.norm1 = tf.keras.layers.LayerNormalization( - epsilon=layer_norm_epsilon - ) - self.attn = EfficientMultiheadAttention( - project_dim, num_heads, sr_ratio, backend="tensorflow" - ) - self.drop_path = StochasticDepth(drop_prob, backend="tensorflow") - self.norm2 = tf.keras.layers.LayerNormalization( - epsilon=layer_norm_epsilon - ) - self.mlp = MixFFN( - channels=project_dim, - mid_channels=int(project_dim * 4), - backend="tensorflow", - ) - - def call(self, x, H, W): - x = x + self.drop_path(self.attn(self.norm1(x), H, W)) - x = x + self.drop_path(self.mlp(self.norm2(x), H, W)) - return x diff --git a/.history/keras_cv/layers/hierarchical_transformer_encoder_20230717115542.py b/.history/keras_cv/layers/hierarchical_transformer_encoder_20230717115542.py deleted file mode 100644 index 61bd26d176..0000000000 --- a/.history/keras_cv/layers/hierarchical_transformer_encoder_20230717115542.py +++ /dev/null @@ -1,31 +0,0 @@ -from keras_cv.backend import keras - - -@keras.saving.register_keras_serializable(package="keras_cv") -class HierarchicalTransformerEncoder(keras.layers.Layer): - def __init__( - self, - project_dim, - num_heads, - sr_ratio=1, - drop_prob=0.0, - layer_norm_epsilon=1e-6, - **kwargs, - ): - super().__init__(**kwargs) - self.norm1 = keras.layers.LayerNormalization(epsilon=layer_norm_epsilon) - self.attn = EfficientMultiheadAttention( - project_dim, num_heads, sr_ratio, backend="tensorflow" - ) - self.drop_path = StochasticDepth(drop_prob, backend="tensorflow") - self.norm2 = keras.layers.LayerNormalization(epsilon=layer_norm_epsilon) - self.mlp = MixFFN( - channels=project_dim, - mid_channels=int(project_dim * 4), - backend="tensorflow", - ) - - def call(self, x, H, W): - x = x + self.drop_path(self.attn(self.norm1(x), H, W)) - x = x + self.drop_path(self.mlp(self.norm2(x), H, W)) - return x diff --git a/.history/keras_cv/layers/hierarchical_transformer_encoder_20230717120252.py b/.history/keras_cv/layers/hierarchical_transformer_encoder_20230717120252.py deleted file mode 100644 index bceb8d818a..0000000000 --- a/.history/keras_cv/layers/hierarchical_transformer_encoder_20230717120252.py +++ /dev/null @@ -1,32 +0,0 @@ -from keras_cv.backend import keras -from keras_cv.layers import StochasticDepth - - -@keras.saving.register_keras_serializable(package="keras_cv") -class HierarchicalTransformerEncoder(keras.layers.Layer): - def __init__( - self, - project_dim, - num_heads, - sr_ratio=1, - drop_prob=0.0, - layer_norm_epsilon=1e-6, - **kwargs, - ): - super().__init__(**kwargs) - self.norm1 = keras.layers.LayerNormalization(epsilon=layer_norm_epsilon) - self.attn = EfficientMultiheadAttention( - project_dim, num_heads, sr_ratio, backend="tensorflow" - ) - self.drop_path = StochasticDepth(drop_prob, backend="tensorflow") - self.norm2 = keras.layers.LayerNormalization(epsilon=layer_norm_epsilon) - self.mlp = MixFFN( - channels=project_dim, - mid_channels=int(project_dim * 4), - backend="tensorflow", - ) - - def call(self, x, H, W): - x = x + self.drop_path(self.attn(self.norm1(x), H, W)) - x = x + self.drop_path(self.mlp(self.norm2(x), H, W)) - return x diff --git a/.history/keras_cv/layers/hierarchical_transformer_encoder_20230717120400.py b/.history/keras_cv/layers/hierarchical_transformer_encoder_20230717120400.py deleted file mode 100644 index ce7d623922..0000000000 --- a/.history/keras_cv/layers/hierarchical_transformer_encoder_20230717120400.py +++ /dev/null @@ -1,56 +0,0 @@ -from keras_cv.backend import keras -from keras_cv.layers import StochasticDepth -from keras_cv.layers import EfficientMultiheadAttention - - -@keras.saving.register_keras_serializable(package="keras_cv") -class HierarchicalTransformerEncoder(keras.layers.Layer): - def __init__( - self, - project_dim, - num_heads, - sr_ratio=1, - drop_prob=0.0, - layer_norm_epsilon=1e-6, - **kwargs, - ): - super().__init__(**kwargs) - self.norm1 = keras.layers.LayerNormalization(epsilon=layer_norm_epsilon) - self.attn = EfficientMultiheadAttention( - project_dim, num_heads, sr_ratio, backend="tensorflow" - ) - self.drop_path = StochasticDepth(drop_prob, backend="tensorflow") - self.norm2 = keras.layers.LayerNormalization(epsilon=layer_norm_epsilon) - self.mlp = MixFFN( - channels=project_dim, - mid_channels=int(project_dim * 4), - backend="tensorflow", - ) - - def call(self, x, H, W): - x = x + self.drop_path(self.attn(self.norm1(x), H, W)) - x = x + self.drop_path(self.mlp(self.norm2(x), H, W)) - return x - - -class __MixFFN(keras.layers.Layer): - def __init__(self, channels, mid_channels): - super().__init__() - self.fc1 = keras.layers.Dense(mid_channels) - self.dwconv = keras.layers.DepthwiseConv2D( - kernel_size=3, - strides=1, - padding="same", - ) - self.fc2 = keras.layers.Dense(channels) - - def call(self, x, H, W): - x = self.fc1(x) - # B, DIM, C - input_shape = x.shape - x = x.reshape((input_shape[0], H, W, input_shape[-1])) - x = self.dwconv(x) - x = x.reshape((input_shape[0], -1, input_shape[-1])) - x = keras.nn.ops.gelu(x) - x = self.fc2(x) - return x diff --git a/.history/keras_cv/layers/hierarchical_transformer_encoder_20230717120409.py b/.history/keras_cv/layers/hierarchical_transformer_encoder_20230717120409.py deleted file mode 100644 index b9fbfa0728..0000000000 --- a/.history/keras_cv/layers/hierarchical_transformer_encoder_20230717120409.py +++ /dev/null @@ -1,55 +0,0 @@ -from keras_cv.backend import keras -from keras_cv.layers import StochasticDepth -from keras_cv.layers import EfficientMultiheadAttention - - -@keras.saving.register_keras_serializable(package="keras_cv") -class HierarchicalTransformerEncoder(keras.layers.Layer): - def __init__( - self, - project_dim, - num_heads, - sr_ratio=1, - drop_prob=0.0, - layer_norm_epsilon=1e-6, - **kwargs, - ): - super().__init__(**kwargs) - self.norm1 = keras.layers.LayerNormalization(epsilon=layer_norm_epsilon) - self.attn = EfficientMultiheadAttention( - project_dim, num_heads, sr_ratio, backend="tensorflow" - ) - self.drop_path = StochasticDepth(drop_prob, backend="tensorflow") - self.norm2 = keras.layers.LayerNormalization(epsilon=layer_norm_epsilon) - self.mlp = __MixFFN( - channels=project_dim, - mid_channels=int(project_dim * 4), - ) - - def call(self, x, H, W): - x = x + self.drop_path(self.attn(self.norm1(x), H, W)) - x = x + self.drop_path(self.mlp(self.norm2(x), H, W)) - return x - - -class __MixFFN(keras.layers.Layer): - def __init__(self, channels, mid_channels): - super().__init__() - self.fc1 = keras.layers.Dense(mid_channels) - self.dwconv = keras.layers.DepthwiseConv2D( - kernel_size=3, - strides=1, - padding="same", - ) - self.fc2 = keras.layers.Dense(channels) - - def call(self, x, H, W): - x = self.fc1(x) - # B, DIM, C - input_shape = x.shape - x = x.reshape((input_shape[0], H, W, input_shape[-1])) - x = self.dwconv(x) - x = x.reshape((input_shape[0], -1, input_shape[-1])) - x = keras.nn.ops.gelu(x) - x = self.fc2(x) - return x diff --git a/.history/keras_cv/layers/hierarchical_transformer_encoder_20230717120411.py b/.history/keras_cv/layers/hierarchical_transformer_encoder_20230717120411.py deleted file mode 100644 index 20b3b0dee7..0000000000 --- a/.history/keras_cv/layers/hierarchical_transformer_encoder_20230717120411.py +++ /dev/null @@ -1,55 +0,0 @@ -from keras_cv.backend import keras -from keras_cv.layers import StochasticDepth -from keras_cv.layers import EfficientMultiheadAttention - - -@keras.saving.register_keras_serializable(package="keras_cv") -class HierarchicalTransformerEncoder(keras.layers.Layer): - def __init__( - self, - project_dim, - num_heads, - sr_ratio=1, - drop_prob=0.0, - layer_norm_epsilon=1e-6, - **kwargs, - ): - super().__init__(**kwargs) - self.norm1 = keras.layers.LayerNormalization(epsilon=layer_norm_epsilon) - self.attn = EfficientMultiheadAttention( - project_dim, num_heads, sr_ratio - ) - self.drop_path = StochasticDepth(drop_prob, backend="tensorflow") - self.norm2 = keras.layers.LayerNormalization(epsilon=layer_norm_epsilon) - self.mlp = __MixFFN( - channels=project_dim, - mid_channels=int(project_dim * 4), - ) - - def call(self, x, H, W): - x = x + self.drop_path(self.attn(self.norm1(x), H, W)) - x = x + self.drop_path(self.mlp(self.norm2(x), H, W)) - return x - - -class __MixFFN(keras.layers.Layer): - def __init__(self, channels, mid_channels): - super().__init__() - self.fc1 = keras.layers.Dense(mid_channels) - self.dwconv = keras.layers.DepthwiseConv2D( - kernel_size=3, - strides=1, - padding="same", - ) - self.fc2 = keras.layers.Dense(channels) - - def call(self, x, H, W): - x = self.fc1(x) - # B, DIM, C - input_shape = x.shape - x = x.reshape((input_shape[0], H, W, input_shape[-1])) - x = self.dwconv(x) - x = x.reshape((input_shape[0], -1, input_shape[-1])) - x = keras.nn.ops.gelu(x) - x = self.fc2(x) - return x diff --git a/.history/keras_cv/layers/hierarchical_transformer_encoder_20230717120414.py b/.history/keras_cv/layers/hierarchical_transformer_encoder_20230717120414.py deleted file mode 100644 index 846f0146d6..0000000000 --- a/.history/keras_cv/layers/hierarchical_transformer_encoder_20230717120414.py +++ /dev/null @@ -1,55 +0,0 @@ -from keras_cv.backend import keras -from keras_cv.layers import StochasticDepth -from keras_cv.layers import EfficientMultiheadAttention - - -@keras.saving.register_keras_serializable(package="keras_cv") -class HierarchicalTransformerEncoder(keras.layers.Layer): - def __init__( - self, - project_dim, - num_heads, - sr_ratio=1, - drop_prob=0.0, - layer_norm_epsilon=1e-6, - **kwargs, - ): - super().__init__(**kwargs) - self.norm1 = keras.layers.LayerNormalization(epsilon=layer_norm_epsilon) - self.attn = EfficientMultiheadAttention( - project_dim, num_heads, sr_ratio - ) - self.drop_path = StochasticDepth(drop_prob) - self.norm2 = keras.layers.LayerNormalization(epsilon=layer_norm_epsilon) - self.mlp = __MixFFN( - channels=project_dim, - mid_channels=int(project_dim * 4), - ) - - def call(self, x, H, W): - x = x + self.drop_path(self.attn(self.norm1(x), H, W)) - x = x + self.drop_path(self.mlp(self.norm2(x), H, W)) - return x - - -class __MixFFN(keras.layers.Layer): - def __init__(self, channels, mid_channels): - super().__init__() - self.fc1 = keras.layers.Dense(mid_channels) - self.dwconv = keras.layers.DepthwiseConv2D( - kernel_size=3, - strides=1, - padding="same", - ) - self.fc2 = keras.layers.Dense(channels) - - def call(self, x, H, W): - x = self.fc1(x) - # B, DIM, C - input_shape = x.shape - x = x.reshape((input_shape[0], H, W, input_shape[-1])) - x = self.dwconv(x) - x = x.reshape((input_shape[0], -1, input_shape[-1])) - x = keras.nn.ops.gelu(x) - x = self.fc2(x) - return x diff --git a/.history/keras_cv/layers/overlapping_patching_embedding_20230717115127.py b/.history/keras_cv/layers/overlapping_patching_embedding_20230717115127.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/.history/keras_cv/layers/overlapping_patching_embedding_20230717115248.py b/.history/keras_cv/layers/overlapping_patching_embedding_20230717115248.py deleted file mode 100644 index e88c3ba844..0000000000 --- a/.history/keras_cv/layers/overlapping_patching_embedding_20230717115248.py +++ /dev/null @@ -1,133 +0,0 @@ -# Copyright 2023 David Landup -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - - -import tensorflow as tf -from torch import nn - - -class __OverlappingPatchingAndEmbeddingPT(nn.Module): - def __init__( - self, - in_channels=3, - out_channels=32, - patch_size=7, - stride=4, - name=None, - ): - super().__init__() - self.proj = nn.Conv2d( - in_channels=in_channels, - out_channels=out_channels, - kernel_size=patch_size, - stride=stride, - padding=patch_size // 2, - ) - self.norm = nn.LayerNorm(out_channels) - - def forward(self, x): - x = self.proj(x) - _, _, H, W = x.shape - x = x.flatten(2).transpose(1, 2) - x = self.norm(x) - return x, H, W - - -class __OverlappingPatchingAndEmbeddingTF(tf.keras.layers.Layer): - def __init__( - self, in_channels=3, out_channels=32, patch_size=7, stride=4, **kwargs - ): - super().__init__(**kwargs) - self.proj = tf.keras.layers.Conv2D( - filters=out_channels, - kernel_size=patch_size, - strides=stride, - padding="same", - ) - self.norm = tf.keras.layers.LayerNormalization() - - def call(self, x): - x = self.proj(x) - # B, H, W, C - shape = tf.shape(x) - x = tf.reshape(x, [-1, shape[1] * shape[2], shape[3]]) - x = self.norm(x) - return x, shape[1], shape[2] - - -LAYER_BACKBONES = { - "tensorflow": __OverlappingPatchingAndEmbeddingTF, - "pytorch": __OverlappingPatchingAndEmbeddingPT, -} - - -def OverlappingPatchingAndEmbedding( - in_channels=3, - out_channels=32, - patch_size=7, - stride=4, - backend=None, - name=None, -): - """ - ViT-inspired PatchingAndEmbedding, modified to merge overlapping patches for the SegFormer architecture. - - Reference: - - ["SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers"](https://arxiv.org/pdf/2105.15203v2.pdf) - - - Args: - in_channels: the number of channels in the input tensor - out_channels: the projection dimensionality - patch_size: the patch size/kernel size to apply in the convolutional layer used to patchify - stride: the stride to apply in the convolutional layer used to patchify - backend: the backend framework to use - - Basic usage: - - ``` - inp = torch.rand(1, 3, 224, 224) - output, H, W = deepvision.layers.OverlappingPatchingAndEmbedding(in_channels=3, - out_channels=64, - patch_size=7, - stride=4, - backend='pytorch')(inp) - print(output.shape) # torch.Size([1, 3136, 64]) - - - inp = tf.random.uniform(1, 224, 224, 3) - output, H, W = deepvision.layers.OverlappingPatchingAndEmbedding(in_channels=3, - out_channels=64, - patch_size=7, - stride=4, - backend='tensorflow')(inp) - print(output.shape) # (1, 3136, 64) - ``` - """ - - layer_class = LAYER_BACKBONES.get(backend) - if layer_class is None: - raise ValueError( - f"Backend not supported: {backend}. Supported backbones are {LAYER_BACKBONES.keys()}" - ) - - layer = layer_class( - in_channels=in_channels, - out_channels=out_channels, - patch_size=patch_size, - stride=stride, - name=name, - ) - - return layer diff --git a/.history/keras_cv/layers/overlapping_patching_embedding_20230717115350.py b/.history/keras_cv/layers/overlapping_patching_embedding_20230717115350.py deleted file mode 100644 index 3fda36f09a..0000000000 --- a/.history/keras_cv/layers/overlapping_patching_embedding_20230717115350.py +++ /dev/null @@ -1,24 +0,0 @@ -import tensorflow as tf -from torch import nn - - -class OverlappingPatchingAndEmbedding(tf.keras.layers.Layer): - def __init__( - self, in_channels=3, out_channels=32, patch_size=7, stride=4, **kwargs - ): - super().__init__(**kwargs) - self.proj = tf.keras.layers.Conv2D( - filters=out_channels, - kernel_size=patch_size, - strides=stride, - padding="same", - ) - self.norm = tf.keras.layers.LayerNormalization() - - def call(self, x): - x = self.proj(x) - # B, H, W, C - shape = tf.shape(x) - x = tf.reshape(x, [-1, shape[1] * shape[2], shape[3]]) - x = self.norm(x) - return x, shape[1], shape[2] diff --git a/.history/keras_cv/layers/overlapping_patching_embedding_20230717115401.py b/.history/keras_cv/layers/overlapping_patching_embedding_20230717115401.py deleted file mode 100644 index 19bffa8b66..0000000000 --- a/.history/keras_cv/layers/overlapping_patching_embedding_20230717115401.py +++ /dev/null @@ -1,22 +0,0 @@ -import tensorflow as tf -from torch import nn - - -class OverlappingPatchingAndEmbedding(tf.keras.layers.Layer): - def __init__(self, out_channels=32, patch_size=7, stride=4, **kwargs): - super().__init__(**kwargs) - self.proj = tf.keras.layers.Conv2D( - filters=out_channels, - kernel_size=patch_size, - strides=stride, - padding="same", - ) - self.norm = tf.keras.layers.LayerNormalization() - - def call(self, x): - x = self.proj(x) - # B, H, W, C - shape = tf.shape(x) - x = tf.reshape(x, [-1, shape[1] * shape[2], shape[3]]) - x = self.norm(x) - return x, shape[1], shape[2] diff --git a/.history/keras_cv/layers/overlapping_patching_embedding_20230717115451.py b/.history/keras_cv/layers/overlapping_patching_embedding_20230717115451.py deleted file mode 100644 index 6036d169b5..0000000000 --- a/.history/keras_cv/layers/overlapping_patching_embedding_20230717115451.py +++ /dev/null @@ -1,22 +0,0 @@ -from keras_cv.backend import keras - - -@keras.saving.register_keras_serializable(package="keras_cv") -class OverlappingPatchingAndEmbedding(tf.keras.layers.Layer): - def __init__(self, out_channels=32, patch_size=7, stride=4, **kwargs): - super().__init__(**kwargs) - self.proj = tf.keras.layers.Conv2D( - filters=out_channels, - kernel_size=patch_size, - strides=stride, - padding="same", - ) - self.norm = tf.keras.layers.LayerNormalization() - - def call(self, x): - x = self.proj(x) - # B, H, W, C - shape = tf.shape(x) - x = tf.reshape(x, [-1, shape[1] * shape[2], shape[3]]) - x = self.norm(x) - return x, shape[1], shape[2] diff --git a/.history/keras_cv/layers/overlapping_patching_embedding_20230717115505.py b/.history/keras_cv/layers/overlapping_patching_embedding_20230717115505.py deleted file mode 100644 index ab04114697..0000000000 --- a/.history/keras_cv/layers/overlapping_patching_embedding_20230717115505.py +++ /dev/null @@ -1,22 +0,0 @@ -from keras_cv.backend import keras - - -@keras.saving.register_keras_serializable(package="keras_cv") -class OverlappingPatchingAndEmbedding(tf.keras.layers.Layer): - def __init__(self, out_channels=32, patch_size=7, stride=4, **kwargs): - super().__init__(**kwargs) - self.proj = keras.layers.Conv2D( - filters=out_channels, - kernel_size=patch_size, - strides=stride, - padding="same", - ) - self.norm = keras.layers.LayerNormalization() - - def call(self, x): - x = self.proj(x) - # B, H, W, C - shape = x.shape - x = x.reshape([-1, shape[1] * shape[2], shape[3]]) - x = self.norm(x) - return x, shape[1], shape[2] diff --git a/.history/keras_cv/layers/overlapping_patching_embedding_20230717115509.py b/.history/keras_cv/layers/overlapping_patching_embedding_20230717115509.py deleted file mode 100644 index 622f2b6ba7..0000000000 --- a/.history/keras_cv/layers/overlapping_patching_embedding_20230717115509.py +++ /dev/null @@ -1,22 +0,0 @@ -from keras_cv.backend import keras - - -@keras.saving.register_keras_serializable(package="keras_cv") -class OverlappingPatchingAndEmbedding(tf.keras.layers.Layer): - def __init__(self, out_channels=32, patch_size=7, stride=4, **kwargs): - super().__init__(**kwargs) - self.proj = keras.layers.Conv2D( - filters=out_channels, - kernel_size=patch_size, - strides=stride, - padding="same", - ) - self.norm = keras.layers.LayerNormalization() - - def call(self, x): - x = self.proj(x) - # B, H, W, C - shape = x.shape - x = x.reshape((-1, shape[1] * shape[2], shape[3])) - x = self.norm(x) - return x, shape[1], shape[2] diff --git a/.history/keras_cv/layers/overlapping_patching_embedding_20230717115518.py b/.history/keras_cv/layers/overlapping_patching_embedding_20230717115518.py deleted file mode 100644 index 3e3e5daaef..0000000000 --- a/.history/keras_cv/layers/overlapping_patching_embedding_20230717115518.py +++ /dev/null @@ -1,22 +0,0 @@ -from keras_cv.backend import keras - - -@keras.saving.register_keras_serializable(package="keras_cv") -class OverlappingPatchingAndEmbedding(keras.layers.Layer): - def __init__(self, out_channels=32, patch_size=7, stride=4, **kwargs): - super().__init__(**kwargs) - self.proj = keras.layers.Conv2D( - filters=out_channels, - kernel_size=patch_size, - strides=stride, - padding="same", - ) - self.norm = keras.layers.LayerNormalization() - - def call(self, x): - x = self.proj(x) - # B, H, W, C - shape = x.shape - x = x.reshape((-1, shape[1] * shape[2], shape[3])) - x = self.norm(x) - return x, shape[1], shape[2] diff --git a/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_20230717113450.py b/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_20230717113450.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_20230717113606.py b/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_20230717113606.py deleted file mode 100644 index d741da96ed..0000000000 --- a/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_20230717113606.py +++ /dev/null @@ -1,158 +0,0 @@ -# Copyright 2023 The KerasCV Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""MobileNet v3 backbone model. - -References: - - [Searching for MobileNetV3](https://arxiv.org/pdf/1905.02244.pdf) - (ICCV 2019) - - [Based on the original keras.applications MobileNetv3](https://github.com/keras-team/keras/blob/master/keras/applications/mobilenet_v3.py) -""" # noqa: E501 - -import tensorflow as tf -from tensorflow.keras import layers - - -class MiT(tf.keras.models.Model): - def __init__( - self, - input_shape=None, - input_tensor=None, - classes=None, - include_top=None, - embed_dims=None, - depths=None, - as_backbone=None, - pooling=None, - **kwargs, - ): - if include_top and not classes: - raise ValueError( - "If `include_top` is True, you should specify `classes`. " - f"Received: classes={classes}" - ) - - if include_top and pooling: - raise ValueError( - f"`pooling` must be `None` when `include_top=True`." - f"Received pooling={pooling} and include_top={include_top}. " - ) - - if include_top and as_backbone: - raise ValueError( - f"`as_backbone` must be `False` when `include_top=True`." - f"Received as_backbone={as_backbone} and include_top={include_top}. " - ) - - if as_backbone and classes: - raise ValueError( - f"`as_backbone` must be `False` when `classes` are set." - f"Received as_backbone={as_backbone} and classes={classes}. " - ) - - drop_path_rate = 0.1 - dpr = [x.numpy() for x in tf.linspace(0.0, drop_path_rate, sum(depths))] - blockwise_num_heads = [1, 2, 5, 8] - blockwise_sr_ratios = [8, 4, 2, 1] - num_stages = 4 - - cur = 0 - patch_embedding_layers = [] - transformer_blocks = [] - layer_norms = [] - - for i in range(num_stages): - patch_embed_layer = OverlappingPatchingAndEmbedding( - in_channels=input_shape[-1] if i == 0 else embed_dims[i - 1], - out_channels=embed_dims[0] if i == 0 else embed_dims[i], - patch_size=7 if i == 0 else 3, - stride=4 if i == 0 else 2, - backend="tensorflow", - name=f"patch_and_embed_{i}", - ) - patch_embedding_layers.append(patch_embed_layer) - - transformer_block = [ - HierarchicalTransformerEncoder( - project_dim=embed_dims[i], - num_heads=blockwise_num_heads[i], - sr_ratio=blockwise_sr_ratios[i], - drop_prob=dpr[cur + k], - backend="tensorflow", - name=f"hierarchical_encoder_{i}_{k}", - ) - for k in range(depths[i]) - ] - transformer_blocks.append(transformer_block) - cur += depths[i] - layer_norms.append(layers.LayerNormalization()) - - inputs = parse_model_inputs("tensorflow", input_shape, input_tensor) - x = inputs - - B = tf.shape(x)[0] - outputs = [] - for i in range(num_stages): - x, H, W = patch_embedding_layers[i](x) - for blk in transformer_blocks[i]: - x = blk(x, H, W) - x = layer_norms[i](x) - C = tf.shape(x)[-1] - x = tf.reshape(x, [B, H, W, C]) - outputs.append(x) - - if include_top: - output = layers.GlobalAveragePooling2D(name="avg_pool")(x) - output = layers.Dense( - classes, activation="softmax", name="predictions" - )(output) - elif as_backbone: - output = outputs - else: - if pooling == "avg": - output = layers.GlobalAveragePooling2D(name="avg_pool")(x) - elif pooling == "max": - output = layers.GlobalMaxPooling2D(name="max_pool")(x) - - super().__init__( - inputs=inputs, - outputs=output, - **kwargs, - ) - - self.channels = embed_dims - self.num_stages = num_stages - self.output_channels = embed_dims - self.classes = classes - self.include_top = include_top - self.as_backbone = as_backbone - self.pooling = pooling - - self.patch_embedding_layers = [] - self.transformer_blocks = [] - - def get_config(self): - config = super().get_config() - config.update( - { - "channels": self.channels, - "num_stages": self.num_stages, - "output_channels": self.output_channels, - "classes": self.classes, - "include_top": self.include_top, - "as_backbone": self.as_backbone, - "pooling": self.pooling, - } - ) - return config diff --git a/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_20230717113615.py b/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_20230717113615.py deleted file mode 100644 index 01c71257fd..0000000000 --- a/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_20230717113615.py +++ /dev/null @@ -1,158 +0,0 @@ -# Copyright 2023 The KerasCV Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""MiT backbone model. - -References: - - [Searching for MobileNetV3](https://arxiv.org/pdf/1905.02244.pdf) - (ICCV 2019) - - [Based on the original keras.applications MobileNetv3](https://github.com/keras-team/keras/blob/master/keras/applications/mobilenet_v3.py) -""" # noqa: E501 - -import tensorflow as tf -from tensorflow.keras import layers - - -class MiT(tf.keras.models.Model): - def __init__( - self, - input_shape=None, - input_tensor=None, - classes=None, - include_top=None, - embed_dims=None, - depths=None, - as_backbone=None, - pooling=None, - **kwargs, - ): - if include_top and not classes: - raise ValueError( - "If `include_top` is True, you should specify `classes`. " - f"Received: classes={classes}" - ) - - if include_top and pooling: - raise ValueError( - f"`pooling` must be `None` when `include_top=True`." - f"Received pooling={pooling} and include_top={include_top}. " - ) - - if include_top and as_backbone: - raise ValueError( - f"`as_backbone` must be `False` when `include_top=True`." - f"Received as_backbone={as_backbone} and include_top={include_top}. " - ) - - if as_backbone and classes: - raise ValueError( - f"`as_backbone` must be `False` when `classes` are set." - f"Received as_backbone={as_backbone} and classes={classes}. " - ) - - drop_path_rate = 0.1 - dpr = [x.numpy() for x in tf.linspace(0.0, drop_path_rate, sum(depths))] - blockwise_num_heads = [1, 2, 5, 8] - blockwise_sr_ratios = [8, 4, 2, 1] - num_stages = 4 - - cur = 0 - patch_embedding_layers = [] - transformer_blocks = [] - layer_norms = [] - - for i in range(num_stages): - patch_embed_layer = OverlappingPatchingAndEmbedding( - in_channels=input_shape[-1] if i == 0 else embed_dims[i - 1], - out_channels=embed_dims[0] if i == 0 else embed_dims[i], - patch_size=7 if i == 0 else 3, - stride=4 if i == 0 else 2, - backend="tensorflow", - name=f"patch_and_embed_{i}", - ) - patch_embedding_layers.append(patch_embed_layer) - - transformer_block = [ - HierarchicalTransformerEncoder( - project_dim=embed_dims[i], - num_heads=blockwise_num_heads[i], - sr_ratio=blockwise_sr_ratios[i], - drop_prob=dpr[cur + k], - backend="tensorflow", - name=f"hierarchical_encoder_{i}_{k}", - ) - for k in range(depths[i]) - ] - transformer_blocks.append(transformer_block) - cur += depths[i] - layer_norms.append(layers.LayerNormalization()) - - inputs = parse_model_inputs("tensorflow", input_shape, input_tensor) - x = inputs - - B = tf.shape(x)[0] - outputs = [] - for i in range(num_stages): - x, H, W = patch_embedding_layers[i](x) - for blk in transformer_blocks[i]: - x = blk(x, H, W) - x = layer_norms[i](x) - C = tf.shape(x)[-1] - x = tf.reshape(x, [B, H, W, C]) - outputs.append(x) - - if include_top: - output = layers.GlobalAveragePooling2D(name="avg_pool")(x) - output = layers.Dense( - classes, activation="softmax", name="predictions" - )(output) - elif as_backbone: - output = outputs - else: - if pooling == "avg": - output = layers.GlobalAveragePooling2D(name="avg_pool")(x) - elif pooling == "max": - output = layers.GlobalMaxPooling2D(name="max_pool")(x) - - super().__init__( - inputs=inputs, - outputs=output, - **kwargs, - ) - - self.channels = embed_dims - self.num_stages = num_stages - self.output_channels = embed_dims - self.classes = classes - self.include_top = include_top - self.as_backbone = as_backbone - self.pooling = pooling - - self.patch_embedding_layers = [] - self.transformer_blocks = [] - - def get_config(self): - config = super().get_config() - config.update( - { - "channels": self.channels, - "num_stages": self.num_stages, - "output_channels": self.output_channels, - "classes": self.classes, - "include_top": self.include_top, - "as_backbone": self.as_backbone, - "pooling": self.pooling, - } - ) - return config diff --git a/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_20230717113618.py b/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_20230717113618.py deleted file mode 100644 index 6a8f44bd77..0000000000 --- a/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_20230717113618.py +++ /dev/null @@ -1,156 +0,0 @@ -# Copyright 2023 The KerasCV Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""MiT backbone model. - -References: - -""" # noqa: E501 - -import tensorflow as tf -from tensorflow.keras import layers - - -class MiT(tf.keras.models.Model): - def __init__( - self, - input_shape=None, - input_tensor=None, - classes=None, - include_top=None, - embed_dims=None, - depths=None, - as_backbone=None, - pooling=None, - **kwargs, - ): - if include_top and not classes: - raise ValueError( - "If `include_top` is True, you should specify `classes`. " - f"Received: classes={classes}" - ) - - if include_top and pooling: - raise ValueError( - f"`pooling` must be `None` when `include_top=True`." - f"Received pooling={pooling} and include_top={include_top}. " - ) - - if include_top and as_backbone: - raise ValueError( - f"`as_backbone` must be `False` when `include_top=True`." - f"Received as_backbone={as_backbone} and include_top={include_top}. " - ) - - if as_backbone and classes: - raise ValueError( - f"`as_backbone` must be `False` when `classes` are set." - f"Received as_backbone={as_backbone} and classes={classes}. " - ) - - drop_path_rate = 0.1 - dpr = [x.numpy() for x in tf.linspace(0.0, drop_path_rate, sum(depths))] - blockwise_num_heads = [1, 2, 5, 8] - blockwise_sr_ratios = [8, 4, 2, 1] - num_stages = 4 - - cur = 0 - patch_embedding_layers = [] - transformer_blocks = [] - layer_norms = [] - - for i in range(num_stages): - patch_embed_layer = OverlappingPatchingAndEmbedding( - in_channels=input_shape[-1] if i == 0 else embed_dims[i - 1], - out_channels=embed_dims[0] if i == 0 else embed_dims[i], - patch_size=7 if i == 0 else 3, - stride=4 if i == 0 else 2, - backend="tensorflow", - name=f"patch_and_embed_{i}", - ) - patch_embedding_layers.append(patch_embed_layer) - - transformer_block = [ - HierarchicalTransformerEncoder( - project_dim=embed_dims[i], - num_heads=blockwise_num_heads[i], - sr_ratio=blockwise_sr_ratios[i], - drop_prob=dpr[cur + k], - backend="tensorflow", - name=f"hierarchical_encoder_{i}_{k}", - ) - for k in range(depths[i]) - ] - transformer_blocks.append(transformer_block) - cur += depths[i] - layer_norms.append(layers.LayerNormalization()) - - inputs = parse_model_inputs("tensorflow", input_shape, input_tensor) - x = inputs - - B = tf.shape(x)[0] - outputs = [] - for i in range(num_stages): - x, H, W = patch_embedding_layers[i](x) - for blk in transformer_blocks[i]: - x = blk(x, H, W) - x = layer_norms[i](x) - C = tf.shape(x)[-1] - x = tf.reshape(x, [B, H, W, C]) - outputs.append(x) - - if include_top: - output = layers.GlobalAveragePooling2D(name="avg_pool")(x) - output = layers.Dense( - classes, activation="softmax", name="predictions" - )(output) - elif as_backbone: - output = outputs - else: - if pooling == "avg": - output = layers.GlobalAveragePooling2D(name="avg_pool")(x) - elif pooling == "max": - output = layers.GlobalMaxPooling2D(name="max_pool")(x) - - super().__init__( - inputs=inputs, - outputs=output, - **kwargs, - ) - - self.channels = embed_dims - self.num_stages = num_stages - self.output_channels = embed_dims - self.classes = classes - self.include_top = include_top - self.as_backbone = as_backbone - self.pooling = pooling - - self.patch_embedding_layers = [] - self.transformer_blocks = [] - - def get_config(self): - config = super().get_config() - config.update( - { - "channels": self.channels, - "num_stages": self.num_stages, - "output_channels": self.output_channels, - "classes": self.classes, - "include_top": self.include_top, - "as_backbone": self.as_backbone, - "pooling": self.pooling, - } - ) - return config diff --git a/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_20230717115045.py b/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_20230717115045.py deleted file mode 100644 index e770d5db34..0000000000 --- a/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_20230717115045.py +++ /dev/null @@ -1,146 +0,0 @@ -# Copyright 2023 The KerasCV Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""MiT backbone model. - -References: - -""" # noqa: E501 - -from keras_cv import layers as cv_layers -from keras_cv.backend import keras -from keras_cv.models import utils -from keras_cv.models.backbones.backbone import Backbone -from keras_cv.models.backbones.mix_transformer.mix_transformer_backbone_presets import ( # noqa: E501 - backbone_presets, -) -from keras_cv.models.backbones.mix_transformer.mix_transformer_backbone_presets import ( # noqa: E501 - backbone_presets_with_weights, -) -from keras_cv.utils.python_utils import classproperty - - -@keras.saving.register_keras_serializable(package="keras_cv.models") -class MiT(Backbone): - def __init__( - self, - input_shape=None, - input_tensor=None, - classes=None, - include_top=None, - embed_dims=None, - depths=None, - pooling=None, - **kwargs, - ): - if include_top and not classes: - raise ValueError( - "If `include_top` is True, you should specify `classes`. " - f"Received: classes={classes}" - ) - - if include_top and pooling: - raise ValueError( - f"`pooling` must be `None` when `include_top=True`." - f"Received pooling={pooling} and include_top={include_top}. " - ) - - if include_top and as_backbone: - raise ValueError( - f"`as_backbone` must be `False` when `include_top=True`." - f"Received as_backbone={as_backbone} and include_top={include_top}. " - ) - - drop_path_rate = 0.1 - dpr = [x.numpy() for x in tf.linspace(0.0, drop_path_rate, sum(depths))] - blockwise_num_heads = [1, 2, 5, 8] - blockwise_sr_ratios = [8, 4, 2, 1] - num_stages = 4 - - cur = 0 - patch_embedding_layers = [] - transformer_blocks = [] - layer_norms = [] - - for i in range(num_stages): - patch_embed_layer = OverlappingPatchingAndEmbedding( - in_channels=input_shape[-1] if i == 0 else embed_dims[i - 1], - out_channels=embed_dims[0] if i == 0 else embed_dims[i], - patch_size=7 if i == 0 else 3, - stride=4 if i == 0 else 2, - backend="tensorflow", - name=f"patch_and_embed_{i}", - ) - patch_embedding_layers.append(patch_embed_layer) - - transformer_block = [ - HierarchicalTransformerEncoder( - project_dim=embed_dims[i], - num_heads=blockwise_num_heads[i], - sr_ratio=blockwise_sr_ratios[i], - drop_prob=dpr[cur + k], - backend="tensorflow", - name=f"hierarchical_encoder_{i}_{k}", - ) - for k in range(depths[i]) - ] - transformer_blocks.append(transformer_block) - cur += depths[i] - layer_norms.append(keras.layers.LayerNormalization()) - - inputs = utils.parse_model_inputs(input_shape, input_tensor) - x = inputs - - batch_size = x.shape[0] - pyramid_level_inputs = [] - for i in range(num_stages): - x, H, W = patch_embedding_layers[i](x) - for blk in transformer_blocks[i]: - x = blk(x, H, W) - x = layer_norms[i](x) - C = x.shape[-1] - x = x.reshape((batch_size, H, W, C)) - pyramid_level_inputs.append(x) - - super().__init__( - inputs=inputs, - outputs=x, - **kwargs, - ) - - self.channels = embed_dims - self.num_stages = num_stages - self.output_channels = embed_dims - self.classes = classes - self.include_top = include_top - self.pyramid_level_inputs = pyramid_level_inputs - self.pooling = pooling - - self.patch_embedding_layers = [] - self.transformer_blocks = [] - - def get_config(self): - config = super().get_config() - config.update( - { - "channels": self.channels, - "num_stages": self.num_stages, - "output_channels": self.output_channels, - "classes": self.classes, - "include_top": self.include_top, - "as_backbone": self.as_backbone, - "pooling": self.pooling, - } - ) - return config diff --git a/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_20230717115058.py b/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_20230717115058.py deleted file mode 100644 index 77fd559970..0000000000 --- a/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_20230717115058.py +++ /dev/null @@ -1,144 +0,0 @@ -# Copyright 2023 The KerasCV Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""MiT backbone model. - -References: - -""" # noqa: E501 - -from keras_cv import layers as cv_layers -from keras_cv.backend import keras -from keras_cv.models import utils -from keras_cv.models.backbones.backbone import Backbone -from keras_cv.models.backbones.mix_transformer.mix_transformer_backbone_presets import ( # noqa: E501 - backbone_presets, -) -from keras_cv.models.backbones.mix_transformer.mix_transformer_backbone_presets import ( # noqa: E501 - backbone_presets_with_weights, -) -from keras_cv.utils.python_utils import classproperty - - -@keras.saving.register_keras_serializable(package="keras_cv.models") -class MiT(Backbone): - def __init__( - self, - input_shape=None, - input_tensor=None, - classes=None, - include_top=None, - embed_dims=None, - depths=None, - pooling=None, - **kwargs, - ): - if include_top and not classes: - raise ValueError( - "If `include_top` is True, you should specify `classes`. " - f"Received: classes={classes}" - ) - - if include_top and pooling: - raise ValueError( - f"`pooling` must be `None` when `include_top=True`." - f"Received pooling={pooling} and include_top={include_top}. " - ) - - if include_top and as_backbone: - raise ValueError( - f"`as_backbone` must be `False` when `include_top=True`." - f"Received as_backbone={as_backbone} and include_top={include_top}. " - ) - - drop_path_rate = 0.1 - dpr = [x.numpy() for x in tf.linspace(0.0, drop_path_rate, sum(depths))] - blockwise_num_heads = [1, 2, 5, 8] - blockwise_sr_ratios = [8, 4, 2, 1] - num_stages = 4 - - cur = 0 - patch_embedding_layers = [] - transformer_blocks = [] - layer_norms = [] - - for i in range(num_stages): - patch_embed_layer = OverlappingPatchingAndEmbedding( - in_channels=input_shape[-1] if i == 0 else embed_dims[i - 1], - out_channels=embed_dims[0] if i == 0 else embed_dims[i], - patch_size=7 if i == 0 else 3, - stride=4 if i == 0 else 2, - name=f"patch_and_embed_{i}", - ) - patch_embedding_layers.append(patch_embed_layer) - - transformer_block = [ - HierarchicalTransformerEncoder( - project_dim=embed_dims[i], - num_heads=blockwise_num_heads[i], - sr_ratio=blockwise_sr_ratios[i], - drop_prob=dpr[cur + k], - name=f"hierarchical_encoder_{i}_{k}", - ) - for k in range(depths[i]) - ] - transformer_blocks.append(transformer_block) - cur += depths[i] - layer_norms.append(keras.layers.LayerNormalization()) - - inputs = utils.parse_model_inputs(input_shape, input_tensor) - x = inputs - - batch_size = x.shape[0] - pyramid_level_inputs = [] - for i in range(num_stages): - x, H, W = patch_embedding_layers[i](x) - for blk in transformer_blocks[i]: - x = blk(x, H, W) - x = layer_norms[i](x) - C = x.shape[-1] - x = x.reshape((batch_size, H, W, C)) - pyramid_level_inputs.append(x) - - super().__init__( - inputs=inputs, - outputs=x, - **kwargs, - ) - - self.channels = embed_dims - self.num_stages = num_stages - self.output_channels = embed_dims - self.classes = classes - self.include_top = include_top - self.pyramid_level_inputs = pyramid_level_inputs - self.pooling = pooling - - self.patch_embedding_layers = [] - self.transformer_blocks = [] - - def get_config(self): - config = super().get_config() - config.update( - { - "channels": self.channels, - "num_stages": self.num_stages, - "output_channels": self.output_channels, - "classes": self.classes, - "include_top": self.include_top, - "as_backbone": self.as_backbone, - "pooling": self.pooling, - } - ) - return config diff --git a/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets_20230717113721.py b/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets_20230717113721.py deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets_20230717113912.py b/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets_20230717113912.py deleted file mode 100644 index 0f21b0b687..0000000000 --- a/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets_20230717113912.py +++ /dev/null @@ -1,282 +0,0 @@ -# Copyright 2023 The KerasCV Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""MobileNetV3 model preset configurations.""" - -backbone_presets_no_weights = { - "mobilenet_v3_small": { - "metadata": { - "description": ( - "MobileNetV3 model with 14 layers where the batch " - "normalization and hard-swish activation are applied after the " - "convolution layers." - ), - "params": 933502, - "official_name": "MobileNetV3", - "path": "mobilenetv3", - }, - "class_name": "keras_cv.models>MobileNetV3Backbone", - "config": { - "stackwise_expansion": [ - 1, - 72.0 / 16, - 88.0 / 24, - 4, - 6, - 6, - 3, - 3, - 6, - 6, - 6, - ], - "stackwise_filters": [16, 24, 24, 40, 40, 40, 48, 48, 96, 96, 96], - "stackwise_kernel_size": [3, 3, 3, 5, 5, 5, 5, 5, 5, 5, 5], - "stackwise_stride": [2, 2, 1, 2, 1, 1, 1, 1, 2, 1, 1], - "stackwise_se_ratio": [ - 0.25, - None, - None, - 0.25, - 0.25, - 0.25, - 0.25, - 0.25, - 0.25, - 0.25, - 0.25, - ], - "stackwise_activation": [ - "relu", - "relu", - "relu", - "hard_swish", - "hard_swish", - "hard_swish", - "hard_swish", - "hard_swish", - "hard_swish", - "hard_swish", - "hard_swish", - ], - "include_rescaling": True, - "input_shape": (None, None, 3), - "input_tensor": None, - "alpha": 1.0, - }, - }, - "mobilenet_v3_large": { - "metadata": { - "description": ( - "MobileNetV3 model with 28 layers where the batch " - "normalization and hard-swish activation are applied after the " - "convolution layers." - ), - "params": 2994518, - "official_name": "MobileNetV3", - "path": "mobilenetv3", - }, - "class_name": "keras_cv.models>MobileNetV3Backbone", - "config": { - "embedding_dims": [32, 64, 160, 256], - "depths": [2, 2, 2, 2], - }, - }, -} - -backbone_presets_with_weights = {} - -backbone_presets = { - **backbone_presets_no_weights, - **backbone_presets_with_weights, -} - - -MODEL_CONFIGS = { - "B0": {"embedding_dims": [32, 64, 160, 256], "depths": [2, 2, 2, 2]}, - "B1": {"embedding_dims": [64, 128, 320, 512], "depths": [2, 2, 2, 2]}, - "B2": {"embedding_dims": [64, 128, 320, 512], "depths": [3, 4, 6, 3]}, - "B3": {"embedding_dims": [64, 128, 320, 512], "depths": [3, 4, 18, 3]}, - "B4": {"embedding_dims": [64, 128, 320, 512], "depths": [3, 8, 27, 3]}, - "B5": {"embedding_dims": [64, 128, 320, 512], "depths": [3, 6, 40, 3]}, -} - -MODEL_BACKBONES = {"tensorflow": __MiTTF, "pytorch": __MiTPT} - - -def MiTB0( - backend, - include_top, - classes=None, - input_shape=(None, None, 3), - input_tensor=None, - pooling=None, - as_backbone=False, - **kwargs, -): - model_class = MODEL_BACKBONES.get(backend) - if model_class is None: - raise ValueError( - f"Backend not supported: {backend}. Supported backbones are {MODEL_BACKBONES.keys()}" - ) - return model_class( - input_shape=input_shape, - input_tensor=input_tensor, - pooling=pooling, - embed_dims=MODEL_CONFIGS["B0"]["embedding_dims"], - depths=MODEL_CONFIGS["B0"]["depths"], - classes=classes, - include_top=include_top, - as_backbone=as_backbone, - **kwargs, - ) - - -def MiTB1( - backend, - include_top, - classes=None, - input_shape=(None, None, 3), - input_tensor=None, - pooling=None, - as_backbone=False, - **kwargs, -): - model_class = MODEL_BACKBONES.get(backend) - if model_class is None: - raise ValueError( - f"Backend not supported: {backend}. Supported backbones are {MODEL_BACKBONES.keys()}" - ) - return model_class( - input_shape=input_shape, - input_tensor=input_tensor, - pooling=pooling, - embed_dims=MODEL_CONFIGS["B1"]["embedding_dims"], - depths=MODEL_CONFIGS["B1"]["depths"], - classes=classes, - include_top=include_top, - as_backbone=as_backbone, - **kwargs, - ) - - -def MiTB2( - backend, - include_top, - classes=None, - input_shape=(None, None, 3), - input_tensor=None, - pooling=None, - as_backbone=False, - **kwargs, -): - model_class = MODEL_BACKBONES.get(backend) - if model_class is None: - raise ValueError( - f"Backend not supported: {backend}. Supported backbones are {MODEL_BACKBONES.keys()}" - ) - return model_class( - input_shape=input_shape, - input_tensor=input_tensor, - pooling=pooling, - embed_dims=MODEL_CONFIGS["B2"]["embedding_dims"], - depths=MODEL_CONFIGS["B2"]["depths"], - classes=classes, - include_top=include_top, - as_backbone=as_backbone, - **kwargs, - ) - - -def MiTB3( - backend, - include_top, - classes=None, - input_shape=(None, None, 3), - input_tensor=None, - pooling=None, - as_backbone=False, - **kwargs, -): - model_class = MODEL_BACKBONES.get(backend) - if model_class is None: - raise ValueError( - f"Backend not supported: {backend}. Supported backbones are {MODEL_BACKBONES.keys()}" - ) - return model_class( - input_shape=input_shape, - input_tensor=input_tensor, - pooling=pooling, - embed_dims=MODEL_CONFIGS["B3"]["embedding_dims"], - depths=MODEL_CONFIGS["B3"]["depths"], - classes=classes, - include_top=include_top, - as_backbone=as_backbone, - **kwargs, - ) - - -def MiTB4( - backend, - include_top, - classes=None, - input_shape=(None, None, 3), - input_tensor=None, - pooling=None, - as_backbone=False, - **kwargs, -): - model_class = MODEL_BACKBONES.get(backend) - if model_class is None: - raise ValueError( - f"Backend not supported: {backend}. Supported backbones are {MODEL_BACKBONES.keys()}" - ) - return model_class( - input_shape=input_shape, - input_tensor=input_tensor, - pooling=pooling, - embed_dims=MODEL_CONFIGS["B4"]["embedding_dims"], - depths=MODEL_CONFIGS["B4"]["depths"], - classes=classes, - include_top=include_top, - as_backbone=as_backbone, - **kwargs, - ) - - -def MiTB5( - backend, - include_top, - classes=None, - input_shape=(None, None, 3), - input_tensor=None, - pooling=None, - as_backbone=False, - **kwargs, -): - model_class = MODEL_BACKBONES.get(backend) - if model_class is None: - raise ValueError( - f"Backend not supported: {backend}. Supported backbones are {MODEL_BACKBONES.keys()}" - ) - return model_class( - input_shape=input_shape, - input_tensor=input_tensor, - pooling=pooling, - embed_dims=MODEL_CONFIGS["B5"]["embedding_dims"], - depths=MODEL_CONFIGS["B5"]["depths"], - classes=classes, - include_top=include_top, - as_backbone=as_backbone, - **kwargs, - ) diff --git a/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets_20230717114045.py b/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets_20230717114045.py deleted file mode 100644 index 197b217d1c..0000000000 --- a/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets_20230717114045.py +++ /dev/null @@ -1,126 +0,0 @@ -# Copyright 2023 The KerasCV Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""MobileNetV3 model preset configurations.""" - -backbone_presets_no_weights = { - "MiT_B0": { - "metadata": { - "description": ( - "MobileNetV3 model with 28 layers where the batch " - "normalization and hard-swish activation are applied after the " - "convolution layers." - ), - "params": 2994518, - "official_name": "MobileNetV3", - "path": "mobilenetv3", - }, - "class_name": "keras_cv.models>MobileNetV3Backbone", - "config": { - "embedding_dims": [32, 64, 160, 256], - "depths": [2, 2, 2, 2], - }, - }, - "MiT_B1": { - "metadata": { - "description": ( - "MobileNetV3 model with 28 layers where the batch " - "normalization and hard-swish activation are applied after the " - "convolution layers." - ), - "params": 2994518, - "official_name": "MobileNetV3", - "path": "mobilenetv3", - }, - "class_name": "keras_cv.models>MobileNetV3Backbone", - "config": { - "embedding_dims": [64, 128, 320, 512], - "depths": [2, 2, 2, 2], - }, - }, - "MiT_B2": { - "metadata": { - "description": ( - "MobileNetV3 model with 28 layers where the batch " - "normalization and hard-swish activation are applied after the " - "convolution layers." - ), - "params": 2994518, - "official_name": "MobileNetV3", - "path": "mobilenetv3", - }, - "class_name": "keras_cv.models>MobileNetV3Backbone", - "config": { - "embedding_dims": [64, 128, 320, 512], - "depths": [3, 4, 6, 3], - }, - }, - "MiT_B3": { - "metadata": { - "description": ( - "MobileNetV3 model with 28 layers where the batch " - "normalization and hard-swish activation are applied after the " - "convolution layers." - ), - "params": 2994518, - "official_name": "MobileNetV3", - "path": "mobilenetv3", - }, - "class_name": "keras_cv.models>MobileNetV3Backbone", - "config": { - "embedding_dims": [64, 128, 320, 512], - "depths": [3, 4, 18, 3], - }, - }, - "MiT_B4": { - "metadata": { - "description": ( - "MobileNetV3 model with 28 layers where the batch " - "normalization and hard-swish activation are applied after the " - "convolution layers." - ), - "params": 2994518, - "official_name": "MobileNetV3", - "path": "mobilenetv3", - }, - "class_name": "keras_cv.models>MobileNetV3Backbone", - "config": { - "embedding_dims": [64, 128, 320, 512], - "depths": [3, 8, 27, 3], - }, - }, - "MiT_B5": { - "metadata": { - "description": ( - "MobileNetV3 model with 28 layers where the batch " - "normalization and hard-swish activation are applied after the " - "convolution layers." - ), - "params": 2994518, - "official_name": "MobileNetV3", - "path": "mobilenetv3", - }, - "class_name": "keras_cv.models>MobileNetV3Backbone", - "config": { - "embedding_dims": [64, 128, 320, 512], - "depths": [3, 6, 40, 3], - }, - }, -} - -backbone_presets_with_weights = {} - -backbone_presets = { - **backbone_presets_no_weights, - **backbone_presets_with_weights, -} diff --git a/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets_20230717114112.py b/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets_20230717114112.py deleted file mode 100644 index 197b217d1c..0000000000 --- a/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets_20230717114112.py +++ /dev/null @@ -1,126 +0,0 @@ -# Copyright 2023 The KerasCV Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""MobileNetV3 model preset configurations.""" - -backbone_presets_no_weights = { - "MiT_B0": { - "metadata": { - "description": ( - "MobileNetV3 model with 28 layers where the batch " - "normalization and hard-swish activation are applied after the " - "convolution layers." - ), - "params": 2994518, - "official_name": "MobileNetV3", - "path": "mobilenetv3", - }, - "class_name": "keras_cv.models>MobileNetV3Backbone", - "config": { - "embedding_dims": [32, 64, 160, 256], - "depths": [2, 2, 2, 2], - }, - }, - "MiT_B1": { - "metadata": { - "description": ( - "MobileNetV3 model with 28 layers where the batch " - "normalization and hard-swish activation are applied after the " - "convolution layers." - ), - "params": 2994518, - "official_name": "MobileNetV3", - "path": "mobilenetv3", - }, - "class_name": "keras_cv.models>MobileNetV3Backbone", - "config": { - "embedding_dims": [64, 128, 320, 512], - "depths": [2, 2, 2, 2], - }, - }, - "MiT_B2": { - "metadata": { - "description": ( - "MobileNetV3 model with 28 layers where the batch " - "normalization and hard-swish activation are applied after the " - "convolution layers." - ), - "params": 2994518, - "official_name": "MobileNetV3", - "path": "mobilenetv3", - }, - "class_name": "keras_cv.models>MobileNetV3Backbone", - "config": { - "embedding_dims": [64, 128, 320, 512], - "depths": [3, 4, 6, 3], - }, - }, - "MiT_B3": { - "metadata": { - "description": ( - "MobileNetV3 model with 28 layers where the batch " - "normalization and hard-swish activation are applied after the " - "convolution layers." - ), - "params": 2994518, - "official_name": "MobileNetV3", - "path": "mobilenetv3", - }, - "class_name": "keras_cv.models>MobileNetV3Backbone", - "config": { - "embedding_dims": [64, 128, 320, 512], - "depths": [3, 4, 18, 3], - }, - }, - "MiT_B4": { - "metadata": { - "description": ( - "MobileNetV3 model with 28 layers where the batch " - "normalization and hard-swish activation are applied after the " - "convolution layers." - ), - "params": 2994518, - "official_name": "MobileNetV3", - "path": "mobilenetv3", - }, - "class_name": "keras_cv.models>MobileNetV3Backbone", - "config": { - "embedding_dims": [64, 128, 320, 512], - "depths": [3, 8, 27, 3], - }, - }, - "MiT_B5": { - "metadata": { - "description": ( - "MobileNetV3 model with 28 layers where the batch " - "normalization and hard-swish activation are applied after the " - "convolution layers." - ), - "params": 2994518, - "official_name": "MobileNetV3", - "path": "mobilenetv3", - }, - "class_name": "keras_cv.models>MobileNetV3Backbone", - "config": { - "embedding_dims": [64, 128, 320, 512], - "depths": [3, 6, 40, 3], - }, - }, -} - -backbone_presets_with_weights = {} - -backbone_presets = { - **backbone_presets_no_weights, - **backbone_presets_with_weights, -} diff --git a/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets_20230717114332.py b/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets_20230717114332.py deleted file mode 100644 index 1313c0a151..0000000000 --- a/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets_20230717114332.py +++ /dev/null @@ -1,114 +0,0 @@ -# Copyright 2023 The KerasCV Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""MiT model preset configurations.""" - -backbone_presets_no_weights = { - "MiT_B0": { - "metadata": { - "description": ( - "MiT (MixTransformer) model with 8 transformer blocks." - ), - "params": 2994518, - "official_name": "MiT", - "path": "mit", - }, - "class_name": "keras_cv.models>MobileNetV3Backbone", - "config": { - "embedding_dims": [32, 64, 160, 256], - "depths": [2, 2, 2, 2], - }, - }, - "MiT_B1": { - "metadata": { - "description": ( - "MiT (MixTransformer) model with 8 transformer blocks." - ), - "params": 2994518, - "official_name": "MiT", - "path": "mit", - }, - "class_name": "keras_cv.models>MobileNetV3Backbone", - "config": { - "embedding_dims": [64, 128, 320, 512], - "depths": [2, 2, 2, 2], - }, - }, - "MiT_B2": { - "metadata": { - "description": ( - "MiT (MixTransformer) model with 16 transformer blocks." - ), - "params": 2994518, - "official_name": "MiT", - "path": "mit", - }, - "class_name": "keras_cv.models>MobileNetV3Backbone", - "config": { - "embedding_dims": [64, 128, 320, 512], - "depths": [3, 4, 6, 3], - }, - }, - "MiT_B3": { - "metadata": { - "description": ( - "MiT (MixTransformer) model with 28 transformer blocks." - ), - "params": 2994518, - "official_name": "MiT", - "path": "mit", - }, - "class_name": "keras_cv.models>MobileNetV3Backbone", - "config": { - "embedding_dims": [64, 128, 320, 512], - "depths": [3, 4, 18, 3], - }, - }, - "MiT_B4": { - "metadata": { - "description": ( - "MiT (MixTransformer) model with 41 transformer blocks." - ), - "params": 2994518, - "official_name": "MiT", - "path": "mit", - }, - "class_name": "keras_cv.models>MobileNetV3Backbone", - "config": { - "embedding_dims": [64, 128, 320, 512], - "depths": [3, 8, 27, 3], - }, - }, - "MiT_B5": { - "metadata": { - "description": ( - "MiT (MixTransformer) model with 52 transformer blocks." - ), - "params": 2994518, - "official_name": "MiT", - "path": "mit", - }, - "class_name": "keras_cv.models>MobileNetV3Backbone", - "config": { - "embedding_dims": [64, 128, 320, 512], - "depths": [3, 6, 40, 3], - }, - }, -} - -backbone_presets_with_weights = {} - -backbone_presets = { - **backbone_presets_no_weights, - **backbone_presets_with_weights, -} diff --git a/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets_20230717114420.py b/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets_20230717114420.py deleted file mode 100644 index 5267a09ace..0000000000 --- a/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets_20230717114420.py +++ /dev/null @@ -1,114 +0,0 @@ -# Copyright 2023 The KerasCV Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""MiT model preset configurations.""" - -backbone_presets_no_weights = { - "MiT_B0": { - "metadata": { - "description": ( - "MiT (MixTransformer) model with 8 transformer blocks." - ), - "params": 3321962, - "official_name": "MiT", - "path": "mit", - }, - "class_name": "keras_cv.models>MobileNetV3Backbone", - "config": { - "embedding_dims": [32, 64, 160, 256], - "depths": [2, 2, 2, 2], - }, - }, - "MiT_B1": { - "metadata": { - "description": ( - "MiT (MixTransformer) model with 8 transformer blocks." - ), - "params": 13156554, - "official_name": "MiT", - "path": "mit", - }, - "class_name": "keras_cv.models>MobileNetV3Backbone", - "config": { - "embedding_dims": [64, 128, 320, 512], - "depths": [2, 2, 2, 2], - }, - }, - "MiT_B2": { - "metadata": { - "description": ( - "MiT (MixTransformer) model with 16 transformer blocks." - ), - "params": 24201418, - "official_name": "MiT", - "path": "mit", - }, - "class_name": "keras_cv.models>MobileNetV3Backbone", - "config": { - "embedding_dims": [64, 128, 320, 512], - "depths": [3, 4, 6, 3], - }, - }, - "MiT_B3": { - "metadata": { - "description": ( - "MiT (MixTransformer) model with 28 transformer blocks." - ), - "params": 44077258, - "official_name": "MiT", - "path": "mit", - }, - "class_name": "keras_cv.models>MobileNetV3Backbone", - "config": { - "embedding_dims": [64, 128, 320, 512], - "depths": [3, 4, 18, 3], - }, - }, - "MiT_B4": { - "metadata": { - "description": ( - "MiT (MixTransformer) model with 41 transformer blocks." - ), - "params": 60847818, - "official_name": "MiT", - "path": "mit", - }, - "class_name": "keras_cv.models>MobileNetV3Backbone", - "config": { - "embedding_dims": [64, 128, 320, 512], - "depths": [3, 8, 27, 3], - }, - }, - "MiT_B5": { - "metadata": { - "description": ( - "MiT (MixTransformer) model with 52 transformer blocks." - ), - "params": 81448138, - "official_name": "MiT", - "path": "mit", - }, - "class_name": "keras_cv.models>MobileNetV3Backbone", - "config": { - "embedding_dims": [64, 128, 320, 512], - "depths": [3, 6, 40, 3], - }, - }, -} - -backbone_presets_with_weights = {} - -backbone_presets = { - **backbone_presets_no_weights, - **backbone_presets_with_weights, -} diff --git a/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets_20230717114436.py b/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets_20230717114436.py deleted file mode 100644 index b890282a91..0000000000 --- a/.history/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets_20230717114436.py +++ /dev/null @@ -1,114 +0,0 @@ -# Copyright 2023 The KerasCV Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""MiT model preset configurations.""" - -backbone_presets_no_weights = { - "MiT_B0": { - "metadata": { - "description": ( - "MiT (MixTransformer) model with 8 transformer blocks." - ), - "params": 3321962, - "official_name": "MiT", - "path": "mit", - }, - "class_name": "keras_cv.models>MiTBackbone", - "config": { - "embedding_dims": [32, 64, 160, 256], - "depths": [2, 2, 2, 2], - }, - }, - "MiT_B1": { - "metadata": { - "description": ( - "MiT (MixTransformer) model with 8 transformer blocks." - ), - "params": 13156554, - "official_name": "MiT", - "path": "mit", - }, - "class_name": "keras_cv.models>MiTBackbone", - "config": { - "embedding_dims": [64, 128, 320, 512], - "depths": [2, 2, 2, 2], - }, - }, - "MiT_B2": { - "metadata": { - "description": ( - "MiT (MixTransformer) model with 16 transformer blocks." - ), - "params": 24201418, - "official_name": "MiT", - "path": "mit", - }, - "class_name": "keras_cv.models>MiTBackbone", - "config": { - "embedding_dims": [64, 128, 320, 512], - "depths": [3, 4, 6, 3], - }, - }, - "MiT_B3": { - "metadata": { - "description": ( - "MiT (MixTransformer) model with 28 transformer blocks." - ), - "params": 44077258, - "official_name": "MiT", - "path": "mit", - }, - "class_name": "keras_cv.models>MiTBackbone", - "config": { - "embedding_dims": [64, 128, 320, 512], - "depths": [3, 4, 18, 3], - }, - }, - "MiT_B4": { - "metadata": { - "description": ( - "MiT (MixTransformer) model with 41 transformer blocks." - ), - "params": 60847818, - "official_name": "MiT", - "path": "mit", - }, - "class_name": "keras_cv.models>MiTBackbone", - "config": { - "embedding_dims": [64, 128, 320, 512], - "depths": [3, 8, 27, 3], - }, - }, - "MiT_B5": { - "metadata": { - "description": ( - "MiT (MixTransformer) model with 52 transformer blocks." - ), - "params": 81448138, - "official_name": "MiT", - "path": "mit", - }, - "class_name": "keras_cv.models>MiTBackbone", - "config": { - "embedding_dims": [64, 128, 320, 512], - "depths": [3, 6, 40, 3], - }, - }, -} - -backbone_presets_with_weights = {} - -backbone_presets = { - **backbone_presets_no_weights, - **backbone_presets_with_weights, -} From 03470dfc2a0fb3db19f69cdb5cf8cb9622cfc275 Mon Sep 17 00:00:00 2001 From: DavidLandup0 Date: Mon, 17 Jul 2023 12:27:37 +0200 Subject: [PATCH 04/53] segformer head and formatting --- keras_cv/layers/__init__.py | 6 ++ .../hierarchical_transformer_encoder.py | 2 +- keras_cv/models/__init__.py | 4 ++ .../mix_transformer_backbone.py | 12 +--- keras_cv/models/segmentation/__init__.py | 1 + .../models/segmentation/segformer/__init__.py | 1 + .../segmentation/segformer/segformer.py | 61 ++++++++++++++++--- 7 files changed, 67 insertions(+), 20 deletions(-) create mode 100644 keras_cv/models/segmentation/segformer/__init__.py diff --git a/keras_cv/layers/__init__.py b/keras_cv/layers/__init__.py index a8a49287c9..1ccfe46c52 100644 --- a/keras_cv/layers/__init__.py +++ b/keras_cv/layers/__init__.py @@ -16,6 +16,9 @@ from tensorflow.keras.layers import RandomHeight from tensorflow.keras.layers import RandomWidth +from keras_cv.layers.efficient_multihead_attention import ( + EfficientMultiheadAttention, +) from keras_cv.layers.feature_pyramid import FeaturePyramid from keras_cv.layers.fusedmbconv import FusedMBConvBlock from keras_cv.layers.mbconv import MBConvBlock @@ -31,6 +34,9 @@ CenterNetLabelEncoder, ) from keras_cv.layers.object_detection_3d.voxelization import DynamicVoxelization +from keras_cv.layers.overlapping_patching_embedding import ( + OverlappingPatchingAndEmbedding, +) from keras_cv.layers.preprocessing.aug_mix import AugMix from keras_cv.layers.preprocessing.auto_contrast import AutoContrast from keras_cv.layers.preprocessing.base_image_augmentation_layer import ( diff --git a/keras_cv/layers/hierarchical_transformer_encoder.py b/keras_cv/layers/hierarchical_transformer_encoder.py index 846f0146d6..73b00647b0 100644 --- a/keras_cv/layers/hierarchical_transformer_encoder.py +++ b/keras_cv/layers/hierarchical_transformer_encoder.py @@ -1,6 +1,6 @@ from keras_cv.backend import keras -from keras_cv.layers import StochasticDepth from keras_cv.layers import EfficientMultiheadAttention +from keras_cv.layers import StochasticDepth @keras.saving.register_keras_serializable(package="keras_cv") diff --git a/keras_cv/models/__init__.py b/keras_cv/models/__init__.py index d159d42af4..686d883f1d 100644 --- a/keras_cv/models/__init__.py +++ b/keras_cv/models/__init__.py @@ -67,6 +67,9 @@ from keras_cv.models.backbones.efficientnet_v2.efficientnet_v2_aliases import ( EfficientNetV2SBackbone, ) +from keras_cv.models.backbones.mix_transformer.mix_transformer_backbone import ( + MiTBackbone, +) from keras_cv.models.backbones.mobilenet_v3.mobilenet_v3_aliases import ( MobileNetV3LargeBackbone, ) @@ -124,5 +127,6 @@ MultiHeadCenterPillar, ) from keras_cv.models.segmentation import DeepLabV3Plus +from keras_cv.models.segmentation import SegFormer from keras_cv.models.stable_diffusion import StableDiffusion from keras_cv.models.stable_diffusion import StableDiffusionV2 diff --git a/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py index 77fd559970..57c99f127a 100644 --- a/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py +++ b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py @@ -32,7 +32,7 @@ @keras.saving.register_keras_serializable(package="keras_cv.models") -class MiT(Backbone): +class MiTBackbone(Backbone): def __init__( self, input_shape=None, @@ -56,12 +56,6 @@ def __init__( f"Received pooling={pooling} and include_top={include_top}. " ) - if include_top and as_backbone: - raise ValueError( - f"`as_backbone` must be `False` when `include_top=True`." - f"Received as_backbone={as_backbone} and include_top={include_top}. " - ) - drop_path_rate = 0.1 dpr = [x.numpy() for x in tf.linspace(0.0, drop_path_rate, sum(depths))] blockwise_num_heads = [1, 2, 5, 8] @@ -74,7 +68,7 @@ def __init__( layer_norms = [] for i in range(num_stages): - patch_embed_layer = OverlappingPatchingAndEmbedding( + patch_embed_layer = cv_layers.OverlappingPatchingAndEmbedding( in_channels=input_shape[-1] if i == 0 else embed_dims[i - 1], out_channels=embed_dims[0] if i == 0 else embed_dims[i], patch_size=7 if i == 0 else 3, @@ -84,7 +78,7 @@ def __init__( patch_embedding_layers.append(patch_embed_layer) transformer_block = [ - HierarchicalTransformerEncoder( + cv_layers.HierarchicalTransformerEncoder( project_dim=embed_dims[i], num_heads=blockwise_num_heads[i], sr_ratio=blockwise_sr_ratios[i], diff --git a/keras_cv/models/segmentation/__init__.py b/keras_cv/models/segmentation/__init__.py index 122dc4191e..f25ee4ea7c 100644 --- a/keras_cv/models/segmentation/__init__.py +++ b/keras_cv/models/segmentation/__init__.py @@ -13,3 +13,4 @@ # limitations under the License. from keras_cv.models.segmentation.deeplab_v3_plus import DeepLabV3Plus +from keras_cv.models.segmentation.segformer import SegFormer diff --git a/keras_cv/models/segmentation/segformer/__init__.py b/keras_cv/models/segmentation/segformer/__init__.py new file mode 100644 index 0000000000..e76527fdde --- /dev/null +++ b/keras_cv/models/segmentation/segformer/__init__.py @@ -0,0 +1 @@ +from keras_cv.models.segmentation.segformer.segformer import SegFormer diff --git a/keras_cv/models/segmentation/segformer/segformer.py b/keras_cv/models/segmentation/segformer/segformer.py index e71c0a8bf7..223d37179b 100644 --- a/keras_cv/models/segmentation/segformer/segformer.py +++ b/keras_cv/models/segmentation/segformer/segformer.py @@ -1,10 +1,10 @@ -import tensorflow as tf +from keras_cv.backend import keras +from keras_cv.models import utils +from keras_cv.models.task import Task -from deepvision.layers.segformer_segmentation_head import SegFormerHead -from deepvision.utils.utils import parse_model_inputs - -class __SegFormerTF(tf.keras.Model): +@keras.utils.register_keras_serializable(package="keras_cv") +class SegFormer(Task): def __init__( self, num_classes=None, @@ -13,9 +13,9 @@ def __init__( input_shape=None, input_tensor=None, softmax_output=None, - **kwargs + **kwargs, ): - inputs = parse_model_inputs("tensorflow", input_shape, input_tensor) + inputs = utils.parse_model_inputs(input_shape, input_tensor) x = inputs y = backbone(x) y = SegFormerHead( @@ -23,13 +23,12 @@ def __init__( embed_dim=embed_dim, num_classes=num_classes, name="segformer_head", - backend="tensorflow", )(y) - output = tf.keras.layers.Resizing( + output = keras.layers.Resizing( height=x.shape[1], width=x.shape[2], interpolation="bilinear" )(y) if softmax_output: - output = tf.keras.layers.Activation( + output = keras.layers.Activation( "softmax", name="output_activation" )(output) @@ -42,3 +41,45 @@ def __init__( self.num_classes = num_classes self.embed_dim = embed_dim self.softmax_output = softmax_output + + +class SegFormerHead(keras.layers.Layer): + def __init__(self, in_dims, embed_dim=256, num_classes=19, **kwargs): + super().__init__(**kwargs) + self.linear_layers = [] + + for i in in_dims: + self.linear_layers.append( + keras.layers.Dense(embed_dim, name=f"linear_{i}") + ) + + # To fuse multiple layer outputs into a single feature map using a Conv2d + self.linear_fuse = keras.Sequential( + [ + keras.layers.Conv2D( + filters=embed_dim, kernel_size=1, use_bias=False + ), + keras.layers.BatchNormalization(), + keras.layers.Activation("relu"), + ] + ) + self.dropout = keras.layers.Dropout(0.1) + # Final segmentation output + self.seg_out = keras.layers.Conv2D(filters=num_classes, kernel_size=1) + + def call(self, features): + B, H, W, _ = features[0].shape + outs = [] + + for feature, layer in zip(features, self.linear_layers): + feature = layer(feature) + feature = keras.image.resize( + feature, size=(H, W), method="bilinear" + ) + outs.append(feature) + + seg = self.linear_fuse(keras.ops.concat(outs[::-1], axis=3)) + seg = self.dropout(seg) + seg = self.seg_out(seg) + + return seg From cb1c702b9c124fb5fda6a91706f5cc2f2d1c1706 Mon Sep 17 00:00:00 2001 From: DavidLandup0 Date: Mon, 17 Jul 2023 12:35:32 +0200 Subject: [PATCH 05/53] cleanup --- .../models/backbones/mix_transformer/__init__.py | 13 +++++++++++++ .../mix_transformer/mix_transformer_backbone.py | 1 - keras_cv/models/segmentation/segformer/segformer.py | 13 +++++++------ 3 files changed, 20 insertions(+), 7 deletions(-) create mode 100644 keras_cv/models/backbones/mix_transformer/__init__.py diff --git a/keras_cv/models/backbones/mix_transformer/__init__.py b/keras_cv/models/backbones/mix_transformer/__init__.py new file mode 100644 index 0000000000..3992ffb59a --- /dev/null +++ b/keras_cv/models/backbones/mix_transformer/__init__.py @@ -0,0 +1,13 @@ +# Copyright 2023 The KerasCV Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py index 57c99f127a..da16be7b98 100644 --- a/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py +++ b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py @@ -131,7 +131,6 @@ def get_config(self): "output_channels": self.output_channels, "classes": self.classes, "include_top": self.include_top, - "as_backbone": self.as_backbone, "pooling": self.pooling, } ) diff --git a/keras_cv/models/segmentation/segformer/segformer.py b/keras_cv/models/segmentation/segformer/segformer.py index 223d37179b..1be42001ad 100644 --- a/keras_cv/models/segmentation/segformer/segformer.py +++ b/keras_cv/models/segmentation/segformer/segformer.py @@ -12,7 +12,6 @@ def __init__( embed_dim=None, input_shape=None, input_tensor=None, - softmax_output=None, **kwargs, ): inputs = utils.parse_model_inputs(input_shape, input_tensor) @@ -27,10 +26,6 @@ def __init__( output = keras.layers.Resizing( height=x.shape[1], width=x.shape[2], interpolation="bilinear" )(y) - if softmax_output: - output = keras.layers.Activation( - "softmax", name="output_activation" - )(output) super().__init__( inputs=inputs, @@ -40,7 +35,13 @@ def __init__( self.num_classes = num_classes self.embed_dim = embed_dim - self.softmax_output = softmax_output + + def get_config(self): + return { + "num_classes": self.num_classes, + "backbone": self.backbone, + "embed_dim": self.embed_dim, + } class SegFormerHead(keras.layers.Layer): From 22f8fdf614d3cc1ce69802bda2e886d7986d079a Mon Sep 17 00:00:00 2001 From: DavidLandup0 Date: Mon, 17 Jul 2023 12:51:43 +0200 Subject: [PATCH 06/53] remove tf call --- keras_cv/layers/efficient_multihead_attention.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/keras_cv/layers/efficient_multihead_attention.py b/keras_cv/layers/efficient_multihead_attention.py index 718a5b216b..7ce82d9a49 100644 --- a/keras_cv/layers/efficient_multihead_attention.py +++ b/keras_cv/layers/efficient_multihead_attention.py @@ -6,7 +6,7 @@ @keras.saving.register_keras_serializable(package="keras_cv") -class EfficientMultiheadAttention(tf.keras.layers.Layer): +class EfficientMultiheadAttention(keras.layers.Layer): def __init__(self, project_dim, num_heads, sr_ratio): super().__init__() self.num_heads = num_heads From 5c9803a43b11a8ffe4402b1f74614c138d8ad673 Mon Sep 17 00:00:00 2001 From: DavidLandup0 Date: Mon, 17 Jul 2023 13:24:10 +0200 Subject: [PATCH 07/53] remove tf --- .../backbones/mix_transformer/mix_transformer_backbone.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py index da16be7b98..65b5a983e7 100644 --- a/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py +++ b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py @@ -29,6 +29,7 @@ backbone_presets_with_weights, ) from keras_cv.utils.python_utils import classproperty +import numpy as np @keras.saving.register_keras_serializable(package="keras_cv.models") @@ -57,7 +58,7 @@ def __init__( ) drop_path_rate = 0.1 - dpr = [x.numpy() for x in tf.linspace(0.0, drop_path_rate, sum(depths))] + dpr = [x for x in np.linspace(0.0, drop_path_rate, sum(depths))] blockwise_num_heads = [1, 2, 5, 8] blockwise_sr_ratios = [8, 4, 2, 1] num_stages = 4 From 314dc6b946977b60955a369a6de582498864d1c9 Mon Sep 17 00:00:00 2001 From: DavidLandup0 Date: Mon, 17 Jul 2023 14:33:19 +0200 Subject: [PATCH 08/53] migrating to more keras ops --- keras_cv/layers/__init__.py | 3 ++ .../layers/efficient_multihead_attention.py | 42 ++++++++-------- .../hierarchical_transformer_encoder.py | 48 ++++++++++--------- .../layers/overlapping_patching_embedding.py | 2 +- .../mix_transformer_backbone.py | 4 +- 5 files changed, 54 insertions(+), 45 deletions(-) diff --git a/keras_cv/layers/__init__.py b/keras_cv/layers/__init__.py index 1ccfe46c52..4a5e236140 100644 --- a/keras_cv/layers/__init__.py +++ b/keras_cv/layers/__init__.py @@ -21,6 +21,9 @@ ) from keras_cv.layers.feature_pyramid import FeaturePyramid from keras_cv.layers.fusedmbconv import FusedMBConvBlock +from keras_cv.layers.hierarchical_transformer_encoder import ( + HierarchicalTransformerEncoder, +) from keras_cv.layers.mbconv import MBConvBlock from keras_cv.layers.object_detection.anchor_generator import AnchorGenerator from keras_cv.layers.object_detection.box_matcher import BoxMatcher diff --git a/keras_cv/layers/efficient_multihead_attention.py b/keras_cv/layers/efficient_multihead_attention.py index 7ce82d9a49..e465ab08d5 100644 --- a/keras_cv/layers/efficient_multihead_attention.py +++ b/keras_cv/layers/efficient_multihead_attention.py @@ -30,7 +30,8 @@ def call(self, x, H, W): input_shape = x.shape q = self.q(x) - q = q.reshape( + q = keras.ops.reshape.reshape( + q, ( input_shape[0], input_shape[1], @@ -42,42 +43,45 @@ def call(self, x, H, W): q = q.transpose([0, 2, 1, 3]) if self.sr_ratio > 1: - x = x.transpose(x, [0, 2, 1]).reshape( - (input_shape[0], H, W, input_shape[2]) + x = keras.ops.reshape( + keras.ops.transpose(x, [0, 2, 1]), + (input_shape[0], H, W, input_shape[2]), ) x = self.sr(x) - x = x.reshape([input_shape[0], input_shape[2], -1]) - x = x.transpose([0, 2, 1]) + x = keras.ops.reshape(x, [input_shape[0], input_shape[2], -1]) + x = keras.ops.transpose(x, [0, 2, 1]) x = self.norm(x) k = self.k(x) v = self.v(x) - k = k.transpose([0, 2, 1, 3]).reshape( + k = keras.ops.reshape( + keras.ops.transpose(k, [0, 2, 1, 3]), [ input_shape[0], -1, self.num_heads, input_shape[2] // self.num_heads, - ] + ], ) - v = ( - v.transpose([0, 2, 1, 3]).reshape( - [ - input_shape[0], - -1, - self.num_heads, - input_shape[2] // self.num_heads, - ] - ), + + v = keras.ops.reshape( + keras.ops.transpose(v, [0, 2, 1, 3]), + [ + input_shape[0], + -1, + self.num_heads, + input_shape[2] // self.num_heads, + ], ) - attn = (q @ x.transpose([0, 1, 3, 2])) * self.scale + attn = (q @ keras.ops.transpose(x, [0, 1, 3, 2])) * self.scale attn = keras.nn.ops.softmax(attn, axis=-1) attn = attn @ v - attn = attn.transpose(attn, [0, 2, 1, 3]).reshape( - [input_shape[0], input_shape[1], input_shape[2]] + attn = keras.ops.reshape( + keras.ops.transpose(attn, [0, 2, 1, 3]), + [input_shape[0], input_shape[1], input_shape[2]], ) x = self.proj(attn) return x diff --git a/keras_cv/layers/hierarchical_transformer_encoder.py b/keras_cv/layers/hierarchical_transformer_encoder.py index 73b00647b0..5d2fb43667 100644 --- a/keras_cv/layers/hierarchical_transformer_encoder.py +++ b/keras_cv/layers/hierarchical_transformer_encoder.py @@ -1,6 +1,8 @@ from keras_cv.backend import keras -from keras_cv.layers import EfficientMultiheadAttention -from keras_cv.layers import StochasticDepth +from keras_cv.layers.efficient_multihead_attention import ( + EfficientMultiheadAttention, +) +from keras_cv.layers.regularization.stochastic_depth import StochasticDepth @keras.saving.register_keras_serializable(package="keras_cv") @@ -21,7 +23,7 @@ def __init__( ) self.drop_path = StochasticDepth(drop_prob) self.norm2 = keras.layers.LayerNormalization(epsilon=layer_norm_epsilon) - self.mlp = __MixFFN( + self.mlp = self.__MixFFN( channels=project_dim, mid_channels=int(project_dim * 4), ) @@ -31,25 +33,25 @@ def call(self, x, H, W): x = x + self.drop_path(self.mlp(self.norm2(x), H, W)) return x + class __MixFFN(keras.layers.Layer): + def __init__(self, channels, mid_channels): + super().__init__() + self.fc1 = keras.layers.Dense(mid_channels) + self.dwconv = keras.layers.DepthwiseConv2D( + kernel_size=3, + strides=1, + padding="same", + ) + self.fc2 = keras.layers.Dense(channels) -class __MixFFN(keras.layers.Layer): - def __init__(self, channels, mid_channels): - super().__init__() - self.fc1 = keras.layers.Dense(mid_channels) - self.dwconv = keras.layers.DepthwiseConv2D( - kernel_size=3, - strides=1, - padding="same", - ) - self.fc2 = keras.layers.Dense(channels) + def call(self, x, H, W): + x = self.fc1(x) + # B, DIM, C + input_shape = x.shape - def call(self, x, H, W): - x = self.fc1(x) - # B, DIM, C - input_shape = x.shape - x = x.reshape((input_shape[0], H, W, input_shape[-1])) - x = self.dwconv(x) - x = x.reshape((input_shape[0], -1, input_shape[-1])) - x = keras.nn.ops.gelu(x) - x = self.fc2(x) - return x + x = keras.ops.reshape(x, (input_shape[0], H, W, input_shape[-1])) + x = self.dwconv(x) + x = keras.ops.reshape(x, (input_shape[0], -1, input_shape[-1])) + x = keras.nn.ops.gelu(x) + x = self.fc2(x) + return x diff --git a/keras_cv/layers/overlapping_patching_embedding.py b/keras_cv/layers/overlapping_patching_embedding.py index 3e3e5daaef..d108c3bc50 100644 --- a/keras_cv/layers/overlapping_patching_embedding.py +++ b/keras_cv/layers/overlapping_patching_embedding.py @@ -17,6 +17,6 @@ def call(self, x): x = self.proj(x) # B, H, W, C shape = x.shape - x = x.reshape((-1, shape[1] * shape[2], shape[3])) + x = keras.ops.reshape(x, (-1, shape[1] * shape[2], shape[3])) x = self.norm(x) return x, shape[1], shape[2] diff --git a/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py index 65b5a983e7..f3071c6d59 100644 --- a/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py +++ b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py @@ -18,6 +18,8 @@ """ # noqa: E501 +import numpy as np + from keras_cv import layers as cv_layers from keras_cv.backend import keras from keras_cv.models import utils @@ -29,7 +31,6 @@ backbone_presets_with_weights, ) from keras_cv.utils.python_utils import classproperty -import numpy as np @keras.saving.register_keras_serializable(package="keras_cv.models") @@ -70,7 +71,6 @@ def __init__( for i in range(num_stages): patch_embed_layer = cv_layers.OverlappingPatchingAndEmbedding( - in_channels=input_shape[-1] if i == 0 else embed_dims[i - 1], out_channels=embed_dims[0] if i == 0 else embed_dims[i], patch_size=7 if i == 0 else 3, stride=4 if i == 0 else 2, From 7a0151b7fb12ff4475df39403c7bd1e255e6dfc0 Mon Sep 17 00:00:00 2001 From: DavidLandup0 Date: Sun, 23 Jul 2023 21:58:54 +0200 Subject: [PATCH 09/53] cleanups and fixes --- .../layers/efficient_multihead_attention.py | 52 ++++++++++--------- .../hierarchical_transformer_encoder.py | 39 +++++++++----- .../layers/overlapping_patching_embedding.py | 2 +- .../mix_transformer_backbone.py | 38 +++++--------- 4 files changed, 66 insertions(+), 65 deletions(-) diff --git a/keras_cv/layers/efficient_multihead_attention.py b/keras_cv/layers/efficient_multihead_attention.py index e465ab08d5..11edec72f2 100644 --- a/keras_cv/layers/efficient_multihead_attention.py +++ b/keras_cv/layers/efficient_multihead_attention.py @@ -1,3 +1,5 @@ +import math + from keras_cv.backend import keras """ @@ -26,11 +28,18 @@ def __init__(self, project_dim, num_heads, sr_ratio): ) self.norm = keras.layers.LayerNormalization() - def call(self, x, H, W): - input_shape = x.shape + def call(self, x): + input_shape = keras.ops.shape(x) + H, W = keras.ops.sqrt( + keras.ops.cast(input_shape[1], "float32") + ), keras.ops.sqrt(keras.ops.cast(input_shape[1], "float32")) + B, C = keras.ops.cast(input_shape[0], "float32"), keras.ops.cast( + input_shape[2], "float32" + ) q = self.q(x) - q = keras.ops.reshape.reshape( + + q = keras.ops.reshape( q, ( input_shape[0], @@ -39,13 +48,12 @@ def call(self, x, H, W): input_shape[2] // self.num_heads, ), ) - - q = q.transpose([0, 2, 1, 3]) + q = keras.ops.transpose(q, [0, 2, 1, 3]) if self.sr_ratio > 1: x = keras.ops.reshape( keras.ops.transpose(x, [0, 2, 1]), - (input_shape[0], H, W, input_shape[2]), + (B, H, W, C), ) x = self.sr(x) x = keras.ops.reshape(x, [input_shape[0], input_shape[2], -1]) @@ -55,28 +63,24 @@ def call(self, x, H, W): k = self.k(x) v = self.v(x) - k = keras.ops.reshape( - keras.ops.transpose(k, [0, 2, 1, 3]), - [ - input_shape[0], - -1, - self.num_heads, - input_shape[2] // self.num_heads, - ], + k = keras.ops.transpose( + keras.ops.reshape( + k, + [B, -1, self.num_heads, C // self.num_heads], + ), + [0, 2, 1, 3], ) - v = keras.ops.reshape( - keras.ops.transpose(v, [0, 2, 1, 3]), - [ - input_shape[0], - -1, - self.num_heads, - input_shape[2] // self.num_heads, - ], + v = keras.ops.transpose( + keras.ops.reshape( + v, + [B, -1, self.num_heads, C // self.num_heads], + ), + [0, 2, 1, 3], ) - attn = (q @ keras.ops.transpose(x, [0, 1, 3, 2])) * self.scale - attn = keras.nn.ops.softmax(attn, axis=-1) + attn = (q @ keras.ops.transpose(k, [0, 1, 3, 2])) * self.scale + attn = keras.ops.nn.softmax(attn, axis=-1) attn = attn @ v attn = keras.ops.reshape( diff --git a/keras_cv/layers/hierarchical_transformer_encoder.py b/keras_cv/layers/hierarchical_transformer_encoder.py index 5d2fb43667..bb83c2cefa 100644 --- a/keras_cv/layers/hierarchical_transformer_encoder.py +++ b/keras_cv/layers/hierarchical_transformer_encoder.py @@ -1,8 +1,10 @@ +import math + from keras_cv.backend import keras from keras_cv.layers.efficient_multihead_attention import ( EfficientMultiheadAttention, ) -from keras_cv.layers.regularization.stochastic_depth import StochasticDepth +from keras_cv.layers.regularization.drop_path import DropPath @keras.saving.register_keras_serializable(package="keras_cv") @@ -21,19 +23,23 @@ def __init__( self.attn = EfficientMultiheadAttention( project_dim, num_heads, sr_ratio ) - self.drop_path = StochasticDepth(drop_prob) + self.drop_path = DropPath(drop_prob) self.norm2 = keras.layers.LayerNormalization(epsilon=layer_norm_epsilon) - self.mlp = self.__MixFFN( + self.mlp = self.MixFFN( channels=project_dim, mid_channels=int(project_dim * 4), ) - def call(self, x, H, W): - x = x + self.drop_path(self.attn(self.norm1(x), H, W)) - x = x + self.drop_path(self.mlp(self.norm2(x), H, W)) + def build(self, input_shape): + self.H = keras.ops.sqrt(keras.ops.cast(input_shape[1], "float32")) + self.W = keras.ops.sqrt(keras.ops.cast(input_shape[2], "float32")) + + def call(self, x): + x = x + self.drop_path(self.attn(self.norm1(x))) + x = x + self.drop_path(self.mlp(self.norm2(x))) return x - class __MixFFN(keras.layers.Layer): + class MixFFN(keras.layers.Layer): def __init__(self, channels, mid_channels): super().__init__() self.fc1 = keras.layers.Dense(mid_channels) @@ -44,14 +50,19 @@ def __init__(self, channels, mid_channels): ) self.fc2 = keras.layers.Dense(channels) - def call(self, x, H, W): + def call(self, x): x = self.fc1(x) - # B, DIM, C - input_shape = x.shape - - x = keras.ops.reshape(x, (input_shape[0], H, W, input_shape[-1])) + shape = keras.ops.shape(x) + B, C = keras.ops.cast(shape[0], "float32"), keras.ops.cast( + shape[-1], "float32" + ) + H, W = keras.ops.sqrt( + keras.ops.cast(shape[1], "float32") + ), keras.ops.sqrt(keras.ops.cast(shape[1], "float32")) + # print(B, C, H, W) + x = keras.ops.reshape(x, (B, H, W, C)) x = self.dwconv(x) - x = keras.ops.reshape(x, (input_shape[0], -1, input_shape[-1])) - x = keras.nn.ops.gelu(x) + x = keras.ops.reshape(x, (B, -1, C)) + x = keras.ops.nn.gelu(x) x = self.fc2(x) return x diff --git a/keras_cv/layers/overlapping_patching_embedding.py b/keras_cv/layers/overlapping_patching_embedding.py index d108c3bc50..463507e109 100644 --- a/keras_cv/layers/overlapping_patching_embedding.py +++ b/keras_cv/layers/overlapping_patching_embedding.py @@ -19,4 +19,4 @@ def call(self, x): shape = x.shape x = keras.ops.reshape(x, (-1, shape[1] * shape[2], shape[3])) x = self.norm(x) - return x, shape[1], shape[2] + return x diff --git a/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py index f3071c6d59..389d728ead 100644 --- a/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py +++ b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py @@ -39,25 +39,10 @@ def __init__( self, input_shape=None, input_tensor=None, - classes=None, - include_top=None, embed_dims=None, depths=None, - pooling=None, **kwargs, ): - if include_top and not classes: - raise ValueError( - "If `include_top` is True, you should specify `classes`. " - f"Received: classes={classes}" - ) - - if include_top and pooling: - raise ValueError( - f"`pooling` must be `None` when `include_top=True`." - f"Received pooling={pooling} and include_top={include_top}. " - ) - drop_path_rate = 0.1 dpr = [x for x in np.linspace(0.0, drop_path_rate, sum(depths))] blockwise_num_heads = [1, 2, 5, 8] @@ -95,15 +80,23 @@ def __init__( inputs = utils.parse_model_inputs(input_shape, input_tensor) x = inputs - batch_size = x.shape[0] + batch_size = keras.ops.shape(x)[0] pyramid_level_inputs = [] for i in range(num_stages): - x, H, W = patch_embedding_layers[i](x) + # Compute new height/width after the `proj` + # call in `OverlappingPatchingAndEmbedding` + stride = 4 if i == 0 else 2 + new_height, new_width = ( + int(keras.ops.shape(x)[1] / stride), + int(keras.ops.shape(x)[2] / stride), + ) + + x = patch_embedding_layers[i](x) for blk in transformer_blocks[i]: - x = blk(x, H, W) + x = blk(x) x = layer_norms[i](x) C = x.shape[-1] - x = x.reshape((batch_size, H, W, C)) + x = keras.ops.reshape(x, (batch_size, new_height, new_width, C)) pyramid_level_inputs.append(x) super().__init__( @@ -115,11 +108,7 @@ def __init__( self.channels = embed_dims self.num_stages = num_stages self.output_channels = embed_dims - self.classes = classes - self.include_top = include_top self.pyramid_level_inputs = pyramid_level_inputs - self.pooling = pooling - self.patch_embedding_layers = [] self.transformer_blocks = [] @@ -130,9 +119,6 @@ def get_config(self): "channels": self.channels, "num_stages": self.num_stages, "output_channels": self.output_channels, - "classes": self.classes, - "include_top": self.include_top, - "pooling": self.pooling, } ) return config From 44f01affac4489444e7d2433758816984b7eebc8 Mon Sep 17 00:00:00 2001 From: DavidLandup0 Date: Sun, 23 Jul 2023 22:36:44 +0200 Subject: [PATCH 10/53] fix reshaping --- .../hierarchical_transformer_encoder.py | 4 ---- .../mix_transformer_backbone.py | 21 ++++++++++++++++++- 2 files changed, 20 insertions(+), 5 deletions(-) diff --git a/keras_cv/layers/hierarchical_transformer_encoder.py b/keras_cv/layers/hierarchical_transformer_encoder.py index bb83c2cefa..e752886c71 100644 --- a/keras_cv/layers/hierarchical_transformer_encoder.py +++ b/keras_cv/layers/hierarchical_transformer_encoder.py @@ -30,10 +30,6 @@ def __init__( mid_channels=int(project_dim * 4), ) - def build(self, input_shape): - self.H = keras.ops.sqrt(keras.ops.cast(input_shape[1], "float32")) - self.W = keras.ops.sqrt(keras.ops.cast(input_shape[2], "float32")) - def call(self, x): x = x + self.drop_path(self.attn(self.norm1(x))) x = x + self.drop_path(self.mlp(self.norm2(x))) diff --git a/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py index 389d728ead..4dcf9487a3 100644 --- a/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py +++ b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py @@ -18,6 +18,8 @@ """ # noqa: E501 +import math + import numpy as np from keras_cv import layers as cv_layers @@ -32,6 +34,8 @@ ) from keras_cv.utils.python_utils import classproperty +# from keras import KerasTensor + @keras.saving.register_keras_serializable(package="keras_cv.models") class MiTBackbone(Backbone): @@ -96,7 +100,7 @@ def __init__( x = blk(x) x = layer_norms[i](x) C = x.shape[-1] - x = keras.ops.reshape(x, (batch_size, new_height, new_width, C)) + x = CustomReshaping(new_height, new_width)(x) pyramid_level_inputs.append(x) super().__init__( @@ -122,3 +126,18 @@ def get_config(self): } ) return config + + +@keras.saving.register_keras_serializable(package="keras_cv") +class CustomReshaping(keras.layers.Layer): + def __init__(self, H, W): + super().__init__() + self.H = H + self.W = W + + def call(self, x): + input_shape = keras.ops.shape(x) + x = keras.ops.reshape( + x, (input_shape[0], self.H, self.W, input_shape[-1]) + ) + return x From eb5b5ae874c18bd29e43c0ff528099239d42e5c1 Mon Sep 17 00:00:00 2001 From: DavidLandup0 Date: Sun, 23 Jul 2023 22:42:18 +0200 Subject: [PATCH 11/53] comments --- .../layers/efficient_multihead_attention.py | 6 +++++- .../hierarchical_transformer_encoder.py | 4 ++++ .../mix_transformer_backbone.py | 21 ++++++++++++------- 3 files changed, 22 insertions(+), 9 deletions(-) diff --git a/keras_cv/layers/efficient_multihead_attention.py b/keras_cv/layers/efficient_multihead_attention.py index 11edec72f2..a3e7e2d3b7 100644 --- a/keras_cv/layers/efficient_multihead_attention.py +++ b/keras_cv/layers/efficient_multihead_attention.py @@ -3,7 +3,11 @@ from keras_cv.backend import keras """ -Based on: https://github.com/sithu31296/semantic-segmentation/blob/main/semseg/models/backbones/mit.py +Acknowledgement: + +This implementation is based on the PyTorch implementations from: + - NVlabs' official implementation: https://github.com/NVlabs/SegFormer/blob/master/mmseg/models/backbones/mix_transformer.py + - @sithu31296's reimplementation: https://github.com/sithu31296/semantic-segmentation/blob/main/semseg/models/backbones/mit.py """ diff --git a/keras_cv/layers/hierarchical_transformer_encoder.py b/keras_cv/layers/hierarchical_transformer_encoder.py index e752886c71..bb83c2cefa 100644 --- a/keras_cv/layers/hierarchical_transformer_encoder.py +++ b/keras_cv/layers/hierarchical_transformer_encoder.py @@ -30,6 +30,10 @@ def __init__( mid_channels=int(project_dim * 4), ) + def build(self, input_shape): + self.H = keras.ops.sqrt(keras.ops.cast(input_shape[1], "float32")) + self.W = keras.ops.sqrt(keras.ops.cast(input_shape[2], "float32")) + def call(self, x): x = x + self.drop_path(self.attn(self.norm1(x))) x = x + self.drop_path(self.mlp(self.norm2(x))) diff --git a/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py index 4dcf9487a3..57d7cb09d9 100644 --- a/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py +++ b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py @@ -18,7 +18,7 @@ """ # noqa: E501 -import math +import copy import numpy as np @@ -34,8 +34,6 @@ ) from keras_cv.utils.python_utils import classproperty -# from keras import KerasTensor - @keras.saving.register_keras_serializable(package="keras_cv.models") class MiTBackbone(Backbone): @@ -103,11 +101,7 @@ def __init__( x = CustomReshaping(new_height, new_width)(x) pyramid_level_inputs.append(x) - super().__init__( - inputs=inputs, - outputs=x, - **kwargs, - ) + super().__init__(inputs=inputs, outputs=x, **kwargs) self.channels = embed_dims self.num_stages = num_stages @@ -141,3 +135,14 @@ def call(self, x): x, (input_shape[0], self.H, self.W, input_shape[-1]) ) return x + + @classproperty + def presets(cls): + """Dictionary of preset names and configurations.""" + return copy.deepcopy(backbone_presets) + + @classproperty + def presets_with_weights(cls): + """Dictionary of preset names and configurations that include + weights.""" + return copy.deepcopy(backbone_presets_with_weights) From ea0239ff75c08eb77bceb683b7fa26ffa19c1ab3 Mon Sep 17 00:00:00 2001 From: DavidLandup0 Date: Sun, 23 Jul 2023 23:09:05 +0200 Subject: [PATCH 12/53] from presets api, keras.ops -> ops --- .../layers/efficient_multihead_attention.py | 41 ++- .../hierarchical_transformer_encoder.py | 21 +- .../layers/overlapping_patching_embedding.py | 3 +- .../mix_transformer_aliases.py | 257 ++++++++++++++++++ .../mix_transformer_backbone.py | 48 ++-- .../mix_transformer_backbone_presets.py | 30 +- 6 files changed, 338 insertions(+), 62 deletions(-) create mode 100644 keras_cv/models/backbones/mix_transformer/mix_transformer_aliases.py diff --git a/keras_cv/layers/efficient_multihead_attention.py b/keras_cv/layers/efficient_multihead_attention.py index a3e7e2d3b7..2c832ff40f 100644 --- a/keras_cv/layers/efficient_multihead_attention.py +++ b/keras_cv/layers/efficient_multihead_attention.py @@ -1,6 +1,5 @@ -import math - from keras_cv.backend import keras +from keras_cv.backend import ops """ Acknowledgement: @@ -33,17 +32,17 @@ def __init__(self, project_dim, num_heads, sr_ratio): self.norm = keras.layers.LayerNormalization() def call(self, x): - input_shape = keras.ops.shape(x) - H, W = keras.ops.sqrt( - keras.ops.cast(input_shape[1], "float32") - ), keras.ops.sqrt(keras.ops.cast(input_shape[1], "float32")) - B, C = keras.ops.cast(input_shape[0], "float32"), keras.ops.cast( + input_shape = ops.shape(x) + H, W = ops.sqrt(ops.cast(input_shape[1], "float32")), ops.sqrt( + ops.cast(input_shape[1], "float32") + ) + B, C = ops.cast(input_shape[0], "float32"), ops.cast( input_shape[2], "float32" ) q = self.q(x) - q = keras.ops.reshape( + q = ops.reshape( q, ( input_shape[0], @@ -52,43 +51,43 @@ def call(self, x): input_shape[2] // self.num_heads, ), ) - q = keras.ops.transpose(q, [0, 2, 1, 3]) + q = ops.transpose(q, [0, 2, 1, 3]) if self.sr_ratio > 1: - x = keras.ops.reshape( - keras.ops.transpose(x, [0, 2, 1]), + x = ops.reshape( + ops.transpose(x, [0, 2, 1]), (B, H, W, C), ) x = self.sr(x) - x = keras.ops.reshape(x, [input_shape[0], input_shape[2], -1]) - x = keras.ops.transpose(x, [0, 2, 1]) + x = ops.reshape(x, [input_shape[0], input_shape[2], -1]) + x = ops.transpose(x, [0, 2, 1]) x = self.norm(x) k = self.k(x) v = self.v(x) - k = keras.ops.transpose( - keras.ops.reshape( + k = ops.transpose( + ops.reshape( k, [B, -1, self.num_heads, C // self.num_heads], ), [0, 2, 1, 3], ) - v = keras.ops.transpose( - keras.ops.reshape( + v = ops.transpose( + ops.reshape( v, [B, -1, self.num_heads, C // self.num_heads], ), [0, 2, 1, 3], ) - attn = (q @ keras.ops.transpose(k, [0, 1, 3, 2])) * self.scale - attn = keras.ops.nn.softmax(attn, axis=-1) + attn = (q @ ops.transpose(k, [0, 1, 3, 2])) * self.scale + attn = ops.nn.softmax(attn, axis=-1) attn = attn @ v - attn = keras.ops.reshape( - keras.ops.transpose(attn, [0, 2, 1, 3]), + attn = ops.reshape( + ops.transpose(attn, [0, 2, 1, 3]), [input_shape[0], input_shape[1], input_shape[2]], ) x = self.proj(attn) diff --git a/keras_cv/layers/hierarchical_transformer_encoder.py b/keras_cv/layers/hierarchical_transformer_encoder.py index bb83c2cefa..ad22c4261e 100644 --- a/keras_cv/layers/hierarchical_transformer_encoder.py +++ b/keras_cv/layers/hierarchical_transformer_encoder.py @@ -1,6 +1,7 @@ import math from keras_cv.backend import keras +from keras_cv.backend import ops from keras_cv.layers.efficient_multihead_attention import ( EfficientMultiheadAttention, ) @@ -31,8 +32,8 @@ def __init__( ) def build(self, input_shape): - self.H = keras.ops.sqrt(keras.ops.cast(input_shape[1], "float32")) - self.W = keras.ops.sqrt(keras.ops.cast(input_shape[2], "float32")) + self.H = ops.sqrt(ops.cast(input_shape[1], "float32")) + self.W = ops.sqrt(ops.cast(input_shape[2], "float32")) def call(self, x): x = x + self.drop_path(self.attn(self.norm1(x))) @@ -52,17 +53,15 @@ def __init__(self, channels, mid_channels): def call(self, x): x = self.fc1(x) - shape = keras.ops.shape(x) - B, C = keras.ops.cast(shape[0], "float32"), keras.ops.cast( - shape[-1], "float32" + shape = ops.shape(x) + B, C = ops.cast(shape[0], "float32"), ops.cast(shape[-1], "float32") + H, W = ops.sqrt(ops.cast(shape[1], "float32")), ops.sqrt( + ops.cast(shape[1], "float32") ) - H, W = keras.ops.sqrt( - keras.ops.cast(shape[1], "float32") - ), keras.ops.sqrt(keras.ops.cast(shape[1], "float32")) # print(B, C, H, W) - x = keras.ops.reshape(x, (B, H, W, C)) + x = ops.reshape(x, (B, H, W, C)) x = self.dwconv(x) - x = keras.ops.reshape(x, (B, -1, C)) - x = keras.ops.nn.gelu(x) + x = ops.reshape(x, (B, -1, C)) + x = ops.nn.gelu(x) x = self.fc2(x) return x diff --git a/keras_cv/layers/overlapping_patching_embedding.py b/keras_cv/layers/overlapping_patching_embedding.py index 463507e109..f9f48416b5 100644 --- a/keras_cv/layers/overlapping_patching_embedding.py +++ b/keras_cv/layers/overlapping_patching_embedding.py @@ -1,4 +1,5 @@ from keras_cv.backend import keras +from keras_cv.backend import ops @keras.saving.register_keras_serializable(package="keras_cv") @@ -17,6 +18,6 @@ def call(self, x): x = self.proj(x) # B, H, W, C shape = x.shape - x = keras.ops.reshape(x, (-1, shape[1] * shape[2], shape[3])) + x = ops.reshape(x, (-1, shape[1] * shape[2], shape[3])) x = self.norm(x) return x diff --git a/keras_cv/models/backbones/mix_transformer/mix_transformer_aliases.py b/keras_cv/models/backbones/mix_transformer/mix_transformer_aliases.py new file mode 100644 index 0000000000..bd326ffa0a --- /dev/null +++ b/keras_cv/models/backbones/mix_transformer/mix_transformer_aliases.py @@ -0,0 +1,257 @@ +# Copyright 2023 The KerasCV Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy + +from keras_cv.models.backbones.mix_transformer.mix_transformer_backbone import ( + MiTBackbone, +) +from keras_cv.models.backbones.mix_transformer.mix_transformer_backbone_presets import ( # noqa: E501 + backbone_presets, +) +from keras_cv.utils.python_utils import classproperty + +ALIAS_DOCSTRING = """MiT model. + + For transfer learning use cases, make sure to read the + [guide to transfer learning & fine-tuning](https://keras.io/guides/transfer_learning/). + + Args: + include_rescaling: bool, whether to rescale the inputs. If set to + True, inputs will be passed through a `Rescaling(scale=1 / 255)` + layer. Defaults to True. + input_shape: optional shape tuple, defaults to (None, None, 3). + input_tensor: optional Keras tensor (i.e., output of `layers.Input()`) + to use as image input for the model. + + Examples: + ```python + input_data = tf.ones(shape=(8, 224, 224, 3)) + + # Randomly initialized backbone + model = {name}Backbone() + output = model(input_data) + ``` +""" # noqa: E501 + + +class MiTB0Backbone(MiTBackbone): + def __new__( + cls, + include_rescaling=True, + input_shape=(None, None, 3), + input_tensor=None, + **kwargs, + ): + # Pack args in kwargs + kwargs.update( + { + "include_rescaling": include_rescaling, + "input_shape": input_shape, + "input_tensor": input_tensor, + } + ) + return MiTBackbone.from_preset("MiT_B0", **kwargs) + + @classproperty + def presets(cls): + """Dictionary of preset names and configurations.""" + return {} + + @classproperty + def presets_with_weights(cls): + """Dictionary of preset names and configurations.""" + return {} + + +class MiTB1Backbone(MiTBackbone): + def __new__( + cls, + include_rescaling=True, + input_shape=(None, None, 3), + input_tensor=None, + **kwargs, + ): + # Pack args in kwargs + kwargs.update( + { + "include_rescaling": include_rescaling, + "input_shape": input_shape, + "input_tensor": input_tensor, + } + ) + return MiTBackbone.from_preset("MiT_B1", **kwargs) + + @classproperty + def presets(cls): + """Dictionary of preset names and configurations.""" + return {} + + @classproperty + def presets_with_weights(cls): + """Dictionary of preset names and configurations.""" + return {} + + +class MiTB2Backbone(MiTBackbone): + def __new__( + cls, + include_rescaling=True, + input_shape=(None, None, 3), + input_tensor=None, + **kwargs, + ): + # Pack args in kwargs + kwargs.update( + { + "include_rescaling": include_rescaling, + "input_shape": input_shape, + "input_tensor": input_tensor, + } + ) + return MiTBackbone.from_preset("MiT_B2", **kwargs) + + @classproperty + def presets(cls): + """Dictionary of preset names and configurations.""" + return {} + + @classproperty + def presets_with_weights(cls): + """Dictionary of preset names and configurations.""" + return {} + + +class MiTB3Backbone(MiTBackbone): + def __new__( + cls, + include_rescaling=True, + input_shape=(None, None, 3), + input_tensor=None, + **kwargs, + ): + # Pack args in kwargs + kwargs.update( + { + "include_rescaling": include_rescaling, + "input_shape": input_shape, + "input_tensor": input_tensor, + } + ) + return MiTBackbone.from_preset("MiT_B3", **kwargs) + + @classproperty + def presets(cls): + """Dictionary of preset names and configurations.""" + return {} + + @classproperty + def presets_with_weights(cls): + """Dictionary of preset names and configurations.""" + return {} + + +class MiTB4Backbone(MiTBackbone): + def __new__( + cls, + include_rescaling=True, + input_shape=(None, None, 3), + input_tensor=None, + **kwargs, + ): + # Pack args in kwargs + kwargs.update( + { + "include_rescaling": include_rescaling, + "input_shape": input_shape, + "input_tensor": input_tensor, + } + ) + return MiTBackbone.from_preset("MiT_B4", **kwargs) + + @classproperty + def presets(cls): + """Dictionary of preset names and configurations.""" + return {} + + @classproperty + def presets_with_weights(cls): + """Dictionary of preset names and configurations.""" + return {} + + +class MiTB5Backbone(MiTBackbone): + def __new__( + cls, + include_rescaling=True, + input_shape=(None, None, 3), + input_tensor=None, + **kwargs, + ): + # Pack args in kwargs + kwargs.update( + { + "include_rescaling": include_rescaling, + "input_shape": input_shape, + "input_tensor": input_tensor, + } + ) + return MiTBackbone.from_preset("MiT_B5", **kwargs) + + @classproperty + def presets(cls): + """Dictionary of preset names and configurations.""" + return {} + + @classproperty + def presets_with_weights(cls): + """Dictionary of preset names and configurations.""" + return {} + + +setattr( + MiTB0Backbone, + "__doc__", + ALIAS_DOCSTRING.format(name="MiTB0"), +) + +setattr( + MiTB1Backbone, + "__doc__", + ALIAS_DOCSTRING.format(name="MiTB1"), +) + +setattr( + MiTB2Backbone, + "__doc__", + ALIAS_DOCSTRING.format(name="MiTB2"), +) + +setattr( + MiTB3Backbone, + "__doc__", + ALIAS_DOCSTRING.format(name="MiTB3"), +) + +setattr( + MiTB4Backbone, + "__doc__", + ALIAS_DOCSTRING.format(name="MiTB4"), +) + +setattr( + MiTB5Backbone, + "__doc__", + ALIAS_DOCSTRING.format(name="MiTB5"), +) diff --git a/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py index 57d7cb09d9..f65fb79be1 100644 --- a/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py +++ b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py @@ -24,6 +24,7 @@ from keras_cv import layers as cv_layers from keras_cv.backend import keras +from keras_cv.backend import ops from keras_cv.models import utils from keras_cv.models.backbones.backbone import Backbone from keras_cv.models.backbones.mix_transformer.mix_transformer_backbone_presets import ( # noqa: E501 @@ -39,7 +40,8 @@ class MiTBackbone(Backbone): def __init__( self, - input_shape=None, + include_rescaling, + input_shape=(None, None, 3), input_tensor=None, embed_dims=None, depths=None, @@ -82,15 +84,17 @@ def __init__( inputs = utils.parse_model_inputs(input_shape, input_tensor) x = inputs - batch_size = keras.ops.shape(x)[0] + if include_rescaling: + x = keras.layers.Rescaling(scale=1 / 255)(x) + pyramid_level_inputs = [] for i in range(num_stages): # Compute new height/width after the `proj` # call in `OverlappingPatchingAndEmbedding` stride = 4 if i == 0 else 2 new_height, new_width = ( - int(keras.ops.shape(x)[1] / stride), - int(keras.ops.shape(x)[2] / stride), + int(ops.shape(x)[1] / stride), + int(ops.shape(x)[2] / stride), ) x = patch_embedding_layers[i](x) @@ -103,12 +107,11 @@ def __init__( super().__init__(inputs=inputs, outputs=x, **kwargs) - self.channels = embed_dims self.num_stages = num_stages self.output_channels = embed_dims - self.pyramid_level_inputs = pyramid_level_inputs - self.patch_embedding_layers = [] - self.transformer_blocks = [] + self.pyramid_level_inputs = { + f"P{i + 1}": name for i, name in enumerate(pyramid_level_inputs) + } def get_config(self): config = super().get_config() @@ -117,25 +120,11 @@ def get_config(self): "channels": self.channels, "num_stages": self.num_stages, "output_channels": self.output_channels, + "pyramid_level_inputs": self.pyramid_level_inputs, } ) return config - -@keras.saving.register_keras_serializable(package="keras_cv") -class CustomReshaping(keras.layers.Layer): - def __init__(self, H, W): - super().__init__() - self.H = H - self.W = W - - def call(self, x): - input_shape = keras.ops.shape(x) - x = keras.ops.reshape( - x, (input_shape[0], self.H, self.W, input_shape[-1]) - ) - return x - @classproperty def presets(cls): """Dictionary of preset names and configurations.""" @@ -146,3 +135,16 @@ def presets_with_weights(cls): """Dictionary of preset names and configurations that include weights.""" return copy.deepcopy(backbone_presets_with_weights) + + +@keras.saving.register_keras_serializable(package="keras_cv") +class CustomReshaping(keras.layers.Layer): + def __init__(self, H, W): + super().__init__() + self.H = H + self.W = W + + def call(self, x): + input_shape = ops.shape(x) + x = ops.reshape(x, (input_shape[0], self.H, self.W, input_shape[-1])) + return x diff --git a/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets.py b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets.py index b890282a91..62f3a65d5f 100644 --- a/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets.py +++ b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets.py @@ -25,8 +25,11 @@ }, "class_name": "keras_cv.models>MiTBackbone", "config": { - "embedding_dims": [32, 64, 160, 256], + "embed_dims": [32, 64, 160, 256], "depths": [2, 2, 2, 2], + "include_rescaling": True, + "input_shape": (224, 224, 3), + "input_tensor": None, }, }, "MiT_B1": { @@ -40,8 +43,11 @@ }, "class_name": "keras_cv.models>MiTBackbone", "config": { - "embedding_dims": [64, 128, 320, 512], + "embed_dims": [64, 128, 320, 512], "depths": [2, 2, 2, 2], + "include_rescaling": True, + "input_shape": (224, 224, 3), + "input_tensor": None, }, }, "MiT_B2": { @@ -55,8 +61,11 @@ }, "class_name": "keras_cv.models>MiTBackbone", "config": { - "embedding_dims": [64, 128, 320, 512], + "embed_dims": [64, 128, 320, 512], "depths": [3, 4, 6, 3], + "include_rescaling": True, + "input_shape": (224, 224, 3), + "input_tensor": None, }, }, "MiT_B3": { @@ -70,8 +79,11 @@ }, "class_name": "keras_cv.models>MiTBackbone", "config": { - "embedding_dims": [64, 128, 320, 512], + "embed_dims": [64, 128, 320, 512], "depths": [3, 4, 18, 3], + "include_rescaling": True, + "input_shape": (224, 224, 3), + "input_tensor": None, }, }, "MiT_B4": { @@ -85,8 +97,11 @@ }, "class_name": "keras_cv.models>MiTBackbone", "config": { - "embedding_dims": [64, 128, 320, 512], + "embed_dims": [64, 128, 320, 512], "depths": [3, 8, 27, 3], + "include_rescaling": True, + "input_shape": (224, 224, 3), + "input_tensor": None, }, }, "MiT_B5": { @@ -100,8 +115,11 @@ }, "class_name": "keras_cv.models>MiTBackbone", "config": { - "embedding_dims": [64, 128, 320, 512], + "embed_dims": [64, 128, 320, 512], "depths": [3, 6, 40, 3], + "include_rescaling": True, + "input_shape": (224, 224, 3), + "input_tensor": None, }, }, } From b6128a523411e2204c3a9b20ba5fb96ddb78cc84 Mon Sep 17 00:00:00 2001 From: DavidLandup0 Date: Sun, 23 Jul 2023 23:09:56 +0200 Subject: [PATCH 13/53] embed_dims -> embedding_dims --- .../mix_transformer/mix_transformer_backbone.py | 8 ++++---- .../mix_transformer_backbone_presets.py | 12 ++++++------ 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py index f65fb79be1..a49bd4e2e2 100644 --- a/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py +++ b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py @@ -43,7 +43,7 @@ def __init__( include_rescaling, input_shape=(None, None, 3), input_tensor=None, - embed_dims=None, + embedding_dims=None, depths=None, **kwargs, ): @@ -60,7 +60,7 @@ def __init__( for i in range(num_stages): patch_embed_layer = cv_layers.OverlappingPatchingAndEmbedding( - out_channels=embed_dims[0] if i == 0 else embed_dims[i], + out_channels=embedding_dims[0] if i == 0 else embedding_dims[i], patch_size=7 if i == 0 else 3, stride=4 if i == 0 else 2, name=f"patch_and_embed_{i}", @@ -69,7 +69,7 @@ def __init__( transformer_block = [ cv_layers.HierarchicalTransformerEncoder( - project_dim=embed_dims[i], + project_dim=embedding_dims[i], num_heads=blockwise_num_heads[i], sr_ratio=blockwise_sr_ratios[i], drop_prob=dpr[cur + k], @@ -108,7 +108,7 @@ def __init__( super().__init__(inputs=inputs, outputs=x, **kwargs) self.num_stages = num_stages - self.output_channels = embed_dims + self.output_channels = embedding_dims self.pyramid_level_inputs = { f"P{i + 1}": name for i, name in enumerate(pyramid_level_inputs) } diff --git a/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets.py b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets.py index 62f3a65d5f..ca94d03472 100644 --- a/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets.py +++ b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets.py @@ -25,7 +25,7 @@ }, "class_name": "keras_cv.models>MiTBackbone", "config": { - "embed_dims": [32, 64, 160, 256], + "embedding_dims": [32, 64, 160, 256], "depths": [2, 2, 2, 2], "include_rescaling": True, "input_shape": (224, 224, 3), @@ -43,7 +43,7 @@ }, "class_name": "keras_cv.models>MiTBackbone", "config": { - "embed_dims": [64, 128, 320, 512], + "embedding_dims": [64, 128, 320, 512], "depths": [2, 2, 2, 2], "include_rescaling": True, "input_shape": (224, 224, 3), @@ -61,7 +61,7 @@ }, "class_name": "keras_cv.models>MiTBackbone", "config": { - "embed_dims": [64, 128, 320, 512], + "embedding_dims": [64, 128, 320, 512], "depths": [3, 4, 6, 3], "include_rescaling": True, "input_shape": (224, 224, 3), @@ -79,7 +79,7 @@ }, "class_name": "keras_cv.models>MiTBackbone", "config": { - "embed_dims": [64, 128, 320, 512], + "embedding_dims": [64, 128, 320, 512], "depths": [3, 4, 18, 3], "include_rescaling": True, "input_shape": (224, 224, 3), @@ -97,7 +97,7 @@ }, "class_name": "keras_cv.models>MiTBackbone", "config": { - "embed_dims": [64, 128, 320, 512], + "embedding_dims": [64, 128, 320, 512], "depths": [3, 8, 27, 3], "include_rescaling": True, "input_shape": (224, 224, 3), @@ -115,7 +115,7 @@ }, "class_name": "keras_cv.models>MiTBackbone", "config": { - "embed_dims": [64, 128, 320, 512], + "embedding_dims": [64, 128, 320, 512], "depths": [3, 6, 40, 3], "include_rescaling": True, "input_shape": (224, 224, 3), From 83221098385592c56a72071b7c5ce6e792b93e0d Mon Sep 17 00:00:00 2001 From: DavidLandup0 Date: Mon, 24 Jul 2023 16:56:22 +0200 Subject: [PATCH 14/53] addressing some PR comments --- .../hierarchical_transformer_encoder.py | 2 +- .../mix_transformer_backbone.py | 21 +++++-------------- 2 files changed, 6 insertions(+), 17 deletions(-) diff --git a/keras_cv/layers/hierarchical_transformer_encoder.py b/keras_cv/layers/hierarchical_transformer_encoder.py index ad22c4261e..b1dd5de89c 100644 --- a/keras_cv/layers/hierarchical_transformer_encoder.py +++ b/keras_cv/layers/hierarchical_transformer_encoder.py @@ -32,6 +32,7 @@ def __init__( ) def build(self, input_shape): + super().build() self.H = ops.sqrt(ops.cast(input_shape[1], "float32")) self.W = ops.sqrt(ops.cast(input_shape[2], "float32")) @@ -58,7 +59,6 @@ def call(self, x): H, W = ops.sqrt(ops.cast(shape[1], "float32")), ops.sqrt( ops.cast(shape[1], "float32") ) - # print(B, C, H, W) x = ops.reshape(x, (B, H, W, C)) x = self.dwconv(x) x = ops.reshape(x, (B, -1, C)) diff --git a/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py index a49bd4e2e2..d3d6d05f1b 100644 --- a/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py +++ b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py @@ -15,7 +15,10 @@ """MiT backbone model. References: - + - [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) (CVPR 2021) + - [Based on the TensorFlow implementation from DeepVision](https://github.com/DavidLandup0/deepvision/blob/main/deepvision/models/classification/mix_transformer/mit_tf.py) + - [Based on the NVlabs' official PyTorch implementation](https://github.com/NVlabs/SegFormer/blob/master/mmseg/models/backbones/mix_transformer.py) + - [Inspired by @sithu31296's reimplementation](https://github.com/sithu31296/semantic-segmentation/blob/main/semseg/models/backbones/mit.py) """ # noqa: E501 import copy @@ -101,8 +104,7 @@ def __init__( for blk in transformer_blocks[i]: x = blk(x) x = layer_norms[i](x) - C = x.shape[-1] - x = CustomReshaping(new_height, new_width)(x) + x = keras.layers.Reshape((new_height, new_width, -1))(x) pyramid_level_inputs.append(x) super().__init__(inputs=inputs, outputs=x, **kwargs) @@ -135,16 +137,3 @@ def presets_with_weights(cls): """Dictionary of preset names and configurations that include weights.""" return copy.deepcopy(backbone_presets_with_weights) - - -@keras.saving.register_keras_serializable(package="keras_cv") -class CustomReshaping(keras.layers.Layer): - def __init__(self, H, W): - super().__init__() - self.H = H - self.W = W - - def call(self, x): - input_shape = ops.shape(x) - x = ops.reshape(x, (input_shape[0], self.H, self.W, input_shape[-1])) - return x From 75bb4a26a840886903aa58504e69ac7f210470dc Mon Sep 17 00:00:00 2001 From: DavidLandup0 Date: Mon, 24 Jul 2023 17:33:03 +0200 Subject: [PATCH 15/53] docstrings, argument update --- .../layers/efficient_multihead_attention.py | 37 +++++++++--- .../hierarchical_transformer_encoder.py | 56 ++++++++++++++++++- .../layers/overlapping_patching_embedding.py | 52 ++++++++++++++++- .../mix_transformer_backbone.py | 2 +- 4 files changed, 135 insertions(+), 12 deletions(-) diff --git a/keras_cv/layers/efficient_multihead_attention.py b/keras_cv/layers/efficient_multihead_attention.py index 2c832ff40f..998e34c795 100644 --- a/keras_cv/layers/efficient_multihead_attention.py +++ b/keras_cv/layers/efficient_multihead_attention.py @@ -1,18 +1,39 @@ from keras_cv.backend import keras from keras_cv.backend import ops -""" -Acknowledgement: - -This implementation is based on the PyTorch implementations from: - - NVlabs' official implementation: https://github.com/NVlabs/SegFormer/blob/master/mmseg/models/backbones/mix_transformer.py - - @sithu31296's reimplementation: https://github.com/sithu31296/semantic-segmentation/blob/main/semseg/models/backbones/mit.py -""" - @keras.saving.register_keras_serializable(package="keras_cv") class EfficientMultiheadAttention(keras.layers.Layer): def __init__(self, project_dim, num_heads, sr_ratio): + """ + Efficient MultiHeadAttention implementation as a Keras layer. + A huge bottleneck in scaling transformers is the self-attention layer with an O(n^2) complexity. + + EfficientMultiHeadAttention performs a sequence reduction (SR) operation with a given ratio, to reduce + the sequence length before performing key and value projections, reducing the O(n^2) complexity to O(n^2/R) where + R is the sequence reduction ratio. + + References: + - [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) (CVPR 2021) + - [NVlabs' official implementation](https://github.com/NVlabs/SegFormer/blob/master/mmseg/models/backbones/mix_transformer.py) + - [@sithu31296's reimplementation](https://github.com/sithu31296/semantic-segmentation/blob/main/semseg/models/backbones/mit.py) + - [Ported from the TensorFlow implementation from DeepVision](https://github.com/DavidLandup0/deepvision/blob/main/deepvision/layers/efficient_attention.py) + + Args: + project_dim: the dimensionality of the projection of the `EfficientMultiHeadAttention` layer. + num_heads: the number of heads to use in the attention computation. + sr_ratio: the sequence reduction ratio to perform on the sequence before key and value projections. + + Basic usage: + + ``` + tensor = tf.random.uniform([1, 196, 32]) + output = keras_cv.layers.EfficientMultiheadAttention(project_dim=768, + num_heads=2, + sr_ratio=4)(tensor) + print(output.shape) # (1, 196, 32) + ``` + """ super().__init__() self.num_heads = num_heads self.sr_ratio = sr_ratio diff --git a/keras_cv/layers/hierarchical_transformer_encoder.py b/keras_cv/layers/hierarchical_transformer_encoder.py index b1dd5de89c..e4b17012e9 100644 --- a/keras_cv/layers/hierarchical_transformer_encoder.py +++ b/keras_cv/layers/hierarchical_transformer_encoder.py @@ -10,6 +10,45 @@ @keras.saving.register_keras_serializable(package="keras_cv") class HierarchicalTransformerEncoder(keras.layers.Layer): + """ + Hierarchical transformer encoder block implementation as a Keras Layer. + The layer uses `EfficientMultiheadAttention` as a `MultiHeadAttention` alternative for + computational efficiency, and is meant to be used within the SegFormer architecture. + + References: + - [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) (CVPR 2021) + - [Official PyTorch implementation](https://github.com/NVlabs/SegFormer/blob/master/mmseg/models/backbones/mix_transformer.py) + - [Ported from the TensorFlow implementation from DeepVision](https://github.com/DavidLandup0/deepvision/blob/main/deepvision/layers/hierarchical_transformer_encoder.py) + + Args: + project_dim: the dimensionality of the projection of the encoder, and + output of the `EfficientMultiHeadAttention` layer. Due to the residual addition + the input dimensionality has to be equal to the output dimensionality. + num_heads: the number of heads for the `EfficientMultiHeadAttention` layer + drop_prob: default 0.0, the probability of dropping a random sample using the `DropPath` layer. + layer_norm_epsilon: default 1e-06, the epsilon for `LayerNormalization` + layers + sr_ratio: default 1, the ratio to use within `EfficientMultiHeadAttention`. If set to > 1, + a `Conv2D` layer is used to reduce the length of the sequence. + + Basic usage: + + ``` + project_dim = 1024 + num_heads = 4 + patch_size = 16 + + encoded_patches = keras_cv.layers.OverlappingPatchingAndEmbedding( + project_dim=project_dim, patch_size=patch_size)(img_batch) + + trans_encoded = keras_cv.layers.HierarchicalTransformerEncoder(project_dim=project_dim, + num_heads=num_heads, + sr_ratio=1)(encoded_patches) + + print(trans_encoded.shape) # (1, 3136, 1024) + ``` + """ + def __init__( self, project_dim, @@ -20,6 +59,10 @@ def __init__( **kwargs, ): super().__init__(**kwargs) + self.project_dim = project_dim + self.num_heads = num_heads + self.drop_prop = drop_prob + self.norm1 = keras.layers.LayerNormalization(epsilon=layer_norm_epsilon) self.attn = EfficientMultiheadAttention( project_dim, num_heads, sr_ratio @@ -32,7 +75,7 @@ def __init__( ) def build(self, input_shape): - super().build() + super().build(input_shape) self.H = ops.sqrt(ops.cast(input_shape[1], "float32")) self.W = ops.sqrt(ops.cast(input_shape[2], "float32")) @@ -41,6 +84,17 @@ def call(self, x): x = x + self.drop_path(self.mlp(self.norm2(x))) return x + def get_config(self): + config = super().get_config() + config.update( + { + "project_dim": self.project_dim, + "num_heads": self.num_heads, + "drop_prop": self.drop_prop, + } + ) + return config + class MixFFN(keras.layers.Layer): def __init__(self, channels, mid_channels): super().__init__() diff --git a/keras_cv/layers/overlapping_patching_embedding.py b/keras_cv/layers/overlapping_patching_embedding.py index f9f48416b5..3e13c8a911 100644 --- a/keras_cv/layers/overlapping_patching_embedding.py +++ b/keras_cv/layers/overlapping_patching_embedding.py @@ -4,10 +4,47 @@ @keras.saving.register_keras_serializable(package="keras_cv") class OverlappingPatchingAndEmbedding(keras.layers.Layer): - def __init__(self, out_channels=32, patch_size=7, stride=4, **kwargs): + def __init__(self, project_dim=32, patch_size=7, stride=4, **kwargs): + """ + Overlapping Patching and Embedding layer. Differs from `PatchingAndEmbedding` in that the patch size + does not affect the sequence length. It's fully derived from the `stride` parameter. + Additionally, no positional embedding is done as part of the layer - only a projection using a `Conv2D` layer. + + References: + - [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) (CVPR 2021) + - [Official PyTorch implementation](https://github.com/NVlabs/SegFormer/blob/master/mmseg/models/backbones/mix_transformer.py) + - [Ported from the TensorFlow implementation from DeepVision](https://github.com/DavidLandup0/deepvision/blob/main/deepvision/layers/hierarchical_transformer_encoder.py) + + Args: + project_dim: the dimensionality of the projection of the encoder, and + output of the `MultiHeadAttention` + num_heads: the number of heads for the `MultiHeadAttention` layer + drop_prob: default 0.0, the probability of dropping a random sample using the `DropPath` layer. + layer_norm_epsilon: default 1e-06, the epsilon for `LayerNormalization` + layers + sr_ratio: default 1, the ratio to use within `EfficientMultiHeadAttention`. If set to > 1, + a `Conv2D` layer is used to reduce the length of the sequence. + + Basic usage: + + ``` + project_dim = 1024 + patch_size = 16 + + encoded_patches = keras_cv.layers.OverlappingPatchingAndEmbedding( + project_dim=project_dim, patch_size=patch_size)(img_batch) + + print(encoded_patches.shape) # (1, 3136, 1024) + ``` + """ super().__init__(**kwargs) + + self.project_dim = project_dim + self.patch_size = patch_size + self.stride = stride + self.proj = keras.layers.Conv2D( - filters=out_channels, + filters=project_dim, kernel_size=patch_size, strides=stride, padding="same", @@ -21,3 +58,14 @@ def call(self, x): x = ops.reshape(x, (-1, shape[1] * shape[2], shape[3])) x = self.norm(x) return x + + def get_config(self): + config = super().get_config() + config.update( + { + "project_dim": self.project_dim, + "patch_size": self.patch_size, + "stride": self.stride, + } + ) + return config diff --git a/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py index d3d6d05f1b..9f4116032c 100644 --- a/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py +++ b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py @@ -63,7 +63,7 @@ def __init__( for i in range(num_stages): patch_embed_layer = cv_layers.OverlappingPatchingAndEmbedding( - out_channels=embedding_dims[0] if i == 0 else embedding_dims[i], + project_dim=embedding_dims[0] if i == 0 else embedding_dims[i], patch_size=7 if i == 0 else 3, stride=4 if i == 0 else 2, name=f"patch_and_embed_{i}", From 97daf7cc005a1c4aa1886e30790f51d96eb0bbf6 Mon Sep 17 00:00:00 2001 From: DavidLandup0 Date: Mon, 24 Jul 2023 17:36:27 +0200 Subject: [PATCH 16/53] depths arg --- .../backbones/mix_transformer/mix_transformer_backbone.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py index 9f4116032c..9579aa33be 100644 --- a/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py +++ b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py @@ -44,10 +44,10 @@ class MiTBackbone(Backbone): def __init__( self, include_rescaling, + depths, input_shape=(None, None, 3), input_tensor=None, embedding_dims=None, - depths=None, **kwargs, ): drop_path_rate = 0.1 From 5f9dc0c3a10dd0ccbdf53c695f0c92306dc08e7b Mon Sep 17 00:00:00 2001 From: DavidLandup0 Date: Mon, 24 Jul 2023 22:43:11 +0200 Subject: [PATCH 17/53] sync --- .../segmentation/segformer/segformer.py | 27 +++++++++++++------ 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/keras_cv/models/segmentation/segformer/segformer.py b/keras_cv/models/segmentation/segformer/segformer.py index 1be42001ad..8245381b1d 100644 --- a/keras_cv/models/segmentation/segformer/segformer.py +++ b/keras_cv/models/segmentation/segformer/segformer.py @@ -10,21 +10,32 @@ def __init__( num_classes=None, backbone=None, embed_dim=None, - input_shape=None, - input_tensor=None, **kwargs, ): - inputs = utils.parse_model_inputs(input_shape, input_tensor) - x = inputs - y = backbone(x) + + if not isinstance(backbone, keras.layers.Layer) or not isinstance( + backbone, keras.Model + ): + raise ValueError( + "Argument `backbone` must be a `keras.layers.Layer` instance " + f" or `keras.Model`. Received instead " + f"backbone={backbone} (of type {type(backbone)})." + ) + + inputs = backbone.input + backbone(inputs) + + outputs = backbone.pyramid_level_inputs + y = SegFormerHead( in_dims=backbone.output_channels, embed_dim=embed_dim, num_classes=num_classes, name="segformer_head", - )(y) + )(outputs) + output = keras.layers.Resizing( - height=x.shape[1], width=x.shape[2], interpolation="bilinear" + height=inputs.shape[1], width=inputs.shape[2], interpolation="bilinear" )(y) super().__init__( @@ -69,7 +80,7 @@ def __init__(self, in_dims, embed_dim=256, num_classes=19, **kwargs): self.seg_out = keras.layers.Conv2D(filters=num_classes, kernel_size=1) def call(self, features): - B, H, W, _ = features[0].shape + _, H, W, _ = features[0].shape outs = [] for feature, layer in zip(features, self.linear_layers): From efbbd49d8432bd9c945896f7908e86947ac590de Mon Sep 17 00:00:00 2001 From: DavidLandup0 Date: Thu, 27 Jul 2023 00:02:04 +0200 Subject: [PATCH 18/53] compute output shapes --- keras_cv/layers/efficient_multihead_attention.py | 16 ++++++++-------- .../layers/hierarchical_transformer_encoder.py | 16 +++++++--------- .../models/segmentation/segformer/segformer.py | 5 +++-- 3 files changed, 18 insertions(+), 19 deletions(-) diff --git a/keras_cv/layers/efficient_multihead_attention.py b/keras_cv/layers/efficient_multihead_attention.py index 998e34c795..092ab562e6 100644 --- a/keras_cv/layers/efficient_multihead_attention.py +++ b/keras_cv/layers/efficient_multihead_attention.py @@ -1,3 +1,5 @@ +import math + from keras_cv.backend import keras from keras_cv.backend import ops @@ -52,17 +54,15 @@ def __init__(self, project_dim, num_heads, sr_ratio): ) self.norm = keras.layers.LayerNormalization() + def compute_output_shape(self, input_shape): + return input_shape + def call(self, x): - input_shape = ops.shape(x) - H, W = ops.sqrt(ops.cast(input_shape[1], "float32")), ops.sqrt( - ops.cast(input_shape[1], "float32") - ) - B, C = ops.cast(input_shape[0], "float32"), ops.cast( - input_shape[2], "float32" - ) + input_shape = x.shape + H, W = int(math.sqrt(input_shape[1])), int(math.sqrt(input_shape[1])) + B, C = input_shape[0], input_shape[-1] q = self.q(x) - q = ops.reshape( q, ( diff --git a/keras_cv/layers/hierarchical_transformer_encoder.py b/keras_cv/layers/hierarchical_transformer_encoder.py index e4b17012e9..58c170d246 100644 --- a/keras_cv/layers/hierarchical_transformer_encoder.py +++ b/keras_cv/layers/hierarchical_transformer_encoder.py @@ -76,8 +76,9 @@ def __init__( def build(self, input_shape): super().build(input_shape) - self.H = ops.sqrt(ops.cast(input_shape[1], "float32")) - self.W = ops.sqrt(ops.cast(input_shape[2], "float32")) + + def compute_output_shape(self, input_shape): + return input_shape def call(self, x): x = x + self.drop_path(self.attn(self.norm1(x))) @@ -108,14 +109,11 @@ def __init__(self, channels, mid_channels): def call(self, x): x = self.fc1(x) - shape = ops.shape(x) - B, C = ops.cast(shape[0], "float32"), ops.cast(shape[-1], "float32") - H, W = ops.sqrt(ops.cast(shape[1], "float32")), ops.sqrt( - ops.cast(shape[1], "float32") - ) - x = ops.reshape(x, (B, H, W, C)) + shape = x.shape + H, W = int(math.sqrt(shape[1])), int(math.sqrt(shape[1])) + x = ops.reshape(x, (shape[0], H, W, shape[-1])) x = self.dwconv(x) - x = ops.reshape(x, (B, -1, C)) + x = ops.reshape(x, (shape[0], -1, shape[-1])) x = ops.nn.gelu(x) x = self.fc2(x) return x diff --git a/keras_cv/models/segmentation/segformer/segformer.py b/keras_cv/models/segmentation/segformer/segformer.py index 8245381b1d..59cb28c93e 100644 --- a/keras_cv/models/segmentation/segformer/segformer.py +++ b/keras_cv/models/segmentation/segformer/segformer.py @@ -12,7 +12,6 @@ def __init__( embed_dim=None, **kwargs, ): - if not isinstance(backbone, keras.layers.Layer) or not isinstance( backbone, keras.Model ): @@ -35,7 +34,9 @@ def __init__( )(outputs) output = keras.layers.Resizing( - height=inputs.shape[1], width=inputs.shape[2], interpolation="bilinear" + height=inputs.shape[1], + width=inputs.shape[2], + interpolation="bilinear", )(y) super().__init__( From d3b43c6e239778370baf6e34f8c4c7f69ef0a16c Mon Sep 17 00:00:00 2001 From: DavidLandup0 Date: Thu, 27 Jul 2023 00:18:37 +0200 Subject: [PATCH 19/53] segformer progress --- .../mix_transformer/mix_transformer_backbone.py | 4 ++-- .../models/segmentation/segformer/segformer.py | 14 +++++++++++--- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py index 9579aa33be..f44697981a 100644 --- a/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py +++ b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py @@ -104,8 +104,8 @@ def __init__( for blk in transformer_blocks[i]: x = blk(x) x = layer_norms[i](x) - x = keras.layers.Reshape((new_height, new_width, -1))(x) - pyramid_level_inputs.append(x) + x = keras.layers.Reshape((new_height, new_width, -1), name=f'output_level_{i}')(x) + pyramid_level_inputs.append(utils.get_tensor_input_name(x)) super().__init__(inputs=inputs, outputs=x, **kwargs) diff --git a/keras_cv/models/segmentation/segformer/segformer.py b/keras_cv/models/segmentation/segformer/segformer.py index 59cb28c93e..01eeaab092 100644 --- a/keras_cv/models/segmentation/segformer/segformer.py +++ b/keras_cv/models/segmentation/segformer/segformer.py @@ -1,6 +1,7 @@ from keras_cv.backend import keras from keras_cv.models import utils from keras_cv.models.task import Task +from keras_cv.utils.train import get_feature_extractor @keras.utils.register_keras_serializable(package="keras_cv") @@ -22,9 +23,10 @@ def __init__( ) inputs = backbone.input - backbone(inputs) - - outputs = backbone.pyramid_level_inputs + feature_extractor = get_feature_extractor( + backbone, list(backbone.pyramid_level_inputs.values()) + ) + outputs = feature_extractor(inputs) y = SegFormerHead( in_dims=backbone.output_channels, @@ -80,17 +82,23 @@ def __init__(self, in_dims, embed_dim=256, num_classes=19, **kwargs): # Final segmentation output self.seg_out = keras.layers.Conv2D(filters=num_classes, kernel_size=1) + def compute_output_shape(self, input_shape): + return input_shape + def call(self, features): _, H, W, _ = features[0].shape outs = [] + print(features) for feature, layer in zip(features, self.linear_layers): + print(feature, layer) feature = layer(feature) feature = keras.image.resize( feature, size=(H, W), method="bilinear" ) outs.append(feature) + seg = self.linear_fuse(keras.ops.concat(outs[::-1], axis=3)) seg = self.dropout(seg) seg = self.seg_out(seg) From dab4e74b42b3c0a355966ddcfc28107f77ff9239 Mon Sep 17 00:00:00 2001 From: DavidLandup0 Date: Fri, 28 Jul 2023 01:02:02 +0200 Subject: [PATCH 20/53] head --- .../mix_transformer_backbone.py | 4 +- .../segmentation/segformer/segformer.py | 93 +++++++------------ 2 files changed, 39 insertions(+), 58 deletions(-) diff --git a/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py index f44697981a..e2a15a60fe 100644 --- a/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py +++ b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py @@ -104,7 +104,9 @@ def __init__( for blk in transformer_blocks[i]: x = blk(x) x = layer_norms[i](x) - x = keras.layers.Reshape((new_height, new_width, -1), name=f'output_level_{i}')(x) + x = keras.layers.Reshape( + (new_height, new_width, -1), name=f"output_level_{i}" + )(x) pyramid_level_inputs.append(utils.get_tensor_input_name(x)) super().__init__(inputs=inputs, outputs=x, **kwargs) diff --git a/keras_cv/models/segmentation/segformer/segformer.py b/keras_cv/models/segmentation/segformer/segformer.py index 01eeaab092..3a3ac0eaf9 100644 --- a/keras_cv/models/segmentation/segformer/segformer.py +++ b/keras_cv/models/segmentation/segformer/segformer.py @@ -1,5 +1,6 @@ +import tensorflow as tf + from keras_cv.backend import keras -from keras_cv.models import utils from keras_cv.models.task import Task from keras_cv.utils.train import get_feature_extractor @@ -23,23 +24,49 @@ def __init__( ) inputs = backbone.input + feature_extractor = get_feature_extractor( backbone, list(backbone.pyramid_level_inputs.values()) ) - outputs = feature_extractor(inputs) + # Multi-level dictionary + features = list(feature_extractor(inputs).values()) + + # Get H and W of level one output + _, H, W, _ = features[0].shape + # Project all multi-level outputs onto the same dimensionality + # and feature map shape + multi_layer_outs = [] + for feature_dim, feature in zip(backbone.output_channels, features): + out = keras.layers.Dense(embed_dim, name=f"linear_{feature_dim}")( + feature + ) + out = keras.layers.Resizing(H, W, interpolation="bilinear")(out) + multi_layer_outs.append(out) + + # Concat now-equal feature maps + concatenated_outs = keras.layers.Concatenate(axis=3)( + multi_layer_outs[::-1] + ) + + # Fuse multi-channel segmentation map into a single-channel segmentation map + seg = keras.Sequential( + [ + keras.layers.Conv2D( + filters=embed_dim, kernel_size=1, use_bias=False + ), + keras.layers.BatchNormalization(), + keras.layers.Activation("relu"), + ] + )(concatenated_outs) - y = SegFormerHead( - in_dims=backbone.output_channels, - embed_dim=embed_dim, - num_classes=num_classes, - name="segformer_head", - )(outputs) + seg = keras.layers.Dropout(0.1)(seg) + seg = keras.layers.Conv2D(filters=num_classes, kernel_size=1)(seg) output = keras.layers.Resizing( height=inputs.shape[1], width=inputs.shape[2], interpolation="bilinear", - )(y) + )(seg) super().__init__( inputs=inputs, @@ -56,51 +83,3 @@ def get_config(self): "backbone": self.backbone, "embed_dim": self.embed_dim, } - - -class SegFormerHead(keras.layers.Layer): - def __init__(self, in_dims, embed_dim=256, num_classes=19, **kwargs): - super().__init__(**kwargs) - self.linear_layers = [] - - for i in in_dims: - self.linear_layers.append( - keras.layers.Dense(embed_dim, name=f"linear_{i}") - ) - - # To fuse multiple layer outputs into a single feature map using a Conv2d - self.linear_fuse = keras.Sequential( - [ - keras.layers.Conv2D( - filters=embed_dim, kernel_size=1, use_bias=False - ), - keras.layers.BatchNormalization(), - keras.layers.Activation("relu"), - ] - ) - self.dropout = keras.layers.Dropout(0.1) - # Final segmentation output - self.seg_out = keras.layers.Conv2D(filters=num_classes, kernel_size=1) - - def compute_output_shape(self, input_shape): - return input_shape - - def call(self, features): - _, H, W, _ = features[0].shape - outs = [] - - print(features) - for feature, layer in zip(features, self.linear_layers): - print(feature, layer) - feature = layer(feature) - feature = keras.image.resize( - feature, size=(H, W), method="bilinear" - ) - outs.append(feature) - - - seg = self.linear_fuse(keras.ops.concat(outs[::-1], axis=3)) - seg = self.dropout(seg) - seg = self.seg_out(seg) - - return seg From 1dba059ddd461285977c24947a1fa5d966b156c6 Mon Sep 17 00:00:00 2001 From: DavidLandup0 Date: Fri, 28 Jul 2023 01:51:41 +0200 Subject: [PATCH 21/53] softmax --- keras_cv/models/segmentation/segformer/segformer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/keras_cv/models/segmentation/segformer/segformer.py b/keras_cv/models/segmentation/segformer/segformer.py index 3a3ac0eaf9..21d97fd400 100644 --- a/keras_cv/models/segmentation/segformer/segformer.py +++ b/keras_cv/models/segmentation/segformer/segformer.py @@ -60,7 +60,7 @@ def __init__( )(concatenated_outs) seg = keras.layers.Dropout(0.1)(seg) - seg = keras.layers.Conv2D(filters=num_classes, kernel_size=1)(seg) + seg = keras.layers.Conv2D(filters=num_classes, kernel_size=1, activation="softmax")(seg) output = keras.layers.Resizing( height=inputs.shape[1], From bdc3687128581de79e1ba6eae6e3eae073452bf6 Mon Sep 17 00:00:00 2001 From: DavidLandup0 Date: Fri, 28 Jul 2023 18:54:14 +0200 Subject: [PATCH 22/53] remove softmax --- keras_cv/models/segmentation/segformer/segformer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/keras_cv/models/segmentation/segformer/segformer.py b/keras_cv/models/segmentation/segformer/segformer.py index 21d97fd400..3a3ac0eaf9 100644 --- a/keras_cv/models/segmentation/segformer/segformer.py +++ b/keras_cv/models/segmentation/segformer/segformer.py @@ -60,7 +60,7 @@ def __init__( )(concatenated_outs) seg = keras.layers.Dropout(0.1)(seg) - seg = keras.layers.Conv2D(filters=num_classes, kernel_size=1, activation="softmax")(seg) + seg = keras.layers.Conv2D(filters=num_classes, kernel_size=1)(seg) output = keras.layers.Resizing( height=inputs.shape[1], From ddfa31547860e24bfd749fa86a6fe4668bb4bea9 Mon Sep 17 00:00:00 2001 From: DavidLandup0 Date: Fri, 28 Jul 2023 19:08:53 +0200 Subject: [PATCH 23/53] undo compute_output_shapes() --- keras_cv/layers/efficient_multihead_attention.py | 13 +++++++------ .../layers/hierarchical_transformer_encoder.py | 16 +++++++++------- 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/keras_cv/layers/efficient_multihead_attention.py b/keras_cv/layers/efficient_multihead_attention.py index 092ab562e6..e524b20e7f 100644 --- a/keras_cv/layers/efficient_multihead_attention.py +++ b/keras_cv/layers/efficient_multihead_attention.py @@ -54,13 +54,14 @@ def __init__(self, project_dim, num_heads, sr_ratio): ) self.norm = keras.layers.LayerNormalization() - def compute_output_shape(self, input_shape): - return input_shape - def call(self, x): - input_shape = x.shape - H, W = int(math.sqrt(input_shape[1])), int(math.sqrt(input_shape[1])) - B, C = input_shape[0], input_shape[-1] + input_shape = ops.shape(x) + H, W = ops.sqrt(ops.cast(input_shape[1], "float32")), ops.sqrt( + ops.cast(input_shape[1], "float32") + ) + B, C = ops.cast(input_shape[0], "float32"), ops.cast( + input_shape[2], "float32" + ) q = self.q(x) q = ops.reshape( diff --git a/keras_cv/layers/hierarchical_transformer_encoder.py b/keras_cv/layers/hierarchical_transformer_encoder.py index 58c170d246..e4b17012e9 100644 --- a/keras_cv/layers/hierarchical_transformer_encoder.py +++ b/keras_cv/layers/hierarchical_transformer_encoder.py @@ -76,9 +76,8 @@ def __init__( def build(self, input_shape): super().build(input_shape) - - def compute_output_shape(self, input_shape): - return input_shape + self.H = ops.sqrt(ops.cast(input_shape[1], "float32")) + self.W = ops.sqrt(ops.cast(input_shape[2], "float32")) def call(self, x): x = x + self.drop_path(self.attn(self.norm1(x))) @@ -109,11 +108,14 @@ def __init__(self, channels, mid_channels): def call(self, x): x = self.fc1(x) - shape = x.shape - H, W = int(math.sqrt(shape[1])), int(math.sqrt(shape[1])) - x = ops.reshape(x, (shape[0], H, W, shape[-1])) + shape = ops.shape(x) + B, C = ops.cast(shape[0], "float32"), ops.cast(shape[-1], "float32") + H, W = ops.sqrt(ops.cast(shape[1], "float32")), ops.sqrt( + ops.cast(shape[1], "float32") + ) + x = ops.reshape(x, (B, H, W, C)) x = self.dwconv(x) - x = ops.reshape(x, (shape[0], -1, shape[-1])) + x = ops.reshape(x, (B, -1, C)) x = ops.nn.gelu(x) x = self.fc2(x) return x From 5a091b680ba0fb87ebda97b1f15a9b2ec43a0afc Mon Sep 17 00:00:00 2001 From: DavidLandup0 Date: Sun, 30 Jul 2023 21:01:59 +0200 Subject: [PATCH 24/53] efficientmultiheadattention -> segformermultiheadattention --- keras_cv/layers/__init__.py | 2 +- keras_cv/layers/efficient_multihead_attention.py | 8 ++++---- keras_cv/layers/hierarchical_transformer_encoder.py | 12 ++++++------ keras_cv/layers/overlapping_patching_embedding.py | 2 +- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/keras_cv/layers/__init__.py b/keras_cv/layers/__init__.py index 4a5e236140..2f6b38e606 100644 --- a/keras_cv/layers/__init__.py +++ b/keras_cv/layers/__init__.py @@ -17,7 +17,7 @@ from tensorflow.keras.layers import RandomWidth from keras_cv.layers.efficient_multihead_attention import ( - EfficientMultiheadAttention, + SegFormerMultiheadAttention, ) from keras_cv.layers.feature_pyramid import FeaturePyramid from keras_cv.layers.fusedmbconv import FusedMBConvBlock diff --git a/keras_cv/layers/efficient_multihead_attention.py b/keras_cv/layers/efficient_multihead_attention.py index e524b20e7f..1327d1df3a 100644 --- a/keras_cv/layers/efficient_multihead_attention.py +++ b/keras_cv/layers/efficient_multihead_attention.py @@ -5,13 +5,13 @@ @keras.saving.register_keras_serializable(package="keras_cv") -class EfficientMultiheadAttention(keras.layers.Layer): +class SegFormerMultiheadAttention(keras.layers.Layer): def __init__(self, project_dim, num_heads, sr_ratio): """ Efficient MultiHeadAttention implementation as a Keras layer. A huge bottleneck in scaling transformers is the self-attention layer with an O(n^2) complexity. - EfficientMultiHeadAttention performs a sequence reduction (SR) operation with a given ratio, to reduce + SegFormerMultiheadAttention performs a sequence reduction (SR) operation with a given ratio, to reduce the sequence length before performing key and value projections, reducing the O(n^2) complexity to O(n^2/R) where R is the sequence reduction ratio. @@ -22,7 +22,7 @@ def __init__(self, project_dim, num_heads, sr_ratio): - [Ported from the TensorFlow implementation from DeepVision](https://github.com/DavidLandup0/deepvision/blob/main/deepvision/layers/efficient_attention.py) Args: - project_dim: the dimensionality of the projection of the `EfficientMultiHeadAttention` layer. + project_dim: the dimensionality of the projection of the `SegFormerMultiheadAttention` layer. num_heads: the number of heads to use in the attention computation. sr_ratio: the sequence reduction ratio to perform on the sequence before key and value projections. @@ -30,7 +30,7 @@ def __init__(self, project_dim, num_heads, sr_ratio): ``` tensor = tf.random.uniform([1, 196, 32]) - output = keras_cv.layers.EfficientMultiheadAttention(project_dim=768, + output = keras_cv.layers.SegFormerMultiheadAttention(project_dim=768, num_heads=2, sr_ratio=4)(tensor) print(output.shape) # (1, 196, 32) diff --git a/keras_cv/layers/hierarchical_transformer_encoder.py b/keras_cv/layers/hierarchical_transformer_encoder.py index e4b17012e9..4db69b08df 100644 --- a/keras_cv/layers/hierarchical_transformer_encoder.py +++ b/keras_cv/layers/hierarchical_transformer_encoder.py @@ -3,7 +3,7 @@ from keras_cv.backend import keras from keras_cv.backend import ops from keras_cv.layers.efficient_multihead_attention import ( - EfficientMultiheadAttention, + SegFormerMultiheadAttention, ) from keras_cv.layers.regularization.drop_path import DropPath @@ -12,7 +12,7 @@ class HierarchicalTransformerEncoder(keras.layers.Layer): """ Hierarchical transformer encoder block implementation as a Keras Layer. - The layer uses `EfficientMultiheadAttention` as a `MultiHeadAttention` alternative for + The layer uses `SegFormerMultiheadAttention` as a `MultiHeadAttention` alternative for computational efficiency, and is meant to be used within the SegFormer architecture. References: @@ -22,13 +22,13 @@ class HierarchicalTransformerEncoder(keras.layers.Layer): Args: project_dim: the dimensionality of the projection of the encoder, and - output of the `EfficientMultiHeadAttention` layer. Due to the residual addition + output of the `SegFormerMultiheadAttention` layer. Due to the residual addition the input dimensionality has to be equal to the output dimensionality. - num_heads: the number of heads for the `EfficientMultiHeadAttention` layer + num_heads: the number of heads for the `SegFormerMultiheadAttention` layer drop_prob: default 0.0, the probability of dropping a random sample using the `DropPath` layer. layer_norm_epsilon: default 1e-06, the epsilon for `LayerNormalization` layers - sr_ratio: default 1, the ratio to use within `EfficientMultiHeadAttention`. If set to > 1, + sr_ratio: default 1, the ratio to use within `SegFormerMultiheadAttention`. If set to > 1, a `Conv2D` layer is used to reduce the length of the sequence. Basic usage: @@ -64,7 +64,7 @@ def __init__( self.drop_prop = drop_prob self.norm1 = keras.layers.LayerNormalization(epsilon=layer_norm_epsilon) - self.attn = EfficientMultiheadAttention( + self.attn = SegFormerMultiheadAttention( project_dim, num_heads, sr_ratio ) self.drop_path = DropPath(drop_prob) diff --git a/keras_cv/layers/overlapping_patching_embedding.py b/keras_cv/layers/overlapping_patching_embedding.py index 3e13c8a911..54754948ef 100644 --- a/keras_cv/layers/overlapping_patching_embedding.py +++ b/keras_cv/layers/overlapping_patching_embedding.py @@ -22,7 +22,7 @@ def __init__(self, project_dim=32, patch_size=7, stride=4, **kwargs): drop_prob: default 0.0, the probability of dropping a random sample using the `DropPath` layer. layer_norm_epsilon: default 1e-06, the epsilon for `LayerNormalization` layers - sr_ratio: default 1, the ratio to use within `EfficientMultiHeadAttention`. If set to > 1, + sr_ratio: default 1, the ratio to use within `SegFormerMultiheadAttention`. If set to > 1, a `Conv2D` layer is used to reduce the length of the sequence. Basic usage: From 4e9df1608316653ebf023d676093a37a59dd1d0c Mon Sep 17 00:00:00 2001 From: DavidLandup0 Date: Sun, 30 Jul 2023 21:13:12 +0200 Subject: [PATCH 25/53] docstrings --- .../segmentation/segformer/segformer.py | 71 ++++++++++++++++--- 1 file changed, 61 insertions(+), 10 deletions(-) diff --git a/keras_cv/models/segmentation/segformer/segformer.py b/keras_cv/models/segmentation/segformer/segformer.py index 3a3ac0eaf9..161923d026 100644 --- a/keras_cv/models/segmentation/segformer/segformer.py +++ b/keras_cv/models/segmentation/segformer/segformer.py @@ -7,13 +7,64 @@ @keras.utils.register_keras_serializable(package="keras_cv") class SegFormer(Task): + """A Keras model implementing the SegFormer architecture for semantic + segmentation. + + References: + - [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) + - [Based on the TensorFlow implementation from DeepVision](https://github.com/DavidLandup0/deepvision/tree/main/deepvision/models/segmentation/segformer) + + Args: + backbone: `keras.Model`. The backbone network for the model that is + used as a feature extractor for the SegFormer encoder. It is *intended* + to be used only with the MiT backbone model which was created specifically + for SegFormers. It should either be a `keras_cv.models.backbones.backbone.Backbone` or a + `tf.keras.Model` that implements the `pyramid_level_inputs` + property with keys "P2", "P3", "P4", and "P5" and layer names as + values. + num_classes: int, the number of classes for the detection model. Note + that the `num_classes` doesn't contain the background class, and the + classes from the data should be represented by integers with range + [0, `num_classes`). + projection_filters: int, default 256, number of filters in the convolution layer + projecting the concatenated features into a segmentation map. + + Examples: + + Using the class with a `backbone`: + + ```python + import tensorflow as tf + import keras_cv + + images = np.ones(shape=(1, 96, 96, 3)) + labels = np.zeros(shape=(1, 96, 96, 1)) + backbone = keras_cv.models.MiTBackbone.from_preset("MiT_B0") + model = keras_cv.models.segmentation.SegFormer( + num_classes=1, backbone=backbone, + ) + + # Evaluate model + model(images) + + # Train model + model.compile( + optimizer="adam", + loss=keras.losses.BinaryCrossentropy(from_logits=False), + metrics=["accuracy"], + ) + model.fit(images, labels, epochs=3) + ``` + """ + def __init__( self, - num_classes=None, - backbone=None, - embed_dim=None, + backbone, + num_classes, + projection_filters=None, **kwargs, ): + """ """ if not isinstance(backbone, keras.layers.Layer) or not isinstance( backbone, keras.Model ): @@ -37,9 +88,9 @@ def __init__( # and feature map shape multi_layer_outs = [] for feature_dim, feature in zip(backbone.output_channels, features): - out = keras.layers.Dense(embed_dim, name=f"linear_{feature_dim}")( - feature - ) + out = keras.layers.Dense( + projection_filters, name=f"linear_{feature_dim}" + )(feature) out = keras.layers.Resizing(H, W, interpolation="bilinear")(out) multi_layer_outs.append(out) @@ -48,11 +99,11 @@ def __init__( multi_layer_outs[::-1] ) - # Fuse multi-channel segmentation map into a single-channel segmentation map + # Fuse concatenated features into a segmentation map seg = keras.Sequential( [ keras.layers.Conv2D( - filters=embed_dim, kernel_size=1, use_bias=False + filters=projection_filters, kernel_size=1, use_bias=False ), keras.layers.BatchNormalization(), keras.layers.Activation("relu"), @@ -75,11 +126,11 @@ def __init__( ) self.num_classes = num_classes - self.embed_dim = embed_dim + self.projection_filters = projection_filters def get_config(self): return { "num_classes": self.num_classes, "backbone": self.backbone, - "embed_dim": self.embed_dim, + "projection_filters": self.projection_filters, } From 278875cd1ab89fcb6b895fe3a56d45dc2cee3e65 Mon Sep 17 00:00:00 2001 From: DavidLandup0 Date: Sun, 30 Jul 2023 21:14:03 +0200 Subject: [PATCH 26/53] softmax output --- keras_cv/models/segmentation/segformer/segformer.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/keras_cv/models/segmentation/segformer/segformer.py b/keras_cv/models/segmentation/segformer/segformer.py index 161923d026..d1e06543a4 100644 --- a/keras_cv/models/segmentation/segformer/segformer.py +++ b/keras_cv/models/segmentation/segformer/segformer.py @@ -111,7 +111,9 @@ def __init__( )(concatenated_outs) seg = keras.layers.Dropout(0.1)(seg) - seg = keras.layers.Conv2D(filters=num_classes, kernel_size=1)(seg) + seg = keras.layers.Conv2D( + filters=num_classes, kernel_size=1, activation="softmax" + )(seg) output = keras.layers.Resizing( height=inputs.shape[1], From 6618a659f03203cb6d64a9c9ea3ac2daadaaa632 Mon Sep 17 00:00:00 2001 From: DavidLandup0 Date: Tue, 1 Aug 2023 22:35:42 +0200 Subject: [PATCH 27/53] segformer presets --- .../mix_transformer_aliases.py | 23 ++-- .../mix_transformer_backbone_presets.py | 35 ++++-- .../segmentation/segformer/segformer.py | 4 +- .../segformer/segformer_backbone_presets.py | 113 ++++++++++++++++++ .../segmentation/segformer/segformer_test.py | 94 +++++++++++++++ 5 files changed, 251 insertions(+), 18 deletions(-) create mode 100644 keras_cv/models/segmentation/segformer/segformer_backbone_presets.py create mode 100644 keras_cv/models/segmentation/segformer/segformer_test.py diff --git a/keras_cv/models/backbones/mix_transformer/mix_transformer_aliases.py b/keras_cv/models/backbones/mix_transformer/mix_transformer_aliases.py index bd326ffa0a..74c9a4ec51 100644 --- a/keras_cv/models/backbones/mix_transformer/mix_transformer_aliases.py +++ b/keras_cv/models/backbones/mix_transformer/mix_transformer_aliases.py @@ -62,17 +62,22 @@ def __new__( "input_tensor": input_tensor, } ) - return MiTBackbone.from_preset("MiT_B0", **kwargs) + return MiTBackbone.from_preset("mit_b0", **kwargs) @classproperty def presets(cls): """Dictionary of preset names and configurations.""" - return {} + return { + "mit_b0_imagenet": copy.deepcopy( + backbone_presets["mit_b0_imagenet"] + ), + } @classproperty def presets_with_weights(cls): - """Dictionary of preset names and configurations.""" - return {} + """Dictionary of preset names and configurations that include + weights.""" + return cls.presets class MiTB1Backbone(MiTBackbone): @@ -91,7 +96,7 @@ def __new__( "input_tensor": input_tensor, } ) - return MiTBackbone.from_preset("MiT_B1", **kwargs) + return MiTBackbone.from_preset("mit_b1", **kwargs) @classproperty def presets(cls): @@ -120,7 +125,7 @@ def __new__( "input_tensor": input_tensor, } ) - return MiTBackbone.from_preset("MiT_B2", **kwargs) + return MiTBackbone.from_preset("mit_b2", **kwargs) @classproperty def presets(cls): @@ -149,7 +154,7 @@ def __new__( "input_tensor": input_tensor, } ) - return MiTBackbone.from_preset("MiT_B3", **kwargs) + return MiTBackbone.from_preset("mit_b3", **kwargs) @classproperty def presets(cls): @@ -178,7 +183,7 @@ def __new__( "input_tensor": input_tensor, } ) - return MiTBackbone.from_preset("MiT_B4", **kwargs) + return MiTBackbone.from_preset("mit_b4", **kwargs) @classproperty def presets(cls): @@ -207,7 +212,7 @@ def __new__( "input_tensor": input_tensor, } ) - return MiTBackbone.from_preset("MiT_B5", **kwargs) + return MiTBackbone.from_preset("mit_b5", **kwargs) @classproperty def presets(cls): diff --git a/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets.py b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets.py index ca94d03472..a7c6985bd6 100644 --- a/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets.py +++ b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets.py @@ -14,7 +14,7 @@ """MiT model preset configurations.""" backbone_presets_no_weights = { - "MiT_B0": { + "mit_b0": { "metadata": { "description": ( "MiT (MixTransformer) model with 8 transformer blocks." @@ -32,7 +32,7 @@ "input_tensor": None, }, }, - "MiT_B1": { + "mit_b1": { "metadata": { "description": ( "MiT (MixTransformer) model with 8 transformer blocks." @@ -50,7 +50,7 @@ "input_tensor": None, }, }, - "MiT_B2": { + "mit_b2": { "metadata": { "description": ( "MiT (MixTransformer) model with 16 transformer blocks." @@ -68,7 +68,7 @@ "input_tensor": None, }, }, - "MiT_B3": { + "mit_b3": { "metadata": { "description": ( "MiT (MixTransformer) model with 28 transformer blocks." @@ -86,7 +86,7 @@ "input_tensor": None, }, }, - "MiT_B4": { + "mit_b4": { "metadata": { "description": ( "MiT (MixTransformer) model with 41 transformer blocks." @@ -104,7 +104,7 @@ "input_tensor": None, }, }, - "MiT_B5": { + "mit_b5": { "metadata": { "description": ( "MiT (MixTransformer) model with 52 transformer blocks." @@ -124,7 +124,28 @@ }, } -backbone_presets_with_weights = {} +backbone_presets_with_weights = { + "mit_b0_imagenet": { + "metadata": { + "description": ( + "MiT (MixTransformer) model with 8 transformer blocks." + ), + "params": 3321962, + "official_name": "MiT", + "path": "mit", + }, + "class_name": "keras_cv.models>MiTBackbone", + "config": { + "embedding_dims": [32, 64, 160, 256], + "depths": [2, 2, 2, 2], + "include_rescaling": True, + "input_shape": (224, 224, 3), + "input_tensor": None, + }, + "weights_url": "https://storage.googleapis.com/keras-cv/models/mitb0/imagenet/classification-v0.h5", # noqa: E501 + "weights_hash": "8e0c416cd330b6fa0bcfb3a5ccc43edcbcabf6a463aee3c2a9b6a1398c207d10", # noqa: E501 + }, +} backbone_presets = { **backbone_presets_no_weights, diff --git a/keras_cv/models/segmentation/segformer/segformer.py b/keras_cv/models/segmentation/segformer/segformer.py index d1e06543a4..206e3050fd 100644 --- a/keras_cv/models/segmentation/segformer/segformer.py +++ b/keras_cv/models/segmentation/segformer/segformer.py @@ -39,7 +39,7 @@ class SegFormer(Task): images = np.ones(shape=(1, 96, 96, 3)) labels = np.zeros(shape=(1, 96, 96, 1)) - backbone = keras_cv.models.MiTBackbone.from_preset("MiT_B0") + backbone = keras_cv.models.MiTBackbone.from_preset("mit_b0_imagenet") model = keras_cv.models.segmentation.SegFormer( num_classes=1, backbone=backbone, ) @@ -61,7 +61,7 @@ def __init__( self, backbone, num_classes, - projection_filters=None, + projection_filters=256, **kwargs, ): """ """ diff --git a/keras_cv/models/segmentation/segformer/segformer_backbone_presets.py b/keras_cv/models/segmentation/segformer/segformer_backbone_presets.py new file mode 100644 index 0000000000..02fb178746 --- /dev/null +++ b/keras_cv/models/segmentation/segformer/segformer_backbone_presets.py @@ -0,0 +1,113 @@ +# Copyright 2023 The KerasCV Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""SegFormer model preset configurations.""" + +from keras_cv.models.backbones.mix_transformer.mix_transformer_backbone import ( + MiTBackbone, +) + +backbone_presets_no_weights = { + "segformer_b0": { + "metadata": { + "description": ( + "SegFormer model with MiTB0 backbone." + ), + "params": 3719027, + "official_name": "SegFormerB0", + "path": "segformer_b0", + }, + "class_name": "keras_cv.models>SegFormer", + "config": { + "backbone": MiTBackbone.from_preset('mit_b0_imagenet'), + }, + }, + "segformer_b1": { + "metadata": { + "description": ( + "SegFormer model with MiTB1 backbone." + ), + "params": 13682643, + "official_name": "SegFormerB1", + "path": "segformer_b1", + }, + "class_name": "keras_cv.models>SegFormer", + "config": { + "backbone": MiTBackbone.from_preset('mit_b1'), + }, + }, + "segformer_b2": { + "metadata": { + "description": ( + "SegFormer model with MiTB2 backbone." + ), + "params": 24727507, + "official_name": "SegFormerB2", + "path": "segformer_b2", + }, + "class_name": "keras_cv.models>SegFormer", + "config": { + "backbone": MiTBackbone.from_preset('mit_b2'), + }, + }, + "segformer_b3": { + "metadata": { + "description": ( + "SegFormer model with MiTB3 backbone." + ), + "params": 44603347, + "official_name": "SegFormerB3", + "path": "segformer_b3", + }, + "class_name": "keras_cv.models>SegFormer", + "config": { + "backbone": MiTBackbone.from_preset('mit_b3'), + }, + }, + "segformer_b4": { + "metadata": { + "description": ( + "SegFormer model with MiTB4 backbone." + ), + "params": 61373907, + "official_name": "SegFormerB4", + "path": "segformer_b4", + }, + "class_name": "keras_cv.models>SegFormer", + "config": { + "backbone": MiTBackbone.from_preset('mit_b4'), + }, + }, + "segformer_b5": { + "metadata": { + "description": ( + "SegFormer model with MiTB5 backbone." + ), + "params": 81974227, + "official_name": "SegFormerB5", + "path": "segformer_b5", + }, + "class_name": "keras_cv.models>SegFormer", + "config": { + "backbone": MiTBackbone.from_preset('mit_b5'), + }, + }, +} + +backbone_presets_with_weights = { +} + +backbone_presets = { + **backbone_presets_no_weights, + **backbone_presets_with_weights, +} diff --git a/keras_cv/models/segmentation/segformer/segformer_test.py b/keras_cv/models/segmentation/segformer/segformer_test.py new file mode 100644 index 0000000000..924c506350 --- /dev/null +++ b/keras_cv/models/segmentation/segformer/segformer_test.py @@ -0,0 +1,94 @@ +# Copyright 2023 The KerasCV Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import pytest +import tensorflow as tf +from absl.testing import parameterized +from tensorflow import keras + +from keras_cv.models import MiTBackbone +from keras_cv.models import SegFormer + + +class SegFormerTest(tf.test.TestCase, parameterized.TestCase): + def test_segformer_construction(self): + backbone = MiTBackbone.from_preset("mit_b0", input_shape=[512, 512, 3]) + model = SegFormer(backbone=backbone, num_classes=1) + model.compile( + optimizer="adam", + loss=keras.losses.BinaryCrossentropy(), + metrics=["accuracy"], + ) + + @pytest.mark.large + def test_segformer_plus_call(self): + backbone = MiTBackbone.from_preset("mit_b0", input_shape=[512, 512, 3]) + model = SegFormer(backbone=backbone, num_classes=1) + images = tf.random.uniform((2, 512, 512, 3)) + _ = model(images) + _ = model.predict(images) + + @pytest.mark.large + def test_weights_change(self): + target_size = [512, 512, 3] + + images = tf.ones(shape=[1] + target_size) + labels = tf.zeros(shape=[1] + target_size) + ds = tf.data.Dataset.from_tensor_slices((images, labels)) + ds = ds.repeat(2) + ds = ds.batch(2) + + backbone = MiTBackbone.from_preset("mit_b0", input_shape=[512, 512, 3]) + model = SegFormer(backbone=backbone, num_classes=1) + + model.compile( + optimizer="adam", + loss=keras.losses.BinaryCrossentropy(), + metrics=["accuracy"], + ) + + original_weights = model.get_weights() + model.fit(ds, epochs=1) + updated_weights = model.get_weights() + + for w1, w2 in zip(original_weights, updated_weights): + self.assertNotAllClose(w1, w2) + self.assertFalse(tf.math.reduce_any(tf.math.is_nan(w2))) + + @parameterized.named_parameters( + ("tf_format", "tf", "model"), + ("keras_format", "keras_v3", "model.keras"), + ) + @pytest.mark.large # Saving is slow, so mark these large. + def test_saved_model(self, save_format, filename): + target_size = [512, 512, 3] + + backbone = MiTBackbone.from_preset("mit_b0", input_shape=[512, 512, 3]) + model = SegFormer(backbone=backbone, num_classes=1) + + input_batch = tf.ones(shape=[2] + target_size) + model_output = model(input_batch) + + save_path = os.path.join(self.get_temp_dir(), filename) + model.save(save_path, save_format=save_format) + restored_model = keras.models.load_model(save_path) + + # Check we got the real object back. + self.assertIsInstance(restored_model, SegFormer) + + # Check that output matches. + restored_output = restored_model(input_batch) + self.assertAllClose(model_output, restored_output) From 00ecd9201da49b51526b84fb5441954e8696e1f6 Mon Sep 17 00:00:00 2001 From: DavidLandup0 Date: Tue, 1 Aug 2023 22:39:21 +0200 Subject: [PATCH 28/53] updating segformer presets --- keras_cv/layers/__init__.py | 2 +- .../segmentation/segformer/segformer.py | 20 ++++++++ ...ckbone_presets.py => segformer_presets.py} | 49 +++++++------------ 3 files changed, 39 insertions(+), 32 deletions(-) rename keras_cv/models/segmentation/segformer/{segformer_backbone_presets.py => segformer_presets.py} (66%) diff --git a/keras_cv/layers/__init__.py b/keras_cv/layers/__init__.py index b63a9b7490..0e118941fc 100644 --- a/keras_cv/layers/__init__.py +++ b/keras_cv/layers/__init__.py @@ -16,10 +16,10 @@ from tensorflow.keras.layers import RandomHeight from tensorflow.keras.layers import RandomWidth +from keras_cv.layers.augmenter import Augmenter from keras_cv.layers.efficient_multihead_attention import ( SegFormerMultiheadAttention, ) -from keras_cv.layers.augmenter import Augmenter from keras_cv.layers.feature_pyramid import FeaturePyramid from keras_cv.layers.fusedmbconv import FusedMBConvBlock from keras_cv.layers.hierarchical_transformer_encoder import ( diff --git a/keras_cv/models/segmentation/segformer/segformer.py b/keras_cv/models/segmentation/segformer/segformer.py index 206e3050fd..1073679862 100644 --- a/keras_cv/models/segmentation/segformer/segformer.py +++ b/keras_cv/models/segmentation/segformer/segformer.py @@ -1,7 +1,16 @@ +import copy + import tensorflow as tf from keras_cv.backend import keras +from keras_cv.models.segmentation.segformer.segformer_presets import ( # noqa: E501 + presets, +) +from keras_cv.models.segmentation.segformer.segformer_presets import ( # noqa: E501 + presets_with_weights, +) from keras_cv.models.task import Task +from keras_cv.utils.python_utils import classproperty from keras_cv.utils.train import get_feature_extractor @@ -136,3 +145,14 @@ def get_config(self): "backbone": self.backbone, "projection_filters": self.projection_filters, } + + @classproperty + def presets(cls): + """Dictionary of preset names and configurations.""" + return copy.deepcopy(presets) + + @classproperty + def presets_with_weights(cls): + """Dictionary of preset names and configurations that include + weights.""" + return copy.deepcopy(presets_with_weights) diff --git a/keras_cv/models/segmentation/segformer/segformer_backbone_presets.py b/keras_cv/models/segmentation/segformer/segformer_presets.py similarity index 66% rename from keras_cv/models/segmentation/segformer/segformer_backbone_presets.py rename to keras_cv/models/segmentation/segformer/segformer_presets.py index 02fb178746..2f0fdc1610 100644 --- a/keras_cv/models/segmentation/segformer/segformer_backbone_presets.py +++ b/keras_cv/models/segmentation/segformer/segformer_presets.py @@ -17,97 +17,84 @@ MiTBackbone, ) -backbone_presets_no_weights = { +presets_no_weights = { "segformer_b0": { "metadata": { - "description": ( - "SegFormer model with MiTB0 backbone." - ), + "description": ("SegFormer model with MiTB0 backbone."), "params": 3719027, "official_name": "SegFormerB0", "path": "segformer_b0", }, "class_name": "keras_cv.models>SegFormer", "config": { - "backbone": MiTBackbone.from_preset('mit_b0_imagenet'), + "backbone": MiTBackbone.from_preset("mit_b0_imagenet"), }, }, "segformer_b1": { "metadata": { - "description": ( - "SegFormer model with MiTB1 backbone." - ), + "description": ("SegFormer model with MiTB1 backbone."), "params": 13682643, "official_name": "SegFormerB1", "path": "segformer_b1", }, "class_name": "keras_cv.models>SegFormer", "config": { - "backbone": MiTBackbone.from_preset('mit_b1'), + "backbone": MiTBackbone.from_preset("mit_b1"), }, }, "segformer_b2": { "metadata": { - "description": ( - "SegFormer model with MiTB2 backbone." - ), + "description": ("SegFormer model with MiTB2 backbone."), "params": 24727507, "official_name": "SegFormerB2", "path": "segformer_b2", }, "class_name": "keras_cv.models>SegFormer", "config": { - "backbone": MiTBackbone.from_preset('mit_b2'), + "backbone": MiTBackbone.from_preset("mit_b2"), }, }, "segformer_b3": { "metadata": { - "description": ( - "SegFormer model with MiTB3 backbone." - ), + "description": ("SegFormer model with MiTB3 backbone."), "params": 44603347, "official_name": "SegFormerB3", "path": "segformer_b3", }, "class_name": "keras_cv.models>SegFormer", "config": { - "backbone": MiTBackbone.from_preset('mit_b3'), + "backbone": MiTBackbone.from_preset("mit_b3"), }, }, "segformer_b4": { "metadata": { - "description": ( - "SegFormer model with MiTB4 backbone." - ), + "description": ("SegFormer model with MiTB4 backbone."), "params": 61373907, "official_name": "SegFormerB4", "path": "segformer_b4", }, "class_name": "keras_cv.models>SegFormer", "config": { - "backbone": MiTBackbone.from_preset('mit_b4'), + "backbone": MiTBackbone.from_preset("mit_b4"), }, }, - "segformer_b5": { + "segformer_b5": { "metadata": { - "description": ( - "SegFormer model with MiTB5 backbone." - ), + "description": ("SegFormer model with MiTB5 backbone."), "params": 81974227, "official_name": "SegFormerB5", "path": "segformer_b5", }, "class_name": "keras_cv.models>SegFormer", "config": { - "backbone": MiTBackbone.from_preset('mit_b5'), + "backbone": MiTBackbone.from_preset("mit_b5"), }, }, } -backbone_presets_with_weights = { -} +presets_with_weights = {} -backbone_presets = { - **backbone_presets_no_weights, - **backbone_presets_with_weights, +presets = { + **presets_no_weights, + **presets_with_weights, } From 97d9d4a2b75533a1d31a5012dfd1a1f0211366ed Mon Sep 17 00:00:00 2001 From: DavidLandup0 Date: Fri, 18 Aug 2023 23:28:16 +0200 Subject: [PATCH 29/53] segformer presets --- .../mix_transformer_backbone.py | 17 ++++++---- .../segmentation/segformer/segformer.py | 19 +++++------ .../segformer/segformer_presets.py | 32 ++++++++----------- 3 files changed, 33 insertions(+), 35 deletions(-) diff --git a/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py index e2a15a60fe..7e2e42991e 100644 --- a/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py +++ b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py @@ -45,7 +45,7 @@ def __init__( self, include_rescaling, depths, - input_shape=(None, None, 3), + input_shape=(224, 224, 3), input_tensor=None, embedding_dims=None, **kwargs, @@ -111,8 +111,10 @@ def __init__( super().__init__(inputs=inputs, outputs=x, **kwargs) - self.num_stages = num_stages - self.output_channels = embedding_dims + self.depths = depths + self.embedding_dims = embedding_dims + self.include_rescaling = include_rescaling + self.input_tensor = input_tensor self.pyramid_level_inputs = { f"P{i + 1}": name for i, name in enumerate(pyramid_level_inputs) } @@ -121,10 +123,11 @@ def get_config(self): config = super().get_config() config.update( { - "channels": self.channels, - "num_stages": self.num_stages, - "output_channels": self.output_channels, - "pyramid_level_inputs": self.pyramid_level_inputs, + "depths": self.depths, + "embedding_dims": self.embedding_dims, + "include_rescaling": self.include_rescaling, + "input_shape": self.input_shape[1:], + "input_tensor": self.input_tensor, } ) return config diff --git a/keras_cv/models/segmentation/segformer/segformer.py b/keras_cv/models/segmentation/segformer/segformer.py index 1073679862..c77e6bc726 100644 --- a/keras_cv/models/segmentation/segformer/segformer.py +++ b/keras_cv/models/segmentation/segformer/segformer.py @@ -1,7 +1,5 @@ import copy -import tensorflow as tf - from keras_cv.backend import keras from keras_cv.models.segmentation.segformer.segformer_presets import ( # noqa: E501 presets, @@ -73,7 +71,6 @@ def __init__( projection_filters=256, **kwargs, ): - """ """ if not isinstance(backbone, keras.layers.Layer) or not isinstance( backbone, keras.Model ): @@ -96,7 +93,7 @@ def __init__( # Project all multi-level outputs onto the same dimensionality # and feature map shape multi_layer_outs = [] - for feature_dim, feature in zip(backbone.output_channels, features): + for feature_dim, feature in zip(backbone.embedding_dims, features): out = keras.layers.Dense( projection_filters, name=f"linear_{feature_dim}" )(feature) @@ -140,11 +137,15 @@ def __init__( self.projection_filters = projection_filters def get_config(self): - return { - "num_classes": self.num_classes, - "backbone": self.backbone, - "projection_filters": self.projection_filters, - } + config = super().get_config() + config.update( + { + "num_classes": self.num_classes, + "backbone": self.backbone, + "projection_filters": self.projection_filters, + "backbone": keras.saving.serialize_keras_object(self.backbone), + } + ) @classproperty def presets(cls): diff --git a/keras_cv/models/segmentation/segformer/segformer_presets.py b/keras_cv/models/segmentation/segformer/segformer_presets.py index 2f0fdc1610..a3ee50d504 100644 --- a/keras_cv/models/segmentation/segformer/segformer_presets.py +++ b/keras_cv/models/segmentation/segformer/segformer_presets.py @@ -13,21 +13,25 @@ # limitations under the License. """SegFormer model preset configurations.""" -from keras_cv.models.backbones.mix_transformer.mix_transformer_backbone import ( - MiTBackbone, +from keras_cv.backend import keras +from keras_cv.models.backbones.mix_transformer.mix_transformer_backbone_presets import ( + backbone_presets, ) presets_no_weights = { "segformer_b0": { "metadata": { - "description": ("SegFormer model with MiTB0 backbone."), + "description": ( + "SegFormer model with a pretrained MiTB0 backbone." + ), "params": 3719027, "official_name": "SegFormerB0", "path": "segformer_b0", }, "class_name": "keras_cv.models>SegFormer", "config": { - "backbone": MiTBackbone.from_preset("mit_b0_imagenet"), + "num_classes": 19, + "backbone": backbone_presets["mit_b0_imagenet"], }, }, "segformer_b1": { @@ -38,9 +42,7 @@ "path": "segformer_b1", }, "class_name": "keras_cv.models>SegFormer", - "config": { - "backbone": MiTBackbone.from_preset("mit_b1"), - }, + "config": {"num_classes": 19, "backbone": backbone_presets["mit_b1"]}, }, "segformer_b2": { "metadata": { @@ -50,9 +52,7 @@ "path": "segformer_b2", }, "class_name": "keras_cv.models>SegFormer", - "config": { - "backbone": MiTBackbone.from_preset("mit_b2"), - }, + "config": {"num_classes": 19, "backbone": backbone_presets["mit_b2"]}, }, "segformer_b3": { "metadata": { @@ -62,9 +62,7 @@ "path": "segformer_b3", }, "class_name": "keras_cv.models>SegFormer", - "config": { - "backbone": MiTBackbone.from_preset("mit_b3"), - }, + "config": {"num_classes": 19, "backbone": backbone_presets["mit_b3"]}, }, "segformer_b4": { "metadata": { @@ -74,9 +72,7 @@ "path": "segformer_b4", }, "class_name": "keras_cv.models>SegFormer", - "config": { - "backbone": MiTBackbone.from_preset("mit_b4"), - }, + "config": {"num_classes": 19, "backbone": backbone_presets["mit_b4"]}, }, "segformer_b5": { "metadata": { @@ -86,9 +82,7 @@ "path": "segformer_b5", }, "class_name": "keras_cv.models>SegFormer", - "config": { - "backbone": MiTBackbone.from_preset("mit_b5"), - }, + "config": {"num_classes": 19, "backbone": backbone_presets["mit_b5"]}, }, } From c10963fa101fa2f485322ecb60ce659fc9f7c25d Mon Sep 17 00:00:00 2001 From: DavidLandup0 Date: Fri, 18 Aug 2023 23:31:44 +0200 Subject: [PATCH 30/53] import aliases --- keras_cv/models/__init__.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/keras_cv/models/__init__.py b/keras_cv/models/__init__.py index c2fcd8d79b..c990f134f5 100644 --- a/keras_cv/models/__init__.py +++ b/keras_cv/models/__init__.py @@ -67,6 +67,24 @@ from keras_cv.models.backbones.efficientnet_v2.efficientnet_v2_aliases import ( EfficientNetV2SBackbone, ) +from keras_cv.models.backbones.mix_transformer.mix_transformer_aliases import ( + MiTB0Backbone, +) +from keras_cv.models.backbones.mix_transformer.mix_transformer_aliases import ( + MiTB1Backbone, +) +from keras_cv.models.backbones.mix_transformer.mix_transformer_aliases import ( + MiTB2Backbone, +) +from keras_cv.models.backbones.mix_transformer.mix_transformer_aliases import ( + MiTB3Backbone, +) +from keras_cv.models.backbones.mix_transformer.mix_transformer_aliases import ( + MiTB4Backbone, +) +from keras_cv.models.backbones.mix_transformer.mix_transformer_aliases import ( + MiTB5Backbone, +) from keras_cv.models.backbones.mix_transformer.mix_transformer_backbone import ( MiTBackbone, ) From ab101365f6331e6a7f1f25ffb181544923d4d76f Mon Sep 17 00:00:00 2001 From: DavidLandup0 Date: Fri, 18 Aug 2023 23:36:24 +0200 Subject: [PATCH 31/53] refactoring --- keras_cv/layers/hierarchical_transformer_encoder.py | 2 +- keras_cv/layers/overlapping_patching_embedding.py | 2 +- ..._multihead_attention.py => segformer_multihead_attention.py} | 2 +- .../backbones/mix_transformer/mix_transformer_backbone.py | 2 +- keras_cv/models/segmentation/segformer/segformer.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) rename keras_cv/layers/{efficient_multihead_attention.py => segformer_multihead_attention.py} (98%) diff --git a/keras_cv/layers/hierarchical_transformer_encoder.py b/keras_cv/layers/hierarchical_transformer_encoder.py index 4db69b08df..8ad64eb33b 100644 --- a/keras_cv/layers/hierarchical_transformer_encoder.py +++ b/keras_cv/layers/hierarchical_transformer_encoder.py @@ -8,7 +8,7 @@ from keras_cv.layers.regularization.drop_path import DropPath -@keras.saving.register_keras_serializable(package="keras_cv") +@keras_cv_export("keras_cv.layers.HierarchicalTransformerEncoder") class HierarchicalTransformerEncoder(keras.layers.Layer): """ Hierarchical transformer encoder block implementation as a Keras Layer. diff --git a/keras_cv/layers/overlapping_patching_embedding.py b/keras_cv/layers/overlapping_patching_embedding.py index 54754948ef..a10c77f29e 100644 --- a/keras_cv/layers/overlapping_patching_embedding.py +++ b/keras_cv/layers/overlapping_patching_embedding.py @@ -2,7 +2,7 @@ from keras_cv.backend import ops -@keras.saving.register_keras_serializable(package="keras_cv") +@keras_cv_export("keras_cv.layers.OverlappingPatchingAndEmbedding") class OverlappingPatchingAndEmbedding(keras.layers.Layer): def __init__(self, project_dim=32, patch_size=7, stride=4, **kwargs): """ diff --git a/keras_cv/layers/efficient_multihead_attention.py b/keras_cv/layers/segformer_multihead_attention.py similarity index 98% rename from keras_cv/layers/efficient_multihead_attention.py rename to keras_cv/layers/segformer_multihead_attention.py index 1327d1df3a..b976a58112 100644 --- a/keras_cv/layers/efficient_multihead_attention.py +++ b/keras_cv/layers/segformer_multihead_attention.py @@ -4,7 +4,7 @@ from keras_cv.backend import ops -@keras.saving.register_keras_serializable(package="keras_cv") +@keras_cv_export("keras_cv.layers.SegFormerMultiheadAttention") class SegFormerMultiheadAttention(keras.layers.Layer): def __init__(self, project_dim, num_heads, sr_ratio): """ diff --git a/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py index 7e2e42991e..5cc29b2f19 100644 --- a/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py +++ b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py @@ -39,7 +39,7 @@ from keras_cv.utils.python_utils import classproperty -@keras.saving.register_keras_serializable(package="keras_cv.models") +@keras_cv_export("keras_cv.layers.MiTBackbone") class MiTBackbone(Backbone): def __init__( self, diff --git a/keras_cv/models/segmentation/segformer/segformer.py b/keras_cv/models/segmentation/segformer/segformer.py index c77e6bc726..64640f33e3 100644 --- a/keras_cv/models/segmentation/segformer/segformer.py +++ b/keras_cv/models/segmentation/segformer/segformer.py @@ -12,7 +12,7 @@ from keras_cv.utils.train import get_feature_extractor -@keras.utils.register_keras_serializable(package="keras_cv") +@keras_cv_export("keras_cv.layers.SegFormer") class SegFormer(Task): """A Keras model implementing the SegFormer architecture for semantic segmentation. From 094189e5a4197f51902e51bdad9d450eed28186b Mon Sep 17 00:00:00 2001 From: DavidLandup0 Date: Fri, 18 Aug 2023 23:46:23 +0200 Subject: [PATCH 32/53] pr comments --- keras_cv/layers/hierarchical_transformer_encoder.py | 5 +++-- .../mix_transformer/mix_transformer_backbone_presets.py | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/keras_cv/layers/hierarchical_transformer_encoder.py b/keras_cv/layers/hierarchical_transformer_encoder.py index 8ad64eb33b..55ef6b5ba2 100644 --- a/keras_cv/layers/hierarchical_transformer_encoder.py +++ b/keras_cv/layers/hierarchical_transformer_encoder.py @@ -2,10 +2,10 @@ from keras_cv.backend import keras from keras_cv.backend import ops -from keras_cv.layers.efficient_multihead_attention import ( +from keras_cv.layers.regularization.drop_path import DropPath +from keras_cv.layers.segformer_multihead_attention import ( SegFormerMultiheadAttention, ) -from keras_cv.layers.regularization.drop_path import DropPath @keras_cv_export("keras_cv.layers.HierarchicalTransformerEncoder") @@ -88,6 +88,7 @@ def get_config(self): config = super().get_config() config.update( { + "mlp": keras.saving.serialize_keras_object(self.mlp), "project_dim": self.project_dim, "num_heads": self.num_heads, "drop_prop": self.drop_prop, diff --git a/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets.py b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets.py index a7c6985bd6..b42b3fa3c9 100644 --- a/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets.py +++ b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets.py @@ -128,7 +128,7 @@ "mit_b0_imagenet": { "metadata": { "description": ( - "MiT (MixTransformer) model with 8 transformer blocks." + "MiT (MixTransformer) model with 8 transformer blocks. Pre-trained on ImageNet-1K and scores 69% top-1 accuracy on the validation set." ), "params": 3321962, "official_name": "MiT", From a4df0a6bb589733d5604c53a80ea806f8fbf473c Mon Sep 17 00:00:00 2001 From: DavidLandup0 Date: Fri, 18 Aug 2023 23:48:02 +0200 Subject: [PATCH 33/53] pr comments --- keras_cv/models/segmentation/segformer/segformer.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/keras_cv/models/segmentation/segformer/segformer.py b/keras_cv/models/segmentation/segformer/segformer.py index 64640f33e3..c6fe9763b8 100644 --- a/keras_cv/models/segmentation/segformer/segformer.py +++ b/keras_cv/models/segmentation/segformer/segformer.py @@ -29,10 +29,7 @@ class SegFormer(Task): `tf.keras.Model` that implements the `pyramid_level_inputs` property with keys "P2", "P3", "P4", and "P5" and layer names as values. - num_classes: int, the number of classes for the detection model. Note - that the `num_classes` doesn't contain the background class, and the - classes from the data should be represented by integers with range - [0, `num_classes`). + num_classes: int, the number of classes for the detection model, including the background class. projection_filters: int, default 256, number of filters in the convolution layer projecting the concatenated features into a segmentation map. From e22a15e3612b2a62e0b10cd098bbe6f76bb9b63d Mon Sep 17 00:00:00 2001 From: DavidLandup0 Date: Fri, 18 Aug 2023 23:54:08 +0200 Subject: [PATCH 34/53] add aliases --- .../segformer/segformer_aliases.py | 256 ++++++++++++++++++ 1 file changed, 256 insertions(+) create mode 100644 keras_cv/models/segmentation/segformer/segformer_aliases.py diff --git a/keras_cv/models/segmentation/segformer/segformer_aliases.py b/keras_cv/models/segmentation/segformer/segformer_aliases.py new file mode 100644 index 0000000000..06e83cc592 --- /dev/null +++ b/keras_cv/models/segmentation/segformer/segformer_aliases.py @@ -0,0 +1,256 @@ +# Copyright 2023 The KerasCV Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy + +from keras_cv.models.segmentation.segformer.segformer import SegFormer +from keras_cv.models.segmentation.segformer.segformer_presets import presets +from keras_cv.utils.python_utils import classproperty + +ALIAS_DOCSTRING = """SegFormer model. + + For transfer learning use cases, make sure to read the + [guide to transfer learning & fine-tuning](https://keras.io/guides/transfer_learning/). + + Args: + backbone: a KerasCV backbone for feature extraction. + num_classes: the number of classes for segmentation, including the background class. + + Examples: + ```python + input_data = tf.ones(shape=(8, 224, 224, 3)) + + # Randomly initialized backbone + backbone = keras_cv.models.MiTBackbone.from_preset("mit_b0_imagenet") + segformer = keras_cv.models.SegFormer(backbone=backbone, num_classes=19) + output = model(input_data) + ``` +""" # noqa: E501 + + +class SegFormerB0(SegFormer): + def __new__( + cls, + backbone=None, + num_classes=None, + **kwargs, + ): + # Pack args in kwargs + kwargs.update( + { + "backbone": backbone, + "num_classes": num_classes, + } + ) + return SegFormer.from_preset("segformer_b0", **kwargs) + + @classproperty + def presets(cls): + """Dictionary of preset names and configurations.""" + return { + "segformer_b0": copy.deepcopy(presets["segformer_b0"]), + } + + @classproperty + def presets_with_weights(cls): + """Dictionary of preset names and configurations that include + weights.""" + return cls.presets + + +class SegFormerB1(SegFormer): + def __new__( + cls, + backbone=None, + num_classes=None, + **kwargs, + ): + # Pack args in kwargs + kwargs.update( + { + "backbone": backbone, + "num_classes": num_classes, + } + ) + return SegFormer.from_preset("segformer_b1", **kwargs) + + @classproperty + def presets(cls): + """Dictionary of preset names and configurations.""" + return { + "segformer_b1": copy.deepcopy(presets["segformer_b1"]), + } + + @classproperty + def presets_with_weights(cls): + """Dictionary of preset names and configurations that include + weights.""" + return cls.presets + + +class SegFormerB2(SegFormer): + def __new__( + cls, + backbone=None, + num_classes=None, + **kwargs, + ): + # Pack args in kwargs + kwargs.update( + { + "backbone": backbone, + "num_classes": num_classes, + } + ) + return SegFormer.from_preset("segformer_b2", **kwargs) + + @classproperty + def presets(cls): + """Dictionary of preset names and configurations.""" + return { + "segformer_b2": copy.deepcopy(presets["segformer_b2"]), + } + + @classproperty + def presets_with_weights(cls): + """Dictionary of preset names and configurations that include + weights.""" + return cls.presets + + +class SegFormerB3(SegFormer): + def __new__( + cls, + backbone=None, + num_classes=None, + **kwargs, + ): + # Pack args in kwargs + kwargs.update( + { + "backbone": backbone, + "num_classes": num_classes, + } + ) + return SegFormer.from_preset("segformer_b3", **kwargs) + + @classproperty + def presets(cls): + """Dictionary of preset names and configurations.""" + return { + "segformer_b3": copy.deepcopy(presets["segformer_b3"]), + } + + @classproperty + def presets_with_weights(cls): + """Dictionary of preset names and configurations that include + weights.""" + return cls.presets + + +class SegFormerB4(SegFormer): + def __new__( + cls, + backbone=None, + num_classes=None, + **kwargs, + ): + # Pack args in kwargs + kwargs.update( + { + "backbone": backbone, + "num_classes": num_classes, + } + ) + return SegFormer.from_preset("segformer_b4", **kwargs) + + @classproperty + def presets(cls): + """Dictionary of preset names and configurations.""" + return { + "segformer_b4": copy.deepcopy(presets["segformer_b4"]), + } + + @classproperty + def presets_with_weights(cls): + """Dictionary of preset names and configurations that include + weights.""" + return cls.presets + + +class SegFormerB5(SegFormer): + def __new__( + cls, + backbone=None, + num_classes=None, + **kwargs, + ): + # Pack args in kwargs + kwargs.update( + { + "backbone": backbone, + "num_classes": num_classes, + } + ) + return SegFormer.from_preset("segformer_b5", **kwargs) + + @classproperty + def presets(cls): + """Dictionary of preset names and configurations.""" + return { + "segformer_b5": copy.deepcopy(presets["segformer_b5"]), + } + + @classproperty + def presets_with_weights(cls): + """Dictionary of preset names and configurations that include + weights.""" + return cls.presets + + +setattr( + SegFormerB0, + "__doc__", + ALIAS_DOCSTRING.format(name="SegFormerB0"), +) + +setattr( + SegFormerB1, + "__doc__", + ALIAS_DOCSTRING.format(name="SegFormerB1"), +) + +setattr( + SegFormerB2, + "__doc__", + ALIAS_DOCSTRING.format(name="SegFormerB2"), +) + +setattr( + SegFormerB3, + "__doc__", + ALIAS_DOCSTRING.format(name="SegFormerB3"), +) + +setattr( + SegFormerB4, + "__doc__", + ALIAS_DOCSTRING.format(name="SegFormerB4"), +) + +setattr( + SegFormerB5, + "__doc__", + ALIAS_DOCSTRING.format(name="SegFormerB5"), +) From 5d63d188c014538a7ddf69173c692399b2c55b64 Mon Sep 17 00:00:00 2001 From: DavidLandup0 Date: Fri, 18 Aug 2023 23:54:58 +0200 Subject: [PATCH 35/53] aliases ot init --- keras_cv/models/__init__.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/keras_cv/models/__init__.py b/keras_cv/models/__init__.py index 91a4c89f47..1ff5a47e74 100644 --- a/keras_cv/models/__init__.py +++ b/keras_cv/models/__init__.py @@ -188,5 +188,11 @@ ) from keras_cv.models.segmentation import DeepLabV3Plus from keras_cv.models.segmentation import SegFormer +from keras_cv.models.segmentation.segformer.segformer_aliases import SegFormerB0 +from keras_cv.models.segmentation.segformer.segformer_aliases import SegFormerB1 +from keras_cv.models.segmentation.segformer.segformer_aliases import SegFormerB2 +from keras_cv.models.segmentation.segformer.segformer_aliases import SegFormerB3 +from keras_cv.models.segmentation.segformer.segformer_aliases import SegFormerB4 +from keras_cv.models.segmentation.segformer.segformer_aliases import SegFormerB5 from keras_cv.models.stable_diffusion import StableDiffusion from keras_cv.models.stable_diffusion import StableDiffusionV2 From 03a177ffd333ec7fc37133e10e9bedce65c33340 Mon Sep 17 00:00:00 2001 From: DavidLandup0 Date: Fri, 18 Aug 2023 23:56:41 +0200 Subject: [PATCH 36/53] refactor fix --- keras_cv/layers/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/keras_cv/layers/__init__.py b/keras_cv/layers/__init__.py index 0e118941fc..55c2a939a4 100644 --- a/keras_cv/layers/__init__.py +++ b/keras_cv/layers/__init__.py @@ -17,7 +17,7 @@ from tensorflow.keras.layers import RandomWidth from keras_cv.layers.augmenter import Augmenter -from keras_cv.layers.efficient_multihead_attention import ( +from keras_cv.layers.segformer_multihead_attention import ( SegFormerMultiheadAttention, ) from keras_cv.layers.feature_pyramid import FeaturePyramid From d1cdd5d373a4874624e83eded60bcf440127e847 Mon Sep 17 00:00:00 2001 From: DavidLandup0 Date: Fri, 18 Aug 2023 23:58:19 +0200 Subject: [PATCH 37/53] import keras_cv_export --- keras_cv/layers/__init__.py | 6 +++--- keras_cv/layers/hierarchical_transformer_encoder.py | 3 +-- keras_cv/layers/overlapping_patching_embedding.py | 1 + keras_cv/layers/segformer_multihead_attention.py | 1 + .../backbones/mix_transformer/mix_transformer_backbone.py | 1 + keras_cv/models/segmentation/segformer/segformer.py | 1 + 6 files changed, 8 insertions(+), 5 deletions(-) diff --git a/keras_cv/layers/__init__.py b/keras_cv/layers/__init__.py index 55c2a939a4..342a942f64 100644 --- a/keras_cv/layers/__init__.py +++ b/keras_cv/layers/__init__.py @@ -17,9 +17,6 @@ from tensorflow.keras.layers import RandomWidth from keras_cv.layers.augmenter import Augmenter -from keras_cv.layers.segformer_multihead_attention import ( - SegFormerMultiheadAttention, -) from keras_cv.layers.feature_pyramid import FeaturePyramid from keras_cv.layers.fusedmbconv import FusedMBConvBlock from keras_cv.layers.hierarchical_transformer_encoder import ( @@ -133,6 +130,9 @@ from keras_cv.layers.regularization.dropblock_2d import DropBlock2D from keras_cv.layers.regularization.squeeze_excite import SqueezeAndExcite2D from keras_cv.layers.regularization.stochastic_depth import StochasticDepth +from keras_cv.layers.segformer_multihead_attention import ( + SegFormerMultiheadAttention, +) from keras_cv.layers.spatial_pyramid import SpatialPyramidPooling from keras_cv.layers.transformer_encoder import TransformerEncoder from keras_cv.layers.vit_layers import PatchingAndEmbedding diff --git a/keras_cv/layers/hierarchical_transformer_encoder.py b/keras_cv/layers/hierarchical_transformer_encoder.py index 55ef6b5ba2..ed757aa1e1 100644 --- a/keras_cv/layers/hierarchical_transformer_encoder.py +++ b/keras_cv/layers/hierarchical_transformer_encoder.py @@ -1,5 +1,4 @@ -import math - +from keras_cv.api_export import keras_cv_export from keras_cv.backend import keras from keras_cv.backend import ops from keras_cv.layers.regularization.drop_path import DropPath diff --git a/keras_cv/layers/overlapping_patching_embedding.py b/keras_cv/layers/overlapping_patching_embedding.py index a10c77f29e..d2e5c0489b 100644 --- a/keras_cv/layers/overlapping_patching_embedding.py +++ b/keras_cv/layers/overlapping_patching_embedding.py @@ -1,3 +1,4 @@ +from keras_cv.api_export import keras_cv_export from keras_cv.backend import keras from keras_cv.backend import ops diff --git a/keras_cv/layers/segformer_multihead_attention.py b/keras_cv/layers/segformer_multihead_attention.py index b976a58112..0ffb62e051 100644 --- a/keras_cv/layers/segformer_multihead_attention.py +++ b/keras_cv/layers/segformer_multihead_attention.py @@ -1,5 +1,6 @@ import math +from keras_cv.api_export import keras_cv_export from keras_cv.backend import keras from keras_cv.backend import ops diff --git a/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py index 5cc29b2f19..656df27698 100644 --- a/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py +++ b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py @@ -26,6 +26,7 @@ import numpy as np from keras_cv import layers as cv_layers +from keras_cv.api_export import keras_cv_export from keras_cv.backend import keras from keras_cv.backend import ops from keras_cv.models import utils diff --git a/keras_cv/models/segmentation/segformer/segformer.py b/keras_cv/models/segmentation/segformer/segformer.py index c6fe9763b8..9a01626682 100644 --- a/keras_cv/models/segmentation/segformer/segformer.py +++ b/keras_cv/models/segmentation/segformer/segformer.py @@ -1,5 +1,6 @@ import copy +from keras_cv.api_export import keras_cv_export from keras_cv.backend import keras from keras_cv.models.segmentation.segformer.segformer_presets import ( # noqa: E501 presets, From ff32d6304bf4c636f81f4f39eb5c79bfd42f74ca Mon Sep 17 00:00:00 2001 From: DavidLandup0 Date: Sat, 19 Aug 2023 21:12:59 +0200 Subject: [PATCH 38/53] fix presets/aliases and add copyright --- .../hierarchical_transformer_encoder.py | 14 +++++++++++ .../layers/overlapping_patching_embedding.py | 14 +++++++++++ .../layers/segformer_multihead_attention.py | 14 +++++++++++ keras_cv/models/__init__.py | 4 ++++ .../mix_transformer_backbone.py | 2 +- .../mix_transformer_backbone_presets.py | 14 +++++------ .../segmentation/segformer/segformer.py | 16 ++++++++++++- .../segformer/segformer_aliases.py | 24 +++++-------------- .../segformer/segformer_presets.py | 12 +++++----- 9 files changed, 81 insertions(+), 33 deletions(-) diff --git a/keras_cv/layers/hierarchical_transformer_encoder.py b/keras_cv/layers/hierarchical_transformer_encoder.py index ed757aa1e1..3a87d24483 100644 --- a/keras_cv/layers/hierarchical_transformer_encoder.py +++ b/keras_cv/layers/hierarchical_transformer_encoder.py @@ -1,3 +1,17 @@ +# Copyright 2023 The KerasCV Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from keras_cv.api_export import keras_cv_export from keras_cv.backend import keras from keras_cv.backend import ops diff --git a/keras_cv/layers/overlapping_patching_embedding.py b/keras_cv/layers/overlapping_patching_embedding.py index d2e5c0489b..4e1484257d 100644 --- a/keras_cv/layers/overlapping_patching_embedding.py +++ b/keras_cv/layers/overlapping_patching_embedding.py @@ -1,3 +1,17 @@ +# Copyright 2023 The KerasCV Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from keras_cv.api_export import keras_cv_export from keras_cv.backend import keras from keras_cv.backend import ops diff --git a/keras_cv/layers/segformer_multihead_attention.py b/keras_cv/layers/segformer_multihead_attention.py index 0ffb62e051..0a745ee297 100644 --- a/keras_cv/layers/segformer_multihead_attention.py +++ b/keras_cv/layers/segformer_multihead_attention.py @@ -1,3 +1,17 @@ +# Copyright 2023 The KerasCV Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import math from keras_cv.api_export import keras_cv_export diff --git a/keras_cv/models/__init__.py b/keras_cv/models/__init__.py index 1ff5a47e74..d45b90f427 100644 --- a/keras_cv/models/__init__.py +++ b/keras_cv/models/__init__.py @@ -130,6 +130,9 @@ from keras_cv.models.backbones.mix_transformer.mix_transformer_aliases import ( MiTB5Backbone, ) +from keras_cv.models.backbones.mix_transformer.mix_transformer_aliases import ( + MiTBackbone, +) from keras_cv.models.backbones.mix_transformer.mix_transformer_backbone import ( MiTBackbone, ) @@ -188,6 +191,7 @@ ) from keras_cv.models.segmentation import DeepLabV3Plus from keras_cv.models.segmentation import SegFormer +from keras_cv.models.segmentation.segformer.segformer_aliases import SegFormer from keras_cv.models.segmentation.segformer.segformer_aliases import SegFormerB0 from keras_cv.models.segmentation.segformer.segformer_aliases import SegFormerB1 from keras_cv.models.segmentation.segformer.segformer_aliases import SegFormerB2 diff --git a/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py index 656df27698..85ffabedd4 100644 --- a/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py +++ b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py @@ -40,7 +40,7 @@ from keras_cv.utils.python_utils import classproperty -@keras_cv_export("keras_cv.layers.MiTBackbone") +@keras_cv_export("keras_cv.models.MiTBackbone") class MiTBackbone(Backbone): def __init__( self, diff --git a/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets.py b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets.py index b42b3fa3c9..d135859071 100644 --- a/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets.py +++ b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets.py @@ -23,7 +23,7 @@ "official_name": "MiT", "path": "mit", }, - "class_name": "keras_cv.models>MiTBackbone", + "class_name": "keras_cv>MiTBackbone", "config": { "embedding_dims": [32, 64, 160, 256], "depths": [2, 2, 2, 2], @@ -41,7 +41,7 @@ "official_name": "MiT", "path": "mit", }, - "class_name": "keras_cv.models>MiTBackbone", + "class_name": "keras_cv>MiTBackbone", "config": { "embedding_dims": [64, 128, 320, 512], "depths": [2, 2, 2, 2], @@ -59,7 +59,7 @@ "official_name": "MiT", "path": "mit", }, - "class_name": "keras_cv.models>MiTBackbone", + "class_name": "keras_cv>MiTBackbone", "config": { "embedding_dims": [64, 128, 320, 512], "depths": [3, 4, 6, 3], @@ -77,7 +77,7 @@ "official_name": "MiT", "path": "mit", }, - "class_name": "keras_cv.models>MiTBackbone", + "class_name": "keras_cv>MiTBackbone", "config": { "embedding_dims": [64, 128, 320, 512], "depths": [3, 4, 18, 3], @@ -95,7 +95,7 @@ "official_name": "MiT", "path": "mit", }, - "class_name": "keras_cv.models>MiTBackbone", + "class_name": "keras_cv>MiTBackbone", "config": { "embedding_dims": [64, 128, 320, 512], "depths": [3, 8, 27, 3], @@ -113,7 +113,7 @@ "official_name": "MiT", "path": "mit", }, - "class_name": "keras_cv.models>MiTBackbone", + "class_name": "keras_cv>MiTBackbone", "config": { "embedding_dims": [64, 128, 320, 512], "depths": [3, 6, 40, 3], @@ -134,7 +134,7 @@ "official_name": "MiT", "path": "mit", }, - "class_name": "keras_cv.models>MiTBackbone", + "class_name": "keras_cv>MiTBackbone", "config": { "embedding_dims": [32, 64, 160, 256], "depths": [2, 2, 2, 2], diff --git a/keras_cv/models/segmentation/segformer/segformer.py b/keras_cv/models/segmentation/segformer/segformer.py index 9a01626682..11c85b3c41 100644 --- a/keras_cv/models/segmentation/segformer/segformer.py +++ b/keras_cv/models/segmentation/segformer/segformer.py @@ -1,3 +1,17 @@ +# Copyright 2023 The KerasCV Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + import copy from keras_cv.api_export import keras_cv_export @@ -13,7 +27,7 @@ from keras_cv.utils.train import get_feature_extractor -@keras_cv_export("keras_cv.layers.SegFormer") +@keras_cv_export("keras_cv.models.segmentation.SegFormer") class SegFormer(Task): """A Keras model implementing the SegFormer architecture for semantic segmentation. diff --git a/keras_cv/models/segmentation/segformer/segformer_aliases.py b/keras_cv/models/segmentation/segformer/segformer_aliases.py index 06e83cc592..faf9645492 100644 --- a/keras_cv/models/segmentation/segformer/segformer_aliases.py +++ b/keras_cv/models/segmentation/segformer/segformer_aliases.py @@ -42,14 +42,12 @@ class SegFormerB0(SegFormer): def __new__( cls, - backbone=None, - num_classes=None, + num_classes=19, **kwargs, ): # Pack args in kwargs kwargs.update( { - "backbone": backbone, "num_classes": num_classes, } ) @@ -72,14 +70,12 @@ def presets_with_weights(cls): class SegFormerB1(SegFormer): def __new__( cls, - backbone=None, - num_classes=None, + num_classes=19, **kwargs, ): # Pack args in kwargs kwargs.update( { - "backbone": backbone, "num_classes": num_classes, } ) @@ -102,14 +98,12 @@ def presets_with_weights(cls): class SegFormerB2(SegFormer): def __new__( cls, - backbone=None, - num_classes=None, + num_classes=19, **kwargs, ): # Pack args in kwargs kwargs.update( { - "backbone": backbone, "num_classes": num_classes, } ) @@ -132,14 +126,12 @@ def presets_with_weights(cls): class SegFormerB3(SegFormer): def __new__( cls, - backbone=None, - num_classes=None, + num_classes=19, **kwargs, ): # Pack args in kwargs kwargs.update( { - "backbone": backbone, "num_classes": num_classes, } ) @@ -162,14 +154,12 @@ def presets_with_weights(cls): class SegFormerB4(SegFormer): def __new__( cls, - backbone=None, - num_classes=None, + num_classes=19, **kwargs, ): # Pack args in kwargs kwargs.update( { - "backbone": backbone, "num_classes": num_classes, } ) @@ -192,14 +182,12 @@ def presets_with_weights(cls): class SegFormerB5(SegFormer): def __new__( cls, - backbone=None, - num_classes=None, + num_classes=19, **kwargs, ): # Pack args in kwargs kwargs.update( { - "backbone": backbone, "num_classes": num_classes, } ) diff --git a/keras_cv/models/segmentation/segformer/segformer_presets.py b/keras_cv/models/segmentation/segformer/segformer_presets.py index a3ee50d504..8faadf66d7 100644 --- a/keras_cv/models/segmentation/segformer/segformer_presets.py +++ b/keras_cv/models/segmentation/segformer/segformer_presets.py @@ -28,7 +28,7 @@ "official_name": "SegFormerB0", "path": "segformer_b0", }, - "class_name": "keras_cv.models>SegFormer", + "class_name": "keras_cv>SegFormer", "config": { "num_classes": 19, "backbone": backbone_presets["mit_b0_imagenet"], @@ -41,7 +41,7 @@ "official_name": "SegFormerB1", "path": "segformer_b1", }, - "class_name": "keras_cv.models>SegFormer", + "class_name": "keras_cv>SegFormer", "config": {"num_classes": 19, "backbone": backbone_presets["mit_b1"]}, }, "segformer_b2": { @@ -51,7 +51,7 @@ "official_name": "SegFormerB2", "path": "segformer_b2", }, - "class_name": "keras_cv.models>SegFormer", + "class_name": "keras_cv>SegFormer", "config": {"num_classes": 19, "backbone": backbone_presets["mit_b2"]}, }, "segformer_b3": { @@ -61,7 +61,7 @@ "official_name": "SegFormerB3", "path": "segformer_b3", }, - "class_name": "keras_cv.models>SegFormer", + "class_name": "keras_cv>SegFormer", "config": {"num_classes": 19, "backbone": backbone_presets["mit_b3"]}, }, "segformer_b4": { @@ -71,7 +71,7 @@ "official_name": "SegFormerB4", "path": "segformer_b4", }, - "class_name": "keras_cv.models>SegFormer", + "class_name": "keras_cv>SegFormer", "config": {"num_classes": 19, "backbone": backbone_presets["mit_b4"]}, }, "segformer_b5": { @@ -81,7 +81,7 @@ "official_name": "SegFormerB5", "path": "segformer_b5", }, - "class_name": "keras_cv.models>SegFormer", + "class_name": "keras_cv>SegFormer", "config": {"num_classes": 19, "backbone": backbone_presets["mit_b5"]}, }, } From 5f3fc222cc175de31897e8a23a006d33a5abc681 Mon Sep 17 00:00:00 2001 From: DavidLandup0 Date: Sat, 19 Aug 2023 21:21:17 +0200 Subject: [PATCH 39/53] linter warnings --- .../layers/overlapping_patching_embedding.py | 13 +++++++------ .../layers/segformer_multihead_attention.py | 19 +++++++++---------- keras_cv/models/__init__.py | 4 ---- .../mix_transformer_backbone_presets.py | 2 +- .../segmentation/segformer/segformer.py | 18 ++++++++++-------- 5 files changed, 27 insertions(+), 29 deletions(-) diff --git a/keras_cv/layers/overlapping_patching_embedding.py b/keras_cv/layers/overlapping_patching_embedding.py index 4e1484257d..ff842731c4 100644 --- a/keras_cv/layers/overlapping_patching_embedding.py +++ b/keras_cv/layers/overlapping_patching_embedding.py @@ -21,14 +21,15 @@ class OverlappingPatchingAndEmbedding(keras.layers.Layer): def __init__(self, project_dim=32, patch_size=7, stride=4, **kwargs): """ - Overlapping Patching and Embedding layer. Differs from `PatchingAndEmbedding` in that the patch size - does not affect the sequence length. It's fully derived from the `stride` parameter. - Additionally, no positional embedding is done as part of the layer - only a projection using a `Conv2D` layer. + Overlapping Patching and Embedding layer. Differs from `PatchingAndEmbedding` + in that the patch size does not affect the sequence length. It's fully derived + from the `stride` parameter. Additionally, no positional embedding is done + as part of the layer - only a projection using a `Conv2D` layer. References: - - [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) (CVPR 2021) - - [Official PyTorch implementation](https://github.com/NVlabs/SegFormer/blob/master/mmseg/models/backbones/mix_transformer.py) - - [Ported from the TensorFlow implementation from DeepVision](https://github.com/DavidLandup0/deepvision/blob/main/deepvision/layers/hierarchical_transformer_encoder.py) + - [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) (CVPR 2021) # noqa: E501 + - [Official PyTorch implementation](https://github.com/NVlabs/SegFormer/blob/master/mmseg/models/backbones/mix_transformer.py) # noqa: E501 + - [Ported from the TensorFlow implementation from DeepVision](https://github.com/DavidLandup0/deepvision/blob/main/deepvision/layers/hierarchical_transformer_encoder.py) # noqa: E501 Args: project_dim: the dimensionality of the projection of the encoder, and diff --git a/keras_cv/layers/segformer_multihead_attention.py b/keras_cv/layers/segformer_multihead_attention.py index 0a745ee297..92281bad7d 100644 --- a/keras_cv/layers/segformer_multihead_attention.py +++ b/keras_cv/layers/segformer_multihead_attention.py @@ -12,8 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import math - from keras_cv.api_export import keras_cv_export from keras_cv.backend import keras from keras_cv.backend import ops @@ -24,17 +22,18 @@ class SegFormerMultiheadAttention(keras.layers.Layer): def __init__(self, project_dim, num_heads, sr_ratio): """ Efficient MultiHeadAttention implementation as a Keras layer. - A huge bottleneck in scaling transformers is the self-attention layer with an O(n^2) complexity. + A huge bottleneck in scaling transformers is the self-attention layer + with an O(n^2) complexity. - SegFormerMultiheadAttention performs a sequence reduction (SR) operation with a given ratio, to reduce - the sequence length before performing key and value projections, reducing the O(n^2) complexity to O(n^2/R) where - R is the sequence reduction ratio. + SegFormerMultiheadAttention performs a sequence reduction (SR) operation + with a given ratio, to reduce the sequence length before performing key and value projections, + reducing the O(n^2) complexity to O(n^2/R) where R is the sequence reduction ratio. References: - - [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) (CVPR 2021) - - [NVlabs' official implementation](https://github.com/NVlabs/SegFormer/blob/master/mmseg/models/backbones/mix_transformer.py) - - [@sithu31296's reimplementation](https://github.com/sithu31296/semantic-segmentation/blob/main/semseg/models/backbones/mit.py) - - [Ported from the TensorFlow implementation from DeepVision](https://github.com/DavidLandup0/deepvision/blob/main/deepvision/layers/efficient_attention.py) + - [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) (CVPR 2021) # noqa: E501 + - [NVlabs' official implementation](https://github.com/NVlabs/SegFormer/blob/master/mmseg/models/backbones/mix_transformer.py) # noqa: E501 + - [@sithu31296's reimplementation](https://github.com/sithu31296/semantic-segmentation/blob/main/semseg/models/backbones/mit.py) # noqa: E501 + - [Ported from the TensorFlow implementation from DeepVision](https://github.com/DavidLandup0/deepvision/blob/main/deepvision/layers/efficient_attention.py) # noqa: E501 Args: project_dim: the dimensionality of the projection of the `SegFormerMultiheadAttention` layer. diff --git a/keras_cv/models/__init__.py b/keras_cv/models/__init__.py index d45b90f427..9c83a3891a 100644 --- a/keras_cv/models/__init__.py +++ b/keras_cv/models/__init__.py @@ -133,9 +133,6 @@ from keras_cv.models.backbones.mix_transformer.mix_transformer_aliases import ( MiTBackbone, ) -from keras_cv.models.backbones.mix_transformer.mix_transformer_backbone import ( - MiTBackbone, -) from keras_cv.models.backbones.mobilenet_v3.mobilenet_v3_aliases import ( MobileNetV3LargeBackbone, ) @@ -190,7 +187,6 @@ YOLOV8Detector, ) from keras_cv.models.segmentation import DeepLabV3Plus -from keras_cv.models.segmentation import SegFormer from keras_cv.models.segmentation.segformer.segformer_aliases import SegFormer from keras_cv.models.segmentation.segformer.segformer_aliases import SegFormerB0 from keras_cv.models.segmentation.segformer.segformer_aliases import SegFormerB1 diff --git a/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets.py b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets.py index d135859071..a4c1c2a3e1 100644 --- a/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets.py +++ b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets.py @@ -128,7 +128,7 @@ "mit_b0_imagenet": { "metadata": { "description": ( - "MiT (MixTransformer) model with 8 transformer blocks. Pre-trained on ImageNet-1K and scores 69% top-1 accuracy on the validation set." + "MiT (MixTransformer) model with 8 transformer blocks. Pre-trained on ImageNet-1K and scores 69% top-1 accuracy on the validation set." # noqa: E501 ), "params": 3321962, "official_name": "MiT", diff --git a/keras_cv/models/segmentation/segformer/segformer.py b/keras_cv/models/segmentation/segformer/segformer.py index 11c85b3c41..6ca0884021 100644 --- a/keras_cv/models/segmentation/segformer/segformer.py +++ b/keras_cv/models/segmentation/segformer/segformer.py @@ -33,18 +33,20 @@ class SegFormer(Task): segmentation. References: - - [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) - - [Based on the TensorFlow implementation from DeepVision](https://github.com/DavidLandup0/deepvision/tree/main/deepvision/models/segmentation/segformer) + - [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) # noqa: E501 + - [Based on the TensorFlow implementation from DeepVision](https://github.com/DavidLandup0/deepvision/tree/main/deepvision/models/segmentation/segformer) # noqa: E501 Args: backbone: `keras.Model`. The backbone network for the model that is - used as a feature extractor for the SegFormer encoder. It is *intended* - to be used only with the MiT backbone model which was created specifically - for SegFormers. It should either be a `keras_cv.models.backbones.backbone.Backbone` or a - `tf.keras.Model` that implements the `pyramid_level_inputs` - property with keys "P2", "P3", "P4", and "P5" and layer names as + used as a feature extractor for the SegFormer encoder. + It is *intended* to be used only with the MiT backbone model which + was created specifically for SegFormers. It should either be a + `keras_cv.models.backbones.backbone.Backbone` or a `tf.keras.Model` + that implements the `pyramid_level_inputs` property with keys + "P2", "P3", "P4", and "P5" and layer names as values. - num_classes: int, the number of classes for the detection model, including the background class. + num_classes: int, the number of classes for the detection model, + including the background class. projection_filters: int, default 256, number of filters in the convolution layer projecting the concatenated features into a segmentation map. From c6b454ff261cde6e7592fc67acc255f70767e293 Mon Sep 17 00:00:00 2001 From: DavidLandup0 Date: Sat, 19 Aug 2023 21:27:16 +0200 Subject: [PATCH 40/53] linter errors --- .../hierarchical_transformer_encoder.py | 27 +++++++++++-------- .../segmentation/segformer/segformer.py | 6 ++--- .../segformer/segformer_presets.py | 1 - 3 files changed, 19 insertions(+), 15 deletions(-) diff --git a/keras_cv/layers/hierarchical_transformer_encoder.py b/keras_cv/layers/hierarchical_transformer_encoder.py index 3a87d24483..b8de9d1de2 100644 --- a/keras_cv/layers/hierarchical_transformer_encoder.py +++ b/keras_cv/layers/hierarchical_transformer_encoder.py @@ -25,24 +25,29 @@ class HierarchicalTransformerEncoder(keras.layers.Layer): """ Hierarchical transformer encoder block implementation as a Keras Layer. - The layer uses `SegFormerMultiheadAttention` as a `MultiHeadAttention` alternative for - computational efficiency, and is meant to be used within the SegFormer architecture. + The layer uses `SegFormerMultiheadAttention` as a `MultiHeadAttention` + alternative for computational efficiency, and is meant to be used + within the SegFormer architecture. References: - - [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) (CVPR 2021) - - [Official PyTorch implementation](https://github.com/NVlabs/SegFormer/blob/master/mmseg/models/backbones/mix_transformer.py) - - [Ported from the TensorFlow implementation from DeepVision](https://github.com/DavidLandup0/deepvision/blob/main/deepvision/layers/hierarchical_transformer_encoder.py) + - [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) (CVPR 2021) # noqa: E501 + - [Official PyTorch implementation](https://github.com/NVlabs/SegFormer/blob/master/mmseg/models/backbones/mix_transformer.py) # noqa: E501 + - [Ported from the TensorFlow implementation from DeepVision](https://github.com/DavidLandup0/deepvision/blob/main/deepvision/layers/hierarchical_transformer_encoder.py) # noqa: E501 Args: project_dim: the dimensionality of the projection of the encoder, and - output of the `SegFormerMultiheadAttention` layer. Due to the residual addition - the input dimensionality has to be equal to the output dimensionality. - num_heads: the number of heads for the `SegFormerMultiheadAttention` layer - drop_prob: default 0.0, the probability of dropping a random sample using the `DropPath` layer. + output of the `SegFormerMultiheadAttention` layer. Due to the + residual addition the input dimensionality has to be equal to + the output dimensionality. + num_heads: the number of heads for the `SegFormerMultiheadAttention` + layer + drop_prob: default 0.0, the probability of dropping a random sample + using the `DropPath` layer. layer_norm_epsilon: default 1e-06, the epsilon for `LayerNormalization` layers - sr_ratio: default 1, the ratio to use within `SegFormerMultiheadAttention`. If set to > 1, - a `Conv2D` layer is used to reduce the length of the sequence. + sr_ratio: default 1, the ratio to use within `SegFormerMultiheadAttention`. # noqa: E501 + If set to > 1, a `Conv2D` layer is used to reduce the length of + the sequence. Basic usage: diff --git a/keras_cv/models/segmentation/segformer/segformer.py b/keras_cv/models/segmentation/segformer/segformer.py index 6ca0884021..bbba1cdbab 100644 --- a/keras_cv/models/segmentation/segformer/segformer.py +++ b/keras_cv/models/segmentation/segformer/segformer.py @@ -47,8 +47,9 @@ class SegFormer(Task): values. num_classes: int, the number of classes for the detection model, including the background class. - projection_filters: int, default 256, number of filters in the convolution layer - projecting the concatenated features into a segmentation map. + projection_filters: int, default 256, number of filters in the + convolution layer projecting the concatenated features into + a segmentation map. Examples: @@ -155,7 +156,6 @@ def get_config(self): config.update( { "num_classes": self.num_classes, - "backbone": self.backbone, "projection_filters": self.projection_filters, "backbone": keras.saving.serialize_keras_object(self.backbone), } diff --git a/keras_cv/models/segmentation/segformer/segformer_presets.py b/keras_cv/models/segmentation/segformer/segformer_presets.py index 8faadf66d7..2f814437cb 100644 --- a/keras_cv/models/segmentation/segformer/segformer_presets.py +++ b/keras_cv/models/segmentation/segformer/segformer_presets.py @@ -13,7 +13,6 @@ # limitations under the License. """SegFormer model preset configurations.""" -from keras_cv.backend import keras from keras_cv.models.backbones.mix_transformer.mix_transformer_backbone_presets import ( backbone_presets, ) From 5ac7f775ef1fbac728223a55b7ad22b5c2fa437e Mon Sep 17 00:00:00 2001 From: DavidLandup0 Date: Sat, 19 Aug 2023 21:28:54 +0200 Subject: [PATCH 41/53] consistency in presets --- .../segformer/segformer_presets.py | 24 +++++++++++++++---- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/keras_cv/models/segmentation/segformer/segformer_presets.py b/keras_cv/models/segmentation/segformer/segformer_presets.py index 2f814437cb..f75c8233c8 100644 --- a/keras_cv/models/segmentation/segformer/segformer_presets.py +++ b/keras_cv/models/segmentation/segformer/segformer_presets.py @@ -20,9 +20,7 @@ presets_no_weights = { "segformer_b0": { "metadata": { - "description": ( - "SegFormer model with a pretrained MiTB0 backbone." - ), + "description": ("SegFormer model with MiTB0 backbone."), "params": 3719027, "official_name": "SegFormerB0", "path": "segformer_b0", @@ -30,7 +28,7 @@ "class_name": "keras_cv>SegFormer", "config": { "num_classes": 19, - "backbone": backbone_presets["mit_b0_imagenet"], + "backbone": backbone_presets["mit_b0"], }, }, "segformer_b1": { @@ -85,7 +83,23 @@ }, } -presets_with_weights = {} +presets_with_weights = { + "segformer_b0_imagenet": { + "metadata": { + "description": ( + "SegFormer model with a pretrained MiTB0 backbone." + ), + "params": 3719027, + "official_name": "SegFormerB0", + "path": "segformer_b0", + }, + "class_name": "keras_cv>SegFormer", + "config": { + "num_classes": 19, + "backbone": backbone_presets["mit_b0_imagenet"], + }, + }, +} presets = { **presets_no_weights, From b2a76ce0631256a4d189d2d5f3cf4806413d8022 Mon Sep 17 00:00:00 2001 From: DavidLandup0 Date: Sat, 19 Aug 2023 21:39:53 +0200 Subject: [PATCH 42/53] return config --- keras_cv/models/segmentation/segformer/segformer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/keras_cv/models/segmentation/segformer/segformer.py b/keras_cv/models/segmentation/segformer/segformer.py index bbba1cdbab..8850dc06e1 100644 --- a/keras_cv/models/segmentation/segformer/segformer.py +++ b/keras_cv/models/segmentation/segformer/segformer.py @@ -160,6 +160,7 @@ def get_config(self): "backbone": keras.saving.serialize_keras_object(self.backbone), } ) + return config @classproperty def presets(cls): From 0ad5879fb3b5b6314d690384e0ae741e2838fdf2 Mon Sep 17 00:00:00 2001 From: DavidLandup0 Date: Sat, 19 Aug 2023 21:41:48 +0200 Subject: [PATCH 43/53] fix serialization --- keras_cv/models/segmentation/segformer/segformer.py | 1 + 1 file changed, 1 insertion(+) diff --git a/keras_cv/models/segmentation/segformer/segformer.py b/keras_cv/models/segmentation/segformer/segformer.py index 8850dc06e1..315afe7906 100644 --- a/keras_cv/models/segmentation/segformer/segformer.py +++ b/keras_cv/models/segmentation/segformer/segformer.py @@ -150,6 +150,7 @@ def __init__( self.num_classes = num_classes self.projection_filters = projection_filters + self.backbone = backbone def get_config(self): config = super().get_config() From eea5e3c87e156dd8e8dd01426bbd571c3318d179 Mon Sep 17 00:00:00 2001 From: ianjjohnson <3072903+ianstenbit@users.noreply.github.com> Date: Mon, 21 Aug 2023 13:11:53 -0600 Subject: [PATCH 44/53] Some cleanup + more tests --- .../layers/segformer_multihead_attention.py | 11 +- .../mix_transformer_aliases.py | 12 +-- .../mix_transformer_backbone_presets_test.py | 100 ++++++++++++++++++ .../mix_transformer_backbone_test.py | 69 ++++++++++++ .../segformer/segformer_presets.py | 2 +- .../segmentation/segformer/segformer_test.py | 34 +++--- 6 files changed, 197 insertions(+), 31 deletions(-) create mode 100644 keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets_test.py create mode 100644 keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_test.py diff --git a/keras_cv/layers/segformer_multihead_attention.py b/keras_cv/layers/segformer_multihead_attention.py index 92281bad7d..6ec1ba5e2e 100644 --- a/keras_cv/layers/segformer_multihead_attention.py +++ b/keras_cv/layers/segformer_multihead_attention.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import math + from keras_cv.api_export import keras_cv_export from keras_cv.backend import keras from keras_cv.backend import ops @@ -70,12 +72,8 @@ def __init__(self, project_dim, num_heads, sr_ratio): def call(self, x): input_shape = ops.shape(x) - H, W = ops.sqrt(ops.cast(input_shape[1], "float32")), ops.sqrt( - ops.cast(input_shape[1], "float32") - ) - B, C = ops.cast(input_shape[0], "float32"), ops.cast( - input_shape[2], "float32" - ) + H, W = int(math.sqrt(input_shape[1])), int(math.sqrt(input_shape[1])) + B, C = input_shape[0], input_shape[2] q = self.q(x) q = ops.reshape( @@ -126,5 +124,6 @@ def call(self, x): ops.transpose(attn, [0, 2, 1, 3]), [input_shape[0], input_shape[1], input_shape[2]], ) + x = self.proj(attn) return x diff --git a/keras_cv/models/backbones/mix_transformer/mix_transformer_aliases.py b/keras_cv/models/backbones/mix_transformer/mix_transformer_aliases.py index 74c9a4ec51..7c7ea6a8b6 100644 --- a/keras_cv/models/backbones/mix_transformer/mix_transformer_aliases.py +++ b/keras_cv/models/backbones/mix_transformer/mix_transformer_aliases.py @@ -50,7 +50,7 @@ class MiTB0Backbone(MiTBackbone): def __new__( cls, include_rescaling=True, - input_shape=(None, None, 3), + input_shape=(224, 224, 3), input_tensor=None, **kwargs, ): @@ -84,7 +84,7 @@ class MiTB1Backbone(MiTBackbone): def __new__( cls, include_rescaling=True, - input_shape=(None, None, 3), + input_shape=(224, 224, 3), input_tensor=None, **kwargs, ): @@ -113,7 +113,7 @@ class MiTB2Backbone(MiTBackbone): def __new__( cls, include_rescaling=True, - input_shape=(None, None, 3), + input_shape=(224, 224, 3), input_tensor=None, **kwargs, ): @@ -142,7 +142,7 @@ class MiTB3Backbone(MiTBackbone): def __new__( cls, include_rescaling=True, - input_shape=(None, None, 3), + input_shape=(224, 224, 3), input_tensor=None, **kwargs, ): @@ -171,7 +171,7 @@ class MiTB4Backbone(MiTBackbone): def __new__( cls, include_rescaling=True, - input_shape=(None, None, 3), + input_shape=(224, 224, 3), input_tensor=None, **kwargs, ): @@ -200,7 +200,7 @@ class MiTB5Backbone(MiTBackbone): def __new__( cls, include_rescaling=True, - input_shape=(None, None, 3), + input_shape=(224, 224, 3), input_tensor=None, **kwargs, ): diff --git a/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets_test.py b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets_test.py new file mode 100644 index 0000000000..0bc443ee92 --- /dev/null +++ b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets_test.py @@ -0,0 +1,100 @@ +# Copyright 2023 The KerasCV Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tests for loading pretrained model presets.""" + +import numpy as np +import pytest + +from keras_cv.backend import ops +from keras_cv.models.backbones.mix_transformer.mix_transformer_aliases import ( + MiTB0Backbone, +) +from keras_cv.models.backbones.mix_transformer.mix_transformer_backbone import ( + MiTBackbone, +) +from keras_cv.tests.test_case import TestCase + + +@pytest.mark.large +class MixTransformerPresetSmokeTest(TestCase): + """ + A smoke test for MixTransformer presets we run continuously. + This only tests the smallest weights we have available. Run with: + `pytest keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets_test.py --run_large` # noqa: E501 + """ + + def setUp(self): + self.input_batch = np.ones(shape=(2, 224, 224, 3)) + + def test_backbone_output(self): + model = MiTBackbone.from_preset("mit_b0") + model(self.input_batch) + + def test_backbone_output_with_weights(self): + model = MiTBackbone.from_preset("mit_b0_imagenet") + + # The forward pass from a preset should be stable! + # This test should catch cases where we unintentionally change our + # network code in a way that would invalidate our preset weights. + # We should only update these numbers if we are updating a weights + # file, or have found a discrepancy with the upstream source. + + outputs = model(np.ones(shape=(1, 224, 224, 3))) + expected = [-0.603472, -0.180627, -1.92137, -0.004339, 2.396384] + # Keep a high tolerance, so we are robust to different hardware. + self.assertAllClose( + ops.convert_to_numpy(outputs[0, 0, 0, :5]), + expected, + atol=0.01, + rtol=0.01, + ) + + def test_applications_model_output(self): + model = MiTB0Backbone() + model(self.input_batch) + + def test_applications_model_output_with_preset(self): + model = MiTB0Backbone.from_preset("mit_b0_imagenet") + model(self.input_batch) + + def test_preset_docstring(self): + """Check we did our docstring formatting correctly.""" + for name in MiTBackbone.presets: + self.assertRegex(MiTBackbone.from_preset.__doc__, name) + + def test_unknown_preset_error(self): + # Not a preset name + with self.assertRaises(ValueError): + MiTBackbone.from_preset("mit_b0_clowntown") + + def test_load_weights_error(self): + # Try to load weights when none available + with self.assertRaises(ValueError): + MiTBackbone.from_preset("mit_b0", load_weights=True) + + +@pytest.mark.extra_large +class MixTransformerPresetFullTest(TestCase): + """ + Test the full enumeration of our preset. + This tests every preset for Mix Transformer and is only run manually. + Run with: + `pytest keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_presets_test.py --run_extra_large` # noqa: E501 + """ + + def test_load_mix_transformer(self): + input_data = np.ones(shape=(2, 224, 224, 3)) + for preset in MiTBackbone.presets: + model = MiTBackbone.from_preset(preset) + model(input_data) diff --git a/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_test.py b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_test.py new file mode 100644 index 0000000000..f24596bdfe --- /dev/null +++ b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone_test.py @@ -0,0 +1,69 @@ +# Copyright 2023 The KerasCV Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os + +import numpy as np +import pytest +from absl.testing import parameterized + +from keras_cv.backend import keras +from keras_cv.backend import ops +from keras_cv.models.backbones.mix_transformer.mix_transformer_aliases import ( + MiTB0Backbone, +) +from keras_cv.models.backbones.mix_transformer.mix_transformer_backbone import ( + MiTBackbone, +) +from keras_cv.tests.test_case import TestCase + + +class MixTransformerBackboneTest(TestCase): + def setUp(self): + self.input_batch = np.ones(shape=(2, 224, 224, 3)) + + def test_valid_call(self): + model = MiTB0Backbone() + model(self.input_batch) + + @pytest.mark.large # Saving is slow, so mark these large. + def test_saved_model(self): + model = MiTB0Backbone( + include_rescaling=False, + ) + model_output = model(self.input_batch) + save_path = os.path.join(self.get_temp_dir(), "mit_backbone.keras") + model.save(save_path) + restored_model = keras.models.load_model(save_path) + + # Check we got the real object back. + self.assertIsInstance(restored_model, MiTBackbone) + + # Check that output matches. + restored_output = restored_model(self.input_batch) + self.assertAllClose( + ops.convert_to_numpy(model_output), + ops.convert_to_numpy(restored_output), + ) + + @parameterized.named_parameters( + ("one_channel", 1), + ("four_channels", 4), + ) + def test_application_variable_input_channels(self, num_channels): + model = MiTB0Backbone( + input_shape=(224, 224, num_channels), + include_rescaling=False, + ) + self.assertEqual(model.output_shape, (None, 7, 7, 256)) diff --git a/keras_cv/models/segmentation/segformer/segformer_presets.py b/keras_cv/models/segmentation/segformer/segformer_presets.py index f75c8233c8..3d5bfc568b 100644 --- a/keras_cv/models/segmentation/segformer/segformer_presets.py +++ b/keras_cv/models/segmentation/segformer/segformer_presets.py @@ -13,7 +13,7 @@ # limitations under the License. """SegFormer model preset configurations.""" -from keras_cv.models.backbones.mix_transformer.mix_transformer_backbone_presets import ( +from keras_cv.models.backbones.mix_transformer.mix_transformer_backbone_presets import ( # noqa: E501 backbone_presets, ) diff --git a/keras_cv/models/segmentation/segformer/segformer_test.py b/keras_cv/models/segmentation/segformer/segformer_test.py index 924c506350..0990e0e88f 100644 --- a/keras_cv/models/segmentation/segformer/segformer_test.py +++ b/keras_cv/models/segmentation/segformer/segformer_test.py @@ -14,16 +14,18 @@ import os +import numpy as np import pytest import tensorflow as tf -from absl.testing import parameterized -from tensorflow import keras +from keras_cv.backend import keras +from keras_cv.backend import ops from keras_cv.models import MiTBackbone from keras_cv.models import SegFormer +from keras_cv.tests.test_case import TestCase -class SegFormerTest(tf.test.TestCase, parameterized.TestCase): +class SegFormerTest(TestCase): def test_segformer_construction(self): backbone = MiTBackbone.from_preset("mit_b0", input_shape=[512, 512, 3]) model = SegFormer(backbone=backbone, num_classes=1) @@ -34,25 +36,25 @@ def test_segformer_construction(self): ) @pytest.mark.large - def test_segformer_plus_call(self): + def test_segformer_call(self): backbone = MiTBackbone.from_preset("mit_b0", input_shape=[512, 512, 3]) model = SegFormer(backbone=backbone, num_classes=1) - images = tf.random.uniform((2, 512, 512, 3)) + images = np.random.uniform(size=(2, 512, 512, 3)) _ = model(images) _ = model.predict(images) @pytest.mark.large def test_weights_change(self): - target_size = [512, 512, 3] + target_size = [512, 512, 2] - images = tf.ones(shape=[1] + target_size) + images = tf.ones(shape=[1] + [512, 512, 3]) labels = tf.zeros(shape=[1] + target_size) ds = tf.data.Dataset.from_tensor_slices((images, labels)) ds = ds.repeat(2) ds = ds.batch(2) backbone = MiTBackbone.from_preset("mit_b0", input_shape=[512, 512, 3]) - model = SegFormer(backbone=backbone, num_classes=1) + model = SegFormer(backbone=backbone, num_classes=2) model.compile( optimizer="adam", @@ -65,25 +67,21 @@ def test_weights_change(self): updated_weights = model.get_weights() for w1, w2 in zip(original_weights, updated_weights): - self.assertNotAllClose(w1, w2) - self.assertFalse(tf.math.reduce_any(tf.math.is_nan(w2))) + self.assertNotAllEqual(w1, w2) + self.assertFalse(ops.any(ops.isnan(w2))) - @parameterized.named_parameters( - ("tf_format", "tf", "model"), - ("keras_format", "keras_v3", "model.keras"), - ) @pytest.mark.large # Saving is slow, so mark these large. - def test_saved_model(self, save_format, filename): + def test_saved_model(self): target_size = [512, 512, 3] backbone = MiTBackbone.from_preset("mit_b0", input_shape=[512, 512, 3]) model = SegFormer(backbone=backbone, num_classes=1) - input_batch = tf.ones(shape=[2] + target_size) + input_batch = np.ones(shape=[2] + target_size) model_output = model(input_batch) - save_path = os.path.join(self.get_temp_dir(), filename) - model.save(save_path, save_format=save_format) + save_path = os.path.join(self.get_temp_dir(), "model.keras") + model.save(save_path, save_format="keras_v3") restored_model = keras.models.load_model(save_path) # Check we got the real object back. From 8e62cf6941bf5522499fa3d04bdf2a0f4fd2c4a2 Mon Sep 17 00:00:00 2001 From: ianjjohnson <3072903+ianstenbit@users.noreply.github.com> Date: Mon, 21 Aug 2023 13:22:02 -0600 Subject: [PATCH 45/53] Fix DropPath layer (need to update tests + add shim for tf.keras --- .../layers/hierarchical_transformer_encoder.py | 8 ++++---- keras_cv/layers/regularization/drop_path.py | 17 +++++++++-------- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/keras_cv/layers/hierarchical_transformer_encoder.py b/keras_cv/layers/hierarchical_transformer_encoder.py index b8de9d1de2..1917f078c9 100644 --- a/keras_cv/layers/hierarchical_transformer_encoder.py +++ b/keras_cv/layers/hierarchical_transformer_encoder.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import math + from keras_cv.api_export import keras_cv_export from keras_cv.backend import keras from keras_cv.backend import ops @@ -128,10 +130,8 @@ def __init__(self, channels, mid_channels): def call(self, x): x = self.fc1(x) shape = ops.shape(x) - B, C = ops.cast(shape[0], "float32"), ops.cast(shape[-1], "float32") - H, W = ops.sqrt(ops.cast(shape[1], "float32")), ops.sqrt( - ops.cast(shape[1], "float32") - ) + H, W = int(math.sqrt(shape[1])), int(math.sqrt(shape[1])) + B, C = shape[0], shape[2] x = ops.reshape(x, (B, H, W, C)) x = self.dwconv(x) x = ops.reshape(x, (B, -1, C)) diff --git a/keras_cv/layers/regularization/drop_path.py b/keras_cv/layers/regularization/drop_path.py index e254f29493..2999ea2ddf 100644 --- a/keras_cv/layers/regularization/drop_path.py +++ b/keras_cv/layers/regularization/drop_path.py @@ -12,13 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -from tensorflow import keras - from keras_cv.api_export import keras_cv_export +from keras_cv.backend import keras +from keras_cv.backend import ops @keras_cv_export("keras_cv.layers.DropPath") -class DropPath(keras.__internal__.layers.BaseRandomLayer): +class DropPath(keras.layers.Layer): """ Implements the DropPath layer. DropPath randomly drops samples during training with a probability of `rate`. Note that this layer drops individual @@ -47,7 +47,7 @@ class DropPath(keras.__internal__.layers.BaseRandomLayer): """ # noqa: E501 def __init__(self, rate=0.5, seed=None, **kwargs): - super().__init__(seed=seed, **kwargs) + super().__init__(**kwargs) self.rate = rate self.seed = seed @@ -55,12 +55,13 @@ def call(self, x, training=None): if self.rate == 0.0 or not training: return x else: - keep_prob = 1 - self.rate drop_map_shape = (x.shape[0],) + (1,) * (len(x.shape) - 1) - drop_map = keras.backend.random_bernoulli( - drop_map_shape, p=keep_prob, seed=self.seed + drop_map = ops.cast( + keras.random.uniform(drop_map_shape, seed=self.seed) + > self.rate, + x.dtype, ) - x = x / keep_prob + x = x / (1.0 - self.rate) x = x * drop_map return x From b9efeb1fb9c705b62579c0a676733f75948244db Mon Sep 17 00:00:00 2001 From: ianjjohnson <3072903+ianstenbit@users.noreply.github.com> Date: Mon, 21 Aug 2023 13:57:31 -0600 Subject: [PATCH 46/53] Finish DropPath layer --- keras_cv/backend/__init__.py | 1 + keras_cv/backend/random.py | 20 +++++++++++++++++++ keras_cv/layers/regularization/drop_path.py | 7 ++++--- .../layers/regularization/drop_path_test.py | 18 ++++++++++------- .../models/segmentation/segformer/__init__.py | 14 +++++++++++++ 5 files changed, 50 insertions(+), 10 deletions(-) create mode 100644 keras_cv/backend/random.py diff --git a/keras_cv/backend/__init__.py b/keras_cv/backend/__init__.py index da703722b9..7440acbd38 100644 --- a/keras_cv/backend/__init__.py +++ b/keras_cv/backend/__init__.py @@ -76,6 +76,7 @@ from keras_cv.backend import config # noqa: E402 from keras_cv.backend import ops # noqa: E402 +from keras_cv.backend import random # noqa: E402 from keras_cv.backend import tf_ops # noqa: E402 diff --git a/keras_cv/backend/random.py b/keras_cv/backend/random.py new file mode 100644 index 0000000000..21d4b08c7d --- /dev/null +++ b/keras_cv/backend/random.py @@ -0,0 +1,20 @@ +# Copyright 2023 The KerasCV Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from keras_cv.backend.config import multi_backend + +if multi_backend(): + from keras_core.random import * # noqa: F403, F401 +else: + from keras_core.src.backend.tensorflow.random import * # noqa: F403, F401 diff --git a/keras_cv/layers/regularization/drop_path.py b/keras_cv/layers/regularization/drop_path.py index 2999ea2ddf..4475e2365f 100644 --- a/keras_cv/layers/regularization/drop_path.py +++ b/keras_cv/layers/regularization/drop_path.py @@ -15,6 +15,7 @@ from keras_cv.api_export import keras_cv_export from keras_cv.backend import keras from keras_cv.backend import ops +from keras_cv.backend import random @keras_cv_export("keras_cv.layers.DropPath") @@ -55,10 +56,10 @@ def call(self, x, training=None): if self.rate == 0.0 or not training: return x else: - drop_map_shape = (x.shape[0],) + (1,) * (len(x.shape) - 1) + batch_size = x.shape[0] or ops.shape(x)[0] + drop_map_shape = (batch_size,) + (1,) * (len(x.shape) - 1) drop_map = ops.cast( - keras.random.uniform(drop_map_shape, seed=self.seed) - > self.rate, + random.uniform(drop_map_shape, seed=self.seed) > self.rate, x.dtype, ) x = x / (1.0 - self.rate) diff --git a/keras_cv/layers/regularization/drop_path_test.py b/keras_cv/layers/regularization/drop_path_test.py index 22f63b5223..00b4b790f0 100644 --- a/keras_cv/layers/regularization/drop_path_test.py +++ b/keras_cv/layers/regularization/drop_path_test.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import numpy as np +import pytest import tensorflow as tf from keras_cv.layers import DropPath @@ -23,7 +25,7 @@ class DropPathTest(TestCase): def test_input_unchanged_in_eval_mode(self): layer = DropPath(rate=0.5, seed=42) - inputs = tf.random.uniform(self.FEATURE_SHAPE) + inputs = np.random.uniform(size=self.FEATURE_SHAPE) outputs = layer(inputs, training=False) @@ -31,7 +33,7 @@ def test_input_unchanged_in_eval_mode(self): def test_input_unchanged_with_rate_equal_to_zero(self): layer = DropPath(rate=0, seed=42) - inputs = tf.random.uniform(self.FEATURE_SHAPE) + inputs = np.random.uniform(size=self.FEATURE_SHAPE) outputs = layer(inputs, training=True) @@ -39,7 +41,7 @@ def test_input_unchanged_with_rate_equal_to_zero(self): def test_input_gets_partially_zeroed_out_in_train_mode(self): layer = DropPath(rate=0.2, seed=42) - inputs = tf.random.uniform(self.FEATURE_SHAPE) + inputs = np.random.uniform(size=self.FEATURE_SHAPE) outputs = layer(inputs, training=True) @@ -48,9 +50,11 @@ def test_input_gets_partially_zeroed_out_in_train_mode(self): self.assertGreaterEqual(non_zeros_inputs, non_zeros_outputs) + # Because randomness is inconsistent across backends, we just test with 1. + @pytest.mark.tf_keras_only def test_strict_input_gets_partially_zeroed_out_in_train_mode(self): - layer = DropPath(rate=0.5, seed=42) - inputs = tf.random.uniform(self.FEATURE_SHAPE) + layer = DropPath(rate=0.5, seed=10) + inputs = np.random.uniform(size=self.FEATURE_SHAPE) total_non_zero_inputs = 0 total_non_zero_outputs = 0 @@ -66,6 +70,6 @@ def test_strict_input_gets_partially_zeroed_out_in_train_mode(self): self.assertAllInRange( total_non_zero_outputs, - int(0.49 * tf.cast(total_non_zero_inputs, tf.float32)), - int(0.51 * tf.cast(total_non_zero_inputs, tf.float32)), + int(0.40 * tf.cast(total_non_zero_inputs, tf.float32)), + int(0.60 * tf.cast(total_non_zero_inputs, tf.float32)), ) diff --git a/keras_cv/models/segmentation/segformer/__init__.py b/keras_cv/models/segmentation/segformer/__init__.py index e76527fdde..59d29582c2 100644 --- a/keras_cv/models/segmentation/segformer/__init__.py +++ b/keras_cv/models/segmentation/segformer/__init__.py @@ -1 +1,15 @@ +# Copyright 2023 The KerasCV Authors +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + from keras_cv.models.segmentation.segformer.segformer import SegFormer From bd5a99f4cf7da767f4491c95911f7366158c699f Mon Sep 17 00:00:00 2001 From: ianjjohnson <3072903+ianstenbit@users.noreply.github.com> Date: Mon, 21 Aug 2023 14:38:21 -0600 Subject: [PATCH 47/53] Use static shape in backbone --- .../backbones/mix_transformer/mix_transformer_backbone.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py index 85ffabedd4..e29ef4bd35 100644 --- a/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py +++ b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py @@ -97,8 +97,8 @@ def __init__( # call in `OverlappingPatchingAndEmbedding` stride = 4 if i == 0 else 2 new_height, new_width = ( - int(ops.shape(x)[1] / stride), - int(ops.shape(x)[2] / stride), + int(x.shape[1] / stride), + int(x.shape[2] / stride), ) x = patch_embedding_layers[i](x) From 3d29b0a319a5cc88a82e3b077d8a1dbaa3a220b5 Mon Sep 17 00:00:00 2001 From: ianjjohnson <3072903+ianstenbit@users.noreply.github.com> Date: Mon, 21 Aug 2023 14:41:27 -0600 Subject: [PATCH 48/53] Formatting --- .../models/backbones/mix_transformer/mix_transformer_backbone.py | 1 - 1 file changed, 1 deletion(-) diff --git a/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py index e29ef4bd35..9907f6bd7c 100644 --- a/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py +++ b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py @@ -28,7 +28,6 @@ from keras_cv import layers as cv_layers from keras_cv.api_export import keras_cv_export from keras_cv.backend import keras -from keras_cv.backend import ops from keras_cv.models import utils from keras_cv.models.backbones.backbone import Backbone from keras_cv.models.backbones.mix_transformer.mix_transformer_backbone_presets import ( # noqa: E501 From 4e2c4e88b993280e75a79366436d488753a82c5d Mon Sep 17 00:00:00 2001 From: ianjjohnson <3072903+ianstenbit@users.noreply.github.com> Date: Mon, 21 Aug 2023 14:50:55 -0600 Subject: [PATCH 49/53] Switch back to ops.shape --- .../backbones/mix_transformer/mix_transformer_backbone.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py index 9907f6bd7c..85ffabedd4 100644 --- a/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py +++ b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py @@ -28,6 +28,7 @@ from keras_cv import layers as cv_layers from keras_cv.api_export import keras_cv_export from keras_cv.backend import keras +from keras_cv.backend import ops from keras_cv.models import utils from keras_cv.models.backbones.backbone import Backbone from keras_cv.models.backbones.mix_transformer.mix_transformer_backbone_presets import ( # noqa: E501 @@ -96,8 +97,8 @@ def __init__( # call in `OverlappingPatchingAndEmbedding` stride = 4 if i == 0 else 2 new_height, new_width = ( - int(x.shape[1] / stride), - int(x.shape[2] / stride), + int(ops.shape(x)[1] / stride), + int(ops.shape(x)[2] / stride), ) x = patch_embedding_layers[i](x) From b32e0cfd4d2e39168d4997dee700b41d862c2010 Mon Sep 17 00:00:00 2001 From: DavidLandup0 Date: Wed, 23 Aug 2023 21:05:22 +0200 Subject: [PATCH 50/53] documentation --- .../hierarchical_transformer_encoder.py | 26 +++++----- .../layers/overlapping_patching_embedding.py | 12 ++--- .../layers/segformer_multihead_attention.py | 9 ++-- .../mix_transformer_backbone.py | 49 +++++++++++++++++++ 4 files changed, 72 insertions(+), 24 deletions(-) diff --git a/keras_cv/layers/hierarchical_transformer_encoder.py b/keras_cv/layers/hierarchical_transformer_encoder.py index 1917f078c9..f0dc0aa09f 100644 --- a/keras_cv/layers/hierarchical_transformer_encoder.py +++ b/keras_cv/layers/hierarchical_transformer_encoder.py @@ -37,19 +37,19 @@ class HierarchicalTransformerEncoder(keras.layers.Layer): - [Ported from the TensorFlow implementation from DeepVision](https://github.com/DavidLandup0/deepvision/blob/main/deepvision/layers/hierarchical_transformer_encoder.py) # noqa: E501 Args: - project_dim: the dimensionality of the projection of the encoder, and - output of the `SegFormerMultiheadAttention` layer. Due to the - residual addition the input dimensionality has to be equal to - the output dimensionality. - num_heads: the number of heads for the `SegFormerMultiheadAttention` - layer - drop_prob: default 0.0, the probability of dropping a random sample - using the `DropPath` layer. - layer_norm_epsilon: default 1e-06, the epsilon for `LayerNormalization` - layers - sr_ratio: default 1, the ratio to use within `SegFormerMultiheadAttention`. # noqa: E501 - If set to > 1, a `Conv2D` layer is used to reduce the length of - the sequence. + project_dim: integer, the dimensionality of the projection of the + encoder, and output of the `SegFormerMultiheadAttention` layer. + Due to the residual addition the input dimensionality has to be + equal to the output dimensionality. + num_heads: integer, the number of heads for the + `SegFormerMultiheadAttention` layer + drop_prob: float, default 0.0, the probability of dropping a random + sample using the `DropPath` layer. + layer_norm_epsilon: float, default 1e-06, the epsilon for + `LayerNormalization` layers + sr_ratio: integer, default 1, the ratio to use within + `SegFormerMultiheadAttention`. If set to > 1, a `Conv2D` + layer is used to reduce the length of the sequence. Basic usage: diff --git a/keras_cv/layers/overlapping_patching_embedding.py b/keras_cv/layers/overlapping_patching_embedding.py index ff842731c4..624c266d18 100644 --- a/keras_cv/layers/overlapping_patching_embedding.py +++ b/keras_cv/layers/overlapping_patching_embedding.py @@ -32,14 +32,10 @@ def __init__(self, project_dim=32, patch_size=7, stride=4, **kwargs): - [Ported from the TensorFlow implementation from DeepVision](https://github.com/DavidLandup0/deepvision/blob/main/deepvision/layers/hierarchical_transformer_encoder.py) # noqa: E501 Args: - project_dim: the dimensionality of the projection of the encoder, and - output of the `MultiHeadAttention` - num_heads: the number of heads for the `MultiHeadAttention` layer - drop_prob: default 0.0, the probability of dropping a random sample using the `DropPath` layer. - layer_norm_epsilon: default 1e-06, the epsilon for `LayerNormalization` - layers - sr_ratio: default 1, the ratio to use within `SegFormerMultiheadAttention`. If set to > 1, - a `Conv2D` layer is used to reduce the length of the sequence. + project_dim: integer, default 32, the dimensionality of the projection + patch_size: integer, default 7, the size of the patches to encode + stride: integer, default 4, the stride to use for the patching before + projection Basic usage: diff --git a/keras_cv/layers/segformer_multihead_attention.py b/keras_cv/layers/segformer_multihead_attention.py index 6ec1ba5e2e..203773d4ea 100644 --- a/keras_cv/layers/segformer_multihead_attention.py +++ b/keras_cv/layers/segformer_multihead_attention.py @@ -38,9 +38,12 @@ def __init__(self, project_dim, num_heads, sr_ratio): - [Ported from the TensorFlow implementation from DeepVision](https://github.com/DavidLandup0/deepvision/blob/main/deepvision/layers/efficient_attention.py) # noqa: E501 Args: - project_dim: the dimensionality of the projection of the `SegFormerMultiheadAttention` layer. - num_heads: the number of heads to use in the attention computation. - sr_ratio: the sequence reduction ratio to perform on the sequence before key and value projections. + project_dim: integer, the dimensionality of the projection + of the `SegFormerMultiheadAttention` layer. + num_heads: integer, the number of heads to use in the + attention computation. + sr_ratio: integer, the sequence reduction ratio to perform + on the sequence before key and value projections. Basic usage: diff --git a/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py index 85ffabedd4..b6824331d9 100644 --- a/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py +++ b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py @@ -51,6 +51,55 @@ def __init__( embedding_dims=None, **kwargs, ): + """A Keras model implementing the MixTransformer architecture to be + used as a backbone for the SegFormer architecture. + + References: + - [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203) # noqa: E501 + - [Based on the TensorFlow implementation from DeepVision](https://github.com/DavidLandup0/deepvision/tree/main/deepvision/models/classification/mix_transformer) # noqa: E501 + + Args: + backbone: `keras.Model`. The backbone network for the model that is + used as a feature extractor for the SegFormer encoder. + It is *intended* to be used only with the MiT backbone model which + was created specifically for SegFormers. It should either be a + `keras_cv.models.backbones.backbone.Backbone` or a `tf.keras.Model` + that implements the `pyramid_level_inputs` property with keys + "P2", "P3", "P4", and "P5" and layer names as + values. + num_classes: int, the number of classes for the detection model, + including the background class. + projection_filters: int, default 256, number of filters in the + convolution layer projecting the concatenated features into + a segmentation map. + + Examples: + + Using the class with a `backbone`: + + ```python + import tensorflow as tf + import keras_cv + + images = np.ones(shape=(1, 96, 96, 3)) + labels = np.zeros(shape=(1, 96, 96, 1)) + backbone = keras_cv.models.MiTBackbone.from_preset("mit_b0_imagenet") + model = keras_cv.models.segmentation.SegFormer( + num_classes=1, backbone=backbone, + ) + + # Evaluate model + model(images) + + # Train model + model.compile( + optimizer="adam", + loss=keras.losses.BinaryCrossentropy(from_logits=False), + metrics=["accuracy"], + ) + model.fit(images, labels, epochs=3) + ``` + """ drop_path_rate = 0.1 dpr = [x for x in np.linspace(0.0, drop_path_rate, sum(depths))] blockwise_num_heads = [1, 2, 5, 8] From 743a3bb032f1671ea4d75176fa6fdaef23200b24 Mon Sep 17 00:00:00 2001 From: DavidLandup0 Date: Wed, 23 Aug 2023 21:09:21 +0200 Subject: [PATCH 51/53] documentation --- .../layers/hierarchical_transformer_encoder.py | 14 +++++++------- keras_cv/layers/overlapping_patching_embedding.py | 10 ++++++---- .../mix_transformer/mix_transformer_backbone.py | 4 ++-- .../models/segmentation/segformer/segformer.py | 4 ++-- 4 files changed, 17 insertions(+), 15 deletions(-) diff --git a/keras_cv/layers/hierarchical_transformer_encoder.py b/keras_cv/layers/hierarchical_transformer_encoder.py index f0dc0aa09f..ee67a17b56 100644 --- a/keras_cv/layers/hierarchical_transformer_encoder.py +++ b/keras_cv/layers/hierarchical_transformer_encoder.py @@ -42,14 +42,14 @@ class HierarchicalTransformerEncoder(keras.layers.Layer): Due to the residual addition the input dimensionality has to be equal to the output dimensionality. num_heads: integer, the number of heads for the - `SegFormerMultiheadAttention` layer - drop_prob: float, default 0.0, the probability of dropping a random - sample using the `DropPath` layer. - layer_norm_epsilon: float, default 1e-06, the epsilon for - `LayerNormalization` layers - sr_ratio: integer, default 1, the ratio to use within + `SegFormerMultiheadAttention` layer. + drop_prob: float, the probability of dropping a random + sample using the `DropPath` layer. Defaults to `0.0`. + layer_norm_epsilon: float, the epsilon for + `LayerNormalization` layers. Defaults to `1e-06` + sr_ratio: integer, the ratio to use within `SegFormerMultiheadAttention`. If set to > 1, a `Conv2D` - layer is used to reduce the length of the sequence. + layer is used to reduce the length of the sequence. Defaults to `1`. Basic usage: diff --git a/keras_cv/layers/overlapping_patching_embedding.py b/keras_cv/layers/overlapping_patching_embedding.py index 624c266d18..2753e28452 100644 --- a/keras_cv/layers/overlapping_patching_embedding.py +++ b/keras_cv/layers/overlapping_patching_embedding.py @@ -32,10 +32,12 @@ def __init__(self, project_dim=32, patch_size=7, stride=4, **kwargs): - [Ported from the TensorFlow implementation from DeepVision](https://github.com/DavidLandup0/deepvision/blob/main/deepvision/layers/hierarchical_transformer_encoder.py) # noqa: E501 Args: - project_dim: integer, default 32, the dimensionality of the projection - patch_size: integer, default 7, the size of the patches to encode - stride: integer, default 4, the stride to use for the patching before - projection + project_dim: integer, the dimensionality of the projection. + Defaults to `32`. + patch_size: integer, the size of the patches to encode. + Defaults to `7`. + stride: integer, the stride to use for the patching before + projection. Defaults to 5`. Basic usage: diff --git a/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py index b6824331d9..8a0078b193 100644 --- a/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py +++ b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py @@ -69,9 +69,9 @@ def __init__( values. num_classes: int, the number of classes for the detection model, including the background class. - projection_filters: int, default 256, number of filters in the + projection_filters: int, number of filters in the convolution layer projecting the concatenated features into - a segmentation map. + a segmentation map. Defaults to `256`. Examples: diff --git a/keras_cv/models/segmentation/segformer/segformer.py b/keras_cv/models/segmentation/segformer/segformer.py index 315afe7906..0985b13749 100644 --- a/keras_cv/models/segmentation/segformer/segformer.py +++ b/keras_cv/models/segmentation/segformer/segformer.py @@ -47,9 +47,9 @@ class SegFormer(Task): values. num_classes: int, the number of classes for the detection model, including the background class. - projection_filters: int, default 256, number of filters in the + projection_filters: int, number of filters in the convolution layer projecting the concatenated features into - a segmentation map. + a segmentation map. Defaults to 256`. Examples: From c640fc992b84013086184316478431cde10a0ac7 Mon Sep 17 00:00:00 2001 From: DavidLandup0 Date: Wed, 23 Aug 2023 21:12:57 +0200 Subject: [PATCH 52/53] remove default num classes --- .../segmentation/segformer/segformer_aliases.py | 12 ++++++------ .../segmentation/segformer/segformer_presets.py | 12 +++++------- 2 files changed, 11 insertions(+), 13 deletions(-) diff --git a/keras_cv/models/segmentation/segformer/segformer_aliases.py b/keras_cv/models/segmentation/segformer/segformer_aliases.py index faf9645492..03547f60f2 100644 --- a/keras_cv/models/segmentation/segformer/segformer_aliases.py +++ b/keras_cv/models/segmentation/segformer/segformer_aliases.py @@ -42,7 +42,7 @@ class SegFormerB0(SegFormer): def __new__( cls, - num_classes=19, + num_classes, **kwargs, ): # Pack args in kwargs @@ -70,7 +70,7 @@ def presets_with_weights(cls): class SegFormerB1(SegFormer): def __new__( cls, - num_classes=19, + num_classes, **kwargs, ): # Pack args in kwargs @@ -98,7 +98,7 @@ def presets_with_weights(cls): class SegFormerB2(SegFormer): def __new__( cls, - num_classes=19, + num_classes, **kwargs, ): # Pack args in kwargs @@ -126,7 +126,7 @@ def presets_with_weights(cls): class SegFormerB3(SegFormer): def __new__( cls, - num_classes=19, + num_classes, **kwargs, ): # Pack args in kwargs @@ -154,7 +154,7 @@ def presets_with_weights(cls): class SegFormerB4(SegFormer): def __new__( cls, - num_classes=19, + num_classes, **kwargs, ): # Pack args in kwargs @@ -182,7 +182,7 @@ def presets_with_weights(cls): class SegFormerB5(SegFormer): def __new__( cls, - num_classes=19, + num_classes, **kwargs, ): # Pack args in kwargs diff --git a/keras_cv/models/segmentation/segformer/segformer_presets.py b/keras_cv/models/segmentation/segformer/segformer_presets.py index 3d5bfc568b..e19e2ec9ba 100644 --- a/keras_cv/models/segmentation/segformer/segformer_presets.py +++ b/keras_cv/models/segmentation/segformer/segformer_presets.py @@ -27,7 +27,6 @@ }, "class_name": "keras_cv>SegFormer", "config": { - "num_classes": 19, "backbone": backbone_presets["mit_b0"], }, }, @@ -39,7 +38,7 @@ "path": "segformer_b1", }, "class_name": "keras_cv>SegFormer", - "config": {"num_classes": 19, "backbone": backbone_presets["mit_b1"]}, + "config": {"backbone": backbone_presets["mit_b1"]}, }, "segformer_b2": { "metadata": { @@ -49,7 +48,7 @@ "path": "segformer_b2", }, "class_name": "keras_cv>SegFormer", - "config": {"num_classes": 19, "backbone": backbone_presets["mit_b2"]}, + "config": {"backbone": backbone_presets["mit_b2"]}, }, "segformer_b3": { "metadata": { @@ -59,7 +58,7 @@ "path": "segformer_b3", }, "class_name": "keras_cv>SegFormer", - "config": {"num_classes": 19, "backbone": backbone_presets["mit_b3"]}, + "config": {"backbone": backbone_presets["mit_b3"]}, }, "segformer_b4": { "metadata": { @@ -69,7 +68,7 @@ "path": "segformer_b4", }, "class_name": "keras_cv>SegFormer", - "config": {"num_classes": 19, "backbone": backbone_presets["mit_b4"]}, + "config": {"backbone": backbone_presets["mit_b4"]}, }, "segformer_b5": { "metadata": { @@ -79,7 +78,7 @@ "path": "segformer_b5", }, "class_name": "keras_cv>SegFormer", - "config": {"num_classes": 19, "backbone": backbone_presets["mit_b5"]}, + "config": {"backbone": backbone_presets["mit_b5"]}, }, } @@ -95,7 +94,6 @@ }, "class_name": "keras_cv>SegFormer", "config": { - "num_classes": 19, "backbone": backbone_presets["mit_b0_imagenet"], }, }, From f1b5ffaf0a30801dc85003e876e4b7e4239e67e6 Mon Sep 17 00:00:00 2001 From: DavidLandup0 Date: Wed, 23 Aug 2023 21:19:45 +0200 Subject: [PATCH 53/53] fix docs --- .../layers/overlapping_patching_embedding.py | 2 +- .../mix_transformer_backbone.py | 26 +++++++------------ 2 files changed, 11 insertions(+), 17 deletions(-) diff --git a/keras_cv/layers/overlapping_patching_embedding.py b/keras_cv/layers/overlapping_patching_embedding.py index 2753e28452..69060087ec 100644 --- a/keras_cv/layers/overlapping_patching_embedding.py +++ b/keras_cv/layers/overlapping_patching_embedding.py @@ -37,7 +37,7 @@ def __init__(self, project_dim=32, patch_size=7, stride=4, **kwargs): patch_size: integer, the size of the patches to encode. Defaults to `7`. stride: integer, the stride to use for the patching before - projection. Defaults to 5`. + projection. Defaults to `5`. Basic usage: diff --git a/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py index 8a0078b193..bf6a1a6ec2 100644 --- a/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py +++ b/keras_cv/models/backbones/mix_transformer/mix_transformer_backbone.py @@ -59,19 +59,16 @@ def __init__( - [Based on the TensorFlow implementation from DeepVision](https://github.com/DavidLandup0/deepvision/tree/main/deepvision/models/classification/mix_transformer) # noqa: E501 Args: - backbone: `keras.Model`. The backbone network for the model that is - used as a feature extractor for the SegFormer encoder. - It is *intended* to be used only with the MiT backbone model which - was created specifically for SegFormers. It should either be a - `keras_cv.models.backbones.backbone.Backbone` or a `tf.keras.Model` - that implements the `pyramid_level_inputs` property with keys - "P2", "P3", "P4", and "P5" and layer names as - values. - num_classes: int, the number of classes for the detection model, - including the background class. - projection_filters: int, number of filters in the - convolution layer projecting the concatenated features into - a segmentation map. Defaults to `256`. + include_rescaling: bool, whether to rescale the inputs. If set + to `True`, inputs will be passed through a `Rescaling(1/255.0)` + layer. + depths: the number of transformer encoders to be used per stage in the + network + embedding_dims: the embedding dims per hierarchical stage, used as + the levels of the feature pyramid + input_shape: optional shape tuple, defaults to (None, None, 3). + input_tensor: optional Keras tensor (i.e. output of `keras.layers.Input()`) + to use as image input for the model. Examples: @@ -84,9 +81,6 @@ def __init__( images = np.ones(shape=(1, 96, 96, 3)) labels = np.zeros(shape=(1, 96, 96, 1)) backbone = keras_cv.models.MiTBackbone.from_preset("mit_b0_imagenet") - model = keras_cv.models.segmentation.SegFormer( - num_classes=1, backbone=backbone, - ) # Evaluate model model(images)