Revert EANet examples

nkovela1 · Nov 9, 2023 · c9e4e1b · c9e4e1b
1 parent 70961b6
commit c9e4e1b
Show file tree

Hide file tree

Showing 3 changed files with 224 additions and 115 deletions.
diff --git a/examples/vision/eanet.py b/examples/vision/eanet.py
@@ -2,7 +2,7 @@
 Title: Image classification with EANet (External Attention Transformer)
 Author: [ZhiYong Chang](https://github.com/czy00000)
 Date created: 2021/10/19
-Last modified: 2023/07/18
+Last modified: 2021/10/19
 Description: Image classification with a Transformer that leverages external attention.
 Accelerator: GPU
 """
@@ -18,16 +18,25 @@
 linear layers and two normalization layers. It conveniently replaces self-attention
 as used in existing architectures. External attention has linear complexity, as it only
 implicitly considers the correlations between all samples.
+
+This example requires TensorFlow 2.5 or higher, as well as
+[TensorFlow Addons](https://www.tensorflow.org/addons/overview) package,
+which can be installed using the following command:
+
+```python
+pip install -U tensorflow-addons
+```
 """
 
 """
 ## Setup
 """
 
-import keras
-from keras import layers
-from keras import ops
-
+import numpy as np
+import tensorflow as tf
+from tensorflow import keras
+from tensorflow.keras import layers
+import tensorflow_addons as tfa
 import matplotlib.pyplot as plt
 
 
@@ -53,7 +62,7 @@
 label_smoothing = 0.1
 validation_split = 0.2
 batch_size = 128
-num_epochs = 1  # Recommended num_epochs = 1.
+num_epochs = 50
 patch_size = 2  # Size of the patches to be extracted from the input images.
 num_patches = (input_shape[0] // patch_size) ** 2  # Number of patch
 embedding_dim = 64  # Number of hidden units.
@@ -95,11 +104,18 @@ def __init__(self, patch_size, **kwargs):
         super().__init__(**kwargs)
         self.patch_size = patch_size
 
-    def call(self, x):
-        B, C = ops.shape(x)[0], ops.shape(x)[-1]
-        x = ops.image.extract_patches(x, self.patch_size)
-        x = ops.reshape(x, (B, -1, self.patch_size * self.patch_size * C))
-        return x
+    def call(self, images):
+        batch_size = tf.shape(images)[0]
+        patches = tf.image.extract_patches(
+            images=images,
+            sizes=(1, self.patch_size, self.patch_size, 1),
+            strides=(1, self.patch_size, self.patch_size, 1),
+            rates=(1, 1, 1, 1),
+            padding="VALID",
+        )
+        patch_dim = patches.shape[-1]
+        patch_num = patches.shape[1]
+        return tf.reshape(patches, (batch_size, patch_num * patch_num, patch_dim))
 
 
 class PatchEmbedding(layers.Layer):
@@ -110,7 +126,7 @@ def __init__(self, num_patch, embed_dim, **kwargs):
         self.pos_embed = layers.Embedding(input_dim=num_patch, output_dim=embed_dim)
 
     def call(self, patch):
-        pos = ops.arange(start=0, stop=self.num_patch, step=1)
+        pos = tf.range(start=0, limit=self.num_patch, delta=1)
         return self.proj(patch) + self.pos_embed(pos)
 
 
@@ -120,37 +136,29 @@ def call(self, patch):
 
 
 def external_attention(
-    x,
-    dim,
-    num_heads,
-    dim_coefficient=4,
-    attention_dropout=0,
-    projection_dropout=0,
+    x, dim, num_heads, dim_coefficient=4, attention_dropout=0, projection_dropout=0
 ):
     _, num_patch, channel = x.shape
     assert dim % num_heads == 0
     num_heads = num_heads * dim_coefficient
 
     x = layers.Dense(dim * dim_coefficient)(x)
     # create tensor [batch_size, num_patches, num_heads, dim*dim_coefficient//num_heads]
-    x = ops.reshape(x, (-1, num_patch, num_heads, dim * dim_coefficient // num_heads))
-    x = ops.transpose(x, axes=[0, 2, 1, 3])
+    x = tf.reshape(
+        x, shape=(-1, num_patch, num_heads, dim * dim_coefficient // num_heads)
+    )
+    x = tf.transpose(x, perm=[0, 2, 1, 3])
     # a linear layer M_k
     attn = layers.Dense(dim // dim_coefficient)(x)
     # normalize attention map
     attn = layers.Softmax(axis=2)(attn)
     # dobule-normalization
-    attn = layers.Lambda(
-        lambda attn: ops.divide(
-            attn,
-            ops.convert_to_tensor(1e-9) + ops.sum(attn, axis=-1, keepdims=True),
-        )
-    )(attn)
+    attn = attn / (1e-9 + tf.reduce_sum(attn, axis=-1, keepdims=True))
     attn = layers.Dropout(attention_dropout)(attn)
     # a linear layer M_v
     x = layers.Dense(dim * dim_coefficient // num_heads)(attn)
-    x = ops.transpose(x, axes=[0, 2, 1, 3])
-    x = ops.reshape(x, [-1, num_patch, dim * dim_coefficient])
+    x = tf.transpose(x, perm=[0, 2, 1, 3])
+    x = tf.reshape(x, [-1, num_patch, dim * dim_coefficient])
     # a linear layer to project original dim
     x = layers.Dense(dim)(x)
     x = layers.Dropout(projection_dropout)(x)
@@ -163,7 +171,7 @@ def external_attention(
 
 
 def mlp(x, embedding_dim, mlp_dim, drop_rate=0.2):
-    x = layers.Dense(mlp_dim, activation=ops.gelu)(x)
+    x = layers.Dense(mlp_dim, activation=tf.nn.gelu)(x)
     x = layers.Dropout(drop_rate)(x)
     x = layers.Dense(embedding_dim)(x)
     x = layers.Dropout(drop_rate)(x)
@@ -198,9 +206,7 @@ def transformer_encoder(
         )
     elif attention_type == "self_attention":
         x = layers.MultiHeadAttention(
-            num_heads=num_heads,
-            key_dim=embedding_dim,
-            dropout=attention_dropout,
+            num_heads=num_heads, key_dim=embedding_dim, dropout=attention_dropout
         )(x, x)
     x = layers.add([x, residual_1])
     residual_2 = x
@@ -250,7 +256,7 @@ def get_model(attention_type="external_attention"):
             attention_type,
         )
 
-    x = layers.GlobalAveragePooling1D()(x)
+    x = layers.GlobalAvgPool1D()(x)
     outputs = layers.Dense(num_classes, activation="softmax")(x)
     model = keras.Model(inputs=inputs, outputs=outputs)
     return model
@@ -266,7 +272,7 @@ def get_model(attention_type="external_attention"):
 
 model.compile(
     loss=keras.losses.CategoricalCrossentropy(label_smoothing=label_smoothing),
-    optimizer=keras.optimizers.AdamW(
+    optimizer=tfa.optimizers.AdamW(
         learning_rate=learning_rate, weight_decay=weight_decay
     ),
     metrics=[

diff --git a/examples/vision/ipynb/eanet.ipynb b/examples/vision/ipynb/eanet.ipynb
@@ -10,7 +10,7 @@
     "\n",
     "**Author:** [ZhiYong Chang](https://github.com/czy00000)<br>\n",
     "**Date created:** 2021/10/19<br>\n",
-    "**Last modified:** 2023/07/18<br>\n",
+    "**Last modified:** 2021/10/19<br>\n",
     "**Description:** Image classification with a Transformer that leverages external attention."
    ]
   },
@@ -21,15 +21,20 @@
    },
    "source": [
     "## Introduction\n",
-    "\n",
     "This example implements the [EANet](https://arxiv.org/abs/2105.02358)\n",
     "model for image classification, and demonstrates it on the CIFAR-100 dataset.\n",
     "EANet introduces a novel attention mechanism\n",
     "named ***external attention***, based on two external, small, learnable, and\n",
     "shared memories, which can be implemented easily by simply using two cascaded\n",
     "linear layers and two normalization layers. It conveniently replaces self-attention\n",
     "as used in existing architectures. External attention has linear complexity, as it only\n",
-    "implicitly considers the correlations between all samples."
+    "implicitly considers the correlations between all samples.\n",
+    "This example requires TensorFlow 2.5 or higher, as well as\n",
+    "[TensorFlow Addons](https://www.tensorflow.org/addons/overview) package,\n",
+    "which can be installed using the following command:\n",
+    "```python\n",
+    "pip install -U tensorflow-addons\n",
+    "```"
    ]
   },
   {
@@ -49,10 +54,11 @@
    },
    "outputs": [],
    "source": [
-    "import keras\n",
-    "from keras import layers\n",
-    "from keras import ops\n",
-    "\n",
+    "import numpy as np\n",
+    "import tensorflow as tf\n",
+    "from tensorflow import keras\n",
+    "from tensorflow.keras import layers\n",
+    "import tensorflow_addons as tfa\n",
     "import matplotlib.pyplot as plt\n",
     ""
    ]
@@ -106,7 +112,7 @@
     "label_smoothing = 0.1\n",
     "validation_split = 0.2\n",
     "batch_size = 128\n",
-    "num_epochs = 1  # Recommended num_epochs = 1.\n",
+    "num_epochs = 50\n",
     "patch_size = 2  # Size of the patches to be extracted from the input images.\n",
     "num_patches = (input_shape[0] // patch_size) ** 2  # Number of patch\n",
     "embedding_dim = 64  # Number of hidden units.\n",
@@ -176,11 +182,18 @@
     "        super().__init__(**kwargs)\n",
     "        self.patch_size = patch_size\n",
     "\n",
-    "    def call(self, x):\n",
-    "        B, C = ops.shape(x)[0], ops.shape(x)[-1]\n",
-    "        x = ops.image.extract_patches(x, self.patch_size)\n",
-    "        x = ops.reshape(x, (B, -1, self.patch_size * self.patch_size * C))\n",
-    "        return x\n",
+    "    def call(self, images):\n",
+    "        batch_size = tf.shape(images)[0]\n",
+    "        patches = tf.image.extract_patches(\n",
+    "            images=images,\n",
+    "            sizes=(1, self.patch_size, self.patch_size, 1),\n",
+    "            strides=(1, self.patch_size, self.patch_size, 1),\n",
+    "            rates=(1, 1, 1, 1),\n",
+    "            padding=\"VALID\",\n",
+    "        )\n",
+    "        patch_dim = patches.shape[-1]\n",
+    "        patch_num = patches.shape[1]\n",
+    "        return tf.reshape(patches, (batch_size, patch_num * patch_num, patch_dim))\n",
     "\n",
     "\n",
     "class PatchEmbedding(layers.Layer):\n",
@@ -191,7 +204,7 @@
     "        self.pos_embed = layers.Embedding(input_dim=num_patch, output_dim=embed_dim)\n",
     "\n",
     "    def call(self, patch):\n",
-    "        pos = ops.arange(start=0, stop=self.num_patch, step=1)\n",
+    "        pos = tf.range(start=0, limit=self.num_patch, delta=1)\n",
     "        return self.proj(patch) + self.pos_embed(pos)\n",
     ""
    ]
@@ -215,37 +228,29 @@
    "source": [
     "\n",
     "def external_attention(\n",
-    "    x,\n",
-    "    dim,\n",
-    "    num_heads,\n",
-    "    dim_coefficient=4,\n",
-    "    attention_dropout=0,\n",
-    "    projection_dropout=0,\n",
+    "    x, dim, num_heads, dim_coefficient=4, attention_dropout=0, projection_dropout=0\n",
     "):\n",
     "    _, num_patch, channel = x.shape\n",
     "    assert dim % num_heads == 0\n",
     "    num_heads = num_heads * dim_coefficient\n",
     "\n",
     "    x = layers.Dense(dim * dim_coefficient)(x)\n",
     "    # create tensor [batch_size, num_patches, num_heads, dim*dim_coefficient//num_heads]\n",
-    "    x = ops.reshape(x, (-1, num_patch, num_heads, dim * dim_coefficient // num_heads))\n",
-    "    x = ops.transpose(x, axes=[0, 2, 1, 3])\n",
+    "    x = tf.reshape(\n",
+    "        x, shape=(-1, num_patch, num_heads, dim * dim_coefficient // num_heads)\n",
+    "    )\n",
+    "    x = tf.transpose(x, perm=[0, 2, 1, 3])\n",
     "    # a linear layer M_k\n",
     "    attn = layers.Dense(dim // dim_coefficient)(x)\n",
     "    # normalize attention map\n",
     "    attn = layers.Softmax(axis=2)(attn)\n",
     "    # dobule-normalization\n",
-    "    attn = layers.Lambda(\n",
-    "        lambda attn: ops.divide(\n",
-    "            attn,\n",
-    "            ops.convert_to_tensor(1e-9) + ops.sum(attn, axis=-1, keepdims=True),\n",
-    "        )\n",
-    "    )(attn)\n",
+    "    attn = attn / (1e-9 + tf.reduce_sum(attn, axis=-1, keepdims=True))\n",
     "    attn = layers.Dropout(attention_dropout)(attn)\n",
     "    # a linear layer M_v\n",
     "    x = layers.Dense(dim * dim_coefficient // num_heads)(attn)\n",
-    "    x = ops.transpose(x, axes=[0, 2, 1, 3])\n",
-    "    x = ops.reshape(x, [-1, num_patch, dim * dim_coefficient])\n",
+    "    x = tf.transpose(x, perm=[0, 2, 1, 3])\n",
+    "    x = tf.reshape(x, [-1, num_patch, dim * dim_coefficient])\n",
     "    # a linear layer to project original dim\n",
     "    x = layers.Dense(dim)(x)\n",
     "    x = layers.Dropout(projection_dropout)(x)\n",
@@ -272,7 +277,7 @@
    "source": [
     "\n",
     "def mlp(x, embedding_dim, mlp_dim, drop_rate=0.2):\n",
-    "    x = layers.Dense(mlp_dim, activation=ops.gelu)(x)\n",
+    "    x = layers.Dense(mlp_dim, activation=tf.nn.gelu)(x)\n",
     "    x = layers.Dropout(drop_rate)(x)\n",
     "    x = layers.Dense(embedding_dim)(x)\n",
     "    x = layers.Dropout(drop_rate)(x)\n",
@@ -321,9 +326,7 @@
     "        )\n",
     "    elif attention_type == \"self_attention\":\n",
     "        x = layers.MultiHeadAttention(\n",
-    "            num_heads=num_heads,\n",
-    "            key_dim=embedding_dim,\n",
-    "            dropout=attention_dropout,\n",
+    "            num_heads=num_heads, key_dim=embedding_dim, dropout=attention_dropout\n",
     "        )(x, x)\n",
     "    x = layers.add([x, residual_1])\n",
     "    residual_2 = x\n",
@@ -392,7 +395,7 @@
     "            attention_type,\n",
     "        )\n",
     "\n",
-    "    x = layers.GlobalAveragePooling1D()(x)\n",
+    "    x = layers.GlobalAvgPool1D()(x)\n",
     "    outputs = layers.Dense(num_classes, activation=\"softmax\")(x)\n",
     "    model = keras.Model(inputs=inputs, outputs=outputs)\n",
     "    return model\n",
@@ -421,7 +424,7 @@
     "\n",
     "model.compile(\n",
     "    loss=keras.losses.CategoricalCrossentropy(label_smoothing=label_smoothing),\n",
-    "    optimizer=keras.optimizers.AdamW(\n",
+    "    optimizer=tfa.optimizers.AdamW(\n",
     "        learning_rate=learning_rate, weight_decay=weight_decay\n",
     "    ),\n",
     "    metrics=[\n",
@@ -501,7 +504,6 @@
     "and the same hyperparameters, The EANet model we just trained has just 0.3M parameters,\n",
     "and it gets us to ~73% test top-5 accuracy and ~43% top-1 accuracy. This fully demonstrates the\n",
     "effectiveness of external attention.\n",
-    "\n",
     "We only show the training\n",
     "process of EANet, you can train Vit under the same experimental conditions and observe\n",
     "the test results."
@@ -512,7 +514,7 @@
   "accelerator": "GPU",
   "colab": {
    "collapsed_sections": [],
-   "name": "eanet",
+   "name": "EANet",
    "private_outputs": false,
    "provenance": [],
    "toc_visible": true