From 84625c991c2152e39c62a99886f05568724ff960 Mon Sep 17 00:00:00 2001 From: Divyashree Sreepathihalli Date: Mon, 26 Feb 2024 21:56:52 +0000 Subject: [PATCH 1/4] enable clip large GPU tests and fix jax broadcast_to error --- .kokoro/github/ubuntu/gpu/build.sh | 2 ++ keras_cv/models/feature_extractor/clip/clip_image_model.py | 4 +++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/.kokoro/github/ubuntu/gpu/build.sh b/.kokoro/github/ubuntu/gpu/build.sh index 76ac0631b4..8e1bf5d616 100644 --- a/.kokoro/github/ubuntu/gpu/build.sh +++ b/.kokoro/github/ubuntu/gpu/build.sh @@ -70,6 +70,7 @@ then keras_cv/models/object_detection/yolo_v8 \ keras_cv/models/object_detection_3d \ keras_cv/models/segmentation \ + keras_cv/models/feature_extractor/clip \ keras_cv/models/stable_diffusion else pytest --cache-clear --check_gpu --run_large --durations 0 \ @@ -84,5 +85,6 @@ else keras_cv/models/object_detection/yolo_v8 \ keras_cv/models/object_detection_3d \ keras_cv/models/segmentation \ + keras_cv/models/feature_extractor/clip \ keras_cv/models/stable_diffusion fi \ No newline at end of file diff --git a/keras_cv/models/feature_extractor/clip/clip_image_model.py b/keras_cv/models/feature_extractor/clip/clip_image_model.py index 1718768116..895004bf57 100644 --- a/keras_cv/models/feature_extractor/clip/clip_image_model.py +++ b/keras_cv/models/feature_extractor/clip/clip_image_model.py @@ -12,6 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. +import numpy as np + from keras_cv.backend import keras from keras_cv.backend import ops from keras_cv.models.feature_extractor.clip.clip_encoder import CLIPEncoder @@ -72,7 +74,7 @@ def call(self, x): patch_embeddings = ops.reshape( patch_embeddings, (batch_size, self.num_patches, -1) ) - class_embeds = ops.broadcast_to( + class_embeds = np.broadcast_to( self.class_embedding, (batch_size, 1, self.width) ) embeddings = ops.concatenate( From b5d64a0f8a7f27a9184e5d68d26baf6620eee246 Mon Sep 17 00:00:00 2001 From: Divyashree Sreepathihalli Date: Mon, 26 Feb 2024 22:09:56 +0000 Subject: [PATCH 2/4] update to use ops --- keras_cv/models/feature_extractor/clip/clip_image_model.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/keras_cv/models/feature_extractor/clip/clip_image_model.py b/keras_cv/models/feature_extractor/clip/clip_image_model.py index 895004bf57..69c1002f8e 100644 --- a/keras_cv/models/feature_extractor/clip/clip_image_model.py +++ b/keras_cv/models/feature_extractor/clip/clip_image_model.py @@ -12,8 +12,6 @@ # See the License for the specific language governing permissions and # limitations under the License. -import numpy as np - from keras_cv.backend import keras from keras_cv.backend import ops from keras_cv.models.feature_extractor.clip.clip_encoder import CLIPEncoder @@ -74,8 +72,8 @@ def call(self, x): patch_embeddings = ops.reshape( patch_embeddings, (batch_size, self.num_patches, -1) ) - class_embeds = np.broadcast_to( - self.class_embedding, (batch_size, 1, self.width) + class_embeds = ops.broadcast_to( + self.class_embedding.value, (batch_size, 1, self.width) ) embeddings = ops.concatenate( [class_embeds, patch_embeddings], axis=1 From d676c35c08ba3b329f183d9101ced7c52dd52fc1 Mon Sep 17 00:00:00 2001 From: Divyashree Sreepathihalli Date: Mon, 11 Mar 2024 11:38:31 +0000 Subject: [PATCH 3/4] update model input format --- .../feature_extractor/clip/clip_model.py | 14 ++++- .../feature_extractor/clip/clip_model_test.py | 63 ++++++++++++------- 2 files changed, 55 insertions(+), 22 deletions(-) diff --git a/keras_cv/models/feature_extractor/clip/clip_model.py b/keras_cv/models/feature_extractor/clip/clip_model.py index c3e6d49caf..0cd96643d7 100644 --- a/keras_cv/models/feature_extractor/clip/clip_model.py +++ b/keras_cv/models/feature_extractor/clip/clip_model.py @@ -95,6 +95,13 @@ def __init__( self.transformer_layers = transformer_layers vision_heads = self.vision_width // 64 + self.image_input = keras.layers.Input(shape=(None,), name="image") + self.text_input = keras.layers.Input( + shape=(None, None, self.context_length), name="text" + ) + self.attention_mask_input = keras.layers.Input( + shape=(None, None, self.context_length), name="attention_mask" + ) self.image_encoder = CLIPImageEncoder( input_resolution=self.image_resolution, patch_size=self.vision_patch_size, @@ -133,7 +140,12 @@ def encode_images(self, image): def encode_text(self, text, attention_mask=None): return self.text_encoder(text, attention_mask=attention_mask) - def call(self, image, text, attention_mask=None): + def call(self, inputs): + image, text = inputs["image"], inputs["text"] + if "attention_mask" in inputs: + attention_mask = inputs["attention_mask"] + else: + attention_mask = None self.image_embeddings = self.encode_images(image) self.text_embeddings = self.encode_text( text, attention_mask=attention_mask diff --git a/keras_cv/models/feature_extractor/clip/clip_model_test.py b/keras_cv/models/feature_extractor/clip/clip_model_test.py index 14304b73ef..d114668b2f 100644 --- a/keras_cv/models/feature_extractor/clip/clip_model_test.py +++ b/keras_cv/models/feature_extractor/clip/clip_model_test.py @@ -34,27 +34,24 @@ "https://storage.googleapis.com/keras-cv/models/clip/merges.txt", ) -MODEL_PATH = keras.utils.get_file( - None, - "https://storage.googleapis.com/keras-cv/models/clip/clip-vit-base-patch32.weights.h5", # noqa: E501 -) - class CLIPTest(TestCase): @pytest.mark.large def test_clip_model_golden_values(self): - model = CLIP() - model.load_weights(MODEL_PATH) + model = CLIP.from_preset("clip-vit-base-patch32") processed_image = np.ones(shape=[1, 224, 224, 3]) processed_text = np.ones(shape=[3, 77]) attention_mask = np.ones(shape=[3, 77]) image_logits, text_logits = model( - processed_image, processed_text, attention_mask + { + "image": processed_image, + "text": processed_text, + "attention_mask": attention_mask, + } ) - print(image_logits) - self.assertAllClose(image_logits, [[1.896713, 1.896713, 1.896713]]) + self.assertAllClose(image_logits, [[-0.694048, -0.694048, -0.694048]]) self.assertAllClose( - text_logits, ops.transpose([[1.896713, 1.896713, 1.896713]]) + text_logits, ops.transpose([[-0.694048, -0.694048, -0.694048]]) ) def test_clip_preprocessor(self): @@ -83,20 +80,29 @@ def test_presets(self): processed_text = np.ones(shape=[3, 77]) attention_mask = np.ones(shape=[3, 77]) image_logits, text_logits = model( - processed_image, processed_text, attention_mask + { + "image": processed_image, + "text": processed_text, + "attention_mask": attention_mask, + } ) @pytest.mark.large def test_image_encoder_golden_values(self): - model = CLIP() - model.load_weights(MODEL_PATH) + model = CLIP.from_preset("clip-vit-base-patch32") processed_image = np.ones(shape=[1, 224, 224, 3]) processed_text = np.ones(shape=[3, 77]) attention_mask = np.ones(shape=[3, 77]) - model(processed_image, processed_text, attention_mask) + model( + { + "image": processed_image, + "text": processed_text, + "attention_mask": attention_mask, + } + ) self.assertAllClose( model.image_embeddings[:, :5], - [[0.023215, 0.026526, 0.008914, -0.091689, 0.021791]], + [[-0.031356, -0.036849, 0.015929, -0.004443, 0.095277]], ) @pytest.mark.large @@ -105,11 +111,16 @@ def test_text_encoder_golden_values(self): processed_image = np.ones(shape=[1, 224, 224, 3]) processed_text = np.ones(shape=[3, 77]) attention_mask = np.ones(shape=[3, 77]) - model(processed_image, processed_text, attention_mask) - print(model.text_embeddings) + model( + { + "image": processed_image, + "text": processed_text, + "attention_mask": attention_mask, + } + ) self.assertAllClose( model.text_embeddings[0, :3], - [0.007531, -0.038361, -0.035686], + [0.01866, 0.004538, -0.018127], ) @pytest.mark.large # Saving is slow, so mark these large. @@ -118,7 +129,13 @@ def test_saved_model(self): processed_image = np.ones(shape=[1, 224, 224, 3]) processed_text = np.ones(shape=[3, 77]) attention_mask = np.ones(shape=[3, 77]) - model_output, _ = model(processed_image, processed_text, attention_mask) + model_output, _ = model( + { + "image": processed_image, + "text": processed_text, + "attention_mask": attention_mask, + } + ) save_path = os.path.join(self.get_temp_dir(), "model.keras") if keras_3(): model.save(save_path) @@ -130,6 +147,10 @@ def test_saved_model(self): self.assertIsInstance(restored_model, CLIP) # Check that output matches. restored_output, _ = restored_model( - processed_image, processed_text, attention_mask + { + "image": processed_image, + "text": processed_text, + "attention_mask": attention_mask, + } ) self.assertAllClose(model_output, restored_output) From 52f3cc5a7f12cdb2bcc0139fac7108882a6a5b9b Mon Sep 17 00:00:00 2001 From: Divyashree Sreepathihalli Date: Mon, 11 Mar 2024 17:04:21 +0000 Subject: [PATCH 4/4] update golden values --- keras_cv/models/feature_extractor/clip/clip_model_test.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/keras_cv/models/feature_extractor/clip/clip_model_test.py b/keras_cv/models/feature_extractor/clip/clip_model_test.py index d114668b2f..1a657d540c 100644 --- a/keras_cv/models/feature_extractor/clip/clip_model_test.py +++ b/keras_cv/models/feature_extractor/clip/clip_model_test.py @@ -49,9 +49,9 @@ def test_clip_model_golden_values(self): "attention_mask": attention_mask, } ) - self.assertAllClose(image_logits, [[-0.694048, -0.694048, -0.694048]]) + self.assertAllClose(image_logits, [[1.896712, 1.896712, 1.896712]]) self.assertAllClose( - text_logits, ops.transpose([[-0.694048, -0.694048, -0.694048]]) + text_logits, ops.transpose([[1.896712, 1.896712, 1.896712]]) ) def test_clip_preprocessor(self): @@ -102,7 +102,7 @@ def test_image_encoder_golden_values(self): ) self.assertAllClose( model.image_embeddings[:, :5], - [[-0.031356, -0.036849, 0.015929, -0.004443, 0.095277]], + [[0.023215, 0.026526, 0.008914, -0.091689, 0.021791]], ) @pytest.mark.large @@ -120,7 +120,7 @@ def test_text_encoder_golden_values(self): ) self.assertAllClose( model.text_embeddings[0, :3], - [0.01866, 0.004538, -0.018127], + [0.007531, -0.038361, -0.035686], ) @pytest.mark.large # Saving is slow, so mark these large.