diff --git a/keras_cv/models/feature_extractor/clip/clip_presets.py b/keras_cv/models/feature_extractor/clip/clip_presets.py
new file mode 100644
index 0000000000..9f6dda87aa
--- /dev/null
+++ b/keras_cv/models/feature_extractor/clip/clip_presets.py
@@ -0,0 +1,68 @@
+"""CLIP presets."""
+
+clip_presets = {
+    "clip-vit-base-patch16": {
+        "metadata": {
+            "description": (
+                "The model uses a ViT-B/16 Transformer architecture as an "
+                "image encoder and uses a masked self-attention Transformer as "
+                "a text encoder. These encoders are trained to maximize the "
+                "similarity of (image, text) pairs via a contrastive loss. The "
+                "model uses a patch size of 16 and input images of size (224, "
+                "224)"
+            ),
+            "params": 149620737,
+            "official_name": "CLIP",
+            "path": "clip",
+        },
+        "kaggle_handle": "kaggle://keras/yolov8/keras/yolo_v8_m_pascalvoc/2",
+    },
+    "clip-vit-base-patch32": {
+        "metadata": {
+            "description": (
+                "The model uses a ViT-B/32 Transformer architecture as an "
+                "image encoder and uses a masked self-attention Transformer as "
+                "a text encoder. These encoders are trained to maximize the "
+                "similarity of (image, text) pairs via a contrastive loss.The "
+                "model uses a patch size of 32 and input images of size (224, "
+                "224)"
+            ),
+            "params": 151277313,
+            "official_name": "CLIP",
+            "path": "clip",
+        },
+        "kaggle_handle": "kaggle://keras/yolov8/keras/yolo_v8_m_pascalvoc/2",
+    },
+    "clip-vit-large-patch14": {
+        "metadata": {
+            "description": (
+                "The model uses a ViT-L/14 Transformer architecture as an "
+                "image encoder and uses a masked self-attention Transformer as "
+                "a text encoder. These encoders are trained to maximize the "
+                "similarity of (image, text) pairs via a contrastive loss.The "
+                "model uses a patch size of 14 and input images of size (224, "
+                "224)"
+            ),
+            "params": 427616513,
+            "official_name": "CLIP",
+            "path": "clip",
+        },
+        "kaggle_handle": "kaggle://keras/yolov8/keras/yolo_v8_m_pascalvoc/2",
+    },
+    "clip-vit-large-patch14-336": {
+        "metadata": {
+            "description": (
+                "The model uses a ViT-L/14 Transformer architecture as an "
+                "image encoder and uses a masked self-attention Transformer as "
+                "a text encoder. These encoders are trained to maximize the "
+                "similarity of (image, text) pairs via a contrastive loss.The "
+                "model uses a patch size of 14 and input images of size (336, "
+                "336)"
+            ),
+            "params": 427944193,
+            "official_name": "CLIP",
+            "path": "clip",
+        },
+        "kaggle_handle": "",
+    },
+}