Skip to content

Commit

Permalink
add preset file
Browse files Browse the repository at this point in the history
  • Loading branch information
Divyashree Sreepathihalli committed Feb 2, 2024
1 parent 209e5da commit 91e6ea9
Showing 1 changed file with 68 additions and 0 deletions.
68 changes: 68 additions & 0 deletions keras_cv/models/feature_extractor/clip/clip_presets.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
"""CLIP presets."""

clip_presets = {
"clip-vit-base-patch16": {
"metadata": {
"description": (
"The model uses a ViT-B/16 Transformer architecture as an "
"image encoder and uses a masked self-attention Transformer as "
"a text encoder. These encoders are trained to maximize the "
"similarity of (image, text) pairs via a contrastive loss. The "
"model uses a patch size of 16 and input images of size (224, "
"224)"
),
"params": 149620737,
"official_name": "CLIP",
"path": "clip",
},
"kaggle_handle": "kaggle://keras/yolov8/keras/yolo_v8_m_pascalvoc/2",
},
"clip-vit-base-patch32": {
"metadata": {
"description": (
"The model uses a ViT-B/32 Transformer architecture as an "
"image encoder and uses a masked self-attention Transformer as "
"a text encoder. These encoders are trained to maximize the "
"similarity of (image, text) pairs via a contrastive loss.The "
"model uses a patch size of 32 and input images of size (224, "
"224)"
),
"params": 151277313,
"official_name": "CLIP",
"path": "clip",
},
"kaggle_handle": "kaggle://keras/yolov8/keras/yolo_v8_m_pascalvoc/2",
},
"clip-vit-large-patch14": {
"metadata": {
"description": (
"The model uses a ViT-L/14 Transformer architecture as an "
"image encoder and uses a masked self-attention Transformer as "
"a text encoder. These encoders are trained to maximize the "
"similarity of (image, text) pairs via a contrastive loss.The "
"model uses a patch size of 14 and input images of size (224, "
"224)"
),
"params": 427616513,
"official_name": "CLIP",
"path": "clip",
},
"kaggle_handle": "kaggle://keras/yolov8/keras/yolo_v8_m_pascalvoc/2",
},
"clip-vit-large-patch14-336": {
"metadata": {
"description": (
"The model uses a ViT-L/14 Transformer architecture as an "
"image encoder and uses a masked self-attention Transformer as "
"a text encoder. These encoders are trained to maximize the "
"similarity of (image, text) pairs via a contrastive loss.The "
"model uses a patch size of 14 and input images of size (336, "
"336)"
),
"params": 427944193,
"official_name": "CLIP",
"path": "clip",
},
"kaggle_handle": "",
},
}

0 comments on commit 91e6ea9

Please sign in to comment.