Skip to content

Commit

Permalink
Merge pull request #599 from artemisp/main
Browse files Browse the repository at this point in the history
X-InstructBLIP Code
  • Loading branch information
henryhungle authored Dec 12, 2023
2 parents 7f00a08 + 018b106 commit ac8fc98
Show file tree
Hide file tree
Showing 560 changed files with 73,960 additions and 182 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -152,5 +152,8 @@ debug*/
*.dat
*.tsv
*.gz
*.csv
*.p
*.pdf

cache/
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,9 @@
# LAVIS - A Library for Language-Vision Intelligence

## What's New: 🎉
* [Model Release] November 2023, released implementation of **X-InstructBLIP** <br>
[Paper](https://arxiv.org/pdf/2311.18799.pdf), [Project Page](https://github.com/salesforce/LAVIS/tree/main/projects/xinstructblip), [Website](https://artemisp.github.io/X-InstructBLIP-page/), [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/salesforce/LAVIS/blob/main/projects/xinstructblip/demo/run_demo.ipynb)
> A simple, yet effective, cross-modality framework built atop frozen LLMs that allows the integration of various modalities (image, video, audio, 3D) without extensive modality-specific customization.
* [Model Release] July 2023, released implementation of **BLIP-Diffusion** <br>
[Paper](https://arxiv.org/abs/2305.06500), [Project Page](https://github.com/salesforce/LAVIS/tree/main/projects/blip-diffusion), [Website](https://dxli94.github.io/BLIP-Diffusion-website/)
> A text-to-image generation model that trains 20x than DreamBooth. Also facilitates zero-shot subject-driven generation and editing.
Expand Down
Binary file removed assets/LAVIS_technical_report.pdf
Binary file not shown.
15 changes: 14 additions & 1 deletion lavis/common/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
Copyright (c) 2022, salesforce.com, inc.
Copyright (c) 2023, salesforce.com, inc.
All rights reserved.
SPDX-License-Identifier: BSD-3-Clause
For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
Expand Down Expand Up @@ -440,3 +440,16 @@ def get_file_size(filename):
"""
size_in_mb = os.path.getsize(filename) / float(1024**2)
return size_in_mb

def is_serializable(value):
"""
This function checks if the provided value can be serialized into a JSON string.
"""
try:
json.dumps(value)
return True
except (TypeError, OverflowError):
return False

def is_convertible_to_int(value):
return bool(re.match(r'^-?\d+$', str(value)))
52 changes: 52 additions & 0 deletions lavis/configs/datasets/aokvqa/defaults_instruct.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# Copyright (c) 2023, salesforce.com, inc.
# All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause

datasets:
aok_vqa_instruct:
# data_dir: ${env.data_dir}/datasets
data_type: images # [images|videos|features]

vis_processor:
train:
name: "clip_image_train"
image_size: 224
eval:
name: "clip_image_eval"
image_size: 224

text_processor:
train:
name: blip_instruction
modality: image
task: qa
eval:
name: blip_question

build_info:
# Be careful not to append minus sign (-) before split to avoid itemizing
annotations:
train:
url:
- https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_train.json
storage:
- aokvqa/annotations/aokvqa_v1p0_train.json
# val:
# url:
# - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_val.json
# - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/specialized_vocab_train.json
# storage:
# - aokvqa/annotations/aokvqa_v1p0_val.json
# - aokvqa/annotations/specialized_vocab_train_lavis.json
# # - aokvqa/annotations/large_vocab_train_lavis.json
# test:
# url:
# - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/aokvqa_v1p0_test.json
# - https://storage.googleapis.com/sfr-vision-language-research/LAVIS/datasets/aokvqa/specialized_vocab_train.json
# storage:
# - aokvqa/annotations/aokvqa_v1p0_test.json
# - aokvqa/annotations/specialized_vocab_train_lavis.json
images:
# storage: /coco/images
storage: /export/share/datasets/vision/coco/images
49 changes: 49 additions & 0 deletions lavis/configs/datasets/audiocaps/defaults_mm_cap.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# Copyright (c) 2023, salesforce.com, inc.
# All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause

datasets:
audiocaps_mm_caption: # name of the dataset builder
audio_processor:
train:
name: beats_audio
sampling_rate: 16000
eval:
name: beats_audio
sampling_rate: 16000

text_processor:
train:
name: "blip_instruction"
modality: audio
task: caption
eval:
name: "blip_caption"

data_type: [audio]

build_info:
kwargs:
missing_ids: [2sh7ZkazyO8, 966jA2-z0mQ, 52RlolYyjAE, HVAc9hm4jjk, 8lPjqvYWNyM, eXgPnnE3TuQ]
annotations:
train:
url:
- https://raw.githubusercontent.com/cdjkim/audiocaps/master/dataset/train.csv
storage:
- audiocaps/annotations/train.csv

val:
url:
- https://raw.githubusercontent.com/cdjkim/audiocaps/master/dataset/val.csv
storage:
- audiocaps/annotations/val.csv

test:
url:
- https://raw.githubusercontent.com/cdjkim/audiocaps/master/dataset/test.csv
storage:
- audiocaps/annotations/test.csv

audio:
storage: /export/einstein-vision/audio_datasets/audiocaps/AUDIOCAPS_32000Hz/audio
52 changes: 52 additions & 0 deletions lavis/configs/datasets/audiocaps/defaults_mm_cap_instruct.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# Copyright (c) 2023, salesforce.com, inc.
# All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause

datasets:
audiocaps_mm_caption_instruct: # name of the dataset builder
audio_processor:
train:
name: beats_audio
sampling_rate: 16000
eval:
name: beats_audio
sampling_rate: 16000

text_processor:
train:
name: "blip_instruction"
modality: audio
task: caption
eval:
name: "blip_caption"

data_type: [audio]

missing_ids: [2sh7ZkazyO8, 966jA2-z0mQ, 52RlolYyjAE, HVAc9hm4jjk, 8lPjqvYWNyM, eXgPnnE3TuQ]

build_info:
kwargs:
cached: False
cached_dir: /export/einstein-vision/audio_datasets/audiocaps/beats_features
annotations:
train:
url:
- https://raw.githubusercontent.com/cdjkim/audiocaps/master/dataset/train.csv
storage:
- audiocaps/annotations/train.csv

# val:
# url:
# - https://raw.githubusercontent.com/cdjkim/audiocaps/master/dataset/val.csv
# storage:
# - audiocaps/annotation/val.csv

# test:
# url:
# - https://raw.githubusercontent.com/cdjkim/audiocaps/master/dataset/test.csv
# storage:
# - /export/einstein-vision/audio_datasets/audiocaps/dataset/test.csv

audio:
storage: /export/einstein-vision/audio_datasets/audiocaps/AUDIOCAPS_32000Hz/audio
51 changes: 51 additions & 0 deletions lavis/configs/datasets/audiocaps/defaults_mm_qa.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# Copyright (c) 2023, salesforce.com, inc.
# All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause

datasets:
audiocaps_mm_qa: # name of the dataset builder
audio_processor:
train:
name: beats_audio
sampling_rate: 16000
eval:
name: beats_audio
sampling_rate: 16000
is_eval: True

text_processor:
train:
name: "blip_instruction"
modality: audio
task: qa
eval:
name: "blip_question"

data_type: [audio]

build_info:
kwargs:
cached: False
# add_binary: True
cached_dir: /export/einstein-vision/audio_datasets/audiocaps/beats_features
missing_ids: [2sh7ZkazyO8, 966jA2-z0mQ, 52RlolYyjAE, HVAc9hm4jjk, 8lPjqvYWNyM, eXgPnnE3TuQ]
annotations:
train:
url:
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data/audiocaps/audio_qa_final_train.csv
# - /export/home/LAVIS-xgen_mm/projects/xinstructblip/data_aug/audio_qa_data/audio_qa_final_train.csv
storage:
- audiocaps_qa/annotations/train.csv
# - /export/home/LAVIS-xgen_mm/projects/xinstructblip/data_aug/audio_qa_data/audio_qa_final_train.csv

# val:
# url:
# # - https://storage.googleapis.com/sfr-xinstructblip-data-research/data/audiocaps/audio_qa_final_val.csv
# - /export/home/LAVIS-xgen_mm/projects/xinstructblip/data_aug/audio_qa_data/audio_qa_final_val.csv
# storage:
# # - audiocaps_qa/annotations/val.csv
# - /export/home/LAVIS-xgen_mm/projects/xinstructblip/data_aug/audio_qa_data/audio_qa_final_val.csv

audio:
storage: /export/einstein-vision/audio_datasets/audiocaps/AUDIOCAPS_32000Hz/audio
47 changes: 47 additions & 0 deletions lavis/configs/datasets/audioset/defaults_mm_cap.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
# Copyright (c) 2023, salesforce.com, inc.
# All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause

datasets:
audioset_mm_caption: # 14141
audio_processor:
train:
name: beats_audio
sampling_rate: 16000
eval:
name: beats_audio
sampling_rate: 16000
is_eval: False

text_processor:
train:
name: blip_instruction
modality: audio
task: classification
eval:
name: blip_caption

data_type: [audio]

build_info:
annotations:
train:
url:
- https://storage.googleapis.com/sfr-xinstructblip-data-research/data//audioset/balanced_train_clean.csv
# - /export/home/LAVIS-xgen_mm/lavis/configs/datasets/audioset/balanced_train_clean.csv
- http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/class_labels_indices.csv
storage:
- audioset/balanced_train_clean.csv
# - /export/home/LAVIS-xgen_mm/lavis/configs/datasets/audioset/balanced_train_clean.csv
- audioset/annotations/class_labels_indices.csv

# val:
# url:
# - http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/eval_segments.csv
# - http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/class_labels_indices.csv
# storage:
# - audioset/annotations/eval_segments.csv
# - audioset/annotations/class_labels_indices.csv
audio:
storage: /export/einstein-vision/audio_datasets/AudioSet/all_audio
48 changes: 48 additions & 0 deletions lavis/configs/datasets/audioset/defaults_mm_cap_instruct.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# Copyright (c) 2023, salesforce.com, inc.
# All rights reserved.
# SPDX-License-Identifier: BSD-3-Clause
# For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause

datasets:
audioset_mm_caption_instruct: # 14141
audio_processor:
train:
name: beats_audio
sampling_rate: 16000
eval:
name: beats_audio
sampling_rate: 16000
is_eval: False

text_processor:
train:
name: blip_instruction
modality: audio
task: classification
eval:
name: blip_caption

data_type: [audio]

build_info:
annotations:
train:
url:
# - https://storage.googleapis.com/sfr-xinstructblip-data-research/data//audioset/balanced_train_clean.csv
- /export/home/LAVIS-xgen_mm/lavis/configs/datasets/audioset/balanced_train_clean.csv
- http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/class_labels_indices.csv
storage:
- audioset/annotations/balanced_train_clean.csv
# - /export/home/LAVIS-xgen_mm/lavis/configs/datasets/audioset/balanced_train_clean.csv
- audioset/annotations/class_labels_indices.csv

# val:
# url:
# - http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/eval_segments.csv
# - http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/class_labels_indices.csv
# storage:
# - audioset/annotations/eval_segments.csv
# - audioset/annotations/class_labels_indices.csv

audio:
storage: /export/einstein-vision/audio_datasets/AudioSet/all_audio
Loading

0 comments on commit ac8fc98

Please sign in to comment.