diff --git a/camel/prompts/__init__.py b/camel/prompts/__init__.py index 49f9d6f35..0eef251b4 100644 --- a/camel/prompts/__init__.py +++ b/camel/prompts/__init__.py @@ -21,10 +21,12 @@ from .role_description_prompt_template import RoleDescriptionPromptTemplateDict from .task_prompt_template import TaskPromptTemplateDict from .prompt_templates import PromptTemplateGenerator +from .multimodal import MultiModalPrompt __all__ = [ 'TextPrompt', 'CodePrompt', + 'MultiModalPrompt', 'TextPromptDict', 'AISocietyPromptTemplateDict', 'CodePromptTemplateDict', diff --git a/camel/prompts/multimodal.py b/camel/prompts/multimodal.py new file mode 100644 index 000000000..c14f67704 --- /dev/null +++ b/camel/prompts/multimodal.py @@ -0,0 +1,90 @@ +# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. =========== +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. =========== +from typing import Any, Callable, Dict, List, Union + +from camel.prompts import TextPrompt + +MODALITIES = ["CAMEL_IMAGE"] + + +def default_to_model_format(text_prompt, modalities_dict: Dict) -> Dict: + r""" + The default format is return the text and multimodal information in dict. + This function should be implemented in the multimodal prompt class. + + Returns: + dict: The input format that the multimodal model can understand. + """ + + return {"text": text_prompt, "multimodal_information": modalities_dict} + + +class MultiModalPrompt: + r""" + To enable information transfer between multimodal agents, we need a multimodal prompt class. + It contains a text prompt and multimodal information. + """ + + def __init__(self, text_prompt: TextPrompt, modalities: Union[List, Dict]): + r""" + Initializes the multimodal prompt. + + Args: + text_prompt (TextPrompt): The text prompt. + multimodal_info (dict): The supported modalities list or modality information dict. + """ + # check if multimodal_info is valid + for modality in modalities: + assert modality in MODALITIES, f"modality {modality} not supported." + + self.text_prompt = text_prompt + self.modalities = modalities + + def format(self, *args: Any, **kwargs: Any) -> 'MultiModalPrompt': + r""" + Formats the text prompt and the multimodal information at the same time. + if the keyword argument is in MODALITIES, then pop it and add it to multimodal_info, otherwise, apply it to text_prompt. + + Args: + *args (Any): Variable length argument list. + **kwargs (Any): Arbitrary keyword arguments. + + Returns: + MultiModalPrompt: The formatted multimodal prompt. + """ + + # pop the kwargs that is in MODALITIES + multimodal_info = {} + for modality in self.modalities: + multimodal_info[modality] = kwargs.pop(modality) + + text_prompt = self.text_prompt.format(*args, **kwargs) + return MultiModalPrompt(text_prompt, multimodal_info) + + def to_model_format( + self, + method: Callable = default_to_model_format) -> Any: + r""" + Converts the prompt to the input format that the multimodal model can understand. Different multimodal models have different input formats. + The default format is return the text and multimodal information in dict. + This function should be implemented in the multimodal prompt class. + + Returns: + dict: The input format that the multimodal model can understand. + """ + + return method(self.text_prompt, self.modalities) + + +# TODO: MultiModalPromptDict diff --git a/docs/get_started/multimodal_prompt.md b/docs/get_started/multimodal_prompt.md new file mode 100644 index 000000000..2d1984728 --- /dev/null +++ b/docs/get_started/multimodal_prompt.md @@ -0,0 +1,159 @@ +# Introduction to `MultiModalPrompt` Class + +## Overview + +The `MultiModalPrompt` class streamlines the process of creating integrated prompts for multimodal agents. By bringing together text and other modalities, it establishes a unified structure for communication. + +## Supported Modalities + +As of now, the class recognizes the following modality: +- `CAMEL_IMAGE` + +## Initialization + +To initialize a `MultiModalPrompt` instance: + +```python +from camel.prompts import MultiModalPrompt, TextPrompt + +vqa_prompt = MultiModalPrompt( + text_prompt=TextPrompt("Please answer the following question related to the provided image:\nQuestion: {Question}"), + modalities=["CAMEL_IMAGE"] +) +``` + +**Arguments**: +- `text_prompt` (TextPrompt): The text-based template. It dictates the format of the text segment of the prompt. +- `modalities` (Union[List, Dict]): Either a list of modality names or a dictionary pairing modality names with their respective data. If the input is a dictionary, it should follow the pattern `{Modality Name: Modality Data}`. + +## Methods + +### `format(*args, **kwargs) -> 'MultiModalPrompt'` + +This method concurrently formats both the text and multimodal components. Once formatted, the output is a new `MultiModalPrompt` instance. + +```python +vqa_prompt = MultiModalPrompt( + text_prompt=TextPrompt("Please anwser the following question about the given image:\nQuestion: {Question}"), + modalities=["CAMEL_IMAGE"] + ) + +question = "What animal is in the picture?" +image_path = "examples/multimodal/camel.jpg" + +vqa_prompt = vqa_prompt.format(Question=question, CAMEL_IMAGE=image_path) +# the prompt is now an instance of MultiModalPrompt, initializing with all multimodal information, and can be used to generate model input when the to_model_format method is called +``` + +### `to_model_format(method=default_to_model_format) -> Any` + +Transforms the prompt to a format understood by the multimodal model. + +By default, this method returns the prompt as a dictionary. However, by specifying a different `model_format` method, the output can be adapted to various multimodal model requirements. + +```python +def default_to_model_format(text_prompt, modalities_dict: Dict) -> Dict: + r""" + The default format is return the text and multimodal information in dict. + This function should be implemented in the multimodal prompt class. + + Returns: + dict: The input format that the multimodal model can understand. + """ + + return {"text": text_prompt, "multimodal_information": modalities_dict} + +vqa_prompt = MultiModalPrompt( + text_prompt=TextPrompt("Please anwser the following question about the given image:\nQuestion: {Question}"), + modalities=["CAMEL_IMAGE"] + ) + +question = "What animal is in the picture?" +image_path = "examples/multimodal/camel.jpg" + +vqa_prompt = vqa_prompt.format(Question=question, CAMEL_IMAGE=image_path) + + +print(vqa_prompt.to_model_format(default_to_model_format)) +# {'text': 'Please anwser the following question about the given image:\nQuestion: What animal is in the picture?', 'multimodal_information': {'CAMEL_IMAGE': 'examples/multimodal/camel.jpg'}} +``` + +## Usage Examples + +We provide an example in `camel/examples/multimodal/formating_example.py` which you can directly run to see the output. + +### 1. Single Image VQA (Visual Question Answering): + +This example illustrates how to generate prompts for a Visual Question Answering task associated with a single image. + +```python +from camel.prompts import MultiModalPrompt, TextPrompt + +# Create a VQA prompt template +vqa_prompt = MultiModalPrompt( + text_prompt=TextPrompt("Please answer the following question about the given image:\nQuestion: {Question}"), + modalities=["CAMEL_IMAGE"] +) + +# Define questions and their respective image paths +question1 = "What animal is in the picture?" +question2 = "What is the color of the animal?" +image1_path = "examples/multimodal/camel.jpg" +image2_path = "examples/multimodal/llama.jpg" + +# Format and display the prompts +vqa_prompt1 = vqa_prompt.format(Question=question1, CAMEL_IMAGE=image1_path) +vqa_prompt2 = vqa_prompt.format(Question=question2, CAMEL_IMAGE=image2_path) + +print("vqa_prompt1:", vqa_prompt1.to_model_format()) +# {'text': 'Please anwser the following question about the given image:\nQuestion: What animal is in the picture?', 'multimodal_information': {'CAMEL_IMAGE': 'examples/multimodal/camel.jpg'}} +print("vqa_prompt2:", vqa_prompt2.to_model_format()) +# {'text': 'Please anwser the following question about the given image:\nQuestion: What is the color of the animal?', 'multimodal_information': {'CAMEL_IMAGE': 'examples/multimodal/llama.jpg'}} +``` + +### 2. Multi-Image Question with a Custom Model Input: + +This showcases the creation of a prompt involving multiple images for a single question. Furthermore, it illustrates how to adapt the prompt to a model-specific format. + +```python +def multi_image_input_format(text_prompt, modalities_dict): + """ + Label each image in the prompt with numbers. + """ + if not isinstance(modalities_dict["CAMEL_IMAGE"], List): + modalities_dict["CAMEL_IMAGE"] = [modalities_dict["CAMEL_IMAGE"]] + + for i, image in enumerate(modalities_dict["CAMEL_IMAGE"]): + text_prompt = f"Image {i} is [Image{i}]\n" + text_prompt + + return {"prompt": text_prompt, "image": modalities_dict["CAMEL_IMAGE"]} + +# Define the multi-image question and format the prompt +question3 = "Are the animals from the two images the same?" +multi_image_prompt = vqa_prompt.format(Question=question3, CAMEL_IMAGE=[image1_path, image2_path]) + +# Display the multi-image prompt and its corresponding images +model_input = multi_image_prompt.to_model_format(multi_image_input_format) + +print("Prompt:", model_input["prompt"]) +''' +Image 1 is [Image1] +Image 0 is [Image0] +Please anwser the following question about the given image: +Question: Are the animals from the two images the same? +''' + + +print("Images:", model_input["image"]) +''' +['examples/multimodal/camel.jpg', 'examples/multimodal/llama.jpg'] +''' + +``` + +## Application with different multimodal models + + +### LLAVA-1.5 + +- TODO: add examples of how to use the multimodal prompt with different multimodal models with simple VL tasks. diff --git a/docs/index.rst b/docs/index.rst index 88a88f92e..0f21bd206 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -19,6 +19,7 @@ Welcome to CAMEL's documentation! get_started/text_prompt.md get_started/code_prompt.md get_started/messages.md + get_started/multimodal_prompt.md .. toctree:: :maxdepth: 1 diff --git a/examples/multimodal/camel.jpg b/examples/multimodal/camel.jpg new file mode 100644 index 000000000..aa5e5ec15 Binary files /dev/null and b/examples/multimodal/camel.jpg differ diff --git a/examples/multimodal/formating_example.py b/examples/multimodal/formating_example.py new file mode 100644 index 000000000..195a859ec --- /dev/null +++ b/examples/multimodal/formating_example.py @@ -0,0 +1,72 @@ +from typing import List + +from camel.prompts import MultiModalPrompt, TextPrompt + +if __name__ == "__main__": + # example prompt for simple one image vqa, using default model input format + vqa_prompt = MultiModalPrompt( + text_prompt=TextPrompt("Please anwser the following question about the given image:\nQuestion: {Question}"), + modalities=["CAMEL_IMAGE"]) + + print("Example prompt for simple one image vqa, using default model input format:") + + question1 = "What animal is in the picture?" + question2 = "What is the color of the animal?" + + image1_path = "examples/multimodal/camel.jpg" + image2_path = "examples/multimodal/llama.jpg" + + vqa_prompt1 = vqa_prompt.format( + Question=question1, + CAMEL_IMAGE=image1_path) + vqa_prompt2 = vqa_prompt.format( + Question=question2, + CAMEL_IMAGE=image2_path) + + print("vqa_prompt1:") + print(vqa_prompt1.to_model_format()) + + print("vqa_prompt2:") + print(vqa_prompt2.to_model_format()) + + print("-" * 100) + + # example prompt for multiple image question, with custom model input + # format + def multi_image_input_format(text_prompt, modalities_dict): + r""" + Label the image in the front of text prompt with numbers. + The multi image indexing format is taken from MMICL: Empowering Vision-language Model with Multi-Modal In-Context Learning + [Image{i}] in the prompt would be replaced by the visual prompts for the i-th image. + + Returns: + dict: The input format that the multimodal model can understand. + """ + + if not isinstance(modalities_dict["CAMEL_IMAGE"], List): + modalities_dict["CAMEL_IMAGE"] = [modalities_dict["CAMEL_IMAGE"]] + + for i, image in enumerate(modalities_dict["CAMEL_IMAGE"]): + text_prompt = f"Image {i} is [Image{i}]\n" + text_prompt + + return {"prompt": text_prompt, "image": modalities_dict["CAMEL_IMAGE"]} + + question3 = "Are the animals from the two images the same?" + multi_image_prompt = vqa_prompt.format( + Question=question3, + CAMEL_IMAGE=[ + image1_path, + image2_path]) # easily apply custom input format for different VLM agents + + print( + r"Example prompt for multiple image question, with custom model input format( is the special token, [Image{i}] is the image visual prompt]):") + print("multi_image_prompt: \n") + + model_input = multi_image_prompt.to_model_format(multi_image_input_format) + + prompt = model_input["prompt"] + print(prompt) + + images = model_input["image"] + print("images:") + print(images) diff --git a/examples/multimodal/llama.jpg b/examples/multimodal/llama.jpg new file mode 100644 index 000000000..9b7800960 Binary files /dev/null and b/examples/multimodal/llama.jpg differ