diff --git a/camel/prompts/__init__.py b/camel/prompts/__init__.py
index 49f9d6f35..0eef251b4 100644
--- a/camel/prompts/__init__.py
+++ b/camel/prompts/__init__.py
@@ -21,10 +21,12 @@
 from .role_description_prompt_template import RoleDescriptionPromptTemplateDict
 from .task_prompt_template import TaskPromptTemplateDict
 from .prompt_templates import PromptTemplateGenerator
+from .multimodal import MultiModalPrompt
 
 __all__ = [
     'TextPrompt',
     'CodePrompt',
+    'MultiModalPrompt',
     'TextPromptDict',
     'AISocietyPromptTemplateDict',
     'CodePromptTemplateDict',
diff --git a/camel/prompts/multimodal.py b/camel/prompts/multimodal.py
new file mode 100644
index 000000000..c14f67704
--- /dev/null
+++ b/camel/prompts/multimodal.py
@@ -0,0 +1,90 @@
+# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
+# Licensed under the Apache License, Version 2.0 (the “License”);
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an “AS IS” BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# =========== Copyright 2023 @ CAMEL-AI.org. All Rights Reserved. ===========
+from typing import Any, Callable, Dict, List, Union
+
+from camel.prompts import TextPrompt
+
+MODALITIES = ["CAMEL_IMAGE"]
+
+
+def default_to_model_format(text_prompt, modalities_dict: Dict) -> Dict:
+    r"""
+    The default format is return the text and multimodal information in dict.
+    This function should be implemented in the multimodal prompt class.
+
+    Returns:
+        dict: The input format that the multimodal model can understand.
+    """
+
+    return {"text": text_prompt, "multimodal_information": modalities_dict}
+
+
+class MultiModalPrompt:
+    r"""
+    To enable information transfer between multimodal agents, we need a multimodal prompt class.
+    It contains a text prompt and multimodal information.
+    """
+
+    def __init__(self, text_prompt: TextPrompt, modalities: Union[List, Dict]):
+        r"""
+        Initializes the multimodal prompt.
+
+        Args:
+            text_prompt (TextPrompt): The text prompt.
+            multimodal_info (dict): The supported modalities list or modality information dict.
+        """
+        # check if multimodal_info is valid
+        for modality in modalities:
+            assert modality in MODALITIES, f"modality {modality} not supported."
+
+        self.text_prompt = text_prompt
+        self.modalities = modalities
+
+    def format(self, *args: Any, **kwargs: Any) -> 'MultiModalPrompt':
+        r"""
+        Formats the text prompt and the multimodal information at the same time.
+        if the keyword argument is in MODALITIES, then pop it and add it to multimodal_info, otherwise, apply it to text_prompt.
+
+        Args:
+            *args (Any): Variable length argument list.
+            **kwargs (Any): Arbitrary keyword arguments.
+
+        Returns:
+            MultiModalPrompt: The formatted multimodal prompt.
+        """
+
+        # pop the kwargs that is in MODALITIES
+        multimodal_info = {}
+        for modality in self.modalities:
+            multimodal_info[modality] = kwargs.pop(modality)
+
+        text_prompt = self.text_prompt.format(*args, **kwargs)
+        return MultiModalPrompt(text_prompt, multimodal_info)
+
+    def to_model_format(
+            self,
+            method: Callable = default_to_model_format) -> Any:
+        r"""
+        Converts the prompt to the input format that the multimodal model can understand. Different multimodal models have different input formats.
+        The default format is return the text and multimodal information in dict.
+        This function should be implemented in the multimodal prompt class.
+
+        Returns:
+            dict: The input format that the multimodal model can understand.
+        """
+
+        return method(self.text_prompt, self.modalities)
+
+
+# TODO: MultiModalPromptDict
diff --git a/docs/get_started/multimodal_prompt.md b/docs/get_started/multimodal_prompt.md
new file mode 100644
index 000000000..2d1984728
--- /dev/null
+++ b/docs/get_started/multimodal_prompt.md
@@ -0,0 +1,159 @@
+# Introduction to `MultiModalPrompt` Class
+
+## Overview
+
+The `MultiModalPrompt` class streamlines the process of creating integrated prompts for multimodal agents. By bringing together text and other modalities, it establishes a unified structure for communication.
+
+## Supported Modalities
+
+As of now, the class recognizes the following modality:
+- `CAMEL_IMAGE`
+
+## Initialization
+
+To initialize a `MultiModalPrompt` instance:
+
+```python
+from camel.prompts import MultiModalPrompt, TextPrompt
+
+vqa_prompt = MultiModalPrompt(
+    text_prompt=TextPrompt("Please answer the following question related to the provided image:\nQuestion: {Question}"),
+    modalities=["CAMEL_IMAGE"]
+)
+```
+
+**Arguments**:
+- `text_prompt` (TextPrompt): The text-based template. It dictates the format of the text segment of the prompt.
+- `modalities` (Union[List, Dict]): Either a list of modality names or a dictionary pairing modality names with their respective data. If the input is a dictionary, it should follow the pattern `{Modality Name: Modality Data}`.
+
+## Methods
+
+### `format(*args, **kwargs) -> 'MultiModalPrompt'`
+
+This method concurrently formats both the text and multimodal components. Once formatted, the output is a new `MultiModalPrompt` instance.
+
+```python
+vqa_prompt = MultiModalPrompt(
+    text_prompt=TextPrompt("Please anwser the following question about the given image:\nQuestion: {Question}"),
+    modalities=["CAMEL_IMAGE"]
+    )
+
+question = "What animal is in the picture?"
+image_path = "examples/multimodal/camel.jpg"
+
+vqa_prompt = vqa_prompt.format(Question=question, CAMEL_IMAGE=image_path)
+# the prompt is now an instance of MultiModalPrompt, initializing with all multimodal information, and can be used to generate model input when the to_model_format method is called
+```
+
+### `to_model_format(method=default_to_model_format) -> Any`
+
+Transforms the prompt to a format understood by the multimodal model.
+
+By default, this method returns the prompt as a dictionary. However, by specifying a different `model_format` method, the output can be adapted to various multimodal model requirements.
+
+```python
+def default_to_model_format(text_prompt, modalities_dict: Dict) -> Dict:
+    r"""
+    The default format is return the text and multimodal information in dict.
+    This function should be implemented in the multimodal prompt class.
+
+    Returns:
+        dict: The input format that the multimodal model can understand.
+    """
+
+    return {"text": text_prompt, "multimodal_information": modalities_dict}
+
+vqa_prompt = MultiModalPrompt(
+        text_prompt=TextPrompt("Please anwser the following question about the given image:\nQuestion: {Question}"),
+        modalities=["CAMEL_IMAGE"]
+        )
+
+question = "What animal is in the picture?"
+image_path = "examples/multimodal/camel.jpg"
+
+vqa_prompt = vqa_prompt.format(Question=question, CAMEL_IMAGE=image_path)
+
+
+print(vqa_prompt.to_model_format(default_to_model_format))
+# {'text': 'Please anwser the following question about the given image:\nQuestion: What animal is in the picture?', 'multimodal_information': {'CAMEL_IMAGE': 'examples/multimodal/camel.jpg'}}
+```
+
+## Usage Examples
+
+We provide an example in `camel/examples/multimodal/formating_example.py` which you can directly run to see the output.
+
+### 1. Single Image VQA (Visual Question Answering):
+
+This example illustrates how to generate prompts for a Visual Question Answering task associated with a single image.
+
+```python
+from camel.prompts import MultiModalPrompt, TextPrompt
+
+# Create a VQA prompt template
+vqa_prompt = MultiModalPrompt(
+    text_prompt=TextPrompt("Please answer the following question about the given image:\nQuestion: {Question}"),
+    modalities=["CAMEL_IMAGE"]
+)
+
+# Define questions and their respective image paths
+question1 = "What animal is in the picture?"
+question2 = "What is the color of the animal?"
+image1_path = "examples/multimodal/camel.jpg"
+image2_path = "examples/multimodal/llama.jpg"
+
+# Format and display the prompts
+vqa_prompt1 = vqa_prompt.format(Question=question1, CAMEL_IMAGE=image1_path)
+vqa_prompt2 = vqa_prompt.format(Question=question2, CAMEL_IMAGE=image2_path)
+
+print("vqa_prompt1:", vqa_prompt1.to_model_format())
+# {'text': 'Please anwser the following question about the given image:\nQuestion: What animal is in the picture?', 'multimodal_information': {'CAMEL_IMAGE': 'examples/multimodal/camel.jpg'}}
+print("vqa_prompt2:", vqa_prompt2.to_model_format())
+# {'text': 'Please anwser the following question about the given image:\nQuestion: What is the color of the animal?', 'multimodal_information': {'CAMEL_IMAGE': 'examples/multimodal/llama.jpg'}}
+```
+
+### 2. Multi-Image Question with a Custom Model Input:
+
+This showcases the creation of a prompt involving multiple images for a single question. Furthermore, it illustrates how to adapt the prompt to a model-specific format.
+
+```python
+def multi_image_input_format(text_prompt, modalities_dict):
+    """ 
+    Label each image in the prompt with numbers.
+    """
+    if not isinstance(modalities_dict["CAMEL_IMAGE"], List):
+        modalities_dict["CAMEL_IMAGE"] = [modalities_dict["CAMEL_IMAGE"]]
+
+    for i, image in enumerate(modalities_dict["CAMEL_IMAGE"]):
+        text_prompt = f"Image {i} is <Image{i}> [Image{i}]\n" + text_prompt
+
+    return {"prompt": text_prompt, "image": modalities_dict["CAMEL_IMAGE"]}
+
+# Define the multi-image question and format the prompt
+question3 = "Are the animals from the two images the same?"
+multi_image_prompt = vqa_prompt.format(Question=question3, CAMEL_IMAGE=[image1_path, image2_path])
+
+# Display the multi-image prompt and its corresponding images
+model_input = multi_image_prompt.to_model_format(multi_image_input_format)
+
+print("Prompt:", model_input["prompt"])
+'''
+Image 1 is <Image1> [Image1]
+Image 0 is <Image0> [Image0]
+Please anwser the following question about the given image:
+Question: Are the animals from the two images the same?
+'''
+
+
+print("Images:", model_input["image"])
+'''
+['examples/multimodal/camel.jpg', 'examples/multimodal/llama.jpg']
+'''
+
+```
+
+## Application with different multimodal models
+
+
+### LLAVA-1.5
+
+- TODO: add examples of how to use the multimodal prompt with different multimodal models with simple VL tasks.
diff --git a/docs/index.rst b/docs/index.rst
index 88a88f92e..0f21bd206 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -19,6 +19,7 @@ Welcome to CAMEL's documentation!
    get_started/text_prompt.md
    get_started/code_prompt.md
    get_started/messages.md
+   get_started/multimodal_prompt.md
 
 .. toctree::
    :maxdepth: 1
diff --git a/examples/multimodal/camel.jpg b/examples/multimodal/camel.jpg
new file mode 100644
index 000000000..aa5e5ec15
Binary files /dev/null and b/examples/multimodal/camel.jpg differ
diff --git a/examples/multimodal/formating_example.py b/examples/multimodal/formating_example.py
new file mode 100644
index 000000000..195a859ec
--- /dev/null
+++ b/examples/multimodal/formating_example.py
@@ -0,0 +1,72 @@
+from typing import List
+
+from camel.prompts import MultiModalPrompt, TextPrompt
+
+if __name__ == "__main__":
+    # example prompt for simple one image vqa, using default model input format
+    vqa_prompt = MultiModalPrompt(
+        text_prompt=TextPrompt("Please anwser the following question about the given image:\nQuestion: {Question}"),
+        modalities=["CAMEL_IMAGE"])
+
+    print("Example prompt for simple one image vqa, using default model input format:")
+
+    question1 = "What animal is in the picture?"
+    question2 = "What is the color of the animal?"
+
+    image1_path = "examples/multimodal/camel.jpg"
+    image2_path = "examples/multimodal/llama.jpg"
+
+    vqa_prompt1 = vqa_prompt.format(
+        Question=question1,
+        CAMEL_IMAGE=image1_path)
+    vqa_prompt2 = vqa_prompt.format(
+        Question=question2,
+        CAMEL_IMAGE=image2_path)
+
+    print("vqa_prompt1:")
+    print(vqa_prompt1.to_model_format())
+
+    print("vqa_prompt2:")
+    print(vqa_prompt2.to_model_format())
+
+    print("-" * 100)
+
+    # example prompt for multiple image question, with custom model input
+    # format
+    def multi_image_input_format(text_prompt, modalities_dict):
+        r"""
+        Label the image in the front of text prompt with numbers.
+        The multi image indexing format is taken from MMICL: Empowering Vision-language Model with Multi-Modal In-Context Learning
+        [Image{i}] in the prompt would be replaced by the visual prompts for the i-th image.
+
+        Returns:
+            dict: The input format that the multimodal model can understand.
+        """
+
+        if not isinstance(modalities_dict["CAMEL_IMAGE"], List):
+            modalities_dict["CAMEL_IMAGE"] = [modalities_dict["CAMEL_IMAGE"]]
+
+        for i, image in enumerate(modalities_dict["CAMEL_IMAGE"]):
+            text_prompt = f"Image {i} is <Image{i}> [Image{i}]\n" + text_prompt
+
+        return {"prompt": text_prompt, "image": modalities_dict["CAMEL_IMAGE"]}
+
+    question3 = "Are the animals from the two images the same?"
+    multi_image_prompt = vqa_prompt.format(
+        Question=question3,
+        CAMEL_IMAGE=[
+            image1_path,
+            image2_path])  # easily apply custom input format for different VLM agents
+
+    print(
+        r"Example prompt for multiple image question, with custom model input format(<Image{i}> is the special token, [Image{i}] is the image visual prompt]):")
+    print("multi_image_prompt: \n")
+
+    model_input = multi_image_prompt.to_model_format(multi_image_input_format)
+
+    prompt = model_input["prompt"]
+    print(prompt)
+
+    images = model_input["image"]
+    print("images:")
+    print(images)
diff --git a/examples/multimodal/llama.jpg b/examples/multimodal/llama.jpg
new file mode 100644
index 000000000..9b7800960
Binary files /dev/null and b/examples/multimodal/llama.jpg differ