From 9f2bc323f36b3cbec7f85aacf11a458388c1c9d3 Mon Sep 17 00:00:00 2001
From: yym68686 <yym68686@outlook.com>
Date: Wed, 20 Dec 2023 13:50:06 +0800
Subject: [PATCH] 1. Add support for gpt-4-vision-preview.

2. Add dalle3 to generate images in parallel.
---
 README.md            |  6 ++++-
 bot.py               | 42 ++++++++++++++++++++++++-----
 test/test_dict.py    | 22 +++++++++++++++
 utils/chatgpt2api.py | 64 +++++++++++++++++++++++++++++---------------
 4 files changed, 106 insertions(+), 28 deletions(-)
 create mode 100644 test/test_dict.py
diff --git a/README.md b/README.md
index 1c0f1967..0a8a37ee 100644
--- a/README.md
+++ b/README.md
@@ -4,9 +4,13 @@ Join the [Telegram Group](https://t.me/+_01cz9tAkUc1YzZl) chat to share your use
 
 [English](./README.md) | [Simplified Chinese](./README.zh-CN.md) | [Traditional Chinese](./README.zh-TW.md)
 
+<a href="https://hub.docker.com/repository/docker/yym68686/chatgpt">
+    <img src="https://img.shields.io/docker/pulls/yym68686/chatgpt?color=blue" alt="docker pull">
+</a>
+
 ## ✨ Features
 
-✅ Supports GPT3.5, GPT4/GPT4 Turbo and Claude2.1 API, DALLE 3
+✅ Supports GPT3.5, GPT4/GPT4 Turbo/GPT4 vison and Claude2.1 API, DALLE 3
 
 ✅ Supports online search using duckduckgo and Google🔍. DuckDuckGo search is provided by default, and the official API for Google search needs to be applied by the user. It can provide real-time information that GPT could not answer before, such as Weibo hot search today, weather in a certain place today, and the progress of a certain person or news.
 
diff --git a/bot.py b/bot.py
index 38268c16..d90daefc 100644
--- a/bot.py
+++ b/bot.py
@@ -34,17 +34,35 @@
 @decorators.GroupAuthorization
 @decorators.Authorization
 async def command_bot(update, context, language=None, prompt=translator_prompt, title="", robot=None, has_command=True):
+    image_url = None
     if update.edited_message:
         message = update.edited_message.text if config.NICK is None else update.edited_message.text[botNicKLength:].strip() if update.edited_message.text[:botNicKLength].lower() == botNick else None
         rawtext = update.edited_message.text
         chatid = update.edited_message.chat_id
         messageid = update.edited_message.message_id
+        
+        if update.edited_message.photo:
+            photo = update.edited_message.photo[-1]
+            file_id = photo.file_id
+            photo_file = await context.bot.getFile(file_id)
+            image_url = photo_file.file_path
+
+            message = rawtext = update.edited_message.caption
     else:
         message = update.message.text if config.NICK is None else update.message.text[botNicKLength:].strip() if update.message.text[:botNicKLength].lower() == botNick else None
         rawtext = update.message.text
         chatid = update.message.chat_id
         messageid = update.message.message_id
+
+        if update.message.photo:
+            photo = update.message.photo[-1]
+            file_id = photo.file_id
+            photo_file = await context.bot.getFile(file_id)
+            image_url = photo_file.file_path
+
+            message = rawtext = update.message.caption
     print("\033[32m", update.effective_user.username, update.effective_user.id, rawtext, "\033[0m")
+
     if has_command == False or len(context.args) > 0:
         if has_command:
             message = ' '.join(context.args)
@@ -55,6 +73,16 @@ async def command_bot(update, context, language=None, prompt=translator_prompt,
         if message:
             if "claude" in config.GPT_ENGINE and config.ClaudeAPI:
                 robot = config.claudeBot
+            message = [{"type": "text", "text": message}]
+            if image_url and config.GPT_ENGINE == "gpt-4-vision-preview":
+                message.append(
+                    {
+                        "type": "image_url",
+                        "image_url": {
+                            "url": image_url
+                        }
+                    }
+                )
             await context.bot.send_chat_action(chat_id=chatid, action=ChatAction.TYPING)
             await getChatGPT(update, context, title, robot, message, chatid, messageid)
     else:
@@ -185,7 +213,7 @@ async def image(update, context):
             result += "当前账号余额不足～"
         else:
             result += f"`{e}`"
-        await context.bot.edit_message_text(chat_id=chatid, message_id=start_messageid, text=result, parse_mode='MarkdownV2', disable_web_page_preview=True)
+        await context.bot.edit_message_text(chat_id=chatid, message_id=start_messageid, text=escape(result), parse_mode='MarkdownV2', disable_web_page_preview=True)
 
 import time
 async def delete_message(update, context, messageid, delay=10):
@@ -216,15 +244,16 @@ async def delete_message(update, context, messageid, delay=10):
         # InlineKeyboardButton("gpt-4-32k", callback_data="gpt-4-32k"),
         # InlineKeyboardButton("gpt-4-32k-0314", callback_data="gpt-4-32k-0314"),
     ],
+    [
+        # InlineKeyboardButton("gpt-4-0613", callback_data="gpt-4-0613"),
+        # InlineKeyboardButton("gpt-4-32k-0613", callback_data="gpt-4-32k-0613"),
+        InlineKeyboardButton("gpt-4-vision-preview", callback_data="gpt-4-vision-preview"),
+    ],
     [
         InlineKeyboardButton("gpt-4", callback_data="gpt-4"),
         InlineKeyboardButton("gpt-4-32k", callback_data="gpt-4-32k"),
         # InlineKeyboardButton("gpt-4-0314", callback_data="gpt-4-0314"),
     ],
-    # [
-    #     InlineKeyboardButton("gpt-4-0613", callback_data="gpt-4-0613"),
-    #     InlineKeyboardButton("gpt-4-32k-0613", callback_data="gpt-4-32k-0613"),
-    # ],
     [
         InlineKeyboardButton("claude-2", callback_data="claude-2"),
         # InlineKeyboardButton("claude-2-web", callback_data="claude-2-web"),
@@ -502,7 +531,7 @@ async def post_init(application: Application) -> None:
     )
 
     application.add_handler(CommandHandler("start", start))
-    application.add_handler(CommandHandler("pic", image))
+    application.add_handler(CommandHandler("pic", image, block = False))
     application.add_handler(CommandHandler("search", lambda update, context: command_bot(update, context, prompt="search: ", title=f"`🤖️ {config.GPT_ENGINE}`\n\n", robot=config.ChatGPTbot, has_command="search")))
     # application.add_handler(CommandHandler("search", lambda update, context: search(update, context, title=f"`🤖️ {config.GPT_ENGINE}`\n\n", robot=config.ChatGPTbot)))
     application.add_handler(CallbackQueryHandler(button_press))
@@ -514,6 +543,7 @@ async def post_init(application: Application) -> None:
     # application.add_handler(CommandHandler("qa", qa))
     application.add_handler(MessageHandler(filters.Document.PDF | filters.Document.TXT | filters.Document.DOC, handle_pdf))
     application.add_handler(MessageHandler(filters.TEXT & ~filters.COMMAND, lambda update, context: command_bot(update, context, prompt=None, title=f"`🤖️ {config.GPT_ENGINE}`\n\n", robot=config.ChatGPTbot, has_command=False)))
+    application.add_handler(MessageHandler(filters.CAPTION & filters.PHOTO & ~filters.COMMAND, lambda update, context: command_bot(update, context, prompt=None, title=f"`🤖️ {config.GPT_ENGINE}`\n\n", robot=config.ChatGPTbot, has_command=False)))
     application.add_handler(MessageHandler(filters.COMMAND, unknown))
     application.add_error_handler(error)
 
diff --git a/test/test_dict.py b/test/test_dict.py
new file mode 100644
index 00000000..5e233dc6
--- /dev/null
+++ b/test/test_dict.py
@@ -0,0 +1,22 @@
+# # 假设你的列表如下：
+# lst = [{"name": "张三", "age": 20}, {"name": "李四", "age": {"url": "wwww"}}, {"name": "王五", "age": 40}]
+
+# # 使用列表解析和items()方法取出所有值
+# values = [value for dic in lst for value in dic.values()]
+
+# # 打印结果
+# print(values)
+
+def extract_values(obj):
+    if isinstance(obj, dict):
+        for value in obj.values():
+            yield from extract_values(value)
+    elif isinstance(obj, list):
+        for item in obj:
+            yield from extract_values(item)
+    else:
+        yield obj
+
+lst = [{"name": "张三", "age": 20}, {"name": "李四", "age": {"url": "wwww"}}, {"name": "王五", "age": 40}]
+values = list(extract_values(lst))
+print(values)
diff --git a/utils/chatgpt2api.py b/utils/chatgpt2api.py
index 12579591..a374cb4e 100644
--- a/utils/chatgpt2api.py
+++ b/utils/chatgpt2api.py
@@ -51,6 +51,7 @@ def get_filtered_keys_from_object(obj: object, *keys: str) -> Set[str]:
     "gpt-4-0613",
     "gpt-4-32k-0613",
     "gpt-4-1106-preview",
+    "gpt-4-vision-preview",
     "claude-2-web",
     "claude-2",
 ]
@@ -269,7 +270,7 @@ def __init__(
         self.system_prompt: str = system_prompt
         self.max_tokens: int = max_tokens or (
             4096
-            if "gpt-4-1106-preview" in engine or "gpt-3.5-turbo-1106" in engine
+            if "gpt-4-1106-preview" in engine or "gpt-3.5-turbo-1106" in engine or self.engine == "gpt-4-vision-preview"
             else 31000
             if "gpt-4-32k" in engine
             else 7000
@@ -284,7 +285,7 @@ def __init__(
         self.truncate_limit: int = truncate_limit or (
             16000
             # 126500 Control the number of search characters to prevent excessive spending
-            if "gpt-4-1106-preview" in engine
+            if "gpt-4-1106-preview" in engine or self.engine == "gpt-4-vision-preview"
             else 30500
             if "gpt-4-32k" in engine
             else 6500
@@ -342,7 +343,7 @@ def __init__(
 
     def add_to_conversation(
         self,
-        message: str,
+        message: list,
         role: str,
         convo_id: str = "default",
         function_name: str = "",
@@ -352,9 +353,9 @@ def add_to_conversation(
         """
         if convo_id not in self.conversation:
             self.reset(convo_id=convo_id)
-        if function_name == "" and message != "" and message != None:
+        if function_name == "" and message and message != None:
             self.conversation[convo_id].append({"role": role, "content": message})
-        elif function_name != "" and message != "" and message != None:
+        elif function_name != "" and message and message != None:
             self.conversation[convo_id].append({"role": role, "name": function_name, "content": message})
         else:
             print('\033[31m')
@@ -392,7 +393,7 @@ def truncate_conversation(
         while True:
             json_post = self.get_post_body(prompt, role, convo_id, model, pass_history, **kwargs)
             url = config.bot_api_url.chat_url
-            if self.engine == "gpt-4-1106-preview" or self.engine == "claude-2":
+            if self.engine == "gpt-4-1106-preview" or self.engine == "claude-2" or self.engine == "gpt-4-vision-preview":
                 message_token = {
                     "total": self.get_token_count(convo_id),
                 }
@@ -410,6 +411,15 @@ def truncate_conversation(
                 break
         return json_post, message_token
     
+    def extract_values(self, obj):
+        if isinstance(obj, dict):
+            for value in obj.values():
+                yield from self.extract_values(value)
+        elif isinstance(obj, list):
+            for item in obj:
+                yield from self.extract_values(item)
+        else:
+            yield obj
     # def clear_function_call(self, convo_id: str = "default"):
     #     self.conversation[convo_id] = [item for item in self.conversation[convo_id] if '@Trash@' not in item['content']]
     #     function_call_items = [item for item in self.conversation[convo_id] if 'function' in item['role']]
@@ -438,8 +448,10 @@ def get_token_count(self, convo_id: str = "default") -> int:
             # every message follows <im_start>{role/name}\n{content}<im_end>\n
             num_tokens += 5
             for key, value in message.items():
-                if value:
-                    num_tokens += len(encoding.encode(value))
+                values = list(self.extract_values(value))
+                if values:
+                    for value in values:
+                        num_tokens += len(encoding.encode(value))
                 if key == "name":  # if there's a name, the role is omitted
                     num_tokens += 5  # role is always required and always 1 token
         num_tokens += 5  # every reply is primed with <im_start>assistant
@@ -458,7 +470,7 @@ def get_message_token(self, url, json_post):
         if response.status_code != 200:
             json_response = json.loads(response.text)
             string = json_response["error"]["message"]
-            # print(json_response, string)
+            print(json_response, string)
             string = re.findall(r"\((.*?)\)", string)[0]
             numbers = re.findall(r"\d+\.?\d*", string)
             numbers = [int(i) for i in numbers]
@@ -488,6 +500,9 @@ def get_post_body(
         json_post_body = {
             "model": os.environ.get("MODEL_NAME") or model or self.engine,
             "messages": self.conversation[convo_id] if pass_history else [{"role": "system","content": self.system_prompt},{"role": role, "content": prompt}],
+            "max_tokens": 5000,
+        }
+        body = {
             "stream": True,
             # kwargs
             "temperature": kwargs.get("temperature", self.temperature),
@@ -502,14 +517,15 @@ def get_post_body(
             ),
             "n": kwargs.get("n", self.reply_count),
             "user": role,
-            "max_tokens": 5000,
         }
-        json_post_body.update(copy.deepcopy(function_call_list["base"]))
-        if config.SEARCH_USE_GPT:
-            json_post_body["functions"].append(function_call_list["web_search"])
-        json_post_body["functions"].append(function_call_list["url_fetch"])
-        json_post_body["functions"].append(function_call_list["today"])
-        json_post_body["functions"].append(function_call_list["vresion"])
+        if config.GPT_ENGINE != "gpt-4-vision-preview":
+            json_post_body.update(copy.deepcopy(body))
+            json_post_body.update(copy.deepcopy(function_call_list["base"]))
+            if config.SEARCH_USE_GPT:
+                json_post_body["functions"].append(function_call_list["web_search"])
+            json_post_body["functions"].append(function_call_list["url_fetch"])
+            json_post_body["functions"].append(function_call_list["today"])
+            json_post_body["functions"].append(function_call_list["vresion"])
 
         return json_post_body
 
@@ -522,7 +538,7 @@ def get_max_tokens(self, convo_id: str) -> int:
 
     def ask_stream(
         self,
-        prompt: str,
+        prompt: list,
         role: str = "user",
         convo_id: str = "default",
         model: str = None,
@@ -541,7 +557,7 @@ def ask_stream(
         print(json.dumps(json_post, indent=4, ensure_ascii=False))
         # print(self.conversation[convo_id])
 
-        if self.engine == "gpt-4-1106-preview":
+        if self.engine == "gpt-4-1106-preview" or self.engine == "gpt-4-vision-preview":
             model_max_tokens = kwargs.get("max_tokens", self.max_tokens)
         elif self.engine == "gpt-3.5-turbo-1106":
             model_max_tokens = min(kwargs.get("max_tokens", self.max_tokens), 16385 - message_token["total"])
@@ -571,7 +587,13 @@ def ask_stream(
             if not line:
                 continue
             # Remove "data: "
-            line = line.decode("utf-8")[6:]
+            if line.decode("utf-8")[:6] == "data: ":
+                line = line.decode("utf-8")[6:]
+            else:
+                print(line.decode("utf-8"))
+                full_response = json.loads(line.decode("utf-8"))["choices"][0]["message"]["content"]
+                yield full_response
+                break
             if line == "[DONE]":
                 break
             resp: dict = json.loads(line)
@@ -613,8 +635,8 @@ def ask_stream(
                     # prompt = json.loads(full_response)["prompt"]
                     for index in range(len(self.conversation[convo_id])):
                         if self.conversation[convo_id][-1 - index]["role"] == "user":
-                            self.conversation[convo_id][-1 - index]["content"] = self.conversation[convo_id][-1 - index]["content"].replace("search: ", "")
-                            prompt = self.conversation[convo_id][-1 - index]["content"]
+                            self.conversation[convo_id][-1 - index]["content"][0]["text"] = self.conversation[convo_id][-1 - index]["content"][0]["text"].replace("search: ", "")
+                            prompt = self.conversation[convo_id][-1 - index]["content"][0]["text"]
                             if json.loads(full_response)["prompt"].strip() != prompt:
                                 prompt = " ".join([prompt, json.loads(full_response)["prompt"].strip()]).strip()
                             print("\n\nprompt", prompt)