1. Refactored the plugin function file, cleaning up unused functions.

2. Update g4f version to 0.1.9.6, fixed bug: with g4f availability. 3. Update the readme file.
yym68686 · Dec 25, 2023 · 3c90dcd · 3c90dcd
1 parent ea6c5e6
commit 3c90dcd
Show file tree

Hide file tree

Showing 10 changed files with 345 additions and 289 deletions.
diff --git a/README.md b/README.md
@@ -55,7 +55,7 @@ To develop plugins, please follow the steps outlined below:
 - Initially, you need to add the environment variable for the plugin in the `config.PLUGINS` dictionary located in the `config.py` file. The value can be customized to be either enabled or disabled by default. It is advisable to use uppercase letters for the entire environment variable.
 - Subsequently, append the function's name and description in the `utils/function_call.py` file.
 - Then, enhance the `ask_stream` function in the `utils/chatgpt2api.py` file with the function's processing logic. You can refer to the existing examples within the `ask_stream` method for guidance on how to write it.
-- Following that, write the function, as mentioned in the `utils/function_call.py` file, in the `utils/agent.py` file.
+- Following that, write the function, as mentioned in the `utils/function_call.py` file, in the `utils/plugins.py` file.
 - Next, in the `bot.py` file, augment the `update_first_buttons_message` function with buttons, enabling users to freely toggle plugins using the `info` command.
 - Lastly, don't forget to add the plugin's description in the plugins section of the README.
 

diff --git a/bot.py b/bot.py
@@ -8,7 +8,7 @@
 from utils.chatgpt2api import Chatbot as GPT
 from utils.chatgpt2api import claudebot
 from telegram.constants import ChatAction
-from utils.agent import docQA, get_doc_from_local, Document_extract, pdfQA, get_encode_image
+from utils.plugins import Document_extract, get_encode_image
 from telegram import BotCommand, InlineKeyboardButton, InlineKeyboardMarkup, InlineQueryResultArticle, InputTextMessageContent
 from telegram.ext import CommandHandler, MessageHandler, ApplicationBuilder, filters, CallbackQueryHandler, Application, AIORateLimiter, InlineQueryHandler
 from config import WEB_HOOK, PORT, BOT_TOKEN
@@ -74,6 +74,9 @@ async def command_bot(update, context, language=None, prompt=translator_prompt,
         if message:
             if "claude" in config.GPT_ENGINE and config.ClaudeAPI:
                 robot = config.claudeBot
+            if not config.API or config.PLUGINS["USE_G4F"]:
+                import utils.gpt4free as gpt4free
+                robot = gpt4free
             if image_url:
                 robot = config.GPT4visionbot
                 title = "`🤖️ gpt-4-vision-preview`\n\n"
@@ -124,9 +127,6 @@ async def getChatGPT(update, context, title, robot, message, chatid, messageid):
     )
     messageid = message.message_id
     get_answer = robot.ask_stream
-    if not config.API or (config.PLUGINS["USE_G4F"] and not config.PLUGINS["SEARCH_USE_GPT"]):
-        import utils.gpt4free as gpt4free
-        get_answer = gpt4free.get_response
 
     try:
         for data in get_answer(text, convo_id=str(chatid), pass_history=config.PASS_HISTORY):

diff --git a/config.py b/config.py
@@ -57,7 +57,7 @@
 
 PLUGINS = {
     "SEARCH_USE_GPT": (os.environ.get('SEARCH_USE_GPT', "True") == "False") == False,
-    "USE_G4F": False,
+    "USE_G4F": (os.environ.get('USE_G4F', "False") == "False") == False,
     "DATE": True,
     "URL": True,
     "VERSION": True,

diff --git a/requirements.txt b/requirements.txt
@@ -16,7 +16,7 @@ duckduckgo-search==4.1.0
 langchain==0.0.271
 oauth2client==3.0.0
 pdfminer.six
-g4f==0.1.8.8
+g4f==0.1.9.6
 
 # plugin
 pytz
diff --git a/test/test_gpt4free.py b/test/test_gpt4free.py
@@ -15,6 +15,7 @@ def get_response(message, model="gpt-3.5-turbo"):
 if __name__ == "__main__":
     console = Console()
     message = r"""
+李雪主是谁？
     """
     answer = ""
     for result in get_response(message, "gpt-4"):

diff --git a/test/test_langchain_search_old.py b/test/test_langchain_search_old.py
@@ -0,0 +1,235 @@
+import os
+import re
+
+import sys
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+import config
+
+from langchain.chat_models import ChatOpenAI
+
+
+from langchain.chains import RetrievalQA, RetrievalQAWithSourcesChain
+
+from langchain.prompts.chat import (
+    ChatPromptTemplate,
+    SystemMessagePromptTemplate,
+    HumanMessagePromptTemplate,
+)
+from langchain.embeddings.openai import OpenAIEmbeddings
+from langchain.vectorstores import Chroma
+from langchain.text_splitter import CharacterTextSplitter
+
+from langchain.document_loaders import UnstructuredPDFLoader
+
+def getmd5(string):
+    import hashlib
+    md5_hash = hashlib.md5()
+    md5_hash.update(string.encode('utf-8'))
+    md5_hex = md5_hash.hexdigest()
+    return md5_hex
+
+from utils.sitemap import SitemapLoader
+async def get_doc_from_sitemap(url):
+    # https://www.langchain.asia/modules/indexes/document_loaders/examples/sitemap#%E8%BF%87%E6%BB%A4%E7%AB%99%E7%82%B9%E5%9C%B0%E5%9B%BE-url-
+    sitemap_loader = SitemapLoader(web_path=url)
+    docs = await sitemap_loader.load()
+    return docs
+
+async def get_doc_from_local(docpath, doctype="md"):
+    from langchain.document_loaders import DirectoryLoader
+    # 加载文件夹中的所有txt类型的文件
+    loader = DirectoryLoader(docpath, glob='**/*.' + doctype)
+    # 将数据转成 document 对象，每个文件会作为一个 document
+    documents = loader.load()
+    return documents
+
+system_template="""Use the following pieces of context to answer the users question. 
+If you don't know the answer, just say "Hmm..., I'm not sure.", don't try to make up an answer.
+ALWAYS return a "Sources" part in your answer.
+The "Sources" part should be a reference to the source of the document from which you got your answer.
+
+Example of your response should be:
+
+```
+The answer is foo
+
+Sources:
+1. abc
+2. xyz
+```
+Begin!
+----------------
+{summaries}
+"""
+messages = [
+    SystemMessagePromptTemplate.from_template(system_template),
+    HumanMessagePromptTemplate.from_template("{question}")
+]
+prompt = ChatPromptTemplate.from_messages(messages)
+
+def get_chain(store, llm):
+    chain_type_kwargs = {"prompt": prompt}
+    chain = RetrievalQAWithSourcesChain.from_chain_type(
+        llm, 
+        chain_type="stuff", 
+        retriever=store.as_retriever(),
+        chain_type_kwargs=chain_type_kwargs,
+        reduce_k_below_max_tokens=True
+    )
+    return chain
+
+async def docQA(docpath, query_message, persist_db_path="db", model = "gpt-3.5-turbo"):
+    chatllm = ChatOpenAI(temperature=0.5, openai_api_base=config.bot_api_url.v1_url, model_name=model, openai_api_key=config.API)
+    embeddings = OpenAIEmbeddings(openai_api_base=config.bot_api_url.v1_url, openai_api_key=config.API)
+
+    sitemap = "sitemap.xml"
+    match = re.match(r'^(https?|ftp)://[^\s/$.?#].[^\s]*$', docpath)
+    if match:
+        doc_method = get_doc_from_sitemap
+        docpath = os.path.join(docpath, sitemap)
+    else:
+        doc_method = get_doc_from_local
+
+    persist_db_path = getmd5(docpath)
+    if not os.path.exists(persist_db_path):
+        documents = await doc_method(docpath)
+        # 初始化加载器
+        text_splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=50)
+        # 持久化数据
+        split_docs = text_splitter.split_documents(documents)
+        vector_store = Chroma.from_documents(split_docs, embeddings, persist_directory=persist_db_path)
+        vector_store.persist()
+    else:
+        # 加载数据
+        vector_store = Chroma(persist_directory=persist_db_path, embedding_function=embeddings)
+
+    # 创建问答对象
+    qa = get_chain(vector_store, chatllm)
+    # qa = RetrievalQA.from_chain_type(llm=chatllm, chain_type="stuff", retriever=vector_store.as_retriever(), return_source_documents=True)
+    # 进行问答
+    result = qa({"question": query_message})
+    return result
+
+
+def persist_emdedding_pdf(docurl, persist_db_path):
+    embeddings = OpenAIEmbeddings(openai_api_base=config.bot_api_url.v1_url, openai_api_key=os.environ.get('API', None))
+    filename = get_doc_from_url(docurl)
+    docpath = os.getcwd() + "/" + filename
+    loader = UnstructuredPDFLoader(docpath)
+    documents = loader.load()
+    # 初始化加载器
+    text_splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=25)
+    # 切割加载的 document
+    split_docs = text_splitter.split_documents(documents)
+    vector_store = Chroma.from_documents(split_docs, embeddings, persist_directory=persist_db_path)
+    vector_store.persist()
+    os.remove(docpath)
+    return vector_store
+
+async def pdfQA(docurl, docpath, query_message, model="gpt-3.5-turbo"):
+    chatllm = ChatOpenAI(temperature=0.5, openai_api_base=config.bot_api_url.v1_url, model_name=model, openai_api_key=os.environ.get('API', None))
+    embeddings = OpenAIEmbeddings(openai_api_base=config.bot_api_url.v1_url, openai_api_key=os.environ.get('API', None))
+    persist_db_path = getmd5(docpath)
+    if not os.path.exists(persist_db_path):
+        vector_store = persist_emdedding_pdf(docurl, persist_db_path)
+    else:
+        vector_store = Chroma(persist_directory=persist_db_path, embedding_function=embeddings)
+    qa = RetrievalQA.from_chain_type(llm=chatllm, chain_type="stuff", retriever=vector_store.as_retriever(), return_source_documents=True)
+    result = qa({"query": query_message})
+    return result['result']
+
+
+def pdf_search(docurl, query_message, model="gpt-3.5-turbo"):
+    chatllm = ChatOpenAI(temperature=0.5, openai_api_base=config.bot_api_url.v1_url, model_name=model, openai_api_key=os.environ.get('API', None))
+    embeddings = OpenAIEmbeddings(openai_api_base=config.bot_api_url.v1_url, openai_api_key=os.environ.get('API', None))
+    filename = get_doc_from_url(docurl)
+    docpath = os.getcwd() + "/" + filename
+    loader = UnstructuredPDFLoader(docpath)
+    try:
+        documents = loader.load()
+    except:
+        print("pdf load error! docpath:", docpath)
+        return ""
+    os.remove(docpath)
+    # 初始化加载器
+    text_splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=25)
+    # 切割加载的 document
+    split_docs = text_splitter.split_documents(documents)
+    vector_store = Chroma.from_documents(split_docs, embeddings)
+    # 创建问答对象
+    qa = RetrievalQA.from_chain_type(llm=chatllm, chain_type="stuff", retriever=vector_store.as_retriever(),return_source_documents=True)
+    # 进行问答
+    result = qa({"query": query_message})
+    return result['result']
+
+def summary_each_url(threads, chainllm, prompt):
+    summary_prompt = PromptTemplate(
+        input_variables=["web_summary", "question", "language"],
+        template=(
+            "You need to response the following question: {question}."
+            "Your task is answer the above question in {language} based on the Search results provided. Provide a detailed and in-depth response"
+            "If there is no relevant content in the search results, just answer None, do not make any explanations."
+            "Search results: {web_summary}."
+        ),
+    )
+    summary_threads = []
+
+    for t in threads:
+        tmp = t.join()
+        print(tmp)
+        chain = LLMChain(llm=chainllm, prompt=summary_prompt)
+        chain_thread = ThreadWithReturnValue(target=chain.run, args=({"web_summary": tmp, "question": prompt, "language": config.LANGUAGE},))
+        chain_thread.start()
+        summary_threads.append(chain_thread)
+
+    url_result = ""
+    for t in summary_threads:
+        tmp = t.join()
+        print("summary", tmp)
+        if tmp != "None":
+            url_result += "\n\n" + tmp
+    return url_result
+
+def get_search_results(prompt: str, context_max_tokens: int):
+
+    url_text_list = get_url_text_list(prompt)
+    useful_source_text = "\n\n".join(url_text_list)
+    # useful_source_text = summary_each_url(threads, chainllm, prompt)
+
+    useful_source_text, search_tokens_len = cut_message(useful_source_text, context_max_tokens)
+    print("search tokens len", search_tokens_len, "\n\n")
+
+    return useful_source_text
+
+from typing import Any
+from langchain.schema.output import LLMResult
+from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
+class ChainStreamHandler(StreamingStdOutCallbackHandler):
+    def __init__(self):
+        self.tokens = []
+        # 记得结束后这里置true
+        self.finish = False
+        self.answer = ""
+
+    def on_llm_new_token(self, token: str, **kwargs):
+        # print(token)
+        self.tokens.append(token)
+        # yield ''.join(self.tokens)
+        # print(''.join(self.tokens))
+
+    def on_llm_end(self, response: LLMResult, **kwargs: Any) -> None:
+        self.finish = 1
+
+    def on_llm_error(self, error: Exception, **kwargs: Any) -> None:
+        print(str(error))
+        self.tokens.append(str(error))
+
+    def generate_tokens(self):
+        while not self.finish or self.tokens:
+            if self.tokens:
+                data = self.tokens.pop(0)
+                self.answer += data
+                yield data
+            else:
+                pass
+        return self.answer
diff --git a/test/test_url.py b/test/test_url.py
@@ -11,8 +11,11 @@ def extract_date(url):
                 match = "1000/01/01"
         else:
             match = "1000/01/01"
-        return datetime.datetime.strptime(match, '%Y/%m/%d')
-
+        try:
+            return datetime.datetime.strptime(match, '%Y/%m/%d')
+        except:
+            match = "1000/01/01"
+            return datetime.datetime.strptime(match, '%Y/%m/%d')
 
     # 提取日期并创建一个包含日期和URL的元组列表
     date_url_pairs = [(extract_date(url), url) for url in urls]

diff --git a/utils/chatgpt2api.py b/utils/chatgpt2api.py
@@ -13,7 +13,7 @@
 from typing import Set
 
 import config
-from utils.agent import *
+from utils.plugins import *
 from utils.function_call import function_call_list
 
 def get_filtered_keys_from_object(obj: object, *keys: str) -> Set[str]:

diff --git a/utils/gpt4free.py b/utils/gpt4free.py
@@ -1,10 +1,32 @@
 import re
 import g4f
+import os
+import sys
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 import config
 
-def get_response(message, **kwargs):
+GPT_ENGINE_map = {
+    "gpt-3.5-turbo": "gpt-3.5-turbo",
+    "gpt-3.5-turbo-16k": "gpt-3.5-turbo-16k",
+    "gpt-3.5-turbo-0301": "gpt-3.5-turbo",
+    "gpt-3.5-turbo-0613": "gpt-3.5-turbo-0613",
+    "gpt-3.5-turbo-1106": "gpt-3.5-turbo",
+    "gpt-3.5-turbo-16k-0613": "gpt-3.5-turbo-0613",
+    "gpt-4": "gpt-4",
+    "gpt-4-0314": "gpt-4",
+    "gpt-4-32k": "gpt-4-32k",
+    "gpt-4-32k-0314": "gpt-4",
+    "gpt-4-0613": "gpt-4-0613",
+    "gpt-4-32k-0613": "gpt-4-32k-0613",
+    "gpt-4-1106-preview": "gpt-4-turbo",
+    "gpt-4-vision-preview": "gpt-4",
+    "claude-2-web": "gpt-4",
+    "claude-2": "gpt-4",
+}
+
+def ask_stream(message, **kwargs):
     response = g4f.ChatCompletion.create(
-        model=config.GPT_ENGINE,
+        model=GPT_ENGINE_map[config.GPT_ENGINE],
         messages=[{"role": "user", "content": message}],
         stream=True,
     )
@@ -22,8 +44,8 @@ def bing(response):
 if __name__ == "__main__":
 
     message = rf"""
-
+鲁迅和周树人为什么打架
     """
     answer = ""
-    for result in get_response(message, "gpt-4"):
+    for result in ask_stream(message, model="gpt-4"):
         print(result, end="")