feat: add support for Llama 3.1 (#91)

jitsi · Aug 12, 2024 · 9a60788 · 9a60788
1 parent a55125a
commit 9a60788
Show file tree

Hide file tree

Showing 12 changed files with 790 additions and 688 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -15,21 +15,20 @@ COPY docker/rootfs/ /
 RUN \
     apt-dpkg-wrap apt-key adv --keyserver keyserver.ubuntu.com --recv-keys F23C5A6CF475977595C89F51BA6932366A755776 && \
     apt-dpkg-wrap apt-get update && \
-    apt-dpkg-wrap apt-get install -y wget build-essential python3.11 python3.11-venv && \
-    apt-cleanup
+    apt-dpkg-wrap apt-get install -y wget build-essential libcurl4-openssl-dev python3.11 python3.11-venv
 
 RUN \
     wget -nv -O cmake.sh https://github.com/Kitware/CMake/releases/download/v3.29.3/cmake-3.29.3-linux-x86_64.sh && \
     sh cmake.sh --skip-license --prefix=/usr/local && \
     rm cmake.sh
 
-ENV LLAMA_CPP_RELEASE=b3070
 COPY llama.cpp llama.cpp
 RUN \
     cd llama.cpp && \
     rm -rf build && \
-    cmake -B build -DCMAKE_BUILD_TYPE=Release -DLLAMA_CUDA=ON -DLLAMA_NATIVE=OFF && \
-    cmake --build build --target server -j`getconf _NPROCESSORS_ONLN`
+    cmake -B build -DCMAKE_BUILD_TYPE=Release -DGGML_CUDA=ON -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF && \
+    cmake --build build --target llama-server -j`getconf _NPROCESSORS_ONLN` && \
+    ldd build/bin/llama-server
 
 COPY requirements.txt /app/
 
@@ -67,7 +66,9 @@ RUN \
 
 # Copy virtual environment
 COPY --chown=jitsi:jitsi --from=builder /app/.venv /app/.venv
-COPY --chown=jitsi:jitsi --from=builder /llama.cpp/build/bin/server /app/llama.cpp/server
+COPY --chown=jitsi:jitsi --from=builder /llama.cpp/build/bin /app/llama.cpp
+
+RUN ldd /app/llama.cpp/llama-server
 
 # Copy application files
 COPY --chown=jitsi:jitsi /skynet /app/skynet/
@@ -78,7 +79,7 @@ ENV \
     # https://docs.python.org/3/using/cmdline.html#envvar-PYTHONDONTWRITEBYTECODE
     PYTHONDONTWRITEBYTECODE=1 \
     PYTHONPATH=/app \
-    LLAMA_PATH="/models/llama-3-8b-instruct-Q8_0.gguf"
+    LLAMA_PATH="/models/Llama-3.1-8B-Instruct-Q8_0.gguf"
 
 VOLUME [ "/models" ]
 

diff --git a/README.md b/README.md
@@ -22,17 +22,16 @@ git submodule update --init
 # Download the preferred GGUF llama model
 mkdir "$HOME/models"
 
-wget -q --show-progress "https://huggingface.co/jitsi/Llama-3-8B-Instruct-GGUF/resolve/main/llama-3-8b-instruct-Q4_K_M.gguf?download=true" -O "$HOME/models/llama-3-8b-instruct.Q4_K_M.gguf"
+wget -q --show-progress "https://huggingface.co/jitsi/Llama-3.1-8B-GGUF/blob/main/Llama-3.1-8B-Instruct-Q8_0.gguf?download=true" -O "$HOME/models/Llama-3.1-8B-Instruct-Q8_0.gguf"
 
-export LLAMA_PATH="$HOME/models/llama-3-8b-instruct.Q4_K_M.gguf"
-export OPENAI_API_SERVER_PATH="$HOME/skynet/llama.cpp/server"
+export OPENAI_API_SERVER_PATH="$HOME/skynet/llama.cpp/llama-server"
+export LLAMA_PATH="$HOME/models/Llama-3.1-8B-Instruct-Q8_0.gguf"
+# disable authorization (for testing)
+export BYPASS_AUTHORIZATION=1
 
 # start Redis
 docker run -d --rm -p 6379:6379 redis 
 
-# disable authorization (for testing)
-export BYPASS_AUTHORIZATION=1
-
 poetry install
 ./run.sh
 
@@ -55,7 +54,7 @@ poetry install
 ## Testing docker changes
 ```bash
 docker compose -f compose-dev.yaml up --build
-docker cp $HOME/models/llama-3-8b-instruct-Q8_0.gguf skynet-web-1:/models
+docker cp $HOME/models/Llama-3.1-8B-Instruct-Q8_0.gguf skynet-web-1:/models
 docker restart skynet-web-1
 
 # localhost:8000 for Skynet APIs

diff --git a/docs/summaries_module.md b/docs/summaries_module.md
@@ -25,20 +25,16 @@ All of the configuration is done via env vars. Check the [Skynet Environment Var
 # Download the preferred GGUF llama model
 mkdir "$HOME/models"
 
-wget -q --show-progress "https://huggingface.co/jitsi/Llama-3-8B-Instruct-GGUF/resolve/main/llama-3-8b-instruct-Q4_K_M.gguf?download=true" -O "$HOME/models/llama-3-8b-instruct.Q4_K_M.gguf"
+wget -q --show-progress "https://huggingface.co/jitsi/Llama-3.1-8B-GGUF/blob/main/Llama-3.1-8B-Instruct-Q8_0.gguf?download=true" -O "$HOME/models/Llama-3.1-8B-Instruct-Q8_0.gguf"
 
-export LLAMA_PATH="$HOME/models/llama-3-8b-instruct.Q4_K_M.gguf"
-# Optional for llama-3 since it's the default:
-export LLAMA_N_CTX=8192
-# Optional for llama-3 since it's auto-detected:
-export MODEL_CHAT_FORMAT=llama-3
+export OPENAI_API_SERVER_PATH="$HOME/skynet/llama.cpp/llama-server"
+export LLAMA_PATH="$HOME/models/Llama-3.1-8B-Instruct-Q8_0.gguf"
+# disable authorization (for testing)
+export BYPASS_AUTHORIZATION=1
 
 # start Redis
 docker run -d --rm -p 6379:6379 redis 
 
-# disable authorization (for testing)
-export BYPASS_AUTHORIZATION=1
-
 poetry install
 ./run.sh
 ```

diff --git a/llama.cpp b/llama.cpp
diff --git a/poetry.lock b/poetry.lock
diff --git a/requirements.txt b/requirements.txt
@@ -5,88 +5,89 @@ annotated-types==0.7.0 ; python_version >= "3.11" and python_version < "3.12"
 anyio==4.4.0 ; python_version >= "3.11" and python_version < "3.12"
 async-lru==2.0.4 ; python_version >= "3.11" and python_version < "3.12"
 async-timeout==4.0.3 ; python_version >= "3.11" and python_full_version <= "3.11.2"
-attrs==23.2.0 ; python_version >= "3.11" and python_version < "3.12"
+attrs==24.2.0 ; python_version >= "3.11" and python_version < "3.12"
 av==10.0.0 ; python_version >= "3.11" and python_version < "3.12"
-boto3==1.34.113 ; python_version >= "3.11" and python_version < "3.12"
-botocore==1.34.113 ; python_version >= "3.11" and python_version < "3.12"
-certifi==2024.2.2 ; python_version >= "3.11" and python_version < "3.12"
-cffi==1.16.0 ; python_version >= "3.11" and python_version < "3.12" and platform_python_implementation != "PyPy"
+boto3==1.34.156 ; python_version >= "3.11" and python_version < "3.12"
+botocore==1.34.156 ; python_version >= "3.11" and python_version < "3.12"
+certifi==2024.7.4 ; python_version >= "3.11" and python_version < "3.12"
+cffi==1.17.0 ; python_version >= "3.11" and python_version < "3.12" and platform_python_implementation != "PyPy"
 charset-normalizer==3.3.2 ; python_version >= "3.11" and python_version < "3.12"
 click==8.1.7 ; python_version >= "3.11" and python_version < "3.12"
 colorama==0.4.6 ; python_version >= "3.11" and python_version < "3.12" and (sys_platform == "win32" or platform_system == "Windows")
 coloredlogs==15.0.1 ; python_version >= "3.11" and python_version < "3.12"
-cryptography==42.0.7 ; python_version >= "3.11" and python_version < "3.12"
+cryptography==43.0.0 ; python_version >= "3.11" and python_version < "3.12"
 ctranslate2==3.24.0 ; python_version >= "3.11" and python_version < "3.12"
-dataclasses-json==0.6.6 ; python_version >= "3.11" and python_version < "3.12"
+dataclasses-json==0.6.7 ; python_version >= "3.11" and python_version < "3.12"
 distro==1.9.0 ; python_version >= "3.11" and python_version < "3.12"
 fastapi-versionizer==3.0.4 ; python_version >= "3.11" and python_version < "3.12"
 fastapi==0.109.0 ; python_version >= "3.11" and python_version < "3.12"
 faster-whisper==0.10.1 ; python_version >= "3.11" and python_version < "3.12"
-filelock==3.14.0 ; python_version >= "3.11" and python_version < "3.12"
+filelock==3.15.4 ; python_version >= "3.11" and python_version < "3.12"
 flatbuffers==24.3.25 ; python_version >= "3.11" and python_version < "3.12"
 frozenlist==1.4.1 ; python_version >= "3.11" and python_version < "3.12"
-fsspec==2024.5.0 ; python_version >= "3.11" and python_version < "3.12"
-greenlet==3.0.3 ; python_version >= "3.11" and python_version < "3.12" and (platform_machine == "aarch64" or platform_machine == "ppc64le" or platform_machine == "x86_64" or platform_machine == "amd64" or platform_machine == "AMD64" or platform_machine == "win32" or platform_machine == "WIN32")
+fsspec==2024.6.1 ; python_version >= "3.11" and python_version < "3.12"
+greenlet==3.0.3 ; python_version < "3.12" and (platform_machine == "aarch64" or platform_machine == "ppc64le" or platform_machine == "x86_64" or platform_machine == "amd64" or platform_machine == "AMD64" or platform_machine == "win32" or platform_machine == "WIN32") and python_version >= "3.11"
 h11==0.14.0 ; python_version >= "3.11" and python_version < "3.12"
 httpcore==1.0.5 ; python_version >= "3.11" and python_version < "3.12"
 httptools==0.6.1 ; python_version >= "3.11" and python_version < "3.12"
 httpx==0.27.0 ; python_version >= "3.11" and python_version < "3.12"
-huggingface-hub==0.23.2 ; python_version >= "3.11" and python_version < "3.12"
+huggingface-hub==0.24.5 ; python_version >= "3.11" and python_version < "3.12"
 humanfriendly==10.0 ; python_version >= "3.11" and python_version < "3.12"
 idna==3.7 ; python_version >= "3.11" and python_version < "3.12"
 jinja2==3.1.4 ; python_version >= "3.11" and python_version < "3.12"
+jiter==0.5.0 ; python_version >= "3.11" and python_version < "3.12"
 jmespath==1.0.1 ; python_version >= "3.11" and python_version < "3.12"
 jsonpatch==1.33 ; python_version >= "3.11" and python_version < "3.12"
-jsonpointer==2.4 ; python_version >= "3.11" and python_version < "3.12"
+jsonpointer==3.0.0 ; python_version >= "3.11" and python_version < "3.12"
 langchain-community==0.0.20 ; python_version >= "3.11" and python_version < "3.12"
 langchain-core==0.1.23 ; python_version >= "3.11" and python_version < "3.12"
 langchain-openai==0.0.6 ; python_version >= "3.11" and python_version < "3.12"
 langchain==0.1.7 ; python_version >= "3.11" and python_version < "3.12"
 langsmith==0.0.87 ; python_version >= "3.11" and python_version < "3.12"
 markupsafe==2.1.5 ; python_version >= "3.11" and python_version < "3.12"
-marshmallow==3.21.2 ; python_version >= "3.11" and python_version < "3.12"
+marshmallow==3.21.3 ; python_version >= "3.11" and python_version < "3.12"
 mpmath==1.3.0 ; python_version >= "3.11" and python_version < "3.12"
 multidict==6.0.5 ; python_version >= "3.11" and python_version < "3.12"
 mypy-extensions==1.0.0 ; python_version >= "3.11" and python_version < "3.12"
 natsort==8.4.0 ; python_version >= "3.11" and python_version < "3.12"
 networkx==3.3 ; python_version >= "3.11" and python_version < "3.12"
 numpy==1.26.4 ; python_version >= "3.11" and python_version < "3.12"
-onnxruntime==1.18.0 ; python_version >= "3.11" and python_version < "3.12"
-openai==1.30.3 ; python_version >= "3.11" and python_version < "3.12"
+onnxruntime==1.18.1 ; python_version >= "3.11" and python_version < "3.12"
+openai==1.40.1 ; python_version >= "3.11" and python_version < "3.12"
 packaging==23.2 ; python_version >= "3.11" and python_version < "3.12"
 prometheus-client==0.19.0 ; python_version >= "3.11" and python_version < "3.12"
 prometheus-fastapi-instrumentator==6.1.0 ; python_version >= "3.11" and python_version < "3.12"
-protobuf==5.27.0 ; python_version >= "3.11" and python_version < "3.12"
+protobuf==5.27.3 ; python_version >= "3.11" and python_version < "3.12"
 pycparser==2.22 ; python_version >= "3.11" and python_version < "3.12" and platform_python_implementation != "PyPy"
 pydantic-core==2.14.6 ; python_version >= "3.11" and python_version < "3.12"
 pydantic==2.5.3 ; python_version >= "3.11" and python_version < "3.12"
-pyjwt[crypto]==2.8.0 ; python_version >= "3.11" and python_version < "3.12"
+pyjwt[crypto]==2.9.0 ; python_version >= "3.11" and python_version < "3.12"
 pyreadline3==3.4.1 ; sys_platform == "win32" and python_version >= "3.11" and python_version < "3.12"
 python-dateutil==2.9.0.post0 ; python_version >= "3.11" and python_version < "3.12"
 python-dotenv==1.0.1 ; python_version >= "3.11" and python_version < "3.12"
-pyyaml==6.0.1 ; python_version >= "3.11" and python_version < "3.12"
+pyyaml==6.0.2 ; python_version >= "3.11" and python_version < "3.12"
 redis==5.0.1 ; python_version >= "3.11" and python_version < "3.12"
-regex==2024.5.15 ; python_version >= "3.11" and python_version < "3.12"
-requests==2.32.2 ; python_version >= "3.11" and python_version < "3.12"
-s3transfer==0.10.1 ; python_version >= "3.11" and python_version < "3.12"
-setuptools==70.0.0 ; python_version >= "3.11" and python_version < "3.12"
+regex==2024.7.24 ; python_version >= "3.11" and python_version < "3.12"
+requests==2.32.3 ; python_version >= "3.11" and python_version < "3.12"
+s3transfer==0.10.2 ; python_version >= "3.11" and python_version < "3.12"
+setuptools==72.1.0 ; python_version >= "3.11" and python_version < "3.12"
 six==1.16.0 ; python_version >= "3.11" and python_version < "3.12"
 sniffio==1.3.1 ; python_version >= "3.11" and python_version < "3.12"
-sqlalchemy==2.0.30 ; python_version >= "3.11" and python_version < "3.12"
+sqlalchemy==2.0.32 ; python_version >= "3.11" and python_version < "3.12"
 starlette==0.35.1 ; python_version >= "3.11" and python_version < "3.12"
-sympy==1.12 ; python_version >= "3.11" and python_version < "3.12"
-tenacity==8.3.0 ; python_version >= "3.11" and python_version < "3.12"
+sympy==1.13.1 ; python_version >= "3.11" and python_version < "3.12"
+tenacity==8.5.0 ; python_version >= "3.11" and python_version < "3.12"
 tiktoken==0.7.0 ; python_version >= "3.11" and python_version < "3.12"
 tokenizers==0.15.2 ; python_version >= "3.11" and python_version < "3.12"
 torch==2.0.1 ; python_version >= "3.11" and python_version < "3.12"
 torchaudio==2.0.2 ; python_version >= "3.11" and python_version < "3.12"
-tqdm==4.66.4 ; python_version >= "3.11" and python_version < "3.12"
-typing-extensions==4.12.0 ; python_version >= "3.11" and python_version < "3.12"
+tqdm==4.66.5 ; python_version >= "3.11" and python_version < "3.12"
+typing-extensions==4.12.2 ; python_version >= "3.11" and python_version < "3.12"
 typing-inspect==0.9.0 ; python_version >= "3.11" and python_version < "3.12"
-urllib3==2.2.1 ; python_version >= "3.11" and python_version < "3.12"
-uuid6==2024.1.12 ; python_version >= "3.11" and python_version < "3.12"
+urllib3==2.2.2 ; python_version >= "3.11" and python_version < "3.12"
+uuid6==2024.7.10 ; python_version >= "3.11" and python_version < "3.12"
 uvicorn[standard]==0.29.0 ; python_version >= "3.11" and python_version < "3.12"
 uvloop==0.19.0 ; (sys_platform != "win32" and sys_platform != "cygwin") and platform_python_implementation != "PyPy" and python_version >= "3.11" and python_version < "3.12"
-watchfiles==0.22.0 ; python_version >= "3.11" and python_version < "3.12"
+watchfiles==0.23.0 ; python_version >= "3.11" and python_version < "3.12"
 websockets==12.0 ; python_version >= "3.11" and python_version < "3.12"
 yarl==1.9.4 ; python_version >= "3.11" and python_version < "3.12"
diff --git a/run.sh b/run.sh
@@ -1,6 +1,7 @@
 #!/bin/sh
 cd llama.cpp
-make server
+make llama-server
 cd ..
 
+export LLAMA_N_CTX=32000
 poetry run python -m uvicorn skynet.main:app --reload
diff --git a/skynet/env.py b/skynet/env.py
@@ -26,16 +26,16 @@ def tobool(val: str | None):
 
 # models
 llama_path = os.environ.get('LLAMA_PATH')
-llama_n_ctx = int(os.environ.get('LLAMA_N_CTX', 8192))
-llama_n_gpu_layers = int(os.environ.get('LLAMA_N_GPU_LAYERS', -1 if is_mac else 40))
+llama_n_ctx = int(os.environ.get('LLAMA_N_CTX', 128000))
+llama_n_gpu_layers = int(os.environ.get('LLAMA_N_GPU_LAYERS', -1 if is_mac else 99))
 llama_n_batch = int(os.environ.get('LLAMA_N_BATCH', 512))
 
 # azure openai api
 # latest ga version https://learn.microsoft.com/en-us/azure/ai-services/openai/api-version-deprecation#latest-ga-api-release
 azure_openai_api_version = os.environ.get('AZURE_OPENAI_API_VERSION', '2024-02-01')
 
 # openai api
-openai_api_server_path = os.environ.get('OPENAI_API_SERVER_PATH', '/app/llama.cpp/server')
+openai_api_server_path = os.environ.get('OPENAI_API_SERVER_PATH', '/app/llama.cpp/llama-server')
 openai_api_server_port = int(os.environ.get('OPENAI_API_SERVER_PORT', 8003))
 openai_api_base_url = os.environ.get('OPENAI_API_BASE_URL', f'http://localhost:{openai_api_server_port}/v1')
 

diff --git a/skynet/modules/ttt/openai_api/app.py b/skynet/modules/ttt/openai_api/app.py
@@ -23,10 +23,11 @@ def initialize():
 
     proc = subprocess.Popen(
         f'{openai_api_server_path} \
-            -m {llama_path} \
-            -b {llama_n_batch} \
-            -c {llama_n_ctx} \
-            -ngl {llama_n_gpu_layers} \
+            --batch-size {llama_n_batch} \
+            --ctx-size {llama_n_ctx} \
+            --flash-attn \
+            --model {llama_path} \
+            --n-gpu-layers {llama_n_gpu_layers} \
             --port {openai_api_server_port}'.split(),
         shell=False,
     )

diff --git a/skynet/modules/ttt/summaries/processor.py b/skynet/modules/ttt/summaries/processor.py
@@ -71,8 +71,12 @@ async def process(payload: DocumentPayload, job_type: JobType, model: ChatOpenAI
         chain = load_summarize_chain(current_model, chain_type="map_reduce", combine_prompt=prompt, map_prompt=prompt)
 
     result = await chain.ainvoke(input={"input_documents": docs})
+    formatted_result = result['output_text'].replace('Response:', '', 1).strip()
 
-    return result['output_text'].strip()
+    log.info(f'input length: {len(system_message.replace("{text}", text))}')
+    log.info(f'output length: {len(formatted_result)}')
+
+    return formatted_result
 
 
 async def process_open_ai(payload: DocumentPayload, job_type: JobType, api_key: str, model_name=None) -> str:

diff --git a/skynet/modules/ttt/summaries/prompts/action_items.py b/skynet/modules/ttt/summaries/prompts/action_items.py
@@ -1,27 +1,16 @@
 action_items_conversation_prompt = """
-  ## Instructions
-    You will be provided a text transcript of a conversation which may or may not contain some action items that need to be taken by the conversation participants.
-    An action item is valid when a participant commits to doing something in the future.
-    Each action item should be on a separate line.
-    If there is at least one action item, start your response with "Action_items:".
-    If nobody has any action items, please write "No action items."
+    You will be provided a conversation transcript which may or may not contain some action items that need to be taken by the conversation participants.
+    An action item can be extracted when someone commits to doing something in the future.
+    If there are no action items, respond just with "N/A".
 
-    ## Transcript
     {text}
-
-    ## Response
 """
 
+
 action_items_text_prompt = """
-    ## Instructions
     You will be provided a text transcript which may or may not contain some action items that need to be taken by the conversation participants.
-    An action item is valid when a participant commits to doing something in the future.
-    Each action item should be on a separate line.
-    If there is at least one action item, start your response with "Action_items:".
-    If nobody has any action items, please write "No action items."
+    An action item can be extracted when someone commits to doing something in the future.
+    If there are no action items, respond just with "N/A".
 
-    ## Transcript
     {text}
-
-    ## Response
 """
diff --git a/skynet/modules/ttt/summaries/prompts/summary.py b/skynet/modules/ttt/summaries/prompts/summary.py
@@ -1,27 +1,11 @@
 summary_conversation_prompt = """
-    ## Instructions
-    Provide a summary of the given conversation, following the instructions
-    The summary should be just plain text.
-    The only formatting allowed is adding a new line between main ideas.
-    Do not add any other formatting, such as bullet points, numbering, or asterisks.
-    Start your response with "Summary:".
-
-    ## Transcript
+    Summarize the following conversation. Start your response with "Response:".
+    You should not use any formatting like bullet points, numbering, or asterisks.
     {text}
-
-    ## Response
 """
 
 summary_text_prompt = """
-    ## Instructions
-    Provide a summary of the given transcript, following the instructions
-    The summary should be just plain text.
-    The only formatting allowed is adding a new line between main ideas.
-    Do not add any other formatting, such as bullet points, numbering, or asterisks.
-    Start your response with "Summary:".
-
-    ## Transcript
+    Summarize the following text. Start your response with "Response:".
+    You should not use any formatting like bullet points, numbering, or asterisks.
     {text}
-
-    ## Response
 """