Skip to content

Commit

Permalink
feat: add support for Llama 3.1 (#91)
Browse files Browse the repository at this point in the history
  • Loading branch information
quitrk authored Aug 12, 2024
1 parent a55125a commit 9a60788
Show file tree
Hide file tree
Showing 12 changed files with 790 additions and 688 deletions.
15 changes: 8 additions & 7 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -15,21 +15,20 @@ COPY docker/rootfs/ /
RUN \
apt-dpkg-wrap apt-key adv --keyserver keyserver.ubuntu.com --recv-keys F23C5A6CF475977595C89F51BA6932366A755776 && \
apt-dpkg-wrap apt-get update && \
apt-dpkg-wrap apt-get install -y wget build-essential python3.11 python3.11-venv && \
apt-cleanup
apt-dpkg-wrap apt-get install -y wget build-essential libcurl4-openssl-dev python3.11 python3.11-venv

RUN \
wget -nv -O cmake.sh https://github.com/Kitware/CMake/releases/download/v3.29.3/cmake-3.29.3-linux-x86_64.sh && \
sh cmake.sh --skip-license --prefix=/usr/local && \
rm cmake.sh

ENV LLAMA_CPP_RELEASE=b3070
COPY llama.cpp llama.cpp
RUN \
cd llama.cpp && \
rm -rf build && \
cmake -B build -DCMAKE_BUILD_TYPE=Release -DLLAMA_CUDA=ON -DLLAMA_NATIVE=OFF && \
cmake --build build --target server -j`getconf _NPROCESSORS_ONLN`
cmake -B build -DCMAKE_BUILD_TYPE=Release -DGGML_CUDA=ON -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF && \
cmake --build build --target llama-server -j`getconf _NPROCESSORS_ONLN` && \
ldd build/bin/llama-server

COPY requirements.txt /app/

Expand Down Expand Up @@ -67,7 +66,9 @@ RUN \

# Copy virtual environment
COPY --chown=jitsi:jitsi --from=builder /app/.venv /app/.venv
COPY --chown=jitsi:jitsi --from=builder /llama.cpp/build/bin/server /app/llama.cpp/server
COPY --chown=jitsi:jitsi --from=builder /llama.cpp/build/bin /app/llama.cpp

RUN ldd /app/llama.cpp/llama-server

# Copy application files
COPY --chown=jitsi:jitsi /skynet /app/skynet/
Expand All @@ -78,7 +79,7 @@ ENV \
# https://docs.python.org/3/using/cmdline.html#envvar-PYTHONDONTWRITEBYTECODE
PYTHONDONTWRITEBYTECODE=1 \
PYTHONPATH=/app \
LLAMA_PATH="/models/llama-3-8b-instruct-Q8_0.gguf"
LLAMA_PATH="/models/Llama-3.1-8B-Instruct-Q8_0.gguf"

VOLUME [ "/models" ]

Expand Down
13 changes: 6 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,17 +22,16 @@ git submodule update --init
# Download the preferred GGUF llama model
mkdir "$HOME/models"

wget -q --show-progress "https://huggingface.co/jitsi/Llama-3-8B-Instruct-GGUF/resolve/main/llama-3-8b-instruct-Q4_K_M.gguf?download=true" -O "$HOME/models/llama-3-8b-instruct.Q4_K_M.gguf"
wget -q --show-progress "https://huggingface.co/jitsi/Llama-3.1-8B-GGUF/blob/main/Llama-3.1-8B-Instruct-Q8_0.gguf?download=true" -O "$HOME/models/Llama-3.1-8B-Instruct-Q8_0.gguf"

export LLAMA_PATH="$HOME/models/llama-3-8b-instruct.Q4_K_M.gguf"
export OPENAI_API_SERVER_PATH="$HOME/skynet/llama.cpp/server"
export OPENAI_API_SERVER_PATH="$HOME/skynet/llama.cpp/llama-server"
export LLAMA_PATH="$HOME/models/Llama-3.1-8B-Instruct-Q8_0.gguf"
# disable authorization (for testing)
export BYPASS_AUTHORIZATION=1

# start Redis
docker run -d --rm -p 6379:6379 redis

# disable authorization (for testing)
export BYPASS_AUTHORIZATION=1

poetry install
./run.sh

Expand All @@ -55,7 +54,7 @@ poetry install
## Testing docker changes
```bash
docker compose -f compose-dev.yaml up --build
docker cp $HOME/models/llama-3-8b-instruct-Q8_0.gguf skynet-web-1:/models
docker cp $HOME/models/Llama-3.1-8B-Instruct-Q8_0.gguf skynet-web-1:/models
docker restart skynet-web-1

# localhost:8000 for Skynet APIs
Expand Down
14 changes: 5 additions & 9 deletions docs/summaries_module.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,20 +25,16 @@ All of the configuration is done via env vars. Check the [Skynet Environment Var
# Download the preferred GGUF llama model
mkdir "$HOME/models"

wget -q --show-progress "https://huggingface.co/jitsi/Llama-3-8B-Instruct-GGUF/resolve/main/llama-3-8b-instruct-Q4_K_M.gguf?download=true" -O "$HOME/models/llama-3-8b-instruct.Q4_K_M.gguf"
wget -q --show-progress "https://huggingface.co/jitsi/Llama-3.1-8B-GGUF/blob/main/Llama-3.1-8B-Instruct-Q8_0.gguf?download=true" -O "$HOME/models/Llama-3.1-8B-Instruct-Q8_0.gguf"

export LLAMA_PATH="$HOME/models/llama-3-8b-instruct.Q4_K_M.gguf"
# Optional for llama-3 since it's the default:
export LLAMA_N_CTX=8192
# Optional for llama-3 since it's auto-detected:
export MODEL_CHAT_FORMAT=llama-3
export OPENAI_API_SERVER_PATH="$HOME/skynet/llama.cpp/llama-server"
export LLAMA_PATH="$HOME/models/Llama-3.1-8B-Instruct-Q8_0.gguf"
# disable authorization (for testing)
export BYPASS_AUTHORIZATION=1

# start Redis
docker run -d --rm -p 6379:6379 redis

# disable authorization (for testing)
export BYPASS_AUTHORIZATION=1

poetry install
./run.sh
```
Expand Down
2 changes: 1 addition & 1 deletion llama.cpp
Submodule llama.cpp updated 755 files
1,302 changes: 714 additions & 588 deletions poetry.lock

Large diffs are not rendered by default.

61 changes: 31 additions & 30 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,88 +5,89 @@ annotated-types==0.7.0 ; python_version >= "3.11" and python_version < "3.12"
anyio==4.4.0 ; python_version >= "3.11" and python_version < "3.12"
async-lru==2.0.4 ; python_version >= "3.11" and python_version < "3.12"
async-timeout==4.0.3 ; python_version >= "3.11" and python_full_version <= "3.11.2"
attrs==23.2.0 ; python_version >= "3.11" and python_version < "3.12"
attrs==24.2.0 ; python_version >= "3.11" and python_version < "3.12"
av==10.0.0 ; python_version >= "3.11" and python_version < "3.12"
boto3==1.34.113 ; python_version >= "3.11" and python_version < "3.12"
botocore==1.34.113 ; python_version >= "3.11" and python_version < "3.12"
certifi==2024.2.2 ; python_version >= "3.11" and python_version < "3.12"
cffi==1.16.0 ; python_version >= "3.11" and python_version < "3.12" and platform_python_implementation != "PyPy"
boto3==1.34.156 ; python_version >= "3.11" and python_version < "3.12"
botocore==1.34.156 ; python_version >= "3.11" and python_version < "3.12"
certifi==2024.7.4 ; python_version >= "3.11" and python_version < "3.12"
cffi==1.17.0 ; python_version >= "3.11" and python_version < "3.12" and platform_python_implementation != "PyPy"
charset-normalizer==3.3.2 ; python_version >= "3.11" and python_version < "3.12"
click==8.1.7 ; python_version >= "3.11" and python_version < "3.12"
colorama==0.4.6 ; python_version >= "3.11" and python_version < "3.12" and (sys_platform == "win32" or platform_system == "Windows")
coloredlogs==15.0.1 ; python_version >= "3.11" and python_version < "3.12"
cryptography==42.0.7 ; python_version >= "3.11" and python_version < "3.12"
cryptography==43.0.0 ; python_version >= "3.11" and python_version < "3.12"
ctranslate2==3.24.0 ; python_version >= "3.11" and python_version < "3.12"
dataclasses-json==0.6.6 ; python_version >= "3.11" and python_version < "3.12"
dataclasses-json==0.6.7 ; python_version >= "3.11" and python_version < "3.12"
distro==1.9.0 ; python_version >= "3.11" and python_version < "3.12"
fastapi-versionizer==3.0.4 ; python_version >= "3.11" and python_version < "3.12"
fastapi==0.109.0 ; python_version >= "3.11" and python_version < "3.12"
faster-whisper==0.10.1 ; python_version >= "3.11" and python_version < "3.12"
filelock==3.14.0 ; python_version >= "3.11" and python_version < "3.12"
filelock==3.15.4 ; python_version >= "3.11" and python_version < "3.12"
flatbuffers==24.3.25 ; python_version >= "3.11" and python_version < "3.12"
frozenlist==1.4.1 ; python_version >= "3.11" and python_version < "3.12"
fsspec==2024.5.0 ; python_version >= "3.11" and python_version < "3.12"
greenlet==3.0.3 ; python_version >= "3.11" and python_version < "3.12" and (platform_machine == "aarch64" or platform_machine == "ppc64le" or platform_machine == "x86_64" or platform_machine == "amd64" or platform_machine == "AMD64" or platform_machine == "win32" or platform_machine == "WIN32")
fsspec==2024.6.1 ; python_version >= "3.11" and python_version < "3.12"
greenlet==3.0.3 ; python_version < "3.12" and (platform_machine == "aarch64" or platform_machine == "ppc64le" or platform_machine == "x86_64" or platform_machine == "amd64" or platform_machine == "AMD64" or platform_machine == "win32" or platform_machine == "WIN32") and python_version >= "3.11"
h11==0.14.0 ; python_version >= "3.11" and python_version < "3.12"
httpcore==1.0.5 ; python_version >= "3.11" and python_version < "3.12"
httptools==0.6.1 ; python_version >= "3.11" and python_version < "3.12"
httpx==0.27.0 ; python_version >= "3.11" and python_version < "3.12"
huggingface-hub==0.23.2 ; python_version >= "3.11" and python_version < "3.12"
huggingface-hub==0.24.5 ; python_version >= "3.11" and python_version < "3.12"
humanfriendly==10.0 ; python_version >= "3.11" and python_version < "3.12"
idna==3.7 ; python_version >= "3.11" and python_version < "3.12"
jinja2==3.1.4 ; python_version >= "3.11" and python_version < "3.12"
jiter==0.5.0 ; python_version >= "3.11" and python_version < "3.12"
jmespath==1.0.1 ; python_version >= "3.11" and python_version < "3.12"
jsonpatch==1.33 ; python_version >= "3.11" and python_version < "3.12"
jsonpointer==2.4 ; python_version >= "3.11" and python_version < "3.12"
jsonpointer==3.0.0 ; python_version >= "3.11" and python_version < "3.12"
langchain-community==0.0.20 ; python_version >= "3.11" and python_version < "3.12"
langchain-core==0.1.23 ; python_version >= "3.11" and python_version < "3.12"
langchain-openai==0.0.6 ; python_version >= "3.11" and python_version < "3.12"
langchain==0.1.7 ; python_version >= "3.11" and python_version < "3.12"
langsmith==0.0.87 ; python_version >= "3.11" and python_version < "3.12"
markupsafe==2.1.5 ; python_version >= "3.11" and python_version < "3.12"
marshmallow==3.21.2 ; python_version >= "3.11" and python_version < "3.12"
marshmallow==3.21.3 ; python_version >= "3.11" and python_version < "3.12"
mpmath==1.3.0 ; python_version >= "3.11" and python_version < "3.12"
multidict==6.0.5 ; python_version >= "3.11" and python_version < "3.12"
mypy-extensions==1.0.0 ; python_version >= "3.11" and python_version < "3.12"
natsort==8.4.0 ; python_version >= "3.11" and python_version < "3.12"
networkx==3.3 ; python_version >= "3.11" and python_version < "3.12"
numpy==1.26.4 ; python_version >= "3.11" and python_version < "3.12"
onnxruntime==1.18.0 ; python_version >= "3.11" and python_version < "3.12"
openai==1.30.3 ; python_version >= "3.11" and python_version < "3.12"
onnxruntime==1.18.1 ; python_version >= "3.11" and python_version < "3.12"
openai==1.40.1 ; python_version >= "3.11" and python_version < "3.12"
packaging==23.2 ; python_version >= "3.11" and python_version < "3.12"
prometheus-client==0.19.0 ; python_version >= "3.11" and python_version < "3.12"
prometheus-fastapi-instrumentator==6.1.0 ; python_version >= "3.11" and python_version < "3.12"
protobuf==5.27.0 ; python_version >= "3.11" and python_version < "3.12"
protobuf==5.27.3 ; python_version >= "3.11" and python_version < "3.12"
pycparser==2.22 ; python_version >= "3.11" and python_version < "3.12" and platform_python_implementation != "PyPy"
pydantic-core==2.14.6 ; python_version >= "3.11" and python_version < "3.12"
pydantic==2.5.3 ; python_version >= "3.11" and python_version < "3.12"
pyjwt[crypto]==2.8.0 ; python_version >= "3.11" and python_version < "3.12"
pyjwt[crypto]==2.9.0 ; python_version >= "3.11" and python_version < "3.12"
pyreadline3==3.4.1 ; sys_platform == "win32" and python_version >= "3.11" and python_version < "3.12"
python-dateutil==2.9.0.post0 ; python_version >= "3.11" and python_version < "3.12"
python-dotenv==1.0.1 ; python_version >= "3.11" and python_version < "3.12"
pyyaml==6.0.1 ; python_version >= "3.11" and python_version < "3.12"
pyyaml==6.0.2 ; python_version >= "3.11" and python_version < "3.12"
redis==5.0.1 ; python_version >= "3.11" and python_version < "3.12"
regex==2024.5.15 ; python_version >= "3.11" and python_version < "3.12"
requests==2.32.2 ; python_version >= "3.11" and python_version < "3.12"
s3transfer==0.10.1 ; python_version >= "3.11" and python_version < "3.12"
setuptools==70.0.0 ; python_version >= "3.11" and python_version < "3.12"
regex==2024.7.24 ; python_version >= "3.11" and python_version < "3.12"
requests==2.32.3 ; python_version >= "3.11" and python_version < "3.12"
s3transfer==0.10.2 ; python_version >= "3.11" and python_version < "3.12"
setuptools==72.1.0 ; python_version >= "3.11" and python_version < "3.12"
six==1.16.0 ; python_version >= "3.11" and python_version < "3.12"
sniffio==1.3.1 ; python_version >= "3.11" and python_version < "3.12"
sqlalchemy==2.0.30 ; python_version >= "3.11" and python_version < "3.12"
sqlalchemy==2.0.32 ; python_version >= "3.11" and python_version < "3.12"
starlette==0.35.1 ; python_version >= "3.11" and python_version < "3.12"
sympy==1.12 ; python_version >= "3.11" and python_version < "3.12"
tenacity==8.3.0 ; python_version >= "3.11" and python_version < "3.12"
sympy==1.13.1 ; python_version >= "3.11" and python_version < "3.12"
tenacity==8.5.0 ; python_version >= "3.11" and python_version < "3.12"
tiktoken==0.7.0 ; python_version >= "3.11" and python_version < "3.12"
tokenizers==0.15.2 ; python_version >= "3.11" and python_version < "3.12"
torch==2.0.1 ; python_version >= "3.11" and python_version < "3.12"
torchaudio==2.0.2 ; python_version >= "3.11" and python_version < "3.12"
tqdm==4.66.4 ; python_version >= "3.11" and python_version < "3.12"
typing-extensions==4.12.0 ; python_version >= "3.11" and python_version < "3.12"
tqdm==4.66.5 ; python_version >= "3.11" and python_version < "3.12"
typing-extensions==4.12.2 ; python_version >= "3.11" and python_version < "3.12"
typing-inspect==0.9.0 ; python_version >= "3.11" and python_version < "3.12"
urllib3==2.2.1 ; python_version >= "3.11" and python_version < "3.12"
uuid6==2024.1.12 ; python_version >= "3.11" and python_version < "3.12"
urllib3==2.2.2 ; python_version >= "3.11" and python_version < "3.12"
uuid6==2024.7.10 ; python_version >= "3.11" and python_version < "3.12"
uvicorn[standard]==0.29.0 ; python_version >= "3.11" and python_version < "3.12"
uvloop==0.19.0 ; (sys_platform != "win32" and sys_platform != "cygwin") and platform_python_implementation != "PyPy" and python_version >= "3.11" and python_version < "3.12"
watchfiles==0.22.0 ; python_version >= "3.11" and python_version < "3.12"
watchfiles==0.23.0 ; python_version >= "3.11" and python_version < "3.12"
websockets==12.0 ; python_version >= "3.11" and python_version < "3.12"
yarl==1.9.4 ; python_version >= "3.11" and python_version < "3.12"
3 changes: 2 additions & 1 deletion run.sh
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#!/bin/sh
cd llama.cpp
make server
make llama-server
cd ..

export LLAMA_N_CTX=32000
poetry run python -m uvicorn skynet.main:app --reload
6 changes: 3 additions & 3 deletions skynet/env.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,16 +26,16 @@ def tobool(val: str | None):

# models
llama_path = os.environ.get('LLAMA_PATH')
llama_n_ctx = int(os.environ.get('LLAMA_N_CTX', 8192))
llama_n_gpu_layers = int(os.environ.get('LLAMA_N_GPU_LAYERS', -1 if is_mac else 40))
llama_n_ctx = int(os.environ.get('LLAMA_N_CTX', 128000))
llama_n_gpu_layers = int(os.environ.get('LLAMA_N_GPU_LAYERS', -1 if is_mac else 99))
llama_n_batch = int(os.environ.get('LLAMA_N_BATCH', 512))

# azure openai api
# latest ga version https://learn.microsoft.com/en-us/azure/ai-services/openai/api-version-deprecation#latest-ga-api-release
azure_openai_api_version = os.environ.get('AZURE_OPENAI_API_VERSION', '2024-02-01')

# openai api
openai_api_server_path = os.environ.get('OPENAI_API_SERVER_PATH', '/app/llama.cpp/server')
openai_api_server_path = os.environ.get('OPENAI_API_SERVER_PATH', '/app/llama.cpp/llama-server')
openai_api_server_port = int(os.environ.get('OPENAI_API_SERVER_PORT', 8003))
openai_api_base_url = os.environ.get('OPENAI_API_BASE_URL', f'http://localhost:{openai_api_server_port}/v1')

Expand Down
9 changes: 5 additions & 4 deletions skynet/modules/ttt/openai_api/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,11 @@ def initialize():

proc = subprocess.Popen(
f'{openai_api_server_path} \
-m {llama_path} \
-b {llama_n_batch} \
-c {llama_n_ctx} \
-ngl {llama_n_gpu_layers} \
--batch-size {llama_n_batch} \
--ctx-size {llama_n_ctx} \
--flash-attn \
--model {llama_path} \
--n-gpu-layers {llama_n_gpu_layers} \
--port {openai_api_server_port}'.split(),
shell=False,
)
Expand Down
6 changes: 5 additions & 1 deletion skynet/modules/ttt/summaries/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,12 @@ async def process(payload: DocumentPayload, job_type: JobType, model: ChatOpenAI
chain = load_summarize_chain(current_model, chain_type="map_reduce", combine_prompt=prompt, map_prompt=prompt)

result = await chain.ainvoke(input={"input_documents": docs})
formatted_result = result['output_text'].replace('Response:', '', 1).strip()

return result['output_text'].strip()
log.info(f'input length: {len(system_message.replace("{text}", text))}')
log.info(f'output length: {len(formatted_result)}')

return formatted_result


async def process_open_ai(payload: DocumentPayload, job_type: JobType, api_key: str, model_name=None) -> str:
Expand Down
23 changes: 6 additions & 17 deletions skynet/modules/ttt/summaries/prompts/action_items.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,16 @@
action_items_conversation_prompt = """
## Instructions
You will be provided a text transcript of a conversation which may or may not contain some action items that need to be taken by the conversation participants.
An action item is valid when a participant commits to doing something in the future.
Each action item should be on a separate line.
If there is at least one action item, start your response with "Action_items:".
If nobody has any action items, please write "No action items."
You will be provided a conversation transcript which may or may not contain some action items that need to be taken by the conversation participants.
An action item can be extracted when someone commits to doing something in the future.
If there are no action items, respond just with "N/A".
## Transcript
{text}
## Response
"""


action_items_text_prompt = """
## Instructions
You will be provided a text transcript which may or may not contain some action items that need to be taken by the conversation participants.
An action item is valid when a participant commits to doing something in the future.
Each action item should be on a separate line.
If there is at least one action item, start your response with "Action_items:".
If nobody has any action items, please write "No action items."
An action item can be extracted when someone commits to doing something in the future.
If there are no action items, respond just with "N/A".
## Transcript
{text}
## Response
"""
24 changes: 4 additions & 20 deletions skynet/modules/ttt/summaries/prompts/summary.py
Original file line number Diff line number Diff line change
@@ -1,27 +1,11 @@
summary_conversation_prompt = """
## Instructions
Provide a summary of the given conversation, following the instructions
The summary should be just plain text.
The only formatting allowed is adding a new line between main ideas.
Do not add any other formatting, such as bullet points, numbering, or asterisks.
Start your response with "Summary:".
## Transcript
Summarize the following conversation. Start your response with "Response:".
You should not use any formatting like bullet points, numbering, or asterisks.
{text}
## Response
"""

summary_text_prompt = """
## Instructions
Provide a summary of the given transcript, following the instructions
The summary should be just plain text.
The only formatting allowed is adding a new line between main ideas.
Do not add any other formatting, such as bullet points, numbering, or asterisks.
Start your response with "Summary:".
## Transcript
Summarize the following text. Start your response with "Response:".
You should not use any formatting like bullet points, numbering, or asterisks.
{text}
## Response
"""

0 comments on commit 9a60788

Please sign in to comment.