Skip to content

Commit

Permalink
📈 Add llava llama3 8b performance config
Browse files Browse the repository at this point in the history
  • Loading branch information
ZackBradshaw committed May 2, 2024
1 parent 35fd8f6 commit 1d53b4a
Showing 1 changed file with 52 additions and 0 deletions.
52 changes: 52 additions & 0 deletions servers/llava/llava_llama3_8b.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
envs:
MODEL_NAME: xtuner/llava-llama-3-8b-v1_1
HF_HUB_ENABLE_HF_TRANSFER: True

resources:
# accelerators: {L4:4, A100:4, A100:8, A100-80GB:2, A100-80GB:4, A100-80GB:8} ## Large models
accelerators: [L4, A10g, A100, A100, A100-80GB, T4, M60] ## Small models
# cpus: 32+
memory: 32+
# use_spot: True
# disk_size: 512 # Ensure model checkpoints (~246GB) can fit.
# disk_tier: best
ports: 8080 # Expose to internet traffic.

service:
readiness_probe:
path: /v1/chat/completions
post_data:
model: $MODEL_NAME
messages:
- role: user
content: Hello! What is your name?
max_tokens: 1
readiness_probe: /v1/models
replica_policy:
min_replicas: 0
max_replicas: 10
target_qps_per_replica: 5
upscale_delay_seconds: 300
downscale_delay_seconds: 1200

setup: |
pip install hf_transfer
# GPU Monitoring with Grafana
pip install nvidia-ml-py boto3 && \
# git clone --depth=1 https://gist.github.com/ZackBradshaw/5a50952f445596d046649e2d38b12176 gpumon && \
wget -O gpumon.py https://gist.githubusercontent.com/ZackBradshaw/5a50952f445596d046649e2d38b12176/raw/45135e5f90183bb6971bdddc276ab1e394ccb670/gpumon.py/
python gpumon.py
# nohup python gpumon.py > gpumon.out 2>&1 && \
# cat gpumon.out
run: |
# Serve With Docker
docker run --runtime nvidia --gpus all \
-v ~/.cache/huggingface:/root/.cache/huggingface \
--env "HUGGING_FACE_HUB_TOKEN=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"\
-p 8080:8080 \
--ipc=host \
openmmlab/lmdeploy:latest \
lmdeploy serve api_server $MODEL_NAME --server-port 8080

0 comments on commit 1d53b4a

Please sign in to comment.