generated from kyegomez/Python-Package-Template
-
-
Notifications
You must be signed in to change notification settings - Fork 10
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
📈 Add llava llama3 8b performance config
- Loading branch information
1 parent
35fd8f6
commit 1d53b4a
Showing
1 changed file
with
52 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
envs: | ||
MODEL_NAME: xtuner/llava-llama-3-8b-v1_1 | ||
HF_HUB_ENABLE_HF_TRANSFER: True | ||
|
||
resources: | ||
# accelerators: {L4:4, A100:4, A100:8, A100-80GB:2, A100-80GB:4, A100-80GB:8} ## Large models | ||
accelerators: [L4, A10g, A100, A100, A100-80GB, T4, M60] ## Small models | ||
# cpus: 32+ | ||
memory: 32+ | ||
# use_spot: True | ||
# disk_size: 512 # Ensure model checkpoints (~246GB) can fit. | ||
# disk_tier: best | ||
ports: 8080 # Expose to internet traffic. | ||
|
||
service: | ||
readiness_probe: | ||
path: /v1/chat/completions | ||
post_data: | ||
model: $MODEL_NAME | ||
messages: | ||
- role: user | ||
content: Hello! What is your name? | ||
max_tokens: 1 | ||
readiness_probe: /v1/models | ||
replica_policy: | ||
min_replicas: 0 | ||
max_replicas: 10 | ||
target_qps_per_replica: 5 | ||
upscale_delay_seconds: 300 | ||
downscale_delay_seconds: 1200 | ||
|
||
setup: | | ||
pip install hf_transfer | ||
# GPU Monitoring with Grafana | ||
pip install nvidia-ml-py boto3 && \ | ||
# git clone --depth=1 https://gist.github.com/ZackBradshaw/5a50952f445596d046649e2d38b12176 gpumon && \ | ||
wget -O gpumon.py https://gist.githubusercontent.com/ZackBradshaw/5a50952f445596d046649e2d38b12176/raw/45135e5f90183bb6971bdddc276ab1e394ccb670/gpumon.py/ | ||
python gpumon.py | ||
# nohup python gpumon.py > gpumon.out 2>&1 && \ | ||
# cat gpumon.out | ||
run: | | ||
# Serve With Docker | ||
docker run --runtime nvidia --gpus all \ | ||
-v ~/.cache/huggingface:/root/.cache/huggingface \ | ||
--env "HUGGING_FACE_HUB_TOKEN=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"\ | ||
-p 8080:8080 \ | ||
--ipc=host \ | ||
openmmlab/lmdeploy:latest \ | ||
lmdeploy serve api_server $MODEL_NAME --server-port 8080 | ||