Skip to content

Commit

Permalink
feat: llava yaml
Browse files Browse the repository at this point in the history
  • Loading branch information
ZackBradshaw committed May 1, 2024
1 parent ef4d454 commit bc413a6
Showing 1 changed file with 44 additions and 0 deletions.
44 changes: 44 additions & 0 deletions servers/llava/sky_serve.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
envs:
MODEL_NAME: xtuner/llava-llama-3-8b-v1_1
HF_HUB_ENABLE_HF_TRANSFER: True

resources:
# accelerators: {L4:4, A100:4, A100:8, A100-80GB:2, A100-80GB:4, A100-80GB:8} ## Large models
accelerators: [L4, A10g, A100, A100, A100-80GB, T4, M60] ## Small models
# cpus: 32+
memory: 32+
# use_spot: True
# disk_size: 512 # Ensure model checkpoints (~246GB) can fit.
# disk_tier: best
ports: 8080 # Expose to internet traffic.

service:
readiness_probe:
path: /v1/chat/completions
post_data:
model: $MODEL_NAME
messages:
- role: user
content: Hello! What is your name?
max_tokens: 1
readiness_probe: /v1/models
replica_policy:
min_replicas: 1
max_replicas: 10
target_qps_per_replica: 2.5
upscale_delay_seconds: 300
downscale_delay_seconds: 1200

setup: |
pip install hf_transfer
run: |
# Serve With Docker
docker run --runtime nvidia --gpus all \
-v ~/.cache/huggingface:/root/.cache/huggingface \
--env "HUGGING_FACE_HUB_TOKEN=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"\
-p 8080:8080 \
--ipc=host \
openmmlab/lmdeploy:latest \
lmdeploy serve api_server $MODEL_NAME --server-port 8080

0 comments on commit bc413a6

Please sign in to comment.