📈 Add llava llama3 8b performance config

The-Swarm-Corporation · May 2, 2024 · 1d53b4a · 1d53b4a
1 parent 35fd8f6
commit 1d53b4a
Showing 1 changed file with 52 additions and 0 deletions.
diff --git a/servers/llava/llava_llama3_8b.yaml b/servers/llava/llava_llama3_8b.yaml
@@ -0,0 +1,52 @@
+envs:
+  MODEL_NAME: xtuner/llava-llama-3-8b-v1_1
+  HF_HUB_ENABLE_HF_TRANSFER: True
+
+resources:
+  # accelerators: {L4:4, A100:4, A100:8, A100-80GB:2, A100-80GB:4, A100-80GB:8} ## Large models
+  accelerators: [L4, A10g, A100, A100, A100-80GB, T4, M60] ## Small models
+  # cpus: 32+
+  memory: 32+
+  # use_spot: True
+  # disk_size: 512  # Ensure model checkpoints (~246GB) can fit.
+  # disk_tier: best
+  ports: 8080  # Expose to internet traffic.
+
+service:
+  readiness_probe:
+    path: /v1/chat/completions
+    post_data:
+      model: $MODEL_NAME
+      messages:
+        - role: user
+          content: Hello! What is your name?
+      max_tokens: 1
+  readiness_probe: /v1/models
+  replica_policy:
+    min_replicas: 0
+    max_replicas: 10
+    target_qps_per_replica: 5
+    upscale_delay_seconds: 300
+    downscale_delay_seconds: 1200
+
+setup: |
+  pip install hf_transfer
+
+  # GPU Monitoring with Grafana
+  pip install nvidia-ml-py boto3 && \
+  # git clone --depth=1 https://gist.github.com/ZackBradshaw/5a50952f445596d046649e2d38b12176 gpumon && \
+  wget -O gpumon.py https://gist.githubusercontent.com/ZackBradshaw/5a50952f445596d046649e2d38b12176/raw/45135e5f90183bb6971bdddc276ab1e394ccb670/gpumon.py/
+  python gpumon.py 
+  # nohup python gpumon.py > gpumon.out 2>&1 && \
+  # cat gpumon.out 
+  
+run: | 
+  # Serve With Docker
+  docker run --runtime nvidia --gpus all \
+      -v ~/.cache/huggingface:/root/.cache/huggingface \
+      --env "HUGGING_FACE_HUB_TOKEN=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"\
+      -p 8080:8080 \
+      --ipc=host \
+      openmmlab/lmdeploy:latest \
+    lmdeploy serve api_server $MODEL_NAME --server-port 8080
+