From 3ba68ee71ec61c6ceafdea1aeab78916bbec7dc6 Mon Sep 17 00:00:00 2001
From: Zack <zack@zackbradshaw.com>
Date: Thu, 2 May 2024 17:34:03 -0500
Subject: [PATCH] =?UTF-8?q?=F0=9F=A7=AC=20Add=20llava=20phi=207b=20genetic?=
 =?UTF-8?q?=20algorithm=20config?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 servers/llava/llava_phi_7b.yaml | 50 +++++++++++++++++++++++++++++++++
 1 file changed, 50 insertions(+)
 create mode 100644 servers/llava/llava_phi_7b.yaml

diff --git a/servers/llava/llava_phi_7b.yaml b/servers/llava/llava_phi_7b.yaml
new file mode 100644
index 0000000..2ba69ee
--- /dev/null
+++ b/servers/llava/llava_phi_7b.yaml
@@ -0,0 +1,50 @@
+envs:
+  MODEL_NAME: xtuner/llava-phi-3-mini-hf
+  HF_HUB_ENABLE_HF_TRANSFER: True
+
+resources:
+  # accelerators: {L4:4, A100:4, A100:8, A100-80GB:2, A100-80GB:4, A100-80GB:8} ## Large models
+  accelerators: [L4, A10g, A100, A100, A100-80GB, T4, M60] ## Small models
+  # cpus: 32+
+  memory: 32+
+  # use_spot: True
+  # disk_size: 512  # Ensure model checkpoints (~246GB) can fit.
+  # disk_tier: best
+  ports: 8080  # Expose to internet traffic.
+
+service:
+  readiness_probe:
+    path: /v1/chat/completions
+    post_data:
+      model: $MODEL_NAME
+      messages:
+        - role: user
+          content: Hello! What is your name?
+      max_tokens: 1
+  readiness_probe: /v1/models
+  replica_policy:
+    min_replicas: 1
+    max_replicas: 10
+    target_qps_per_replica: 5
+    upscale_delay_seconds: 300
+    downscale_delay_seconds: 1200
+
+setup: |
+  pip install hf_transfer && \
+
+  # GPU Monitoring with Grafana
+  pip install nvidia-ml-py boto3 && \
+  # git clone --depth=1 https://gist.github.com/ZackBradshaw/5a50952f445596d046649e2d38b12176 gpumon && \
+  wget -O gpumon.py https://gist.githubusercontent.com/ZackBradshaw/5a50952f445596d046649e2d38b12176/raw/45135e5f90183bb6971bdddc276ab1e394ccb670/gpumon.py/
+  nohup python gpumon.py > gpumon.out 2>&1 && \
+  cat gpumon.out 
+
+run: | 
+  # Serve With Docker
+  docker run --runtime nvidia --gpus all \
+      -v ~/.cache/huggingface:/root/.cache/huggingface \
+      --env "HUGGING_FACE_HUB_TOKEN=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"\
+      -p 8080:8080 \
+      --ipc=host \
+      openmmlab/lmdeploy:latest \
+    lmdeploy serve api_server $MODEL_NAME --model-name llava-phi --server-port 8080
\ No newline at end of file