From 3ba68ee71ec61c6ceafdea1aeab78916bbec7dc6 Mon Sep 17 00:00:00 2001 From: Zack Date: Thu, 2 May 2024 17:34:03 -0500 Subject: [PATCH] =?UTF-8?q?=F0=9F=A7=AC=20Add=20llava=20phi=207b=20genetic?= =?UTF-8?q?=20algorithm=20config?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- servers/llava/llava_phi_7b.yaml | 50 +++++++++++++++++++++++++++++++++ 1 file changed, 50 insertions(+) create mode 100644 servers/llava/llava_phi_7b.yaml diff --git a/servers/llava/llava_phi_7b.yaml b/servers/llava/llava_phi_7b.yaml new file mode 100644 index 0000000..2ba69ee --- /dev/null +++ b/servers/llava/llava_phi_7b.yaml @@ -0,0 +1,50 @@ +envs: + MODEL_NAME: xtuner/llava-phi-3-mini-hf + HF_HUB_ENABLE_HF_TRANSFER: True + +resources: + # accelerators: {L4:4, A100:4, A100:8, A100-80GB:2, A100-80GB:4, A100-80GB:8} ## Large models + accelerators: [L4, A10g, A100, A100, A100-80GB, T4, M60] ## Small models + # cpus: 32+ + memory: 32+ + # use_spot: True + # disk_size: 512 # Ensure model checkpoints (~246GB) can fit. + # disk_tier: best + ports: 8080 # Expose to internet traffic. + +service: + readiness_probe: + path: /v1/chat/completions + post_data: + model: $MODEL_NAME + messages: + - role: user + content: Hello! What is your name? + max_tokens: 1 + readiness_probe: /v1/models + replica_policy: + min_replicas: 1 + max_replicas: 10 + target_qps_per_replica: 5 + upscale_delay_seconds: 300 + downscale_delay_seconds: 1200 + +setup: | + pip install hf_transfer && \ + + # GPU Monitoring with Grafana + pip install nvidia-ml-py boto3 && \ + # git clone --depth=1 https://gist.github.com/ZackBradshaw/5a50952f445596d046649e2d38b12176 gpumon && \ + wget -O gpumon.py https://gist.githubusercontent.com/ZackBradshaw/5a50952f445596d046649e2d38b12176/raw/45135e5f90183bb6971bdddc276ab1e394ccb670/gpumon.py/ + nohup python gpumon.py > gpumon.out 2>&1 && \ + cat gpumon.out + +run: | + # Serve With Docker + docker run --runtime nvidia --gpus all \ + -v ~/.cache/huggingface:/root/.cache/huggingface \ + --env "HUGGING_FACE_HUB_TOKEN=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"\ + -p 8080:8080 \ + --ipc=host \ + openmmlab/lmdeploy:latest \ + lmdeploy serve api_server $MODEL_NAME --model-name llava-phi --server-port 8080 \ No newline at end of file