forked from skypilot-org/skypilot
-
Notifications
You must be signed in to change notification settings - Fork 0
/
llama-3_1.yaml
109 lines (100 loc) · 3.17 KB
/
llama-3_1.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
# Serving Meta Llama-3.1 on your own infra.
#
# Usage:
#
# # Launch Llama-3.1 8B on a single L4 GPU:
# HF_TOKEN=xxx sky launch llama-31.yaml -c llama31 --gpus L4:1 --env HF_TOKEN --env MODEL_NAME=meta-llama/Meta-Llama-3.1-8B-Instruct
#
# # Launch Llama-3.1 405B-FP8 on a A100-80GB:8 GPU:
# HF_TOKEN=xxx sky launch llama-31.yaml -c llama31 --gpus A100-80GB:8 --env HF_TOKEN --env MODEL_NAME=meta-llama/Meta-Llama-3.1-405B-Instruct-FP8
#
# curl /v1/chat/completions:
#
# ENDPOINT=$(sky status --endpoint 8081 llama31)
#
# curl http://$ENDPOINT/v1/chat/completions \
# -H "Content-Type: application/json" \
# -d '{
# "model": "meta-llama/Meta-Llama-3.1-8B-Instruct",
# "messages": [
# {
# "role": "system",
# "content": "You are a helpful assistant."
# },
# {
# "role": "user",
# "content": "Who are you?"
# }
# ]
# }'
#
# Chat with model with Gradio UI (URLs printed in logs):
#
# Running on local URL: http://127.0.0.1:8811
# Running on public URL: https://<hash>.gradio.live
#
# Scale up with SkyServe:
# HF_TOKEN=xxx sky serve up llama-31.yaml -n llama31 --env HF_TOKEN --gpus L4:1 --env MODEL_NAME=meta-llama/Meta-Llama-3.1-8B-Instruct
#
# curl /v1/chat/completions:
#
# ENDPOINT=$(sky serve status --endpoint llama31)
# curl -L $ENDPOINT/v1/models
# curl -L http://$ENDPOINT/v1/chat/completions \
# -H "Content-Type: application/json" \
# -d '{
# "model": "meta-llama/Meta-Llama-3-8B-Instruct",
# "messages": [
# {
# "role": "system",
# "content": "You are a helpful assistant."
# },
# {
# "role": "user",
# "content": "Who are you?"
# }
# ]
# }'
envs:
MODEL_NAME: meta-llama/Meta-Llama-3.1-8B-Instruct
HF_TOKEN: # TODO: Fill with your own huggingface token, or use --env to pass.
service:
replicas: 2
# An actual request for readiness probe.
readiness_probe:
path: /v1/chat/completions
post_data:
model: $MODEL_NAME
messages:
- role: user
content: Hello! What is your name?
max_tokens: 1
resources:
accelerators: {L4:8, A10g:8, A10:8, A100:4, A100:8, A100-80GB:2, A100-80GB:4, A100-80GB:8}
# accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
cpus: 32+
disk_size: 1000 # Ensure model checkpoints can fit.
disk_tier: best
ports: 8081 # Expose to internet traffic.
setup: |
pip install vllm==0.5.3post1
pip install vllm-flash-attn==2.5.9.post1
# Install Gradio for web UI.
pip install gradio openai
run: |
echo 'Starting vllm api server...'
vllm serve $MODEL_NAME \
--tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
--max-model-len 4096 \
--port 8081 \
2>&1 | tee api_server.log &
while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do
echo 'Waiting for vllm api server to start...'
sleep 5
done
echo 'Starting gradio server...'
git clone https://github.com/vllm-project/vllm.git || true
python vllm/examples/gradio_openai_chatbot_webserver.py \
-m $MODEL_NAME \
--port 8811 \
--model-url http://localhost:8081/v1