Skip to content

Commit

Permalink
[FEAT][llama3]
Browse files Browse the repository at this point in the history
  • Loading branch information
Kye committed Apr 20, 2024
1 parent 4efc9b2 commit 09bf430
Show file tree
Hide file tree
Showing 3 changed files with 127 additions and 76 deletions.
2 changes: 1 addition & 1 deletion servers/cogvlm/sky_serve.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ service:

# Fields below describe each replica.
resources:
accelerators: {L4:8, A10g:8, A100:4, A100:8, A100-80GB:2, A100-80GB:4, A100-80GB:8}
accelerators: {L4:8, A10g:8, A100:4, A100:8, A100-80GB:2}
# cpus: 32+
# memory: 512+
# use_spot: True
Expand Down
125 changes: 125 additions & 0 deletions servers/llama3/sky_serve.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
# Serving Meta Llama-3 on your own infra.
#
# Usage:
#
# HF_TOKEN=xxx sky launch llama3.yaml -c llama3 --env HF_TOKEN
#
# curl /v1/chat/completions:
#
# ENDPOINT=$(sky status --endpoint 8081 llama3)
#
# # We need to manually specify the stop_token_ids to make sure the model finish
# # on <|eot_id|>.
# curl http://$ENDPOINT/v1/chat/completions \
# -H "Content-Type: application/json" \
# -d '{
# "model": "meta-llama/Meta-Llama-3-8B-Instruct",
# "messages": [
# {
# "role": "system",
# "content": "You are a helpful assistant."
# },
# {
# "role": "user",
# "content": "Who are you?"
# }
# ],
# "stop_token_ids": [128009, 128001]
# }'
#
# Chat with model with Gradio UI:
#
# Running on local URL: http://127.0.0.1:8811
# Running on public URL: https://<hash>.gradio.live
#
# Scale up with SkyServe:
# HF_TOKEN=xxx sky serve up llama3.yaml -n llama3 --env HF_TOKEN
#
# curl /v1/chat/completions:
#
# ENDPOINT=$(sky serve status --endpoint llama3)
# curl -L $ENDPOINT/v1/models
# curl -L http://$ENDPOINT/v1/chat/completions \
# -H "Content-Type: application/json" \
# -d '{
# "model": "databricks/llama3-instruct",
# "messages": [
# {
# "role": "system",
# "content": "You are a helpful assistant."
# },
# {
# "role": "user",
# "content": "Who are you?"
# }
# ]
# }'


envs:
MODEL_NAME: meta-llama/Meta-Llama-3-70B-Instruct
# MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
HF_TOKEN: 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 # Change to your own huggingface token, or use --env to pass.

service:
replicas: 2
# An actual request for readiness probe.
readiness_probe:
path: /v1/chat/completions
post_data:
model: $MODEL_NAME
messages:
- role: user
content: Hello! What is your name?
max_tokens: 1

resources:
# accelerators: {L4:8, A10g:8, A10:8, A100:4, A100:8, A100-80GB:2, A100-80GB:4, A100-80GB:8}
accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
# cpus: 32+
# use_spot: True
# disk_size: 512 # Ensure model checkpoints can fit.
# disk_tier: best
ports: 8081 # Expose to internet traffic.

setup: |
conda activate vllm
if [ $? -ne 0 ]; then
conda create -n vllm python=3.10 -y
conda activate vllm
fi
pip install vllm==0.4.0.post1
# Install Gradio for web UI.
pip install gradio openai
pip install flash-attn==2.5.7
run: |
conda activate vllm
echo 'Starting vllm api server...'
# https://github.com/vllm-project/vllm/issues/3098
export PATH=$PATH:/sbin
# NOTE: --gpu-memory-utilization 0.95 needed for 4-GPU nodes.
python -u -m vllm.entrypoints.openai.api_server \
--port 8081 \
--model $MODEL_NAME \
--trust-remote-code --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
--gpu-memory-utilization 0.95 \
--max-num-seqs 64 \
2>&1 | tee api_server.log &
while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do
echo 'Waiting for vllm api server to start...'
sleep 5
done
echo 'Starting gradio server...'
git clone https://github.com/vllm-project/vllm.git || true
python vllm/examples/gradio_openai_chatbot_webserver.py \
-m $MODEL_NAME \
--port 8811 \
--model-url http://localhost:8081/v1 \
--stop-token-ids 128009,128001
76 changes: 1 addition & 75 deletions servers/text_to_video/sky_serve.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -18,85 +18,11 @@ service:
upscale_delay_seconds: 300
downscale_delay_seconds: 1200

# # # Advanced Kubernetes configurations (optional).
# kubernetes:
# # The networking mode for accessing SSH jump pod (optional).
# #
# # This must be either: 'nodeport' or 'portforward'. If not specified,
# # defaults to 'portforward'.

# #
# # nodeport: Exposes the jump pod SSH service on a static port number on each
# # Node, allowing external access to using <NodeIP>:<NodePort>. Using this
# # mode requires opening multiple ports on nodes in the Kubernetes cluster.
# #
# # portforward: Uses `kubectl port-forward` to create a tunnel and directly
# # access the jump pod SSH service in the Kubernetes cluster. Does not
# # require opening ports the cluster nodes and is more secure. 'portforward'
# # is used as default if 'networking' is not specified.
# networking: portforward

# # The mode to use for opening ports on Kubernetes
# #
# # This must be either: 'ingress' or 'loadbalancer'. If not specified,
# # defaults to 'loadbalancer'.
# #
# # loadbalancer: Creates services of type `LoadBalancer` to expose ports.
# # See https://skypilot.readthedocs.io/en/latest/reference/kubernetes/kubernetes-setup.html#loadbalancer-service.
# # This mode is supported out of the box on most cloud managed Kubernetes
# # environments (e.g., GKE, EKS).
# #
# # ingress: Creates an ingress and a ClusterIP service for each port opened.
# # Requires an Nginx ingress controller to be configured on the Kubernetes cluster.
# # Refer to https://skypilot.readthedocs.io/en/latest/reference/kubernetes/kubernetes-setup.html#nginx-ingress
# # for details on deploying the NGINX ingress controller.
# ports: loadbalancer

# # Attach custom metadata to Kubernetes objects created by SkyPilot
# #
# # Uses the same schema as Kubernetes metadata object: https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.26/#objectmeta-v1-meta
# #
# # Since metadata is applied to all all objects created by SkyPilot,
# # specifying 'name' and 'namespace' fields here is not allowed.
# # custom_metadata:
# # labels:
# # mylabel: myvalue
# # annotations:
# # myannotation: myvalue

# # Additional fields to override the pod fields used by SkyPilot (optional)
# #
# # Any key:value pairs added here would get added to the pod spec used to
# # create SkyPilot pods. The schema follows the same schema for a Pod object
# # in the Kubernetes API:
# # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.26/#pod-v1-core
# #
# # # Some example use cases are shown below. All fields are optional.
# # pod_config:
# # spec:
# # runtimeClassName: nvidia # Custom runtimeClassName for GPU pods.
# # containers:
# # - env: # Custom environment variables for the pod, e.g., for proxy
# # - name: HTTP_PROXY
# # value: http://proxy-host:3128
# # volumeMounts: # Custom volume mounts for the pod
# # - mountPath: /foo
# # name: swarms
# # readOnly: true
# # volumes:
# # - name: swarms
# # hostPath:
# # path: /tmp
# # type: Directory
# # - name: swarms # Use this to modify the /dev/shm volume mounted by SkyPilot
# # emptyDir:
# # medium: Memory
# # sizeLimit: 3Gi # Set a size limit for the /dev/shm volume


# Fields below describe each replica.
resources:
accelerators: {L4:8, A10g:8}
accelerators: {L4:8, A10g:8, A100:4, A100:8, A100-80GB:2, A100-80GB:4, A100-80GB:8}
# cpus: 32+
# memory: 512+
# use_spot: True
Expand Down

0 comments on commit 09bf430

Please sign in to comment.