[FEAT][llama3]

The-Swarm-Corporation · Apr 20, 2024 · 09bf430 · 09bf430
1 parent 4efc9b2
commit 09bf430
Show file tree

Hide file tree

Showing 3 changed files with 127 additions and 76 deletions.
diff --git a/servers/cogvlm/sky_serve.yaml b/servers/cogvlm/sky_serve.yaml
@@ -100,7 +100,7 @@ service:
 
 # Fields below describe each replica.
 resources:
-  accelerators: {L4:8, A10g:8, A100:4, A100:8, A100-80GB:2, A100-80GB:4, A100-80GB:8}
+  accelerators: {L4:8, A10g:8, A100:4, A100:8, A100-80GB:2}
   # cpus: 32+
   # memory: 512+
   # use_spot: True

diff --git a/servers/llama3/sky_serve.yaml b/servers/llama3/sky_serve.yaml
@@ -0,0 +1,125 @@
+# Serving Meta Llama-3 on your own infra.
+#
+# Usage:
+#
+#  HF_TOKEN=xxx sky launch llama3.yaml -c llama3 --env HF_TOKEN
+#
+# curl /v1/chat/completions:
+#
+#   ENDPOINT=$(sky status --endpoint 8081 llama3)
+#  
+#   # We need to manually specify the stop_token_ids to make sure the model finish
+#   # on <|eot_id|>.
+#   curl http://$ENDPOINT/v1/chat/completions \
+#     -H "Content-Type: application/json" \
+#     -d '{
+#       "model": "meta-llama/Meta-Llama-3-8B-Instruct",
+#       "messages": [
+#         {
+#           "role": "system",
+#           "content": "You are a helpful assistant."
+#         },
+#         {
+#           "role": "user",
+#           "content": "Who are you?"
+#         }
+#       ],
+#       "stop_token_ids": [128009,  128001]
+#     }'
+#
+# Chat with model with Gradio UI:
+#
+#   Running on local URL:  http://127.0.0.1:8811
+#   Running on public URL: https://<hash>.gradio.live
+#
+# Scale up with SkyServe:
+#  HF_TOKEN=xxx sky serve up llama3.yaml -n llama3 --env HF_TOKEN
+#
+# curl /v1/chat/completions:
+#
+#   ENDPOINT=$(sky serve status --endpoint llama3)
+#   curl -L $ENDPOINT/v1/models
+#   curl -L http://$ENDPOINT/v1/chat/completions \
+#     -H "Content-Type: application/json" \
+#     -d '{
+#       "model": "databricks/llama3-instruct",
+#       "messages": [
+#         {
+#           "role": "system",
+#           "content": "You are a helpful assistant."
+#         },
+#         {
+#           "role": "user",
+#           "content": "Who are you?"
+#         }
+#       ]
+#     }'
+
+
+envs:
+  MODEL_NAME: meta-llama/Meta-Llama-3-70B-Instruct
+  # MODEL_NAME: meta-llama/Meta-Llama-3-8B-Instruct
+  HF_TOKEN: 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  # Change to your own huggingface token, or use --env to pass.
+
+service:
+  replicas: 2
+  # An actual request for readiness probe.
+  readiness_probe:
+    path: /v1/chat/completions
+    post_data:
+      model: $MODEL_NAME
+      messages:
+        - role: user
+          content: Hello! What is your name?
+      max_tokens: 1
+
+resources:
+  # accelerators: {L4:8, A10g:8, A10:8, A100:4, A100:8, A100-80GB:2, A100-80GB:4, A100-80GB:8}
+  accelerators: {L4, A10g, A10, L40, A40, A100, A100-80GB} # We can use cheaper accelerators for 8B model.
+  # cpus: 32+
+  # use_spot: True
+  # disk_size: 512  # Ensure model checkpoints can fit.
+  # disk_tier: best
+  ports: 8081  # Expose to internet traffic.
+
+setup: |
+  conda activate vllm
+  if [ $? -ne 0 ]; then
+    conda create -n vllm python=3.10 -y
+    conda activate vllm
+  fi
+
+  pip install vllm==0.4.0.post1
+  # Install Gradio for web UI.
+  pip install gradio openai
+  pip install flash-attn==2.5.7
+
+
+run: |
+  conda activate vllm
+  echo 'Starting vllm api server...'
+
+  # https://github.com/vllm-project/vllm/issues/3098
+  export PATH=$PATH:/sbin
+
+  # NOTE: --gpu-memory-utilization 0.95 needed for 4-GPU nodes.
+  python -u -m vllm.entrypoints.openai.api_server \
+    --port 8081 \
+    --model $MODEL_NAME \
+    --trust-remote-code --tensor-parallel-size $SKYPILOT_NUM_GPUS_PER_NODE \
+    --gpu-memory-utilization 0.95 \
+    --max-num-seqs 64 \
+    2>&1 | tee api_server.log &
+
+  while ! `cat api_server.log | grep -q 'Uvicorn running on'`; do
+    echo 'Waiting for vllm api server to start...'
+    sleep 5
+  done
+
+  echo 'Starting gradio server...'
+  git clone https://github.com/vllm-project/vllm.git || true
+  python vllm/examples/gradio_openai_chatbot_webserver.py \
+    -m $MODEL_NAME \
+    --port 8811 \
+    --model-url http://localhost:8081/v1 \
+    --stop-token-ids 128009,128001
diff --git a/servers/text_to_video/sky_serve.yaml b/servers/text_to_video/sky_serve.yaml
@@ -18,85 +18,11 @@ service:
     upscale_delay_seconds: 300
     downscale_delay_seconds: 1200
 
-# # # Advanced Kubernetes configurations (optional).
-# kubernetes:
-#   # The networking mode for accessing SSH jump pod (optional).
-#   #
-#   # This must be either: 'nodeport' or 'portforward'. If not specified,
-#   # defaults to 'portforward'.
-
-#   #
-#   # nodeport: Exposes the jump pod SSH service on a static port number on each
-#   # Node, allowing external access to using <NodeIP>:<NodePort>. Using this
-#   # mode requires opening multiple ports on nodes in the Kubernetes cluster.
-#   #
-#   # portforward: Uses `kubectl port-forward` to create a tunnel and directly
-#   # access the jump pod SSH service in the Kubernetes cluster. Does not
-#   # require opening ports the cluster nodes and is more secure. 'portforward'
-#   # is used as default if 'networking' is not specified.
-#   networking: portforward
-
-#   # The mode to use for opening ports on Kubernetes
-#   #
-#   # This must be either: 'ingress' or 'loadbalancer'. If not specified,
-#   # defaults to 'loadbalancer'.
-#   #
-#   # loadbalancer: Creates services of type `LoadBalancer` to expose ports.
-#   # See https://skypilot.readthedocs.io/en/latest/reference/kubernetes/kubernetes-setup.html#loadbalancer-service.
-#   # This mode is supported out of the box on most cloud managed Kubernetes
-#   # environments (e.g., GKE, EKS).
-#   #
-#   # ingress: Creates an ingress and a ClusterIP service for each port opened.
-#   # Requires an Nginx ingress controller to be configured on the Kubernetes cluster.
-#   # Refer to https://skypilot.readthedocs.io/en/latest/reference/kubernetes/kubernetes-setup.html#nginx-ingress
-#   # for details on deploying the NGINX ingress controller.
-#   ports: loadbalancer
-
-#   # Attach custom metadata to Kubernetes objects created by SkyPilot
-#   #
-#   # Uses the same schema as Kubernetes metadata object: https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.26/#objectmeta-v1-meta
-#   #
-#   # Since metadata is applied to all all objects created by SkyPilot,
-#   # specifying 'name' and 'namespace' fields here is not allowed.
-#   # custom_metadata:
-#   #   labels:
-#   #     mylabel: myvalue
-#   #   annotations:
-#   #     myannotation: myvalue
-
-#   # Additional fields to override the pod fields used by SkyPilot (optional)
-#   #
-#   # Any key:value pairs added here would get added to the pod spec used to
-#   # create SkyPilot pods. The schema follows the same schema for a Pod object
-#   # in the Kubernetes API:
-#   # https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.26/#pod-v1-core
-#   #
-#   # # Some example use cases are shown below. All fields are optional.
-#   # pod_config:
-#   #   spec:
-#   #     runtimeClassName: nvidia    # Custom runtimeClassName for GPU pods.
-#   #     containers:
-#   #       - env:                # Custom environment variables for the pod, e.g., for proxy
-#   #         - name: HTTP_PROXY
-#   #           value: http://proxy-host:3128
-#   #         volumeMounts:       # Custom volume mounts for the pod
-#   #           - mountPath: /foo
-#   #             name: swarms
-#   #             readOnly: true
-#   #     volumes:
-#   #       - name: swarms
-#   #         hostPath:
-#   #           path: /tmp
-#   #           type: Directory
-#   #       - name: swarms          # Use this to modify the /dev/shm volume mounted by SkyPilot
-#   #         emptyDir:
-#   #           medium: Memory
-#   #           sizeLimit: 3Gi    # Set a size limit for the /dev/shm volume
 
 
 # Fields below describe each replica.
 resources:
-  accelerators: {L4:8, A10g:8}
+  accelerators: {L4:8, A10g:8, A100:4, A100:8, A100-80GB:2, A100-80GB:4, A100-80GB:8}
   # cpus: 32+
   # memory: 512+
   # use_spot: True