matrixorigin · bincherry · Oct 15, 2024
diff --git a/ray/serve_config_examples/Dockerfile b/ray/serve_config_examples/Dockerfile
@@ -0,0 +1,25 @@
+FROM rayproject/ray:2.35.0-py310-cpu
+
+USER root
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends git git-lfs && \
+    apt-get autoremove -y && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends build-essential && \
+    pip install torch --no-cache-dir -c /home/ray/requirements_compiled.txt && \
+    pip install transformers --no-cache-dir -c /home/ray/requirements_compiled.txt && \
+    apt-get purge -y build-essential && \
+    apt-get autoremove -y && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+USER ray
+
+RUN git clone https://hf-mirror.com/google-t5/t5-small
+ADD --chown=ray:users text_ml.py /home/ray/text_ml.py
+
+WORKDIR /home/ray
diff --git a/ray/serve_config_examples/README.md b/ray/serve_config_examples/README.md
@@ -0,0 +1,119 @@
+## serve_config_examples
+
+来自ray社区的官方示例：
+- 推理服务代码 [serve_config_examples](https://github.com/ray-project/serve_config_examples)
+- 如何部署到k8s [Deploy on Kubernetes](https://docs.ray.io/en/latest/serve/production-guide/kubernetes.html)
+
+其中官方示例中的`text_ml.py`不兼容新版的rayserve，这里做了一点修改。
+
+### Deploy as RayService
+
+```yaml
+# Make sure to increase resource requests and limits before using this example in production.
+# For examples with more realistic resource configuration, see
+# ray-cluster.complete.large.yaml and
+# ray-cluster.autoscaler.large.yaml.
+apiVersion: ray.io/v1
+kind: RayService
+metadata:
+  annotations:
+    neolink.ai/istio-dashboard-allow-user: "[email protected]"
+    neolink.ai/istio-dashboard-unprotected: "true"
+    neolink.ai/istio-gateway: "istio-system/ray-ingressgateway"
+    neolink.ai/istio-domain: "10.161.0.67.nip.io"
+    neolink.ai/istio-gateway-port: "31680"
+  name: rayservice-sample
+spec:
+  # serveConfigV2 takes a yaml multi-line scalar, which should be a Ray Serve multi-application config. See https://docs.ray.io/en/latest/serve/multi-app.html.
+  serveConfigV2: |
+    applications:
+      - name: text_ml_app
+        import_path: text_ml.app
+        route_prefix: /summarize_translate
+        runtime_env:
+          env_vars:
+            HF_ENDPOINT: "https://hf-mirror.com"
+        deployments:
+          - name: Translator
+            num_replicas: 1
+            ray_actor_options:
+              num_cpus: 0.2
+            user_config:
+              language: french
+          - name: Summarizer
+            num_replicas: 1
+            ray_actor_options:
+              num_cpus: 0.2
+  rayClusterConfig:
+    rayVersion: '2.35.0' # should match the Ray version in the image of the containers
+    ######################headGroupSpecs#################################
+    # Ray head pod template.
+    headGroupSpec:
+      # The `rayStartParams` are used to configure the `ray start` command.
+      # See https://github.com/ray-project/kuberay/blob/master/docs/guidance/rayStartParams.md for the default settings of `rayStartParams` in KubeRay.
+      # See https://docs.ray.io/en/latest/cluster/cli.html#ray-start for all available options in `rayStartParams`.
+      rayStartParams:
+        dashboard-host: '0.0.0.0'
+        num-cpus: '0'
+      #pod template
+      template:
+        metadata:
+          annotations:
+            sidecar.istio.io/inject: "true"
+        spec:
+          containers:
+            - name: ray-head
+              image: ghcr.io/bincherry/ray:text_ml
+              imagePullPolicy: Always
+              resources:
+                limits:
+                  cpu: 2
+                  memory: 2Gi
+                requests:
+                  cpu: "0.1"
+                  memory: 128Mi
+              ports:
+                - containerPort: 6379
+                  name: gcs-server
+                - containerPort: 8265 # Ray dashboard
+                  name: dashboard
+                - containerPort: 10001
+                  name: client
+                - containerPort: 8000
+                  name: serve
+    workerGroupSpecs:
+      # the pod replicas in this group typed worker
+      - replicas: 1
+        minReplicas: 1
+        maxReplicas: 5
+        # logical group name, for this called small-group, also can be functional
+        groupName: cpu1
+        # The `rayStartParams` are used to configure the `ray start` command.
+        # See https://github.com/ray-project/kuberay/blob/master/docs/guidance/rayStartParams.md for the default settings of `rayStartParams` in KubeRay.
+        # See https://docs.ray.io/en/latest/cluster/cli.html#ray-start for all available options in `rayStartParams`.
+        rayStartParams:
+          num-cpus: "1"
+        #pod template
+        template:
+          spec:
+            containers:
+              - name: ray-worker # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name',  or '123-abc'
+                image: ghcr.io/bincherry/ray:text_ml
+                lifecycle:
+                  preStop:
+                    exec:
+                      command: ["/bin/sh","-c","ray stop"]
+                resources:
+                  limits:
+                    cpu: "1"
+                    memory: "2Gi"
+                  requests:
+                    cpu: "0.1"
+                    memory: 128Mi
+```
+
+### Client request
+
+```shell
+curl -X POST -H "Content-Type: application/json" 10.109.48.227:8000/summarize_translate -d '"Hello, how are you?"'
+```
diff --git a/ray/serve_config_examples/build-scripts.sh b/ray/serve_config_examples/build-scripts.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+
+RAY_VERSION=2.35.0
+BASE_IMAGE=rayproject/ray:${RAY_VERSION}-py310-cpu
+
+#REGISTRY=images.neolink-ai.com/matrixdc
+REGISTRY=ghcr.io/bincherry
+
+DOCKER_BUILDKIT=1
+
+EXTRA_ARGS="$@"
+
+docker buildx build \
+    --platform=linux/amd64 \
+    --build-arg BASE_IMAGE=${BASE_IMAGE} \
+    $EXTRA_ARGS \
+    -t ${REGISTRY}/ray:text_ml \
+    -f ./Dockerfile \
+    .
diff --git a/ray/serve_config_examples/text_ml.py b/ray/serve_config_examples/text_ml.py
@@ -0,0 +1,71 @@
+# Get and fix bugs from https://github.com/ray-project/serve_config_examples/blob/master/text_ml.py
+
+from starlette.requests import Request
+from typing import Dict
+
+from ray import serve
+from ray.serve.handle import DeploymentHandle
+
+from transformers import pipeline
+
+
+@serve.deployment
+class Translator:
+    def __init__(self):
+        self.language = "french"
+        self.model = pipeline("translation_en_to_fr", model="/home/ray/t5-small")
+
+    def translate(self, text: str) -> str:
+        model_output = self.model(text)
+
+        translation = model_output[0]["translation_text"]
+
+        return translation
+
+    def reconfigure(self, config: Dict):
+        self.language = config.get("language", "french")
+
+        if self.language.lower() == "french":
+            self.model = pipeline("translation_en_to_fr", model="t5-small")
+        elif self.language.lower() == "german":
+            self.model = pipeline("translation_en_to_de", model="t5-small")
+        elif self.language.lower() == "romanian":
+            self.model = pipeline("translation_en_to_ro", model="t5-small")
+        else:
+            pass
+
+
+@serve.deployment
+class Summarizer:
+    def __init__(self, translator: DeploymentHandle):
+        # Load model
+        self.model = pipeline("summarization", model="t5-small")
+        self.translator = translator
+        self.min_length = 5
+        self.max_length = 15
+
+    def summarize(self, text: str) -> str:
+        # Run inference
+        model_output = self.model(
+            text, min_length=self.min_length, max_length=self.max_length
+        )
+
+        # Post-process output to return only the summary text
+        summary = model_output[0]["summary_text"]
+
+        return summary
+
+    async def __call__(self, http_request: Request) -> str:
+        english_text: str = await http_request.json()
+        summary = self.summarize(english_text)
+
+        translation = await self.translator.translate.remote(summary)
+
+        return translation
+
+    def reconfigure(self, config: Dict):
+        self.min_length = config.get("min_length", 5)
+        self.max_length = config.get("max_length", 15)
+
+
+app = Summarizer.bind(Translator.bind())
diff --git a/ray/vllm/Dockerfile.cpu b/ray/vllm/Dockerfile.cpu
@@ -0,0 +1,35 @@
+ARG BASE_IMAGE
+
+FROM ${BASE_IMAGE}
+
+USER root
+
+# install vllm-cpu
+# https://docs.vllm.ai/en/latest/getting_started/cpu-installation.html
+ARG VLLM_VERSION
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends cmake gcc-12 g++-12 libnuma-dev && \
+    update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 && \
+    pip --no-cache-dir install "cmake>=3.26" wheel packaging ninja "setuptools-scm>=8" numpy && \
+    git clone https://github.com/vllm-project/vllm.git -b ${VLLM_VERSION} --depth 1 && \
+    git clone https://github.com/oneapi-src/oneDNN.git -b rls-v3.5 --depth 1 && \
+    pip install --no-cache-dir -v -r vllm/requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu && \
+    cmake -B ./oneDNN/build -S ./oneDNN -G Ninja -DONEDNN_LIBRARY_TYPE=STATIC \
+        -DONEDNN_BUILD_DOC=OFF \
+        -DONEDNN_BUILD_EXAMPLES=OFF \
+        -DONEDNN_BUILD_TESTS=OFF \
+        -DONEDNN_BUILD_GRAPH=OFF \
+        -DONEDNN_ENABLE_WORKLOAD=INFERENCE \
+        -DONEDNN_ENABLE_PRIMITIVE=MATMUL && \
+    cmake --build ./oneDNN/build --target install --config Release && \
+    cd vllm && \
+    VLLM_TARGET_DEVICE=cpu python setup.py install && \
+    rm -rf ./oneDNN ./vllm && \
+    apt-get purge -y cmake gcc-12 g++-12 && \
+    apt-get autoremove -y && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+
+USER ray
+
+ADD --chown=ray:users serve.py /home/ray/serve.py
diff --git a/ray/vllm/Dockerfile.cuda b/ray/vllm/Dockerfile.cuda
@@ -0,0 +1,9 @@
+ARG BASE_IMAGE
+
+FROM ${BASE_IMAGE}
+
+# install vllm-cuda
+ARG VLLM_VERSION
+RUN pip --no-cache-dir install vllm==${VLLM_VERSION}
+
+ADD --chown=ray:users serve.py /home/ray/serve.py