From 77a2e8c39097e7e4291aac4285d899c9605a4f4a Mon Sep 17 00:00:00 2001
From: Xinyao Wang <xinyao.wang@intel.com>
Date: Thu, 7 Nov 2024 12:32:08 +0800
Subject: [PATCH] remove vllm-on-ray

Signed-off-by: Xinyao Wang <xinyao.wang@intel.com>
---
 .../docker_compose/intel/hpu/gaudi/README.md  |  17 +-
 .../intel/hpu/gaudi/compose_vllm_ray.yaml     | 172 ----------------
 ChatQnA/docker_image_build/build.yaml         |   6 -
 .../tests/test_compose_vllm_ray_on_gaudi.sh   | 183 ------------------
 docker_images_list.md                         |   1 -
 5 files changed, 2 insertions(+), 377 deletions(-)
 delete mode 100644 ChatQnA/docker_compose/intel/hpu/gaudi/compose_vllm_ray.yaml
 delete mode 100644 ChatQnA/tests/test_compose_vllm_ray_on_gaudi.sh

diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/README.md b/ChatQnA/docker_compose/intel/hpu/gaudi/README.md
index d3237e2fe..a922ec031 100644
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/README.md
+++ b/ChatQnA/docker_compose/intel/hpu/gaudi/README.md
@@ -26,7 +26,7 @@ To set up environment variables for deploying ChatQnA services, follow these ste
    export http_proxy="Your_HTTP_Proxy"
    export https_proxy="Your_HTTPs_Proxy"
    # Example: no_proxy="localhost, 127.0.0.1, 192.168.1.1"
-   export no_proxy="Your_No_Proxy",chatqna-gaudi-ui-server,chatqna-gaudi-backend-server,dataprep-redis-service,tei-embedding-service,retriever,tei-reranking-service,tgi-service,vllm_service,vllm-ray-service,guardrails
+   export no_proxy="Your_No_Proxy",chatqna-gaudi-ui-server,chatqna-gaudi-backend-server,dataprep-redis-service,tei-embedding-service,retriever,tei-reranking-service,tgi-service,vllm_service,guardrails
    ```
 
 3. Set up other environment variables:
@@ -227,7 +227,7 @@ For users in China who are unable to download models directly from Huggingface,
    export http_proxy="Your_HTTP_Proxy"
    export https_proxy="Your_HTTPs_Proxy"
    # Example: no_proxy="localhost, 127.0.0.1, 192.168.1.1"
-   export no_proxy="Your_No_Proxy",chatqna-gaudi-ui-server,chatqna-gaudi-backend-server,dataprep-redis-service,tei-embedding-service,retriever,tei-reranking-service,tgi-service,vllm_service,vllm-ray-service,guardrails
+   export no_proxy="Your_No_Proxy",chatqna-gaudi-ui-server,chatqna-gaudi-backend-server,dataprep-redis-service,tei-embedding-service,retriever,tei-reranking-service,tgi-service,vllm_service,guardrails
    ```
 
 3. Set up other environment variables:
@@ -257,12 +257,6 @@ If use vllm for llm backend.
 docker compose -f compose_vllm.yaml up -d
 ```
 
-If use vllm-on-ray for llm backend.
-
-```bash
-docker compose -f compose_vllm_ray.yaml up -d
-```
-
 If you want to enable guardrails microservice in the pipeline, please follow the below command instead:
 
 ```bash
@@ -351,13 +345,6 @@ For validation details, please refer to [how-to-validate_service](./how_to_valid
      }'
    ```
 
-   ```bash
-   #vLLM-on-Ray Service
-   curl http://${host_ip}:8006/v1/chat/completions \
-     -H "Content-Type: application/json" \
-     -d '{"model": "${LLM_MODEL_ID}", "messages": [{"role": "user", "content": "What is Deep Learning?"}]}'
-   ```
-
 5. MegaService
 
    ```bash
diff --git a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_vllm_ray.yaml b/ChatQnA/docker_compose/intel/hpu/gaudi/compose_vllm_ray.yaml
deleted file mode 100644
index a1019c9ac..000000000
--- a/ChatQnA/docker_compose/intel/hpu/gaudi/compose_vllm_ray.yaml
+++ /dev/null
@@ -1,172 +0,0 @@
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-services:
-  redis-vector-db:
-    image: redis/redis-stack:7.2.0-v9
-    container_name: redis-vector-db
-    ports:
-      - "6379:6379"
-      - "8001:8001"
-  dataprep-redis-service:
-    image: ${REGISTRY:-opea}/dataprep-redis:${TAG:-latest}
-    container_name: dataprep-redis-server
-    depends_on:
-      - redis-vector-db
-      - tei-embedding-service
-    ports:
-      - "6007:6007"
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      REDIS_URL: redis://redis-vector-db:6379
-      REDIS_HOST: redis-vector-db
-      INDEX_NAME: ${INDEX_NAME}
-      TEI_ENDPOINT: http://tei-embedding-service:80
-      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-  tei-embedding-service:
-    image: ghcr.io/huggingface/tei-gaudi:latest
-    container_name: tei-embedding-gaudi-server
-    ports:
-      - "8090:80"
-    volumes:
-      - "./data:/data"
-    runtime: habana
-    cap_add:
-      - SYS_NICE
-    ipc: host
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      HF_HUB_DISABLE_PROGRESS_BARS: 1
-      HF_HUB_ENABLE_HF_TRANSFER: 0
-      HABANA_VISIBLE_DEVICES: all
-      OMPI_MCA_btl_vader_single_copy_mechanism: none
-      MAX_WARMUP_SEQUENCE_LENGTH: 512
-    command: --model-id ${EMBEDDING_MODEL_ID} --auto-truncate
-  retriever:
-    image: ${REGISTRY:-opea}/retriever-redis:${TAG:-latest}
-    container_name: retriever-redis-server
-    depends_on:
-      - redis-vector-db
-    ports:
-      - "7000:7000"
-    ipc: host
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      REDIS_URL: redis://redis-vector-db:6379
-      REDIS_HOST: redis-vector-db
-      INDEX_NAME: ${INDEX_NAME}
-      TEI_EMBEDDING_ENDPOINT: http://tei-embedding-service:80
-      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-    restart: unless-stopped
-  tei-reranking-service:
-    image: ghcr.io/huggingface/tei-gaudi:latest
-    container_name: tei-reranking-gaudi-server
-    ports:
-      - "8808:80"
-    volumes:
-      - "./data:/data"
-    runtime: habana
-    cap_add:
-      - SYS_NICE
-    ipc: host
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      HF_HUB_DISABLE_PROGRESS_BARS: 1
-      HF_HUB_ENABLE_HF_TRANSFER: 0
-      HABANA_VISIBLE_DEVICES: all
-      OMPI_MCA_btl_vader_single_copy_mechanism: none
-      MAX_WARMUP_SEQUENCE_LENGTH: 512
-    command: --model-id ${RERANK_MODEL_ID} --auto-truncate
-  vllm-ray-service:
-    image: ${REGISTRY:-opea}/llm-vllm-ray-hpu:${TAG:-latest}
-    container_name: vllm-ray-gaudi-server
-    ports:
-      - "8006:8000"
-    volumes:
-      - "./data:/data"
-    environment:
-      no_proxy: ${no_proxy}
-      http_proxy: ${http_proxy}
-      https_proxy: ${https_proxy}
-      HUGGINGFACEHUB_API_TOKEN: ${HUGGINGFACEHUB_API_TOKEN}
-      HABANA_VISIBLE_DEVICES: all
-      OMPI_MCA_btl_vader_single_copy_mechanism: none
-      LLM_MODEL_ID: ${LLM_MODEL_ID}
-    runtime: habana
-    cap_add:
-      - SYS_NICE
-    ipc: host
-    command: /bin/bash -c "ray start --head && python vllm_ray_openai.py --port_number 8000 --model_id_or_path $LLM_MODEL_ID --tensor_parallel_size 2 --enforce_eager True"
-  chatqna-gaudi-backend-server:
-    image: ${REGISTRY:-opea}/chatqna:${TAG:-latest}
-    container_name: chatqna-gaudi-backend-server
-    depends_on:
-      - redis-vector-db
-      - tei-embedding-service
-      - retriever
-      - tei-reranking-service
-      - vllm-ray-service
-    ports:
-      - "8888:8888"
-    environment:
-      - no_proxy=${no_proxy}
-      - https_proxy=${https_proxy}
-      - http_proxy=${http_proxy}
-      - MEGA_SERVICE_HOST_IP=chatqna-gaudi-backend-server
-      - EMBEDDING_SERVER_HOST_IP=tei-embedding-service
-      - EMBEDDING_SERVER_PORT=${EMBEDDING_SERVER_PORT:-80}
-      - RETRIEVER_SERVICE_HOST_IP=retriever
-      - RERANK_SERVER_HOST_IP=tei-reranking-service
-      - RERANK_SERVER_PORT=${RERANK_SERVER_PORT:-80}
-      - LLM_SERVER_HOST_IP=vllm-ray-service
-      - LLM_SERVER_PORT=${LLM_SERVER_PORT:-8000}
-      - LLM_MODEL=${LLM_MODEL_ID}
-      - LOGFLAG=${LOGFLAG}
-    ipc: host
-    restart: always
-  chatqna-gaudi-ui-server:
-    image: ${REGISTRY:-opea}/chatqna-ui:${TAG:-latest}
-    container_name: chatqna-gaudi-ui-server
-    depends_on:
-      - chatqna-gaudi-backend-server
-    ports:
-      - "5173:5173"
-    environment:
-      - no_proxy=${no_proxy}
-      - https_proxy=${https_proxy}
-      - http_proxy=${http_proxy}
-    ipc: host
-    restart: always
-  chatqna-gaudi-nginx-server:
-    image: ${REGISTRY:-opea}/nginx:${TAG:-latest}
-    container_name: chatqna-gaudi-nginx-server
-    depends_on:
-      - chatqna-gaudi-backend-server
-      - chatqna-gaudi-ui-server
-    ports:
-      - "${NGINX_PORT:-80}:80"
-    environment:
-      - no_proxy=${no_proxy}
-      - https_proxy=${https_proxy}
-      - http_proxy=${http_proxy}
-      - FRONTEND_SERVICE_IP=chatqna-gaudi-ui-server
-      - FRONTEND_SERVICE_PORT=5173
-      - BACKEND_SERVICE_NAME=chatqna
-      - BACKEND_SERVICE_IP=chatqna-gaudi-backend-server
-      - BACKEND_SERVICE_PORT=8888
-      - DATAPREP_SERVICE_IP=dataprep-redis-service
-      - DATAPREP_SERVICE_PORT=6007
-    ipc: host
-    restart: always
-
-networks:
-  default:
-    driver: bridge
diff --git a/ChatQnA/docker_image_build/build.yaml b/ChatQnA/docker_image_build/build.yaml
index ca3139de4..54f357f04 100644
--- a/ChatQnA/docker_image_build/build.yaml
+++ b/ChatQnA/docker_image_build/build.yaml
@@ -83,12 +83,6 @@ services:
       dockerfile: comps/llms/text-generation/vllm/langchain/dependency/Dockerfile.intel_hpu
     extends: chatqna
     image: ${REGISTRY:-opea}/llm-vllm-hpu:${TAG:-latest}
-  llm-vllm-ray-hpu:
-    build:
-      context: GenAIComps
-      dockerfile: comps/llms/text-generation/vllm/ray/dependency/Dockerfile
-    extends: chatqna
-    image: ${REGISTRY:-opea}/llm-vllm-ray-hpu:${TAG:-latest}
   dataprep-redis:
     build:
       context: GenAIComps
diff --git a/ChatQnA/tests/test_compose_vllm_ray_on_gaudi.sh b/ChatQnA/tests/test_compose_vllm_ray_on_gaudi.sh
deleted file mode 100644
index d7d1dbe6b..000000000
--- a/ChatQnA/tests/test_compose_vllm_ray_on_gaudi.sh
+++ /dev/null
@@ -1,183 +0,0 @@
-#!/bin/bash
-# Copyright (C) 2024 Intel Corporation
-# SPDX-License-Identifier: Apache-2.0
-
-set -e
-IMAGE_REPO=${IMAGE_REPO:-"opea"}
-IMAGE_TAG=${IMAGE_TAG:-"latest"}
-echo "REGISTRY=IMAGE_REPO=${IMAGE_REPO}"
-echo "TAG=IMAGE_TAG=${IMAGE_TAG}"
-export REGISTRY=${IMAGE_REPO}
-export TAG=${IMAGE_TAG}
-
-WORKPATH=$(dirname "$PWD")
-LOG_PATH="$WORKPATH/tests"
-ip_address=$(hostname -I | awk '{print $1}')
-
-function build_docker_images() {
-    cd $WORKPATH/docker_image_build
-    git clone https://github.com/opea-project/GenAIComps.git && cd GenAIComps && git checkout "${opea_branch:-"main"}" && cd ../
-
-    echo "Build all the images with --no-cache, check docker_image_build.log for details..."
-    service_list="chatqna chatqna-ui dataprep-redis retriever-redis llm-vllm-ray-hpu nginx"
-    docker compose -f build.yaml build ${service_list} --no-cache > ${LOG_PATH}/docker_image_build.log
-
-    docker pull ghcr.io/huggingface/text-embeddings-inference:cpu-1.5
-    docker pull ghcr.io/huggingface/tei-gaudi:latest
-    docker images && sleep 1s
-}
-
-function start_services() {
-
-    cd $WORKPATH/docker_compose/intel/hpu/gaudi
-    export EMBEDDING_MODEL_ID="BAAI/bge-base-en-v1.5"
-    export RERANK_MODEL_ID="BAAI/bge-reranker-base"
-    export LLM_MODEL_ID="Intel/neural-chat-7b-v3-3"
-    export INDEX_NAME="rag-redis"
-    export HUGGINGFACEHUB_API_TOKEN=${HUGGINGFACEHUB_API_TOKEN}
-
-    # Start Docker Containers
-    docker compose -f compose_vllm_ray.yaml up -d > ${LOG_PATH}/start_services_with_compose.log
-    n=0
-    until [[ "$n" -ge 100 ]]; do
-        echo "n=$n"
-        docker logs vllm-ray-gaudi-server > vllm_ray_service_start.log
-        if grep -q "Warmup finished" vllm_ray_service_start.log; then
-            break
-        fi
-        sleep 5s
-        n=$((n+1))
-    done
-}
-
-function validate_services() {
-    local URL="$1"
-    local EXPECTED_RESULT="$2"
-    local SERVICE_NAME="$3"
-    local DOCKER_NAME="$4"
-    local INPUT_DATA="$5"
-
-    local HTTP_STATUS=$(curl -s -o /dev/null -w "%{http_code}" -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL")
-    if [ "$HTTP_STATUS" -eq 200 ]; then
-        echo "[ $SERVICE_NAME ] HTTP status is 200. Checking content..."
-
-        local CONTENT=$(curl -s -X POST -d "$INPUT_DATA" -H 'Content-Type: application/json' "$URL" | tee ${LOG_PATH}/${SERVICE_NAME}.log)
-
-        if echo "$CONTENT" | grep -q "$EXPECTED_RESULT"; then
-            echo "[ $SERVICE_NAME ] Content is as expected."
-        else
-            echo "[ $SERVICE_NAME ] Content does not match the expected result: $CONTENT"
-            docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
-            exit 1
-        fi
-    else
-        echo "[ $SERVICE_NAME ] HTTP status is not 200. Received status was $HTTP_STATUS"
-        docker logs ${DOCKER_NAME} >> ${LOG_PATH}/${SERVICE_NAME}.log
-        exit 1
-    fi
-    sleep 1s
-}
-
-function validate_microservices() {
-    # Check if the microservices are running correctly.
-
-    # tei for embedding service
-    validate_services \
-        "${ip_address}:8090/embed" \
-        "\[\[" \
-        "tei-embedding" \
-        "tei-embedding-gaudi-server" \
-        '{"inputs":"What is Deep Learning?"}'
-
-    sleep 1m # retrieval can't curl as expected, try to wait for more time
-
-    # retrieval microservice
-    test_embedding=$(python3 -c "import random; embedding = [random.uniform(-1, 1) for _ in range(768)]; print(embedding)")
-    validate_services \
-        "${ip_address}:7000/v1/retrieval" \
-        " " \
-        "retrieval" \
-        "retriever-redis-server" \
-        "{\"text\":\"What is the revenue of Nike in 2023?\",\"embedding\":${test_embedding}}"
-
-    # tei for rerank microservice
-    validate_services \
-        "${ip_address}:8808/rerank" \
-        '{"index":1,"score":' \
-        "tei-rerank" \
-        "tei-reranking-gaudi-server" \
-        '{"query":"What is Deep Learning?", "texts": ["Deep Learning is not...", "Deep learning is..."]}'
-
-    # vllm-on-ray for llm service
-    validate_services \
-        "${ip_address}:8006/v1/chat/completions" \
-        "content" \
-        "vllm-ray-llm" \
-        "vllm-ray-gaudi-server" \
-        '{"model": "Intel/neural-chat-7b-v3-3", "messages": [{"role": "user", "content": "What is Deep Learning?"}]}'
-}
-
-function validate_megaservice() {
-    # Curl the Mega Service
-    validate_services \
-        "${ip_address}:8888/v1/chatqna" \
-        "data: " \
-        "mega-chatqna" \
-        "chatqna-gaudi-backend-server" \
-        '{"messages": "What is the revenue of Nike in 2023?"}'
-
-}
-
-function validate_frontend() {
-    cd $WORKPATH/ui/svelte
-    local conda_env_name="OPEA_e2e"
-    export PATH=${HOME}/miniforge3/bin/:$PATH
-    if conda info --envs | grep -q "$conda_env_name"; then
-        echo "$conda_env_name exist!"
-    else
-        conda create -n ${conda_env_name} python=3.12 -y
-    fi
-    source activate ${conda_env_name}
-
-    sed -i "s/localhost/$ip_address/g" playwright.config.ts
-
-    conda install -c conda-forge nodejs -y
-    npm install && npm ci && npx playwright install --with-deps
-    node -v && npm -v && pip list
-
-    exit_status=0
-    npx playwright test || exit_status=$?
-
-    if [ $exit_status -ne 0 ]; then
-        echo "[TEST INFO]: ---------frontend test failed---------"
-        exit $exit_status
-    else
-        echo "[TEST INFO]: ---------frontend test passed---------"
-    fi
-}
-
-function stop_docker() {
-    cd $WORKPATH/docker_compose/intel/hpu/gaudi
-    docker compose -f compose_vllm_ray.yaml down
-}
-
-function main() {
-
-    stop_docker
-    if [[ "$IMAGE_REPO" == "opea" ]]; then build_docker_images; fi
-    start_time=$(date +%s)
-    start_services
-    end_time=$(date +%s)
-    duration=$((end_time-start_time))
-    echo "Mega service start duration is $duration s"
-
-    validate_microservices
-    validate_megaservice
-    # validate_frontend
-
-    stop_docker
-    echo y | docker system prune
-
-}
-
-main
diff --git a/docker_images_list.md b/docker_images_list.md
index d90ae08dc..c6acc0818 100644
--- a/docker_images_list.md
+++ b/docker_images_list.md
@@ -80,7 +80,6 @@ Take ChatQnA for example. ChatQnA is a chatbot application service based on the
 | [opea/llm-vllm-hpu](https://hub.docker.com/r/opea/llm-vllm-hpu)                                                     | [Link](https://github.com/opea-project/GenAIComps/blob/main/comps/llms/text-generation/vllm/langchain/dependency/Dockerfile.intel_hpu)   | The docker image exposed the OPEA LLM microservice upon vLLM docker image for use by GenAI apps on the Gaudi                                                                                                               |
 | [opea/llm-vllm-llamaindex](https://hub.docker.com/r/opea/llm-vllm-llamaindex)                                       | [Link](https://github.com/opea-project/GenAIComps/blob/main/comps/llms/text-generation/vllm/llama_index/Dockerfile)                      | This docker image exposes OPEA LLM microservices to the llamaindex framework's vLLM Docker image for use by GenAI applications                                                                                             |
 | [opea/llm-vllm-llamaindex-hpu](https://hub.docker.com/r/opea/llm-vllm-llamaindex-hpu)                               | [Link](https://github.com/opea-project/GenAIComps/blob/main/comps/llms/text-generation/vllm/llama_index/dependency/Dockerfile.intel_hpu) | This docker image exposes OPEA LLM microservices to the llamaindex framework's vLLM Docker image for use by GenAI applications on the gaudi                                                                                |
-| [opea/llm-vllm-ray-hpu](https://hub.docker.com/r/opea/llm-vllm-ray-hpu)                                             | [Link](https://github.com/opea-project/GenAIComps/blob/main/comps/llms/text-generation/vllm/ray/dependency/Dockerfile)                   | The docker image exposes Ray-based OPEA LLM microservices upon the vLLM Docker image for use by GenAI applications on the Gaudi                                                                                            |
 | [opea/llava-hpu](https://hub.docker.com/r/opea/llava-hpu)                                                           | [Link](https://github.com/opea-project/GenAIComps/blob/main/comps/lvms/llava/dependency/Dockerfile.intel_hpu)                            | The docker image exposed the OPEA microservice running LLaVA as a large visual model (LVM) service for GenAI application use on the Gaudi                                                                                  |
 | [opea/lvm-tgi](https://hub.docker.com/r/opea/lvm-tgi)                                                               | [Link](https://github.com/opea-project/GenAIComps/blob/main/comps/lvms/tgi-llava/Dockerfile)                                             | This docker image is designed to build a large visual model (LVM) microservice using the HuggingFace Text Generation Inference(TGI) framework. The microservice accepts document input and generates a answer to question. |
 | [opea/lvm-llava](https://hub.docker.com/r/opea/lvm-llava)                                                           | [Link](https://github.com/opea-project/GenAIComps/blob/main/comps/lvms/llava/dependency/Dockerfile)                                      | The docker image exposed the OPEA microservice running LLaVA as a large visual model (LVM) server for GenAI application use                                                                                                |