From c8c5f14ad93f8c0ac92b1de2ca4e9143aa60b2fb Mon Sep 17 00:00:00 2001
From: Hyunjae Woo <107147848+nv-hwoo@users.noreply.github.com>
Date: Mon, 4 Dec 2023 09:14:54 -0800
Subject: [PATCH] Support TRTLLM model in the benchmark script (#442)

* Support TRTLLM model and use vLLM backend

* Align spaces

* Move comment

* Specify shape of input tensors

* Fix pre-commit hooks

* Fix metric error when there is only single response

* Specify backend type for distinguishing input data

* Raise error when unknown backend specified.

* Change to direct string comparison
---
 .../perf_analyzer/docs/examples/profile.py    | 136 ++++++++++++++----
 src/c++/perf_analyzer/docs/llm.md             |  40 +++---
 2 files changed, 129 insertions(+), 47 deletions(-)

diff --git a/src/c++/perf_analyzer/docs/examples/profile.py b/src/c++/perf_analyzer/docs/examples/profile.py
index 534dcec95..7b79c9848 100644
--- a/src/c++/perf_analyzer/docs/examples/profile.py
+++ b/src/c++/perf_analyzer/docs/examples/profile.py
@@ -133,11 +133,11 @@ def get_postfix(args, prompt_size):
     """Generate postfix for profile export filename and plot.
 
     e.g.
-      - trtllm-prompt100-maxtokens256
-      - trtllm-prompt100-periodic1_100_1-period32-maxtokens1024
+      - trtllm-ensemble-prompt100-maxtokens256
+      - trtllm-ensemble-prompt100-periodic1_100_1-period32-maxtokens1024
     """
     stream_type = "offline" if args.offline else "online"
-    postfix = f"{args.model}-{stream_type}-prompt{prompt_size}-"
+    postfix = f"{args.backend}-{args.model}-{stream_type}-prompt{prompt_size}-"
     if args.periodic_concurrency_range:
         start, end, step = args.periodic_concurrency_range
         postfix += f"periodic{start}_{end}_{step}-period{args.request_period}-"
@@ -265,11 +265,12 @@ def collect_online_metrics(export_data, output_tokens):
     for r in requests:
         init_request, responses = r["timestamp"], r["response_timestamps"]
         first_token_latency = (responses[0] - init_request) / 1_000_000
-        generation_latency_ms = (responses[-1] - responses[0]) / 1_000_000  # msec
-        generation_latency_s = (responses[-1] - responses[0]) / 1_000_000_000  # sec
         first_token_latencies.append(first_token_latency)
-        generation_latencies.append(generation_latency_ms)
-        generation_throughputs.append(output_tokens / generation_latency_s)
+        if args.max_tokens > 1:
+            generation_latency_ms = (responses[-1] - responses[0]) / 1_000_000  # msec
+            generation_latency_s = (responses[-1] - responses[0]) / 1_000_000_000  # sec
+            generation_latencies.append(generation_latency_ms)
+            generation_throughputs.append(output_tokens / generation_latency_s)
         for prev_res, res in pairwise(responses):
             token_to_token_latencies.append((res - prev_res) / 1_000_000)
     return (
@@ -290,8 +291,6 @@ def calculate_online_metrics(args, profile_result, export_data):
         generation_throughputs,
     ) = latencies
 
-    profile_result.avg_total_t2t_latency = np.mean(token_to_token_latencies)
-
     profile_result.max_first_token_latency = max(first_token_latencies)
     profile_result.min_first_token_latency = min(first_token_latencies)
     profile_result.avg_first_token_latency = np.mean(first_token_latencies)
@@ -309,6 +308,8 @@ def calculate_online_metrics(args, profile_result, export_data):
     )
 
     if args.max_tokens > 1:
+        profile_result.avg_total_t2t_latency = np.mean(token_to_token_latencies)
+
         profile_result.max_gen_latency = max(generation_latencies)
         profile_result.min_gen_latency = min(generation_latencies)
         profile_result.avg_gen_latency = np.mean(generation_latencies)
@@ -420,6 +421,13 @@ def profile(args, export_file):
         f"--input-data={INPUT_FILENAME} "
         f"--profile-export-file={export_file} "
     )
+    if args.backend == "trtllm":
+        command += (
+            "--shape=text_input:1 "
+            "--shape=max_tokens:1 "
+            "--shape=bad_words:1 "
+            "--shape=stop_words:1 "
+        )
     if args.periodic_concurrency_range:
         start, end, step = args.periodic_concurrency_range
         command += (
@@ -449,13 +457,13 @@ def prepare_export_file(args, prompt):
 
 def prepare_input_data(input_data, prompt):
     """Insert the prompt to send into input JSON data."""
-    input_data["data"][0]["PROMPT"] = [prompt]
+    input_data["data"][0]["text_input"] = [prompt]
     save_json_data(input_data, INPUT_FILENAME)
 
 
 def generate_prompts(args, input_data):
     """Generate dummy prompts if not specified by input JSON file."""
-    prompt = input_data["data"][0]["PROMPT"][0]
+    prompt = input_data["data"][0]["text_input"][0]
 
     if not prompt:  # Generate dummy prompt
         assert args.prompt_size_range, "Must specify --prompt-size-range."
@@ -464,28 +472,41 @@ def generate_prompts(args, input_data):
     return [prompt]
 
 
-def construct_input_data(args):
-    """Construct input data that contains input tensors and parameters.
+def construct_vllm_input_data(args):
+    """Construct input data that contains input tensors and parameters for vLLM.
 
     Parse the input JSON file (if exists) to construct the input data.
     When user sets parameters through command line, overwrite the
     parameters set by input JSON file.
     """
-    prompt = ""
-    stream = True
-    sampling_params = {}
+    # Default sampling parameters
+    sampling_params = {
+        "max_tokens": 256,
+        "ignore_eos": False,
+    }
 
     if args.input_data:
-        data = load_json_data(filename=args.input_data)["data"][0]
-        stream = data["STREAM"][0] if "STREAM" in data else stream
-        prompt = data["PROMPT"][0] if "PROMPT" in data else prompt
-        if "SAMPLING_PARAMETERS" in data:
-            sampling_params = json.loads(data["SAMPLING_PARAMETERS"][0])
+        input_data = load_json_data(filename=args.input_data)
+        if "sampling_parameters" in input_data["data"][0]:
+            loaded_params = input_data["data"][0]["sampling_parameters"][0]
+            loaded_params = json.loads(loaded_params or "null")
+            sampling_params = loaded_params if loaded_params else sampling_params
+    else:
+        # Default input JSON
+        input_data = {
+            "data": [
+                {
+                    "text_input": [""],
+                    "stream": [True],
+                    "sampling_parameters": [""],
+                }
+            ]
+        }
 
     # If command line option is specified, overwrite
     if args.offline:
-        stream = False
-    elif not stream:
+        input_data["data"][0]["stream"] = [False]
+    elif not input_data["data"][0]["stream"]:
         args.offline = True
 
     if args.max_tokens:
@@ -496,20 +517,66 @@ def construct_input_data(args):
         args.max_tokens = 256  # default
         sampling_params["max_tokens"] = args.max_tokens
 
-    if "ignore_eos" not in sampling_params:
+    if args.ignore_eos:
+        sampling_params["ignore_eos"] = args.ignore_eos
+    elif "ignore_eos" in sampling_params:
+        args.ignore_eos = sampling_params["ignore_eos"]
+    else:
+        args.ignore_eos = False  # default
         sampling_params["ignore_eos"] = args.ignore_eos
-    elif args.ignore_eos:
-        sampling_params["ignore_eos"] = True
 
-    input_data = {"data": [{}]}
-    input_data["data"][0]["PROMPT"] = [prompt]
-    input_data["data"][0]["STREAM"] = [stream]
-    input_data["data"][0]["SAMPLING_PARAMETERS"] = [json.dumps(sampling_params)]
+    input_data["data"][0]["sampling_parameters"] = [json.dumps(sampling_params)]
+    return input_data
+
+
+def construct_trtllm_input_data(args):
+    """Construct input data that contains input tensors and parameters for TRT-LLM.
+
+    Parse the input JSON file (if exists) to construct the input data.
+    When user sets parameters through command line, overwrite the
+    parameters set by input JSON file.
+    """
+    if args.input_data:
+        input_data = load_json_data(filename=args.input_data)
+    else:
+        # Default input JSON
+        input_data = {
+            "data": [
+                {
+                    "text_input": [""],
+                    "stream": [True],
+                    "max_tokens": [256],
+                    "bad_words": [""],
+                    "stop_words": [""],
+                }
+            ]
+        }
+
+    # If command line option is specified, overwrite
+    if args.offline:
+        input_data["data"][0]["stream"] = [False]
+    elif not input_data["data"][0]["stream"]:
+        args.offline = True
+
+    if args.max_tokens:
+        input_data["data"][0]["max_tokens"] = [args.max_tokens]
+    else:
+        args.max_tokens = input_data["data"][0]["max_tokens"][0]
+
     return input_data
 
 
 def main(args):
-    input_data = construct_input_data(args)
+    if args.backend == "trtllm":
+        input_data = construct_trtllm_input_data(args)
+    elif args.backend == "vllm":
+        input_data = construct_vllm_input_data(args)
+    else:
+        raise ValueError(
+            "Unknown backend specified. Supported backend types are: 'trtllm' "
+            "and 'vllm'."
+        )
+
     prompts = generate_prompts(args, input_data)
 
     for prompt in prompts:
@@ -531,6 +598,13 @@ def main(args):
         default="vllm",
         help="The name of the model to profile.",
     )
+    parser.add_argument(
+        "-b",
+        "--backend",
+        type=str,
+        default="vllm",
+        help="The name of the backend.",
+    )
     parser.add_argument(
         "--prompt-size-range",
         type=int,
diff --git a/src/c++/perf_analyzer/docs/llm.md b/src/c++/perf_analyzer/docs/llm.md
index 1de686c1b..331e7db55 100644
--- a/src/c++/perf_analyzer/docs/llm.md
+++ b/src/c++/perf_analyzer/docs/llm.md
@@ -33,20 +33,28 @@ The following guide shows the reader how to use Triton
 to measure and characterize the performance behaviors of Large Language Models
 (LLMs) using Triton with [vLLM](https://github.com/vllm-project/vllm).
 
-### Setup: Download and configure Triton Server environment
+### Setup: Download and configure Triton vLLM Backend
 
-From [Step 1 of the Triton vLLM tutorial](https://github.com/triton-inference-server/tutorials/blob/main/Quick_Deploy/vLLM/README.md#step-1-build-a-triton-container-image-with-vllm).
+Download the pre-built Triton Server Container with vLLM backend from
+[NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/tritonserver)
+registry.
 
 ```bash
-git clone https://github.com/triton-inference-server/tutorials
-cd tutorials/Quick_Deploy/vLLM
-docker build -t tritonserver_vllm .
-# wait for command to finish, might take several minutes
+docker pull nvcr.io/nvidia/tritonserver:23.10-vllm-python-py3
 ```
 
-Upon successful build, run the following command to start the Triton Server container:
+Run the Triton Server container with
+[vLLM backend](https://github.com/triton-inference-server/vllm_backend) and
+launch the server.
 ```bash
-docker run --gpus all -it --rm -p 8001:8001 --shm-size=1G --ulimit memlock=-1 --ulimit stack=67108864 -v ${PWD}:/work -w /work tritonserver_vllm tritonserver --model-store ./model_repository
+git clone -b r23.10 https://github.com/triton-inference-server/vllm_backend.git
+cd vllm_backend
+
+docker run --gpus all --rm -it --net host \
+    --shm-size=2G --ulimit memlock=-1 --ulimit stack=67108864 \
+    -v $(pwd)/samples/model_repository:/model_repository \
+    nvcr.io/nvidia/tritonserver:23.10-vllm-python-py3 \
+    tritonserver --model-repository /model_repository
 ```
 
 Next run the following command to start the Triton SDK container:
@@ -69,7 +77,7 @@ Inside the client container, run the following command to generate dummy prompts
 of size 100, 300, and 500 and receive single token from the model for each prompt.
 
 ```bash
-python profile.py -m vllm --prompt-size-range 100 500 200 --max-tokens 1
+python profile.py -m vllm_model --prompt-size-range 100 500 200 --max-tokens 1
 
 # [ BENCHMARK SUMMARY ]
 # Prompt size: 100
@@ -91,13 +99,13 @@ python profile.py -m vllm --prompt-size-range 100 500 200 --max-tokens 1
 > {
 >     "data": [
 >         {
->             "PROMPT": [
+>             "text_input": [
 >                 "Hello, my name is"  // user-provided prompt
 >             ],
->             "STREAM": [
+>             "stream": [
 >                 true
 >             ],
->             "SAMPLING_PARAMETERS": [
+>             "sampling_parameters": [
 >                 "{ \"max_tokens\": 1 }"
 >             ]
 >         }
@@ -105,7 +113,7 @@ python profile.py -m vllm --prompt-size-range 100 500 200 --max-tokens 1
 > }
 > ' > input_data.json
 >
-> $ python profile.py -m vllm --input-data input_data.json
+> $ python profile.py -m vllm_model --input-data input_data.json
 > ```
 
 
@@ -122,7 +130,7 @@ of size 100, 300, and 500 and receive total 256 tokens from the model for each
 prompts.
 
 ```bash
-python profile.py -m vllm --prompt-size-range 100 500 200 --max-tokens 256 --ignore-eos
+python profile.py -m vllm_model --prompt-size-range 100 500 200 --max-tokens 256 --ignore-eos
 
 # [ BENCHMARK SUMMARY ]
 # Prompt size: 100
@@ -157,7 +165,7 @@ Run the following command inside the client container.
 pip install matplotlib
 
 # Run Perf Analyzer
-python profile.py -m vllm --prompt-size-range 10 10 1 --periodic-concurrency-range 1 100 1 --request-period 32 --max-tokens 1024 --ignore-eos
+python profile.py -m vllm_model --prompt-size-range 10 10 1 --periodic-concurrency-range 1 100 1 --request-period 32 --max-tokens 1024 --ignore-eos
 
 # [ BENCHMARK SUMMARY ]
 # Prompt size: 10
@@ -179,7 +187,7 @@ split them into multiple segments of responses.
 For instance, assume we ran the following benchmark command:
 
 ```bash
-python profile.py -m vllm --periodic-concurrency-range 1 4 1 --request-period 32 --max-tokens 1024 --ignore-eos
+python profile.py -m vllm_model --periodic-concurrency-range 1 4 1 --request-period 32 --max-tokens 1024 --ignore-eos
 ```
 
 We start from a single request and increment up to 4 requests one by one for