From c8c5f14ad93f8c0ac92b1de2ca4e9143aa60b2fb Mon Sep 17 00:00:00 2001 From: Hyunjae Woo <107147848+nv-hwoo@users.noreply.github.com> Date: Mon, 4 Dec 2023 09:14:54 -0800 Subject: [PATCH] Support TRTLLM model in the benchmark script (#442) * Support TRTLLM model and use vLLM backend * Align spaces * Move comment * Specify shape of input tensors * Fix pre-commit hooks * Fix metric error when there is only single response * Specify backend type for distinguishing input data * Raise error when unknown backend specified. * Change to direct string comparison --- .../perf_analyzer/docs/examples/profile.py | 136 ++++++++++++++---- src/c++/perf_analyzer/docs/llm.md | 40 +++--- 2 files changed, 129 insertions(+), 47 deletions(-) diff --git a/src/c++/perf_analyzer/docs/examples/profile.py b/src/c++/perf_analyzer/docs/examples/profile.py index 534dcec95..7b79c9848 100644 --- a/src/c++/perf_analyzer/docs/examples/profile.py +++ b/src/c++/perf_analyzer/docs/examples/profile.py @@ -133,11 +133,11 @@ def get_postfix(args, prompt_size): """Generate postfix for profile export filename and plot. e.g. - - trtllm-prompt100-maxtokens256 - - trtllm-prompt100-periodic1_100_1-period32-maxtokens1024 + - trtllm-ensemble-prompt100-maxtokens256 + - trtllm-ensemble-prompt100-periodic1_100_1-period32-maxtokens1024 """ stream_type = "offline" if args.offline else "online" - postfix = f"{args.model}-{stream_type}-prompt{prompt_size}-" + postfix = f"{args.backend}-{args.model}-{stream_type}-prompt{prompt_size}-" if args.periodic_concurrency_range: start, end, step = args.periodic_concurrency_range postfix += f"periodic{start}_{end}_{step}-period{args.request_period}-" @@ -265,11 +265,12 @@ def collect_online_metrics(export_data, output_tokens): for r in requests: init_request, responses = r["timestamp"], r["response_timestamps"] first_token_latency = (responses[0] - init_request) / 1_000_000 - generation_latency_ms = (responses[-1] - responses[0]) / 1_000_000 # msec - generation_latency_s = (responses[-1] - responses[0]) / 1_000_000_000 # sec first_token_latencies.append(first_token_latency) - generation_latencies.append(generation_latency_ms) - generation_throughputs.append(output_tokens / generation_latency_s) + if args.max_tokens > 1: + generation_latency_ms = (responses[-1] - responses[0]) / 1_000_000 # msec + generation_latency_s = (responses[-1] - responses[0]) / 1_000_000_000 # sec + generation_latencies.append(generation_latency_ms) + generation_throughputs.append(output_tokens / generation_latency_s) for prev_res, res in pairwise(responses): token_to_token_latencies.append((res - prev_res) / 1_000_000) return ( @@ -290,8 +291,6 @@ def calculate_online_metrics(args, profile_result, export_data): generation_throughputs, ) = latencies - profile_result.avg_total_t2t_latency = np.mean(token_to_token_latencies) - profile_result.max_first_token_latency = max(first_token_latencies) profile_result.min_first_token_latency = min(first_token_latencies) profile_result.avg_first_token_latency = np.mean(first_token_latencies) @@ -309,6 +308,8 @@ def calculate_online_metrics(args, profile_result, export_data): ) if args.max_tokens > 1: + profile_result.avg_total_t2t_latency = np.mean(token_to_token_latencies) + profile_result.max_gen_latency = max(generation_latencies) profile_result.min_gen_latency = min(generation_latencies) profile_result.avg_gen_latency = np.mean(generation_latencies) @@ -420,6 +421,13 @@ def profile(args, export_file): f"--input-data={INPUT_FILENAME} " f"--profile-export-file={export_file} " ) + if args.backend == "trtllm": + command += ( + "--shape=text_input:1 " + "--shape=max_tokens:1 " + "--shape=bad_words:1 " + "--shape=stop_words:1 " + ) if args.periodic_concurrency_range: start, end, step = args.periodic_concurrency_range command += ( @@ -449,13 +457,13 @@ def prepare_export_file(args, prompt): def prepare_input_data(input_data, prompt): """Insert the prompt to send into input JSON data.""" - input_data["data"][0]["PROMPT"] = [prompt] + input_data["data"][0]["text_input"] = [prompt] save_json_data(input_data, INPUT_FILENAME) def generate_prompts(args, input_data): """Generate dummy prompts if not specified by input JSON file.""" - prompt = input_data["data"][0]["PROMPT"][0] + prompt = input_data["data"][0]["text_input"][0] if not prompt: # Generate dummy prompt assert args.prompt_size_range, "Must specify --prompt-size-range." @@ -464,28 +472,41 @@ def generate_prompts(args, input_data): return [prompt] -def construct_input_data(args): - """Construct input data that contains input tensors and parameters. +def construct_vllm_input_data(args): + """Construct input data that contains input tensors and parameters for vLLM. Parse the input JSON file (if exists) to construct the input data. When user sets parameters through command line, overwrite the parameters set by input JSON file. """ - prompt = "" - stream = True - sampling_params = {} + # Default sampling parameters + sampling_params = { + "max_tokens": 256, + "ignore_eos": False, + } if args.input_data: - data = load_json_data(filename=args.input_data)["data"][0] - stream = data["STREAM"][0] if "STREAM" in data else stream - prompt = data["PROMPT"][0] if "PROMPT" in data else prompt - if "SAMPLING_PARAMETERS" in data: - sampling_params = json.loads(data["SAMPLING_PARAMETERS"][0]) + input_data = load_json_data(filename=args.input_data) + if "sampling_parameters" in input_data["data"][0]: + loaded_params = input_data["data"][0]["sampling_parameters"][0] + loaded_params = json.loads(loaded_params or "null") + sampling_params = loaded_params if loaded_params else sampling_params + else: + # Default input JSON + input_data = { + "data": [ + { + "text_input": [""], + "stream": [True], + "sampling_parameters": [""], + } + ] + } # If command line option is specified, overwrite if args.offline: - stream = False - elif not stream: + input_data["data"][0]["stream"] = [False] + elif not input_data["data"][0]["stream"]: args.offline = True if args.max_tokens: @@ -496,20 +517,66 @@ def construct_input_data(args): args.max_tokens = 256 # default sampling_params["max_tokens"] = args.max_tokens - if "ignore_eos" not in sampling_params: + if args.ignore_eos: + sampling_params["ignore_eos"] = args.ignore_eos + elif "ignore_eos" in sampling_params: + args.ignore_eos = sampling_params["ignore_eos"] + else: + args.ignore_eos = False # default sampling_params["ignore_eos"] = args.ignore_eos - elif args.ignore_eos: - sampling_params["ignore_eos"] = True - input_data = {"data": [{}]} - input_data["data"][0]["PROMPT"] = [prompt] - input_data["data"][0]["STREAM"] = [stream] - input_data["data"][0]["SAMPLING_PARAMETERS"] = [json.dumps(sampling_params)] + input_data["data"][0]["sampling_parameters"] = [json.dumps(sampling_params)] + return input_data + + +def construct_trtllm_input_data(args): + """Construct input data that contains input tensors and parameters for TRT-LLM. + + Parse the input JSON file (if exists) to construct the input data. + When user sets parameters through command line, overwrite the + parameters set by input JSON file. + """ + if args.input_data: + input_data = load_json_data(filename=args.input_data) + else: + # Default input JSON + input_data = { + "data": [ + { + "text_input": [""], + "stream": [True], + "max_tokens": [256], + "bad_words": [""], + "stop_words": [""], + } + ] + } + + # If command line option is specified, overwrite + if args.offline: + input_data["data"][0]["stream"] = [False] + elif not input_data["data"][0]["stream"]: + args.offline = True + + if args.max_tokens: + input_data["data"][0]["max_tokens"] = [args.max_tokens] + else: + args.max_tokens = input_data["data"][0]["max_tokens"][0] + return input_data def main(args): - input_data = construct_input_data(args) + if args.backend == "trtllm": + input_data = construct_trtllm_input_data(args) + elif args.backend == "vllm": + input_data = construct_vllm_input_data(args) + else: + raise ValueError( + "Unknown backend specified. Supported backend types are: 'trtllm' " + "and 'vllm'." + ) + prompts = generate_prompts(args, input_data) for prompt in prompts: @@ -531,6 +598,13 @@ def main(args): default="vllm", help="The name of the model to profile.", ) + parser.add_argument( + "-b", + "--backend", + type=str, + default="vllm", + help="The name of the backend.", + ) parser.add_argument( "--prompt-size-range", type=int, diff --git a/src/c++/perf_analyzer/docs/llm.md b/src/c++/perf_analyzer/docs/llm.md index 1de686c1b..331e7db55 100644 --- a/src/c++/perf_analyzer/docs/llm.md +++ b/src/c++/perf_analyzer/docs/llm.md @@ -33,20 +33,28 @@ The following guide shows the reader how to use Triton to measure and characterize the performance behaviors of Large Language Models (LLMs) using Triton with [vLLM](https://github.com/vllm-project/vllm). -### Setup: Download and configure Triton Server environment +### Setup: Download and configure Triton vLLM Backend -From [Step 1 of the Triton vLLM tutorial](https://github.com/triton-inference-server/tutorials/blob/main/Quick_Deploy/vLLM/README.md#step-1-build-a-triton-container-image-with-vllm). +Download the pre-built Triton Server Container with vLLM backend from +[NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/tritonserver) +registry. ```bash -git clone https://github.com/triton-inference-server/tutorials -cd tutorials/Quick_Deploy/vLLM -docker build -t tritonserver_vllm . -# wait for command to finish, might take several minutes +docker pull nvcr.io/nvidia/tritonserver:23.10-vllm-python-py3 ``` -Upon successful build, run the following command to start the Triton Server container: +Run the Triton Server container with +[vLLM backend](https://github.com/triton-inference-server/vllm_backend) and +launch the server. ```bash -docker run --gpus all -it --rm -p 8001:8001 --shm-size=1G --ulimit memlock=-1 --ulimit stack=67108864 -v ${PWD}:/work -w /work tritonserver_vllm tritonserver --model-store ./model_repository +git clone -b r23.10 https://github.com/triton-inference-server/vllm_backend.git +cd vllm_backend + +docker run --gpus all --rm -it --net host \ + --shm-size=2G --ulimit memlock=-1 --ulimit stack=67108864 \ + -v $(pwd)/samples/model_repository:/model_repository \ + nvcr.io/nvidia/tritonserver:23.10-vllm-python-py3 \ + tritonserver --model-repository /model_repository ``` Next run the following command to start the Triton SDK container: @@ -69,7 +77,7 @@ Inside the client container, run the following command to generate dummy prompts of size 100, 300, and 500 and receive single token from the model for each prompt. ```bash -python profile.py -m vllm --prompt-size-range 100 500 200 --max-tokens 1 +python profile.py -m vllm_model --prompt-size-range 100 500 200 --max-tokens 1 # [ BENCHMARK SUMMARY ] # Prompt size: 100 @@ -91,13 +99,13 @@ python profile.py -m vllm --prompt-size-range 100 500 200 --max-tokens 1 > { > "data": [ > { -> "PROMPT": [ +> "text_input": [ > "Hello, my name is" // user-provided prompt > ], -> "STREAM": [ +> "stream": [ > true > ], -> "SAMPLING_PARAMETERS": [ +> "sampling_parameters": [ > "{ \"max_tokens\": 1 }" > ] > } @@ -105,7 +113,7 @@ python profile.py -m vllm --prompt-size-range 100 500 200 --max-tokens 1 > } > ' > input_data.json > -> $ python profile.py -m vllm --input-data input_data.json +> $ python profile.py -m vllm_model --input-data input_data.json > ``` @@ -122,7 +130,7 @@ of size 100, 300, and 500 and receive total 256 tokens from the model for each prompts. ```bash -python profile.py -m vllm --prompt-size-range 100 500 200 --max-tokens 256 --ignore-eos +python profile.py -m vllm_model --prompt-size-range 100 500 200 --max-tokens 256 --ignore-eos # [ BENCHMARK SUMMARY ] # Prompt size: 100 @@ -157,7 +165,7 @@ Run the following command inside the client container. pip install matplotlib # Run Perf Analyzer -python profile.py -m vllm --prompt-size-range 10 10 1 --periodic-concurrency-range 1 100 1 --request-period 32 --max-tokens 1024 --ignore-eos +python profile.py -m vllm_model --prompt-size-range 10 10 1 --periodic-concurrency-range 1 100 1 --request-period 32 --max-tokens 1024 --ignore-eos # [ BENCHMARK SUMMARY ] # Prompt size: 10 @@ -179,7 +187,7 @@ split them into multiple segments of responses. For instance, assume we ran the following benchmark command: ```bash -python profile.py -m vllm --periodic-concurrency-range 1 4 1 --request-period 32 --max-tokens 1024 --ignore-eos +python profile.py -m vllm_model --periodic-concurrency-range 1 4 1 --request-period 32 --max-tokens 1024 --ignore-eos ``` We start from a single request and increment up to 4 requests one by one for