triton-inference-server · matthewkotila · Oct 9, 2023 · Oct 9, 2023 · Oct 10, 2023
diff --git a/src/c++/perf_analyzer/docs/examples/profile.py b/src/c++/perf_analyzer/docs/examples/profile.py
@@ -91,11 +91,11 @@ def generate_input_data(args, filename):
         "ignore_eos": {"true" if args.ignore_eos else "false"}
     }}
     """
-    input_data = {"data": [{"STREAM": [True]}]}
-    input_data["data"][0]["SAMPLING_PARAMETERS"] = [request_parameters]
+    input_data = {"data": [{"stream": [True]}]}
+    input_data["data"][0]["sampling_parameters"] = [request_parameters]
 
-    prompt = ["hi"] * prompt_size  # Generate dummy prompt
-    input_data["data"][0]["PROMPT"] = [" ".join(prompt)]
+    text_input = ["hi"] * text_input_size  # Generate dummy text input
+    input_data["data"][0]["text_input"] = [" ".join(text_input)]
     with open(filename, "w") as f:
         json.dump(input_data, f)
 
@@ -111,12 +111,12 @@ def generate_input_data(args, filename):
         help="The name of the model to profile.",
     )
     parser.add_argument(
-        "--prompt-size-range",
+        "--text-input-size-range",
         type=int,
         nargs=3,
         metavar=("START", "END", "STEP"),
         default=[10, 10, 1],
-        help="The range of prompt sizes '<[START, END], STEP>' where END is inclusive.",
+        help="The range of text input sizes '<[START, END], STEP>' where END is inclusive.",
     )
     parser.add_argument(
         "--max-tokens",
@@ -143,24 +143,24 @@ def generate_input_data(args, filename):
         print(f"Using input data file '{args.input_data}' for inference request.\n")
         with open(args.input_data) as f:
             input_data = json.load(f)
-            prompt_size = len(input_data["data"][0]["PROMPT"][0].split())
-            args.prompt_size_range = [prompt_size, prompt_size, 1]
+            text_input_size = len(input_data["data"][0]["text_input"][0].split())
+            args.text_input_size_range = [text_input_size, text_input_size, 1]
 
-    start, end, step = args.prompt_size_range
-    for prompt_size in range(start, end + 1, step):
+    start, end, step = args.text_input_size_range
+    for text_input_size in range(start, end + 1, step):
         if not args.input_data:
             generate_input_data(args, TEMP_INPUT_FILE)
 
         profile(args, args.input_data if args.input_data else TEMP_INPUT_FILE)
         avg_first_token_latency, avg_token_to_token_latency = calculate_avg_latencies()
         results.append(
-            (prompt_size, avg_first_token_latency, avg_token_to_token_latency)
+            (text_input_size, avg_first_token_latency, avg_token_to_token_latency)
         )
 
     print("\n[ Benchmark Summary ]")
-    for prompt_size, avg_first_token_latency, avg_token_to_token_latency in results:
+    for text_input_size, avg_first_token_latency, avg_token_to_token_latency in results:
         line = (
-            f"  Prompt size: {prompt_size}, "
+            f"  Text input size: {text_input_size}, "
             f"Average first-token latency: {avg_first_token_latency:.4f} sec"
         )
         line += (

diff --git a/src/c++/perf_analyzer/docs/llm.md b/src/c++/perf_analyzer/docs/llm.md
@@ -58,43 +58,44 @@ docker run --gpus all -it --rm --net host -v ${PWD}:/work -w /work nvcr.io/nvidi
 
 ## Benchmark 1: Profiling the Prefill Phase
 
-In this benchmarking scenario, we want to measure the effect of input prompt
+In this benchmarking scenario, we want to measure the effect of text input
 size on first-token latency. We issue single request to the server of fixed
 input sizes and request the model to compute at most one new token. This
 essentially means one pass through the model.
 
 #### Example
 
-Inside the client container, run the following command to generate dummy prompts
-of size 100, 300, and 500 and receive single token from the model for each prompt.
+Inside the client container, run the following command to generate dummy text inputs
+of size 100, 300, and 500 and receive single token from the model for each text input.
 
 ```bash
-python profile.py -m vllm --prompt-size-range 100 500 200 --max-tokens 1
+python profile.py -m vllm --text-input-size-range 100 500 200 --max-tokens 1
 
 # Sample output
 # [ Benchmark Summary ]
-#   Prompt size: 100, Average first-token latency: 0.0459 sec
-#   Prompt size: 300, Average first-token latency: 0.0415 sec
-#   Prompt size: 500, Average first-token latency: 0.0451 sec
+#   Text input size: 100, Average first-token latency: 0.0459 sec
+#   Text input size: 300, Average first-token latency: 0.0415 sec
+#   Text input size: 500, Average first-token latency: 0.0451 sec
 ```
 
 > **Note**
 >
-> In order to provide a specific prompt (instead of the dummy prompt generated by default),
-> the user can provide input data JSON file using `--input-data` option.
-> This will however *ignore* any parameters specified through the command line.
+> In order to provide a specific text input (instead of the dummy text input
+> generated by default), the user can provide input data JSON file using
+> `--input-data` option. This will however *ignore* any parameters specified
+> through the command line.
 > ```bash
 > $ echo '
 > {
 >     "data": [
 >         {
->             "PROMPT": [
->                 "Hello, my name is"  // user-provided prompt
+>             "text_input": [
+>                 "Hello, my name is"  // user-provided text input
 >             ],
->             "STREAM": [
+>             "stream": [
 >                 true
 >             ],
->             "SAMPLING_PARAMETERS": [
+>             "sampling_parameters": [
 >                 "{ \"max_tokens\": 1 }"
 >             ]
 >         }
@@ -108,23 +109,23 @@ python profile.py -m vllm --prompt-size-range 100 500 200 --max-tokens 1
 
 ## Benchmark 2: Profiling the Generation Phase
 
-In this benchmarking scenario, we want to measure the effect of input prompt
+In this benchmarking scenario, we want to measure the effect of text input
 size on token-to-token latency. We issue single request to the server of fixed
 input sizes and request the model to compute a fixed amount of tokens.
 
 #### Example
 
-Inside the client container, run the following command to generate dummy prompts
-of size 100, 300, and 500 and receive total 256 tokens from the model for each prompts.
+Inside the client container, run the following command to generate dummy text inputs
+of size 100, 300, and 500 and receive total 256 tokens from the model for each text input.
 
 ```bash
-python profile.py -m vllm --prompt-size-range 100 500 200 --max-tokens 256 --ignore-eos
+python profile.py -m vllm --text-input-size-range 100 500 200 --max-tokens 256 --ignore-eos
 
 # Sample output
 # [ Benchmark Summary ]
-#   Prompt size: 100, Average first-token latency: 0.0388 sec, Average token-token latency: 0.0066 sec
-#   Prompt size: 300, Average first-token latency: 0.0431 sec, Average token-token latency: 0.0071 sec
-#   Prompt size: 500, Average first-token latency: 0.0400 sec, Average token-token latency: 0.0070 sec
+#   Text input size: 100, Average first-token latency: 0.0388 sec, Average token-token latency: 0.0066 sec
+#   Text input size: 300, Average first-token latency: 0.0431 sec, Average token-token latency: 0.0071 sec
+#   Text input size: 500, Average first-token latency: 0.0400 sec, Average token-token latency: 0.0070 sec
 ```
 
 ### Benchmark 3: Profiling Continuous Batch Size
@@ -134,27 +135,27 @@ batch size on token-to-token latency. We systematically issue requests to the
 server of fixed input sizes and request the model to compute a fixed amount of
 tokens in order to increase the continuous batching size over time.
 
-#### 1. Generate prompts input data JSON
+#### 1. Generate input data JSON
 
 ```bash
 # open a new shell in the same directory you were in when running the above command
 echo '
 {
     "data": [
         {
-            "PROMPT": [
+            "text_input": [
                 "Hello, my name is"
             ],
-            "STREAM": [
+            "stream": [
                 true
             ],
-            "SAMPLING_PARAMETERS": [
+            "sampling_parameters": [
                 "{\"max_tokens\":16,\"ignore_eos\":true}"
             ]
         }
     ]
 }
-' > prompts.json
+' > text_inputs.json
 ```
 
 #### 2. Run Perf Analyzer
@@ -165,7 +166,7 @@ perf_analyzer \
     -i grpc \
     --async \
     --streaming \
-    --input-data=prompts.json \
+    --input-data=text_inputs.json \
     --profile-export-file=profile_export.json \
     --periodic-concurrency-range=1:20:1
     --request-period=10