Skip to content

Commit

Permalink
BT-12026 BT-12027 BT-12029 Support parallel function calls, optional …
Browse files Browse the repository at this point in the history
…calls, and mistral (#1133)

* Update briton server image and template

* improvements for handling concurrency

* bump version
  • Loading branch information
bdubayah authored Sep 11, 2024
1 parent 5f0eb48 commit 8544185
Show file tree
Hide file tree
Showing 4 changed files with 196 additions and 57 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "truss"
version = "0.9.34"
version = "0.9.35rc1"
description = "A seamless bridge from model development to model delivery"
license = "MIT"
readme = "README.md"
Expand Down
2 changes: 1 addition & 1 deletion truss/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@

REGISTRY_BUILD_SECRET_PREFIX = "DOCKER_REGISTRY_"

TRTLLM_BASE_IMAGE = "baseten/briton-server:5fa9436e_v0.0.8"
TRTLLM_BASE_IMAGE = "baseten/briton-server:5fa9436e_v0.0.9"
TRTLLM_PYTHON_EXECUTABLE = "/usr/bin/python3"
BASE_TRTLLM_REQUIREMENTS = [
"grpcio==1.62.3",
Expand Down
40 changes: 20 additions & 20 deletions truss/templates/trtllm-briton/packages/briton_pb2.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@


DESCRIPTOR = _descriptor_pool.Default().AddSerializedFile(
b'\n\x0c\x62riton.proto\x12\x06\x62riton"r\n\x06Tensor\x12#\n\x05shape\x18\x01 \x01(\x0b\x32\x14.briton.Tensor.Shape\x12\x1f\n\x05\x64type\x18\x02 \x01(\x0e\x32\x10.briton.DataType\x12\x0c\n\x04\x64\x61ta\x18\x03 \x01(\x0c\x1a\x14\n\x05Shape\x12\x0b\n\x03\x64im\x18\x01 \x03(\x03"\xb4\x06\n\x10InferenceRequest\x12\x12\n\nrequest_id\x18\x01 \x01(\x03\x12\x12\n\ninput_text\x18\x02 \x01(\t\x12\x11\n\tinput_ids\x18\x03 \x03(\x05\x12\x1f\n\x12request_output_len\x18\x05 \x01(\rH\x00\x88\x01\x01\x12\x13\n\x06\x65nd_id\x18\x06 \x01(\rH\x01\x88\x01\x01\x12\x13\n\x06pad_id\x18\x07 \x01(\rH\x02\x88\x01\x01\x12\x17\n\nbeam_width\x18\n \x01(\rH\x03\x88\x01\x01\x12\x18\n\x0btemperature\x18\x0b \x01(\x02H\x04\x88\x01\x01\x12\x1a\n\rruntime_top_k\x18\x0c \x01(\rH\x05\x88\x01\x01\x12\x1a\n\rruntime_top_p\x18\r \x01(\x02H\x06\x88\x01\x01\x12\x18\n\x0blen_penalty\x18\x0e \x01(\x02H\x07\x88\x01\x01\x12\x1f\n\x12repetition_penalty\x18\x0f \x01(\x02H\x08\x88\x01\x01\x12\x1d\n\x10presence_penalty\x18\x10 \x01(\x02H\t\x88\x01\x01\x12\x11\n\tbad_words\x18\x11 \x03(\t\x12\x12\n\nstop_words\x18\x12 \x03(\t\x12\x19\n\x0clora_task_id\x18\x13 \x01(\x04H\n\x88\x01\x01\x12)\n\x0clora_weights\x18\x14 \x01(\x0b\x32\x0e.briton.TensorH\x0b\x88\x01\x01\x12(\n\x0blora_config\x18\x15 \x01(\x0b\x32\x0e.briton.TensorH\x0c\x88\x01\x01\x12\x18\n\x0brandom_seed\x18\x16 \x01(\x03H\r\x88\x01\x01\x12\x1f\n\x12output_schema_hash\x18\x17 \x01(\tH\x0e\x88\x01\x01\x42\x15\n\x13_request_output_lenB\t\n\x07_end_idB\t\n\x07_pad_idB\r\n\x0b_beam_widthB\x0e\n\x0c_temperatureB\x10\n\x0e_runtime_top_kB\x10\n\x0e_runtime_top_pB\x0e\n\x0c_len_penaltyB\x15\n\x13_repetition_penaltyB\x13\n\x11_presence_penaltyB\x0f\n\r_lora_task_idB\x0f\n\r_lora_weightsB\x0e\n\x0c_lora_configB\x0e\n\x0c_random_seedB\x15\n\x13_output_schema_hash"R\n\x13InferenceAnswerPart\x12\x12\n\nrequest_id\x18\x01 \x01(\x03\x12\x13\n\x0boutput_text\x18\x02 \x01(\t\x12\x12\n\noutput_ids\x18\x03 \x03(\x05"\xa6\x08\n\x0c\x42ritonConfig\x12\x13\n\x0b\x65ngine_path\x18\x01 \x01(\t\x12\x14\n\x0chf_tokenizer\x18\x02 \x01(\t\x12N\n\x16\x62\x61tch_scheduler_policy\x18\x05 \x01(\x0e\x32).briton.BritonConfig.BatchSchedulerPolicyH\x00\x88\x01\x01\x12\x1f\n\x12\x65nable_trt_overlap\x18\x06 \x01(\x08H\x01\x88\x01\x01\x12)\n\x1cmax_tokens_in_paged_kv_cache\x18\n \x01(\x04H\x02\x88\x01\x01\x12+\n\x1ekv_cache_free_gpu_mem_fraction\x18\x0b \x01(\x02H\x03\x88\x01\x01\x12!\n\x14medusa_decoding_mode\x18\x0c \x01(\x08H\x04\x88\x01\x01\x12#\n\x16\x65nable_chunked_context\x18\r \x01(\x08H\x05\x88\x01\x01\x12"\n\x15\x65nable_kv_cache_reuse\x18\x0e \x01(\x08H\x06\x88\x01\x01\x12\'\n\x1akv_cache_host_memory_bytes\x18\x0f \x01(\x04H\x07\x88\x01\x01\x12(\n\x1blora_cache_max_adapter_size\x18\x10 \x01(\x04H\x08\x88\x01\x01\x12,\n\x1flora_cache_optimal_adapter_size\x18\x11 \x01(\x04H\t\x88\x01\x01\x12+\n\x1elora_cache_gpu_memory_fraction\x18\x12 \x01(\x02H\n\x88\x01\x01\x12)\n\x1clora_cache_host_memory_bytes\x18\x13 \x01(\x04H\x0b\x88\x01\x01\x12\x1a\n\rfsm_cache_dir\x18\x14 \x01(\tH\x0c\x88\x01\x01"D\n\x14\x42\x61tchSchedulerPolicy\x12\x13\n\x0fMAX_UTILIZATION\x10\x00\x12\x17\n\x13GUARANTEED_NO_EVICT\x10\x01\x42\x19\n\x17_batch_scheduler_policyB\x15\n\x13_enable_trt_overlapB\x1f\n\x1d_max_tokens_in_paged_kv_cacheB!\n\x1f_kv_cache_free_gpu_mem_fractionB\x17\n\x15_medusa_decoding_modeB\x19\n\x17_enable_chunked_contextB\x18\n\x16_enable_kv_cache_reuseB\x1d\n\x1b_kv_cache_host_memory_bytesB\x1e\n\x1c_lora_cache_max_adapter_sizeB"\n _lora_cache_optimal_adapter_sizeB!\n\x1f_lora_cache_gpu_memory_fractionB\x1f\n\x1d_lora_cache_host_memory_bytesB\x10\n\x0e_fsm_cache_dir"\x98\x01\n\x10TokenToNextState\x12K\n\x13token_to_next_state\x18\x01 \x03(\x0b\x32..briton.TokenToNextState.TokenToNextStateEntry\x1a\x37\n\x15TokenToNextStateEntry\x12\x0b\n\x03key\x18\x01 \x01(\x05\x12\r\n\x05value\x18\x02 \x01(\x05:\x02\x38\x01"\xfb\x01\n\x0eStatesToTokens\x12\x44\n\x10states_to_tokens\x18\x01 \x03(\x0b\x32*.briton.StatesToTokens.StatesToTokensEntry\x12\x17\n\nvocab_size\x18\x02 \x01(\x05H\x00\x88\x01\x01\x12\x19\n\x0c\x65os_token_id\x18\x03 \x01(\x05H\x01\x88\x01\x01\x1aO\n\x13StatesToTokensEntry\x12\x0b\n\x03key\x18\x01 \x01(\x05\x12\'\n\x05value\x18\x02 \x01(\x0b\x32\x18.briton.TokenToNextState:\x02\x38\x01\x42\r\n\x0b_vocab_sizeB\x0f\n\r_eos_token_id*\xa8\x01\n\x08\x44\x61taType\x12\x0e\n\nDT_INVALID\x10\x00\x12\x0b\n\x07\x44T_INT4\x10\x01\x12\x0b\n\x07\x44T_INT8\x10\x02\x12\x0c\n\x08\x44T_UINT8\x10\x03\x12\x0c\n\x08\x44T_INT32\x10\x04\x12\x0c\n\x08\x44T_INT64\x10\x05\x12\x0e\n\nDT_FLOAT16\x10\n\x12\x0f\n\x0b\x44T_BFLOAT16\x10\x0b\x12\x0e\n\nDT_FLOAT32\x10\x0c\x12\n\n\x06\x44T_FP8\x10\r\x12\x0b\n\x07\x44T_BOOL\x10\x14\x32L\n\x06\x42riton\x12\x42\n\x05Infer\x12\x18.briton.InferenceRequest\x1a\x1b.briton.InferenceAnswerPart"\x00\x30\x01\x62\x06proto3'
b'\n\x0c\x62riton.proto\x12\x06\x62riton"r\n\x06Tensor\x12#\n\x05shape\x18\x01 \x01(\x0b\x32\x14.briton.Tensor.Shape\x12\x1f\n\x05\x64type\x18\x02 \x01(\x0e\x32\x10.briton.DataType\x12\x0c\n\x04\x64\x61ta\x18\x03 \x01(\x0c\x1a\x14\n\x05Shape\x12\x0b\n\x03\x64im\x18\x01 \x03(\x03"\x82\x07\n\x10InferenceRequest\x12\x12\n\nrequest_id\x18\x01 \x01(\x03\x12\x12\n\ninput_text\x18\x02 \x01(\t\x12\x11\n\tinput_ids\x18\x03 \x03(\x05\x12\x1f\n\x12request_output_len\x18\x05 \x01(\rH\x00\x88\x01\x01\x12\x13\n\x06\x65nd_id\x18\x06 \x01(\rH\x01\x88\x01\x01\x12\x13\n\x06pad_id\x18\x07 \x01(\rH\x02\x88\x01\x01\x12\x17\n\nbeam_width\x18\n \x01(\rH\x03\x88\x01\x01\x12\x18\n\x0btemperature\x18\x0b \x01(\x02H\x04\x88\x01\x01\x12\x1a\n\rruntime_top_k\x18\x0c \x01(\rH\x05\x88\x01\x01\x12\x1a\n\rruntime_top_p\x18\r \x01(\x02H\x06\x88\x01\x01\x12\x18\n\x0blen_penalty\x18\x0e \x01(\x02H\x07\x88\x01\x01\x12\x1f\n\x12repetition_penalty\x18\x0f \x01(\x02H\x08\x88\x01\x01\x12\x1d\n\x10presence_penalty\x18\x10 \x01(\x02H\t\x88\x01\x01\x12\x11\n\tbad_words\x18\x11 \x03(\t\x12\x12\n\nstop_words\x18\x12 \x03(\t\x12\x19\n\x0clora_task_id\x18\x13 \x01(\x04H\n\x88\x01\x01\x12)\n\x0clora_weights\x18\x14 \x01(\x0b\x32\x0e.briton.TensorH\x0b\x88\x01\x01\x12(\n\x0blora_config\x18\x15 \x01(\x0b\x32\x0e.briton.TensorH\x0c\x88\x01\x01\x12\x18\n\x0brandom_seed\x18\x16 \x01(\x03H\r\x88\x01\x01\x12\x1f\n\x12output_schema_hash\x18\x17 \x01(\tH\x0e\x88\x01\x01\x12\x15\n\x08tools_id\x18\x18 \x01(\rH\x0f\x88\x01\x01\x12\x18\n\x0b\x66orce_tools\x18\x19 \x01(\x08H\x10\x88\x01\x01\x42\x15\n\x13_request_output_lenB\t\n\x07_end_idB\t\n\x07_pad_idB\r\n\x0b_beam_widthB\x0e\n\x0c_temperatureB\x10\n\x0e_runtime_top_kB\x10\n\x0e_runtime_top_pB\x0e\n\x0c_len_penaltyB\x15\n\x13_repetition_penaltyB\x13\n\x11_presence_penaltyB\x0f\n\r_lora_task_idB\x0f\n\r_lora_weightsB\x0e\n\x0c_lora_configB\x0e\n\x0c_random_seedB\x15\n\x13_output_schema_hashB\x0b\n\t_tools_idB\x0e\n\x0c_force_tools"R\n\x13InferenceAnswerPart\x12\x12\n\nrequest_id\x18\x01 \x01(\x03\x12\x13\n\x0boutput_text\x18\x02 \x01(\t\x12\x12\n\noutput_ids\x18\x03 \x03(\x05"\xa6\x08\n\x0c\x42ritonConfig\x12\x13\n\x0b\x65ngine_path\x18\x01 \x01(\t\x12\x14\n\x0chf_tokenizer\x18\x02 \x01(\t\x12N\n\x16\x62\x61tch_scheduler_policy\x18\x05 \x01(\x0e\x32).briton.BritonConfig.BatchSchedulerPolicyH\x00\x88\x01\x01\x12\x1f\n\x12\x65nable_trt_overlap\x18\x06 \x01(\x08H\x01\x88\x01\x01\x12)\n\x1cmax_tokens_in_paged_kv_cache\x18\n \x01(\x04H\x02\x88\x01\x01\x12+\n\x1ekv_cache_free_gpu_mem_fraction\x18\x0b \x01(\x02H\x03\x88\x01\x01\x12!\n\x14medusa_decoding_mode\x18\x0c \x01(\x08H\x04\x88\x01\x01\x12#\n\x16\x65nable_chunked_context\x18\r \x01(\x08H\x05\x88\x01\x01\x12"\n\x15\x65nable_kv_cache_reuse\x18\x0e \x01(\x08H\x06\x88\x01\x01\x12\'\n\x1akv_cache_host_memory_bytes\x18\x0f \x01(\x04H\x07\x88\x01\x01\x12(\n\x1blora_cache_max_adapter_size\x18\x10 \x01(\x04H\x08\x88\x01\x01\x12,\n\x1flora_cache_optimal_adapter_size\x18\x11 \x01(\x04H\t\x88\x01\x01\x12+\n\x1elora_cache_gpu_memory_fraction\x18\x12 \x01(\x02H\n\x88\x01\x01\x12)\n\x1clora_cache_host_memory_bytes\x18\x13 \x01(\x04H\x0b\x88\x01\x01\x12\x1a\n\rfsm_cache_dir\x18\x14 \x01(\tH\x0c\x88\x01\x01"D\n\x14\x42\x61tchSchedulerPolicy\x12\x13\n\x0fMAX_UTILIZATION\x10\x00\x12\x17\n\x13GUARANTEED_NO_EVICT\x10\x01\x42\x19\n\x17_batch_scheduler_policyB\x15\n\x13_enable_trt_overlapB\x1f\n\x1d_max_tokens_in_paged_kv_cacheB!\n\x1f_kv_cache_free_gpu_mem_fractionB\x17\n\x15_medusa_decoding_modeB\x19\n\x17_enable_chunked_contextB\x18\n\x16_enable_kv_cache_reuseB\x1d\n\x1b_kv_cache_host_memory_bytesB\x1e\n\x1c_lora_cache_max_adapter_sizeB"\n _lora_cache_optimal_adapter_sizeB!\n\x1f_lora_cache_gpu_memory_fractionB\x1f\n\x1d_lora_cache_host_memory_bytesB\x10\n\x0e_fsm_cache_dir"\x98\x01\n\x10TokenToNextState\x12K\n\x13token_to_next_state\x18\x01 \x03(\x0b\x32..briton.TokenToNextState.TokenToNextStateEntry\x1a\x37\n\x15TokenToNextStateEntry\x12\x0b\n\x03key\x18\x01 \x01(\x05\x12\r\n\x05value\x18\x02 \x01(\x05:\x02\x38\x01"\xfb\x01\n\x0eStatesToTokens\x12\x44\n\x10states_to_tokens\x18\x01 \x03(\x0b\x32*.briton.StatesToTokens.StatesToTokensEntry\x12\x17\n\nvocab_size\x18\x02 \x01(\x05H\x00\x88\x01\x01\x12\x19\n\x0c\x65os_token_id\x18\x03 \x01(\x05H\x01\x88\x01\x01\x1aO\n\x13StatesToTokensEntry\x12\x0b\n\x03key\x18\x01 \x01(\x05\x12\'\n\x05value\x18\x02 \x01(\x0b\x32\x18.briton.TokenToNextState:\x02\x38\x01\x42\r\n\x0b_vocab_sizeB\x0f\n\r_eos_token_id*\xa8\x01\n\x08\x44\x61taType\x12\x0e\n\nDT_INVALID\x10\x00\x12\x0b\n\x07\x44T_INT4\x10\x01\x12\x0b\n\x07\x44T_INT8\x10\x02\x12\x0c\n\x08\x44T_UINT8\x10\x03\x12\x0c\n\x08\x44T_INT32\x10\x04\x12\x0c\n\x08\x44T_INT64\x10\x05\x12\x0e\n\nDT_FLOAT16\x10\n\x12\x0f\n\x0b\x44T_BFLOAT16\x10\x0b\x12\x0e\n\nDT_FLOAT32\x10\x0c\x12\n\n\x06\x44T_FP8\x10\r\x12\x0b\n\x07\x44T_BOOL\x10\x14\x32L\n\x06\x42riton\x12\x42\n\x05Infer\x12\x18.briton.InferenceRequest\x1a\x1b.briton.InferenceAnswerPart"\x00\x30\x01\x62\x06proto3'
)

_globals = globals()
Expand All @@ -30,28 +30,28 @@
_globals["_TOKENTONEXTSTATE_TOKENTONEXTSTATEENTRY"]._serialized_options = b"8\001"
_globals["_STATESTOTOKENS_STATESTOTOKENSENTRY"]._options = None
_globals["_STATESTOTOKENS_STATESTOTOKENSENTRY"]._serialized_options = b"8\001"
_globals["_DATATYPE"]._serialized_start = 2522
_globals["_DATATYPE"]._serialized_end = 2690
_globals["_DATATYPE"]._serialized_start = 2600
_globals["_DATATYPE"]._serialized_end = 2768
_globals["_TENSOR"]._serialized_start = 24
_globals["_TENSOR"]._serialized_end = 138
_globals["_TENSOR_SHAPE"]._serialized_start = 118
_globals["_TENSOR_SHAPE"]._serialized_end = 138
_globals["_INFERENCEREQUEST"]._serialized_start = 141
_globals["_INFERENCEREQUEST"]._serialized_end = 961
_globals["_INFERENCEANSWERPART"]._serialized_start = 963
_globals["_INFERENCEANSWERPART"]._serialized_end = 1045
_globals["_BRITONCONFIG"]._serialized_start = 1048
_globals["_BRITONCONFIG"]._serialized_end = 2110
_globals["_BRITONCONFIG_BATCHSCHEDULERPOLICY"]._serialized_start = 1661
_globals["_BRITONCONFIG_BATCHSCHEDULERPOLICY"]._serialized_end = 1729
_globals["_TOKENTONEXTSTATE"]._serialized_start = 2113
_globals["_TOKENTONEXTSTATE"]._serialized_end = 2265
_globals["_TOKENTONEXTSTATE_TOKENTONEXTSTATEENTRY"]._serialized_start = 2210
_globals["_TOKENTONEXTSTATE_TOKENTONEXTSTATEENTRY"]._serialized_end = 2265
_globals["_STATESTOTOKENS"]._serialized_start = 2268
_globals["_STATESTOTOKENS"]._serialized_end = 2519
_globals["_STATESTOTOKENS_STATESTOTOKENSENTRY"]._serialized_start = 2408
_globals["_STATESTOTOKENS_STATESTOTOKENSENTRY"]._serialized_end = 2487
_globals["_BRITON"]._serialized_start = 2692
_globals["_BRITON"]._serialized_end = 2768
_globals["_INFERENCEREQUEST"]._serialized_end = 1039
_globals["_INFERENCEANSWERPART"]._serialized_start = 1041
_globals["_INFERENCEANSWERPART"]._serialized_end = 1123
_globals["_BRITONCONFIG"]._serialized_start = 1126
_globals["_BRITONCONFIG"]._serialized_end = 2188
_globals["_BRITONCONFIG_BATCHSCHEDULERPOLICY"]._serialized_start = 1739
_globals["_BRITONCONFIG_BATCHSCHEDULERPOLICY"]._serialized_end = 1807
_globals["_TOKENTONEXTSTATE"]._serialized_start = 2191
_globals["_TOKENTONEXTSTATE"]._serialized_end = 2343
_globals["_TOKENTONEXTSTATE_TOKENTONEXTSTATEENTRY"]._serialized_start = 2288
_globals["_TOKENTONEXTSTATE_TOKENTONEXTSTATEENTRY"]._serialized_end = 2343
_globals["_STATESTOTOKENS"]._serialized_start = 2346
_globals["_STATESTOTOKENS"]._serialized_end = 2597
_globals["_STATESTOTOKENS_STATESTOTOKENSENTRY"]._serialized_start = 2486
_globals["_STATESTOTOKENS_STATESTOTOKENSENTRY"]._serialized_end = 2565
_globals["_BRITON"]._serialized_start = 2770
_globals["_BRITON"]._serialized_end = 2846
# @@protoc_insertion_point(module_scope)
Loading

0 comments on commit 8544185

Please sign in to comment.