From ce6ccf88e8f106a6a88b1946076139c011a8f5e8 Mon Sep 17 00:00:00 2001 From: Stephen Baione <109226581+stbaione@users.noreply.github.com> Date: Wed, 13 Nov 2024 10:15:31 -0600 Subject: [PATCH 1/3] Shortfin LLM Deviceid Support (#493) # Description Add the ability to specify device_ids that you want Shortfin LLM Server to run with. The setup is essentially 1-1 with how SD server sets device_ids support up. Created a new `shortfin/interop/support/device_setup.py` module and moved the `get_selected_devices` function there to be shared across `managers`. ## Example ```bash python -m shortfin_apps.llm.server --tokenizer_json=/data/llama3.1/8b/tokenizer.json --model_config=./export/edited_config.json --vmfb=./export/model.vmfb --parameters=/data/llama3.1/8b/llama8b_f16.irpa --device=amdgpu --device_ids=0 ``` --- .../shortfin/interop/support/device_setup.py | 26 +++++++++++++++++++ .../shortfin_apps/llm/components/manager.py | 15 ++++++++--- shortfin/python/shortfin_apps/llm/server.py | 21 +++++++++++++-- .../shortfin_apps/sd/components/manager.py | 26 +------------------ 4 files changed, 57 insertions(+), 31 deletions(-) create mode 100644 shortfin/python/shortfin/interop/support/device_setup.py diff --git a/shortfin/python/shortfin/interop/support/device_setup.py b/shortfin/python/shortfin/interop/support/device_setup.py new file mode 100644 index 000000000..afe6ca695 --- /dev/null +++ b/shortfin/python/shortfin/interop/support/device_setup.py @@ -0,0 +1,26 @@ +import shortfin as sf + + +def get_selected_devices(sb: sf.SystemBuilder, device_ids=None): + available = sb.available_devices + selected = [] + if device_ids is not None: + if len(device_ids) > len(available): + raise ValueError( + f"Requested more device ids ({device_ids}) than available ({available})." + ) + for did in device_ids: + if isinstance(did, str): + try: + did = int(did) + except ValueError: + did = did + if did in available: + selected.append(did) + elif isinstance(did, int): + selected.append(available[did]) + else: + raise ValueError(f"Device id {did} could not be parsed.") + else: + selected = available + return selected diff --git a/shortfin/python/shortfin_apps/llm/components/manager.py b/shortfin/python/shortfin_apps/llm/components/manager.py index e3057de22..b44116b39 100644 --- a/shortfin/python/shortfin_apps/llm/components/manager.py +++ b/shortfin/python/shortfin_apps/llm/components/manager.py @@ -8,16 +8,23 @@ import threading import shortfin as sf +from shortfin.interop.support.device_setup import get_selected_devices logger = logging.getLogger(__name__) class SystemManager: - def __init__(self, device="local-task"): - if device == "local-task": + def __init__(self, device="local-task", device_ids=None, async_allocs=True): + if any(x in device for x in ["local-task", "cpu"]): self.ls = sf.host.CPUSystemBuilder().create_system() - elif device == "hip": - self.ls = sf.amdgpu.SystemBuilder().create_system() + elif any(x in device for x in ["hip", "amdgpu"]): + sb = sf.SystemBuilder( + system_type="amdgpu", amdgpu_async_allocations=async_allocs + ) + if device_ids: + sb.visible_devices = sb.available_devices + sb.visible_devices = get_selected_devices(sb, device_ids) + self.ls = sb.create_system() logger.info(f"Created local system with {self.ls.device_names} devices") # TODO: Come up with an easier bootstrap thing than manually # running a thread. diff --git a/shortfin/python/shortfin_apps/llm/server.py b/shortfin/python/shortfin_apps/llm/server.py index 5b51a9a7f..2ab7a1b96 100644 --- a/shortfin/python/shortfin_apps/llm/server.py +++ b/shortfin/python/shortfin_apps/llm/server.py @@ -86,7 +86,11 @@ def get_eos_from_tokenizer_config(json_path): def configure(args) -> SystemManager: # Setup system (configure devices, etc). - sysman = SystemManager(device=args.device) + sysman = SystemManager( + device=args.device, + device_ids=args.device_ids, + async_allocs=args.amdgpu_async_allocations, + ) # Setup each service we are hosting. eos_token = get_eos_from_tokenizer_config(args.tokenizer_config_json) @@ -155,9 +159,17 @@ def main(argv, log_config=uvicorn.config.LOGGING_CONFIG): parser.add_argument( "--device", type=str, - default="local-task", + required=True, + choices=["local-task", "hip", "amdgpu"], help="Device to serve on; e.g. local-task, hip. Same options as `iree-run-module --device` ", ) + parser.add_argument( + "--device_ids", + type=str, + nargs="*", + default=None, + help="Device IDs visible to the system builder. Defaults to None (full visibility). Can be an index or a sf device id like amdgpu:0:0@0", + ) parser.add_argument( "--isolation", type=str, @@ -165,6 +177,11 @@ def main(argv, log_config=uvicorn.config.LOGGING_CONFIG): choices=[isolation.name.lower() for isolation in ProgramIsolation], help="Concurrency control -- How to isolate programs.", ) + parser.add_argument( + "--amdgpu_async_allocations", + action="store_true", + help="Enable asynchronous allocations for amdgpu device contexts.", + ) args = parser.parse_args(argv) if args.tokenizer_config_json is None: diff --git a/shortfin/python/shortfin_apps/sd/components/manager.py b/shortfin/python/shortfin_apps/sd/components/manager.py index 846c4ced6..b44116b39 100644 --- a/shortfin/python/shortfin_apps/sd/components/manager.py +++ b/shortfin/python/shortfin_apps/sd/components/manager.py @@ -8,35 +8,11 @@ import threading import shortfin as sf +from shortfin.interop.support.device_setup import get_selected_devices logger = logging.getLogger(__name__) -def get_selected_devices(sb: sf.SystemBuilder, device_ids=None): - available = sb.available_devices - selected = [] - if device_ids is not None: - if len(device_ids) >= len(available): - raise ValueError( - f"Requested more device ids ({device_ids}) than available ({available})." - ) - for did in device_ids: - if isinstance(did, str): - try: - did = int(did) - except ValueError: - did = did - if did in available: - selected.append(did) - elif isinstance(did, int): - selected.append(available[did]) - else: - raise ValueError(f"Device id {did} could not be parsed.") - else: - selected = available - return selected - - class SystemManager: def __init__(self, device="local-task", device_ids=None, async_allocs=True): if any(x in device for x in ["local-task", "cpu"]): From 23bed174837e9ebe60e9b9f4ced3b9ccf9932c9c Mon Sep 17 00:00:00 2001 From: Marius Brehler Date: Wed, 13 Nov 2024 17:31:17 +0100 Subject: [PATCH 2/3] Adapt to package rename (#494) The packages were recently renamed from `iree-compiler` and `iree-runtime` to `iree-base-compiler` and `iree-base-compiler`, respectively. --- .github/workflows/ci-sharktank.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci-sharktank.yml b/.github/workflows/ci-sharktank.yml index 6f359077a..4c660e6ee 100644 --- a/.github/workflows/ci-sharktank.yml +++ b/.github/workflows/ci-sharktank.yml @@ -63,7 +63,7 @@ jobs: # Update to the latest iree packages. pip install -f https://iree.dev/pip-release-links.html --upgrade \ - iree-compiler iree-runtime --src deps \ + iree-base-compiler iree-base-runtime --src deps \ -e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine" - name: Run sharktank tests From 7bd325388e72a187f7ecb07e95b15da0cf4fb384 Mon Sep 17 00:00:00 2001 From: Andrew Woloszyn Date: Wed, 13 Nov 2024 12:49:12 -0500 Subject: [PATCH 3/3] [shortfin] Fix the f-string for python 3.11 (#499) --- shortfin/python/shortfin_apps/sd/components/service.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/shortfin/python/shortfin_apps/sd/components/service.py b/shortfin/python/shortfin_apps/sd/components/service.py index a64013db0..1ee11569a 100644 --- a/shortfin/python/shortfin_apps/sd/components/service.py +++ b/shortfin/python/shortfin_apps/sd/components/service.py @@ -183,6 +183,8 @@ def __repr__(self): params = [ f" {key} : {value}" for key, value in self.inference_parameters.items() ] + # For python 3.11 since we can't have \ in the f"" expression. + new_line = "\n" return ( f"ServiceManager(" f"\n INFERENCE DEVICES : \n" @@ -193,9 +195,9 @@ def __repr__(self): f" fibers per device : {self.fibers_per_device}\n" f" program isolation mode : {self.prog_isolation}\n" f"\n INFERENCE MODULES : \n" - f"{'\n'.join(modules)}\n" + f"{new_line.join(modules)}\n" f"\n INFERENCE PARAMETERS : \n" - f"{'\n'.join(params)}\n" + f"{new_line.join(params)}\n" f")" )