diff --git a/.github/workflows/ci-sharktank.yml b/.github/workflows/ci-sharktank.yml index 6f359077a..4c660e6ee 100644 --- a/.github/workflows/ci-sharktank.yml +++ b/.github/workflows/ci-sharktank.yml @@ -63,7 +63,7 @@ jobs: # Update to the latest iree packages. pip install -f https://iree.dev/pip-release-links.html --upgrade \ - iree-compiler iree-runtime --src deps \ + iree-base-compiler iree-base-runtime --src deps \ -e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine" - name: Run sharktank tests diff --git a/shortfin/python/shortfin/interop/support/device_setup.py b/shortfin/python/shortfin/interop/support/device_setup.py new file mode 100644 index 000000000..afe6ca695 --- /dev/null +++ b/shortfin/python/shortfin/interop/support/device_setup.py @@ -0,0 +1,26 @@ +import shortfin as sf + + +def get_selected_devices(sb: sf.SystemBuilder, device_ids=None): + available = sb.available_devices + selected = [] + if device_ids is not None: + if len(device_ids) > len(available): + raise ValueError( + f"Requested more device ids ({device_ids}) than available ({available})." + ) + for did in device_ids: + if isinstance(did, str): + try: + did = int(did) + except ValueError: + did = did + if did in available: + selected.append(did) + elif isinstance(did, int): + selected.append(available[did]) + else: + raise ValueError(f"Device id {did} could not be parsed.") + else: + selected = available + return selected diff --git a/shortfin/python/shortfin_apps/llm/components/manager.py b/shortfin/python/shortfin_apps/llm/components/manager.py index e3057de22..b44116b39 100644 --- a/shortfin/python/shortfin_apps/llm/components/manager.py +++ b/shortfin/python/shortfin_apps/llm/components/manager.py @@ -8,16 +8,23 @@ import threading import shortfin as sf +from shortfin.interop.support.device_setup import get_selected_devices logger = logging.getLogger(__name__) class SystemManager: - def __init__(self, device="local-task"): - if device == "local-task": + def __init__(self, device="local-task", device_ids=None, async_allocs=True): + if any(x in device for x in ["local-task", "cpu"]): self.ls = sf.host.CPUSystemBuilder().create_system() - elif device == "hip": - self.ls = sf.amdgpu.SystemBuilder().create_system() + elif any(x in device for x in ["hip", "amdgpu"]): + sb = sf.SystemBuilder( + system_type="amdgpu", amdgpu_async_allocations=async_allocs + ) + if device_ids: + sb.visible_devices = sb.available_devices + sb.visible_devices = get_selected_devices(sb, device_ids) + self.ls = sb.create_system() logger.info(f"Created local system with {self.ls.device_names} devices") # TODO: Come up with an easier bootstrap thing than manually # running a thread. diff --git a/shortfin/python/shortfin_apps/llm/server.py b/shortfin/python/shortfin_apps/llm/server.py index 5b51a9a7f..2ab7a1b96 100644 --- a/shortfin/python/shortfin_apps/llm/server.py +++ b/shortfin/python/shortfin_apps/llm/server.py @@ -86,7 +86,11 @@ def get_eos_from_tokenizer_config(json_path): def configure(args) -> SystemManager: # Setup system (configure devices, etc). - sysman = SystemManager(device=args.device) + sysman = SystemManager( + device=args.device, + device_ids=args.device_ids, + async_allocs=args.amdgpu_async_allocations, + ) # Setup each service we are hosting. eos_token = get_eos_from_tokenizer_config(args.tokenizer_config_json) @@ -155,9 +159,17 @@ def main(argv, log_config=uvicorn.config.LOGGING_CONFIG): parser.add_argument( "--device", type=str, - default="local-task", + required=True, + choices=["local-task", "hip", "amdgpu"], help="Device to serve on; e.g. local-task, hip. Same options as `iree-run-module --device` ", ) + parser.add_argument( + "--device_ids", + type=str, + nargs="*", + default=None, + help="Device IDs visible to the system builder. Defaults to None (full visibility). Can be an index or a sf device id like amdgpu:0:0@0", + ) parser.add_argument( "--isolation", type=str, @@ -165,6 +177,11 @@ def main(argv, log_config=uvicorn.config.LOGGING_CONFIG): choices=[isolation.name.lower() for isolation in ProgramIsolation], help="Concurrency control -- How to isolate programs.", ) + parser.add_argument( + "--amdgpu_async_allocations", + action="store_true", + help="Enable asynchronous allocations for amdgpu device contexts.", + ) args = parser.parse_args(argv) if args.tokenizer_config_json is None: diff --git a/shortfin/python/shortfin_apps/sd/components/manager.py b/shortfin/python/shortfin_apps/sd/components/manager.py index 2835f8027..ea29b69a4 100644 --- a/shortfin/python/shortfin_apps/sd/components/manager.py +++ b/shortfin/python/shortfin_apps/sd/components/manager.py @@ -8,35 +8,11 @@ import threading import shortfin as sf +from shortfin.interop.support.device_setup import get_selected_devices logger = logging.getLogger("shortfin-sd.manager") -def get_selected_devices(sb: sf.SystemBuilder, device_ids=None): - available = sb.available_devices - selected = [] - if device_ids is not None: - if len(device_ids) >= len(available): - raise ValueError( - f"Requested more device ids ({device_ids}) than available ({available})." - ) - for did in device_ids: - if isinstance(did, str): - try: - did = int(did) - except ValueError: - did = did - if did in available: - selected.append(did) - elif isinstance(did, int): - selected.append(available[did]) - else: - raise ValueError(f"Device id {did} could not be parsed.") - else: - selected = available - return selected - - class SystemManager: def __init__(self, device="local-task", device_ids=None, async_allocs=True): if any(x in device for x in ["local-task", "cpu"]): diff --git a/shortfin/python/shortfin_apps/sd/components/service.py b/shortfin/python/shortfin_apps/sd/components/service.py index be812a5c6..971bafd57 100644 --- a/shortfin/python/shortfin_apps/sd/components/service.py +++ b/shortfin/python/shortfin_apps/sd/components/service.py @@ -186,24 +186,22 @@ def __repr__(self): params = [ f" {key} : {value}" for key, value in self.inference_parameters.items() ] - mod_string = '\n'.join(modules) - params_string = '\n'.join(params) + # For python 3.11 since we can't have \ in the f"" expression. + new_line = "\n" return ( - f"ServiceManager(" + - "\n INFERENCE DEVICES : \n" + - f" {self.sysman.ls.devices}\n" + - "\n MODEL PARAMS : \n" + - f"{self.model_params}" + - "\n SERVICE PARAMS : \n" + - f" fibers per device : {self.fibers_per_device}" + - "\n" + - f" program isolation mode : {self.prog_isolation}" + - "\n" + - "\n INFERENCE MODULES : \n" + - mod_string + - "\n INFERENCE PARAMETERS : \n" + - params_string + - ")" + f"ServiceManager(" + f"\n INFERENCE DEVICES : \n" + f" {self.sysman.ls.devices}\n" + f"\n MODEL PARAMS : \n" + f"{self.model_params}" + f"\n SERVICE PARAMS : \n" + f" fibers per device : {self.fibers_per_device}\n" + f" program isolation mode : {self.prog_isolation}\n" + f"\n INFERENCE MODULES : \n" + f"{new_line.join(modules)}\n" + f"\n INFERENCE PARAMETERS : \n" + f"{new_line.join(params)}\n" + f")" )