From 1ebf125eff311db4219af9e7b1b1b1f73d0c10ee Mon Sep 17 00:00:00 2001 From: Olivier Le Thanh Duong Date: Mon, 8 Apr 2024 14:30:18 +0200 Subject: [PATCH 01/39] Problem: dbus call were not async dbus call to systemd were not called asyncronously. Modify all call to asynchronous, this required switching from python-dbus to dbus-fast which offer an asyncio backend. --- packaging/aleph-vm/DEBIAN/control | 2 +- pyproject.toml | 3 +- src/aleph/vm/pool.py | 4 +- src/aleph/vm/systemd.py | 230 ++++++++++++++++++++++++------ 4 files changed, 194 insertions(+), 45 deletions(-) diff --git a/packaging/aleph-vm/DEBIAN/control b/packaging/aleph-vm/DEBIAN/control index 45aa6bd65..1f51a0990 100644 --- a/packaging/aleph-vm/DEBIAN/control +++ b/packaging/aleph-vm/DEBIAN/control @@ -3,6 +3,6 @@ Version: 0.1.8 Architecture: all Maintainer: Aleph.im Description: Aleph.im VM execution engine -Depends: python3,python3-pip,python3-aiohttp,python3-msgpack,python3-aiodns,python3-alembic,python3-sqlalchemy,python3-setproctitle,redis,python3-aioredis,python3-psutil,sudo,acl,curl,systemd-container,squashfs-tools,debootstrap,python3-packaging,python3-cpuinfo,python3-nftables,python3-jsonschema,cloud-image-utils,ndppd,python3-yaml,python3-dotenv,python3-schedule,qemu-system-x86,qemu-utils,python3-systemd,python3-dbus,btrfs-progs,nftables +Depends: python3,python3-pip,python3-aiohttp,python3-msgpack,python3-aiodns,python3-alembic,python3-sqlalchemy,python3-setproctitle,redis,python3-aioredis,python3-psutil,sudo,acl,curl,systemd-container,squashfs-tools,debootstrap,python3-packaging,python3-cpuinfo,python3-nftables,python3-jsonschema,cloud-image-utils,ndppd,python3-yaml,python3-dotenv,python3-schedule,qemu-system-x86,qemu-utils,python3-systemd,python3-dbus-fast,btrfs-progs,nftables Section: aleph-im Priority: Extra diff --git a/pyproject.toml b/pyproject.toml index cd803673e..71013adbe 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,8 +42,7 @@ dependencies = [ "packaging==23.2", "jsonschema==4.19.1", "qmp==0.0.1", - "dbus-python==1.3.2", - "systemd-python==235", + "dbus-fast==1.90.1", "systemd-python==235", "superfluid~=0.2.1", "sqlalchemy[asyncio]>=2.0", diff --git a/src/aleph/vm/pool.py b/src/aleph/vm/pool.py index 3e5c5f3ec..938a83726 100644 --- a/src/aleph/vm/pool.py +++ b/src/aleph/vm/pool.py @@ -124,7 +124,7 @@ async def create_a_vm( # Start VM and snapshots automatically if execution.persistent: - self.systemd_manager.enable_and_start(execution.controller_service) + await self.systemd_manager.enable_and_start(execution.controller_service) await execution.wait_for_init() if execution.is_program and execution.vm: await execution.vm.load_configuration() @@ -191,7 +191,7 @@ async def stop_vm(self, vm_hash: ItemHash) -> Optional[VmExecution]: async def stop_persistent_execution(self, execution: VmExecution): """Stop persistent VMs in the pool.""" assert execution.persistent, "Execution isn't persistent" - self.systemd_manager.stop_and_disable(execution.controller_service) + await self.systemd_manager.stop_and_disable(execution.controller_service) await execution.stop() def forget_vm(self, vm_hash: ItemHash) -> None: diff --git a/src/aleph/vm/systemd.py b/src/aleph/vm/systemd.py index 001c4671d..e72143f42 100644 --- a/src/aleph/vm/systemd.py +++ b/src/aleph/vm/systemd.py @@ -2,75 +2,225 @@ async SystemD Manager implementation. """ +import abc +import enum import logging +from typing import Literal -import dbus -from dbus import DBusException, SystemBus -from dbus.proxies import Interface +from dbus_fast import DBusError +from dbus_fast.aio import MessageBus, ProxyInterface, ProxyObject logger = logging.getLogger(__name__) +class UnitFileState(str, enum.Enum): + """This StrEnum class represents the different possible states of a unit file.""" + + ENABLED = "enabled" + """Indicates that a unit file is permanently enabled.""" + + ENABLED_RUNTIME = "enabled-runtime" + """Indicates the unit file is only temporarily enabled and will no longer be enabled after a reboot + (that means, it is enabled via /run/ symlinks, rather than /etc/).""" + + LINKED = "linked" + """Indicates that a unit is linked into /etc/ permanently.""" + + LINKED_RUNTIME = "linked-runtime" + """Indicates that a unit is linked into /run/ temporarily (until the next reboot).""" + + MASKED = "masked" + """Indicates that the unit file is masked permanently.""" + + MASKED_RUNTIME = "masked-runtime" + """Indicates that it is masked in /run/ temporarily (until the next reboot).""" + + STATIC = "static" + """Indicates that the unit is statically enabled, i.e. always enabled and doesn't need to be enabled explicitly.""" + + DISABLED = "disabled" + """Indicates that the unit file is not enabled.""" + + INVALID = "invalid" + """Indicates that it could not be determined whether the unit file is enabled.""" + + +UnitFileStateLiteral = Literal[ + "enabled", + "enabled-runtime", + "linked", + "linked-runtime", + "masked", + "masked-runtime", + "static", + "disabled", + "invalid", +] + + +class Mode(str, enum.Enum): + REPLACE = "replace" + FAIL = "fail" + ISOLATE = "isolate" + IGNORE_DEPENDENCIES = "ignore-dependencies" + IGNORE_REQUIREMENTS = "ignore-requirements" + + +class ActiveState(str, enum.Enum): + """ + ActiveState contains a state value that reflects the unit's current status. + """ + + ACTIVE = "active" + """ + The unit is active. + """ + + RELOADING = "reloading" + """ + The unit is active and reloading its configuration. + """ + + INACTIVE = "inactive" + """ + The unit is inactive, previous run was successful or hasn't yet occurred. + """ + + FAILED = "failed" + """ + The unit is inactive, previous run was unsuccessful. + """ + + ACTIVATING = "activating" + """ + The unit is transitioning from inactive to active state. + """ + + DEACTIVATING = "deactivating" + """ + The unit is in the process of deactivation. + """ + + +ActiveStateLiteral = Literal["active", "reloading", "inactive", "failed", "activating", "deactivating"] + + +class SystemdProxy(ProxyInterface, abc.ABC): + """ABC for typing. + + for description of methodsp + see https://www.freedesktop.org/software/systemd/man/latest/org.freedesktop.systemd1.html#The%20Manager%20Object""" + + @abc.abstractmethod + async def call_enable_unit_files(self, files: list[str], runtime: bool, force: bool): + pass + + @abc.abstractmethod + async def call_get_unit_file_state(self, service) -> UnitFileStateLiteral: + pass + + @abc.abstractmethod + async def call_start_unit(self, name, mode): + pass + + @abc.abstractmethod + async def call_stop_unit(self, name, mode): + pass + + @abc.abstractmethod + async def call_restart_unit(self, name, mode): + pass + + @abc.abstractmethod + async def call_disable_unit_files(self, files: list[str], runtime: bool): + pass + + @abc.abstractmethod + async def call_get_unit(self, name: str) -> str: + pass + + +class UnitProxy(ProxyInterface, abc.ABC): + """ABC for typing. + + for description of methods see + https://www.freedesktop.org/software/systemd/man/latest/org.freedesktop.systemd1.html#Service%20Unit%20Objects""" + + @abc.abstractmethod + async def get_active_state(self) -> ActiveStateLiteral: + pass + + class SystemDManager: """SystemD Manager class. Used to manage the systemd services on the host on Linux. """ - bus: SystemBus - manager: Interface + bus: MessageBus + manager: SystemdProxy def __init__(self): - self.bus = dbus.SystemBus() - systemd = self.bus.get_object("org.freedesktop.systemd1", "/org/freedesktop/systemd1") - self.manager = dbus.Interface(systemd, "org.freedesktop.systemd1.Manager") - - def stop_and_disable(self, service: str) -> None: - if self.is_service_active(service): - self.stop(service) - if self.is_service_enabled(service): - self.disable(service) - - def enable(self, service: str) -> None: - self.manager.EnableUnitFiles([service], False, True) + self.bus = MessageBus() + + async def connect(self): + await self.bus.connect() + path = "/org/freedesktop/systemd1" + bus_name = "org.freedesktop.systemd1" + introspect = await self.bus.introspect(bus_name, path) + systemd_proxy: ProxyObject = self.bus.get_proxy_object(bus_name, path, introspection=introspect) + # noinspection PyTypeChecker + self.manager = systemd_proxy.get_interface("org.freedesktop.systemd1.Manager") # type: ignore + + async def enable(self, service: str) -> None: + await self.manager.call_enable_unit_files([service], False, True) logger.debug(f"Enabled {service} service") - def start(self, service: str) -> None: - self.manager.StartUnit(service, "replace") + async def start(self, service: str) -> None: + await self.manager.call_start_unit(service, Mode.REPLACE) logger.debug(f"Started {service} service") - def stop(self, service: str) -> None: - self.manager.StopUnit(service, "replace") + async def stop(self, service: str) -> None: + await self.manager.call_stop_unit(service, Mode.REPLACE) logger.debug(f"Stopped {service} service") - def restart(self, service: str) -> None: - self.manager.RestartUnit(service, "replace") + async def restart(self, service: str) -> None: + await self.manager.call_restart_unit(service, Mode.REPLACE) logger.debug(f"Restarted {service} service") - def disable(self, service: str) -> None: - self.manager.DisableUnitFiles([service], False) + async def disable(self, service: str) -> None: + await self.manager.call_disable_unit_files([service], False) logger.debug(f"Disabled {service} service") - def is_service_enabled(self, service: str) -> bool: + async def is_service_enabled(self, service: str) -> bool: try: - return self.manager.GetUnitFileState(service) == "enabled" - except DBusException as error: + state = await self.manager.call_get_unit_file_state(service) + return state == UnitFileState.ENABLED + except DBusError as error: logger.error(error) return False - def is_service_active(self, service: str) -> bool: + async def is_service_active(self, service: str) -> bool: try: - systemd_service = self.bus.get_object("org.freedesktop.systemd1", object_path=self.manager.GetUnit(service)) - unit = dbus.Interface(systemd_service, "org.freedesktop.systemd1.Unit") - unit_properties = dbus.Interface(unit, "org.freedesktop.DBus.Properties") - active_state = unit_properties.Get("org.freedesktop.systemd1.Unit", "ActiveState") - return active_state == "active" - except DBusException as error: + path = await self.manager.call_get_unit(service) + bus_name = "org.freedesktop.systemd1" + introspect = await self.bus.introspect(bus_name, path) + systemd_service = self.bus.get_proxy_object(bus_name, path, introspection=introspect) + unit: UnitProxy = systemd_service.get_interface("org.freedesktop.systemd1.Unit") # type: ignore + active_state = await unit.get_active_state() + return active_state == ActiveState.ACTIVE + except DBusError as error: logger.error(error) return False - def enable_and_start(self, service: str) -> None: - if not self.is_service_enabled(service): - self.enable(service) - if not self.is_service_active(service): - self.start(service) + async def enable_and_start(self, service: str) -> None: + if not await self.is_service_enabled(service): + await self.enable(service) + if not await self.is_service_active(service): + await self.start(service) + + async def stop_and_disable(self, service: str) -> None: + if await self.is_service_active(service): + await self.stop(service) + if await self.is_service_enabled(service): + await self.disable(service) From 13e9e6d3f622d7a57e93a6329a242ac64f3844ae Mon Sep 17 00:00:00 2001 From: Olivier Le Thanh Duong Date: Mon, 8 Apr 2024 15:55:38 +0200 Subject: [PATCH 02/39] use mypy procotol method for typing --- src/aleph/vm/systemd.py | 49 +++++++++++++++++------------------------ 1 file changed, 20 insertions(+), 29 deletions(-) diff --git a/src/aleph/vm/systemd.py b/src/aleph/vm/systemd.py index e72143f42..e8909b5d4 100644 --- a/src/aleph/vm/systemd.py +++ b/src/aleph/vm/systemd.py @@ -5,7 +5,7 @@ import abc import enum import logging -from typing import Literal +from typing import Literal, Protocol, runtime_checkable from dbus_fast import DBusError from dbus_fast.aio import MessageBus, ProxyInterface, ProxyObject @@ -105,50 +105,37 @@ class ActiveState(str, enum.Enum): ActiveStateLiteral = Literal["active", "reloading", "inactive", "failed", "activating", "deactivating"] -class SystemdProxy(ProxyInterface, abc.ABC): +@runtime_checkable +class SystemdProxy(Protocol): """ABC for typing. for description of methodsp see https://www.freedesktop.org/software/systemd/man/latest/org.freedesktop.systemd1.html#The%20Manager%20Object""" - @abc.abstractmethod - async def call_enable_unit_files(self, files: list[str], runtime: bool, force: bool): - pass + async def call_enable_unit_files(self, files: list[str], runtime: bool, force: bool): ... - @abc.abstractmethod - async def call_get_unit_file_state(self, service) -> UnitFileStateLiteral: - pass + async def call_get_unit_file_state(self, service) -> UnitFileStateLiteral: ... - @abc.abstractmethod async def call_start_unit(self, name, mode): pass - @abc.abstractmethod - async def call_stop_unit(self, name, mode): - pass + async def call_stop_unit(self, name, mode): ... - @abc.abstractmethod - async def call_restart_unit(self, name, mode): - pass + async def call_restart_unit(self, name, mode): ... - @abc.abstractmethod - async def call_disable_unit_files(self, files: list[str], runtime: bool): - pass + async def call_disable_unit_files(self, files: list[str], runtime: bool): ... - @abc.abstractmethod - async def call_get_unit(self, name: str) -> str: - pass + async def call_get_unit(self, name: str) -> str: ... -class UnitProxy(ProxyInterface, abc.ABC): - """ABC for typing. +@runtime_checkable +class UnitProxy(Protocol): + """for typing. for description of methods see https://www.freedesktop.org/software/systemd/man/latest/org.freedesktop.systemd1.html#Service%20Unit%20Objects""" - @abc.abstractmethod - async def get_active_state(self) -> ActiveStateLiteral: - pass + async def get_active_state(self) -> ActiveStateLiteral: ... class SystemDManager: @@ -169,8 +156,10 @@ async def connect(self): bus_name = "org.freedesktop.systemd1" introspect = await self.bus.introspect(bus_name, path) systemd_proxy: ProxyObject = self.bus.get_proxy_object(bus_name, path, introspection=introspect) - # noinspection PyTypeChecker - self.manager = systemd_proxy.get_interface("org.freedesktop.systemd1.Manager") # type: ignore + interface = systemd_proxy.get_interface("org.freedesktop.systemd1.Manager") + # Check required method are implemented + assert isinstance(interface, SystemdProxy) + self.manager = interface async def enable(self, service: str) -> None: await self.manager.call_enable_unit_files([service], False, True) @@ -206,7 +195,9 @@ async def is_service_active(self, service: str) -> bool: bus_name = "org.freedesktop.systemd1" introspect = await self.bus.introspect(bus_name, path) systemd_service = self.bus.get_proxy_object(bus_name, path, introspection=introspect) - unit: UnitProxy = systemd_service.get_interface("org.freedesktop.systemd1.Unit") # type: ignore + unit = systemd_service.get_interface("org.freedesktop.systemd1.Unit") + # Check required method are implemented + assert isinstance(unit, UnitProxy) active_state = await unit.get_active_state() return active_state == ActiveState.ACTIVE except DBusError as error: From 421515b67b335eb5bb45391dd7f4190eed1b3e37 Mon Sep 17 00:00:00 2001 From: Olivier Le Thanh Duong Date: Thu, 11 Apr 2024 10:39:01 +0200 Subject: [PATCH 03/39] fix CI, specify bus_type --- src/aleph/vm/systemd.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/aleph/vm/systemd.py b/src/aleph/vm/systemd.py index e8909b5d4..f761723ec 100644 --- a/src/aleph/vm/systemd.py +++ b/src/aleph/vm/systemd.py @@ -2,13 +2,12 @@ async SystemD Manager implementation. """ -import abc import enum import logging from typing import Literal, Protocol, runtime_checkable -from dbus_fast import DBusError -from dbus_fast.aio import MessageBus, ProxyInterface, ProxyObject +from dbus_fast import DBusError, BusType +from dbus_fast.aio import MessageBus, ProxyObject logger = logging.getLogger(__name__) @@ -148,7 +147,7 @@ class SystemDManager: manager: SystemdProxy def __init__(self): - self.bus = MessageBus() + self.bus = MessageBus(bus_type=BusType.SYSTEM) async def connect(self): await self.bus.connect() From 0c435477087df4a9d57aa4567de532ff5bf5f70b Mon Sep 17 00:00:00 2001 From: Olivier Le Thanh Duong Date: Thu, 11 Apr 2024 11:32:30 +0200 Subject: [PATCH 04/39] fix init in async, isort --- src/aleph/vm/systemd.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/aleph/vm/systemd.py b/src/aleph/vm/systemd.py index f761723ec..0e23f781e 100644 --- a/src/aleph/vm/systemd.py +++ b/src/aleph/vm/systemd.py @@ -4,9 +4,9 @@ import enum import logging -from typing import Literal, Protocol, runtime_checkable +from typing import Literal, Optional, Protocol, runtime_checkable -from dbus_fast import DBusError, BusType +from dbus_fast import BusType, DBusError from dbus_fast.aio import MessageBus, ProxyObject logger = logging.getLogger(__name__) @@ -19,7 +19,7 @@ class UnitFileState(str, enum.Enum): """Indicates that a unit file is permanently enabled.""" ENABLED_RUNTIME = "enabled-runtime" - """Indicates the unit file is only temporarily enabled and will no longer be enabled after a reboot + """Indicates the unit file is only temporarily enabled and will no longer be enabled after a reboot (that means, it is enabled via /run/ symlinks, rather than /etc/).""" LINKED = "linked" @@ -143,13 +143,14 @@ class SystemDManager: Used to manage the systemd services on the host on Linux. """ - bus: MessageBus - manager: SystemdProxy + bus: Optional[MessageBus] + manager: Optional[SystemdProxy] def __init__(self): - self.bus = MessageBus(bus_type=BusType.SYSTEM) + pass async def connect(self): + self.bus = MessageBus(bus_type=BusType.SYSTEM) await self.bus.connect() path = "/org/freedesktop/systemd1" bus_name = "org.freedesktop.systemd1" From fc7d3da034922b67663872565f53478a766f1eec Mon Sep 17 00:00:00 2001 From: Olivier Le Thanh Duong Date: Thu, 11 Apr 2024 12:05:09 +0200 Subject: [PATCH 05/39] mypy --- src/aleph/vm/systemd.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/aleph/vm/systemd.py b/src/aleph/vm/systemd.py index 0e23f781e..d66345af0 100644 --- a/src/aleph/vm/systemd.py +++ b/src/aleph/vm/systemd.py @@ -162,26 +162,32 @@ async def connect(self): self.manager = interface async def enable(self, service: str) -> None: + assert self.manager, "connect() not called" await self.manager.call_enable_unit_files([service], False, True) logger.debug(f"Enabled {service} service") async def start(self, service: str) -> None: + assert self.manager, "connect() not called" await self.manager.call_start_unit(service, Mode.REPLACE) logger.debug(f"Started {service} service") async def stop(self, service: str) -> None: + assert self.manager, "connect() not called" await self.manager.call_stop_unit(service, Mode.REPLACE) logger.debug(f"Stopped {service} service") async def restart(self, service: str) -> None: + assert self.manager, "connect() not called" await self.manager.call_restart_unit(service, Mode.REPLACE) logger.debug(f"Restarted {service} service") async def disable(self, service: str) -> None: + assert self.manager, "connect() not called" await self.manager.call_disable_unit_files([service], False) logger.debug(f"Disabled {service} service") async def is_service_enabled(self, service: str) -> bool: + assert self.manager, "connect() not called" try: state = await self.manager.call_get_unit_file_state(service) return state == UnitFileState.ENABLED @@ -190,6 +196,8 @@ async def is_service_enabled(self, service: str) -> bool: return False async def is_service_active(self, service: str) -> bool: + assert self.manager, "connect() not called" + assert self.bus, "connect() not called" try: path = await self.manager.call_get_unit(service) bus_name = "org.freedesktop.systemd1" From f8c93433e969cf69d9284b716d8569dcc312d46f Mon Sep 17 00:00:00 2001 From: Olivier Le Thanh Duong Date: Thu, 11 Apr 2024 12:36:32 +0200 Subject: [PATCH 06/39] dbus fast is not in debian 11 --- packaging/Makefile | 2 +- packaging/aleph-vm/DEBIAN/control | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/packaging/Makefile b/packaging/Makefile index 43d8a0017..c9dd6ae85 100644 --- a/packaging/Makefile +++ b/packaging/Makefile @@ -15,7 +15,7 @@ debian-package-code: cp ../examples/instance_message_from_aleph.json ./aleph-vm/opt/aleph-vm/examples/instance_message_from_aleph.json cp -r ../examples/data ./aleph-vm/opt/aleph-vm/examples/data mkdir -p ./aleph-vm/opt/aleph-vm/examples/volumes - pip3 install --target ./aleph-vm/opt/aleph-vm/ 'aleph-message==0.4.4' 'jwskate==0.8.0' 'eth-account==0.9.0' 'sentry-sdk==1.31.0' 'qmp==1.1.0' 'superfluid==0.2.1' 'sqlalchemy[asyncio]' 'aiosqlite==0.19.0' 'alembic==1.13.1' 'aiohttp_cors==0.7.0' 'pyroute2==0.7.12' + pip3 install --target ./aleph-vm/opt/aleph-vm/ 'aleph-message==0.4.4' 'jwskate==0.8.0' 'eth-account==0.9.0' 'sentry-sdk==1.31.0' 'qmp==1.1.0' 'superfluid==0.2.1' 'sqlalchemy[asyncio]' 'aiosqlite==0.19.0' 'alembic==1.13.1' 'aiohttp_cors==0.7.0' 'pyroute2==0.7.12' 'dbus-fast==1.90.1' python3 -m compileall ./aleph-vm/opt/aleph-vm/ debian-package-resources: firecracker-bins vmlinux download-ipfs-kubo diff --git a/packaging/aleph-vm/DEBIAN/control b/packaging/aleph-vm/DEBIAN/control index 1f51a0990..1df0a82f8 100644 --- a/packaging/aleph-vm/DEBIAN/control +++ b/packaging/aleph-vm/DEBIAN/control @@ -3,6 +3,6 @@ Version: 0.1.8 Architecture: all Maintainer: Aleph.im Description: Aleph.im VM execution engine -Depends: python3,python3-pip,python3-aiohttp,python3-msgpack,python3-aiodns,python3-alembic,python3-sqlalchemy,python3-setproctitle,redis,python3-aioredis,python3-psutil,sudo,acl,curl,systemd-container,squashfs-tools,debootstrap,python3-packaging,python3-cpuinfo,python3-nftables,python3-jsonschema,cloud-image-utils,ndppd,python3-yaml,python3-dotenv,python3-schedule,qemu-system-x86,qemu-utils,python3-systemd,python3-dbus-fast,btrfs-progs,nftables +Depends: python3,python3-pip,python3-aiohttp,python3-msgpack,python3-aiodns,python3-alembic,python3-sqlalchemy,python3-setproctitle,redis,python3-aioredis,python3-psutil,sudo,acl,curl,systemd-container,squashfs-tools,debootstrap,python3-packaging,python3-cpuinfo,python3-nftables,python3-jsonschema,cloud-image-utils,ndppd,python3-yaml,python3-dotenv,python3-schedule,qemu-system-x86,qemu-utils,python3-systemd,btrfs-progs,nftables Section: aleph-im Priority: Extra From afbdd0c784b0f85b8858ecd514113dbed5df4061 Mon Sep 17 00:00:00 2001 From: Olivier Le Thanh Duong Date: Mon, 8 Apr 2024 14:30:18 +0200 Subject: [PATCH 07/39] Problem: dbus call were not async dbus call to systemd were not called asyncronously. Modify all call to asynchronous, this required switching from python-dbus to dbus-fast which offer an asyncio backend. --- packaging/aleph-vm/DEBIAN/control | 2 +- pyproject.toml | 3 +- src/aleph/vm/pool.py | 4 +- src/aleph/vm/systemd.py | 230 ++++++++++++++++++++++++------ 4 files changed, 194 insertions(+), 45 deletions(-) diff --git a/packaging/aleph-vm/DEBIAN/control b/packaging/aleph-vm/DEBIAN/control index 45aa6bd65..1f51a0990 100644 --- a/packaging/aleph-vm/DEBIAN/control +++ b/packaging/aleph-vm/DEBIAN/control @@ -3,6 +3,6 @@ Version: 0.1.8 Architecture: all Maintainer: Aleph.im Description: Aleph.im VM execution engine -Depends: python3,python3-pip,python3-aiohttp,python3-msgpack,python3-aiodns,python3-alembic,python3-sqlalchemy,python3-setproctitle,redis,python3-aioredis,python3-psutil,sudo,acl,curl,systemd-container,squashfs-tools,debootstrap,python3-packaging,python3-cpuinfo,python3-nftables,python3-jsonschema,cloud-image-utils,ndppd,python3-yaml,python3-dotenv,python3-schedule,qemu-system-x86,qemu-utils,python3-systemd,python3-dbus,btrfs-progs,nftables +Depends: python3,python3-pip,python3-aiohttp,python3-msgpack,python3-aiodns,python3-alembic,python3-sqlalchemy,python3-setproctitle,redis,python3-aioredis,python3-psutil,sudo,acl,curl,systemd-container,squashfs-tools,debootstrap,python3-packaging,python3-cpuinfo,python3-nftables,python3-jsonschema,cloud-image-utils,ndppd,python3-yaml,python3-dotenv,python3-schedule,qemu-system-x86,qemu-utils,python3-systemd,python3-dbus-fast,btrfs-progs,nftables Section: aleph-im Priority: Extra diff --git a/pyproject.toml b/pyproject.toml index cd803673e..71013adbe 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -42,8 +42,7 @@ dependencies = [ "packaging==23.2", "jsonschema==4.19.1", "qmp==0.0.1", - "dbus-python==1.3.2", - "systemd-python==235", + "dbus-fast==1.90.1", "systemd-python==235", "superfluid~=0.2.1", "sqlalchemy[asyncio]>=2.0", diff --git a/src/aleph/vm/pool.py b/src/aleph/vm/pool.py index 3e5c5f3ec..938a83726 100644 --- a/src/aleph/vm/pool.py +++ b/src/aleph/vm/pool.py @@ -124,7 +124,7 @@ async def create_a_vm( # Start VM and snapshots automatically if execution.persistent: - self.systemd_manager.enable_and_start(execution.controller_service) + await self.systemd_manager.enable_and_start(execution.controller_service) await execution.wait_for_init() if execution.is_program and execution.vm: await execution.vm.load_configuration() @@ -191,7 +191,7 @@ async def stop_vm(self, vm_hash: ItemHash) -> Optional[VmExecution]: async def stop_persistent_execution(self, execution: VmExecution): """Stop persistent VMs in the pool.""" assert execution.persistent, "Execution isn't persistent" - self.systemd_manager.stop_and_disable(execution.controller_service) + await self.systemd_manager.stop_and_disable(execution.controller_service) await execution.stop() def forget_vm(self, vm_hash: ItemHash) -> None: diff --git a/src/aleph/vm/systemd.py b/src/aleph/vm/systemd.py index 001c4671d..e72143f42 100644 --- a/src/aleph/vm/systemd.py +++ b/src/aleph/vm/systemd.py @@ -2,75 +2,225 @@ async SystemD Manager implementation. """ +import abc +import enum import logging +from typing import Literal -import dbus -from dbus import DBusException, SystemBus -from dbus.proxies import Interface +from dbus_fast import DBusError +from dbus_fast.aio import MessageBus, ProxyInterface, ProxyObject logger = logging.getLogger(__name__) +class UnitFileState(str, enum.Enum): + """This StrEnum class represents the different possible states of a unit file.""" + + ENABLED = "enabled" + """Indicates that a unit file is permanently enabled.""" + + ENABLED_RUNTIME = "enabled-runtime" + """Indicates the unit file is only temporarily enabled and will no longer be enabled after a reboot + (that means, it is enabled via /run/ symlinks, rather than /etc/).""" + + LINKED = "linked" + """Indicates that a unit is linked into /etc/ permanently.""" + + LINKED_RUNTIME = "linked-runtime" + """Indicates that a unit is linked into /run/ temporarily (until the next reboot).""" + + MASKED = "masked" + """Indicates that the unit file is masked permanently.""" + + MASKED_RUNTIME = "masked-runtime" + """Indicates that it is masked in /run/ temporarily (until the next reboot).""" + + STATIC = "static" + """Indicates that the unit is statically enabled, i.e. always enabled and doesn't need to be enabled explicitly.""" + + DISABLED = "disabled" + """Indicates that the unit file is not enabled.""" + + INVALID = "invalid" + """Indicates that it could not be determined whether the unit file is enabled.""" + + +UnitFileStateLiteral = Literal[ + "enabled", + "enabled-runtime", + "linked", + "linked-runtime", + "masked", + "masked-runtime", + "static", + "disabled", + "invalid", +] + + +class Mode(str, enum.Enum): + REPLACE = "replace" + FAIL = "fail" + ISOLATE = "isolate" + IGNORE_DEPENDENCIES = "ignore-dependencies" + IGNORE_REQUIREMENTS = "ignore-requirements" + + +class ActiveState(str, enum.Enum): + """ + ActiveState contains a state value that reflects the unit's current status. + """ + + ACTIVE = "active" + """ + The unit is active. + """ + + RELOADING = "reloading" + """ + The unit is active and reloading its configuration. + """ + + INACTIVE = "inactive" + """ + The unit is inactive, previous run was successful or hasn't yet occurred. + """ + + FAILED = "failed" + """ + The unit is inactive, previous run was unsuccessful. + """ + + ACTIVATING = "activating" + """ + The unit is transitioning from inactive to active state. + """ + + DEACTIVATING = "deactivating" + """ + The unit is in the process of deactivation. + """ + + +ActiveStateLiteral = Literal["active", "reloading", "inactive", "failed", "activating", "deactivating"] + + +class SystemdProxy(ProxyInterface, abc.ABC): + """ABC for typing. + + for description of methodsp + see https://www.freedesktop.org/software/systemd/man/latest/org.freedesktop.systemd1.html#The%20Manager%20Object""" + + @abc.abstractmethod + async def call_enable_unit_files(self, files: list[str], runtime: bool, force: bool): + pass + + @abc.abstractmethod + async def call_get_unit_file_state(self, service) -> UnitFileStateLiteral: + pass + + @abc.abstractmethod + async def call_start_unit(self, name, mode): + pass + + @abc.abstractmethod + async def call_stop_unit(self, name, mode): + pass + + @abc.abstractmethod + async def call_restart_unit(self, name, mode): + pass + + @abc.abstractmethod + async def call_disable_unit_files(self, files: list[str], runtime: bool): + pass + + @abc.abstractmethod + async def call_get_unit(self, name: str) -> str: + pass + + +class UnitProxy(ProxyInterface, abc.ABC): + """ABC for typing. + + for description of methods see + https://www.freedesktop.org/software/systemd/man/latest/org.freedesktop.systemd1.html#Service%20Unit%20Objects""" + + @abc.abstractmethod + async def get_active_state(self) -> ActiveStateLiteral: + pass + + class SystemDManager: """SystemD Manager class. Used to manage the systemd services on the host on Linux. """ - bus: SystemBus - manager: Interface + bus: MessageBus + manager: SystemdProxy def __init__(self): - self.bus = dbus.SystemBus() - systemd = self.bus.get_object("org.freedesktop.systemd1", "/org/freedesktop/systemd1") - self.manager = dbus.Interface(systemd, "org.freedesktop.systemd1.Manager") - - def stop_and_disable(self, service: str) -> None: - if self.is_service_active(service): - self.stop(service) - if self.is_service_enabled(service): - self.disable(service) - - def enable(self, service: str) -> None: - self.manager.EnableUnitFiles([service], False, True) + self.bus = MessageBus() + + async def connect(self): + await self.bus.connect() + path = "/org/freedesktop/systemd1" + bus_name = "org.freedesktop.systemd1" + introspect = await self.bus.introspect(bus_name, path) + systemd_proxy: ProxyObject = self.bus.get_proxy_object(bus_name, path, introspection=introspect) + # noinspection PyTypeChecker + self.manager = systemd_proxy.get_interface("org.freedesktop.systemd1.Manager") # type: ignore + + async def enable(self, service: str) -> None: + await self.manager.call_enable_unit_files([service], False, True) logger.debug(f"Enabled {service} service") - def start(self, service: str) -> None: - self.manager.StartUnit(service, "replace") + async def start(self, service: str) -> None: + await self.manager.call_start_unit(service, Mode.REPLACE) logger.debug(f"Started {service} service") - def stop(self, service: str) -> None: - self.manager.StopUnit(service, "replace") + async def stop(self, service: str) -> None: + await self.manager.call_stop_unit(service, Mode.REPLACE) logger.debug(f"Stopped {service} service") - def restart(self, service: str) -> None: - self.manager.RestartUnit(service, "replace") + async def restart(self, service: str) -> None: + await self.manager.call_restart_unit(service, Mode.REPLACE) logger.debug(f"Restarted {service} service") - def disable(self, service: str) -> None: - self.manager.DisableUnitFiles([service], False) + async def disable(self, service: str) -> None: + await self.manager.call_disable_unit_files([service], False) logger.debug(f"Disabled {service} service") - def is_service_enabled(self, service: str) -> bool: + async def is_service_enabled(self, service: str) -> bool: try: - return self.manager.GetUnitFileState(service) == "enabled" - except DBusException as error: + state = await self.manager.call_get_unit_file_state(service) + return state == UnitFileState.ENABLED + except DBusError as error: logger.error(error) return False - def is_service_active(self, service: str) -> bool: + async def is_service_active(self, service: str) -> bool: try: - systemd_service = self.bus.get_object("org.freedesktop.systemd1", object_path=self.manager.GetUnit(service)) - unit = dbus.Interface(systemd_service, "org.freedesktop.systemd1.Unit") - unit_properties = dbus.Interface(unit, "org.freedesktop.DBus.Properties") - active_state = unit_properties.Get("org.freedesktop.systemd1.Unit", "ActiveState") - return active_state == "active" - except DBusException as error: + path = await self.manager.call_get_unit(service) + bus_name = "org.freedesktop.systemd1" + introspect = await self.bus.introspect(bus_name, path) + systemd_service = self.bus.get_proxy_object(bus_name, path, introspection=introspect) + unit: UnitProxy = systemd_service.get_interface("org.freedesktop.systemd1.Unit") # type: ignore + active_state = await unit.get_active_state() + return active_state == ActiveState.ACTIVE + except DBusError as error: logger.error(error) return False - def enable_and_start(self, service: str) -> None: - if not self.is_service_enabled(service): - self.enable(service) - if not self.is_service_active(service): - self.start(service) + async def enable_and_start(self, service: str) -> None: + if not await self.is_service_enabled(service): + await self.enable(service) + if not await self.is_service_active(service): + await self.start(service) + + async def stop_and_disable(self, service: str) -> None: + if await self.is_service_active(service): + await self.stop(service) + if await self.is_service_enabled(service): + await self.disable(service) From 2a50080e1891c4ce79c2c4060e07eeba9e7688b8 Mon Sep 17 00:00:00 2001 From: Olivier Le Thanh Duong Date: Mon, 8 Apr 2024 15:55:38 +0200 Subject: [PATCH 08/39] use mypy procotol method for typing --- src/aleph/vm/systemd.py | 49 +++++++++++++++++------------------------ 1 file changed, 20 insertions(+), 29 deletions(-) diff --git a/src/aleph/vm/systemd.py b/src/aleph/vm/systemd.py index e72143f42..e8909b5d4 100644 --- a/src/aleph/vm/systemd.py +++ b/src/aleph/vm/systemd.py @@ -5,7 +5,7 @@ import abc import enum import logging -from typing import Literal +from typing import Literal, Protocol, runtime_checkable from dbus_fast import DBusError from dbus_fast.aio import MessageBus, ProxyInterface, ProxyObject @@ -105,50 +105,37 @@ class ActiveState(str, enum.Enum): ActiveStateLiteral = Literal["active", "reloading", "inactive", "failed", "activating", "deactivating"] -class SystemdProxy(ProxyInterface, abc.ABC): +@runtime_checkable +class SystemdProxy(Protocol): """ABC for typing. for description of methodsp see https://www.freedesktop.org/software/systemd/man/latest/org.freedesktop.systemd1.html#The%20Manager%20Object""" - @abc.abstractmethod - async def call_enable_unit_files(self, files: list[str], runtime: bool, force: bool): - pass + async def call_enable_unit_files(self, files: list[str], runtime: bool, force: bool): ... - @abc.abstractmethod - async def call_get_unit_file_state(self, service) -> UnitFileStateLiteral: - pass + async def call_get_unit_file_state(self, service) -> UnitFileStateLiteral: ... - @abc.abstractmethod async def call_start_unit(self, name, mode): pass - @abc.abstractmethod - async def call_stop_unit(self, name, mode): - pass + async def call_stop_unit(self, name, mode): ... - @abc.abstractmethod - async def call_restart_unit(self, name, mode): - pass + async def call_restart_unit(self, name, mode): ... - @abc.abstractmethod - async def call_disable_unit_files(self, files: list[str], runtime: bool): - pass + async def call_disable_unit_files(self, files: list[str], runtime: bool): ... - @abc.abstractmethod - async def call_get_unit(self, name: str) -> str: - pass + async def call_get_unit(self, name: str) -> str: ... -class UnitProxy(ProxyInterface, abc.ABC): - """ABC for typing. +@runtime_checkable +class UnitProxy(Protocol): + """for typing. for description of methods see https://www.freedesktop.org/software/systemd/man/latest/org.freedesktop.systemd1.html#Service%20Unit%20Objects""" - @abc.abstractmethod - async def get_active_state(self) -> ActiveStateLiteral: - pass + async def get_active_state(self) -> ActiveStateLiteral: ... class SystemDManager: @@ -169,8 +156,10 @@ async def connect(self): bus_name = "org.freedesktop.systemd1" introspect = await self.bus.introspect(bus_name, path) systemd_proxy: ProxyObject = self.bus.get_proxy_object(bus_name, path, introspection=introspect) - # noinspection PyTypeChecker - self.manager = systemd_proxy.get_interface("org.freedesktop.systemd1.Manager") # type: ignore + interface = systemd_proxy.get_interface("org.freedesktop.systemd1.Manager") + # Check required method are implemented + assert isinstance(interface, SystemdProxy) + self.manager = interface async def enable(self, service: str) -> None: await self.manager.call_enable_unit_files([service], False, True) @@ -206,7 +195,9 @@ async def is_service_active(self, service: str) -> bool: bus_name = "org.freedesktop.systemd1" introspect = await self.bus.introspect(bus_name, path) systemd_service = self.bus.get_proxy_object(bus_name, path, introspection=introspect) - unit: UnitProxy = systemd_service.get_interface("org.freedesktop.systemd1.Unit") # type: ignore + unit = systemd_service.get_interface("org.freedesktop.systemd1.Unit") + # Check required method are implemented + assert isinstance(unit, UnitProxy) active_state = await unit.get_active_state() return active_state == ActiveState.ACTIVE except DBusError as error: From a77cc0db53182c0dbc8efe52b6345c44c0f5fe3a Mon Sep 17 00:00:00 2001 From: Olivier Le Thanh Duong Date: Thu, 11 Apr 2024 10:39:01 +0200 Subject: [PATCH 09/39] fix CI, specify bus_type --- src/aleph/vm/systemd.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/src/aleph/vm/systemd.py b/src/aleph/vm/systemd.py index e8909b5d4..f761723ec 100644 --- a/src/aleph/vm/systemd.py +++ b/src/aleph/vm/systemd.py @@ -2,13 +2,12 @@ async SystemD Manager implementation. """ -import abc import enum import logging from typing import Literal, Protocol, runtime_checkable -from dbus_fast import DBusError -from dbus_fast.aio import MessageBus, ProxyInterface, ProxyObject +from dbus_fast import DBusError, BusType +from dbus_fast.aio import MessageBus, ProxyObject logger = logging.getLogger(__name__) @@ -148,7 +147,7 @@ class SystemDManager: manager: SystemdProxy def __init__(self): - self.bus = MessageBus() + self.bus = MessageBus(bus_type=BusType.SYSTEM) async def connect(self): await self.bus.connect() From 56f43deb3aef595c37324900e1adac2d2fc53efd Mon Sep 17 00:00:00 2001 From: Olivier Le Thanh Duong Date: Thu, 11 Apr 2024 11:32:30 +0200 Subject: [PATCH 10/39] fix init in async, isort --- src/aleph/vm/systemd.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/aleph/vm/systemd.py b/src/aleph/vm/systemd.py index f761723ec..0e23f781e 100644 --- a/src/aleph/vm/systemd.py +++ b/src/aleph/vm/systemd.py @@ -4,9 +4,9 @@ import enum import logging -from typing import Literal, Protocol, runtime_checkable +from typing import Literal, Optional, Protocol, runtime_checkable -from dbus_fast import DBusError, BusType +from dbus_fast import BusType, DBusError from dbus_fast.aio import MessageBus, ProxyObject logger = logging.getLogger(__name__) @@ -19,7 +19,7 @@ class UnitFileState(str, enum.Enum): """Indicates that a unit file is permanently enabled.""" ENABLED_RUNTIME = "enabled-runtime" - """Indicates the unit file is only temporarily enabled and will no longer be enabled after a reboot + """Indicates the unit file is only temporarily enabled and will no longer be enabled after a reboot (that means, it is enabled via /run/ symlinks, rather than /etc/).""" LINKED = "linked" @@ -143,13 +143,14 @@ class SystemDManager: Used to manage the systemd services on the host on Linux. """ - bus: MessageBus - manager: SystemdProxy + bus: Optional[MessageBus] + manager: Optional[SystemdProxy] def __init__(self): - self.bus = MessageBus(bus_type=BusType.SYSTEM) + pass async def connect(self): + self.bus = MessageBus(bus_type=BusType.SYSTEM) await self.bus.connect() path = "/org/freedesktop/systemd1" bus_name = "org.freedesktop.systemd1" From ee3cfbb5d3692ed7266a2e1ca87267a14c196ed5 Mon Sep 17 00:00:00 2001 From: Olivier Le Thanh Duong Date: Thu, 11 Apr 2024 12:05:09 +0200 Subject: [PATCH 11/39] mypy --- src/aleph/vm/systemd.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/aleph/vm/systemd.py b/src/aleph/vm/systemd.py index 0e23f781e..d66345af0 100644 --- a/src/aleph/vm/systemd.py +++ b/src/aleph/vm/systemd.py @@ -162,26 +162,32 @@ async def connect(self): self.manager = interface async def enable(self, service: str) -> None: + assert self.manager, "connect() not called" await self.manager.call_enable_unit_files([service], False, True) logger.debug(f"Enabled {service} service") async def start(self, service: str) -> None: + assert self.manager, "connect() not called" await self.manager.call_start_unit(service, Mode.REPLACE) logger.debug(f"Started {service} service") async def stop(self, service: str) -> None: + assert self.manager, "connect() not called" await self.manager.call_stop_unit(service, Mode.REPLACE) logger.debug(f"Stopped {service} service") async def restart(self, service: str) -> None: + assert self.manager, "connect() not called" await self.manager.call_restart_unit(service, Mode.REPLACE) logger.debug(f"Restarted {service} service") async def disable(self, service: str) -> None: + assert self.manager, "connect() not called" await self.manager.call_disable_unit_files([service], False) logger.debug(f"Disabled {service} service") async def is_service_enabled(self, service: str) -> bool: + assert self.manager, "connect() not called" try: state = await self.manager.call_get_unit_file_state(service) return state == UnitFileState.ENABLED @@ -190,6 +196,8 @@ async def is_service_enabled(self, service: str) -> bool: return False async def is_service_active(self, service: str) -> bool: + assert self.manager, "connect() not called" + assert self.bus, "connect() not called" try: path = await self.manager.call_get_unit(service) bus_name = "org.freedesktop.systemd1" From dc961fc1379f054714199d7ada8b401af7e9126d Mon Sep 17 00:00:00 2001 From: Olivier Le Thanh Duong Date: Thu, 11 Apr 2024 12:36:32 +0200 Subject: [PATCH 12/39] dbus fast is not in debian 11 --- packaging/Makefile | 2 +- packaging/aleph-vm/DEBIAN/control | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/packaging/Makefile b/packaging/Makefile index 43d8a0017..c9dd6ae85 100644 --- a/packaging/Makefile +++ b/packaging/Makefile @@ -15,7 +15,7 @@ debian-package-code: cp ../examples/instance_message_from_aleph.json ./aleph-vm/opt/aleph-vm/examples/instance_message_from_aleph.json cp -r ../examples/data ./aleph-vm/opt/aleph-vm/examples/data mkdir -p ./aleph-vm/opt/aleph-vm/examples/volumes - pip3 install --target ./aleph-vm/opt/aleph-vm/ 'aleph-message==0.4.4' 'jwskate==0.8.0' 'eth-account==0.9.0' 'sentry-sdk==1.31.0' 'qmp==1.1.0' 'superfluid==0.2.1' 'sqlalchemy[asyncio]' 'aiosqlite==0.19.0' 'alembic==1.13.1' 'aiohttp_cors==0.7.0' 'pyroute2==0.7.12' + pip3 install --target ./aleph-vm/opt/aleph-vm/ 'aleph-message==0.4.4' 'jwskate==0.8.0' 'eth-account==0.9.0' 'sentry-sdk==1.31.0' 'qmp==1.1.0' 'superfluid==0.2.1' 'sqlalchemy[asyncio]' 'aiosqlite==0.19.0' 'alembic==1.13.1' 'aiohttp_cors==0.7.0' 'pyroute2==0.7.12' 'dbus-fast==1.90.1' python3 -m compileall ./aleph-vm/opt/aleph-vm/ debian-package-resources: firecracker-bins vmlinux download-ipfs-kubo diff --git a/packaging/aleph-vm/DEBIAN/control b/packaging/aleph-vm/DEBIAN/control index 1f51a0990..1df0a82f8 100644 --- a/packaging/aleph-vm/DEBIAN/control +++ b/packaging/aleph-vm/DEBIAN/control @@ -3,6 +3,6 @@ Version: 0.1.8 Architecture: all Maintainer: Aleph.im Description: Aleph.im VM execution engine -Depends: python3,python3-pip,python3-aiohttp,python3-msgpack,python3-aiodns,python3-alembic,python3-sqlalchemy,python3-setproctitle,redis,python3-aioredis,python3-psutil,sudo,acl,curl,systemd-container,squashfs-tools,debootstrap,python3-packaging,python3-cpuinfo,python3-nftables,python3-jsonschema,cloud-image-utils,ndppd,python3-yaml,python3-dotenv,python3-schedule,qemu-system-x86,qemu-utils,python3-systemd,python3-dbus-fast,btrfs-progs,nftables +Depends: python3,python3-pip,python3-aiohttp,python3-msgpack,python3-aiodns,python3-alembic,python3-sqlalchemy,python3-setproctitle,redis,python3-aioredis,python3-psutil,sudo,acl,curl,systemd-container,squashfs-tools,debootstrap,python3-packaging,python3-cpuinfo,python3-nftables,python3-jsonschema,cloud-image-utils,ndppd,python3-yaml,python3-dotenv,python3-schedule,qemu-system-x86,qemu-utils,python3-systemd,btrfs-progs,nftables Section: aleph-im Priority: Extra From 4ac90999699c384226b47f37a9e8d8a2a295539b Mon Sep 17 00:00:00 2001 From: Olivier Le Thanh Duong Date: Tue, 16 Apr 2024 12:05:47 +0200 Subject: [PATCH 13/39] Problem: Makefile for publishing example were not working 'aleph program' now need an 'update' argument. Solution: Update makefile and documentation --- examples/example_http_js/Makefile | 2 +- examples/example_http_rust/Makefile | 2 +- tutorials/REQUIREMENTS.md | 2 +- tutorials/SERVER.md | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/example_http_js/Makefile b/examples/example_http_js/Makefile index 3b2ac89e8..6c43a3f06 100644 --- a/examples/example_http_js/Makefile +++ b/examples/example_http_js/Makefile @@ -16,4 +16,4 @@ docker-publish: publish: chmod +x ./src/run.sh - aleph program ./src "run.sh" + aleph program upload ./src "run.sh" diff --git a/examples/example_http_rust/Makefile b/examples/example_http_rust/Makefile index 0f82bdd02..dbf618cd9 100644 --- a/examples/example_http_rust/Makefile +++ b/examples/example_http_rust/Makefile @@ -15,4 +15,4 @@ publish: cargo build --release mkdir -p ./dist cp target/release/example_http_rust ./dist/ - aleph program ./dist example_http_rust + aleph program upload ./dist example_http_rust diff --git a/tutorials/REQUIREMENTS.md b/tutorials/REQUIREMENTS.md index 905ddfbbd..6a87fe359 100644 --- a/tutorials/REQUIREMENTS.md +++ b/tutorials/REQUIREMENTS.md @@ -89,7 +89,7 @@ aleph pin QmWWX6BaaRkRSr2iNdwH5e29ACPg2nCHHXTRTfuBmVm3Ga ## 3. Create your program ```shell -aleph program ./my-program main:app +aleph program upload ./my-program main:app ``` Press Enter at the following prompt to use the default runtime: diff --git a/tutorials/SERVER.md b/tutorials/SERVER.md index c34548364..2dcff54e5 100644 --- a/tutorials/SERVER.md +++ b/tutorials/SERVER.md @@ -88,9 +88,9 @@ cargo build --release Publish it on Aleph using the same procedure as with the Python example, except the entrypoint refers to the name of the binary to execute. ```shell -aleph program ./target/release/example_http_rust example_http_rust +aleph program upload ./target/release/example_http_rust example_http_rust ``` If your program takes some arguments, pass them in the entrypoint by using quotes: `"example_http_rust --help`. -ℹ️ If you get the error `Invalid zip archive`, you are probably missing the Squashfs user tool `mksquashfs`. In that case, first create the squashfs archive and then upload it using `aleph program ./target/release/example_http_rust.squashfs example_http_rust` +ℹ️ If you get the error `Invalid zip archive`, you are probably missing the Squashfs user tool `mksquashfs`. In that case, first create the squashfs archive and then upload it using `aleph program upload ./target/release/example_http_rust.squashfs example_http_rust` From 18bb56f95d240fb7db68b70914fb02c0c3fa4dc9 Mon Sep 17 00:00:00 2001 From: Olivier Le Thanh Duong Date: Thu, 25 Apr 2024 15:26:37 +0200 Subject: [PATCH 14/39] Problem: could not start Instances from command line (#597) Problem: could not start Instances from command line Problem happened when launching with --run-fake-instance Solution: Adapt to new VMPool API that take a loop Also fix benchmarks function --- src/aleph/vm/orchestrator/cli.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/src/aleph/vm/orchestrator/cli.py b/src/aleph/vm/orchestrator/cli.py index 22bd44147..65b290ba2 100644 --- a/src/aleph/vm/orchestrator/cli.py +++ b/src/aleph/vm/orchestrator/cli.py @@ -17,6 +17,7 @@ from sqlalchemy.ext.asyncio import create_async_engine from aleph.vm.conf import ALLOW_DEVELOPER_SSH_KEYS, make_db_url, settings +from aleph.vm.models import VmExecution from aleph.vm.pool import VmPool from aleph.vm.version import get_version_from_apt, get_version_from_git @@ -187,7 +188,8 @@ async def fake_read() -> bytes: bench: list[float] = [] - pool = VmPool() + loop = asyncio.get_event_loop() + pool = VmPool(loop) pool.setup() # Does not make sense in benchmarks @@ -236,25 +238,24 @@ async def fake_read() -> bytes: print("Event result", result) -async def start_instance(item_hash: ItemHash) -> None: +async def start_instance(item_hash: ItemHash, pubsub: Optional[PubSub], pool) -> VmExecution: """Run an instance from an InstanceMessage.""" - pool = VmPool() + return await start_persistent_vm(item_hash, pubsub, pool) + +async def run_instances(instances: list[ItemHash]) -> None: + """Run instances from a list of message identifiers.""" + logger.info(f"Instances to run: {instances}") + loop = asyncio.get_event_loop() + pool = VmPool(loop) # The main program uses a singleton pubsub instance in order to watch for updates. # We create another instance here since that singleton is not initialized yet. # Watching for updates on this instance will therefore not work. pubsub: Optional[PubSub] = None - await start_persistent_vm(item_hash, pubsub, pool) - - -async def run_instances(instances: list[ItemHash]) -> None: - """Run instances from a list of message identifiers.""" - logger.info(f"Instances to run: {instances}") + await asyncio.gather(*[start_instance(instance_id, pubsub, pool) for instance_id in instances]) - await asyncio.gather(*[start_instance(item_hash=instance_id) for instance_id in instances]) await asyncio.Event().wait() # wait forever - # TODO : should we really wait forever? @contextlib.contextmanager From 84614a5af14caf165c9f76136b32471fac504334 Mon Sep 17 00:00:00 2001 From: nesitor Date: Fri, 26 Apr 2024 10:14:18 +0200 Subject: [PATCH 15/39] Solve last CORS issues about duplicated headers (#604) Fix: Solve last CORS errors raised cause by duplication of headers returned. --- src/aleph/vm/orchestrator/resources.py | 4 +++- src/aleph/vm/orchestrator/supervisor.py | 13 ------------- src/aleph/vm/orchestrator/views/__init__.py | 18 +++++------------- .../vm/orchestrator/views/authentication.py | 2 -- 4 files changed, 8 insertions(+), 29 deletions(-) diff --git a/src/aleph/vm/orchestrator/resources.py b/src/aleph/vm/orchestrator/resources.py index 6c042f056..a40c6ff13 100644 --- a/src/aleph/vm/orchestrator/resources.py +++ b/src/aleph/vm/orchestrator/resources.py @@ -11,6 +11,7 @@ from pydantic import BaseModel, Field from aleph.vm.conf import settings +from aleph.vm.utils import cors_allow_all class Period(BaseModel): @@ -92,6 +93,7 @@ def get_machine_properties() -> MachineProperties: ) +@cors_allow_all async def about_system_usage(_: web.Request): """Public endpoint to expose information about the system usage.""" period_start = datetime.now(timezone.utc).replace(second=0, microsecond=0) @@ -116,7 +118,7 @@ async def about_system_usage(_: web.Request): ), properties=get_machine_properties(), ) - return web.json_response(text=usage.json(exclude_none=True), headers={"Access-Control-Allow-Origin:": "*"}) + return web.json_response(text=usage.json(exclude_none=True)) class Allocation(BaseModel): diff --git a/src/aleph/vm/orchestrator/supervisor.py b/src/aleph/vm/orchestrator/supervisor.py index 9b2c3c1c1..4846104ae 100644 --- a/src/aleph/vm/orchestrator/supervisor.py +++ b/src/aleph/vm/orchestrator/supervisor.py @@ -69,19 +69,6 @@ async def server_version_middleware( return resp -async def allow_cors_on_endpoint(request: web.Request): - """Allow CORS on endpoints that VM owners use to control their machine.""" - return web.Response( - status=200, - headers={ - "Access-Control-Allow-Headers": "*", - "Access-Control-Allow-Methods": "*", - "Access-Control-Allow-Origin": "*", - "Allow": "POST", - }, - ) - - async def http_not_found(request: web.Request): """Return a 404 error for unknown URLs.""" return web.HTTPNotFound() diff --git a/src/aleph/vm/orchestrator/views/__init__.py b/src/aleph/vm/orchestrator/views/__init__.py index 7c1fd370e..994476cba 100644 --- a/src/aleph/vm/orchestrator/views/__init__.py +++ b/src/aleph/vm/orchestrator/views/__init__.py @@ -214,13 +214,9 @@ async def status_check_fastapi(request: web.Request, vm_id: Optional[ItemHash] = # "ipv6": await status.check_ipv6(session), } - return web.json_response( - result, status=200 if all(result.values()) else 503, headers={"Access-Control-Allow-Origin": "*"} - ) + return web.json_response(result, status=200 if all(result.values()) else 503) except aiohttp.ServerDisconnectedError as error: - return web.json_response( - {"error": f"Server disconnected: {error}"}, status=503, headers={"Access-Control-Allow-Origin": "*"} - ) + return web.json_response({"error": f"Server disconnected: {error}"}, status=503) @cors_allow_all @@ -246,7 +242,7 @@ async def status_check_host(request: web.Request): }, } result_status = 200 if all(result["ipv4"].values()) and all(result["ipv6"].values()) else 503 - return web.json_response(result, status=result_status, headers={"Access-Control-Allow-Origin": "*"}) + return web.json_response(result, status=result_status) @cors_allow_all @@ -260,7 +256,7 @@ async def status_check_ipv6(request: web.Request): vm_ipv6 = False result = {"host": await check_host_egress_ipv6(), "vm": vm_ipv6} - return web.json_response(result, headers={"Access-Control-Allow-Origin": "*"}) + return web.json_response(result) @cors_allow_all @@ -283,7 +279,6 @@ async def status_check_version(request: web.Request): return web.Response( status=200, text=f"Up-to-date: version {current} >= {reference}", - headers={"Access-Control-Allow-Origin": "*"}, ) else: return web.HTTPForbidden(text=f"Outdated: version {current} < {reference}") @@ -327,7 +322,6 @@ async def status_public_config(request: web.Request): }, }, dumps=dumps_for_json, - headers={"Access-Control-Allow-Origin": "*"}, ) @@ -436,9 +430,7 @@ async def notify_allocation(request: web.Request): except JSONDecodeError: return web.HTTPBadRequest(reason="Body is not valid JSON") except ValidationError as error: - return web.json_response( - data=error.json(), status=web.HTTPBadRequest.status_code, headers={"Access-Control-Allow-Origin": "*"} - ) + return web.json_response(data=error.json(), status=web.HTTPBadRequest.status_code) pubsub: PubSub = request.app["pubsub"] pool: VmPool = request.app["vm_pool"] diff --git a/src/aleph/vm/orchestrator/views/authentication.py b/src/aleph/vm/orchestrator/views/authentication.py index 84dd96982..d38587015 100644 --- a/src/aleph/vm/orchestrator/views/authentication.py +++ b/src/aleph/vm/orchestrator/views/authentication.py @@ -227,8 +227,6 @@ async def wrapper(request): return web.json_response(data={"error": e.reason}, status=e.status) response = await handler(request, authenticated_sender) - # Allow browser clients to access the body of the response - response.headers.update({"Access-Control-Allow-Origin": request.headers.get("Origin", "")}) return response return wrapper From 5a01c4266dd01c3101f59274eb617c2d818ea558 Mon Sep 17 00:00:00 2001 From: Hugo Herter Date: Fri, 26 Apr 2024 13:00:35 +0200 Subject: [PATCH 16/39] Fix: Diagnostic API was not updated We published multiple changes to the diagnostic VM recently but none of these was released. This provides a new diagnostic VM, based on a new runtime [1], with fixes: - Reading messages with the newer SDK - Better handling of IPv6 detection errors - Two different tests for signing messages (local and remote) - aleph-message version was not specified - fetching a single message was not tested --- .github/workflows/test-on-droplets-matrix.yml | 5 +- examples/example_fastapi/README.md | 6 + examples/example_fastapi/main.py | 187 ++++++++++++++---- .../create_disk_image.sh | 2 +- src/aleph/vm/conf.py | 2 +- src/aleph/vm/orchestrator/run.py | 1 + src/aleph/vm/orchestrator/status.py | 58 +++++- src/aleph/vm/orchestrator/views/__init__.py | 4 + 8 files changed, 224 insertions(+), 41 deletions(-) create mode 100644 examples/example_fastapi/README.md diff --git a/.github/workflows/test-on-droplets-matrix.yml b/.github/workflows/test-on-droplets-matrix.yml index c9563ab82..c67c1688f 100644 --- a/.github/workflows/test-on-droplets-matrix.yml +++ b/.github/workflows/test-on-droplets-matrix.yml @@ -134,8 +134,11 @@ jobs: - alias: "runtime-6770" # Old runtime, using Debian 11 item_hash: "67705389842a0a1b95eaa408b009741027964edc805997475e95c505d642edd8" query_params: "?retro-compatibility=true" - - alias: "runtime-3fc0" # New runtime, using Debian 12 + - alias: "runtime-3fc0" # Newer runtime, using Debian 12 but now old SDK item_hash: "3fc0aa9569da840c43e7bd2033c3c580abb46b007527d6d20f2d4e98e867f7af" + query_params: "?retro-compatibility=true" + - alias: "runtime-63fa" # Latest runtime, using Debian 12 and SDK 0.9.0 + item_hash: "63faf8b5db1cf8d965e6a464a0cb8062af8e7df131729e48738342d956f29ace" query_params: "" steps: diff --git a/examples/example_fastapi/README.md b/examples/example_fastapi/README.md new file mode 100644 index 000000000..231ce255b --- /dev/null +++ b/examples/example_fastapi/README.md @@ -0,0 +1,6 @@ +Publish using: + +```shell + aleph program upload ../aleph-vm/examples/example_fastapi main:app \ + --persistent-volume "persistence=host,size_mib=1,mount=/var/lib/example,name=increment-storage,comment=Persistence" +``` diff --git a/examples/example_fastapi/main.py b/examples/example_fastapi/main.py index ebe1a8bd0..81055c723 100644 --- a/examples/example_fastapi/main.py +++ b/examples/example_fastapi/main.py @@ -5,12 +5,19 @@ import socket import subprocess import sys -from datetime import datetime +from datetime import datetime, timezone from os import listdir from pathlib import Path -from typing import List, Optional +from typing import Any, Optional import aiohttp +from aleph_message.models import ( + MessagesResponse, + PostMessage, + ProgramMessage, + StoreMessage, +) +from aleph_message.status import MessageStatus from fastapi import FastAPI from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import PlainTextResponse @@ -18,8 +25,10 @@ from pydantic import BaseModel, HttpUrl from starlette.responses import JSONResponse +from aleph.sdk.chains.ethereum import get_fallback_account from aleph.sdk.chains.remote import RemoteAccount -from aleph.sdk.client import AlephClient, AuthenticatedAlephClient +from aleph.sdk.client import AlephHttpClient, AuthenticatedAlephHttpClient +from aleph.sdk.query.filters import MessageFilter from aleph.sdk.types import StorageEnum from aleph.sdk.vm.app import AlephApp from aleph.sdk.vm.cache import VmCache @@ -42,13 +51,13 @@ @app.on_event("startup") -async def startup_event(): +async def startup_event() -> None: global startup_lifespan_executed startup_lifespan_executed = True @app.get("/") -async def index(): +async def index() -> dict[str, Any]: if os.path.exists("/opt/venv"): opt_venv = list(listdir("/opt/venv")) else: @@ -56,16 +65,33 @@ async def index(): return { "Example": "example_fastapi", "endpoints": [ + # Features + "/lifespan", "/environ", - "/messages", + "/state/increment", + "/wait-for/{delay}", + # Local cache + "/cache/get/{key}", + "/cache/set/{key}/{value}", + "/cache/remove/{key}", + "/cache/keys", + # Networking "/dns", - "ip/address", + "/ip/address", "/ip/4", "/ip/6", "/internet", + # Error handling + "/raise", + "/crash", + # Aleph.im + "/messages", + "/get_a_message", "/post_a_message", - "/state/increment", - "/wait-for/{delay}", + "/post_a_message_local_account", + "/post_a_file", + "/sign_a_message", + # Platform properties "/platform/os", "/platform/python", "/platform/pip-freeze", @@ -91,10 +117,11 @@ async def environ() -> dict[str, str]: @app.get("/messages") -async def read_aleph_messages(): +async def read_aleph_messages() -> dict[str, MessagesResponse]: """Read data from Aleph using the Aleph Client library.""" - async with AlephClient() as client: - data = await client.get_messages(hashes=["f246f873c3e0f637a15c566e7a465d2ecbb83eaa024d54ccb8fb566b549a929e"]) + async with AlephHttpClient() as client: + message_filter = MessageFilter(hashes=["f246f873c3e0f637a15c566e7a465d2ecbb83eaa024d54ccb8fb566b549a929e"]) + data = await client.get_messages(message_filter=message_filter) return {"Messages": data} @@ -163,9 +190,13 @@ async def connect_ipv6(): if resp.status != 404: resp.raise_for_status() return {"result": True, "headers": resp.headers} - except aiohttp.ClientTimeout: - logger.warning(f"Session connection for host {ipv6_host} failed") - return {"result": False, "headers": resp.headers} + except TimeoutError: + logger.warning(f"Session connection to host {ipv6_host} timed out") + return {"result": False, "reason": "Timeout"} + except aiohttp.ClientConnectionError as error: + logger.warning(f"Client connection to host {ipv6_host} failed: {error}") + # Get a string that describes the error + return {"result": False, "reason": str(error.args[0])} async def check_url(internet_host: HttpUrl, timeout_seconds: int = 5): @@ -184,7 +215,7 @@ async def check_url(internet_host: HttpUrl, timeout_seconds: int = 5): @app.get("/internet") async def read_internet(): """Check Internet connectivity of the system, requiring IP connectivity, domain resolution and HTTPS/TLS.""" - internet_hosts: List[HttpUrl] = [ + internet_hosts: list[HttpUrl] = [ HttpUrl(url="https://aleph.im/", scheme="https"), HttpUrl(url="https://ethereum.org", scheme="https"), HttpUrl(url="https://ipfs.io/", scheme="https"), @@ -192,7 +223,7 @@ async def read_internet(): timeout_seconds = 5 # Create a list of tasks to check the URLs in parallel - tasks: set[asyncio.Task] = set(asyncio.create_task(check_url(host, timeout_seconds)) for host in internet_hosts) + tasks: set[asyncio.Task] = {asyncio.create_task(check_url(host, timeout_seconds)) for host in internet_hosts} # While no tasks have completed, keep waiting for the next one to finish while tasks: @@ -211,34 +242,121 @@ async def read_internet(): return {"result": False} -@app.get("/post_a_message") -async def post_a_message(): - """Post a message on the Aleph network""" +@app.get("/get_a_message") +async def get_a_message(): + """Get a message from the Aleph.im network""" + item_hash = "3fc0aa9569da840c43e7bd2033c3c580abb46b007527d6d20f2d4e98e867f7af" + async with AlephHttpClient() as client: + message = await client.get_message( + item_hash=item_hash, + message_type=ProgramMessage, + ) + return message.dict() - account = await RemoteAccount.from_crypto_host(host="http://localhost", unix_socket="/tmp/socat-socket") + +@app.post("/post_a_message") +async def post_with_remote_account(): + """Post a message on the Aleph.im network using the remote account of the host.""" + try: + account = await RemoteAccount.from_crypto_host(host="http://localhost", unix_socket="/tmp/socat-socket") + + content = { + "date": datetime.now(tz=timezone.utc).isoformat(), + "test": True, + "answer": 42, + "something": "interesting", + } + async with AuthenticatedAlephHttpClient( + account=account, + ) as client: + message: PostMessage + status: MessageStatus + message, status = await client.create_post( + post_content=content, + post_type="test", + ref=None, + channel="TEST", + inline=True, + storage_engine=StorageEnum.storage, + sync=True, + ) + if status != MessageStatus.PROCESSED: + return JSONResponse(status_code=500, content={"error": status}) + return { + "message": message, + } + except aiohttp.client_exceptions.UnixClientConnectorError: + return JSONResponse(status_code=500, content={"error": "Could not connect to the remote account"}) + + +@app.post("/post_a_message_local_account") +async def post_with_local_account(): + """Post a message on the Aleph.im network using a local private key.""" + + account = get_fallback_account() content = { - "date": datetime.utcnow().isoformat(), + "date": datetime.now(tz=timezone.utc).isoformat(), "test": True, "answer": 42, "something": "interesting", } - async with AuthenticatedAlephClient( + async with AuthenticatedAlephHttpClient( account=account, + api_server="https://api2.aleph.im", + allow_unix_sockets=False, ) as client: - response = await client.create_post( + message: PostMessage + status: MessageStatus + message, status = await client.create_post( post_content=content, post_type="test", ref=None, channel="TEST", inline=True, storage_engine=StorageEnum.storage, + sync=True, + ) + if status != MessageStatus.PROCESSED: + return JSONResponse(status_code=500, content={"error": status}) + return { + "message": message, + } + + +@app.post("/post_a_file") +async def post_a_file(): + account = get_fallback_account() + file_path = Path(__file__).absolute() + async with AuthenticatedAlephHttpClient( + account=account, + ) as client: + message: StoreMessage + status: MessageStatus + message, status = await client.create_store( + file_path=file_path, + ref=None, + channel="TEST", + storage_engine=StorageEnum.storage, + sync=True, ) + if status != MessageStatus.PROCESSED: + return JSONResponse(status_code=500, content={"error": status}) return { - "response": response, + "message": message, } +@app.get("/sign_a_message") +async def sign_a_message(): + """Sign a message using a locally managed account within the virtual machine.""" + # FIXME: Broken, fixing this depends on https://github.com/aleph-im/aleph-sdk-python/pull/120 + account = get_fallback_account() + message = {"hello": "world", "chain": "ETH"} + signed_message = await account.sign_message(message) + return {"message": signed_message} + + @app.get("/cache/get/{key}") async def get_from_cache(key: str): """Get data in the VM cache""" @@ -265,7 +383,7 @@ async def keys_from_cache(pattern: str = "*"): @app.get("/state/increment") -async def increment(): +async def increment() -> dict[str, int]: path = "/var/lib/example/storage.json" try: with open(path) as fd: @@ -284,7 +402,7 @@ class Data(BaseModel): @app.post("/post") -async def receive_post(data: Data): +async def receive_post(data: Data) -> str: return str(data) @@ -293,13 +411,14 @@ class CustomError(Exception): @app.get("/raise") -def raise_error(): +def raise_error() -> None: """Raises an error to check that the init handles it properly without crashing""" - raise CustomError("Whoops") + error_message = "Whoops" + raise CustomError(error_message) @app.get("/crash") -def crash(): +def crash() -> None: """Crash the entire VM in order to check that the supervisor can handle it""" sys.exit(1) @@ -313,22 +432,22 @@ def crash(): @app.get("/platform/os") -def platform_os(): +def platform_os() -> PlainTextResponse: return PlainTextResponse(content=Path("/etc/os-release").read_text()) @app.get("/platform/python") -def platform_python(): +def platform_python() -> PlainTextResponse: return PlainTextResponse(content=sys.version) @app.get("/platform/pip-freeze") -def platform_pip_freeze(): +def platform_pip_freeze() -> list[str]: return list(freeze()) @app.event(filters=filters) -async def aleph_event(event): +async def aleph_event(event) -> dict[str, str]: print("aleph_event", event) async with aiohttp.ClientSession(connector=aiohttp.TCPConnector()) as session: async with session.get("https://official.aleph.cloud/api/v0/info/public.json") as resp: diff --git a/runtimes/aleph-debian-12-python/create_disk_image.sh b/runtimes/aleph-debian-12-python/create_disk_image.sh index 6a0c2265a..78c96b897 100755 --- a/runtimes/aleph-debian-12-python/create_disk_image.sh +++ b/runtimes/aleph-debian-12-python/create_disk_image.sh @@ -36,7 +36,7 @@ locale-gen en_US.UTF-8 echo "Pip installing aleph-sdk-python" mkdir -p /opt/aleph/libs -pip3 install --target /opt/aleph/libs 'aleph-sdk-python==0.9.0' 'fastapi~=0.109.2' +pip3 install --target /opt/aleph/libs 'aleph-sdk-python==0.9.0' 'aleph-message==0.4.4' 'fastapi~=0.109.2' # Compile Python code to bytecode for faster execution # -o2 is needed to compile with optimization level 2 which is what we launch init1.py (`python -OO`) diff --git a/src/aleph/vm/conf.py b/src/aleph/vm/conf.py index 29a5317f3..e84c58c31 100644 --- a/src/aleph/vm/conf.py +++ b/src/aleph/vm/conf.py @@ -289,7 +289,7 @@ class Settings(BaseSettings): ) FAKE_INSTANCE_MESSAGE = Path(abspath(join(__file__, "../../../../examples/instance_message_from_aleph.json"))) - CHECK_FASTAPI_VM_ID = "3fc0aa9569da840c43e7bd2033c3c580abb46b007527d6d20f2d4e98e867f7af" + CHECK_FASTAPI_VM_ID = "63faf8b5db1cf8d965e6a464a0cb8062af8e7df131729e48738342d956f29ace" LEGACY_CHECK_FASTAPI_VM_ID = "67705389842a0a1b95eaa408b009741027964edc805997475e95c505d642edd8" # Developer options diff --git a/src/aleph/vm/orchestrator/run.py b/src/aleph/vm/orchestrator/run.py index 6e429ff87..8dec7e963 100644 --- a/src/aleph/vm/orchestrator/run.py +++ b/src/aleph/vm/orchestrator/run.py @@ -44,6 +44,7 @@ async def build_asgi_scope(path: str, request: web.Request) -> dict[str, Any]: async def build_event_scope(event) -> dict[str, Any]: + """Build an ASGI scope for an event.""" return { "type": "aleph.message", "body": event, diff --git a/src/aleph/vm/orchestrator/status.py b/src/aleph/vm/orchestrator/status.py index 8c9c8064a..b0d76554d 100644 --- a/src/aleph/vm/orchestrator/status.py +++ b/src/aleph/vm/orchestrator/status.py @@ -15,19 +15,31 @@ logger = logging.getLogger(__name__) -def make_check_vm_url(vm_id: ItemHash) -> str: +def assemble_vm_url(vm_id: ItemHash) -> str: + """Assemble the URL for a VM based on the host and port that the orchestrator is running on and the VM ID.""" return f"http://{settings.SUPERVISOR_HOST}:{settings.SUPERVISOR_PORT}/vm/{vm_id}" async def get_json_from_vm(session: ClientSession, vm_id: ItemHash, suffix: str) -> Any: - vm_url = make_check_vm_url(vm_id) + """Get JSON from a VM running locally.""" + vm_url = assemble_vm_url(vm_id) url = f"{vm_url}{suffix}" async with session.get(url) as resp: resp.raise_for_status() return await resp.json() +async def post_to_vm(session: ClientSession, vm_id: ItemHash, suffix: str, data: Any = None) -> Any: + """Post data to a VM running locally.""" + vm_url = assemble_vm_url(vm_id) + url = f"{vm_url}{suffix}" + async with session.post(url, json=data) as resp: + resp.raise_for_status() + return await resp.json() + + async def check_index(session: ClientSession, vm_id: ItemHash) -> bool: + """Check that the index page of the VM is working.""" try: result: dict = await get_json_from_vm(session, vm_id, "/") assert result["Example"] == "example_fastapi" @@ -37,6 +49,7 @@ async def check_index(session: ClientSession, vm_id: ItemHash) -> bool: async def check_lifespan(session: ClientSession, vm_id: ItemHash) -> bool: + """Check that the lifespan endpoint of the VM is working.""" try: result: dict = await get_json_from_vm(session, vm_id, "/lifespan") return result["Lifespan"] is True @@ -45,6 +58,7 @@ async def check_lifespan(session: ClientSession, vm_id: ItemHash) -> bool: async def check_environ(session: ClientSession, vm_id: ItemHash) -> bool: + """Check that the environ endpoint of the VM returns the expected environment variables.""" try: result: dict = await get_json_from_vm(session, vm_id, "/environ") assert "ALEPH_API_HOST" in result @@ -58,6 +72,7 @@ async def check_environ(session: ClientSession, vm_id: ItemHash) -> bool: async def check_messages(session: ClientSession, vm_id: ItemHash) -> bool: + """Check that the messages endpoint of the VM returns a list of messages.""" try: result: dict = await get_json_from_vm(session, vm_id, "/messages") assert "Messages" in result @@ -69,6 +84,7 @@ async def check_messages(session: ClientSession, vm_id: ItemHash) -> bool: async def check_dns(session: ClientSession, vm_id: ItemHash) -> bool: + """Check that the DNS endpoint of the VM returns both IPv4 and IPv6 results.""" try: result: dict = await get_json_from_vm(session, vm_id, "/dns") assert result["ipv4"] @@ -79,6 +95,7 @@ async def check_dns(session: ClientSession, vm_id: ItemHash) -> bool: async def check_ipv4(session: ClientSession, vm_id: ItemHash) -> bool: + """Check that the VM has IPv4 connectivity.""" try: result: dict = await get_json_from_vm(session, vm_id, "/ip/4") assert result["result"] is True @@ -88,6 +105,7 @@ async def check_ipv4(session: ClientSession, vm_id: ItemHash) -> bool: async def check_ipv6(session: ClientSession, vm_id: ItemHash) -> bool: + """Check that the VM has IPv6 connectivity.""" try: result: dict = await get_json_from_vm(session, vm_id, "/ip/6") assert result["result"] is True @@ -98,6 +116,7 @@ async def check_ipv6(session: ClientSession, vm_id: ItemHash) -> bool: async def check_internet(session: ClientSession, vm_id: ItemHash) -> bool: + """Check that the VM has internet connectivity. This requires DNS, IP, HTTP and TLS to work.""" try: result: dict = await get_json_from_vm(session, vm_id, "/internet") assert result["result"] == HTTPOk.status_code @@ -108,6 +127,7 @@ async def check_internet(session: ClientSession, vm_id: ItemHash) -> bool: async def check_cache(session: ClientSession, vm_id: ItemHash) -> bool: + """Check that the VM can set and get a value in its cache.""" try: result1: bool = await get_json_from_vm(session, vm_id, "/cache/set/a/42") assert result1 is True @@ -121,6 +141,7 @@ async def check_cache(session: ClientSession, vm_id: ItemHash) -> bool: async def check_persistent_storage(session: ClientSession, vm_id: ItemHash) -> bool: + """Check that the VM can set and get a value in its persistent storage.""" try: result: dict = await get_json_from_vm(session, vm_id, "/state/increment") counter = result["counter"] @@ -134,7 +155,8 @@ async def check_persistent_storage(session: ClientSession, vm_id: ItemHash) -> b async def check_error_raised(session: ClientSession, vm_id: ItemHash) -> bool: - vm_url = make_check_vm_url(vm_id) + """Check that the VM can raise an error and return a traceback instead of crashing.""" + vm_url = assemble_vm_url(vm_id) try: async with session.get(f"{vm_url}/raise") as resp: text = await resp.text() @@ -144,8 +166,9 @@ async def check_error_raised(session: ClientSession, vm_id: ItemHash) -> bool: async def check_crash_and_restart(session: ClientSession, vm_id: ItemHash) -> bool: + """Check that a crash in the VM would cause it to restart and work as expected.""" # Crash the VM init. - vm_url = make_check_vm_url(vm_id) + vm_url = assemble_vm_url(vm_id) async with session.get(f"{vm_url}/crash") as resp: if resp.status != HTTPBadGateway.status_code: return False @@ -158,3 +181,30 @@ async def check_crash_and_restart(session: ClientSession, vm_id: ItemHash) -> bo except ClientResponseError: return False + + +async def check_get_a_message(session: ClientSession, vm_id: ItemHash) -> bool: + """Check that the VM can get a message from the aleph.im network.""" + try: + result: dict = await get_json_from_vm(session, vm_id, "/get_a_message") + return "item_hash" in result + except ClientResponseError: + return False + + +async def check_post_a_message(session: ClientSession, vm_id: ItemHash) -> bool: + """Check that the VM can post a message to the aleph.im network using a remote key present on the host.""" + try: + result: dict = await post_to_vm(session, vm_id, "/post_a_message") + return "item_hash" in result + except ClientResponseError: + return False + + +async def check_sign_a_message(session: ClientSession, vm_id: ItemHash) -> bool: + """Check that the VM can sign a message using a key local to the VM.""" + try: + result: dict = await post_to_vm(session, vm_id, "/sign_a_message") + return "item_hash" in result + except ClientResponseError: + return False diff --git a/src/aleph/vm/orchestrator/views/__init__.py b/src/aleph/vm/orchestrator/views/__init__.py index 994476cba..177e6a348 100644 --- a/src/aleph/vm/orchestrator/views/__init__.py +++ b/src/aleph/vm/orchestrator/views/__init__.py @@ -199,6 +199,9 @@ async def status_check_fastapi(request: web.Request, vm_id: Optional[ItemHash] = "index": await status.check_index(session, fastapi_vm_id), "environ": await status.check_environ(session, fastapi_vm_id), "messages": await status.check_messages(session, fastapi_vm_id), + # Using the remote account currently causes issues + # "post_a_message": await status.check_post_a_message(session, fastapi_vm_id), + # "sign_a_message": await status.check_sign_a_message(session, fastapi_vm_id), "dns": await status.check_dns(session, fastapi_vm_id), "ipv4": await status.check_ipv4(session, fastapi_vm_id), "internet": await status.check_internet(session, fastapi_vm_id), @@ -209,6 +212,7 @@ async def status_check_fastapi(request: web.Request, vm_id: Optional[ItemHash] = if not retro_compatibility: # These fields were added in the runtime running Debian 12. result = result | { + "get_a_message": await status.check_get_a_message(session, fastapi_vm_id), "lifespan": await status.check_lifespan(session, fastapi_vm_id), # IPv6 requires extra work from node operators and is not required yet. # "ipv6": await status.check_ipv6(session), From 46819068752bfec69c0228b6eed3994925c8f55d Mon Sep 17 00:00:00 2001 From: Olivier Le Thanh Duong Date: Fri, 26 Apr 2024 12:46:41 +0200 Subject: [PATCH 17/39] Fix not awaited async call --- src/aleph/vm/orchestrator/views/operator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/aleph/vm/orchestrator/views/operator.py b/src/aleph/vm/orchestrator/views/operator.py index 298486b73..5b8bb236d 100644 --- a/src/aleph/vm/orchestrator/views/operator.py +++ b/src/aleph/vm/orchestrator/views/operator.py @@ -174,7 +174,7 @@ async def operate_reboot(request: web.Request, authenticated_sender: str) -> web if execution.is_running: logger.info(f"Rebooting {execution.vm_hash}") if execution.persistent: - pool.systemd_manager.restart(execution.controller_service) + await pool.systemd_manager.restart(execution.controller_service) else: await pool.stop_vm(vm_hash) pool.forget_vm(vm_hash) From 314666bbe419291c082e176a14404bc7e71a2b7c Mon Sep 17 00:00:00 2001 From: Olivier Le Thanh Duong Date: Fri, 26 Apr 2024 14:08:11 +0200 Subject: [PATCH 18/39] Connect to the bus on demand to avoid having to call setup --- src/aleph/vm/systemd.py | 67 +++++++++++++++++++++++------------------ 1 file changed, 37 insertions(+), 30 deletions(-) diff --git a/src/aleph/vm/systemd.py b/src/aleph/vm/systemd.py index d66345af0..3b3c15c57 100644 --- a/src/aleph/vm/systemd.py +++ b/src/aleph/vm/systemd.py @@ -143,66 +143,73 @@ class SystemDManager: Used to manage the systemd services on the host on Linux. """ - bus: Optional[MessageBus] - manager: Optional[SystemdProxy] + _bus: Optional[MessageBus] = None + _manager: Optional[SystemdProxy] = None def __init__(self): pass - async def connect(self): - self.bus = MessageBus(bus_type=BusType.SYSTEM) - await self.bus.connect() - path = "/org/freedesktop/systemd1" - bus_name = "org.freedesktop.systemd1" - introspect = await self.bus.introspect(bus_name, path) - systemd_proxy: ProxyObject = self.bus.get_proxy_object(bus_name, path, introspection=introspect) - interface = systemd_proxy.get_interface("org.freedesktop.systemd1.Manager") - # Check required method are implemented - assert isinstance(interface, SystemdProxy) - self.manager = interface + async def get_bus(self): + if self._bus is None: + self._bus = MessageBus(bus_type=BusType.SYSTEM) + await self._bus.connect() + return self._bus + + async def get_manager(self): + if self._manager is None: + bus = await self.get_bus() + path = "/org/freedesktop/systemd1" + bus_name = "org.freedesktop.systemd1" + introspect = await bus.introspect(bus_name, path) + systemd_proxy: ProxyObject = bus.get_proxy_object(bus_name, path, introspection=introspect) + interface = systemd_proxy.get_interface("org.freedesktop.systemd1.Manager") + # Check required method are implemented + assert isinstance(interface, SystemdProxy) + self._manager = interface + return self._manager async def enable(self, service: str) -> None: - assert self.manager, "connect() not called" - await self.manager.call_enable_unit_files([service], False, True) + manager = await self.get_manager() + await manager.call_enable_unit_files([service], False, True) logger.debug(f"Enabled {service} service") async def start(self, service: str) -> None: - assert self.manager, "connect() not called" - await self.manager.call_start_unit(service, Mode.REPLACE) + manager = await self.get_manager() + await manager.call_start_unit(service, Mode.REPLACE) logger.debug(f"Started {service} service") async def stop(self, service: str) -> None: - assert self.manager, "connect() not called" - await self.manager.call_stop_unit(service, Mode.REPLACE) + manager = await self.get_manager() + await manager.call_stop_unit(service, Mode.REPLACE) logger.debug(f"Stopped {service} service") async def restart(self, service: str) -> None: - assert self.manager, "connect() not called" - await self.manager.call_restart_unit(service, Mode.REPLACE) + manager = await self.get_manager() + await manager.call_restart_unit(service, Mode.REPLACE) logger.debug(f"Restarted {service} service") async def disable(self, service: str) -> None: - assert self.manager, "connect() not called" - await self.manager.call_disable_unit_files([service], False) + manager = await self.get_manager() + await manager.call_disable_unit_files([service], False) logger.debug(f"Disabled {service} service") async def is_service_enabled(self, service: str) -> bool: - assert self.manager, "connect() not called" + manager = await self.get_manager() try: - state = await self.manager.call_get_unit_file_state(service) + state = await manager.call_get_unit_file_state(service) return state == UnitFileState.ENABLED except DBusError as error: logger.error(error) return False async def is_service_active(self, service: str) -> bool: - assert self.manager, "connect() not called" - assert self.bus, "connect() not called" + manager = await self.get_manager() try: - path = await self.manager.call_get_unit(service) + path = await manager.call_get_unit(service) + bus = await self.get_bus() bus_name = "org.freedesktop.systemd1" - introspect = await self.bus.introspect(bus_name, path) - systemd_service = self.bus.get_proxy_object(bus_name, path, introspection=introspect) + introspect = await bus.introspect(bus_name, path) + systemd_service = bus.get_proxy_object(bus_name, path, introspection=introspect) unit = systemd_service.get_interface("org.freedesktop.systemd1.Unit") # Check required method are implemented assert isinstance(unit, UnitProxy) From 0c59d471aa4677e949058a72e2b579428c0b1c21 Mon Sep 17 00:00:00 2001 From: Olivier Le Thanh Duong Date: Fri, 26 Apr 2024 15:35:42 +0200 Subject: [PATCH 19/39] fix is running requiring async --- src/aleph/vm/models.py | 16 ++++----- src/aleph/vm/orchestrator/run.py | 8 ++--- src/aleph/vm/orchestrator/tasks.py | 2 +- src/aleph/vm/orchestrator/views/__init__.py | 11 +++++-- src/aleph/vm/orchestrator/views/operator.py | 4 +-- src/aleph/vm/pool.py | 36 ++++++++++++--------- src/aleph/vm/utils.py | 8 +++-- 7 files changed, 48 insertions(+), 37 deletions(-) diff --git a/src/aleph/vm/models.py b/src/aleph/vm/models.py index 9d38eea97..9282d74d3 100644 --- a/src/aleph/vm/models.py +++ b/src/aleph/vm/models.py @@ -83,13 +83,11 @@ class VmExecution: persistent: bool = False - @property - def is_running(self) -> bool: - return ( - bool(self.times.starting_at and not self.times.stopping_at) - if not self.persistent - else self.systemd_manager.is_service_active(self.controller_service) - ) + async def check_is_running(self) -> bool: + if not self.persistent: + return bool(self.times.starting_at and not self.times.stopping_at) + else: + return await self.systemd_manager.is_service_active(self.controller_service) @property def is_stopping(self) -> bool: @@ -160,9 +158,9 @@ def __init__( self.systemd_manager = systemd_manager self.persistent = persistent - def to_dict(self) -> dict: + async def to_dict(self) -> dict: return { - "is_running": self.is_running, + "is_running": await self.check_is_running(), **self.__dict__, } diff --git a/src/aleph/vm/orchestrator/run.py b/src/aleph/vm/orchestrator/run.py index 8dec7e963..2508d6267 100644 --- a/src/aleph/vm/orchestrator/run.py +++ b/src/aleph/vm/orchestrator/run.py @@ -118,7 +118,7 @@ async def run_code_on_request(vm_hash: ItemHash, path: str, pool: VmPool, reques Execute the code corresponding to the 'code id' in the path. """ - execution: Optional[VmExecution] = pool.get_running_vm(vm_hash=vm_hash) + execution: Optional[VmExecution] = await pool.get_running_vm(vm_hash=vm_hash) # Prevent execution issues if the execution resources are empty # TODO: Improve expiration process to avoid that kind of issues. @@ -222,7 +222,7 @@ async def run_code_on_event(vm_hash: ItemHash, event, pubsub: PubSub, pool: VmPo Execute code in response to an event. """ - execution: Optional[VmExecution] = pool.get_running_vm(vm_hash=vm_hash) + execution: Optional[VmExecution] = await pool.get_running_vm(vm_hash=vm_hash) if not execution: execution = await create_vm_execution_or_raise_http_error(vm_hash=vm_hash, pool=pool) @@ -268,7 +268,7 @@ async def run_code_on_event(vm_hash: ItemHash, event, pubsub: PubSub, pool: VmPo async def start_persistent_vm(vm_hash: ItemHash, pubsub: Optional[PubSub], pool: VmPool) -> VmExecution: - execution: Optional[VmExecution] = pool.get_running_vm(vm_hash=vm_hash) + execution: Optional[VmExecution] = await pool.get_running_vm(vm_hash=vm_hash) if not execution: logger.info(f"Starting persistent virtual machine with id: {vm_hash}") @@ -288,7 +288,7 @@ async def start_persistent_vm(vm_hash: ItemHash, pubsub: Optional[PubSub], pool: async def stop_persistent_vm(vm_hash: ItemHash, pool: VmPool) -> Optional[VmExecution]: logger.info(f"Stopping persistent VM {vm_hash}") - execution = pool.get_running_vm(vm_hash) + execution = await pool.get_running_vm(vm_hash) if execution: await execution.stop() diff --git a/src/aleph/vm/orchestrator/tasks.py b/src/aleph/vm/orchestrator/tasks.py index 4e85f0d91..7bcfd1a04 100644 --- a/src/aleph/vm/orchestrator/tasks.py +++ b/src/aleph/vm/orchestrator/tasks.py @@ -159,7 +159,7 @@ async def monitor_payments(app: web.Application): # required_balance = await compute_required_balance(executions) # Check if the balance held in the wallet is sufficient stream tier resources - for sender, chains in pool.get_executions_by_sender(payment_type=PaymentType.superfluid).items(): + for sender, chains in await pool.get_executions_by_sender(payment_type=PaymentType.superfluid).items(): for chain, executions in chains.items(): stream = await get_stream(sender=sender, receiver=settings.PAYMENT_RECEIVER_ADDRESS, chain=chain) logger.debug( diff --git a/src/aleph/vm/orchestrator/views/__init__.py b/src/aleph/vm/orchestrator/views/__init__.py index 177e6a348..926c797dc 100644 --- a/src/aleph/vm/orchestrator/views/__init__.py +++ b/src/aleph/vm/orchestrator/views/__init__.py @@ -146,7 +146,7 @@ async def list_executions(request: web.Request) -> web.Response: }, } for item_hash, execution in pool.executions.items() - if execution.is_running + if await execution.check_is_running() }, dumps=dumps_for_json, ) @@ -356,8 +356,13 @@ async def update_allocations(request: web.Request): # First free resources from persistent programs and instances that are not scheduled anymore. allocations = allocation.persistent_vms | allocation.instances # Make a copy since the pool is modified - for execution in list(pool.get_persistent_executions()): - if execution.vm_hash not in allocations and execution.is_running and not execution.uses_payment_stream: + persistent_executions = list(await pool.get_persistent_executions()) + for execution in persistent_executions: + if ( + execution.vm_hash not in allocations + and not execution.uses_payment_stream + and (await execution.check_is_running()) + ): vm_type = "instance" if execution.is_instance else "persistent program" logger.info("Stopping %s %s", vm_type, execution.vm_hash) await pool.stop_vm(execution.vm_hash) diff --git a/src/aleph/vm/orchestrator/views/operator.py b/src/aleph/vm/orchestrator/views/operator.py index 5b8bb236d..bc2e60f18 100644 --- a/src/aleph/vm/orchestrator/views/operator.py +++ b/src/aleph/vm/orchestrator/views/operator.py @@ -150,7 +150,7 @@ async def operate_stop(request: web.Request, authenticated_sender: str) -> web.R if not is_sender_authorized(authenticated_sender, execution.message): return web.Response(status=403, body="Unauthorized sender") - if execution.is_running: + if await execution.check_is_running(): logger.info(f"Stopping {execution.vm_hash}") await pool.stop_vm(execution.vm_hash) return web.Response(status=200, body=f"Stopped VM with ref {vm_hash}") @@ -171,7 +171,7 @@ async def operate_reboot(request: web.Request, authenticated_sender: str) -> web if not is_sender_authorized(authenticated_sender, execution.message): return web.Response(status=403, body="Unauthorized sender") - if execution.is_running: + if await execution.check_is_running(): logger.info(f"Rebooting {execution.vm_hash}") if execution.persistent: await pool.systemd_manager.restart(execution.controller_service) diff --git a/src/aleph/vm/pool.py b/src/aleph/vm/pool.py index 938a83726..ea2026b60 100644 --- a/src/aleph/vm/pool.py +++ b/src/aleph/vm/pool.py @@ -94,7 +94,7 @@ async def create_a_vm( async with self.creation_lock: # Check if an execution is already present for this VM, then return it. # Do not `await` in this section. - current_execution = self.get_running_vm(vm_hash) + current_execution = await self.get_running_vm(vm_hash) if current_execution: return current_execution else: @@ -110,7 +110,7 @@ async def create_a_vm( try: await execution.prepare() - vm_id = self.get_unique_vm_id() + vm_id = await self.get_unique_vm_id() if self.network: vm_type = VmType.from_message_content(message) @@ -140,7 +140,7 @@ async def create_a_vm( return execution - def get_unique_vm_id(self) -> int: + async def get_unique_vm_id(self) -> int: """Get a unique identifier for the VM. This identifier is used to name the network interface and in the IPv4 range @@ -159,7 +159,9 @@ def get_unique_vm_id(self) -> int: # # We therefore recycle vm_id values from executions that are not running # anymore. - currently_used_vm_ids = {execution.vm_id for execution in self.executions.values() if execution.is_running} + currently_used_vm_ids = { + execution.vm_id for execution in self.executions.values() if (await execution.check_is_running()) + } for i in range(settings.START_ID_INDEX, 255**2): if i not in currently_used_vm_ids: return i @@ -167,10 +169,10 @@ def get_unique_vm_id(self) -> int: msg = "No available value for vm_id." raise ValueError(msg) - def get_running_vm(self, vm_hash: ItemHash) -> Optional[VmExecution]: + async def get_running_vm(self, vm_hash: ItemHash) -> Optional[VmExecution]: """Return a running VM or None. Disables the VM expiration task.""" execution = self.executions.get(vm_hash) - if execution and execution.is_running and not execution.is_stopping: + if execution and (await execution.check_is_running()) and not execution.is_stopping: execution.cancel_expiration() return execution else: @@ -239,7 +241,7 @@ async def load_persistent_executions(self): persistent=saved_execution.persistent, ) - if execution.is_running: + if await execution.check_is_running(): # TODO: Improve the way that we re-create running execution await execution.prepare() if self.network: @@ -269,31 +271,33 @@ async def load_persistent_executions(self): async def stop(self): """Stop ephemeral VMs in the pool.""" # Stop executions in parallel: - await asyncio.gather(*(execution.stop() for execution in self.get_ephemeral_executions())) + await asyncio.gather(*(execution.stop() for execution in await self.get_ephemeral_executions())) - def get_ephemeral_executions(self) -> Iterable[VmExecution]: + async def get_ephemeral_executions(self) -> Iterable[VmExecution]: executions = ( - execution for _, execution in self.executions.items() if execution.is_running and not execution.persistent + execution + for _, execution in self.executions.items() + if (await execution.check_is_running()) and not execution.persistent ) return executions or [] - def get_persistent_executions(self) -> Iterable[VmExecution]: + async def get_persistent_executions(self) -> Iterable[VmExecution]: executions = ( execution for _vm_hash, execution in self.executions.items() - if execution.is_running and execution.persistent + if (await execution.check_is_running()) and execution.persistent ) return executions or [] - def get_instance_executions(self) -> Iterable[VmExecution]: + async def get_instance_executions(self) -> Iterable[VmExecution]: executions = ( execution for _vm_hash, execution in self.executions.items() - if execution.is_running and execution.is_instance + if (await execution.check_is_running()) and execution.is_instance ) return executions or [] - def get_executions_by_sender(self, payment_type: PaymentType) -> dict[str, dict[str, list[VmExecution]]]: + async def get_executions_by_sender(self, payment_type: PaymentType) -> dict[str, dict[str, list[VmExecution]]]: """Return all executions of the given type, grouped by sender and by chain.""" executions_by_sender: dict[str, dict[str, list[VmExecution]]] = {} for vm_hash, execution in self.executions.items(): @@ -301,7 +305,7 @@ def get_executions_by_sender(self, payment_type: PaymentType) -> dict[str, dict[ # Ignore Diagnostic VM execution continue - if not execution.is_running: + if not await execution.check_is_running(): # Ignore the execution that is stopping or not running anymore continue if execution.vm_hash == settings.CHECK_FASTAPI_VM_ID: diff --git a/src/aleph/vm/utils.py b/src/aleph/vm/utils.py index 63ce18253..7d464405f 100644 --- a/src/aleph/vm/utils.py +++ b/src/aleph/vm/utils.py @@ -1,6 +1,7 @@ import asyncio import dataclasses import hashlib +import inspect import json import logging import subprocess @@ -69,9 +70,12 @@ async def get_ref_from_dns(domain): return record[0].text -def to_json(o: Any): +async def to_json(o: Any): if hasattr(o, "to_dict"): # default method - return o.to_dict() + if inspect.isawaitable(o.to_dict): + return await o.to_dict() + else: + return o.to_dict() elif hasattr(o, "dict"): # Pydantic return o.dict() elif is_dataclass(o): From 2e2a445cab028916827686bb61c74140fc01113f Mon Sep 17 00:00:00 2001 From: Olivier Le Thanh Duong Date: Mon, 29 Apr 2024 14:21:28 +0200 Subject: [PATCH 20/39] working --- src/aleph/vm/orchestrator/supervisor.py | 3 +- src/aleph/vm/orchestrator/tasks.py | 3 +- src/aleph/vm/orchestrator/views/__init__.py | 10 +++--- src/aleph/vm/pool.py | 40 ++++++++++++--------- src/aleph/vm/systemd.py | 7 ++-- 5 files changed, 37 insertions(+), 26 deletions(-) diff --git a/src/aleph/vm/orchestrator/supervisor.py b/src/aleph/vm/orchestrator/supervisor.py index 4846104ae..9f42482a7 100644 --- a/src/aleph/vm/orchestrator/supervisor.py +++ b/src/aleph/vm/orchestrator/supervisor.py @@ -161,6 +161,7 @@ def run(): try: if settings.WATCH_FOR_MESSAGES: + # FIXME We have a bug because task run on app.on_ don't run on the same loop? app.on_startup.append(start_watch_for_messages_task) app.on_startup.append(start_payment_monitoring_task) app.on_cleanup.append(stop_watch_for_messages_task) @@ -171,7 +172,7 @@ def run(): asyncio.run(pool.load_persistent_executions()) logger.info(f"Starting the web server on http://{settings.SUPERVISOR_HOST}:{settings.SUPERVISOR_PORT}") - web.run_app(app, host=settings.SUPERVISOR_HOST, port=settings.SUPERVISOR_PORT) + web.run_app(app, host=settings.SUPERVISOR_HOST, port=settings.SUPERVISOR_PORT, loop=loop) except OSError as e: if e.errno == 98: logger.error( diff --git a/src/aleph/vm/orchestrator/tasks.py b/src/aleph/vm/orchestrator/tasks.py index 7bcfd1a04..62db59515 100644 --- a/src/aleph/vm/orchestrator/tasks.py +++ b/src/aleph/vm/orchestrator/tasks.py @@ -159,7 +159,8 @@ async def monitor_payments(app: web.Application): # required_balance = await compute_required_balance(executions) # Check if the balance held in the wallet is sufficient stream tier resources - for sender, chains in await pool.get_executions_by_sender(payment_type=PaymentType.superfluid).items(): + user_executions = await pool.get_executions_by_sender(payment_type=PaymentType.superfluid) + for sender, chains in user_executions.items(): for chain, executions in chains.items(): stream = await get_stream(sender=sender, receiver=settings.PAYMENT_RECEIVER_ADDRESS, chain=chain) logger.debug( diff --git a/src/aleph/vm/orchestrator/views/__init__.py b/src/aleph/vm/orchestrator/views/__init__.py index 926c797dc..453e799f1 100644 --- a/src/aleph/vm/orchestrator/views/__init__.py +++ b/src/aleph/vm/orchestrator/views/__init__.py @@ -341,8 +341,8 @@ def authenticate_api_request(request: web.Request) -> bool: async def update_allocations(request: web.Request): - if not authenticate_api_request(request): - return web.HTTPUnauthorized(text="Authentication token received is invalid") + # if not authenticate_api_request(request): + # return web.HTTPUnauthorized(text="Authentication token received is invalid") try: data = await request.json() @@ -356,11 +356,13 @@ async def update_allocations(request: web.Request): # First free resources from persistent programs and instances that are not scheduled anymore. allocations = allocation.persistent_vms | allocation.instances # Make a copy since the pool is modified - persistent_executions = list(await pool.get_persistent_executions()) - for execution in persistent_executions: + executions = list(pool.executions.values()) + + for execution in executions: if ( execution.vm_hash not in allocations and not execution.uses_payment_stream + and execution.persistent and (await execution.check_is_running()) ): vm_type = "instance" if execution.is_instance else "persistent program" diff --git a/src/aleph/vm/pool.py b/src/aleph/vm/pool.py index ea2026b60..e869d6527 100644 --- a/src/aleph/vm/pool.py +++ b/src/aleph/vm/pool.py @@ -50,7 +50,6 @@ def __init__(self, loop: asyncio.AbstractEventLoop): self.executions = {} self.message_cache = {} - asyncio.set_event_loop(loop) self.creation_lock = asyncio.Lock() self.network = ( @@ -242,6 +241,7 @@ async def load_persistent_executions(self): ) if await execution.check_is_running(): + logger.info(f"Reloading running persistent execution {execution}") # TODO: Improve the way that we re-create running execution await execution.prepare() if self.network: @@ -274,27 +274,33 @@ async def stop(self): await asyncio.gather(*(execution.stop() for execution in await self.get_ephemeral_executions())) async def get_ephemeral_executions(self) -> Iterable[VmExecution]: - executions = ( - execution - for _, execution in self.executions.items() - if (await execution.check_is_running()) and not execution.persistent - ) + executions = [ + ( + execution + for _, execution in self.executions.items() + if (await execution.check_is_running()) and not execution.persistent + ) + ] return executions or [] async def get_persistent_executions(self) -> Iterable[VmExecution]: - executions = ( - execution - for _vm_hash, execution in self.executions.items() - if (await execution.check_is_running()) and execution.persistent - ) + executions = [ + ( + execution + for _vm_hash, execution in self.executions.items() + if (await execution.check_is_running()) and execution.persistent + ) + ] return executions or [] - async def get_instance_executions(self) -> Iterable[VmExecution]: - executions = ( - execution - for _vm_hash, execution in self.executions.items() - if (await execution.check_is_running()) and execution.is_instance - ) + async def get_instance_executions(self): + executions = [ + ( + execution + for _vm_hash, execution in self.executions.items() + if (await execution.check_is_running()) and execution.is_instance + ) + ] return executions or [] async def get_executions_by_sender(self, payment_type: PaymentType) -> dict[str, dict[str, list[VmExecution]]]: diff --git a/src/aleph/vm/systemd.py b/src/aleph/vm/systemd.py index 3b3c15c57..8fd9c5901 100644 --- a/src/aleph/vm/systemd.py +++ b/src/aleph/vm/systemd.py @@ -150,13 +150,13 @@ def __init__(self): pass async def get_bus(self): - if self._bus is None: + if True or self._bus is None: self._bus = MessageBus(bus_type=BusType.SYSTEM) await self._bus.connect() return self._bus async def get_manager(self): - if self._manager is None: + if True or self._manager is None: bus = await self.get_bus() path = "/org/freedesktop/systemd1" bus_name = "org.freedesktop.systemd1" @@ -170,11 +170,12 @@ async def get_manager(self): async def enable(self, service: str) -> None: manager = await self.get_manager() - await manager.call_enable_unit_files([service], False, True) logger.debug(f"Enabled {service} service") + await manager.call_enable_unit_files([service], False, True) async def start(self, service: str) -> None: manager = await self.get_manager() + logger.debug(f"Starting {service} service") await manager.call_start_unit(service, Mode.REPLACE) logger.debug(f"Started {service} service") From 067d6eec8cf8098867e22639b192fb248d24fe9c Mon Sep 17 00:00:00 2001 From: Olivier Le Thanh Duong Date: Mon, 29 Apr 2024 15:11:56 +0200 Subject: [PATCH 21/39] restore --- src/aleph/vm/systemd.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/aleph/vm/systemd.py b/src/aleph/vm/systemd.py index 8fd9c5901..ee782436f 100644 --- a/src/aleph/vm/systemd.py +++ b/src/aleph/vm/systemd.py @@ -150,13 +150,13 @@ def __init__(self): pass async def get_bus(self): - if True or self._bus is None: + if self._bus is None: self._bus = MessageBus(bus_type=BusType.SYSTEM) await self._bus.connect() return self._bus async def get_manager(self): - if True or self._manager is None: + if self._manager is None: bus = await self.get_bus() path = "/org/freedesktop/systemd1" bus_name = "org.freedesktop.systemd1" From eacd7da08eb77668a4c7aa5c961acc74b782a9e4 Mon Sep 17 00:00:00 2001 From: Olivier Le Thanh Duong Date: Mon, 29 Apr 2024 15:14:44 +0200 Subject: [PATCH 22/39] revert change to running --- src/aleph/vm/models.py | 16 ++--- src/aleph/vm/orchestrator/run.py | 8 +-- src/aleph/vm/orchestrator/supervisor.py | 2 +- src/aleph/vm/orchestrator/tasks.py | 3 +- src/aleph/vm/orchestrator/views/__init__.py | 17 ++---- src/aleph/vm/orchestrator/views/operator.py | 4 +- src/aleph/vm/pool.py | 66 +++++++++------------ src/aleph/vm/systemd.py | 3 +- src/aleph/vm/utils.py | 8 +-- 9 files changed, 53 insertions(+), 74 deletions(-) diff --git a/src/aleph/vm/models.py b/src/aleph/vm/models.py index 9282d74d3..9d38eea97 100644 --- a/src/aleph/vm/models.py +++ b/src/aleph/vm/models.py @@ -83,11 +83,13 @@ class VmExecution: persistent: bool = False - async def check_is_running(self) -> bool: - if not self.persistent: - return bool(self.times.starting_at and not self.times.stopping_at) - else: - return await self.systemd_manager.is_service_active(self.controller_service) + @property + def is_running(self) -> bool: + return ( + bool(self.times.starting_at and not self.times.stopping_at) + if not self.persistent + else self.systemd_manager.is_service_active(self.controller_service) + ) @property def is_stopping(self) -> bool: @@ -158,9 +160,9 @@ def __init__( self.systemd_manager = systemd_manager self.persistent = persistent - async def to_dict(self) -> dict: + def to_dict(self) -> dict: return { - "is_running": await self.check_is_running(), + "is_running": self.is_running, **self.__dict__, } diff --git a/src/aleph/vm/orchestrator/run.py b/src/aleph/vm/orchestrator/run.py index 2508d6267..8dec7e963 100644 --- a/src/aleph/vm/orchestrator/run.py +++ b/src/aleph/vm/orchestrator/run.py @@ -118,7 +118,7 @@ async def run_code_on_request(vm_hash: ItemHash, path: str, pool: VmPool, reques Execute the code corresponding to the 'code id' in the path. """ - execution: Optional[VmExecution] = await pool.get_running_vm(vm_hash=vm_hash) + execution: Optional[VmExecution] = pool.get_running_vm(vm_hash=vm_hash) # Prevent execution issues if the execution resources are empty # TODO: Improve expiration process to avoid that kind of issues. @@ -222,7 +222,7 @@ async def run_code_on_event(vm_hash: ItemHash, event, pubsub: PubSub, pool: VmPo Execute code in response to an event. """ - execution: Optional[VmExecution] = await pool.get_running_vm(vm_hash=vm_hash) + execution: Optional[VmExecution] = pool.get_running_vm(vm_hash=vm_hash) if not execution: execution = await create_vm_execution_or_raise_http_error(vm_hash=vm_hash, pool=pool) @@ -268,7 +268,7 @@ async def run_code_on_event(vm_hash: ItemHash, event, pubsub: PubSub, pool: VmPo async def start_persistent_vm(vm_hash: ItemHash, pubsub: Optional[PubSub], pool: VmPool) -> VmExecution: - execution: Optional[VmExecution] = await pool.get_running_vm(vm_hash=vm_hash) + execution: Optional[VmExecution] = pool.get_running_vm(vm_hash=vm_hash) if not execution: logger.info(f"Starting persistent virtual machine with id: {vm_hash}") @@ -288,7 +288,7 @@ async def start_persistent_vm(vm_hash: ItemHash, pubsub: Optional[PubSub], pool: async def stop_persistent_vm(vm_hash: ItemHash, pool: VmPool) -> Optional[VmExecution]: logger.info(f"Stopping persistent VM {vm_hash}") - execution = await pool.get_running_vm(vm_hash) + execution = pool.get_running_vm(vm_hash) if execution: await execution.stop() diff --git a/src/aleph/vm/orchestrator/supervisor.py b/src/aleph/vm/orchestrator/supervisor.py index 9f42482a7..b9b2a0078 100644 --- a/src/aleph/vm/orchestrator/supervisor.py +++ b/src/aleph/vm/orchestrator/supervisor.py @@ -172,7 +172,7 @@ def run(): asyncio.run(pool.load_persistent_executions()) logger.info(f"Starting the web server on http://{settings.SUPERVISOR_HOST}:{settings.SUPERVISOR_PORT}") - web.run_app(app, host=settings.SUPERVISOR_HOST, port=settings.SUPERVISOR_PORT, loop=loop) + web.run_app(app, host=settings.SUPERVISOR_HOST, port=settings.SUPERVISOR_PORT) except OSError as e: if e.errno == 98: logger.error( diff --git a/src/aleph/vm/orchestrator/tasks.py b/src/aleph/vm/orchestrator/tasks.py index 62db59515..4e85f0d91 100644 --- a/src/aleph/vm/orchestrator/tasks.py +++ b/src/aleph/vm/orchestrator/tasks.py @@ -159,8 +159,7 @@ async def monitor_payments(app: web.Application): # required_balance = await compute_required_balance(executions) # Check if the balance held in the wallet is sufficient stream tier resources - user_executions = await pool.get_executions_by_sender(payment_type=PaymentType.superfluid) - for sender, chains in user_executions.items(): + for sender, chains in pool.get_executions_by_sender(payment_type=PaymentType.superfluid).items(): for chain, executions in chains.items(): stream = await get_stream(sender=sender, receiver=settings.PAYMENT_RECEIVER_ADDRESS, chain=chain) logger.debug( diff --git a/src/aleph/vm/orchestrator/views/__init__.py b/src/aleph/vm/orchestrator/views/__init__.py index 453e799f1..177e6a348 100644 --- a/src/aleph/vm/orchestrator/views/__init__.py +++ b/src/aleph/vm/orchestrator/views/__init__.py @@ -146,7 +146,7 @@ async def list_executions(request: web.Request) -> web.Response: }, } for item_hash, execution in pool.executions.items() - if await execution.check_is_running() + if execution.is_running }, dumps=dumps_for_json, ) @@ -341,8 +341,8 @@ def authenticate_api_request(request: web.Request) -> bool: async def update_allocations(request: web.Request): - # if not authenticate_api_request(request): - # return web.HTTPUnauthorized(text="Authentication token received is invalid") + if not authenticate_api_request(request): + return web.HTTPUnauthorized(text="Authentication token received is invalid") try: data = await request.json() @@ -356,15 +356,8 @@ async def update_allocations(request: web.Request): # First free resources from persistent programs and instances that are not scheduled anymore. allocations = allocation.persistent_vms | allocation.instances # Make a copy since the pool is modified - executions = list(pool.executions.values()) - - for execution in executions: - if ( - execution.vm_hash not in allocations - and not execution.uses_payment_stream - and execution.persistent - and (await execution.check_is_running()) - ): + for execution in list(pool.get_persistent_executions()): + if execution.vm_hash not in allocations and execution.is_running and not execution.uses_payment_stream: vm_type = "instance" if execution.is_instance else "persistent program" logger.info("Stopping %s %s", vm_type, execution.vm_hash) await pool.stop_vm(execution.vm_hash) diff --git a/src/aleph/vm/orchestrator/views/operator.py b/src/aleph/vm/orchestrator/views/operator.py index bc2e60f18..5b8bb236d 100644 --- a/src/aleph/vm/orchestrator/views/operator.py +++ b/src/aleph/vm/orchestrator/views/operator.py @@ -150,7 +150,7 @@ async def operate_stop(request: web.Request, authenticated_sender: str) -> web.R if not is_sender_authorized(authenticated_sender, execution.message): return web.Response(status=403, body="Unauthorized sender") - if await execution.check_is_running(): + if execution.is_running: logger.info(f"Stopping {execution.vm_hash}") await pool.stop_vm(execution.vm_hash) return web.Response(status=200, body=f"Stopped VM with ref {vm_hash}") @@ -171,7 +171,7 @@ async def operate_reboot(request: web.Request, authenticated_sender: str) -> web if not is_sender_authorized(authenticated_sender, execution.message): return web.Response(status=403, body="Unauthorized sender") - if await execution.check_is_running(): + if execution.is_running: logger.info(f"Rebooting {execution.vm_hash}") if execution.persistent: await pool.systemd_manager.restart(execution.controller_service) diff --git a/src/aleph/vm/pool.py b/src/aleph/vm/pool.py index e869d6527..938a83726 100644 --- a/src/aleph/vm/pool.py +++ b/src/aleph/vm/pool.py @@ -50,6 +50,7 @@ def __init__(self, loop: asyncio.AbstractEventLoop): self.executions = {} self.message_cache = {} + asyncio.set_event_loop(loop) self.creation_lock = asyncio.Lock() self.network = ( @@ -93,7 +94,7 @@ async def create_a_vm( async with self.creation_lock: # Check if an execution is already present for this VM, then return it. # Do not `await` in this section. - current_execution = await self.get_running_vm(vm_hash) + current_execution = self.get_running_vm(vm_hash) if current_execution: return current_execution else: @@ -109,7 +110,7 @@ async def create_a_vm( try: await execution.prepare() - vm_id = await self.get_unique_vm_id() + vm_id = self.get_unique_vm_id() if self.network: vm_type = VmType.from_message_content(message) @@ -139,7 +140,7 @@ async def create_a_vm( return execution - async def get_unique_vm_id(self) -> int: + def get_unique_vm_id(self) -> int: """Get a unique identifier for the VM. This identifier is used to name the network interface and in the IPv4 range @@ -158,9 +159,7 @@ async def get_unique_vm_id(self) -> int: # # We therefore recycle vm_id values from executions that are not running # anymore. - currently_used_vm_ids = { - execution.vm_id for execution in self.executions.values() if (await execution.check_is_running()) - } + currently_used_vm_ids = {execution.vm_id for execution in self.executions.values() if execution.is_running} for i in range(settings.START_ID_INDEX, 255**2): if i not in currently_used_vm_ids: return i @@ -168,10 +167,10 @@ async def get_unique_vm_id(self) -> int: msg = "No available value for vm_id." raise ValueError(msg) - async def get_running_vm(self, vm_hash: ItemHash) -> Optional[VmExecution]: + def get_running_vm(self, vm_hash: ItemHash) -> Optional[VmExecution]: """Return a running VM or None. Disables the VM expiration task.""" execution = self.executions.get(vm_hash) - if execution and (await execution.check_is_running()) and not execution.is_stopping: + if execution and execution.is_running and not execution.is_stopping: execution.cancel_expiration() return execution else: @@ -240,8 +239,7 @@ async def load_persistent_executions(self): persistent=saved_execution.persistent, ) - if await execution.check_is_running(): - logger.info(f"Reloading running persistent execution {execution}") + if execution.is_running: # TODO: Improve the way that we re-create running execution await execution.prepare() if self.network: @@ -271,39 +269,31 @@ async def load_persistent_executions(self): async def stop(self): """Stop ephemeral VMs in the pool.""" # Stop executions in parallel: - await asyncio.gather(*(execution.stop() for execution in await self.get_ephemeral_executions())) - - async def get_ephemeral_executions(self) -> Iterable[VmExecution]: - executions = [ - ( - execution - for _, execution in self.executions.items() - if (await execution.check_is_running()) and not execution.persistent - ) - ] + await asyncio.gather(*(execution.stop() for execution in self.get_ephemeral_executions())) + + def get_ephemeral_executions(self) -> Iterable[VmExecution]: + executions = ( + execution for _, execution in self.executions.items() if execution.is_running and not execution.persistent + ) return executions or [] - async def get_persistent_executions(self) -> Iterable[VmExecution]: - executions = [ - ( - execution - for _vm_hash, execution in self.executions.items() - if (await execution.check_is_running()) and execution.persistent - ) - ] + def get_persistent_executions(self) -> Iterable[VmExecution]: + executions = ( + execution + for _vm_hash, execution in self.executions.items() + if execution.is_running and execution.persistent + ) return executions or [] - async def get_instance_executions(self): - executions = [ - ( - execution - for _vm_hash, execution in self.executions.items() - if (await execution.check_is_running()) and execution.is_instance - ) - ] + def get_instance_executions(self) -> Iterable[VmExecution]: + executions = ( + execution + for _vm_hash, execution in self.executions.items() + if execution.is_running and execution.is_instance + ) return executions or [] - async def get_executions_by_sender(self, payment_type: PaymentType) -> dict[str, dict[str, list[VmExecution]]]: + def get_executions_by_sender(self, payment_type: PaymentType) -> dict[str, dict[str, list[VmExecution]]]: """Return all executions of the given type, grouped by sender and by chain.""" executions_by_sender: dict[str, dict[str, list[VmExecution]]] = {} for vm_hash, execution in self.executions.items(): @@ -311,7 +301,7 @@ async def get_executions_by_sender(self, payment_type: PaymentType) -> dict[str, # Ignore Diagnostic VM execution continue - if not await execution.check_is_running(): + if not execution.is_running: # Ignore the execution that is stopping or not running anymore continue if execution.vm_hash == settings.CHECK_FASTAPI_VM_ID: diff --git a/src/aleph/vm/systemd.py b/src/aleph/vm/systemd.py index ee782436f..3b3c15c57 100644 --- a/src/aleph/vm/systemd.py +++ b/src/aleph/vm/systemd.py @@ -170,12 +170,11 @@ async def get_manager(self): async def enable(self, service: str) -> None: manager = await self.get_manager() - logger.debug(f"Enabled {service} service") await manager.call_enable_unit_files([service], False, True) + logger.debug(f"Enabled {service} service") async def start(self, service: str) -> None: manager = await self.get_manager() - logger.debug(f"Starting {service} service") await manager.call_start_unit(service, Mode.REPLACE) logger.debug(f"Started {service} service") diff --git a/src/aleph/vm/utils.py b/src/aleph/vm/utils.py index 7d464405f..63ce18253 100644 --- a/src/aleph/vm/utils.py +++ b/src/aleph/vm/utils.py @@ -1,7 +1,6 @@ import asyncio import dataclasses import hashlib -import inspect import json import logging import subprocess @@ -70,12 +69,9 @@ async def get_ref_from_dns(domain): return record[0].text -async def to_json(o: Any): +def to_json(o: Any): if hasattr(o, "to_dict"): # default method - if inspect.isawaitable(o.to_dict): - return await o.to_dict() - else: - return o.to_dict() + return o.to_dict() elif hasattr(o, "dict"): # Pydantic return o.dict() elif is_dataclass(o): From bc496dc6e17555cd7f658d0fd812013749200fd6 Mon Sep 17 00:00:00 2001 From: Olivier Le Thanh Duong Date: Mon, 29 Apr 2024 16:45:01 +0200 Subject: [PATCH 23/39] CI check system usage endpoint --- .github/workflows/test-on-droplets-matrix.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.github/workflows/test-on-droplets-matrix.yml b/.github/workflows/test-on-droplets-matrix.yml index c67c1688f..f99f30a0b 100644 --- a/.github/workflows/test-on-droplets-matrix.yml +++ b/.github/workflows/test-on-droplets-matrix.yml @@ -238,6 +238,14 @@ jobs: -d '{"persistent_vms": [], "instances": ["${{ matrix.check_vm.item_hash }}"]}' \ "http://${DROPLET_IPV4}:4020/control/allocations" + - name: Get system usage + run: | + export DROPLET_IPV4="$(doctl compute droplet get aleph-vm-ci-${{ matrix.os_config.alias }}-${{ matrix.check_vm.alias }} --output json | ./.github/scripts/extract_droplet_ipv4.py)" + curl -X GET -H "Content-Type: application/json" \ + -H "X-Auth-Signature: test" \ + "http://${DROPLET_IPV4}:4020/about/usage/system" + + - name: Export aleph logs if: always() run: | From 12743f6a1f14e6ed4158f2f016990594df7eb807 Mon Sep 17 00:00:00 2001 From: Olivier Le Thanh Duong Date: Mon, 29 Apr 2024 17:42:46 +0200 Subject: [PATCH 24/39] add unit test for system usage --- src/aleph/vm/models.py | 6 +----- src/aleph/vm/orchestrator/resources.py | 4 ++-- src/aleph/vm/pool.py | 5 +++-- tests/supervisor/test_views.py | 13 +++++++++++++ 4 files changed, 19 insertions(+), 9 deletions(-) diff --git a/src/aleph/vm/models.py b/src/aleph/vm/models.py index 9d38eea97..82390cb08 100644 --- a/src/aleph/vm/models.py +++ b/src/aleph/vm/models.py @@ -85,11 +85,7 @@ class VmExecution: @property def is_running(self) -> bool: - return ( - bool(self.times.starting_at and not self.times.stopping_at) - if not self.persistent - else self.systemd_manager.is_service_active(self.controller_service) - ) + return bool(self.times.starting_at and not self.times.stopping_at) @property def is_stopping(self) -> bool: diff --git a/src/aleph/vm/orchestrator/resources.py b/src/aleph/vm/orchestrator/resources.py index a40c6ff13..b58faaacc 100644 --- a/src/aleph/vm/orchestrator/resources.py +++ b/src/aleph/vm/orchestrator/resources.py @@ -87,8 +87,8 @@ def get_machine_properties() -> MachineProperties: cpu_info = cpuinfo.get_cpu_info() # Slow return MachineProperties( cpu=CpuProperties( - architecture=cpu_info["raw_arch_string"], - vendor=cpu_info["vendor_id"], + architecture=cpu_info.get("raw_arch_string", cpu_info.get("arch_string_raw")), + vendor=cpu_info["vendor_id_raw"], ), ) diff --git a/src/aleph/vm/pool.py b/src/aleph/vm/pool.py index 938a83726..64572bb47 100644 --- a/src/aleph/vm/pool.py +++ b/src/aleph/vm/pool.py @@ -239,8 +239,9 @@ async def load_persistent_executions(self): persistent=saved_execution.persistent, ) - if execution.is_running: - # TODO: Improve the way that we re-create running execution + if await self.systemd_manager.is_service_active( + execution.controller_service + ): # TODO: Improve the way that we re-create running execution await execution.prepare() if self.network: vm_type = VmType.from_message_content(execution.message) diff --git a/tests/supervisor/test_views.py b/tests/supervisor/test_views.py index 49a6fa91e..73bcfec45 100644 --- a/tests/supervisor/test_views.py +++ b/tests/supervisor/test_views.py @@ -24,3 +24,16 @@ async def test_allocation_fails_on_invalid_item_hash(aiohttp_client): "type": "value_error.unknownhash", }, ] + + +@pytest.mark.asyncio +async def test_system_usage(aiohttp_client): + """Test that the allocation endpoint fails when an invalid item_hash is provided.""" + client = await aiohttp_client(app) + settings.ALLOCATION_TOKEN_HASH = "9f86d081884c7d659a2feaa0c55ad015a3bf4f1b2b0b822cd15d6c15b0f00a08" # = "test" + response: web.Response = await client.get("/about/usage/system") + assert response.status == 200 + # check if it is valid json + resp = await response.json() + assert "cpu" in resp + assert resp["cpu"]["count"] > 0 From e58b9ad8b2b2e10af7cb660527d703d6043d72af Mon Sep 17 00:00:00 2001 From: Olivier Le Thanh Duong Date: Mon, 29 Apr 2024 17:42:46 +0200 Subject: [PATCH 25/39] add unit test for system usage --- tests/supervisor/test_views.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tests/supervisor/test_views.py b/tests/supervisor/test_views.py index 49a6fa91e..73bcfec45 100644 --- a/tests/supervisor/test_views.py +++ b/tests/supervisor/test_views.py @@ -24,3 +24,16 @@ async def test_allocation_fails_on_invalid_item_hash(aiohttp_client): "type": "value_error.unknownhash", }, ] + + +@pytest.mark.asyncio +async def test_system_usage(aiohttp_client): + """Test that the allocation endpoint fails when an invalid item_hash is provided.""" + client = await aiohttp_client(app) + settings.ALLOCATION_TOKEN_HASH = "9f86d081884c7d659a2feaa0c55ad015a3bf4f1b2b0b822cd15d6c15b0f00a08" # = "test" + response: web.Response = await client.get("/about/usage/system") + assert response.status == 200 + # check if it is valid json + resp = await response.json() + assert "cpu" in resp + assert resp["cpu"]["count"] > 0 From 6174c965d04cd947f2ee03769d9ec90f1a702605 Mon Sep 17 00:00:00 2001 From: Olivier Le Thanh Duong Date: Tue, 30 Apr 2024 09:53:13 +0200 Subject: [PATCH 26/39] Set up a fresh web_app for each test as required by aiohttp --- src/aleph/vm/orchestrator/resources.py | 5 +- src/aleph/vm/orchestrator/supervisor.py | 112 ++++++++++++------------ tests/supervisor/test_views.py | 4 +- 3 files changed, 63 insertions(+), 58 deletions(-) diff --git a/src/aleph/vm/orchestrator/resources.py b/src/aleph/vm/orchestrator/resources.py index a40c6ff13..29e819079 100644 --- a/src/aleph/vm/orchestrator/resources.py +++ b/src/aleph/vm/orchestrator/resources.py @@ -87,8 +87,8 @@ def get_machine_properties() -> MachineProperties: cpu_info = cpuinfo.get_cpu_info() # Slow return MachineProperties( cpu=CpuProperties( - architecture=cpu_info["raw_arch_string"], - vendor=cpu_info["vendor_id"], + architecture=cpu_info.get("raw_arch_string"), + vendor=cpu_info.get("vendor_id"), ), ) @@ -118,6 +118,7 @@ async def about_system_usage(_: web.Request): ), properties=get_machine_properties(), ) + return web.json_response(text=usage.json(exclude_none=True)) diff --git a/src/aleph/vm/orchestrator/supervisor.py b/src/aleph/vm/orchestrator/supervisor.py index 4846104ae..892106ba0 100644 --- a/src/aleph/vm/orchestrator/supervisor.py +++ b/src/aleph/vm/orchestrator/supervisor.py @@ -74,62 +74,63 @@ async def http_not_found(request: web.Request): return web.HTTPNotFound() -app = web.Application(middlewares=[server_version_middleware]) -cors = setup( - app, - defaults={ - "*": ResourceOptions( - allow_credentials=True, - expose_headers="*", - allow_headers="*", - ) - }, -) +def setup_webapp(): + app = web.Application(middlewares=[server_version_middleware]) + cors = setup( + app, + defaults={ + "*": ResourceOptions( + allow_credentials=True, + expose_headers="*", + allow_headers="*", + ) + }, + ) -# Routes that need CORS enabled -cors_routes = [ - # /about APIs return information about the VM Orchestrator - web.get("/about/login", about_login), - web.get("/about/executions/list", list_executions), - web.get("/about/executions/details", about_executions), - web.get("/about/executions/records", about_execution_records), - web.get("/about/usage/system", about_system_usage), - web.get("/about/config", about_config), - # /control APIs are used to control the VMs and access their logs - web.post("/control/allocation/notify", notify_allocation), - web.get("/control/machine/{ref}/logs", stream_logs), - web.post("/control/machine/{ref}/expire", operate_expire), - web.post("/control/machine/{ref}/stop", operate_stop), - web.post("/control/machine/{ref}/erase", operate_erase), - web.post("/control/machine/{ref}/reboot", operate_reboot), - # /status APIs are used to check that the VM Orchestrator is running properly - web.get("/status/check/fastapi", status_check_fastapi), - web.get("/status/check/fastapi/legacy", status_check_fastapi_legacy), - web.get("/status/check/host", status_check_host), - web.get("/status/check/version", status_check_version), - web.get("/status/check/ipv6", status_check_ipv6), - web.get("/status/config", status_public_config), -] -routes = app.add_routes(cors_routes) -for route in routes: - cors.add(route) - - -# Routes that don't need CORS enabled -other_routes = [ - # /control APIs are used to control the VMs and access their logs - web.post("/control/allocations", update_allocations), - # Raise an HTTP Error 404 if attempting to access an unknown URL within these paths. - web.get("/about/{suffix:.*}", http_not_found), - web.get("/control/{suffix:.*}", http_not_found), - web.get("/status/{suffix:.*}", http_not_found), - # /static is used to serve static files - web.static("/static", Path(__file__).parent / "views/static"), - # /vm is used to launch VMs on-demand - web.route("*", "/vm/{ref}{suffix:.*}", run_code_from_path), - web.route("*", "/{suffix:.*}", run_code_from_hostname), -] -app.add_routes(other_routes) + # Routes that need CORS enabled + cors_routes = [ + # /about APIs return information about the VM Orchestrator + web.get("/about/login", about_login), + web.get("/about/executions/list", list_executions), + web.get("/about/executions/details", about_executions), + web.get("/about/executions/records", about_execution_records), + web.get("/about/usage/system", about_system_usage), + web.get("/about/config", about_config), + # /control APIs are used to control the VMs and access their logs + web.post("/control/allocation/notify", notify_allocation), + web.get("/control/machine/{ref}/logs", stream_logs), + web.post("/control/machine/{ref}/expire", operate_expire), + web.post("/control/machine/{ref}/stop", operate_stop), + web.post("/control/machine/{ref}/erase", operate_erase), + web.post("/control/machine/{ref}/reboot", operate_reboot), + # /status APIs are used to check that the VM Orchestrator is running properly + web.get("/status/check/fastapi", status_check_fastapi), + web.get("/status/check/fastapi/legacy", status_check_fastapi_legacy), + web.get("/status/check/host", status_check_host), + web.get("/status/check/version", status_check_version), + web.get("/status/check/ipv6", status_check_ipv6), + web.get("/status/config", status_public_config), + ] + routes = app.add_routes(cors_routes) + for route in routes: + cors.add(route) + + # Routes that don't need CORS enabled + other_routes = [ + # /control APIs are used to control the VMs and access their logs + web.post("/control/allocations", update_allocations), + # Raise an HTTP Error 404 if attempting to access an unknown URL within these paths. + web.get("/about/{suffix:.*}", http_not_found), + web.get("/control/{suffix:.*}", http_not_found), + web.get("/status/{suffix:.*}", http_not_found), + # /static is used to serve static files + web.static("/static", Path(__file__).parent / "views/static"), + # /vm is used to launch VMs on-demand + web.route("*", "/vm/{ref}{suffix:.*}", run_code_from_path), + web.route("*", "/{suffix:.*}", run_code_from_hostname), + ] + app.add_routes(other_routes) + return app async def stop_all_vms(app: web.Application): @@ -153,6 +154,7 @@ def run(): # Require a random token to access /about APIs secret_token = token_urlsafe(nbytes=32) + app = setup_webapp() # Store app singletons. Note that app["pubsub"] will also be created. app["secret_token"] = secret_token app["vm_pool"] = pool diff --git a/tests/supervisor/test_views.py b/tests/supervisor/test_views.py index 73bcfec45..58cad0d69 100644 --- a/tests/supervisor/test_views.py +++ b/tests/supervisor/test_views.py @@ -2,12 +2,13 @@ from aiohttp import web from aleph.vm.conf import settings -from aleph.vm.orchestrator.supervisor import app +from aleph.vm.orchestrator.supervisor import setup_webapp @pytest.mark.asyncio async def test_allocation_fails_on_invalid_item_hash(aiohttp_client): """Test that the allocation endpoint fails when an invalid item_hash is provided.""" + app = setup_webapp() client = await aiohttp_client(app) settings.ALLOCATION_TOKEN_HASH = "9f86d081884c7d659a2feaa0c55ad015a3bf4f1b2b0b822cd15d6c15b0f00a08" # = "test" response: web.Response = await client.post( @@ -29,6 +30,7 @@ async def test_allocation_fails_on_invalid_item_hash(aiohttp_client): @pytest.mark.asyncio async def test_system_usage(aiohttp_client): """Test that the allocation endpoint fails when an invalid item_hash is provided.""" + app = setup_webapp() client = await aiohttp_client(app) settings.ALLOCATION_TOKEN_HASH = "9f86d081884c7d659a2feaa0c55ad015a3bf4f1b2b0b822cd15d6c15b0f00a08" # = "test" response: web.Response = await client.get("/about/usage/system") From f9133980e5388e166bb52a52d2aa01b5c8cf3eba Mon Sep 17 00:00:00 2001 From: Olivier Le Thanh Duong Date: Tue, 30 Apr 2024 11:42:19 +0200 Subject: [PATCH 27/39] revert local compat change --- src/aleph/vm/orchestrator/resources.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/aleph/vm/orchestrator/resources.py b/src/aleph/vm/orchestrator/resources.py index 29e819079..1679c0525 100644 --- a/src/aleph/vm/orchestrator/resources.py +++ b/src/aleph/vm/orchestrator/resources.py @@ -87,8 +87,8 @@ def get_machine_properties() -> MachineProperties: cpu_info = cpuinfo.get_cpu_info() # Slow return MachineProperties( cpu=CpuProperties( - architecture=cpu_info.get("raw_arch_string"), - vendor=cpu_info.get("vendor_id"), + architecture=cpu_info["raw_arch_string"], + vendor=cpu_info["vendor_id"], ), ) From fdbc765fd431bacd9f20f8bec4ae75f284d275d5 Mon Sep 17 00:00:00 2001 From: Olivier Le Thanh Duong Date: Tue, 30 Apr 2024 12:57:04 +0200 Subject: [PATCH 28/39] remove force settings the loop which was causing problem with future from different loop in web requests --- src/aleph/vm/pool.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/aleph/vm/pool.py b/src/aleph/vm/pool.py index 64572bb47..19c0dc5ce 100644 --- a/src/aleph/vm/pool.py +++ b/src/aleph/vm/pool.py @@ -50,7 +50,6 @@ def __init__(self, loop: asyncio.AbstractEventLoop): self.executions = {} self.message_cache = {} - asyncio.set_event_loop(loop) self.creation_lock = asyncio.Lock() self.network = ( From f59cc5fe21379acb41838485ab18f4907ee52bea Mon Sep 17 00:00:00 2001 From: Olivier Le Thanh Duong Date: Tue, 30 Apr 2024 13:32:43 +0200 Subject: [PATCH 29/39] fix other double loop problems --- src/aleph/vm/orchestrator/supervisor.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/aleph/vm/orchestrator/supervisor.py b/src/aleph/vm/orchestrator/supervisor.py index b9b2a0078..484d69e7f 100644 --- a/src/aleph/vm/orchestrator/supervisor.py +++ b/src/aleph/vm/orchestrator/supervisor.py @@ -139,12 +139,12 @@ async def stop_all_vms(app: web.Application): def run(): """Run the VM Supervisor.""" + loop = asyncio.new_event_loop() settings.check() engine = setup_engine() asyncio.run(create_tables(engine)) - loop = asyncio.new_event_loop() pool = VmPool(loop) pool.setup() @@ -169,10 +169,10 @@ def run(): app.on_cleanup.append(stop_all_vms) logger.info("Loading existing executions ...") - asyncio.run(pool.load_persistent_executions()) + loop.run_until_complete(pool.load_persistent_executions()) logger.info(f"Starting the web server on http://{settings.SUPERVISOR_HOST}:{settings.SUPERVISOR_PORT}") - web.run_app(app, host=settings.SUPERVISOR_HOST, port=settings.SUPERVISOR_PORT) + web.run_app(app, host=settings.SUPERVISOR_HOST, port=settings.SUPERVISOR_PORT, loop=loop) except OSError as e: if e.errno == 98: logger.error( From 4aa3eb8c7f564520331b3ba4c0a0490b9eba4cc0 Mon Sep 17 00:00:00 2001 From: Olivier Le Thanh Duong Date: Tue, 30 Apr 2024 13:34:47 +0200 Subject: [PATCH 30/39] Fix inconsistant execution state this was occasionally causing crash when trying a second launch --- src/aleph/vm/pool.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/aleph/vm/pool.py b/src/aleph/vm/pool.py index 19c0dc5ce..6dcfd4c1a 100644 --- a/src/aleph/vm/pool.py +++ b/src/aleph/vm/pool.py @@ -241,6 +241,7 @@ async def load_persistent_executions(self): if await self.systemd_manager.is_service_active( execution.controller_service ): # TODO: Improve the way that we re-create running execution + logger.debug(("Execution %s is still running in systemd, reconnecting", execution.vm_hash)) await execution.prepare() if self.network: vm_type = VmType.from_message_content(execution.message) @@ -251,16 +252,21 @@ async def load_persistent_executions(self): vm = execution.create(vm_id=vm_id, tap_interface=tap_interface, prepare=False) await vm.start_guest_api() execution.ready_event.set() + execution.times.starting_at = execution.times.starting_at or datetime.now(tz=timezone.utc) execution.times.started_at = datetime.now(tz=timezone.utc) - + execution.times.stopping_at = None + execution.times.stopped_at = None self._schedule_forget_on_stop(execution) # Start the snapshot manager for the VM if vm.support_snapshot and self.snapshot_manager: await self.snapshot_manager.start_for(vm=execution.vm) + assert execution.is_running self.executions[vm_hash] = execution + else: + logger.debug(("Execution %s is not running in systemd, reconnecting", execution.vm_hash)) execution.uuid = saved_execution.uuid await execution.record_usage() From 69ed5552fc8f8d79e92341bcfd2c13d2cd044faf Mon Sep 17 00:00:00 2001 From: Olivier Le Thanh Duong Date: Tue, 30 Apr 2024 14:21:28 +0200 Subject: [PATCH 31/39] Remove unused loop params --- src/aleph/vm/orchestrator/cli.py | 6 ++---- src/aleph/vm/orchestrator/supervisor.py | 2 +- src/aleph/vm/pool.py | 2 +- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src/aleph/vm/orchestrator/cli.py b/src/aleph/vm/orchestrator/cli.py index 65b290ba2..1c56c232e 100644 --- a/src/aleph/vm/orchestrator/cli.py +++ b/src/aleph/vm/orchestrator/cli.py @@ -188,8 +188,7 @@ async def fake_read() -> bytes: bench: list[float] = [] - loop = asyncio.get_event_loop() - pool = VmPool(loop) + pool = VmPool() pool.setup() # Does not make sense in benchmarks @@ -246,8 +245,7 @@ async def start_instance(item_hash: ItemHash, pubsub: Optional[PubSub], pool) -> async def run_instances(instances: list[ItemHash]) -> None: """Run instances from a list of message identifiers.""" logger.info(f"Instances to run: {instances}") - loop = asyncio.get_event_loop() - pool = VmPool(loop) + pool = VmPool() # The main program uses a singleton pubsub instance in order to watch for updates. # We create another instance here since that singleton is not initialized yet. # Watching for updates on this instance will therefore not work. diff --git a/src/aleph/vm/orchestrator/supervisor.py b/src/aleph/vm/orchestrator/supervisor.py index 484d69e7f..efb1a12d1 100644 --- a/src/aleph/vm/orchestrator/supervisor.py +++ b/src/aleph/vm/orchestrator/supervisor.py @@ -145,7 +145,7 @@ def run(): engine = setup_engine() asyncio.run(create_tables(engine)) - pool = VmPool(loop) + pool = VmPool() pool.setup() hostname = settings.DOMAIN_NAME diff --git a/src/aleph/vm/pool.py b/src/aleph/vm/pool.py index 6dcfd4c1a..051c6f7f9 100644 --- a/src/aleph/vm/pool.py +++ b/src/aleph/vm/pool.py @@ -45,7 +45,7 @@ class VmPool: systemd_manager: SystemDManager creation_lock: asyncio.Lock - def __init__(self, loop: asyncio.AbstractEventLoop): + def __init__(self): self.counter = settings.START_ID_INDEX self.executions = {} self.message_cache = {} From 6e84b2600880b381223814d9829c3f1d62230045 Mon Sep 17 00:00:00 2001 From: Olivier Le Thanh Duong Date: Tue, 30 Apr 2024 14:11:52 +0200 Subject: [PATCH 32/39] Apparently CI also don't have matching arch --- src/aleph/vm/orchestrator/resources.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/aleph/vm/orchestrator/resources.py b/src/aleph/vm/orchestrator/resources.py index 1679c0525..448a822c5 100644 --- a/src/aleph/vm/orchestrator/resources.py +++ b/src/aleph/vm/orchestrator/resources.py @@ -87,8 +87,8 @@ def get_machine_properties() -> MachineProperties: cpu_info = cpuinfo.get_cpu_info() # Slow return MachineProperties( cpu=CpuProperties( - architecture=cpu_info["raw_arch_string"], - vendor=cpu_info["vendor_id"], + architecture=cpu_info.get("raw_arch_string", cpu_info.get("arch_string_raw")), + vendor=cpu_info.get("vendor_id", cpu_info.get("vendor_id_raw")), ), ) From b80061242900de8590e940e95143a92cff5258a9 Mon Sep 17 00:00:00 2001 From: Olivier Le Thanh Duong Date: Tue, 30 Apr 2024 15:00:09 +0200 Subject: [PATCH 33/39] Fix test description --- tests/supervisor/test_views.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/supervisor/test_views.py b/tests/supervisor/test_views.py index 58cad0d69..abd375be1 100644 --- a/tests/supervisor/test_views.py +++ b/tests/supervisor/test_views.py @@ -29,10 +29,9 @@ async def test_allocation_fails_on_invalid_item_hash(aiohttp_client): @pytest.mark.asyncio async def test_system_usage(aiohttp_client): - """Test that the allocation endpoint fails when an invalid item_hash is provided.""" + """Test that the usage system endpoints responds. No auth needed""" app = setup_webapp() client = await aiohttp_client(app) - settings.ALLOCATION_TOKEN_HASH = "9f86d081884c7d659a2feaa0c55ad015a3bf4f1b2b0b822cd15d6c15b0f00a08" # = "test" response: web.Response = await client.get("/about/usage/system") assert response.status == 200 # check if it is valid json From 5fe46ac4e5979f397ffe4b7d875caed8f2235847 Mon Sep 17 00:00:00 2001 From: Olivier Le Thanh Duong Date: Tue, 30 Apr 2024 15:03:11 +0200 Subject: [PATCH 34/39] Problem: allocation endpoints was not tested Solution: Start by adding some simple tests We don't test the full allocation and deallocation here. just auth --- src/aleph/vm/orchestrator/views/__init__.py | 6 +++ tests/supervisor/test_views.py | 53 +++++++++++++++++++++ 2 files changed, 59 insertions(+) diff --git a/src/aleph/vm/orchestrator/views/__init__.py b/src/aleph/vm/orchestrator/views/__init__.py index 177e6a348..1c614d428 100644 --- a/src/aleph/vm/orchestrator/views/__init__.py +++ b/src/aleph/vm/orchestrator/views/__init__.py @@ -341,6 +341,12 @@ def authenticate_api_request(request: web.Request) -> bool: async def update_allocations(request: web.Request): + """Main entry for the start of persistence VM and instance, called by the CCN, + + + auth via the SETTINGS.ALLOCATION_TOKEN_HASH sent in header X-Auth-Signature. + Receive a list of vm and instance that should be present and then match that state by stopping and launching VMs + """ if not authenticate_api_request(request): return web.HTTPUnauthorized(text="Authentication token received is invalid") diff --git a/tests/supervisor/test_views.py b/tests/supervisor/test_views.py index abd375be1..6788667fe 100644 --- a/tests/supervisor/test_views.py +++ b/tests/supervisor/test_views.py @@ -38,3 +38,56 @@ async def test_system_usage(aiohttp_client): resp = await response.json() assert "cpu" in resp assert resp["cpu"]["count"] > 0 + + +@pytest.mark.asyncio +async def test_allocation_invalid_auth_token(aiohttp_client): + """Test that the allocation endpoint fails when an invalid auth token is provided.""" + settings.ALLOCATION_TOKEN_HASH = "9f86d081884c7d659a2feaa0c55ad015a3bf4f1b2b0b822cd15d6c15b0f00a08" # = "test" + app = setup_webapp() + client = await aiohttp_client(app) + response = await client.post( + "/control/allocations", + json={"persistent_vms": []}, + headers={"X-Auth-Signature": "notTest"}, + ) + assert response.status == 401 + assert await response.text() == "Authentication token received is invalid" + + +@pytest.mark.asyncio +async def test_allocation_missing_auth_token(aiohttp_client): + """Test that the allocation endpoint fails when auth token is not provided.""" + app = setup_webapp() + client = await aiohttp_client(app) + response: web.Response = await client.post( + "/control/allocations", + json={"persistent_vms": []}, + ) + assert response.status == 401 + assert await response.text() == "Authentication token is missing" + + +@pytest.mark.asyncio +async def test_allocation_valid_token(aiohttp_client): + """Test that the allocation endpoint fails when an invalid auth is provided. + + This is a very simple test that don't start or stop any VM so the mock is minimal""" + + class FakeVmPool: + def get_persistent_executions(self): + return [] + + settings.ALLOCATION_TOKEN_HASH = "9f86d081884c7d659a2feaa0c55ad015a3bf4f1b2b0b822cd15d6c15b0f00a08" # = "test" + app = setup_webapp() + app["vm_pool"] = FakeVmPool() + app["pubsub"] = FakeVmPool() + client = await aiohttp_client(app) + + response: web.Response = await client.post( + "/control/allocations", + json={"persistent_vms": []}, + headers={"X-Auth-Signature": "test"}, + ) + assert response.status == 200 + assert await response.json() == {"success": True, "successful": [], "failing": [], "errors": {}} From 5aabea210e1cb8e45c08301b9b660131dc8c29f4 Mon Sep 17 00:00:00 2001 From: Olivier Le Thanh Duong Date: Tue, 30 Apr 2024 15:17:27 +0200 Subject: [PATCH 35/39] style --- src/aleph/vm/pool.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/aleph/vm/pool.py b/src/aleph/vm/pool.py index 051c6f7f9..73fbfa48d 100644 --- a/src/aleph/vm/pool.py +++ b/src/aleph/vm/pool.py @@ -241,7 +241,7 @@ async def load_persistent_executions(self): if await self.systemd_manager.is_service_active( execution.controller_service ): # TODO: Improve the way that we re-create running execution - logger.debug(("Execution %s is still running in systemd, reconnecting", execution.vm_hash)) + logger.debug("Execution %s is still running in systemd, reconnecting", execution.vm_hash) await execution.prepare() if self.network: vm_type = VmType.from_message_content(execution.message) From c0003508b289bb7d01d26b91a79172589211f3b1 Mon Sep 17 00:00:00 2001 From: Olivier Le Thanh Duong Date: Thu, 2 May 2024 12:50:28 +0200 Subject: [PATCH 36/39] black --- src/aleph/vm/orchestrator/cli.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/aleph/vm/orchestrator/cli.py b/src/aleph/vm/orchestrator/cli.py index e253c2930..38a16a3b6 100644 --- a/src/aleph/vm/orchestrator/cli.py +++ b/src/aleph/vm/orchestrator/cli.py @@ -251,7 +251,6 @@ async def run_instances(instances: list[ItemHash]) -> None: # Watching for updates on this instance will therefore not work. pubsub: Optional[PubSub] = None - await asyncio.gather(*[start_instance(instance_id, pubsub, pool) for instance_id in instances]) await asyncio.Event().wait() # wait forever From 975ada625ea82c3ddee503e410e0ac288cd1e1a4 Mon Sep 17 00:00:00 2001 From: Olivier Le Thanh Duong Date: Thu, 2 May 2024 13:35:28 +0200 Subject: [PATCH 37/39] Fix bug found on debian 11 / python 3.9 droplet --- src/aleph/vm/orchestrator/supervisor.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/aleph/vm/orchestrator/supervisor.py b/src/aleph/vm/orchestrator/supervisor.py index db622be08..def871d18 100644 --- a/src/aleph/vm/orchestrator/supervisor.py +++ b/src/aleph/vm/orchestrator/supervisor.py @@ -140,7 +140,11 @@ async def stop_all_vms(app: web.Application): def run(): """Run the VM Supervisor.""" + # Loop creation set here to avoid bug with Future on different loop + loop = asyncio.new_event_loop() + # apparently needed for Python 3.9 / Debian 11 + asyncio.set_event_loop(loop) settings.check() engine = setup_engine() @@ -163,7 +167,6 @@ def run(): try: if settings.WATCH_FOR_MESSAGES: - # FIXME We have a bug because task run on app.on_ don't run on the same loop? app.on_startup.append(start_watch_for_messages_task) app.on_startup.append(start_payment_monitoring_task) app.on_cleanup.append(stop_watch_for_messages_task) From 0e2ca01af5208f2a56822c62dfc1280bfc710ad9 Mon Sep 17 00:00:00 2001 From: Olivier Le Thanh Duong Date: Fri, 3 May 2024 10:35:08 +0200 Subject: [PATCH 38/39] Try another way to fix python 3.9 / Debian 11 --- src/aleph/vm/orchestrator/cli.py | 7 +++++-- src/aleph/vm/orchestrator/supervisor.py | 2 +- src/aleph/vm/pool.py | 4 +++- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/src/aleph/vm/orchestrator/cli.py b/src/aleph/vm/orchestrator/cli.py index 38a16a3b6..65b290ba2 100644 --- a/src/aleph/vm/orchestrator/cli.py +++ b/src/aleph/vm/orchestrator/cli.py @@ -188,7 +188,8 @@ async def fake_read() -> bytes: bench: list[float] = [] - pool = VmPool() + loop = asyncio.get_event_loop() + pool = VmPool(loop) pool.setup() # Does not make sense in benchmarks @@ -245,13 +246,15 @@ async def start_instance(item_hash: ItemHash, pubsub: Optional[PubSub], pool) -> async def run_instances(instances: list[ItemHash]) -> None: """Run instances from a list of message identifiers.""" logger.info(f"Instances to run: {instances}") - pool = VmPool() + loop = asyncio.get_event_loop() + pool = VmPool(loop) # The main program uses a singleton pubsub instance in order to watch for updates. # We create another instance here since that singleton is not initialized yet. # Watching for updates on this instance will therefore not work. pubsub: Optional[PubSub] = None await asyncio.gather(*[start_instance(instance_id, pubsub, pool) for instance_id in instances]) + await asyncio.Event().wait() # wait forever diff --git a/src/aleph/vm/orchestrator/supervisor.py b/src/aleph/vm/orchestrator/supervisor.py index def871d18..04b121d7a 100644 --- a/src/aleph/vm/orchestrator/supervisor.py +++ b/src/aleph/vm/orchestrator/supervisor.py @@ -150,7 +150,7 @@ def run(): engine = setup_engine() asyncio.run(create_tables(engine)) - pool = VmPool() + pool = VmPool(loop) pool.setup() hostname = settings.DOMAIN_NAME diff --git a/src/aleph/vm/pool.py b/src/aleph/vm/pool.py index 73fbfa48d..c35312129 100644 --- a/src/aleph/vm/pool.py +++ b/src/aleph/vm/pool.py @@ -45,11 +45,13 @@ class VmPool: systemd_manager: SystemDManager creation_lock: asyncio.Lock - def __init__(self): + def __init__(self, loop: asyncio.AbstractEventLoop): self.counter = settings.START_ID_INDEX self.executions = {} self.message_cache = {} + # apparently needed for Python 3.9 / Debian 11 + asyncio.set_event_loop(loop) self.creation_lock = asyncio.Lock() self.network = ( From b02c3c2f4425dde4c63fbc3dc897fe59f95f65be Mon Sep 17 00:00:00 2001 From: Olivier Le Thanh Duong Date: Fri, 17 May 2024 10:54:02 +0200 Subject: [PATCH 39/39] Split the systems Protocol in own module --- src/aleph/vm/systemd.py | 130 +------------------------------ src/aleph/vm/systemd_helpers.py | 131 ++++++++++++++++++++++++++++++++ 2 files changed, 134 insertions(+), 127 deletions(-) create mode 100644 src/aleph/vm/systemd_helpers.py diff --git a/src/aleph/vm/systemd.py b/src/aleph/vm/systemd.py index 3b3c15c57..3bd619bb7 100644 --- a/src/aleph/vm/systemd.py +++ b/src/aleph/vm/systemd.py @@ -2,139 +2,15 @@ async SystemD Manager implementation. """ -import enum import logging -from typing import Literal, Optional, Protocol, runtime_checkable +from typing import Optional from dbus_fast import BusType, DBusError from dbus_fast.aio import MessageBus, ProxyObject -logger = logging.getLogger(__name__) - - -class UnitFileState(str, enum.Enum): - """This StrEnum class represents the different possible states of a unit file.""" - - ENABLED = "enabled" - """Indicates that a unit file is permanently enabled.""" - - ENABLED_RUNTIME = "enabled-runtime" - """Indicates the unit file is only temporarily enabled and will no longer be enabled after a reboot - (that means, it is enabled via /run/ symlinks, rather than /etc/).""" - - LINKED = "linked" - """Indicates that a unit is linked into /etc/ permanently.""" - - LINKED_RUNTIME = "linked-runtime" - """Indicates that a unit is linked into /run/ temporarily (until the next reboot).""" - - MASKED = "masked" - """Indicates that the unit file is masked permanently.""" - - MASKED_RUNTIME = "masked-runtime" - """Indicates that it is masked in /run/ temporarily (until the next reboot).""" - - STATIC = "static" - """Indicates that the unit is statically enabled, i.e. always enabled and doesn't need to be enabled explicitly.""" - - DISABLED = "disabled" - """Indicates that the unit file is not enabled.""" - - INVALID = "invalid" - """Indicates that it could not be determined whether the unit file is enabled.""" - - -UnitFileStateLiteral = Literal[ - "enabled", - "enabled-runtime", - "linked", - "linked-runtime", - "masked", - "masked-runtime", - "static", - "disabled", - "invalid", -] +from aleph.vm.systemd_helpers import UnitFileState, Mode, ActiveState, SystemdProxy, UnitProxy - -class Mode(str, enum.Enum): - REPLACE = "replace" - FAIL = "fail" - ISOLATE = "isolate" - IGNORE_DEPENDENCIES = "ignore-dependencies" - IGNORE_REQUIREMENTS = "ignore-requirements" - - -class ActiveState(str, enum.Enum): - """ - ActiveState contains a state value that reflects the unit's current status. - """ - - ACTIVE = "active" - """ - The unit is active. - """ - - RELOADING = "reloading" - """ - The unit is active and reloading its configuration. - """ - - INACTIVE = "inactive" - """ - The unit is inactive, previous run was successful or hasn't yet occurred. - """ - - FAILED = "failed" - """ - The unit is inactive, previous run was unsuccessful. - """ - - ACTIVATING = "activating" - """ - The unit is transitioning from inactive to active state. - """ - - DEACTIVATING = "deactivating" - """ - The unit is in the process of deactivation. - """ - - -ActiveStateLiteral = Literal["active", "reloading", "inactive", "failed", "activating", "deactivating"] - - -@runtime_checkable -class SystemdProxy(Protocol): - """ABC for typing. - - for description of methodsp - see https://www.freedesktop.org/software/systemd/man/latest/org.freedesktop.systemd1.html#The%20Manager%20Object""" - - async def call_enable_unit_files(self, files: list[str], runtime: bool, force: bool): ... - - async def call_get_unit_file_state(self, service) -> UnitFileStateLiteral: ... - - async def call_start_unit(self, name, mode): - pass - - async def call_stop_unit(self, name, mode): ... - - async def call_restart_unit(self, name, mode): ... - - async def call_disable_unit_files(self, files: list[str], runtime: bool): ... - - async def call_get_unit(self, name: str) -> str: ... - - -@runtime_checkable -class UnitProxy(Protocol): - """for typing. - - for description of methods see - https://www.freedesktop.org/software/systemd/man/latest/org.freedesktop.systemd1.html#Service%20Unit%20Objects""" - - async def get_active_state(self) -> ActiveStateLiteral: ... +logger = logging.getLogger(__name__) class SystemDManager: diff --git a/src/aleph/vm/systemd_helpers.py b/src/aleph/vm/systemd_helpers.py new file mode 100644 index 000000000..2dd6f9f72 --- /dev/null +++ b/src/aleph/vm/systemd_helpers.py @@ -0,0 +1,131 @@ +"""Typing helpers for talking to systemd via dbus + +The proxy object interface are determined at runtimes""" + +import enum +from typing import Literal, runtime_checkable, Protocol + + +class UnitFileState(str, enum.Enum): + """This StrEnum class represents the different possible states of a unit file.""" + + ENABLED = "enabled" + """Indicates that a unit file is permanently enabled.""" + + ENABLED_RUNTIME = "enabled-runtime" + """Indicates the unit file is only temporarily enabled and will no longer be enabled after a reboot + (that means, it is enabled via /run/ symlinks, rather than /etc/).""" + + LINKED = "linked" + """Indicates that a unit is linked into /etc/ permanently.""" + + LINKED_RUNTIME = "linked-runtime" + """Indicates that a unit is linked into /run/ temporarily (until the next reboot).""" + + MASKED = "masked" + """Indicates that the unit file is masked permanently.""" + + MASKED_RUNTIME = "masked-runtime" + """Indicates that it is masked in /run/ temporarily (until the next reboot).""" + + STATIC = "static" + """Indicates that the unit is statically enabled, i.e. always enabled and doesn't need to be enabled explicitly.""" + + DISABLED = "disabled" + """Indicates that the unit file is not enabled.""" + + INVALID = "invalid" + """Indicates that it could not be determined whether the unit file is enabled.""" + + +UnitFileStateLiteral = Literal[ + "enabled", + "enabled-runtime", + "linked", + "linked-runtime", + "masked", + "masked-runtime", + "static", + "disabled", + "invalid", +] + + +class Mode(str, enum.Enum): + REPLACE = "replace" + FAIL = "fail" + ISOLATE = "isolate" + IGNORE_DEPENDENCIES = "ignore-dependencies" + IGNORE_REQUIREMENTS = "ignore-requirements" + + +class ActiveState(str, enum.Enum): + """ + ActiveState contains a state value that reflects the unit's current status. + """ + + ACTIVE = "active" + """ + The unit is active. + """ + + RELOADING = "reloading" + """ + The unit is active and reloading its configuration. + """ + + INACTIVE = "inactive" + """ + The unit is inactive, previous run was successful or hasn't yet occurred. + """ + + FAILED = "failed" + """ + The unit is inactive, previous run was unsuccessful. + """ + + ACTIVATING = "activating" + """ + The unit is transitioning from inactive to active state. + """ + + DEACTIVATING = "deactivating" + """ + The unit is in the process of deactivation. + """ + + +ActiveStateLiteral = Literal["active", "reloading", "inactive", "failed", "activating", "deactivating"] + + +@runtime_checkable +class SystemdProxy(Protocol): + """ABC for typing. + + for description of methods + see https://www.freedesktop.org/software/systemd/man/latest/org.freedesktop.systemd1.html#The%20Manager%20Object""" + + async def call_enable_unit_files(self, files: list[str], runtime: bool, force: bool): ... + + async def call_get_unit_file_state(self, service) -> UnitFileStateLiteral: ... + + async def call_start_unit(self, name, mode): + pass + + async def call_stop_unit(self, name, mode): ... + + async def call_restart_unit(self, name, mode): ... + + async def call_disable_unit_files(self, files: list[str], runtime: bool): ... + + async def call_get_unit(self, name: str) -> str: ... + + +@runtime_checkable +class UnitProxy(Protocol): + """for typing. + + for description of methods see + https://www.freedesktop.org/software/systemd/man/latest/org.freedesktop.systemd1.html#Service%20Unit%20Objects""" + + async def get_active_state(self) -> ActiveStateLiteral: ...