Skip to content

Commit

Permalink
Solve duplicated network issues for ephemeral VMs (#680)
Browse files Browse the repository at this point in the history
* Fix: Solve failing test removing it because is not used.

* Problem: If the service restarts, the diagnostic VM fails for network issues.

Solution: Loading already loaded VMs filtering by only persistent ones.

* Fix: Replaced interface check by interface remove and re-creation.

* Fix: Ensure to delete the IPv6 address first before trying to delete the interface to prevent if the deletion fails.

* Fix: Also delete the IPv4 ip to prevent 2 interfaces with the same IPv4.

---------

Co-authored-by: Andres D. Molins <[email protected]>
  • Loading branch information
nesitor and Andres D. Molins authored Aug 23, 2024
1 parent 6af63ce commit 4aa662b
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 8 deletions.
15 changes: 15 additions & 0 deletions src/aleph/vm/network/interfaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,19 @@ def add_ip_address(ipr: IPRoute, device_name: str, ip: Union[IPv4Interface, IPv6
logger.error(f"Unknown exception while adding address {ip} to interface {device_name}: {e}")


def delete_ip_address(ipr: IPRoute, device_name: str, ip: Union[IPv4Interface, IPv6Interface]):
"""Delete an IP address to the given interface."""
interface_index: list[int] = ipr.link_lookup(ifname=device_name)
if not interface_index:
raise MissingInterfaceError(f"Interface {device_name} does not exist, can't delete address {ip} to it.")
try:
ipr.addr("del", index=interface_index[0], address=str(ip.ip), mask=ip.network.prefixlen)
except NetlinkError as e:
logger.error(f"Unknown exception while deleting address {ip} to interface {device_name}: {e}")
except OSError as e:
logger.error(f"Unknown exception while deleting address {ip} to interface {device_name}: {e}")


def set_link_up(ipr: IPRoute, device_name: str):
"""Set the given interface up."""
interface_index: list[int] = ipr.link_lookup(ifname=device_name)
Expand Down Expand Up @@ -154,4 +167,6 @@ async def delete(self) -> None:
if self.ndp_proxy:
await self.ndp_proxy.delete_range(self.device_name)
with IPRoute() as ipr:
delete_ip_address(ipr, self.device_name, self.host_ip)
delete_ip_address(ipr, self.device_name, self.host_ipv6)
delete_tap_interface(ipr, self.device_name)
13 changes: 5 additions & 8 deletions src/aleph/vm/pool.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,9 @@ async def create_a_vm(
if self.network:
vm_type = VmType.from_message_content(message)
tap_interface = await self.network.prepare_tap(vm_id, vm_hash, vm_type)
# If the network interface already exists, remove it and then re-create it.
if self.network.interface_exists(vm_id):
await tap_interface.delete()
await self.network.create_tap(vm_id, tap_interface)
else:
tap_interface = None
Expand Down Expand Up @@ -163,12 +166,6 @@ def get_unique_vm_id(self) -> int:
# anymore.
currently_used_vm_ids = {execution.vm_id for execution in self.executions.values()}
for i in range(settings.START_ID_INDEX, 255**2):

if self.network:
# Check the network interface don't already exists, otherwise it will cause a crash
if self.network.interface_exists(i):
continue

if i not in currently_used_vm_ids:
return i
else:
Expand Down Expand Up @@ -229,8 +226,8 @@ async def load_persistent_executions(self):
for saved_execution in saved_executions:
vm_hash = ItemHash(saved_execution.vm_hash)

if vm_hash in self.executions:
# The execution is already loaded, skip it
if vm_hash in self.executions or not saved_execution.persistent:
# The execution is already loaded or isn't persistent, skip it
continue

vm_id = saved_execution.vm_id
Expand Down

0 comments on commit 4aa662b

Please sign in to comment.