Skip to content

Commit

Permalink
Fix: Restarting the NDP Proxy sometimes crashed
Browse files Browse the repository at this point in the history
This is likely due because it was restarted too quickly.

Solution: Wrap the error in a custom class and retry after 5 seconds.

This is still a work in progress as a better approach should limit the frequency of restarts instead, for example using an event or queue.
  • Loading branch information
hoh committed Mar 6, 2024
1 parent c7a2bf9 commit 2340926
Showing 1 changed file with 27 additions and 2 deletions.
29 changes: 27 additions & 2 deletions src/aleph/vm/network/ndp_proxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,24 @@
and restart the service.
"""

import asyncio
import logging
from dataclasses import dataclass
from ipaddress import IPv6Network
from pathlib import Path
from subprocess import CalledProcessError

from aleph.vm.utils import run_in_subprocess

logger = logging.getLogger(__name__)


class NdpProxyTerminatedError(Exception):
"""Raised when restarting the NDP Proxy fails due to a SIGTERM signal."""

pass


@dataclass
class NdpRule:
address_range: IPv6Network
Expand All @@ -33,15 +41,32 @@ def __init__(self, host_network_interface: str):
@staticmethod
async def _restart_ndppd():
logger.debug("Restarting ndppd")
await run_in_subprocess(["systemctl", "restart", "ndppd"])
try:
await run_in_subprocess(["systemctl", "restart", "ndppd"])
except CalledProcessError as error:

Check warning on line 46 in src/aleph/vm/network/ndp_proxy.py

View check run for this annotation

Codecov / codecov/patch

src/aleph/vm/network/ndp_proxy.py#L44-L46

Added lines #L44 - L46 were not covered by tests
if "died with <Signals.SIGTERM: 15>." in str(error):
raise NdpProxyTerminatedError("ndppd was terminated by a SIGTERM signal") from error

Check warning on line 48 in src/aleph/vm/network/ndp_proxy.py

View check run for this annotation

Codecov / codecov/patch

src/aleph/vm/network/ndp_proxy.py#L48

Added line #L48 was not covered by tests
else:
raise

Check warning on line 50 in src/aleph/vm/network/ndp_proxy.py

View check run for this annotation

Codecov / codecov/patch

src/aleph/vm/network/ndp_proxy.py#L50

Added line #L50 was not covered by tests

async def _update_ndppd_conf(self):
config = f"proxy {self.host_network_interface} {{\n"
for interface, address_range in self.interface_address_range_mapping.items():
config += f" rule {address_range} {{\n iface {interface}\n }}\n"
config += "}\n"
Path("/etc/ndppd.conf").write_text(config)
await self._restart_ndppd()
for attempt in range(3):
try:
await self._restart_ndppd()
break
except NdpProxyTerminatedError:

Check warning on line 62 in src/aleph/vm/network/ndp_proxy.py

View check run for this annotation

Codecov / codecov/patch

src/aleph/vm/network/ndp_proxy.py#L59-L62

Added lines #L59 - L62 were not covered by tests
if attempt >= 2:
raise
logger.warning(

Check warning on line 65 in src/aleph/vm/network/ndp_proxy.py

View check run for this annotation

Codecov / codecov/patch

src/aleph/vm/network/ndp_proxy.py#L64-L65

Added lines #L64 - L65 were not covered by tests
"ndppd was terminated by a SIGTERM signal while restarting. Waiting 5 seconds and retrying."
)
await asyncio.sleep(5)
continue

Check warning on line 69 in src/aleph/vm/network/ndp_proxy.py

View check run for this annotation

Codecov / codecov/patch

src/aleph/vm/network/ndp_proxy.py#L68-L69

Added lines #L68 - L69 were not covered by tests

async def add_range(self, interface: str, address_range: IPv6Network):
logger.debug("Proxying range %s -> %s", address_range, interface)
Expand Down

0 comments on commit 2340926

Please sign in to comment.