Skip to content

Commit

Permalink
Merge pull request #82 from canonical/DPE-4767/HA_test_flaky_still
Browse files Browse the repository at this point in the history
[DPE-4767] HA network cut tests stabilization
  • Loading branch information
juditnovak authored Aug 29, 2024
2 parents 810f6fc + fd515fe commit 98686b2
Show file tree
Hide file tree
Showing 10 changed files with 749 additions and 110 deletions.
571 changes: 559 additions & 12 deletions lib/charms/grafana_agent/v0/cos_agent.py

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions lib/charms/operator_libs_linux/v0/apt.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@

# Increment this PATCH version before using `charmcraft publish-lib` or reset
# to 0 if you are raising the major API version
LIBPATCH = 13
LIBPATCH = 14


VALID_SOURCE_TYPES = ("deb", "deb-src")
Expand Down Expand Up @@ -837,7 +837,7 @@ def remove_package(

def update() -> None:
"""Update the apt cache via `apt-get update`."""
subprocess.run(["apt-get", "update"], capture_output=True, check=True)
subprocess.run(["apt-get", "update", "--error-on=any"], capture_output=True, check=True)


def import_key(key: str) -> str:
Expand Down
2 changes: 1 addition & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ poetry-plugin-export = "^1.8.0"
ops = "^2.13.0"
poetry-core = "^1.9.0"
# tls_certificates_interface/v3/tls_certificates.py
cryptography = "42.0.8"
cryptography = "^42.0.8"
jsonschema = "*"
# pinning to avoid: https://github.com/canonical/charmcraft/issues/1722
# We should unpin it once we have rustc 1.76+ available at build time
Expand Down
49 changes: 37 additions & 12 deletions src/events/tls.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
generate_csr,
generate_private_key,
)
from ops.charm import ActionEvent, RelationJoinedEvent
from ops.charm import ActionEvent, RelationCreatedEvent
from ops.framework import EventBase, Object

from literals import CERTS_REL_NAME
Expand All @@ -34,7 +34,8 @@ def __init__(self, charm):
self.certificates = TLSCertificatesRequiresV3(self.charm, CERTS_REL_NAME)

self.framework.observe(
getattr(self.charm.on, "certificates_relation_joined"), self._on_certs_relation_joined
getattr(self.charm.on, "certificates_relation_created"),
self._on_certs_relation_created,
)
self.framework.observe(
getattr(self.certificates.on, "certificate_available"), self._on_certificate_available
Expand All @@ -52,25 +53,49 @@ def __init__(self, charm):

self.framework.observe(getattr(self.charm.on, "config_changed"), self._on_config_changed)

def _on_certs_relation_joined(self, event: RelationJoinedEvent) -> None:
"""Handler for `certificates_relation_joined` event."""
# generate unit private key if not already created by action
def _request_certificates(self):
"""Request brand-new certificates."""
if not self.charm.state.unit_server.private_key:
self.charm.state.unit_server.update(
{"private-key": generate_private_key().decode("utf-8")}
)

if self.charm.state.unit_server.tls:
self._remove_certificates()

logger.debug(
"Requesting certificate for: "
f"host {self.charm.state.unit_server.host},"
f"with IP {self.charm.state.unit_server.sans.get('sans_ip', [])},"
f"DNS {self.charm.state.unit_server.sans.get('sans_dns', [])}"
)

csr = generate_csr(
private_key=self.charm.state.unit_server.private_key.encode("utf-8"),
subject=self.charm.state.unit_server.host,
subject=self.charm.state.unit_server.private_ip,
sans_ip=self.charm.state.unit_server.sans.get("sans_ip", []),
sans_dns=self.charm.state.unit_server.sans.get("sans_dns", []),
)

self.charm.state.unit_server.update({"csr": csr.decode("utf-8").strip()})

self.certificates.request_certificate_creation(certificate_signing_request=csr)

def _remove_certificates(self):
"""Cleanup any existing certificates."""
if self.charm.state.cluster.tls:
self.certificates.request_certificate_revocation(
self.charm.state.unit_server.csr.encode("utf-8")
)
self.charm.state.unit_server.update({"csr": "", "certificate": "", "ca-cert": ""})

# remove all existing keystores from the unit so we don't preserve certs
self.charm.tls_manager.remove_cert_files()

def _on_certs_relation_created(self, event: RelationCreatedEvent) -> None:
"""Handler for `certificates_relation_created` event."""
# generate unit private key if not already created by action
self._request_certificates()

def _on_certificate_available(self, event: CertificateAvailableEvent) -> None:
"""Handler for `certificates_available` event after provider updates signed certs."""
# avoid setting tls files and restarting
Expand Down Expand Up @@ -109,14 +134,14 @@ def _on_certificate_expiring(self, _: EventBase) -> None:
def _on_config_changed(self, event: EventBase):
"""If system configuration (such as IP) changes, certs have to be re-issued."""
if self.charm.state.unit_server.tls and not self.charm.tls_manager.certificate_valid():
self._on_certificate_expiring(event)
self._remove_certificates()
self._request_certificates()

def _on_certs_relation_broken(self, _) -> None:
"""Handler for `certificates_relation_broken` event."""
self.charm.state.unit_server.update({"csr": "", "certificate": "", "ca-cert": ""})

# remove all existing keystores from the unit so we don't preserve certs
self.charm.tls_manager.remove_cert_files()
# In case we have valid certificates, we keep them for smooth service function
if self.charm.state.unit_server.tls and not self.charm.tls_manager.certificate_valid():
self._remove_certificates()

def _set_tls_private_key(self, event: ActionEvent) -> None:
"""Handler for `set-tls-privat-key` event when user manually specifies private-keys for a unit."""
Expand Down
5 changes: 5 additions & 0 deletions src/managers/tls.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,4 +81,9 @@ def certificate_valid(self) -> bool:
except CalledProcessError as error:
logging.error(f"Checking certificate failed: {error.output}")
return False

logger.debug(f"Response of openssl cert decode: {response}")
logger.debug(
f"Currently recognized IP using 'gethostbyname': {self.state.unit_server.private_ip}"
)
return self.state.unit_server.private_ip in response
39 changes: 24 additions & 15 deletions tests/integration/ha/test_network_cut.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,12 @@
import yaml
from pytest_operator.plugin import OpsTest

from ..helpers import access_all_dashboards, get_address, get_leader_name
from ..helpers import (
access_all_dashboards,
all_dashboards_unavailable,
get_address,
get_leader_name,
)

logger = logging.getLogger(__name__)

Expand All @@ -32,6 +37,7 @@
""",
}
TLS_CERT_APP_NAME = "self-signed-certificates"
ALL_APPS = [APP_NAME, TLS_CERT_APP_NAME, OPENSEARCH_APP_NAME]
APP_AND_TLS = [APP_NAME, TLS_CERT_APP_NAME]
PEER = "dashboard_peers"
SERVER_PORT = 5601
Expand All @@ -44,7 +50,6 @@


@pytest.mark.runner(["self-hosted", "linux", "X64", "jammy", "large"])
@pytest.mark.skip(reason="https://warthogs.atlassian.net/browse/DPE-4903")
@pytest.mark.group(1)
@pytest.mark.skip_if_deployed
@pytest.mark.abort_on_fail
Expand Down Expand Up @@ -146,6 +151,8 @@ async def network_cut_leader(ops_test: OpsTest, https: bool = False):
assert new_ip != old_ip
logger.info(f"Old IP {old_ip} has changed to {new_ip}...")

await ops_test.model.wait_for_idle(apps=ALL_APPS, wait_for_active=True, timeout=LONG_TIMEOUT)

logger.info("Checking Dashboard access...")
assert await access_all_dashboards(ops_test, https=https)

Expand Down Expand Up @@ -198,6 +205,8 @@ async def network_throttle_leader(ops_test: OpsTest, https: bool = False):
current_ip = await get_address(ops_test, old_leader_name)
assert old_ip == current_ip

await ops_test.model.wait_for_idle(apps=ALL_APPS, wait_for_active=True, timeout=LONG_TIMEOUT)

logger.info("Checking Dashboard access...")
assert await access_all_dashboards(ops_test, https=https)

Expand All @@ -212,7 +221,7 @@ async def network_cut_application(ops_test: OpsTest, https: bool = False):
machine_name = await ha_helpers.get_unit_machine_name(ops_test, unit.name)
ip = await get_address(ops_test, unit.name)

logger.info("Cutting unit {unit.name} from network...")
logger.info(f"Cutting unit {unit.name} from network...")
ha_helpers.cut_unit_network(machine_name)

machines.append(machine_name)
Expand All @@ -239,7 +248,7 @@ async def network_cut_application(ops_test: OpsTest, https: bool = False):
)

logger.info("Checking lack of Dashboard access...")
assert not (await access_all_dashboards(ops_test, https=https))
assert all_dashboards_unavailable(ops_test, https=https)

logger.info("Restoring network...")
for machine_name in machines:
Expand All @@ -259,6 +268,13 @@ async def network_cut_application(ops_test: OpsTest, https: bool = False):
wait_period=LONG_WAIT,
)

for unit, old_ip in unit_ip_map.items():
new_ip = await get_address(ops_test, unit)
assert new_ip != old_ip
logger.info(f"Old IP {old_ip} has changed to {new_ip}...")

await ops_test.model.wait_for_idle(apps=ALL_APPS, wait_for_active=True, timeout=LONG_TIMEOUT)

logger.info("Checking Dashboard access...")
assert await access_all_dashboards(ops_test, https=https)

Expand All @@ -273,7 +289,7 @@ async def network_throttle_application(ops_test: OpsTest, https: bool = False):
machine_name = await ha_helpers.get_unit_machine_name(ops_test, unit.name)
ip = await get_address(ops_test, unit.name)

logger.info("Cutting unit {unit.name} from network...")
logger.info(f"Cutting unit {unit.name} from network...")
ha_helpers.network_throttle(machine_name)

machines.append(machine_name)
Expand All @@ -300,7 +316,7 @@ async def network_throttle_application(ops_test: OpsTest, https: bool = False):
)

logger.info("Checking lack of Dashboard access...")
assert not (await access_all_dashboards(ops_test, https=https))
assert all_dashboards_unavailable(ops_test, https=https)

logger.info("Restoring network...")
for machine_name in machines:
Expand All @@ -323,6 +339,8 @@ async def network_throttle_application(ops_test: OpsTest, https: bool = False):
for unit in unit_ip_map
)

await ops_test.model.wait_for_idle(apps=ALL_APPS, wait_for_active=True, timeout=LONG_TIMEOUT)

logger.info("Checking Dashboard access...")
assert await access_all_dashboards(ops_test, https=https)

Expand All @@ -332,31 +350,27 @@ async def network_throttle_application(ops_test: OpsTest, https: bool = False):
##############################################################################


@pytest.mark.skip(reason="https://warthogs.atlassian.net/browse/DPE-4903")
@pytest.mark.runner(["self-hosted", "linux", "X64", "jammy", "large"])
@pytest.mark.group(1)
@pytest.mark.abort_on_fail
async def test_network_cut_ip_change_leader_http(ops_test: OpsTest, request):
await network_cut_leader(ops_test)


@pytest.mark.skip(reason="https://warthogs.atlassian.net/browse/DPE-4903")
@pytest.mark.runner(["self-hosted", "linux", "X64", "jammy", "large"])
@pytest.mark.group(1)
@pytest.mark.abort_on_fail
async def test_network_cut_no_ip_change_leader_http(ops_test: OpsTest, request):
await network_throttle_leader(ops_test)


@pytest.mark.skip(reason="https://warthogs.atlassian.net/browse/DPE-4903")
@pytest.mark.runner(["self-hosted", "linux", "X64", "jammy", "large"])
@pytest.mark.group(1)
@pytest.mark.abort_on_fail
async def test_network_cut_ip_change_application_http(ops_test: OpsTest, request):
await network_cut_application(ops_test)


@pytest.mark.skip(reason="https://warthogs.atlassian.net/browse/DPE-4903")
@pytest.mark.runner(["self-hosted", "linux", "X64", "jammy", "large"])
@pytest.mark.group(1)
@pytest.mark.abort_on_fail
Expand All @@ -367,7 +381,6 @@ async def test_network_no_ip_change_application_http(ops_test: OpsTest, request)
##############################################################################


@pytest.mark.skip(reason="https://warthogs.atlassian.net/browse/DPE-4903")
@pytest.mark.runner(["self-hosted", "linux", "X64", "jammy", "large"])
@pytest.mark.group(1)
@pytest.mark.abort_on_fail
Expand All @@ -386,31 +399,27 @@ async def test_set_tls(ops_test: OpsTest, request):
##############################################################################


@pytest.mark.skip(reason="https://warthogs.atlassian.net/browse/DPE-4903")
@pytest.mark.runner(["self-hosted", "linux", "X64", "jammy", "large"])
@pytest.mark.group(1)
@pytest.mark.abort_on_fail
async def test_network_cut_ip_change_leader_https(ops_test: OpsTest, request):
await network_cut_leader(ops_test, https=True)


@pytest.mark.skip(reason="https://warthogs.atlassian.net/browse/DPE-4903")
@pytest.mark.runner(["self-hosted", "linux", "X64", "jammy", "large"])
@pytest.mark.group(1)
@pytest.mark.abort_on_fail
async def test_network_cut_no_ip_change_leader_https(ops_test: OpsTest, request):
await network_throttle_leader(ops_test, https=True)


@pytest.mark.skip(reason="https://warthogs.atlassian.net/browse/DPE-4903")
@pytest.mark.runner(["self-hosted", "linux", "X64", "jammy", "large"])
@pytest.mark.group(1)
@pytest.mark.abort_on_fail
async def test_network_cut_ip_change_application_https(ops_test: OpsTest, request):
await network_cut_application(ops_test, https=True)


@pytest.mark.skip(reason="https://warthogs.atlassian.net/browse/DPE-4903")
@pytest.mark.runner(["self-hosted", "linux", "X64", "jammy", "large"])
@pytest.mark.group(1)
@pytest.mark.abort_on_fail
Expand Down
Loading

0 comments on commit 98686b2

Please sign in to comment.