Skip to content

Commit

Permalink
Merge pull request #122 from canonical/DPE-5587-avoid-socket-gethostname
Browse files Browse the repository at this point in the history
[DPE-5587] Fix private IP resolution to use juju spaces
  • Loading branch information
Mehdi-Bendriss authored Oct 14, 2024
2 parents 7568a57 + 8910a3d commit 017d5c5
Show file tree
Hide file tree
Showing 21 changed files with 487 additions and 65 deletions.
21 changes: 20 additions & 1 deletion lib/charms/data_platform_libs/v0/data_interfaces.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,7 +331,7 @@ def _on_topic_requested(self, event: TopicRequestedEvent):

# Increment this PATCH version before using `charmcraft publish-lib` or reset
# to 0 if you are raising the major API version
LIBPATCH = 39
LIBPATCH = 40

PYDEPS = ["ops>=2.0.0"]

Expand Down Expand Up @@ -391,6 +391,10 @@ class IllegalOperationError(DataInterfacesError):
"""To be used when an operation is not allowed to be performed."""


class PrematureDataAccessError(DataInterfacesError):
"""To be raised when the Relation Data may be accessed (written) before protocol init complete."""


##############################################################################
# Global helpers / utilities
##############################################################################
Expand Down Expand Up @@ -1453,6 +1457,8 @@ def _on_relation_changed_event(self, event: RelationChangedEvent) -> None:
class ProviderData(Data):
"""Base provides-side of the data products relation."""

RESOURCE_FIELD = "database"

def __init__(
self,
model: Model,
Expand Down Expand Up @@ -1618,6 +1624,15 @@ def _fetch_my_specific_relation_data(
def _update_relation_data(self, relation: Relation, data: Dict[str, str]) -> None:
"""Set values for fields not caring whether it's a secret or not."""
req_secret_fields = []

keys = set(data.keys())
if self.fetch_relation_field(relation.id, self.RESOURCE_FIELD) is None and (
keys - {"endpoints", "read-only-endpoints", "replset"}
):
raise PrematureDataAccessError(
"Premature access to relation data, update is forbidden before the connection is initialized."
)

if relation.app:
req_secret_fields = get_encoded_list(relation, relation.app, REQ_SECRET_FIELDS)

Expand Down Expand Up @@ -3290,6 +3305,8 @@ class KafkaRequiresEvents(CharmEvents):
class KafkaProviderData(ProviderData):
"""Provider-side of the Kafka relation."""

RESOURCE_FIELD = "topic"

def __init__(self, model: Model, relation_name: str) -> None:
super().__init__(model, relation_name)

Expand Down Expand Up @@ -3539,6 +3556,8 @@ class OpenSearchRequiresEvents(CharmEvents):
class OpenSearchProvidesData(ProviderData):
"""Provider-side of the OpenSearch relation."""

RESOURCE_FIELD = "index"

def __init__(self, model: Model, relation_name: str) -> None:
super().__init__(model, relation_name)

Expand Down
4 changes: 2 additions & 2 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,7 @@ pytest = "^8.2.2"
juju = "~3.5.0"
tenacity = "^8.4.2"
coverage = {extras = ["toml"], version = ">7.0"}
pytest-asyncio = "^0.21.2"
pytest-operator = ">0.20"
pytest-operator-cache = {git = "https://github.com/canonical/data-platform-workflows", tag = "v16.7.0", subdirectory = "python/pytest_plugins/pytest_operator_cache"}
pytest-operator-groups = {git = "https://github.com/canonical/data-platform-workflows", tag = "v16.7.0", subdirectory = "python/pytest_plugins/pytest_operator_groups"}
Expand Down
1 change: 0 additions & 1 deletion src/charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,6 @@ def _on_install(self, event: InstallEvent) -> None:

def reconcile(self, event: EventBase) -> None:
"""Generic handler for all 'something changed, update' events across all relations."""

# 1. Block until peer relation is set
if not self.state.peer_relation:
self.unit.status = WaitingStatus(MSG_WAITING_FOR_PEER)
Expand Down
20 changes: 19 additions & 1 deletion src/core/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

"""Collection of global cluster state."""
import logging
from ipaddress import IPv4Address, IPv6Address

from charms.data_platform_libs.v0.data_interfaces import (
DataPeerData,
Expand All @@ -23,6 +24,7 @@
PEER,
PEER_APP_SECRETS,
PEER_UNIT_SECRETS,
SERVER_PORT,
)

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -63,7 +65,7 @@ def opensearch_relation(self) -> Relation | None:

@property
def tls_relation(self) -> Relation | None:
"""The cluster peer relation."""
"""The cluster tls relation."""
return self.model.get_relation(CERTS_REL_NAME)

# --- CORE COMPONENTS---
Expand Down Expand Up @@ -141,6 +143,16 @@ def opensearch_server(self) -> OpensearchServer | None:
local_app=self.cluster.app,
)

@property
def bind_address(self) -> IPv4Address | IPv6Address | str | None:
"""The network binding address from the peer relation."""
bind_address = None
if self.peer_relation:
if binding := self.model.get_binding(self.peer_relation):
bind_address = binding.network.bind_address
# If the relation does not exist, then we get None
return bind_address

# --- CLUSTER INIT ---

@property
Expand All @@ -162,3 +174,9 @@ def stable(self) -> bool:
return False

return True

@property
def url(self) -> str:
"""Service URL."""
scheme = "https" if self.unit_server.tls else "http"
return f"{scheme}://{self.bind_address}:{SERVER_PORT}"
29 changes: 7 additions & 22 deletions src/core/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,12 @@
"""Collection of state objects for relations, apps and units."""
import logging
import socket
import subprocess
from typing import Literal, MutableMapping

from charms.data_platform_libs.v0.data_interfaces import Data, DataDict
from ops.model import Application, Relation, Unit
from typing_extensions import override

from literals import SERVER_PORT

logger = logging.getLogger(__name__)

SUBSTRATES = Literal["vm", "k8s"]
Expand Down Expand Up @@ -178,24 +175,18 @@ def hostname(self) -> str:
@property
def fqdn(self) -> str:
"""The Fully Qualified Domain Name for the unit."""
return socket.getfqdn()
# return socket.getfqdn(self.private_ip)
return socket.getfqdn(self.private_ip)

@property
def private_ip(self) -> str:
"""The IP for the unit."""
"""The IP for the unit recovered using socket."""
return socket.gethostbyname(self.hostname)

@property
def public_ip(self) -> str:
result = subprocess.check_output(
[
"bash",
"-c",
"ip a | grep global | grep -v 'inet 10.' | cut -d' ' -f6 | cut -d'/' -f1",
],
text=True,
)
return result.rstrip()
"""The public IP for the unit."""
return socket.gethostbyname(self.hostname)

@property
def host(self) -> str:
Expand Down Expand Up @@ -245,12 +236,6 @@ def sans(self) -> dict[str, list[str]]:
return {}

return {
"sans_ip": [self.private_ip],
"sans_dns": [self.hostname, self.fqdn],
"sans_ip": [self.private_ip, self.public_ip],
"sans_dns": [dns for dns in {self.hostname, self.fqdn} if dns],
}

@property
def url(self) -> str:
"""Service URL."""
scheme = "https" if self.tls else "http"
return f"{scheme}://{self.private_ip}:{SERVER_PORT}"
16 changes: 10 additions & 6 deletions src/events/tls.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,18 +63,22 @@ def _request_certificates(self):
if self.charm.state.unit_server.tls:
self._remove_certificates()

sans_ip = set(
self.charm.state.unit_server.sans.get("sans_ip", [])
+ [str(self.charm.state.bind_address or "")]
)
sans_dns = set(self.charm.state.unit_server.sans.get("sans_dns", []))

logger.debug(
"Requesting certificate for: "
f"host {self.charm.state.unit_server.host},"
f"with IP {self.charm.state.unit_server.sans.get('sans_ip', [])},"
f"DNS {self.charm.state.unit_server.sans.get('sans_dns', [])}"
f"host {self.charm.state.unit_server.host}, with IP {sans_ip}, DNS {sans_dns}"
)

csr = generate_csr(
private_key=self.charm.state.unit_server.private_key.encode("utf-8"),
subject=self.charm.state.unit_server.private_ip,
sans_ip=self.charm.state.unit_server.sans.get("sans_ip", []),
sans_dns=self.charm.state.unit_server.sans.get("sans_dns", []),
subject=str(self.charm.state.bind_address or self.charm.state.unit_server.private_ip),
sans_ip=list(sans_ip or ""),
sans_dns=list(sans_dns),
)

self.charm.state.unit_server.update({"csr": csr.decode("utf-8").strip()})
Expand Down
1 change: 1 addition & 0 deletions src/literals.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@
MSG_STATUS_ERROR = "Service is an error state"
MSG_STATUS_WORKLOAD_DOWN = "Workload is not alive"
MSG_STATUS_UNKNOWN = "Workload status is not known"
MSG_STATUS_APP_REMOVED = "remove-application was requested: leaving..."

MSG_APP_STATUS = [
MSG_STATUS_DB_DOWN,
Expand Down
2 changes: 1 addition & 1 deletion src/managers/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def request(
if None in [endpoint, method]:
raise ValueError("endpoint or method missing")

full_url = f"{self.state.unit_server.url}/api/{endpoint}"
full_url = f"{self.state.url}/api/{endpoint}"

request_kwargs = {
"verify": self.workload.paths.ca,
Expand Down
3 changes: 2 additions & 1 deletion src/managers/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,8 @@ def dashboard_properties(self) -> list[str]:

opensearch_ca = self.workload.paths.opensearch_ca if self.state.opensearch_server else ""

properties += [f"server.host: '{self.state.unit_server.private_ip}'"]
# We are using the address exposed by Juju as service address
properties += [f"server.host: '{self.state.bind_address}'"]
properties += (
[
f"opensearch.username: {opensearch_user}",
Expand Down
4 changes: 4 additions & 0 deletions src/managers/health.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from exceptions import OSDAPIError
from literals import (
HEALTH_OPENSEARCH_STATUS_URL,
MSG_STATUS_APP_REMOVED,
MSG_STATUS_DB_DOWN,
MSG_STATUS_DB_MISSING,
MSG_STATUS_ERROR,
Expand Down Expand Up @@ -63,6 +64,9 @@ def status_ok(self) -> tuple[bool, str]:
def opensearch_ok(self) -> tuple[bool, str]:
"""Verify if associated Opensearch service is up and running."""

if not self.state.url:
return False, MSG_STATUS_APP_REMOVED

if not self.state.opensearch_server or not (
os.path.exists(self.workload.paths.opensearch_ca)
and os.path.getsize(self.workload.paths.opensearch_ca) > 0
Expand Down
2 changes: 1 addition & 1 deletion src/managers/tls.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,4 +86,4 @@ def certificate_valid(self) -> bool:
logger.debug(
f"Currently recognized IP using 'gethostbyname': {self.state.unit_server.private_ip}"
)
return self.state.unit_server.private_ip in response
return str(self.state.bind_address) in response
57 changes: 48 additions & 9 deletions tests/integration/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,24 @@
METADATA = yaml.safe_load(Path("./metadata.yaml").read_text())
APP_NAME = METADATA["name"]
OPENSEARCH_APP_NAME = "opensearch"
SERIES = "jammy"

OPENSEARCH_APP_NAME = "opensearch"
OPENSEARCH_RELATION_NAME = "opensearch-client"
OPENSEARCH_CONFIG = {
"logging-config": "<root>=INFO;unit=DEBUG",
"cloudinit-userdata": """postruncmd:
- [ 'sysctl', '-w', 'vm.max_map_count=262144' ]
- [ 'sysctl', '-w', 'fs.file-max=1048576' ]
- [ 'sysctl', '-w', 'vm.swappiness=0' ]
- [ 'sysctl', '-w', 'net.ipv4.tcp_retries2=5' ]
""",
}

TLS_CERTIFICATES_APP_NAME = "self-signed-certificates"
COS_AGENT_APP_NAME = "grafana-agent"
COS_AGENT_RELATION_NAME = "cos-agent"
DB_CLIENT_APP_NAME = "application"


logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -213,7 +231,7 @@ def all_dashboards_unavailable(ops_test: OpsTest, https: bool = False) -> bool:
logger.info(f"Couldn't retrieve host certificate for unit {unit}")
continue

host = get_private_address(ops_test.model.name, unit.name)
host = get_bind_address(ops_test.model.name, unit.name)

# We should retry until a host could be retrieved
if not host:
Expand Down Expand Up @@ -258,7 +276,6 @@ def access_dashboard(
arguments = {"url": url, "headers": headers, "json": data}
if ssl:
arguments["verify"] = "./ca.pem"

response = requests.post(**arguments)
return response.status_code == 200

Expand Down Expand Up @@ -298,7 +315,7 @@ async def access_all_dashboards(
for unit in ops_test.model.applications[APP_NAME].units:
if unit.name in skip:
continue
host = get_private_address(ops_test.model.name, unit.name)
host = get_bind_address(ops_test.model.name, unit.name)
if not host:
logger.error(f"No hostname found for {unit.name}, can't check connection.")
return False
Expand Down Expand Up @@ -371,14 +388,19 @@ async def get_address(ops_test: OpsTest, unit_name: str) -> str:
return address


def get_private_address(model_full_name: str, unit: str):
def get_bind_address(model_full_name: str, unit: str):
try:
private_ip = check_output(
[
"bash",
"-c",
f"JUJU_MODEL={model_full_name} juju ssh {unit} ip a | "
"grep global | grep 'inet 10.*/24' | cut -d' ' -f6 | cut -d'/' -f1",
"juju",
"exec",
f"--model={model_full_name}",
"--unit",
unit,
"--",
"network-get",
OPENSEARCH_RELATION_NAME,
"--bind-address",
],
text=True,
)
Expand Down Expand Up @@ -742,7 +764,7 @@ async def client_run_all_dashboards_request(
return False

for dashboards_unit in ops_test.model.applications[APP_NAME].units:
host = get_private_address(ops_test.model.name, dashboards_unit.name)
host = get_bind_address(ops_test.model.name, dashboards_unit.name)
if not host:
logger.debug(f"No hostname found for {dashboards_unit.name}, can't check connection.")
return False
Expand Down Expand Up @@ -774,3 +796,20 @@ async def destroy_cluster(ops_test, app: str = OPENSEARCH_APP_NAME):
# This case we don't raise an error in the context manager which
# fails to restore the `update-status-hook-interval` value to it's former state.
assert n_apps_after == n_apps_before - 1, "old cluster not destroyed successfully."


async def for_machines(ops_test, machines, state="started"):
for attempt in Retrying(stop=stop_after_attempt(10), wait=wait_fixed(wait=60)):
with attempt:
mach_status = json.loads(
subprocess.check_output(
["juju", "machines", f"--model={ops_test.model.name}", "--format=json"]
)
)["machines"]
for id in machines:
if (
str(id) not in mach_status.keys()
or mach_status[str(id)]["juju-status"]["current"] != state
):
logger.warning(f"machine-{id} either not exist yet or not in {state}")
raise Exception()
2 changes: 2 additions & 0 deletions tests/integration/spaces/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
# Copyright 2024 Canonical Ltd.
# See LICENSE file for licensing details.
Loading

0 comments on commit 017d5c5

Please sign in to comment.