From 851bb47963f28939b7895f009a1f915377484adf Mon Sep 17 00:00:00 2001 From: Judit Novak Date: Fri, 9 Aug 2024 15:58:07 +0200 Subject: [PATCH 01/10] Latest cryptography --- poetry.lock | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/poetry.lock b/poetry.lock index d2158867..fabeb130 100644 --- a/poetry.lock +++ b/poetry.lock @@ -3029,4 +3029,4 @@ test = ["big-O", "importlib-resources", "jaraco.functools", "jaraco.itertools", [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "a62aa34d05150c70ea679ef8e4d7f93fa7049de6ed444453100c5e6204f293f5" +content-hash = "5324eac94783161663f276942e7ee10f5f98c70399cc00ca3f3758952d7a2e48" diff --git a/pyproject.toml b/pyproject.toml index a2568eff..59f516bf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -43,7 +43,7 @@ poetry-plugin-export = "^1.8.0" ops = "^2.13.0" poetry-core = "^1.9.0" # tls_certificates_interface/v3/tls_certificates.py -cryptography = "42.0.8" +cryptography = "^42.0.8" jsonschema = "*" # pinning to avoid: https://github.com/canonical/charmcraft/issues/1722 # We should unpin it once we have rustc 1.76+ available at build time From fa43f6e6919e09f03cd35428a7574bfa9e4a251b Mon Sep 17 00:00:00 2001 From: Judit Novak Date: Fri, 9 Aug 2024 16:12:23 +0200 Subject: [PATCH 02/10] Updating libs --- lib/charms/grafana_agent/v0/cos_agent.py | 565 ++++++++++++++++++++++- 1 file changed, 554 insertions(+), 11 deletions(-) diff --git a/lib/charms/grafana_agent/v0/cos_agent.py b/lib/charms/grafana_agent/v0/cos_agent.py index 76157da0..582b70c0 100644 --- a/lib/charms/grafana_agent/v0/cos_agent.py +++ b/lib/charms/grafana_agent/v0/cos_agent.py @@ -206,25 +206,36 @@ def __init__(self, *args): ``` """ +import enum import json import logging +import socket from collections import namedtuple from itertools import chain from pathlib import Path -from typing import TYPE_CHECKING, Any, Callable, ClassVar, Dict, List, Optional, Set, Tuple, Union - +from typing import ( + TYPE_CHECKING, + Any, + Callable, + ClassVar, + Dict, + List, + Literal, + MutableMapping, + Optional, + Set, + Tuple, + Union, +) + +import pydantic from cosl import GrafanaDashboard, JujuTopology from cosl.rules import AlertRules from ops.charm import RelationChangedEvent from ops.framework import EventBase, EventSource, Object, ObjectEvents -from ops.model import Relation +from ops.model import ModelError, Relation from ops.testing import CharmType -try: - import pydantic.v1 as pydantic -except ImportError: - import pydantic - if TYPE_CHECKING: try: from typing import TypedDict @@ -238,7 +249,7 @@ class _MetricsEndpointDict(TypedDict): LIBID = "dc15fa84cef84ce58155fb84f6c6213a" LIBAPI = 0 -LIBPATCH = 9 +LIBPATCH = 10 PYDEPS = ["cosl", "pydantic"] @@ -253,7 +264,207 @@ class _MetricsEndpointDict(TypedDict): SnapEndpoint = namedtuple("SnapEndpoint", "owner, name") -class CosAgentProviderUnitData(pydantic.BaseModel): +# Note: MutableMapping is imported from the typing module and not collections.abc +# because subscripting collections.abc.MutableMapping was added in python 3.9, but +# most of our charms are based on 20.04, which has python 3.8. + +_RawDatabag = MutableMapping[str, str] + + +class TransportProtocolType(str, enum.Enum): + """Receiver Type.""" + + http = "http" + grpc = "grpc" + + +receiver_protocol_to_transport_protocol = { + "zipkin": TransportProtocolType.http, + "kafka": TransportProtocolType.http, + "tempo_http": TransportProtocolType.http, + "tempo_grpc": TransportProtocolType.grpc, + "otlp_grpc": TransportProtocolType.grpc, + "otlp_http": TransportProtocolType.http, + "jaeger_thrift_http": TransportProtocolType.http, +} + +_tracing_receivers_ports = { + # OTLP receiver: see + # https://github.com/open-telemetry/opentelemetry-collector/tree/v0.96.0/receiver/otlpreceiver + "otlp_http": 4318, + "otlp_grpc": 4317, + # Jaeger receiver: see + # https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/v0.96.0/receiver/jaegerreceiver + "jaeger_grpc": 14250, + "jaeger_thrift_http": 14268, + # Zipkin receiver: see + # https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/v0.96.0/receiver/zipkinreceiver + "zipkin": 9411, +} + +ReceiverProtocol = Literal["otlp_grpc", "otlp_http", "zipkin", "jaeger_thrift_http", "jaeger_grpc"] + + +class TracingError(Exception): + """Base class for custom errors raised by tracing.""" + + +class NotReadyError(TracingError): + """Raised by the provider wrapper if a requirer hasn't published the required data (yet).""" + + +class ProtocolNotRequestedError(TracingError): + """Raised if the user attempts to obtain an endpoint for a protocol it did not request.""" + + +class DataValidationError(TracingError): + """Raised when data validation fails on IPU relation data.""" + + +class AmbiguousRelationUsageError(TracingError): + """Raised when one wrongly assumes that there can only be one relation on an endpoint.""" + + +# TODO we want to eventually use `DatabagModel` from cosl but it likely needs a move to common package first +if int(pydantic.version.VERSION.split(".")[0]) < 2: # type: ignore + + class DatabagModel(pydantic.BaseModel): # type: ignore + """Base databag model.""" + + class Config: + """Pydantic config.""" + + # ignore any extra fields in the databag + extra = "ignore" + """Ignore any extra fields in the databag.""" + allow_population_by_field_name = True + """Allow instantiating this class by field name (instead of forcing alias).""" + + _NEST_UNDER = None + + @classmethod + def load(cls, databag: MutableMapping): + """Load this model from a Juju databag.""" + if cls._NEST_UNDER: + return cls.parse_obj(json.loads(databag[cls._NEST_UNDER])) + + try: + data = { + k: json.loads(v) + for k, v in databag.items() + # Don't attempt to parse model-external values + if k in {f.alias for f in cls.__fields__.values()} + } + except json.JSONDecodeError as e: + msg = f"invalid databag contents: expecting json. {databag}" + logger.error(msg) + raise DataValidationError(msg) from e + + try: + return cls.parse_raw(json.dumps(data)) # type: ignore + except pydantic.ValidationError as e: + msg = f"failed to validate databag: {databag}" + logger.debug(msg, exc_info=True) + raise DataValidationError(msg) from e + + def dump(self, databag: Optional[MutableMapping] = None, clear: bool = True): + """Write the contents of this model to Juju databag. + + :param databag: the databag to write the data to. + :param clear: ensure the databag is cleared before writing it. + """ + if clear and databag: + databag.clear() + + if databag is None: + databag = {} + + if self._NEST_UNDER: + databag[self._NEST_UNDER] = self.json(by_alias=True) + return databag + + dct = self.dict() + for key, field in self.__fields__.items(): # type: ignore + value = dct[key] + databag[field.alias or key] = json.dumps(value) + + return databag + +else: + from pydantic import ConfigDict + + class DatabagModel(pydantic.BaseModel): + """Base databag model.""" + + model_config = ConfigDict( + # ignore any extra fields in the databag + extra="ignore", + # Allow instantiating this class by field name (instead of forcing alias). + populate_by_name=True, + # Custom config key: whether to nest the whole datastructure (as json) + # under a field or spread it out at the toplevel. + _NEST_UNDER=None, # type: ignore + arbitrary_types_allowed=True, + ) + """Pydantic config.""" + + @classmethod + def load(cls, databag: MutableMapping): + """Load this model from a Juju databag.""" + nest_under = cls.model_config.get("_NEST_UNDER") # type: ignore + if nest_under: + return cls.model_validate(json.loads(databag[nest_under])) # type: ignore + + try: + data = { + k: json.loads(v) + for k, v in databag.items() + # Don't attempt to parse model-external values + if k in {(f.alias or n) for n, f in cls.__fields__.items()} + } + except json.JSONDecodeError as e: + msg = f"invalid databag contents: expecting json. {databag}" + logger.error(msg) + raise DataValidationError(msg) from e + + try: + return cls.model_validate_json(json.dumps(data)) # type: ignore + except pydantic.ValidationError as e: + msg = f"failed to validate databag: {databag}" + logger.debug(msg, exc_info=True) + raise DataValidationError(msg) from e + + def dump(self, databag: Optional[MutableMapping] = None, clear: bool = True): + """Write the contents of this model to Juju databag. + + :param databag: the databag to write the data to. + :param clear: ensure the databag is cleared before writing it. + """ + if clear and databag: + databag.clear() + + if databag is None: + databag = {} + nest_under = self.model_config.get("_NEST_UNDER") + if nest_under: + databag[nest_under] = self.model_dump_json( # type: ignore + by_alias=True, + # skip keys whose values are default + exclude_defaults=True, + ) + return databag + + dct = self.model_dump() # type: ignore + for key, field in self.model_fields.items(): # type: ignore + value = dct[key] + if value == field.default: + continue + databag[field.alias or key] = json.dumps(value) + + return databag + + +class CosAgentProviderUnitData(DatabagModel): """Unit databag model for `cos-agent` relation.""" # The following entries are the same for all units of the same principal. @@ -271,13 +482,16 @@ class CosAgentProviderUnitData(pydantic.BaseModel): metrics_scrape_jobs: List[Dict] log_slots: List[str] + # Requested tracing protocols. + tracing_protocols: Optional[List[str]] = None + # when this whole datastructure is dumped into a databag, it will be nested under this key. # while not strictly necessary (we could have it 'flattened out' into the databag), # this simplifies working with the model. KEY: ClassVar[str] = "config" -class CosAgentPeersUnitData(pydantic.BaseModel): +class CosAgentPeersUnitData(DatabagModel): """Unit databag model for `peers` cos-agent machine charm peer relation.""" # We need the principal unit name and relation metadata to be able to render identifiers @@ -308,6 +522,83 @@ def app_name(self) -> str: return self.unit_name.split("/")[0] +if int(pydantic.version.VERSION.split(".")[0]) < 2: # type: ignore + + class ProtocolType(pydantic.BaseModel): # type: ignore + """Protocol Type.""" + + class Config: + """Pydantic config.""" + + use_enum_values = True + """Allow serializing enum values.""" + + name: str = pydantic.Field( + ..., + description="Receiver protocol name. What protocols are supported (and what they are called) " + "may differ per provider.", + examples=["otlp_grpc", "otlp_http", "tempo_http"], + ) + + type: TransportProtocolType = pydantic.Field( + ..., + description="The transport protocol used by this receiver.", + examples=["http", "grpc"], + ) + +else: + + class ProtocolType(pydantic.BaseModel): + """Protocol Type.""" + + model_config = pydantic.ConfigDict( + # Allow serializing enum values. + use_enum_values=True + ) + """Pydantic config.""" + + name: str = pydantic.Field( + ..., + description="Receiver protocol name. What protocols are supported (and what they are called) " + "may differ per provider.", + examples=["otlp_grpc", "otlp_http", "tempo_http"], + ) + + type: TransportProtocolType = pydantic.Field( + ..., + description="The transport protocol used by this receiver.", + examples=["http", "grpc"], + ) + + +class Receiver(pydantic.BaseModel): + """Specification of an active receiver.""" + + protocol: ProtocolType = pydantic.Field(..., description="Receiver protocol name and type.") + url: str = pydantic.Field( + ..., + description="""URL at which the receiver is reachable. If there's an ingress, it would be the external URL. + Otherwise, it would be the service's fqdn or internal IP. + If the protocol type is grpc, the url will not contain a scheme.""", + examples=[ + "http://traefik_address:2331", + "https://traefik_address:2331", + "http://tempo_public_ip:2331", + "https://tempo_public_ip:2331", + "tempo_public_ip:2331", + ], + ) + + +class CosAgentRequirerUnitData(DatabagModel): # noqa: D101 + """Application databag model for the COS-agent requirer.""" + + receivers: List[Receiver] = pydantic.Field( + ..., + description="List of all receivers enabled on the tracing provider.", + ) + + class COSAgentProvider(Object): """Integration endpoint wrapper for the provider side of the cos_agent interface.""" @@ -322,6 +613,7 @@ def __init__( log_slots: Optional[List[str]] = None, dashboard_dirs: Optional[List[str]] = None, refresh_events: Optional[List] = None, + tracing_protocols: Optional[List[str]] = None, *, scrape_configs: Optional[Union[List[dict], Callable]] = None, ): @@ -340,6 +632,7 @@ def __init__( in the form ["snap-name:slot", ...]. dashboard_dirs: Directory where the dashboards are stored. refresh_events: List of events on which to refresh relation data. + tracing_protocols: List of protocols that the charm will be using for sending traces. scrape_configs: List of standard scrape_configs dicts or a callable that returns the list in case the configs need to be generated dynamically. The contents of this list will be merged with the contents of `metrics_endpoints`. @@ -357,6 +650,8 @@ def __init__( self._log_slots = log_slots or [] self._dashboard_dirs = dashboard_dirs self._refresh_events = refresh_events or [self._charm.on.config_changed] + self._tracing_protocols = tracing_protocols + self._is_single_endpoint = charm.meta.relations[relation_name].limit == 1 events = self._charm.on[relation_name] self.framework.observe(events.relation_joined, self._on_refresh) @@ -381,6 +676,7 @@ def _on_refresh(self, event): dashboards=self._dashboards, metrics_scrape_jobs=self._scrape_jobs, log_slots=self._log_slots, + tracing_protocols=self._tracing_protocols, ) relation.data[self._charm.unit][data.KEY] = data.json() except ( @@ -445,6 +741,103 @@ def _dashboards(self) -> List[GrafanaDashboard]: dashboards.append(dashboard) return dashboards + @property + def relations(self) -> List[Relation]: + """The tracing relations associated with this endpoint.""" + return self._charm.model.relations[self._relation_name] + + @property + def _relation(self) -> Optional[Relation]: + """If this wraps a single endpoint, the relation bound to it, if any.""" + if not self._is_single_endpoint: + objname = type(self).__name__ + raise AmbiguousRelationUsageError( + f"This {objname} wraps a {self._relation_name} endpoint that has " + "limit != 1. We can't determine what relation, of the possibly many, you are " + f"referring to. Please pass a relation instance while calling {objname}, " + "or set limit=1 in the charm metadata." + ) + relations = self.relations + return relations[0] if relations else None + + def is_ready(self, relation: Optional[Relation] = None): + """Is this endpoint ready?""" + relation = relation or self._relation + if not relation: + logger.debug(f"no relation on {self._relation_name !r}: tracing not ready") + return False + if relation.data is None: + logger.error(f"relation data is None for {relation}") + return False + if not relation.app: + logger.error(f"{relation} event received but there is no relation.app") + return False + try: + unit = next(iter(relation.units), None) + if not unit: + return False + databag = dict(relation.data[unit]) + CosAgentRequirerUnitData.load(databag) + + except (json.JSONDecodeError, pydantic.ValidationError, DataValidationError): + logger.info(f"failed validating relation data for {relation}") + return False + return True + + def get_all_endpoints( + self, relation: Optional[Relation] = None + ) -> Optional[CosAgentRequirerUnitData]: + """Unmarshalled relation data.""" + relation = relation or self._relation + if not relation or not self.is_ready(relation): + return None + unit = next(iter(relation.units), None) + if not unit: + return None + return CosAgentRequirerUnitData.load(relation.data[unit]) # type: ignore + + def _get_tracing_endpoint( + self, relation: Optional[Relation], protocol: ReceiverProtocol + ) -> Optional[str]: + unit_data = self.get_all_endpoints(relation) + if not unit_data: + return None + receivers: List[Receiver] = [i for i in unit_data.receivers if i.protocol.name == protocol] + if not receivers: + logger.error(f"no receiver found with protocol={protocol!r}") + return None + if len(receivers) > 1: + logger.error( + f"too many receivers with protocol={protocol!r}; using first one. Found: {receivers}" + ) + return None + + receiver = receivers[0] + return receiver.url + + def get_tracing_endpoint( + self, protocol: ReceiverProtocol, relation: Optional[Relation] = None + ) -> Optional[str]: + """Receiver endpoint for the given protocol.""" + endpoint = self._get_tracing_endpoint(relation or self._relation, protocol=protocol) + if not endpoint: + requested_protocols = set() + relations = [relation] if relation else self.relations + for relation in relations: + try: + databag = CosAgentProviderUnitData.load(relation.data[self._charm.unit]) + except DataValidationError: + continue + + if databag.tracing_protocols: + requested_protocols.update(databag.tracing_protocols) + + if protocol not in requested_protocols: + raise ProtocolNotRequestedError(protocol, relation) + + return None + return endpoint + class COSAgentDataChanged(EventBase): """Event emitted by `COSAgentRequirer` when relation data changes.""" @@ -558,6 +951,12 @@ def _on_relation_data_changed(self, event: RelationChangedEvent): if not (provider_data := self._validated_provider_data(raw)): return + # write enabled receivers to cos-agent relation + try: + self.update_tracing_receivers() + except ModelError: + raise + # Copy data from the cos_agent relation to the peer relation, so the leader could # follow up. # Save the originating unit name, so it could be used for topology later on by the leader. @@ -578,6 +977,37 @@ def _on_relation_data_changed(self, event: RelationChangedEvent): # need to emit `on.data_changed`), so we're emitting `on.data_changed` either way. self.on.data_changed.emit() # pyright: ignore + def update_tracing_receivers(self): + """Updates the list of exposed tracing receivers in all relations.""" + try: + for relation in self._charm.model.relations[self._relation_name]: + CosAgentRequirerUnitData( + receivers=[ + Receiver( + url=f"{self._get_tracing_receiver_url(protocol)}", + protocol=ProtocolType( + name=protocol, + type=receiver_protocol_to_transport_protocol[protocol], + ), + ) + for protocol in self.requested_tracing_protocols() + ], + ).dump(relation.data[self._charm.unit]) + + except ModelError as e: + # args are bytes + msg = e.args[0] + if isinstance(msg, bytes): + if msg.startswith( + b"ERROR cannot read relation application settings: permission denied" + ): + logger.error( + f"encountered error {e} while attempting to update_relation_data." + f"The relation must be gone." + ) + return + raise + def _validated_provider_data(self, raw) -> Optional[CosAgentProviderUnitData]: try: return CosAgentProviderUnitData(**json.loads(raw)) @@ -590,6 +1020,55 @@ def trigger_refresh(self, _): # FIXME: Figure out what we should do here self.on.data_changed.emit() # pyright: ignore + def _get_requested_protocols(self, relation: Relation): + # Coherence check + units = relation.units + if len(units) > 1: + # should never happen + raise ValueError( + f"unexpected error: subordinate relation {relation} " + f"should have exactly one unit" + ) + + unit = next(iter(units), None) + + if not unit: + return None + + if not (raw := relation.data[unit].get(CosAgentProviderUnitData.KEY)): + return None + + if not (provider_data := self._validated_provider_data(raw)): + return None + + return provider_data.tracing_protocols + + def requested_tracing_protocols(self): + """All receiver protocols that have been requested by our related apps.""" + requested_protocols = set() + for relation in self._charm.model.relations[self._relation_name]: + try: + protocols = self._get_requested_protocols(relation) + except NotReadyError: + continue + if protocols: + requested_protocols.update(protocols) + return requested_protocols + + def _get_tracing_receiver_url(self, protocol: str): + scheme = "http" + try: + if self._charm.cert.enabled: # type: ignore + scheme = "https" + # not only Grafana Agent can implement cos_agent. If the charm doesn't have the `cert` attribute + # using our cert_handler, it won't have the `enabled` parameter. In this case, we pass and assume http. + except AttributeError: + pass + # the assumption is that a subordinate charm will always be accessible to its principal charm under its fqdn + if receiver_protocol_to_transport_protocol[protocol] == TransportProtocolType.grpc: + return f"{socket.getfqdn()}:{_tracing_receivers_ports[protocol]}" + return f"{scheme}://{socket.getfqdn()}:{_tracing_receivers_ports[protocol]}" + @property def _remote_data(self) -> List[Tuple[CosAgentProviderUnitData, JujuTopology]]: """Return a list of remote data from each of the related units. @@ -819,3 +1298,67 @@ def dashboards(self) -> List[Dict[str, str]]: ) return dashboards + + +def charm_tracing_config( + endpoint_requirer: COSAgentProvider, cert_path: Optional[Union[Path, str]] +) -> Tuple[Optional[str], Optional[str]]: + """Utility function to determine the charm_tracing config you will likely want. + + If no endpoint is provided: + disable charm tracing. + If https endpoint is provided but cert_path is not found on disk: + disable charm tracing. + If https endpoint is provided and cert_path is None: + ERROR + Else: + proceed with charm tracing (with or without tls, as appropriate) + + Usage: + If you are using charm_tracing >= v1.9: + >>> from lib.charms.tempo_k8s.v1.charm_tracing import trace_charm + >>> from lib.charms.tempo_k8s.v0.cos_agent import charm_tracing_config + >>> @trace_charm(tracing_endpoint="my_endpoint", cert_path="cert_path") + >>> class MyCharm(...): + >>> _cert_path = "/path/to/cert/on/charm/container.crt" + >>> def __init__(self, ...): + >>> self.cos_agent = COSAgentProvider(...) + >>> self.my_endpoint, self.cert_path = charm_tracing_config( + ... self.cos_agent, self._cert_path) + + If you are using charm_tracing < v1.9: + >>> from lib.charms.tempo_k8s.v1.charm_tracing import trace_charm + >>> from lib.charms.tempo_k8s.v2.tracing import charm_tracing_config + >>> @trace_charm(tracing_endpoint="my_endpoint", cert_path="cert_path") + >>> class MyCharm(...): + >>> _cert_path = "/path/to/cert/on/charm/container.crt" + >>> def __init__(self, ...): + >>> self.cos_agent = COSAgentProvider(...) + >>> self.my_endpoint, self.cert_path = charm_tracing_config( + ... self.cos_agent, self._cert_path) + >>> @property + >>> def my_endpoint(self): + >>> return self._my_endpoint + >>> @property + >>> def cert_path(self): + >>> return self._cert_path + + """ + if not endpoint_requirer.is_ready(): + return None, None + + endpoint = endpoint_requirer.get_tracing_endpoint("otlp_http") + if not endpoint: + return None, None + + is_https = endpoint.startswith("https://") + + if is_https: + if cert_path is None: + raise TracingError("Cannot send traces to an https endpoint without a certificate.") + if not Path(cert_path).exists(): + # if endpoint is https BUT we don't have a server_cert yet: + # disable charm tracing until we do to prevent tls errors + return None, None + return endpoint, str(cert_path) + return endpoint, None From 0716d6a2dfb6337486a33232a01ddea9db136dce Mon Sep 17 00:00:00 2001 From: Judit Novak Date: Wed, 3 Jul 2024 13:32:04 +0200 Subject: [PATCH 03/10] Enhancments on helpers (more retries, logging, etc) --- tests/integration/ha/test_network_cut.py | 4 +- tests/integration/helpers.py | 127 +++++++++++++---------- 2 files changed, 72 insertions(+), 59 deletions(-) diff --git a/tests/integration/ha/test_network_cut.py b/tests/integration/ha/test_network_cut.py index e2b7f818..a4f08971 100644 --- a/tests/integration/ha/test_network_cut.py +++ b/tests/integration/ha/test_network_cut.py @@ -212,7 +212,7 @@ async def network_cut_application(ops_test: OpsTest, https: bool = False): machine_name = await ha_helpers.get_unit_machine_name(ops_test, unit.name) ip = await get_address(ops_test, unit.name) - logger.info("Cutting unit {unit.name} from network...") + logger.info(f"Cutting unit {unit.name} from network...") ha_helpers.cut_unit_network(machine_name) machines.append(machine_name) @@ -273,7 +273,7 @@ async def network_throttle_application(ops_test: OpsTest, https: bool = False): machine_name = await ha_helpers.get_unit_machine_name(ops_test, unit.name) ip = await get_address(ops_test, unit.name) - logger.info("Cutting unit {unit.name} from network...") + logger.info(f"Cutting unit {unit.name} from network...") ha_helpers.network_throttle(machine_name) machines.append(machine_name) diff --git a/tests/integration/helpers.py b/tests/integration/helpers.py index bad35c3e..435e14e0 100644 --- a/tests/integration/helpers.py +++ b/tests/integration/helpers.py @@ -7,7 +7,7 @@ import socket import subprocess from pathlib import Path -from subprocess import PIPE, check_output +from subprocess import PIPE, CalledProcessError, check_output from typing import Dict, List, Optional import requests @@ -15,8 +15,16 @@ from juju.relation import Relation from juju.unit import Unit from pytest_operator.plugin import OpsTest -from requests.exceptions import ConnectionError -from tenacity import Retrying, retry, stop_after_attempt, wait_fixed +from requests.exceptions import ConnectionError, SSLError +from tenacity import ( + Retrying, + before_sleep_log, + retry, + retry_if_exception_type, + retry_if_result, + stop_after_attempt, + wait_fixed, +) from core.workload import ODPaths @@ -174,13 +182,7 @@ async def access_all_prometheus_exporters(ops_test: OpsTest) -> bool: retry=lambda x: x is False, ) def dashboard_unavailable(host: str, https: bool = False) -> bool: - try: - # Normal IP address - socket.inet_aton(host) - except OSError: - socket.inet_pton(socket.AF_INET6, host) - host = f"[{host}]" - + """A single OSD instance is impossible to contact.""" protocol = "http" if not https else "https" url = f"{protocol}://{host}:5601/auth/login" arguments = {"url": url} @@ -194,17 +196,29 @@ def dashboard_unavailable(host: str, https: bool = False) -> bool: return response.status_code == 503 +@retry( + stop=stop_after_attempt(3), + wait=wait_fixed(15), + retry_error_callback=lambda _: False, + retry=retry_if_result(lambda x: x is False), +) def all_dashboards_unavailable(ops_test: OpsTest, https: bool = False) -> bool: - - if https: - unit = ops_test.model.applications[APP_NAME].units[0].name - if not get_dashboard_ca_cert(ops_test.model.name, unit): - logger.error(f"Couldn't retrieve host certificate for unit {unit}") - return False + """None of the OSD units are possible to contact.""" unavail = True for unit in ops_test.model.applications[APP_NAME].units: + + if https: + if not get_dashboard_ca_cert(ops_test.model.name, unit): + logger.info(f"Couldn't retrieve host certificate for unit {unit}") + continue + host = get_private_address(ops_test.model.name, unit.name) + + # We should retry until a host could be retrieved + if not host: + continue + unavail = unavail and dashboard_unavailable(host, https) if not unavail: logger.error("Host {host} still available") @@ -249,37 +263,19 @@ def access_dashboard( return response.status_code == 200 -def access_dashboard_https(host: str, password: str): - """This function should be rather replaced by a 'requests' call, if we can figure out the source of discrepancy.""" - try: - curl_cmd = check_output( - [ - "bash", - "-c", - 'curl -XPOST -H "Content-Type: application/json" -H "osd-xsrf: true" -H "Accept: application/json" ' - + f"https://{host}:5601/auth/login -d " - + "'" - + '{"username":"kibanaserver","password": "' - + f"{password}" - + '"' - + "}' --cacert ca.pem --verbose", - ], - stderr=subprocess.STDOUT, - text=True, - ) - except subprocess.CalledProcessError as err: - logger.error(f"{err}, {err.output}") - return False - return "roles" in curl_cmd - - +@retry( + stop=stop_after_attempt(3), + wait=wait_fixed(15), + retry_error_callback=lambda _: False, + retry=retry_if_result(lambda x: x is False), +) async def access_all_dashboards( ops_test: OpsTest, relation_id: int | None = None, https: bool = False, skip: list[str] = [] ): """Check if all dashboard instances are accessible.""" if not ops_test.model.applications[APP_NAME].units: - logger.debug(f"No units for application {APP_NAME}") + logger.error(f"No units for application {APP_NAME}") return False if not relation_id: @@ -295,20 +291,19 @@ async def access_all_dashboards( if https: unit = ops_test.model.applications[APP_NAME].units[0].name if unit not in skip and not get_dashboard_ca_cert(ops_test.model.name, unit): - logger.debug(f"Couldn't retrieve host certificate for unit {unit}") + logger.error(f"Couldn't retrieve host certificate for unit {unit}") return False - function = access_dashboard if not https else access_dashboard_https result = True for unit in ops_test.model.applications[APP_NAME].units: if unit.name in skip: continue host = get_private_address(ops_test.model.name, unit.name) if not host: - logger.debug(f"No hostname found for {unit.name}, can't check connection.") + logger.error(f"No hostname found for {unit.name}, can't check connection.") return False - result &= function(host=host, password=dashboard_password) + result &= access_dashboard(host=host, password=dashboard_password, ssl=https) if result: logger.info(f"Host {unit.name}, {host} passed access check") else: @@ -318,9 +313,10 @@ async def access_all_dashboards( @retry( stop=stop_after_attempt(5), - wait=wait_fixed(15), + wait=wait_fixed(30), retry_error_callback=lambda _: False, - retry=lambda x: x is False, + retry=(retry_if_result(lambda x: x is False) | retry_if_exception_type(SSLError)), + before_sleep=before_sleep_log(logger, logging.DEBUG), ) def get_dashboard_ca_cert(model_full_name: str, unit: str): try: @@ -332,6 +328,7 @@ def get_dashboard_ca_cert(model_full_name: str, unit: str): f"ubuntu@{unit}:" "/var/snap/opensearch-dashboards/current/etc/opensearch-dashboards/certificates/ca.pem ./", ], + timeout=30, ) except subprocess.CalledProcessError as err: logger.error(f"{err}") @@ -375,16 +372,32 @@ async def get_address(ops_test: OpsTest, unit_name: str) -> str: def get_private_address(model_full_name: str, unit: str): - private_ip = check_output( - [ - "bash", - "-c", - f"JUJU_MODEL={model_full_name} juju ssh {unit} ip a | " - "grep global | grep 'inet 10.*/24' | cut -d' ' -f6 | cut -d'/' -f1", - ], - text=True, - ) - return private_ip.rstrip() + try: + private_ip = check_output( + [ + "bash", + "-c", + f"JUJU_MODEL={model_full_name} juju ssh {unit} ip a | " + "grep global | grep 'inet 10.*/24' | cut -d' ' -f6 | cut -d'/' -f1", + ], + text=True, + ) + except CalledProcessError as err: + logger.info(f"Couldn't retrieve IP address: {str(err)}") + return "" + + if private_ip.rstrip(): + return private_ip.rstrip() + + try: + info = check_output( + ["bash", "-c", f"JUJU_MODEL={model_full_name} juju ssh {unit} ip a "], + text=True, + ) + logger.info(f"Couldn't retrieve IP, info is: {info}") + except CalledProcessError as err: + logger.info(f"Couldn't retrieve IP address: {str(err)}") + return "" def _get_show_unit_json(model_full_name: str, unit: str) -> Dict: From 76e0cd2a16a6db1e9a02a584bd3b5801946d6ca4 Mon Sep 17 00:00:00 2001 From: Judit Novak Date: Wed, 31 Jul 2024 23:53:02 +0200 Subject: [PATCH 04/10] Changing the TLS workflow --- src/events/tls.py | 39 ++++++++++++++++++------ src/managers/tls.py | 5 +++ tests/integration/ha/test_network_cut.py | 7 ++++- 3 files changed, 40 insertions(+), 11 deletions(-) diff --git a/src/events/tls.py b/src/events/tls.py index 64949e07..78e9228c 100644 --- a/src/events/tls.py +++ b/src/events/tls.py @@ -52,25 +52,46 @@ def __init__(self, charm): self.framework.observe(getattr(self.charm.on, "config_changed"), self._on_config_changed) - def _on_certs_relation_joined(self, event: RelationJoinedEvent) -> None: - """Handler for `certificates_relation_joined` event.""" - # generate unit private key if not already created by action + def _request_certificates(self): + """Request brand-new certificates.""" if not self.charm.state.unit_server.private_key: self.charm.state.unit_server.update( {"private-key": generate_private_key().decode("utf-8")} ) + logger.debug( + "Requesting certificate for: " + f"host {self.charm.state.unit_server.host}," + f"with IP {self.charm.state.unit_server.sans.get('sans_ip', [])}," + f"DNS {self.charm.state.unit_server.sans.get('sans_dns', [])}" + ) + csr = generate_csr( private_key=self.charm.state.unit_server.private_key.encode("utf-8"), - subject=self.charm.state.unit_server.host, + subject=self.charm.state.unit_server.private_ip, sans_ip=self.charm.state.unit_server.sans.get("sans_ip", []), sans_dns=self.charm.state.unit_server.sans.get("sans_dns", []), ) self.charm.state.unit_server.update({"csr": csr.decode("utf-8").strip()}) - self.certificates.request_certificate_creation(certificate_signing_request=csr) + def _remove_certificates(self): + """Cleanup any existing certificates.""" + if self.charm.state.cluster.tls: + self.certificates.request_certificate_revocation( + self.charm.state.unit_server.csr.encode("utf-8") + ) + self.charm.state.unit_server.update({"csr": "", "certificate": "", "ca-cert": ""}) + + # remove all existing keystores from the unit so we don't preserve certs + self.charm.tls_manager.remove_cert_files() + + def _on_certs_relation_joined(self, event: RelationJoinedEvent) -> None: + """Handler for `certificates_relation_joined` event.""" + # generate unit private key if not already created by action + self._request_certificates() + def _on_certificate_available(self, event: CertificateAvailableEvent) -> None: """Handler for `certificates_available` event after provider updates signed certs.""" # avoid setting tls files and restarting @@ -109,14 +130,12 @@ def _on_certificate_expiring(self, _: EventBase) -> None: def _on_config_changed(self, event: EventBase): """If system configuration (such as IP) changes, certs have to be re-issued.""" if self.charm.state.unit_server.tls and not self.charm.tls_manager.certificate_valid(): - self._on_certificate_expiring(event) + self._remove_certificates() + self._request_certificates() def _on_certs_relation_broken(self, _) -> None: """Handler for `certificates_relation_broken` event.""" - self.charm.state.unit_server.update({"csr": "", "certificate": "", "ca-cert": ""}) - - # remove all existing keystores from the unit so we don't preserve certs - self.charm.tls_manager.remove_cert_files() + self._remove_certificates() def _set_tls_private_key(self, event: ActionEvent) -> None: """Handler for `set-tls-privat-key` event when user manually specifies private-keys for a unit.""" diff --git a/src/managers/tls.py b/src/managers/tls.py index 0e904a2a..ca56b210 100644 --- a/src/managers/tls.py +++ b/src/managers/tls.py @@ -81,4 +81,9 @@ def certificate_valid(self) -> bool: except CalledProcessError as error: logging.error(f"Checking certificate failed: {error.output}") return False + + logger.debug(f"Response of openssl cert decode: {response}") + logger.debug( + f"Currently recognized IP using 'gethostbyname': {self.state.unit_server.private_ip}" + ) return self.state.unit_server.private_ip in response diff --git a/tests/integration/ha/test_network_cut.py b/tests/integration/ha/test_network_cut.py index a4f08971..9563bdd2 100644 --- a/tests/integration/ha/test_network_cut.py +++ b/tests/integration/ha/test_network_cut.py @@ -11,7 +11,12 @@ import yaml from pytest_operator.plugin import OpsTest -from ..helpers import access_all_dashboards, get_address, get_leader_name +from ..helpers import ( + access_all_dashboards, + all_dashboards_unavailable, + get_address, + get_leader_name, +) logger = logging.getLogger(__name__) From cc330244b5298d9c0d0e158b9140e1dbfdc21c00 Mon Sep 17 00:00:00 2001 From: Judit Novak Date: Fri, 9 Aug 2024 14:01:28 +0200 Subject: [PATCH 05/10] Unittest adjustments following the fix --- tests/unit/test_tls.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/unit/test_tls.py b/tests/unit/test_tls.py index 561ecd80..e05854ea 100644 --- a/tests/unit/test_tls.py +++ b/tests/unit/test_tls.py @@ -11,6 +11,7 @@ from charm import OpensearchDasboardsCharm from literals import CERTS_REL_NAME, CHARM_KEY, PEER +from src.events.tls import TLSEvents CONFIG = str(yaml.safe_load(Path("./config.yaml").read_text())) ACTIONS = str(yaml.safe_load(Path("./actions.yaml").read_text())) @@ -115,6 +116,7 @@ def test_certificates_broken(harness): remove_cert_files=DEFAULT, ), patch("workload.ODWorkload.configure") as workload_config, + patch("events.tls.TLSCertificatesRequiresV3.request_certificate_revocation"), ): harness.remove_relation(certs_rel_id) From 9959e20eced27b7532587f0d7810ed52ec337849 Mon Sep 17 00:00:00 2001 From: Judit Novak Date: Fri, 12 Jul 2024 11:03:55 +0200 Subject: [PATCH 06/10] Re-enabling HA network cut tests --- tests/integration/ha/test_network_cut.py | 30 ++++++++++++++---------- 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/tests/integration/ha/test_network_cut.py b/tests/integration/ha/test_network_cut.py index 9563bdd2..d15f1136 100644 --- a/tests/integration/ha/test_network_cut.py +++ b/tests/integration/ha/test_network_cut.py @@ -20,6 +20,8 @@ logger = logging.getLogger(__name__) +logging.basicConfig(format="%(asctime)s %(levelname)-8s %(message)s", datefmt="%Y-%m-%d %H:%M:%S") + CLIENT_TIMEOUT = 10 RESTART_DELAY = 60 @@ -37,6 +39,7 @@ """, } TLS_CERT_APP_NAME = "self-signed-certificates" +ALL_APPS = [APP_NAME, TLS_CERT_APP_NAME, OPENSEARCH_APP_NAME] APP_AND_TLS = [APP_NAME, TLS_CERT_APP_NAME] PEER = "dashboard_peers" SERVER_PORT = 5601 @@ -49,7 +52,6 @@ @pytest.mark.runner(["self-hosted", "linux", "X64", "jammy", "large"]) -@pytest.mark.skip(reason="https://warthogs.atlassian.net/browse/DPE-4903") @pytest.mark.group(1) @pytest.mark.skip_if_deployed @pytest.mark.abort_on_fail @@ -151,6 +153,8 @@ async def network_cut_leader(ops_test: OpsTest, https: bool = False): assert new_ip != old_ip logger.info(f"Old IP {old_ip} has changed to {new_ip}...") + await ops_test.model.wait_for_idle(apps=ALL_APPS, wait_for_active=True, timeout=LONG_TIMEOUT) + logger.info("Checking Dashboard access...") assert await access_all_dashboards(ops_test, https=https) @@ -203,6 +207,8 @@ async def network_throttle_leader(ops_test: OpsTest, https: bool = False): current_ip = await get_address(ops_test, old_leader_name) assert old_ip == current_ip + await ops_test.model.wait_for_idle(apps=ALL_APPS, wait_for_active=True, timeout=LONG_TIMEOUT) + logger.info("Checking Dashboard access...") assert await access_all_dashboards(ops_test, https=https) @@ -244,7 +250,7 @@ async def network_cut_application(ops_test: OpsTest, https: bool = False): ) logger.info("Checking lack of Dashboard access...") - assert not (await access_all_dashboards(ops_test, https=https)) + assert all_dashboards_unavailable(ops_test, https=https) logger.info("Restoring network...") for machine_name in machines: @@ -264,6 +270,13 @@ async def network_cut_application(ops_test: OpsTest, https: bool = False): wait_period=LONG_WAIT, ) + for unit, old_ip in unit_ip_map.items(): + new_ip = await get_address(ops_test, unit) + assert new_ip != old_ip + logger.info(f"Old IP {old_ip} has changed to {new_ip}...") + + await ops_test.model.wait_for_idle(apps=ALL_APPS, wait_for_active=True, timeout=LONG_TIMEOUT) + logger.info("Checking Dashboard access...") assert await access_all_dashboards(ops_test, https=https) @@ -305,7 +318,7 @@ async def network_throttle_application(ops_test: OpsTest, https: bool = False): ) logger.info("Checking lack of Dashboard access...") - assert not (await access_all_dashboards(ops_test, https=https)) + assert all_dashboards_unavailable(ops_test, https=https) logger.info("Restoring network...") for machine_name in machines: @@ -328,6 +341,8 @@ async def network_throttle_application(ops_test: OpsTest, https: bool = False): for unit in unit_ip_map ) + await ops_test.model.wait_for_idle(apps=ALL_APPS, wait_for_active=True, timeout=LONG_TIMEOUT) + logger.info("Checking Dashboard access...") assert await access_all_dashboards(ops_test, https=https) @@ -337,7 +352,6 @@ async def network_throttle_application(ops_test: OpsTest, https: bool = False): ############################################################################## -@pytest.mark.skip(reason="https://warthogs.atlassian.net/browse/DPE-4903") @pytest.mark.runner(["self-hosted", "linux", "X64", "jammy", "large"]) @pytest.mark.group(1) @pytest.mark.abort_on_fail @@ -345,7 +359,6 @@ async def test_network_cut_ip_change_leader_http(ops_test: OpsTest, request): await network_cut_leader(ops_test) -@pytest.mark.skip(reason="https://warthogs.atlassian.net/browse/DPE-4903") @pytest.mark.runner(["self-hosted", "linux", "X64", "jammy", "large"]) @pytest.mark.group(1) @pytest.mark.abort_on_fail @@ -353,7 +366,6 @@ async def test_network_cut_no_ip_change_leader_http(ops_test: OpsTest, request): await network_throttle_leader(ops_test) -@pytest.mark.skip(reason="https://warthogs.atlassian.net/browse/DPE-4903") @pytest.mark.runner(["self-hosted", "linux", "X64", "jammy", "large"]) @pytest.mark.group(1) @pytest.mark.abort_on_fail @@ -361,7 +373,6 @@ async def test_network_cut_ip_change_application_http(ops_test: OpsTest, request await network_cut_application(ops_test) -@pytest.mark.skip(reason="https://warthogs.atlassian.net/browse/DPE-4903") @pytest.mark.runner(["self-hosted", "linux", "X64", "jammy", "large"]) @pytest.mark.group(1) @pytest.mark.abort_on_fail @@ -372,7 +383,6 @@ async def test_network_no_ip_change_application_http(ops_test: OpsTest, request) ############################################################################## -@pytest.mark.skip(reason="https://warthogs.atlassian.net/browse/DPE-4903") @pytest.mark.runner(["self-hosted", "linux", "X64", "jammy", "large"]) @pytest.mark.group(1) @pytest.mark.abort_on_fail @@ -391,7 +401,6 @@ async def test_set_tls(ops_test: OpsTest, request): ############################################################################## -@pytest.mark.skip(reason="https://warthogs.atlassian.net/browse/DPE-4903") @pytest.mark.runner(["self-hosted", "linux", "X64", "jammy", "large"]) @pytest.mark.group(1) @pytest.mark.abort_on_fail @@ -399,7 +408,6 @@ async def test_network_cut_ip_change_leader_https(ops_test: OpsTest, request): await network_cut_leader(ops_test, https=True) -@pytest.mark.skip(reason="https://warthogs.atlassian.net/browse/DPE-4903") @pytest.mark.runner(["self-hosted", "linux", "X64", "jammy", "large"]) @pytest.mark.group(1) @pytest.mark.abort_on_fail @@ -407,7 +415,6 @@ async def test_network_cut_no_ip_change_leader_https(ops_test: OpsTest, request) await network_throttle_leader(ops_test, https=True) -@pytest.mark.skip(reason="https://warthogs.atlassian.net/browse/DPE-4903") @pytest.mark.runner(["self-hosted", "linux", "X64", "jammy", "large"]) @pytest.mark.group(1) @pytest.mark.abort_on_fail @@ -415,7 +422,6 @@ async def test_network_cut_ip_change_application_https(ops_test: OpsTest, reques await network_cut_application(ops_test, https=True) -@pytest.mark.skip(reason="https://warthogs.atlassian.net/browse/DPE-4903") @pytest.mark.runner(["self-hosted", "linux", "X64", "jammy", "large"]) @pytest.mark.group(1) @pytest.mark.abort_on_fail From 6b8f7b34466bb9f5d6b1553f65599e759799d8b7 Mon Sep 17 00:00:00 2001 From: Judit Novak Date: Fri, 23 Aug 2024 18:58:58 +0200 Subject: [PATCH 07/10] TLS certs NOT to be removed on relation-broken --- src/events/tls.py | 8 +++++++- tests/integration/test_charm.py | 24 ++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) diff --git a/src/events/tls.py b/src/events/tls.py index 78e9228c..8780f08f 100644 --- a/src/events/tls.py +++ b/src/events/tls.py @@ -59,6 +59,10 @@ def _request_certificates(self): {"private-key": generate_private_key().decode("utf-8")} ) + if self.charm.state.unit_server.tls and self.charm.tls_manager.certificate_valid(): + return + # OR SHOULD WE REMOVE (incl. REVOKE) AND DELETE OLD FILES (i.e. run self._remove_certificates()) + logger.debug( "Requesting certificate for: " f"host {self.charm.state.unit_server.host}," @@ -135,7 +139,9 @@ def _on_config_changed(self, event: EventBase): def _on_certs_relation_broken(self, _) -> None: """Handler for `certificates_relation_broken` event.""" - self._remove_certificates() + # In case we have valid certificats, we keep them for smooth service function + if self.charm.state.unit_server.tls and not self.charm.tls_manager.certificate_valid(): + self._remove_certificates() def _set_tls_private_key(self, event: ActionEvent) -> None: """Handler for `set-tls-privat-key` event when user manually specifies private-keys for a unit.""" diff --git a/tests/integration/test_charm.py b/tests/integration/test_charm.py index a4a35ad3..aac086f6 100644 --- a/tests/integration/test_charm.py +++ b/tests/integration/test_charm.py @@ -23,6 +23,7 @@ count_lines_with, destroy_cluster, get_address, + get_file_contents, get_relations, get_unit_relation_data, ) @@ -127,6 +128,29 @@ async def test_dashboard_access_https(ops_test: OpsTest): assert await access_all_dashboards(ops_test, opensearch_relation.id, https=True) assert await access_all_prometheus_exporters(ops_test) + # Breaking the relation shouldn't impact service availabilty + # A new certificate is requested when the relation is joined again + await ops_test.juju("remove-relation", "opensearch", "opensearch-dashboards") + await ops_test.model.wait_for_idle( + apps=[APP_NAME, TLS_CERTIFICATES_APP_NAME], status="active", timeout=1000 + ) + assert await access_all_dashboards(ops_test, opensearch_relation.id, https=True) + host_cert = await get_file_contents( + ops_test, "/var/snap/opensearch-dashboards/current/etc/opensearch-dashboards/certificates/server.pem" + ) + + # Restore relation for further tests + await ops_test.model.integrate(APP_NAME, TLS_CERTIFICATES_APP_NAME) + await ops_test.model.wait_for_idle( + apps=[APP_NAME, TLS_CERTIFICATES_APP_NAME], status="active", timeout=1000 + ) + same_host_cert = await get_file_contents( + ops_test, "/var/snap/opensearch-dashboards/current/etc/opensearch-dashboards/certificates/server.pem" + ) + assert host_cert == same_host_cert + # OR THIS IS WHAT WE WANT + # assert host_cert != same_host_cert + @pytest.mark.runner(["self-hosted", "linux", "X64", "jammy", "large"]) @pytest.mark.group(1) From 42cc8063368e583138621ea715aaac676c338d2b Mon Sep 17 00:00:00 2001 From: Judit Novak Date: Sat, 24 Aug 2024 02:42:18 +0200 Subject: [PATCH 08/10] fixup! TLS certs NOT to be removed on relation-broken --- src/events/tls.py | 16 ++++++++-------- tests/integration/test_charm.py | 26 ++++++++++++++++---------- tests/unit/test_tls.py | 28 ++++++++++++++++++---------- 3 files changed, 42 insertions(+), 28 deletions(-) diff --git a/src/events/tls.py b/src/events/tls.py index 8780f08f..c786eea1 100644 --- a/src/events/tls.py +++ b/src/events/tls.py @@ -14,7 +14,7 @@ generate_csr, generate_private_key, ) -from ops.charm import ActionEvent, RelationJoinedEvent +from ops.charm import ActionEvent, RelationCreatedEvent from ops.framework import EventBase, Object from literals import CERTS_REL_NAME @@ -34,7 +34,8 @@ def __init__(self, charm): self.certificates = TLSCertificatesRequiresV3(self.charm, CERTS_REL_NAME) self.framework.observe( - getattr(self.charm.on, "certificates_relation_joined"), self._on_certs_relation_joined + getattr(self.charm.on, "certificates_relation_created"), + self._on_certs_relation_created, ) self.framework.observe( getattr(self.certificates.on, "certificate_available"), self._on_certificate_available @@ -59,9 +60,8 @@ def _request_certificates(self): {"private-key": generate_private_key().decode("utf-8")} ) - if self.charm.state.unit_server.tls and self.charm.tls_manager.certificate_valid(): - return - # OR SHOULD WE REMOVE (incl. REVOKE) AND DELETE OLD FILES (i.e. run self._remove_certificates()) + if self.charm.state.unit_server.tls: + self._remove_certificates() logger.debug( "Requesting certificate for: " @@ -91,8 +91,8 @@ def _remove_certificates(self): # remove all existing keystores from the unit so we don't preserve certs self.charm.tls_manager.remove_cert_files() - def _on_certs_relation_joined(self, event: RelationJoinedEvent) -> None: - """Handler for `certificates_relation_joined` event.""" + def _on_certs_relation_created(self, event: RelationCreatedEvent) -> None: + """Handler for `certificates_relation_created` event.""" # generate unit private key if not already created by action self._request_certificates() @@ -139,7 +139,7 @@ def _on_config_changed(self, event: EventBase): def _on_certs_relation_broken(self, _) -> None: """Handler for `certificates_relation_broken` event.""" - # In case we have valid certificats, we keep them for smooth service function + # In case we have valid certificates, we keep them for smooth service function if self.charm.state.unit_server.tls and not self.charm.tls_manager.certificate_valid(): self._remove_certificates() diff --git a/tests/integration/test_charm.py b/tests/integration/test_charm.py index aac086f6..d780447d 100644 --- a/tests/integration/test_charm.py +++ b/tests/integration/test_charm.py @@ -9,6 +9,7 @@ from pathlib import Path import pytest +import requests import yaml from pytest_operator.plugin import OpsTest @@ -128,28 +129,33 @@ async def test_dashboard_access_https(ops_test: OpsTest): assert await access_all_dashboards(ops_test, opensearch_relation.id, https=True) assert await access_all_prometheus_exporters(ops_test) - # Breaking the relation shouldn't impact service availabilty + # Breaking the relation shouldn't impact service availability # A new certificate is requested when the relation is joined again - await ops_test.juju("remove-relation", "opensearch", "opensearch-dashboards") + await ops_test.juju("remove-relation", APP_NAME, TLS_CERTIFICATES_APP_NAME) await ops_test.model.wait_for_idle( apps=[APP_NAME, TLS_CERTIFICATES_APP_NAME], status="active", timeout=1000 ) + + # Event thought the TLS connection is not there, we do NOT switch back to HTTP + with pytest.raises(requests.exceptions.ConnectionError): + await access_all_dashboards(ops_test, opensearch_relation.id) + + # Instead, HTTPS works uninterrupted assert await access_all_dashboards(ops_test, opensearch_relation.id, https=True) - host_cert = await get_file_contents( - ops_test, "/var/snap/opensearch-dashboards/current/etc/opensearch-dashboards/certificates/server.pem" + + server_cert = ( + "/var/snap/opensearch-dashboards/current/etc/opensearch-dashboards/certificates/server.pem" ) + unit = ops_test.model.applications[APP_NAME].units[0] + host_cert = get_file_contents(ops_test, unit, server_cert) # Restore relation for further tests await ops_test.model.integrate(APP_NAME, TLS_CERTIFICATES_APP_NAME) await ops_test.model.wait_for_idle( apps=[APP_NAME, TLS_CERTIFICATES_APP_NAME], status="active", timeout=1000 ) - same_host_cert = await get_file_contents( - ops_test, "/var/snap/opensearch-dashboards/current/etc/opensearch-dashboards/certificates/server.pem" - ) - assert host_cert == same_host_cert - # OR THIS IS WHAT WE WANT - # assert host_cert != same_host_cert + new_host_cert = get_file_contents(ops_test, unit, server_cert) + assert host_cert != new_host_cert @pytest.mark.runner(["self-hosted", "linux", "X64", "jammy", "large"]) diff --git a/tests/unit/test_tls.py b/tests/unit/test_tls.py index e05854ea..9ac6a659 100644 --- a/tests/unit/test_tls.py +++ b/tests/unit/test_tls.py @@ -66,13 +66,15 @@ def test_certificates_available_fails_wrong_csr(harness): def test_certificates_available_succeeds(harness): - harness.add_relation(CERTS_REL_NAME, "tls-certificates-operator") + with harness.hooks_disabled(): + harness.add_relation(CERTS_REL_NAME, "tls-certificates-operator") # implicitly tests restart call harness.add_relation(harness.charm.restart.name, "{CHARM_KEY}/0") - harness.update_relation_data( - harness.charm.state.peer_relation.id, f"{CHARM_KEY}/0", {"csr": "not-missing"} + harness.charm.unit.add_secret( + {"csr": "not-missing"}, + label=f"{PEER}.opensearch-dashboards.unit", ) # implicitly tests these method calls @@ -100,7 +102,12 @@ def test_certificates_broken(harness): certs_rel_id = harness.add_relation(CERTS_REL_NAME, "tls-certificates-operator") harness.charm.unit.add_secret( - {"csr": "not-missing", "certificate": "cert", "ca-cert": "exists"}, + { + "csr": "not-missing", + "certificate": "cert", + "ca-cert": "exists", + "private-key": "key", + }, label=f"{PEER}.opensearch-dashboards.unit", ) harness.set_leader(True) @@ -112,8 +119,7 @@ def test_certificates_broken(harness): # implicitly tests these method calls with ( patch.multiple( - "managers.tls.TLSManager", - remove_cert_files=DEFAULT, + "managers.tls.TLSManager", remove_cert_files=DEFAULT, certificate_valid=lambda _: True ), patch("workload.ODWorkload.configure") as workload_config, patch("events.tls.TLSCertificatesRequiresV3.request_certificate_revocation"), @@ -121,11 +127,13 @@ def test_certificates_broken(harness): harness.remove_relation(certs_rel_id) - assert not harness.charm.state.unit_server.certificate - assert not harness.charm.state.unit_server.ca - assert not harness.charm.state.unit_server.csr - assert not harness.charm.state.unit_server.tls + # While the TLS relation is gone assert not harness.charm.state.cluster.tls + # ...we've still preserved certs locally + assert harness.charm.state.unit_server.certificate + assert harness.charm.state.unit_server.ca + assert harness.charm.state.unit_server.csr + assert harness.charm.state.unit_server.tls assert workload_config.assert_called_once From 095a8d08caaa1647a20ff634fbfe86095273053d Mon Sep 17 00:00:00 2001 From: Judit Novak Date: Sat, 24 Aug 2024 02:49:16 +0200 Subject: [PATCH 09/10] Updating more libs --- lib/charms/grafana_agent/v0/cos_agent.py | 8 ++++++-- lib/charms/operator_libs_linux/v0/apt.py | 4 ++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/lib/charms/grafana_agent/v0/cos_agent.py b/lib/charms/grafana_agent/v0/cos_agent.py index 582b70c0..cc4da25a 100644 --- a/lib/charms/grafana_agent/v0/cos_agent.py +++ b/lib/charms/grafana_agent/v0/cos_agent.py @@ -22,7 +22,7 @@ Using the `COSAgentProvider` object only requires instantiating it, typically in the `__init__` method of your charm (the one which sends telemetry). -The constructor of `COSAgentProvider` has only one required and nine optional parameters: +The constructor of `COSAgentProvider` has only one required and ten optional parameters: ```python def __init__( @@ -36,6 +36,7 @@ def __init__( log_slots: Optional[List[str]] = None, dashboard_dirs: Optional[List[str]] = None, refresh_events: Optional[List] = None, + tracing_protocols: Optional[List[str]] = None, scrape_configs: Optional[Union[List[Dict], Callable]] = None, ): ``` @@ -65,6 +66,8 @@ def __init__( - `refresh_events`: List of events on which to refresh relation data. +- `tracing_protocols`: List of requested tracing protocols that the charm requires to send traces. + - `scrape_configs`: List of standard scrape_configs dicts or a callable that returns the list in case the configs need to be generated dynamically. The contents of this list will be merged with the configs from `metrics_endpoints`. @@ -108,6 +111,7 @@ def __init__(self, *args): log_slots=["my-app:slot"], dashboard_dirs=["./src/dashboards_1", "./src/dashboards_2"], refresh_events=["update-status", "upgrade-charm"], + tracing_protocols=["otlp_http", "otlp_grpc"], scrape_configs=[ { "job_name": "custom_job", @@ -249,7 +253,7 @@ class _MetricsEndpointDict(TypedDict): LIBID = "dc15fa84cef84ce58155fb84f6c6213a" LIBAPI = 0 -LIBPATCH = 10 +LIBPATCH = 11 PYDEPS = ["cosl", "pydantic"] diff --git a/lib/charms/operator_libs_linux/v0/apt.py b/lib/charms/operator_libs_linux/v0/apt.py index 1400df7f..b8913c0e 100644 --- a/lib/charms/operator_libs_linux/v0/apt.py +++ b/lib/charms/operator_libs_linux/v0/apt.py @@ -122,7 +122,7 @@ # Increment this PATCH version before using `charmcraft publish-lib` or reset # to 0 if you are raising the major API version -LIBPATCH = 13 +LIBPATCH = 14 VALID_SOURCE_TYPES = ("deb", "deb-src") @@ -837,7 +837,7 @@ def remove_package( def update() -> None: """Update the apt cache via `apt-get update`.""" - subprocess.run(["apt-get", "update"], capture_output=True, check=True) + subprocess.run(["apt-get", "update", "--error-on=any"], capture_output=True, check=True) def import_key(key: str) -> str: From fd515fe0e3f837100390f1a7b977c3a12c90da26 Mon Sep 17 00:00:00 2001 From: Judit Novak Date: Tue, 27 Aug 2024 11:40:20 +0200 Subject: [PATCH 10/10] Change request on PR review --- tests/integration/ha/test_network_cut.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/integration/ha/test_network_cut.py b/tests/integration/ha/test_network_cut.py index d15f1136..3fa00fe8 100644 --- a/tests/integration/ha/test_network_cut.py +++ b/tests/integration/ha/test_network_cut.py @@ -20,8 +20,6 @@ logger = logging.getLogger(__name__) -logging.basicConfig(format="%(asctime)s %(levelname)-8s %(message)s", datefmt="%Y-%m-%d %H:%M:%S") - CLIENT_TIMEOUT = 10 RESTART_DELAY = 60