Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[DPE-4575][DPE-4886][DPE-4983] Add voting exclusions management #367

Closed
wants to merge 44 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
52fcb0d
Rebase and add support for voting exclusions
phvalguima Jul 8, 2024
8802410
Add integration test for 3->1->3 scaling; plus fixes
phvalguima Jul 8, 2024
d360fe9
Add fix for scaling test
phvalguima Jul 8, 2024
05c828b
Fix issue with continuous_writes
phvalguima Jul 8, 2024
186dadd
Add rudimentary check for shard relocation
phvalguima Jul 8, 2024
784dd0f
Move to self.unit_name
phvalguima Jul 8, 2024
b4b9fde
Add safeguards at stopping for the app removal case
phvalguima Jul 9, 2024
b9becc8
Move the _get_nodes up in _stop_opensearch
phvalguima Jul 9, 2024
f22f75b
Focus remove-app test to only remove opensearch
phvalguima Jul 9, 2024
ce0c3e6
Add explain call to update status
phvalguima Jul 10, 2024
3e59a79
Add explain call to update status
phvalguima Jul 10, 2024
b3a9c11
Add explain call to update status
phvalguima Jul 10, 2024
1bc6555
Add cluster explain API
phvalguima Jul 10, 2024
ee49e33
Fix explain on node_lock
phvalguima Jul 10, 2024
5a9e006
Node lock testing -- move logging
phvalguima Jul 10, 2024
e633312
Add try/catch for the allocation
phvalguima Jul 10, 2024
fc148a5
Extend voting exclusion settling to scenarios: covers outage cases; f…
phvalguima Jul 12, 2024
68ca4f0
Add unit tests for exclusions logic
phvalguima Jul 15, 2024
8fa6f29
Add fixes to unit tests and removed commented code
phvalguima Jul 15, 2024
48c511e
Add fixes for integration tests and unit tests
phvalguima Jul 15, 2024
3b16370
Update helper_cluster.py
phvalguima Jul 17, 2024
4004ba5
Add retry to the lock
phvalguima Jul 18, 2024
6c2137c
Remove fix for dashboards, diff PR; add comments on update_status
phvalguima Jul 18, 2024
c45e7e9
Update helper_cluster.py
phvalguima Jul 22, 2024
4591f2b
Update helper_cluster.py
phvalguima Jul 22, 2024
6ff1ad9
Updates following review
phvalguima Aug 6, 2024
9e0e20d
Merge remote-tracking branch 'origin/main' into with-tests-DPE-4057-v…
phvalguima Aug 6, 2024
c3b71e7
fix elected_manager to return the Node object instead of the ID strin…
phvalguima Aug 6, 2024
3ebadee
Fixes for exclusions, locking and health
phvalguima Aug 7, 2024
f0e0fec
Extend to a larger runner
phvalguima Aug 7, 2024
bbca393
WIP: updating integration test_ha for 2 nodes
phvalguima Aug 8, 2024
adeef52
WIP(2): updating integration test_ha for 2 nodes
phvalguima Aug 8, 2024
3d74325
Merge remote-tracking branch 'origin' into with-tests-DPE-4057-voting…
phvalguima Aug 8, 2024
e14ae76
Fix scale up and down test
phvalguima Aug 9, 2024
b909c16
Update from main branch
phvalguima Aug 12, 2024
af9fa25
move from xlarge to large
phvalguima Aug 12, 2024
7c1595a
Add more info in the test_restart_db_process_node_with_elected_cm
phvalguima Aug 12, 2024
3851b2d
add more logging
phvalguima Aug 12, 2024
631b6a8
_on_update_status: remove the is_node_up() from the start and move it…
phvalguima Aug 13, 2024
8972bf3
Merge remote-tracking branch 'origin' into with-tests-DPE-4057-voting…
phvalguima Aug 13, 2024
ad524e6
Fix node exclusions to also check for service running, plus manually …
phvalguima Aug 13, 2024
77b1c67
remove pdb mentions to test_ha
phvalguima Aug 13, 2024
d84e1d5
Extend is_active to consider other cases such as an stopped process
phvalguima Aug 14, 2024
bd97709
Add missing rstrip()
phvalguima Aug 14, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 51 additions & 1 deletion lib/charms/opensearch/v0/helper_cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,54 @@ def nodes_by_role(nodes: List[Node]) -> Dict[str, List[Node]]:

return result

@staticmethod
def elected_manager(
opensearch: OpenSearchDistribution,
use_localhost: bool,
hosts: Optional[List[str]] = None,
) -> Node:
"""Get the list of nodes in a cluster."""
host: Optional[str] = None # defaults to current unit ip
alt_hosts: Optional[List[str]] = hosts
if not use_localhost and hosts:
host = hosts[0]
if len(hosts) >= 2:
alt_hosts = hosts[1:]

if use_localhost or host:
manager_id = opensearch.request(
"GET",
"/_cluster/state/cluster_manager_node",
host=host,
alt_hosts=alt_hosts,
retries=3,
)
if "cluster_manager_node" not in manager_id:
return None

response = opensearch.request(
"GET",
f"/_nodes/{manager_id['cluster_manager_node']}",
host=host,
alt_hosts=alt_hosts,
retries=3,
)
if "nodes" in response:
for id, obj in response["nodes"].items():
if id != manager_id["cluster_manager_node"]:
return None
node = Node(
name=obj["name"],
roles=obj["roles"],
ip=obj["ip"],
app=App(id=obj["attributes"]["app_id"]),
unit_number=int(obj["name"].split(".")[0].split("-")[-1]),
temperature=obj.get("attributes", {}).get("temp"),
)
return node

return None

@staticmethod
def nodes(
opensearch: OpenSearchDistribution,
Expand All @@ -149,7 +197,9 @@ def nodes(
host: Optional[str] = None # defaults to current unit ip
alt_hosts: Optional[List[str]] = hosts
if not use_localhost and hosts:
host, alt_hosts = hosts[0], hosts[1:]
host = hosts[0]
if len(hosts) >= 2:
alt_hosts = hosts[1:]

nodes: List[Node] = []
if use_localhost or host:
Expand Down
93 changes: 73 additions & 20 deletions lib/charms/opensearch/v0/opensearch_base_charm.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,11 @@
from charms.opensearch.v0.opensearch_health import HealthColors, OpenSearchHealth
from charms.opensearch.v0.opensearch_internal_data import RelationDataStore, Scope
from charms.opensearch.v0.opensearch_locking import OpenSearchNodeLock
from charms.opensearch.v0.opensearch_nodes_exclusions import OpenSearchExclusions
from charms.opensearch.v0.opensearch_nodes_exclusions import (
OpenSearchExclusionError,
OpenSearchExclusionNodeNotRegisteredError,
OpenSearchExclusions,
)
from charms.opensearch.v0.opensearch_peer_clusters import (
OpenSearchPeerClustersManager,
OpenSearchProvidedRolesException,
Expand Down Expand Up @@ -436,7 +440,7 @@ def _on_peer_relation_changed(self, event: RelationChangedEvent):
if not (unit_data := event.relation.data.get(event.unit)):
return

self.opensearch_exclusions.cleanup()
self.opensearch_exclusions.allocation_cleanup()

if self.unit.is_leader() and unit_data.get("bootstrap_contributor"):
contributor_count = self.peers_data.get(Scope.APP, "bootstrap_contributors_count", 0)
Expand Down Expand Up @@ -517,7 +521,7 @@ def _on_opensearch_data_storage_detaching(self, _: StorageDetachingEvent): # no
# release lock
self.node_lock.release()

def _on_update_status(self, event: UpdateStatusEvent):
def _on_update_status(self, event: UpdateStatusEvent): # noqa: C901
"""On update status event.

We want to periodically check for the following:
Expand All @@ -533,23 +537,39 @@ def _on_update_status(self, event: UpdateStatusEvent):
self.status.set(BlockedStatus(" - ".join(missing_sys_reqs)))
return

# if node already shutdown - leave
if not self.opensearch.is_node_up():
return

# if there are exclusions to be removed
if self.unit.is_leader():
self.opensearch_exclusions.cleanup()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

While before this cleanup was done on every update_status, now it's only done when the Health is green. Is this on purpose?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That is not the case... It is done on every case, except HealthColors.UNKNOWN. Indeed, we defer the event if it is not green... I put it down there because I need the API to be responsive before configuring voting exclusions. If it is not responsive, we will get UNKNOWN anyways and retry later anyways.

I will add some comments to clarify that.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a reason why the shards allocation exclusion cleanup is postponed until later in the hook? As long as there is connectivity to a host, we should be able to cleanup.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, the health checks below allow anything pass, unless the cluster is really on a bad state (i.e. UNKNOWN). So, moved the check below these first health checks because it makes more sense.


if (health := self.health.apply(wait_for_green_first=True)) not in [
HealthColors.GREEN,
HealthColors.IGNORE,
]:
] or not self.opensearch.is_node_up():
# Do not return right now!
# We must first check if we need to remove exclusions
event.defer()

# Unless it is unknown, in this case we can return and wait for the next run
if health == HealthColors.UNKNOWN:
return

# Execute allocations now, as we know the cluster is minimally healthy.
self.opensearch_exclusions.allocation_cleanup()
# Now, review voting exclusions, as we may have lost a unit due to an outage
if not self.is_any_voting_unit_stopping():
try:
self.opensearch_exclusions.settle_voting(
unit_is_stopping=False,
retry=False,
)
except (OpenSearchExclusionError, OpenSearchHttpError):
# Register the issue but do not act on it: let the next update status event
# retry the operation.
# We cannot assume or try to enforce the cluster to be in a healthy state
logger.warning("Failed to settle voting exclusions")

# if node already shutdown - leave
if not self.opensearch.is_node_up():
return

for relation in self.model.relations.get(ClientRelationName, []):
self.opensearch_provider.update_endpoints(relation)

Expand Down Expand Up @@ -584,6 +604,16 @@ def _on_config_changed(self, event: ConfigChangedEvent): # noqa C901
RelationJoinedEvent(event.handle, PeerRelationName, self.app, self.unit)
)

# Review voting exclusions as our IP has changed: we may be coming back from a network
# outage case.
# In this case, we should retry if the node is not found in the cluster
if not self.is_any_voting_unit_stopping():
try:
self.opensearch_exclusions.settle_voting(unit_is_stopping=False)
except (OpenSearchExclusionNodeNotRegisteredError, OpenSearchHttpError):
event.defer()
return

previous_deployment_desc = self.opensearch_peer_cm.deployment_desc()
if self.unit.is_leader():
# run peer cluster manager processing
Expand Down Expand Up @@ -741,6 +771,14 @@ def on_tls_relation_broken(self, _: RelationBrokenEvent):
# Otherwise, we block.
self.status.set(BlockedStatus(TLSRelationBrokenError))

def is_any_voting_unit_stopping(self) -> bool:
"""Check if any voting unit is stopping."""
rel = self.model.get_relation(PeerRelationName)
for unit in all_units(self):
if rel.data[unit].get("voting_unit_stopping") == "True":
return True
return False

def is_every_unit_marked_as_started(self) -> bool:
"""Check if every unit in the cluster is marked as started."""
rel = self.model.get_relation(PeerRelationName)
Expand Down Expand Up @@ -914,7 +952,9 @@ def _post_start_init(self, event: _StartOpenSearch): # noqa: C901
self._cleanup_bootstrap_conf_if_applies()

# Remove the exclusions that could not be removed when no units were online
self.opensearch_exclusions.delete_current()
if not self.is_any_voting_unit_stopping():
self.opensearch_exclusions.settle_voting(unit_is_stopping=False)
self.opensearch_exclusions.delete_allocations()

self.node_lock.release()

Expand Down Expand Up @@ -1006,34 +1046,47 @@ def _post_start_init(self, event: _StartOpenSearch): # noqa: C901
if self.opensearch_peer_cm.is_provider():
self.peer_cluster_provider.refresh_relation_data(event, can_defer=False)

def _stop_opensearch(self, *, restart=False) -> None:
def _stop_opensearch(self, *, restart=False) -> None: # noqa: C901
"""Stop OpenSearch if possible."""
self.status.set(WaitingStatus(ServiceIsStopping))

if "cluster_manager" in self.opensearch.roles or "voting_only" in self.opensearch.roles:
# Inform peers that this unit is stopping and it has a voting seat.
# This unit must be the only one managing the voting exclusions, as it may have to
# exclude itself from the voting while stopping.
self.peers_data.put(Scope.UNIT, "voting_unit_stopping", True)

nodes = []
if self.opensearch.is_node_up():
try:
nodes = self._get_nodes(True)
nodes = self._get_nodes(self.opensearch.is_node_up())
# do not add exclusions if it's the last unit to stop
# otherwise cluster manager election will be blocked when starting up again
# and re-using storage
if len(nodes) > 1:
# 1. Add current node to the voting + alloc exclusions
self.opensearch_exclusions.add_current(restart=restart)
if len(nodes) > 1 and not restart:
# 1. Add current node to the voting + alloc exclusions if not restarting
self.opensearch_exclusions.add_allocations()
except OpenSearchHttpError:
logger.debug("Failed to get online nodes, voting and alloc exclusions not added")
logger.debug("Failed to get online nodes, alloc exclusion not added")

# block until all primary shards are moved away from the unit that is stopping
self.health.wait_for_shards_relocation()

# 2. stop the service
# We should only run voting settle right before stop. We need to manage voting before
# stopping e.g. in case we are going 1->0 units.
# We MUST run settle_voting, even if other units are stopping as well.
self.opensearch_exclusions.settle_voting(unit_is_stopping=True)
self.opensearch.stop()

self.peers_data.delete(Scope.UNIT, "voting_unit_stopping")
self.peers_data.delete(Scope.UNIT, "started")
self.status.set(WaitingStatus(ServiceStopped))

# 3. Remove the exclusions
if not restart:
try:
self.opensearch_exclusions.delete_current()
self.opensearch_exclusions.delete_allocations()
except Exception:
# It is purposefully broad - as this can fail for HTTP reasons,
# or if the config wasn't set on disk etc. In any way, this operation is on
Expand Down Expand Up @@ -1175,7 +1228,7 @@ def _remove_data_role_from_dedicated_cm_if_needed( # noqa: C901
self.opensearch_config.remove_temporary_data_role()

# wait until data moves out completely
self.opensearch_exclusions.add_current()
self.opensearch_exclusions.add_allocations()

try:
for attempt in Retrying(stop=stop_after_attempt(3), wait=wait_fixed(0.5)):
Expand All @@ -1188,7 +1241,7 @@ def _remove_data_role_from_dedicated_cm_if_needed( # noqa: C901
raise Exception
return True
except RetryError:
self.opensearch_exclusions.delete_current()
self.opensearch_exclusions.delete_allocations()
event.defer()
return False

Expand Down
4 changes: 3 additions & 1 deletion lib/charms/opensearch/v0/opensearch_distro.py
Original file line number Diff line number Diff line change
Expand Up @@ -398,9 +398,11 @@ def roles(self) -> List[str]:
"""Get the list of the roles assigned to this node."""
try:
nodes = self.request("GET", f"/_nodes/{self.node_id}", alt_hosts=self._charm.alt_hosts)
logger.debug(f"Request for node {self.node_id} returned: {nodes}")

return nodes["nodes"][self.node_id]["roles"]
except OpenSearchHttpError:
return self.config.load("opensearch.yml")["node.roles"]
return self.config.load("opensearch.yml").get("node.roles", [])

@property
def host(self) -> str:
Expand Down
32 changes: 27 additions & 5 deletions lib/charms/opensearch/v0/opensearch_locking.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,9 +254,28 @@ def acquired(self) -> bool: # noqa: C901
self._opensearch, use_localhost=host is not None, hosts=alt_hosts
)
)
try:
logger.debug(
"Current shard allocation status: %s",
self._opensearch.request(
"GET",
"/_cluster/allocation/explain?include_yes_decisions=true&include_disk_info=true",
payload={
"index": self.OPENSEARCH_INDEX,
"shard": 0,
"primary": "true",
},
),
)
except Exception:
logger.debug("Current shard allocation status: error to connect with API")
pass

except OpenSearchHttpError:
logger.exception("Error getting OpenSearch nodes")
return False
# If we are trying to acquire the lock at application removal, this condition
# will be eventually hit
return len(self.units) <= 1
logger.debug(f"[Node lock] Opensearch {online_nodes=}")
assert online_nodes > 0
try:
Expand All @@ -266,10 +285,10 @@ def acquired(self) -> bool: # noqa: C901
# if the node lock cannot be acquired, fall back to peer databag lock
# this avoids hitting deadlock situations in cases where
# the .charm_node_lock index is not available
if online_nodes <= 1:
if online_nodes <= 1 and self._charm.app.planned_units() > 1:
return self._peer.acquired
else:
return False
return self._charm.app.planned_units() > 1
# If online_nodes == 1, we should acquire the lock via the peer databag.
# If we acquired the lock via OpenSearch and this unit was stopping, we would be unable
# to release the OpenSearch lock. For example, when scaling to 0.
Expand Down Expand Up @@ -302,10 +321,13 @@ def acquired(self) -> bool: # noqa: C901
"to acquire lock"
)
return False
else:
elif self._charm.app.planned_units() > 1:
logger.exception("Error creating OpenSearch lock document")
# in this case, try to acquire peer databag lock as fallback
return self._peer.acquired
else:
# There is only one unit or less
return True
else:
# Ensure write was successful on all nodes
# "It is important to note that this setting [`wait_for_active_shards`] greatly
Expand Down Expand Up @@ -359,7 +381,7 @@ def acquired(self) -> bool: # noqa: C901
# If return value is True:
# - Lock granted in previous Juju event
# - OR, unit is leader & lock granted in this Juju event
return self._peer.acquired
return self._peer.acquired if self._charm.app.planned_units() > 1 else True

def release(self):
"""Release lock.
Expand Down
Loading
Loading