Skip to content

Commit

Permalink
Fix the cluster_has_leader service for standby clusters
Browse files Browse the repository at this point in the history
Before this patch we checked the expected standby leader state
was `running` for all versions of Patroni.

With this patch, for:
* Patroni < 3.0.4, standby leaders are in `running` state.
* Patroni >= 3.0.4, standby leaders can be in `streaming` or `in
archive recovey` state. We will raise a warning for the latter.

The tests where modified to account for this.

Co-authored-by: Denis Laxalde <[email protected]>
  • Loading branch information
blogh and dlax committed Dec 18, 2023
1 parent ffc330f commit 46db3e2
Show file tree
Hide file tree
Showing 8 changed files with 246 additions and 23 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,15 @@

* Add the timeline in the `cluster_has_replica` perfstats. (#50)
* Add a mention about shell completion support and shell versions in the doc. (#53)
* Add the leader type and whether it's archiving to the `cluster_has_leader` perfstats. (#58)

### Fixed

* Add compatibility with [requests](https://requests.readthedocs.io)
version 2.25 and higher.
* Fix what `cluster_has_replica` deems a healthy replica. (#50, reported by @mbanck)
* Fix `cluster_has_replica` to display perfstats for replicas whenever it's possible (healthy or not). (#50)
* Fix `cluster_has_leader` to correctly check for standby leaders. (#58, reported by @mbanck)

### Misc

Expand Down
20 changes: 18 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -176,11 +176,27 @@ Usage: check_patroni cluster_has_leader [OPTIONS]
This check applies to any kind of leaders including standby leaders.
A leader is a node with the "leader" role and a "running" state.
A standby leader is a node with a "standby_leader" role and a "streaming" or
"in archive recovery" state. Please note that log shipping could be stuck
because the WAL are not available or applicable. Patroni doesn't provide
information about the origin cluster (timeline or lag), so we cannot check
if there is a problem in that particular case. That's why we issue a warning
when the node is "in archive recovery". We suggest using other supervision
tools to do this (eg. check_pgactivity).
Check:
* `OK`: if there is a leader node.
* `CRITICAL`: otherwise
* 'WARNING': if there is a stanby leader in archive mode.
* `CRITICAL`: otherwise.
Perfdata: `has_leader` is 1 if there is a leader node, 0 otherwise
Perfdata:
* `has_leader` is 1 if there is any kind of leader node, 0 otherwise
* `is_standby_leader_in_arc_rec` is 1 if the standby leader node is "in
archive recovery", 0 otherwise
* `is_standby_leader` is 1 if there is a standby leader node, 0 otherwise
* `is_leader` is 1 if there is a "classical" leader node, 0 otherwise
Options:
--help Show this message and exit.
Expand Down
25 changes: 23 additions & 2 deletions check_patroni/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,17 +285,38 @@ def cluster_has_leader(ctx: click.Context) -> None:
This check applies to any kind of leaders including standby leaders.
A leader is a node with the "leader" role and a "running" state.
A standby leader is a node with a "standby_leader" role and a "streaming"
or "in archive recovery" state. Please note that log shipping could be
stuck because the WAL are not available or applicable. Patroni doesn't
provide information about the origin cluster (timeline or lag), so we
cannot check if there is a problem in that particular case. That's why we
issue a warning when the node is "in archive recovery". We suggest using
other supervision tools to do this (eg. check_pgactivity).
\b
Check:
* `OK`: if there is a leader node.
* `CRITICAL`: otherwise
* 'WARNING': if there is a stanby leader in archive mode.
* `CRITICAL`: otherwise.
\b
Perfdata:
* `has_leader` is 1 if there is any kind of leader node, 0 otherwise
* `is_standby_leader_in_arc_rec` is 1 if the standby leader node is "in
archive recovery", 0 otherwise
* `is_standby_leader` is 1 if there is a standby leader node, 0 otherwise
* `is_leader` is 1 if there is a "classical" leader node, 0 otherwise
Perfdata: `has_leader` is 1 if there is a leader node, 0 otherwise
"""
check = nagiosplugin.Check()
check.add(
ClusterHasLeader(ctx.obj.connection_info),
nagiosplugin.ScalarContext("has_leader", None, "@0:0"),
nagiosplugin.ScalarContext("is_standby_leader_in_arc_rec", "@1:1", None),
nagiosplugin.ScalarContext("is_leader", None, None),
nagiosplugin.ScalarContext("is_standby_leader", None, None),
ClusterHasLeaderSummary(),
)
check.main(verbose=ctx.obj.verbose, timeout=ctx.obj.timeout)
Expand Down
35 changes: 29 additions & 6 deletions check_patroni/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,19 +52,42 @@ def probe(self) -> Iterable[nagiosplugin.Metric]:
item_dict = self.rest_api("cluster")

is_leader_found = False
is_standby_leader_found = False
is_standby_leader_in_arc_rec = False
for member in item_dict["members"]:
if (
member["role"] in ("leader", "standby_leader")
and member["state"] == "running"
):
if member["role"] == "leader" and member["state"] == "running":
is_leader_found = True
break

if member["role"] == "standby_leader":
if member["state"] not in ["streaming", "in archive recovery"]:
# for patroni >= 3.0.4 any state would be wrong
# for patroni < 3.0.4 a state different from running would be wrong
if self.has_detailed_states() or member["state"] != "running":
continue

if member["state"] in ["in archive recovery"]:
is_standby_leader_in_arc_rec = True

is_standby_leader_found = True
break
return [
nagiosplugin.Metric(
"has_leader",
1 if is_leader_found or is_standby_leader_found else 0,
),
nagiosplugin.Metric(
"is_standby_leader_in_arc_rec",
1 if is_standby_leader_in_arc_rec else 0,
),
nagiosplugin.Metric(
"is_standby_leader",
1 if is_standby_leader_found else 0,
),
nagiosplugin.Metric(
"is_leader",
1 if is_leader_found else 0,
)
),
]


Expand All @@ -74,7 +97,7 @@ def ok(self, results: nagiosplugin.Result) -> str:

@handle_unknown
def problem(self, results: nagiosplugin.Result) -> str:
return "The cluster has no running leader."
return "The cluster has no running leader or the standby leader is in archive recovery."


class ClusterHasReplica(PatroniResource):
Expand Down
33 changes: 33 additions & 0 deletions tests/json/cluster_has_leader_ko_standby_leader.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
{
"members": [
{
"name": "srv1",
"role": "standby_leader",
"state": "stopped",
"api_url": "https://10.20.199.3:8008/patroni",
"host": "10.20.199.3",
"port": 5432,
"timeline": 51
},
{
"name": "srv2",
"role": "replica",
"state": "streaming",
"api_url": "https://10.20.199.4:8008/patroni",
"host": "10.20.199.4",
"port": 5432,
"timeline": 51,
"lag": 0
},
{
"name": "srv3",
"role": "replica",
"state": "streaming",
"api_url": "https://10.20.199.5:8008/patroni",
"host": "10.20.199.5",
"port": 5432,
"timeline": 51,
"lag": 0
}
]
}
33 changes: 33 additions & 0 deletions tests/json/cluster_has_leader_ko_standby_leader_archiving.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
{
"members": [
{
"name": "srv1",
"role": "standby_leader",
"state": "in archive recovery",
"api_url": "https://10.20.199.3:8008/patroni",
"host": "10.20.199.3",
"port": 5432,
"timeline": 51
},
{
"name": "srv2",
"role": "replica",
"state": "streaming",
"api_url": "https://10.20.199.4:8008/patroni",
"host": "10.20.199.4",
"port": 5432,
"timeline": 51,
"lag": 0
},
{
"name": "srv3",
"role": "replica",
"state": "streaming",
"api_url": "https://10.20.199.5:8008/patroni",
"host": "10.20.199.5",
"port": 5432,
"timeline": 51,
"lag": 0
}
]
}
2 changes: 1 addition & 1 deletion tests/json/cluster_has_leader_ok_standby_leader.json
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
{
"name": "srv1",
"role": "standby_leader",
"state": "running",
"state": "streaming",
"api_url": "https://10.20.199.3:8008/patroni",
"host": "10.20.199.3",
"port": 5432,
Expand Down
119 changes: 107 additions & 12 deletions tests/test_cluster_has_leader.py
Original file line number Diff line number Diff line change
@@ -1,37 +1,132 @@
from pathlib import Path
from typing import Iterator, Union

import pytest
from click.testing import CliRunner

from check_patroni.cli import main

from . import PatroniAPI
from . import PatroniAPI, cluster_api_set_replica_running


@pytest.fixture
def cluster_has_leader_ok(
patroni_api: PatroniAPI, old_replica_state: bool, datadir: Path, tmp_path: Path
) -> Iterator[None]:
cluster_path: Union[str, Path] = "cluster_has_leader_ok.json"
patroni_path = "cluster_has_replica_patroni_verion_3.1.0.json"
if old_replica_state:
cluster_path = cluster_api_set_replica_running(datadir / cluster_path, tmp_path)
patroni_path = "cluster_has_replica_patroni_verion_3.0.0.json"
with patroni_api.routes({"cluster": cluster_path, "patroni": patroni_path}):
yield None


@pytest.mark.usefixtures("cluster_has_leader_ok")
def test_cluster_has_leader_ok(runner: CliRunner, patroni_api: PatroniAPI) -> None:
with patroni_api.routes({"cluster": "cluster_has_leader_ok.json"}):
result = runner.invoke(main, ["-e", patroni_api.endpoint, "cluster_has_leader"])
assert result.exit_code == 0
result = runner.invoke(main, ["-e", patroni_api.endpoint, "cluster_has_leader"])
assert (
result.stdout
== "CLUSTERHASLEADER OK - The cluster has a running leader. | has_leader=1;;@0\n"
== "CLUSTERHASLEADER OK - The cluster has a running leader. | has_leader=1;;@0 is_leader=1 is_standby_leader=0 is_standby_leader_in_arc_rec=0;@1:1\n"
)
assert result.exit_code == 0


@pytest.fixture
def cluster_has_leader_ok_standby_leader(
patroni_api: PatroniAPI, old_replica_state: bool, datadir: Path, tmp_path: Path
) -> Iterator[None]:
cluster_path: Union[str, Path] = "cluster_has_leader_ok_standby_leader.json"
patroni_path = "cluster_has_replica_patroni_verion_3.1.0.json"
if old_replica_state:
cluster_path = cluster_api_set_replica_running(datadir / cluster_path, tmp_path)
patroni_path = "cluster_has_replica_patroni_verion_3.0.0.json"
with patroni_api.routes({"cluster": cluster_path, "patroni": patroni_path}):
yield None


@pytest.mark.usefixtures("cluster_has_leader_ok_standby_leader")
def test_cluster_has_leader_ok_standby_leader(
runner: CliRunner, patroni_api: PatroniAPI
) -> None:
with patroni_api.routes({"cluster": "cluster_has_leader_ok_standby_leader.json"}):
result = runner.invoke(main, ["-e", patroni_api.endpoint, "cluster_has_leader"])
assert result.exit_code == 0
result = runner.invoke(main, ["-e", patroni_api.endpoint, "cluster_has_leader"])
assert (
result.stdout
== "CLUSTERHASLEADER OK - The cluster has a running leader. | has_leader=1;;@0\n"
== "CLUSTERHASLEADER OK - The cluster has a running leader. | has_leader=1;;@0 is_leader=0 is_standby_leader=1 is_standby_leader_in_arc_rec=0;@1:1\n"
)
assert result.exit_code == 0


@pytest.fixture
def cluster_has_leader_ko(
patroni_api: PatroniAPI, old_replica_state: bool, datadir: Path, tmp_path: Path
) -> Iterator[None]:
cluster_path: Union[str, Path] = "cluster_has_leader_ko.json"
patroni_path = "cluster_has_replica_patroni_verion_3.1.0.json"
if old_replica_state:
cluster_path = cluster_api_set_replica_running(datadir / cluster_path, tmp_path)
patroni_path = "cluster_has_replica_patroni_verion_3.0.0.json"
with patroni_api.routes({"cluster": cluster_path, "patroni": patroni_path}):
yield None


@pytest.mark.usefixtures("cluster_has_leader_ko")
def test_cluster_has_leader_ko(runner: CliRunner, patroni_api: PatroniAPI) -> None:
with patroni_api.routes({"cluster": "cluster_has_leader_ko.json"}):
result = runner.invoke(main, ["-e", patroni_api.endpoint, "cluster_has_leader"])
result = runner.invoke(main, ["-e", patroni_api.endpoint, "cluster_has_leader"])
assert (
result.stdout
== "CLUSTERHASLEADER CRITICAL - The cluster has no running leader or the standby leader is in archive recovery. | has_leader=0;;@0 is_leader=0 is_standby_leader=0 is_standby_leader_in_arc_rec=0;@1:1\n"
)
assert result.exit_code == 2


@pytest.fixture
def cluster_has_leader_ko_standby_leader(
patroni_api: PatroniAPI, old_replica_state: bool, datadir: Path, tmp_path: Path
) -> Iterator[None]:
cluster_path: Union[str, Path] = "cluster_has_leader_ko_standby_leader.json"
patroni_path = "cluster_has_replica_patroni_verion_3.1.0.json"
if old_replica_state:
cluster_path = cluster_api_set_replica_running(datadir / cluster_path, tmp_path)
patroni_path = "cluster_has_replica_patroni_verion_3.0.0.json"
with patroni_api.routes({"cluster": cluster_path, "patroni": patroni_path}):
yield None


@pytest.mark.usefixtures("cluster_has_leader_ko_standby_leader")
def test_cluster_has_leader_ko_standby_leader(
runner: CliRunner, patroni_api: PatroniAPI
) -> None:
result = runner.invoke(main, ["-e", patroni_api.endpoint, "cluster_has_leader"])
assert (
result.stdout
== "CLUSTERHASLEADER CRITICAL - The cluster has no running leader or the standby leader is in archive recovery. | has_leader=0;;@0 is_leader=0 is_standby_leader=0 is_standby_leader_in_arc_rec=0;@1:1\n"
)
assert result.exit_code == 2


@pytest.fixture
def cluster_has_leader_ko_standby_leader_archiving(
patroni_api: PatroniAPI, old_replica_state: bool, datadir: Path, tmp_path: Path
) -> Iterator[None]:
cluster_path: Union[
str, Path
] = "cluster_has_leader_ko_standby_leader_archiving.json"
patroni_path = "cluster_has_replica_patroni_verion_3.1.0.json"
if old_replica_state:
cluster_path = cluster_api_set_replica_running(datadir / cluster_path, tmp_path)
patroni_path = "cluster_has_replica_patroni_verion_3.0.0.json"
with patroni_api.routes({"cluster": cluster_path, "patroni": patroni_path}):
yield None


@pytest.mark.usefixtures("cluster_has_leader_ko_standby_leader_archiving")
def test_cluster_has_leader_ko_standby_leader_archiving(
runner: CliRunner, patroni_api: PatroniAPI
) -> None:
result = runner.invoke(main, ["-e", patroni_api.endpoint, "cluster_has_leader"])
assert (
result.stdout
== "CLUSTERHASLEADER CRITICAL - The cluster has no running leader. | has_leader=0;;@0\n"
== "CLUSTERHASLEADER WARNING - The cluster has no running leader or the standby leader is in archive recovery. | has_leader=1;;@0 is_leader=0 is_standby_leader=1 is_standby_leader_in_arc_rec=1;@1:1\n"
)
assert result.exit_code == 1

0 comments on commit 46db3e2

Please sign in to comment.