Skip to content

Commit

Permalink
Add proper startup, liveness and readiness probes (#273)
Browse files Browse the repository at this point in the history
# Description

For #142
It would be very good if you could check the druid docs to make sure if not miss-understood the checks or if you have a better idea
  • Loading branch information
sbernauer committed Jul 22, 2022
1 parent 3fc42ee commit 1f2e9a9
Show file tree
Hide file tree
Showing 4 changed files with 80 additions and 14 deletions.
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,14 @@ All notable changes to this project will be documented in this file.
- BREAKING: HDFS deep storage now configurable via HDFS discovery config map instead of an url to a HDFS name node ([#262]).
- Include chart name when installing with a custom release name ([#263], [#264]).

### Fixed

- Add proper startup, liveness and readiness probes ([#273])

[#262]: https://github.com/stackabletech/druid-operator/pull/262
[#263]: https://github.com/stackabletech/druid-operator/pull/263
[#264]: https://github.com/stackabletech/druid-operator/pull/264
[#273]: https://github.com/stackabletech/druid-operator/pull/273

## [0.6.0] - 2022-06-30

Expand Down
17 changes: 3 additions & 14 deletions rust/operator-binary/src/druid_controller.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
use crate::{
config::{get_jvm_config, get_log4j_config},
discovery::{self, build_discovery_configmaps},
probes::add_probes,
};

use snafu::{OptionExt, ResultExt, Snafu};
Expand All @@ -25,9 +26,7 @@ use stackable_operator::{
k8s_openapi::{
api::{
apps::v1::{StatefulSet, StatefulSetSpec},
core::v1::{
ConfigMap, EnvVar, Probe, Service, ServicePort, ServiceSpec, TCPSocketAction,
},
core::v1::{ConfigMap, EnvVar, Service, ServicePort, ServiceSpec},
},
apimachinery::pkg::{apis::meta::v1::LabelSelector, util::intstr::IntOrString},
},
Expand Down Expand Up @@ -596,17 +595,7 @@ fn build_rolegroup_statefulset(
.build(),
);

// readiness probe
let probe = Probe {
tcp_socket: Some(TCPSocketAction {
port: IntOrString::Int(role.get_http_port().into()),
..Default::default()
}),
initial_delay_seconds: Some(30),
period_seconds: Some(5),
..Default::default()
};
cb.readiness_probe(probe);
add_probes(&mut cb, &role);

let mut container = cb.build();
container.image_pull_policy = Some("IfNotPresent".to_string());
Expand Down
1 change: 1 addition & 0 deletions rust/operator-binary/src/main.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
mod config;
mod discovery;
mod druid_controller;
mod probes;

use std::sync::Arc;

Expand Down
71 changes: 71 additions & 0 deletions rust/operator-binary/src/probes.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
use stackable_druid_crd::DruidRole;
use stackable_operator::{
builder::ContainerBuilder,
k8s_openapi::{
api::core::v1::{HTTPGetAction, Probe},
apimachinery::pkg::util::intstr::IntOrString,
},
};

pub fn add_probes(container_builder: &mut ContainerBuilder, role: &DruidRole) {
// /status/selfDiscovered:
// Indicating whether the node has received a confirmation from the central node discovery mechanism (currently ZooKeeper) of the Druid cluster that the node has been added to the cluster.
// Returns 200 OK response with empty body if the node has discovered itself and 503 SERVICE UNAVAILABLE if the node hasn't discovered itself yet.
// It is recommended to not consider a Druid node "healthy" or "ready" in automated deployment/container management systems until it returns 200 OK response.
// see https://druid.apache.org/docs/latest/operations/api-reference.html#process-information
let startup_probe_path = "/status/selfDiscovered";
let liveness_probe_path = "/status/selfDiscovered";

let readiness_probe_path = match role {
// /druid/broker/v1/readiness:
// Returns if the Broker knows about all segments in the cluster.
// This can be used to know when a Broker process is ready to be queried after a restart.
// see https://druid.apache.org/docs/latest/operations/api-reference.html#broker
DruidRole::Broker => "/druid/broker/v1/readiness",

// /druid/historical/v1/readiness:
// Returns if all segments in the local cache have been loaded.
// This can be used to know when a Historical process is ready to be queried after a restart.
// see https://druid.apache.org/docs/latest/operations/api-reference.html#historical
DruidRole::Historical => "/druid/historical/v1/readiness",

// For the other roles we use the normal discovery-check
DruidRole::Coordinator | DruidRole::MiddleManager | DruidRole::Router => {
"/status/selfDiscovered"
}
};

container_builder.startup_probe(Probe {
failure_threshold: Some(60), // 60 * 10s = 10min time to start up and register itself
period_seconds: Some(10),
timeout_seconds: Some(3),
http_get: Some(HTTPGetAction {
port: IntOrString::Int(role.get_http_port() as i32),
path: Some(startup_probe_path.to_string()),
..HTTPGetAction::default()
}),
..Probe::default()
});
container_builder.liveness_probe(Probe {
failure_threshold: Some(6), // After not being healthy for 6 * 5s = 30s => restart
period_seconds: Some(5),
timeout_seconds: Some(3),
http_get: Some(HTTPGetAction {
port: IntOrString::Int(role.get_http_port() as i32),
path: Some(liveness_probe_path.to_string()),
..HTTPGetAction::default()
}),
..Probe::default()
});
container_builder.readiness_probe(Probe {
failure_threshold: Some(1), // After not being healthy for 1 * 5s = 5s => take it out of the service
period_seconds: Some(5),
timeout_seconds: Some(3),
http_get: Some(HTTPGetAction {
port: IntOrString::Int(role.get_http_port() as i32),
path: Some(readiness_probe_path.to_string()),
..HTTPGetAction::default()
}),
..Probe::default()
});
}

0 comments on commit 1f2e9a9

Please sign in to comment.