Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add retry logic to script probe, similar to poll probe #140

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions config.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ push_system_cpu_sick_above = 0.90
push_system_ram_sick_above = 0.90

script_interval = 300
script_retry = 0
script_retry_delay = 2000

script_parallelism = 2

Expand Down
6 changes: 6 additions & 0 deletions src/config/config.rs
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,12 @@ pub struct ConfigMetrics {
#[serde(default = "defaults::script_parallelism")]
pub script_parallelism: u16,

#[serde(default = "defaults::metrics_script_retry")]
pub script_retry: u16,

#[serde(default = "defaults::metrics_script_retry_delay")]
pub script_retry_delay: u64,

#[serde(default = "defaults::metrics_local_delay_dead")]
pub local_delay_dead: u64,
}
Expand Down
8 changes: 8 additions & 0 deletions src/config/defaults.rs
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,14 @@ pub fn script_parallelism() -> u16 {
2
}

pub fn metrics_script_retry() -> u16 {
0
}

pub fn metrics_script_retry_delay() -> u64 {
2000
}

pub fn metrics_local_delay_dead() -> u64 {
40
}
Expand Down
33 changes: 30 additions & 3 deletions src/prober/manager.rs
Original file line number Diff line number Diff line change
Expand Up @@ -517,7 +517,30 @@ fn proceed_replica_probe_poll_http(
(false, None)
}

fn proceed_replica_probe_script(script: &String) -> (Status, Option<Duration>) {
fn proceed_replica_probe_script_with_retry(script: &String) -> (Status, Option<Duration>) {
let (mut status, mut latency, mut retry_count) = (Status::Dead, None, 0);

while retry_count <= APP_CONF.metrics.script_retry && status == Status::Dead {
debug!(
"will probe script replica: {:?} with retry count: {}",
script, retry_count
);

thread::sleep(Duration::from_millis(APP_CONF.metrics.script_retry_delay));

let probe_results = proceed_replica_probe_script(script);

status = probe_results.0;
latency = Some(probe_results.1);

// Increment retry count (for next attempt)
retry_count += 1;
}

(status, latency)
}

fn proceed_replica_probe_script(script: &String) -> (Status, Duration) {
let start_time = SystemTime::now();

let status = match run_script::run(script, &Vec::new(), &ScriptOptions::new()) {
Expand All @@ -541,7 +564,11 @@ fn proceed_replica_probe_script(script: &String) -> (Status, Option<Duration>) {
}
};

(status, SystemTime::now().duration_since(start_time).ok())
let probing_duration = SystemTime::now()
.duration_since(start_time)
.unwrap_or(Duration::from_secs(0));

(status, probing_duration)
}

fn proceed_rabbitmq_queue_probe(
Expand Down Expand Up @@ -663,7 +690,7 @@ fn dispatch_replica<'a>(probe_replica: &ProbeReplica) {
node_id = &probe_replica_target.node_id;
replica_id = &probe_replica_target.replica_id;

proceed_replica_probe_script(&probe_replica_script.script)
proceed_replica_probe_script_with_retry(&probe_replica_script.script)
}
};

Expand Down