Skip to content

Commit

Permalink
track started job mtimes and assume jobs that take longer than `SCCAC…
Browse files Browse the repository at this point in the history
…HE_DIST_REQUEST_TIMEOUT` seconds are stale and should be removed
  • Loading branch information
trxcllnt committed Oct 28, 2024
1 parent 6cd9ff3 commit 5f1d50e
Show file tree
Hide file tree
Showing 3 changed files with 47 additions and 15 deletions.
34 changes: 33 additions & 1 deletion src/bin/sccache-dist/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ use rand::{rngs::OsRng, RngCore};
use sccache::config::{
scheduler as scheduler_config, server as server_config, INSECURE_DIST_CLIENT_TOKEN,
};
use sccache::dist::http::get_dist_request_timeout;
use sccache::dist::{
self, AllocJobResult, AssignJobResult, BuilderIncoming, CompileCommand, HeartbeatServerResult,
InputsReader, JobAlloc, JobAuthorizer, JobComplete, JobId, JobState, RunJobResult,
Expand Down Expand Up @@ -317,6 +318,7 @@ const UNCLAIMED_READY_TIMEOUT: Duration = Duration::from_secs(60);

#[derive(Copy, Clone)]
struct JobDetail {
mtime: Instant,
server_id: ServerId,
state: JobState,
}
Expand Down Expand Up @@ -494,7 +496,14 @@ impl SchedulerIncoming for Scheduler {
);
}

jobs.insert(job_id, JobDetail { server_id, state });
jobs.insert(
job_id,
JobDetail {
server_id,
state,
mtime: Instant::now(),
},
);

if log_enabled!(log::Level::Trace) {
// LOCKS
Expand Down Expand Up @@ -686,6 +695,27 @@ impl SchedulerIncoming for Scheduler {
}
}

// If the server has jobs assigned that have taken longer than `SCCACHE_DIST_REQUEST_TIMEOUT` seconds,
// either the client retried the compilation (due to a server error), or the client abandoned the job
// after calling `run_job/<job_id>` (for example, when the user kills the build).
//
// In both cases the scheduler moves the job from pending/ready to started, and is expecting to hear
// from the build server that the job has been completed. That message will never arrive, so we track
// the time when jobs are started, and prune them if they take longer than the configured request
// timeout.
//
// Note: this assumes the scheduler's `SCCACHE_DIST_REQUEST_TIMEOUT` is >= the client's value, but
// that should be easy for the user to configure.
for &job_id in details.jobs_assigned.iter() {
if let Some(detail) = jobs.get(&job_id) {
if now.duration_since(detail.mtime) >= get_dist_request_timeout() {
// insert into jobs_unclaimed to avoid the warning below
details.jobs_unclaimed.insert(job_id, detail.mtime);
stale_jobs.push(job_id);
}
}
}

if !stale_jobs.is_empty() {
warn!(
"The following stale jobs will be de-allocated: {:?}",
Expand Down Expand Up @@ -791,10 +821,12 @@ impl SchedulerIncoming for Scheduler {
// Update the job's `last_seen` time to ensure it isn't
// pruned for taking longer than UNCLAIMED_READY_TIMEOUT
server.jobs_unclaimed.entry(job_id).and_modify(|e| *e = now);
job.get_mut().mtime = now;
job.get_mut().state = job_state;
}
(JobState::Ready, JobState::Started) => {
server.jobs_unclaimed.remove(&job_id);
job.get_mut().mtime = now;
job.get_mut().state = job_state;
}
(JobState::Started, JobState::Complete) => {
Expand Down
22 changes: 11 additions & 11 deletions src/dist/http.rs
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,10 @@ use std::env;
use std::time::Duration;

/// Default timeout for connections to an sccache-dist server
const DEFAULT_DIST_CONNECT_TIMEOUT: u64 = 5;
const DEFAULT_DIST_CONNECT_TIMEOUT: u64 = 30;

/// Timeout for connections to an sccache-dist server
pub fn get_connect_timeout() -> Duration {
pub fn get_dist_connect_timeout() -> Duration {
Duration::new(
env::var("SCCACHE_DIST_CONNECT_TIMEOUT")
.ok()
Expand All @@ -41,7 +41,7 @@ pub fn get_connect_timeout() -> Duration {
const DEFAULT_DIST_REQUEST_TIMEOUT: u64 = 600;

/// Timeout for compile requests to an sccache-dist server
pub fn get_request_timeout() -> Duration {
pub fn get_dist_request_timeout() -> Duration {
Duration::new(
env::var("SCCACHE_DIST_REQUEST_TIMEOUT")
.ok()
Expand Down Expand Up @@ -295,7 +295,7 @@ mod server {
AllocJobHttpResponse, HeartbeatServerHttpRequest, JobJwt, ReqwestRequestBuilderExt,
RunJobHttpRequest, ServerCertificateHttpResponse,
};
use super::{get_connect_timeout, get_request_timeout, urls};
use super::{get_dist_connect_timeout, get_dist_request_timeout, urls};
use crate::dist::{
self, AllocJobResult, AssignJobResult, HeartbeatServerResult, InputsReader, JobAuthorizer,
JobId, JobState, RunJobResult, SchedulerStatusResult, ServerId, ServerNonce,
Expand Down Expand Up @@ -776,8 +776,8 @@ mod server {
}
// Finish the client
let new_client = client_builder
.timeout(get_request_timeout())
.connect_timeout(get_connect_timeout())
.timeout(get_dist_request_timeout())
.connect_timeout(get_dist_connect_timeout())
// Disable connection pool to avoid broken connection
// between runtime
.pool_max_idle_per_host(0)
Expand Down Expand Up @@ -1130,7 +1130,7 @@ mod client {
bincode_req_fut, AllocJobHttpResponse, ReqwestRequestBuilderExt, RunJobHttpRequest,
ServerCertificateHttpResponse,
};
use super::{get_connect_timeout, get_request_timeout, urls};
use super::{get_dist_connect_timeout, get_dist_request_timeout, urls};
use crate::errors::*;

pub struct Client {
Expand All @@ -1155,8 +1155,8 @@ mod client {
rewrite_includes_only: bool,
) -> Result<Self> {
let client = reqwest::ClientBuilder::new()
.timeout(get_request_timeout())
.connect_timeout(get_connect_timeout())
.timeout(get_dist_request_timeout())
.connect_timeout(get_dist_connect_timeout())
// Disable connection pool to avoid broken connection
// between runtime
.pool_max_idle_per_host(0)
Expand Down Expand Up @@ -1198,8 +1198,8 @@ mod client {
}
// Finish the client
let new_client_async = client_async_builder
.timeout(get_request_timeout())
.connect_timeout(get_connect_timeout())
.timeout(get_dist_request_timeout())
.connect_timeout(get_dist_connect_timeout())
// Disable keep-alive
.pool_max_idle_per_host(0)
.build()
Expand Down
6 changes: 3 additions & 3 deletions src/util.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
// limitations under the License.

#[cfg(any(feature = "dist-server", feature = "dist-client"))]
use crate::dist::http::{get_connect_timeout, get_request_timeout};
use crate::dist::http::{get_dist_connect_timeout, get_dist_request_timeout};
use crate::mock_command::{CommandChild, RunCommand};
use blake3::Hasher as blake3_Hasher;
use byteorder::{BigEndian, ByteOrder};
Expand Down Expand Up @@ -943,8 +943,8 @@ pub fn daemonize() -> Result<()> {
pub fn new_reqwest_blocking_client() -> reqwest::blocking::Client {
reqwest::blocking::Client::builder()
.pool_max_idle_per_host(0)
.timeout(get_request_timeout())
.connect_timeout(get_connect_timeout())
.timeout(get_dist_request_timeout())
.connect_timeout(get_dist_connect_timeout())
.build()
.expect("http client must build with success")
}
Expand Down

0 comments on commit 5f1d50e

Please sign in to comment.