Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ensure client and scheduler are resilient to server autoscaling #2277

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ hyper-util = { version = "0.1.3", optional = true, features = [
"server",
] }
is-terminal = "0.4.12"
itertools = "0.12"
jobserver = "0.1"
jwt = { package = "jsonwebtoken", version = "9", optional = true }
libc = "0.2.153"
Expand Down Expand Up @@ -126,7 +127,6 @@ assert_cmd = "2.0.13"
cc = "1.0"
chrono = "0.4.33"
filetime = "0.2"
itertools = "0.12"
predicates = "=3.1.0"
serial_test = "3.1"
temp-env = "0.3.6"
Expand Down
5 changes: 5 additions & 0 deletions docs/Configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,11 @@ The latest `cache.XXX` entries may be found here: https://github.com/mozilla/scc
Whatever is set by a file based configuration, it is overruled by the env
configuration variables

### dist
* `SCCACHE_DIST_CONNECT_TIMEOUT` Timeout in seconds for connections to an sccache-dist server. Default is `5`.
* `SCCACHE_DIST_REQUEST_TIMEOUT` Timeout in seconds for compile requests to an sccache-dist server. Default is `600`.
* `SCCACHE_DIST_RETRY_LIMIT` Number of times the client should retry failed distributed compilations. The default is `0` (no retries).

### misc

* `SCCACHE_ALLOW_CORE_DUMPS` to enable core dumps by the server
Expand Down
189 changes: 132 additions & 57 deletions src/bin/sccache-dist/build.rs

Large diffs are not rendered by default.

113 changes: 82 additions & 31 deletions src/bin/sccache-dist/build_freebsd.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@
use anyhow::{bail, Context, Error, Result};
use flate2::read::GzDecoder;
use sccache::dist::{
BuildResult, BuilderIncoming, CompileCommand, InputsReader, OutputData, ProcessOutput, TcCache,
Toolchain,
BuildResult, BuilderIncoming, CompileCommand, InputsReader, JobId, OutputData, ProcessOutput,
TcCache, Toolchain,
};
use sccache::lru_disk_cache::Error as LruError;
use std::collections::{hash_map, HashMap};
Expand Down Expand Up @@ -146,7 +146,12 @@ impl PotBuilder {
// If we have a spare running container, claim it and remove it from the available list,
// otherwise try and create a new container (possibly creating the Pot image along
// the way)
fn get_container(&self, tc: &Toolchain, tccache: &Mutex<TcCache>) -> Result<String> {
fn get_container(
&self,
job_id: JobId,
tc: &Toolchain,
tccache: &Mutex<TcCache>,
) -> Result<String> {
let container = {
let mut map = self.container_lists.lock().unwrap();
map.entry(tc.clone()).or_insert_with(Vec::new).pop()
Expand All @@ -161,8 +166,9 @@ impl PotBuilder {
match map.entry(tc.clone()) {
hash_map::Entry::Occupied(e) => e.get().clone(),
hash_map::Entry::Vacant(e) => {
info!("Creating pot image for {:?} (may block requests)", tc);
info!("[get_container({})]: Creating pot image for {:?} (may block requests)", job_id, tc);
let image = Self::make_image(
job_id,
tc,
tccache,
&self.pot_fs_root,
Expand Down Expand Up @@ -201,38 +207,49 @@ impl PotBuilder {
// Failing during cleanup is pretty unexpected, but we can still return the successful compile
// TODO: if too many of these fail, we should mark this builder as faulty
fn finish_container(
job_id: JobId,
container_lists: Arc<Mutex<HashMap<Toolchain, Vec<String>>>>,
tc: Toolchain,
cid: String,
pot_cmd: &PathBuf,
) {
if let Err(e) = Self::clean_container(&cid) {
info!("Failed to clean container {}: {}", cid, e);
info!(
"[finish_container({})]: Failed to clean container {}: {}",
job_id, cid, e
);
if let Err(e) = pot_rm(&cid, pot_cmd) {
warn!(
"Failed to remove container {} after failed clean: {}",
cid, e
"[finish_container({})]: Failed to remove container {} after failed clean: {}",
job_id, cid, e
);
}
return;
}

// Good as new, add it back to the container list
if let Some(entry) = container_lists.lock().unwrap().get_mut(&tc) {
debug!("Reclaimed container {}", cid);
debug!(
"[finish_container({})]: Reclaimed container {}",
job_id, cid
);
entry.push(cid)
} else {
warn!(
"Was ready to reclaim container {} but toolchain went missing",
cid
"[finish_container({})]: Was ready to reclaim container {} but toolchain went missing",
job_id, cid
);
if let Err(e) = pot_rm(&cid, pot_cmd) {
warn!("Failed to remove container {}: {}", cid, e);
warn!(
"[finish_container({})]: Failed to remove container {}: {}",
job_id, cid, e
);
}
}
}

fn make_image(
job_id: JobId,
tc: &Toolchain,
tccache: &Mutex<TcCache>,
pot_fs_root: &Path,
Expand All @@ -241,7 +258,11 @@ impl PotBuilder {
pot_clone_args: &[String],
) -> Result<String> {
let imagename = format!("sccache-image-{}", &tc.archive_id);
trace!("Creating toolchain image: {}", imagename);
trace!(
"[make_image({})]: Creating toolchain image: {}",
job_id,
imagename
);
let mut clone_args: Vec<&str> = ["clone", "-p", &imagename, "-P", clone_from].to_vec();
clone_args.append(&mut pot_clone_args.iter().map(|s| s as &str).collect());
Command::new(pot_cmd)
Expand All @@ -258,11 +279,14 @@ impl PotBuilder {
Err(e) => return Err(Error::from(e).context("failed to get toolchain from cache")),
};

trace!("Copying in toolchain");
trace!("[make_image({})]: Copying in toolchain", job_id);
tar::Archive::new(GzDecoder::new(toolchain_rdr))
.unpack(pot_fs_root.join("jails").join(&imagename).join("m"))
.or_else(|e| {
warn!("Failed to unpack toolchain: {:?}", e);
warn!(
"[make_image({})]: Failed to unpack toolchain: {:?}",
job_id, e
);
tccache
.remove(tc)
.context("Failed to remove corrupt toolchain")?;
Expand Down Expand Up @@ -303,20 +327,26 @@ impl PotBuilder {
}

fn perform_build(
job_id: JobId,
compile_command: CompileCommand,
inputs_rdr: InputsReader,
output_paths: Vec<String>,
cid: &str,
pot_fs_root: &Path,
) -> Result<BuildResult> {
trace!("Compile environment: {:?}", compile_command.env_vars);
trace!(
"Compile command: {:?} {:?}",
"[perform_build({})]: Compile environment: {:?}",
job_id,
compile_command.env_vars
);
trace!(
"[perform_build({})]: Compile command: {:?} {:?}",
job_id,
compile_command.executable,
compile_command.arguments
);

trace!("copying in inputs");
trace!("[perform_build({})]: copying in inputs", job_id);
// not elegant
tar::Archive::new(inputs_rdr)
.unpack(pot_fs_root.join("jails").join(cid).join("m"))
Expand All @@ -330,7 +360,7 @@ impl PotBuilder {
} = compile_command;
let cwd = Path::new(&cwd);

trace!("creating output directories");
trace!("[perform_build({})]: creating output directories", job_id);
assert!(!output_paths.is_empty());
let mut cmd = Command::new("jexec");
cmd.args(&[cid, "mkdir", "-p"]).arg(cwd);
Expand All @@ -346,14 +376,17 @@ impl PotBuilder {
cmd.check_run()
.context("Failed to create directories required for compile in container")?;

trace!("performing compile");
trace!("[perform_build({})]: performing compile", job_id);
// TODO: likely shouldn't perform the compile as root in the container
let mut cmd = Command::new("jexec");
cmd.arg(cid);
cmd.arg("env");
for (k, v) in env_vars {
if k.contains('=') {
warn!("Skipping environment variable: {:?}", k);
warn!(
"[perform_build({})]: Skipping environment variable: {:?}",
job_id, k
);
continue;
}
let mut env = k;
Expand All @@ -368,10 +401,14 @@ impl PotBuilder {
cmd.arg(executable);
cmd.args(arguments);
let compile_output = cmd.output().context("Failed to start executing compile")?;
trace!("compile_output: {:?}", compile_output);
trace!(
"[perform_build({})]: compile_output: {:?}",
job_id,
compile_output
);

let mut outputs = vec![];
trace!("retrieving {:?}", output_paths);
trace!("[perform_build({})]: retrieving {:?}", job_id, output_paths);
for path in output_paths {
let abspath = cwd.join(&path); // Resolve in case it's relative since we copy it from the root level
// TODO: this isn't great, but cp gives it out as a tar
Expand All @@ -385,7 +422,10 @@ impl PotBuilder {
.expect("Failed to read compress output stdout");
outputs.push((path, output))
} else {
debug!("Missing output path {:?}", path)
debug!(
"[perform_build({})]: Missing output path {:?}",
job_id, path
)
}
}

Expand All @@ -402,20 +442,31 @@ impl BuilderIncoming for PotBuilder {
// From Server
fn run_build(
&self,
job_id: JobId,
tc: Toolchain,
command: CompileCommand,
outputs: Vec<String>,
inputs_rdr: InputsReader,
tccache: &Mutex<TcCache>,
) -> Result<BuildResult> {
debug!("Finding container");
debug!("[run_build({})]: Finding container", job_id);
let cid = self
.get_container(&tc, tccache)
.get_container(job_id, &tc, tccache)
.context("Failed to get a container for build")?;
debug!("Performing build with container {}", cid);
let res = Self::perform_build(command, inputs_rdr, outputs, &cid, &self.pot_fs_root)
.context("Failed to perform build")?;
debug!("Finishing with container {}", cid);
debug!(
"[run_build({})]: Performing build with container {}",
job_id, cid
);
let res = Self::perform_build(
job_id,
command,
inputs_rdr,
outputs,
&cid,
&self.pot_fs_root,
)
.context("Failed to perform build")?;
debug!("[run_build({})]: Finishing with container {}", job_id, cid);
let cloned = self.clone();
let tc = tc;
while cloned.cleanup_thread_count.fetch_add(1, Ordering::SeqCst)
Expand All @@ -425,10 +476,10 @@ impl BuilderIncoming for PotBuilder {
hint::spin_loop();
}
thread::spawn(move || {
Self::finish_container(cloned.container_lists, tc, cid, &cloned.pot_cmd);
Self::finish_container(job_id, cloned.container_lists, tc, cid, &cloned.pot_cmd);
cloned.cleanup_thread_count.fetch_sub(1, Ordering::SeqCst);
});
debug!("Returning result");
debug!("[run_build({})]: Returning result", job_id);
Ok(res)
}
}
Loading