diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index dcdc388a939b..16fc22ce629b 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -794,7 +794,7 @@ jobs: run: shell: sh -eu {0} env: - VM_BUILDER_VERSION: v0.13.1 + VM_BUILDER_VERSION: v0.15.0-alpha1 steps: - name: Checkout diff --git a/Cargo.lock b/Cargo.lock index dbeeb539d44f..c18bddfbcb76 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -740,6 +740,9 @@ name = "cc" version = "1.0.79" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "50d30906286121d95be3d479533b458f87493b30a4b5f79a607db8f5d11aa91f" +dependencies = [ + "jobserver", +] [[package]] name = "cexpr" @@ -907,12 +910,14 @@ dependencies = [ "opentelemetry", "postgres", "regex", + "remote_storage", "reqwest", "serde", "serde_json", "tar", "tokio", "tokio-postgres", + "toml_edit", "tracing", "tracing-opentelemetry", "tracing-subscriber", @@ -920,6 +925,7 @@ dependencies = [ "url", "utils", "workspace_hack", + "zstd", ] [[package]] @@ -980,6 +986,7 @@ dependencies = [ "tar", "thiserror", "toml", + "tracing", "url", "utils", "workspace_hack", @@ -1972,6 +1979,15 @@ version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6" +[[package]] +name = "jobserver" +version = "0.1.26" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "936cfd212a0155903bcbc060e316fb6cc7cbf2e1907329391ebadc1fe0ce77c2" +dependencies = [ + "libc", +] + [[package]] name = "js-sys" version = "0.3.63" @@ -3237,6 +3253,7 @@ dependencies = [ "metrics", "once_cell", "pin-project-lite", + "scopeguard", "serde", "serde_json", "tempfile", @@ -5296,6 +5313,7 @@ version = "0.1.0" dependencies = [ "anyhow", "bytes", + "cc", "chrono", "clap", "clap_builder", @@ -5396,3 +5414,33 @@ name = "zeroize" version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2a0956f1ba7c7909bfb66c2e9e4124ab6f6482560f6628b5aaeba39207c9aad9" + +[[package]] +name = "zstd" +version = "0.12.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a27595e173641171fc74a1232b7b1c7a7cb6e18222c11e9dfb9888fa424c53c" +dependencies = [ + "zstd-safe", +] + +[[package]] +name = "zstd-safe" +version = "6.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee98ffd0b48ee95e6c5168188e44a54550b1564d9d530ee21d5f0eaed1069581" +dependencies = [ + "libc", + "zstd-sys", +] + +[[package]] +name = "zstd-sys" +version = "2.0.8+zstd.1.5.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5556e6ee25d32df2586c098bbfa278803692a20d0ab9565e049480d52707ec8c" +dependencies = [ + "cc", + "libc", + "pkg-config", +] diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node index b2d096dc431b..6dc6612e93c7 100644 --- a/Dockerfile.compute-node +++ b/Dockerfile.compute-node @@ -816,6 +816,7 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb # libxml2, libxslt1.1 for xml2 # libzstd1 for zstd # libboost*, libfreetype6, and zlib1g for rdkit +# ca-certificates for communicating with s3 by compute_ctl RUN apt update && \ apt install --no-install-recommends -y \ gdb \ @@ -839,7 +840,8 @@ RUN apt update && \ libcurl4-openssl-dev \ locales \ procps \ - zlib1g && \ + zlib1g \ + ca-certificates && \ rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \ localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8 diff --git a/README.md b/README.md index 1c2452f4350d..57806089495d 100644 --- a/README.md +++ b/README.md @@ -29,13 +29,13 @@ See developer documentation in [SUMMARY.md](/docs/SUMMARY.md) for more informati ```bash apt install build-essential libtool libreadline-dev zlib1g-dev flex bison libseccomp-dev \ libssl-dev clang pkg-config libpq-dev cmake postgresql-client protobuf-compiler \ -libcurl4-openssl-dev +libcurl4-openssl-dev openssl python-poetry ``` * On Fedora, these packages are needed: ```bash dnf install flex bison readline-devel zlib-devel openssl-devel \ libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler \ - protobuf-devel libcurl-devel + protobuf-devel libcurl-devel openssl poetry ``` * On Arch based systems, these packages are needed: ```bash @@ -235,6 +235,13 @@ CARGO_BUILD_FLAGS="--features=testing" make ./scripts/pytest ``` +By default, this runs both debug and release modes, and all supported postgres versions. When +testing locally, it is convenient to run just run one set of permutations, like this: + +```sh +DEFAULT_PG_VERSION=15 BUILD_TYPE=release ./scripts/pytest +``` + ## Documentation [docs](/docs) Contains a top-level overview of all available markdown documentation. diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index f8f8f729ce6c..08dcc21c7afd 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -32,3 +32,6 @@ url.workspace = true compute_api.workspace = true utils.workspace = true workspace_hack.workspace = true +toml_edit.workspace = true +remote_storage = { version = "0.1", path = "../libs/remote_storage/" } +zstd = "0.12.4" diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index 9d15a203e5b3..ad747470c231 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -5,6 +5,8 @@ //! - `compute_ctl` accepts cluster (compute node) specification as a JSON file. //! - Every start is a fresh start, so the data directory is removed and //! initialized again on each run. +//! - If remote_extension_config is provided, it will be used to fetch extensions list +//! and download `shared_preload_libraries` from the remote storage. //! - Next it will put configuration files into the `PGDATA` directory. //! - Sync safekeepers and get commit LSN. //! - Get `basebackup` from pageserver using the returned on the previous step LSN. @@ -27,7 +29,8 @@ //! compute_ctl -D /var/db/postgres/compute \ //! -C 'postgresql://cloud_admin@localhost/postgres' \ //! -S /var/db/postgres/specs/current.json \ -//! -b /usr/local/bin/postgres +//! -b /usr/local/bin/postgres \ +//! -r {"bucket": "neon-dev-extensions-eu-central-1", "region": "eu-central-1"} //! ``` //! use std::collections::HashMap; @@ -35,7 +38,7 @@ use std::fs::File; use std::panic; use std::path::Path; use std::process::exit; -use std::sync::{mpsc, Arc, Condvar, Mutex}; +use std::sync::{mpsc, Arc, Condvar, Mutex, OnceLock, RwLock}; use std::{thread, time::Duration}; use anyhow::{Context, Result}; @@ -48,22 +51,33 @@ use compute_api::responses::ComputeStatus; use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec}; use compute_tools::configurator::launch_configurator; +use compute_tools::extension_server::{get_pg_version, init_remote_storage}; use compute_tools::http::api::launch_http_server; use compute_tools::logger::*; use compute_tools::monitor::launch_monitor; use compute_tools::params::*; use compute_tools::spec::*; -const BUILD_TAG_DEFAULT: &str = "local"; +// this is an arbitrary build tag. Fine as a default / for testing purposes +// in-case of not-set environment var +const BUILD_TAG_DEFAULT: &str = "5670669815"; fn main() -> Result<()> { init_tracing_and_logging(DEFAULT_LOG_LEVEL)?; - let build_tag = option_env!("BUILD_TAG").unwrap_or(BUILD_TAG_DEFAULT); - + let build_tag = option_env!("BUILD_TAG") + .unwrap_or(BUILD_TAG_DEFAULT) + .to_string(); info!("build_tag: {build_tag}"); let matches = cli().get_matches(); + let pgbin_default = String::from("postgres"); + let pgbin = matches.get_one::("pgbin").unwrap_or(&pgbin_default); + + let remote_ext_config = matches.get_one::("remote-ext-config"); + let ext_remote_storage = remote_ext_config.map(|x| { + init_remote_storage(x).expect("cannot initialize remote extension storage from config") + }); let http_port = *matches .get_one::("http-port") @@ -128,9 +142,6 @@ fn main() -> Result<()> { let compute_id = matches.get_one::("compute-id"); let control_plane_uri = matches.get_one::("control-plane-uri"); - // Try to use just 'postgres' if no path is provided - let pgbin = matches.get_one::("pgbin").unwrap(); - let spec; let mut live_config_allowed = false; match spec_json { @@ -168,6 +179,7 @@ fn main() -> Result<()> { let mut new_state = ComputeState::new(); let spec_set; + if let Some(spec) = spec { let pspec = ParsedSpec::try_from(spec).map_err(|msg| anyhow::anyhow!(msg))?; new_state.pspec = Some(pspec); @@ -179,27 +191,37 @@ fn main() -> Result<()> { connstr: Url::parse(connstr).context("cannot parse connstr as a URL")?, pgdata: pgdata.to_string(), pgbin: pgbin.to_string(), + pgversion: get_pg_version(pgbin), live_config_allowed, state: Mutex::new(new_state), state_changed: Condvar::new(), + ext_remote_storage, + ext_remote_paths: OnceLock::new(), + ext_download_progress: RwLock::new(HashMap::new()), + library_index: OnceLock::new(), + build_tag, }; let compute = Arc::new(compute_node); + // If this is a pooled VM, prewarm before starting HTTP server and becoming + // available for binding. Prewarming helps postgres start quicker later, + // because QEMU will already have it's memory allocated from the host, and + // the necessary binaries will alreaady be cached. + if !spec_set { + compute.prewarm_postgres()?; + } + // Launch http service first, so we were able to serve control-plane // requests, while configuration is still in progress. let _http_handle = launch_http_server(http_port, &compute).expect("cannot launch http endpoint thread"); + let extension_server_port: u16 = http_port; + if !spec_set { // No spec provided, hang waiting for it. info!("no compute spec provided, waiting"); - // TODO this can stall startups in the unlikely event that we bind - // this compute node while it's busy prewarming. It's not too - // bad because it's just 100ms and unlikely, but it's an - // avoidable problem. - compute.prewarm_postgres()?; - let mut state = compute.state.lock().unwrap(); while state.status != ComputeStatus::ConfigurationPending { state = compute.state_changed.wait(state).unwrap(); @@ -236,7 +258,7 @@ fn main() -> Result<()> { // Start Postgres let mut delay_exit = false; let mut exit_code = None; - let pg = match compute.start_compute() { + let pg = match compute.start_compute(extension_server_port) { Ok(pg) => Some(pg), Err(err) => { error!("could not start the compute node: {:?}", err); @@ -365,6 +387,12 @@ fn cli() -> clap::Command { .long("control-plane-uri") .value_name("CONTROL_PLANE_API_BASE_URI"), ) + .arg( + Arg::new("remote-ext-config") + .short('r') + .long("remote-ext-config") + .value_name("REMOTE_EXT_CONFIG"), + ) } #[test] diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 254d367a7167..ec2eb02237c9 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -1,16 +1,20 @@ +use std::collections::HashMap; use std::fs; use std::io::BufRead; use std::os::unix::fs::PermissionsExt; use std::path::Path; use std::process::{Command, Stdio}; use std::str::FromStr; -use std::sync::{Condvar, Mutex}; +use std::sync::{Condvar, Mutex, OnceLock, RwLock}; use anyhow::{Context, Result}; use chrono::{DateTime, Utc}; +use futures::future::join_all; use futures::stream::FuturesUnordered; use futures::StreamExt; use postgres::{Client, NoTls}; +use regex::Regex; +use tokio; use tokio_postgres; use tracing::{error, info, instrument, warn}; use utils::id::{TenantId, TimelineId}; @@ -20,10 +24,12 @@ use compute_api::responses::{ComputeMetrics, ComputeStatus}; use compute_api::spec::{ComputeMode, ComputeSpec}; use utils::measured_stream::MeasuredReader; -use crate::config; +use remote_storage::{GenericRemoteStorage, RemotePath}; + use crate::pg_helpers::*; use crate::spec::*; use crate::sync_sk::{check_if_synced, ping_safekeeper}; +use crate::{config, extension_server}; /// Compute node info shared across several `compute_ctl` threads. pub struct ComputeNode { @@ -31,6 +37,7 @@ pub struct ComputeNode { pub connstr: url::Url, pub pgdata: String, pub pgbin: String, + pub pgversion: String, /// We should only allow live re- / configuration of the compute node if /// it uses 'pull model', i.e. it can go to control-plane and fetch /// the latest configuration. Otherwise, there could be a case: @@ -50,6 +57,24 @@ pub struct ComputeNode { pub state: Mutex, /// `Condvar` to allow notifying waiters about state changes. pub state_changed: Condvar, + /// the S3 bucket that we search for extensions in + pub ext_remote_storage: Option, + // (key: extension name, value: path to extension archive in remote storage) + pub ext_remote_paths: OnceLock>, + // (key: library name, value: name of extension containing this library) + pub library_index: OnceLock>, + // key: ext_archive_name, value: started download time, download_completed? + pub ext_download_progress: RwLock, bool)>>, + pub build_tag: String, +} + +// store some metrics about download size that might impact startup time +#[derive(Clone, Debug)] +pub struct RemoteExtensionMetrics { + num_ext_downloaded: u64, + largest_ext_size: u64, + total_ext_download_size: u64, + prep_extensions_ms: u64, } #[derive(Clone, Debug)] @@ -473,14 +498,22 @@ impl ComputeNode { /// Do all the preparations like PGDATA directory creation, configuration, /// safekeepers sync, basebackup, etc. #[instrument(skip_all)] - pub fn prepare_pgdata(&self, compute_state: &ComputeState) -> Result<()> { + pub fn prepare_pgdata( + &self, + compute_state: &ComputeState, + extension_server_port: u16, + ) -> Result<()> { let pspec = compute_state.pspec.as_ref().expect("spec must be set"); let spec = &pspec.spec; let pgdata_path = Path::new(&self.pgdata); // Remove/create an empty pgdata directory and put configuration there. self.create_pgdata()?; - config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &pspec.spec)?; + config::write_postgres_conf( + &pgdata_path.join("postgresql.conf"), + &pspec.spec, + Some(extension_server_port), + )?; // Syncing safekeepers is only safe with primary nodes: if a primary // is already connected it will be kicked out, so a secondary (standby) @@ -670,7 +703,7 @@ impl ComputeNode { // Write new config let pgdata_path = Path::new(&self.pgdata); - config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec)?; + config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec, None)?; let mut client = Client::connect(self.connstr.as_str(), NoTls)?; self.pg_reload_conf(&mut client)?; @@ -700,7 +733,7 @@ impl ComputeNode { } #[instrument(skip_all)] - pub fn start_compute(&self) -> Result { + pub fn start_compute(&self, extension_server_port: u16) -> Result { let compute_state = self.state.lock().unwrap().clone(); let pspec = compute_state.pspec.as_ref().expect("spec must be set"); info!( @@ -711,7 +744,31 @@ impl ComputeNode { pspec.timeline_id, ); - self.prepare_pgdata(&compute_state)?; + // This part is sync, because we need to download + // remote shared_preload_libraries before postgres start (if any) + { + let library_load_start_time = Utc::now(); + let remote_ext_metrics = self.prepare_preload_libraries(&compute_state)?; + + let library_load_time = Utc::now() + .signed_duration_since(library_load_start_time) + .to_std() + .unwrap() + .as_millis() as u64; + let mut state = self.state.lock().unwrap(); + state.metrics.load_ext_ms = library_load_time; + state.metrics.num_ext_downloaded = remote_ext_metrics.num_ext_downloaded; + state.metrics.largest_ext_size = remote_ext_metrics.largest_ext_size; + state.metrics.total_ext_download_size = remote_ext_metrics.total_ext_download_size; + state.metrics.prep_extensions_ms = remote_ext_metrics.prep_extensions_ms; + info!( + "Loading shared_preload_libraries took {:?}ms", + library_load_time + ); + info!("{:?}", remote_ext_metrics); + } + + self.prepare_pgdata(&compute_state, extension_server_port)?; let start_time = Utc::now(); let pg = self.start_postgres(pspec.storage_auth_token.clone())?; @@ -859,4 +916,200 @@ LIMIT 100", "{{\"pg_stat_statements\": []}}".to_string() } } + + // If remote extension storage is configured, + // download extension control files + pub async fn prepare_external_extensions(&self, compute_state: &ComputeState) -> Result<()> { + if let Some(ref ext_remote_storage) = self.ext_remote_storage { + let pspec = compute_state.pspec.as_ref().expect("spec must be set"); + let spec = &pspec.spec; + let custom_ext = spec.custom_extensions.clone().unwrap_or(Vec::new()); + info!("custom extensions: {:?}", &custom_ext); + let (ext_remote_paths, library_index) = extension_server::get_available_extensions( + ext_remote_storage, + &self.pgbin, + &self.pgversion, + &custom_ext, + &self.build_tag, + ) + .await?; + self.ext_remote_paths + .set(ext_remote_paths) + .expect("this is the only time we set ext_remote_paths"); + self.library_index + .set(library_index) + .expect("this is the only time we set library_index"); + } + Ok(()) + } + + // download an archive, unzip and place files in correct locations + pub async fn download_extension(&self, ext_name: &str, is_library: bool) -> Result { + match &self.ext_remote_storage { + None => anyhow::bail!("No remote extension storage"), + Some(remote_storage) => { + let mut real_ext_name = ext_name.to_string(); + if is_library { + // sometimes library names might have a suffix like + // library.so or library.so.3. We strip this off + // because library_index is based on the name without the file extension + let strip_lib_suffix = Regex::new(r"\.so.*").unwrap(); + let lib_raw_name = strip_lib_suffix.replace(&real_ext_name, "").to_string(); + real_ext_name = self + .library_index + .get() + .expect("must have already downloaded the library_index")[&lib_raw_name] + .clone(); + } + + let ext_path = &self + .ext_remote_paths + .get() + .expect("error accessing ext_remote_paths")[&real_ext_name]; + let ext_archive_name = ext_path.object_name().expect("bad path"); + + let mut first_try = false; + if !self + .ext_download_progress + .read() + .expect("lock err") + .contains_key(ext_archive_name) + { + self.ext_download_progress + .write() + .expect("lock err") + .insert(ext_archive_name.to_string(), (Utc::now(), false)); + first_try = true; + } + let (download_start, download_completed) = + self.ext_download_progress.read().expect("lock err")[ext_archive_name]; + let start_time_delta = Utc::now() + .signed_duration_since(download_start) + .to_std() + .unwrap() + .as_millis() as u64; + + // how long to wait for extension download if it was started by another process + const HANG_TIMEOUT: u64 = 3000; // milliseconds + + if download_completed { + info!("extension already downloaded, skipping re-download"); + return Ok(0); + } else if start_time_delta < HANG_TIMEOUT && !first_try { + info!("download {ext_archive_name} already started by another process, hanging untill completion or timeout"); + let mut interval = + tokio::time::interval(tokio::time::Duration::from_millis(500)); + loop { + info!("waiting for download"); + interval.tick().await; + let (_, download_completed_now) = + self.ext_download_progress.read().expect("lock")[ext_archive_name]; + if download_completed_now { + info!("download finished by whoever else downloaded it"); + return Ok(0); + } + } + // NOTE: the above loop will get terminated + // based on the timeout of the download function + } + + // if extension hasn't been downloaded before or the previous + // attempt to download was at least HANG_TIMEOUT ms ago + // then we try to download it here + info!("downloading new extension {ext_archive_name}"); + + let download_size = extension_server::download_extension( + &real_ext_name, + ext_path, + remote_storage, + &self.pgbin, + ) + .await; + self.ext_download_progress + .write() + .expect("bad lock") + .insert(ext_archive_name.to_string(), (download_start, true)); + download_size + } + } + } + + #[tokio::main] + pub async fn prepare_preload_libraries( + &self, + compute_state: &ComputeState, + ) -> Result { + if self.ext_remote_storage.is_none() { + return Ok(RemoteExtensionMetrics { + num_ext_downloaded: 0, + largest_ext_size: 0, + total_ext_download_size: 0, + prep_extensions_ms: 0, + }); + } + let pspec = compute_state.pspec.as_ref().expect("spec must be set"); + let spec = &pspec.spec; + + info!("parse shared_preload_libraries from spec.cluster.settings"); + let mut libs_vec = Vec::new(); + if let Some(libs) = spec.cluster.settings.find("shared_preload_libraries") { + libs_vec = libs + .split(&[',', '\'', ' ']) + .filter(|s| *s != "neon" && !s.is_empty()) + .map(str::to_string) + .collect(); + } + info!("parse shared_preload_libraries from provided postgresql.conf"); + // that is used in neon_local and python tests + if let Some(conf) = &spec.cluster.postgresql_conf { + let conf_lines = conf.split('\n').collect::>(); + let mut shared_preload_libraries_line = ""; + for line in conf_lines { + if line.starts_with("shared_preload_libraries") { + shared_preload_libraries_line = line; + } + } + let mut preload_libs_vec = Vec::new(); + if let Some(libs) = shared_preload_libraries_line.split("='").nth(1) { + preload_libs_vec = libs + .split(&[',', '\'', ' ']) + .filter(|s| *s != "neon" && !s.is_empty()) + .map(str::to_string) + .collect(); + } + libs_vec.extend(preload_libs_vec); + } + + info!("Download ext_index.json, find the extension paths"); + let prep_ext_start_time = Utc::now(); + self.prepare_external_extensions(compute_state).await?; + let prep_ext_time_delta = Utc::now() + .signed_duration_since(prep_ext_start_time) + .to_std() + .unwrap() + .as_millis() as u64; + info!("Prepare extensions took {prep_ext_time_delta}ms"); + + info!("Downloading to shared preload libraries: {:?}", &libs_vec); + let mut download_tasks = Vec::new(); + for library in &libs_vec { + download_tasks.push(self.download_extension(library, true)); + } + let results = join_all(download_tasks).await; + + let mut remote_ext_metrics = RemoteExtensionMetrics { + num_ext_downloaded: 0, + largest_ext_size: 0, + total_ext_download_size: 0, + prep_extensions_ms: prep_ext_time_delta, + }; + for result in results { + let download_size = result?; + remote_ext_metrics.num_ext_downloaded += 1; + remote_ext_metrics.largest_ext_size = + std::cmp::max(remote_ext_metrics.largest_ext_size, download_size); + remote_ext_metrics.total_ext_download_size += download_size; + } + Ok(remote_ext_metrics) + } } diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs index 68b943eec8a5..2da671a149bf 100644 --- a/compute_tools/src/config.rs +++ b/compute_tools/src/config.rs @@ -33,7 +33,11 @@ pub fn line_in_file(path: &Path, line: &str) -> Result { } /// Create or completely rewrite configuration file specified by `path` -pub fn write_postgres_conf(path: &Path, spec: &ComputeSpec) -> Result<()> { +pub fn write_postgres_conf( + path: &Path, + spec: &ComputeSpec, + extension_server_port: Option, +) -> Result<()> { // File::create() destroys the file content if it exists. let mut file = File::create(path)?; @@ -87,5 +91,9 @@ pub fn write_postgres_conf(path: &Path, spec: &ComputeSpec) -> Result<()> { writeln!(file, "# Managed by compute_ctl: end")?; } + if let Some(port) = extension_server_port { + writeln!(file, "neon.extension_server_port={}", port)?; + } + Ok(()) } diff --git a/compute_tools/src/extension_server.rs b/compute_tools/src/extension_server.rs new file mode 100644 index 000000000000..c8710b057705 --- /dev/null +++ b/compute_tools/src/extension_server.rs @@ -0,0 +1,275 @@ +// Download extension files from the extension store +// and put them in the right place in the postgres directory (share / lib) +/* +The layout of the S3 bucket is as follows: +5615610098 // this is an extension build number +├── v14 +│   ├── extensions +│   │   ├── anon.tar.zst +│   │   └── embedding.tar.zst +│   └── ext_index.json +└── v15 + ├── extensions + │   ├── anon.tar.zst + │   └── embedding.tar.zst + └── ext_index.json +5615261079 +├── v14 +│   ├── extensions +│   │   └── anon.tar.zst +│   └── ext_index.json +└── v15 + ├── extensions + │   └── anon.tar.zst + └── ext_index.json +5623261088 +├── v14 +│   ├── extensions +│   │   └── embedding.tar.zst +│   └── ext_index.json +└── v15 + ├── extensions + │   └── embedding.tar.zst + └── ext_index.json + +Note that build number cannot be part of prefix because we might need extensions +from other build numbers. + +ext_index.json stores the control files and location of extension archives +It also stores a list of public extensions and a library_index + +We don't need to duplicate extension.tar.zst files. +We only need to upload a new one if it is updated. +(Although currently we just upload every time anyways, hopefully will change +this sometime) + +*access* is controlled by spec + +More specifically, here is an example ext_index.json +{ + "public_extensions": [ + "anon", + "pg_buffercache" + ], + "library_index": { + "anon": "anon", + "pg_buffercache": "pg_buffercache" + }, + "extension_data": { + "pg_buffercache": { + "control_data": { + "pg_buffercache.control": "# pg_buffercache extension \ncomment = 'examine the shared buffer cache' \ndefault_version = '1.3' \nmodule_pathname = '$libdir/pg_buffercache' \nrelocatable = true \ntrusted=true" + }, + "archive_path": "5670669815/v14/extensions/pg_buffercache.tar.zst" + }, + "anon": { + "control_data": { + "anon.control": "# PostgreSQL Anonymizer (anon) extension \ncomment = 'Data anonymization tools' \ndefault_version = '1.1.0' \ndirectory='extension/anon' \nrelocatable = false \nrequires = 'pgcrypto' \nsuperuser = false \nmodule_pathname = '$libdir/anon' \ntrusted = true \n" + }, + "archive_path": "5670669815/v14/extensions/anon.tar.zst" + } + } +} +*/ +use anyhow::Context; +use anyhow::{self, Result}; +use futures::future::join_all; +use remote_storage::*; +use serde_json; +use std::collections::HashMap; +use std::io::Read; +use std::num::{NonZeroU32, NonZeroUsize}; +use std::path::Path; +use std::str; +use tar::Archive; +use tokio::io::AsyncReadExt; +use tracing::info; +use tracing::log::warn; +use zstd::stream::read::Decoder; + +fn get_pg_config(argument: &str, pgbin: &str) -> String { + // gives the result of `pg_config [argument]` + // where argument is a flag like `--version` or `--sharedir` + let pgconfig = pgbin + .strip_suffix("postgres") + .expect("bad pgbin") + .to_owned() + + "/pg_config"; + let config_output = std::process::Command::new(pgconfig) + .arg(argument) + .output() + .expect("pg_config error"); + std::str::from_utf8(&config_output.stdout) + .expect("pg_config error") + .trim() + .to_string() +} + +pub fn get_pg_version(pgbin: &str) -> String { + // pg_config --version returns a (platform specific) human readable string + // such as "PostgreSQL 15.4". We parse this to v14/v15 + let human_version = get_pg_config("--version", pgbin); + if human_version.contains("15") { + return "v15".to_string(); + } else if human_version.contains("14") { + return "v14".to_string(); + } + panic!("Unsuported postgres version {human_version}"); +} + +// download control files for enabled_extensions +// return Hashmaps converting library names to extension names (library_index) +// and specifying the remote path to the archive for each extension name +pub async fn get_available_extensions( + remote_storage: &GenericRemoteStorage, + pgbin: &str, + pg_version: &str, + custom_extensions: &[String], + build_tag: &str, +) -> Result<(HashMap, HashMap)> { + let local_sharedir = Path::new(&get_pg_config("--sharedir", pgbin)).join("extension"); + let index_path = format!("{build_tag}/{pg_version}/ext_index.json"); + let index_path = RemotePath::new(Path::new(&index_path)).context("error forming path")?; + info!("download ext_index.json from: {:?}", &index_path); + + let mut download = remote_storage.download(&index_path).await?; + let mut ext_idx_buffer = Vec::new(); + download + .download_stream + .read_to_end(&mut ext_idx_buffer) + .await?; + info!("ext_index downloaded"); + + #[derive(Debug, serde::Deserialize)] + struct Index { + public_extensions: Vec, + library_index: HashMap, + extension_data: HashMap, + } + + #[derive(Debug, serde::Deserialize)] + struct ExtensionData { + control_data: HashMap, + archive_path: String, + } + + let ext_index_full = serde_json::from_slice::(&ext_idx_buffer)?; + let mut enabled_extensions = ext_index_full.public_extensions; + enabled_extensions.extend_from_slice(custom_extensions); + let library_index = ext_index_full.library_index; + let all_extension_data = ext_index_full.extension_data; + info!("library_index: {:?}", library_index); + + info!("enabled_extensions: {:?}", enabled_extensions); + let mut ext_remote_paths = HashMap::new(); + let mut file_create_tasks = Vec::new(); + for extension in enabled_extensions { + let ext_data = &all_extension_data[&extension]; + for (control_file, control_contents) in &ext_data.control_data { + let extension_name = control_file + .strip_suffix(".control") + .expect("control files must end in .control"); + ext_remote_paths.insert( + extension_name.to_string(), + RemotePath::from_string(&ext_data.archive_path)?, + ); + let control_path = local_sharedir.join(control_file); + info!("writing file {:?}{:?}", control_path, control_contents); + file_create_tasks.push(tokio::fs::write(control_path, control_contents)); + } + } + let results = join_all(file_create_tasks).await; + for result in results { + result?; + } + info!("ext_remote_paths {:?}", ext_remote_paths); + Ok((ext_remote_paths, library_index)) +} + +// download the archive for a given extension, +// unzip it, and place files in the appropriate locations (share/lib) +pub async fn download_extension( + ext_name: &str, + ext_path: &RemotePath, + remote_storage: &GenericRemoteStorage, + pgbin: &str, +) -> Result { + info!("Download extension {:?} from {:?}", ext_name, ext_path); + let mut download = remote_storage.download(ext_path).await?; + let mut download_buffer = Vec::new(); + download + .download_stream + .read_to_end(&mut download_buffer) + .await?; + let download_size = download_buffer.len() as u64; + // it's unclear whether it is more performant to decompress into memory or not + // TODO: decompressing into memory can be avoided + let mut decoder = Decoder::new(download_buffer.as_slice())?; + let mut decompress_buffer = Vec::new(); + decoder.read_to_end(&mut decompress_buffer)?; + let mut archive = Archive::new(decompress_buffer.as_slice()); + let unzip_dest = pgbin + .strip_suffix("/bin/postgres") + .expect("bad pgbin") + .to_string() + + "/download_extensions"; + archive.unpack(&unzip_dest)?; + info!("Download + unzip {:?} completed successfully", &ext_path); + + let sharedir_paths = ( + unzip_dest.to_string() + "/share/extension", + Path::new(&get_pg_config("--sharedir", pgbin)).join("extension"), + ); + let libdir_paths = ( + unzip_dest.to_string() + "/lib", + Path::new(&get_pg_config("--libdir", pgbin)).join("postgresql"), + ); + // move contents of the libdir / sharedir in unzipped archive to the correct local paths + for paths in [sharedir_paths, libdir_paths] { + let (zip_dir, real_dir) = paths; + info!("mv {zip_dir:?}/* {real_dir:?}"); + for file in std::fs::read_dir(zip_dir)? { + let old_file = file?.path(); + let new_file = + Path::new(&real_dir).join(old_file.file_name().context("error parsing file")?); + info!("moving {old_file:?} to {new_file:?}"); + + // extension download failed: Directory not empty (os error 39) + match std::fs::rename(old_file, new_file) { + Ok(()) => info!("move succeeded"), + Err(e) => { + warn!("move failed, probably because the extension already exists: {e}") + } + } + } + } + info!("done moving extension {ext_name}"); + Ok(download_size) +} + +// This function initializes the necessary structs to use remote storage +pub fn init_remote_storage(remote_ext_config: &str) -> anyhow::Result { + #[derive(Debug, serde::Deserialize)] + struct RemoteExtJson { + bucket: String, + region: String, + endpoint: Option, + prefix: Option, + } + let remote_ext_json = serde_json::from_str::(remote_ext_config)?; + + let config = S3Config { + bucket_name: remote_ext_json.bucket, + bucket_region: remote_ext_json.region, + prefix_in_bucket: remote_ext_json.prefix, + endpoint: remote_ext_json.endpoint, + concurrency_limit: NonZeroUsize::new(100).expect("100 != 0"), + max_keys_per_list_response: None, + }; + let config = RemoteStorageConfig { + max_concurrent_syncs: NonZeroUsize::new(100).expect("100 != 0"), + max_sync_errors: NonZeroU32::new(100).expect("100 != 0"), + storage: RemoteStorageKind::AwsS3(config), + }; + GenericRemoteStorage::from_config(&config) +} diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs index afd9c2fb5479..af07412b5250 100644 --- a/compute_tools/src/http/api.rs +++ b/compute_tools/src/http/api.rs @@ -121,6 +121,37 @@ async fn routes(req: Request, compute: &Arc) -> Response { + info!("serving {:?} POST request", route); + info!("req.uri {:?}", req.uri()); + + let mut is_library = false; + if let Some(params) = req.uri().query() { + info!("serving {:?} POST request with params: {}", route, params); + if params == "is_library=true" { + is_library = true; + } else { + let mut resp = Response::new(Body::from("Wrong request parameters")); + *resp.status_mut() = StatusCode::BAD_REQUEST; + return resp; + } + } + + let filename = route.split('/').last().unwrap().to_string(); + info!("serving /extension_server POST request, filename: {filename:?} is_library: {is_library}"); + + match compute.download_extension(&filename, is_library).await { + Ok(_) => Response::new(Body::from("OK")), + Err(e) => { + error!("extension download failed: {}", e); + let mut resp = Response::new(Body::from(e.to_string())); + *resp.status_mut() = StatusCode::INTERNAL_SERVER_ERROR; + resp + } + } + } + // Return the `404 Not Found` for any other routes. _ => { let mut not_found = Response::new(Body::from("404 Not Found")); diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml index 268026975630..dc26cc63eb8d 100644 --- a/compute_tools/src/http/openapi_spec.yaml +++ b/compute_tools/src/http/openapi_spec.yaml @@ -139,6 +139,34 @@ paths: application/json: schema: $ref: "#/components/schemas/GenericError" + /extension_server: + post: + tags: + - Extension + summary: Download extension from S3 to local folder. + description: "" + operationId: downloadExtension + responses: + 200: + description: Extension downloaded + content: + text/plain: + schema: + type: string + description: Error text or 'OK' if download succeeded. + example: "OK" + 400: + description: Request is invalid. + content: + application/json: + schema: + $ref: "#/components/schemas/GenericError" + 500: + description: Extension download request failed. + content: + application/json: + schema: + $ref: "#/components/schemas/GenericError" components: securitySchemes: diff --git a/compute_tools/src/lib.rs b/compute_tools/src/lib.rs index 1d7b09f095c1..1cd960208910 100644 --- a/compute_tools/src/lib.rs +++ b/compute_tools/src/lib.rs @@ -9,6 +9,7 @@ pub mod http; #[macro_use] pub mod logger; pub mod compute; +pub mod extension_server; pub mod monitor; pub mod params; pub mod pg_helpers; diff --git a/compute_tools/src/spec.rs b/compute_tools/src/spec.rs index 575a5332a82d..eff7c93b461a 100644 --- a/compute_tools/src/spec.rs +++ b/compute_tools/src/spec.rs @@ -124,7 +124,7 @@ pub fn get_spec_from_control_plane( pub fn handle_configuration(spec: &ComputeSpec, pgdata_path: &Path) -> Result<()> { // File `postgresql.conf` is no longer included into `basebackup`, so just // always write all config into it creating new file. - config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec)?; + config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec, None)?; update_pg_hba(pgdata_path)?; diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index a341ff0263bb..d2c99c5f364e 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -32,3 +32,4 @@ utils.workspace = true compute_api.workspace = true workspace_hack.workspace = true +tracing.workspace = true diff --git a/control_plane/src/bin/neon_local.rs b/control_plane/src/bin/neon_local.rs index 8995a1856468..8f71cb65e2b2 100644 --- a/control_plane/src/bin/neon_local.rs +++ b/control_plane/src/bin/neon_local.rs @@ -658,6 +658,8 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<( .get_one::("endpoint_id") .ok_or_else(|| anyhow!("No endpoint ID was provided to start"))?; + let remote_ext_config = sub_args.get_one::("remote-ext-config"); + // If --safekeepers argument is given, use only the listed safekeeper nodes. let safekeepers = if let Some(safekeepers_str) = sub_args.get_one::("safekeepers") { @@ -699,7 +701,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<( _ => {} } println!("Starting existing endpoint {endpoint_id}..."); - endpoint.start(&auth_token, safekeepers)?; + endpoint.start(&auth_token, safekeepers, remote_ext_config)?; } else { let branch_name = sub_args .get_one::("branch-name") @@ -743,7 +745,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<( pg_version, mode, )?; - ep.start(&auth_token, safekeepers)?; + ep.start(&auth_token, safekeepers, remote_ext_config)?; } } "stop" => { @@ -1003,6 +1005,12 @@ fn cli() -> Command { .help("Additional pageserver's configuration options or overrides, refer to pageserver's 'config-override' CLI parameter docs for more") .required(false); + let remote_ext_config_args = Arg::new("remote-ext-config") + .long("remote-ext-config") + .num_args(1) + .help("Configure the S3 bucket that we search for extensions in.") + .required(false); + let lsn_arg = Arg::new("lsn") .long("lsn") .help("Specify Lsn on the timeline to start from. By default, end of the timeline would be used.") @@ -1161,6 +1169,7 @@ fn cli() -> Command { .arg(pg_version_arg) .arg(hot_standby_arg) .arg(safekeepers_arg) + .arg(remote_ext_config_args) ) .subcommand( Command::new("stop") diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index 8b5c88bf01e5..60607994588b 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -313,7 +313,7 @@ impl Endpoint { // TODO: use future host field from safekeeper spec // Pass the list of safekeepers to the replica so that it can connect to any of them, - // whichever is availiable. + // whichever is available. let sk_ports = self .env .safekeepers @@ -420,7 +420,12 @@ impl Endpoint { Ok(()) } - pub fn start(&self, auth_token: &Option, safekeepers: Vec) -> Result<()> { + pub fn start( + &self, + auth_token: &Option, + safekeepers: Vec, + remote_ext_config: Option<&String>, + ) -> Result<()> { if self.status() == "running" { anyhow::bail!("The endpoint is already running"); } @@ -488,6 +493,7 @@ impl Endpoint { pageserver_connstring: Some(pageserver_connstring), safekeeper_connstrings, storage_auth_token: auth_token.clone(), + custom_extensions: Some(vec![]), }; let spec_path = self.endpoint_path().join("spec.json"); std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?; @@ -519,6 +525,11 @@ impl Endpoint { .stdin(std::process::Stdio::null()) .stderr(logfile.try_clone()?) .stdout(logfile); + + if let Some(remote_ext_config) = remote_ext_config { + cmd.args(["--remote-ext-config", remote_ext_config]); + } + let child = cmd.spawn()?; // Write down the pid so we can wait for it when we want to stop diff --git a/docs/rfcs/024-extension-loading.md b/docs/rfcs/024-extension-loading.md new file mode 100644 index 000000000000..26ba4f792700 --- /dev/null +++ b/docs/rfcs/024-extension-loading.md @@ -0,0 +1,236 @@ +# Supporting custom user Extensions (Dynamic Extension Loading) +Created 2023-05-03 + +## Motivation + +There are many extensions in the PostgreSQL ecosystem, and not all extensions +are of a quality that we can confidently support them. Additionally, our +current extension inclusion mechanism has several problems because we build all +extensions into the primary Compute image: We build the extensions every time +we build the compute image regardless of whether we actually need to rebuild +the image, and the inclusion of these extensions in the image adds a hard +dependency on all supported extensions - thus increasing the image size, and +with it the time it takes to download that image - increasing first start +latency. + +This RFC proposes a dynamic loading mechanism that solves most of these +problems. + +## Summary + +`compute_ctl` is made responsible for loading extensions on-demand into +the container's file system for dynamically loaded extensions, and will also +make sure that the extensions in `shared_preload_libraries` are downloaded +before the compute node starts. + +## Components + +compute_ctl, PostgreSQL, neon (extension), Compute Host Node, Extension Store + +## Requirements + +Compute nodes with no extra extensions should not be negatively impacted by +the existence of support for many extensions. + +Installing an extension into PostgreSQL should be easy. + +Non-preloaded extensions shouldn't impact startup latency. + +Uninstalled extensions shouldn't impact query latency. + +A small latency penalty for dynamically loaded extensions is acceptable in +the first seconds of compute startup, but not in steady-state operations. + +## Proposed implementation + +### On-demand, JIT-loading of extensions + +Before postgres starts we download +- control files for all extensions available to that compute node; +- all `shared_preload_libraries`; + +After postgres is running, `compute_ctl` listens for requests to load files. +When PostgreSQL requests a file, `compute_ctl` downloads it. + +PostgreSQL requests files in the following cases: +- When loading a preload library set in `local_preload_libraries` +- When explicitly loading a library with `LOAD` +- Wnen creating extension with `CREATE EXTENSION` (download sql scripts, (optional) extension data files and (optional) library files))) + + +#### Summary + +Pros: + - Startup is only as slow as it takes to load all (shared_)preload_libraries + - Supports BYO Extension + +Cons: + - O(sizeof(extensions)) IO requirement for loading all extensions. + +### Alternative solutions + +1. Allow users to add their extensions to the base image + + Pros: + - Easy to deploy + + Cons: + - Doesn't scale - first start size is dependent on image size; + - All extensions are shared across all users: It doesn't allow users to + bring their own restrictive-licensed extensions + +2. Bring Your Own compute image + + Pros: + - Still easy to deploy + - User can bring own patched version of PostgreSQL + + Cons: + - First start latency is O(sizeof(extensions image)) + - Warm instance pool for skipping pod schedule latency is not feasible with + O(n) custom images + - Support channels are difficult to manage + +3. Download all user extensions in bulk on compute start + + Pros: + - Easy to deploy + - No startup latency issues for "clean" users. + - Warm instance pool for skipping pod schedule latency is possible + + Cons: + - Downloading all extensions in advance takes a lot of time, thus startup + latency issues + +4. Store user's extensions in persistent storage + + Pros: + - Easy to deploy + - No startup latency issues + - Warm instance pool for skipping pod schedule latency is possible + + Cons: + - EC2 instances have only limited number of attachments shared between EBS + volumes, direct-attached NVMe drives, and ENIs. + - Compute instance migration isn't trivially solved for EBS mounts (e.g. + the device is unavailable whilst moving the mount between instances). + - EBS can only mount on one instance at a time (except the expensive IO2 + device type). + +5. Store user's extensions in network drive + + Pros: + - Easy to deploy + - Few startup latency issues + - Warm instance pool for skipping pod schedule latency is possible + + Cons: + - We'd need networked drives, and a lot of them, which would store many + duplicate extensions. + - **UNCHECKED:** Compute instance migration may not work nicely with + networked IOs + + +### Idea extensions + +The extension store does not have to be S3 directly, but could be a Node-local +caching service on top of S3. This would reduce the load on the network for +popular extensions. + +## Extension Storage implementation + +The layout of the S3 bucket is as follows: +``` +5615610098 // this is an extension build number +├── v14 +│   ├── extensions +│   │   ├── anon.tar.zst +│   │   └── embedding.tar.zst +│   └── ext_index.json +└── v15 + ├── extensions + │   ├── anon.tar.zst + │   └── embedding.tar.zst + └── ext_index.json +5615261079 +├── v14 +│   ├── extensions +│   │   └── anon.tar.zst +│   └── ext_index.json +└── v15 + ├── extensions + │   └── anon.tar.zst + └── ext_index.json +5623261088 +├── v14 +│   ├── extensions +│   │   └── embedding.tar.zst +│   └── ext_index.json +└── v15 + ├── extensions + │   └── embedding.tar.zst + └── ext_index.json +``` + +Note that build number cannot be part of prefix because we might need extensions +from other build numbers. + +`ext_index.json` stores the control files and location of extension archives. +It also stores a list of public extensions and a library_index + +We don't need to duplicate `extension.tar.zst`` files. +We only need to upload a new one if it is updated. +(Although currently we just upload every time anyways, hopefully will change +this sometime) + +*access* is controlled by spec + +More specifically, here is an example ext_index.json +``` +{ + "public_extensions": [ + "anon", + "pg_buffercache" + ], + "library_index": { + "anon": "anon", + "pg_buffercache": "pg_buffercache" + // for more complex extensions like postgis + // we might have something like: + // address_standardizer: postgis + // postgis_tiger: postgis + }, + "extension_data": { + "pg_buffercache": { + "control_data": { + "pg_buffercache.control": "# pg_buffercache extension \ncomment = 'examine the shared buffer cache' \ndefault_version = '1.3' \nmodule_pathname = '$libdir/pg_buffercache' \nrelocatable = true \ntrusted=true" + }, + "archive_path": "5670669815/v14/extensions/pg_buffercache.tar.zst" + }, + "anon": { + "control_data": { + "anon.control": "# PostgreSQL Anonymizer (anon) extension \ncomment = 'Data anonymization tools' \ndefault_version = '1.1.0' \ndirectory='extension/anon' \nrelocatable = false \nrequires = 'pgcrypto' \nsuperuser = false \nmodule_pathname = '$libdir/anon' \ntrusted = true \n" + }, + "archive_path": "5670669815/v14/extensions/anon.tar.zst" + } + } +} +``` + +### How to add new extension to the Extension Storage? + +Simply upload build artifacts to the S3 bucket. +Implement a CI step for that. Splitting it from compute-node-image build. + +### How do we deal with extension versions and updates? + +Currently, we rebuild extensions on every compute-node-image build and store them in the prefix. +This is needed to ensure that `/share` and `/lib` files are in sync. + +For extension updates, we rely on the PostgreSQL extension versioning mechanism (sql update scripts) and extension authors to not break backwards compatibility within one major version of PostgreSQL. + +### Alternatives + +For extensions written on trusted languages we can also adopt +`dbdev` PostgreSQL Package Manager based on `pg_tle` by Supabase. +This will increase the amount supported extensions and decrease the amount of work required to support them. diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs index f2865b71ec6d..9522d7138f79 100644 --- a/libs/compute_api/src/responses.rs +++ b/libs/compute_api/src/responses.rs @@ -76,6 +76,11 @@ pub struct ComputeMetrics { pub start_postgres_ms: u64, pub config_ms: u64, pub total_startup_ms: u64, + pub load_ext_ms: u64, + pub num_ext_downloaded: u64, + pub largest_ext_size: u64, // these are measured in bytes + pub total_ext_download_size: u64, + pub prep_extensions_ms: u64, } /// Response of the `/computes/{compute_id}/spec` control-plane API. diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs index b3f0e9ba4350..293f6dc29479 100644 --- a/libs/compute_api/src/spec.rs +++ b/libs/compute_api/src/spec.rs @@ -60,6 +60,9 @@ pub struct ComputeSpec { /// If set, 'storage_auth_token' is used as the password to authenticate to /// the pageserver and safekeepers. pub storage_auth_token: Option, + + // list of prefixes to search for custom extensions in remote extension storage + pub custom_extensions: Option>, } #[serde_as] diff --git a/libs/consumption_metrics/src/lib.rs b/libs/consumption_metrics/src/lib.rs index 3a1b396d63db..418885a1b0e9 100644 --- a/libs/consumption_metrics/src/lib.rs +++ b/libs/consumption_metrics/src/lib.rs @@ -57,7 +57,7 @@ pub struct Event { pub extra: Extra, } -pub fn idempotency_key(node_id: String) -> String { +pub fn idempotency_key(node_id: &str) -> String { format!( "{}-{}-{:04}", Utc::now(), @@ -71,6 +71,6 @@ pub const CHUNK_SIZE: usize = 1000; // Just a wrapper around a slice of events // to serialize it as `{"events" : [ ] } #[derive(serde::Serialize)] -pub struct EventChunk<'a, T> { - pub events: &'a [T], +pub struct EventChunk<'a, T: Clone> { + pub events: std::borrow::Cow<'a, [T]>, } diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml index 0877a38dd9b7..a4adae614698 100644 --- a/libs/remote_storage/Cargo.toml +++ b/libs/remote_storage/Cargo.toml @@ -20,6 +20,7 @@ tokio = { workspace = true, features = ["sync", "fs", "io-util"] } tokio-util.workspace = true toml_edit.workspace = true tracing.workspace = true +scopeguard.workspace = true metrics.workspace = true utils.workspace = true pin-project-lite.workspace = true diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index 92ef793a345e..1ddd156a087e 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -65,6 +65,10 @@ impl RemotePath { Ok(Self(relative_path.to_path_buf())) } + pub fn from_string(relative_path: &str) -> anyhow::Result { + Self::new(Path::new(relative_path)) + } + pub fn with_base(&self, base_path: &Path) -> PathBuf { base_path.join(&self.0) } @@ -190,6 +194,20 @@ pub enum GenericRemoteStorage { } impl GenericRemoteStorage { + // A function for listing all the files in a "directory" + // Example: + // list_files("foo/bar") = ["foo/bar/a.txt", "foo/bar/b.txt"] + pub async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result> { + match self { + Self::LocalFs(s) => s.list_files(folder).await, + Self::AwsS3(s) => s.list_files(folder).await, + Self::Unreliable(s) => s.list_files(folder).await, + } + } + + // lists common *prefixes*, if any of files + // Example: + // list_prefixes("foo123","foo567","bar123","bar432") = ["foo", "bar"] pub async fn list_prefixes( &self, prefix: Option<&RemotePath>, @@ -201,14 +219,6 @@ impl GenericRemoteStorage { } } - pub async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result> { - match self { - Self::LocalFs(s) => s.list_files(folder).await, - Self::AwsS3(s) => s.list_files(folder).await, - Self::Unreliable(s) => s.list_files(folder).await, - } - } - pub async fn upload( &self, from: impl io::AsyncRead + Unpin + Send + Sync + 'static, diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index 37a6bf23e8b5..b79d8566bb04 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -22,6 +22,7 @@ use aws_sdk_s3::{ }; use aws_smithy_http::body::SdkBody; use hyper::Body; +use scopeguard::ScopeGuard; use tokio::{ io::{self, AsyncRead}, sync::Semaphore, @@ -36,82 +37,9 @@ use crate::{ const MAX_DELETE_OBJECTS_REQUEST_SIZE: usize = 1000; -pub(super) mod metrics { - use metrics::{register_int_counter_vec, IntCounterVec}; - use once_cell::sync::Lazy; +pub(super) mod metrics; - static S3_REQUESTS_COUNT: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "remote_storage_s3_requests_count", - "Number of s3 requests of particular type", - &["request_type"], - ) - .expect("failed to define a metric") - }); - - static S3_REQUESTS_FAIL_COUNT: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "remote_storage_s3_failures_count", - "Number of failed s3 requests of particular type", - &["request_type"], - ) - .expect("failed to define a metric") - }); - - pub fn inc_get_object() { - S3_REQUESTS_COUNT.with_label_values(&["get_object"]).inc(); - } - - pub fn inc_get_object_fail() { - S3_REQUESTS_FAIL_COUNT - .with_label_values(&["get_object"]) - .inc(); - } - - pub fn inc_put_object() { - S3_REQUESTS_COUNT.with_label_values(&["put_object"]).inc(); - } - - pub fn inc_put_object_fail() { - S3_REQUESTS_FAIL_COUNT - .with_label_values(&["put_object"]) - .inc(); - } - - pub fn inc_delete_object() { - S3_REQUESTS_COUNT - .with_label_values(&["delete_object"]) - .inc(); - } - - pub fn inc_delete_objects(count: u64) { - S3_REQUESTS_COUNT - .with_label_values(&["delete_object"]) - .inc_by(count); - } - - pub fn inc_delete_object_fail() { - S3_REQUESTS_FAIL_COUNT - .with_label_values(&["delete_object"]) - .inc(); - } - - pub fn inc_delete_objects_fail(count: u64) { - S3_REQUESTS_FAIL_COUNT - .with_label_values(&["delete_object"]) - .inc_by(count); - } - - pub fn inc_list_objects() { - S3_REQUESTS_COUNT.with_label_values(&["list_objects"]).inc(); - } - - pub fn inc_list_objects_fail() { - S3_REQUESTS_FAIL_COUNT - .with_label_values(&["list_objects"]) - .inc(); - } -} +use self::metrics::{AttemptOutcome, RequestKind}; /// AWS S3 storage. pub struct S3Bucket { @@ -213,17 +141,46 @@ impl S3Bucket { } } - async fn download_object(&self, request: GetObjectRequest) -> Result { + async fn permit(&self, kind: RequestKind) -> tokio::sync::SemaphorePermit<'_> { + let started_at = start_counting_cancelled_wait(kind); + let permit = self + .concurrency_limiter + .acquire() + .await + .expect("semaphore is never closed"); + + let started_at = ScopeGuard::into_inner(started_at); + metrics::BUCKET_METRICS + .wait_seconds + .observe_elapsed(kind, started_at); + + permit + } + + async fn owned_permit(&self, kind: RequestKind) -> tokio::sync::OwnedSemaphorePermit { + let started_at = start_counting_cancelled_wait(kind); let permit = self .concurrency_limiter .clone() .acquire_owned() .await - .context("Concurrency limiter semaphore got closed during S3 download") - .map_err(DownloadError::Other)?; + .expect("semaphore is never closed"); + + let started_at = ScopeGuard::into_inner(started_at); + metrics::BUCKET_METRICS + .wait_seconds + .observe_elapsed(kind, started_at); + permit + } + + async fn download_object(&self, request: GetObjectRequest) -> Result { + let kind = RequestKind::Get; + let permit = self.owned_permit(kind).await; metrics::inc_get_object(); + let started_at = start_measuring_requests(kind); + let get_object = self .client .get_object() @@ -233,26 +190,34 @@ impl S3Bucket { .send() .await; + let started_at = ScopeGuard::into_inner(started_at); + + if get_object.is_err() { + metrics::inc_get_object_fail(); + metrics::BUCKET_METRICS.req_seconds.observe_elapsed( + kind, + AttemptOutcome::Err, + started_at, + ); + } + match get_object { Ok(object_output) => { let metadata = object_output.metadata().cloned().map(StorageMetadata); Ok(Download { metadata, - download_stream: Box::pin(io::BufReader::new(RatelimitedAsyncRead::new( - permit, - object_output.body.into_async_read(), + download_stream: Box::pin(io::BufReader::new(TimedDownload::new( + started_at, + RatelimitedAsyncRead::new(permit, object_output.body.into_async_read()), ))), }) } Err(SdkError::ServiceError(e)) if matches!(e.err(), GetObjectError::NoSuchKey(_)) => { Err(DownloadError::NotFound) } - Err(e) => { - metrics::inc_get_object_fail(); - Err(DownloadError::Other(anyhow::anyhow!( - "Failed to download S3 object: {e}" - ))) - } + Err(e) => Err(DownloadError::Other( + anyhow::Error::new(e).context("download s3 object"), + )), } } } @@ -283,6 +248,54 @@ impl AsyncRead for RatelimitedAsyncRead { } } +pin_project_lite::pin_project! { + /// Times and tracks the outcome of the request. + struct TimedDownload { + started_at: std::time::Instant, + outcome: metrics::AttemptOutcome, + #[pin] + inner: S + } + + impl PinnedDrop for TimedDownload { + fn drop(mut this: Pin<&mut Self>) { + metrics::BUCKET_METRICS.req_seconds.observe_elapsed(RequestKind::Get, this.outcome, this.started_at); + } + } +} + +impl TimedDownload { + fn new(started_at: std::time::Instant, inner: S) -> Self { + TimedDownload { + started_at, + outcome: metrics::AttemptOutcome::Cancelled, + inner, + } + } +} + +impl AsyncRead for TimedDownload { + fn poll_read( + self: std::pin::Pin<&mut Self>, + cx: &mut std::task::Context<'_>, + buf: &mut io::ReadBuf<'_>, + ) -> std::task::Poll> { + let this = self.project(); + let before = buf.filled().len(); + let read = std::task::ready!(this.inner.poll_read(cx, buf)); + + let read_eof = buf.filled().len() == before; + + match read { + Ok(()) if read_eof => *this.outcome = AttemptOutcome::Ok, + Ok(()) => { /* still in progress */ } + Err(_) => *this.outcome = AttemptOutcome::Err, + } + + std::task::Poll::Ready(read) + } +} + #[async_trait::async_trait] impl RemoteStorage for S3Bucket { /// See the doc for `RemoteStorage::list_prefixes` @@ -291,6 +304,8 @@ impl RemoteStorage for S3Bucket { &self, prefix: Option<&RemotePath>, ) -> Result, DownloadError> { + let kind = RequestKind::List; + // get the passed prefix or if it is not set use prefix_in_bucket value let list_prefix = prefix .map(|p| self.relative_path_to_s3_object(p)) @@ -307,15 +322,11 @@ impl RemoteStorage for S3Bucket { let mut document_keys = Vec::new(); let mut continuation_token = None; - loop { - let _guard = self - .concurrency_limiter - .acquire() - .await - .context("Concurrency limiter semaphore got closed during S3 list") - .map_err(DownloadError::Other)?; + loop { + let _guard = self.permit(kind).await; metrics::inc_list_objects(); + let started_at = start_measuring_requests(kind); let fetch_response = self .client @@ -332,7 +343,15 @@ impl RemoteStorage for S3Bucket { e }) .context("Failed to list S3 prefixes") - .map_err(DownloadError::Other)?; + .map_err(DownloadError::Other); + + let started_at = ScopeGuard::into_inner(started_at); + + metrics::BUCKET_METRICS + .req_seconds + .observe_elapsed(kind, &fetch_response, started_at); + + let fetch_response = fetch_response?; document_keys.extend( fetch_response @@ -342,10 +361,10 @@ impl RemoteStorage for S3Bucket { .filter_map(|o| Some(self.s3_object_to_relative_path(o.prefix()?))), ); - match fetch_response.next_continuation_token { - Some(new_token) => continuation_token = Some(new_token), + continuation_token = match fetch_response.next_continuation_token { + Some(new_token) => Some(new_token), None => break, - } + }; } Ok(document_keys) @@ -353,6 +372,8 @@ impl RemoteStorage for S3Bucket { /// See the doc for `RemoteStorage::list_files` async fn list_files(&self, folder: Option<&RemotePath>) -> anyhow::Result> { + let kind = RequestKind::List; + let folder_name = folder .map(|p| self.relative_path_to_s3_object(p)) .or_else(|| self.prefix_in_bucket.clone()); @@ -361,12 +382,9 @@ impl RemoteStorage for S3Bucket { let mut continuation_token = None; let mut all_files = vec![]; loop { - let _guard = self - .concurrency_limiter - .acquire() - .await - .context("Concurrency limiter semaphore got closed during S3 list_files")?; + let _guard = self.permit(kind).await; metrics::inc_list_objects(); + let started_at = start_measuring_requests(kind); let response = self .client @@ -381,7 +399,14 @@ impl RemoteStorage for S3Bucket { metrics::inc_list_objects_fail(); e }) - .context("Failed to list files in S3 bucket")?; + .context("Failed to list files in S3 bucket"); + + let started_at = ScopeGuard::into_inner(started_at); + metrics::BUCKET_METRICS + .req_seconds + .observe_elapsed(kind, &response, started_at); + + let response = response?; for object in response.contents().unwrap_or_default() { let object_path = object.key().expect("response does not contain a key"); @@ -403,18 +428,17 @@ impl RemoteStorage for S3Bucket { to: &RemotePath, metadata: Option, ) -> anyhow::Result<()> { - let _guard = self - .concurrency_limiter - .acquire() - .await - .context("Concurrency limiter semaphore got closed during S3 upload")?; + let kind = RequestKind::Put; + let _guard = self.permit(kind).await; metrics::inc_put_object(); + let started_at = start_measuring_requests(kind); let body = Body::wrap_stream(ReaderStream::new(from)); let bytes_stream = ByteStream::new(SdkBody::from(body)); - self.client + let res = self + .client .put_object() .bucket(self.bucket_name.clone()) .key(self.relative_path_to_s3_object(to)) @@ -426,7 +450,15 @@ impl RemoteStorage for S3Bucket { .map_err(|e| { metrics::inc_put_object_fail(); e - })?; + }); + + let started_at = ScopeGuard::into_inner(started_at); + metrics::BUCKET_METRICS + .req_seconds + .observe_elapsed(kind, &res, started_at); + + res?; + Ok(()) } @@ -463,11 +495,8 @@ impl RemoteStorage for S3Bucket { .await } async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> { - let _guard = self - .concurrency_limiter - .acquire() - .await - .context("Concurrency limiter semaphore got closed during S3 delete")?; + let kind = RequestKind::Delete; + let _guard = self.permit(kind).await; let mut delete_objects = Vec::with_capacity(paths.len()); for path in paths { @@ -479,6 +508,7 @@ impl RemoteStorage for S3Bucket { for chunk in delete_objects.chunks(MAX_DELETE_OBJECTS_REQUEST_SIZE) { metrics::inc_delete_objects(chunk.len() as u64); + let started_at = start_measuring_requests(kind); let resp = self .client @@ -488,6 +518,11 @@ impl RemoteStorage for S3Bucket { .send() .await; + let started_at = ScopeGuard::into_inner(started_at); + metrics::BUCKET_METRICS + .req_seconds + .observe_elapsed(kind, &resp, started_at); + match resp { Ok(resp) => { if let Some(errors) = resp.errors { @@ -508,15 +543,14 @@ impl RemoteStorage for S3Bucket { } async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> { - let _guard = self - .concurrency_limiter - .acquire() - .await - .context("Concurrency limiter semaphore got closed during S3 delete")?; + let kind = RequestKind::Delete; + let _guard = self.permit(kind).await; metrics::inc_delete_object(); + let started_at = start_measuring_requests(kind); - self.client + let res = self + .client .delete_object() .bucket(self.bucket_name.clone()) .key(self.relative_path_to_s3_object(path)) @@ -525,11 +559,41 @@ impl RemoteStorage for S3Bucket { .map_err(|e| { metrics::inc_delete_object_fail(); e - })?; + }); + + let started_at = ScopeGuard::into_inner(started_at); + metrics::BUCKET_METRICS + .req_seconds + .observe_elapsed(kind, &res, started_at); + + res?; + Ok(()) } } +/// On drop (cancellation) count towards [`metrics::BucketMetrics::cancelled_waits`]. +fn start_counting_cancelled_wait( + kind: RequestKind, +) -> ScopeGuard { + scopeguard::guard_on_success(std::time::Instant::now(), move |_| { + metrics::BUCKET_METRICS.cancelled_waits.get(kind).inc() + }) +} + +/// On drop (cancellation) add time to [`metrics::BucketMetrics::req_seconds`]. +fn start_measuring_requests( + kind: RequestKind, +) -> ScopeGuard { + scopeguard::guard_on_success(std::time::Instant::now(), move |started_at| { + metrics::BUCKET_METRICS.req_seconds.observe_elapsed( + kind, + AttemptOutcome::Cancelled, + started_at, + ) + }) +} + #[cfg(test)] mod tests { use std::num::NonZeroUsize; diff --git a/libs/remote_storage/src/s3_bucket/metrics.rs b/libs/remote_storage/src/s3_bucket/metrics.rs new file mode 100644 index 000000000000..4b7562867a81 --- /dev/null +++ b/libs/remote_storage/src/s3_bucket/metrics.rs @@ -0,0 +1,243 @@ +use metrics::{register_histogram_vec, register_int_counter_vec, Histogram, IntCounter}; +use once_cell::sync::Lazy; + +pub(super) static BUCKET_METRICS: Lazy = Lazy::new(Default::default); + +#[derive(Clone, Copy, Debug)] +pub(super) enum RequestKind { + Get = 0, + Put = 1, + Delete = 2, + List = 3, +} + +use RequestKind::*; + +impl RequestKind { + const fn as_str(&self) -> &'static str { + match self { + Get => "get_object", + Put => "put_object", + Delete => "delete_object", + List => "list_objects", + } + } + const fn as_index(&self) -> usize { + *self as usize + } +} + +pub(super) struct RequestTyped([C; 4]); + +impl RequestTyped { + pub(super) fn get(&self, kind: RequestKind) -> &C { + &self.0[kind.as_index()] + } + + fn build_with(mut f: impl FnMut(RequestKind) -> C) -> Self { + use RequestKind::*; + let mut it = [Get, Put, Delete, List].into_iter(); + let arr = std::array::from_fn::(|index| { + let next = it.next().unwrap(); + assert_eq!(index, next.as_index()); + f(next) + }); + + if let Some(next) = it.next() { + panic!("unexpected {next:?}"); + } + + RequestTyped(arr) + } +} + +impl RequestTyped { + pub(super) fn observe_elapsed(&self, kind: RequestKind, started_at: std::time::Instant) { + self.get(kind).observe(started_at.elapsed().as_secs_f64()) + } +} + +pub(super) struct PassFailCancelledRequestTyped { + success: RequestTyped, + fail: RequestTyped, + cancelled: RequestTyped, +} + +#[derive(Debug, Clone, Copy)] +pub(super) enum AttemptOutcome { + Ok, + Err, + Cancelled, +} + +impl From<&Result> for AttemptOutcome { + fn from(value: &Result) -> Self { + match value { + Ok(_) => AttemptOutcome::Ok, + Err(_) => AttemptOutcome::Err, + } + } +} + +impl AttemptOutcome { + pub(super) fn as_str(&self) -> &'static str { + match self { + AttemptOutcome::Ok => "ok", + AttemptOutcome::Err => "err", + AttemptOutcome::Cancelled => "cancelled", + } + } +} + +impl PassFailCancelledRequestTyped { + pub(super) fn get(&self, kind: RequestKind, outcome: AttemptOutcome) -> &C { + let target = match outcome { + AttemptOutcome::Ok => &self.success, + AttemptOutcome::Err => &self.fail, + AttemptOutcome::Cancelled => &self.cancelled, + }; + target.get(kind) + } + + fn build_with(mut f: impl FnMut(RequestKind, AttemptOutcome) -> C) -> Self { + let success = RequestTyped::build_with(|kind| f(kind, AttemptOutcome::Ok)); + let fail = RequestTyped::build_with(|kind| f(kind, AttemptOutcome::Err)); + let cancelled = RequestTyped::build_with(|kind| f(kind, AttemptOutcome::Cancelled)); + + PassFailCancelledRequestTyped { + success, + fail, + cancelled, + } + } +} + +impl PassFailCancelledRequestTyped { + pub(super) fn observe_elapsed( + &self, + kind: RequestKind, + outcome: impl Into, + started_at: std::time::Instant, + ) { + self.get(kind, outcome.into()) + .observe(started_at.elapsed().as_secs_f64()) + } +} + +pub(super) struct BucketMetrics { + /// Total requests attempted + // TODO: remove after next release and migrate dashboards to `sum by (result) (remote_storage_s3_requests_count)` + requests: RequestTyped, + /// Subset of attempted requests failed + // TODO: remove after next release and migrate dashboards to `remote_storage_s3_requests_count{result="err"}` + failed: RequestTyped, + + pub(super) req_seconds: PassFailCancelledRequestTyped, + pub(super) wait_seconds: RequestTyped, + + /// Track how many semaphore awaits were cancelled per request type. + /// + /// This is in case cancellations are happening more than expected. + pub(super) cancelled_waits: RequestTyped, +} + +impl Default for BucketMetrics { + fn default() -> Self { + let requests = register_int_counter_vec!( + "remote_storage_s3_requests_count", + "Number of s3 requests of particular type", + &["request_type"], + ) + .expect("failed to define a metric"); + let requests = + RequestTyped::build_with(|kind| requests.with_label_values(&[kind.as_str()])); + + let failed = register_int_counter_vec!( + "remote_storage_s3_failures_count", + "Number of failed s3 requests of particular type", + &["request_type"], + ) + .expect("failed to define a metric"); + let failed = RequestTyped::build_with(|kind| failed.with_label_values(&[kind.as_str()])); + + let buckets = [0.01, 0.10, 0.5, 1.0, 5.0, 10.0, 50.0, 100.0]; + + let req_seconds = register_histogram_vec!( + "remote_storage_s3_request_seconds", + "Seconds to complete a request", + &["request_type", "result"], + buckets.to_vec(), + ) + .unwrap(); + let req_seconds = PassFailCancelledRequestTyped::build_with(|kind, outcome| { + req_seconds.with_label_values(&[kind.as_str(), outcome.as_str()]) + }); + + let wait_seconds = register_histogram_vec!( + "remote_storage_s3_wait_seconds", + "Seconds rate limited", + &["request_type"], + buckets.to_vec(), + ) + .unwrap(); + let wait_seconds = + RequestTyped::build_with(|kind| wait_seconds.with_label_values(&[kind.as_str()])); + + let cancelled_waits = register_int_counter_vec!( + "remote_storage_s3_cancelled_waits_total", + "Times a semaphore wait has been cancelled per request type", + &["request_type"], + ) + .unwrap(); + let cancelled_waits = + RequestTyped::build_with(|kind| cancelled_waits.with_label_values(&[kind.as_str()])); + + Self { + requests, + failed, + req_seconds, + wait_seconds, + cancelled_waits, + } + } +} + +pub fn inc_get_object() { + BUCKET_METRICS.requests.get(Get).inc() +} + +pub fn inc_get_object_fail() { + BUCKET_METRICS.failed.get(Get).inc() +} + +pub fn inc_put_object() { + BUCKET_METRICS.requests.get(Put).inc() +} + +pub fn inc_put_object_fail() { + BUCKET_METRICS.failed.get(Put).inc() +} + +pub fn inc_delete_object() { + BUCKET_METRICS.requests.get(Delete).inc() +} + +pub fn inc_delete_objects(count: u64) { + BUCKET_METRICS.requests.get(Delete).inc_by(count) +} + +pub fn inc_delete_object_fail() { + BUCKET_METRICS.failed.get(Delete).inc() +} + +pub fn inc_delete_objects_fail(count: u64) { + BUCKET_METRICS.failed.get(Delete).inc_by(count) +} + +pub fn inc_list_objects() { + BUCKET_METRICS.requests.get(List).inc() +} + +pub fn inc_list_objects_fail() { + BUCKET_METRICS.failed.get(List).inc() +} diff --git a/pageserver/ctl/src/draw_timeline_dir.rs b/pageserver/ctl/src/draw_timeline_dir.rs index 568078808fc6..0e77ef056383 100644 --- a/pageserver/ctl/src/draw_timeline_dir.rs +++ b/pageserver/ctl/src/draw_timeline_dir.rs @@ -23,6 +23,7 @@ //! use anyhow::Result; use pageserver::repository::Key; +use pageserver::METADATA_FILE_NAME; use std::cmp::Ordering; use std::io::{self, BufRead}; use std::path::PathBuf; @@ -71,6 +72,10 @@ pub fn main() -> Result<()> { let line = PathBuf::from_str(&line).unwrap(); let filename = line.file_name().unwrap(); let filename = filename.to_str().unwrap(); + if filename == METADATA_FILE_NAME { + // Don't try and parse "metadata" like a key-lsn range + continue; + } let range = parse_filename(filename); ranges.push(range); } diff --git a/pageserver/ctl/src/layer_map_analyzer.rs b/pageserver/ctl/src/layer_map_analyzer.rs index da83c061c071..9d9a1fa15110 100644 --- a/pageserver/ctl/src/layer_map_analyzer.rs +++ b/pageserver/ctl/src/layer_map_analyzer.rs @@ -107,23 +107,25 @@ async fn get_holes(path: &Path, max_holes: usize) -> Result> { // min-heap (reserve space for one more element added before eviction) let mut heap: BinaryHeap = BinaryHeap::with_capacity(max_holes + 1); let mut prev_key: Option = None; - tree_reader.visit( - &[0u8; DELTA_KEY_SIZE], - VisitDirection::Forwards, - |key, _value| { - let curr = Key::from_slice(&key[..KEY_SIZE]); - if let Some(prev) = prev_key { - if curr.to_i128() - prev.to_i128() >= MIN_HOLE_LENGTH { - heap.push(Hole(prev..curr)); - if heap.len() > max_holes { - heap.pop(); // remove smallest hole + tree_reader + .visit( + &[0u8; DELTA_KEY_SIZE], + VisitDirection::Forwards, + |key, _value| { + let curr = Key::from_slice(&key[..KEY_SIZE]); + if let Some(prev) = prev_key { + if curr.to_i128() - prev.to_i128() >= MIN_HOLE_LENGTH { + heap.push(Hole(prev..curr)); + if heap.len() > max_holes { + heap.pop(); // remove smallest hole + } } } - } - prev_key = Some(curr.next()); - true - }, - )?; + prev_key = Some(curr.next()); + true + }, + ) + .await?; let mut holes = heap.into_vec(); holes.sort_by_key(|hole| hole.0.start); Ok(holes) diff --git a/pageserver/ctl/src/layers.rs b/pageserver/ctl/src/layers.rs index d81303d53328..e3ddefc66197 100644 --- a/pageserver/ctl/src/layers.rs +++ b/pageserver/ctl/src/layers.rs @@ -59,16 +59,18 @@ async fn read_delta_file(path: impl AsRef) -> Result<()> { ); // TODO(chi): dedup w/ `delta_layer.rs` by exposing the API. let mut all = vec![]; - tree_reader.visit( - &[0u8; DELTA_KEY_SIZE], - VisitDirection::Forwards, - |key, value_offset| { - let curr = Key::from_slice(&key[..KEY_SIZE]); - all.push((curr, BlobRef(value_offset))); - true - }, - )?; - let mut cursor = BlockCursor::new(&file); + tree_reader + .visit( + &[0u8; DELTA_KEY_SIZE], + VisitDirection::Forwards, + |key, value_offset| { + let curr = Key::from_slice(&key[..KEY_SIZE]); + all.push((curr, BlobRef(value_offset))); + true + }, + ) + .await?; + let cursor = BlockCursor::new(&file); for (k, v) in all { let value = cursor.read_blob(v.pos())?; println!("key:{} value_len:{}", k, value.len()); diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index b247fdf0ab25..35690431010e 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -38,8 +38,6 @@ const PID_FILE_NAME: &str = "pageserver.pid"; const FEATURES: &[&str] = &[ #[cfg(feature = "testing")] "testing", - #[cfg(feature = "fail/failpoints")] - "fail/failpoints", ]; fn version() -> String { diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs index 45b4be470be7..e4284b9e9cc2 100644 --- a/pageserver/src/consumption_metrics.rs +++ b/pageserver/src/consumption_metrics.rs @@ -14,14 +14,16 @@ use reqwest::Url; use serde::Serialize; use serde_with::{serde_as, DisplayFromStr}; use std::collections::HashMap; -use std::time::Duration; +use std::sync::Arc; +use std::time::{Duration, SystemTime}; use tracing::*; use utils::id::{NodeId, TenantId, TimelineId}; +use utils::lsn::Lsn; const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60); #[serde_as] -#[derive(Serialize, Debug)] +#[derive(Serialize, Debug, Clone, Copy)] struct Ids { #[serde_as(as = "DisplayFromStr")] tenant_id: TenantId, @@ -32,13 +34,13 @@ struct Ids { /// Key that uniquely identifies the object, this metric describes. #[derive(Debug, Clone, PartialEq, Eq, Hash)] -pub struct PageserverConsumptionMetricsKey { - pub tenant_id: TenantId, - pub timeline_id: Option, - pub metric: &'static str, +struct MetricsKey { + tenant_id: TenantId, + timeline_id: Option, + metric: &'static str, } -impl PageserverConsumptionMetricsKey { +impl MetricsKey { const fn absolute_values(self) -> AbsoluteValueFactory { AbsoluteValueFactory(self) } @@ -48,18 +50,17 @@ impl PageserverConsumptionMetricsKey { } /// Helper type which each individual metric kind can return to produce only absolute values. -struct AbsoluteValueFactory(PageserverConsumptionMetricsKey); +struct AbsoluteValueFactory(MetricsKey); impl AbsoluteValueFactory { - fn now(self, val: u64) -> (PageserverConsumptionMetricsKey, (EventType, u64)) { + fn at(self, time: DateTime, val: u64) -> (MetricsKey, (EventType, u64)) { let key = self.0; - let time = Utc::now(); (key, (EventType::Absolute { time }, val)) } } /// Helper type which each individual metric kind can return to produce only incremental values. -struct IncrementalValueFactory(PageserverConsumptionMetricsKey); +struct IncrementalValueFactory(MetricsKey); impl IncrementalValueFactory { #[allow(clippy::wrong_self_convention)] @@ -68,7 +69,7 @@ impl IncrementalValueFactory { prev_end: DateTime, up_to: DateTime, val: u64, - ) -> (PageserverConsumptionMetricsKey, (EventType, u64)) { + ) -> (MetricsKey, (EventType, u64)) { let key = self.0; // cannot assert prev_end < up_to because these are realtime clock based ( @@ -83,15 +84,18 @@ impl IncrementalValueFactory { ) } - fn key(&self) -> &PageserverConsumptionMetricsKey { + fn key(&self) -> &MetricsKey { &self.0 } } -// the static part of a PageserverConsumptionMetricsKey -impl PageserverConsumptionMetricsKey { +// the static part of a MetricsKey +impl MetricsKey { + /// Absolute value of [`Timeline::get_last_record_lsn`]. + /// + /// [`Timeline::get_last_record_lsn`]: crate::tenant::Timeline::get_last_record_lsn const fn written_size(tenant_id: TenantId, timeline_id: TimelineId) -> AbsoluteValueFactory { - PageserverConsumptionMetricsKey { + MetricsKey { tenant_id, timeline_id: Some(timeline_id), metric: "written_size", @@ -99,25 +103,31 @@ impl PageserverConsumptionMetricsKey { .absolute_values() } - /// Values will be the difference of the latest written_size (last_record_lsn) to what we - /// previously sent. + /// Values will be the difference of the latest [`MetricsKey::written_size`] to what we + /// previously sent, starting from the previously sent incremental time range ending at the + /// latest absolute measurement. const fn written_size_delta( tenant_id: TenantId, timeline_id: TimelineId, ) -> IncrementalValueFactory { - PageserverConsumptionMetricsKey { + MetricsKey { tenant_id, timeline_id: Some(timeline_id), - metric: "written_size_bytes_delta", + // the name here is correctly about data not size, because that is what is wanted by + // downstream pipeline + metric: "written_data_bytes_delta", } .incremental_values() } + /// Exact [`Timeline::get_current_logical_size`]. + /// + /// [`Timeline::get_current_logical_size`]: crate::tenant::Timeline::get_current_logical_size const fn timeline_logical_size( tenant_id: TenantId, timeline_id: TimelineId, ) -> AbsoluteValueFactory { - PageserverConsumptionMetricsKey { + MetricsKey { tenant_id, timeline_id: Some(timeline_id), metric: "timeline_logical_size", @@ -125,8 +135,11 @@ impl PageserverConsumptionMetricsKey { .absolute_values() } + /// [`Tenant::remote_size`] + /// + /// [`Tenant::remote_size`]: crate::tenant::Tenant::remote_size const fn remote_storage_size(tenant_id: TenantId) -> AbsoluteValueFactory { - PageserverConsumptionMetricsKey { + MetricsKey { tenant_id, timeline_id: None, metric: "remote_storage_size", @@ -134,8 +147,11 @@ impl PageserverConsumptionMetricsKey { .absolute_values() } + /// Sum of [`Timeline::resident_physical_size`] for each `Tenant`. + /// + /// [`Timeline::resident_physical_size`]: crate::tenant::Timeline::resident_physical_size const fn resident_size(tenant_id: TenantId) -> AbsoluteValueFactory { - PageserverConsumptionMetricsKey { + MetricsKey { tenant_id, timeline_id: None, metric: "resident_size", @@ -143,8 +159,11 @@ impl PageserverConsumptionMetricsKey { .absolute_values() } + /// [`Tenant::cached_synthetic_size`] as refreshed by [`calculate_synthetic_size_worker`]. + /// + /// [`Tenant::cached_synthetic_size`]: crate::tenant::Tenant::cached_synthetic_size const fn synthetic_size(tenant_id: TenantId) -> AbsoluteValueFactory { - PageserverConsumptionMetricsKey { + MetricsKey { tenant_id, timeline_id: None, metric: "synthetic_storage_size", @@ -228,15 +247,15 @@ pub async fn collect_metrics( /// /// TODO /// - refactor this function (chunking+sending part) to reuse it in proxy module; -pub async fn collect_metrics_iteration( +async fn collect_metrics_iteration( client: &reqwest::Client, - cached_metrics: &mut HashMap, + cached_metrics: &mut HashMap, metric_collection_endpoint: &reqwest::Url, node_id: NodeId, ctx: &RequestContext, send_cached: bool, ) { - let mut current_metrics: Vec<(PageserverConsumptionMetricsKey, (EventType, u64))> = Vec::new(); + let mut current_metrics: Vec<(MetricsKey, (EventType, u64))> = Vec::new(); trace!( "starting collect_metrics_iteration. metric_collection_endpoint: {}", metric_collection_endpoint @@ -270,130 +289,48 @@ pub async fn collect_metrics_iteration( let mut tenant_resident_size = 0; // iterate through list of timelines in tenant - for timeline in tenant.list_timelines().iter() { + for timeline in tenant.list_timelines() { // collect per-timeline metrics only for active timelines - if timeline.is_active() { - let timeline_written_size = u64::from(timeline.get_last_record_lsn()); - - let (key, written_size_now) = - PageserverConsumptionMetricsKey::written_size(tenant_id, timeline.timeline_id) - .now(timeline_written_size); - - // last_record_lsn can only go up, right now at least, TODO: #2592 or related - // features might change this. - - let written_size_delta_key = PageserverConsumptionMetricsKey::written_size_delta( - tenant_id, - timeline.timeline_id, - ); - // use this when available, because in a stream of incremental values, it will be - // accurate where as when last_record_lsn stops moving, we will only cache the last - // one of those. - let last_stop_time = - cached_metrics - .get(written_size_delta_key.key()) - .map(|(until, _val)| { - until - .incremental_timerange() - .expect("never create EventType::Absolute for written_size_delta") - .end - }); - - // by default, use the last sent written_size as the basis for - // calculating the delta. if we don't yet have one, use the load time value. - let prev = cached_metrics - .get(&key) - .map(|(prev_at, prev)| { - // use the prev time from our last incremental update, or default to latest - // absolute update on the first round. - let prev_at = prev_at - .absolute_time() - .expect("never create EventType::Incremental for written_size"); - let prev_at = last_stop_time.unwrap_or(prev_at); - (*prev_at, *prev) - }) - .unwrap_or_else(|| { - // if we don't have a previous point of comparison, compare to the load time - // lsn. - let (disk_consistent_lsn, loaded_at) = &timeline.loaded_at; - (DateTime::from(*loaded_at), disk_consistent_lsn.0) - }); - - // written_size_delta_bytes - current_metrics.extend( - if let Some(delta) = written_size_now.1.checked_sub(prev.1) { - let up_to = written_size_now - .0 - .absolute_time() - .expect("never create EventType::Incremental for written_size"); - let key_value = - written_size_delta_key.from_previous_up_to(prev.0, *up_to, delta); - Some(key_value) - } else { - None - }, - ); - - // written_size - current_metrics.push((key, written_size_now)); - - let span = info_span!("collect_metrics_iteration", tenant_id = %timeline.tenant_id, timeline_id = %timeline.timeline_id); - match span.in_scope(|| timeline.get_current_logical_size(ctx)) { - // Only send timeline logical size when it is fully calculated. - Ok((size, is_exact)) if is_exact => { - current_metrics.push( - PageserverConsumptionMetricsKey::timeline_logical_size( - tenant_id, - timeline.timeline_id, - ) - .now(size), - ); - } - Ok((_, _)) => {} - Err(err) => { - error!( - "failed to get current logical size for timeline {}: {err:?}", - timeline.timeline_id - ); - continue; - } - }; + let timeline_id = timeline.timeline_id; + + match TimelineSnapshot::collect(&timeline, ctx) { + Ok(Some(snap)) => { + snap.to_metrics( + tenant_id, + timeline_id, + Utc::now(), + &mut current_metrics, + cached_metrics, + ); + } + Ok(None) => {} + Err(e) => { + error!( + "failed to get metrics values for tenant {tenant_id} timeline {}: {e:#?}", + timeline.timeline_id + ); + continue; + } } - let timeline_resident_size = timeline.get_resident_physical_size(); - tenant_resident_size += timeline_resident_size; + tenant_resident_size += timeline.resident_physical_size(); } - match tenant.get_remote_size().await { - Ok(tenant_remote_size) => { - current_metrics.push( - PageserverConsumptionMetricsKey::remote_storage_size(tenant_id) - .now(tenant_remote_size), - ); - } - Err(err) => { - error!( - "failed to get remote size for tenant {}: {err:?}", - tenant_id - ); - } - } + current_metrics + .push(MetricsKey::remote_storage_size(tenant_id).at(Utc::now(), tenant.remote_size())); - current_metrics.push( - PageserverConsumptionMetricsKey::resident_size(tenant_id).now(tenant_resident_size), - ); + current_metrics + .push(MetricsKey::resident_size(tenant_id).at(Utc::now(), tenant_resident_size)); // Note that this metric is calculated in a separate bgworker // Here we only use cached value, which may lag behind the real latest one - let tenant_synthetic_size = tenant.get_cached_synthetic_size(); + let synthetic_size = tenant.cached_synthetic_size(); - if tenant_synthetic_size != 0 { + if synthetic_size != 0 { // only send non-zeroes because otherwise these show up as errors in logs - current_metrics.push( - PageserverConsumptionMetricsKey::synthetic_size(tenant_id) - .now(tenant_synthetic_size), - ); + current_metrics + .push(MetricsKey::synthetic_size(tenant_id).at(Utc::now(), synthetic_size)); } } @@ -425,6 +362,8 @@ pub async fn collect_metrics_iteration( let mut chunk_to_send: Vec> = Vec::with_capacity(CHUNK_SIZE); + let node_id = node_id.to_string(); + for chunk in chunks { chunk_to_send.clear(); @@ -432,7 +371,7 @@ pub async fn collect_metrics_iteration( chunk_to_send.extend(chunk.iter().map(|(curr_key, (when, curr_val))| Event { kind: *when, metric: curr_key.metric, - idempotency_key: idempotency_key(node_id.to_string()), + idempotency_key: idempotency_key(&node_id), value: *curr_val, extra: Ids { tenant_id: curr_key.tenant_id, @@ -440,17 +379,14 @@ pub async fn collect_metrics_iteration( }, })); - let chunk_json = serde_json::value::to_raw_value(&EventChunk { - events: &chunk_to_send, - }) - .expect("PageserverConsumptionMetric should not fail serialization"); - const MAX_RETRIES: u32 = 3; for attempt in 0..MAX_RETRIES { let res = client .post(metric_collection_endpoint.clone()) - .json(&chunk_json) + .json(&EventChunk { + events: (&chunk_to_send).into(), + }) .send() .await; @@ -486,6 +422,130 @@ pub async fn collect_metrics_iteration( } } +/// Internal type to make timeline metric production testable. +/// +/// As this value type contains all of the information needed from a timeline to produce the +/// metrics, it can easily be created with different values in test. +struct TimelineSnapshot { + loaded_at: (Lsn, SystemTime), + last_record_lsn: Lsn, + current_exact_logical_size: Option, +} + +impl TimelineSnapshot { + /// Collect the metrics from an actual timeline. + /// + /// Fails currently only when [`Timeline::get_current_logical_size`] fails. + /// + /// [`Timeline::get_current_logical_size`]: crate::tenant::Timeline::get_current_logical_size + fn collect( + t: &Arc, + ctx: &RequestContext, + ) -> anyhow::Result> { + use anyhow::Context; + + if !t.is_active() { + // no collection for broken or stopping needed, we will still keep the cached values + // though at the caller. + Ok(None) + } else { + let loaded_at = t.loaded_at; + let last_record_lsn = t.get_last_record_lsn(); + + let current_exact_logical_size = { + let span = info_span!("collect_metrics_iteration", tenant_id = %t.tenant_id, timeline_id = %t.timeline_id); + let res = span + .in_scope(|| t.get_current_logical_size(ctx)) + .context("get_current_logical_size"); + match res? { + // Only send timeline logical size when it is fully calculated. + (size, is_exact) if is_exact => Some(size), + (_, _) => None, + } + }; + + Ok(Some(TimelineSnapshot { + loaded_at, + last_record_lsn, + current_exact_logical_size, + })) + } + } + + /// Produce the timeline consumption metrics into the `metrics` argument. + fn to_metrics( + &self, + tenant_id: TenantId, + timeline_id: TimelineId, + now: DateTime, + metrics: &mut Vec<(MetricsKey, (EventType, u64))>, + cache: &HashMap, + ) { + let timeline_written_size = u64::from(self.last_record_lsn); + + let (key, written_size_now) = + MetricsKey::written_size(tenant_id, timeline_id).at(now, timeline_written_size); + + // last_record_lsn can only go up, right now at least, TODO: #2592 or related + // features might change this. + + let written_size_delta_key = MetricsKey::written_size_delta(tenant_id, timeline_id); + + // use this when available, because in a stream of incremental values, it will be + // accurate where as when last_record_lsn stops moving, we will only cache the last + // one of those. + let last_stop_time = cache + .get(written_size_delta_key.key()) + .map(|(until, _val)| { + until + .incremental_timerange() + .expect("never create EventType::Absolute for written_size_delta") + .end + }); + + // by default, use the last sent written_size as the basis for + // calculating the delta. if we don't yet have one, use the load time value. + let prev = cache + .get(&key) + .map(|(prev_at, prev)| { + // use the prev time from our last incremental update, or default to latest + // absolute update on the first round. + let prev_at = prev_at + .absolute_time() + .expect("never create EventType::Incremental for written_size"); + let prev_at = last_stop_time.unwrap_or(prev_at); + (*prev_at, *prev) + }) + .unwrap_or_else(|| { + // if we don't have a previous point of comparison, compare to the load time + // lsn. + let (disk_consistent_lsn, loaded_at) = &self.loaded_at; + (DateTime::from(*loaded_at), disk_consistent_lsn.0) + }); + + // written_size_bytes_delta + metrics.extend( + if let Some(delta) = written_size_now.1.checked_sub(prev.1) { + let up_to = written_size_now + .0 + .absolute_time() + .expect("never create EventType::Incremental for written_size"); + let key_value = written_size_delta_key.from_previous_up_to(prev.0, *up_to, delta); + Some(key_value) + } else { + None + }, + ); + + // written_size + metrics.push((key, written_size_now)); + + if let Some(size) = self.current_exact_logical_size { + metrics.push(MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, size)); + } + } +} + /// Caclculate synthetic size for each active tenant pub async fn calculate_synthetic_size_worker( synthetic_size_calculation_interval: Duration, @@ -500,7 +560,7 @@ pub async fn calculate_synthetic_size_worker( _ = task_mgr::shutdown_watcher() => { return Ok(()); }, - tick_at = ticker.tick() => { + tick_at = ticker.tick() => { let tenants = match mgr::list_tenants().await { Ok(tenants) => tenants, @@ -536,3 +596,149 @@ pub async fn calculate_synthetic_size_worker( } } } + +#[cfg(test)] +mod tests { + use std::collections::HashMap; + + use std::time::SystemTime; + use utils::{ + id::{TenantId, TimelineId}, + lsn::Lsn, + }; + + use crate::consumption_metrics::MetricsKey; + + use super::TimelineSnapshot; + use chrono::{DateTime, Utc}; + + #[test] + fn startup_collected_timeline_metrics_before_advancing() { + let tenant_id = TenantId::generate(); + let timeline_id = TimelineId::generate(); + + let mut metrics = Vec::new(); + let cache = HashMap::new(); + + let initdb_lsn = Lsn(0x10000); + let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2); + + let snap = TimelineSnapshot { + loaded_at: (disk_consistent_lsn, SystemTime::now()), + last_record_lsn: disk_consistent_lsn, + current_exact_logical_size: Some(0x42000), + }; + + let now = DateTime::::from(SystemTime::now()); + + snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache); + + assert_eq!( + metrics, + &[ + MetricsKey::written_size_delta(tenant_id, timeline_id).from_previous_up_to( + snap.loaded_at.1.into(), + now, + 0 + ), + MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0), + MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000) + ] + ); + } + + #[test] + fn startup_collected_timeline_metrics_second_round() { + let tenant_id = TenantId::generate(); + let timeline_id = TimelineId::generate(); + + let [now, before, init] = time_backwards(); + + let now = DateTime::::from(now); + let before = DateTime::::from(before); + + let initdb_lsn = Lsn(0x10000); + let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2); + + let mut metrics = Vec::new(); + let cache = HashMap::from([ + MetricsKey::written_size(tenant_id, timeline_id).at(before, disk_consistent_lsn.0) + ]); + + let snap = TimelineSnapshot { + loaded_at: (disk_consistent_lsn, init), + last_record_lsn: disk_consistent_lsn, + current_exact_logical_size: Some(0x42000), + }; + + snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache); + + assert_eq!( + metrics, + &[ + MetricsKey::written_size_delta(tenant_id, timeline_id) + .from_previous_up_to(before, now, 0), + MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0), + MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000) + ] + ); + } + + #[test] + fn startup_collected_timeline_metrics_nth_round_at_same_lsn() { + let tenant_id = TenantId::generate(); + let timeline_id = TimelineId::generate(); + + let [now, just_before, before, init] = time_backwards(); + + let now = DateTime::::from(now); + let just_before = DateTime::::from(just_before); + let before = DateTime::::from(before); + + let initdb_lsn = Lsn(0x10000); + let disk_consistent_lsn = Lsn(initdb_lsn.0 * 2); + + let mut metrics = Vec::new(); + let cache = HashMap::from([ + // at t=before was the last time the last_record_lsn changed + MetricsKey::written_size(tenant_id, timeline_id).at(before, disk_consistent_lsn.0), + // end time of this event is used for the next ones + MetricsKey::written_size_delta(tenant_id, timeline_id).from_previous_up_to( + before, + just_before, + 0, + ), + ]); + + let snap = TimelineSnapshot { + loaded_at: (disk_consistent_lsn, init), + last_record_lsn: disk_consistent_lsn, + current_exact_logical_size: Some(0x42000), + }; + + snap.to_metrics(tenant_id, timeline_id, now, &mut metrics, &cache); + + assert_eq!( + metrics, + &[ + MetricsKey::written_size_delta(tenant_id, timeline_id).from_previous_up_to( + just_before, + now, + 0 + ), + MetricsKey::written_size(tenant_id, timeline_id).at(now, disk_consistent_lsn.0), + MetricsKey::timeline_logical_size(tenant_id, timeline_id).at(now, 0x42000) + ] + ); + } + + fn time_backwards() -> [std::time::SystemTime; N] { + let mut times = [std::time::SystemTime::UNIX_EPOCH; N]; + times[0] = std::time::SystemTime::now(); + for behind in 1..N { + times[behind] = times[0] - std::time::Duration::from_secs(behind as u64); + } + + times + } +} diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index f43651e93166..1b1d0acaee22 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -47,24 +47,50 @@ pub use crate::metrics::preinitialize_metrics; #[tracing::instrument] pub async fn shutdown_pageserver(exit_code: i32) { + use std::time::Duration; // Shut down the libpq endpoint task. This prevents new connections from // being accepted. - task_mgr::shutdown_tasks(Some(TaskKind::LibpqEndpointListener), None, None).await; + timed( + task_mgr::shutdown_tasks(Some(TaskKind::LibpqEndpointListener), None, None), + "shutdown LibpqEndpointListener", + Duration::from_secs(1), + ) + .await; // Shut down any page service tasks. - task_mgr::shutdown_tasks(Some(TaskKind::PageRequestHandler), None, None).await; + timed( + task_mgr::shutdown_tasks(Some(TaskKind::PageRequestHandler), None, None), + "shutdown PageRequestHandlers", + Duration::from_secs(1), + ) + .await; // Shut down all the tenants. This flushes everything to disk and kills // the checkpoint and GC tasks. - tenant::mgr::shutdown_all_tenants().await; + timed( + tenant::mgr::shutdown_all_tenants(), + "shutdown all tenants", + Duration::from_secs(5), + ) + .await; // Shut down the HTTP endpoint last, so that you can still check the server's // status while it's shutting down. // FIXME: We should probably stop accepting commands like attach/detach earlier. - task_mgr::shutdown_tasks(Some(TaskKind::HttpEndpointListener), None, None).await; + timed( + task_mgr::shutdown_tasks(Some(TaskKind::HttpEndpointListener), None, None), + "shutdown http", + Duration::from_secs(1), + ) + .await; // There should be nothing left, but let's be sure - task_mgr::shutdown_tasks(None, None, None).await; + timed( + task_mgr::shutdown_tasks(None, None, None), + "shutdown leftovers", + Duration::from_secs(1), + ) + .await; info!("Shut down successfully completed"); std::process::exit(exit_code); } @@ -172,6 +198,45 @@ pub struct InitializationOrder { pub background_jobs_can_start: utils::completion::Barrier, } +/// Time the future with a warning when it exceeds a threshold. +async fn timed( + fut: Fut, + name: &str, + warn_at: std::time::Duration, +) -> ::Output { + let started = std::time::Instant::now(); + + let mut fut = std::pin::pin!(fut); + + match tokio::time::timeout(warn_at, &mut fut).await { + Ok(ret) => { + tracing::info!( + task = name, + elapsed_ms = started.elapsed().as_millis(), + "completed" + ); + ret + } + Err(_) => { + tracing::info!( + task = name, + elapsed_ms = started.elapsed().as_millis(), + "still waiting, taking longer than expected..." + ); + + let ret = fut.await; + + tracing::warn!( + task = name, + elapsed_ms = started.elapsed().as_millis(), + "completed, took longer than expected" + ); + + ret + } + } +} + #[cfg(test)] mod backoff_defaults_tests { use super::*; @@ -202,3 +267,36 @@ mod backoff_defaults_tests { ); } } + +#[cfg(test)] +mod timed_tests { + use super::timed; + use std::time::Duration; + + #[tokio::test] + async fn timed_completes_when_inner_future_completes() { + // A future that completes on time should have its result returned + let r1 = timed( + async move { + tokio::time::sleep(Duration::from_millis(10)).await; + 123 + }, + "test 1", + Duration::from_millis(50), + ) + .await; + assert_eq!(r1, 123); + + // A future that completes too slowly should also have its result returned + let r1 = timed( + async move { + tokio::time::sleep(Duration::from_millis(50)).await; + 456 + }, + "test 1", + Duration::from_millis(10), + ) + .await; + assert_eq!(r1, 456); + } +} diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 1b287ee10f7e..9fd60053303b 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -644,20 +644,19 @@ impl Tenant { Ok(()) } - /// get size of all remote timelines + /// Get sum of all remote timelines sizes /// /// This function relies on the index_part instead of listing the remote storage - /// - pub async fn get_remote_size(&self) -> anyhow::Result { + pub fn remote_size(&self) -> u64 { let mut size = 0; - for timeline in self.list_timelines().iter() { + for timeline in self.list_timelines() { if let Some(remote_client) = &timeline.remote_client { size += remote_client.get_remote_physical_size(); } } - Ok(size) + size } #[instrument(skip_all, fields(timeline_id=%timeline_id))] @@ -2889,7 +2888,7 @@ impl Tenant { .set(size); } - pub fn get_cached_synthetic_size(&self) -> u64 { + pub fn cached_synthetic_size(&self) -> u64 { self.cached_synthetic_tenant_size.load(Ordering::Relaxed) } } diff --git a/pageserver/src/tenant/blob_io.rs b/pageserver/src/tenant/blob_io.rs index 4dcf7fe5feeb..7dd53407e73f 100644 --- a/pageserver/src/tenant/blob_io.rs +++ b/pageserver/src/tenant/blob_io.rs @@ -21,7 +21,7 @@ where R: BlockReader, { /// Read a blob into a new buffer. - pub fn read_blob(&mut self, offset: u64) -> Result, std::io::Error> { + pub fn read_blob(&self, offset: u64) -> Result, std::io::Error> { let mut buf = Vec::new(); self.read_blob_into_buf(offset, &mut buf)?; Ok(buf) @@ -29,7 +29,7 @@ where /// Read blob into the given buffer. Any previous contents in the buffer /// are overwritten. pub fn read_blob_into_buf( - &mut self, + &self, offset: u64, dstbuf: &mut Vec, ) -> Result<(), std::io::Error> { diff --git a/pageserver/src/tenant/block_io.rs b/pageserver/src/tenant/block_io.rs index 10de34e3f6d7..e6ebebe594d6 100644 --- a/pageserver/src/tenant/block_io.rs +++ b/pageserver/src/tenant/block_io.rs @@ -80,7 +80,7 @@ where BlockCursor { reader } } - pub fn read_blk(&mut self, blknum: u32) -> Result { + pub fn read_blk(&self, blknum: u32) -> Result { self.reader.read_blk(blknum) } } diff --git a/pageserver/src/tenant/disk_btree.rs b/pageserver/src/tenant/disk_btree.rs index 518286ddb015..b163d27b421e 100644 --- a/pageserver/src/tenant/disk_btree.rs +++ b/pageserver/src/tenant/disk_btree.rs @@ -230,14 +230,15 @@ where /// /// Read the value for given key. Returns the value, or None if it doesn't exist. /// - pub fn get(&self, search_key: &[u8; L]) -> Result> { + pub async fn get(&self, search_key: &[u8; L]) -> Result> { let mut result: Option = None; self.visit(search_key, VisitDirection::Forwards, |key, value| { if key == search_key { result = Some(value); } false - })?; + }) + .await?; Ok(result) } @@ -246,7 +247,7 @@ where /// will be called for every key >= 'search_key' (or <= 'search_key', if scanning /// backwards) /// - pub fn visit( + pub async fn visit( &self, search_key: &[u8; L], dir: VisitDirection, @@ -269,23 +270,9 @@ where V: FnMut(&[u8], u64) -> bool, { // Locate the node. - let blk = self.reader.read_blk(self.start_blk + node_blknum)?; - - // Search all entries on this node - self.search_node(blk.as_ref(), search_key, dir, visitor) - } + let node_buf = self.reader.read_blk(self.start_blk + node_blknum)?; - fn search_node( - &self, - node_buf: &[u8], - search_key: &[u8; L], - dir: VisitDirection, - visitor: &mut V, - ) -> Result - where - V: FnMut(&[u8], u64) -> bool, - { - let node = OnDiskNode::deparse(node_buf)?; + let node = OnDiskNode::deparse(node_buf.as_ref())?; let prefix_len = node.prefix_len as usize; let suffix_len = node.suffix_len as usize; @@ -782,12 +769,12 @@ mod tests { // Test the `get` function on all the keys. for (key, val) in all_data.iter() { - assert_eq!(reader.get(key)?, Some(*val)); + assert_eq!(reader.get(key).await?, Some(*val)); } // And on some keys that don't exist - assert_eq!(reader.get(b"aaaaaa")?, None); - assert_eq!(reader.get(b"zzzzzz")?, None); - assert_eq!(reader.get(b"xaaabx")?, None); + assert_eq!(reader.get(b"aaaaaa").await?, None); + assert_eq!(reader.get(b"zzzzzz").await?, None); + assert_eq!(reader.get(b"xaaabx").await?, None); // Test search with `visit` function let search_key = b"xabaaa"; @@ -798,10 +785,12 @@ mod tests { .collect(); let mut data = Vec::new(); - reader.visit(search_key, VisitDirection::Forwards, |key, value| { - data.push((key.to_vec(), value)); - true - })?; + reader + .visit(search_key, VisitDirection::Forwards, |key, value| { + data.push((key.to_vec(), value)); + true + }) + .await?; assert_eq!(data, expected); // Test a backwards scan @@ -812,16 +801,20 @@ mod tests { .collect(); expected.reverse(); let mut data = Vec::new(); - reader.visit(search_key, VisitDirection::Backwards, |key, value| { - data.push((key.to_vec(), value)); - true - })?; + reader + .visit(search_key, VisitDirection::Backwards, |key, value| { + data.push((key.to_vec(), value)); + true + }) + .await?; assert_eq!(data, expected); // Backward scan where nothing matches - reader.visit(b"aaaaaa", VisitDirection::Backwards, |key, value| { - panic!("found unexpected key {}: {}", hex::encode(key), value); - })?; + reader + .visit(b"aaaaaa", VisitDirection::Backwards, |key, value| { + panic!("found unexpected key {}: {}", hex::encode(key), value); + }) + .await?; // Full scan let expected: Vec<(Vec, u64)> = all_data @@ -829,10 +822,12 @@ mod tests { .map(|(key, value)| (key.to_vec(), *value)) .collect(); let mut data = Vec::new(); - reader.visit(&[0u8; 6], VisitDirection::Forwards, |key, value| { - data.push((key.to_vec(), value)); - true - })?; + reader + .visit(&[0u8; 6], VisitDirection::Forwards, |key, value| { + data.push((key.to_vec(), value)); + true + }) + .await?; assert_eq!(data, expected); Ok(()) @@ -880,13 +875,15 @@ mod tests { for search_key_int in 0..(NUM_KEYS * 2 + 10) { let search_key = u64::to_be_bytes(search_key_int); assert_eq!( - reader.get(&search_key)?, + reader.get(&search_key).await?, all_data.get(&search_key_int).cloned() ); // Test a forward scan starting with this key result.lock().unwrap().clear(); - reader.visit(&search_key, VisitDirection::Forwards, take_ten)?; + reader + .visit(&search_key, VisitDirection::Forwards, take_ten) + .await?; let expected = all_data .range(search_key_int..) .take(10) @@ -896,7 +893,9 @@ mod tests { // And a backwards scan result.lock().unwrap().clear(); - reader.visit(&search_key, VisitDirection::Backwards, take_ten)?; + reader + .visit(&search_key, VisitDirection::Backwards, take_ten) + .await?; let expected = all_data .range(..=search_key_int) .rev() @@ -910,7 +909,9 @@ mod tests { let search_key = u64::to_be_bytes(0); limit.store(usize::MAX, Ordering::Relaxed); result.lock().unwrap().clear(); - reader.visit(&search_key, VisitDirection::Forwards, take_ten)?; + reader + .visit(&search_key, VisitDirection::Forwards, take_ten) + .await?; let expected = all_data .iter() .map(|(&key, &val)| (key, val)) @@ -921,7 +922,9 @@ mod tests { let search_key = u64::to_be_bytes(u64::MAX); limit.store(usize::MAX, Ordering::Relaxed); result.lock().unwrap().clear(); - reader.visit(&search_key, VisitDirection::Backwards, take_ten)?; + reader + .visit(&search_key, VisitDirection::Backwards, take_ten) + .await?; let expected = all_data .iter() .rev() @@ -932,8 +935,8 @@ mod tests { Ok(()) } - #[test] - fn random_data() -> Result<()> { + #[tokio::test] + async fn random_data() -> Result<()> { // Generate random keys with exponential distribution, to // exercise the prefix compression const NUM_KEYS: usize = 100000; @@ -960,19 +963,23 @@ mod tests { // Test get() operation on all the keys for (&key, &val) in all_data.iter() { let search_key = u128::to_be_bytes(key); - assert_eq!(reader.get(&search_key)?, Some(val)); + assert_eq!(reader.get(&search_key).await?, Some(val)); } // Test get() operations on random keys, most of which will not exist for _ in 0..100000 { let key_int = rand::thread_rng().gen::(); let search_key = u128::to_be_bytes(key_int); - assert!(reader.get(&search_key)? == all_data.get(&key_int).cloned()); + assert!(reader.get(&search_key).await? == all_data.get(&key_int).cloned()); } // Test boundary cases - assert!(reader.get(&u128::to_be_bytes(u128::MIN))? == all_data.get(&u128::MIN).cloned()); - assert!(reader.get(&u128::to_be_bytes(u128::MAX))? == all_data.get(&u128::MAX).cloned()); + assert!( + reader.get(&u128::to_be_bytes(u128::MIN)).await? == all_data.get(&u128::MIN).cloned() + ); + assert!( + reader.get(&u128::to_be_bytes(u128::MAX)).await? == all_data.get(&u128::MAX).cloned() + ); Ok(()) } @@ -1014,15 +1021,17 @@ mod tests { // Test get() operation on all the keys for (key, val) in disk_btree_test_data::TEST_DATA { - assert_eq!(reader.get(&key)?, Some(val)); + assert_eq!(reader.get(&key).await?, Some(val)); } // Test full scan let mut count = 0; - reader.visit(&[0u8; 26], VisitDirection::Forwards, |_key, _value| { - count += 1; - true - })?; + reader + .visit(&[0u8; 26], VisitDirection::Forwards, |_key, _value| { + count += 1; + true + }) + .await?; assert_eq!(count, disk_btree_test_data::TEST_DATA.len()); reader.dump().await?; diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs index f44534249c66..b088a3b6028e 100644 --- a/pageserver/src/tenant/ephemeral_file.rs +++ b/pageserver/src/tenant/ephemeral_file.rs @@ -266,11 +266,17 @@ impl Drop for EphemeralFile { // unlink the file let res = std::fs::remove_file(&self.file.path); if let Err(e) = res { - warn!( - "could not remove ephemeral file '{}': {}", - self.file.path.display(), - e - ); + if e.kind() != std::io::ErrorKind::NotFound { + // just never log the not found errors, we cannot do anything for them; on detach + // the tenant directory is already gone. + // + // not found files might also be related to https://github.com/neondatabase/neon/issues/2442 + error!( + "could not remove ephemeral file '{}': {}", + self.file.path.display(), + e + ); + } } } } @@ -420,7 +426,7 @@ mod tests { blobs.push((pos, data)); } - let mut cursor = BlockCursor::new(&file); + let cursor = BlockCursor::new(&file); for (pos, expected) in blobs { let actual = cursor.read_blob(pos)?; assert_eq!(actual, expected); diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs index 25c5e3f2e0e7..2635953e6dab 100644 --- a/pageserver/src/tenant/mgr.rs +++ b/pageserver/src/tenant/mgr.rs @@ -266,43 +266,25 @@ async fn shutdown_all_tenants0(tenants: &tokio::sync::RwLock) { } }; + let started_at = std::time::Instant::now(); let mut join_set = JoinSet::new(); for (tenant_id, tenant) in tenants_to_shut_down { join_set.spawn( async move { - // ordering shouldn't matter for this, either we store true right away or never - let ordering = std::sync::atomic::Ordering::Relaxed; - let joined_other = std::sync::atomic::AtomicBool::new(false); + let freeze_and_flush = true; - let mut shutdown = std::pin::pin!(async { - let freeze_and_flush = true; + let res = { + let (_guard, shutdown_progress) = completion::channel(); + tenant.shutdown(shutdown_progress, freeze_and_flush).await + }; - let res = { - let (_guard, shutdown_progress) = completion::channel(); - tenant.shutdown(shutdown_progress, freeze_and_flush).await - }; + if let Err(other_progress) = res { + // join the another shutdown in progress + other_progress.wait().await; + } - if let Err(other_progress) = res { - // join the another shutdown in progress - joined_other.store(true, ordering); - other_progress.wait().await; - } - }); - - // in practice we might not have a lot time to go, since systemd is going to - // SIGKILL us at 10s, but we can try. delete tenant might take a while, so put out - // a warning. - let warning = std::time::Duration::from_secs(5); - let mut warning = std::pin::pin!(tokio::time::sleep(warning)); - - tokio::select! { - _ = &mut shutdown => {}, - _ = &mut warning => { - let joined_other = joined_other.load(ordering); - warn!(%joined_other, "waiting for the shutdown to complete"); - shutdown.await; - } - }; + // we cannot afford per tenant logging here, because if s3 is degraded, we are + // going to log too many lines debug!("tenant successfully stopped"); } @@ -310,27 +292,51 @@ async fn shutdown_all_tenants0(tenants: &tokio::sync::RwLock) { ); } + let total = join_set.len(); let mut panicked = 0; + let mut buffering = true; + const BUFFER_FOR: std::time::Duration = std::time::Duration::from_millis(500); + let mut buffered = std::pin::pin!(tokio::time::sleep(BUFFER_FOR)); - while let Some(res) = join_set.join_next().await { - match res { - Ok(()) => {} - Err(join_error) if join_error.is_cancelled() => { - unreachable!("we are not cancelling any of the futures"); - } - Err(join_error) if join_error.is_panic() => { - // cannot really do anything, as this panic is likely a bug - panicked += 1; - } - Err(join_error) => { - warn!("unknown kind of JoinError: {join_error}"); + while !join_set.is_empty() { + tokio::select! { + Some(joined) = join_set.join_next() => { + match joined { + Ok(()) => {} + Err(join_error) if join_error.is_cancelled() => { + unreachable!("we are not cancelling any of the futures"); + } + Err(join_error) if join_error.is_panic() => { + // cannot really do anything, as this panic is likely a bug + panicked += 1; + } + Err(join_error) => { + warn!("unknown kind of JoinError: {join_error}"); + } + } + if !buffering { + // buffer so that every 500ms since the first update (or starting) we'll log + // how far away we are; this is because we will get SIGKILL'd at 10s, and we + // are not able to log *then*. + buffering = true; + buffered.as_mut().reset(tokio::time::Instant::now() + BUFFER_FOR); + } + }, + _ = &mut buffered, if buffering => { + buffering = false; + info!(remaining = join_set.len(), total, elapsed_ms = started_at.elapsed().as_millis(), "waiting for tenants to shutdown"); } } } if panicked > 0 { - warn!(panicked, "observed panicks while shutting down tenants"); + warn!( + panicked, + total, "observed panicks while shutting down tenants" + ); } + + // caller will log how long we took } pub async fn create_tenant( diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 05381cb56de7..b4db7e2f08cb 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -9,7 +9,7 @@ mod remote_layer; use crate::config::PageServerConf; use crate::context::RequestContext; -use crate::repository::{Key, Value}; +use crate::repository::Key; use crate::task_mgr::TaskKind; use crate::walrecord::NeonWalRecord; use anyhow::Result; @@ -34,7 +34,7 @@ use utils::{ lsn::Lsn, }; -pub use delta_layer::{DeltaLayer, DeltaLayerWriter}; +pub use delta_layer::{DeltaLayer, DeltaLayerWriter, ValueRef}; pub use filename::{DeltaFileName, ImageFileName, LayerFileName}; pub use image_layer::{ImageLayer, ImageLayerWriter}; pub use inmemory_layer::InMemoryLayer; @@ -381,12 +381,6 @@ pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync + 'static { async fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()>; } -/// Returned by [`PersistentLayer::iter`] -pub type LayerIter<'i> = Box> + 'i + Send>; - -/// Returned by [`PersistentLayer::key_iter`] -pub type LayerKeyIter<'i> = Box + 'i + Send>; - /// Get a layer descriptor from a layer. pub trait AsLayerDesc { /// Get the layer descriptor. @@ -427,15 +421,6 @@ pub trait PersistentLayer: Layer + AsLayerDesc { // `None` for `RemoteLayer`. fn local_path(&self) -> Option; - /// Iterate through all keys and values stored in the layer - fn iter(&self, ctx: &RequestContext) -> Result>; - - /// Iterate through all keys stored in the layer. Returns key, lsn and value size - /// It is used only for compaction and so is currently implemented only for DeltaLayer - fn key_iter(&self, _ctx: &RequestContext) -> Result> { - panic!("Not implemented") - } - /// Permanently remove this layer from disk. fn delete_resident_layer_file(&self) -> Result<()>; diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 7574554b4e67..fd002ef39866 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -41,7 +41,6 @@ use crate::virtual_file::VirtualFile; use crate::{walrecord, TEMP_FILE_SUFFIX}; use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION}; use anyhow::{bail, ensure, Context, Result}; -use once_cell::sync::OnceCell; use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind}; use rand::{distributions::Alphanumeric, Rng}; use serde::{Deserialize, Serialize}; @@ -52,6 +51,7 @@ use std::ops::Range; use std::os::unix::fs::FileExt; use std::path::{Path, PathBuf}; use std::sync::Arc; +use tokio::sync::OnceCell; use tracing::*; use utils::{ @@ -61,8 +61,8 @@ use utils::{ }; use super::{ - AsLayerDesc, DeltaFileName, Layer, LayerAccessStats, LayerAccessStatsReset, LayerIter, - LayerKeyIter, PathOrConf, PersistentLayerDesc, + AsLayerDesc, DeltaFileName, Layer, LayerAccessStats, LayerAccessStatsReset, PathOrConf, + PersistentLayerDesc, }; /// @@ -189,7 +189,7 @@ pub struct DeltaLayer { access_stats: LayerAccessStats, - inner: OnceCell, + inner: OnceCell>, } impl std::fmt::Debug for DeltaLayer { @@ -242,7 +242,7 @@ impl Layer for DeltaLayer { return Ok(()); } - let inner = self.load(LayerAccessKind::Dump, ctx)?; + let inner = self.load(LayerAccessKind::Dump, ctx).await?; println!( "index_start_blk: {}, root {}", @@ -258,10 +258,10 @@ impl Layer for DeltaLayer { tree_reader.dump().await?; - let mut cursor = file.block_cursor(); + let cursor = file.block_cursor(); // A subroutine to dump a single blob - let mut dump_blob = |blob_ref: BlobRef| -> anyhow::Result { + let dump_blob = |blob_ref: BlobRef| -> anyhow::Result { let buf = cursor.read_blob(blob_ref.pos())?; let val = Value::des(&buf)?; let desc = match val { @@ -281,22 +281,24 @@ impl Layer for DeltaLayer { Ok(desc) }; - tree_reader.visit( - &[0u8; DELTA_KEY_SIZE], - VisitDirection::Forwards, - |delta_key, val| { - let blob_ref = BlobRef(val); - let key = DeltaKey::extract_key_from_buf(delta_key); - let lsn = DeltaKey::extract_lsn_from_buf(delta_key); - - let desc = match dump_blob(blob_ref) { - Ok(desc) => desc, - Err(err) => format!("ERROR: {}", err), - }; - println!(" key {} at {}: {}", key, lsn, desc); - true - }, - )?; + tree_reader + .visit( + &[0u8; DELTA_KEY_SIZE], + VisitDirection::Forwards, + |delta_key, val| { + let blob_ref = BlobRef(val); + let key = DeltaKey::extract_key_from_buf(delta_key); + let lsn = DeltaKey::extract_lsn_from_buf(delta_key); + + let desc = match dump_blob(blob_ref) { + Ok(desc) => desc, + Err(err) => format!("ERROR: {}", err), + }; + println!(" key {} at {}: {}", key, lsn, desc); + true + }, + ) + .await?; Ok(()) } @@ -315,7 +317,9 @@ impl Layer for DeltaLayer { { // Open the file and lock the metadata in memory - let inner = self.load(LayerAccessKind::GetValueReconstructData, ctx)?; + let inner = self + .load(LayerAccessKind::GetValueReconstructData, ctx) + .await?; // Scan the page versions backwards, starting from `lsn`. let file = &inner.file; @@ -328,22 +332,24 @@ impl Layer for DeltaLayer { let mut offsets: Vec<(Lsn, u64)> = Vec::new(); - tree_reader.visit(&search_key.0, VisitDirection::Backwards, |key, value| { - let blob_ref = BlobRef(value); - if key[..KEY_SIZE] != search_key.0[..KEY_SIZE] { - return false; - } - let entry_lsn = DeltaKey::extract_lsn_from_buf(key); - if entry_lsn < lsn_range.start { - return false; - } - offsets.push((entry_lsn, blob_ref.pos())); + tree_reader + .visit(&search_key.0, VisitDirection::Backwards, |key, value| { + let blob_ref = BlobRef(value); + if key[..KEY_SIZE] != search_key.0[..KEY_SIZE] { + return false; + } + let entry_lsn = DeltaKey::extract_lsn_from_buf(key); + if entry_lsn < lsn_range.start { + return false; + } + offsets.push((entry_lsn, blob_ref.pos())); - !blob_ref.will_init() - })?; + !blob_ref.will_init() + }) + .await?; // Ok, 'offsets' now contains the offsets of all the entries we need to read - let mut cursor = file.block_cursor(); + let cursor = file.block_cursor(); let mut buf = Vec::new(); for (entry_lsn, pos) in offsets { cursor.read_blob_into_buf(pos, &mut buf).with_context(|| { @@ -424,23 +430,6 @@ impl PersistentLayer for DeltaLayer { Some(self.path()) } - fn iter(&self, ctx: &RequestContext) -> Result> { - let inner = self - .load(LayerAccessKind::KeyIter, ctx) - .context("load delta layer")?; - Ok(match DeltaValueIter::new(inner) { - Ok(iter) => Box::new(iter), - Err(err) => Box::new(std::iter::once(Err(err))), - }) - } - - fn key_iter(&self, ctx: &RequestContext) -> Result> { - let inner = self.load(LayerAccessKind::KeyIter, ctx)?; - Ok(Box::new( - DeltaKeyIter::new(inner).context("Layer index is corrupted")?, - )) - } - fn delete_resident_layer_file(&self) -> Result<()> { // delete underlying file fs::remove_file(self.path())?; @@ -510,16 +499,21 @@ impl DeltaLayer { /// Open the underlying file and read the metadata into memory, if it's /// not loaded already. /// - fn load(&self, access_kind: LayerAccessKind, ctx: &RequestContext) -> Result<&DeltaLayerInner> { + async fn load( + &self, + access_kind: LayerAccessKind, + ctx: &RequestContext, + ) -> Result<&Arc> { self.access_stats .record_access(access_kind, ctx.task_kind()); // Quick exit if already loaded self.inner .get_or_try_init(|| self.load_inner()) + .await .with_context(|| format!("Failed to load delta layer {}", self.path().display())) } - fn load_inner(&self) -> Result { + async fn load_inner(&self) -> Result> { let path = self.path(); let file = VirtualFile::open(&path) @@ -554,11 +548,11 @@ impl DeltaLayer { debug!("loaded from {}", &path.display()); - Ok(DeltaLayerInner { + Ok(Arc::new(DeltaLayerInner { file, index_start_blk: actual_summary.index_start_blk, index_root_blk: actual_summary.index_root_blk, - }) + })) } /// Create a DeltaLayer struct representing an existing file on disk. @@ -580,7 +574,7 @@ impl DeltaLayer { file_size, ), access_stats, - inner: once_cell::sync::OnceCell::new(), + inner: OnceCell::new(), } } @@ -607,7 +601,7 @@ impl DeltaLayer { metadata.len(), ), access_stats: LayerAccessStats::empty_will_record_residence_event_later(), - inner: once_cell::sync::OnceCell::new(), + inner: OnceCell::new(), }) } @@ -623,6 +617,30 @@ impl DeltaLayer { &self.layer_name(), ) } + + /// Obtains all keys and value references stored in the layer + /// + /// The value can be obtained via the [`ValueRef::load`] function. + pub async fn load_val_refs(&self, ctx: &RequestContext) -> Result> { + let inner = self + .load(LayerAccessKind::KeyIter, ctx) + .await + .context("load delta layer")?; + DeltaLayerInner::load_val_refs(inner) + .await + .context("Layer index is corrupted") + } + + /// Loads all keys stored in the layer. Returns key, lsn and value size. + pub async fn load_keys(&self, ctx: &RequestContext) -> Result> { + let inner = self + .load(LayerAccessKind::KeyIter, ctx) + .await + .context("load delta layer keys")?; + DeltaLayerInner::load_keys(inner) + .await + .context("Layer index is corrupted") + } } /// A builder object for constructing a new delta layer. @@ -771,7 +789,7 @@ impl DeltaLayerWriterInner { metadata.len(), ), access_stats: LayerAccessStats::empty_will_record_residence_event_later(), - inner: once_cell::sync::OnceCell::new(), + inner: OnceCell::new(), }; // fsync the file @@ -893,168 +911,94 @@ impl Drop for DeltaLayerWriter { } } -/// -/// Iterator over all key-value pairse stored in a delta layer -/// -/// FIXME: This creates a Vector to hold the offsets of all key value pairs. -/// That takes up quite a lot of memory. Should do this in a more streaming -/// fashion. -/// -struct DeltaValueIter<'a> { - all_offsets: Vec<(DeltaKey, BlobRef)>, - next_idx: usize, - reader: BlockCursor>, -} - -struct Adapter<'a>(&'a DeltaLayerInner); - -impl<'a> BlockReader for Adapter<'a> { - type BlockLease = PageReadGuard<'static>; - - fn read_blk(&self, blknum: u32) -> Result { - self.0.file.read_blk(blknum) - } -} - -impl<'a> Iterator for DeltaValueIter<'a> { - type Item = Result<(Key, Lsn, Value)>; - - fn next(&mut self) -> Option { - self.next_res().transpose() - } -} - -impl<'a> DeltaValueIter<'a> { - fn new(inner: &'a DeltaLayerInner) -> Result { - let file = &inner.file; +impl DeltaLayerInner { + async fn load_val_refs(this: &Arc) -> Result> { + let file = &this.file; let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( - inner.index_start_blk, - inner.index_root_blk, + this.index_start_blk, + this.index_root_blk, file, ); - let mut all_offsets: Vec<(DeltaKey, BlobRef)> = Vec::new(); - tree_reader.visit( - &[0u8; DELTA_KEY_SIZE], - VisitDirection::Forwards, - |key, value| { - all_offsets.push((DeltaKey::from_slice(key), BlobRef(value))); - true - }, - )?; - - let iter = DeltaValueIter { - all_offsets, - next_idx: 0, - reader: BlockCursor::new(Adapter(inner)), - }; - - Ok(iter) - } - - fn next_res(&mut self) -> Result> { - if self.next_idx < self.all_offsets.len() { - let (delta_key, blob_ref) = &self.all_offsets[self.next_idx]; - - let key = delta_key.key(); - let lsn = delta_key.lsn(); - - let buf = self.reader.read_blob(blob_ref.pos())?; - let val = Value::des(&buf)?; - self.next_idx += 1; - Ok(Some((key, lsn, val))) - } else { - Ok(None) - } - } -} -/// -/// Iterator over all keys stored in a delta layer -/// -/// FIXME: This creates a Vector to hold all keys. -/// That takes up quite a lot of memory. Should do this in a more streaming -/// fashion. -/// -struct DeltaKeyIter { - all_keys: Vec<(DeltaKey, u64)>, - next_idx: usize, -} - -impl Iterator for DeltaKeyIter { - type Item = (Key, Lsn, u64); - - fn next(&mut self) -> Option { - if self.next_idx < self.all_keys.len() { - let (delta_key, size) = &self.all_keys[self.next_idx]; - - let key = delta_key.key(); - let lsn = delta_key.lsn(); - - self.next_idx += 1; - Some((key, lsn, *size)) - } else { - None - } - } -} - -impl<'a> DeltaKeyIter { - fn new(inner: &'a DeltaLayerInner) -> Result { - let file = &inner.file; + let mut all_offsets = Vec::<(Key, Lsn, ValueRef)>::new(); + tree_reader + .visit( + &[0u8; DELTA_KEY_SIZE], + VisitDirection::Forwards, + |key, value| { + let delta_key = DeltaKey::from_slice(key); + let val_ref = ValueRef { + blob_ref: BlobRef(value), + reader: BlockCursor::new(Adapter(this.clone())), + }; + all_offsets.push((delta_key.key(), delta_key.lsn(), val_ref)); + true + }, + ) + .await?; + + Ok(all_offsets) + } + async fn load_keys(&self) -> Result> { + let file = &self.file; let tree_reader = DiskBtreeReader::<_, DELTA_KEY_SIZE>::new( - inner.index_start_blk, - inner.index_root_blk, + self.index_start_blk, + self.index_root_blk, file, ); - let mut all_keys: Vec<(DeltaKey, u64)> = Vec::new(); - tree_reader.visit( - &[0u8; DELTA_KEY_SIZE], - VisitDirection::Forwards, - |key, value| { - let delta_key = DeltaKey::from_slice(key); - let pos = BlobRef(value).pos(); - if let Some(last) = all_keys.last_mut() { - if last.0.key() == delta_key.key() { - return true; - } else { - // subtract offset of new key BLOB and first blob of this key - // to get total size if values associated with this key - let first_pos = last.1; - last.1 = pos - first_pos; + let mut all_keys: Vec<(Key, Lsn, u64)> = Vec::new(); + tree_reader + .visit( + &[0u8; DELTA_KEY_SIZE], + VisitDirection::Forwards, + |key, value| { + let delta_key = DeltaKey::from_slice(key); + let pos = BlobRef(value).pos(); + if let Some(last) = all_keys.last_mut() { + if last.0 == delta_key.key() { + return true; + } else { + // subtract offset of new key BLOB and first blob of this key + // to get total size if values associated with this key + let first_pos = last.2; + last.2 = pos - first_pos; + } } - } - all_keys.push((delta_key, pos)); - true - }, - )?; + all_keys.push((delta_key.key(), delta_key.lsn(), pos)); + true + }, + ) + .await?; if let Some(last) = all_keys.last_mut() { // Last key occupies all space till end of layer - last.1 = std::fs::metadata(&file.file.path)?.len() - last.1; + last.2 = std::fs::metadata(&file.file.path)?.len() - last.2; } - let iter = DeltaKeyIter { - all_keys, - next_idx: 0, - }; + Ok(all_keys) + } +} + +/// Reference to an on-disk value +pub struct ValueRef { + blob_ref: BlobRef, + reader: BlockCursor, +} - Ok(iter) +impl ValueRef { + /// Loads the value from disk + pub fn load(&self) -> Result { + let buf = self.reader.read_blob(self.blob_ref.pos())?; + let val = Value::des(&buf)?; + Ok(val) } } -#[cfg(test)] -mod test { - use super::DeltaKeyIter; - use super::DeltaLayer; - use super::DeltaValueIter; - - // We will soon need the iters to be send in the compaction code. - // Cf https://github.com/neondatabase/neon/pull/4462#issuecomment-1587398883 - // Cf https://github.com/neondatabase/neon/issues/4471 - #[test] - fn is_send() { - fn assert_send() {} - assert_send::(); - assert_send::(); - assert_send::(); +struct Adapter(Arc); + +impl BlockReader for Adapter { + type BlockLease = PageReadGuard<'static>; + + fn read_blk(&self, blknum: u32) -> Result { + self.0.file.read_blk(blknum) } } diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index a5b9d8834e0a..6c19d0a871d2 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -38,7 +38,6 @@ use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX}; use anyhow::{bail, ensure, Context, Result}; use bytes::Bytes; use hex; -use once_cell::sync::OnceCell; use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind}; use rand::{distributions::Alphanumeric, Rng}; use serde::{Deserialize, Serialize}; @@ -48,6 +47,7 @@ use std::io::{Seek, SeekFrom}; use std::ops::Range; use std::os::unix::prelude::FileExt; use std::path::{Path, PathBuf}; +use tokio::sync::OnceCell; use tracing::*; use utils::{ @@ -57,9 +57,7 @@ use utils::{ }; use super::filename::ImageFileName; -use super::{ - AsLayerDesc, Layer, LayerAccessStatsReset, LayerIter, PathOrConf, PersistentLayerDesc, -}; +use super::{AsLayerDesc, Layer, LayerAccessStatsReset, PathOrConf, PersistentLayerDesc}; /// /// Header stored in the beginning of the file @@ -170,17 +168,19 @@ impl Layer for ImageLayer { return Ok(()); } - let inner = self.load(LayerAccessKind::Dump, ctx)?; + let inner = self.load(LayerAccessKind::Dump, ctx).await?; let file = &inner.file; let tree_reader = DiskBtreeReader::<_, KEY_SIZE>::new(inner.index_start_blk, inner.index_root_blk, file); tree_reader.dump().await?; - tree_reader.visit(&[0u8; KEY_SIZE], VisitDirection::Forwards, |key, value| { - println!("key: {} offset {}", hex::encode(key), value); - true - })?; + tree_reader + .visit(&[0u8; KEY_SIZE], VisitDirection::Forwards, |key, value| { + println!("key: {} offset {}", hex::encode(key), value); + true + }) + .await?; Ok(()) } @@ -197,14 +197,16 @@ impl Layer for ImageLayer { assert!(lsn_range.start >= self.lsn); assert!(lsn_range.end >= self.lsn); - let inner = self.load(LayerAccessKind::GetValueReconstructData, ctx)?; + let inner = self + .load(LayerAccessKind::GetValueReconstructData, ctx) + .await?; let file = &inner.file; let tree_reader = DiskBtreeReader::new(inner.index_start_blk, inner.index_root_blk, file); let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE]; key.write_to_byte_slice(&mut keybuf); - if let Some(offset) = tree_reader.get(&keybuf)? { + if let Some(offset) = tree_reader.get(&keybuf).await? { let blob = file.block_cursor().read_blob(offset).with_context(|| { format!( "failed to read value from data file {} at offset {}", @@ -255,10 +257,6 @@ impl PersistentLayer for ImageLayer { Some(self.path()) } - fn iter(&self, _ctx: &RequestContext) -> Result> { - unimplemented!(); - } - fn delete_resident_layer_file(&self) -> Result<()> { // delete underlying file fs::remove_file(self.path())?; @@ -318,7 +316,11 @@ impl ImageLayer { /// Open the underlying file and read the metadata into memory, if it's /// not loaded already. /// - fn load(&self, access_kind: LayerAccessKind, ctx: &RequestContext) -> Result<&ImageLayerInner> { + async fn load( + &self, + access_kind: LayerAccessKind, + ctx: &RequestContext, + ) -> Result<&ImageLayerInner> { self.access_stats .record_access(access_kind, ctx.task_kind()); loop { @@ -327,11 +329,12 @@ impl ImageLayer { } self.inner .get_or_try_init(|| self.load_inner()) + .await .with_context(|| format!("Failed to load image layer {}", self.path().display()))?; } } - fn load_inner(&self) -> Result { + async fn load_inner(&self) -> Result { let path = self.path(); // Open the file if it's not open already. diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs index 31d0b5997a5c..3d222fcb1ea3 100644 --- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs +++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs @@ -151,7 +151,7 @@ impl Layer for InMemoryLayer { return Ok(()); } - let mut cursor = inner.file.block_cursor(); + let cursor = inner.file.block_cursor(); let mut buf = Vec::new(); for (key, vec_map) in inner.index.iter() { for (lsn, pos) in vec_map.as_slice() { @@ -196,7 +196,7 @@ impl Layer for InMemoryLayer { let inner = self.inner.read().unwrap(); - let mut reader = inner.file.block_cursor(); + let reader = inner.file.block_cursor(); // Scan the page versions backwards, starting from `lsn`. if let Some(vec_map) = inner.index.get(&key) { @@ -354,7 +354,7 @@ impl InMemoryLayer { let mut buf = Vec::new(); - let mut cursor = inner.file.block_cursor(); + let cursor = inner.file.block_cursor(); let mut keys: Vec<(&Key, &VecMap)> = inner.index.iter().collect(); keys.sort_by_key(|k| k.0); diff --git a/pageserver/src/tenant/storage_layer/remote_layer.rs b/pageserver/src/tenant/storage_layer/remote_layer.rs index e5511d6051cb..36a6593779f5 100644 --- a/pageserver/src/tenant/storage_layer/remote_layer.rs +++ b/pageserver/src/tenant/storage_layer/remote_layer.rs @@ -20,8 +20,8 @@ use utils::{ use super::filename::{DeltaFileName, ImageFileName}; use super::{ - AsLayerDesc, DeltaLayer, ImageLayer, LayerAccessStats, LayerAccessStatsReset, LayerIter, - LayerKeyIter, LayerResidenceStatus, PersistentLayer, PersistentLayerDesc, + AsLayerDesc, DeltaLayer, ImageLayer, LayerAccessStats, LayerAccessStatsReset, + LayerResidenceStatus, PersistentLayer, PersistentLayerDesc, }; /// RemoteLayer is a not yet downloaded [`ImageLayer`] or @@ -129,14 +129,6 @@ impl PersistentLayer for RemoteLayer { None } - fn iter(&self, _ctx: &RequestContext) -> Result> { - bail!("cannot iterate a remote layer"); - } - - fn key_iter(&self, _ctx: &RequestContext) -> Result> { - bail!("cannot iterate a remote layer"); - } - fn delete_resident_layer_file(&self) -> Result<()> { bail!("remote layer has no layer file"); } diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs index a758d4da23ec..c067a844713c 100644 --- a/pageserver/src/tenant/tasks.rs +++ b/pageserver/src/tenant/tasks.rs @@ -73,17 +73,13 @@ pub fn start_background_loops( /// async fn compaction_loop(tenant: Arc, cancel: CancellationToken) { let wait_duration = Duration::from_secs(2); - info!("starting"); TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); async { let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download); let mut first = true; loop { - trace!("waking up"); - tokio::select! { _ = cancel.cancelled() => { - info!("received cancellation request"); return; }, tenant_wait_result = wait_for_active_tenant(&tenant) => match tenant_wait_result { @@ -126,15 +122,12 @@ async fn compaction_loop(tenant: Arc, cancel: CancellationToken) { .await .is_ok() { - info!("received cancellation request during idling"); break; } } } .await; TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc(); - - trace!("compaction loop stopped."); } /// @@ -142,7 +135,6 @@ async fn compaction_loop(tenant: Arc, cancel: CancellationToken) { /// async fn gc_loop(tenant: Arc, cancel: CancellationToken) { let wait_duration = Duration::from_secs(2); - info!("starting"); TENANT_TASK_EVENTS.with_label_values(&["start"]).inc(); async { // GC might require downloading, to find the cutoff LSN that corresponds to the @@ -151,11 +143,8 @@ async fn gc_loop(tenant: Arc, cancel: CancellationToken) { RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download); let mut first = true; loop { - trace!("waking up"); - tokio::select! { _ = cancel.cancelled() => { - info!("received cancellation request"); return; }, tenant_wait_result = wait_for_active_tenant(&tenant) => match tenant_wait_result { @@ -200,14 +189,12 @@ async fn gc_loop(tenant: Arc, cancel: CancellationToken) { .await .is_ok() { - info!("received cancellation request during idling"); break; } } } .await; TENANT_TASK_EVENTS.with_label_values(&["stop"]).inc(); - trace!("GC loop stopped."); } async fn wait_for_active_tenant(tenant: &Arc) -> ControlFlow<()> { @@ -232,7 +219,6 @@ async fn wait_for_active_tenant(tenant: &Arc) -> ControlFlow<()> { } } Err(_sender_dropped_error) => { - info!("Tenant dropped the state updates sender, quitting waiting for tenant and the task loop"); return ControlFlow::Break(()); } } diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 34211fb7142d..0e715998ab09 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -19,6 +19,7 @@ use pageserver_api::models::{ use remote_storage::GenericRemoteStorage; use serde_with::serde_as; use storage_broker::BrokerClientChannel; +use tokio::runtime::Handle; use tokio::sync::{oneshot, watch, TryAcquireError}; use tokio_util::sync::CancellationToken; use tracing::*; @@ -528,7 +529,7 @@ impl Timeline { size } - pub fn get_resident_physical_size(&self) -> u64 { + pub fn resident_physical_size(&self) -> u64 { self.metrics.resident_physical_size_gauge.get() } @@ -697,6 +698,9 @@ impl Timeline { Err(CompactionError::DownloadRequired(rls)) => { anyhow::bail!("Compaction requires downloading multiple times (last was {} layers), possibly battling against eviction", rls.len()) } + Err(CompactionError::ShuttingDown) => { + return Ok(()); + } Err(CompactionError::Other(e)) => { return Err(e); } @@ -778,7 +782,8 @@ impl Timeline { let layer_removal_cs = Arc::new(self.layer_removal_cs.clone().lock_owned().await); // Is the timeline being deleted? if self.is_stopping() { - return Err(anyhow::anyhow!("timeline is Stopping").into()); + trace!("Dropping out of compaction on timeline shutdown"); + return Err(CompactionError::ShuttingDown); } let target_file_size = self.get_checkpoint_distance(); @@ -3235,6 +3240,8 @@ enum CompactionError { /// This should not happen repeatedly, but will be retried once by top-level /// `Timeline::compact`. DownloadRequired(Vec>), + /// The timeline or pageserver is shutting down + ShuttingDown, /// Compaction cannot be done right now; page reconstruction and so on. Other(anyhow::Error), } @@ -3512,10 +3519,41 @@ impl Timeline { // min-heap (reserve space for one more element added before eviction) let mut heap: BinaryHeap = BinaryHeap::with_capacity(max_holes + 1); let mut prev: Option = None; - for (next_key, _next_lsn, _size) in itertools::process_results( - deltas_to_compact.iter().map(|l| l.key_iter(ctx)), - |iter_iter| iter_iter.kmerge_by(|a, b| a.0 < b.0), - )? { + + let mut all_value_refs = Vec::new(); + for l in deltas_to_compact.iter() { + // TODO: replace this with an await once we fully go async + all_value_refs.extend( + Handle::current().block_on( + l.clone() + .downcast_delta_layer() + .expect("delta layer") + .load_val_refs(ctx), + )?, + ); + } + // The current stdlib sorting implementation is designed in a way where it is + // particularly fast where the slice is made up of sorted sub-ranges. + all_value_refs.sort_by_key(|(key, lsn, _value_ref)| (*key, *lsn)); + + let mut all_keys = Vec::new(); + for l in deltas_to_compact.iter() { + // TODO: replace this with an await once we fully go async + all_keys.extend( + Handle::current().block_on( + l.clone() + .downcast_delta_layer() + .expect("delta layer") + .load_keys(ctx), + )?, + ); + } + // The current stdlib sorting implementation is designed in a way where it is + // particularly fast where the slice is made up of sorted sub-ranges. + all_keys.sort_by_key(|(key, lsn, _size)| (*key, *lsn)); + + for (next_key, _next_lsn, _size) in all_keys.iter() { + let next_key = *next_key; if let Some(prev_key) = prev { // just first fast filter if next_key.to_i128() - prev_key.to_i128() >= min_hole_range { @@ -3548,34 +3586,10 @@ impl Timeline { // This iterator walks through all key-value pairs from all the layers // we're compacting, in key, LSN order. - let all_values_iter = itertools::process_results( - deltas_to_compact.iter().map(|l| l.iter(ctx)), - |iter_iter| { - iter_iter.kmerge_by(|a, b| { - if let Ok((a_key, a_lsn, _)) = a { - if let Ok((b_key, b_lsn, _)) = b { - (a_key, a_lsn) < (b_key, b_lsn) - } else { - false - } - } else { - true - } - }) - }, - )?; + let all_values_iter = all_value_refs.into_iter(); // This iterator walks through all keys and is needed to calculate size used by each key - let mut all_keys_iter = itertools::process_results( - deltas_to_compact.iter().map(|l| l.key_iter(ctx)), - |iter_iter| { - iter_iter.kmerge_by(|a, b| { - let (a_key, a_lsn, _) = a; - let (b_key, b_lsn, _) = b; - (a_key, a_lsn) < (b_key, b_lsn) - }) - }, - )?; + let mut all_keys_iter = all_keys.into_iter(); stats.prepare_iterators_micros = stats.read_lock_drop_micros.till_now(); @@ -3629,8 +3643,8 @@ impl Timeline { let mut key_values_total_size = 0u64; let mut dup_start_lsn: Lsn = Lsn::INVALID; // start LSN of layer containing values of the single key let mut dup_end_lsn: Lsn = Lsn::INVALID; // end LSN of layer containing values of the single key - for x in all_values_iter { - let (key, lsn, value) = x?; + for (key, lsn, value_ref) in all_values_iter { + let value = value_ref.load()?; let same_key = prev_key.map_or(false, |prev_key| prev_key == key); // We need to check key boundaries once we reach next key or end of layer with the same key if !same_key || lsn == dup_end_lsn { diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs index 5485cc42b485..3e407dda572d 100644 --- a/pageserver/src/tenant/timeline/eviction_task.rs +++ b/pageserver/src/tenant/timeline/eviction_task.rs @@ -78,9 +78,6 @@ impl Timeline { #[instrument(skip_all, fields(tenant_id = %self.tenant_id, timeline_id = %self.timeline_id))] async fn eviction_task(self: Arc, cancel: CancellationToken) { - scopeguard::defer! { - info!("eviction task finishing"); - } use crate::tenant::tasks::random_init_delay; { let policy = self.get_eviction_policy(); @@ -308,8 +305,13 @@ impl Timeline { ctx: &RequestContext, ) -> ControlFlow<()> { let mut state = self.eviction_task_timeline_state.lock().await; + + // Only do the imitate_layer accesses approximately as often as the threshold. A little + // more frequently, to avoid this period racing with the threshold/period-th eviction iteration. + let inter_imitate_period = p.threshold.checked_sub(p.period).unwrap_or(p.threshold); + match state.last_layer_access_imitation { - Some(ts) if ts.elapsed() < p.threshold => { /* no need to run */ } + Some(ts) if ts.elapsed() < inter_imitate_period => { /* no need to run */ } _ => { self.imitate_timeline_cached_layer_accesses(cancel, ctx) .await; @@ -332,7 +334,7 @@ impl Timeline { }; let mut state = tenant.eviction_task_tenant_state.lock().await; match state.last_layer_access_imitation { - Some(ts) if ts.elapsed() < p.threshold => { /* no need to run */ } + Some(ts) if ts.elapsed() < inter_imitate_period => { /* no need to run */ } _ => { self.imitate_synthetic_size_calculation_worker(&tenant, ctx, cancel) .await; diff --git a/pgxn/neon/Makefile b/pgxn/neon/Makefile index 194802347233..53917d8bc48b 100644 --- a/pgxn/neon/Makefile +++ b/pgxn/neon/Makefile @@ -4,6 +4,7 @@ MODULE_big = neon OBJS = \ $(WIN32RES) \ + extension_server.o \ file_cache.o \ libpagestore.o \ libpqwalproposer.o \ diff --git a/pgxn/neon/extension_server.c b/pgxn/neon/extension_server.c new file mode 100644 index 000000000000..6053425de0e5 --- /dev/null +++ b/pgxn/neon/extension_server.c @@ -0,0 +1,103 @@ + +/*------------------------------------------------------------------------- + * + * extension_server.c + * Request compute_ctl to download extension files. + * + * IDENTIFICATION + * contrib/neon/extension_server.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" +#include "tcop/pquery.h" +#include "tcop/utility.h" +#include "access/xact.h" +#include "utils/hsearch.h" +#include "utils/memutils.h" +#include "commands/defrem.h" +#include "miscadmin.h" +#include "utils/acl.h" +#include "fmgr.h" +#include "utils/guc.h" +#include "port.h" +#include "fmgr.h" + +#include + +static int extension_server_port = 0; + +static download_extension_file_hook_type prev_download_extension_file_hook = NULL; + +// to download all SQL (and data) files for an extension: +// curl -X POST http://localhost:8080/extension_server/postgis +// it covers two possible extension files layouts: +// 1. extension_name--version--platform.sql +// 2. extension_name/extension_name--version.sql +// extension_name/extra_files.csv +// +// to download specific library file: +// curl -X POST http://localhost:8080/extension_server/postgis-3.so?is_library=true +static bool +neon_download_extension_file_http(const char *filename, bool is_library) +{ + CURL *curl; + CURLcode res; + char *compute_ctl_url; + char *postdata; + bool ret = false; + + if ((curl = curl_easy_init()) == NULL) + { + elog(ERROR, "Failed to initialize curl handle"); + } + + compute_ctl_url = psprintf("http://localhost:%d/extension_server/%s%s", + extension_server_port, filename, is_library ? "?is_library=true" : ""); + + elog(LOG, "Sending request to compute_ctl: %s", compute_ctl_url); + + curl_easy_setopt(curl, CURLOPT_CUSTOMREQUEST, "POST"); + curl_easy_setopt(curl, CURLOPT_URL, compute_ctl_url); + curl_easy_setopt(curl, CURLOPT_TIMEOUT, 3L /* seconds */); + + if (curl) + { + /* Perform the request, res will get the return code */ + res = curl_easy_perform(curl); + /* Check for errors */ + if (res == CURLE_OK) + { + ret = true; + } + else + { + // Don't error here because postgres will try to find the file + // and will fail with some proper error message if it's not found. + elog(WARNING, "neon_download_extension_file_http failed: %s\n", curl_easy_strerror(res)); + } + + /* always cleanup */ + curl_easy_cleanup(curl); + } + + return ret; +} + +void pg_init_extension_server() +{ + // Port to connect to compute_ctl on localhost + // to request extension files. + DefineCustomIntVariable("neon.extension_server_port", + "connection string to the compute_ctl", + NULL, + &extension_server_port, + 0, 0, INT_MAX, + PGC_POSTMASTER, + 0, /* no flags required */ + NULL, NULL, NULL); + + // set download_extension_file_hook + prev_download_extension_file_hook = download_extension_file_hook; + download_extension_file_hook = neon_download_extension_file_http; +} diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index 02795bc8b8c7..262814c81811 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -172,7 +172,7 @@ lfc_change_limit_hook(int newval, void *extra) { lfc_desc = BasicOpenFile(lfc_path, O_RDWR|O_CREAT); if (lfc_desc < 0) { - elog(LOG, "Failed to open file cache %s: %m", lfc_path); + elog(WARNING, "Failed to open file cache %s: %m, disabling file cache", lfc_path); lfc_size_limit = 0; /* disable file cache */ return; } @@ -557,7 +557,7 @@ lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, Assert(victim->access_count == 0); entry->offset = victim->offset; /* grab victim's chunk */ hash_search(lfc_hash, &victim->key, HASH_REMOVE, NULL); - elog(LOG, "Swap file cache page"); + elog(DEBUG2, "Swap file cache page"); } else { @@ -574,7 +574,7 @@ lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, { lfc_desc = BasicOpenFile(lfc_path, O_RDWR|O_CREAT); if (lfc_desc < 0) { - elog(LOG, "Failed to open file cache %s: %m", lfc_path); + elog(WARNING, "Failed to open file cache %s: %m, disabling file cache", lfc_path); lfc_size_limit = 0; /* disable file cache */ } } @@ -583,7 +583,7 @@ lfc_write(RelFileNode rnode, ForkNumber forkNum, BlockNumber blkno, rc = pwrite(lfc_desc, buffer, BLCKSZ, ((off_t)entry->offset*BLOCKS_PER_CHUNK + chunk_offs)*BLCKSZ); if (rc != BLCKSZ) { - elog(INFO, "Failed to write file cache: %m"); + elog(WARNING, "Failed to write file cache: %m, disabling file cache"); lfc_size_limit = 0; /* disable file cache */ } } diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c index b45d7cfc32f4..c7211ea05ac9 100644 --- a/pgxn/neon/neon.c +++ b/pgxn/neon/neon.c @@ -35,8 +35,11 @@ _PG_init(void) { pg_init_libpagestore(); pg_init_walproposer(); + InitControlPlaneConnector(); + pg_init_extension_server(); + // Important: This must happen after other parts of the extension // are loaded, otherwise any settings to GUCs that were set before // the extension was loaded will be removed. diff --git a/pgxn/neon/neon.h b/pgxn/neon/neon.h index 60d321a9451f..2610da431128 100644 --- a/pgxn/neon/neon.h +++ b/pgxn/neon/neon.h @@ -21,6 +21,8 @@ extern char *neon_tenant; extern void pg_init_libpagestore(void); extern void pg_init_walproposer(void); +extern void pg_init_extension_server(void); + /* * Returns true if we shouldn't do REDO on that block in record indicated by * block_id; false otherwise. diff --git a/poetry.lock b/poetry.lock index 74cec84a4049..63e756a4c06b 100644 --- a/poetry.lock +++ b/poetry.lock @@ -887,34 +887,34 @@ files = [ [[package]] name = "cryptography" -version = "41.0.2" +version = "41.0.3" description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers." optional = false python-versions = ">=3.7" files = [ - {file = "cryptography-41.0.2-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:01f1d9e537f9a15b037d5d9ee442b8c22e3ae11ce65ea1f3316a41c78756b711"}, - {file = "cryptography-41.0.2-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:079347de771f9282fbfe0e0236c716686950c19dee1b76240ab09ce1624d76d7"}, - {file = "cryptography-41.0.2-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:439c3cc4c0d42fa999b83ded80a9a1fb54d53c58d6e59234cfe97f241e6c781d"}, - {file = "cryptography-41.0.2-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f14ad275364c8b4e525d018f6716537ae7b6d369c094805cae45300847e0894f"}, - {file = "cryptography-41.0.2-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:84609ade00a6ec59a89729e87a503c6e36af98ddcd566d5f3be52e29ba993182"}, - {file = "cryptography-41.0.2-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:49c3222bb8f8e800aead2e376cbef687bc9e3cb9b58b29a261210456a7783d83"}, - {file = "cryptography-41.0.2-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:d73f419a56d74fef257955f51b18d046f3506270a5fd2ac5febbfa259d6c0fa5"}, - {file = "cryptography-41.0.2-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:2a034bf7d9ca894720f2ec1d8b7b5832d7e363571828037f9e0c4f18c1b58a58"}, - {file = "cryptography-41.0.2-cp37-abi3-win32.whl", hash = "sha256:d124682c7a23c9764e54ca9ab5b308b14b18eba02722b8659fb238546de83a76"}, - {file = "cryptography-41.0.2-cp37-abi3-win_amd64.whl", hash = "sha256:9c3fe6534d59d071ee82081ca3d71eed3210f76ebd0361798c74abc2bcf347d4"}, - {file = "cryptography-41.0.2-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:a719399b99377b218dac6cf547b6ec54e6ef20207b6165126a280b0ce97e0d2a"}, - {file = "cryptography-41.0.2-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:182be4171f9332b6741ee818ec27daff9fb00349f706629f5cbf417bd50e66fd"}, - {file = "cryptography-41.0.2-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:7a9a3bced53b7f09da251685224d6a260c3cb291768f54954e28f03ef14e3766"}, - {file = "cryptography-41.0.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:f0dc40e6f7aa37af01aba07277d3d64d5a03dc66d682097541ec4da03cc140ee"}, - {file = "cryptography-41.0.2-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:674b669d5daa64206c38e507808aae49904c988fa0a71c935e7006a3e1e83831"}, - {file = "cryptography-41.0.2-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:7af244b012711a26196450d34f483357e42aeddb04128885d95a69bd8b14b69b"}, - {file = "cryptography-41.0.2-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:9b6d717393dbae53d4e52684ef4f022444fc1cce3c48c38cb74fca29e1f08eaa"}, - {file = "cryptography-41.0.2-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:192255f539d7a89f2102d07d7375b1e0a81f7478925b3bc2e0549ebf739dae0e"}, - {file = "cryptography-41.0.2-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:f772610fe364372de33d76edcd313636a25684edb94cee53fd790195f5989d14"}, - {file = "cryptography-41.0.2-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:b332cba64d99a70c1e0836902720887fb4529ea49ea7f5462cf6640e095e11d2"}, - {file = "cryptography-41.0.2-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:9a6673c1828db6270b76b22cc696f40cde9043eb90373da5c2f8f2158957f42f"}, - {file = "cryptography-41.0.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:342f3767e25876751e14f8459ad85e77e660537ca0a066e10e75df9c9e9099f0"}, - {file = "cryptography-41.0.2.tar.gz", hash = "sha256:7d230bf856164de164ecb615ccc14c7fc6de6906ddd5b491f3af90d3514c925c"}, + {file = "cryptography-41.0.3-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:652627a055cb52a84f8c448185922241dd5217443ca194d5739b44612c5e6507"}, + {file = "cryptography-41.0.3-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:8f09daa483aedea50d249ef98ed500569841d6498aa9c9f4b0531b9964658922"}, + {file = "cryptography-41.0.3-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:4fd871184321100fb400d759ad0cddddf284c4b696568204d281c902fc7b0d81"}, + {file = "cryptography-41.0.3-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:84537453d57f55a50a5b6835622ee405816999a7113267739a1b4581f83535bd"}, + {file = "cryptography-41.0.3-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:3fb248989b6363906827284cd20cca63bb1a757e0a2864d4c1682a985e3dca47"}, + {file = "cryptography-41.0.3-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:42cb413e01a5d36da9929baa9d70ca90d90b969269e5a12d39c1e0d475010116"}, + {file = "cryptography-41.0.3-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:aeb57c421b34af8f9fe830e1955bf493a86a7996cc1338fe41b30047d16e962c"}, + {file = "cryptography-41.0.3-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:6af1c6387c531cd364b72c28daa29232162010d952ceb7e5ca8e2827526aceae"}, + {file = "cryptography-41.0.3-cp37-abi3-win32.whl", hash = "sha256:0d09fb5356f975974dbcb595ad2d178305e5050656affb7890a1583f5e02a306"}, + {file = "cryptography-41.0.3-cp37-abi3-win_amd64.whl", hash = "sha256:a983e441a00a9d57a4d7c91b3116a37ae602907a7618b882c8013b5762e80574"}, + {file = "cryptography-41.0.3-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:5259cb659aa43005eb55a0e4ff2c825ca111a0da1814202c64d28a985d33b087"}, + {file = "cryptography-41.0.3-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:67e120e9a577c64fe1f611e53b30b3e69744e5910ff3b6e97e935aeb96005858"}, + {file = "cryptography-41.0.3-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:7efe8041897fe7a50863e51b77789b657a133c75c3b094e51b5e4b5cec7bf906"}, + {file = "cryptography-41.0.3-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:ce785cf81a7bdade534297ef9e490ddff800d956625020ab2ec2780a556c313e"}, + {file = "cryptography-41.0.3-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:57a51b89f954f216a81c9d057bf1a24e2f36e764a1ca9a501a6964eb4a6800dd"}, + {file = "cryptography-41.0.3-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:4c2f0d35703d61002a2bbdcf15548ebb701cfdd83cdc12471d2bae80878a4207"}, + {file = "cryptography-41.0.3-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:23c2d778cf829f7d0ae180600b17e9fceea3c2ef8b31a99e3c694cbbf3a24b84"}, + {file = "cryptography-41.0.3-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:95dd7f261bb76948b52a5330ba5202b91a26fbac13ad0e9fc8a3ac04752058c7"}, + {file = "cryptography-41.0.3-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:41d7aa7cdfded09b3d73a47f429c298e80796c8e825ddfadc84c8a7f12df212d"}, + {file = "cryptography-41.0.3-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:d0d651aa754ef58d75cec6edfbd21259d93810b73f6ec246436a21b7841908de"}, + {file = "cryptography-41.0.3-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:ab8de0d091acbf778f74286f4989cf3d1528336af1b59f3e5d2ebca8b5fe49e1"}, + {file = "cryptography-41.0.3-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:a74fbcdb2a0d46fe00504f571a2a540532f4c188e6ccf26f1f178480117b33c4"}, + {file = "cryptography-41.0.3.tar.gz", hash = "sha256:6d192741113ef5e30d89dcb5b956ef4e1578f304708701b8b73d38e3e1461f34"}, ] [package.dependencies] diff --git a/proxy/src/auth/backend/classic.rs b/proxy/src/auth/backend/classic.rs index 5ed0f747c274..d8ee1e8b64d7 100644 --- a/proxy/src/auth/backend/classic.rs +++ b/proxy/src/auth/backend/classic.rs @@ -10,7 +10,7 @@ use crate::{ stream::PqStream, }; use tokio::io::{AsyncRead, AsyncWrite}; -use tracing::info; +use tracing::{error, info, warn}; pub(super) async fn authenticate( api: &impl console::Api, @@ -55,11 +55,17 @@ pub(super) async fn authenticate( let mut num_retries = 0; let mut node = loop { let wake_res = api.wake_compute(extra, creds).await; - match handle_try_wake(wake_res, num_retries)? { - ControlFlow::Continue(_) => num_retries += 1, - ControlFlow::Break(n) => break n, + match handle_try_wake(wake_res, num_retries) { + Err(e) => { + error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node"); + return Err(e.into()); + } + Ok(ControlFlow::Continue(e)) => { + warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node"); + num_retries += 1; + } + Ok(ControlFlow::Break(n)) => break n, } - info!(num_retries, "retrying wake compute"); }; if let Some(keys) = scram_keys { use tokio_postgres::config::AuthKeys; diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index b1cf2a855928..e96b79ed924a 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -230,7 +230,8 @@ pub struct PostgresConnection { } impl ConnCfg { - async fn do_connect( + /// Connect to a corresponding compute node. + pub async fn connect( &self, allow_self_signed_compute: bool, timeout: Duration, @@ -270,20 +271,6 @@ impl ConnCfg { Ok(connection) } - - /// Connect to a corresponding compute node. - pub async fn connect( - &self, - allow_self_signed_compute: bool, - timeout: Duration, - ) -> Result { - self.do_connect(allow_self_signed_compute, timeout) - .inspect_err(|err| { - // Immediately log the error we have at our disposal. - error!("couldn't connect to compute node: {err}"); - }) - .await - } } /// Retrieve `options` from a startup message, dropping all proxy-secific flags. diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs index c4be7e1f08f0..d63f902ac1f2 100644 --- a/proxy/src/metrics.rs +++ b/proxy/src/metrics.rs @@ -11,7 +11,6 @@ const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client"; const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60); -/// /// Key that uniquely identifies the object, this metric describes. /// Currently, endpoint_id is enough, but this may change later, /// so keep it in a named struct. @@ -19,8 +18,7 @@ const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60); /// Both the proxy and the ingestion endpoint will live in the same region (or cell) /// so while the project-id is unique across regions the whole pipeline will work correctly /// because we enrich the event with project_id in the control-plane endpoint. -/// -#[derive(Eq, Hash, PartialEq, Serialize, Debug)] +#[derive(Eq, Hash, PartialEq, Serialize, Debug, Clone)] pub struct Ids { pub endpoint_id: String, pub branch_id: String, @@ -149,7 +147,7 @@ async fn collect_metrics_iteration( stop_time: *curr_time, }, metric: PROXY_IO_BYTES_PER_CLIENT, - idempotency_key: idempotency_key(hostname.to_owned()), + idempotency_key: idempotency_key(hostname), value, extra: Ids { endpoint_id: curr_key.endpoint_id.clone(), @@ -167,12 +165,11 @@ async fn collect_metrics_iteration( // Send metrics. // Split into chunks of 1000 metrics to avoid exceeding the max request size for chunk in metrics_to_send.chunks(CHUNK_SIZE) { - let chunk_json = serde_json::value::to_raw_value(&EventChunk { events: chunk }) - .expect("ProxyConsumptionMetric should not fail serialization"); - let res = client .post(metric_collection_endpoint.clone()) - .json(&chunk_json) + .json(&EventChunk { + events: chunk.into(), + }) .send() .await; diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index 5f59957b2dac..e6fcf901d974 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -23,7 +23,7 @@ use tokio::{ time, }; use tokio_util::sync::CancellationToken; -use tracing::{error, info, warn}; +use tracing::{error, info, info_span, warn, Instrument}; use utils::measured_stream::MeasuredStream; /// Number of times we should retry the `/proxy_wake_compute` http request. @@ -101,21 +101,20 @@ pub async fn task_main( tokio::select! { accept_result = listener.accept() => { let (socket, peer_addr) = accept_result?; - info!("accepted postgres client connection from {peer_addr}"); let session_id = uuid::Uuid::new_v4(); let cancel_map = Arc::clone(&cancel_map); connections.spawn( async move { - info!("spawned a task for {peer_addr}"); + info!("accepted postgres client connection"); socket .set_nodelay(true) .context("failed to set socket option")?; - handle_client(config, &cancel_map, session_id, socket, ClientMode::Tcp) - .await + handle_client(config, &cancel_map, session_id, socket, ClientMode::Tcp).await } + .instrument(info_span!("handle_client", ?session_id, %peer_addr)) .unwrap_or_else(move |e| { // Acknowledge that the task has finished with an error. error!(?session_id, "per-client task finished with an error: {e:#}"); @@ -183,7 +182,6 @@ impl ClientMode { } } -#[tracing::instrument(fields(session_id = ?session_id), skip_all)] pub async fn handle_client( config: &'static ProxyConfig, cancel_map: &CancelMap, @@ -425,11 +423,17 @@ where auth::BackendType::Test(x) => x.wake_compute(), }; - match handle_try_wake(wake_res, num_retries)? { + match handle_try_wake(wake_res, num_retries) { + Err(e) => { + error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node"); + return Err(e.into()); + } // failed to wake up but we can continue to retry - ControlFlow::Continue(_) => {} + Ok(ControlFlow::Continue(e)) => { + warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node"); + } // successfully woke up a compute node and can break the wakeup loop - ControlFlow::Break(mut node_info) => { + Ok(ControlFlow::Break(mut node_info)) => { node_info.config.reuse_password(&config); mechanism.update_connect_config(&mut node_info.config); break node_info; @@ -440,7 +444,6 @@ where num_retries += 1; time::sleep(wait_duration).await; - info!(num_retries, "retrying wake compute"); }; // now that we have a new node, try connect to it repeatedly. @@ -451,10 +454,12 @@ where match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await { Ok(res) => return Ok(res), Err(e) => { - error!(error = ?e, "could not connect to compute node"); - if !e.should_retry(num_retries) { + let retriable = e.should_retry(num_retries); + if !retriable { + error!(error = ?e, num_retries, retriable, "couldn't connect to compute node"); return Err(e.into()); } + warn!(error = ?e, num_retries, retriable, "couldn't connect to compute node"); } } @@ -462,7 +467,6 @@ where num_retries += 1; time::sleep(wait_duration).await; - info!(num_retries, "retrying connect_once"); } } diff --git a/scripts/combine_control_files.py b/scripts/combine_control_files.py index 0350e4721d6f..b1fe73881e66 100644 --- a/scripts/combine_control_files.py +++ b/scripts/combine_control_files.py @@ -15,16 +15,12 @@ ], "library_index": { "anon": "anon", - "kq_imcx": "kq_imcx" - // would be more complicated for something like postgis where multiple library names all map to postgis + // for more complex extensions like postgis + // we might have something like: + // address_standardizer: postgis + // postgis_tiger: postgis }, "extension_data": { - "kq_imcx": { - "control_data": { - "kq_imcx.control": "# This file is generated content from add_postgresql_extension.\n# No point in modifying it, it will be overwritten anyway.\n\n# Default version, always set\ndefault_version = '0.1'\n\n# Module pathname generated from target shared library name. Use\n# MODULE_PATHNAME in script file.\nmodule_pathname = '$libdir/kq_imcx.so'\n\n# Comment for extension. Set using COMMENT option. Can be set in\n# script file as well.\ncomment = 'ketteQ In-Memory Calendar Extension (IMCX)'\n\n# Encoding for script file. Set using ENCODING option.\n#encoding = ''\n\n# Required extensions. Set using REQUIRES option (multi-valued).\n#requires = ''\ntrusted = true\n" - }, - "archive_path": "5648391853/v15/extensions/kq_imcx.tar.zst" - }, "anon": { "control_data": { "anon.control": "# PostgreSQL Anonymizer (anon) extension \ncomment = 'Data anonymization tools' \ndefault_version = '1.1.0' \ndirectory='extension/anon' \nrelocatable = false \nrequires = 'pgcrypto' \nsuperuser = false \nmodule_pathname = '$libdir/anon' \ntrusted = true \n" diff --git a/test_runner/fixtures/broker.py b/test_runner/fixtures/broker.py new file mode 100644 index 000000000000..fa8b816e69f4 --- /dev/null +++ b/test_runner/fixtures/broker.py @@ -0,0 +1,60 @@ +import subprocess +import time +from dataclasses import dataclass +from pathlib import Path +from typing import Any, Optional + +from fixtures.log_helper import log + + +@dataclass +class NeonBroker: + """An object managing storage_broker instance""" + + logfile: Path + port: int + neon_binpath: Path + handle: Optional[subprocess.Popen[Any]] = None # handle of running daemon + + def listen_addr(self): + return f"127.0.0.1:{self.port}" + + def client_url(self): + return f"http://{self.listen_addr()}" + + def check_status(self): + return True # TODO + + def try_start(self): + if self.handle is not None: + log.debug(f"storage_broker is already running on port {self.port}") + return + + listen_addr = self.listen_addr() + log.info(f'starting storage_broker to listen incoming connections at "{listen_addr}"') + with open(self.logfile, "wb") as logfile: + args = [ + str(self.neon_binpath / "storage_broker"), + f"--listen-addr={listen_addr}", + ] + self.handle = subprocess.Popen(args, stdout=logfile, stderr=logfile) + + # wait for start + started_at = time.time() + while True: + try: + self.check_status() + except Exception as e: + elapsed = time.time() - started_at + if elapsed > 5: + raise RuntimeError( + f"timed out waiting {elapsed:.0f}s for storage_broker start: {e}" + ) from e + time.sleep(0.5) + else: + break # success + + def stop(self): + if self.handle is not None: + self.handle.terminate() + self.handle.wait() diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index fb78d69d6762..cdda8c414e44 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -2,13 +2,11 @@ import abc import asyncio -import enum import filecmp import json import os import re import shutil -import socket import subprocess import tempfile import textwrap @@ -17,12 +15,11 @@ from contextlib import closing, contextmanager from dataclasses import dataclass, field from datetime import datetime -from enum import Flag, auto from functools import cached_property from itertools import chain, product from pathlib import Path from types import TracebackType -from typing import Any, Dict, Iterator, List, Optional, Tuple, Type, Union, cast +from typing import Any, Dict, Iterator, List, Optional, Tuple, Type, cast from urllib.parse import urlparse import asyncpg @@ -42,10 +39,21 @@ from psycopg2.extensions import make_dsn, parse_dsn from typing_extensions import Literal +from fixtures.broker import NeonBroker from fixtures.log_helper import log from fixtures.pageserver.http import PageserverHttpClient from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload from fixtures.pg_version import PgVersion +from fixtures.port_distributor import PortDistributor +from fixtures.remote_storage import ( + LocalFsStorage, + MockS3Server, + RemoteStorage, + RemoteStorageKind, + RemoteStorageUsers, + S3Storage, + remote_storage_to_toml_inline_table, +) from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import ( ATTACHMENT_NAME_REGEX, @@ -78,19 +86,6 @@ DEFAULT_BRANCH_NAME: str = "main" BASE_PORT: int = 15000 -WORKER_PORT_NUM: int = 1000 - - -def pytest_configure(config: Config): - """ - Check that we do not overflow available ports range. - """ - - numprocesses = config.getoption("numprocesses") - if ( - numprocesses is not None and BASE_PORT + numprocesses * WORKER_PORT_NUM > 32768 - ): # do not use ephemeral ports - raise Exception("Too many workers configured. Cannot distribute ports for services.") @pytest.fixture(scope="session") @@ -192,6 +187,11 @@ def myfixture(...) return scope +@pytest.fixture(scope="session") +def worker_port_num(): + return (32768 - BASE_PORT) // int(os.environ.get("PYTEST_XDIST_WORKER_COUNT", "1")) + + @pytest.fixture(scope="session") def worker_seq_no(worker_id: str) -> int: # worker_id is a pytest-xdist fixture @@ -204,10 +204,10 @@ def worker_seq_no(worker_id: str) -> int: @pytest.fixture(scope="session") -def worker_base_port(worker_seq_no: int) -> int: - # so we divide ports in ranges of 100 ports +def worker_base_port(worker_seq_no: int, worker_port_num: int) -> int: + # so we divide ports in ranges of ports # so workers have disjoint set of ports for services - return BASE_PORT + worker_seq_no * WORKER_PORT_NUM + return BASE_PORT + worker_seq_no * worker_port_num def get_dir_size(path: str) -> int: @@ -220,80 +220,9 @@ def get_dir_size(path: str) -> int: return totalbytes -def can_bind(host: str, port: int) -> bool: - """ - Check whether a host:port is available to bind for listening - - Inspired by the can_bind() perl function used in Postgres tests, in - vendor/postgres-v14/src/test/perl/PostgresNode.pm - """ - with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock: - # TODO: The pageserver and safekeepers don't use SO_REUSEADDR at the - # moment. If that changes, we should use start using SO_REUSEADDR here - # too, to allow reusing ports more quickly. - # See https://github.com/neondatabase/neon/issues/801 - # sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) - - try: - sock.bind((host, port)) - sock.listen() - return True - except socket.error: - log.info(f"Port {port} is in use, skipping") - return False - finally: - sock.close() - - -class PortDistributor: - def __init__(self, base_port: int, port_number: int): - self.iterator = iter(range(base_port, base_port + port_number)) - self.port_map: Dict[int, int] = {} - - def get_port(self) -> int: - for port in self.iterator: - if can_bind("localhost", port): - return port - raise RuntimeError( - "port range configured for test is exhausted, consider enlarging the range" - ) - - def replace_with_new_port(self, value: Union[int, str]) -> Union[int, str]: - """ - Returns a new port for a port number in a string (like "localhost:1234") or int. - Replacements are memorised, so a substitution for the same port is always the same. - """ - - # TODO: replace with structural pattern matching for Python >= 3.10 - if isinstance(value, int): - return self._replace_port_int(value) - - if isinstance(value, str): - return self._replace_port_str(value) - - raise TypeError(f"unsupported type {type(value)} of {value=}") - - def _replace_port_int(self, value: int) -> int: - known_port = self.port_map.get(value) - if known_port is None: - known_port = self.port_map[value] = self.get_port() - - return known_port - - def _replace_port_str(self, value: str) -> str: - # Use regex to find port in a string - # urllib.parse.urlparse produces inconvenient results for cases without scheme like "localhost:5432" - # See https://bugs.python.org/issue27657 - ports = re.findall(r":(\d+)(?:/|$)", value) - assert len(ports) == 1, f"can't find port in {value}" - port_int = int(ports[0]) - - return value.replace(f":{port_int}", f":{self._replace_port_int(port_int)}") - - @pytest.fixture(scope="session") -def port_distributor(worker_base_port: int) -> PortDistributor: - return PortDistributor(base_port=worker_base_port, port_number=WORKER_PORT_NUM) +def port_distributor(worker_base_port: int, worker_port_num: int) -> PortDistributor: + return PortDistributor(base_port=worker_base_port, port_number=worker_port_num) @pytest.fixture(scope="session") @@ -464,120 +393,6 @@ def generate_tenant_token(self, tenant_id: TenantId) -> str: return self.generate_token(scope="tenant", tenant_id=str(tenant_id)) -class MockS3Server: - """ - Starts a mock S3 server for testing on a port given, errors if the server fails to start or exits prematurely. - Relies that `poetry` and `moto` server are installed, since it's the way the tests are run. - - Also provides a set of methods to derive the connection properties from and the method to kill the underlying server. - """ - - def __init__( - self, - port: int, - ): - self.port = port - - # XXX: do not use `shell=True` or add `exec ` to the command here otherwise. - # We use `self.subprocess.kill()` to shut down the server, which would not "just" work in Linux - # if a process is started from the shell process. - self.subprocess = subprocess.Popen(["poetry", "run", "moto_server", "s3", f"-p{port}"]) - error = None - try: - return_code = self.subprocess.poll() - if return_code is not None: - error = f"expected mock s3 server to run but it exited with code {return_code}. stdout: '{self.subprocess.stdout}', stderr: '{self.subprocess.stderr}'" - except Exception as e: - error = f"expected mock s3 server to start but it failed with exception: {e}. stdout: '{self.subprocess.stdout}', stderr: '{self.subprocess.stderr}'" - if error is not None: - log.error(error) - self.kill() - raise RuntimeError("failed to start s3 mock server") - - def endpoint(self) -> str: - return f"http://127.0.0.1:{self.port}" - - def region(self) -> str: - return "us-east-1" - - def access_key(self) -> str: - return "test" - - def secret_key(self) -> str: - return "test" - - def kill(self): - self.subprocess.kill() - - -@enum.unique -class RemoteStorageKind(str, enum.Enum): - LOCAL_FS = "local_fs" - MOCK_S3 = "mock_s3" - REAL_S3 = "real_s3" - # Pass to tests that are generic to remote storage - # to ensure the test pass with or without the remote storage - NOOP = "noop" - - -def available_remote_storages() -> List[RemoteStorageKind]: - remote_storages = [RemoteStorageKind.LOCAL_FS, RemoteStorageKind.MOCK_S3] - if os.getenv("ENABLE_REAL_S3_REMOTE_STORAGE") is not None: - remote_storages.append(RemoteStorageKind.REAL_S3) - log.info("Enabling real s3 storage for tests") - else: - log.info("Using mock implementations to test remote storage") - return remote_storages - - -@dataclass -class LocalFsStorage: - root: Path - - -@dataclass -class S3Storage: - bucket_name: str - bucket_region: str - access_key: str - secret_key: str - endpoint: Optional[str] = None - prefix_in_bucket: Optional[str] = "" - - def access_env_vars(self) -> Dict[str, str]: - return { - "AWS_ACCESS_KEY_ID": self.access_key, - "AWS_SECRET_ACCESS_KEY": self.secret_key, - } - - -RemoteStorage = Union[LocalFsStorage, S3Storage] - - -# serialize as toml inline table -def remote_storage_to_toml_inline_table(remote_storage: RemoteStorage) -> str: - if isinstance(remote_storage, LocalFsStorage): - remote_storage_config = f"local_path='{remote_storage.root}'" - elif isinstance(remote_storage, S3Storage): - remote_storage_config = f"bucket_name='{remote_storage.bucket_name}',\ - bucket_region='{remote_storage.bucket_region}'" - - if remote_storage.prefix_in_bucket is not None: - remote_storage_config += f",prefix_in_bucket='{remote_storage.prefix_in_bucket}'" - - if remote_storage.endpoint is not None: - remote_storage_config += f",endpoint='{remote_storage.endpoint}'" - else: - raise Exception("invalid remote storage type") - - return f"{{{remote_storage_config}}}" - - -class RemoteStorageUsers(Flag): - PAGESERVER = auto() - SAFEKEEPER = auto() - - class NeonEnvBuilder: """ Builder object to create a Neon runtime environment @@ -616,10 +431,12 @@ def __init__( self.rust_log_override = rust_log_override self.port_distributor = port_distributor self.remote_storage = remote_storage + self.ext_remote_storage: Optional[S3Storage] = None + self.remote_storage_client: Optional[Any] = None self.remote_storage_users = remote_storage_users self.broker = broker self.run_id = run_id - self.mock_s3_server = mock_s3_server + self.mock_s3_server: MockS3Server = mock_s3_server self.pageserver_config_override = pageserver_config_override self.num_safekeepers = num_safekeepers self.safekeepers_id_start = safekeepers_id_start @@ -651,7 +468,7 @@ def init_start(self, initial_tenant_conf: Optional[Dict[str, str]] = None) -> Ne # Prepare the default branch to start the postgres on later. # Pageserver itself does not create tenants and timelines, until started first and asked via HTTP API. - log.info( + log.debug( f"Services started, creating initial tenant {env.initial_tenant} and its initial timeline" ) initial_tenant, initial_timeline = env.neon_cli.create_tenant( @@ -667,15 +484,24 @@ def enable_remote_storage( remote_storage_kind: RemoteStorageKind, test_name: str, force_enable: bool = True, + enable_remote_extensions: bool = False, ): if remote_storage_kind == RemoteStorageKind.NOOP: return elif remote_storage_kind == RemoteStorageKind.LOCAL_FS: self.enable_local_fs_remote_storage(force_enable=force_enable) elif remote_storage_kind == RemoteStorageKind.MOCK_S3: - self.enable_mock_s3_remote_storage(bucket_name=test_name, force_enable=force_enable) + self.enable_mock_s3_remote_storage( + bucket_name=test_name, + force_enable=force_enable, + enable_remote_extensions=enable_remote_extensions, + ) elif remote_storage_kind == RemoteStorageKind.REAL_S3: - self.enable_real_s3_remote_storage(test_name=test_name, force_enable=force_enable) + self.enable_real_s3_remote_storage( + test_name=test_name, + force_enable=force_enable, + enable_remote_extensions=enable_remote_extensions, + ) else: raise RuntimeError(f"Unknown storage type: {remote_storage_kind}") @@ -689,11 +515,18 @@ def enable_local_fs_remote_storage(self, force_enable: bool = True): assert force_enable or self.remote_storage is None, "remote storage is enabled already" self.remote_storage = LocalFsStorage(Path(self.repo_dir / "local_fs_remote_storage")) - def enable_mock_s3_remote_storage(self, bucket_name: str, force_enable: bool = True): + def enable_mock_s3_remote_storage( + self, + bucket_name: str, + force_enable: bool = True, + enable_remote_extensions: bool = False, + ): """ Sets up the pageserver to use the S3 mock server, creates the bucket, if it's not present already. Starts up the mock server, if that does not run yet. Errors, if the pageserver has some remote storage configuration already, unless `force_enable` is not set to `True`. + + Also creates the bucket for extensions, self.ext_remote_storage bucket """ assert force_enable or self.remote_storage is None, "remote storage is enabled already" mock_endpoint = self.mock_s3_server.endpoint() @@ -714,9 +547,25 @@ def enable_mock_s3_remote_storage(self, bucket_name: str, force_enable: bool = T bucket_region=mock_region, access_key=self.mock_s3_server.access_key(), secret_key=self.mock_s3_server.secret_key(), + prefix_in_bucket="pageserver", ) - def enable_real_s3_remote_storage(self, test_name: str, force_enable: bool = True): + if enable_remote_extensions: + self.ext_remote_storage = S3Storage( + bucket_name=bucket_name, + endpoint=mock_endpoint, + bucket_region=mock_region, + access_key=self.mock_s3_server.access_key(), + secret_key=self.mock_s3_server.secret_key(), + prefix_in_bucket="ext", + ) + + def enable_real_s3_remote_storage( + self, + test_name: str, + force_enable: bool = True, + enable_remote_extensions: bool = False, + ): """ Sets up configuration to use real s3 endpoint without mock server """ @@ -756,6 +605,15 @@ def enable_real_s3_remote_storage(self, test_name: str, force_enable: bool = Tru prefix_in_bucket=self.remote_storage_prefix, ) + if enable_remote_extensions: + self.ext_remote_storage = S3Storage( + bucket_name="neon-dev-extensions-eu-central-1", + bucket_region="eu-central-1", + access_key=access_key, + secret_key=secret_key, + prefix_in_bucket=None, + ) + def cleanup_local_storage(self): if self.preserve_database_files: return @@ -789,6 +647,7 @@ def cleanup_remote_storage(self): # `self.remote_storage_prefix` is coupled with `S3Storage` storage type, # so this line effectively a no-op assert isinstance(self.remote_storage, S3Storage) + assert self.remote_storage_client is not None if self.keep_remote_storage_contents: log.info("keep_remote_storage_contents skipping remote storage cleanup") @@ -918,6 +777,8 @@ def __init__(self, config: NeonEnvBuilder): self.neon_binpath = config.neon_binpath self.pg_distrib_dir = config.pg_distrib_dir self.endpoint_counter = 0 + self.remote_storage_client = config.remote_storage_client + self.ext_remote_storage = config.ext_remote_storage # generate initial tenant ID here instead of letting 'neon init' generate it, # so that we don't need to dig it out of the config file afterwards. @@ -1505,6 +1366,7 @@ def endpoint_start( tenant_id: Optional[TenantId] = None, lsn: Optional[Lsn] = None, branch_name: Optional[str] = None, + remote_ext_config: Optional[str] = None, ) -> "subprocess.CompletedProcess[str]": args = [ "endpoint", @@ -1514,6 +1376,8 @@ def endpoint_start( "--pg-version", self.env.pg_version, ] + if remote_ext_config is not None: + args.extend(["--remote-ext-config", remote_ext_config]) if lsn is not None: args.append(f"--lsn={lsn}") args.extend(["--pg-port", str(pg_port)]) @@ -1526,7 +1390,11 @@ def endpoint_start( if endpoint_id is not None: args.append(endpoint_id) - res = self.raw_cli(args) + s3_env_vars = None + if self.env.remote_storage is not None and isinstance(self.env.remote_storage, S3Storage): + s3_env_vars = self.env.remote_storage.access_env_vars() + + res = self.raw_cli(args, extra_env_vars=s3_env_vars) res.check_returncode() return res @@ -1628,9 +1496,6 @@ def __init__(self, env: NeonEnv, port: PageserverPort, config_override: Optional ".*Error processing HTTP request: Forbidden", # intentional failpoints ".*failpoint ", - # FIXME: there is a race condition between GC and detach, see - # https://github.com/neondatabase/neon/issues/2442 - ".*could not remove ephemeral file.*No such file or directory.*", # FIXME: These need investigation ".*manual_gc.*is_shutdown_requested\\(\\) called in an unexpected task or thread.*", ".*tenant_list: timeline is not found in remote index while it is present in the tenants registry.*", @@ -1653,6 +1518,7 @@ def __init__(self, env: NeonEnv, port: PageserverPort, config_override: Optional ".*Compaction failed, retrying in .*: queue is in state Stopped.*", # Pageserver timeline deletion should be polled until it gets 404, so ignore it globally ".*Error processing HTTP request: NotFound: Timeline .* was not found", + ".*took more than expected to complete.*", ] def start( @@ -2375,7 +2241,7 @@ def create( return self - def start(self) -> "Endpoint": + def start(self, remote_ext_config: Optional[str] = None) -> "Endpoint": """ Start the Postgres instance. Returns self. @@ -2391,6 +2257,7 @@ def start(self) -> "Endpoint": http_port=self.http_port, tenant_id=self.tenant_id, safekeepers=self.active_safekeepers, + remote_ext_config=remote_ext_config, ) self.running = True @@ -2480,6 +2347,7 @@ def create_start( hot_standby: bool = False, lsn: Optional[Lsn] = None, config_lines: Optional[List[str]] = None, + remote_ext_config: Optional[str] = None, ) -> "Endpoint": """ Create an endpoint, apply config, and start Postgres. @@ -2494,7 +2362,7 @@ def create_start( config_lines=config_lines, hot_standby=hot_standby, lsn=lsn, - ).start() + ).start(remote_ext_config=remote_ext_config) log.info(f"Postgres startup took {time.time() - started_at} seconds") @@ -2528,6 +2396,7 @@ def create_start( lsn: Optional[Lsn] = None, hot_standby: bool = False, config_lines: Optional[List[str]] = None, + remote_ext_config: Optional[str] = None, ) -> Endpoint: ep = Endpoint( self.env, @@ -2544,6 +2413,7 @@ def create_start( hot_standby=hot_standby, config_lines=config_lines, lsn=lsn, + remote_ext_config=remote_ext_config, ) def create( @@ -2822,59 +2692,6 @@ def get_metrics(self) -> SafekeeperMetrics: return metrics -@dataclass -class NeonBroker: - """An object managing storage_broker instance""" - - logfile: Path - port: int - neon_binpath: Path - handle: Optional[subprocess.Popen[Any]] = None # handle of running daemon - - def listen_addr(self): - return f"127.0.0.1:{self.port}" - - def client_url(self): - return f"http://{self.listen_addr()}" - - def check_status(self): - return True # TODO - - def try_start(self): - if self.handle is not None: - log.debug(f"storage_broker is already running on port {self.port}") - return - - listen_addr = self.listen_addr() - log.info(f'starting storage_broker to listen incoming connections at "{listen_addr}"') - with open(self.logfile, "wb") as logfile: - args = [ - str(self.neon_binpath / "storage_broker"), - f"--listen-addr={listen_addr}", - ] - self.handle = subprocess.Popen(args, stdout=logfile, stderr=logfile) - - # wait for start - started_at = time.time() - while True: - try: - self.check_status() - except Exception as e: - elapsed = time.time() - started_at - if elapsed > 5: - raise RuntimeError( - f"timed out waiting {elapsed:.0f}s for storage_broker start: {e}" - ) from e - time.sleep(0.5) - else: - break # success - - def stop(self): - if self.handle is not None: - self.handle.terminate() - self.handle.wait() - - def get_test_output_dir(request: FixtureRequest, top_output_dir: Path) -> Path: """Compute the working directory for an individual test.""" test_name = request.node.name diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py index f8a4423ffa3b..119c99bb9684 100644 --- a/test_runner/fixtures/pageserver/utils.py +++ b/test_runner/fixtures/pageserver/utils.py @@ -197,10 +197,9 @@ def wait_timeline_detail_404( pageserver_http: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId, - wait_longer: bool = False, + iterations: int, ): last_exc = None - iterations = 10 if wait_longer else 2 for _ in range(iterations): time.sleep(0.250) try: @@ -220,8 +219,8 @@ def timeline_delete_wait_completed( pageserver_http: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId, - wait_longer: bool = False, # Use when running with RemoteStorageKind.REAL_S3 + iterations: int = 20, **delete_args, ): pageserver_http.timeline_delete(tenant_id=tenant_id, timeline_id=timeline_id, **delete_args) - wait_timeline_detail_404(pageserver_http, tenant_id, timeline_id, wait_longer) + wait_timeline_detail_404(pageserver_http, tenant_id, timeline_id, iterations) diff --git a/test_runner/fixtures/port_distributor.py b/test_runner/fixtures/port_distributor.py new file mode 100644 index 000000000000..fd808d7a5f7d --- /dev/null +++ b/test_runner/fixtures/port_distributor.py @@ -0,0 +1,77 @@ +import re +import socket +from contextlib import closing +from typing import Dict, Union + +from fixtures.log_helper import log + + +def can_bind(host: str, port: int) -> bool: + """ + Check whether a host:port is available to bind for listening + + Inspired by the can_bind() perl function used in Postgres tests, in + vendor/postgres-v14/src/test/perl/PostgresNode.pm + """ + with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock: + # TODO: The pageserver and safekeepers don't use SO_REUSEADDR at the + # moment. If that changes, we should use start using SO_REUSEADDR here + # too, to allow reusing ports more quickly. + # See https://github.com/neondatabase/neon/issues/801 + # sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) + + try: + sock.bind((host, port)) + sock.listen() + return True + except socket.error: + log.info(f"Port {port} is in use, skipping") + return False + finally: + sock.close() + + +class PortDistributor: + def __init__(self, base_port: int, port_number: int): + self.iterator = iter(range(base_port, base_port + port_number)) + self.port_map: Dict[int, int] = {} + + def get_port(self) -> int: + for port in self.iterator: + if can_bind("localhost", port): + return port + raise RuntimeError( + "port range configured for test is exhausted, consider enlarging the range" + ) + + def replace_with_new_port(self, value: Union[int, str]) -> Union[int, str]: + """ + Returns a new port for a port number in a string (like "localhost:1234") or int. + Replacements are memorised, so a substitution for the same port is always the same. + """ + + # TODO: replace with structural pattern matching for Python >= 3.10 + if isinstance(value, int): + return self._replace_port_int(value) + + if isinstance(value, str): + return self._replace_port_str(value) + + raise TypeError(f"unsupported type {type(value)} of {value=}") + + def _replace_port_int(self, value: int) -> int: + known_port = self.port_map.get(value) + if known_port is None: + known_port = self.port_map[value] = self.get_port() + + return known_port + + def _replace_port_str(self, value: str) -> str: + # Use regex to find port in a string + # urllib.parse.urlparse produces inconvenient results for cases without scheme like "localhost:5432" + # See https://bugs.python.org/issue27657 + ports = re.findall(r":(\d+)(?:/|$)", value) + assert len(ports) == 1, f"can't find port in {value}" + port_int = int(ports[0]) + + return value.replace(f":{port_int}", f":{self._replace_port_int(port_int)}") diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py new file mode 100644 index 000000000000..1b8047337738 --- /dev/null +++ b/test_runner/fixtures/remote_storage.py @@ -0,0 +1,143 @@ +import enum +import json +import os +import subprocess +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List, Optional, Union + +from fixtures.log_helper import log + + +class MockS3Server: + """ + Starts a mock S3 server for testing on a port given, errors if the server fails to start or exits prematurely. + Relies that `poetry` and `moto` server are installed, since it's the way the tests are run. + + Also provides a set of methods to derive the connection properties from and the method to kill the underlying server. + """ + + def __init__( + self, + port: int, + ): + self.port = port + + # XXX: do not use `shell=True` or add `exec ` to the command here otherwise. + # We use `self.subprocess.kill()` to shut down the server, which would not "just" work in Linux + # if a process is started from the shell process. + self.subprocess = subprocess.Popen(["poetry", "run", "moto_server", "s3", f"-p{port}"]) + error = None + try: + return_code = self.subprocess.poll() + if return_code is not None: + error = f"expected mock s3 server to run but it exited with code {return_code}. stdout: '{self.subprocess.stdout}', stderr: '{self.subprocess.stderr}'" + except Exception as e: + error = f"expected mock s3 server to start but it failed with exception: {e}. stdout: '{self.subprocess.stdout}', stderr: '{self.subprocess.stderr}'" + if error is not None: + log.error(error) + self.kill() + raise RuntimeError("failed to start s3 mock server") + + def endpoint(self) -> str: + return f"http://127.0.0.1:{self.port}" + + def region(self) -> str: + return "us-east-1" + + def access_key(self) -> str: + return "test" + + def secret_key(self) -> str: + return "test" + + def kill(self): + self.subprocess.kill() + + +@enum.unique +class RemoteStorageKind(str, enum.Enum): + LOCAL_FS = "local_fs" + MOCK_S3 = "mock_s3" + REAL_S3 = "real_s3" + # Pass to tests that are generic to remote storage + # to ensure the test pass with or without the remote storage + NOOP = "noop" + + +def available_remote_storages() -> List[RemoteStorageKind]: + remote_storages = [RemoteStorageKind.LOCAL_FS, RemoteStorageKind.MOCK_S3] + if os.getenv("ENABLE_REAL_S3_REMOTE_STORAGE") is not None: + remote_storages.append(RemoteStorageKind.REAL_S3) + log.info("Enabling real s3 storage for tests") + else: + log.info("Using mock implementations to test remote storage") + return remote_storages + + +def available_s3_storages() -> List[RemoteStorageKind]: + remote_storages = [RemoteStorageKind.MOCK_S3] + if os.getenv("ENABLE_REAL_S3_REMOTE_STORAGE") is not None: + remote_storages.append(RemoteStorageKind.REAL_S3) + log.info("Enabling real s3 storage for tests") + else: + log.info("Using mock implementations to test remote storage") + return remote_storages + + +@dataclass +class LocalFsStorage: + root: Path + + +@dataclass +class S3Storage: + bucket_name: str + bucket_region: str + access_key: str + secret_key: str + endpoint: Optional[str] = None + prefix_in_bucket: Optional[str] = "" + + def access_env_vars(self) -> Dict[str, str]: + return { + "AWS_ACCESS_KEY_ID": self.access_key, + "AWS_SECRET_ACCESS_KEY": self.secret_key, + } + + def to_string(self) -> str: + return json.dumps( + { + "bucket": self.bucket_name, + "region": self.bucket_region, + "endpoint": self.endpoint, + "prefix": self.prefix_in_bucket, + } + ) + + +RemoteStorage = Union[LocalFsStorage, S3Storage] + + +# serialize as toml inline table +def remote_storage_to_toml_inline_table(remote_storage: RemoteStorage) -> str: + if isinstance(remote_storage, LocalFsStorage): + remote_storage_config = f"local_path='{remote_storage.root}'" + elif isinstance(remote_storage, S3Storage): + remote_storage_config = f"bucket_name='{remote_storage.bucket_name}',\ + bucket_region='{remote_storage.bucket_region}'" + + if remote_storage.prefix_in_bucket is not None: + remote_storage_config += f",prefix_in_bucket='{remote_storage.prefix_in_bucket}'" + + if remote_storage.endpoint is not None: + remote_storage_config += f",endpoint='{remote_storage.endpoint}'" + else: + raise Exception("invalid remote storage type") + + return f"{{{remote_storage_config}}}" + + +class RemoteStorageUsers(enum.Flag): + PAGESERVER = enum.auto() + SAFEKEEPER = enum.auto() diff --git a/test_runner/fixtures/types.py b/test_runner/fixtures/types.py index 7d179cc7fb38..ef88e09de4ba 100644 --- a/test_runner/fixtures/types.py +++ b/test_runner/fixtures/types.py @@ -89,6 +89,9 @@ class TenantId(Id): def __repr__(self) -> str: return f'`TenantId("{self.id.hex()}")' + def __str__(self) -> str: + return self.id.hex() + class TimelineId(Id): def __repr__(self) -> str: diff --git a/test_runner/regress/data/extension_test/5670669815/v14/ext_index.json b/test_runner/regress/data/extension_test/5670669815/v14/ext_index.json new file mode 100644 index 000000000000..af49dfa0c006 --- /dev/null +++ b/test_runner/regress/data/extension_test/5670669815/v14/ext_index.json @@ -0,0 +1,24 @@ +{ + "public_extensions": [ + "anon", + "pg_buffercache" + ], + "library_index": { + "anon": "anon", + "pg_buffercache": "pg_buffercache" + }, + "extension_data": { + "pg_buffercache": { + "control_data": { + "pg_buffercache.control": "# pg_buffercache extension \ncomment = 'examine the shared buffer cache' \ndefault_version = '1.3' \nmodule_pathname = '$libdir/pg_buffercache' \nrelocatable = true \ntrusted=true" + }, + "archive_path": "5670669815/v14/extensions/pg_buffercache.tar.zst" + }, + "anon": { + "control_data": { + "anon.control": "# PostgreSQL Anonymizer (anon) extension \ncomment = 'Data anonymization tools' \ndefault_version = '1.1.0' \ndirectory='extension/anon' \nrelocatable = false \nrequires = 'pgcrypto' \nsuperuser = false \nmodule_pathname = '$libdir/anon' \ntrusted = true \n" + }, + "archive_path": "5670669815/v14/extensions/anon.tar.zst" + } + } +} \ No newline at end of file diff --git a/test_runner/regress/data/extension_test/5670669815/v14/extensions/anon.tar.zst b/test_runner/regress/data/extension_test/5670669815/v14/extensions/anon.tar.zst new file mode 100644 index 000000000000..5c17630109e6 Binary files /dev/null and b/test_runner/regress/data/extension_test/5670669815/v14/extensions/anon.tar.zst differ diff --git a/test_runner/regress/data/extension_test/5670669815/v14/extensions/pg_buffercache.tar.zst b/test_runner/regress/data/extension_test/5670669815/v14/extensions/pg_buffercache.tar.zst new file mode 100644 index 000000000000..69648a2f1a03 Binary files /dev/null and b/test_runner/regress/data/extension_test/5670669815/v14/extensions/pg_buffercache.tar.zst differ diff --git a/test_runner/regress/data/extension_test/5670669815/v15/ext_index.json b/test_runner/regress/data/extension_test/5670669815/v15/ext_index.json new file mode 100644 index 000000000000..fd0d1edc3cb8 --- /dev/null +++ b/test_runner/regress/data/extension_test/5670669815/v15/ext_index.json @@ -0,0 +1,17 @@ +{ + "public_extensions": [ + "anon" + ], + "library_index": { + "anon": "anon" + }, + "extension_data": { + "anon": { + "control_data": { + "anon.control": "# PostgreSQL Anonymizer (anon) extension \ncomment = 'Data anonymization tools' \ndefault_version = '1.1.0' \ndirectory='extension/anon' \nrelocatable = false \nrequires = 'pgcrypto' \nsuperuser = false \nmodule_pathname = '$libdir/anon' \ntrusted = true \n" + }, + "archive_path": "5670669815/v15/extensions/anon.tar.zst" + } + } +} + diff --git a/test_runner/regress/data/extension_test/5670669815/v15/extensions/anon.tar.zst b/test_runner/regress/data/extension_test/5670669815/v15/extensions/anon.tar.zst new file mode 100644 index 000000000000..ea7034578f06 Binary files /dev/null and b/test_runner/regress/data/extension_test/5670669815/v15/extensions/anon.tar.zst differ diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py index b6b46f940173..bc6afa84a1ab 100644 --- a/test_runner/regress/test_attach_tenant_config.py +++ b/test_runner/regress/test_attach_tenant_config.py @@ -3,12 +3,11 @@ import pytest from fixtures.neon_fixtures import ( - LocalFsStorage, NeonEnv, NeonEnvBuilder, - RemoteStorageKind, ) from fixtures.pageserver.http import PageserverApiException, TenantConfig +from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind from fixtures.types import TenantId from fixtures.utils import wait_until diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index 754d23a01a27..fa386ee75a04 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -13,7 +13,6 @@ NeonCli, NeonEnvBuilder, PgBin, - PortDistributor, ) from fixtures.pageserver.http import PageserverHttpClient from fixtures.pageserver.utils import ( @@ -22,6 +21,7 @@ wait_for_upload, ) from fixtures.pg_version import PgVersion +from fixtures.port_distributor import PortDistributor from fixtures.types import Lsn from pytest import FixtureRequest @@ -337,6 +337,7 @@ def check_neon_works( config.pg_version = pg_version config.initial_tenant = snapshot_config["default_tenant_id"] config.pg_distrib_dir = pg_distrib_dir + config.remote_storage = None # Use the "target" binaries to launch the storage nodes config_target = config diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py index fc2d29c2c102..1e9b130b1c22 100644 --- a/test_runner/regress/test_disk_usage_eviction.py +++ b/test_runner/regress/test_disk_usage_eviction.py @@ -7,15 +7,14 @@ import toml from fixtures.log_helper import log from fixtures.neon_fixtures import ( - LocalFsStorage, NeonEnv, NeonEnvBuilder, PgBin, - RemoteStorageKind, wait_for_last_flush_lsn, ) from fixtures.pageserver.http import PageserverHttpClient from fixtures.pageserver.utils import wait_for_upload_queue_empty +from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import wait_until diff --git a/test_runner/regress/test_download_extensions.py b/test_runner/regress/test_download_extensions.py new file mode 100644 index 000000000000..c4bee1a1ea9d --- /dev/null +++ b/test_runner/regress/test_download_extensions.py @@ -0,0 +1,323 @@ +import os +import shutil +import threading +from contextlib import closing +from pathlib import Path + +import pytest +from fixtures.log_helper import log +from fixtures.neon_fixtures import ( + NeonEnvBuilder, +) +from fixtures.pg_version import PgVersion +from fixtures.remote_storage import RemoteStorageKind, available_s3_storages + + +# Cleaning up downloaded files is important for local tests +# or else one test could reuse the files from another test or another test run +def cleanup(pg_version): + PGDIR = Path(f"pg_install/v{pg_version}") + + LIB_DIR = PGDIR / Path("lib/postgresql") + cleanup_lib_globs = ["anon*", "postgis*", "pg_buffercache*"] + cleanup_lib_glob_paths = [LIB_DIR.glob(x) for x in cleanup_lib_globs] + + SHARE_DIR = PGDIR / Path("share/postgresql/extension") + cleanup_ext_globs = [ + "anon*", + "address_standardizer*", + "postgis*", + "pageinspect*", + "pg_buffercache*", + "pgrouting*", + ] + cleanup_ext_glob_paths = [SHARE_DIR.glob(x) for x in cleanup_ext_globs] + + all_glob_paths = cleanup_lib_glob_paths + cleanup_ext_glob_paths + all_cleanup_files = [] + for file_glob in all_glob_paths: + for file in file_glob: + all_cleanup_files.append(file) + + for file in all_cleanup_files: + try: + os.remove(file) + log.info(f"removed file {file}") + except Exception as err: + log.info( + f"skipping remove of file {file} because it doesn't exist.\ + this may be expected or unexpected depending on the test {err}" + ) + + cleanup_folders = [SHARE_DIR / Path("anon"), PGDIR / Path("download_extensions")] + for folder in cleanup_folders: + try: + shutil.rmtree(folder) + log.info(f"removed folder {folder}") + except Exception as err: + log.info( + f"skipping remove of folder {folder} because it doesn't exist.\ + this may be expected or unexpected depending on the test {err}" + ) + + +def upload_files(env): + log.info("Uploading test files to mock bucket") + os.chdir("test_runner/regress/data/extension_test") + for path in os.walk("."): + prefix, _, files = path + for file in files: + # the [2:] is to remove the leading "./" + full_path = os.path.join(prefix, file)[2:] + + with open(full_path, "rb") as f: + log.info(f"UPLOAD {full_path} to ext/{full_path}") + env.remote_storage_client.upload_fileobj( + f, + env.ext_remote_storage.bucket_name, + f"ext/{full_path}", + ) + os.chdir("../../../..") + + +# Test downloading remote extension. +@pytest.mark.parametrize("remote_storage_kind", available_s3_storages()) +def test_remote_extensions( + neon_env_builder: NeonEnvBuilder, + remote_storage_kind: RemoteStorageKind, + pg_version: PgVersion, +): + neon_env_builder.enable_remote_storage( + remote_storage_kind=remote_storage_kind, + test_name="test_remote_extensions", + enable_remote_extensions=True, + ) + env = neon_env_builder.init_start() + tenant_id, _ = env.neon_cli.create_tenant() + env.neon_cli.create_timeline("test_remote_extensions", tenant_id=tenant_id) + + assert env.ext_remote_storage is not None # satisfy mypy + assert env.remote_storage_client is not None # satisfy mypy + + # For MOCK_S3 we upload test files. + # For REAL_S3 we use the files already in the bucket + if remote_storage_kind == RemoteStorageKind.MOCK_S3: + upload_files(env) + + # Start a compute node and check that it can download the extensions + # and use them to CREATE EXTENSION and LOAD + endpoint = env.endpoints.create_start( + "test_remote_extensions", + tenant_id=tenant_id, + remote_ext_config=env.ext_remote_storage.to_string(), + # config_lines=["log_min_messages=debug3"], + ) + try: + with closing(endpoint.connect()) as conn: + with conn.cursor() as cur: + # Check that appropriate control files were downloaded + cur.execute("SELECT * FROM pg_available_extensions") + all_extensions = [x[0] for x in cur.fetchall()] + log.info(all_extensions) + assert "anon" in all_extensions + + # postgis is on real s3 but not mock s3. + # it's kind of a big file, would rather not upload to github + if remote_storage_kind == RemoteStorageKind.REAL_S3: + assert "postgis" in all_extensions + # this may fail locally if dependency is missing + # we don't really care about the error, + # we just want to make sure it downloaded + try: + cur.execute("CREATE EXTENSION postgis") + except Exception as err: + log.info(f"(expected) error creating postgis extension: {err}") + # we do not check the error, so this is basically a NO-OP + # however checking the log you can make sure that it worked + # and also get valuable information about how long loading the extension took + + # this is expected to fail on my computer because I don't have the pgcrypto extension + try: + cur.execute("CREATE EXTENSION anon") + except Exception as err: + log.info("error creating anon extension") + assert "pgcrypto" in str(err), "unexpected error creating anon extension" + finally: + cleanup(pg_version) + + +# Test downloading remote library. +@pytest.mark.parametrize("remote_storage_kind", available_s3_storages()) +def test_remote_library( + neon_env_builder: NeonEnvBuilder, + remote_storage_kind: RemoteStorageKind, + pg_version: PgVersion, +): + neon_env_builder.enable_remote_storage( + remote_storage_kind=remote_storage_kind, + test_name="test_remote_library", + enable_remote_extensions=True, + ) + env = neon_env_builder.init_start() + tenant_id, _ = env.neon_cli.create_tenant() + env.neon_cli.create_timeline("test_remote_library", tenant_id=tenant_id) + + assert env.ext_remote_storage is not None # satisfy mypy + assert env.remote_storage_client is not None # satisfy mypy + + # For MOCK_S3 we upload test files. + # For REAL_S3 we use the files already in the bucket + if remote_storage_kind == RemoteStorageKind.MOCK_S3: + upload_files(env) + + # and use them to run LOAD library + endpoint = env.endpoints.create_start( + "test_remote_library", + tenant_id=tenant_id, + remote_ext_config=env.ext_remote_storage.to_string(), + # config_lines=["log_min_messages=debug3"], + ) + try: + with closing(endpoint.connect()) as conn: + with conn.cursor() as cur: + # try to load library + try: + cur.execute("LOAD 'anon'") + except Exception as err: + log.info(f"error loading anon library: {err}") + raise AssertionError("unexpected error loading anon library") from err + + # test library which name is different from extension name + # this may fail locally if dependency is missing + # however, it does successfully download the postgis archive + if remote_storage_kind == RemoteStorageKind.REAL_S3: + try: + cur.execute("LOAD 'postgis_topology-3'") + except Exception as err: + log.info("error loading postgis_topology-3") + assert "No such file or directory" in str( + err + ), "unexpected error loading postgis_topology-3" + finally: + cleanup(pg_version) + + +# Here we test a complex extension +# which has multiple extensions in one archive +# using postgis as an example +@pytest.mark.skipif( + RemoteStorageKind.REAL_S3 not in available_s3_storages(), + reason="skipping test because real s3 not enabled", +) +def test_multiple_extensions_one_archive( + neon_env_builder: NeonEnvBuilder, + pg_version: PgVersion, +): + neon_env_builder.enable_remote_storage( + remote_storage_kind=RemoteStorageKind.REAL_S3, + test_name="test_multiple_extensions_one_archive", + enable_remote_extensions=True, + ) + env = neon_env_builder.init_start() + tenant_id, _ = env.neon_cli.create_tenant() + env.neon_cli.create_timeline("test_multiple_extensions_one_archive", tenant_id=tenant_id) + + assert env.ext_remote_storage is not None # satisfy mypy + assert env.remote_storage_client is not None # satisfy mypy + + endpoint = env.endpoints.create_start( + "test_multiple_extensions_one_archive", + tenant_id=tenant_id, + remote_ext_config=env.ext_remote_storage.to_string(), + ) + with closing(endpoint.connect()) as conn: + with conn.cursor() as cur: + cur.execute("CREATE EXTENSION address_standardizer;") + cur.execute("CREATE EXTENSION address_standardizer_data_us;") + # execute query to ensure that it works + cur.execute( + "SELECT house_num, name, suftype, city, country, state, unit \ + FROM standardize_address('us_lex', 'us_gaz', 'us_rules', \ + 'One Rust Place, Boston, MA 02109');" + ) + res = cur.fetchall() + log.info(res) + assert len(res) > 0 + + cleanup(pg_version) + + +# Test that extension is downloaded after endpoint restart, +# when the library is used in the query. +# +# Run the test with mutliple simultaneous connections to an endpoint. +# to ensure that the extension is downloaded only once. +# +def test_extension_download_after_restart( + neon_env_builder: NeonEnvBuilder, + pg_version: PgVersion, +): + if "15" in pg_version: # SKIP v15 for now because test set only has extension built for v14 + return None + + neon_env_builder.enable_remote_storage( + remote_storage_kind=RemoteStorageKind.MOCK_S3, + test_name="test_extension_download_after_restart", + enable_remote_extensions=True, + ) + env = neon_env_builder.init_start() + tenant_id, _ = env.neon_cli.create_tenant() + env.neon_cli.create_timeline("test_extension_download_after_restart", tenant_id=tenant_id) + + assert env.ext_remote_storage is not None # satisfy mypy + assert env.remote_storage_client is not None # satisfy mypy + + # For MOCK_S3 we upload test files. + upload_files(env) + + endpoint = env.endpoints.create_start( + "test_extension_download_after_restart", + tenant_id=tenant_id, + remote_ext_config=env.ext_remote_storage.to_string(), + config_lines=["log_min_messages=debug3"], + ) + with closing(endpoint.connect()) as conn: + with conn.cursor() as cur: + cur.execute("CREATE extension pg_buffercache;") + cur.execute("SELECT * from pg_buffercache;") + res = cur.fetchall() + assert len(res) > 0 + log.info(res) + + # shutdown compute node + endpoint.stop() + # remove extension files locally + cleanup(pg_version) + + # spin up compute node again (there are no extension files available, because compute is stateless) + endpoint = env.endpoints.create_start( + "test_extension_download_after_restart", + tenant_id=tenant_id, + remote_ext_config=env.ext_remote_storage.to_string(), + config_lines=["log_min_messages=debug3"], + ) + + # connect to compute node and run the query + # that will trigger the download of the extension + def run_query(endpoint, thread_id: int): + log.info("thread_id {%d} starting", thread_id) + with closing(endpoint.connect()) as conn: + with conn.cursor() as cur: + cur.execute("SELECT * from pg_buffercache;") + res = cur.fetchall() + assert len(res) > 0 + log.info("thread_id {%d}, res = %s", thread_id, res) + + threads = [threading.Thread(target=run_query, args=(endpoint, i)) for i in range(2)] + + for thread in threads: + thread.start() + for thread in threads: + thread.join() + + cleanup(pg_version) diff --git a/test_runner/regress/test_duplicate_layers.py b/test_runner/regress/test_duplicate_layers.py index c1832a2063ad..7f76a8e04287 100644 --- a/test_runner/regress/test_duplicate_layers.py +++ b/test_runner/regress/test_duplicate_layers.py @@ -33,4 +33,4 @@ def test_duplicate_layers(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): time.sleep(10) # let compaction to be performed assert env.pageserver.log_contains("compact-level0-phase1-return-same") - pg_bin.run_capture(["pgbench", "-P1", "-N", "-c5", "-T500", "-Mprepared", connstr]) + pg_bin.run_capture(["pgbench", "-P1", "-N", "-c5", "-T200", "-Mprepared", connstr]) diff --git a/test_runner/regress/test_fullbackup.py b/test_runner/regress/test_fullbackup.py index ece9dccf931e..214f1f33a836 100644 --- a/test_runner/regress/test_fullbackup.py +++ b/test_runner/regress/test_fullbackup.py @@ -5,9 +5,9 @@ from fixtures.neon_fixtures import ( NeonEnvBuilder, PgBin, - PortDistributor, VanillaPostgres, ) +from fixtures.port_distributor import PortDistributor from fixtures.types import Lsn, TimelineId from fixtures.utils import query_scalar, subprocess_capture diff --git a/test_runner/regress/test_gc_aggressive.py b/test_runner/regress/test_gc_aggressive.py index e26b1a980ef1..53fa70903fe1 100644 --- a/test_runner/regress/test_gc_aggressive.py +++ b/test_runner/regress/test_gc_aggressive.py @@ -8,9 +8,9 @@ Endpoint, NeonEnv, NeonEnvBuilder, - RemoteStorageKind, wait_for_last_flush_lsn, ) +from fixtures.remote_storage import RemoteStorageKind from fixtures.types import TenantId, TimelineId from fixtures.utils import query_scalar diff --git a/test_runner/regress/test_layer_eviction.py b/test_runner/regress/test_layer_eviction.py index d4f0c94a29c2..1269210d0d89 100644 --- a/test_runner/regress/test_layer_eviction.py +++ b/test_runner/regress/test_layer_eviction.py @@ -4,10 +4,10 @@ from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, - RemoteStorageKind, wait_for_last_flush_lsn, ) from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload +from fixtures.remote_storage import RemoteStorageKind from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import query_scalar diff --git a/test_runner/regress/test_metric_collection.py b/test_runner/regress/test_metric_collection.py index 61c8f800f867..80ffe5126d52 100644 --- a/test_runner/regress/test_metric_collection.py +++ b/test_runner/regress/test_metric_collection.py @@ -13,11 +13,11 @@ PSQL, NeonEnvBuilder, NeonProxy, - PortDistributor, - RemoteStorageKind, VanillaPostgres, wait_for_last_flush_lsn, ) +from fixtures.port_distributor import PortDistributor +from fixtures.remote_storage import RemoteStorageKind from fixtures.types import TenantId, TimelineId from fixtures.utils import query_scalar from pytest_httpserver import HTTPServer diff --git a/test_runner/regress/test_neon_local_cli.py b/test_runner/regress/test_neon_local_cli.py index 6dd47de8cf01..becdd9ff80ca 100644 --- a/test_runner/regress/test_neon_local_cli.py +++ b/test_runner/regress/test_neon_local_cli.py @@ -1,4 +1,5 @@ -from fixtures.neon_fixtures import NeonEnvBuilder, PortDistributor +from fixtures.neon_fixtures import NeonEnvBuilder +from fixtures.port_distributor import PortDistributor # Test that neon cli is able to start and stop all processes with the user defaults. diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py index ca43df350ded..17a63535cfc7 100644 --- a/test_runner/regress/test_ondemand_download.py +++ b/test_runner/regress/test_ondemand_download.py @@ -10,8 +10,6 @@ from fixtures.log_helper import log from fixtures.neon_fixtures import ( NeonEnvBuilder, - RemoteStorageKind, - available_remote_storages, last_flush_lsn_upload, wait_for_last_flush_lsn, ) @@ -23,6 +21,7 @@ wait_for_upload_queue_empty, wait_until_tenant_state, ) +from fixtures.remote_storage import RemoteStorageKind, available_remote_storages from fixtures.types import Lsn from fixtures.utils import query_scalar, wait_until diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index 7c04ed901781..68116985d3f0 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -12,10 +12,7 @@ import pytest from fixtures.log_helper import log from fixtures.neon_fixtures import ( - LocalFsStorage, NeonEnvBuilder, - RemoteStorageKind, - available_remote_storages, wait_for_last_flush_lsn, ) from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient @@ -26,6 +23,11 @@ wait_until_tenant_active, wait_until_tenant_state, ) +from fixtures.remote_storage import ( + LocalFsStorage, + RemoteStorageKind, + available_remote_storages, +) from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import print_gc_result, query_scalar, wait_until from requests import ReadTimeout diff --git a/test_runner/regress/test_sni_router.py b/test_runner/regress/test_sni_router.py index 9b78e8287ecf..4336e6551d05 100644 --- a/test_runner/regress/test_sni_router.py +++ b/test_runner/regress/test_sni_router.py @@ -6,7 +6,8 @@ import backoff from fixtures.log_helper import log -from fixtures.neon_fixtures import PgProtocol, PortDistributor, VanillaPostgres +from fixtures.neon_fixtures import PgProtocol, VanillaPostgres +from fixtures.port_distributor import PortDistributor def generate_tls_cert(cn, certout, keyout): diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py index 7c80d86863d5..60ec532db497 100644 --- a/test_runner/regress/test_tenant_conf.py +++ b/test_runner/regress/test_tenant_conf.py @@ -4,11 +4,10 @@ import psycopg2.extras from fixtures.log_helper import log from fixtures.neon_fixtures import ( - LocalFsStorage, NeonEnvBuilder, - RemoteStorageKind, ) from fixtures.pageserver.utils import assert_tenant_state, wait_for_upload +from fixtures.remote_storage import LocalFsStorage, RemoteStorageKind from fixtures.types import Lsn from fixtures.utils import wait_until diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py index 77f93dbd92b9..932d8219543e 100644 --- a/test_runner/regress/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -11,8 +11,6 @@ Endpoint, NeonEnv, NeonEnvBuilder, - RemoteStorageKind, - available_remote_storages, ) from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient from fixtures.pageserver.utils import ( @@ -20,6 +18,7 @@ wait_for_upload, wait_until_tenant_state, ) +from fixtures.remote_storage import RemoteStorageKind, available_remote_storages from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import query_scalar, wait_until from prometheus_client.samples import Sample diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py index 2805d56c9896..eb020c101f4c 100644 --- a/test_runner/regress/test_tenant_relocation.py +++ b/test_runner/regress/test_tenant_relocation.py @@ -7,15 +7,12 @@ from typing import Any, Dict, Optional, Tuple import pytest +from fixtures.broker import NeonBroker from fixtures.log_helper import log from fixtures.neon_fixtures import ( Endpoint, - NeonBroker, NeonEnv, NeonEnvBuilder, - PortDistributor, - RemoteStorageKind, - available_remote_storages, ) from fixtures.pageserver.http import PageserverHttpClient from fixtures.pageserver.utils import ( @@ -24,6 +21,8 @@ wait_for_last_record_lsn, wait_for_upload, ) +from fixtures.port_distributor import PortDistributor +from fixtures.remote_storage import RemoteStorageKind, available_remote_storages from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import ( query_scalar, diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index e327910138e2..19bc3ed37c8f 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -18,10 +18,9 @@ from fixtures.neon_fixtures import ( NeonEnv, NeonEnvBuilder, - RemoteStorageKind, - available_remote_storages, ) from fixtures.pageserver.utils import timeline_delete_wait_completed +from fixtures.remote_storage import RemoteStorageKind, available_remote_storages from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import wait_until from prometheus_client.samples import Sample diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py index 498563325b8c..79a3b353d458 100644 --- a/test_runner/regress/test_tenants_with_remote_storage.py +++ b/test_runner/regress/test_tenants_with_remote_storage.py @@ -16,11 +16,8 @@ from fixtures.log_helper import log from fixtures.neon_fixtures import ( Endpoint, - LocalFsStorage, NeonEnv, NeonEnvBuilder, - RemoteStorageKind, - available_remote_storages, last_flush_lsn_upload, ) from fixtures.pageserver.utils import ( @@ -28,6 +25,11 @@ wait_for_last_record_lsn, wait_for_upload, ) +from fixtures.remote_storage import ( + LocalFsStorage, + RemoteStorageKind, + available_remote_storages, +) from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import query_scalar, wait_until diff --git a/test_runner/regress/test_threshold_based_eviction.py b/test_runner/regress/test_threshold_based_eviction.py index 4d8c87e09e37..b1bc9623ce6a 100644 --- a/test_runner/regress/test_threshold_based_eviction.py +++ b/test_runner/regress/test_threshold_based_eviction.py @@ -6,10 +6,10 @@ from fixtures.neon_fixtures import ( NeonEnvBuilder, PgBin, - RemoteStorageKind, last_flush_lsn_upload, ) from fixtures.pageserver.http import LayerMapInfo +from fixtures.remote_storage import RemoteStorageKind from fixtures.types import TimelineId from pytest_httpserver import HTTPServer diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index 9226ca21d2fe..1e8dd3620618 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -13,9 +13,6 @@ NeonEnv, NeonEnvBuilder, PgBin, - RemoteStorageKind, - S3Storage, - available_remote_storages, last_flush_lsn_upload, wait_for_last_flush_lsn, ) @@ -28,6 +25,11 @@ wait_until_tenant_active, wait_until_timeline_state, ) +from fixtures.remote_storage import ( + RemoteStorageKind, + S3Storage, + available_remote_storages, +) from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import query_scalar, wait_until @@ -229,6 +231,8 @@ def test_delete_timeline_exercise_crash_safety_failpoints( ps_http.configure_failpoints((failpoint, "return")) + iterations = 20 if remote_storage_kind is RemoteStorageKind.REAL_S3 else 4 + # These failpoints are earlier than background task is spawned. # so they result in api request failure. if failpoint in ( @@ -245,7 +249,7 @@ def test_delete_timeline_exercise_crash_safety_failpoints( tenant_id=env.initial_tenant, timeline_id=timeline_id, expected_state="Broken", - iterations=2, # effectively try immediately and retry once in one second + iterations=iterations, ) reason = timeline_info["state"]["Broken"]["reason"] @@ -254,29 +258,44 @@ def test_delete_timeline_exercise_crash_safety_failpoints( # failpoint may not be the only error in the stack assert reason.endswith(f"failpoint: {failpoint}"), reason - wait_longer = remote_storage_kind is RemoteStorageKind.REAL_S3 if check is Check.RETRY_WITH_RESTART: env.pageserver.stop() env.pageserver.start() - if failpoint == "timeline-delete-before-index-deleted-at": - # We crashed before persisting this to remote storage, need to retry delete request - # Wait till tenant is loaded. Shouldnt take longer than 2 seconds (we shouldnt block tenant loading) - wait_until_tenant_active(ps_http, env.initial_tenant, iterations=2) + wait_until_tenant_active(ps_http, env.initial_tenant, iterations=iterations) + if failpoint == "timeline-delete-before-index-deleted-at": + # We crashed before persisting this to remote storage, need to retry delete request timeline_delete_wait_completed(ps_http, env.initial_tenant, timeline_id) else: # Pageserver should've resumed deletion after restart. wait_timeline_detail_404( - ps_http, env.initial_tenant, timeline_id, wait_longer=wait_longer + ps_http, env.initial_tenant, timeline_id, iterations=iterations ) + + if failpoint == "timeline-delete-after-index-delete": + m = ps_http.get_metrics() + assert ( + m.query_one( + "remote_storage_s3_request_seconds_count", + filter={"request_type": "get_object", "result": "err"}, + ).value + == 1 + ) + assert ( + m.query_one( + "remote_storage_s3_request_seconds_count", + filter={"request_type": "get_object", "result": "ok"}, + ).value + == 1 + ) elif check is Check.RETRY_WITHOUT_RESTART: # this should succeed # this also checks that delete can be retried even when timeline is in Broken state ps_http.configure_failpoints((failpoint, "off")) timeline_delete_wait_completed( - ps_http, env.initial_tenant, timeline_id, wait_longer=wait_longer + ps_http, env.initial_tenant, timeline_id, iterations=iterations ) # Check remote is impty @@ -404,6 +423,7 @@ def assert_prefix_empty(neon_env_builder: NeonEnvBuilder, prefix: Optional[str] assert isinstance(neon_env_builder.remote_storage, S3Storage) # Note that this doesnt use pagination, so list is not guaranteed to be exhaustive. + assert neon_env_builder.remote_storage_client is not None response = neon_env_builder.remote_storage_client.list_objects_v2( Bucket=neon_env_builder.remote_storage.bucket_name, Prefix=prefix or neon_env_builder.remote_storage.prefix_in_bucket or "", @@ -569,7 +589,7 @@ def first_call(result_queue): try: log.info("first call start") timeline_delete_wait_completed( - ps_http, env.initial_tenant, child_timeline_id, timeout=10 + ps_http, env.initial_tenant, child_timeline_id, timeout=20 ) log.info("first call success") result_queue.put("success") @@ -683,7 +703,7 @@ def first_request_finished(): wait_until(50, 0.1, first_request_finished) # check that the timeline is gone - wait_timeline_detail_404(ps_http, env.initial_tenant, child_timeline_id) + wait_timeline_detail_404(ps_http, env.initial_tenant, child_timeline_id, iterations=2) @pytest.mark.parametrize( @@ -758,7 +778,7 @@ def test_timeline_delete_works_for_remote_smoke( ) # for some reason the check above doesnt immediately take effect for the below. - # Assume it is mock server incosistency and check twice. + # Assume it is mock server inconsistency and check twice. wait_until( 2, 0.5, diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index cb993c93d235..f6e4a667a489 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -16,8 +16,6 @@ NeonEnv, NeonEnvBuilder, PgBin, - PortDistributor, - RemoteStorageKind, VanillaPostgres, wait_for_last_flush_lsn, ) @@ -29,6 +27,8 @@ wait_until_tenant_active, ) from fixtures.pg_version import PgVersion +from fixtures.port_distributor import PortDistributor +from fixtures.remote_storage import RemoteStorageKind from fixtures.types import TenantId, TimelineId from fixtures.utils import get_timeline_dir_size, wait_until diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 24b32ad7e7a5..87f076d23692 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -15,22 +15,18 @@ import psycopg2 import pytest +from fixtures.broker import NeonBroker from fixtures.log_helper import log from fixtures.neon_fixtures import ( Endpoint, - NeonBroker, NeonEnv, NeonEnvBuilder, NeonPageserver, PgBin, PgProtocol, - PortDistributor, - RemoteStorageKind, - RemoteStorageUsers, Safekeeper, SafekeeperHttpClient, SafekeeperPort, - available_remote_storages, ) from fixtures.pageserver.utils import ( timeline_delete_wait_completed, @@ -38,6 +34,12 @@ wait_for_upload, ) from fixtures.pg_version import PgVersion +from fixtures.port_distributor import PortDistributor +from fixtures.remote_storage import ( + RemoteStorageKind, + RemoteStorageUsers, + available_remote_storages, +) from fixtures.types import Lsn, TenantId, TimelineId from fixtures.utils import get_dir_size, query_scalar, start_in_background diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py index bb8ee8f52c57..cfc131a3aa21 100644 --- a/test_runner/regress/test_wal_acceptor_async.py +++ b/test_runner/regress/test_wal_acceptor_async.py @@ -245,7 +245,7 @@ def test_restarts_frequent_checkpoints(neon_env_builder: NeonEnvBuilder): # we try to simulate large (flush_lsn - truncate_lsn) lag, to test that WAL segments # are not removed before broadcasted to all safekeepers, with the help of replication slot asyncio.run( - run_restarts_under_load(env, endpoint, env.safekeepers, period_time=15, iterations=5) + run_restarts_under_load(env, endpoint, env.safekeepers, period_time=15, iterations=4) ) diff --git a/test_runner/regress/test_wal_restore.py b/test_runner/regress/test_wal_restore.py index f3d3a84c20d1..c97c69db2360 100644 --- a/test_runner/regress/test_wal_restore.py +++ b/test_runner/regress/test_wal_restore.py @@ -5,9 +5,9 @@ from fixtures.neon_fixtures import ( NeonEnvBuilder, PgBin, - PortDistributor, VanillaPostgres, ) +from fixtures.port_distributor import PortDistributor from fixtures.types import TenantId, TimelineId diff --git a/vendor/postgres-v14 b/vendor/postgres-v14 index da3885c34db3..28bf5ccfa2fd 160000 --- a/vendor/postgres-v14 +++ b/vendor/postgres-v14 @@ -1 +1 @@ -Subproject commit da3885c34db312afd555802be2ce985fafd1d8ad +Subproject commit 28bf5ccfa2fda9677566a25abd450e714d9ed055 diff --git a/vendor/postgres-v15 b/vendor/postgres-v15 index 770c6dffc5ef..553f2d3618a6 160000 --- a/vendor/postgres-v15 +++ b/vendor/postgres-v15 @@ -1 +1 @@ -Subproject commit 770c6dffc5ef6aac05bf049693877fb377eea6fc +Subproject commit 553f2d3618a6d4893bde67f1c065926ee8a3a118 diff --git a/vendor/revisions.json b/vendor/revisions.json index 8579b5abaaec..80d161938eff 100644 --- a/vendor/revisions.json +++ b/vendor/revisions.json @@ -1,4 +1,4 @@ { - "postgres-v15": "770c6dffc5ef6aac05bf049693877fb377eea6fc", - "postgres-v14": "da3885c34db312afd555802be2ce985fafd1d8ad" + "postgres-v15": "553f2d3618a6d4893bde67f1c065926ee8a3a118", + "postgres-v14": "28bf5ccfa2fda9677566a25abd450e714d9ed055" } diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 3f47ef062f93..d79c7a4104fc 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -60,6 +60,7 @@ url = { version = "2", features = ["serde"] } [build-dependencies] anyhow = { version = "1", features = ["backtrace"] } bytes = { version = "1", features = ["serde"] } +cc = { version = "1", default-features = false, features = ["parallel"] } either = { version = "1" } itertools = { version = "0.10" } libc = { version = "0.2", features = ["extra_traits"] }