diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index 8cb49d5d769c..a759efb56c1b 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -341,7 +341,7 @@ jobs: PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}" TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}" PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring - SYNC_AFTER_EACH_TEST: true + SYNC_BETWEEN_TESTS: true # XXX: no coverage data handling here, since benchmarks are run on release builds, # while coverage is currently collected for the debug ones diff --git a/.github/workflows/trigger-e2e-tests.yml b/.github/workflows/trigger-e2e-tests.yml index cad97645327b..5c5423e252c9 100644 --- a/.github/workflows/trigger-e2e-tests.yml +++ b/.github/workflows/trigger-e2e-tests.yml @@ -102,12 +102,17 @@ jobs: # Default set of platforms to run e2e tests on platforms='["docker", "k8s"]' - # If the PR changes vendor/, pgxn/ or libs/vm_monitor/ directories, or compute/Dockerfile.compute-node, add k8s-neonvm to the list of platforms. + # If a PR changes anything that affects computes, add k8s-neonvm to the list of platforms. # If the workflow run is not a pull request, add k8s-neonvm to the list. if [ "$GITHUB_EVENT_NAME" == "pull_request" ]; then for f in $(gh api "/repos/${GITHUB_REPOSITORY}/pulls/${PR_NUMBER}/files" --paginate --jq '.[].filename'); do case "$f" in - vendor/*|pgxn/*|libs/vm_monitor/*|compute/Dockerfile.compute-node) + # List of directories that contain code which affect compute images. + # + # This isn't exhaustive, just the paths that are most directly compute-related. + # For example, compute_ctl also depends on libs/utils, but we don't trigger + # an e2e run on that. + vendor/*|pgxn/*|compute_tools/*|libs/vm_monitor/*|compute/Dockerfile.compute-node) platforms=$(echo "${platforms}" | jq --compact-output '. += ["k8s-neonvm"] | unique') ;; *) diff --git a/Cargo.lock b/Cargo.lock index 528ee3319307..6ae5aac12797 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -673,8 +673,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3b829e4e32b91e643de6eafe82b1d90675f5874230191a4ffbc1b336dec4d6bf" dependencies = [ "async-trait", - "axum-core", - "base64 0.21.1", + "axum-core 0.3.4", "bitflags 1.3.2", "bytes", "futures-util", @@ -689,16 +688,47 @@ dependencies = [ "pin-project-lite", "rustversion", "serde", + "sync_wrapper 0.1.2", + "tower", + "tower-layer", + "tower-service", +] + +[[package]] +name = "axum" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3a6c9af12842a67734c9a2e355436e5d03b22383ed60cf13cd0c18fbfe3dcbcf" +dependencies = [ + "async-trait", + "axum-core 0.4.5", + "base64 0.21.1", + "bytes", + "futures-util", + "http 1.1.0", + "http-body 1.0.0", + "http-body-util", + "hyper 1.4.1", + "hyper-util", + "itoa", + "matchit 0.7.0", + "memchr", + "mime", + "percent-encoding", + "pin-project-lite", + "rustversion", + "serde", "serde_json", "serde_path_to_error", "serde_urlencoded", "sha1", - "sync_wrapper", + "sync_wrapper 1.0.1", "tokio", "tokio-tungstenite", "tower", "tower-layer", "tower-service", + "tracing", ] [[package]] @@ -718,6 +748,27 @@ dependencies = [ "tower-service", ] +[[package]] +name = "axum-core" +version = "0.4.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09f2bd6146b97ae3359fa0cc6d6b376d9539582c7b4220f041a33ec24c226199" +dependencies = [ + "async-trait", + "bytes", + "futures-util", + "http 1.1.0", + "http-body 1.0.0", + "http-body-util", + "mime", + "pin-project-lite", + "rustversion", + "sync_wrapper 1.0.1", + "tower-layer", + "tower-service", + "tracing", +] + [[package]] name = "azure_core" version = "0.19.0" @@ -1214,6 +1265,7 @@ version = "0.1.0" dependencies = [ "anyhow", "bytes", + "camino", "cfg-if", "chrono", "clap", @@ -2039,7 +2091,7 @@ dependencies = [ "futures-core", "futures-sink", "http-body-util", - "hyper 1.2.0", + "hyper 1.4.1", "hyper-util", "pin-project", "rand 0.8.5", @@ -2402,6 +2454,15 @@ dependencies = [ "digest", ] +[[package]] +name = "home" +version = "0.5.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e3d1354bf6b7235cb4a0576c2619fd4ed18183f689b12b006a0ee7329eeff9a5" +dependencies = [ + "windows-sys 0.52.0", +] + [[package]] name = "hostname" version = "0.4.0" @@ -2458,9 +2519,9 @@ dependencies = [ [[package]] name = "http-body-util" -version = "0.1.0" +version = "0.1.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "41cb79eb393015dadd30fc252023adb0b2400a0caee0fa2a077e6e21a551e840" +checksum = "793429d76616a256bcb62c2a2ec2bed781c8307e797e2598c50010f2bee2544f" dependencies = [ "bytes", "futures-util", @@ -2543,9 +2604,9 @@ dependencies = [ [[package]] name = "hyper" -version = "1.2.0" +version = "1.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "186548d73ac615b32a73aafe38fb4f56c0d340e110e5a200bcadbaf2e199263a" +checksum = "50dfd22e0e76d0f662d429a5f80fcaf3855009297eab6a0a9f8543834744ba05" dependencies = [ "bytes", "futures-channel", @@ -2585,7 +2646,7 @@ checksum = "a0bea761b46ae2b24eb4aef630d8d1c398157b6fc29e6350ecf090a0b70c952c" dependencies = [ "futures-util", "http 1.1.0", - "hyper 1.2.0", + "hyper 1.4.1", "hyper-util", "rustls 0.22.4", "rustls-pki-types", @@ -2608,16 +2669,16 @@ dependencies = [ [[package]] name = "hyper-util" -version = "0.1.3" +version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ca38ef113da30126bbff9cd1705f9273e15d45498615d138b0c20279ac7a76aa" +checksum = "cde7055719c54e36e95e8719f95883f22072a48ede39db7fc17a4e1d5281e9b9" dependencies = [ "bytes", "futures-channel", "futures-util", "http 1.1.0", "http-body 1.0.0", - "hyper 1.2.0", + "hyper 1.4.1", "pin-project-lite", "socket2", "tokio", @@ -3423,7 +3484,7 @@ dependencies = [ "opentelemetry", "opentelemetry_sdk", "prost 0.13.3", - "tonic 0.12.2", + "tonic 0.12.3", ] [[package]] @@ -4312,7 +4373,7 @@ dependencies = [ "humantime", "humantime-serde", "hyper 0.14.30", - "hyper 1.2.0", + "hyper 1.4.1", "hyper-util", "indexmap 2.0.1", "ipnet", @@ -4678,7 +4739,7 @@ dependencies = [ "scopeguard", "serde", "serde_json", - "sync_wrapper", + "sync_wrapper 0.1.2", "test-context", "tokio", "tokio-stream", @@ -4743,7 +4804,7 @@ dependencies = [ "http 1.1.0", "http-body 1.0.0", "http-body-util", - "hyper 1.2.0", + "hyper 1.4.1", "hyper-rustls 0.26.0", "hyper-util", "ipnet", @@ -4759,7 +4820,7 @@ dependencies = [ "serde", "serde_json", "serde_urlencoded", - "sync_wrapper", + "sync_wrapper 0.1.2", "tokio", "tokio-rustls 0.25.0", "tokio-util", @@ -4800,7 +4861,7 @@ dependencies = [ "futures", "getrandom 0.2.11", "http 1.1.0", - "hyper 1.2.0", + "hyper 1.4.1", "parking_lot 0.11.2", "reqwest 0.12.4", "reqwest-middleware", @@ -5897,6 +5958,12 @@ dependencies = [ "futures-core", ] +[[package]] +name = "sync_wrapper" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a7065abeca94b6a8a577f9bd45aa0867a2238b74e8eb67cf10d492bc39351394" + [[package]] name = "synstructure" version = "0.12.6" @@ -6241,9 +6308,9 @@ dependencies = [ [[package]] name = "tokio-stream" -version = "0.1.14" +version = "0.1.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "397c988d37662c7dda6d2208364a706264bf3d6138b11d436cbac0ad38832842" +checksum = "4f4e6ce100d0eb49a2734f8c0812bcd324cf357d21810932c5df6b96ef2b86f1" dependencies = [ "futures-core", "pin-project-lite", @@ -6267,9 +6334,9 @@ dependencies = [ [[package]] name = "tokio-tungstenite" -version = "0.20.0" +version = "0.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b2dbec703c26b00d74844519606ef15d09a7d6857860f84ad223dec002ddea2" +checksum = "c83b561d025642014097b66e6c1bb422783339e0909e4429cde4749d1990bc38" dependencies = [ "futures-util", "log", @@ -6336,7 +6403,7 @@ checksum = "3082666a3a6433f7f511c7192923fa1fe07c69332d3c6a2e6bb040b569199d5a" dependencies = [ "async-stream", "async-trait", - "axum", + "axum 0.6.20", "base64 0.21.1", "bytes", "futures-core", @@ -6362,9 +6429,9 @@ dependencies = [ [[package]] name = "tonic" -version = "0.12.2" +version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c6f6ba989e4b2c58ae83d862d3a3e27690b6e3ae630d0deb59f3697f32aa88ad" +checksum = "877c5b330756d856ffcc4553ab34a5684481ade925ecc54bcd1bf02b1d0d4d52" dependencies = [ "async-trait", "base64 0.22.1", @@ -6562,14 +6629,14 @@ checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" [[package]] name = "tungstenite" -version = "0.20.1" +version = "0.21.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e3dac10fd62eaf6617d3a904ae222845979aec67c615d1c842b4002c7666fb9" +checksum = "9ef1a641ea34f399a848dea702823bbecfb4c486f911735368f1f137cb8257e1" dependencies = [ "byteorder", "bytes", "data-encoding", - "http 0.2.9", + "http 1.1.0", "httparse", "log", "rand 0.8.5", @@ -6797,7 +6864,7 @@ name = "vm_monitor" version = "0.1.0" dependencies = [ "anyhow", - "axum", + "axum 0.7.5", "cgroups-rs", "clap", "futures", @@ -7030,13 +7097,14 @@ dependencies = [ [[package]] name = "which" -version = "4.4.0" +version = "4.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2441c784c52b289a054b7201fc93253e288f094e2f4be9058343127c4226a269" +checksum = "87ba24419a2078cd2b0f2ede2691b6c66d8e47836da3b6db8265ebad47afbfc7" dependencies = [ "either", - "libc", + "home", "once_cell", + "rustix", ] [[package]] @@ -7267,15 +7335,9 @@ version = "0.1.0" dependencies = [ "ahash", "anyhow", - "aws-config", - "aws-runtime", - "aws-sigv4", - "aws-smithy-async", - "aws-smithy-http", - "aws-smithy-types", - "axum", "base64 0.21.1", "base64ct", + "bitflags 2.4.1", "bytes", "camino", "cc", @@ -7300,6 +7362,8 @@ dependencies = [ "hex", "hmac", "hyper 0.14.30", + "hyper 1.4.1", + "hyper-util", "indexmap 1.9.3", "itertools 0.10.5", "itertools 0.12.1", @@ -7332,7 +7396,7 @@ dependencies = [ "subtle", "syn 1.0.109", "syn 2.0.52", - "sync_wrapper", + "sync_wrapper 0.1.2", "tikv-jemalloc-sys", "time", "time-macros", @@ -7344,7 +7408,6 @@ dependencies = [ "tracing", "tracing-core", "url", - "uuid", "zeroize", "zstd", "zstd-safe", diff --git a/Cargo.toml b/Cargo.toml index 7997d34c3354..ed7dda235a0f 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -53,7 +53,7 @@ azure_storage_blobs = { version = "0.19", default-features = false, features = [ flate2 = "1.0.26" async-stream = "0.3" async-trait = "0.1" -aws-config = { version = "1.5", default-features = false, features=["rustls"] } +aws-config = { version = "1.5", default-features = false, features=["rustls", "sso"] } aws-sdk-s3 = "1.52" aws-sdk-iam = "1.46.0" aws-smithy-async = { version = "1.2.1", default-features = false, features=["rt-tokio"] } @@ -61,7 +61,7 @@ aws-smithy-types = "1.2" aws-credential-types = "1.2.0" aws-sigv4 = { version = "1.2", features = ["sign-http"] } aws-types = "1.3" -axum = { version = "0.6.20", features = ["ws"] } +axum = { version = "0.7.5", features = ["ws"] } base64 = "0.13.0" bincode = "1.3" bindgen = "0.70" @@ -96,10 +96,13 @@ hmac = "0.12.1" hostname = "0.4" http = {version = "1.1.0", features = ["std"]} http-types = { version = "2", default-features = false } +http-body-util = "0.1.2" humantime = "2.1" humantime-serde = "1.1.1" -hyper = "0.14" -tokio-tungstenite = "0.20.0" +hyper0 = { package = "hyper", version = "0.14" } +hyper = "1.4" +hyper-util = "0.1" +tokio-tungstenite = "0.21.0" indexmap = "2" indoc = "2" ipnet = "2.9.0" diff --git a/README.md b/README.md index b54956f773d8..cfc63b47087c 100644 --- a/README.md +++ b/README.md @@ -58,7 +58,7 @@ curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh 1. Install XCode and dependencies ``` xcode-select --install -brew install protobuf openssl flex bison icu4c pkg-config +brew install protobuf openssl flex bison icu4c pkg-config m4 # add openssl to PATH, required for ed25519 keys generation in neon_local echo 'export PATH="$(brew --prefix openssl)/bin:$PATH"' >> ~/.zshrc diff --git a/compute/Dockerfile.compute-node b/compute/Dockerfile.compute-node index eb4682445c1a..5332b9ca1fc5 100644 --- a/compute/Dockerfile.compute-node +++ b/compute/Dockerfile.compute-node @@ -27,8 +27,8 @@ RUN case $DEBIAN_FLAVOR in \ ;; \ esac && \ apt update && \ - apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \ - zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev \ + apt install --no-install-recommends -y git autoconf automake libtool build-essential bison flex libreadline-dev \ + zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget ca-certificates pkg-config libssl-dev \ libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd \ $VERSION_INSTALLS @@ -104,7 +104,7 @@ FROM build-deps AS postgis-build ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN apt update && \ - apt install -y gdal-bin libboost-dev libboost-thread-dev libboost-filesystem-dev \ + apt install --no-install-recommends -y gdal-bin libboost-dev libboost-thread-dev libboost-filesystem-dev \ libboost-system-dev libboost-iostreams-dev libboost-program-options-dev libboost-timer-dev \ libcgal-dev libgdal-dev libgmp-dev libmpfr-dev libopenscenegraph-dev libprotobuf-c-dev \ protobuf-c-compiler xsltproc @@ -182,7 +182,7 @@ RUN case "${PG_VERSION}" in "v17") \ echo "v17 extensions are not supported yet. Quit" && exit 0;; \ esac && \ apt update && \ - apt install -y ninja-build python3-dev libncurses5 binutils clang + apt install --no-install-recommends -y ninja-build python3-dev libncurses5 binutils clang RUN case "${PG_VERSION}" in "v17") \ echo "v17 extensions are not supported yet. Quit" && exit 0;; \ @@ -587,7 +587,7 @@ RUN case "${PG_VERSION}" in "v17") \ echo "v17 extensions are not supported yet. Quit" && exit 0;; \ esac && \ apt-get update && \ - apt-get install -y \ + apt-get install --no-install-recommends -y \ libboost-iostreams1.74-dev \ libboost-regex1.74-dev \ libboost-serialization1.74-dev \ @@ -752,7 +752,7 @@ ARG PG_VERSION COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/ RUN apt-get update && \ - apt-get install -y curl libclang-dev && \ + apt-get install --no-install-recommends -y curl libclang-dev && \ useradd -ms /bin/bash nonroot -b /home ENV HOME=/home/nonroot @@ -880,9 +880,6 @@ RUN case "${PG_VERSION}" in "v17") \ mkdir pg_session_jwt-src && cd pg_session_jwt-src && tar xzf ../pg_session_jwt.tar.gz --strip-components=1 -C . && \ sed -i 's/pgrx = "=0.11.3"/pgrx = { version = "=0.11.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \ cargo pgrx install --release - # it's needed to enable extension because it uses untrusted C language - # sed -i 's/superuser = false/superuser = true/g' /usr/local/pgsql/share/extension/pg_session_jwt.control && \ - # echo "trusted = true" >> /usr/local/pgsql/share/extension/pg_session_jwt.control ######################################################################################### # @@ -1058,9 +1055,12 @@ FROM debian:$DEBIAN_FLAVOR AS pgbouncer ARG DEBIAN_FLAVOR RUN set -e \ && apt-get update \ - && apt-get install -y \ + && apt-get install --no-install-recommends -y \ build-essential \ git \ + ca-certificates \ + autoconf \ + automake \ libevent-dev \ libtool \ pkg-config @@ -1075,6 +1075,20 @@ RUN set -e \ && make -j $(nproc) dist_man_MANS= \ && make install dist_man_MANS= +######################################################################################### +# +# Compile the Neon-specific `local_proxy` binary +# +######################################################################################### +FROM $REPOSITORY/$IMAGE:$TAG AS local_proxy +ARG BUILD_TAG +ENV BUILD_TAG=$BUILD_TAG + +USER nonroot +# Copy entire project to get Cargo.* files with proper dependencies for the whole project +COPY --chown=nonroot . . +RUN mold -run cargo build --locked --profile release-line-debug-size-lto --bin local_proxy + ######################################################################################### # # Layers "postgres-exporter" and "sql-exporter" @@ -1213,6 +1227,10 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb COPY --from=pgbouncer /usr/local/pgbouncer/bin/pgbouncer /usr/local/bin/pgbouncer COPY --chmod=0666 --chown=postgres compute/etc/pgbouncer.ini /etc/pgbouncer.ini +# local_proxy and its config +COPY --from=local_proxy --chown=postgres /home/nonroot/target/release-line-debug-size-lto/local_proxy /usr/local/bin/local_proxy +RUN mkdir -p /etc/local_proxy && chown postgres:postgres /etc/local_proxy + # Metrics exporter binaries and configuration files COPY --from=postgres-exporter /bin/postgres_exporter /bin/postgres_exporter COPY --from=sql-exporter /bin/sql_exporter /bin/sql_exporter diff --git a/compute/etc/neon_collector.yml b/compute/etc/neon_collector.yml index 0a05acbbe674..92da0cdbdd72 100644 --- a/compute/etc/neon_collector.yml +++ b/compute/etc/neon_collector.yml @@ -148,7 +148,7 @@ metrics: values: [pageserver_send_flushes_total] query_ref: neon_perf_counters -- metric_name: getpage_wait_seconds_buckets +- metric_name: getpage_wait_seconds_bucket type: counter help: 'Histogram buckets of getpage request latency' key_labels: diff --git a/compute/vm-image-spec.yaml b/compute/vm-image-spec.yaml index 50fcd62e4f38..43e57a4ed598 100644 --- a/compute/vm-image-spec.yaml +++ b/compute/vm-image-spec.yaml @@ -19,6 +19,10 @@ commands: user: postgres sysvInitAction: respawn shell: '/usr/local/bin/pgbouncer /etc/pgbouncer.ini' + - name: local_proxy + user: postgres + sysvInitAction: respawn + shell: '/usr/local/bin/local_proxy --config-path /etc/local_proxy/config.json --pid-path /etc/local_proxy/pid --http 0.0.0.0:10432' - name: postgres-exporter user: nobody sysvInitAction: respawn diff --git a/compute_tools/Cargo.toml b/compute_tools/Cargo.toml index b6d84d7eff60..91e0b9d5b87c 100644 --- a/compute_tools/Cargo.toml +++ b/compute_tools/Cargo.toml @@ -11,12 +11,13 @@ testing = [] [dependencies] anyhow.workspace = true +camino.workspace = true chrono.workspace = true cfg-if.workspace = true clap.workspace = true flate2.workspace = true futures.workspace = true -hyper = { workspace = true, features = ["full"] } +hyper0 = { workspace = true, features = ["full"] } nix.workspace = true notify.workspace = true num_cpus.workspace = true diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs index 2f6e2bdb2cf0..ba7b4f37df0f 100644 --- a/compute_tools/src/compute.rs +++ b/compute_tools/src/compute.rs @@ -34,6 +34,7 @@ use nix::sys::signal::{kill, Signal}; use remote_storage::{DownloadError, RemotePath}; use crate::checker::create_availability_check_data; +use crate::local_proxy; use crate::logger::inlinify; use crate::pg_helpers::*; use crate::spec::*; @@ -886,6 +887,11 @@ impl ComputeNode { // 'Close' connection drop(client); + if let Some(ref local_proxy) = spec.local_proxy_config { + info!("configuring local_proxy"); + local_proxy::configure(local_proxy).context("apply_config local_proxy")?; + } + // Run migrations separately to not hold up cold starts thread::spawn(move || { let mut connstr = connstr.clone(); @@ -936,6 +942,19 @@ impl ComputeNode { }); } + if let Some(ref local_proxy) = spec.local_proxy_config { + info!("configuring local_proxy"); + + // Spawn a thread to do the configuration, + // so that we don't block the main thread that starts Postgres. + let local_proxy = local_proxy.clone(); + let _handle = Some(thread::spawn(move || { + if let Err(err) = local_proxy::configure(&local_proxy) { + error!("error while configuring local_proxy: {err:?}"); + } + })); + } + // Write new config let pgdata_path = Path::new(&self.pgdata); let postgresql_conf_path = pgdata_path.join("postgresql.conf"); @@ -1023,6 +1042,19 @@ impl ComputeNode { }); } + if let Some(local_proxy) = &pspec.spec.local_proxy_config { + info!("configuring local_proxy"); + + // Spawn a thread to do the configuration, + // so that we don't block the main thread that starts Postgres. + let local_proxy = local_proxy.clone(); + let _handle = thread::spawn(move || { + if let Err(err) = local_proxy::configure(&local_proxy) { + error!("error while configuring local_proxy: {err:?}"); + } + }); + } + info!( "start_compute spec.remote_extensions {:?}", pspec.spec.remote_extensions diff --git a/compute_tools/src/lib.rs b/compute_tools/src/lib.rs index c5b4ca632ca4..477f423aa232 100644 --- a/compute_tools/src/lib.rs +++ b/compute_tools/src/lib.rs @@ -2,6 +2,9 @@ //! configuration. #![deny(unsafe_code)] #![deny(clippy::undocumented_unsafe_blocks)] + +extern crate hyper0 as hyper; + pub mod checker; pub mod config; pub mod configurator; @@ -12,6 +15,7 @@ pub mod catalog; pub mod compute; pub mod disk_quota; pub mod extension_server; +pub mod local_proxy; pub mod lsn_lease; mod migration; pub mod monitor; diff --git a/compute_tools/src/local_proxy.rs b/compute_tools/src/local_proxy.rs new file mode 100644 index 000000000000..3de3c58786b4 --- /dev/null +++ b/compute_tools/src/local_proxy.rs @@ -0,0 +1,56 @@ +//! Local Proxy is a feature of our BaaS Neon Authorize project. +//! +//! Local Proxy validates JWTs and manages the pg_session_jwt extension. +//! It also maintains a connection pool to postgres. + +use anyhow::{Context, Result}; +use camino::Utf8Path; +use compute_api::spec::LocalProxySpec; +use nix::sys::signal::Signal; +use utils::pid_file::{self, PidFileRead}; + +pub fn configure(local_proxy: &LocalProxySpec) -> Result<()> { + write_local_proxy_conf("/etc/local_proxy/config.json".as_ref(), local_proxy)?; + notify_local_proxy("/etc/local_proxy/pid".as_ref())?; + + Ok(()) +} + +/// Create or completely rewrite configuration file specified by `path` +fn write_local_proxy_conf(path: &Utf8Path, local_proxy: &LocalProxySpec) -> Result<()> { + let config = + serde_json::to_string_pretty(local_proxy).context("serializing LocalProxySpec to json")?; + std::fs::write(path, config).with_context(|| format!("writing {path}"))?; + + Ok(()) +} + +/// Notify local proxy about a new config file. +fn notify_local_proxy(path: &Utf8Path) -> Result<()> { + match pid_file::read(path)? { + // if the file doesn't exist, or isn't locked, local_proxy isn't running + // and will naturally pick up our config later + PidFileRead::NotExist | PidFileRead::NotHeldByAnyProcess(_) => {} + PidFileRead::LockedByOtherProcess(pid) => { + // From the pid_file docs: + // + // > 1. The other process might exit at any time, turning the given PID stale. + // > 2. There is a small window in which `claim_for_current_process` has already + // > locked the file but not yet updates its contents. [`read`] will return + // > this variant here, but with the old file contents, i.e., a stale PID. + // > + // > The kernel is free to recycle PID once it has been `wait(2)`ed upon by + // > its creator. Thus, acting upon a stale PID, e.g., by issuing a `kill` + // > system call on it, bears the risk of killing an unrelated process. + // > This is an inherent limitation of using pidfiles. + // > The only race-free solution is to have a supervisor-process with a lifetime + // > that exceeds that of all of its child-processes (e.g., `runit`, `supervisord`). + // + // This is an ok risk as we only send a SIGHUP which likely won't actually + // kill the process, only reload config. + nix::sys::signal::kill(pid, Signal::SIGHUP).context("sending signal to local_proxy")?; + } + } + + Ok(()) +} diff --git a/control_plane/Cargo.toml b/control_plane/Cargo.toml index 355eca0fe5a7..f71810284725 100644 --- a/control_plane/Cargo.toml +++ b/control_plane/Cargo.toml @@ -14,7 +14,7 @@ humantime.workspace = true nix.workspace = true once_cell.workspace = true humantime-serde.workspace = true -hyper.workspace = true +hyper0.workspace = true regex.workspace = true reqwest = { workspace = true, features = ["blocking", "json"] } scopeguard.workspace = true diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs index 18f396b88615..7cdf6217373b 100644 --- a/control_plane/src/endpoint.rs +++ b/control_plane/src/endpoint.rs @@ -599,6 +599,7 @@ impl Endpoint { remote_extensions, pgbouncer_settings: None, shard_stripe_size: Some(shard_stripe_size), + local_proxy_config: None, }; let spec_path = self.endpoint_path().join("spec.json"); std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?; diff --git a/control_plane/src/local_env.rs b/control_plane/src/local_env.rs index d616154af6b1..9dc2a0c36b62 100644 --- a/control_plane/src/local_env.rs +++ b/control_plane/src/local_env.rs @@ -168,6 +168,9 @@ pub struct NeonStorageControllerConf { #[serde(with = "humantime_serde")] pub heartbeat_interval: Duration, + + #[serde(with = "humantime_serde")] + pub long_reconcile_threshold: Option, } impl NeonStorageControllerConf { @@ -190,6 +193,7 @@ impl Default for NeonStorageControllerConf { split_threshold: None, max_secondary_lag_bytes: None, heartbeat_interval: Self::DEFAULT_HEARTBEAT_INTERVAL, + long_reconcile_threshold: None, } } } diff --git a/control_plane/src/storage_controller.rs b/control_plane/src/storage_controller.rs index 36e5e04c86dd..43c63e7ef415 100644 --- a/control_plane/src/storage_controller.rs +++ b/control_plane/src/storage_controller.rs @@ -3,7 +3,7 @@ use crate::{ local_env::{LocalEnv, NeonStorageControllerConf}, }; use camino::{Utf8Path, Utf8PathBuf}; -use hyper::Uri; +use hyper0::Uri; use nix::unistd::Pid; use pageserver_api::{ controller_api::{ @@ -517,6 +517,13 @@ impl StorageController { args.push(format!("--max-secondary-lag-bytes={lag}")) } + if let Some(threshold) = self.config.long_reconcile_threshold { + args.push(format!( + "--long-reconcile-threshold={}", + humantime::Duration::from(threshold) + )) + } + args.push(format!( "--neon-local-repo-dir={}", self.env.base_data_dir.display() diff --git a/deny.toml b/deny.toml index 30eb90e6cfbd..327ac58db722 100644 --- a/deny.toml +++ b/deny.toml @@ -27,10 +27,6 @@ yanked = "warn" id = "RUSTSEC-2023-0071" reason = "the marvin attack only affects private key decryption, not public key signature verification" -[[advisories.ignore]] -id = "RUSTSEC-2024-0376" -reason = "gRPC endpoints in Neon are not exposed externally" - # This section is considered when running `cargo deny check licenses` # More documentation for the licenses section can be found here: # https://embarkstudios.github.io/cargo-deny/checks/licenses/cfg.html diff --git a/docs/docker.md b/docs/docker.md index ce806c4e6cfd..d16311c27bd6 100644 --- a/docs/docker.md +++ b/docs/docker.md @@ -5,7 +5,7 @@ Currently we build two main images: - [neondatabase/neon](https://hub.docker.com/repository/docker/neondatabase/neon) — image with pre-built `pageserver`, `safekeeper` and `proxy` binaries and all the required runtime dependencies. Built from [/Dockerfile](/Dockerfile). -- [neondatabase/compute-node-v16](https://hub.docker.com/repository/docker/neondatabase/compute-node-v16) — compute node image with pre-built Postgres binaries from [neondatabase/postgres](https://github.com/neondatabase/postgres). Similar images exist for v15 and v14. +- [neondatabase/compute-node-v16](https://hub.docker.com/repository/docker/neondatabase/compute-node-v16) — compute node image with pre-built Postgres binaries from [neondatabase/postgres](https://github.com/neondatabase/postgres). Similar images exist for v15 and v14. Built from [/compute-node/Dockerfile](/compute/Dockerfile.compute-node). And additional intermediate image: diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs index 83515a00a013..5903db70550d 100644 --- a/libs/compute_api/src/spec.rs +++ b/libs/compute_api/src/spec.rs @@ -106,6 +106,10 @@ pub struct ComputeSpec { // Stripe size for pageserver sharding, in pages #[serde(default)] pub shard_stripe_size: Option, + + /// Local Proxy configuration used for JWT authentication + #[serde(default)] + pub local_proxy_config: Option, } /// Feature flag to signal `compute_ctl` to enable certain experimental functionality. @@ -278,11 +282,13 @@ pub struct GenericOption { /// declare a `trait` on it. pub type GenericOptions = Option>; -/// Configured the local-proxy application with the relevant JWKS and roles it should +/// Configured the local_proxy application with the relevant JWKS and roles it should /// use for authorizing connect requests using JWT. #[derive(Clone, Debug, Deserialize, Serialize)] pub struct LocalProxySpec { - pub jwks: Vec, + #[serde(default)] + #[serde(skip_serializing_if = "Option::is_none")] + pub jwks: Option>, } #[derive(Clone, Debug, Deserialize, Serialize)] diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs index 95310fdbac2b..105c8a50d3d2 100644 --- a/libs/pageserver_api/src/config.rs +++ b/libs/pageserver_api/src/config.rs @@ -296,7 +296,14 @@ pub mod defaults { pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100; - pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 128 * 1024; // 128 KiB + /// Soft limit for the maximum size of a vectored read. + /// + /// This is determined by the largest NeonWalRecord that can exist (minus dbdir and reldir keys + /// which are bounded by the blob io limits only). As of this writing, that is a `NeonWalRecord::ClogSetCommitted` record, + /// with 32k xids. That's the max number of XIDS on a single CLOG page. The size of such a record + /// is `sizeof(Transactionid) * 32768 + (some fixed overhead from 'timestamp`, the Vec length and whatever extra serde serialization adds)`. + /// That is, slightly above 128 kB. + pub const DEFAULT_MAX_VECTORED_READ_BYTES: usize = 130 * 1024; // 130 KiB pub const DEFAULT_IMAGE_COMPRESSION: ImageCompressionAlgorithm = ImageCompressionAlgorithm::Zstd { level: Some(1) }; diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs index 4a776709c953..b3fcaae62f3f 100644 --- a/libs/pageserver_api/src/key.rs +++ b/libs/pageserver_api/src/key.rs @@ -748,6 +748,16 @@ impl Key { self.field1 == 0x00 && self.field4 != 0 && self.field6 != 0xffffffff } + #[inline(always)] + pub fn is_rel_dir_key(&self) -> bool { + self.field1 == 0x00 + && self.field2 != 0 + && self.field3 != 0 + && self.field4 == 0 + && self.field5 == 0 + && self.field6 == 1 + } + /// Guaranteed to return `Ok()` if [`Self::is_rel_block_key`] returns `true` for `key`. #[inline(always)] pub fn to_rel_block(self) -> anyhow::Result<(RelTag, BlockNumber)> { diff --git a/libs/remote_storage/Cargo.toml b/libs/remote_storage/Cargo.toml index f48f1801a4d2..be4d61f00925 100644 --- a/libs/remote_storage/Cargo.toml +++ b/libs/remote_storage/Cargo.toml @@ -16,7 +16,7 @@ aws-sdk-s3.workspace = true bytes.workspace = true camino = { workspace = true, features = ["serde1"] } humantime-serde.workspace = true -hyper = { workspace = true, features = ["stream"] } +hyper0 = { workspace = true, features = ["stream"] } futures.workspace = true serde.workspace = true serde_json.workspace = true diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs index cb7479f6cdcc..e113a987a53e 100644 --- a/libs/remote_storage/src/azure_blob.rs +++ b/libs/remote_storage/src/azure_blob.rs @@ -14,7 +14,7 @@ use std::time::SystemTime; use super::REMOTE_STORAGE_PREFIX_SEPARATOR; use anyhow::Result; -use azure_core::request_options::{MaxResults, Metadata, Range}; +use azure_core::request_options::{IfMatchCondition, MaxResults, Metadata, Range}; use azure_core::{Continuable, RetryOptions}; use azure_identity::DefaultAzureCredential; use azure_storage::StorageCredentials; @@ -33,10 +33,10 @@ use tracing::debug; use utils::backoff; use crate::metrics::{start_measuring_requests, AttemptOutcome, RequestKind}; -use crate::ListingObject; use crate::{ - config::AzureConfig, error::Cancelled, ConcurrencyLimiter, Download, DownloadError, Listing, - ListingMode, RemotePath, RemoteStorage, StorageMetadata, TimeTravelError, TimeoutOrCancel, + config::AzureConfig, error::Cancelled, ConcurrencyLimiter, Download, DownloadError, + DownloadOpts, Listing, ListingMode, ListingObject, RemotePath, RemoteStorage, StorageMetadata, + TimeTravelError, TimeoutOrCancel, }; pub struct AzureBlobStorage { @@ -259,6 +259,7 @@ fn to_download_error(error: azure_core::Error) -> DownloadError { if let Some(http_err) = error.as_http_error() { match http_err.status() { StatusCode::NotFound => DownloadError::NotFound, + StatusCode::NotModified => DownloadError::Unmodified, StatusCode::BadRequest => DownloadError::BadInput(anyhow::Error::new(error)), _ => DownloadError::Other(anyhow::Error::new(error)), } @@ -484,11 +485,16 @@ impl RemoteStorage for AzureBlobStorage { async fn download( &self, from: &RemotePath, + opts: &DownloadOpts, cancel: &CancellationToken, ) -> Result { let blob_client = self.client.blob_client(self.relative_path_to_name(from)); - let builder = blob_client.get(); + let mut builder = blob_client.get(); + + if let Some(ref etag) = opts.etag { + builder = builder.if_match(IfMatchCondition::NotMatch(etag.to_string())) + } self.download_for_builder(builder, cancel).await } diff --git a/libs/remote_storage/src/error.rs b/libs/remote_storage/src/error.rs index 5fd0eaabc7ef..17790e9f70df 100644 --- a/libs/remote_storage/src/error.rs +++ b/libs/remote_storage/src/error.rs @@ -5,6 +5,8 @@ pub enum DownloadError { BadInput(anyhow::Error), /// The file was not found in the remote storage. NotFound, + /// The caller provided an ETag, and the file was not modified. + Unmodified, /// A cancellation token aborted the download, typically during /// tenant detach or process shutdown. Cancelled, @@ -24,6 +26,7 @@ impl std::fmt::Display for DownloadError { write!(f, "Failed to download a remote file due to user input: {e}") } DownloadError::NotFound => write!(f, "No file found for the remote object id given"), + DownloadError::Unmodified => write!(f, "File was not modified"), DownloadError::Cancelled => write!(f, "Cancelled, shutting down"), DownloadError::Timeout => write!(f, "timeout"), DownloadError::Other(e) => write!(f, "Failed to download a remote file: {e:?}"), @@ -38,7 +41,7 @@ impl DownloadError { pub fn is_permanent(&self) -> bool { use DownloadError::*; match self { - BadInput(_) | NotFound | Cancelled => true, + BadInput(_) | NotFound | Unmodified | Cancelled => true, Timeout | Other(_) => false, } } diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs index 45267ccda992..0ff0f1c87822 100644 --- a/libs/remote_storage/src/lib.rs +++ b/libs/remote_storage/src/lib.rs @@ -161,6 +161,14 @@ pub struct Listing { pub keys: Vec, } +/// Options for downloads. The default value is a plain GET. +#[derive(Default)] +pub struct DownloadOpts { + /// If given, returns [`DownloadError::Unmodified`] if the object still has + /// the same ETag (using If-None-Match). + pub etag: Option, +} + /// Storage (potentially remote) API to manage its state. /// This storage tries to be unaware of any layered repository context, /// providing basic CRUD operations for storage files. @@ -245,6 +253,7 @@ pub trait RemoteStorage: Send + Sync + 'static { async fn download( &self, from: &RemotePath, + opts: &DownloadOpts, cancel: &CancellationToken, ) -> Result; @@ -401,16 +410,18 @@ impl GenericRemoteStorage> { } } + /// See [`RemoteStorage::download`] pub async fn download( &self, from: &RemotePath, + opts: &DownloadOpts, cancel: &CancellationToken, ) -> Result { match self { - Self::LocalFs(s) => s.download(from, cancel).await, - Self::AwsS3(s) => s.download(from, cancel).await, - Self::AzureBlob(s) => s.download(from, cancel).await, - Self::Unreliable(s) => s.download(from, cancel).await, + Self::LocalFs(s) => s.download(from, opts, cancel).await, + Self::AwsS3(s) => s.download(from, opts, cancel).await, + Self::AzureBlob(s) => s.download(from, opts, cancel).await, + Self::Unreliable(s) => s.download(from, opts, cancel).await, } } @@ -572,7 +583,7 @@ impl GenericRemoteStorage { ) -> Result { match byte_range { Some((start, end)) => self.download_byte_range(from, start, end, cancel).await, - None => self.download(from, cancel).await, + None => self.download(from, &DownloadOpts::default(), cancel).await, } } diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs index c3ef18cab1e8..d912b94c7493 100644 --- a/libs/remote_storage/src/local_fs.rs +++ b/libs/remote_storage/src/local_fs.rs @@ -23,8 +23,8 @@ use tokio_util::{io::ReaderStream, sync::CancellationToken}; use utils::crashsafe::path_with_suffix_extension; use crate::{ - Download, DownloadError, Listing, ListingMode, ListingObject, RemotePath, TimeTravelError, - TimeoutOrCancel, REMOTE_STORAGE_PREFIX_SEPARATOR, + Download, DownloadError, DownloadOpts, Listing, ListingMode, ListingObject, RemotePath, + TimeTravelError, TimeoutOrCancel, REMOTE_STORAGE_PREFIX_SEPARATOR, }; use super::{RemoteStorage, StorageMetadata}; @@ -494,11 +494,17 @@ impl RemoteStorage for LocalFs { async fn download( &self, from: &RemotePath, + opts: &DownloadOpts, cancel: &CancellationToken, ) -> Result { let target_path = from.with_base(&self.storage_root); let file_metadata = file_metadata(&target_path).await?; + let etag = mock_etag(&file_metadata); + + if opts.etag.as_ref() == Some(&etag) { + return Err(DownloadError::Unmodified); + } let source = ReaderStream::new( fs::OpenOptions::new() @@ -519,7 +525,6 @@ impl RemoteStorage for LocalFs { let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone()); let source = crate::support::DownloadStream::new(cancel_or_timeout, source); - let etag = mock_etag(&file_metadata); Ok(Download { metadata, last_modified: file_metadata @@ -692,7 +697,7 @@ mod fs_tests { ) -> anyhow::Result { let cancel = CancellationToken::new(); let download = storage - .download(remote_storage_path, &cancel) + .download(remote_storage_path, &DownloadOpts::default(), &cancel) .await .map_err(|e| anyhow::anyhow!("Download failed: {e}"))?; ensure!( @@ -773,8 +778,8 @@ mod fs_tests { "We should upload and download the same contents" ); - let non_existing_path = "somewhere/else"; - match storage.download(&RemotePath::new(Utf8Path::new(non_existing_path))?, &cancel).await { + let non_existing_path = RemotePath::new(Utf8Path::new("somewhere/else"))?; + match storage.download(&non_existing_path, &DownloadOpts::default(), &cancel).await { Err(DownloadError::NotFound) => {} // Should get NotFound for non existing keys other => panic!("Should get a NotFound error when downloading non-existing storage files, but got: {other:?}"), } @@ -1101,7 +1106,13 @@ mod fs_tests { storage.upload(body, len, &path, None, &cancel).await?; } - let read = aggregate(storage.download(&path, &cancel).await?.download_stream).await?; + let read = aggregate( + storage + .download(&path, &DownloadOpts::default(), &cancel) + .await? + .download_stream, + ) + .await?; assert_eq!(body, read); let shorter = Bytes::from_static(b"shorter body"); @@ -1112,7 +1123,13 @@ mod fs_tests { storage.upload(body, len, &path, None, &cancel).await?; } - let read = aggregate(storage.download(&path, &cancel).await?.download_stream).await?; + let read = aggregate( + storage + .download(&path, &DownloadOpts::default(), &cancel) + .await? + .download_stream, + ) + .await?; assert_eq!(shorter, read); Ok(()) } @@ -1145,7 +1162,13 @@ mod fs_tests { storage.upload(body, len, &path, None, &cancel).await?; } - let read = aggregate(storage.download(&path, &cancel).await?.download_stream).await?; + let read = aggregate( + storage + .download(&path, &DownloadOpts::default(), &cancel) + .await? + .download_stream, + ) + .await?; assert_eq!(body, read); Ok(()) diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs index 11f6598cbf9d..ec7c0475651d 100644 --- a/libs/remote_storage/src/s3_bucket.rs +++ b/libs/remote_storage/src/s3_bucket.rs @@ -28,12 +28,13 @@ use aws_sdk_s3::{ Client, }; use aws_smithy_async::rt::sleep::TokioSleep; +use http_types::StatusCode; use aws_smithy_types::{body::SdkBody, DateTime}; use aws_smithy_types::{byte_stream::ByteStream, date_time::ConversionError}; use bytes::Bytes; use futures::stream::Stream; -use hyper::Body; +use hyper0::Body; use scopeguard::ScopeGuard; use tokio_util::sync::CancellationToken; use utils::backoff; @@ -44,8 +45,8 @@ use crate::{ error::Cancelled, metrics::{start_counting_cancelled_wait, start_measuring_requests}, support::PermitCarrying, - ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, ListingObject, RemotePath, - RemoteStorage, TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE, + ConcurrencyLimiter, Download, DownloadError, DownloadOpts, Listing, ListingMode, ListingObject, + RemotePath, RemoteStorage, TimeTravelError, TimeoutOrCancel, MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR, }; @@ -67,6 +68,7 @@ pub struct S3Bucket { struct GetObjectRequest { bucket: String, key: String, + etag: Option, range: Option, } impl S3Bucket { @@ -248,13 +250,18 @@ impl S3Bucket { let started_at = start_measuring_requests(kind); - let get_object = self + let mut builder = self .client .get_object() .bucket(request.bucket) .key(request.key) - .set_range(request.range) - .send(); + .set_range(request.range); + + if let Some(etag) = request.etag { + builder = builder.if_none_match(etag); + } + + let get_object = builder.send(); let get_object = tokio::select! { res = get_object => res, @@ -277,6 +284,20 @@ impl S3Bucket { ); return Err(DownloadError::NotFound); } + Err(SdkError::ServiceError(e)) + // aws_smithy_runtime_api::http::response::StatusCode isn't + // re-exported by any aws crates, so just check the numeric + // status against http_types::StatusCode instead of pulling it. + if e.raw().status().as_u16() == StatusCode::NotModified => + { + // Count an unmodified file as a success. + crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed( + kind, + AttemptOutcome::Ok, + started_at, + ); + return Err(DownloadError::Unmodified); + } Err(e) => { crate::metrics::BUCKET_METRICS.req_seconds.observe_elapsed( kind, @@ -773,6 +794,7 @@ impl RemoteStorage for S3Bucket { async fn download( &self, from: &RemotePath, + opts: &DownloadOpts, cancel: &CancellationToken, ) -> Result { // if prefix is not none then download file `prefix/from` @@ -781,6 +803,7 @@ impl RemoteStorage for S3Bucket { GetObjectRequest { bucket: self.bucket_name.clone(), key: self.relative_path_to_s3_object(from), + etag: opts.etag.as_ref().map(|e| e.to_string()), range: None, }, cancel, @@ -807,6 +830,7 @@ impl RemoteStorage for S3Bucket { GetObjectRequest { bucket: self.bucket_name.clone(), key: self.relative_path_to_s3_object(from), + etag: None, range, }, cancel, diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs index c7eb634af3a7..05f82b5a5a8f 100644 --- a/libs/remote_storage/src/simulate_failures.rs +++ b/libs/remote_storage/src/simulate_failures.rs @@ -12,8 +12,8 @@ use std::{collections::hash_map::Entry, sync::Arc}; use tokio_util::sync::CancellationToken; use crate::{ - Download, DownloadError, GenericRemoteStorage, Listing, ListingMode, RemotePath, RemoteStorage, - StorageMetadata, TimeTravelError, + Download, DownloadError, DownloadOpts, GenericRemoteStorage, Listing, ListingMode, RemotePath, + RemoteStorage, StorageMetadata, TimeTravelError, }; pub struct UnreliableWrapper { @@ -167,11 +167,12 @@ impl RemoteStorage for UnreliableWrapper { async fn download( &self, from: &RemotePath, + opts: &DownloadOpts, cancel: &CancellationToken, ) -> Result { self.attempt(RemoteOp::Download(from.clone())) .map_err(DownloadError::Other)?; - self.inner.download(from, cancel).await + self.inner.download(from, opts, cancel).await } async fn download_byte_range( diff --git a/libs/remote_storage/tests/common/tests.rs b/libs/remote_storage/tests/common/tests.rs index 86c55872c174..e38cfb3ef09f 100644 --- a/libs/remote_storage/tests/common/tests.rs +++ b/libs/remote_storage/tests/common/tests.rs @@ -1,8 +1,7 @@ use anyhow::Context; use camino::Utf8Path; use futures::StreamExt; -use remote_storage::ListingMode; -use remote_storage::RemotePath; +use remote_storage::{DownloadError, DownloadOpts, ListingMode, ListingObject, RemotePath}; use std::sync::Arc; use std::{collections::HashSet, num::NonZeroU32}; use test_context::test_context; @@ -284,7 +283,10 @@ async fn upload_download_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result< ctx.client.upload(data, len, &path, None, &cancel).await?; // Normal download request - let dl = ctx.client.download(&path, &cancel).await?; + let dl = ctx + .client + .download(&path, &DownloadOpts::default(), &cancel) + .await?; let buf = download_to_vec(dl).await?; assert_eq!(&buf, &orig); @@ -337,6 +339,54 @@ async fn upload_download_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result< Ok(()) } +/// Tests that conditional downloads work properly, by returning +/// DownloadError::Unmodified when the object ETag matches the given ETag. +#[test_context(MaybeEnabledStorage)] +#[tokio::test] +async fn download_conditional(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> { + let MaybeEnabledStorage::Enabled(ctx) = ctx else { + return Ok(()); + }; + let cancel = CancellationToken::new(); + + // Create a file. + let path = RemotePath::new(Utf8Path::new(format!("{}/file", ctx.base_prefix).as_str()))?; + let data = bytes::Bytes::from_static("foo".as_bytes()); + let (stream, len) = wrap_stream(data); + ctx.client.upload(stream, len, &path, None, &cancel).await?; + + // Download it to obtain its etag. + let mut opts = DownloadOpts::default(); + let download = ctx.client.download(&path, &opts, &cancel).await?; + + // Download with the etag yields DownloadError::Unmodified. + opts.etag = Some(download.etag); + let result = ctx.client.download(&path, &opts, &cancel).await; + assert!( + matches!(result, Err(DownloadError::Unmodified)), + "expected DownloadError::Unmodified, got {result:?}" + ); + + // Replace the file contents. + let data = bytes::Bytes::from_static("bar".as_bytes()); + let (stream, len) = wrap_stream(data); + ctx.client.upload(stream, len, &path, None, &cancel).await?; + + // A download with the old etag should yield the new file. + let download = ctx.client.download(&path, &opts, &cancel).await?; + assert_ne!(download.etag, opts.etag.unwrap(), "ETag did not change"); + + // A download with the new etag should yield Unmodified again. + opts.etag = Some(download.etag); + let result = ctx.client.download(&path, &opts, &cancel).await; + assert!( + matches!(result, Err(DownloadError::Unmodified)), + "expected DownloadError::Unmodified, got {result:?}" + ); + + Ok(()) +} + #[test_context(MaybeEnabledStorage)] #[tokio::test] async fn copy_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> { @@ -364,7 +414,10 @@ async fn copy_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> { // Normal download request ctx.client.copy_object(&path, &path_dest, &cancel).await?; - let dl = ctx.client.download(&path_dest, &cancel).await?; + let dl = ctx + .client + .download(&path_dest, &DownloadOpts::default(), &cancel) + .await?; let buf = download_to_vec(dl).await?; assert_eq!(&buf, &orig); @@ -376,3 +429,56 @@ async fn copy_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> { Ok(()) } + +/// Tests that head_object works properly. +#[test_context(MaybeEnabledStorage)] +#[tokio::test] +async fn head_object(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> { + let MaybeEnabledStorage::Enabled(ctx) = ctx else { + return Ok(()); + }; + let cancel = CancellationToken::new(); + + let path = RemotePath::new(Utf8Path::new(format!("{}/file", ctx.base_prefix).as_str()))?; + + // Errors on missing file. + let result = ctx.client.head_object(&path, &cancel).await; + assert!( + matches!(result, Err(DownloadError::NotFound)), + "expected NotFound, got {result:?}" + ); + + // Create the file. + let data = bytes::Bytes::from_static("foo".as_bytes()); + let (stream, len) = wrap_stream(data); + ctx.client.upload(stream, len, &path, None, &cancel).await?; + + // Fetch the head metadata. + let object = ctx.client.head_object(&path, &cancel).await?; + assert_eq!( + object, + ListingObject { + key: path.clone(), + last_modified: object.last_modified, // ignore + size: 3 + } + ); + + // Wait for a couple of seconds, and then update the file to check the last + // modified timestamp. + tokio::time::sleep(std::time::Duration::from_secs(2)).await; + + let data = bytes::Bytes::from_static("bar".as_bytes()); + let (stream, len) = wrap_stream(data); + ctx.client.upload(stream, len, &path, None, &cancel).await?; + let new = ctx.client.head_object(&path, &cancel).await?; + + assert!( + !new.last_modified + .duration_since(object.last_modified)? + .is_zero(), + "last_modified did not advance" + ); + + Ok(()) +} diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs index b893beeebdf7..3e99a65fac0b 100644 --- a/libs/remote_storage/tests/test_real_s3.rs +++ b/libs/remote_storage/tests/test_real_s3.rs @@ -12,8 +12,8 @@ use anyhow::Context; use camino::Utf8Path; use futures_util::StreamExt; use remote_storage::{ - DownloadError, GenericRemoteStorage, ListingMode, RemotePath, RemoteStorageConfig, - RemoteStorageKind, S3Config, + DownloadError, DownloadOpts, GenericRemoteStorage, ListingMode, RemotePath, + RemoteStorageConfig, RemoteStorageKind, S3Config, }; use test_context::test_context; use test_context::AsyncTestContext; @@ -121,7 +121,8 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow: // A little check to ensure that our clock is not too far off from the S3 clock { - let dl = retry(|| ctx.client.download(&path2, &cancel)).await?; + let opts = DownloadOpts::default(); + let dl = retry(|| ctx.client.download(&path2, &opts, &cancel)).await?; let last_modified = dl.last_modified; let half_wt = WAIT_TIME.mul_f32(0.5); let t0_hwt = t0 + half_wt; @@ -159,7 +160,12 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow: let t2_files_recovered = list_files(&ctx.client, &cancel).await?; println!("after recovery to t2: {t2_files_recovered:?}"); assert_eq!(t2_files, t2_files_recovered); - let path2_recovered_t2 = download_to_vec(ctx.client.download(&path2, &cancel).await?).await?; + let path2_recovered_t2 = download_to_vec( + ctx.client + .download(&path2, &DownloadOpts::default(), &cancel) + .await?, + ) + .await?; assert_eq!(path2_recovered_t2, new_data.as_bytes()); // after recovery to t1: path1 is back, path2 has the old content @@ -170,7 +176,12 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow: let t1_files_recovered = list_files(&ctx.client, &cancel).await?; println!("after recovery to t1: {t1_files_recovered:?}"); assert_eq!(t1_files, t1_files_recovered); - let path2_recovered_t1 = download_to_vec(ctx.client.download(&path2, &cancel).await?).await?; + let path2_recovered_t1 = download_to_vec( + ctx.client + .download(&path2, &DownloadOpts::default(), &cancel) + .await?, + ) + .await?; assert_eq!(path2_recovered_t1, old_data.as_bytes()); // after recovery to t0: everything is gone except for path1 @@ -416,7 +427,7 @@ async fn download_is_timeouted(ctx: &mut MaybeEnabledStorage) { let started_at = std::time::Instant::now(); let mut stream = ctx .client - .download(&path, &cancel) + .download(&path, &DownloadOpts::default(), &cancel) .await .expect("download succeeds") .download_stream; @@ -491,7 +502,7 @@ async fn download_is_cancelled(ctx: &mut MaybeEnabledStorage) { { let stream = ctx .client - .download(&path, &cancel) + .download(&path, &DownloadOpts::default(), &cancel) .await .expect("download succeeds") .download_stream; diff --git a/libs/tracing-utils/Cargo.toml b/libs/tracing-utils/Cargo.toml index 66f21cd1ef41..60637d5b241b 100644 --- a/libs/tracing-utils/Cargo.toml +++ b/libs/tracing-utils/Cargo.toml @@ -5,7 +5,7 @@ edition.workspace = true license.workspace = true [dependencies] -hyper.workspace = true +hyper0.workspace = true opentelemetry = { workspace = true, features = ["trace"] } opentelemetry_sdk = { workspace = true, features = ["rt-tokio"] } opentelemetry-otlp = { workspace = true, default-features = false, features = ["http-proto", "trace", "http", "reqwest-client"] } diff --git a/libs/tracing-utils/src/http.rs b/libs/tracing-utils/src/http.rs index f5ab267ff307..e6fdf9be45ff 100644 --- a/libs/tracing-utils/src/http.rs +++ b/libs/tracing-utils/src/http.rs @@ -1,7 +1,7 @@ //! Tracing wrapper for Hyper HTTP server -use hyper::HeaderMap; -use hyper::{Body, Request, Response}; +use hyper0::HeaderMap; +use hyper0::{Body, Request, Response}; use std::future::Future; use tracing::Instrument; use tracing_opentelemetry::OpenTelemetrySpanExt; diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml index 7d284a6fc567..545317f95871 100644 --- a/libs/utils/Cargo.toml +++ b/libs/utils/Cargo.toml @@ -22,7 +22,7 @@ chrono.workspace = true git-version.workspace = true hex = { workspace = true, features = ["serde"] } humantime.workspace = true -hyper = { workspace = true, features = ["full"] } +hyper0 = { workspace = true, features = ["full"] } fail.workspace = true futures = { workspace = true} jsonwebtoken.workspace = true diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index aacc1e1dd5e8..d9b82b20da51 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -2,6 +2,8 @@ //! between other crates in this repository. #![deny(clippy::undocumented_unsafe_blocks)] +extern crate hyper0 as hyper; + pub mod backoff; /// `Lsn` type implements common tasks on Log Sequence Numbers diff --git a/libs/vm_monitor/src/lib.rs b/libs/vm_monitor/src/lib.rs index 89ca91fdd742..1b13c8e0b23d 100644 --- a/libs/vm_monitor/src/lib.rs +++ b/libs/vm_monitor/src/lib.rs @@ -7,11 +7,13 @@ use axum::{ extract::{ws::WebSocket, State, WebSocketUpgrade}, response::Response, }; -use axum::{routing::get, Router, Server}; +use axum::{routing::get, Router}; use clap::Parser; use futures::Future; +use std::net::SocketAddr; use std::{fmt::Debug, time::Duration}; use sysinfo::{RefreshKind, System, SystemExt}; +use tokio::net::TcpListener; use tokio::{sync::broadcast, task::JoinHandle}; use tokio_util::sync::CancellationToken; use tracing::{error, info}; @@ -132,14 +134,14 @@ pub async fn start(args: &'static Args, token: CancellationToken) -> anyhow::Res args, }); - let addr = args.addr(); - let bound = Server::try_bind(&addr.parse().expect("parsing address should not fail")) - .with_context(|| format!("failed to bind to {addr}"))?; - - info!(addr, "server bound"); + let addr_str = args.addr(); + let addr: SocketAddr = addr_str.parse().expect("parsing address should not fail"); - bound - .serve(app.into_make_service()) + let listener = TcpListener::bind(&addr) + .await + .with_context(|| format!("failed to bind to {addr}"))?; + info!(addr_str, "server bound"); + axum::serve(listener, app.into_make_service()) .await .context("server exited")?; diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index f1fc3a86fe4b..2531abc7a187 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -30,7 +30,7 @@ futures.workspace = true hex.workspace = true humantime.workspace = true humantime-serde.workspace = true -hyper.workspace = true +hyper0.workspace = true itertools.workspace = true md5.workspace = true nix.workspace = true diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index e9e52acee67c..593ca6db2d5a 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -575,7 +575,7 @@ fn start_pageserver( .build() .map_err(|err| anyhow!(err))?; let service = utils::http::RouterService::new(router).unwrap(); - let server = hyper::Server::from_tcp(http_listener)? + let server = hyper0::Server::from_tcp(http_listener)? .serve(service) .with_graceful_shutdown({ let cancel = cancel.clone(); diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index 1cc5502bd602..94375e62b691 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -1742,6 +1742,10 @@ async fn timeline_compact_handler( let state = get_state(&request); let mut flags = EnumSet::empty(); + + if Some(true) == parse_query_param::<_, bool>(&request, "force_l0_compaction")? { + flags |= CompactFlags::ForceL0Compaction; + } if Some(true) == parse_query_param::<_, bool>(&request, "force_repartition")? { flags |= CompactFlags::ForceRepartition; } @@ -1788,6 +1792,9 @@ async fn timeline_checkpoint_handler( let state = get_state(&request); let mut flags = EnumSet::empty(); + if Some(true) == parse_query_param::<_, bool>(&request, "force_l0_compaction")? { + flags |= CompactFlags::ForceL0Compaction; + } if Some(true) == parse_query_param::<_, bool>(&request, "force_repartition")? { flags |= CompactFlags::ForceRepartition; } diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs index 08abfbd64740..d51931c768aa 100644 --- a/pageserver/src/lib.rs +++ b/pageserver/src/lib.rs @@ -13,6 +13,8 @@ pub mod http; pub mod import_datadir; pub mod l0_flush; +extern crate hyper0 as hyper; + use futures::{stream::FuturesUnordered, StreamExt}; pub use pageserver_api::keyspace; use tokio_util::sync::CancellationToken; diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index db88303f7bdb..29f682c62a76 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -97,6 +97,7 @@ use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart; use crate::tenant::remote_timeline_client::INITDB_PATH; use crate::tenant::storage_layer::DeltaLayer; use crate::tenant::storage_layer::ImageLayer; +use crate::walingest::WalLagCooldown; use crate::walredo; use crate::InitializationOrder; use std::collections::hash_map::Entry; @@ -319,6 +320,9 @@ pub struct Tenant { /// background warmup. pub(crate) activate_now_sem: tokio::sync::Semaphore, + /// Time it took for the tenant to activate. Zero if not active yet. + attach_wal_lag_cooldown: Arc>, + // Cancellation token fires when we have entered shutdown(). This is a parent of // Timelines' cancellation token. pub(crate) cancel: CancellationToken, @@ -1000,11 +1004,15 @@ impl Tenant { // Remote preload is complete. drop(remote_load_completion); + // We will time the duration of the attach phase unless this is a creation (attach will do no work) + let attach_start = std::time::Instant::now(); let attached = { let _attach_timer = Some(TENANT.attach.start_timer()); tenant_clone.attach(preload, &ctx).await }; + let attach_duration = attach_start.elapsed(); + _ = tenant_clone.attach_wal_lag_cooldown.set(WalLagCooldown::new(attach_start, attach_duration)); match attached { Ok(()) => { @@ -2754,6 +2762,7 @@ impl Tenant { pg_version, state, last_aux_file_policy, + self.attach_wal_lag_cooldown.clone(), self.cancel.child_token(), ); @@ -2860,6 +2869,7 @@ impl Tenant { Some(Duration::from_secs(3600 * 24)), )), activate_now_sem: tokio::sync::Semaphore::new(0), + attach_wal_lag_cooldown: Arc::new(std::sync::OnceLock::new()), cancel: CancellationToken::default(), gate: Gate::default(), timeline_get_throttle: Arc::new(throttle::Throttle::new( diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs index 97506b7e9a3d..692e4d3096de 100644 --- a/pageserver/src/tenant/remote_timeline_client/download.rs +++ b/pageserver/src/tenant/remote_timeline_client/download.rs @@ -27,7 +27,7 @@ use crate::tenant::Generation; use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt; use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile}; use crate::TEMP_FILE_SUFFIX; -use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode, RemotePath}; +use remote_storage::{DownloadError, DownloadOpts, GenericRemoteStorage, ListingMode, RemotePath}; use utils::crashsafe::path_with_suffix_extension; use utils::id::{TenantId, TimelineId}; use utils::pausable_failpoint; @@ -153,7 +153,9 @@ async fn download_object<'a>( .with_context(|| format!("create a destination file for layer '{dst_path}'")) .map_err(DownloadError::Other)?; - let download = storage.download(src_path, cancel).await?; + let download = storage + .download(src_path, &DownloadOpts::default(), cancel) + .await?; pausable_failpoint!("before-downloading-layer-stream-pausable"); @@ -204,7 +206,9 @@ async fn download_object<'a>( .with_context(|| format!("create a destination file for layer '{dst_path}'")) .map_err(DownloadError::Other)?; - let mut download = storage.download(src_path, cancel).await?; + let mut download = storage + .download(src_path, &DownloadOpts::default(), cancel) + .await?; pausable_failpoint!("before-downloading-layer-stream-pausable"); @@ -344,7 +348,9 @@ async fn do_download_index_part( let index_part_bytes = download_retry_forever( || async { - let download = storage.download(&remote_path, cancel).await?; + let download = storage + .download(&remote_path, &DownloadOpts::default(), cancel) + .await?; let mut bytes = Vec::new(); @@ -526,10 +532,15 @@ pub(crate) async fn download_initdb_tar_zst( .with_context(|| format!("tempfile creation {temp_path}")) .map_err(DownloadError::Other)?; - let download = match storage.download(&remote_path, cancel).await { + let download = match storage + .download(&remote_path, &DownloadOpts::default(), cancel) + .await + { Ok(dl) => dl, Err(DownloadError::NotFound) => { - storage.download(&remote_preserved_path, cancel).await? + storage + .download(&remote_preserved_path, &DownloadOpts::default(), cancel) + .await? } Err(other) => Err(other)?, }; diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs index 90e1c01dbd31..9f7447a9ac5c 100644 --- a/pageserver/src/tenant/secondary/downloader.rs +++ b/pageserver/src/tenant/secondary/downloader.rs @@ -49,7 +49,7 @@ use futures::Future; use metrics::UIntGauge; use pageserver_api::models::SecondaryProgress; use pageserver_api::shard::TenantShardId; -use remote_storage::{DownloadError, Etag, GenericRemoteStorage}; +use remote_storage::{DownloadError, DownloadOpts, Etag, GenericRemoteStorage}; use tokio_util::sync::CancellationToken; use tracing::{info_span, instrument, warn, Instrument}; @@ -944,36 +944,34 @@ impl<'a> TenantDownloader<'a> { ) -> Result { debug_assert_current_span_has_tenant_id(); let tenant_shard_id = self.secondary_state.get_tenant_shard_id(); - // TODO: pull up etag check into the request, to do a conditional GET rather than - // issuing a GET and then maybe ignoring the response body - // (https://github.com/neondatabase/neon/issues/6199) tracing::debug!("Downloading heatmap for secondary tenant",); let heatmap_path = remote_heatmap_path(tenant_shard_id); let cancel = &self.secondary_state.cancel; + let opts = DownloadOpts { + etag: prev_etag.cloned(), + }; backoff::retry( || async { - let download = self + let download = match self .remote_storage - .download(&heatmap_path, cancel) + .download(&heatmap_path, &opts, cancel) .await - .map_err(UpdateError::from)?; - - SECONDARY_MODE.download_heatmap.inc(); + { + Ok(download) => download, + Err(DownloadError::Unmodified) => return Ok(HeatMapDownload::Unmodified), + Err(err) => return Err(err.into()), + }; - if Some(&download.etag) == prev_etag { - Ok(HeatMapDownload::Unmodified) - } else { - let mut heatmap_bytes = Vec::new(); - let mut body = tokio_util::io::StreamReader::new(download.download_stream); - let _size = tokio::io::copy_buf(&mut body, &mut heatmap_bytes).await?; - Ok(HeatMapDownload::Modified(HeatMapModified { - etag: download.etag, - last_modified: download.last_modified, - bytes: heatmap_bytes, - })) - } + let mut heatmap_bytes = Vec::new(); + let mut body = tokio_util::io::StreamReader::new(download.download_stream); + let _size = tokio::io::copy_buf(&mut body, &mut heatmap_bytes).await?; + Ok(HeatMapDownload::Modified(HeatMapModified { + etag: download.etag, + last_modified: download.last_modified, + bytes: heatmap_bytes, + })) }, |e| matches!(e, UpdateError::NoData | UpdateError::Cancelled), FAILED_DOWNLOAD_WARN_THRESHOLD, @@ -984,6 +982,7 @@ impl<'a> TenantDownloader<'a> { .await .ok_or_else(|| UpdateError::Cancelled) .and_then(|x| x) + .inspect(|_| SECONDARY_MODE.download_heatmap.inc()) } /// Download heatmap layers that are not present on local disk, or update their diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs index 6f9eda85f57b..2acad666b8d0 100644 --- a/pageserver/src/tenant/storage_layer/delta_layer.rs +++ b/pageserver/src/tenant/storage_layer/delta_layer.rs @@ -53,6 +53,7 @@ use camino::{Utf8Path, Utf8PathBuf}; use futures::StreamExt; use itertools::Itertools; use pageserver_api::config::MaxVectoredReadBytes; +use pageserver_api::key::DBDIR_KEY; use pageserver_api::keyspace::KeySpace; use pageserver_api::models::ImageCompressionAlgorithm; use pageserver_api::shard::TenantShardId; @@ -963,14 +964,25 @@ impl DeltaLayerInner { .blobs_at .as_slice() .iter() - .map(|(_, blob_meta)| format!("{}@{}", blob_meta.key, blob_meta.lsn)) + .filter_map(|(_, blob_meta)| { + if blob_meta.key.is_rel_dir_key() || blob_meta.key == DBDIR_KEY { + // The size of values for these keys is unbounded and can + // grow very large in pathological cases. + None + } else { + Some(format!("{}@{}", blob_meta.key, blob_meta.lsn)) + } + }) .join(", "); - tracing::warn!( - "Oversized vectored read ({} > {}) for keys {}", - largest_read_size, - read_size_soft_max, - offenders - ); + + if !offenders.is_empty() { + tracing::warn!( + "Oversized vectored read ({} > {}) for keys {}", + largest_read_size, + read_size_soft_max, + offenders + ); + } } largest_read_size diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs index 3dcd7bc96271..9b53fa9e18cd 100644 --- a/pageserver/src/tenant/storage_layer/image_layer.rs +++ b/pageserver/src/tenant/storage_layer/image_layer.rs @@ -49,6 +49,7 @@ use camino::{Utf8Path, Utf8PathBuf}; use hex; use itertools::Itertools; use pageserver_api::config::MaxVectoredReadBytes; +use pageserver_api::key::DBDIR_KEY; use pageserver_api::keyspace::KeySpace; use pageserver_api::shard::{ShardIdentity, TenantShardId}; use rand::{distributions::Alphanumeric, Rng}; @@ -587,14 +588,25 @@ impl ImageLayerInner { .blobs_at .as_slice() .iter() - .map(|(_, blob_meta)| format!("{}@{}", blob_meta.key, blob_meta.lsn)) + .filter_map(|(_, blob_meta)| { + if blob_meta.key.is_rel_dir_key() || blob_meta.key == DBDIR_KEY { + // The size of values for these keys is unbounded and can + // grow very large in pathological cases. + None + } else { + Some(format!("{}@{}", blob_meta.key, blob_meta.lsn)) + } + }) .join(", "); - tracing::warn!( - "Oversized vectored read ({} > {}) for keys {}", - buf_size, - max_vectored_read_bytes, - offenders - ); + + if !offenders.is_empty() { + tracing::warn!( + "Oversized vectored read ({} > {}) for keys {}", + buf_size, + max_vectored_read_bytes, + offenders + ); + } } let buf = BytesMut::with_capacity(buf_size); diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index f0e2ca5c8332..bbb21b180ed5 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -442,11 +442,13 @@ impl Layer { // Visibility was modified to Visible: maybe log about this match ctx.task_kind() { TaskKind::CalculateSyntheticSize + | TaskKind::OndemandLogicalSizeCalculation | TaskKind::GarbageCollector | TaskKind::MgmtRequest => { // This situation is expected in code paths do binary searches of the LSN space to resolve // an LSN to a timestamp, which happens during GC, during GC cutoff calculations in synthetic size, - // and on-demand for certain HTTP API requests. + // and on-demand for certain HTTP API requests. On-demand logical size calculation is also included + // because it is run as a sub-task of synthetic size. } _ => { // In all other contexts, it is unusual to do I/O involving layers which are not visible at @@ -456,8 +458,8 @@ impl Layer { // This case is legal in brief time windows: for example an in-flight getpage request can hold on to a layer object // which was covered by a concurrent compaction. tracing::info!( - "Layer {} became visible as a result of access", - self.0.desc.key() + layer=%self, + "became visible as a result of access", ); } } @@ -686,7 +688,9 @@ impl Drop for LayerInner { // and we could be delaying shutdown for nothing. } - if let Some(timeline) = self.timeline.upgrade() { + let timeline = self.timeline.upgrade(); + + if let Some(timeline) = timeline.as_ref() { // Only need to decrement metrics if the timeline still exists: otherwise // it will have already de-registered these metrics via TimelineMetrics::shutdown if self.desc.is_delta() { @@ -717,7 +721,6 @@ impl Drop for LayerInner { let path = std::mem::take(&mut self.path); let file_name = self.layer_desc().layer_name(); let file_size = self.layer_desc().file_size; - let timeline = self.timeline.clone(); let meta = self.metadata(); let status = self.status.take(); @@ -727,7 +730,7 @@ impl Drop for LayerInner { // carry this until we are finished for [`Layer::wait_drop`] support let _status = status; - let Some(timeline) = timeline.upgrade() else { + let Some(timeline) = timeline else { // no need to nag that timeline is gone: under normal situation on // task_mgr::remove_tenant_from_memory the timeline is gone before we get dropped. LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::TimelineGone); diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 2113a1d72600..1d79b2b74b03 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -48,7 +48,6 @@ use utils::{ sync::gate::{Gate, GateGuard}, }; -use std::pin::pin; use std::sync::atomic::Ordering as AtomicOrdering; use std::sync::{Arc, Mutex, RwLock, Weak}; use std::time::{Duration, Instant, SystemTime}; @@ -62,6 +61,7 @@ use std::{ collections::btree_map::Entry, ops::{Deref, Range}, }; +use std::{pin::pin, sync::OnceLock}; use crate::{ aux_file::AuxFileSizeEstimator, @@ -71,6 +71,7 @@ use crate::{ metadata::TimelineMetadata, storage_layer::{inmemory_layer::IndexEntry, PersistentLayerDesc}, }, + walingest::WalLagCooldown, walredo, }; use crate::{ @@ -429,6 +430,8 @@ pub struct Timeline { pub(crate) l0_flush_global_state: L0FlushGlobalState, pub(crate) handles: handle::PerTimelineState, + + pub(crate) attach_wal_lag_cooldown: Arc>, } pub struct WalReceiverInfo { @@ -737,6 +740,7 @@ pub enum GetLogicalSizePriority { pub(crate) enum CompactFlags { ForceRepartition, ForceImageLayerCreation, + ForceL0Compaction, EnhancedGcBottomMostCompaction, DryRun, } @@ -2130,6 +2134,7 @@ impl Timeline { pg_version: u32, state: TimelineState, aux_file_policy: Option, + attach_wal_lag_cooldown: Arc>, cancel: CancellationToken, ) -> Arc { let disk_consistent_lsn = metadata.disk_consistent_lsn(); @@ -2271,6 +2276,8 @@ impl Timeline { l0_flush_global_state: resources.l0_flush_global_state, handles: Default::default(), + + attach_wal_lag_cooldown, }; if aux_file_policy == Some(AuxFilePolicy::V1) { diff --git a/pageserver/src/tenant/timeline/analysis.rs b/pageserver/src/tenant/timeline/analysis.rs index cd61418f3d31..6009b0b79a1d 100644 --- a/pageserver/src/tenant/timeline/analysis.rs +++ b/pageserver/src/tenant/timeline/analysis.rs @@ -11,6 +11,7 @@ pub(crate) struct RangeAnalysis { has_image: bool, num_of_deltas_above_image: usize, total_num_of_deltas: usize, + num_of_l0: usize, } impl Timeline { @@ -20,8 +21,10 @@ impl Timeline { let mut delta_ranges = Vec::new(); let mut image_ranges = Vec::new(); + let num_of_l0; let all_layer_files = { let guard = self.layers.read().await; + num_of_l0 = guard.layer_map().unwrap().level0_deltas().len(); guard.all_persistent_layers() }; let lsn = self.get_last_record_lsn(); @@ -82,6 +85,7 @@ impl Timeline { has_image: image_layer.is_some(), num_of_deltas_above_image: maybe_delta_layers.len(), total_num_of_deltas: pitr_delta_layers.len(), + num_of_l0, }); } diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index 3de386a2d58f..9f64471432e3 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -353,7 +353,13 @@ impl Timeline { // 2. Compact let timer = self.metrics.compact_time_histo.start_timer(); - let fully_compacted = self.compact_level0(target_file_size, ctx).await?; + let fully_compacted = self + .compact_level0( + target_file_size, + flags.contains(CompactFlags::ForceL0Compaction), + ctx, + ) + .await?; timer.stop_and_record(); let mut partitioning = dense_partitioning; @@ -658,6 +664,7 @@ impl Timeline { async fn compact_level0( self: &Arc, target_file_size: u64, + force_compaction_ignore_threshold: bool, ctx: &RequestContext, ) -> Result { let CompactLevel0Phase1Result { @@ -679,9 +686,15 @@ impl Timeline { let now = tokio::time::Instant::now(); stats.read_lock_acquisition_micros = DurationRecorder::Recorded(RecordedDuration(now - begin), now); - self.compact_level0_phase1(phase1_layers_locked, stats, target_file_size, &ctx) - .instrument(phase1_span) - .await? + self.compact_level0_phase1( + phase1_layers_locked, + stats, + target_file_size, + force_compaction_ignore_threshold, + &ctx, + ) + .instrument(phase1_span) + .await? }; if new_layers.is_empty() && deltas_to_compact.is_empty() { @@ -700,6 +713,7 @@ impl Timeline { guard: tokio::sync::RwLockReadGuard<'a, LayerManager>, mut stats: CompactLevel0Phase1StatsBuilder, target_file_size: u64, + force_compaction_ignore_threshold: bool, ctx: &RequestContext, ) -> Result { stats.read_lock_held_spawn_blocking_startup_micros = @@ -711,11 +725,26 @@ impl Timeline { // Only compact if enough layers have accumulated. let threshold = self.get_compaction_threshold(); if level0_deltas.is_empty() || level0_deltas.len() < threshold { - debug!( - level0_deltas = level0_deltas.len(), - threshold, "too few deltas to compact" - ); - return Ok(CompactLevel0Phase1Result::default()); + if force_compaction_ignore_threshold { + if !level0_deltas.is_empty() { + info!( + level0_deltas = level0_deltas.len(), + threshold, "too few deltas to compact, but forcing compaction" + ); + } else { + info!( + level0_deltas = level0_deltas.len(), + threshold, "too few deltas to compact, cannot force compaction" + ); + return Ok(CompactLevel0Phase1Result::default()); + } + } else { + debug!( + level0_deltas = level0_deltas.len(), + threshold, "too few deltas to compact" + ); + return Ok(CompactLevel0Phase1Result::default()); + } } let mut level0_deltas = level0_deltas diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs index 229c01a6817e..95d1f769205c 100644 --- a/pageserver/src/walingest.rs +++ b/pageserver/src/walingest.rs @@ -21,7 +21,10 @@ //! redo Postgres process, but some records it can handle directly with //! bespoken Rust code. +use std::sync::Arc; +use std::sync::OnceLock; use std::time::Duration; +use std::time::Instant; use std::time::SystemTime; use pageserver_api::shard::ShardIdentity; @@ -69,7 +72,29 @@ impl CheckPoint { } } +/// Temporary limitation of WAL lag warnings after attach +/// +/// After tenant attach, we want to limit WAL lag warnings because +/// we don't look at the WAL until the attach is complete, which +/// might take a while. +pub struct WalLagCooldown { + /// Until when should this limitation apply at all + active_until: std::time::Instant, + /// The maximum lag to suppress. Lags above this limit get reported anyways. + max_lag: Duration, +} + +impl WalLagCooldown { + pub fn new(attach_start: Instant, attach_duration: Duration) -> Self { + Self { + active_until: attach_start + attach_duration * 3 + Duration::from_secs(120), + max_lag: attach_duration * 2 + Duration::from_secs(60), + } + } +} + pub struct WalIngest { + attach_wal_lag_cooldown: Arc>, shard: ShardIdentity, checkpoint: CheckPoint, checkpoint_modified: bool, @@ -103,6 +128,7 @@ impl WalIngest { shard: *timeline.get_shard_identity(), checkpoint, checkpoint_modified: false, + attach_wal_lag_cooldown: timeline.attach_wal_lag_cooldown.clone(), warn_ingest_lag: WarnIngestLag { lag_msg_ratelimit: RateLimit::new(std::time::Duration::from_secs(10)), future_lsn_msg_ratelimit: RateLimit::new(std::time::Duration::from_secs(10)), @@ -1429,6 +1455,13 @@ impl WalIngest { Ok(lag) => { if lag > conf.wait_lsn_timeout { rate_limits.lag_msg_ratelimit.call2(|rate_limit_stats| { + if let Some(cooldown) = self.attach_wal_lag_cooldown.get() { + if std::time::Instant::now() < cooldown.active_until && lag <= cooldown.max_lag { + return; + } + } else { + // Still loading? We shouldn't be here + } let lag = humantime::format_duration(lag); warn!(%rate_limit_stats, %lag, "ingesting record with timestamp lagging more than wait_lsn_timeout"); }) diff --git a/pgxn/neon/file_cache.c b/pgxn/neon/file_cache.c index 2b461c86419a..892a27225221 100644 --- a/pgxn/neon/file_cache.c +++ b/pgxn/neon/file_cache.c @@ -42,6 +42,7 @@ #include "hll.h" #include "bitmap.h" +#include "neon.h" #define CriticalAssert(cond) do if (!(cond)) elog(PANIC, "Assertion %s failed at %s:%d: ", #cond, __FILE__, __LINE__); while (0) @@ -173,7 +174,9 @@ lfc_disable(char const *op) * If the reason of error is ENOSPC, then truncation of file may * help to reclaim some space */ + pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_TRUNCATE); int rc = ftruncate(lfc_desc, 0); + pgstat_report_wait_end(); if (rc < 0) elog(WARNING, "Failed to truncate local file cache %s: %m", lfc_path); @@ -769,8 +772,10 @@ lfc_readv_select(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, if (iteration_hits != 0) { + pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_READ); rc = preadv(lfc_desc, iov, blocks_in_chunk, ((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ); + pgstat_report_wait_end(); if (rc != (BLCKSZ * blocks_in_chunk)) { @@ -944,8 +949,11 @@ lfc_writev(NRelFileInfo rinfo, ForkNumber forkNum, BlockNumber blkno, lfc_ctl->writes += blocks_in_chunk; LWLockRelease(lfc_lock); + pgstat_report_wait_start(WAIT_EVENT_NEON_LFC_WRITE); rc = pwritev(lfc_desc, iov, blocks_in_chunk, ((off_t) entry_offset * BLOCKS_PER_CHUNK + chunk_offs) * BLCKSZ); + pgstat_report_wait_end(); + if (rc != BLCKSZ * blocks_in_chunk) { lfc_disable("write"); diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c index 6c6489277df6..0ca8a70d6ddb 100644 --- a/pgxn/neon/libpagestore.c +++ b/pgxn/neon/libpagestore.c @@ -490,7 +490,7 @@ pageserver_connect(shardno_t shard_no, int elevel) WL_EXIT_ON_PM_DEATH | WL_LATCH_SET | WL_SOCKET_READABLE, PQsocket(shard->conn), 0, - PG_WAIT_EXTENSION); + WAIT_EVENT_NEON_PS_STARTING); elog(DEBUG5, "PGRES_POLLING_READING=>%d", rc); if (rc & WL_LATCH_SET) { @@ -512,7 +512,7 @@ pageserver_connect(shardno_t shard_no, int elevel) WL_EXIT_ON_PM_DEATH | WL_LATCH_SET | WL_SOCKET_WRITEABLE, PQsocket(shard->conn), 0, - PG_WAIT_EXTENSION); + WAIT_EVENT_NEON_PS_STARTING); elog(DEBUG5, "PGRES_POLLING_WRITING=>%d", rc); if (rc & WL_LATCH_SET) { @@ -608,7 +608,8 @@ pageserver_connect(shardno_t shard_no, int elevel) WaitEvent event; /* Sleep until there's something to do */ - (void) WaitEventSetWait(shard->wes_read, -1L, &event, 1, PG_WAIT_EXTENSION); + (void) WaitEventSetWait(shard->wes_read, -1L, &event, 1, + WAIT_EVENT_NEON_PS_CONFIGURING); ResetLatch(MyLatch); CHECK_FOR_INTERRUPTS(); @@ -656,7 +657,8 @@ static int call_PQgetCopyData(shardno_t shard_no, char **buffer) { int ret; - PGconn *pageserver_conn = page_servers[shard_no].conn; + PageServer *shard = &page_servers[shard_no]; + PGconn *pageserver_conn = shard->conn; retry: ret = PQgetCopyData(pageserver_conn, buffer, 1 /* async */ ); @@ -666,7 +668,8 @@ call_PQgetCopyData(shardno_t shard_no, char **buffer) WaitEvent event; /* Sleep until there's something to do */ - (void) WaitEventSetWait(page_servers[shard_no].wes_read, -1L, &event, 1, PG_WAIT_EXTENSION); + (void) WaitEventSetWait(shard->wes_read, -1L, &event, 1, + WAIT_EVENT_NEON_PS_READ); ResetLatch(MyLatch); CHECK_FOR_INTERRUPTS(); diff --git a/pgxn/neon/neon.c b/pgxn/neon/neon.c index fe8e276d1c57..c3ed96710a6d 100644 --- a/pgxn/neon/neon.c +++ b/pgxn/neon/neon.c @@ -41,6 +41,9 @@ #include "pagestore_client.h" #include "control_plane_connector.h" #include "walsender_hooks.h" +#if PG_MAJORVERSION_NUM >= 16 +#include "storage/ipc.h" +#endif PG_MODULE_MAGIC; void _PG_init(void); @@ -49,6 +52,23 @@ static int logical_replication_max_snap_files = 300; static int running_xacts_overflow_policy; +#if PG_MAJORVERSION_NUM >= 16 +static shmem_startup_hook_type prev_shmem_startup_hook; + +static void neon_shmem_startup_hook(void); +#endif +#if PG_MAJORVERSION_NUM >= 17 +uint32 WAIT_EVENT_NEON_LFC_MAINTENANCE; +uint32 WAIT_EVENT_NEON_LFC_READ; +uint32 WAIT_EVENT_NEON_LFC_TRUNCATE; +uint32 WAIT_EVENT_NEON_LFC_WRITE; +uint32 WAIT_EVENT_NEON_PS_STARTING; +uint32 WAIT_EVENT_NEON_PS_CONFIGURING; +uint32 WAIT_EVENT_NEON_PS_SEND; +uint32 WAIT_EVENT_NEON_PS_READ; +uint32 WAIT_EVENT_NEON_WAL_DL; +#endif + enum RunningXactsOverflowPolicies { OP_IGNORE, OP_SKIP, @@ -635,6 +655,9 @@ _PG_init(void) */ #if PG_VERSION_NUM >= 160000 load_file("$libdir/neon_rmgr", false); + + prev_shmem_startup_hook = shmem_startup_hook; + shmem_startup_hook = neon_shmem_startup_hook; #endif pg_init_libpagestore(); @@ -721,3 +744,25 @@ backpressure_throttling_time(PG_FUNCTION_ARGS) { PG_RETURN_UINT64(BackpressureThrottlingTime()); } + +#if PG_MAJORVERSION_NUM >= 16 +static void +neon_shmem_startup_hook(void) +{ + /* Initialize */ + if (prev_shmem_startup_hook) + prev_shmem_startup_hook(); + +#if PG_PG_MAJORVERSION_NUM >= 17 + WAIT_EVENT_NEON_LFC_MAINTENANCE = WaitEventExtensionNew("Neon/FileCache_Maintenance"); + WAIT_EVENT_NEON_LFC_READ = WaitEventExtensionNew("Neon/FileCache_Read"); + WAIT_EVENT_NEON_LFC_TRUNCATE = WaitEventExtensionNew("Neon/FileCache_Truncate"); + WAIT_EVENT_NEON_LFC_WRITE = WaitEventExtensionNew("Neon/FileCache_Write"); + WAIT_EVENT_NEON_PS_STARTING = WaitEventExtensionNew("Neon/PS_Starting"); + WAIT_EVENT_NEON_PS_CONFIGURING = WaitEventExtensionNew("Neon/PS_Configuring"); + WAIT_EVENT_NEON_PS_SEND = WaitEventExtensionNew("Neon/PS_SendIO"); + WAIT_EVENT_NEON_PS_READ = WaitEventExtensionNew("Neon/PS_ReadIO"); + WAIT_EVENT_NEON_WAL_DL = WaitEventExtensionNew("Neon/WAL_Download"); +#endif +} +#endif diff --git a/pgxn/neon/neon.h b/pgxn/neon/neon.h index 5c653fc6c6cc..79aa88b8d36b 100644 --- a/pgxn/neon/neon.h +++ b/pgxn/neon/neon.h @@ -12,6 +12,7 @@ #ifndef NEON_H #define NEON_H #include "access/xlogreader.h" +#include "utils/wait_event.h" /* GUCs */ extern char *neon_auth_token; @@ -22,6 +23,28 @@ extern char *wal_acceptors_list; extern int wal_acceptor_reconnect_timeout; extern int wal_acceptor_connection_timeout; +#if PG_MAJORVERSION_NUM >= 17 +extern uint32 WAIT_EVENT_NEON_LFC_MAINTENANCE; +extern uint32 WAIT_EVENT_NEON_LFC_READ; +extern uint32 WAIT_EVENT_NEON_LFC_TRUNCATE; +extern uint32 WAIT_EVENT_NEON_LFC_WRITE; +extern uint32 WAIT_EVENT_NEON_PS_STARTING; +extern uint32 WAIT_EVENT_NEON_PS_CONFIGURING; +extern uint32 WAIT_EVENT_NEON_PS_SEND; +extern uint32 WAIT_EVENT_NEON_PS_READ; +extern uint32 WAIT_EVENT_NEON_WAL_DL; +#else +#define WAIT_EVENT_NEON_LFC_MAINTENANCE PG_WAIT_EXTENSION +#define WAIT_EVENT_NEON_LFC_READ WAIT_EVENT_BUFFILE_READ +#define WAIT_EVENT_NEON_LFC_TRUNCATE WAIT_EVENT_BUFFILE_TRUNCATE +#define WAIT_EVENT_NEON_LFC_WRITE WAIT_EVENT_BUFFILE_WRITE +#define WAIT_EVENT_NEON_PS_STARTING PG_WAIT_EXTENSION +#define WAIT_EVENT_NEON_PS_CONFIGURING PG_WAIT_EXTENSION +#define WAIT_EVENT_NEON_PS_SEND PG_WAIT_EXTENSION +#define WAIT_EVENT_NEON_PS_READ PG_WAIT_EXTENSION +#define WAIT_EVENT_NEON_WAL_DL WAIT_EVENT_WAL_READ +#endif + extern void pg_init_libpagestore(void); extern void pg_init_walproposer(void); diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index c1914421ecf1..78402a29d5dd 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -213,7 +213,7 @@ WalProposerPoll(WalProposer *wp) rc = wp->api.wait_event_set(wp, timeout, &sk, &events); /* Exit loop if latch is set (we got new WAL) */ - if ((rc == 1 && events & WL_LATCH_SET)) + if (rc == 1 && (events & WL_LATCH_SET)) break; /* diff --git a/pgxn/neon/walproposer_pg.c b/pgxn/neon/walproposer_pg.c index 89d4cb061f7a..01f88a5ab320 100644 --- a/pgxn/neon/walproposer_pg.c +++ b/pgxn/neon/walproposer_pg.c @@ -1814,7 +1814,7 @@ walprop_pg_wait_event_set(WalProposer *wp, long timeout, Safekeeper **sk, uint32 * If wait is terminated by latch set (walsenders' latch is set on each * wal flush). (no need for pm death check due to WL_EXIT_ON_PM_DEATH) */ - if ((rc == 1 && event.events & WL_LATCH_SET) || late_cv_trigger) + if ((rc == 1 && (event.events & WL_LATCH_SET)) || late_cv_trigger) { /* Reset our latch */ ResetLatch(MyLatch); @@ -1826,7 +1826,7 @@ walprop_pg_wait_event_set(WalProposer *wp, long timeout, Safekeeper **sk, uint32 * If the event contains something about the socket, it means we got an * event from a safekeeper socket. */ - if (rc == 1 && (event.events & (WL_SOCKET_MASK))) + if (rc == 1 && (event.events & WL_SOCKET_MASK)) { *sk = (Safekeeper *) event.user_data; *events = event.events; diff --git a/pgxn/neon/walsender_hooks.c b/pgxn/neon/walsender_hooks.c index bd3856e9d9bb..a0fe3822cc6d 100644 --- a/pgxn/neon/walsender_hooks.c +++ b/pgxn/neon/walsender_hooks.c @@ -160,7 +160,7 @@ NeonWALPageRead( WL_LATCH_SET | WL_EXIT_ON_PM_DEATH | reader_events, sock, timeout_ms, - WAIT_EVENT_WAL_SENDER_MAIN); + WAIT_EVENT_NEON_WAL_DL); } } } @@ -191,13 +191,14 @@ NeonOnDemandXLogReaderRoutines(XLogReaderRoutine *xlr) if (!wal_reader) { - XLogRecPtr epochStartLsn = pg_atomic_read_u64(&GetWalpropShmemState()->propEpochStartLsn); + XLogRecPtr basebackupLsn = GetRedoStartLsn(); - if (epochStartLsn == 0) + /* should never happen */ + if (basebackupLsn == 0) { - elog(ERROR, "Unable to start walsender when propEpochStartLsn is 0!"); + elog(ERROR, "unable to start walsender when basebackupLsn is 0"); } - wal_reader = NeonWALReaderAllocate(wal_segment_size, epochStartLsn, "[walsender] "); + wal_reader = NeonWALReaderAllocate(wal_segment_size, basebackupLsn, "[walsender] "); } xlr->page_read = NeonWALPageRead; xlr->segment_open = NeonWALReadSegmentOpen; diff --git a/poetry.lock b/poetry.lock index 48943a73e9c0..07f30d10e75f 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2064,73 +2064,80 @@ test = ["enum34", "ipaddress", "mock", "pywin32", "wmi"] [[package]] name = "psycopg2-binary" -version = "2.9.6" +version = "2.9.9" description = "psycopg2 - Python-PostgreSQL Database Adapter" optional = false -python-versions = ">=3.6" +python-versions = ">=3.7" files = [ - {file = "psycopg2-binary-2.9.6.tar.gz", hash = "sha256:1f64dcfb8f6e0c014c7f55e51c9759f024f70ea572fbdef123f85318c297947c"}, - {file = "psycopg2_binary-2.9.6-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:d26e0342183c762de3276cca7a530d574d4e25121ca7d6e4a98e4f05cb8e4df7"}, - {file = "psycopg2_binary-2.9.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c48d8f2db17f27d41fb0e2ecd703ea41984ee19362cbce52c097963b3a1b4365"}, - {file = "psycopg2_binary-2.9.6-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ffe9dc0a884a8848075e576c1de0290d85a533a9f6e9c4e564f19adf8f6e54a7"}, - {file = "psycopg2_binary-2.9.6-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8a76e027f87753f9bd1ab5f7c9cb8c7628d1077ef927f5e2446477153a602f2c"}, - {file = "psycopg2_binary-2.9.6-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6460c7a99fc939b849431f1e73e013d54aa54293f30f1109019c56a0b2b2ec2f"}, - {file = "psycopg2_binary-2.9.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ae102a98c547ee2288637af07393dd33f440c25e5cd79556b04e3fca13325e5f"}, - {file = "psycopg2_binary-2.9.6-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:9972aad21f965599ed0106f65334230ce826e5ae69fda7cbd688d24fa922415e"}, - {file = "psycopg2_binary-2.9.6-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:7a40c00dbe17c0af5bdd55aafd6ff6679f94a9be9513a4c7e071baf3d7d22a70"}, - {file = "psycopg2_binary-2.9.6-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:cacbdc5839bdff804dfebc058fe25684cae322987f7a38b0168bc1b2df703fb1"}, - {file = "psycopg2_binary-2.9.6-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:7f0438fa20fb6c7e202863e0d5ab02c246d35efb1d164e052f2f3bfe2b152bd0"}, - {file = "psycopg2_binary-2.9.6-cp310-cp310-win32.whl", hash = "sha256:b6c8288bb8a84b47e07013bb4850f50538aa913d487579e1921724631d02ea1b"}, - {file = "psycopg2_binary-2.9.6-cp310-cp310-win_amd64.whl", hash = "sha256:61b047a0537bbc3afae10f134dc6393823882eb263088c271331602b672e52e9"}, - {file = "psycopg2_binary-2.9.6-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:964b4dfb7c1c1965ac4c1978b0f755cc4bd698e8aa2b7667c575fb5f04ebe06b"}, - {file = "psycopg2_binary-2.9.6-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:afe64e9b8ea66866a771996f6ff14447e8082ea26e675a295ad3bdbffdd72afb"}, - {file = "psycopg2_binary-2.9.6-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:15e2ee79e7cf29582ef770de7dab3d286431b01c3bb598f8e05e09601b890081"}, - {file = "psycopg2_binary-2.9.6-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dfa74c903a3c1f0d9b1c7e7b53ed2d929a4910e272add6700c38f365a6002820"}, - {file = "psycopg2_binary-2.9.6-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b83456c2d4979e08ff56180a76429263ea254c3f6552cd14ada95cff1dec9bb8"}, - {file = "psycopg2_binary-2.9.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0645376d399bfd64da57148694d78e1f431b1e1ee1054872a5713125681cf1be"}, - {file = "psycopg2_binary-2.9.6-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e99e34c82309dd78959ba3c1590975b5d3c862d6f279f843d47d26ff89d7d7e1"}, - {file = "psycopg2_binary-2.9.6-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:4ea29fc3ad9d91162c52b578f211ff1c931d8a38e1f58e684c45aa470adf19e2"}, - {file = "psycopg2_binary-2.9.6-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:4ac30da8b4f57187dbf449294d23b808f8f53cad6b1fc3623fa8a6c11d176dd0"}, - {file = "psycopg2_binary-2.9.6-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:e78e6e2a00c223e164c417628572a90093c031ed724492c763721c2e0bc2a8df"}, - {file = "psycopg2_binary-2.9.6-cp311-cp311-win32.whl", hash = "sha256:1876843d8e31c89c399e31b97d4b9725a3575bb9c2af92038464231ec40f9edb"}, - {file = "psycopg2_binary-2.9.6-cp311-cp311-win_amd64.whl", hash = "sha256:b4b24f75d16a89cc6b4cdff0eb6a910a966ecd476d1e73f7ce5985ff1328e9a6"}, - {file = "psycopg2_binary-2.9.6-cp36-cp36m-win32.whl", hash = "sha256:498807b927ca2510baea1b05cc91d7da4718a0f53cb766c154c417a39f1820a0"}, - {file = "psycopg2_binary-2.9.6-cp36-cp36m-win_amd64.whl", hash = "sha256:0d236c2825fa656a2d98bbb0e52370a2e852e5a0ec45fc4f402977313329174d"}, - {file = "psycopg2_binary-2.9.6-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:34b9ccdf210cbbb1303c7c4db2905fa0319391bd5904d32689e6dd5c963d2ea8"}, - {file = "psycopg2_binary-2.9.6-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:84d2222e61f313c4848ff05353653bf5f5cf6ce34df540e4274516880d9c3763"}, - {file = "psycopg2_binary-2.9.6-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:30637a20623e2a2eacc420059be11527f4458ef54352d870b8181a4c3020ae6b"}, - {file = "psycopg2_binary-2.9.6-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8122cfc7cae0da9a3077216528b8bb3629c43b25053284cc868744bfe71eb141"}, - {file = "psycopg2_binary-2.9.6-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:38601cbbfe600362c43714482f43b7c110b20cb0f8172422c616b09b85a750c5"}, - {file = "psycopg2_binary-2.9.6-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:c7e62ab8b332147a7593a385d4f368874d5fe4ad4e341770d4983442d89603e3"}, - {file = "psycopg2_binary-2.9.6-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:2ab652e729ff4ad76d400df2624d223d6e265ef81bb8aa17fbd63607878ecbee"}, - {file = "psycopg2_binary-2.9.6-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:c83a74b68270028dc8ee74d38ecfaf9c90eed23c8959fca95bd703d25b82c88e"}, - {file = "psycopg2_binary-2.9.6-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:d4e6036decf4b72d6425d5b29bbd3e8f0ff1059cda7ac7b96d6ac5ed34ffbacd"}, - {file = "psycopg2_binary-2.9.6-cp37-cp37m-win32.whl", hash = "sha256:a8c28fd40a4226b4a84bdf2d2b5b37d2c7bd49486b5adcc200e8c7ec991dfa7e"}, - {file = "psycopg2_binary-2.9.6-cp37-cp37m-win_amd64.whl", hash = "sha256:51537e3d299be0db9137b321dfb6a5022caaab275775680e0c3d281feefaca6b"}, - {file = "psycopg2_binary-2.9.6-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:cf4499e0a83b7b7edcb8dabecbd8501d0d3a5ef66457200f77bde3d210d5debb"}, - {file = "psycopg2_binary-2.9.6-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:7e13a5a2c01151f1208d5207e42f33ba86d561b7a89fca67c700b9486a06d0e2"}, - {file = "psycopg2_binary-2.9.6-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:0e0f754d27fddcfd74006455b6e04e6705d6c31a612ec69ddc040a5468e44b4e"}, - {file = "psycopg2_binary-2.9.6-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d57c3fd55d9058645d26ae37d76e61156a27722097229d32a9e73ed54819982a"}, - {file = "psycopg2_binary-2.9.6-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:71f14375d6f73b62800530b581aed3ada394039877818b2d5f7fc77e3bb6894d"}, - {file = "psycopg2_binary-2.9.6-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:441cc2f8869a4f0f4bb408475e5ae0ee1f3b55b33f350406150277f7f35384fc"}, - {file = "psycopg2_binary-2.9.6-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:65bee1e49fa6f9cf327ce0e01c4c10f39165ee76d35c846ade7cb0ec6683e303"}, - {file = "psycopg2_binary-2.9.6-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:af335bac6b666cc6aea16f11d486c3b794029d9df029967f9938a4bed59b6a19"}, - {file = "psycopg2_binary-2.9.6-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:cfec476887aa231b8548ece2e06d28edc87c1397ebd83922299af2e051cf2827"}, - {file = "psycopg2_binary-2.9.6-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:65c07febd1936d63bfde78948b76cd4c2a411572a44ac50719ead41947d0f26b"}, - {file = "psycopg2_binary-2.9.6-cp38-cp38-win32.whl", hash = "sha256:4dfb4be774c4436a4526d0c554af0cc2e02082c38303852a36f6456ece7b3503"}, - {file = "psycopg2_binary-2.9.6-cp38-cp38-win_amd64.whl", hash = "sha256:02c6e3cf3439e213e4ee930308dc122d6fb4d4bea9aef4a12535fbd605d1a2fe"}, - {file = "psycopg2_binary-2.9.6-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:e9182eb20f41417ea1dd8e8f7888c4d7c6e805f8a7c98c1081778a3da2bee3e4"}, - {file = "psycopg2_binary-2.9.6-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:8a6979cf527e2603d349a91060f428bcb135aea2be3201dff794813256c274f1"}, - {file = "psycopg2_binary-2.9.6-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8338a271cb71d8da40b023a35d9c1e919eba6cbd8fa20a54b748a332c355d896"}, - {file = "psycopg2_binary-2.9.6-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e3ed340d2b858d6e6fb5083f87c09996506af483227735de6964a6100b4e6a54"}, - {file = "psycopg2_binary-2.9.6-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f81e65376e52f03422e1fb475c9514185669943798ed019ac50410fb4c4df232"}, - {file = "psycopg2_binary-2.9.6-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:bfb13af3c5dd3a9588000910178de17010ebcccd37b4f9794b00595e3a8ddad3"}, - {file = "psycopg2_binary-2.9.6-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:4c727b597c6444a16e9119386b59388f8a424223302d0c06c676ec8b4bc1f963"}, - {file = "psycopg2_binary-2.9.6-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:4d67fbdaf177da06374473ef6f7ed8cc0a9dc640b01abfe9e8a2ccb1b1402c1f"}, - {file = "psycopg2_binary-2.9.6-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:0892ef645c2fabb0c75ec32d79f4252542d0caec1d5d949630e7d242ca4681a3"}, - {file = "psycopg2_binary-2.9.6-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:02c0f3757a4300cf379eb49f543fb7ac527fb00144d39246ee40e1df684ab514"}, - {file = "psycopg2_binary-2.9.6-cp39-cp39-win32.whl", hash = "sha256:c3dba7dab16709a33a847e5cd756767271697041fbe3fe97c215b1fc1f5c9848"}, - {file = "psycopg2_binary-2.9.6-cp39-cp39-win_amd64.whl", hash = "sha256:f6a88f384335bb27812293fdb11ac6aee2ca3f51d3c7820fe03de0a304ab6249"}, + {file = "psycopg2-binary-2.9.9.tar.gz", hash = "sha256:7f01846810177d829c7692f1f5ada8096762d9172af1b1a28d4ab5b77c923c1c"}, + {file = "psycopg2_binary-2.9.9-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c2470da5418b76232f02a2fcd2229537bb2d5a7096674ce61859c3229f2eb202"}, + {file = "psycopg2_binary-2.9.9-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:c6af2a6d4b7ee9615cbb162b0738f6e1fd1f5c3eda7e5da17861eacf4c717ea7"}, + {file = "psycopg2_binary-2.9.9-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:75723c3c0fbbf34350b46a3199eb50638ab22a0228f93fb472ef4d9becc2382b"}, + {file = "psycopg2_binary-2.9.9-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:83791a65b51ad6ee6cf0845634859d69a038ea9b03d7b26e703f94c7e93dbcf9"}, + {file = "psycopg2_binary-2.9.9-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:0ef4854e82c09e84cc63084a9e4ccd6d9b154f1dbdd283efb92ecd0b5e2b8c84"}, + {file = "psycopg2_binary-2.9.9-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ed1184ab8f113e8d660ce49a56390ca181f2981066acc27cf637d5c1e10ce46e"}, + {file = "psycopg2_binary-2.9.9-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:d2997c458c690ec2bc6b0b7ecbafd02b029b7b4283078d3b32a852a7ce3ddd98"}, + {file = "psycopg2_binary-2.9.9-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:b58b4710c7f4161b5e9dcbe73bb7c62d65670a87df7bcce9e1faaad43e715245"}, + {file = "psycopg2_binary-2.9.9-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:0c009475ee389757e6e34611d75f6e4f05f0cf5ebb76c6037508318e1a1e0d7e"}, + {file = "psycopg2_binary-2.9.9-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8dbf6d1bc73f1d04ec1734bae3b4fb0ee3cb2a493d35ede9badbeb901fb40f6f"}, + {file = "psycopg2_binary-2.9.9-cp310-cp310-win32.whl", hash = "sha256:3f78fd71c4f43a13d342be74ebbc0666fe1f555b8837eb113cb7416856c79682"}, + {file = "psycopg2_binary-2.9.9-cp310-cp310-win_amd64.whl", hash = "sha256:876801744b0dee379e4e3c38b76fc89f88834bb15bf92ee07d94acd06ec890a0"}, + {file = "psycopg2_binary-2.9.9-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ee825e70b1a209475622f7f7b776785bd68f34af6e7a46e2e42f27b659b5bc26"}, + {file = "psycopg2_binary-2.9.9-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:1ea665f8ce695bcc37a90ee52de7a7980be5161375d42a0b6c6abedbf0d81f0f"}, + {file = "psycopg2_binary-2.9.9-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:143072318f793f53819048fdfe30c321890af0c3ec7cb1dfc9cc87aa88241de2"}, + {file = "psycopg2_binary-2.9.9-cp311-cp311-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c332c8d69fb64979ebf76613c66b985414927a40f8defa16cf1bc028b7b0a7b0"}, + {file = "psycopg2_binary-2.9.9-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:f7fc5a5acafb7d6ccca13bfa8c90f8c51f13d8fb87d95656d3950f0158d3ce53"}, + {file = "psycopg2_binary-2.9.9-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:977646e05232579d2e7b9c59e21dbe5261f403a88417f6a6512e70d3f8a046be"}, + {file = "psycopg2_binary-2.9.9-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:b6356793b84728d9d50ead16ab43c187673831e9d4019013f1402c41b1db9b27"}, + {file = "psycopg2_binary-2.9.9-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:bc7bb56d04601d443f24094e9e31ae6deec9ccb23581f75343feebaf30423359"}, + {file = "psycopg2_binary-2.9.9-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:77853062a2c45be16fd6b8d6de2a99278ee1d985a7bd8b103e97e41c034006d2"}, + {file = "psycopg2_binary-2.9.9-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:78151aa3ec21dccd5cdef6c74c3e73386dcdfaf19bced944169697d7ac7482fc"}, + {file = "psycopg2_binary-2.9.9-cp311-cp311-win32.whl", hash = "sha256:dc4926288b2a3e9fd7b50dc6a1909a13bbdadfc67d93f3374d984e56f885579d"}, + {file = "psycopg2_binary-2.9.9-cp311-cp311-win_amd64.whl", hash = "sha256:b76bedd166805480ab069612119ea636f5ab8f8771e640ae103e05a4aae3e417"}, + {file = "psycopg2_binary-2.9.9-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:8532fd6e6e2dc57bcb3bc90b079c60de896d2128c5d9d6f24a63875a95a088cf"}, + {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f8544b092a29a6ddd72f3556a9fcf249ec412e10ad28be6a0c0d948924f2212"}, + {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2d423c8d8a3c82d08fe8af900ad5b613ce3632a1249fd6a223941d0735fce493"}, + {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2e5afae772c00980525f6d6ecf7cbca55676296b580c0e6abb407f15f3706996"}, + {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6e6f98446430fdf41bd36d4faa6cb409f5140c1c2cf58ce0bbdaf16af7d3f119"}, + {file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:c77e3d1862452565875eb31bdb45ac62502feabbd53429fdc39a1cc341d681ba"}, + {file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:cb16c65dcb648d0a43a2521f2f0a2300f40639f6f8c1ecbc662141e4e3e1ee07"}, + {file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:911dda9c487075abd54e644ccdf5e5c16773470a6a5d3826fda76699410066fb"}, + {file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:57fede879f08d23c85140a360c6a77709113efd1c993923c59fde17aa27599fe"}, + {file = "psycopg2_binary-2.9.9-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:2293b001e319ab0d869d660a704942c9e2cce19745262a8aba2115ef41a0a42a"}, + {file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:03ef7df18daf2c4c07e2695e8cfd5ee7f748a1d54d802330985a78d2a5a6dca9"}, + {file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0a602ea5aff39bb9fac6308e9c9d82b9a35c2bf288e184a816002c9fae930b77"}, + {file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8359bf4791968c5a78c56103702000105501adb557f3cf772b2c207284273984"}, + {file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:275ff571376626195ab95a746e6a04c7df8ea34638b99fc11160de91f2fef503"}, + {file = "psycopg2_binary-2.9.9-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:f9b5571d33660d5009a8b3c25dc1db560206e2d2f89d3df1cb32d72c0d117d52"}, + {file = "psycopg2_binary-2.9.9-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:420f9bbf47a02616e8554e825208cb947969451978dceb77f95ad09c37791dae"}, + {file = "psycopg2_binary-2.9.9-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:4154ad09dac630a0f13f37b583eae260c6aa885d67dfbccb5b02c33f31a6d420"}, + {file = "psycopg2_binary-2.9.9-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:a148c5d507bb9b4f2030a2025c545fccb0e1ef317393eaba42e7eabd28eb6041"}, + {file = "psycopg2_binary-2.9.9-cp37-cp37m-win32.whl", hash = "sha256:68fc1f1ba168724771e38bee37d940d2865cb0f562380a1fb1ffb428b75cb692"}, + {file = "psycopg2_binary-2.9.9-cp37-cp37m-win_amd64.whl", hash = "sha256:281309265596e388ef483250db3640e5f414168c5a67e9c665cafce9492eda2f"}, + {file = "psycopg2_binary-2.9.9-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:60989127da422b74a04345096c10d416c2b41bd7bf2a380eb541059e4e999980"}, + {file = "psycopg2_binary-2.9.9-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:246b123cc54bb5361588acc54218c8c9fb73068bf227a4a531d8ed56fa3ca7d6"}, + {file = "psycopg2_binary-2.9.9-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:34eccd14566f8fe14b2b95bb13b11572f7c7d5c36da61caf414d23b91fcc5d94"}, + {file = "psycopg2_binary-2.9.9-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:18d0ef97766055fec15b5de2c06dd8e7654705ce3e5e5eed3b6651a1d2a9a152"}, + {file = "psycopg2_binary-2.9.9-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d3f82c171b4ccd83bbaf35aa05e44e690113bd4f3b7b6cc54d2219b132f3ae55"}, + {file = "psycopg2_binary-2.9.9-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:ead20f7913a9c1e894aebe47cccf9dc834e1618b7aa96155d2091a626e59c972"}, + {file = "psycopg2_binary-2.9.9-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:ca49a8119c6cbd77375ae303b0cfd8c11f011abbbd64601167ecca18a87e7cdd"}, + {file = "psycopg2_binary-2.9.9-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:323ba25b92454adb36fa425dc5cf6f8f19f78948cbad2e7bc6cdf7b0d7982e59"}, + {file = "psycopg2_binary-2.9.9-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:1236ed0952fbd919c100bc839eaa4a39ebc397ed1c08a97fc45fee2a595aa1b3"}, + {file = "psycopg2_binary-2.9.9-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:729177eaf0aefca0994ce4cffe96ad3c75e377c7b6f4efa59ebf003b6d398716"}, + {file = "psycopg2_binary-2.9.9-cp38-cp38-win32.whl", hash = "sha256:804d99b24ad523a1fe18cc707bf741670332f7c7412e9d49cb5eab67e886b9b5"}, + {file = "psycopg2_binary-2.9.9-cp38-cp38-win_amd64.whl", hash = "sha256:a6cdcc3ede532f4a4b96000b6362099591ab4a3e913d70bcbac2b56c872446f7"}, + {file = "psycopg2_binary-2.9.9-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:72dffbd8b4194858d0941062a9766f8297e8868e1dd07a7b36212aaa90f49472"}, + {file = "psycopg2_binary-2.9.9-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:30dcc86377618a4c8f3b72418df92e77be4254d8f89f14b8e8f57d6d43603c0f"}, + {file = "psycopg2_binary-2.9.9-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:31a34c508c003a4347d389a9e6fcc2307cc2150eb516462a7a17512130de109e"}, + {file = "psycopg2_binary-2.9.9-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:15208be1c50b99203fe88d15695f22a5bed95ab3f84354c494bcb1d08557df67"}, + {file = "psycopg2_binary-2.9.9-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:1873aade94b74715be2246321c8650cabf5a0d098a95bab81145ffffa4c13876"}, + {file = "psycopg2_binary-2.9.9-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3a58c98a7e9c021f357348867f537017057c2ed7f77337fd914d0bedb35dace7"}, + {file = "psycopg2_binary-2.9.9-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:4686818798f9194d03c9129a4d9a702d9e113a89cb03bffe08c6cf799e053291"}, + {file = "psycopg2_binary-2.9.9-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:ebdc36bea43063116f0486869652cb2ed7032dbc59fbcb4445c4862b5c1ecf7f"}, + {file = "psycopg2_binary-2.9.9-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:ca08decd2697fdea0aea364b370b1249d47336aec935f87b8bbfd7da5b2ee9c1"}, + {file = "psycopg2_binary-2.9.9-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:ac05fb791acf5e1a3e39402641827780fe44d27e72567a000412c648a85ba860"}, + {file = "psycopg2_binary-2.9.9-cp39-cp39-win32.whl", hash = "sha256:9dba73be7305b399924709b91682299794887cbbd88e38226ed9f6712eabee90"}, + {file = "psycopg2_binary-2.9.9-cp39-cp39-win_amd64.whl", hash = "sha256:f7ae5d65ccfbebdfa761585228eb4d0df3a8b15cfb53bd953e713e09fbb12957"}, ] [[package]] @@ -2577,7 +2584,6 @@ files = [ {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"}, {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"}, - {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"}, {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"}, {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"}, {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"}, @@ -2702,13 +2708,13 @@ files = [ [[package]] name = "requests" -version = "2.32.0" +version = "2.32.3" description = "Python HTTP for Humans." optional = false python-versions = ">=3.8" files = [ - {file = "requests-2.32.0-py3-none-any.whl", hash = "sha256:f2c3881dddb70d056c5bd7600a4fae312b2a300e39be6a118d30b90bd27262b5"}, - {file = "requests-2.32.0.tar.gz", hash = "sha256:fa5490319474c82ef1d2c9bc459d3652e3ae4ef4c4ebdd18a21145a47ca4b6b8"}, + {file = "requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6"}, + {file = "requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760"}, ] [package.dependencies] @@ -3131,16 +3137,6 @@ files = [ {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"}, {file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"}, {file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"}, - {file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"}, - {file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"}, - {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"}, - {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"}, - {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"}, - {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"}, - {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"}, - {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"}, - {file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"}, - {file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"}, {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"}, {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"}, {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"}, @@ -3378,4 +3374,4 @@ cffi = ["cffi (>=1.11)"] [metadata] lock-version = "2.0" python-versions = "^3.9" -content-hash = "c09bcb333ab550958b33dbf4fec968c500d8e701fd4c96402cddbd9bb8048055" +content-hash = "9055b73352f1534f664cd8af6ebf8d93cf3bf857f115756f312ff2e3ae1bbbc1" diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml index bfeb8455835c..ae9b2531aa72 100644 --- a/proxy/Cargo.toml +++ b/proxy/Cargo.toml @@ -38,7 +38,7 @@ hostname.workspace = true http.workspace = true humantime.workspace = true humantime-serde.workspace = true -hyper.workspace = true +hyper0.workspace = true hyper1 = { package = "hyper", version = "1.2", features = ["server"] } hyper-util = { version = "0.1", features = ["server", "http1", "http2", "tokio"] } http-body-util = { version = "0.1" } diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs index 7c408f817ceb..13639af3aa6e 100644 --- a/proxy/src/auth.rs +++ b/proxy/src/auth.rs @@ -73,6 +73,9 @@ pub(crate) enum AuthErrorImpl { #[error("Authentication timed out")] UserTimeout(Elapsed), + + #[error("Disconnected due to inactivity after {0}.")] + ConfirmationTimeout(humantime::Duration), } #[derive(Debug, Error)] @@ -103,6 +106,10 @@ impl AuthError { pub(crate) fn user_timeout(elapsed: Elapsed) -> Self { AuthErrorImpl::UserTimeout(elapsed).into() } + + pub(crate) fn confirmation_timeout(timeout: humantime::Duration) -> Self { + AuthErrorImpl::ConfirmationTimeout(timeout).into() + } } impl> From for AuthError { @@ -125,6 +132,7 @@ impl UserFacingError for AuthError { AuthErrorImpl::IpAddressNotAllowed(_) => self.to_string(), AuthErrorImpl::TooManyConnections => self.to_string(), AuthErrorImpl::UserTimeout(_) => self.to_string(), + AuthErrorImpl::ConfirmationTimeout(_) => self.to_string(), } } } @@ -143,6 +151,7 @@ impl ReportableError for AuthError { AuthErrorImpl::IpAddressNotAllowed(_) => crate::error::ErrorKind::User, AuthErrorImpl::TooManyConnections => crate::error::ErrorKind::RateLimit, AuthErrorImpl::UserTimeout(_) => crate::error::ErrorKind::User, + AuthErrorImpl::ConfirmationTimeout(_) => crate::error::ErrorKind::User, } } } diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs index 52ddfd90fb70..0eeed27fb20c 100644 --- a/proxy/src/auth/backend.rs +++ b/proxy/src/auth/backend.rs @@ -620,6 +620,7 @@ mod tests { ip_allowlist_check_enabled: true, is_auth_broker: false, accept_jwts: false, + webauth_confirmation_timeout: std::time::Duration::from_secs(5), }); async fn read_message(r: &mut (impl AsyncRead + Unpin), b: &mut BytesMut) -> PgMessage { diff --git a/proxy/src/auth/backend/web.rs b/proxy/src/auth/backend/web.rs index 05f437355e64..45710d244d07 100644 --- a/proxy/src/auth/backend/web.rs +++ b/proxy/src/auth/backend/web.rs @@ -89,7 +89,12 @@ pub(super) async fn authenticate( // Wait for web console response (see `mgmt`). info!(parent: &span, "waiting for console's reply..."); - let db_info = waiter.await.map_err(WebAuthError::from)?; + let db_info = tokio::time::timeout(auth_config.webauth_confirmation_timeout, waiter) + .await + .map_err(|_elapsed| { + auth::AuthError::confirmation_timeout(auth_config.webauth_confirmation_timeout.into()) + })? + .map_err(WebAuthError::from)?; if auth_config.ip_allowlist_check_enabled { if let Some(allowed_ips) = &db_info.allowed_ips { diff --git a/proxy/src/bin/local_proxy.rs b/proxy/src/bin/local_proxy.rs index 49887576c732..d5ce1e9273e0 100644 --- a/proxy/src/bin/local_proxy.rs +++ b/proxy/src/bin/local_proxy.rs @@ -77,10 +77,10 @@ struct LocalProxyCliArgs { #[clap(long, default_value = "127.0.0.1:5432")] compute: SocketAddr, /// Path of the local proxy config file - #[clap(long, default_value = "./localproxy.json")] + #[clap(long, default_value = "./local_proxy.json")] config_path: Utf8PathBuf, /// Path of the local proxy PID file - #[clap(long, default_value = "./localproxy.pid")] + #[clap(long, default_value = "./local_proxy.pid")] pid_path: Utf8PathBuf, } @@ -109,7 +109,7 @@ struct SqlOverHttpArgs { #[tokio::main] async fn main() -> anyhow::Result<()> { - let _logging_guard = proxy::logging::init().await?; + let _logging_guard = proxy::logging::init_local_proxy()?; let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook(); let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]); @@ -138,7 +138,7 @@ async fn main() -> anyhow::Result<()> { // in order to trigger the appropriate SIGHUP on config change. // // This also claims a "lock" that makes sure only one instance - // of local-proxy runs at a time. + // of local_proxy runs at a time. let _process_guard = loop { match pid_file::claim_for_current_process(&args.pid_path) { Ok(guard) => break guard, @@ -164,12 +164,6 @@ async fn main() -> anyhow::Result<()> { 16, )); - // write the process ID to a file so that compute-ctl can find our process later - // in order to trigger the appropriate SIGHUP on config change. - let pid = std::process::id(); - info!("process running in PID {pid}"); - std::fs::write(args.pid_path, format!("{pid}\n")).context("writing PID to file")?; - let mut maintenance_tasks = JoinSet::new(); let refresh_config_notify = Arc::new(Notify::new()); @@ -182,9 +176,9 @@ async fn main() -> anyhow::Result<()> { // trigger the first config load **after** setting up the signal hook // to avoid the race condition where: - // 1. No config file registered when local-proxy starts up + // 1. No config file registered when local_proxy starts up // 2. The config file is written but the signal hook is not yet received - // 3. local-proxy completes startup but has no config loaded, despite there being a registerd config. + // 3. local_proxy completes startup but has no config loaded, despite there being a registerd config. refresh_config_notify.notify_one(); tokio::spawn(refresh_config_loop(args.config_path, refresh_config_notify)); @@ -279,6 +273,7 @@ fn build_config(args: &LocalProxyCliArgs) -> anyhow::Result<&'static ProxyConfig ip_allowlist_check_enabled: true, is_auth_broker: false, accept_jwts: true, + webauth_confirmation_timeout: Duration::ZERO, }, proxy_protocol_v2: config::ProxyProtocolV2::Rejected, handshake_timeout: Duration::from_secs(10), @@ -310,7 +305,7 @@ async fn refresh_config_inner(path: &Utf8Path) -> anyhow::Result<()> { let mut jwks_set = vec![]; - for jwks in data.jwks { + for jwks in data.jwks.into_iter().flatten() { let mut jwks_url = url::Url::from_str(&jwks.jwks_url).context("parsing JWKS url")?; ensure!( diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs index fa4fb264f203..0585902c3b7d 100644 --- a/proxy/src/bin/proxy.rs +++ b/proxy/src/bin/proxy.rs @@ -236,6 +236,10 @@ struct ProxyCliArgs { // TODO(conradludgate): switch default to rejected or required once we've updated all deployments #[clap(value_enum, long, default_value_t = ProxyProtocolV2::Supported)] proxy_protocol_v2: ProxyProtocolV2, + + /// Time the proxy waits for the webauth session to be confirmed by the control plane. + #[clap(long, default_value = "2m", value_parser = humantime::parse_duration)] + webauth_confirmation_timeout: std::time::Duration, } #[derive(clap::Args, Clone, Copy, Debug)] @@ -719,6 +723,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> { ip_allowlist_check_enabled: !args.is_private_access_proxy, is_auth_broker: args.is_auth_broker, accept_jwts: args.is_auth_broker, + webauth_confirmation_timeout: args.webauth_confirmation_timeout, }; let config = Box::leak(Box::new(ProxyConfig { diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index 8d3cb8ee3cff..e3d9ae530a89 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -20,7 +20,7 @@ use tokio_postgres::tls::MakeTlsConnect; use tokio_postgres_rustls::MakeRustlsConnect; use tracing::{error, info, warn}; -const COULD_NOT_CONNECT: &str = "Couldn't connect to compute node"; +pub const COULD_NOT_CONNECT: &str = "Couldn't connect to compute node"; #[derive(Debug, Error)] pub(crate) enum ConnectionError { diff --git a/proxy/src/config.rs b/proxy/src/config.rs index 7d86ef434867..e0d666adf76f 100644 --- a/proxy/src/config.rs +++ b/proxy/src/config.rs @@ -84,6 +84,7 @@ pub struct AuthenticationConfig { pub jwks_cache: JwkCache, pub is_auth_broker: bool, pub accept_jwts: bool, + pub webauth_confirmation_timeout: tokio::time::Duration, } impl TlsConfig { diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs index ea0a9beced71..92faab61672c 100644 --- a/proxy/src/lib.rs +++ b/proxy/src/lib.rs @@ -90,6 +90,8 @@ use tokio::task::JoinError; use tokio_util::sync::CancellationToken; use tracing::warn; +extern crate hyper0 as hyper; + pub mod auth; pub mod cache; pub mod cancellation; diff --git a/proxy/src/logging.rs b/proxy/src/logging.rs index 2e773fabb3b0..a34eb820f81b 100644 --- a/proxy/src/logging.rs +++ b/proxy/src/logging.rs @@ -1,6 +1,13 @@ +use tracing::Subscriber; use tracing_subscriber::{ filter::{EnvFilter, LevelFilter}, + fmt::{ + format::{Format, Full}, + time::SystemTime, + FormatEvent, FormatFields, + }, prelude::*, + registry::LookupSpan, }; /// Initialize logging and OpenTelemetry tracing and exporter. @@ -33,6 +40,45 @@ pub async fn init() -> anyhow::Result { Ok(LoggingGuard) } +/// Initialize logging for local_proxy with log prefix and no opentelemetry. +/// +/// Logging can be configured using `RUST_LOG` environment variable. +pub fn init_local_proxy() -> anyhow::Result { + let env_filter = EnvFilter::builder() + .with_default_directive(LevelFilter::INFO.into()) + .from_env_lossy(); + + let fmt_layer = tracing_subscriber::fmt::layer() + .with_ansi(false) + .with_writer(std::io::stderr) + .event_format(LocalProxyFormatter(Format::default().with_target(false))); + + tracing_subscriber::registry() + .with(env_filter) + .with(fmt_layer) + .try_init()?; + + Ok(LoggingGuard) +} + +pub struct LocalProxyFormatter(Format); + +impl FormatEvent for LocalProxyFormatter +where + S: Subscriber + for<'a> LookupSpan<'a>, + N: for<'a> FormatFields<'a> + 'static, +{ + fn format_event( + &self, + ctx: &tracing_subscriber::fmt::FmtContext<'_, S, N>, + mut writer: tracing_subscriber::fmt::format::Writer<'_>, + event: &tracing::Event<'_>, + ) -> std::fmt::Result { + writer.write_str("[local_proxy] ")?; + self.0.format_event(ctx, writer, event) + } +} + pub struct LoggingGuard; impl Drop for LoggingGuard { diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs index 613548d4a07b..3b6c467589da 100644 --- a/proxy/src/proxy/connect_compute.rs +++ b/proxy/src/proxy/connect_compute.rs @@ -1,5 +1,6 @@ use crate::{ auth::backend::ComputeCredentialKeys, + compute::COULD_NOT_CONNECT, compute::{self, PostgresConnection}, config::RetryConfig, console::{self, errors::WakeComputeError, locks::ApiLocks, CachedNodeInfo, NodeInfo}, @@ -15,7 +16,7 @@ use crate::{ use async_trait::async_trait; use pq_proto::StartupMessageParams; use tokio::time; -use tracing::{error, info, warn}; +use tracing::{debug, info, warn}; use super::retry::ShouldRetryWakeCompute; @@ -116,7 +117,6 @@ where node_info.set_keys(user_info.get_keys()); node_info.allow_self_signed_compute = allow_self_signed_compute; - // let mut node_info = credentials.get_node_info(ctx, user_info).await?; mechanism.update_connect_config(&mut node_info.config); let retry_type = RetryType::ConnectToCompute; @@ -139,10 +139,10 @@ where Err(e) => e, }; - error!(error = ?err, "could not connect to compute node"); + debug!(error = ?err, COULD_NOT_CONNECT); let node_info = if !node_info.cached() || !err.should_retry_wake_compute() { - // If we just recieved this from cplane and dodn't get it from cache, we shouldn't retry. + // If we just recieved this from cplane and didn't get it from cache, we shouldn't retry. // Do not need to retrieve a new node_info, just return the old one. if should_retry(&err, num_retries, connect_to_compute_retry_config) { Metrics::get().proxy.retries_metric.observe( @@ -191,7 +191,7 @@ where } Err(e) => { if !should_retry(&e, num_retries, connect_to_compute_retry_config) { - error!(error = ?e, num_retries, retriable = false, "couldn't connect to compute node"); + // Don't log an error here, caller will print the error Metrics::get().proxy.retries_metric.observe( RetriesMetricGroup { outcome: ConnectOutcome::Failed, @@ -202,7 +202,7 @@ where return Err(e.into()); } - warn!(error = ?e, num_retries, retriable = true, "couldn't connect to compute node"); + warn!(error = ?e, num_retries, retriable = true, COULD_NOT_CONNECT); } }; diff --git a/pyproject.toml b/pyproject.toml index ad3961ef559c..556edf558926 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,10 +6,10 @@ package-mode = false [tool.poetry.dependencies] python = "^3.9" pytest = "^7.4.4" -psycopg2-binary = "^2.9.6" +psycopg2-binary = "^2.9.9" typing-extensions = "^4.6.1" PyJWT = {version = "^2.1.0", extras = ["crypto"]} -requests = "^2.32.0" +requests = "^2.32.3" pytest-xdist = "^3.3.1" asyncpg = "^0.29.0" aiopg = "^1.4.0" diff --git a/safekeeper/Cargo.toml b/safekeeper/Cargo.toml index 67f32b3cc08b..78a3129abacb 100644 --- a/safekeeper/Cargo.toml +++ b/safekeeper/Cargo.toml @@ -23,7 +23,7 @@ crc32c.workspace = true fail.workspace = true hex.workspace = true humantime.workspace = true -hyper.workspace = true +hyper0.workspace = true futures.workspace = true once_cell.workspace = true parking_lot.workspace = true diff --git a/safekeeper/src/debug_dump.rs b/safekeeper/src/debug_dump.rs index 589536c7a861..125f5af7f315 100644 --- a/safekeeper/src/debug_dump.rs +++ b/safekeeper/src/debug_dump.rs @@ -253,6 +253,13 @@ pub async fn build(args: Args) -> Result { }); } + // Tokio forbids to drop runtime in async context, so this is a stupid way + // to drop it in non async context. + tokio::task::spawn_blocking(move || { + let _r = runtime; + }) + .await?; + Ok(Response { start_time, finish_time: Utc::now(), diff --git a/safekeeper/src/lib.rs b/safekeeper/src/lib.rs index 3116d88dff48..277becb96b30 100644 --- a/safekeeper/src/lib.rs +++ b/safekeeper/src/lib.rs @@ -1,4 +1,7 @@ #![deny(clippy::undocumented_unsafe_blocks)] + +extern crate hyper0 as hyper; + use camino::Utf8PathBuf; use once_cell::sync::Lazy; use remote_storage::RemoteStorageConfig; diff --git a/safekeeper/src/remove_wal.rs b/safekeeper/src/remove_wal.rs index 16239d847ba4..df3ba9eb087b 100644 --- a/safekeeper/src/remove_wal.rs +++ b/safekeeper/src/remove_wal.rs @@ -2,21 +2,29 @@ use utils::lsn::Lsn; use crate::timeline_manager::StateSnapshot; -/// Get oldest LSN we still need to keep. We hold WAL till it is consumed -/// by all of 1) pageserver (remote_consistent_lsn) 2) peers 3) s3 -/// offloading. -/// While it is safe to use inmem values for determining horizon, -/// we use persistent to make possible normal states less surprising. -/// All segments covering LSNs before horizon_lsn can be removed. +/// Get oldest LSN we still need to keep. +/// +/// We hold WAL till it is consumed by +/// 1) pageserver (remote_consistent_lsn) +/// 2) s3 offloading. +/// 3) Additionally we must store WAL since last local commit_lsn because +/// that's where we start looking for last WAL record on start. +/// +/// If some peer safekeeper misses data it will fetch it from the remote +/// storage. While it is safe to use inmem values for determining horizon, we +/// use persistent to make possible normal states less surprising. All segments +/// covering LSNs before horizon_lsn can be removed. pub(crate) fn calc_horizon_lsn(state: &StateSnapshot, extra_horizon_lsn: Option) -> Lsn { use std::cmp::min; - let mut horizon_lsn = min( - state.cfile_remote_consistent_lsn, - state.cfile_peer_horizon_lsn, - ); + let mut horizon_lsn = state.cfile_remote_consistent_lsn; // we don't want to remove WAL that is not yet offloaded to s3 horizon_lsn = min(horizon_lsn, state.cfile_backup_lsn); + // Min by local commit_lsn to be able to begin reading WAL from somewhere on + // sk start. Technically we don't allow local commit_lsn to be higher than + // flush_lsn, but let's be double safe by including it as well. + horizon_lsn = min(horizon_lsn, state.cfile_commit_lsn); + horizon_lsn = min(horizon_lsn, state.flush_lsn); if let Some(extra_horizon_lsn) = extra_horizon_lsn { horizon_lsn = min(horizon_lsn, extra_horizon_lsn); } diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs index 6be75479db7f..f5535c0ceaa4 100644 --- a/safekeeper/src/timeline_manager.rs +++ b/safekeeper/src/timeline_manager.rs @@ -47,7 +47,7 @@ pub(crate) struct StateSnapshot { pub(crate) remote_consistent_lsn: Lsn, // persistent control file values - pub(crate) cfile_peer_horizon_lsn: Lsn, + pub(crate) cfile_commit_lsn: Lsn, pub(crate) cfile_remote_consistent_lsn: Lsn, pub(crate) cfile_backup_lsn: Lsn, @@ -70,7 +70,7 @@ impl StateSnapshot { commit_lsn: state.inmem.commit_lsn, backup_lsn: state.inmem.backup_lsn, remote_consistent_lsn: state.inmem.remote_consistent_lsn, - cfile_peer_horizon_lsn: state.peer_horizon_lsn, + cfile_commit_lsn: state.commit_lsn, cfile_remote_consistent_lsn: state.remote_consistent_lsn, cfile_backup_lsn: state.backup_lsn, flush_lsn: read_guard.sk.flush_lsn(), diff --git a/safekeeper/tests/walproposer_sim/safekeeper.rs b/safekeeper/tests/walproposer_sim/safekeeper.rs index 771d905c90ad..a05c2d4559c2 100644 --- a/safekeeper/tests/walproposer_sim/safekeeper.rs +++ b/safekeeper/tests/walproposer_sim/safekeeper.rs @@ -13,7 +13,7 @@ use desim::{ node_os::NodeOs, proto::{AnyMessage, NetEvent, NodeEvent}, }; -use hyper::Uri; +use hyper0::Uri; use safekeeper::{ safekeeper::{ProposerAcceptorMessage, SafeKeeper, ServerInfo, UNKNOWN_SERVER_VERSION}, state::{TimelinePersistentState, TimelineState}, diff --git a/storage_broker/Cargo.toml b/storage_broker/Cargo.toml index 5359f586e49d..849707fbc425 100644 --- a/storage_broker/Cargo.toml +++ b/storage_broker/Cargo.toml @@ -16,7 +16,7 @@ futures.workspace = true futures-core.workspace = true futures-util.workspace = true humantime.workspace = true -hyper = { workspace = true, features = ["full"] } +hyper0 = { workspace = true, features = ["full"] } once_cell.workspace = true parking_lot.workspace = true prost.workspace = true diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs index 15acd0e49ceb..9c56e9fab546 100644 --- a/storage_broker/src/bin/storage_broker.rs +++ b/storage_broker/src/bin/storage_broker.rs @@ -10,6 +10,9 @@ //! //! Only safekeeper message is supported, but it is not hard to add something //! else with generics. + +extern crate hyper0 as hyper; + use clap::{command, Parser}; use futures_core::Stream; use futures_util::StreamExt; diff --git a/storage_broker/src/lib.rs b/storage_broker/src/lib.rs index aa5d0bad5fee..447591f89876 100644 --- a/storage_broker/src/lib.rs +++ b/storage_broker/src/lib.rs @@ -1,3 +1,5 @@ +extern crate hyper0 as hyper; + use hyper::body::HttpBody; use std::pin::Pin; use std::task::{Context, Poll}; diff --git a/storage_controller/Cargo.toml b/storage_controller/Cargo.toml index 9ed0501026dc..2f5d266567e7 100644 --- a/storage_controller/Cargo.toml +++ b/storage_controller/Cargo.toml @@ -21,7 +21,7 @@ clap.workspace = true fail.workspace = true futures.workspace = true hex.workspace = true -hyper.workspace = true +hyper0.workspace = true humantime.workspace = true itertools.workspace = true lasso.workspace = true diff --git a/storage_controller/src/lib.rs b/storage_controller/src/lib.rs index 60e613bb5cf8..f5823935e1e0 100644 --- a/storage_controller/src/lib.rs +++ b/storage_controller/src/lib.rs @@ -1,6 +1,8 @@ use serde::Serialize; use utils::seqwait::MonotonicCounter; +extern crate hyper0 as hyper; + mod auth; mod background_node_operations; mod compute_hook; diff --git a/storage_controller/src/main.rs b/storage_controller/src/main.rs index 00e90f44676c..801409d61292 100644 --- a/storage_controller/src/main.rs +++ b/storage_controller/src/main.rs @@ -1,6 +1,6 @@ use anyhow::{anyhow, Context}; use clap::Parser; -use hyper::Uri; +use hyper0::Uri; use metrics::launch_timestamp::LaunchTimestamp; use metrics::BuildInfo; use std::path::PathBuf; @@ -11,8 +11,8 @@ use storage_controller::metrics::preinitialize_metrics; use storage_controller::persistence::Persistence; use storage_controller::service::chaos_injector::ChaosInjector; use storage_controller::service::{ - Config, Service, HEARTBEAT_INTERVAL_DEFAULT, MAX_OFFLINE_INTERVAL_DEFAULT, - MAX_WARMING_UP_INTERVAL_DEFAULT, RECONCILER_CONCURRENCY_DEFAULT, + Config, Service, HEARTBEAT_INTERVAL_DEFAULT, LONG_RECONCILE_THRESHOLD_DEFAULT, + MAX_OFFLINE_INTERVAL_DEFAULT, MAX_WARMING_UP_INTERVAL_DEFAULT, RECONCILER_CONCURRENCY_DEFAULT, }; use tokio::signal::unix::SignalKind; use tokio_util::sync::CancellationToken; @@ -108,6 +108,9 @@ struct Cli { // Period with which to send heartbeats to registered nodes #[arg(long)] heartbeat_interval: Option, + + #[arg(long)] + long_reconcile_threshold: Option, } enum StrictMode { @@ -293,6 +296,10 @@ async fn async_main() -> anyhow::Result<()> { .heartbeat_interval .map(humantime::Duration::into) .unwrap_or(HEARTBEAT_INTERVAL_DEFAULT), + long_reconcile_threshold: args + .long_reconcile_threshold + .map(humantime::Duration::into) + .unwrap_or(LONG_RECONCILE_THRESHOLD_DEFAULT), address_for_peers: args.address_for_peers, start_as_candidate: args.start_as_candidate, http_service_port: args.listen.port() as i32, @@ -317,7 +324,7 @@ async fn async_main() -> anyhow::Result<()> { // Start HTTP server let server_shutdown = CancellationToken::new(); - let server = hyper::Server::from_tcp(http_listener)? + let server = hyper0::Server::from_tcp(http_listener)? .serve(router_service) .with_graceful_shutdown({ let server_shutdown = server_shutdown.clone(); diff --git a/storage_controller/src/metrics.rs b/storage_controller/src/metrics.rs index 5cfcfb4b1f14..5989aeba91fc 100644 --- a/storage_controller/src/metrics.rs +++ b/storage_controller/src/metrics.rs @@ -87,6 +87,10 @@ pub(crate) struct StorageControllerMetricGroup { measured::HistogramVec, pub(crate) storage_controller_leadership_status: measured::GaugeVec, + + /// HTTP request status counters for handled requests + pub(crate) storage_controller_reconcile_long_running: + measured::CounterVec, } impl StorageControllerMetrics { @@ -168,6 +172,17 @@ pub(crate) struct LeadershipStatusGroup { pub(crate) status: LeadershipStatus, } +#[derive(measured::LabelGroup, Clone)] +#[label(set = ReconcileLongRunningLabelGroupSet)] +pub(crate) struct ReconcileLongRunningLabelGroup<'a> { + #[label(dynamic_with = lasso::ThreadedRodeo, default)] + pub(crate) tenant_id: &'a str, + #[label(dynamic_with = lasso::ThreadedRodeo, default)] + pub(crate) shard_number: &'a str, + #[label(dynamic_with = lasso::ThreadedRodeo, default)] + pub(crate) sequence: &'a str, +} + #[derive(FixedCardinalityLabel, Clone, Copy)] pub(crate) enum ReconcileOutcome { #[label(rename = "ok")] diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs index 1e7d7adffe63..4864a021fe36 100644 --- a/storage_controller/src/reconciler.rs +++ b/storage_controller/src/reconciler.rs @@ -454,7 +454,7 @@ impl Reconciler { Ok(l) => l, Err(e) => { tracing::info!("🕑 Can't get LSNs on node {node} yet, waiting ({e})",); - std::thread::sleep(Duration::from_millis(500)); + tokio::time::sleep(Duration::from_millis(500)).await; continue; } }; @@ -469,10 +469,7 @@ impl Reconciler { } } None => { - // Expected timeline isn't yet visible on migration destination. - // (IRL we would have to account for timeline deletion, but this - // is just test helper) - any_behind = true; + // Timeline was deleted in the meantime - ignore it } } } @@ -481,7 +478,7 @@ impl Reconciler { tracing::info!("✅ LSN caught up. Proceeding..."); break; } else { - std::thread::sleep(Duration::from_millis(500)); + tokio::time::sleep(Duration::from_millis(500)).await; } } @@ -562,6 +559,8 @@ impl Reconciler { self.location_config(&dest_ps, dest_conf, None, false) .await?; + pausable_failpoint!("reconciler-live-migrate-pre-await-lsn"); + if let Some(baseline) = baseline_lsns { tracing::info!("🕑 Waiting for LSN to catch up..."); self.await_lsn(self.tenant_shard_id, &dest_ps, baseline) diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs index 851db97310b6..180ab5f0c530 100644 --- a/storage_controller/src/service.rs +++ b/storage_controller/src/service.rs @@ -128,6 +128,9 @@ pub const MAX_WARMING_UP_INTERVAL_DEFAULT: Duration = Duration::from_secs(300); /// How often to send heartbeats to registered nodes? pub const HEARTBEAT_INTERVAL_DEFAULT: Duration = Duration::from_secs(5); +/// How long is too long for a reconciliation? +pub const LONG_RECONCILE_THRESHOLD_DEFAULT: Duration = Duration::from_secs(120); + #[derive(Clone, strum_macros::Display)] enum TenantOperations { Create, @@ -348,6 +351,8 @@ pub struct Config { pub start_as_candidate: bool, pub http_service_port: i32, + + pub long_reconcile_threshold: Duration, } impl From for ApiError { @@ -521,6 +526,21 @@ pub(crate) enum ReconcileResultRequest { Stop, } +#[derive(Clone)] +struct MutationLocation { + node: Node, + generation: Generation, +} + +#[derive(Clone)] +struct ShardMutationLocations { + latest: MutationLocation, + other: Vec, +} + +#[derive(Default, Clone)] +struct TenantMutationLocations(BTreeMap); + impl Service { pub fn get_config(&self) -> &Config { &self.config @@ -2982,38 +3002,83 @@ impl Service { failpoint_support::sleep_millis_async!("tenant-create-timeline-shared-lock"); self.tenant_remote_mutation(tenant_id, move |mut targets| async move { - if targets.is_empty() { + if targets.0.is_empty() { return Err(ApiError::NotFound( anyhow::anyhow!("Tenant not found").into(), )); }; - let shard_zero = targets.remove(0); + + let (shard_zero_tid, shard_zero_locations) = + targets.0.pop_first().expect("Must have at least one shard"); + assert!(shard_zero_tid.is_shard_zero()); async fn create_one( tenant_shard_id: TenantShardId, - node: Node, + locations: ShardMutationLocations, jwt: Option, create_req: TimelineCreateRequest, ) -> Result { + let latest = locations.latest.node; + tracing::info!( - "Creating timeline on shard {}/{}, attached to node {node}", + "Creating timeline on shard {}/{}, attached to node {latest} in generation {:?}", tenant_shard_id, create_req.new_timeline_id, + locations.latest.generation ); - let client = PageserverClient::new(node.get_id(), node.base_url(), jwt.as_deref()); - client + let client = + PageserverClient::new(latest.get_id(), latest.base_url(), jwt.as_deref()); + + let timeline_info = client .timeline_create(tenant_shard_id, &create_req) .await - .map_err(|e| passthrough_api_error(&node, e)) + .map_err(|e| passthrough_api_error(&latest, e))?; + + // We propagate timeline creations to all attached locations such that a compute + // for the new timeline is able to start regardless of the current state of the + // tenant shard reconciliation. + for location in locations.other { + tracing::info!( + "Creating timeline on shard {}/{}, stale attached to node {} in generation {:?}", + tenant_shard_id, + create_req.new_timeline_id, + location.node, + location.generation + ); + + let client = PageserverClient::new( + location.node.get_id(), + location.node.base_url(), + jwt.as_deref(), + ); + + let res = client + .timeline_create(tenant_shard_id, &create_req) + .await; + + if let Err(e) = res { + match e { + mgmt_api::Error::ApiError(StatusCode::NOT_FOUND, _) => { + // Tenant might have been detached from the stale location, + // so ignore 404s. + }, + _ => { + return Err(passthrough_api_error(&location.node, e)); + } + } + } + } + + Ok(timeline_info) } // Because the caller might not provide an explicit LSN, we must do the creation first on a single shard, and then // use whatever LSN that shard picked when creating on subsequent shards. We arbitrarily use shard zero as the shard // that will get the first creation request, and propagate the LSN to all the >0 shards. let timeline_info = create_one( - shard_zero.0, - shard_zero.1, + shard_zero_tid, + shard_zero_locations, self.config.jwt_token.clone(), create_req.clone(), ) @@ -3026,14 +3091,24 @@ impl Service { } // Create timeline on remaining shards with number >0 - if !targets.is_empty() { + if !targets.0.is_empty() { // If we had multiple shards, issue requests for the remainder now. let jwt = &self.config.jwt_token; self.tenant_for_shards( - targets.iter().map(|t| (t.0, t.1.clone())).collect(), - |tenant_shard_id: TenantShardId, node: Node| { + targets + .0 + .iter() + .map(|t| (*t.0, t.1.latest.node.clone())) + .collect(), + |tenant_shard_id: TenantShardId, _node: Node| { let create_req = create_req.clone(); - Box::pin(create_one(tenant_shard_id, node, jwt.clone(), create_req)) + let mutation_locations = targets.0.remove(&tenant_shard_id).unwrap(); + Box::pin(create_one( + tenant_shard_id, + mutation_locations, + jwt.clone(), + create_req, + )) }, ) .await?; @@ -3063,7 +3138,7 @@ impl Service { .await; self.tenant_remote_mutation(tenant_id, move |targets| async move { - if targets.is_empty() { + if targets.0.is_empty() { return Err(ApiError::NotFound( anyhow::anyhow!("Tenant not found").into(), )); @@ -3094,8 +3169,9 @@ impl Service { // no shard needs to go first/last; the operation should be idempotent // TODO: it would be great to ensure that all shards return the same error + let locations = targets.0.iter().map(|t| (*t.0, t.1.latest.node.clone())).collect(); let results = self - .tenant_for_shards(targets, |tenant_shard_id, node| { + .tenant_for_shards(locations, |tenant_shard_id, node| { futures::FutureExt::boxed(config_one( tenant_shard_id, timeline_id, @@ -3126,7 +3202,7 @@ impl Service { .await; self.tenant_remote_mutation(tenant_id, move |targets| async move { - if targets.is_empty() { + if targets.0.is_empty() { return Err(ApiError::NotFound( anyhow::anyhow!("Tenant not found").into(), )); @@ -3174,8 +3250,9 @@ impl Service { } // no shard needs to go first/last; the operation should be idempotent + let locations = targets.0.iter().map(|t| (*t.0, t.1.latest.node.clone())).collect(); let mut results = self - .tenant_for_shards(targets, |tenant_shard_id, node| { + .tenant_for_shards(locations, |tenant_shard_id, node| { futures::FutureExt::boxed(detach_one( tenant_shard_id, timeline_id, @@ -3222,7 +3299,7 @@ impl Service { .await; self.tenant_remote_mutation(tenant_id, move |targets| async move { - if targets.is_empty() { + if targets.0.is_empty() { return Err(ApiError::NotFound( anyhow::anyhow!("Tenant not found").into(), )); @@ -3244,7 +3321,12 @@ impl Service { } // no shard needs to go first/last; the operation should be idempotent - self.tenant_for_shards(targets, |tenant_shard_id, node| { + let locations = targets + .0 + .iter() + .map(|t| (*t.0, t.1.latest.node.clone())) + .collect(); + self.tenant_for_shards(locations, |tenant_shard_id, node| { futures::FutureExt::boxed(do_one( tenant_shard_id, timeline_id, @@ -3339,11 +3421,11 @@ impl Service { op: O, ) -> Result where - O: FnOnce(Vec<(TenantShardId, Node)>) -> F, + O: FnOnce(TenantMutationLocations) -> F, F: std::future::Future, { - let target_gens = { - let mut targets = Vec::new(); + let mutation_locations = { + let mut locations = TenantMutationLocations::default(); // Load the currently attached pageservers for the latest generation of each shard. This can // run concurrently with reconciliations, and it is not guaranteed that the node we find here @@ -3394,14 +3476,50 @@ impl Service { .ok_or(ApiError::Conflict(format!( "Raced with removal of node {node_id}" )))?; - targets.push((tenant_shard_id, node.clone(), generation)); + let generation = generation.expect("Checked above"); + + let tenant = locked.tenants.get(&tenant_shard_id); + + // TODO(vlad): Abstract the logic that finds stale attached locations + // from observed state into a [`Service`] method. + let other_locations = match tenant { + Some(tenant) => { + let mut other = tenant.attached_locations(); + let latest_location_index = + other.iter().position(|&l| l == (node.get_id(), generation)); + if let Some(idx) = latest_location_index { + other.remove(idx); + } + + other + } + None => Vec::default(), + }; + + let location = ShardMutationLocations { + latest: MutationLocation { + node: node.clone(), + generation, + }, + other: other_locations + .into_iter() + .filter_map(|(node_id, generation)| { + let node = locked.nodes.get(&node_id)?; + + Some(MutationLocation { + node: node.clone(), + generation, + }) + }) + .collect(), + }; + locations.0.insert(tenant_shard_id, location); } - targets + locations }; - let targets = target_gens.iter().map(|t| (t.0, t.1.clone())).collect(); - let result = op(targets).await; + let result = op(mutation_locations.clone()).await; // Post-check: are all the generations of all the shards the same as they were initially? This proves that // our remote operation executed on the latest generation and is therefore persistent. @@ -3417,9 +3535,10 @@ impl Service { }| (tenant_shard_id, generation), ) .collect::>() - != target_gens + != mutation_locations + .0 .into_iter() - .map(|i| (i.0, i.2)) + .map(|i| (i.0, Some(i.1.latest.generation))) .collect::>() { // We raced with something that incremented the generation, and therefore cannot be @@ -3449,12 +3568,14 @@ impl Service { .await; self.tenant_remote_mutation(tenant_id, move |mut targets| async move { - if targets.is_empty() { + if targets.0.is_empty() { return Err(ApiError::NotFound( anyhow::anyhow!("Tenant not found").into(), )); } - let shard_zero = targets.remove(0); + + let (shard_zero_tid, shard_zero_locations) = targets.0.pop_first().expect("Must have at least one shard"); + assert!(shard_zero_tid.is_shard_zero()); async fn delete_one( tenant_shard_id: TenantShardId, @@ -3477,8 +3598,9 @@ impl Service { }) } + let locations = targets.0.iter().map(|t| (*t.0, t.1.latest.node.clone())).collect(); let statuses = self - .tenant_for_shards(targets, |tenant_shard_id: TenantShardId, node: Node| { + .tenant_for_shards(locations, |tenant_shard_id: TenantShardId, node: Node| { Box::pin(delete_one( tenant_shard_id, timeline_id, @@ -3496,9 +3618,9 @@ impl Service { // Delete shard zero last: this is not strictly necessary, but since a caller's GET on a timeline will be routed // to shard zero, it gives a more obvious behavior that a GET returns 404 once the deletion is done. let shard_zero_status = delete_one( - shard_zero.0, + shard_zero_tid, timeline_id, - shard_zero.1, + shard_zero_locations.latest.node, self.config.jwt_token.clone(), ) .await?; diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs index afc89eae0073..2e85580e0865 100644 --- a/storage_controller/src/tenant_shard.rs +++ b/storage_controller/src/tenant_shard.rs @@ -5,7 +5,9 @@ use std::{ }; use crate::{ - metrics::{self, ReconcileCompleteLabelGroup, ReconcileOutcome}, + metrics::{ + self, ReconcileCompleteLabelGroup, ReconcileLongRunningLabelGroup, ReconcileOutcome, + }, persistence::TenantShardPersistence, reconciler::{ReconcileUnits, ReconcilerConfig}, scheduler::{ @@ -14,6 +16,8 @@ use crate::{ }, service::ReconcileResultRequest, }; +use futures::future::{self, Either}; +use itertools::Itertools; use pageserver_api::controller_api::{ AvailabilityZone, NodeSchedulingPolicy, PlacementPolicy, ShardSchedulingPolicy, }; @@ -1083,6 +1087,47 @@ impl TenantShard { } } + async fn reconcile( + sequence: Sequence, + mut reconciler: Reconciler, + must_notify: bool, + ) -> ReconcileResult { + // Attempt to make observed state match intent state + let result = reconciler.reconcile().await; + + // If we know we had a pending compute notification from some previous action, send a notification irrespective + // of whether the above reconcile() did any work + if result.is_ok() && must_notify { + // If this fails we will send the need to retry in [`ReconcileResult::pending_compute_notification`] + reconciler.compute_notify().await.ok(); + } + + // Update result counter + let outcome_label = match &result { + Ok(_) => ReconcileOutcome::Success, + Err(ReconcileError::Cancel) => ReconcileOutcome::Cancel, + Err(_) => ReconcileOutcome::Error, + }; + + metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_reconcile_complete + .inc(ReconcileCompleteLabelGroup { + status: outcome_label, + }); + + // Constructing result implicitly drops Reconciler, freeing any ReconcileUnits before the Service might + // try and schedule more work in response to our result. + ReconcileResult { + sequence, + result, + tenant_shard_id: reconciler.tenant_shard_id, + generation: reconciler.generation, + observed: reconciler.observed, + pending_compute_notification: reconciler.compute_notify_failure, + } + } + #[allow(clippy::too_many_arguments)] #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))] pub(crate) fn spawn_reconciler( @@ -1122,7 +1167,7 @@ impl TenantShard { let reconciler_cancel = cancel.child_token(); let reconciler_intent = TargetState::from_intent(pageservers, &self.intent); - let mut reconciler = Reconciler { + let reconciler = Reconciler { tenant_shard_id: self.tenant_shard_id, shard: self.shard, placement_policy: self.policy.clone(), @@ -1142,6 +1187,7 @@ impl TenantShard { }; let reconcile_seq = self.sequence; + let long_reconcile_threshold = service_config.long_reconcile_threshold; tracing::info!(seq=%reconcile_seq, "Spawning Reconciler for sequence {}", self.sequence); let must_notify = self.pending_compute_notification; @@ -1178,41 +1224,55 @@ impl TenantShard { return; } - // Attempt to make observed state match intent state - let result = reconciler.reconcile().await; - - // If we know we had a pending compute notification from some previous action, send a notification irrespective - // of whether the above reconcile() did any work - if result.is_ok() && must_notify { - // If this fails we will send the need to retry in [`ReconcileResult::pending_compute_notification`] - reconciler.compute_notify().await.ok(); - } + let (tenant_id_label, shard_number_label, sequence_label) = { + ( + reconciler.tenant_shard_id.tenant_id.to_string(), + reconciler.tenant_shard_id.shard_number.0.to_string(), + reconcile_seq.to_string(), + ) + }; - // Update result counter - let outcome_label = match &result { - Ok(_) => ReconcileOutcome::Success, - Err(ReconcileError::Cancel) => ReconcileOutcome::Cancel, - Err(_) => ReconcileOutcome::Error, + let label_group = ReconcileLongRunningLabelGroup { + tenant_id: &tenant_id_label, + shard_number: &shard_number_label, + sequence: &sequence_label, }; - metrics::METRICS_REGISTRY - .metrics_group - .storage_controller_reconcile_complete - .inc(ReconcileCompleteLabelGroup { - status: outcome_label, - }); + let reconcile_fut = Self::reconcile(reconcile_seq, reconciler, must_notify); + let long_reconcile_fut = { + let label_group = label_group.clone(); + async move { + tokio::time::sleep(long_reconcile_threshold).await; + + tracing::warn!("Reconcile passed the long running threshold of {long_reconcile_threshold:?}"); - // Constructing result implicitly drops Reconciler, freeing any ReconcileUnits before the Service might - // try and schedule more work in response to our result. - let result = ReconcileResult { - sequence: reconcile_seq, - result, - tenant_shard_id: reconciler.tenant_shard_id, - generation: reconciler.generation, - observed: reconciler.observed, - pending_compute_notification: reconciler.compute_notify_failure, + metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_reconcile_long_running + .inc(label_group); + } }; + let reconcile_fut = std::pin::pin!(reconcile_fut); + let long_reconcile_fut = std::pin::pin!(long_reconcile_fut); + + let (was_long, result) = + match future::select(reconcile_fut, long_reconcile_fut).await { + Either::Left((reconcile_result, _)) => (false, reconcile_result), + Either::Right((_, reconcile_fut)) => (true, reconcile_fut.await), + }; + + if was_long { + let id = metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_reconcile_long_running + .with_labels(label_group); + metrics::METRICS_REGISTRY + .metrics_group + .storage_controller_reconcile_long_running + .remove_metric(id); + } + result_tx .send(ReconcileResultRequest::ReconcileResult(result)) .ok(); @@ -1351,6 +1411,32 @@ impl TenantShard { pub(crate) fn set_preferred_az(&mut self, preferred_az_id: AvailabilityZone) { self.preferred_az_id = Some(preferred_az_id); } + + /// Returns all the nodes to which this tenant shard is attached according to the + /// observed state and the generations. Return vector is sorted from latest generation + /// to earliest. + pub(crate) fn attached_locations(&self) -> Vec<(NodeId, Generation)> { + self.observed + .locations + .iter() + .filter_map(|(node_id, observed)| { + use LocationConfigMode::{AttachedMulti, AttachedSingle, AttachedStale}; + + let conf = observed.conf.as_ref()?; + + match (conf.generation, conf.mode) { + (Some(gen), AttachedMulti | AttachedSingle | AttachedStale) => { + Some((*node_id, gen)) + } + _ => None, + } + }) + .sorted_by(|(_lhs_node_id, lhs_gen), (_rhs_node_id, rhs_gen)| { + lhs_gen.cmp(rhs_gen).reverse() + }) + .map(|(node_id, gen)| (node_id, Generation::new(gen))) + .collect() + } } #[cfg(test)] diff --git a/storage_scrubber/Cargo.toml b/storage_scrubber/Cargo.toml index a1b5b0b12f19..609f3bf00918 100644 --- a/storage_scrubber/Cargo.toml +++ b/storage_scrubber/Cargo.toml @@ -5,6 +5,7 @@ edition.workspace = true license.workspace = true [dependencies] +aws-config.workspace = true aws-sdk-s3.workspace = true either.workspace = true anyhow.workspace = true @@ -31,7 +32,6 @@ storage_controller_client.workspace = true tokio = { workspace = true, features = ["macros", "rt-multi-thread"] } chrono = { workspace = true, default-features = false, features = ["clock", "serde"] } reqwest = { workspace = true, default-features = false, features = ["rustls-tls", "json"] } -aws-config = { workspace = true, default-features = false, features = ["rustls", "sso"] } pageserver = { path = "../pageserver" } pageserver_api = { path = "../libs/pageserver_api" } diff --git a/storage_scrubber/src/lib.rs b/storage_scrubber/src/lib.rs index 3f08cddf50f9..de0857cb5f0b 100644 --- a/storage_scrubber/src/lib.rs +++ b/storage_scrubber/src/lib.rs @@ -28,8 +28,9 @@ use pageserver::tenant::remote_timeline_client::{remote_tenant_path, remote_time use pageserver::tenant::TENANTS_SEGMENT_NAME; use pageserver_api::shard::TenantShardId; use remote_storage::{ - GenericRemoteStorage, Listing, ListingMode, RemotePath, RemoteStorageConfig, RemoteStorageKind, - S3Config, DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT, + DownloadOpts, GenericRemoteStorage, Listing, ListingMode, RemotePath, RemoteStorageConfig, + RemoteStorageKind, S3Config, DEFAULT_MAX_KEYS_PER_LIST_RESPONSE, + DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT, }; use reqwest::Url; use serde::{Deserialize, Serialize}; @@ -488,7 +489,10 @@ async fn download_object_with_retries( let cancel = CancellationToken::new(); for trial in 0..MAX_RETRIES { let mut buf = Vec::new(); - let download = match remote_client.download(key, &cancel).await { + let download = match remote_client + .download(key, &DownloadOpts::default(), &cancel) + .await + { Ok(response) => response, Err(e) => { error!("Failed to download object for key {key}: {e}"); diff --git a/test_runner/fixtures/compare_fixtures.py b/test_runner/fixtures/compare_fixtures.py index 770b32b11e01..fb9c2d2b86eb 100644 --- a/test_runner/fixtures/compare_fixtures.py +++ b/test_runner/fixtures/compare_fixtures.py @@ -340,23 +340,27 @@ def neon_with_baseline(request: FixtureRequest) -> PgCompare: @pytest.fixture(scope="function", autouse=True) -def sync_after_each_test(): - # The fixture calls `sync(2)` after each test if `SYNC_AFTER_EACH_TEST` env var is `true` +def sync_between_tests(): + # The fixture calls `sync(2)` after each test if `SYNC_BETWEEN_TESTS` env var is `true` # - # In CI, `SYNC_AFTER_EACH_TEST` is set to `true` only for benchmarks (`test_runner/performance`) + # In CI, `SYNC_BETWEEN_TESTS` is set to `true` only for benchmarks (`test_runner/performance`) # that are run on self-hosted runners because some of these tests are pretty write-heavy # and create issues to start the processes within 10s - key = "SYNC_AFTER_EACH_TEST" + key = "SYNC_BETWEEN_TESTS" enabled = os.environ.get(key) == "true" - yield + if enabled: + start = time.time() + # we only run benches on unices, the method might not exist on windows + os.sync() + elapsed = time.time() - start + log.info(f"called sync before test {elapsed=}") - if not enabled: - # regress test, or running locally - return + yield - start = time.time() - # we only run benches on unices, the method might not exist on windows - os.sync() - elapsed = time.time() - start - log.info(f"called sync after test {elapsed=}") + if enabled: + start = time.time() + # we only run benches on unices, the method might not exist on windows + os.sync() + elapsed = time.time() - start + log.info(f"called sync after test {elapsed=}") diff --git a/test_runner/fixtures/neon_cli.py b/test_runner/fixtures/neon_cli.py new file mode 100644 index 000000000000..c27d22620eff --- /dev/null +++ b/test_runner/fixtures/neon_cli.py @@ -0,0 +1,662 @@ +from __future__ import annotations + +import abc +import json +import os +import re +import subprocess +import tempfile +import textwrap +from itertools import chain, product +from pathlib import Path +from typing import ( + Any, + Dict, + List, + Optional, + Tuple, + TypeVar, + cast, +) + +import toml + +from fixtures.common_types import Lsn, TenantId, TimelineId +from fixtures.log_helper import log +from fixtures.pageserver.common_types import IndexPartDump +from fixtures.pg_version import PgVersion +from fixtures.utils import AuxFileStore + +T = TypeVar("T") + + +class AbstractNeonCli(abc.ABC): + """ + A typed wrapper around an arbitrary Neon CLI tool. + Supports a way to run arbitrary command directly via CLI. + Do not use directly, use specific subclasses instead. + """ + + def __init__(self, extra_env: Optional[Dict[str, str]], binpath: Path): + self.extra_env = extra_env + self.binpath = binpath + + COMMAND: str = cast(str, None) # To be overwritten by the derived class. + + def raw_cli( + self, + arguments: List[str], + extra_env_vars: Optional[Dict[str, str]] = None, + check_return_code=True, + timeout=None, + ) -> "subprocess.CompletedProcess[str]": + """ + Run the command with the specified arguments. + + Arguments must be in list form, e.g. ['endpoint', 'create'] + + Return both stdout and stderr, which can be accessed as + + >>> result = env.neon_cli.raw_cli(...) + >>> assert result.stderr == "" + >>> log.info(result.stdout) + + If `check_return_code`, on non-zero exit code logs failure and raises. + """ + + assert isinstance(arguments, list) + assert isinstance(self.COMMAND, str) + + command_path = str(self.binpath / self.COMMAND) + + args = [command_path] + arguments + log.info('Running command "{}"'.format(" ".join(args))) + + env_vars = os.environ.copy() + + # extra env + for extra_env_key, extra_env_value in (self.extra_env or {}).items(): + env_vars[extra_env_key] = extra_env_value + for extra_env_key, extra_env_value in (extra_env_vars or {}).items(): + env_vars[extra_env_key] = extra_env_value + + # Pass through coverage settings + var = "LLVM_PROFILE_FILE" + val = os.environ.get(var) + if val: + env_vars[var] = val + + # Intercept CalledProcessError and print more info + try: + res = subprocess.run( + args, + env=env_vars, + check=False, + universal_newlines=True, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + timeout=timeout, + ) + except subprocess.TimeoutExpired as e: + if e.stderr: + stderr = e.stderr.decode(errors="replace") + else: + stderr = "" + + if e.stdout: + stdout = e.stdout.decode(errors="replace") + else: + stdout = "" + + log.warn(f"CLI timeout: stderr={stderr}, stdout={stdout}") + raise + + indent = " " + if not res.returncode: + stripped = res.stdout.strip() + lines = stripped.splitlines() + if len(lines) < 2: + log.debug(f"Run {res.args} success: {stripped}") + else: + log.debug("Run %s success:\n%s" % (res.args, textwrap.indent(stripped, indent))) + elif check_return_code: + # this way command output will be in recorded and shown in CI in failure message + indent = indent * 2 + msg = textwrap.dedent( + """\ + Run %s failed: + stdout: + %s + stderr: + %s + """ + ) + msg = msg % ( + res.args, + textwrap.indent(res.stdout.strip(), indent), + textwrap.indent(res.stderr.strip(), indent), + ) + log.info(msg) + raise RuntimeError(msg) from subprocess.CalledProcessError( + res.returncode, res.args, res.stdout, res.stderr + ) + return res + + +class NeonLocalCli(AbstractNeonCli): + """A typed wrapper around the `neon_local` CLI tool. + Supports main commands via typed methods and a way to run arbitrary command directly via CLI. + + Note: The methods in this class are supposed to be faithful wrappers of the underlying + 'neon_local' commands. If you're tempted to add any logic here, please consider putting it + in the caller instead! + + There are a few exceptions where these wrapper methods intentionally differ from the + underlying commands, however: + - Many 'neon_local' commands take an optional 'tenant_id' argument and use the default from + the config file if it's omitted. The corresponding wrappers require an explicit 'tenant_id' + argument. The idea is that we don't want to rely on the config file's default in tests, + because NeonEnv has its own 'initial_tenant'. They are currently always the same, but we + want to rely on the Neonenv's default instead of the config file default in tests. + + - Similarly, --pg_version argument is always required in the wrappers, even when it's + optional in the 'neon_local' command. The default in 'neon_local' is a specific + hardcoded version, but in tests, we never want to accidentally rely on that;, we + always want to use the version from the test fixtures. + + - Wrappers for commands that create a new tenant or timeline ID require the new tenant + or timeline ID to be passed by the caller, while the 'neon_local' commands will + generate a random ID if it's not specified. This is because we don't want to have to + parse the ID from the 'neon_local' output. Making it required ensures that the + caller has to generate it. + """ + + COMMAND = "neon_local" + + def __init__( + self, + extra_env: Optional[Dict[str, str]], + binpath: Path, + repo_dir: Path, + pg_distrib_dir: Path, + ): + if extra_env is None: + env_vars = {} + else: + env_vars = extra_env.copy() + env_vars["NEON_REPO_DIR"] = str(repo_dir) + env_vars["POSTGRES_DISTRIB_DIR"] = str(pg_distrib_dir) + + super().__init__(env_vars, binpath) + + def raw_cli(self, *args, **kwargs) -> subprocess.CompletedProcess[str]: + return super().raw_cli(*args, **kwargs) + + def tenant_create( + self, + tenant_id: TenantId, + timeline_id: TimelineId, + pg_version: PgVersion, + conf: Optional[Dict[str, Any]] = None, + shard_count: Optional[int] = None, + shard_stripe_size: Optional[int] = None, + placement_policy: Optional[str] = None, + set_default: bool = False, + aux_file_policy: Optional[AuxFileStore] = None, + ): + """ + Creates a new tenant, returns its id and its initial timeline's id. + """ + args = [ + "tenant", + "create", + "--tenant-id", + str(tenant_id), + "--timeline-id", + str(timeline_id), + "--pg-version", + pg_version, + ] + if conf is not None: + args.extend( + chain.from_iterable( + product(["-c"], (f"{key}:{value}" for key, value in conf.items())) + ) + ) + + if aux_file_policy is AuxFileStore.V2: + args.extend(["-c", "switch_aux_file_policy:v2"]) + elif aux_file_policy is AuxFileStore.V1: + args.extend(["-c", "switch_aux_file_policy:v1"]) + elif aux_file_policy is AuxFileStore.CrossValidation: + args.extend(["-c", "switch_aux_file_policy:cross-validation"]) + + if set_default: + args.append("--set-default") + + if shard_count is not None: + args.extend(["--shard-count", str(shard_count)]) + + if shard_stripe_size is not None: + args.extend(["--shard-stripe-size", str(shard_stripe_size)]) + + if placement_policy is not None: + args.extend(["--placement-policy", str(placement_policy)]) + + res = self.raw_cli(args) + res.check_returncode() + + def tenant_import(self, tenant_id: TenantId): + args = ["tenant", "import", "--tenant-id", str(tenant_id)] + res = self.raw_cli(args) + res.check_returncode() + + def tenant_set_default(self, tenant_id: TenantId): + """ + Update default tenant for future operations that require tenant_id. + """ + res = self.raw_cli(["tenant", "set-default", "--tenant-id", str(tenant_id)]) + res.check_returncode() + + def tenant_config(self, tenant_id: TenantId, conf: Dict[str, str]): + """ + Update tenant config. + """ + + args = ["tenant", "config", "--tenant-id", str(tenant_id)] + if conf is not None: + args.extend( + chain.from_iterable( + product(["-c"], (f"{key}:{value}" for key, value in conf.items())) + ) + ) + + res = self.raw_cli(args) + res.check_returncode() + + def tenant_list(self) -> "subprocess.CompletedProcess[str]": + res = self.raw_cli(["tenant", "list"]) + res.check_returncode() + return res + + def timeline_create( + self, + new_branch_name: str, + tenant_id: TenantId, + timeline_id: TimelineId, + pg_version: PgVersion, + ) -> TimelineId: + if timeline_id is None: + timeline_id = TimelineId.generate() + + cmd = [ + "timeline", + "create", + "--branch-name", + new_branch_name, + "--tenant-id", + str(tenant_id), + "--timeline-id", + str(timeline_id), + "--pg-version", + pg_version, + ] + + res = self.raw_cli(cmd) + res.check_returncode() + + return timeline_id + + def timeline_branch( + self, + tenant_id: TenantId, + timeline_id: TimelineId, + new_branch_name, + ancestor_branch_name: Optional[str] = None, + ancestor_start_lsn: Optional[Lsn] = None, + ): + cmd = [ + "timeline", + "branch", + "--branch-name", + new_branch_name, + "--timeline-id", + str(timeline_id), + "--tenant-id", + str(tenant_id), + ] + if ancestor_branch_name is not None: + cmd.extend(["--ancestor-branch-name", ancestor_branch_name]) + if ancestor_start_lsn is not None: + cmd.extend(["--ancestor-start-lsn", str(ancestor_start_lsn)]) + + res = self.raw_cli(cmd) + res.check_returncode() + + def timeline_import( + self, + tenant_id: TenantId, + timeline_id: TimelineId, + new_branch_name: str, + base_lsn: Lsn, + base_tarfile: Path, + pg_version: PgVersion, + end_lsn: Optional[Lsn] = None, + wal_tarfile: Optional[Path] = None, + ): + cmd = [ + "timeline", + "import", + "--tenant-id", + str(tenant_id), + "--timeline-id", + str(timeline_id), + "--pg-version", + pg_version, + "--branch-name", + new_branch_name, + "--base-lsn", + str(base_lsn), + "--base-tarfile", + str(base_tarfile), + ] + if end_lsn is not None: + cmd.extend(["--end-lsn", str(end_lsn)]) + if wal_tarfile is not None: + cmd.extend(["--wal-tarfile", str(wal_tarfile)]) + + res = self.raw_cli(cmd) + res.check_returncode() + + def timeline_list(self, tenant_id: TenantId) -> List[Tuple[str, TimelineId]]: + """ + Returns a list of (branch_name, timeline_id) tuples out of parsed `neon timeline list` CLI output. + """ + + # main [b49f7954224a0ad25cc0013ea107b54b] + # ┣━ @0/16B5A50: test_cli_branch_list_main [20f98c79111b9015d84452258b7d5540] + TIMELINE_DATA_EXTRACTOR: re.Pattern = re.compile( # type: ignore[type-arg] + r"\s?(?P[^\s]+)\s\[(?P[^\]]+)\]", re.MULTILINE + ) + res = self.raw_cli(["timeline", "list", "--tenant-id", str(tenant_id)]) + timelines_cli = sorted( + map( + lambda branch_and_id: (branch_and_id[0], TimelineId(branch_and_id[1])), + TIMELINE_DATA_EXTRACTOR.findall(res.stdout), + ) + ) + return timelines_cli + + def init( + self, + init_config: Dict[str, Any], + force: Optional[str] = None, + ) -> "subprocess.CompletedProcess[str]": + with tempfile.NamedTemporaryFile(mode="w+") as init_config_tmpfile: + init_config_tmpfile.write(toml.dumps(init_config)) + init_config_tmpfile.flush() + + cmd = [ + "init", + f"--config={init_config_tmpfile.name}", + ] + + if force is not None: + cmd.extend(["--force", force]) + + res = self.raw_cli(cmd) + res.check_returncode() + return res + + def storage_controller_start( + self, + timeout_in_seconds: Optional[int] = None, + instance_id: Optional[int] = None, + base_port: Optional[int] = None, + ): + cmd = ["storage_controller", "start"] + if timeout_in_seconds is not None: + cmd.append(f"--start-timeout={timeout_in_seconds}s") + if instance_id is not None: + cmd.append(f"--instance-id={instance_id}") + if base_port is not None: + cmd.append(f"--base-port={base_port}") + return self.raw_cli(cmd) + + def storage_controller_stop(self, immediate: bool, instance_id: Optional[int] = None): + cmd = ["storage_controller", "stop"] + if immediate: + cmd.extend(["-m", "immediate"]) + if instance_id is not None: + cmd.append(f"--instance-id={instance_id}") + return self.raw_cli(cmd) + + def pageserver_start( + self, + id: int, + extra_env_vars: Optional[Dict[str, str]] = None, + timeout_in_seconds: Optional[int] = None, + ) -> "subprocess.CompletedProcess[str]": + start_args = ["pageserver", "start", f"--id={id}"] + if timeout_in_seconds is not None: + start_args.append(f"--start-timeout={timeout_in_seconds}s") + return self.raw_cli(start_args, extra_env_vars=extra_env_vars) + + def pageserver_stop(self, id: int, immediate=False) -> "subprocess.CompletedProcess[str]": + cmd = ["pageserver", "stop", f"--id={id}"] + if immediate: + cmd.extend(["-m", "immediate"]) + + log.info(f"Stopping pageserver with {cmd}") + return self.raw_cli(cmd) + + def safekeeper_start( + self, + id: int, + extra_opts: Optional[List[str]] = None, + extra_env_vars: Optional[Dict[str, str]] = None, + timeout_in_seconds: Optional[int] = None, + ) -> "subprocess.CompletedProcess[str]": + if extra_opts is not None: + extra_opts = [f"-e={opt}" for opt in extra_opts] + else: + extra_opts = [] + if timeout_in_seconds is not None: + extra_opts.append(f"--start-timeout={timeout_in_seconds}s") + return self.raw_cli( + ["safekeeper", "start", str(id), *extra_opts], extra_env_vars=extra_env_vars + ) + + def safekeeper_stop( + self, id: Optional[int] = None, immediate=False + ) -> "subprocess.CompletedProcess[str]": + args = ["safekeeper", "stop"] + if id is not None: + args.append(str(id)) + if immediate: + args.extend(["-m", "immediate"]) + return self.raw_cli(args) + + def storage_broker_start( + self, timeout_in_seconds: Optional[int] = None + ) -> "subprocess.CompletedProcess[str]": + cmd = ["storage_broker", "start"] + if timeout_in_seconds is not None: + cmd.append(f"--start-timeout={timeout_in_seconds}s") + return self.raw_cli(cmd) + + def storage_broker_stop(self) -> "subprocess.CompletedProcess[str]": + cmd = ["storage_broker", "stop"] + return self.raw_cli(cmd) + + def endpoint_create( + self, + branch_name: str, + pg_port: int, + http_port: int, + tenant_id: TenantId, + pg_version: PgVersion, + endpoint_id: Optional[str] = None, + hot_standby: bool = False, + lsn: Optional[Lsn] = None, + pageserver_id: Optional[int] = None, + allow_multiple=False, + ) -> "subprocess.CompletedProcess[str]": + args = [ + "endpoint", + "create", + "--tenant-id", + str(tenant_id), + "--branch-name", + branch_name, + "--pg-version", + pg_version, + ] + if lsn is not None: + args.extend(["--lsn", str(lsn)]) + if pg_port is not None: + args.extend(["--pg-port", str(pg_port)]) + if http_port is not None: + args.extend(["--http-port", str(http_port)]) + if endpoint_id is not None: + args.append(endpoint_id) + if hot_standby: + args.extend(["--hot-standby", "true"]) + if pageserver_id is not None: + args.extend(["--pageserver-id", str(pageserver_id)]) + if allow_multiple: + args.extend(["--allow-multiple"]) + + res = self.raw_cli(args) + res.check_returncode() + return res + + def endpoint_start( + self, + endpoint_id: str, + safekeepers: Optional[List[int]] = None, + remote_ext_config: Optional[str] = None, + pageserver_id: Optional[int] = None, + allow_multiple=False, + basebackup_request_tries: Optional[int] = None, + ) -> "subprocess.CompletedProcess[str]": + args = [ + "endpoint", + "start", + ] + extra_env_vars = {} + if basebackup_request_tries is not None: + extra_env_vars["NEON_COMPUTE_TESTING_BASEBACKUP_TRIES"] = str(basebackup_request_tries) + if remote_ext_config is not None: + args.extend(["--remote-ext-config", remote_ext_config]) + + if safekeepers is not None: + args.extend(["--safekeepers", (",".join(map(str, safekeepers)))]) + if endpoint_id is not None: + args.append(endpoint_id) + if pageserver_id is not None: + args.extend(["--pageserver-id", str(pageserver_id)]) + if allow_multiple: + args.extend(["--allow-multiple"]) + + res = self.raw_cli(args, extra_env_vars) + res.check_returncode() + return res + + def endpoint_reconfigure( + self, + endpoint_id: str, + tenant_id: Optional[TenantId] = None, + pageserver_id: Optional[int] = None, + safekeepers: Optional[List[int]] = None, + check_return_code=True, + ) -> "subprocess.CompletedProcess[str]": + args = ["endpoint", "reconfigure", endpoint_id] + if tenant_id is not None: + args.extend(["--tenant-id", str(tenant_id)]) + if pageserver_id is not None: + args.extend(["--pageserver-id", str(pageserver_id)]) + if safekeepers is not None: + args.extend(["--safekeepers", (",".join(map(str, safekeepers)))]) + return self.raw_cli(args, check_return_code=check_return_code) + + def endpoint_stop( + self, + endpoint_id: str, + destroy=False, + check_return_code=True, + mode: Optional[str] = None, + ) -> "subprocess.CompletedProcess[str]": + args = [ + "endpoint", + "stop", + ] + if destroy: + args.append("--destroy") + if mode is not None: + args.append(f"--mode={mode}") + if endpoint_id is not None: + args.append(endpoint_id) + + return self.raw_cli(args, check_return_code=check_return_code) + + def mappings_map_branch( + self, name: str, tenant_id: TenantId, timeline_id: TimelineId + ) -> "subprocess.CompletedProcess[str]": + """ + Map tenant id and timeline id to a neon_local branch name. They do not have to exist. + Usually needed when creating branches via PageserverHttpClient and not neon_local. + + After creating a name mapping, you can use EndpointFactory.create_start + with this registered branch name. + """ + args = [ + "mappings", + "map", + "--branch-name", + name, + "--tenant-id", + str(tenant_id), + "--timeline-id", + str(timeline_id), + ] + + return self.raw_cli(args, check_return_code=True) + + def start(self, check_return_code=True) -> "subprocess.CompletedProcess[str]": + return self.raw_cli(["start"], check_return_code=check_return_code) + + def stop(self, check_return_code=True) -> "subprocess.CompletedProcess[str]": + return self.raw_cli(["stop"], check_return_code=check_return_code) + + +class WalCraft(AbstractNeonCli): + """ + A typed wrapper around the `wal_craft` CLI tool. + Supports main commands via typed methods and a way to run arbitrary command directly via CLI. + """ + + COMMAND = "wal_craft" + + def postgres_config(self) -> List[str]: + res = self.raw_cli(["print-postgres-config"]) + res.check_returncode() + return res.stdout.split("\n") + + def in_existing(self, type: str, connection: str) -> None: + res = self.raw_cli(["in-existing", type, connection]) + res.check_returncode() + + +class Pagectl(AbstractNeonCli): + """ + A typed wrapper around the `pagectl` utility CLI tool. + """ + + COMMAND = "pagectl" + + def dump_index_part(self, path: Path) -> IndexPartDump: + res = self.raw_cli(["index-part", "dump", str(path)]) + res.check_returncode() + parsed = json.loads(res.stdout) + return IndexPartDump.from_json(parsed) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 6a53a34bc93d..df88af88ed16 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -9,8 +9,6 @@ import re import shutil import subprocess -import tempfile -import textwrap import threading import time import uuid @@ -21,7 +19,6 @@ from enum import Enum from fcntl import LOCK_EX, LOCK_UN, flock from functools import cached_property -from itertools import chain, product from pathlib import Path from types import TracebackType from typing import ( @@ -64,11 +61,12 @@ from fixtures.endpoint.http import EndpointHttpClient from fixtures.log_helper import log from fixtures.metrics import Metrics, MetricsGetter, parse_metrics +from fixtures.neon_cli import NeonLocalCli, Pagectl from fixtures.pageserver.allowed_errors import ( DEFAULT_PAGESERVER_ALLOWED_ERRORS, DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS, ) -from fixtures.pageserver.common_types import IndexPartDump, LayerName, parse_layer_file_name +from fixtures.pageserver.common_types import LayerName, parse_layer_file_name from fixtures.pageserver.http import PageserverHttpClient from fixtures.pageserver.utils import ( wait_for_last_record_lsn, @@ -491,7 +489,7 @@ def init_start( log.debug( f"Services started, creating initial tenant {env.initial_tenant} and its initial timeline" ) - initial_tenant, initial_timeline = env.neon_cli.create_tenant( + initial_tenant, initial_timeline = env.create_tenant( tenant_id=env.initial_tenant, conf=initial_tenant_conf, timeline_id=env.initial_timeline, @@ -952,10 +950,16 @@ class NeonEnv: initial_tenant - tenant ID of the initial tenant created in the repository - neon_cli - can be used to run the 'neon' CLI tool + neon_cli - can be used to run the 'neon_local' CLI tool - create_tenant() - initializes a new tenant in the page server, returns - the tenant id + create_tenant() - initializes a new tenant and an initial empty timeline on it, + returns the tenant and timeline id + + create_branch() - branch a new timeline from an existing one, returns + the new timeline id + + create_timeline() - initializes a new timeline by running initdb, returns + the new timeline id """ BASE_PAGESERVER_ID = 1 @@ -966,8 +970,6 @@ def __init__(self, config: NeonEnvBuilder): self.rust_log_override = config.rust_log_override self.port_distributor = config.port_distributor self.s3_mock_server = config.mock_s3_server - self.neon_cli = NeonCli(env=self) - self.pagectl = Pagectl(env=self) self.endpoints = EndpointFactory(self) self.safekeepers: List[Safekeeper] = [] self.pageservers: List[NeonPageserver] = [] @@ -987,6 +989,21 @@ def __init__(self, config: NeonEnvBuilder): self.initial_tenant = config.initial_tenant self.initial_timeline = config.initial_timeline + neon_local_env_vars = {} + if self.rust_log_override is not None: + neon_local_env_vars["RUST_LOG"] = self.rust_log_override + self.neon_cli = NeonLocalCli( + extra_env=neon_local_env_vars, + binpath=self.neon_local_binpath, + repo_dir=self.repo_dir, + pg_distrib_dir=self.pg_distrib_dir, + ) + + pagectl_env_vars = {} + if self.rust_log_override is not None: + pagectl_env_vars["RUST_LOG"] = self.rust_log_override + self.pagectl = Pagectl(extra_env=pagectl_env_vars, binpath=self.neon_binpath) + # The URL for the pageserver to use as its control_plane_api config if config.storage_controller_port_override is not None: log.info( @@ -1310,6 +1327,74 @@ def generate_endpoint_id(self) -> str: self.endpoint_counter += 1 return "ep-" + str(self.endpoint_counter) + def create_tenant( + self, + tenant_id: Optional[TenantId] = None, + timeline_id: Optional[TimelineId] = None, + conf: Optional[Dict[str, Any]] = None, + shard_count: Optional[int] = None, + shard_stripe_size: Optional[int] = None, + placement_policy: Optional[str] = None, + set_default: bool = False, + aux_file_policy: Optional[AuxFileStore] = None, + ) -> Tuple[TenantId, TimelineId]: + """ + Creates a new tenant, returns its id and its initial timeline's id. + """ + tenant_id = tenant_id or TenantId.generate() + timeline_id = timeline_id or TimelineId.generate() + + self.neon_cli.tenant_create( + tenant_id=tenant_id, + timeline_id=timeline_id, + pg_version=self.pg_version, + conf=conf, + shard_count=shard_count, + shard_stripe_size=shard_stripe_size, + placement_policy=placement_policy, + set_default=set_default, + aux_file_policy=aux_file_policy, + ) + + return tenant_id, timeline_id + + def config_tenant(self, tenant_id: Optional[TenantId], conf: Dict[str, str]): + """ + Update tenant config. + """ + tenant_id = tenant_id or self.initial_tenant + self.neon_cli.tenant_config(tenant_id, conf) + + def create_branch( + self, + new_branch_name: str = DEFAULT_BRANCH_NAME, + tenant_id: Optional[TenantId] = None, + ancestor_branch_name: Optional[str] = None, + ancestor_start_lsn: Optional[Lsn] = None, + new_timeline_id: Optional[TimelineId] = None, + ) -> TimelineId: + new_timeline_id = new_timeline_id or TimelineId.generate() + tenant_id = tenant_id or self.initial_tenant + + self.neon_cli.timeline_branch( + tenant_id, new_timeline_id, new_branch_name, ancestor_branch_name, ancestor_start_lsn + ) + + return new_timeline_id + + def create_timeline( + self, + new_branch_name: str, + tenant_id: Optional[TenantId] = None, + timeline_id: Optional[TimelineId] = None, + ) -> TimelineId: + timeline_id = timeline_id or TimelineId.generate() + tenant_id = tenant_id or self.initial_tenant + + self.neon_cli.timeline_create(new_branch_name, tenant_id, timeline_id, self.pg_version) + + return timeline_id + @pytest.fixture(scope="function") def neon_simple_env( @@ -1425,597 +1510,6 @@ class PageserverPort: http: int -class AbstractNeonCli(abc.ABC): - """ - A typed wrapper around an arbitrary Neon CLI tool. - Supports a way to run arbitrary command directly via CLI. - Do not use directly, use specific subclasses instead. - """ - - def __init__(self, env: NeonEnv): - self.env = env - - COMMAND: str = cast(str, None) # To be overwritten by the derived class. - - def raw_cli( - self, - arguments: List[str], - extra_env_vars: Optional[Dict[str, str]] = None, - check_return_code=True, - timeout=None, - local_binpath=False, - ) -> "subprocess.CompletedProcess[str]": - """ - Run the command with the specified arguments. - - Arguments must be in list form, e.g. ['pg', 'create'] - - Return both stdout and stderr, which can be accessed as - - >>> result = env.neon_cli.raw_cli(...) - >>> assert result.stderr == "" - >>> log.info(result.stdout) - - If `check_return_code`, on non-zero exit code logs failure and raises. - - If `local_binpath` is true, then we are invoking a test utility - """ - - assert isinstance(arguments, list) - assert isinstance(self.COMMAND, str) - - if local_binpath: - # Test utility - bin_neon = str(self.env.neon_local_binpath / self.COMMAND) - else: - # Normal binary - bin_neon = str(self.env.neon_binpath / self.COMMAND) - - args = [bin_neon] + arguments - log.info('Running command "{}"'.format(" ".join(args))) - - env_vars = os.environ.copy() - env_vars["NEON_REPO_DIR"] = str(self.env.repo_dir) - env_vars["POSTGRES_DISTRIB_DIR"] = str(self.env.pg_distrib_dir) - if self.env.rust_log_override is not None: - env_vars["RUST_LOG"] = self.env.rust_log_override - for extra_env_key, extra_env_value in (extra_env_vars or {}).items(): - env_vars[extra_env_key] = extra_env_value - - # Pass coverage settings - var = "LLVM_PROFILE_FILE" - val = os.environ.get(var) - if val: - env_vars[var] = val - - # Intercept CalledProcessError and print more info - try: - res = subprocess.run( - args, - env=env_vars, - check=False, - universal_newlines=True, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, - timeout=timeout, - ) - except subprocess.TimeoutExpired as e: - if e.stderr: - stderr = e.stderr.decode(errors="replace") - else: - stderr = "" - - if e.stdout: - stdout = e.stdout.decode(errors="replace") - else: - stdout = "" - - log.warn(f"CLI timeout: stderr={stderr}, stdout={stdout}") - raise - - indent = " " - if not res.returncode: - stripped = res.stdout.strip() - lines = stripped.splitlines() - if len(lines) < 2: - log.debug(f"Run {res.args} success: {stripped}") - else: - log.debug("Run %s success:\n%s" % (res.args, textwrap.indent(stripped, indent))) - elif check_return_code: - # this way command output will be in recorded and shown in CI in failure message - indent = indent * 2 - msg = textwrap.dedent( - """\ - Run %s failed: - stdout: - %s - stderr: - %s - """ - ) - msg = msg % ( - res.args, - textwrap.indent(res.stdout.strip(), indent), - textwrap.indent(res.stderr.strip(), indent), - ) - log.info(msg) - raise RuntimeError(msg) from subprocess.CalledProcessError( - res.returncode, res.args, res.stdout, res.stderr - ) - return res - - -class NeonCli(AbstractNeonCli): - """ - A typed wrapper around the `neon` CLI tool. - Supports main commands via typed methods and a way to run arbitrary command directly via CLI. - """ - - COMMAND = "neon_local" - - def raw_cli(self, *args, **kwargs) -> subprocess.CompletedProcess[str]: - kwargs["local_binpath"] = True - return super().raw_cli(*args, **kwargs) - - def create_tenant( - self, - tenant_id: Optional[TenantId] = None, - timeline_id: Optional[TimelineId] = None, - conf: Optional[Dict[str, Any]] = None, - shard_count: Optional[int] = None, - shard_stripe_size: Optional[int] = None, - placement_policy: Optional[str] = None, - set_default: bool = False, - aux_file_policy: Optional[AuxFileStore] = None, - ) -> Tuple[TenantId, TimelineId]: - """ - Creates a new tenant, returns its id and its initial timeline's id. - """ - tenant_id = tenant_id or TenantId.generate() - timeline_id = timeline_id or TimelineId.generate() - - args = [ - "tenant", - "create", - "--tenant-id", - str(tenant_id), - "--timeline-id", - str(timeline_id), - "--pg-version", - self.env.pg_version, - ] - if conf is not None: - args.extend( - chain.from_iterable( - product(["-c"], (f"{key}:{value}" for key, value in conf.items())) - ) - ) - - if aux_file_policy is AuxFileStore.V2: - args.extend(["-c", "switch_aux_file_policy:v2"]) - elif aux_file_policy is AuxFileStore.V1: - args.extend(["-c", "switch_aux_file_policy:v1"]) - elif aux_file_policy is AuxFileStore.CrossValidation: - args.extend(["-c", "switch_aux_file_policy:cross-validation"]) - - if set_default: - args.append("--set-default") - - if shard_count is not None: - args.extend(["--shard-count", str(shard_count)]) - - if shard_stripe_size is not None: - args.extend(["--shard-stripe-size", str(shard_stripe_size)]) - - if placement_policy is not None: - args.extend(["--placement-policy", str(placement_policy)]) - - res = self.raw_cli(args) - res.check_returncode() - return tenant_id, timeline_id - - def import_tenant(self, tenant_id: TenantId): - args = ["tenant", "import", "--tenant-id", str(tenant_id)] - res = self.raw_cli(args) - res.check_returncode() - - def set_default(self, tenant_id: TenantId): - """ - Update default tenant for future operations that require tenant_id. - """ - res = self.raw_cli(["tenant", "set-default", "--tenant-id", str(tenant_id)]) - res.check_returncode() - - def config_tenant(self, tenant_id: TenantId, conf: Dict[str, str]): - """ - Update tenant config. - """ - - args = ["tenant", "config", "--tenant-id", str(tenant_id)] - if conf is not None: - args.extend( - chain.from_iterable( - product(["-c"], (f"{key}:{value}" for key, value in conf.items())) - ) - ) - - res = self.raw_cli(args) - res.check_returncode() - - def list_tenants(self) -> "subprocess.CompletedProcess[str]": - res = self.raw_cli(["tenant", "list"]) - res.check_returncode() - return res - - def create_timeline( - self, - new_branch_name: str, - tenant_id: Optional[TenantId] = None, - timeline_id: Optional[TimelineId] = None, - ) -> TimelineId: - if timeline_id is None: - timeline_id = TimelineId.generate() - - cmd = [ - "timeline", - "create", - "--branch-name", - new_branch_name, - "--tenant-id", - str(tenant_id or self.env.initial_tenant), - "--timeline-id", - str(timeline_id), - "--pg-version", - self.env.pg_version, - ] - - res = self.raw_cli(cmd) - res.check_returncode() - - return timeline_id - - def create_branch( - self, - new_branch_name: str = DEFAULT_BRANCH_NAME, - ancestor_branch_name: Optional[str] = None, - tenant_id: Optional[TenantId] = None, - ancestor_start_lsn: Optional[Lsn] = None, - new_timeline_id: Optional[TimelineId] = None, - ) -> TimelineId: - if new_timeline_id is None: - new_timeline_id = TimelineId.generate() - cmd = [ - "timeline", - "branch", - "--branch-name", - new_branch_name, - "--timeline-id", - str(new_timeline_id), - "--tenant-id", - str(tenant_id or self.env.initial_tenant), - ] - if ancestor_branch_name is not None: - cmd.extend(["--ancestor-branch-name", ancestor_branch_name]) - if ancestor_start_lsn is not None: - cmd.extend(["--ancestor-start-lsn", str(ancestor_start_lsn)]) - - res = self.raw_cli(cmd) - res.check_returncode() - - return TimelineId(str(new_timeline_id)) - - def list_timelines(self, tenant_id: Optional[TenantId] = None) -> List[Tuple[str, TimelineId]]: - """ - Returns a list of (branch_name, timeline_id) tuples out of parsed `neon timeline list` CLI output. - """ - - # main [b49f7954224a0ad25cc0013ea107b54b] - # ┣━ @0/16B5A50: test_cli_branch_list_main [20f98c79111b9015d84452258b7d5540] - TIMELINE_DATA_EXTRACTOR: re.Pattern = re.compile( # type: ignore[type-arg] - r"\s?(?P[^\s]+)\s\[(?P[^\]]+)\]", re.MULTILINE - ) - res = self.raw_cli( - ["timeline", "list", "--tenant-id", str(tenant_id or self.env.initial_tenant)] - ) - timelines_cli = sorted( - map( - lambda branch_and_id: (branch_and_id[0], TimelineId(branch_and_id[1])), - TIMELINE_DATA_EXTRACTOR.findall(res.stdout), - ) - ) - return timelines_cli - - def init( - self, - init_config: Dict[str, Any], - force: Optional[str] = None, - ) -> "subprocess.CompletedProcess[str]": - with tempfile.NamedTemporaryFile(mode="w+") as init_config_tmpfile: - init_config_tmpfile.write(toml.dumps(init_config)) - init_config_tmpfile.flush() - - cmd = [ - "init", - f"--config={init_config_tmpfile.name}", - ] - - if force is not None: - cmd.extend(["--force", force]) - - res = self.raw_cli(cmd) - res.check_returncode() - return res - - def storage_controller_start( - self, - timeout_in_seconds: Optional[int] = None, - instance_id: Optional[int] = None, - base_port: Optional[int] = None, - ): - cmd = ["storage_controller", "start"] - if timeout_in_seconds is not None: - cmd.append(f"--start-timeout={timeout_in_seconds}s") - if instance_id is not None: - cmd.append(f"--instance-id={instance_id}") - if base_port is not None: - cmd.append(f"--base-port={base_port}") - return self.raw_cli(cmd) - - def storage_controller_stop(self, immediate: bool, instance_id: Optional[int] = None): - cmd = ["storage_controller", "stop"] - if immediate: - cmd.extend(["-m", "immediate"]) - if instance_id is not None: - cmd.append(f"--instance-id={instance_id}") - return self.raw_cli(cmd) - - def pageserver_start( - self, - id: int, - extra_env_vars: Optional[Dict[str, str]] = None, - timeout_in_seconds: Optional[int] = None, - ) -> "subprocess.CompletedProcess[str]": - start_args = ["pageserver", "start", f"--id={id}"] - if timeout_in_seconds is not None: - start_args.append(f"--start-timeout={timeout_in_seconds}s") - storage = self.env.pageserver_remote_storage - - if isinstance(storage, S3Storage): - s3_env_vars = storage.access_env_vars() - extra_env_vars = (extra_env_vars or {}) | s3_env_vars - - return self.raw_cli(start_args, extra_env_vars=extra_env_vars) - - def pageserver_stop(self, id: int, immediate=False) -> "subprocess.CompletedProcess[str]": - cmd = ["pageserver", "stop", f"--id={id}"] - if immediate: - cmd.extend(["-m", "immediate"]) - - log.info(f"Stopping pageserver with {cmd}") - return self.raw_cli(cmd) - - def safekeeper_start( - self, - id: int, - extra_opts: Optional[List[str]] = None, - timeout_in_seconds: Optional[int] = None, - ) -> "subprocess.CompletedProcess[str]": - s3_env_vars = None - if isinstance(self.env.safekeepers_remote_storage, S3Storage): - s3_env_vars = self.env.safekeepers_remote_storage.access_env_vars() - - if extra_opts is not None: - extra_opts = [f"-e={opt}" for opt in extra_opts] - else: - extra_opts = [] - if timeout_in_seconds is not None: - extra_opts.append(f"--start-timeout={timeout_in_seconds}s") - return self.raw_cli( - ["safekeeper", "start", str(id), *extra_opts], extra_env_vars=s3_env_vars - ) - - def safekeeper_stop( - self, id: Optional[int] = None, immediate=False - ) -> "subprocess.CompletedProcess[str]": - args = ["safekeeper", "stop"] - if id is not None: - args.append(str(id)) - if immediate: - args.extend(["-m", "immediate"]) - return self.raw_cli(args) - - def broker_start( - self, timeout_in_seconds: Optional[int] = None - ) -> "subprocess.CompletedProcess[str]": - cmd = ["storage_broker", "start"] - if timeout_in_seconds is not None: - cmd.append(f"--start-timeout={timeout_in_seconds}s") - return self.raw_cli(cmd) - - def broker_stop(self) -> "subprocess.CompletedProcess[str]": - cmd = ["storage_broker", "stop"] - return self.raw_cli(cmd) - - def endpoint_create( - self, - branch_name: str, - pg_port: int, - http_port: int, - endpoint_id: Optional[str] = None, - tenant_id: Optional[TenantId] = None, - hot_standby: bool = False, - lsn: Optional[Lsn] = None, - pageserver_id: Optional[int] = None, - allow_multiple=False, - ) -> "subprocess.CompletedProcess[str]": - args = [ - "endpoint", - "create", - "--tenant-id", - str(tenant_id or self.env.initial_tenant), - "--branch-name", - branch_name, - "--pg-version", - self.env.pg_version, - ] - if lsn is not None: - args.extend(["--lsn", str(lsn)]) - if pg_port is not None: - args.extend(["--pg-port", str(pg_port)]) - if http_port is not None: - args.extend(["--http-port", str(http_port)]) - if endpoint_id is not None: - args.append(endpoint_id) - if hot_standby: - args.extend(["--hot-standby", "true"]) - if pageserver_id is not None: - args.extend(["--pageserver-id", str(pageserver_id)]) - if allow_multiple: - args.extend(["--allow-multiple"]) - - res = self.raw_cli(args) - res.check_returncode() - return res - - def endpoint_start( - self, - endpoint_id: str, - safekeepers: Optional[List[int]] = None, - remote_ext_config: Optional[str] = None, - pageserver_id: Optional[int] = None, - allow_multiple=False, - basebackup_request_tries: Optional[int] = None, - ) -> "subprocess.CompletedProcess[str]": - args = [ - "endpoint", - "start", - ] - extra_env_vars = {} - if basebackup_request_tries is not None: - extra_env_vars["NEON_COMPUTE_TESTING_BASEBACKUP_TRIES"] = str(basebackup_request_tries) - if remote_ext_config is not None: - args.extend(["--remote-ext-config", remote_ext_config]) - - if safekeepers is not None: - args.extend(["--safekeepers", (",".join(map(str, safekeepers)))]) - if endpoint_id is not None: - args.append(endpoint_id) - if pageserver_id is not None: - args.extend(["--pageserver-id", str(pageserver_id)]) - if allow_multiple: - args.extend(["--allow-multiple"]) - - res = self.raw_cli(args, extra_env_vars) - res.check_returncode() - return res - - def endpoint_reconfigure( - self, - endpoint_id: str, - tenant_id: Optional[TenantId] = None, - pageserver_id: Optional[int] = None, - safekeepers: Optional[List[int]] = None, - check_return_code=True, - ) -> "subprocess.CompletedProcess[str]": - args = ["endpoint", "reconfigure", endpoint_id] - if tenant_id is not None: - args.extend(["--tenant-id", str(tenant_id)]) - if pageserver_id is not None: - args.extend(["--pageserver-id", str(pageserver_id)]) - if safekeepers is not None: - args.extend(["--safekeepers", (",".join(map(str, safekeepers)))]) - return self.raw_cli(args, check_return_code=check_return_code) - - def endpoint_stop( - self, - endpoint_id: str, - destroy=False, - check_return_code=True, - mode: Optional[str] = None, - ) -> "subprocess.CompletedProcess[str]": - args = [ - "endpoint", - "stop", - ] - if destroy: - args.append("--destroy") - if mode is not None: - args.append(f"--mode={mode}") - if endpoint_id is not None: - args.append(endpoint_id) - - return self.raw_cli(args, check_return_code=check_return_code) - - def map_branch( - self, name: str, tenant_id: TenantId, timeline_id: TimelineId - ) -> "subprocess.CompletedProcess[str]": - """ - Map tenant id and timeline id to a neon_local branch name. They do not have to exist. - Usually needed when creating branches via PageserverHttpClient and not neon_local. - - After creating a name mapping, you can use EndpointFactory.create_start - with this registered branch name. - """ - args = [ - "mappings", - "map", - "--branch-name", - name, - "--tenant-id", - str(tenant_id), - "--timeline-id", - str(timeline_id), - ] - - return self.raw_cli(args, check_return_code=True) - - def start(self, check_return_code=True) -> "subprocess.CompletedProcess[str]": - return self.raw_cli(["start"], check_return_code=check_return_code) - - def stop(self, check_return_code=True) -> "subprocess.CompletedProcess[str]": - return self.raw_cli(["stop"], check_return_code=check_return_code) - - -class WalCraft(AbstractNeonCli): - """ - A typed wrapper around the `wal_craft` CLI tool. - Supports main commands via typed methods and a way to run arbitrary command directly via CLI. - """ - - COMMAND = "wal_craft" - - def postgres_config(self) -> List[str]: - res = self.raw_cli(["print-postgres-config"]) - res.check_returncode() - return res.stdout.split("\n") - - def in_existing(self, type: str, connection: str) -> None: - res = self.raw_cli(["in-existing", type, connection]) - res.check_returncode() - - -class ComputeCtl(AbstractNeonCli): - """ - A typed wrapper around the `compute_ctl` CLI tool. - """ - - COMMAND = "compute_ctl" - - -class Pagectl(AbstractNeonCli): - """ - A typed wrapper around the `pagectl` utility CLI tool. - """ - - COMMAND = "pagectl" - - def dump_index_part(self, path: Path) -> IndexPartDump: - res = self.raw_cli(["index-part", "dump", str(path)]) - res.check_returncode() - parsed = json.loads(res.stdout) - return IndexPartDump.from_json(parsed) - - class LogUtils: """ A mixin class which provides utilities for inspecting the logs of a service. @@ -2933,6 +2427,10 @@ def start( """ assert self.running is False + storage = self.env.pageserver_remote_storage + if isinstance(storage, S3Storage): + s3_env_vars = storage.access_env_vars() + extra_env_vars = (extra_env_vars or {}) | s3_env_vars self.env.neon_cli.pageserver_start( self.id, extra_env_vars=extra_env_vars, timeout_in_seconds=timeout_in_seconds ) @@ -3953,6 +3451,7 @@ def create( hot_standby=hot_standby, pg_port=self.pg_port, http_port=self.http_port, + pg_version=self.env.pg_version, pageserver_id=pageserver_id, allow_multiple=allow_multiple, ) @@ -4395,8 +3894,16 @@ def start( extra_opts = self.extra_opts assert self.running is False + + s3_env_vars = None + if isinstance(self.env.safekeepers_remote_storage, S3Storage): + s3_env_vars = self.env.safekeepers_remote_storage.access_env_vars() + self.env.neon_cli.safekeeper_start( - self.id, extra_opts=extra_opts, timeout_in_seconds=timeout_in_seconds + self.id, + extra_opts=extra_opts, + timeout_in_seconds=timeout_in_seconds, + extra_env_vars=s3_env_vars, ) self.running = True # wait for wal acceptor start by checking its status @@ -4542,7 +4049,7 @@ def checkpoint_up_to( 1) wait for remote_consistent_lsn and wal_backup_lsn on safekeeper to reach it. 2) checkpoint timeline on safekeeper, which should remove WAL before this LSN; optionally wait for that. """ - cli = self.http_client() + client = self.http_client() target_segment_file = lsn.segment_name() @@ -4554,7 +4061,7 @@ def are_segments_removed(): assert all(target_segment_file <= s for s in segments) def are_lsns_advanced(): - stat = cli.timeline_status(tenant_id, timeline_id) + stat = client.timeline_status(tenant_id, timeline_id) log.info( f"waiting for remote_consistent_lsn and backup_lsn on sk {self.id} to reach {lsn}, currently remote_consistent_lsn={stat.remote_consistent_lsn}, backup_lsn={stat.backup_lsn}" ) @@ -4563,7 +4070,7 @@ def are_lsns_advanced(): # xxx: max wait is long because we might be waiting for reconnection from # pageserver to this safekeeper wait_until(30, 1, are_lsns_advanced) - cli.checkpoint(tenant_id, timeline_id) + client.checkpoint(tenant_id, timeline_id) if wait_wal_removal: wait_until(30, 1, are_segments_removed) @@ -4591,13 +4098,13 @@ def start( timeout_in_seconds: Optional[int] = None, ): assert not self.running - self.env.neon_cli.broker_start(timeout_in_seconds) + self.env.neon_cli.storage_broker_start(timeout_in_seconds) self.running = True return self def stop(self): if self.running: - self.env.neon_cli.broker_stop() + self.env.neon_cli.storage_broker_stop() self.running = False return self @@ -5226,10 +4733,10 @@ def flush_ep_to_pageserver( commit_lsn: Lsn = Lsn(0) # In principle in the absense of failures polling single sk would be enough. for sk in env.safekeepers: - cli = sk.http_client() + client = sk.http_client() # wait until compute connections are gone - wait_walreceivers_absent(cli, tenant, timeline) - commit_lsn = max(cli.get_commit_lsn(tenant, timeline), commit_lsn) + wait_walreceivers_absent(client, tenant, timeline) + commit_lsn = max(client.get_commit_lsn(tenant, timeline), commit_lsn) # Note: depending on WAL filtering implementation, probably most shards # won't be able to reach commit_lsn (unless gaps are also ack'ed), so this @@ -5282,7 +4789,12 @@ def fork_at_current_lsn( the WAL up to that LSN to arrive in the pageserver before creating the branch. """ current_lsn = endpoint.safe_psql("SELECT pg_current_wal_lsn()")[0][0] - return env.neon_cli.create_branch(new_branch_name, ancestor_branch_name, tenant_id, current_lsn) + return env.create_branch( + new_branch_name=new_branch_name, + tenant_id=tenant_id, + ancestor_branch_name=ancestor_branch_name, + ancestor_start_lsn=current_lsn, + ) def import_timeline_from_vanilla_postgres( @@ -5301,9 +4813,9 @@ def import_timeline_from_vanilla_postgres( """ # Take backup of the existing PostgreSQL server with pg_basebackup - basebackup_dir = os.path.join(test_output_dir, "basebackup") - base_tar = os.path.join(basebackup_dir, "base.tar") - wal_tar = os.path.join(basebackup_dir, "pg_wal.tar") + basebackup_dir = test_output_dir / "basebackup" + base_tar = basebackup_dir / "base.tar" + wal_tar = basebackup_dir / "pg_wal.tar" os.mkdir(basebackup_dir) pg_bin.run( [ @@ -5313,40 +4825,28 @@ def import_timeline_from_vanilla_postgres( "-d", vanilla_pg_connstr, "-D", - basebackup_dir, + str(basebackup_dir), ] ) # Extract start_lsn and end_lsn form the backup manifest file with open(os.path.join(basebackup_dir, "backup_manifest")) as f: manifest = json.load(f) - start_lsn = manifest["WAL-Ranges"][0]["Start-LSN"] - end_lsn = manifest["WAL-Ranges"][0]["End-LSN"] + start_lsn = Lsn(manifest["WAL-Ranges"][0]["Start-LSN"]) + end_lsn = Lsn(manifest["WAL-Ranges"][0]["End-LSN"]) # Import the backup tarballs into the pageserver - env.neon_cli.raw_cli( - [ - "timeline", - "import", - "--tenant-id", - str(tenant_id), - "--timeline-id", - str(timeline_id), - "--branch-name", - branch_name, - "--base-lsn", - start_lsn, - "--base-tarfile", - base_tar, - "--end-lsn", - end_lsn, - "--wal-tarfile", - wal_tar, - "--pg-version", - env.pg_version, - ] + env.neon_cli.timeline_import( + tenant_id=tenant_id, + timeline_id=timeline_id, + new_branch_name=branch_name, + base_lsn=start_lsn, + base_tarfile=base_tar, + end_lsn=end_lsn, + wal_tarfile=wal_tar, + pg_version=env.pg_version, ) - wait_for_last_record_lsn(env.pageserver.http_client(), tenant_id, timeline_id, Lsn(end_lsn)) + wait_for_last_record_lsn(env.pageserver.http_client(), tenant_id, timeline_id, end_lsn) def last_flush_lsn_upload( diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py index 0dd557c59f28..49ad54d456fc 100644 --- a/test_runner/fixtures/pageserver/http.py +++ b/test_runner/fixtures/pageserver/http.py @@ -586,6 +586,7 @@ def timeline_compact( timeline_id: TimelineId, force_repartition=False, force_image_layer_creation=False, + force_l0_compaction=False, wait_until_uploaded=False, enhanced_gc_bottom_most_compaction=False, ): @@ -595,6 +596,8 @@ def timeline_compact( query["force_repartition"] = "true" if force_image_layer_creation: query["force_image_layer_creation"] = "true" + if force_l0_compaction: + query["force_l0_compaction"] = "true" if wait_until_uploaded: query["wait_until_uploaded"] = "true" if enhanced_gc_bottom_most_compaction: @@ -701,6 +704,7 @@ def timeline_checkpoint( timeline_id: TimelineId, force_repartition=False, force_image_layer_creation=False, + force_l0_compaction=False, wait_until_uploaded=False, compact: Optional[bool] = None, **kwargs, @@ -711,6 +715,8 @@ def timeline_checkpoint( query["force_repartition"] = "true" if force_image_layer_creation: query["force_image_layer_creation"] = "true" + if force_l0_compaction: + query["force_l0_compaction"] = "true" if wait_until_uploaded: query["wait_until_uploaded"] = "true" diff --git a/test_runner/fixtures/pageserver/remote_storage.py b/test_runner/fixtures/pageserver/remote_storage.py index 0c3612716a50..bc54fc4c8da7 100644 --- a/test_runner/fixtures/pageserver/remote_storage.py +++ b/test_runner/fixtures/pageserver/remote_storage.py @@ -7,7 +7,7 @@ from typing import Any, List, Tuple from fixtures.common_types import TenantId, TimelineId -from fixtures.neon_fixtures import NeonEnv, Pagectl +from fixtures.neon_fixtures import NeonEnv from fixtures.pageserver.common_types import ( InvalidFileName, parse_layer_file_name, @@ -35,7 +35,7 @@ def duplicate_one_tenant(env: NeonEnv, template_tenant: TenantId, new_tenant: Te for file in tl.iterdir(): shutil.copy2(file, dst_tl_dir) if "__" in file.name: - Pagectl(env).raw_cli( + env.pagectl.raw_cli( [ "layer", "rewrite-summary", diff --git a/test_runner/fixtures/workload.py b/test_runner/fixtures/workload.py index 065a78bf9be6..1ea0267e8726 100644 --- a/test_runner/fixtures/workload.py +++ b/test_runner/fixtures/workload.py @@ -175,7 +175,9 @@ def churn_rows(self, n, pageserver_id: Optional[int] = None, upload=True, ingest if upload: # Wait for written data to be uploaded to S3 (force a checkpoint to trigger upload) ps_http.timeline_checkpoint( - tenant_shard_id, self.timeline_id, wait_until_uploaded=True + tenant_shard_id, + self.timeline_id, + wait_until_uploaded=True, ) log.info(f"Churn: waiting for remote LSN {last_flush_lsn}") else: diff --git a/test_runner/performance/pageserver/interactive/test_many_small_tenants.py b/test_runner/performance/pageserver/interactive/test_many_small_tenants.py index 8d781c1609f8..0a5a2c10d6eb 100644 --- a/test_runner/performance/pageserver/interactive/test_many_small_tenants.py +++ b/test_runner/performance/pageserver/interactive/test_many_small_tenants.py @@ -53,7 +53,7 @@ def setup_template(env: NeonEnv): "checkpoint_distance": 268435456, "image_creation_threshold": 3, } - template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True) + template_tenant, template_timeline = env.create_tenant(set_default=True) env.pageserver.tenant_detach(template_tenant) env.pageserver.tenant_attach(template_tenant, config) ep = env.endpoints.create_start("main", tenant_id=template_tenant) diff --git a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py index 8b934057e47a..c3ba5afc2493 100644 --- a/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py +++ b/test_runner/performance/pageserver/pagebench/test_large_slru_basebackup.py @@ -81,7 +81,7 @@ def setup_tenant_template(env: NeonEnv, n_txns: int): "image_creation_threshold": 3, } - template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True) + template_tenant, template_timeline = env.create_tenant(set_default=True) env.pageserver.tenant_detach(template_tenant) env.pageserver.tenant_attach(template_tenant, config) diff --git a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py index 949813c984f9..97eed8847331 100644 --- a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py +++ b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py @@ -162,7 +162,7 @@ def setup_tenant_template(env: NeonEnv, pg_bin: PgBin, scale: int): "checkpoint_distance": 268435456, "image_creation_threshold": 3, } - template_tenant, template_timeline = env.neon_cli.create_tenant(set_default=True) + template_tenant, template_timeline = env.create_tenant(set_default=True) env.pageserver.tenant_detach(template_tenant) env.pageserver.tenant_attach(template_tenant, config) ps_http = env.pageserver.http_client() diff --git a/test_runner/performance/test_branch_creation.py b/test_runner/performance/test_branch_creation.py index f1ab7876f924..1fdb06785b78 100644 --- a/test_runner/performance/test_branch_creation.py +++ b/test_runner/performance/test_branch_creation.py @@ -41,7 +41,7 @@ def test_branch_creation_heavy_write(neon_compare: NeonCompare, n_branches: int) pg_bin = neon_compare.pg_bin # Use aggressive GC and checkpoint settings, so GC and compaction happen more often during the test - tenant, _ = env.neon_cli.create_tenant( + tenant, _ = env.create_tenant( conf={ "gc_period": "5 s", "gc_horizon": f"{4 * 1024 ** 2}", @@ -64,7 +64,7 @@ def run_pgbench(branch: str): endpoint.stop() - env.neon_cli.create_branch("b0", tenant_id=tenant) + env.create_branch("b0", tenant_id=tenant) threads: List[threading.Thread] = [] threads.append(threading.Thread(target=run_pgbench, args=("b0",), daemon=True)) @@ -78,7 +78,7 @@ def run_pgbench(branch: str): p = random.randint(0, i) timer = timeit.default_timer() - env.neon_cli.create_branch(f"b{i + 1}", f"b{p}", tenant_id=tenant) + env.create_branch(f"b{i + 1}", ancestor_branch_name=f"b{p}", tenant_id=tenant) dur = timeit.default_timer() - timer log.info(f"Creating branch b{i+1} took {dur}s") @@ -104,7 +104,7 @@ def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int, shape: # seed the prng so we will measure the same structure every time rng = random.Random("2024-02-29") - env.neon_cli.create_branch("b0") + env.create_branch("b0") endpoint = env.endpoints.create_start("b0") neon_compare.pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", "-s10", endpoint.connstr()]) @@ -121,7 +121,7 @@ def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int, shape: timer = timeit.default_timer() # each of these uploads to remote storage before completion - env.neon_cli.create_branch(f"b{i + 1}", parent) + env.create_branch(f"b{i + 1}", ancestor_branch_name=parent) dur = timeit.default_timer() - timer branch_creation_durations.append(dur) @@ -222,7 +222,7 @@ def metrics_are_filled() -> List[Sample]: def test_branch_creation_many_relations(neon_compare: NeonCompare): env = neon_compare.env - timeline_id = env.neon_cli.create_branch("root") + timeline_id = env.create_branch("root") endpoint = env.endpoints.create_start("root") with closing(endpoint.connect()) as conn: @@ -238,7 +238,7 @@ def test_branch_creation_many_relations(neon_compare: NeonCompare): ) with neon_compare.record_duration("create_branch_time_not_busy_root"): - env.neon_cli.create_branch("child_not_busy", "root") + env.create_branch("child_not_busy", ancestor_branch_name="root") # run a concurrent insertion to make the ancestor "busy" during the branch creation thread = threading.Thread( @@ -247,6 +247,6 @@ def test_branch_creation_many_relations(neon_compare: NeonCompare): thread.start() with neon_compare.record_duration("create_branch_time_busy_root"): - env.neon_cli.create_branch("child_busy", "root") + env.create_branch("child_busy", ancestor_branch_name="root") thread.join() diff --git a/test_runner/performance/test_branching.py b/test_runner/performance/test_branching.py index f8d39487f26e..36c821795a24 100644 --- a/test_runner/performance/test_branching.py +++ b/test_runner/performance/test_branching.py @@ -41,7 +41,7 @@ def run_pgbench_on_branch(branch: str, cmd: List[str]): ) neon_compare.zenbenchmark.record_pg_bench_result(branch, res) - env.neon_cli.create_branch("root") + env.create_branch("root") endpoint_root = env.endpoints.create_start("root") pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", endpoint_root.connstr(), "-s10"]) @@ -55,14 +55,14 @@ def run_pgbench_on_branch(branch: str, cmd: List[str]): def test_compare_child_and_root_write_perf(neon_compare: NeonCompare): env = neon_compare.env - env.neon_cli.create_branch("root") + env.create_branch("root") endpoint_root = env.endpoints.create_start("root") endpoint_root.safe_psql( "CREATE TABLE foo(key serial primary key, t text default 'foooooooooooooooooooooooooooooooooooooooooooooooooooo')", ) - env.neon_cli.create_branch("child", "root") + env.create_branch("child", ancestor_branch_name="root") endpoint_child = env.endpoints.create_start("child") with neon_compare.record_duration("root_run_duration"): @@ -73,7 +73,7 @@ def test_compare_child_and_root_write_perf(neon_compare: NeonCompare): def test_compare_child_and_root_read_perf(neon_compare: NeonCompare): env = neon_compare.env - env.neon_cli.create_branch("root") + env.create_branch("root") endpoint_root = env.endpoints.create_start("root") endpoint_root.safe_psql_many( @@ -83,7 +83,7 @@ def test_compare_child_and_root_read_perf(neon_compare: NeonCompare): ] ) - env.neon_cli.create_branch("child", "root") + env.create_branch("child", ancestor_branch_name="root") endpoint_child = env.endpoints.create_start("child") with neon_compare.record_duration("root_run_duration"): diff --git a/test_runner/performance/test_bulk_tenant_create.py b/test_runner/performance/test_bulk_tenant_create.py index 9b05903cfaca..188ff5e3ad83 100644 --- a/test_runner/performance/test_bulk_tenant_create.py +++ b/test_runner/performance/test_bulk_tenant_create.py @@ -26,10 +26,8 @@ def test_bulk_tenant_create( for i in range(tenants_count): start = timeit.default_timer() - tenant, _ = env.neon_cli.create_tenant() - env.neon_cli.create_timeline( - f"test_bulk_tenant_create_{tenants_count}_{i}", tenant_id=tenant - ) + tenant, _ = env.create_tenant() + env.create_timeline(f"test_bulk_tenant_create_{tenants_count}_{i}", tenant_id=tenant) # FIXME: We used to start new safekeepers here. Did that make sense? Should we do it now? # if use_safekeepers == 'with_sa': diff --git a/test_runner/performance/test_bulk_update.py b/test_runner/performance/test_bulk_update.py index 2ace31a2d7ea..13c48e11741f 100644 --- a/test_runner/performance/test_bulk_update.py +++ b/test_runner/performance/test_bulk_update.py @@ -16,7 +16,7 @@ def test_bulk_update(neon_env_builder: NeonEnvBuilder, zenbenchmark, fillfactor) env = neon_env_builder.init_start() n_records = 1000000 - timeline_id = env.neon_cli.create_branch("test_bulk_update") + timeline_id = env.create_branch("test_bulk_update") tenant_id = env.initial_tenant endpoint = env.endpoints.create_start("test_bulk_update") cur = endpoint.connect().cursor() diff --git a/test_runner/performance/test_compaction.py b/test_runner/performance/test_compaction.py index 3c6f0b013109..54b17ebf8a7a 100644 --- a/test_runner/performance/test_compaction.py +++ b/test_runner/performance/test_compaction.py @@ -17,7 +17,7 @@ def test_compaction(neon_compare: NeonCompare): env = neon_compare.env pageserver_http = env.pageserver.http_client() - tenant_id, timeline_id = env.neon_cli.create_tenant( + tenant_id, timeline_id = env.create_tenant( conf={ # Disable background GC and compaction, we'll run compaction manually. "gc_period": "0s", @@ -68,7 +68,7 @@ def test_compaction_l0_memory(neon_compare: NeonCompare): env = neon_compare.env pageserver_http = env.pageserver.http_client() - tenant_id, timeline_id = env.neon_cli.create_tenant( + tenant_id, timeline_id = env.create_tenant( conf={ # Initially disable compaction so that we will build up a stack of L0s "compaction_period": "0s", diff --git a/test_runner/performance/test_gc_feedback.py b/test_runner/performance/test_gc_feedback.py index 9861259c16f7..2ba1018b33d9 100644 --- a/test_runner/performance/test_gc_feedback.py +++ b/test_runner/performance/test_gc_feedback.py @@ -11,7 +11,7 @@ def gc_feedback_impl(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchma env = neon_env_builder.init_start() client = env.pageserver.http_client() - tenant_id, _ = env.neon_cli.create_tenant( + tenant_id, _ = env.create_tenant( conf={ # disable default GC and compaction "gc_period": "1000 m", @@ -63,7 +63,7 @@ def gc_feedback_impl(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchma log.info(f"Physical storage size {physical_size}") if mode == "with_snapshots": if step == n_steps / 2: - env.neon_cli.create_branch("child") + env.create_branch("child") max_num_of_deltas_above_image = 0 max_total_num_of_deltas = 0 diff --git a/test_runner/performance/test_layer_map.py b/test_runner/performance/test_layer_map.py index bc6d9de346e4..fb2ac14a928b 100644 --- a/test_runner/performance/test_layer_map.py +++ b/test_runner/performance/test_layer_map.py @@ -15,7 +15,7 @@ def test_layer_map(neon_env_builder: NeonEnvBuilder, zenbenchmark): # We want to have a lot of lot of layer files to exercise the layer map. Disable # GC, and make checkpoint_distance very small, so that we get a lot of small layer # files. - tenant, timeline = env.neon_cli.create_tenant( + tenant, timeline = env.create_tenant( conf={ "gc_period": "0s", "checkpoint_distance": "16384", diff --git a/test_runner/performance/test_lazy_startup.py b/test_runner/performance/test_lazy_startup.py index e929bd4d053d..5af10bc4917d 100644 --- a/test_runner/performance/test_lazy_startup.py +++ b/test_runner/performance/test_lazy_startup.py @@ -33,7 +33,7 @@ def test_lazy_startup(slru: str, neon_env_builder: NeonEnvBuilder, zenbenchmark: env = neon_env_builder.init_start() lazy_slru_download = "true" if slru == "lazy" else "false" - tenant, _ = env.neon_cli.create_tenant( + tenant, _ = env.create_tenant( conf={ "lazy_slru_download": lazy_slru_download, } diff --git a/test_runner/performance/test_sharding_autosplit.py b/test_runner/performance/test_sharding_autosplit.py index 9cd83f09599a..35793e41d799 100644 --- a/test_runner/performance/test_sharding_autosplit.py +++ b/test_runner/performance/test_sharding_autosplit.py @@ -85,7 +85,7 @@ def __init__(self, timeline_id, endpoint): tenants = {} for tenant_id in set(TenantId.generate() for _i in range(0, tenant_count)): timeline_id = TimelineId.generate() - env.neon_cli.create_tenant(tenant_id, timeline_id, conf=tenant_conf) + env.create_tenant(tenant_id, timeline_id, conf=tenant_conf) endpoint = env.endpoints.create("main", tenant_id=tenant_id) tenants[tenant_id] = TenantState(timeline_id, endpoint) endpoint.start() diff --git a/test_runner/performance/test_startup.py b/test_runner/performance/test_startup.py index 301078d984a8..514d8bae2a8d 100644 --- a/test_runner/performance/test_startup.py +++ b/test_runner/performance/test_startup.py @@ -27,7 +27,7 @@ def test_startup_simple(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenc neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() - env.neon_cli.create_branch("test_startup") + env.create_branch("test_startup") endpoint = None diff --git a/test_runner/regress/test_ancestor_branch.py b/test_runner/regress/test_ancestor_branch.py index f83b44a7adc4..67a38ab471f7 100644 --- a/test_runner/regress/test_ancestor_branch.py +++ b/test_runner/regress/test_ancestor_branch.py @@ -12,7 +12,7 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder): pageserver_http = env.pageserver.http_client() # Override defaults: 4M checkpoint_distance, disable background compaction and gc. - tenant, _ = env.neon_cli.create_tenant( + tenant, _ = env.create_tenant( conf={ "checkpoint_distance": "4194304", "gc_period": "0s", @@ -45,7 +45,9 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder): log.info(f"LSN after 100k rows: {lsn_100}") # Create branch1. - env.neon_cli.create_branch("branch1", "main", tenant_id=tenant, ancestor_start_lsn=lsn_100) + env.create_branch( + "branch1", ancestor_branch_name="main", ancestor_start_lsn=lsn_100, tenant_id=tenant + ) endpoint_branch1 = env.endpoints.create_start("branch1", tenant_id=tenant) branch1_cur = endpoint_branch1.connect().cursor() @@ -67,7 +69,9 @@ def test_ancestor_branch(neon_env_builder: NeonEnvBuilder): log.info(f"LSN after 200k rows: {lsn_200}") # Create branch2. - env.neon_cli.create_branch("branch2", "branch1", tenant_id=tenant, ancestor_start_lsn=lsn_200) + env.create_branch( + "branch2", ancestor_branch_name="branch1", ancestor_start_lsn=lsn_200, tenant_id=tenant + ) endpoint_branch2 = env.endpoints.create_start("branch2", tenant_id=tenant) branch2_cur = endpoint_branch2.connect().cursor() diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py index bb337d9cc11c..a4e557a863d1 100644 --- a/test_runner/regress/test_attach_tenant_config.py +++ b/test_runner/regress/test_attach_tenant_config.py @@ -41,7 +41,7 @@ def negative_env(neon_env_builder: NeonEnvBuilder) -> Generator[NegativeTests, N assert isinstance(env.pageserver_remote_storage, LocalFsStorage) ps_http = env.pageserver.http_client() - (tenant_id, _) = env.neon_cli.create_tenant() + (tenant_id, _) = env.create_tenant() assert ps_http.tenant_config(tenant_id).tenant_specific_overrides == {} config_pre_detach = ps_http.tenant_config(tenant_id) assert tenant_id in [TenantId(t["id"]) for t in ps_http.tenant_list()] @@ -109,7 +109,7 @@ def test_empty_config(positive_env: NeonEnv, content_type: Optional[str]): """ env = positive_env ps_http = env.pageserver.http_client() - (tenant_id, _) = env.neon_cli.create_tenant() + (tenant_id, _) = env.create_tenant() assert ps_http.tenant_config(tenant_id).tenant_specific_overrides == {} config_pre_detach = ps_http.tenant_config(tenant_id) assert tenant_id in [TenantId(t["id"]) for t in ps_http.tenant_list()] @@ -182,7 +182,7 @@ def test_fully_custom_config(positive_env: NeonEnv): fully_custom_config.keys() ), "ensure we cover all config options" - (tenant_id, _) = env.neon_cli.create_tenant() + (tenant_id, _) = env.create_tenant() ps_http.set_tenant_config(tenant_id, fully_custom_config) our_tenant_config = ps_http.tenant_config(tenant_id) assert our_tenant_config.tenant_specific_overrides == fully_custom_config diff --git a/test_runner/regress/test_auth.py b/test_runner/regress/test_auth.py index 780c0e1602ac..6b0609218301 100644 --- a/test_runner/regress/test_auth.py +++ b/test_runner/regress/test_auth.py @@ -76,7 +76,7 @@ def test_compute_auth_to_pageserver(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() branch = "test_compute_auth_to_pageserver" - env.neon_cli.create_branch(branch) + env.create_branch(branch) endpoint = env.endpoints.create_start(branch) with closing(endpoint.connect()) as conn: @@ -186,7 +186,7 @@ def test_auth_failures(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): env = neon_env_builder.init_start() branch = f"test_auth_failures_auth_enabled_{auth_enabled}" - timeline_id = env.neon_cli.create_branch(branch) + timeline_id = env.create_branch(branch) env.endpoints.create_start(branch) tenant_token = env.auth_keys.generate_tenant_token(env.initial_tenant) diff --git a/test_runner/regress/test_backpressure.py b/test_runner/regress/test_backpressure.py index 819912dd0517..3d7a52ca77a5 100644 --- a/test_runner/regress/test_backpressure.py +++ b/test_runner/regress/test_backpressure.py @@ -98,7 +98,7 @@ def check_backpressure(endpoint: Endpoint, stop_event: threading.Event, polling_ def test_backpressure_received_lsn_lag(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() # Create a branch for us - env.neon_cli.create_branch("test_backpressure") + env.create_branch("test_backpressure") endpoint = env.endpoints.create( "test_backpressure", config_lines=["max_replication_write_lag=30MB"] diff --git a/test_runner/regress/test_bad_connection.py b/test_runner/regress/test_bad_connection.py index 392b73c1f7c1..98842e64f43b 100644 --- a/test_runner/regress/test_bad_connection.py +++ b/test_runner/regress/test_bad_connection.py @@ -22,7 +22,7 @@ def test_compute_pageserver_connection_stress(neon_env_builder: NeonEnvBuilder): pageserver_http = env.pageserver.http_client() pageserver_http.configure_failpoints(("simulated-bad-compute-connection", "50%return(15)")) - env.neon_cli.create_branch("test_compute_pageserver_connection_stress") + env.create_branch("test_compute_pageserver_connection_stress") endpoint = env.endpoints.create_start("test_compute_pageserver_connection_stress") pg_conn = endpoint.connect() diff --git a/test_runner/regress/test_branch_and_gc.py b/test_runner/regress/test_branch_and_gc.py index 43140c05ffd2..afeea55fc2df 100644 --- a/test_runner/regress/test_branch_and_gc.py +++ b/test_runner/regress/test_branch_and_gc.py @@ -53,7 +53,7 @@ def test_branch_and_gc(neon_simple_env: NeonEnv, build_type: str): env = neon_simple_env pageserver_http_client = env.pageserver.http_client() - tenant, timeline_main = env.neon_cli.create_tenant( + tenant, timeline_main = env.create_tenant( conf={ # disable background GC "gc_period": "0s", @@ -90,7 +90,7 @@ def test_branch_and_gc(neon_simple_env: NeonEnv, build_type: str): pageserver_http_client.timeline_checkpoint(tenant, timeline_main) pageserver_http_client.timeline_gc(tenant, timeline_main, lsn2 - lsn1 + 1024) - env.neon_cli.create_branch( + env.create_branch( "test_branch", ancestor_branch_name="main", ancestor_start_lsn=lsn1, tenant_id=tenant ) endpoint_branch = env.endpoints.create_start("test_branch", tenant_id=tenant) @@ -127,7 +127,7 @@ def test_branch_creation_before_gc(neon_simple_env: NeonEnv): env.storage_controller.allowed_errors.extend(error_regexes) # Disable background GC but set the `pitr_interval` to be small, so GC can delete something - tenant, _ = env.neon_cli.create_tenant( + tenant, _ = env.create_tenant( conf={ # disable background GC "gc_period": "0s", @@ -145,7 +145,7 @@ def test_branch_creation_before_gc(neon_simple_env: NeonEnv): } ) - b0 = env.neon_cli.create_branch("b0", tenant_id=tenant) + b0 = env.create_branch("b0", tenant_id=tenant) endpoint0 = env.endpoints.create_start("b0", tenant_id=tenant) res = endpoint0.safe_psql_many( queries=[ @@ -176,7 +176,7 @@ def do_gc(): # The starting LSN is invalid as the corresponding record is scheduled to be removed by in-queue GC. with pytest.raises(Exception, match="invalid branch start lsn: .*"): - env.neon_cli.create_branch("b1", "b0", tenant_id=tenant, ancestor_start_lsn=lsn) + env.create_branch("b1", ancestor_branch_name="b0", ancestor_start_lsn=lsn, tenant_id=tenant) # retry the same with the HTTP API, so that we can inspect the status code with pytest.raises(TimelineCreate406): new_timeline_id = TimelineId.generate() diff --git a/test_runner/regress/test_branch_behind.py b/test_runner/regress/test_branch_behind.py index 2bf7041cf14b..cceb7b3d606d 100644 --- a/test_runner/regress/test_branch_behind.py +++ b/test_runner/regress/test_branch_behind.py @@ -23,7 +23,7 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder): env.storage_controller.allowed_errors.extend(error_regexes) # Branch at the point where only 100 rows were inserted - branch_behind_timeline_id = env.neon_cli.create_branch("test_branch_behind") + branch_behind_timeline_id = env.create_branch("test_branch_behind") endpoint_main = env.endpoints.create_start("test_branch_behind") main_cur = endpoint_main.connect().cursor() @@ -58,8 +58,10 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder): log.info(f"LSN after 200100 rows: {lsn_b}") # Branch at the point where only 100 rows were inserted - env.neon_cli.create_branch( - "test_branch_behind_hundred", "test_branch_behind", ancestor_start_lsn=lsn_a + env.create_branch( + "test_branch_behind_hundred", + ancestor_branch_name="test_branch_behind", + ancestor_start_lsn=lsn_a, ) # Insert many more rows. This generates enough WAL to fill a few segments. @@ -75,8 +77,10 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder): log.info(f"LSN after 400100 rows: {lsn_c}") # Branch at the point where only 200100 rows were inserted - env.neon_cli.create_branch( - "test_branch_behind_more", "test_branch_behind", ancestor_start_lsn=lsn_b + env.create_branch( + "test_branch_behind_more", + ancestor_branch_name="test_branch_behind", + ancestor_start_lsn=lsn_b, ) endpoint_hundred = env.endpoints.create_start("test_branch_behind_hundred") @@ -97,15 +101,17 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder): pageserver_http = env.pageserver.http_client() # branch at segment boundary - env.neon_cli.create_branch( - "test_branch_segment_boundary", "test_branch_behind", ancestor_start_lsn=Lsn("0/3000000") + env.create_branch( + "test_branch_segment_boundary", + ancestor_branch_name="test_branch_behind", + ancestor_start_lsn=Lsn("0/3000000"), ) endpoint = env.endpoints.create_start("test_branch_segment_boundary") assert endpoint.safe_psql("SELECT 1")[0][0] == 1 # branch at pre-initdb lsn (from main branch) with pytest.raises(Exception, match="invalid branch start lsn: .*"): - env.neon_cli.create_branch("test_branch_preinitdb", ancestor_start_lsn=Lsn("0/42")) + env.create_branch("test_branch_preinitdb", ancestor_start_lsn=Lsn("0/42")) # retry the same with the HTTP API, so that we can inspect the status code with pytest.raises(TimelineCreate406): new_timeline_id = TimelineId.generate() @@ -116,8 +122,10 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder): # branch at pre-ancestor lsn with pytest.raises(Exception, match="less than timeline ancestor lsn"): - env.neon_cli.create_branch( - "test_branch_preinitdb", "test_branch_behind", ancestor_start_lsn=Lsn("0/42") + env.create_branch( + "test_branch_preinitdb", + ancestor_branch_name="test_branch_behind", + ancestor_start_lsn=Lsn("0/42"), ) # retry the same with the HTTP API, so that we can inspect the status code with pytest.raises(TimelineCreate406): @@ -139,8 +147,10 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder): print_gc_result(gc_result) with pytest.raises(Exception, match="invalid branch start lsn: .*"): # this gced_lsn is pretty random, so if gc is disabled this woudln't fail - env.neon_cli.create_branch( - "test_branch_create_fail", "test_branch_behind", ancestor_start_lsn=gced_lsn + env.create_branch( + "test_branch_create_fail", + ancestor_branch_name="test_branch_behind", + ancestor_start_lsn=gced_lsn, ) # retry the same with the HTTP API, so that we can inspect the status code with pytest.raises(TimelineCreate406): diff --git a/test_runner/regress/test_branching.py b/test_runner/regress/test_branching.py index 3d5c34a5958b..8d07dfd511ce 100644 --- a/test_runner/regress/test_branching.py +++ b/test_runner/regress/test_branching.py @@ -38,7 +38,7 @@ def test_branching_with_pgbench( env = neon_simple_env # Use aggressive GC and checkpoint settings, so that we also exercise GC during the test - tenant, _ = env.neon_cli.create_tenant( + tenant, _ = env.create_tenant( conf={ "gc_period": "5 s", "gc_horizon": f"{1024 ** 2}", @@ -55,7 +55,7 @@ def run_pgbench(connstr: str): pg_bin.run_capture(["pgbench", "-i", "-I", "dtGvp", f"-s{scale}", connstr]) pg_bin.run_capture(["pgbench", "-T15", connstr]) - env.neon_cli.create_branch("b0", tenant_id=tenant) + env.create_branch("b0", tenant_id=tenant) endpoints: List[Endpoint] = [] endpoints.append(env.endpoints.create_start("b0", tenant_id=tenant)) @@ -84,9 +84,9 @@ def run_pgbench(connstr: str): threads = [] if ty == "cascade": - env.neon_cli.create_branch(f"b{i + 1}", f"b{i}", tenant_id=tenant) + env.create_branch(f"b{i + 1}", ancestor_branch_name=f"b{i}", tenant_id=tenant) else: - env.neon_cli.create_branch(f"b{i + 1}", "b0", tenant_id=tenant) + env.create_branch(f"b{i + 1}", ancestor_branch_name="b0", tenant_id=tenant) endpoints.append(env.endpoints.create_start(f"b{i + 1}", tenant_id=tenant)) @@ -120,7 +120,7 @@ def test_branching_unnormalized_start_lsn(neon_simple_env: NeonEnv, pg_bin: PgBi env = neon_simple_env - env.neon_cli.create_branch("b0") + env.create_branch("b0") endpoint0 = env.endpoints.create_start("b0") pg_bin.run_capture(["pgbench", "-i", endpoint0.connstr()]) @@ -133,7 +133,7 @@ def test_branching_unnormalized_start_lsn(neon_simple_env: NeonEnv, pg_bin: PgBi start_lsn = Lsn((int(curr_lsn) - XLOG_BLCKSZ) // XLOG_BLCKSZ * XLOG_BLCKSZ) log.info(f"Branching b1 from b0 starting at lsn {start_lsn}...") - env.neon_cli.create_branch("b1", "b0", ancestor_start_lsn=start_lsn) + env.create_branch("b1", ancestor_branch_name="b0", ancestor_start_lsn=start_lsn) endpoint1 = env.endpoints.create_start("b1") pg_bin.run_capture(["pgbench", "-i", endpoint1.connstr()]) @@ -173,7 +173,7 @@ def start_creating_timeline(): wait_until_paused(env, "before-upload-index-pausable") - env.neon_cli.map_branch(initial_branch, env.initial_tenant, env.initial_timeline) + env.neon_cli.mappings_map_branch(initial_branch, env.initial_tenant, env.initial_timeline) with pytest.raises(RuntimeError, match="ERROR: Not found: Timeline"): env.endpoints.create_start( @@ -432,9 +432,7 @@ def test_branching_while_stuck_find_gc_cutoffs(neon_env_builder: NeonEnvBuilder) wait_until_paused(env, failpoint) - env.neon_cli.create_branch( - tenant_id=env.initial_tenant, ancestor_branch_name="main", new_branch_name="branch" - ) + env.create_branch("branch", ancestor_branch_name="main") client.configure_failpoints((failpoint, "off")) diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py index 5ec9a22ba147..6b6af481aa22 100644 --- a/test_runner/regress/test_broken_timeline.py +++ b/test_runner/regress/test_broken_timeline.py @@ -34,7 +34,7 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder): tenant_timelines: List[Tuple[TenantId, TimelineId, Endpoint]] = [] for _ in range(3): - tenant_id, timeline_id = env.neon_cli.create_tenant() + tenant_id, timeline_id = env.create_tenant() endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) with endpoint.cursor() as cur: @@ -84,13 +84,11 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder): def test_create_multiple_timelines_parallel(neon_simple_env: NeonEnv): env = neon_simple_env - tenant_id, _ = env.neon_cli.create_tenant() + tenant_id, _ = env.create_tenant() with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor: futures = [ - executor.submit( - env.neon_cli.create_timeline, f"test-create-multiple-timelines-{i}", tenant_id - ) + executor.submit(env.create_timeline, f"test-create-multiple-timelines-{i}", tenant_id) for i in range(4) ] for future in futures: @@ -111,7 +109,7 @@ def test_timeline_init_break_before_checkpoint(neon_env_builder: NeonEnvBuilder) tenant_id = env.initial_tenant timelines_dir = env.pageserver.timeline_dir(tenant_id) - old_tenant_timelines = env.neon_cli.list_timelines(tenant_id) + old_tenant_timelines = env.neon_cli.timeline_list(tenant_id) initial_timeline_dirs = [d for d in timelines_dir.iterdir()] # Introduce failpoint during timeline init (some intermediate files are on disk), before it's checkpointed. @@ -123,7 +121,7 @@ def test_timeline_init_break_before_checkpoint(neon_env_builder: NeonEnvBuilder) env.pageserver.restart(immediate=True) # Creating the timeline didn't finish. The other timelines on tenant should still be present and work normally. - new_tenant_timelines = env.neon_cli.list_timelines(tenant_id) + new_tenant_timelines = env.neon_cli.timeline_list(tenant_id) assert ( new_tenant_timelines == old_tenant_timelines ), f"Pageserver after restart should ignore non-initialized timelines for tenant {tenant_id}" @@ -151,11 +149,11 @@ def test_timeline_init_break_before_checkpoint_recreate( ] ) - env.neon_cli.create_tenant(env.initial_tenant) + env.create_tenant(env.initial_tenant) tenant_id = env.initial_tenant timelines_dir = env.pageserver.timeline_dir(tenant_id) - old_tenant_timelines = env.neon_cli.list_timelines(tenant_id) + old_tenant_timelines = env.neon_cli.timeline_list(tenant_id) initial_timeline_dirs = [d for d in timelines_dir.iterdir()] # Some fixed timeline ID (like control plane does) @@ -176,7 +174,7 @@ def test_timeline_init_break_before_checkpoint_recreate( env.pageserver.restart(immediate=True) # Creating the timeline didn't finish. The other timelines on tenant should still be present and work normally. - new_tenant_timelines = env.neon_cli.list_timelines(tenant_id) + new_tenant_timelines = env.neon_cli.timeline_list(tenant_id) assert ( new_tenant_timelines == old_tenant_timelines ), f"Pageserver after restart should ignore non-initialized timelines for tenant {tenant_id}" @@ -201,7 +199,7 @@ def test_timeline_create_break_after_dir_creation(neon_env_builder: NeonEnvBuild tenant_id = env.initial_tenant timelines_dir = env.pageserver.timeline_dir(tenant_id) - old_tenant_timelines = env.neon_cli.list_timelines(tenant_id) + old_tenant_timelines = env.neon_cli.timeline_list(tenant_id) initial_timeline_dirs = [d for d in timelines_dir.iterdir()] # Introduce failpoint when creating a new timeline, right after creating its directory @@ -211,7 +209,7 @@ def test_timeline_create_break_after_dir_creation(neon_env_builder: NeonEnvBuild # Creating the timeline didn't finish. The other timelines on tenant should still be present and work normally. # "New" timeline is not present in the list, allowing pageserver to retry the same request - new_tenant_timelines = env.neon_cli.list_timelines(tenant_id) + new_tenant_timelines = env.neon_cli.timeline_list(tenant_id) assert ( new_tenant_timelines == old_tenant_timelines ), f"Pageserver after restart should ignore non-initialized timelines for tenant {tenant_id}" diff --git a/test_runner/regress/test_change_pageserver.py b/test_runner/regress/test_change_pageserver.py index 34791e59883c..d3aa49f3746a 100644 --- a/test_runner/regress/test_change_pageserver.py +++ b/test_runner/regress/test_change_pageserver.py @@ -34,7 +34,7 @@ def ignore_notify(request: Request): ignore_notify ) - env.neon_cli.create_branch("test_change_pageserver") + env.create_branch("test_change_pageserver") endpoint = env.endpoints.create_start("test_change_pageserver") # Put this tenant into a dual-attached state diff --git a/test_runner/regress/test_clog_truncate.py b/test_runner/regress/test_clog_truncate.py index 6e4880841a00..bfce795d14a7 100644 --- a/test_runner/regress/test_clog_truncate.py +++ b/test_runner/regress/test_clog_truncate.py @@ -56,8 +56,10 @@ def test_clog_truncate(neon_simple_env: NeonEnv): # create new branch after clog truncation and start a compute node on it log.info(f"create branch at lsn_after_truncation {lsn_after_truncation}") - env.neon_cli.create_branch( - "test_clog_truncate_new", "main", ancestor_start_lsn=lsn_after_truncation + env.create_branch( + "test_clog_truncate_new", + ancestor_branch_name="main", + ancestor_start_lsn=lsn_after_truncation, ) endpoint2 = env.endpoints.create_start("test_clog_truncate_new") diff --git a/test_runner/regress/test_close_fds.py b/test_runner/regress/test_close_fds.py index ce9ecb3dc4dd..3957d0b3b091 100644 --- a/test_runner/regress/test_close_fds.py +++ b/test_runner/regress/test_close_fds.py @@ -23,7 +23,7 @@ def test_lsof_pageserver_pid(neon_simple_env: NeonEnv): env = neon_simple_env def start_workload(): - env.neon_cli.create_branch("test_lsof_pageserver_pid") + env.create_branch("test_lsof_pageserver_pid") endpoint = env.endpoints.create_start("test_lsof_pageserver_pid") with closing(endpoint.connect()) as conn: with conn.cursor() as cur: diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py index cb34551b53fc..98bd3a6a5fba 100644 --- a/test_runner/regress/test_compaction.py +++ b/test_runner/regress/test_compaction.py @@ -63,7 +63,10 @@ def test_pageserver_compaction_smoke(neon_env_builder: NeonEnvBuilder): log.info(f"Running churn round {i}/{churn_rounds} ...") workload.churn_rows(row_count, env.pageserver.id) - ps_http.timeline_compact(tenant_id, timeline_id) + # Force L0 compaction to ensure the number of layers is within bounds; we don't want to count L0 layers + # in this benchmark. In other words, this smoke test ensures number of L1 layers are bound. + ps_http.timeline_compact(tenant_id, timeline_id, force_l0_compaction=True) + assert ps_http.perf_info(tenant_id, timeline_id)[0]["num_of_l0"] <= 1 log.info("Validating at workload end ...") workload.validate(env.pageserver.id) diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py index 066910562505..1f960b6b75b7 100644 --- a/test_runner/regress/test_compatibility.py +++ b/test_runner/regress/test_compatibility.py @@ -517,7 +517,7 @@ def test_historic_storage_formats( assert metadata_summary["tenant_count"] >= 1 assert metadata_summary["timeline_count"] >= 1 - env.neon_cli.import_tenant(dataset.tenant_id) + env.neon_cli.tenant_import(dataset.tenant_id) # Discover timelines timelines = env.pageserver.http_client().timeline_list(dataset.tenant_id) diff --git a/test_runner/regress/test_config.py b/test_runner/regress/test_config.py index d8ef0b8dbda9..5aba1f265fa2 100644 --- a/test_runner/regress/test_config.py +++ b/test_runner/regress/test_config.py @@ -38,7 +38,7 @@ def test_safekeepers_reconfigure_reorder( ): neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() - env.neon_cli.create_branch("test_safekeepers_reconfigure_reorder") + env.create_branch("test_safekeepers_reconfigure_reorder") endpoint = env.endpoints.create_start("test_safekeepers_reconfigure_reorder") diff --git a/test_runner/regress/test_crafted_wal_end.py b/test_runner/regress/test_crafted_wal_end.py index 30f8d8189055..71369ab13145 100644 --- a/test_runner/regress/test_crafted_wal_end.py +++ b/test_runner/regress/test_crafted_wal_end.py @@ -1,6 +1,7 @@ import pytest from fixtures.log_helper import log -from fixtures.neon_fixtures import NeonEnvBuilder, WalCraft +from fixtures.neon_cli import WalCraft +from fixtures.neon_fixtures import NeonEnvBuilder # Restart nodes with WAL end having specially crafted shape, like last record # crossing segment boundary, to test decoding issues. @@ -18,7 +19,7 @@ ) def test_crafted_wal_end(neon_env_builder: NeonEnvBuilder, wal_type: str): env = neon_env_builder.init_start() - env.neon_cli.create_branch("test_crafted_wal_end") + env.create_branch("test_crafted_wal_end") env.pageserver.allowed_errors.extend( [ # seems like pageserver stop triggers these @@ -27,7 +28,7 @@ def test_crafted_wal_end(neon_env_builder: NeonEnvBuilder, wal_type: str): ) endpoint = env.endpoints.create("test_crafted_wal_end") - wal_craft = WalCraft(env) + wal_craft = WalCraft(extra_env=None, binpath=env.neon_binpath) endpoint.config(wal_craft.postgres_config()) endpoint.start() res = endpoint.safe_psql_many( diff --git a/test_runner/regress/test_createdropdb.py b/test_runner/regress/test_createdropdb.py index af643f45d7ae..cdf048ac2680 100644 --- a/test_runner/regress/test_createdropdb.py +++ b/test_runner/regress/test_createdropdb.py @@ -31,7 +31,7 @@ def test_createdb(neon_simple_env: NeonEnv, strategy: str): lsn = query_scalar(cur, "SELECT pg_current_wal_insert_lsn()") # Create a branch - env.neon_cli.create_branch("test_createdb2", "main", ancestor_start_lsn=lsn) + env.create_branch("test_createdb2", ancestor_branch_name="main", ancestor_start_lsn=lsn) endpoint2 = env.endpoints.create_start("test_createdb2") # Test that you can connect to the new database on both branches @@ -77,10 +77,14 @@ def test_dropdb(neon_simple_env: NeonEnv, test_output_dir): lsn_after_drop = query_scalar(cur, "SELECT pg_current_wal_insert_lsn()") # Create two branches before and after database drop. - env.neon_cli.create_branch("test_before_dropdb", "main", ancestor_start_lsn=lsn_before_drop) + env.create_branch( + "test_before_dropdb", ancestor_branch_name="main", ancestor_start_lsn=lsn_before_drop + ) endpoint_before = env.endpoints.create_start("test_before_dropdb") - env.neon_cli.create_branch("test_after_dropdb", "main", ancestor_start_lsn=lsn_after_drop) + env.create_branch( + "test_after_dropdb", ancestor_branch_name="main", ancestor_start_lsn=lsn_after_drop + ) endpoint_after = env.endpoints.create_start("test_after_dropdb") # Test that database exists on the branch before drop diff --git a/test_runner/regress/test_createuser.py b/test_runner/regress/test_createuser.py index d6f138e1266c..96b38f8fb046 100644 --- a/test_runner/regress/test_createuser.py +++ b/test_runner/regress/test_createuser.py @@ -18,7 +18,7 @@ def test_createuser(neon_simple_env: NeonEnv): lsn = query_scalar(cur, "SELECT pg_current_wal_insert_lsn()") # Create a branch - env.neon_cli.create_branch("test_createuser2", "main", ancestor_start_lsn=lsn) + env.create_branch("test_createuser2", ancestor_branch_name="main", ancestor_start_lsn=lsn) endpoint2 = env.endpoints.create_start("test_createuser2") # Test that you can connect to new branch as a new user diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py index 1fec8b3f18b2..4fcdef0ca3da 100644 --- a/test_runner/regress/test_disk_usage_eviction.py +++ b/test_runner/regress/test_disk_usage_eviction.py @@ -59,11 +59,11 @@ def set_min_resident_size(config): env.pageserver.stop() env.pageserver.start() - tenant_id, _ = env.neon_cli.create_tenant() + tenant_id, _ = env.create_tenant() assert_overrides(tenant_id, config_level_override) # Also ensure that specifying the paramter to create_tenant works, in addition to http-level recconfig. - tenant_id, _ = env.neon_cli.create_tenant(conf={"min_resident_size_override": "100"}) + tenant_id, _ = env.create_tenant(conf={"min_resident_size_override": "100"}) assert_config(tenant_id, 100, 100) ps_http.set_tenant_config(tenant_id, {}) assert_config(tenant_id, None, config_level_override) @@ -280,7 +280,7 @@ def _eviction_env( def pgbench_init_tenant( layer_size: int, scale: int, env: NeonEnv, pg_bin: PgBin ) -> Tuple[TenantId, TimelineId]: - tenant_id, timeline_id = env.neon_cli.create_tenant( + tenant_id, timeline_id = env.create_tenant( conf={ "gc_period": "0s", "compaction_period": "0s", diff --git a/test_runner/regress/test_download_extensions.py b/test_runner/regress/test_download_extensions.py index 7370eb145608..c89a82965ef7 100644 --- a/test_runner/regress/test_download_extensions.py +++ b/test_runner/regress/test_download_extensions.py @@ -81,7 +81,7 @@ def endpoint_handler_build_tag(request: Request) -> Response: # Start a compute node with remote_extension spec # and check that it can download the extensions and use them to CREATE EXTENSION. env = neon_env_builder_local.init_start() - env.neon_cli.create_branch("test_remote_extensions") + env.create_branch("test_remote_extensions") endpoint = env.endpoints.create( "test_remote_extensions", config_lines=["log_min_messages=debug3"], diff --git a/test_runner/regress/test_endpoint_crash.py b/test_runner/regress/test_endpoint_crash.py index ae3dded437a0..e34dfab6c493 100644 --- a/test_runner/regress/test_endpoint_crash.py +++ b/test_runner/regress/test_endpoint_crash.py @@ -15,7 +15,7 @@ def test_endpoint_crash(neon_env_builder: NeonEnvBuilder, sql_func: str): Test that triggering crash from neon_test_utils crashes the endpoint """ env = neon_env_builder.init_start() - env.neon_cli.create_branch("test_endpoint_crash") + env.create_branch("test_endpoint_crash") endpoint = env.endpoints.create_start("test_endpoint_crash") endpoint.safe_psql("CREATE EXTENSION neon_test_utils;") diff --git a/test_runner/regress/test_fsm_truncate.py b/test_runner/regress/test_fsm_truncate.py index 80e4da8380e1..691f96ab0ab7 100644 --- a/test_runner/regress/test_fsm_truncate.py +++ b/test_runner/regress/test_fsm_truncate.py @@ -3,7 +3,7 @@ def test_fsm_truncate(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() - env.neon_cli.create_branch("test_fsm_truncate") + env.create_branch("test_fsm_truncate") endpoint = env.endpoints.create_start("test_fsm_truncate") endpoint.safe_psql( "CREATE TABLE t1(key int); CREATE TABLE t2(key int); TRUNCATE TABLE t1; TRUNCATE TABLE t2;" diff --git a/test_runner/regress/test_gc_aggressive.py b/test_runner/regress/test_gc_aggressive.py index 44133f2350e8..3d472f9720a2 100644 --- a/test_runner/regress/test_gc_aggressive.py +++ b/test_runner/regress/test_gc_aggressive.py @@ -68,7 +68,7 @@ async def update_table(endpoint: Endpoint): def test_gc_aggressive(neon_env_builder: NeonEnvBuilder): # Disable pitr, because here we want to test branch creation after GC env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"}) - timeline = env.neon_cli.create_branch("test_gc_aggressive", "main") + timeline = env.create_branch("test_gc_aggressive", ancestor_branch_name="main") endpoint = env.endpoints.create_start("test_gc_aggressive") with endpoint.cursor() as cur: @@ -99,7 +99,7 @@ def test_gc_index_upload(neon_env_builder: NeonEnvBuilder): # Disable time-based pitr, we will use LSN-based thresholds in the manual GC calls env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"}) tenant_id = env.initial_tenant - timeline_id = env.neon_cli.create_branch("test_gc_index_upload", "main") + timeline_id = env.create_branch("test_gc_index_upload", ancestor_branch_name="main") endpoint = env.endpoints.create_start("test_gc_index_upload") pageserver_http = env.pageserver.http_client() diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py index 4385cfca7631..87b44e4e3eb8 100644 --- a/test_runner/regress/test_import.py +++ b/test_runner/regress/test_import.py @@ -98,27 +98,15 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build ) def import_tar(base, wal): - env.neon_cli.raw_cli( - [ - "timeline", - "import", - "--tenant-id", - str(tenant), - "--timeline-id", - str(timeline), - "--branch-name", - branch_name, - "--base-lsn", - start_lsn, - "--base-tarfile", - base, - "--end-lsn", - end_lsn, - "--wal-tarfile", - wal, - "--pg-version", - env.pg_version, - ] + env.neon_cli.timeline_import( + tenant_id=tenant, + timeline_id=timeline, + new_branch_name=branch_name, + base_tarfile=base, + base_lsn=start_lsn, + wal_tarfile=wal, + end_lsn=end_lsn, + pg_version=env.pg_version, ) # Importing empty file fails @@ -158,7 +146,7 @@ def test_import_from_pageserver_small( neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) env = neon_env_builder.init_start() - timeline = env.neon_cli.create_branch("test_import_from_pageserver_small") + timeline = env.create_branch("test_import_from_pageserver_small") endpoint = env.endpoints.create_start("test_import_from_pageserver_small") num_rows = 3000 @@ -177,7 +165,7 @@ def test_import_from_pageserver_multisegment( neon_env_builder.enable_pageserver_remote_storage(RemoteStorageKind.LOCAL_FS) env = neon_env_builder.init_start() - timeline = env.neon_cli.create_branch("test_import_from_pageserver_multisegment") + timeline = env.create_branch("test_import_from_pageserver_multisegment") endpoint = env.endpoints.create_start("test_import_from_pageserver_multisegment") # For `test_import_from_pageserver_multisegment`, we want to make sure that the data @@ -268,23 +256,13 @@ def _import( branch_name = "import_from_pageserver" client = env.pageserver.http_client() env.pageserver.tenant_create(tenant) - env.neon_cli.raw_cli( - [ - "timeline", - "import", - "--tenant-id", - str(tenant), - "--timeline-id", - str(timeline), - "--branch-name", - branch_name, - "--base-lsn", - str(lsn), - "--base-tarfile", - str(tar_output_file), - "--pg-version", - env.pg_version, - ] + env.neon_cli.timeline_import( + tenant_id=tenant, + timeline_id=timeline, + new_branch_name=branch_name, + base_lsn=lsn, + base_tarfile=tar_output_file, + pg_version=env.pg_version, ) # Wait for data to land in s3 diff --git a/test_runner/regress/test_layer_eviction.py b/test_runner/regress/test_layer_eviction.py index 97093ea535c2..82cfe08bc092 100644 --- a/test_runner/regress/test_layer_eviction.py +++ b/test_runner/regress/test_layer_eviction.py @@ -178,9 +178,9 @@ def test_gc_of_remote_layers(neon_env_builder: NeonEnvBuilder): def tenant_update_config(changes): tenant_config.update(changes) - env.neon_cli.config_tenant(tenant_id, tenant_config) + env.config_tenant(tenant_id, tenant_config) - tenant_id, timeline_id = env.neon_cli.create_tenant(conf=tenant_config) + tenant_id, timeline_id = env.create_tenant(conf=tenant_config) log.info("tenant id is %s", tenant_id) env.initial_tenant = tenant_id # update_and_gc relies on this ps_http = env.pageserver.http_client() diff --git a/test_runner/regress/test_layer_writers_fail.py b/test_runner/regress/test_layer_writers_fail.py index 7298635abe91..1711cc14142f 100644 --- a/test_runner/regress/test_layer_writers_fail.py +++ b/test_runner/regress/test_layer_writers_fail.py @@ -8,7 +8,7 @@ def test_image_layer_writer_fail_before_finish(neon_simple_env: NeonEnv): env = neon_simple_env pageserver_http = env.pageserver.http_client() - tenant_id, timeline_id = env.neon_cli.create_tenant( + tenant_id, timeline_id = env.create_tenant( conf={ # small checkpoint distance to create more delta layer files "checkpoint_distance": f"{1024 ** 2}", @@ -52,7 +52,7 @@ def test_delta_layer_writer_fail_before_finish(neon_simple_env: NeonEnv): env = neon_simple_env pageserver_http = env.pageserver.http_client() - tenant_id, timeline_id = env.neon_cli.create_tenant( + tenant_id, timeline_id = env.create_tenant( conf={ # small checkpoint distance to create more delta layer files "checkpoint_distance": f"{1024 ** 2}", diff --git a/test_runner/regress/test_layers_from_future.py b/test_runner/regress/test_layers_from_future.py index 3b2218dd9b09..2857df8ef7bb 100644 --- a/test_runner/regress/test_layers_from_future.py +++ b/test_runner/regress/test_layers_from_future.py @@ -56,7 +56,7 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder): "compaction_target_size": f"{128 * (1024**3)}", # make it so that we only have 1 partition => image coverage for delta layers => enables gc of delta layers } - tenant_id, timeline_id = env.neon_cli.create_tenant(conf=tenant_config) + tenant_id, timeline_id = env.create_tenant(conf=tenant_config) endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py index 15a3719e0b82..1aa1bdf36670 100644 --- a/test_runner/regress/test_logical_replication.py +++ b/test_runner/regress/test_logical_replication.py @@ -219,7 +219,7 @@ def test_ondemand_wal_download_in_replication_slot_funcs(neon_env_builder: NeonE neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() - env.neon_cli.create_branch("init") + env.create_branch("init") endpoint = env.endpoints.create_start("init") with endpoint.connect().cursor() as cur: @@ -270,7 +270,7 @@ def test_lr_with_slow_safekeeper(neon_env_builder: NeonEnvBuilder, vanilla_pg): neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() - env.neon_cli.create_branch("init") + env.create_branch("init") endpoint = env.endpoints.create_start("init") with endpoint.connect().cursor() as cur: @@ -352,7 +352,7 @@ def test_lr_with_slow_safekeeper(neon_env_builder: NeonEnvBuilder, vanilla_pg): def test_restart_endpoint(neon_simple_env: NeonEnv, vanilla_pg): env = neon_simple_env - env.neon_cli.create_branch("init") + env.create_branch("init") endpoint = env.endpoints.create_start("init") tenant_id = endpoint.safe_psql("show neon.tenant_id")[0][0] timeline_id = endpoint.safe_psql("show neon.timeline_id")[0][0] @@ -397,7 +397,7 @@ def test_restart_endpoint(neon_simple_env: NeonEnv, vanilla_pg): def test_large_records(neon_simple_env: NeonEnv, vanilla_pg): env = neon_simple_env - env.neon_cli.create_branch("init") + env.create_branch("init") endpoint = env.endpoints.create_start("init") cur = endpoint.connect().cursor() @@ -445,7 +445,7 @@ def test_large_records(neon_simple_env: NeonEnv, vanilla_pg): def test_slots_and_branching(neon_simple_env: NeonEnv): env = neon_simple_env - tenant, timeline = env.neon_cli.create_tenant() + tenant, timeline = env.create_tenant() env.pageserver.http_client() main_branch = env.endpoints.create_start("main", tenant_id=tenant) @@ -457,7 +457,7 @@ def test_slots_and_branching(neon_simple_env: NeonEnv): wait_for_last_flush_lsn(env, main_branch, tenant, timeline) # Create branch ws. - env.neon_cli.create_branch("ws", "main", tenant_id=tenant) + env.create_branch("ws", ancestor_branch_name="main", tenant_id=tenant) ws_branch = env.endpoints.create_start("ws", tenant_id=tenant) # Check that we can create slot with the same name @@ -469,10 +469,10 @@ def test_slots_and_branching(neon_simple_env: NeonEnv): def test_replication_shutdown(neon_simple_env: NeonEnv): # Ensure Postgres can exit without stuck when a replication job is active + neon extension installed env = neon_simple_env - env.neon_cli.create_branch("test_replication_shutdown_publisher", "main") + env.create_branch("test_replication_shutdown_publisher", ancestor_branch_name="main") pub = env.endpoints.create("test_replication_shutdown_publisher") - env.neon_cli.create_branch("test_replication_shutdown_subscriber") + env.create_branch("test_replication_shutdown_subscriber") sub = env.endpoints.create("test_replication_shutdown_subscriber") pub.respec(skip_pg_catalog_updates=False) @@ -575,7 +575,7 @@ def test_subscriber_synchronous_commit(neon_simple_env: NeonEnv, vanilla_pg): vanilla_pg.start() vanilla_pg.safe_psql("create extension neon;") - env.neon_cli.create_branch("subscriber") + env.create_branch("subscriber") sub = env.endpoints.create("subscriber") sub.start() diff --git a/test_runner/regress/test_lsn_mapping.py b/test_runner/regress/test_lsn_mapping.py index 67e82f8d309f..ab43e3214628 100644 --- a/test_runner/regress/test_lsn_mapping.py +++ b/test_runner/regress/test_lsn_mapping.py @@ -32,7 +32,7 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder, with_lease: bool): """ env = neon_env_builder.init_start() - tenant_id, _ = env.neon_cli.create_tenant( + tenant_id, _ = env.create_tenant( conf={ # disable default GC and compaction "gc_period": "1000 m", @@ -43,7 +43,7 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder, with_lease: bool): } ) - timeline_id = env.neon_cli.create_branch("test_lsn_mapping", tenant_id=tenant_id) + timeline_id = env.create_branch("test_lsn_mapping", tenant_id=tenant_id) endpoint_main = env.endpoints.create_start("test_lsn_mapping", tenant_id=tenant_id) timeline_id = endpoint_main.safe_psql("show neon.timeline_id")[0][0] @@ -123,8 +123,8 @@ def test_lsn_mapping(neon_env_builder: NeonEnvBuilder, with_lease: bool): endpoint_here.stop_and_destroy() # Do the "past" check again at a new branch to ensure that we don't return something before the branch cutoff - timeline_id_child = env.neon_cli.create_branch( - "test_lsn_mapping_child", tenant_id=tenant_id, ancestor_branch_name="test_lsn_mapping" + timeline_id_child = env.create_branch( + "test_lsn_mapping_child", ancestor_branch_name="test_lsn_mapping", tenant_id=tenant_id ) # Timestamp is in the unreachable past @@ -190,7 +190,7 @@ def test_ts_of_lsn_api(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() - new_timeline_id = env.neon_cli.create_branch("test_ts_of_lsn_api") + new_timeline_id = env.create_branch("test_ts_of_lsn_api") endpoint_main = env.endpoints.create_start("test_ts_of_lsn_api") cur = endpoint_main.connect().cursor() diff --git a/test_runner/regress/test_multixact.py b/test_runner/regress/test_multixact.py index 8a00f8835fe4..742d03e46470 100644 --- a/test_runner/regress/test_multixact.py +++ b/test_runner/regress/test_multixact.py @@ -72,9 +72,7 @@ def test_multixact(neon_simple_env: NeonEnv, test_output_dir): assert int(next_multixact_id) > int(next_multixact_id_old) # Branch at this point - env.neon_cli.create_branch( - "test_multixact_new", ancestor_branch_name="main", ancestor_start_lsn=lsn - ) + env.create_branch("test_multixact_new", ancestor_branch_name="main", ancestor_start_lsn=lsn) endpoint_new = env.endpoints.create_start("test_multixact_new") next_multixact_id_new = endpoint_new.safe_psql( diff --git a/test_runner/regress/test_neon_cli.py b/test_runner/regress/test_neon_cli.py index 96543f1ef59e..04780ebcf155 100644 --- a/test_runner/regress/test_neon_cli.py +++ b/test_runner/regress/test_neon_cli.py @@ -31,7 +31,7 @@ def helper_compare_timeline_list( ) ) - timelines_cli = env.neon_cli.list_timelines(initial_tenant) + timelines_cli = env.neon_cli.timeline_list(initial_tenant) cli_timeline_ids = sorted([timeline_id for (_, timeline_id) in timelines_cli]) assert timelines_api == cli_timeline_ids @@ -44,17 +44,19 @@ def test_cli_timeline_list(neon_simple_env: NeonEnv): helper_compare_timeline_list(pageserver_http_client, env, env.initial_tenant) # Create a branch for us - main_timeline_id = env.neon_cli.create_branch("test_cli_branch_list_main") + main_timeline_id = env.create_branch("test_cli_branch_list_main") helper_compare_timeline_list(pageserver_http_client, env, env.initial_tenant) # Create a nested branch - nested_timeline_id = env.neon_cli.create_branch( - "test_cli_branch_list_nested", "test_cli_branch_list_main" + nested_timeline_id = env.create_branch( + "test_cli_branch_list_nested", ancestor_branch_name="test_cli_branch_list_main" ) helper_compare_timeline_list(pageserver_http_client, env, env.initial_tenant) # Check that all new branches are visible via CLI - timelines_cli = [timeline_id for (_, timeline_id) in env.neon_cli.list_timelines()] + timelines_cli = [ + timeline_id for (_, timeline_id) in env.neon_cli.timeline_list(env.initial_tenant) + ] assert main_timeline_id in timelines_cli assert nested_timeline_id in timelines_cli @@ -64,7 +66,7 @@ def helper_compare_tenant_list(pageserver_http_client: PageserverHttpClient, env tenants = pageserver_http_client.tenant_list() tenants_api = sorted(map(lambda t: cast(str, t["id"]), tenants)) - res = env.neon_cli.list_tenants() + res = env.neon_cli.tenant_list() tenants_cli = sorted(map(lambda t: t.split()[0], res.stdout.splitlines())) assert tenants_api == tenants_cli @@ -77,18 +79,18 @@ def test_cli_tenant_list(neon_simple_env: NeonEnv): helper_compare_tenant_list(pageserver_http_client, env) # Create new tenant - tenant1, _ = env.neon_cli.create_tenant() + tenant1, _ = env.create_tenant() # check tenant1 appeared helper_compare_tenant_list(pageserver_http_client, env) # Create new tenant - tenant2, _ = env.neon_cli.create_tenant() + tenant2, _ = env.create_tenant() # check tenant2 appeared helper_compare_tenant_list(pageserver_http_client, env) - res = env.neon_cli.list_tenants() + res = env.neon_cli.tenant_list() tenants = sorted(map(lambda t: TenantId(t.split()[0]), res.stdout.splitlines())) assert env.initial_tenant in tenants @@ -98,8 +100,8 @@ def test_cli_tenant_list(neon_simple_env: NeonEnv): def test_cli_tenant_create(neon_simple_env: NeonEnv): env = neon_simple_env - tenant_id, _ = env.neon_cli.create_tenant() - timelines = env.neon_cli.list_timelines(tenant_id) + tenant_id, _ = env.create_tenant() + timelines = env.neon_cli.timeline_list(tenant_id) # an initial timeline should be created upon tenant creation assert len(timelines) == 1 @@ -132,7 +134,7 @@ def test_cli_start_stop(neon_env_builder: NeonEnvBuilder): env.neon_cli.pageserver_stop(env.pageserver.id) env.neon_cli.safekeeper_stop() env.neon_cli.storage_controller_stop(False) - env.neon_cli.broker_stop() + env.neon_cli.storage_broker_stop() # Keep NeonEnv state up to date, it usually owns starting/stopping services env.pageserver.running = False @@ -175,7 +177,7 @@ def test_cli_start_stop_multi(neon_env_builder: NeonEnvBuilder): # Stop this to get out of the way of the following `start` env.neon_cli.storage_controller_stop(False) - env.neon_cli.broker_stop() + env.neon_cli.storage_broker_stop() # Default start res = env.neon_cli.raw_cli(["start"]) diff --git a/test_runner/regress/test_neon_extension.py b/test_runner/regress/test_neon_extension.py index 619fd83c9bd8..a99e9e15af9d 100644 --- a/test_runner/regress/test_neon_extension.py +++ b/test_runner/regress/test_neon_extension.py @@ -8,7 +8,7 @@ # Verify that the neon extension is installed and has the correct version. def test_neon_extension(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() - env.neon_cli.create_branch("test_create_extension_neon") + env.create_branch("test_create_extension_neon") endpoint_main = env.endpoints.create("test_create_extension_neon") # don't skip pg_catalog updates - it runs CREATE EXTENSION neon @@ -35,7 +35,7 @@ def test_neon_extension(neon_env_builder: NeonEnvBuilder): # Verify that the neon extension can be upgraded/downgraded. def test_neon_extension_compatibility(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() - env.neon_cli.create_branch("test_neon_extension_compatibility") + env.create_branch("test_neon_extension_compatibility") endpoint_main = env.endpoints.create("test_neon_extension_compatibility") # don't skip pg_catalog updates - it runs CREATE EXTENSION neon @@ -72,7 +72,7 @@ def test_neon_extension_compatibility(neon_env_builder: NeonEnvBuilder): # Verify that the neon extension can be auto-upgraded to the latest version. def test_neon_extension_auto_upgrade(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() - env.neon_cli.create_branch("test_neon_extension_auto_upgrade") + env.create_branch("test_neon_extension_auto_upgrade") endpoint_main = env.endpoints.create("test_neon_extension_auto_upgrade") # don't skip pg_catalog updates - it runs CREATE EXTENSION neon diff --git a/test_runner/regress/test_neon_local_cli.py b/test_runner/regress/test_neon_local_cli.py index 8edba49b8a6f..0fdc5960e394 100644 --- a/test_runner/regress/test_neon_local_cli.py +++ b/test_runner/regress/test_neon_local_cli.py @@ -1,4 +1,5 @@ import pytest +from fixtures.common_types import TimelineId from fixtures.neon_fixtures import NeonEnvBuilder from fixtures.port_distributor import PortDistributor @@ -10,22 +11,36 @@ def test_neon_cli_basics(neon_env_builder: NeonEnvBuilder, port_distributor: Por # Skipping the init step that creates a local tenant in Pytest tests try: env.neon_cli.start() - env.neon_cli.create_tenant(tenant_id=env.initial_tenant, set_default=True) + env.create_tenant(tenant_id=env.initial_tenant, set_default=True) main_branch_name = "main" pg_port = port_distributor.get_port() http_port = port_distributor.get_port() env.neon_cli.endpoint_create( - main_branch_name, pg_port, http_port, endpoint_id="ep-basic-main" + main_branch_name, + pg_port, + http_port, + endpoint_id="ep-basic-main", + tenant_id=env.initial_tenant, + pg_version=env.pg_version, ) env.neon_cli.endpoint_start("ep-basic-main") branch_name = "migration-check" - env.neon_cli.create_branch(branch_name) + env.neon_cli.timeline_branch( + tenant_id=env.initial_tenant, + timeline_id=TimelineId.generate(), + new_branch_name=branch_name, + ) pg_port = port_distributor.get_port() http_port = port_distributor.get_port() env.neon_cli.endpoint_create( - branch_name, pg_port, http_port, endpoint_id=f"ep-{branch_name}" + branch_name, + pg_port, + http_port, + endpoint_id=f"ep-{branch_name}", + tenant_id=env.initial_tenant, + pg_version=env.pg_version, ) env.neon_cli.endpoint_start(f"ep-{branch_name}") finally: @@ -43,12 +58,26 @@ def test_neon_two_primary_endpoints_fail( pg_port = port_distributor.get_port() http_port = port_distributor.get_port() - env.neon_cli.endpoint_create(branch_name, pg_port, http_port, "ep1") + env.neon_cli.endpoint_create( + branch_name, + pg_port, + http_port, + endpoint_id="ep1", + tenant_id=env.initial_tenant, + pg_version=env.pg_version, + ) pg_port = port_distributor.get_port() http_port = port_distributor.get_port() # ep1 is not running so create will succeed - env.neon_cli.endpoint_create(branch_name, pg_port, http_port, "ep2") + env.neon_cli.endpoint_create( + branch_name, + pg_port, + http_port, + endpoint_id="ep2", + tenant_id=env.initial_tenant, + pg_version=env.pg_version, + ) env.neon_cli.endpoint_start("ep1") diff --git a/test_runner/regress/test_neon_superuser.py b/test_runner/regress/test_neon_superuser.py index 7825ec772c9f..dc1c9d3fd9a4 100644 --- a/test_runner/regress/test_neon_superuser.py +++ b/test_runner/regress/test_neon_superuser.py @@ -6,10 +6,10 @@ def test_neon_superuser(neon_simple_env: NeonEnv, pg_version: PgVersion): env = neon_simple_env - env.neon_cli.create_branch("test_neon_superuser_publisher", "main") + env.create_branch("test_neon_superuser_publisher", ancestor_branch_name="main") pub = env.endpoints.create("test_neon_superuser_publisher") - env.neon_cli.create_branch("test_neon_superuser_subscriber") + env.create_branch("test_neon_superuser_subscriber") sub = env.endpoints.create("test_neon_superuser_subscriber") pub.respec(skip_pg_catalog_updates=False) diff --git a/test_runner/regress/test_normal_work.py b/test_runner/regress/test_normal_work.py index 50de99adb54a..54433769fd3c 100644 --- a/test_runner/regress/test_normal_work.py +++ b/test_runner/regress/test_normal_work.py @@ -5,7 +5,7 @@ def check_tenant(env: NeonEnv, pageserver_http: PageserverHttpClient): - tenant_id, timeline_id = env.neon_cli.create_tenant() + tenant_id, timeline_id = env.create_tenant() endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) # we rely upon autocommit after each statement res_1 = endpoint.safe_psql_many( diff --git a/test_runner/regress/test_old_request_lsn.py b/test_runner/regress/test_old_request_lsn.py index f1dd3fb67d37..dfd0271c1077 100644 --- a/test_runner/regress/test_old_request_lsn.py +++ b/test_runner/regress/test_old_request_lsn.py @@ -17,7 +17,7 @@ def test_old_request_lsn(neon_env_builder: NeonEnvBuilder): # Disable pitr, because here we want to test branch creation after GC env = neon_env_builder.init_start(initial_tenant_conf={"pitr_interval": "0 sec"}) - env.neon_cli.create_branch("test_old_request_lsn", "main") + env.create_branch("test_old_request_lsn", ancestor_branch_name="main") endpoint = env.endpoints.create_start("test_old_request_lsn") pg_conn = endpoint.connect() diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py index c8249bb2cec6..0d712d06f1dd 100644 --- a/test_runner/regress/test_ondemand_download.py +++ b/test_runner/regress/test_ondemand_download.py @@ -545,7 +545,7 @@ def downloaded_bytes_and_count(pageserver_http: PageserverHttpClient) -> Tuple[i layer_sizes += layer.layer_file_size pageserver_http.evict_layer(tenant_id, timeline_id, layer.layer_file_name) - env.neon_cli.config_tenant(tenant_id, {"compaction_threshold": "3"}) + env.config_tenant(tenant_id, {"compaction_threshold": "3"}) pageserver_http.timeline_compact(tenant_id, timeline_id) layers = pageserver_http.layer_map_info(tenant_id, timeline_id) @@ -647,7 +647,7 @@ def test_compaction_downloads_on_demand_with_image_creation(neon_env_builder: Ne # layers -- threshold of 2 would sound more reasonable, but keeping it as 1 # to be less flaky conf["image_creation_threshold"] = "1" - env.neon_cli.config_tenant(tenant_id, {k: str(v) for k, v in conf.items()}) + env.config_tenant(tenant_id, {k: str(v) for k, v in conf.items()}) pageserver_http.timeline_compact(tenant_id, timeline_id) layers = pageserver_http.layer_map_info(tenant_id, timeline_id) diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py index 28dbf40bed85..a19bc785f84b 100644 --- a/test_runner/regress/test_pageserver_api.py +++ b/test_runner/regress/test_pageserver_api.py @@ -59,7 +59,7 @@ def check_client(env: NeonEnv, client: PageserverHttpClient): def test_pageserver_http_get_wal_receiver_not_found(neon_simple_env: NeonEnv): env = neon_simple_env with env.pageserver.http_client() as client: - tenant_id, timeline_id = env.neon_cli.create_tenant() + tenant_id, timeline_id = env.create_tenant() timeline_details = client.timeline_detail( tenant_id=tenant_id, timeline_id=timeline_id, include_non_incremental_logical_size=True @@ -108,7 +108,7 @@ def expect_updated_msg_lsn( def test_pageserver_http_get_wal_receiver_success(neon_simple_env: NeonEnv): env = neon_simple_env with env.pageserver.http_client() as client: - tenant_id, timeline_id = env.neon_cli.create_tenant() + tenant_id, timeline_id = env.create_tenant() endpoint = env.endpoints.create_start(DEFAULT_BRANCH_NAME, tenant_id=tenant_id) # insert something to force sk -> ps message diff --git a/test_runner/regress/test_pageserver_catchup.py b/test_runner/regress/test_pageserver_catchup.py index c16cbcb4bac0..d02010443152 100644 --- a/test_runner/regress/test_pageserver_catchup.py +++ b/test_runner/regress/test_pageserver_catchup.py @@ -9,7 +9,7 @@ def test_pageserver_catchup_while_compute_down(neon_env_builder: NeonEnvBuilder) neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() - env.neon_cli.create_branch("test_pageserver_catchup_while_compute_down") + env.create_branch("test_pageserver_catchup_while_compute_down") # Make shared_buffers large to ensure we won't query pageserver while it is down. endpoint = env.endpoints.create_start( "test_pageserver_catchup_while_compute_down", config_lines=["shared_buffers=512MB"] diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py index 96521b5684ff..a135b3da1ab5 100644 --- a/test_runner/regress/test_pageserver_generations.py +++ b/test_runner/regress/test_pageserver_generations.py @@ -150,7 +150,7 @@ def remove_control_plane_api_field(config): env.pageserver.start() env.storage_controller.node_configure(env.pageserver.id, {"availability": "Active"}) - env.neon_cli.create_tenant( + env.create_tenant( tenant_id=env.initial_tenant, conf=TENANT_CONF, timeline_id=env.initial_timeline ) @@ -643,9 +643,7 @@ def test_upgrade_generationless_local_file_paths( tenant_id = TenantId.generate() timeline_id = TimelineId.generate() - env.neon_cli.create_tenant( - tenant_id, timeline_id, conf=TENANT_CONF, placement_policy='{"Attached":1}' - ) + env.create_tenant(tenant_id, timeline_id, conf=TENANT_CONF, placement_policy='{"Attached":1}') workload = Workload(env, tenant_id, timeline_id) workload.init() diff --git a/test_runner/regress/test_pageserver_layer_rolling.py b/test_runner/regress/test_pageserver_layer_rolling.py index f6404d68ac1a..8c6e563357c6 100644 --- a/test_runner/regress/test_pageserver_layer_rolling.py +++ b/test_runner/regress/test_pageserver_layer_rolling.py @@ -42,7 +42,7 @@ async def run_worker_for_tenant( async def run_worker(env: NeonEnv, tenant_conf, entries: int) -> Tuple[TenantId, TimelineId, Lsn]: - tenant, timeline = env.neon_cli.create_tenant(conf=tenant_conf) + tenant, timeline = env.create_tenant(conf=tenant_conf) last_flush_lsn = await run_worker_for_tenant(env, entries, tenant) return tenant, timeline, last_flush_lsn diff --git a/test_runner/regress/test_pageserver_reconnect.py b/test_runner/regress/test_pageserver_reconnect.py index ada6da98ffd2..7f10c36db8cb 100644 --- a/test_runner/regress/test_pageserver_reconnect.py +++ b/test_runner/regress/test_pageserver_reconnect.py @@ -14,7 +14,7 @@ # least the code gets exercised. def test_pageserver_reconnect(neon_simple_env: NeonEnv, pg_bin: PgBin): env = neon_simple_env - env.neon_cli.create_branch("test_pageserver_restarts") + env.create_branch("test_pageserver_restarts") endpoint = env.endpoints.create_start("test_pageserver_restarts") n_reconnects = 1000 timeout = 0.01 @@ -46,7 +46,7 @@ def run_pgbench(connstr: str): # Test handling errors during page server reconnect def test_pageserver_reconnect_failure(neon_simple_env: NeonEnv): env = neon_simple_env - env.neon_cli.create_branch("test_pageserver_reconnect") + env.create_branch("test_pageserver_reconnect") endpoint = env.endpoints.create_start("test_pageserver_reconnect") con = endpoint.connect() diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py index bd47a3042892..86313ca91eaf 100644 --- a/test_runner/regress/test_pageserver_restart.py +++ b/test_runner/regress/test_pageserver_restart.py @@ -169,7 +169,7 @@ def test_pageserver_chaos( # Use a tiny checkpoint distance, to create a lot of layers quickly. # That allows us to stress the compaction and layer flushing logic more. - tenant, _ = env.neon_cli.create_tenant( + tenant, _ = env.create_tenant( conf={ "checkpoint_distance": "5000000", } diff --git a/test_runner/regress/test_pageserver_restarts_under_workload.py b/test_runner/regress/test_pageserver_restarts_under_workload.py index 9bb9b373ad46..637e1a87d3da 100644 --- a/test_runner/regress/test_pageserver_restarts_under_workload.py +++ b/test_runner/regress/test_pageserver_restarts_under_workload.py @@ -12,7 +12,7 @@ # running. def test_pageserver_restarts_under_worload(neon_simple_env: NeonEnv, pg_bin: PgBin): env = neon_simple_env - env.neon_cli.create_branch("test_pageserver_restarts") + env.create_branch("test_pageserver_restarts") endpoint = env.endpoints.create_start("test_pageserver_restarts") n_restarts = 10 scale = 10 diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py index 8746b88a7586..cd772beace08 100644 --- a/test_runner/regress/test_pageserver_secondary.py +++ b/test_runner/regress/test_pageserver_secondary.py @@ -650,7 +650,7 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder): tenant_id = TenantId.generate() timeline_a = TimelineId.generate() timeline_b = TimelineId.generate() - env.neon_cli.create_tenant( + env.create_tenant( tenant_id, timeline_a, placement_policy='{"Attached":1}', @@ -658,7 +658,7 @@ def test_secondary_background_downloads(neon_env_builder: NeonEnvBuilder): # to trigger the upload promptly. conf={"heatmap_period": f"{upload_period_secs}s"}, ) - env.neon_cli.create_timeline("main2", tenant_id, timeline_b) + env.create_timeline("main2", tenant_id, timeline_b) tenant_timelines[tenant_id] = [timeline_a, timeline_b] @@ -778,9 +778,7 @@ def test_slow_secondary_downloads(neon_env_builder: NeonEnvBuilder, via_controll tenant_id = TenantId.generate() timeline_id = TimelineId.generate() - env.neon_cli.create_tenant( - tenant_id, timeline_id, conf=TENANT_CONF, placement_policy='{"Attached":1}' - ) + env.create_tenant(tenant_id, timeline_id, conf=TENANT_CONF, placement_policy='{"Attached":1}') attached_to_id = env.storage_controller.locate(tenant_id)[0]["node_id"] ps_attached = env.get_pageserver(attached_to_id) diff --git a/test_runner/regress/test_pitr_gc.py b/test_runner/regress/test_pitr_gc.py index 7e676b55154f..871a31b9ba04 100644 --- a/test_runner/regress/test_pitr_gc.py +++ b/test_runner/regress/test_pitr_gc.py @@ -57,7 +57,7 @@ def test_pitr_gc(neon_env_builder: NeonEnvBuilder): # Branch at the point where only 100 rows were inserted # It must have been preserved by PITR setting - env.neon_cli.create_branch("test_pitr_gc_hundred", "main", ancestor_start_lsn=lsn_a) + env.create_branch("test_pitr_gc_hundred", ancestor_branch_name="main", ancestor_start_lsn=lsn_a) endpoint_hundred = env.endpoints.create_start("test_pitr_gc_hundred") diff --git a/test_runner/regress/test_recovery.py b/test_runner/regress/test_recovery.py index e21f9bb6f625..855610345843 100644 --- a/test_runner/regress/test_recovery.py +++ b/test_runner/regress/test_recovery.py @@ -25,7 +25,7 @@ def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder): ) # Create a branch for us - env.neon_cli.create_branch("test_pageserver_recovery", "main") + env.create_branch("test_pageserver_recovery", ancestor_branch_name="main") endpoint = env.endpoints.create_start("test_pageserver_recovery") diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py index 0a57fc960563..c955dce4dcb1 100644 --- a/test_runner/regress/test_remote_storage.py +++ b/test_runner/regress/test_remote_storage.py @@ -230,7 +230,7 @@ def test_remote_storage_upload_queue_retries( # create tenant with config that will determinstically allow # compaction and gc - tenant_id, timeline_id = env.neon_cli.create_tenant( + tenant_id, timeline_id = env.create_tenant( conf={ # small checkpointing and compaction targets to ensure we generate many upload operations "checkpoint_distance": f"{64 * 1024}", @@ -640,7 +640,9 @@ def test_empty_branch_remote_storage_upload(neon_env_builder: NeonEnvBuilder): client = env.pageserver.http_client() new_branch_name = "new_branch" - new_branch_timeline_id = env.neon_cli.create_branch(new_branch_name, "main", env.initial_tenant) + new_branch_timeline_id = env.create_branch( + new_branch_name, ancestor_branch_name="main", tenant_id=env.initial_tenant + ) assert_nothing_to_upload(client, env.initial_tenant, new_branch_timeline_id) timelines_before_detach = set( diff --git a/test_runner/regress/test_s3_restore.py b/test_runner/regress/test_s3_restore.py index c1a80a54bce3..721c391544cf 100644 --- a/test_runner/regress/test_s3_restore.py +++ b/test_runner/regress/test_s3_restore.py @@ -60,9 +60,7 @@ def test_tenant_s3_restore( last_flush_lsns = [] for timeline in ["first", "second"]: - timeline_id = env.neon_cli.create_branch( - timeline, tenant_id=tenant_id, ancestor_branch_name=parent - ) + timeline_id = env.create_branch(timeline, ancestor_branch_name=parent, tenant_id=tenant_id) with env.endpoints.create_start(timeline, tenant_id=tenant_id) as endpoint: run_pg_bench_small(pg_bin, endpoint.connstr()) endpoint.safe_psql(f"CREATE TABLE created_{timeline}(id integer);") diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py index 1eb33b2d39ca..a3d4b5baca12 100644 --- a/test_runner/regress/test_sharding.py +++ b/test_runner/regress/test_sharding.py @@ -77,7 +77,7 @@ def get_sizes(): assert all(s < expect_initdb_size // 2 for s in sizes.values()) # Test that timeline creation works on a sharded tenant - timeline_b = env.neon_cli.create_branch("branch_b", tenant_id=tenant_id) + timeline_b = env.create_branch("branch_b", tenant_id=tenant_id) # Test that we can write data to a sharded tenant workload = Workload(env, tenant_id, timeline_b, branch_name="branch_b") @@ -378,7 +378,7 @@ def test_sharding_split_smoke( env.start() tenant_id = TenantId.generate() timeline_id = TimelineId.generate() - env.neon_cli.create_tenant( + env.create_tenant( tenant_id, timeline_id, shard_count=shard_count, @@ -1127,7 +1127,7 @@ def test_sharding_split_failures( timeline_id = TimelineId.generate() # Create a tenant with secondary locations enabled - env.neon_cli.create_tenant( + env.create_tenant( tenant_id, timeline_id, shard_count=initial_shard_count, placement_policy='{"Attached":1}' ) @@ -1441,7 +1441,7 @@ def test_sharding_unlogged_relation(neon_env_builder: NeonEnvBuilder): tenant_id = TenantId.generate() timeline_id = TimelineId.generate() - env.neon_cli.create_tenant(tenant_id, timeline_id, shard_count=8) + env.create_tenant(tenant_id, timeline_id, shard_count=8) # We will create many tables to ensure it's overwhelmingly likely that at least one # of them doesn't land on shard 0 @@ -1483,7 +1483,7 @@ def test_top_tenants(neon_env_builder: NeonEnvBuilder): for i in range(0, n_tenants): tenant_id = TenantId.generate() timeline_id = TimelineId.generate() - env.neon_cli.create_tenant(tenant_id, timeline_id) + env.create_tenant(tenant_id, timeline_id) # Write a different amount of data to each tenant w = Workload(env, tenant_id, timeline_id) diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py index 789623cb272d..016d36301be5 100644 --- a/test_runner/regress/test_storage_controller.py +++ b/test_runner/regress/test_storage_controller.py @@ -96,7 +96,7 @@ def test_storage_controller_smoke( # Creating several tenants should spread out across the pageservers for tid in tenant_ids: - env.neon_cli.create_tenant(tid, shard_count=shards_per_tenant) + env.create_tenant(tid, shard_count=shards_per_tenant) # Repeating a creation should be idempotent (we are just testing it doesn't return an error) env.storage_controller.tenant_create( @@ -172,7 +172,7 @@ def node_evacuated(node_id: int) -> None: # Create some fresh tenants tenant_ids = set(TenantId.generate() for i in range(0, tenant_count)) for tid in tenant_ids: - env.neon_cli.create_tenant(tid, shard_count=shards_per_tenant) + env.create_tenant(tid, shard_count=shards_per_tenant) counts = get_node_shard_counts(env, tenant_ids) # Nothing should have been scheduled on the node in Draining @@ -806,10 +806,7 @@ def test_storage_controller_s3_time_travel_recovery( env.storage_controller.consistency_check() branch_name = "main" - timeline_id = env.neon_cli.create_timeline( - branch_name, - tenant_id=tenant_id, - ) + timeline_id = env.create_timeline(branch_name, tenant_id=tenant_id) # Write some nontrivial amount of data into the endpoint and wait until it is uploaded with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint: run_pg_bench_small(pg_bin, endpoint.connstr()) @@ -1009,9 +1006,7 @@ def test_storage_controller_tenant_deletion( tenant_id = TenantId.generate() timeline_id = TimelineId.generate() - env.neon_cli.create_tenant( - tenant_id, timeline_id, shard_count=2, placement_policy='{"Attached":1}' - ) + env.create_tenant(tenant_id, timeline_id, shard_count=2, placement_policy='{"Attached":1}') # Ensure all the locations are configured, including secondaries env.storage_controller.reconcile_until_idle() @@ -1217,10 +1212,7 @@ def create_tenant(tid: TenantId): env.storage_controller.tenant_create(tid) branch_name = "main" - env.neon_cli.create_timeline( - branch_name, - tenant_id=tid, - ) + env.create_timeline(branch_name, tenant_id=tid) with env.endpoints.create_start("main", tenant_id=tid) as endpoint: run_pg_bench_small(pg_bin, endpoint.connstr()) @@ -1322,9 +1314,9 @@ def test_storage_controller_re_attach(neon_env_builder: NeonEnvBuilder): # We'll have two tenants. tenant_a = TenantId.generate() - env.neon_cli.create_tenant(tenant_a, placement_policy='{"Attached":1}') + env.create_tenant(tenant_a, placement_policy='{"Attached":1}') tenant_b = TenantId.generate() - env.neon_cli.create_tenant(tenant_b, placement_policy='{"Attached":1}') + env.create_tenant(tenant_b, placement_policy='{"Attached":1}') # Each pageserver will have one attached and one secondary location env.storage_controller.tenant_shard_migrate( @@ -1647,7 +1639,7 @@ def test_tenant_import(neon_env_builder: NeonEnvBuilder, shard_count, remote_sto # Create a second timeline to ensure that import finds both timeline_a = env.initial_timeline - timeline_b = env.neon_cli.create_branch("branch_b", tenant_id=tenant_id) + timeline_b = env.create_branch("branch_b", tenant_id=tenant_id) workload_a = Workload(env, tenant_id, timeline_a, branch_name="main") workload_a.init() @@ -1689,7 +1681,7 @@ def test_tenant_import(neon_env_builder: NeonEnvBuilder, shard_count, remote_sto ) # Now import it again - env.neon_cli.import_tenant(tenant_id) + env.neon_cli.tenant_import(tenant_id) # Check we found the shards describe = env.storage_controller.tenant_describe(tenant_id) @@ -1731,7 +1723,7 @@ def test_graceful_cluster_restart(neon_env_builder: NeonEnvBuilder): for _ in range(0, tenant_count): tid = TenantId.generate() tenant_ids.append(tid) - env.neon_cli.create_tenant( + env.create_tenant( tid, placement_policy='{"Attached":1}', shard_count=shard_count_per_tenant ) @@ -1818,7 +1810,7 @@ def test_skip_drain_on_secondary_lag(neon_env_builder: NeonEnvBuilder, pg_bin: P env = neon_env_builder.init_configs() env.start() - tid, timeline_id = env.neon_cli.create_tenant(placement_policy='{"Attached":1}') + tid, timeline_id = env.create_tenant(placement_policy='{"Attached":1}') # Give things a chance to settle. env.storage_controller.reconcile_until_idle(timeout_secs=30) @@ -1924,7 +1916,7 @@ def test_background_operation_cancellation(neon_env_builder: NeonEnvBuilder): for _ in range(0, tenant_count): tid = TenantId.generate() tenant_ids.append(tid) - env.neon_cli.create_tenant( + env.create_tenant( tid, placement_policy='{"Attached":1}', shard_count=shard_count_per_tenant ) @@ -1984,7 +1976,7 @@ def test_storage_controller_node_deletion( for _ in range(0, tenant_count): tid = TenantId.generate() tenant_ids.append(tid) - env.neon_cli.create_tenant( + env.create_tenant( tid, placement_policy='{"Attached":1}', shard_count=shard_count_per_tenant ) @@ -2109,7 +2101,7 @@ def update_and_query_metadata_health( ) # Mock tenant with unhealthy scrubber scan result - tenant_b, _ = env.neon_cli.create_tenant(shard_count=shard_count) + tenant_b, _ = env.create_tenant(shard_count=shard_count) tenant_b_shard_ids = ( env.storage_controller.tenant_shard_split(tenant_b, shard_count=shard_count) if shard_count is not None @@ -2117,7 +2109,7 @@ def update_and_query_metadata_health( ) # Mock tenant that never gets a health update from scrubber - tenant_c, _ = env.neon_cli.create_tenant(shard_count=shard_count) + tenant_c, _ = env.create_tenant(shard_count=shard_count) tenant_c_shard_ids = ( env.storage_controller.tenant_shard_split(tenant_c, shard_count=shard_count) @@ -2517,7 +2509,7 @@ def test_storage_controller_validate_during_migration(neon_env_builder: NeonEnvB tenant_id = env.initial_tenant timeline_id = env.initial_timeline - env.neon_cli.create_tenant(tenant_id, timeline_id) + env.create_tenant(tenant_id, timeline_id) env.storage_controller.pageserver_api().set_tenant_config(tenant_id, TENANT_CONF) # Write enough data that a compaction would do some work (deleting some L0s) @@ -2613,6 +2605,9 @@ def has_hit_migration_failpoint(): class MigrationFailpoints(Enum): # While only the origin is attached PRE_GENERATION_INC = "reconciler-live-migrate-pre-generation-inc" + # While only the origin is attached and the db was updated to + # point to the new location + PRE_AWAIT_LSN = "reconciler-live-migrate-pre-await-lsn" # While both locations are attached POST_NOTIFY = "reconciler-live-migrate-post-notify" # While only the destination is attached @@ -2638,12 +2633,24 @@ def test_storage_controller_proxy_during_migration( """ neon_env_builder.num_pageservers = 2 neon_env_builder.enable_pageserver_remote_storage(s3_storage()) + + neon_env_builder.storage_controller_config = { + # Publish long reconcile metric early + "long_reconcile_threshold": "5s", + } + env = neon_env_builder.init_configs() env.start() tenant_id = env.initial_tenant timeline_id = env.initial_timeline - env.neon_cli.create_tenant(tenant_id, timeline_id) + env.create_tenant(tenant_id, timeline_id) + + # The test stalls a reconcile on purpose to check if the long running + # reconcile alert fires. + env.storage_controller.allowed_errors.extend( + [".*Reconcile passed the long running threshold.*"] + ) # Activate a failpoint that will cause live migration to get stuck _after_ the generation has been issued # to the new pageserver: this should result in requests routed to the new pageserver. @@ -2652,6 +2659,24 @@ def test_storage_controller_proxy_during_migration( origin_pageserver = env.get_tenant_pageserver(tenant_id) dest_ps_id = [p.id for p in env.pageservers if p.id != origin_pageserver.id][0] + def long_migration_metric_published(): + assert ( + env.storage_controller.get_metric_value( + "storage_controller_reconcile_long_running_total", + filter={"tenant_id": str(tenant_id), "shard_number": "0"}, + ) + == 1 + ) + + def assert_long_migration_metric_not_published(): + assert ( + env.storage_controller.get_metric_value( + "storage_controller_reconcile_long_running_total", + filter={"tenant_id": str(tenant_id), "shard_number": "0"}, + ) + is None + ) + try: with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor: migrate_fut = executor.submit( @@ -2682,9 +2707,14 @@ def has_hit_migration_failpoint(): # We expect request to land on the origin assert tenant_info["generation"] == 1 + wait_until(10, 1, long_migration_metric_published) + # Eventually migration completes env.storage_controller.configure_failpoints((migration_failpoint.value, "off")) migrate_fut.result() + + assert_long_migration_metric_not_published() + except: # Always disable 'pause' failpoints, even on failure, to avoid hanging in shutdown env.storage_controller.configure_failpoints((migration_failpoint.value, "off")) @@ -2793,7 +2823,7 @@ def assign_az(ps_cfg): # Generate a layer to avoid shard split handling on ps from tripping # up on debug assert. timeline_id = TimelineId.generate() - env.neon_cli.create_timeline("bar", tids[0], timeline_id) + env.create_timeline("bar", tids[0], timeline_id) workload = Workload(env, tids[0], timeline_id, branch_name="bar") workload.init() @@ -2807,3 +2837,171 @@ def assign_az(ps_cfg): attached_to = shard["node_attached"] expected_az = env.get_pageserver(attached_to).az_id assert shard["preferred_az_id"] == expected_az + + +@run_only_on_default_postgres("Postgres version makes no difference here") +@pytest.mark.parametrize( + "migration_failpoint", + [ + MigrationFailpoints.PRE_GENERATION_INC, + MigrationFailpoints.PRE_AWAIT_LSN, + MigrationFailpoints.POST_NOTIFY, + MigrationFailpoints.POST_DETACH, + ], +) +def test_timeline_delete_mid_live_migration(neon_env_builder: NeonEnvBuilder, migration_failpoint): + neon_env_builder.num_pageservers = 2 + env = neon_env_builder.init_configs() + env.start() + + tenant_id = TenantId.generate() + timeline_id = TimelineId.generate() + env.storage_controller.tenant_create(tenant_id, placement_policy={"Attached": 1}) + env.storage_controller.pageserver_api().timeline_create( + pg_version=PgVersion.NOT_SET, tenant_id=tenant_id, new_timeline_id=timeline_id + ) + + shard_zero = TenantShardId(tenant_id, 0, 0) + locations = env.storage_controller.get_tenants_placement()[str(shard_zero)] + + assert locations["observed"] == locations["intent"] + assert locations["observed"]["attached"] is not None + assert len(locations["observed"]["secondary"]) > 0 + + attached_location = locations["observed"]["attached"] + secondary_location = locations["observed"]["secondary"][0] + + env.storage_controller.configure_failpoints((migration_failpoint.value, "pause")) + + try: + with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor: + migrate_fut = executor.submit( + env.storage_controller.tenant_shard_migrate, + shard_zero, + secondary_location, + ) + + def has_hit_migration_failpoint(): + expr = f"at failpoint {migration_failpoint.value}" + log.info(expr) + assert env.storage_controller.log_contains(expr) + + wait_until(10, 1, has_hit_migration_failpoint) + + env.storage_controller.pageserver_api().timeline_delete( + tenant_id=tenant_id, timeline_id=timeline_id + ) + + # Eventually migration completes + env.storage_controller.configure_failpoints((migration_failpoint.value, "off")) + migrate_fut.result() + + # Ensure that we detached from the old attached location + with pytest.raises(PageserverApiException) as exc: + env.get_pageserver(attached_location).http_client().timeline_list(tenant_id) + assert exc.value.status_code == 404 + + # Ensure the timeline is not present on the new attached location + client = env.get_pageserver(secondary_location).http_client() + assert timeline_id not in { + TimelineId(b["timeline_id"]) for b in client.timeline_list(tenant_id) + }, f"deleted timeline found on {secondary_location}" + + except: + # Always disable 'pause' failpoints, even on failure, to avoid hanging in shutdown + env.storage_controller.configure_failpoints((migration_failpoint.value, "off")) + raise + + +@run_only_on_default_postgres("Postgres version makes no difference here") +@pytest.mark.parametrize( + "migration_failpoint", + [ + MigrationFailpoints.PRE_GENERATION_INC, + MigrationFailpoints.POST_NOTIFY, + MigrationFailpoints.POST_DETACH, + ], +) +def test_multi_attached_timeline_creation(neon_env_builder: NeonEnvBuilder, migration_failpoint): + neon_env_builder.num_pageservers = 2 + env = neon_env_builder.init_configs() + env.start() + + tenant_id = TenantId.generate() + env.storage_controller.tenant_create(tenant_id, placement_policy={"Attached": 1}) + + shard_zero = TenantShardId(tenant_id, 0, 0) + locations = env.storage_controller.get_tenants_placement()[str(shard_zero)] + + assert locations["observed"] == locations["intent"] + assert locations["observed"]["attached"] is not None + assert len(locations["observed"]["secondary"]) > 0 + + attached_location = locations["observed"]["attached"] + secondary_location = locations["observed"]["secondary"][0] + + env.storage_controller.configure_failpoints((migration_failpoint.value, "pause")) + + try: + with concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor: + migrate_fut = executor.submit( + env.storage_controller.tenant_shard_migrate, + shard_zero, + secondary_location, + ) + + def has_hit_migration_failpoint(): + expr = f"at failpoint {migration_failpoint.value}" + log.info(expr) + assert env.storage_controller.log_contains(expr) + + wait_until(10, 1, has_hit_migration_failpoint) + + timeline_id = TimelineId.generate() + env.storage_controller.pageserver_api().timeline_create( + pg_version=PgVersion.NOT_SET, tenant_id=tenant_id, new_timeline_id=timeline_id + ) + + # Timeline creation only goes to the origin. + if migration_failpoint == MigrationFailpoints.PRE_GENERATION_INC: + client = env.get_pageserver(attached_location).http_client() + assert timeline_id in { + TimelineId(b["timeline_id"]) for b in client.timeline_list(tenant_id) + }, f"new timeline not found on {attached_location}" + + with pytest.raises(PageserverApiException) as exc: + env.get_pageserver(secondary_location).http_client().timeline_list(tenant_id) + assert exc.value.status_code == 404 + + # Timeline creations goes to both attached locations + if migration_failpoint == MigrationFailpoints.POST_NOTIFY: + for node_id in [attached_location, secondary_location]: + client = env.get_pageserver(node_id).http_client() + assert timeline_id in { + TimelineId(b["timeline_id"]) for b in client.timeline_list(tenant_id) + }, f"new timeline not found on {node_id}" + + # Timeline creation goes both locations, but storcon gets a 404 from the origin + # which it ignores. + if migration_failpoint == MigrationFailpoints.POST_DETACH: + client = env.get_pageserver(secondary_location).http_client() + assert timeline_id in { + TimelineId(b["timeline_id"]) for b in client.timeline_list(tenant_id) + }, f"new timeline not found on {attached_location}" + + with pytest.raises(PageserverApiException) as exc: + env.get_pageserver(attached_location).http_client().timeline_list(tenant_id) + assert exc.value.status_code == 404 + + # Eventually migration completes + env.storage_controller.configure_failpoints((migration_failpoint.value, "off")) + migrate_fut.result() + + # Ensure that we detached from the old attached location + with pytest.raises(PageserverApiException) as exc: + env.get_pageserver(attached_location).http_client().timeline_list(tenant_id) + assert exc.value.status_code == 404 + except: + # Always disable 'pause' failpoints, even on failure, to avoid hanging in shutdown + env.storage_controller.configure_failpoints((migration_failpoint.value, "off")) + raise diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py index b6c19f03f6ab..7ecd0cf74858 100644 --- a/test_runner/regress/test_storage_scrubber.py +++ b/test_runner/regress/test_storage_scrubber.py @@ -135,7 +135,7 @@ def test_scrubber_physical_gc(neon_env_builder: NeonEnvBuilder, shard_count: Opt tenant_id = TenantId.generate() timeline_id = TimelineId.generate() - env.neon_cli.create_tenant(tenant_id, timeline_id, shard_count=shard_count) + env.create_tenant(tenant_id, timeline_id, shard_count=shard_count) workload = Workload(env, tenant_id, timeline_id) workload.init() @@ -185,7 +185,7 @@ def test_scrubber_physical_gc_ancestors( tenant_id = TenantId.generate() timeline_id = TimelineId.generate() - env.neon_cli.create_tenant( + env.create_tenant( tenant_id, timeline_id, shard_count=shard_count, @@ -303,7 +303,7 @@ def test_scrubber_physical_gc_timeline_deletion(neon_env_builder: NeonEnvBuilder tenant_id = TenantId.generate() timeline_id = TimelineId.generate() - env.neon_cli.create_tenant( + env.create_tenant( tenant_id, timeline_id, shard_count=None, @@ -385,7 +385,7 @@ def test_scrubber_physical_gc_ancestors_split(neon_env_builder: NeonEnvBuilder): tenant_id = TenantId.generate() timeline_id = TimelineId.generate() initial_shard_count = 2 - env.neon_cli.create_tenant( + env.create_tenant( tenant_id, timeline_id, shard_count=initial_shard_count, diff --git a/test_runner/regress/test_subscriber_restart.py b/test_runner/regress/test_subscriber_restart.py index 647a2e6b1456..e67001ef4156 100644 --- a/test_runner/regress/test_subscriber_restart.py +++ b/test_runner/regress/test_subscriber_restart.py @@ -9,11 +9,11 @@ # It requires tracking information about replication origins at page server side def test_subscriber_restart(neon_simple_env: NeonEnv): env = neon_simple_env - env.neon_cli.create_branch("publisher") + env.create_branch("publisher") pub = env.endpoints.create("publisher") pub.start() - sub_timeline_id = env.neon_cli.create_branch("subscriber") + sub_timeline_id = env.create_branch("subscriber") sub = env.endpoints.create("subscriber") sub.start() diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py index 9fb7324fa15c..d13cbe45e910 100644 --- a/test_runner/regress/test_tenant_conf.py +++ b/test_runner/regress/test_tenant_conf.py @@ -38,7 +38,7 @@ def set_some_nondefault_global_config(ps_cfg: Dict[str, Any]): # Check that we raise on misspelled configs invalid_conf_key = "some_invalid_setting_name_blah_blah_123" try: - env.neon_cli.create_tenant( + env.create_tenant( conf={ invalid_conf_key: "20000", } @@ -54,9 +54,9 @@ def set_some_nondefault_global_config(ps_cfg: Dict[str, Any]): "evictions_low_residence_duration_metric_threshold": "42s", "eviction_policy": json.dumps({"kind": "NoEviction"}), } - tenant, _ = env.neon_cli.create_tenant(conf=new_conf) + tenant, _ = env.create_tenant(conf=new_conf) - env.neon_cli.create_timeline("test_tenant_conf", tenant_id=tenant) + env.create_timeline("test_tenant_conf", tenant_id=tenant) env.endpoints.create_start("test_tenant_conf", "main", tenant) # check the configuration of the default tenant @@ -121,10 +121,7 @@ def set_some_nondefault_global_config(ps_cfg: Dict[str, Any]): ), "max_lsn_wal_lag": "13000000", } - env.neon_cli.config_tenant( - tenant_id=tenant, - conf=conf_update, - ) + env.config_tenant(tenant_id=tenant, conf=conf_update) updated_tenant_config = http_client.tenant_config(tenant_id=tenant) updated_specific_config = updated_tenant_config.tenant_specific_overrides @@ -172,10 +169,8 @@ def set_some_nondefault_global_config(ps_cfg: Dict[str, Any]): final_conf = { "pitr_interval": "1 min", } - env.neon_cli.config_tenant( - tenant_id=tenant, - conf=final_conf, - ) + env.config_tenant(tenant_id=tenant, conf=final_conf) + final_tenant_config = http_client.tenant_config(tenant_id=tenant) final_specific_config = final_tenant_config.tenant_specific_overrides assert final_specific_config["pitr_interval"] == "1m" @@ -218,7 +213,7 @@ def test_creating_tenant_conf_after_attach(neon_env_builder: NeonEnvBuilder): assert isinstance(env.pageserver_remote_storage, LocalFsStorage) # tenant is created with defaults, as in without config file - (tenant_id, timeline_id) = env.neon_cli.create_tenant() + (tenant_id, timeline_id) = env.create_tenant() config_path = env.pageserver.tenant_dir(tenant_id) / "config-v1" http_client = env.pageserver.http_client() @@ -240,9 +235,9 @@ def test_creating_tenant_conf_after_attach(neon_env_builder: NeonEnvBuilder): func=lambda: assert_tenant_state(http_client, tenant_id, "Active"), ) - env.neon_cli.config_tenant(tenant_id, {"gc_horizon": "1000000"}) + env.config_tenant(tenant_id, {"gc_horizon": "1000000"}) contents_first = config_path.read_text() - env.neon_cli.config_tenant(tenant_id, {"gc_horizon": "0"}) + env.config_tenant(tenant_id, {"gc_horizon": "0"}) contents_later = config_path.read_text() # dont test applying the setting here, we have that another test case to show it @@ -298,7 +293,7 @@ def get_metric(): metric = get_metric() assert int(metric.value) > 0, "metric is updated" - env.neon_cli.config_tenant( + env.config_tenant( tenant_id, {"evictions_low_residence_duration_metric_threshold": default_value} ) updated_metric = get_metric() @@ -306,9 +301,7 @@ def get_metric(): metric.value ), "metric is unchanged when setting same value" - env.neon_cli.config_tenant( - tenant_id, {"evictions_low_residence_duration_metric_threshold": "2day"} - ) + env.config_tenant(tenant_id, {"evictions_low_residence_duration_metric_threshold": "2day"}) metric = get_metric() assert int(metric.labels["low_threshold_secs"]) == 2 * 24 * 60 * 60 assert int(metric.value) == 0 @@ -320,9 +313,7 @@ def get_metric(): assert int(metric.labels["low_threshold_secs"]) == 2 * 24 * 60 * 60 assert int(metric.value) > 0 - env.neon_cli.config_tenant( - tenant_id, {"evictions_low_residence_duration_metric_threshold": "2h"} - ) + env.config_tenant(tenant_id, {"evictions_low_residence_duration_metric_threshold": "2h"}) metric = get_metric() assert int(metric.labels["low_threshold_secs"]) == 2 * 60 * 60 assert int(metric.value) == 0, "value resets if label changes" @@ -334,7 +325,7 @@ def get_metric(): assert int(metric.labels["low_threshold_secs"]) == 2 * 60 * 60 assert int(metric.value) > 0, "set a non-zero value for next step" - env.neon_cli.config_tenant(tenant_id, {}) + env.config_tenant(tenant_id, {}) metric = get_metric() assert int(metric.labels["low_threshold_secs"]) == 24 * 60 * 60, "label resets to default" assert int(metric.value) == 0, "value resets to default" diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py index 7ee949e8d3f4..eafd159ac038 100644 --- a/test_runner/regress/test_tenant_delete.py +++ b/test_runner/regress/test_tenant_delete.py @@ -78,7 +78,7 @@ def test_tenant_delete_smoke( # may need to retry on some remote storage errors injected by the test harness error_tolerant_delete(ps_http, tenant_id) - env.neon_cli.create_tenant( + env.create_tenant( tenant_id=tenant_id, conf=many_small_layers_tenant_config(), ) @@ -89,9 +89,7 @@ def test_tenant_delete_smoke( # create two timelines one being the parent of another parent = None for timeline in ["first", "second"]: - timeline_id = env.neon_cli.create_branch( - timeline, tenant_id=tenant_id, ancestor_branch_name=parent - ) + timeline_id = env.create_branch(timeline, ancestor_branch_name=parent, tenant_id=tenant_id) with env.endpoints.create_start(timeline, tenant_id=tenant_id) as endpoint: run_pg_bench_small(pg_bin, endpoint.connstr()) wait_for_last_flush_lsn(env, endpoint, tenant=tenant_id, timeline=timeline_id) @@ -339,7 +337,7 @@ def test_tenant_delete_scrubber(pg_bin: PgBin, make_httpserver, neon_env_builder ps_http = env.pageserver.http_client() # create a tenant separate from the main tenant so that we have one remaining # after we deleted it, as the scrubber treats empty buckets as an error. - (tenant_id, timeline_id) = env.neon_cli.create_tenant() + (tenant_id, timeline_id) = env.create_tenant() with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint: run_pg_bench_small(pg_bin, endpoint.connstr()) diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py index e7c6d5a4c382..6de22f262dfe 100644 --- a/test_runner/regress/test_tenant_detach.py +++ b/test_runner/regress/test_tenant_detach.py @@ -72,7 +72,7 @@ def test_tenant_reattach(neon_env_builder: NeonEnvBuilder, mode: str): pageserver_http = env.pageserver.http_client() # create new nenant - tenant_id, timeline_id = env.neon_cli.create_tenant() + tenant_id, timeline_id = env.create_tenant() env.pageserver.allowed_errors.extend(PERMIT_PAGE_SERVICE_ERRORS) @@ -241,7 +241,7 @@ async def reattach_while_busy( pageserver_http = env.pageserver.http_client() # create new nenant - tenant_id, timeline_id = env.neon_cli.create_tenant( + tenant_id, timeline_id = env.create_tenant( # Create layers aggressively conf={"checkpoint_distance": "100000"} ) diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py index 43e9a0d36e80..645e22af1f0a 100644 --- a/test_runner/regress/test_tenant_relocation.py +++ b/test_runner/regress/test_tenant_relocation.py @@ -219,7 +219,7 @@ def test_tenant_relocation( log.info("tenant to relocate %s initial_timeline_id %s", tenant_id, env.initial_timeline) - env.neon_cli.create_branch("test_tenant_relocation_main", tenant_id=tenant_id) + env.create_branch("test_tenant_relocation_main", tenant_id=tenant_id) ep_main = env.endpoints.create_start( branch_name="test_tenant_relocation_main", tenant_id=tenant_id ) @@ -232,7 +232,7 @@ def test_tenant_relocation( expected_sum=500500, ) - env.neon_cli.create_branch( + env.create_branch( new_branch_name="test_tenant_relocation_second", ancestor_branch_name="test_tenant_relocation_main", ancestor_start_lsn=current_lsn_main, @@ -404,7 +404,7 @@ def test_emergency_relocate_with_branches_slow_replay( # - A logical replication message between the inserts, so that we can conveniently # pause the WAL ingestion between the two inserts. # - Child branch, created after the inserts - tenant_id, _ = env.neon_cli.create_tenant() + tenant_id, _ = env.create_tenant() main_endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) with main_endpoint.cursor() as cur: @@ -417,7 +417,7 @@ def test_emergency_relocate_with_branches_slow_replay( current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) main_endpoint.stop() - env.neon_cli.create_branch("child", tenant_id=tenant_id, ancestor_start_lsn=current_lsn) + env.create_branch("child", tenant_id=tenant_id, ancestor_start_lsn=current_lsn) # Now kill the pageserver, remove the tenant directory, and restart. This simulates # the scenario that a pageserver dies unexpectedly and cannot be recovered, so we relocate @@ -548,7 +548,7 @@ def test_emergency_relocate_with_branches_createdb( pageserver_http = env.pageserver.http_client() # create new nenant - tenant_id, _ = env.neon_cli.create_tenant() + tenant_id, _ = env.create_tenant() main_endpoint = env.endpoints.create_start("main", tenant_id=tenant_id) with main_endpoint.cursor() as cur: @@ -556,7 +556,7 @@ def test_emergency_relocate_with_branches_createdb( cur.execute("CREATE DATABASE neondb") current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()")) - env.neon_cli.create_branch("child", tenant_id=tenant_id, ancestor_start_lsn=current_lsn) + env.create_branch("child", tenant_id=tenant_id, ancestor_start_lsn=current_lsn) with main_endpoint.cursor(dbname="neondb") as cur: cur.execute("CREATE TABLE test_migrate_one AS SELECT generate_series(1,100)") diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py index 609987ab0cbb..867c0021cd88 100644 --- a/test_runner/regress/test_tenant_size.py +++ b/test_runner/regress/test_tenant_size.py @@ -27,7 +27,7 @@ def test_empty_tenant_size(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_configs() env.start() - (tenant_id, timeline_id) = env.neon_cli.create_tenant() + (tenant_id, timeline_id) = env.create_tenant() http_client = env.pageserver.http_client() initial_size = http_client.tenant_size(tenant_id) @@ -67,12 +67,12 @@ def test_branched_empty_timeline_size(neon_simple_env: NeonEnv, test_output_dir: gc_horizon """ env = neon_simple_env - (tenant_id, _) = env.neon_cli.create_tenant() + (tenant_id, _) = env.create_tenant() http_client = env.pageserver.http_client() initial_size = http_client.tenant_size(tenant_id) - first_branch_timeline_id = env.neon_cli.create_branch("first-branch", tenant_id=tenant_id) + first_branch_timeline_id = env.create_branch("first-branch", tenant_id=tenant_id) with env.endpoints.create_start("first-branch", tenant_id=tenant_id) as endpoint: with endpoint.cursor() as cur: @@ -104,13 +104,13 @@ def test_branched_from_many_empty_parents_size(neon_simple_env: NeonEnv, test_ou nth_n: 10------------I--------100 """ env = neon_simple_env - (tenant_id, _) = env.neon_cli.create_tenant() + (tenant_id, _) = env.create_tenant() http_client = env.pageserver.http_client() initial_size = http_client.tenant_size(tenant_id) first_branch_name = "first" - env.neon_cli.create_branch(first_branch_name, tenant_id=tenant_id) + env.create_branch(first_branch_name, tenant_id=tenant_id) size_after_branching = http_client.tenant_size(tenant_id) @@ -123,7 +123,7 @@ def test_branched_from_many_empty_parents_size(neon_simple_env: NeonEnv, test_ou for i in range(0, 4): latest_branch_name = f"nth_{i}" - last_branch = env.neon_cli.create_branch( + last_branch = env.create_branch( latest_branch_name, ancestor_branch_name=last_branch_name, tenant_id=tenant_id ) last_branch_name = latest_branch_name @@ -159,7 +159,7 @@ def test_branch_point_within_horizon(neon_simple_env: NeonEnv, test_output_dir: env = neon_simple_env gc_horizon = 20_000 - (tenant_id, main_id) = env.neon_cli.create_tenant(conf={"gc_horizon": str(gc_horizon)}) + (tenant_id, main_id) = env.create_tenant(conf={"gc_horizon": str(gc_horizon)}) http_client = env.pageserver.http_client() with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint: @@ -172,9 +172,7 @@ def test_branch_point_within_horizon(neon_simple_env: NeonEnv, test_output_dir: assert flushed_lsn.lsn_int - gc_horizon > initdb_lsn.lsn_int - branch_id = env.neon_cli.create_branch( - "branch", tenant_id=tenant_id, ancestor_start_lsn=flushed_lsn - ) + branch_id = env.create_branch("branch", tenant_id=tenant_id, ancestor_start_lsn=flushed_lsn) with env.endpoints.create_start("branch", tenant_id=tenant_id) as endpoint: with endpoint.cursor() as cur: @@ -201,7 +199,7 @@ def test_parent_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Path): env = neon_simple_env gc_horizon = 5_000 - (tenant_id, main_id) = env.neon_cli.create_tenant(conf={"gc_horizon": str(gc_horizon)}) + (tenant_id, main_id) = env.create_tenant(conf={"gc_horizon": str(gc_horizon)}) http_client = env.pageserver.http_client() with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint: @@ -220,9 +218,7 @@ def test_parent_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Path): assert flushed_lsn.lsn_int - gc_horizon > initdb_lsn.lsn_int - branch_id = env.neon_cli.create_branch( - "branch", tenant_id=tenant_id, ancestor_start_lsn=flushed_lsn - ) + branch_id = env.create_branch("branch", tenant_id=tenant_id, ancestor_start_lsn=flushed_lsn) with env.endpoints.create_start("branch", tenant_id=tenant_id) as endpoint: with endpoint.cursor() as cur: @@ -248,13 +244,13 @@ def test_only_heads_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Pa """ env = neon_simple_env - (tenant_id, main_id) = env.neon_cli.create_tenant(conf={"gc_horizon": "1024"}) + (tenant_id, main_id) = env.create_tenant(conf={"gc_horizon": "1024"}) http_client = env.pageserver.http_client() initial_size = http_client.tenant_size(tenant_id) - first_id = env.neon_cli.create_branch("first", tenant_id=tenant_id) - second_id = env.neon_cli.create_branch("second", tenant_id=tenant_id) + first_id = env.create_branch("first", tenant_id=tenant_id) + second_id = env.create_branch("second", tenant_id=tenant_id) ids = {"main": main_id, "first": first_id, "second": second_id} @@ -530,8 +526,8 @@ def test_get_tenant_size_with_multiple_branches( size_at_branch = http_client.tenant_size(tenant_id) assert size_at_branch > 0 - first_branch_timeline_id = env.neon_cli.create_branch( - "first-branch", main_branch_name, tenant_id + first_branch_timeline_id = env.create_branch( + "first-branch", ancestor_branch_name=main_branch_name, tenant_id=tenant_id ) size_after_first_branch = http_client.tenant_size(tenant_id) @@ -557,8 +553,8 @@ def test_get_tenant_size_with_multiple_branches( size_after_continuing_on_main = http_client.tenant_size(tenant_id) assert size_after_continuing_on_main > size_after_growing_first_branch - second_branch_timeline_id = env.neon_cli.create_branch( - "second-branch", main_branch_name, tenant_id + second_branch_timeline_id = env.create_branch( + "second-branch", ancestor_branch_name=main_branch_name, tenant_id=tenant_id ) size_after_second_branch = http_client.tenant_size(tenant_id) assert_size_approx_equal(size_after_second_branch, size_after_continuing_on_main) @@ -633,8 +629,8 @@ def test_synthetic_size_while_deleting(neon_env_builder: NeonEnvBuilder): orig_size = client.tenant_size(env.initial_tenant) - branch_id = env.neon_cli.create_branch( - tenant_id=env.initial_tenant, ancestor_branch_name="main", new_branch_name="branch" + branch_id = env.create_branch( + "branch", ancestor_branch_name="main", tenant_id=env.initial_tenant ) client.configure_failpoints((failpoint, "pause")) @@ -651,8 +647,8 @@ def test_synthetic_size_while_deleting(neon_env_builder: NeonEnvBuilder): assert_size_approx_equal(orig_size, size) - branch_id = env.neon_cli.create_branch( - tenant_id=env.initial_tenant, ancestor_branch_name="main", new_branch_name="branch2" + branch_id = env.create_branch( + "branch2", ancestor_branch_name="main", tenant_id=env.initial_tenant ) client.configure_failpoints((failpoint, "pause")) @@ -749,7 +745,7 @@ def assert_size_approx_equal_for_lease_test(size_lease, size_branch): env, env.initial_tenant, env.initial_timeline, test_output_dir, action="branch" ) - tenant, timeline = env.neon_cli.create_tenant(conf=conf) + tenant, timeline = env.create_tenant(conf=conf) lease_res = insert_with_action(env, tenant, timeline, test_output_dir, action="lease") assert_size_approx_equal_for_lease_test(lease_res, ro_branch_res) @@ -793,8 +789,8 @@ def insert_with_action( res = client.timeline_lsn_lease(tenant, timeline, last_flush_lsn) log.info(f"result from lsn_lease api: {res}") elif action == "branch": - ro_branch = env.neon_cli.create_branch( - "ro_branch", tenant_id=tenant, ancestor_start_lsn=last_flush_lsn + ro_branch = env.create_branch( + "ro_branch", ancestor_start_lsn=last_flush_lsn, tenant_id=tenant ) log.info(f"{ro_branch=} created") else: diff --git a/test_runner/regress/test_tenant_tasks.py b/test_runner/regress/test_tenant_tasks.py index d08ad3cd2e5d..2bf930d767be 100644 --- a/test_runner/regress/test_tenant_tasks.py +++ b/test_runner/regress/test_tenant_tasks.py @@ -31,8 +31,8 @@ def delete_all_timelines(tenant: TenantId): timeline_delete_wait_completed(client, tenant, t) # Create tenant, start compute - tenant, _ = env.neon_cli.create_tenant() - env.neon_cli.create_timeline(name, tenant_id=tenant) + tenant, _ = env.create_tenant() + env.create_timeline(name, tenant_id=tenant) endpoint = env.endpoints.create_start(name, tenant_id=tenant) assert_tenant_state( client, diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py index b63ff7f6bddd..7b194d40dd13 100644 --- a/test_runner/regress/test_tenants.py +++ b/test_runner/regress/test_tenants.py @@ -32,7 +32,7 @@ def test_tenant_creation_fails(neon_simple_env: NeonEnv): tenants_dir = neon_simple_env.pageserver.tenant_dir() initial_tenants = sorted( - map(lambda t: t.split()[0], neon_simple_env.neon_cli.list_tenants().stdout.splitlines()) + map(lambda t: t.split()[0], neon_simple_env.neon_cli.tenant_list().stdout.splitlines()) ) [d for d in tenants_dir.iterdir()] @@ -59,11 +59,11 @@ def test_tenant_creation_fails(neon_simple_env: NeonEnv): # an empty tenant dir with no config in it. neon_simple_env.pageserver.allowed_errors.append(".*Failed to load tenant config.*") new_tenants = sorted( - map(lambda t: t.split()[0], neon_simple_env.neon_cli.list_tenants().stdout.splitlines()) + map(lambda t: t.split()[0], neon_simple_env.neon_cli.tenant_list().stdout.splitlines()) ) assert initial_tenants == new_tenants, "should not create new tenants" - neon_simple_env.neon_cli.create_tenant() + neon_simple_env.create_tenant() def test_tenants_normal_work(neon_env_builder: NeonEnvBuilder): @@ -71,11 +71,11 @@ def test_tenants_normal_work(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() """Tests tenants with and without wal acceptors""" - tenant_1, _ = env.neon_cli.create_tenant() - tenant_2, _ = env.neon_cli.create_tenant() + tenant_1, _ = env.create_tenant() + tenant_2, _ = env.create_tenant() - env.neon_cli.create_timeline("test_tenants_normal_work", tenant_id=tenant_1) - env.neon_cli.create_timeline("test_tenants_normal_work", tenant_id=tenant_2) + env.create_timeline("test_tenants_normal_work", tenant_id=tenant_1) + env.create_timeline("test_tenants_normal_work", tenant_id=tenant_2) endpoint_tenant1 = env.endpoints.create_start( "test_tenants_normal_work", @@ -102,11 +102,11 @@ def test_metrics_normal_work(neon_env_builder: NeonEnvBuilder): neon_env_builder.pageserver_config_override = "availability_zone='test_ps_az'" env = neon_env_builder.init_start() - tenant_1, _ = env.neon_cli.create_tenant() - tenant_2, _ = env.neon_cli.create_tenant() + tenant_1, _ = env.create_tenant() + tenant_2, _ = env.create_tenant() - timeline_1 = env.neon_cli.create_timeline("test_metrics_normal_work", tenant_id=tenant_1) - timeline_2 = env.neon_cli.create_timeline("test_metrics_normal_work", tenant_id=tenant_2) + timeline_1 = env.create_timeline("test_metrics_normal_work", tenant_id=tenant_1) + timeline_2 = env.create_timeline("test_metrics_normal_work", tenant_id=tenant_2) endpoint_tenant1 = env.endpoints.create_start("test_metrics_normal_work", tenant_id=tenant_1) endpoint_tenant2 = env.endpoints.create_start("test_metrics_normal_work", tenant_id=tenant_2) @@ -250,11 +250,11 @@ def test_pageserver_metrics_removed_after_detach(neon_env_builder: NeonEnvBuilde neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() - tenant_1, _ = env.neon_cli.create_tenant() - tenant_2, _ = env.neon_cli.create_tenant() + tenant_1, _ = env.create_tenant() + tenant_2, _ = env.create_tenant() - env.neon_cli.create_timeline("test_metrics_removed_after_detach", tenant_id=tenant_1) - env.neon_cli.create_timeline("test_metrics_removed_after_detach", tenant_id=tenant_2) + env.create_timeline("test_metrics_removed_after_detach", tenant_id=tenant_1) + env.create_timeline("test_metrics_removed_after_detach", tenant_id=tenant_2) endpoint_tenant1 = env.endpoints.create_start( "test_metrics_removed_after_detach", tenant_id=tenant_1 diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py index 6ecc9031924c..9310786da73c 100644 --- a/test_runner/regress/test_tenants_with_remote_storage.py +++ b/test_runner/regress/test_tenants_with_remote_storage.py @@ -66,7 +66,7 @@ def test_tenants_many(neon_env_builder: NeonEnvBuilder): for _ in range(1, 5): # Use a tiny checkpoint distance, to create a lot of layers quickly - tenant, _ = env.neon_cli.create_tenant( + tenant, _ = env.create_tenant( conf={ "checkpoint_distance": "5000000", } diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py index de43e51c9e58..16e05218901d 100644 --- a/test_runner/regress/test_timeline_archive.py +++ b/test_runner/regress/test_timeline_archive.py @@ -46,10 +46,11 @@ def test_timeline_archive(neon_env_builder: NeonEnvBuilder, shard_count: int): # construct a pair of branches to validate that pageserver prohibits # archival of ancestor timelines when they have non-archived child branches - parent_timeline_id = env.neon_cli.create_branch("test_ancestor_branch_archive_parent") + parent_timeline_id = env.create_branch("test_ancestor_branch_archive_parent") - leaf_timeline_id = env.neon_cli.create_branch( - "test_ancestor_branch_archive_branch1", "test_ancestor_branch_archive_parent" + leaf_timeline_id = env.create_branch( + "test_ancestor_branch_archive_branch1", + ancestor_branch_name="test_ancestor_branch_archive_parent", ) with pytest.raises( diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py index edb32cd2b4a0..7b6f6ac3c694 100644 --- a/test_runner/regress/test_timeline_delete.py +++ b/test_runner/regress/test_timeline_delete.py @@ -68,12 +68,12 @@ def test_timeline_delete(neon_simple_env: NeonEnv): # construct pair of branches to validate that pageserver prohibits # deletion of ancestor timelines when they have child branches - parent_timeline_id = env.neon_cli.create_branch( - new_branch_name="test_ancestor_branch_delete_parent", ancestor_branch_name="main" + parent_timeline_id = env.create_branch( + "test_ancestor_branch_delete_parent", ancestor_branch_name="main" ) - leaf_timeline_id = env.neon_cli.create_branch( - new_branch_name="test_ancestor_branch_delete_branch1", + leaf_timeline_id = env.create_branch( + "test_ancestor_branch_delete_branch1", ancestor_branch_name="test_ancestor_branch_delete_parent", ) @@ -184,7 +184,7 @@ def test_delete_timeline_exercise_crash_safety_failpoints( ps_http = env.pageserver.http_client() - timeline_id = env.neon_cli.create_timeline("delete") + timeline_id = env.create_timeline("delete") with env.endpoints.create_start("delete") as endpoint: # generate enough layers run_pg_bench_small(pg_bin, endpoint.connstr()) @@ -334,7 +334,7 @@ def test_timeline_resurrection_on_attach( wait_for_upload(ps_http, tenant_id, main_timeline_id, current_lsn) log.info("upload of checkpoint is done") - branch_timeline_id = env.neon_cli.create_branch("new", "main") + branch_timeline_id = env.create_branch("new", ancestor_branch_name="main") # Two variants of this test: # - In fill_branch=True, the deleted branch has layer files. @@ -409,13 +409,11 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild ps_http.configure_failpoints(("timeline-delete-before-rm", "return")) # construct pair of branches - intermediate_timeline_id = env.neon_cli.create_branch( - "test_timeline_delete_fail_before_local_delete" - ) + intermediate_timeline_id = env.create_branch("test_timeline_delete_fail_before_local_delete") - leaf_timeline_id = env.neon_cli.create_branch( + leaf_timeline_id = env.create_branch( "test_timeline_delete_fail_before_local_delete1", - "test_timeline_delete_fail_before_local_delete", + ancestor_branch_name="test_timeline_delete_fail_before_local_delete", ) leaf_timeline_path = env.pageserver.timeline_dir(env.initial_tenant, leaf_timeline_id) @@ -514,7 +512,7 @@ def test_concurrent_timeline_delete_stuck_on( env = neon_env_builder.init_start() - child_timeline_id = env.neon_cli.create_branch("child", "main") + child_timeline_id = env.create_branch("child", ancestor_branch_name="main") ps_http = env.pageserver.http_client() @@ -591,7 +589,7 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() - child_timeline_id = env.neon_cli.create_branch("child", "main") + child_timeline_id = env.create_branch("child", ancestor_branch_name="main") ps_http = env.pageserver.http_client(retries=Retry(0, read=False)) @@ -656,7 +654,7 @@ def test_timeline_delete_works_for_remote_smoke( timeline_ids = [env.initial_timeline] for i in range(2): - branch_timeline_id = env.neon_cli.create_branch(f"new{i}", "main") + branch_timeline_id = env.create_branch(f"new{i}", ancestor_branch_name="main") with env.endpoints.create_start(f"new{i}") as pg, pg.cursor() as cur: cur.execute("CREATE TABLE f (i integer);") cur.execute("INSERT INTO f VALUES (generate_series(1,1000));") @@ -733,7 +731,7 @@ def test_delete_orphaned_objects( ps_http = env.pageserver.http_client() - timeline_id = env.neon_cli.create_timeline("delete") + timeline_id = env.create_timeline("delete") with env.endpoints.create_start("delete") as endpoint: # generate enough layers run_pg_bench_small(pg_bin, endpoint.connstr()) @@ -791,7 +789,7 @@ def test_timeline_delete_resumed_on_attach( ps_http = env.pageserver.http_client() - timeline_id = env.neon_cli.create_timeline("delete") + timeline_id = env.create_timeline("delete") with env.endpoints.create_start("delete") as endpoint: # generate enough layers run_pg_bench_small(pg_bin, endpoint.connstr()) diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py index f98b53d966af..7f148a4b9be5 100644 --- a/test_runner/regress/test_timeline_detach_ancestor.py +++ b/test_runner/regress/test_timeline_detach_ancestor.py @@ -133,9 +133,7 @@ def test_ancestor_detach_branched_from( name = "new main" - timeline_id = env.neon_cli.create_branch( - name, "main", env.initial_tenant, ancestor_start_lsn=branch_at - ) + timeline_id = env.create_branch(name, ancestor_branch_name="main", ancestor_start_lsn=branch_at) recorded = Lsn(client.timeline_detail(env.initial_tenant, timeline_id)["ancestor_lsn"]) if branch_at is None: @@ -262,19 +260,19 @@ def test_ancestor_detach_reparents_earlier(neon_env_builder: NeonEnvBuilder): wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline) # as this only gets reparented, we don't need to write to it like new main - reparented = env.neon_cli.create_branch( - "reparented", "main", env.initial_tenant, ancestor_start_lsn=branchpoint_pipe + reparented = env.create_branch( + "reparented", ancestor_branch_name="main", ancestor_start_lsn=branchpoint_pipe ) - same_branchpoint = env.neon_cli.create_branch( - "same_branchpoint", "main", env.initial_tenant, ancestor_start_lsn=branchpoint_x + same_branchpoint = env.create_branch( + "same_branchpoint", ancestor_branch_name="main", ancestor_start_lsn=branchpoint_x ) - timeline_id = env.neon_cli.create_branch( - "new main", "main", env.initial_tenant, ancestor_start_lsn=branchpoint_x + timeline_id = env.create_branch( + "new main", ancestor_branch_name="main", ancestor_start_lsn=branchpoint_x ) - after = env.neon_cli.create_branch("after", "main", env.initial_tenant, ancestor_start_lsn=None) + after = env.create_branch("after", ancestor_branch_name="main", ancestor_start_lsn=None) all_reparented = client.detach_ancestor(env.initial_tenant, timeline_id) assert set(all_reparented) == {reparented, same_branchpoint} @@ -365,8 +363,8 @@ def insert_rows(n: int, ep) -> int: branchpoint = wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline) - timeline_id = env.neon_cli.create_branch( - "new main", "main", tenant_id=env.initial_tenant, ancestor_start_lsn=branchpoint + timeline_id = env.create_branch( + "new main", ancestor_branch_name="main", ancestor_start_lsn=branchpoint ) log.info("starting the new main endpoint") @@ -479,10 +477,9 @@ def delta_layers(timeline_id: TimelineId): for num in more_good_numbers: branch_name = f"br-{len(branches)}" - branch_timeline_id = env.neon_cli.create_branch( + branch_timeline_id = env.create_branch( branch_name, ancestor_branch_name=branches[-1][0], - tenant_id=env.initial_tenant, ancestor_start_lsn=branch_lsn, ) branches.append((branch_name, branch_timeline_id)) @@ -599,15 +596,15 @@ def test_timeline_ancestor_detach_idempotent_success( else: client = env.pageserver.http_client() - first_branch = env.neon_cli.create_branch("first_branch") + first_branch = env.create_branch("first_branch") - _ = env.neon_cli.create_branch("second_branch", ancestor_branch_name="first_branch") + _ = env.create_branch("second_branch", ancestor_branch_name="first_branch") # these two will be reparented, and they should be returned in stable order # from pageservers OR otherwise there will be an `error!` logging from # storage controller - reparented1 = env.neon_cli.create_branch("first_reparented", ancestor_branch_name="main") - reparented2 = env.neon_cli.create_branch("second_reparented", ancestor_branch_name="main") + reparented1 = env.create_branch("first_reparented", ancestor_branch_name="main") + reparented2 = env.create_branch("second_reparented", ancestor_branch_name="main") first_reparenting_response = client.detach_ancestor(env.initial_tenant, first_branch) assert set(first_reparenting_response) == {reparented1, reparented2} @@ -658,9 +655,9 @@ def test_timeline_ancestor_detach_errors(neon_env_builder: NeonEnvBuilder, shard client.detach_ancestor(env.initial_tenant, env.initial_timeline) assert info.value.status_code == 409 - _ = env.neon_cli.create_branch("first_branch") + _ = env.create_branch("first_branch") - second_branch = env.neon_cli.create_branch("second_branch", ancestor_branch_name="first_branch") + second_branch = env.create_branch("second_branch", ancestor_branch_name="first_branch") # funnily enough this does not have a prefix with pytest.raises(PageserverApiException, match="too many ancestors") as info: @@ -697,7 +694,7 @@ def test_sharded_timeline_detach_ancestor(neon_env_builder: NeonEnvBuilder): utilized_pageservers = {x["node_id"] for x in shards} assert len(utilized_pageservers) > 1, "all shards got placed on single pageserver?" - branch_timeline_id = env.neon_cli.create_branch(branch_name, tenant_id=env.initial_tenant) + branch_timeline_id = env.create_branch(branch_name) with env.endpoints.create_start(branch_name, tenant_id=env.initial_tenant) as ep: ep.safe_psql( @@ -849,7 +846,7 @@ def test_timeline_detach_ancestor_interrupted_by_deletion( pageservers = dict((int(p.id), p) for p in env.pageservers) - detached_timeline = env.neon_cli.create_branch("detached soon", "main") + detached_timeline = env.create_branch("detached soon", ancestor_branch_name="main") pausepoint = "timeline-detach-ancestor::before_starting_after_locking-pausable" @@ -993,7 +990,7 @@ def test_sharded_tad_interleaved_after_partial_success(neon_env_builder: NeonEnv ps.http_client().timeline_checkpoint(shard_id, env.initial_timeline) def create_reparentable_timeline() -> TimelineId: - return env.neon_cli.create_branch( + return env.create_branch( "first_branch", ancestor_branch_name="main", ancestor_start_lsn=first_branch_lsn ) @@ -1002,7 +999,7 @@ def create_reparentable_timeline() -> TimelineId: else: first_branch = None - detached_branch = env.neon_cli.create_branch( + detached_branch = env.create_branch( "detached_branch", ancestor_branch_name="main", ancestor_start_lsn=detached_branch_lsn ) @@ -1169,7 +1166,7 @@ def test_retryable_500_hit_through_storcon_during_timeline_detach_ancestor( shards = env.storage_controller.locate(env.initial_tenant) assert len(set(x["node_id"] for x in shards)) == shard_count - detached_branch = env.neon_cli.create_branch("detached_branch", ancestor_branch_name="main") + detached_branch = env.create_branch("detached_branch", ancestor_branch_name="main") pausepoint = "timeline-detach-ancestor::before_starting_after_locking-pausable" failpoint = "timeline-detach-ancestor::before_starting_after_locking" @@ -1294,8 +1291,8 @@ def reparenting_progress(timelines: List[TimelineId]) -> Tuple[int, Set[Timeline ) branch_lsn = wait_for_last_flush_lsn(env, ep, env.initial_tenant, env.initial_timeline) http.timeline_checkpoint(env.initial_tenant, env.initial_timeline) - branch = env.neon_cli.create_branch( - f"branch_{counter}", "main", ancestor_start_lsn=branch_lsn + branch = env.create_branch( + f"branch_{counter}", ancestor_branch_name="main", ancestor_start_lsn=branch_lsn ) timelines.append(branch) @@ -1432,7 +1429,7 @@ def test_timeline_is_deleted_before_timeline_detach_ancestor_completes( http = env.pageserver.http_client() - detached = env.neon_cli.create_branch("detached") + detached = env.create_branch("detached") failpoint = "timeline-detach-ancestor::after_activating_before_finding-pausable" diff --git a/test_runner/regress/test_timeline_gc_blocking.py b/test_runner/regress/test_timeline_gc_blocking.py index ddfe9b911fd8..1540cbbceea8 100644 --- a/test_runner/regress/test_timeline_gc_blocking.py +++ b/test_runner/regress/test_timeline_gc_blocking.py @@ -28,7 +28,7 @@ def test_gc_blocking_by_timeline(neon_env_builder: NeonEnvBuilder, sharded: bool pss = ManyPageservers(list(map(lambda ps: ScrollableLog(ps, None), env.pageservers))) - foo_branch = env.neon_cli.create_branch("foo", "main", env.initial_tenant) + foo_branch = env.create_branch("foo", ancestor_branch_name="main", tenant_id=env.initial_tenant) gc_active_line = ".* gc_loop.*: [12] timelines need GC" gc_skipped_line = ".* gc_loop.*: Skipping GC: .*" diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py index f2265dd3d937..aa7747409743 100644 --- a/test_runner/regress/test_timeline_size.py +++ b/test_runner/regress/test_timeline_size.py @@ -36,7 +36,7 @@ def test_timeline_size(neon_simple_env: NeonEnv): env = neon_simple_env - new_timeline_id = env.neon_cli.create_branch("test_timeline_size", "main") + new_timeline_id = env.create_branch("test_timeline_size", ancestor_branch_name="main") client = env.pageserver.http_client() client.timeline_wait_logical_size(env.initial_tenant, new_timeline_id) @@ -68,7 +68,9 @@ def test_timeline_size(neon_simple_env: NeonEnv): def test_timeline_size_createdropdb(neon_simple_env: NeonEnv): env = neon_simple_env - new_timeline_id = env.neon_cli.create_branch("test_timeline_size_createdropdb", "main") + new_timeline_id = env.create_branch( + "test_timeline_size_createdropdb", ancestor_branch_name="main" + ) client = env.pageserver.http_client() client.timeline_wait_logical_size(env.initial_tenant, new_timeline_id) @@ -148,7 +150,7 @@ def wait_for_pageserver_catchup(endpoint_main: Endpoint, polling_interval=1, tim def test_timeline_size_quota_on_startup(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() client = env.pageserver.http_client() - new_timeline_id = env.neon_cli.create_branch("test_timeline_size_quota_on_startup") + new_timeline_id = env.create_branch("test_timeline_size_quota_on_startup") client.timeline_wait_logical_size(env.initial_tenant, new_timeline_id) @@ -236,7 +238,7 @@ def write_rows(count): def test_timeline_size_quota(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() client = env.pageserver.http_client() - new_timeline_id = env.neon_cli.create_branch("test_timeline_size_quota") + new_timeline_id = env.create_branch("test_timeline_size_quota") client.timeline_wait_logical_size(env.initial_tenant, new_timeline_id) @@ -373,7 +375,7 @@ def test_timeline_physical_size_init(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() - new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_init") + new_timeline_id = env.create_branch("test_timeline_physical_size_init") endpoint = env.endpoints.create_start("test_timeline_physical_size_init") endpoint.safe_psql_many( @@ -410,7 +412,7 @@ def test_timeline_physical_size_post_checkpoint(neon_env_builder: NeonEnvBuilder env = neon_env_builder.init_start() pageserver_http = env.pageserver.http_client() - new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_post_checkpoint") + new_timeline_id = env.create_branch("test_timeline_physical_size_post_checkpoint") endpoint = env.endpoints.create_start("test_timeline_physical_size_post_checkpoint") endpoint.safe_psql_many( @@ -446,7 +448,7 @@ def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder ) pageserver_http = env.pageserver.http_client() - new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_post_compaction") + new_timeline_id = env.create_branch("test_timeline_physical_size_post_compaction") endpoint = env.endpoints.create_start("test_timeline_physical_size_post_compaction") # We don't want autovacuum to run on the table, while we are calculating the @@ -496,7 +498,7 @@ def test_timeline_physical_size_post_gc(neon_env_builder: NeonEnvBuilder): ) pageserver_http = env.pageserver.http_client() - new_timeline_id = env.neon_cli.create_branch("test_timeline_physical_size_post_gc") + new_timeline_id = env.create_branch("test_timeline_physical_size_post_gc") endpoint = env.endpoints.create_start("test_timeline_physical_size_post_gc") # Like in test_timeline_physical_size_post_compaction, disable autovacuum @@ -543,7 +545,7 @@ def test_timeline_size_metrics( env = neon_simple_env pageserver_http = env.pageserver.http_client() - new_timeline_id = env.neon_cli.create_branch("test_timeline_size_metrics") + new_timeline_id = env.create_branch("test_timeline_size_metrics") endpoint = env.endpoints.create_start("test_timeline_size_metrics") endpoint.safe_psql_many( @@ -620,7 +622,7 @@ def test_tenant_physical_size(neon_env_builder: NeonEnvBuilder): pageserver_http = env.pageserver.http_client() client = env.pageserver.http_client() - tenant, timeline = env.neon_cli.create_tenant() + tenant, timeline = env.create_tenant() def get_timeline_resident_physical_size(timeline: TimelineId): sizes = get_physical_size_values(env, tenant, timeline) @@ -631,7 +633,7 @@ def get_timeline_resident_physical_size(timeline: TimelineId): for i in range(10): n_rows = random.randint(100, 1000) - timeline = env.neon_cli.create_branch(f"test_tenant_physical_size_{i}", tenant_id=tenant) + timeline = env.create_branch(f"test_tenant_physical_size_{i}", tenant_id=tenant) endpoint = env.endpoints.create_start(f"test_tenant_physical_size_{i}", tenant_id=tenant) endpoint.safe_psql_many( @@ -743,7 +745,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder): tenant_ids = {env.initial_tenant} for _i in range(0, n_tenants - 1): tenant_id = TenantId.generate() - env.neon_cli.create_tenant(tenant_id) + env.create_tenant(tenant_id) tenant_ids.add(tenant_id) # Restart pageserver with logical size calculations paused @@ -990,8 +992,8 @@ def test_eager_attach_does_not_queue_up(neon_env_builder: NeonEnvBuilder): # the supporting_second does nothing except queue behind env.initial_tenant # for purposes of showing that eager_tenant breezes past the queue - supporting_second, _ = env.neon_cli.create_tenant() - eager_tenant, _ = env.neon_cli.create_tenant() + supporting_second, _ = env.create_tenant() + eager_tenant, _ = env.create_tenant() client = env.pageserver.http_client() client.tenant_location_conf( @@ -1067,7 +1069,7 @@ def test_lazy_attach_activation(neon_env_builder: NeonEnvBuilder, activation_met env = neon_env_builder.init_start() # because this returns (also elsewhere in this file), we know that SpawnMode::Create skips the queue - lazy_tenant, _ = env.neon_cli.create_tenant() + lazy_tenant, _ = env.create_tenant() client = env.pageserver.http_client() client.tenant_location_conf( @@ -1131,7 +1133,7 @@ def lazy_tenant_is_active(): # starting up the endpoint should make it jump the queue wait_until(10, 1, lazy_tenant_is_active) elif activation_method == "branch": - env.neon_cli.create_timeline("second_branch", lazy_tenant) + env.create_timeline("second_branch", lazy_tenant) wait_until(10, 1, lazy_tenant_is_active) elif activation_method == "delete": delete_lazy_activating(lazy_tenant, env.pageserver, expect_attaching=True) diff --git a/test_runner/regress/test_truncate.py b/test_runner/regress/test_truncate.py index bfa9ce5db7be..4fc0601a18bb 100644 --- a/test_runner/regress/test_truncate.py +++ b/test_runner/regress/test_truncate.py @@ -13,7 +13,7 @@ def test_truncate(neon_env_builder: NeonEnvBuilder, zenbenchmark): # Problems with FSM/VM forks truncation are most frequently detected during page reconstruction triggered # by image layer generation. So adjust default parameters to make it happen more frequently. - tenant, _ = env.neon_cli.create_tenant( + tenant, _ = env.create_tenant( conf={ # disable automatic GC "gc_period": "0s", diff --git a/test_runner/regress/test_twophase.py b/test_runner/regress/test_twophase.py index 75fab78d6e6c..1d9fe9d21db0 100644 --- a/test_runner/regress/test_twophase.py +++ b/test_runner/regress/test_twophase.py @@ -96,7 +96,7 @@ def test_twophase(neon_simple_env: NeonEnv): Test branching, when a transaction is in prepared state """ env = neon_simple_env - env.neon_cli.create_branch("test_twophase") + env.create_branch("test_twophase") twophase_test_on_timeline(env) @@ -147,7 +147,7 @@ def test_twophase_at_wal_segment_start(neon_simple_env: NeonEnv): very first page of a WAL segment and the server was started up at that first page. """ env = neon_simple_env - timeline_id = env.neon_cli.create_branch("test_twophase", "main") + timeline_id = env.create_branch("test_twophase", ancestor_branch_name="main") endpoint = env.endpoints.create_start( "test_twophase", config_lines=["max_prepared_transactions=5"] diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py index 25c66c3cae13..44ca9f90a4e8 100644 --- a/test_runner/regress/test_wal_acceptor.py +++ b/test_runner/regress/test_wal_acceptor.py @@ -146,7 +146,7 @@ def test_many_timelines(neon_env_builder: NeonEnvBuilder): # start postgres on each timeline endpoints = [] for branch_name in branch_names: - new_timeline_id = env.neon_cli.create_branch(branch_name) + new_timeline_id = env.create_branch(branch_name) endpoints.append(env.endpoints.create_start(branch_name)) branch_names_to_timeline_ids[branch_name] = new_timeline_id @@ -284,7 +284,7 @@ def test_restarts(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = n_acceptors env = neon_env_builder.init_start() - env.neon_cli.create_branch("test_safekeepers_restarts") + env.create_branch("test_safekeepers_restarts") endpoint = env.endpoints.create_start("test_safekeepers_restarts") # we rely upon autocommit after each statement @@ -314,7 +314,7 @@ def test_broker(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() tenant_id = env.initial_tenant - timeline_id = env.neon_cli.create_branch("test_broker", "main") + timeline_id = env.create_branch("test_broker", ancestor_branch_name="main") endpoint = env.endpoints.create_start("test_broker") endpoint.safe_psql("CREATE TABLE t(key int primary key, value text)") @@ -374,7 +374,7 @@ def test_wal_removal(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): env = neon_env_builder.init_start() tenant_id = env.initial_tenant - timeline_id = env.neon_cli.create_branch("test_safekeepers_wal_removal") + timeline_id = env.create_branch("test_safekeepers_wal_removal") endpoint = env.endpoints.create_start("test_safekeepers_wal_removal") # Note: it is important to insert at least two segments, as currently @@ -504,7 +504,7 @@ def test_wal_backup(neon_env_builder: NeonEnvBuilder): ) tenant_id = env.initial_tenant - timeline_id = env.neon_cli.create_branch("test_safekeepers_wal_backup") + timeline_id = env.create_branch("test_safekeepers_wal_backup") endpoint = env.endpoints.create_start("test_safekeepers_wal_backup") pg_conn = endpoint.connect() @@ -561,7 +561,7 @@ def test_s3_wal_replay(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() tenant_id = env.initial_tenant - timeline_id = env.neon_cli.create_branch("test_s3_wal_replay") + timeline_id = env.create_branch("test_s3_wal_replay") endpoint = env.endpoints.create_start("test_s3_wal_replay") @@ -849,7 +849,7 @@ def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): env = neon_env_builder.init_start() tenant_id = env.initial_tenant - timeline_id = env.neon_cli.create_branch("test_timeline_status") + timeline_id = env.create_branch("test_timeline_status") endpoint = env.endpoints.create_start("test_timeline_status") wa = env.safekeepers[0] @@ -894,6 +894,13 @@ def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): assert debug_dump_0["timelines"][0]["timeline_id"] == str(timeline_id) assert debug_dump_0["timelines"][0]["wal_last_modified"] != "" + # debug dump non existing tenant, should return no timelines. + debug_dump_non_existent = wa_http_cli_debug.debug_dump( + {"tenant_id": "deadbeefdeadbeefdeadbeefdeadbeef"} + ) + log.info(f"debug_dump_non_existend: {debug_dump_non_existent}") + assert len(debug_dump_non_existent["timelines"]) == 0 + endpoint.safe_psql("create table t(i int)") # ensure epoch goes up after reboot @@ -941,7 +948,7 @@ def test_start_replication_term(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() tenant_id = env.initial_tenant - timeline_id = env.neon_cli.create_branch("test_start_replication_term") + timeline_id = env.create_branch("test_start_replication_term") endpoint = env.endpoints.create_start("test_start_replication_term") endpoint.safe_psql("CREATE TABLE t(key int primary key, value text)") @@ -973,7 +980,7 @@ def test_sk_auth(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() tenant_id = env.initial_tenant - timeline_id = env.neon_cli.create_branch("test_sk_auth") + timeline_id = env.create_branch("test_sk_auth") env.endpoints.create_start("test_sk_auth") sk = env.safekeepers[0] @@ -1034,7 +1041,7 @@ def test_restart_endpoint(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() - env.neon_cli.create_branch("test_sk_auth_restart_endpoint") + env.create_branch("test_sk_auth_restart_endpoint") endpoint = env.endpoints.create_start("test_sk_auth_restart_endpoint") with closing(endpoint.connect()) as conn: @@ -1118,7 +1125,7 @@ def test_late_init(neon_env_builder: NeonEnvBuilder): sk1.stop() tenant_id = env.initial_tenant - timeline_id = env.neon_cli.create_branch("test_late_init") + timeline_id = env.create_branch("test_late_init") endpoint = env.endpoints.create_start("test_late_init") # create and insert smth while safekeeper is down... endpoint.safe_psql("create table t(key int, value text)") @@ -1254,7 +1261,7 @@ def fill_segment(ep): # create and insert smth while safekeeper is down... sk1.stop() tenant_id = env.initial_tenant - timeline_id = env.neon_cli.create_branch("test_lagging_sk") + timeline_id = env.create_branch("test_lagging_sk") ep = env.endpoints.create_start("test_lagging_sk") ep.safe_psql("create table t(key int, value text)") # make small insert to be on the same segment @@ -1341,7 +1348,7 @@ def test_peer_recovery(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() tenant_id = env.initial_tenant - timeline_id = env.neon_cli.create_branch("test_peer_recovery") + timeline_id = env.create_branch("test_peer_recovery") endpoint = env.endpoints.create_start("test_peer_recovery") endpoint.safe_psql("create table t(key int, value text)") @@ -1405,7 +1412,7 @@ def test_wp_graceful_shutdown(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin): env = neon_env_builder.init_start() tenant_id = env.initial_tenant - timeline_id = env.neon_cli.create_branch("test_wp_graceful_shutdown") + timeline_id = env.create_branch("test_wp_graceful_shutdown") ep = env.endpoints.create_start("test_wp_graceful_shutdown") ep.safe_psql("create table t(key int, value text)") ep.stop() @@ -1598,7 +1605,7 @@ def show_statuses(safekeepers: List[Safekeeper], tenant_id: TenantId, timeline_i neon_env_builder.num_safekeepers = 4 env = neon_env_builder.init_start() tenant_id = env.initial_tenant - timeline_id = env.neon_cli.create_branch("test_replace_safekeeper") + timeline_id = env.create_branch("test_replace_safekeeper") log.info("Use only first 3 safekeepers") env.safekeepers[3].stop() @@ -1665,12 +1672,12 @@ def test_delete_force(neon_env_builder: NeonEnvBuilder, auth_enabled: bool): # Create two tenants: one will be deleted, other should be preserved. tenant_id = env.initial_tenant - timeline_id_1 = env.neon_cli.create_branch("br1") # Active, delete explicitly - timeline_id_2 = env.neon_cli.create_branch("br2") # Inactive, delete explicitly - timeline_id_3 = env.neon_cli.create_branch("br3") # Active, delete with the tenant - timeline_id_4 = env.neon_cli.create_branch("br4") # Inactive, delete with the tenant + timeline_id_1 = env.create_branch("br1") # Active, delete explicitly + timeline_id_2 = env.create_branch("br2") # Inactive, delete explicitly + timeline_id_3 = env.create_branch("br3") # Active, delete with the tenant + timeline_id_4 = env.create_branch("br4") # Inactive, delete with the tenant - tenant_id_other, timeline_id_other = env.neon_cli.create_tenant() + tenant_id_other, timeline_id_other = env.create_tenant() # Populate branches endpoint_1 = env.endpoints.create_start("br1") @@ -2002,7 +2009,7 @@ def test_idle_reconnections(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() tenant_id = env.initial_tenant - timeline_id = env.neon_cli.create_branch("test_idle_reconnections") + timeline_id = env.create_branch("test_idle_reconnections") def collect_stats() -> Dict[str, float]: # we need to collect safekeeper_pg_queries_received_total metric from all safekeepers @@ -2237,7 +2244,7 @@ def test_broker_discovery(neon_env_builder: NeonEnvBuilder): neon_env_builder.enable_safekeeper_remote_storage(RemoteStorageKind.LOCAL_FS) env = neon_env_builder.init_start() - env.neon_cli.create_branch("test_broker_discovery") + env.create_branch("test_broker_discovery") endpoint = env.endpoints.create_start( "test_broker_discovery", @@ -2318,7 +2325,7 @@ def test_s3_eviction( # start postgres on each timeline endpoints: list[Endpoint] = [] for branch_name in branch_names: - timeline_id = env.neon_cli.create_branch(branch_name) + timeline_id = env.create_branch(branch_name) timelines.append(timeline_id) endpoints.append(env.endpoints.create_start(branch_name)) diff --git a/test_runner/regress/test_wal_acceptor_async.py b/test_runner/regress/test_wal_acceptor_async.py index 3f0a4a2ff8a6..74d114e976c7 100644 --- a/test_runner/regress/test_wal_acceptor_async.py +++ b/test_runner/regress/test_wal_acceptor_async.py @@ -218,7 +218,7 @@ def test_restarts_under_load(neon_env_builder: NeonEnvBuilder): neon_env_builder.enable_safekeeper_remote_storage(RemoteStorageKind.LOCAL_FS) env = neon_env_builder.init_start() - env.neon_cli.create_branch("test_safekeepers_restarts_under_load") + env.create_branch("test_safekeepers_restarts_under_load") # Enable backpressure with 1MB maximal lag, because we don't want to block on `wait_for_lsn()` for too long endpoint = env.endpoints.create_start( "test_safekeepers_restarts_under_load", config_lines=["max_replication_write_lag=1MB"] @@ -234,7 +234,7 @@ def test_restarts_frequent_checkpoints(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() - env.neon_cli.create_branch("test_restarts_frequent_checkpoints") + env.create_branch("test_restarts_frequent_checkpoints") # Enable backpressure with 1MB maximal lag, because we don't want to block on `wait_for_lsn()` for too long endpoint = env.endpoints.create_start( "test_restarts_frequent_checkpoints", @@ -325,7 +325,7 @@ def test_compute_restarts(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() - env.neon_cli.create_branch("test_compute_restarts") + env.create_branch("test_compute_restarts") asyncio.run(run_compute_restarts(env)) @@ -435,7 +435,7 @@ def test_concurrent_computes(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() - env.neon_cli.create_branch("test_concurrent_computes") + env.create_branch("test_concurrent_computes") asyncio.run(run_concurrent_computes(env)) @@ -484,7 +484,7 @@ def test_unavailability(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 2 env = neon_env_builder.init_start() - env.neon_cli.create_branch("test_safekeepers_unavailability") + env.create_branch("test_safekeepers_unavailability") endpoint = env.endpoints.create_start("test_safekeepers_unavailability") asyncio.run(run_unavailability(env, endpoint)) @@ -493,7 +493,7 @@ def test_unavailability(neon_env_builder: NeonEnvBuilder): async def run_recovery_uncommitted(env: NeonEnv): (sk1, sk2, _) = env.safekeepers - env.neon_cli.create_branch("test_recovery_uncommitted") + env.create_branch("test_recovery_uncommitted") ep = env.endpoints.create_start("test_recovery_uncommitted") ep.safe_psql("create table t(key int, value text)") ep.safe_psql("insert into t select generate_series(1, 100), 'payload'") @@ -589,7 +589,7 @@ def test_wal_truncation(neon_env_builder: NeonEnvBuilder): async def run_segment_init_failure(env: NeonEnv): - env.neon_cli.create_branch("test_segment_init_failure") + env.create_branch("test_segment_init_failure") ep = env.endpoints.create_start("test_segment_init_failure") ep.safe_psql("create table t(key int, value text)") ep.safe_psql("insert into t select generate_series(1, 100), 'payload'") @@ -684,7 +684,7 @@ def test_race_conditions(neon_env_builder: NeonEnvBuilder): neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() - env.neon_cli.create_branch("test_safekeepers_race_conditions") + env.create_branch("test_safekeepers_race_conditions") endpoint = env.endpoints.create_start("test_safekeepers_race_conditions") asyncio.run(run_race_conditions(env, endpoint)) @@ -761,7 +761,7 @@ def test_wal_lagging(neon_env_builder: NeonEnvBuilder, test_output_dir: Path, bu neon_env_builder.num_safekeepers = 3 env = neon_env_builder.init_start() - env.neon_cli.create_branch("test_wal_lagging") + env.create_branch("test_wal_lagging") endpoint = env.endpoints.create_start("test_wal_lagging") asyncio.run(run_wal_lagging(env, endpoint, test_output_dir)) diff --git a/test_runner/regress/test_wal_receiver.py b/test_runner/regress/test_wal_receiver.py index 229d3efd8eef..3c73df68e0d0 100644 --- a/test_runner/regress/test_wal_receiver.py +++ b/test_runner/regress/test_wal_receiver.py @@ -14,7 +14,7 @@ def test_pageserver_lsn_wait_error_start(neon_env_builder: NeonEnvBuilder): env = neon_env_builder.init_start() env.pageserver.http_client() - tenant_id, timeline_id = env.neon_cli.create_tenant() + tenant_id, timeline_id = env.create_tenant() expected_timeout_error = f"Timed out while waiting for WAL record at LSN {future_lsn} to arrive" env.pageserver.allowed_errors.append(f".*{expected_timeout_error}.*") @@ -57,7 +57,7 @@ def customize_pageserver_toml(ps_cfg: Dict[str, Any]): env = neon_env_builder.init_start() env.pageserver.http_client() - tenant_id, timeline_id = env.neon_cli.create_tenant() + tenant_id, timeline_id = env.create_tenant() elements_to_insert = 1_000_000 expected_timeout_error = f"Timed out while waiting for WAL record at LSN {future_lsn} to arrive" diff --git a/test_runner/regress/test_wal_restore.py b/test_runner/regress/test_wal_restore.py index 01a1d5cf55b4..46366f0e2cee 100644 --- a/test_runner/regress/test_wal_restore.py +++ b/test_runner/regress/test_wal_restore.py @@ -38,7 +38,7 @@ def test_wal_restore( pg_distrib_dir: Path, ): env = neon_env_builder.init_start() - env.neon_cli.create_branch("test_wal_restore") + env.create_branch("test_wal_restore") endpoint = env.endpoints.create_start("test_wal_restore") endpoint.safe_psql("create table t as select generate_series(1,300000)") tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0]) diff --git a/test_runner/regress/test_walredo_not_left_behind_on_detach.py b/test_runner/regress/test_walredo_not_left_behind_on_detach.py index 375cfcb4feb0..ae8e276a1a41 100644 --- a/test_runner/regress/test_walredo_not_left_behind_on_detach.py +++ b/test_runner/regress/test_walredo_not_left_behind_on_detach.py @@ -40,7 +40,7 @@ def test_walredo_not_left_behind_on_detach(neon_env_builder: NeonEnvBuilder): pageserver_http.tenant_status(tenant_id) # create new nenant - tenant_id, _ = env.neon_cli.create_tenant() + tenant_id, _ = env.create_tenant() # assert tenant exists on disk assert (env.pageserver.tenant_dir(tenant_id)).exists() diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml index 16b0fe82aec3..ddfc14ce1b75 100644 --- a/workspace_hack/Cargo.toml +++ b/workspace_hack/Cargo.toml @@ -17,15 +17,9 @@ license.workspace = true [dependencies] ahash = { version = "0.8" } anyhow = { version = "1", features = ["backtrace"] } -aws-config = { version = "1", default-features = false, features = ["rustls", "sso"] } -aws-runtime = { version = "1", default-features = false, features = ["event-stream", "http-02x", "sigv4a"] } -aws-sigv4 = { version = "1", features = ["http0-compat", "sign-eventstream", "sigv4a"] } -aws-smithy-async = { version = "1", default-features = false, features = ["rt-tokio"] } -aws-smithy-http = { version = "0.60", default-features = false, features = ["event-stream"] } -aws-smithy-types = { version = "1", default-features = false, features = ["byte-stream-poll-next", "http-body-0-4-x", "http-body-1-x", "rt-tokio", "test-util"] } -axum = { version = "0.6", features = ["ws"] } base64 = { version = "0.21", features = ["alloc"] } base64ct = { version = "1", default-features = false, features = ["std"] } +bitflags = { version = "2", default-features = false, features = ["std"] } bytes = { version = "1", features = ["serde"] } camino = { version = "1", default-features = false, features = ["serde1"] } chrono = { version = "0.4", default-features = false, features = ["clock", "serde", "wasmbind"] } @@ -48,7 +42,9 @@ half = { version = "2", default-features = false, features = ["num-traits"] } hashbrown = { version = "0.14", features = ["raw"] } hex = { version = "0.4", features = ["serde"] } hmac = { version = "0.12", default-features = false, features = ["reset"] } -hyper = { version = "0.14", features = ["full"] } +hyper-582f2526e08bb6a0 = { package = "hyper", version = "0.14", features = ["full"] } +hyper-dff4ba8e3ae991db = { package = "hyper", version = "1", features = ["http1", "http2", "server"] } +hyper-util = { version = "0.1", features = ["http1", "http2", "server", "tokio"] } indexmap = { version = "1", default-features = false, features = ["std"] } itertools-5ef9efb8ec2df382 = { package = "itertools", version = "0.12" } itertools-93f6ce9d446188ac = { package = "itertools", version = "0.10" } @@ -88,7 +84,6 @@ tower = { version = "0.4", default-features = false, features = ["balance", "buf tracing = { version = "0.1", features = ["log"] } tracing-core = { version = "0.1" } url = { version = "2", features = ["serde"] } -uuid = { version = "1", features = ["serde", "v4", "v7"] } zeroize = { version = "1", features = ["derive", "serde"] } zstd = { version = "0.13" } zstd-safe = { version = "7", default-features = false, features = ["arrays", "legacy", "std", "zdict_builder"] } @@ -97,6 +92,7 @@ zstd-sys = { version = "2", default-features = false, features = ["legacy", "std [build-dependencies] ahash = { version = "0.8" } anyhow = { version = "1", features = ["backtrace"] } +bitflags = { version = "2", default-features = false, features = ["std"] } bytes = { version = "1", features = ["serde"] } cc = { version = "1", default-features = false, features = ["parallel"] } chrono = { version = "0.4", default-features = false, features = ["clock", "serde", "wasmbind"] }