Skip to content

Commit

Permalink
add replay verify scheduler and fix a minor bug in replay-on-archive
Browse files Browse the repository at this point in the history
  • Loading branch information
areshand committed Nov 12, 2024
1 parent 0aaa3c6 commit 8cefc2e
Show file tree
Hide file tree
Showing 10 changed files with 1,708 additions and 1 deletion.
6 changes: 6 additions & 0 deletions storage/backup/backup-cli/src/utils/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,8 @@ pub struct RocksdbOpt {
index_db_max_total_wal_size: u64,
#[clap(long, hide(true), default_value_t = 16)]
max_background_jobs: i32,
#[clap(long, hide(true), default_value_t = 1024)]
block_cache_size: u64,
}

impl From<RocksdbOpt> for RocksdbConfigs {
Expand All @@ -96,25 +98,29 @@ impl From<RocksdbOpt> for RocksdbConfigs {
max_open_files: opt.ledger_db_max_open_files,
max_total_wal_size: opt.ledger_db_max_total_wal_size,
max_background_jobs: opt.max_background_jobs,
block_cache_size: opt.block_cache_size,
..Default::default()
},
state_merkle_db_config: RocksdbConfig {
max_open_files: opt.state_merkle_db_max_open_files,
max_total_wal_size: opt.state_merkle_db_max_total_wal_size,
max_background_jobs: opt.max_background_jobs,
block_cache_size: opt.block_cache_size,
..Default::default()
},
enable_storage_sharding: opt.enable_storage_sharding,
state_kv_db_config: RocksdbConfig {
max_open_files: opt.state_kv_db_max_open_files,
max_total_wal_size: opt.state_kv_db_max_total_wal_size,
max_background_jobs: opt.max_background_jobs,
block_cache_size: opt.block_cache_size,
..Default::default()
},
index_db_config: RocksdbConfig {
max_open_files: opt.index_db_max_open_files,
max_total_wal_size: opt.index_db_max_total_wal_size,
max_background_jobs: opt.max_background_jobs,
block_cache_size: opt.block_cache_size,
..Default::default()
},
}
Expand Down
23 changes: 22 additions & 1 deletion storage/db-tool/src/replay_on_archive.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
// Parts of the project are originally copyright © Meta Platforms, Inc.
// SPDX-License-Identifier: Apache-2.0

use anyhow::{bail, Error, Result};
use anyhow::{bail, Error, Ok, Result};
use aptos_backup_cli::utils::{ReplayConcurrencyLevelOpt, RocksdbOpt};
use aptos_config::config::{
StorageDirPaths, BUFFERED_STATE_TARGET_ITEMS, DEFAULT_MAX_NUM_NODES_PER_LRU_CACHE_SHARD,
Expand Down Expand Up @@ -62,6 +62,12 @@ pub struct Opt {

#[clap(long, default_value = "1", help = "The number of concurrent replays")]
pub concurrent_replay: usize,

#[clap(
long,
help = "The maximum time in seconds to wait for each transaction replay"
)]
pub timeout_secs: Option<u64>,
}

impl Opt {
Expand Down Expand Up @@ -102,6 +108,10 @@ impl ReplayTps {
cnt, elapsed, tps
);
}

pub fn get_elapsed_secs(&self) -> u64 {
self.timer.elapsed().as_secs()
}
}

struct Verifier {
Expand All @@ -113,6 +123,7 @@ struct Verifier {
chunk_size: usize,
concurrent_replay: usize,
replay_stat: ReplayTps,
timeout_secs: Option<u64>,
}

impl Verifier {
Expand Down Expand Up @@ -148,6 +159,7 @@ impl Verifier {
chunk_size: config.chunk_size,
concurrent_replay: config.concurrent_replay,
replay_stat: ReplayTps::new(),
timeout_secs: config.timeout_secs,
})
}

Expand Down Expand Up @@ -211,6 +223,12 @@ impl Verifier {
self.replay_stat.update_cnt(cur_txns.len() as u64);
self.replay_stat.print_tps();

if let Some(duration) = self.timeout_secs {
if self.replay_stat.get_elapsed_secs() >= duration {
return Ok(total_failed_txns);
}
}

// empty for the new chunk
chunk_start_version = start + (idx as u64) + 1;
cur_txns.clear();
Expand Down Expand Up @@ -270,6 +288,9 @@ impl Verifier {
expected_epoch_events: &Vec<Vec<ContractEvent>>,
expected_epoch_writesets: &Vec<WriteSet>,
) -> Result<Vec<Error>> {
if cur_txns.is_empty() {
return Ok(Vec::new());
}
let executed_outputs = AptosVMBlockExecutor::new().execute_block_no_limit(
cur_txns
.iter()
Expand Down
12 changes: 12 additions & 0 deletions testsuite/replay-verify/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
This script orchestrates the replay and verification of blockchain data using Kubernetes pods. It defines a WorkerPod class to manage individual pods, handling their status, logs, and environment variables. The ReplayScheduler class schedules tasks for these pods, ensuring they run sequentially while managing retries, logging, and error handling. It supports scheduling from specific blockchain versions, skipping defined ranges, and collecting logs from failed or mismatched transactions. The script uses Kubernetes API for pod management and includes configurable hyperparameters for sharding, retries, concurrency, and delays. The main function initializes the scheduler and starts the scheduling process from scratch.

## Prerequiste
Install minikube

## Local test
minikube start --mount --mount-string="/mnt/testnet_archive:/mnt/testnet_archive" --memory=81920 --cpus=17
minikb apply -f ./testnet-archive.yaml

poetry shell
poetry install # install kubenetes
poetry run
Loading

0 comments on commit 8cefc2e

Please sign in to comment.