Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(drive): uncommitted state if db transaction fails #2305

Merged
merged 16 commits into from
Nov 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 25 additions & 1 deletion packages/rs-drive-abci/src/abci/handler/finalize_block.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ use crate::execution::types::block_execution_context::v0::BlockExecutionContextV
use crate::platform_types::cleaned_abci_messages::finalized_block_cleaned_request::v0::FinalizeBlockCleanedRequest;
use crate::platform_types::platform_state::v0::PlatformStateV0Methods;
use crate::rpc::core::CoreRPCLike;
use dpp::dashcore::Network;
use std::sync::atomic::Ordering;
use tenderdash_abci::proto::abci as proto;

Expand Down Expand Up @@ -66,7 +67,30 @@ where
));
}

app.commit_transaction(platform_version)?;
let result = app.commit_transaction(platform_version);

// We had a sequence of errors on the mainnet started since block 32326.
// We got RocksDB's "transaction is busy" error because of a bug (https://github.com/dashpay/platform/pull/2309).
// Due to another bug in Tenderdash (https://github.com/dashpay/tenderdash/pull/966),
// validators just proceeded to the next block partially committing the state and updating the cache.
// Full nodes are stuck and proceeded after re-sync.
// For the mainnet chain, we enable these fixes at the block when we consider the state is consistent.
let config = &app.platform().config;

if app.platform().config.network == Network::Dash
&& config.abci.chain_id == "evo1"
&& block_height < 33000
{
// Old behavior on mainnet below block 33000
result?;
} else {
// In case if transaction commit failed we still have caches in memory that
// corresponds to the data that we weren't able to commit.
// The simplified solution is to restart the Drive, so all caches
// will be restored from the disk and try to process this block again.
// TODO: We need a better handling of the transaction is busy error with retry logic.
result.expect("commit transaction");
}

app.platform()
.committed_block_height_guard
Expand Down
52 changes: 46 additions & 6 deletions packages/rs-drive-abci/src/abci/handler/info.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ use crate::abci::AbciError;
use crate::error::Error;
use crate::platform_types::platform_state::v0::PlatformStateV0Methods;
use crate::rpc::core::CoreRPCLike;
use dpp::dashcore::Network;
use dpp::version::DESIRED_PLATFORM_VERSION;
use tenderdash_abci::proto::abci as proto;

Expand All @@ -21,28 +22,67 @@ where

let platform_state = app.platform().state.load();

let state_app_hash = platform_state
let last_block_height = platform_state.last_committed_block_height() as i64;

// Verify that Platform State corresponds to Drive commited state
let platform_state_app_hash = platform_state
.last_committed_block_app_hash()
.map(|app_hash| app_hash.to_vec())
.unwrap_or_default();

let grove_version = &platform_state
.current_platform_version()?
.drive
.grove_version;

let drive_storage_root_hash = app
.platform()
.drive
.grove
.root_hash(None, grove_version)
.unwrap()?;

// We had a sequence of errors on the mainnet started since block 32326.
// We got RocksDB's "transaction is busy" error because of a bug (https://github.com/dashpay/platform/pull/2309).
// Due to another bug in Tenderdash (https://github.com/dashpay/tenderdash/pull/966),
// validators just proceeded to the next block partially committing the state and updating the cache.
// Full nodes are stuck and proceeded after re-sync.
// For the mainnet chain, we enable these fixes at the block when we consider the state is consistent.
let config = &app.platform().config;

#[allow(clippy::collapsible_if)]
if !(config.network == Network::Dash
&& config.abci.chain_id == "evo1"
&& last_block_height < 33000)
{
// App hash in memory must be equal to app hash on disk
if drive_storage_root_hash != platform_state_app_hash {
// We panic because we can't recover from this situation.
// Better to restart the Drive, so we might self-heal the node
// reloading state form the disk
panic!(
"drive and platform state app hash mismatch: drive_storage_root_hash: {:?}, platform_state_app_hash: {:?}",
drive_storage_root_hash, platform_state_app_hash
);
}
}

let desired_protocol_version = DESIRED_PLATFORM_VERSION.protocol_version;

let response = proto::ResponseInfo {
data: "".to_string(),
app_version: desired_protocol_version as u64,
last_block_height: platform_state.last_committed_block_height() as i64,
last_block_height,
version: env!("CARGO_PKG_VERSION").to_string(),
last_block_app_hash: state_app_hash.clone(),
last_block_app_hash: platform_state_app_hash.to_vec(),
};

tracing::debug!(
desired_protocol_version,
software_version = env!("CARGO_PKG_VERSION"),
block_version = request.block_version,
p2p_version = request.p2p_version,
app_hash = hex::encode(state_app_hash),
height = platform_state.last_committed_block_height(),
app_hash = hex::encode(platform_state_app_hash),
last_block_height,
"Handshake with consensus engine",
);

Expand Down
43 changes: 43 additions & 0 deletions packages/rs-drive-abci/src/abci/handler/prepare_proposal.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ use crate::platform_types::platform_state::v0::PlatformStateV0Methods;
use crate::platform_types::state_transitions_processing_result::StateTransitionExecutionResult;
use crate::rpc::core::CoreRPCLike;
use dpp::dashcore::hashes::Hash;
use dpp::dashcore::Network;
use dpp::version::TryIntoPlatformVersioned;
use drive::grovedb_storage::Error::RocksDBError;
use tenderdash_abci::proto::abci as proto;
Expand All @@ -35,6 +36,48 @@ where

let platform_state = app.platform().state.load();

// Verify that Platform State corresponds to Drive commited state
let platform_state_app_hash = platform_state
.last_committed_block_app_hash()
.unwrap_or_default();

let grove_version = &platform_state
.current_platform_version()?
.drive
.grove_version;

let drive_storage_root_hash = app
.platform()
.drive
.grove
.root_hash(None, grove_version)
.unwrap()?;

// We had a sequence of errors on the mainnet started since block 32326.
// We got RocksDB's "transaction is busy" error because of a bug (https://github.com/dashpay/platform/pull/2309).
// Due to another bug in Tenderdash (https://github.com/dashpay/tenderdash/pull/966),
// validators just proceeded to the next block partially committing the state and updating the cache.
// Full nodes are stuck and proceeded after re-sync.
// For the mainnet chain, we enable these fixes at the block when we consider the state is consistent.
let config = &app.platform().config;

#[allow(clippy::collapsible_if)]
if !(config.network == Network::Dash
&& config.abci.chain_id == "evo1"
&& request.height < 33000)
{
// App hash in memory must be equal to app hash on disk
if drive_storage_root_hash != platform_state_app_hash {
// We panic because we can't recover from this situation.
// Better to restart the Drive, so we might self-heal the node
// reloading state form the disk
panic!(
"drive and platform state app hash mismatch: drive_storage_root_hash: {:?}, platform_state_app_hash: {:?}",
drive_storage_root_hash, platform_state_app_hash
);
}
}

let last_committed_core_height = platform_state.last_committed_core_height();

let starting_platform_version = platform_state.current_platform_version()?;
Expand Down
43 changes: 43 additions & 0 deletions packages/rs-drive-abci/src/abci/handler/process_proposal.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ use crate::platform_types::block_execution_outcome;
use crate::platform_types::platform_state::v0::PlatformStateV0Methods;
use crate::platform_types::state_transitions_processing_result::StateTransitionExecutionResult;
use crate::rpc::core::CoreRPCLike;
use dpp::dashcore::Network;
use dpp::version::TryIntoPlatformVersioned;
use drive::grovedb_storage::Error::RocksDBError;
use tenderdash_abci::proto::abci as proto;
Expand Down Expand Up @@ -179,6 +180,48 @@ where

let platform_state = app.platform().state.load();

// Verify that Platform State corresponds to Drive commited state
let platform_state_app_hash = platform_state
.last_committed_block_app_hash()
.unwrap_or_default();

let grove_version = &platform_state
.current_platform_version()?
.drive
.grove_version;

let drive_storage_root_hash = app
.platform()
.drive
.grove
.root_hash(None, grove_version)
.unwrap()?;
shumkov marked this conversation as resolved.
Show resolved Hide resolved

// We had a sequence of errors on the mainnet started since block 32326.
// We got RocksDB's "transaction is busy" error because of a bug (https://github.com/dashpay/platform/pull/2309).
// Due to another bug in Tenderdash (https://github.com/dashpay/tenderdash/pull/966),
// validators just proceeded to the next block partially committing the state and updating the cache.
// Full nodes are stuck and proceeded after re-sync.
// For the mainnet chain, we enable these fixes at the block when we consider the state is consistent.
let config = &app.platform().config;

#[allow(clippy::collapsible_if)]
if !(app.platform().config.network == Network::Dash
&& config.abci.chain_id == "evo1"
&& request.height < 33000)
{
// App hash in memory must be equal to app hash on disk
if drive_storage_root_hash != platform_state_app_hash {
// We panic because we can't recover from this situation.
// Better to restart the Drive, so we might self-heal the node
// reloading state form the disk
panic!(
"drive and platform state app hash mismatch: drive_storage_root_hash: {:?}, platform_state_app_hash: {:?}",
drive_storage_root_hash, platform_state_app_hash
);
}
shumkov marked this conversation as resolved.
Show resolved Hide resolved
}

let starting_platform_version = platform_state.current_platform_version()?;

// Running the proposal executes all the state transitions for the block
Expand Down
Loading