Skip to content

Commit

Permalink
fix(drive): uncommitted state if db transaction fails (#2305)
Browse files Browse the repository at this point in the history
  • Loading branch information
shumkov authored Nov 4, 2024
1 parent feacde2 commit dacc6db
Show file tree
Hide file tree
Showing 4 changed files with 157 additions and 7 deletions.
26 changes: 25 additions & 1 deletion packages/rs-drive-abci/src/abci/handler/finalize_block.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ use crate::execution::types::block_execution_context::v0::BlockExecutionContextV
use crate::platform_types::cleaned_abci_messages::finalized_block_cleaned_request::v0::FinalizeBlockCleanedRequest;
use crate::platform_types::platform_state::v0::PlatformStateV0Methods;
use crate::rpc::core::CoreRPCLike;
use dpp::dashcore::Network;
use std::sync::atomic::Ordering;
use tenderdash_abci::proto::abci as proto;

Expand Down Expand Up @@ -66,7 +67,30 @@ where
));
}

app.commit_transaction(platform_version)?;
let result = app.commit_transaction(platform_version);

// We had a sequence of errors on the mainnet started since block 32326.
// We got RocksDB's "transaction is busy" error because of a bug (https://github.com/dashpay/platform/pull/2309).
// Due to another bug in Tenderdash (https://github.com/dashpay/tenderdash/pull/966),
// validators just proceeded to the next block partially committing the state and updating the cache.
// Full nodes are stuck and proceeded after re-sync.
// For the mainnet chain, we enable these fixes at the block when we consider the state is consistent.
let config = &app.platform().config;

if app.platform().config.network == Network::Dash
&& config.abci.chain_id == "evo1"
&& block_height < 33000
{
// Old behavior on mainnet below block 33000
result?;
} else {
// In case if transaction commit failed we still have caches in memory that
// corresponds to the data that we weren't able to commit.
// The simplified solution is to restart the Drive, so all caches
// will be restored from the disk and try to process this block again.
// TODO: We need a better handling of the transaction is busy error with retry logic.
result.expect("commit transaction");
}

app.platform()
.committed_block_height_guard
Expand Down
52 changes: 46 additions & 6 deletions packages/rs-drive-abci/src/abci/handler/info.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ use crate::abci::AbciError;
use crate::error::Error;
use crate::platform_types::platform_state::v0::PlatformStateV0Methods;
use crate::rpc::core::CoreRPCLike;
use dpp::dashcore::Network;
use dpp::version::DESIRED_PLATFORM_VERSION;
use tenderdash_abci::proto::abci as proto;

Expand All @@ -21,28 +22,67 @@ where

let platform_state = app.platform().state.load();

let state_app_hash = platform_state
let last_block_height = platform_state.last_committed_block_height() as i64;

// Verify that Platform State corresponds to Drive commited state
let platform_state_app_hash = platform_state
.last_committed_block_app_hash()
.map(|app_hash| app_hash.to_vec())
.unwrap_or_default();

let grove_version = &platform_state
.current_platform_version()?
.drive
.grove_version;

let drive_storage_root_hash = app
.platform()
.drive
.grove
.root_hash(None, grove_version)
.unwrap()?;

// We had a sequence of errors on the mainnet started since block 32326.
// We got RocksDB's "transaction is busy" error because of a bug (https://github.com/dashpay/platform/pull/2309).
// Due to another bug in Tenderdash (https://github.com/dashpay/tenderdash/pull/966),
// validators just proceeded to the next block partially committing the state and updating the cache.
// Full nodes are stuck and proceeded after re-sync.
// For the mainnet chain, we enable these fixes at the block when we consider the state is consistent.
let config = &app.platform().config;

#[allow(clippy::collapsible_if)]
if !(config.network == Network::Dash
&& config.abci.chain_id == "evo1"
&& last_block_height < 33000)
{
// App hash in memory must be equal to app hash on disk
if drive_storage_root_hash != platform_state_app_hash {
// We panic because we can't recover from this situation.
// Better to restart the Drive, so we might self-heal the node
// reloading state form the disk
panic!(
"drive and platform state app hash mismatch: drive_storage_root_hash: {:?}, platform_state_app_hash: {:?}",
drive_storage_root_hash, platform_state_app_hash
);
}
}

let desired_protocol_version = DESIRED_PLATFORM_VERSION.protocol_version;

let response = proto::ResponseInfo {
data: "".to_string(),
app_version: desired_protocol_version as u64,
last_block_height: platform_state.last_committed_block_height() as i64,
last_block_height,
version: env!("CARGO_PKG_VERSION").to_string(),
last_block_app_hash: state_app_hash.clone(),
last_block_app_hash: platform_state_app_hash.to_vec(),
};

tracing::debug!(
desired_protocol_version,
software_version = env!("CARGO_PKG_VERSION"),
block_version = request.block_version,
p2p_version = request.p2p_version,
app_hash = hex::encode(state_app_hash),
height = platform_state.last_committed_block_height(),
app_hash = hex::encode(platform_state_app_hash),
last_block_height,
"Handshake with consensus engine",
);

Expand Down
43 changes: 43 additions & 0 deletions packages/rs-drive-abci/src/abci/handler/prepare_proposal.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ use crate::platform_types::platform_state::v0::PlatformStateV0Methods;
use crate::platform_types::state_transitions_processing_result::StateTransitionExecutionResult;
use crate::rpc::core::CoreRPCLike;
use dpp::dashcore::hashes::Hash;
use dpp::dashcore::Network;
use dpp::version::TryIntoPlatformVersioned;
use drive::grovedb_storage::Error::RocksDBError;
use tenderdash_abci::proto::abci as proto;
Expand All @@ -35,6 +36,48 @@ where

let platform_state = app.platform().state.load();

// Verify that Platform State corresponds to Drive commited state
let platform_state_app_hash = platform_state
.last_committed_block_app_hash()
.unwrap_or_default();

let grove_version = &platform_state
.current_platform_version()?
.drive
.grove_version;

let drive_storage_root_hash = app
.platform()
.drive
.grove
.root_hash(None, grove_version)
.unwrap()?;

// We had a sequence of errors on the mainnet started since block 32326.
// We got RocksDB's "transaction is busy" error because of a bug (https://github.com/dashpay/platform/pull/2309).
// Due to another bug in Tenderdash (https://github.com/dashpay/tenderdash/pull/966),
// validators just proceeded to the next block partially committing the state and updating the cache.
// Full nodes are stuck and proceeded after re-sync.
// For the mainnet chain, we enable these fixes at the block when we consider the state is consistent.
let config = &app.platform().config;

#[allow(clippy::collapsible_if)]
if !(config.network == Network::Dash
&& config.abci.chain_id == "evo1"
&& request.height < 33000)
{
// App hash in memory must be equal to app hash on disk
if drive_storage_root_hash != platform_state_app_hash {
// We panic because we can't recover from this situation.
// Better to restart the Drive, so we might self-heal the node
// reloading state form the disk
panic!(
"drive and platform state app hash mismatch: drive_storage_root_hash: {:?}, platform_state_app_hash: {:?}",
drive_storage_root_hash, platform_state_app_hash
);
}
}

let last_committed_core_height = platform_state.last_committed_core_height();

let starting_platform_version = platform_state.current_platform_version()?;
Expand Down
43 changes: 43 additions & 0 deletions packages/rs-drive-abci/src/abci/handler/process_proposal.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ use crate::platform_types::block_execution_outcome;
use crate::platform_types::platform_state::v0::PlatformStateV0Methods;
use crate::platform_types::state_transitions_processing_result::StateTransitionExecutionResult;
use crate::rpc::core::CoreRPCLike;
use dpp::dashcore::Network;
use dpp::version::TryIntoPlatformVersioned;
use drive::grovedb_storage::Error::RocksDBError;
use tenderdash_abci::proto::abci as proto;
Expand Down Expand Up @@ -179,6 +180,48 @@ where

let platform_state = app.platform().state.load();

// Verify that Platform State corresponds to Drive commited state
let platform_state_app_hash = platform_state
.last_committed_block_app_hash()
.unwrap_or_default();

let grove_version = &platform_state
.current_platform_version()?
.drive
.grove_version;

let drive_storage_root_hash = app
.platform()
.drive
.grove
.root_hash(None, grove_version)
.unwrap()?;

// We had a sequence of errors on the mainnet started since block 32326.
// We got RocksDB's "transaction is busy" error because of a bug (https://github.com/dashpay/platform/pull/2309).
// Due to another bug in Tenderdash (https://github.com/dashpay/tenderdash/pull/966),
// validators just proceeded to the next block partially committing the state and updating the cache.
// Full nodes are stuck and proceeded after re-sync.
// For the mainnet chain, we enable these fixes at the block when we consider the state is consistent.
let config = &app.platform().config;

#[allow(clippy::collapsible_if)]
if !(app.platform().config.network == Network::Dash
&& config.abci.chain_id == "evo1"
&& request.height < 33000)
{
// App hash in memory must be equal to app hash on disk
if drive_storage_root_hash != platform_state_app_hash {
// We panic because we can't recover from this situation.
// Better to restart the Drive, so we might self-heal the node
// reloading state form the disk
panic!(
"drive and platform state app hash mismatch: drive_storage_root_hash: {:?}, platform_state_app_hash: {:?}",
drive_storage_root_hash, platform_state_app_hash
);
}
}

let starting_platform_version = platform_state.current_platform_version()?;

// Running the proposal executes all the state transitions for the block
Expand Down

0 comments on commit dacc6db

Please sign in to comment.