diff --git a/packages/rs-drive-abci/src/abci/handler/finalize_block.rs b/packages/rs-drive-abci/src/abci/handler/finalize_block.rs index 9653391c7d..852f85cc6b 100644 --- a/packages/rs-drive-abci/src/abci/handler/finalize_block.rs +++ b/packages/rs-drive-abci/src/abci/handler/finalize_block.rs @@ -5,6 +5,7 @@ use crate::execution::types::block_execution_context::v0::BlockExecutionContextV use crate::platform_types::cleaned_abci_messages::finalized_block_cleaned_request::v0::FinalizeBlockCleanedRequest; use crate::platform_types::platform_state::v0::PlatformStateV0Methods; use crate::rpc::core::CoreRPCLike; +use dpp::dashcore::Network; use std::sync::atomic::Ordering; use tenderdash_abci::proto::abci as proto; @@ -66,7 +67,30 @@ where )); } - app.commit_transaction(platform_version)?; + let result = app.commit_transaction(platform_version); + + // We had a sequence of errors on the mainnet started since block 32326. + // We got RocksDB's "transaction is busy" error because of a bug (https://github.com/dashpay/platform/pull/2309). + // Due to another bug in Tenderdash (https://github.com/dashpay/tenderdash/pull/966), + // validators just proceeded to the next block partially committing the state and updating the cache. + // Full nodes are stuck and proceeded after re-sync. + // For the mainnet chain, we enable these fixes at the block when we consider the state is consistent. + let config = &app.platform().config; + + if app.platform().config.network == Network::Dash + && config.abci.chain_id == "evo1" + && block_height < 33000 + { + // Old behavior on mainnet below block 33000 + result?; + } else { + // In case if transaction commit failed we still have caches in memory that + // corresponds to the data that we weren't able to commit. + // The simplified solution is to restart the Drive, so all caches + // will be restored from the disk and try to process this block again. + // TODO: We need a better handling of the transaction is busy error with retry logic. + result.expect("commit transaction"); + } app.platform() .committed_block_height_guard diff --git a/packages/rs-drive-abci/src/abci/handler/info.rs b/packages/rs-drive-abci/src/abci/handler/info.rs index dbb8501891..9ac9d31626 100644 --- a/packages/rs-drive-abci/src/abci/handler/info.rs +++ b/packages/rs-drive-abci/src/abci/handler/info.rs @@ -3,6 +3,7 @@ use crate::abci::AbciError; use crate::error::Error; use crate::platform_types::platform_state::v0::PlatformStateV0Methods; use crate::rpc::core::CoreRPCLike; +use dpp::dashcore::Network; use dpp::version::DESIRED_PLATFORM_VERSION; use tenderdash_abci::proto::abci as proto; @@ -21,19 +22,58 @@ where let platform_state = app.platform().state.load(); - let state_app_hash = platform_state + let last_block_height = platform_state.last_committed_block_height() as i64; + + // Verify that Platform State corresponds to Drive commited state + let platform_state_app_hash = platform_state .last_committed_block_app_hash() - .map(|app_hash| app_hash.to_vec()) .unwrap_or_default(); + let grove_version = &platform_state + .current_platform_version()? + .drive + .grove_version; + + let drive_storage_root_hash = app + .platform() + .drive + .grove + .root_hash(None, grove_version) + .unwrap()?; + + // We had a sequence of errors on the mainnet started since block 32326. + // We got RocksDB's "transaction is busy" error because of a bug (https://github.com/dashpay/platform/pull/2309). + // Due to another bug in Tenderdash (https://github.com/dashpay/tenderdash/pull/966), + // validators just proceeded to the next block partially committing the state and updating the cache. + // Full nodes are stuck and proceeded after re-sync. + // For the mainnet chain, we enable these fixes at the block when we consider the state is consistent. + let config = &app.platform().config; + + #[allow(clippy::collapsible_if)] + if !(config.network == Network::Dash + && config.abci.chain_id == "evo1" + && last_block_height < 33000) + { + // App hash in memory must be equal to app hash on disk + if drive_storage_root_hash != platform_state_app_hash { + // We panic because we can't recover from this situation. + // Better to restart the Drive, so we might self-heal the node + // reloading state form the disk + panic!( + "drive and platform state app hash mismatch: drive_storage_root_hash: {:?}, platform_state_app_hash: {:?}", + drive_storage_root_hash, platform_state_app_hash + ); + } + } + let desired_protocol_version = DESIRED_PLATFORM_VERSION.protocol_version; let response = proto::ResponseInfo { data: "".to_string(), app_version: desired_protocol_version as u64, - last_block_height: platform_state.last_committed_block_height() as i64, + last_block_height, version: env!("CARGO_PKG_VERSION").to_string(), - last_block_app_hash: state_app_hash.clone(), + last_block_app_hash: platform_state_app_hash.to_vec(), }; tracing::debug!( @@ -41,8 +81,8 @@ where software_version = env!("CARGO_PKG_VERSION"), block_version = request.block_version, p2p_version = request.p2p_version, - app_hash = hex::encode(state_app_hash), - height = platform_state.last_committed_block_height(), + app_hash = hex::encode(platform_state_app_hash), + last_block_height, "Handshake with consensus engine", ); diff --git a/packages/rs-drive-abci/src/abci/handler/prepare_proposal.rs b/packages/rs-drive-abci/src/abci/handler/prepare_proposal.rs index 18252d0d45..61f58a0196 100644 --- a/packages/rs-drive-abci/src/abci/handler/prepare_proposal.rs +++ b/packages/rs-drive-abci/src/abci/handler/prepare_proposal.rs @@ -11,6 +11,7 @@ use crate::platform_types::platform_state::v0::PlatformStateV0Methods; use crate::platform_types::state_transitions_processing_result::StateTransitionExecutionResult; use crate::rpc::core::CoreRPCLike; use dpp::dashcore::hashes::Hash; +use dpp::dashcore::Network; use dpp::version::TryIntoPlatformVersioned; use drive::grovedb_storage::Error::RocksDBError; use tenderdash_abci::proto::abci as proto; @@ -35,6 +36,48 @@ where let platform_state = app.platform().state.load(); + // Verify that Platform State corresponds to Drive commited state + let platform_state_app_hash = platform_state + .last_committed_block_app_hash() + .unwrap_or_default(); + + let grove_version = &platform_state + .current_platform_version()? + .drive + .grove_version; + + let drive_storage_root_hash = app + .platform() + .drive + .grove + .root_hash(None, grove_version) + .unwrap()?; + + // We had a sequence of errors on the mainnet started since block 32326. + // We got RocksDB's "transaction is busy" error because of a bug (https://github.com/dashpay/platform/pull/2309). + // Due to another bug in Tenderdash (https://github.com/dashpay/tenderdash/pull/966), + // validators just proceeded to the next block partially committing the state and updating the cache. + // Full nodes are stuck and proceeded after re-sync. + // For the mainnet chain, we enable these fixes at the block when we consider the state is consistent. + let config = &app.platform().config; + + #[allow(clippy::collapsible_if)] + if !(config.network == Network::Dash + && config.abci.chain_id == "evo1" + && request.height < 33000) + { + // App hash in memory must be equal to app hash on disk + if drive_storage_root_hash != platform_state_app_hash { + // We panic because we can't recover from this situation. + // Better to restart the Drive, so we might self-heal the node + // reloading state form the disk + panic!( + "drive and platform state app hash mismatch: drive_storage_root_hash: {:?}, platform_state_app_hash: {:?}", + drive_storage_root_hash, platform_state_app_hash + ); + } + } + let last_committed_core_height = platform_state.last_committed_core_height(); let starting_platform_version = platform_state.current_platform_version()?; diff --git a/packages/rs-drive-abci/src/abci/handler/process_proposal.rs b/packages/rs-drive-abci/src/abci/handler/process_proposal.rs index 5bf547e14a..d40567d3db 100644 --- a/packages/rs-drive-abci/src/abci/handler/process_proposal.rs +++ b/packages/rs-drive-abci/src/abci/handler/process_proposal.rs @@ -12,6 +12,7 @@ use crate::platform_types::block_execution_outcome; use crate::platform_types::platform_state::v0::PlatformStateV0Methods; use crate::platform_types::state_transitions_processing_result::StateTransitionExecutionResult; use crate::rpc::core::CoreRPCLike; +use dpp::dashcore::Network; use dpp::version::TryIntoPlatformVersioned; use drive::grovedb_storage::Error::RocksDBError; use tenderdash_abci::proto::abci as proto; @@ -179,6 +180,48 @@ where let platform_state = app.platform().state.load(); + // Verify that Platform State corresponds to Drive commited state + let platform_state_app_hash = platform_state + .last_committed_block_app_hash() + .unwrap_or_default(); + + let grove_version = &platform_state + .current_platform_version()? + .drive + .grove_version; + + let drive_storage_root_hash = app + .platform() + .drive + .grove + .root_hash(None, grove_version) + .unwrap()?; + + // We had a sequence of errors on the mainnet started since block 32326. + // We got RocksDB's "transaction is busy" error because of a bug (https://github.com/dashpay/platform/pull/2309). + // Due to another bug in Tenderdash (https://github.com/dashpay/tenderdash/pull/966), + // validators just proceeded to the next block partially committing the state and updating the cache. + // Full nodes are stuck and proceeded after re-sync. + // For the mainnet chain, we enable these fixes at the block when we consider the state is consistent. + let config = &app.platform().config; + + #[allow(clippy::collapsible_if)] + if !(app.platform().config.network == Network::Dash + && config.abci.chain_id == "evo1" + && request.height < 33000) + { + // App hash in memory must be equal to app hash on disk + if drive_storage_root_hash != platform_state_app_hash { + // We panic because we can't recover from this situation. + // Better to restart the Drive, so we might self-heal the node + // reloading state form the disk + panic!( + "drive and platform state app hash mismatch: drive_storage_root_hash: {:?}, platform_state_app_hash: {:?}", + drive_storage_root_hash, platform_state_app_hash + ); + } + } + let starting_platform_version = platform_state.current_platform_version()?; // Running the proposal executes all the state transitions for the block