Skip to content

Commit

Permalink
blockstore: atomize slot clearing, relax parent slot meta check
Browse files Browse the repository at this point in the history
clear_unconfirmed_slot can leave blockstore in an irrecoverable state
if it panics in the middle. write batch this function, so that any
errors can be recovered after restart.

additionally relax the constraint that the parent slot meta must exist,
as it could have been cleaned up if outdated.
  • Loading branch information
AshwinSekar committed Feb 7, 2024
1 parent bc735fa commit ba189b1
Show file tree
Hide file tree
Showing 3 changed files with 78 additions and 25 deletions.
29 changes: 4 additions & 25 deletions ledger/src/blockstore.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1169,35 +1169,14 @@ impl Blockstore {
/// family.
pub fn clear_unconfirmed_slot(&self, slot: Slot) {
let _lock = self.insert_shreds_lock.lock().unwrap();
if let Some(mut slot_meta) = self
if let Some(slot_meta) = self
.meta(slot)
.expect("Couldn't fetch from SlotMeta column family")
{
// Clear all slot related information
self.run_purge(slot, slot, PurgeType::Exact)
// Clear all slot related information, and cleanup slot meta by removing
// `slot` from parents `next_slots`, but retaining `slot`'s `next_slots`.
self.run_purge_and_cleanup_slot_meta(slot, slot_meta)
.expect("Purge database operations failed");

// Clear this slot as a next slot from parent
if let Some(parent_slot) = slot_meta.parent_slot {
let mut parent_slot_meta = self
.meta(parent_slot)
.expect("Couldn't fetch from SlotMeta column family")
.expect("Unconfirmed slot should have had parent slot set");
// .retain() is a linear scan; however, next_slots should
// only contain several elements so this isn't so bad
parent_slot_meta
.next_slots
.retain(|&next_slot| next_slot != slot);
self.meta_cf
.put(parent_slot, &parent_slot_meta)
.expect("Couldn't insert into SlotMeta column family");
}
// Reinsert parts of `slot_meta` that are important to retain, like the `next_slots`
// field.
slot_meta.clear_unconfirmed_slot();
self.meta_cf
.put(slot, &slot_meta)
.expect("Couldn't insert into SlotMeta column family");
} else {
error!(
"clear_unconfirmed_slot() called on slot {} with no SlotMeta",
Expand Down
72 changes: 72 additions & 0 deletions ledger/src/blockstore/blockstore_purge.rs
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,7 @@ impl Blockstore {
}
}

#[cfg(test)]
pub(crate) fn run_purge(
&self,
from_slot: Slot,
Expand All @@ -155,6 +156,48 @@ impl Blockstore {
to_slot: Slot,
purge_type: PurgeType,
purge_stats: &mut PurgeStats,
) -> Result<bool> {
self.run_purge_with_stats_and_maybe_cleanup_slot_meta(
from_slot,
to_slot,
purge_type,
purge_stats,
None,
)
}

pub(crate) fn run_purge_and_cleanup_slot_meta(
&self,
slot: Slot,
slot_meta: SlotMeta,
) -> Result<bool> {
self.run_purge_with_stats_and_maybe_cleanup_slot_meta(
slot,
slot,
PurgeType::Exact,
&mut PurgeStats::default(),
Some(slot_meta),
)
}

/// A helper function to `purge_slots` that executes the ledger clean up.
/// The cleanup applies to \[`from_slot`, `to_slot`\].
///
/// When `from_slot` is 0, any sst-file with a key-range completely older
/// than `to_slot` will also be deleted.
///
/// If `slot_meta` is specified for some `child_slot`, we require that
/// `child_slot == from_slot == to_slot` - we are purging only one slot.
/// In this case along with the purge we remove `child_slot` from its
/// `parent_slot_meta.next_slots` as well as reinsert an orphaned `slot_meta`
/// for `child_slot` that only retains the `next_slots` value.
pub(crate) fn run_purge_with_stats_and_maybe_cleanup_slot_meta(
&self,
from_slot: Slot,
to_slot: Slot,
purge_type: PurgeType,
purge_stats: &mut PurgeStats,
slot_meta: Option<SlotMeta>,
) -> Result<bool> {
let mut write_batch = self
.db
Expand Down Expand Up @@ -239,6 +282,35 @@ impl Blockstore {
}
delete_range_timer.stop();

if let Some(mut slot_meta) = slot_meta {
let child_slot = slot_meta.slot;
if child_slot != from_slot || child_slot != to_slot {
error!("Slot meta parent cleanup was requested for {}, but a range was specified {} {}", child_slot, from_slot, to_slot);
return Err(BlockstoreError::InvalidRangeForSlotMetaCleanup);
}

if let Some(parent_slot) = slot_meta.parent_slot {
let parent_slot_meta = self.meta(parent_slot)?;
if let Some(mut parent_slot_meta) = parent_slot_meta {
// .retain() is a linear scan; however, next_slots should
// only contain several elements so this isn't so bad
parent_slot_meta
.next_slots
.retain(|&next_slot| next_slot != child_slot);
write_batch.put::<cf::SlotMeta>(parent_slot, &parent_slot_meta)?;
} else {
error!("Parent slot meta {} for child {} is missing. In the absence of a duplicate block this
likely means a cluster restart was performed and your node contains invalid shreds generated
with the wrong shred version, whose ancestors have been cleaned up.
Falling back to duplicate block handling to remedy the situation", parent_slot, child_slot);
}
}

// Reinsert parts of `slot_meta` that are important to retain, like the `next_slots` field.
slot_meta.clear_unconfirmed_slot();
write_batch.put::<cf::SlotMeta>(child_slot, &slot_meta)?;
}

let mut write_timer = Measure::start("write_batch");
if let Err(e) = self.db.write(write_batch) {
error!(
Expand Down
2 changes: 2 additions & 0 deletions ledger/src/blockstore_db.rs
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,8 @@ pub enum BlockstoreError {
MissingTransactionMetadata,
#[error("transaction-index overflow")]
TransactionIndexOverflow,
#[error("invalid purge range for slot meta cleanup")]
InvalidRangeForSlotMetaCleanup,
}
pub type Result<T> = std::result::Result<T, BlockstoreError>;

Expand Down

0 comments on commit ba189b1

Please sign in to comment.