compact: add shared compaction pool for multiple stores

This change adds a new compaction pool which enforces a global max compaction concurrency in a multi-store configuration. Each Pebble store (i.e. an instance of *DB) still maintains its own per-store compaction concurrency which is controlled by `opts.MaxConcurrentCompactions`. However, in a multi-store configuration, disk I/O is a per-store resource while CPU is shared across stores. A significant portion of compaction is CPU-intensive, and so this ensures that excessive compactions don't interrupt foreground CPU tasks even if the disks are capable of handling the additional throughput from those compactions. The shared compaction concurrency only applies to automatic compactions This means that delete-only compactions are excluded because they are expected to be cheap, as are flushes because they should never be blocked. Fixes: #3813 Informs: cockroachdb/cockroach#74697
cockroachdb · Aug 28, 2024 · 705e364 · 705e364
1 parent 94561af
commit 705e364
Show file tree

Hide file tree

Showing 8 changed files with 443 additions and 252 deletions.
diff --git a/compaction.go b/compaction.go
diff --git a/compaction_picker.go b/compaction_picker.go
@@ -13,7 +13,6 @@ import (
 
 	"github.com/cockroachdb/errors"
 	"github.com/cockroachdb/pebble/internal/base"
-	"github.com/cockroachdb/pebble/internal/humanize"
 	"github.com/cockroachdb/pebble/internal/invariants"
 	"github.com/cockroachdb/pebble/internal/manifest"
 )
@@ -192,6 +191,8 @@ type pickedCompaction struct {
 	score float64
 	// kind indicates the kind of compaction.
 	kind compactionKind
+	// isManual indicates whether this compaction was manually triggered.
+	isManual bool
 	// startLevel is the level that is being compacted. Inputs from startLevel
 	// and outputLevel will be merged to produce a set of outputLevel files.
 	startLevel *compactionLevel
@@ -1156,16 +1157,13 @@ func responsibleForGarbageBytes(virtualBackings *manifest.VirtualBackings, m *fi
 	return uint64(totalGarbage) / uint64(useCount)
 }
 
-// pickAuto picks the best compaction, if any.
-//
-// On each call, pickAuto computes per-level size adjustments based on
-// in-progress compactions, and computes a per-level score. The levels are
-// iterated over in decreasing score order trying to find a valid compaction
-// anchored at that level.
-//
-// If a score-based compaction cannot be found, pickAuto falls back to looking
-// for an elision-only compaction to remove obsolete keys.
+// pickAuto picks the best compaction, if any. It first tries to find a
+// score-based compaction; if one cannot be found, pickAuto falls back to
+// various other compaction picking methods.
 func (p *compactionPickerByScore) pickAuto(env compactionEnv) (pc *pickedCompaction) {
+	// We first check if this DB has reached its current max compaction
+	// concurrency.
+	//
 	// Compaction concurrency is controlled by L0 read-amp. We allow one
 	// additional compaction per L0CompactionConcurrency sublevels, as well as
 	// one additional compaction per CompactionDebtConcurrency bytes of
@@ -1184,107 +1182,9 @@ func (p *compactionPickerByScore) pickAuto(env compactionEnv) (pc *pickedCompact
 		}
 	}
 
-	scores := p.calculateLevelScores(env.inProgressCompactions)
-
-	// TODO(bananabrick): Either remove, or change this into an event sent to the
-	// EventListener.
-	logCompaction := func(pc *pickedCompaction) {
-		var buf bytes.Buffer
-		for i := 0; i < numLevels; i++ {
-			if i != 0 && i < p.baseLevel {
-				continue
-			}
-
-			var info *candidateLevelInfo
-			for j := range scores {
-				if scores[j].level == i {
-					info = &scores[j]
-					break
-				}
-			}
-
-			marker := " "
-			if pc.startLevel.level == info.level {
-				marker = "*"
-			}
-			fmt.Fprintf(&buf, "  %sL%d: %5.1f  %5.1f  %5.1f  %5.1f %8s  %8s",
-				marker, info.level, info.compensatedScoreRatio, info.compensatedScore,
-				info.uncompensatedScoreRatio, info.uncompensatedScore,
-				humanize.Bytes.Int64(int64(totalCompensatedSize(
-					p.vers.Levels[info.level].Iter(),
-				))),
-				humanize.Bytes.Int64(p.levelMaxBytes[info.level]),
-			)
-
-			count := 0
-			for i := range env.inProgressCompactions {
-				c := &env.inProgressCompactions[i]
-				if c.inputs[0].level != info.level {
-					continue
-				}
-				count++
-				if count == 1 {
-					fmt.Fprintf(&buf, "  [")
-				} else {
-					fmt.Fprintf(&buf, " ")
-				}
-				fmt.Fprintf(&buf, "L%d->L%d", c.inputs[0].level, c.outputLevel)
-			}
-			if count > 0 {
-				fmt.Fprintf(&buf, "]")
-			}
-			fmt.Fprintf(&buf, "\n")
-		}
-		p.opts.Logger.Infof("pickAuto: L%d->L%d\n%s",
-			pc.startLevel.level, pc.outputLevel.level, buf.String())
-	}
-
-	// Check for a score-based compaction. candidateLevelInfos are first sorted
-	// by whether they should be compacted, so if we find a level which shouldn't
-	// be compacted, we can break early.
-	for i := range scores {
-		info := &scores[i]
-		if !info.shouldCompact() {
-			break
-		}
-		if info.level == numLevels-1 {
-			continue
-		}
-
-		if info.level == 0 {
-			pc = pickL0(env, p.opts, p.vers, p.baseLevel)
-			// Fail-safe to protect against compacting the same sstable
-			// concurrently.
-			if pc != nil && !inputRangeAlreadyCompacting(env, pc) {
-				p.addScoresToPickedCompactionMetrics(pc, scores)
-				pc.score = info.compensatedScoreRatio
-				// TODO(bananabrick): Create an EventListener for logCompaction.
-				if false {
-					logCompaction(pc)
-				}
-				return pc
-			}
-			continue
-		}
-
-		// info.level > 0
-		var ok bool
-		info.file, ok = pickCompactionSeedFile(p.vers, p.virtualBackings, p.opts, info.level, info.outputLevel, env.earliestSnapshotSeqNum)
-		if !ok {
-			continue
-		}
-
-		pc := pickAutoLPositive(env, p.opts, p.vers, *info, p.baseLevel)
-		// Fail-safe to protect against compacting the same sstable concurrently.
-		if pc != nil && !inputRangeAlreadyCompacting(env, pc) {
-			p.addScoresToPickedCompactionMetrics(pc, scores)
-			pc.score = info.compensatedScoreRatio
-			// TODO(bananabrick): Create an EventListener for logCompaction.
-			if false {
-				logCompaction(pc)
-			}
-			return pc
-		}
+	// Score-based compactions have the highest priority.
+	if pc := p.pickScoreBasedCompaction(env); pc != nil {
+		return pc
 	}
 
 	// Check for files which contain excessive point tombstones that could slow
@@ -1337,8 +1237,61 @@ func (p *compactionPickerByScore) pickAuto(env compactionEnv) (pc *pickedCompact
 	// MarkedForCompaction field is persisted in the manifest. That's okay. We
 	// previously would've ignored the designation, whereas now we'll re-compact
 	// the file in place.
-	if p.vers.Stats.MarkedForCompaction > 0 {
-		if pc := p.pickRewriteCompaction(env); pc != nil {
+	if pc := p.pickRewriteCompaction(env); pc != nil {
+		return pc
+	}
+
+	return nil
+}
+
+// pickScoreBasedCompaction computes per-level size adjustments based on
+// in-progress compactions, and computes a per-level score. The levels are
+// iterated over in decreasing score order trying to find a valid compaction
+// anchored at that level.
+func (p *compactionPickerByScore) pickScoreBasedCompaction(
+	env compactionEnv,
+) (pc *pickedCompaction) {
+	scores := p.calculateLevelScores(env.inProgressCompactions)
+
+	// Check for a score-based compaction. candidateLevelInfos are first sorted
+	// by whether they should be compacted, so if we find a level which shouldn't
+	// be compacted, we can break early.
+	for i := range scores {
+		info := &scores[i]
+		if !info.shouldCompact() {
+			break
+		}
+		if info.level == numLevels-1 {
+			continue
+		}
+
+		withScore := func(pc *pickedCompaction) *pickedCompaction {
+			// Fail-safe to protect against compacting the same sstable
+			// concurrently.
+			if pc == nil || inputRangeAlreadyCompacting(env, pc) {
+				return nil
+			}
+			p.addScoresToPickedCompactionMetrics(pc, scores)
+			pc.score = info.compensatedScoreRatio
+			pc.kind = compactionKindDefault
+			return pc
+		}
+
+		if info.level == 0 {
+			if pc = withScore(pickL0(env, p.opts, p.vers, p.baseLevel)); pc != nil {
+				return pc
+			}
+			continue
+		}
+
+		// info.level > 0
+		var ok bool
+		info.file, ok = pickCompactionSeedFile(p.vers, p.virtualBackings, p.opts, info.level, info.outputLevel, env.earliestSnapshotSeqNum)
+		if !ok {
+			continue
+		}
+
+		if pc = withScore(pickAutoLPositive(env, p.opts, p.vers, *info, p.baseLevel)); pc != nil {
 			return pc
 		}
 	}
@@ -1426,7 +1379,12 @@ var markedForCompactionAnnotator = &manifest.Annotator[fileMetadata]{
 // with various checks to ensure that the file still exists in the expected level
 // and isn't already being compacted.
 func (p *compactionPickerByScore) pickedCompactionFromCandidateFile(
-	candidate *fileMetadata, env compactionEnv, startLevel int, outputLevel int, kind compactionKind,
+	candidate *fileMetadata,
+	env compactionEnv,
+	startLevel int,
+	outputLevel int,
+	kind compactionKind,
+	score float64,
 ) *pickedCompaction {
 	if candidate == nil || candidate.IsCompacting() {
 		return nil
@@ -1453,6 +1411,7 @@ func (p *compactionPickerByScore) pickedCompactionFromCandidateFile(
 
 	pc := newPickedCompaction(p.opts, p.vers, startLevel, outputLevel, p.baseLevel)
 	pc.kind = kind
+	pc.score = score
 	pc.startLevel.files = inputs
 	pc.smallest, pc.largest = manifest.KeyRange(pc.cmp, pc.startLevel.files.Iter())
 
@@ -1483,7 +1442,8 @@ func (p *compactionPickerByScore) pickElisionOnlyCompaction(
 	if candidate.LargestSeqNum >= env.earliestSnapshotSeqNum {
 		return nil
 	}
-	return p.pickedCompactionFromCandidateFile(candidate, env, numLevels-1, numLevels-1, compactionKindElisionOnly)
+	score := float64(max(candidate.Stats.RangeDeletionsBytesEstimate/max(candidate.Size, 1), candidate.Stats.NumDeletions/max(candidate.Stats.NumEntries, 1)))
+	return p.pickedCompactionFromCandidateFile(candidate, env, numLevels-1, numLevels-1, compactionKindElisionOnly, score)
 }
 
 // pickRewriteCompaction attempts to construct a compaction that
@@ -1492,13 +1452,17 @@ func (p *compactionPickerByScore) pickElisionOnlyCompaction(
 // necessary. A rewrite compaction outputs files to the same level as
 // the input level.
 func (p *compactionPickerByScore) pickRewriteCompaction(env compactionEnv) (pc *pickedCompaction) {
+	if p.vers.Stats.MarkedForCompaction == 0 {
+		return nil
+	}
+
 	for l := numLevels - 1; l >= 0; l-- {
 		candidate := markedForCompactionAnnotator.LevelAnnotation(p.vers.Levels[l])
 		if candidate == nil {
 			// Try the next level.
 			continue
 		}
-		pc := p.pickedCompactionFromCandidateFile(candidate, env, l, l, compactionKindRewrite)
+		pc := p.pickedCompactionFromCandidateFile(candidate, env, l, l, compactionKindRewrite, 0)
 		if pc != nil {
 			return pc
 		}
@@ -1551,7 +1515,11 @@ func (p *compactionPickerByScore) pickTombstoneDensityCompaction(
 		}
 	}
 
-	return p.pickedCompactionFromCandidateFile(candidate, env, level, defaultOutputLevel(level, p.baseLevel), compactionKindTombstoneDensity)
+	if candidate == nil {
+		return nil
+	}
+
+	return p.pickedCompactionFromCandidateFile(candidate, env, level, defaultOutputLevel(level, p.baseLevel), compactionKindTombstoneDensity, candidate.Stats.TombstoneDenseBlocksRatio)
 }
 
 // pickAutoLPositive picks an automatic compaction for the candidate
@@ -1755,7 +1723,7 @@ func pickL0(env compactionEnv, opts *Options, vers *version, baseLevel int) (pc
 	return pc
 }
 
-func pickManualCompaction(
+func newPickedManualCompaction(
 	vers *version, opts *Options, env compactionEnv, baseLevel int, manual *manualCompaction,
 ) (pc *pickedCompaction, retryLater bool) {
 	outputLevel := manual.level + 1
@@ -1779,6 +1747,11 @@ func pickManualCompaction(
 		return nil, true
 	}
 	pc = newPickedCompaction(opts, vers, manual.level, defaultOutputLevel(manual.level, baseLevel), baseLevel)
+	pc.kind = compactionKindDefault
+	pc.isManual = true
+	// NB: we set the score to math.MaxFloat64 so that manual compactions are
+	// always prioritized above automatic compactions.
+	pc.score = math.MaxFloat64
 	manual.outputLevel = pc.outputLevel.level
 	pc.startLevel.files = vers.Overlaps(manual.level, base.UserKeyBoundsInclusive(manual.start, manual.end))
 	if pc.startLevel.files.Empty() {
@@ -1909,6 +1882,10 @@ func pickReadTriggeredCompactionHelper(
 		return nil
 	}
 
+	// Prioritize read compactions with a smaller initial file size, since
+	// they will be cheaper to perform.
+	pc.score = -float64(outputOverlaps.SizeSum() + overlapSlice.SizeSum())
+
 	return pc
 }
 

diff --git a/compaction_picker_test.go b/compaction_picker_test.go
@@ -325,7 +325,7 @@ func TestCompactionPickerTargetLevel(t *testing.T) {
 					end:   iEnd.UserKey,
 				}
 
-				pc, retryLater := pickManualCompaction(
+				pc, retryLater := newPickedManualCompaction(
 					pickerByScore.vers,
 					opts,
 					compactionEnv{
@@ -1413,17 +1413,15 @@ func TestCompactionPickerPickFile(t *testing.T) {
 			d.mu.Lock()
 			defer d.mu.Unlock()
 
-			// Use maybeScheduleCompactionPicker to take care of all of the
-			// initialization of the compaction-picking environment, but never
-			// pick a compaction; just call pickFile using the user-provided
-			// level.
 			var lf manifest.LevelFile
 			var ok bool
-			d.maybeScheduleCompactionPicker(func(untypedPicker compactionPicker, env compactionEnv) *pickedCompaction {
-				p := untypedPicker.(*compactionPickerByScore)
-				lf, ok = pickCompactionSeedFile(p.vers, p.virtualBackings, opts, level, level+1, env.earliestSnapshotSeqNum)
-				return nil
-			})
+			env := d.makeCompactionEnv()
+			if env == nil {
+				return "unable to lock the DB for compaction picking"
+			}
+			p := d.mu.versions.picker.(*compactionPickerByScore)
+			lf, ok = pickCompactionSeedFile(p.vers, p.virtualBackings, opts, level, level+1, env.earliestSnapshotSeqNum)
+			d.mu.versions.logUnlock()
 			if !ok {
 				return "(none)"
 			}

diff --git a/db.go b/db.go
@@ -323,6 +323,20 @@ type DB struct {
 	// compactionShedulers.Wait() should not be called while the DB.mu is held.
 	compactionSchedulers sync.WaitGroup
 
+	// compactionPool is responsible for scheduling both automatic and manual
+	// compactions.
+	//
+	// The compactionPool may enforce a global max compaction concurrency in a
+	// multi-store configuration. Each Pebble store (i.e. an instance of *DB)
+	// has its own per-store compaction concurrency which is controlled by
+	// opts.MaxConcurrentCompactions. However, in a multi-store configuration,
+	// disk I/O is a per-store resource while CPU is shared across stores.
+	// A significant portion of compaction is CPU-intensive, and so
+	// CompactionPool may be used to ensure that excessive compactions don't
+	// interrupt foreground CPU tasks even if the disks are capable of handling
+	// the additional throughput from those compactions.
+	compactionPool CompactionPool
+
 	// The main mutex protecting internal DB state. This mutex encompasses many
 	// fields because those fields need to be accessed and updated atomically. In
 	// particular, the current version, log.*, mem.*, and snapshot list need to

diff --git a/format_major_version.go b/format_major_version.go
@@ -399,13 +399,13 @@ func (d *DB) writeFormatVersionMarker(formatVers FormatMajorVersion) error {
 // waiting for compactions to complete (or for slots to free up).
 func (d *DB) compactMarkedFilesLocked() error {
 	curr := d.mu.versions.currentVersion()
+	if curr.Stats.MarkedForCompaction == 0 {
+		return nil
+	}
+	// Attempt to schedule a compaction to rewrite a file marked for
+	// compaction.
+	d.maybeScheduleCompaction()
 	for curr.Stats.MarkedForCompaction > 0 {
-		// Attempt to schedule a compaction to rewrite a file marked for
-		// compaction.
-		d.maybeScheduleCompactionPicker(func(picker compactionPicker, env compactionEnv) *pickedCompaction {
-			return picker.pickRewriteCompaction(env)
-		})
-
 		// The above attempt might succeed and schedule a rewrite compaction. Or
 		// there might not be available compaction concurrency to schedule the
 		// compaction.  Or compaction of the file might have already been in

diff --git a/open.go b/open.go
@@ -530,6 +530,8 @@ func Open(dirname string, opts *Options) (db *DB, err error) {
 	}
 	d.updateReadStateLocked(d.opts.DebugCheck)
 
+	d.compactionPool = opts.CompactionPool
+
 	if !d.opts.ReadOnly {
 		// If the Options specify a format major version higher than the
 		// loaded database's, upgrade it. If this is a new database, this