From 47c88d4d4e242ae32750ef4561e10439efe87c28 Mon Sep 17 00:00:00 2001 From: Keith Turner Date: Tue, 22 Oct 2024 22:23:14 +0000 Subject: [PATCH] corrects open file metric The metrics for computing the current number of open files on a tablet server was incorrect over time. Used the semaphor limiting open files to compute the number of open files. The removes the need to correctly increment and decrement files opened which had a bug. --- .../apache/accumulo/server/fs/FileManager.java | 4 ++++ .../apache/accumulo/tserver/ScanServer.java | 2 +- .../apache/accumulo/tserver/TabletServer.java | 2 +- .../tserver/TabletServerResourceManager.java | 4 ++++ .../metrics/TabletServerScanMetrics.java | 18 +++++++----------- .../tserver/tablet/ScanDataSource.java | 3 --- 6 files changed, 17 insertions(+), 16 deletions(-) diff --git a/server/base/src/main/java/org/apache/accumulo/server/fs/FileManager.java b/server/base/src/main/java/org/apache/accumulo/server/fs/FileManager.java index 277f0c62400..adceeeb1c0e 100644 --- a/server/base/src/main/java/org/apache/accumulo/server/fs/FileManager.java +++ b/server/base/src/main/java/org/apache/accumulo/server/fs/FileManager.java @@ -593,4 +593,8 @@ public synchronized int getNumOpenFiles() { public ScanFileManager newScanFileManager(KeyExtent tablet, CacheProvider cacheProvider) { return new ScanFileManager(tablet, cacheProvider); } + + public int getOpenFiles() { + return maxOpen - filePermits.availablePermits(); + } } diff --git a/server/tserver/src/main/java/org/apache/accumulo/tserver/ScanServer.java b/server/tserver/src/main/java/org/apache/accumulo/tserver/ScanServer.java index 688a08b1712..84cc046fbe2 100644 --- a/server/tserver/src/main/java/org/apache/accumulo/tserver/ScanServer.java +++ b/server/tserver/src/main/java/org/apache/accumulo/tserver/ScanServer.java @@ -407,7 +407,7 @@ public void run() { metricsInfo.addServiceTags(getApplicationName(), clientAddress); metricsInfo.addCommonTags(List.of(Tag.of("resource.group", groupName))); - scanMetrics = new TabletServerScanMetrics(); + scanMetrics = new TabletServerScanMetrics(resourceManager::getOpenFiles); sessionManager.setZombieCountConsumer(scanMetrics::setZombieScanThreads); scanServerMetrics = new ScanServerMetrics(tabletMetadataCache); blockCacheMetrics = new BlockCacheMetrics(resourceManager.getIndexCache(), diff --git a/server/tserver/src/main/java/org/apache/accumulo/tserver/TabletServer.java b/server/tserver/src/main/java/org/apache/accumulo/tserver/TabletServer.java index 29d392f1ccb..24573dfd1f2 100644 --- a/server/tserver/src/main/java/org/apache/accumulo/tserver/TabletServer.java +++ b/server/tserver/src/main/java/org/apache/accumulo/tserver/TabletServer.java @@ -765,7 +765,7 @@ public void run() { metrics = new TabletServerMetrics(this); updateMetrics = new TabletServerUpdateMetrics(); - scanMetrics = new TabletServerScanMetrics(); + scanMetrics = new TabletServerScanMetrics(this.resourceManager::getOpenFiles); sessionManager.setZombieCountConsumer(scanMetrics::setZombieScanThreads); mincMetrics = new TabletServerMinCMetrics(); ceMetrics = new CompactionExecutorsMetrics(); diff --git a/server/tserver/src/main/java/org/apache/accumulo/tserver/TabletServerResourceManager.java b/server/tserver/src/main/java/org/apache/accumulo/tserver/TabletServerResourceManager.java index cdcebd2576b..a171d7d33c2 100644 --- a/server/tserver/src/main/java/org/apache/accumulo/tserver/TabletServerResourceManager.java +++ b/server/tserver/src/main/java/org/apache/accumulo/tserver/TabletServerResourceManager.java @@ -400,6 +400,10 @@ public TabletServerResourceManager(ServerContext context, TabletHostingServer ts new AssignmentWatcher(acuConf, context, activeAssignments), 5000, TimeUnit.MILLISECONDS)); } + public int getOpenFiles() { + return fileManager.getOpenFiles(); + } + /** * Accepts some map which is tracking active assignment task(s) (running) and monitors them to * ensure that the time the assignment(s) have been running don't exceed a threshold. If the time diff --git a/server/tserver/src/main/java/org/apache/accumulo/tserver/metrics/TabletServerScanMetrics.java b/server/tserver/src/main/java/org/apache/accumulo/tserver/metrics/TabletServerScanMetrics.java index a46a8bdeaba..36fbd42de11 100644 --- a/server/tserver/src/main/java/org/apache/accumulo/tserver/metrics/TabletServerScanMetrics.java +++ b/server/tserver/src/main/java/org/apache/accumulo/tserver/metrics/TabletServerScanMetrics.java @@ -19,9 +19,9 @@ package org.apache.accumulo.tserver.metrics; import java.time.Duration; -import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.atomic.LongAdder; +import java.util.function.IntSupplier; import org.apache.accumulo.core.metrics.MetricsProducer; import org.apache.accumulo.server.metrics.NoopMetrics; @@ -34,7 +34,7 @@ public class TabletServerScanMetrics implements MetricsProducer { - private final AtomicInteger openFiles = new AtomicInteger(0); + private final IntSupplier openFiles; private Timer scans = NoopMetrics.useNoopTimer(); private DistributionSummary resultsPerScan = NoopMetrics.useNoopDistributionSummary(); private DistributionSummary yields = NoopMetrics.useNoopDistributionSummary(); @@ -96,14 +96,6 @@ public void addYield(long value) { yields.record(value); } - public void incrementOpenFiles(int delta) { - openFiles.addAndGet(Math.max(0, delta)); - } - - public void decrementOpenFiles(int delta) { - openFiles.addAndGet(delta < 0 ? delta : delta * -1); - } - public void incrementStartScan(double value) { startScanCalls.increment(value); } @@ -128,9 +120,13 @@ public long getZombieThreadsCount() { return zombieScanThreads.get(); } + public TabletServerScanMetrics(IntSupplier openFileSupplier) { + openFiles = openFileSupplier; + } + @Override public void registerMetrics(MeterRegistry registry) { - Gauge.builder(METRICS_SCAN_OPEN_FILES, openFiles::get) + Gauge.builder(METRICS_SCAN_OPEN_FILES, openFiles::getAsInt) .description("Number of files open for scans").register(registry); scans = Timer.builder(METRICS_SCAN_TIMES).description("Scans").register(registry); resultsPerScan = DistributionSummary.builder(METRICS_SCAN_RESULTS) diff --git a/server/tserver/src/main/java/org/apache/accumulo/tserver/tablet/ScanDataSource.java b/server/tserver/src/main/java/org/apache/accumulo/tserver/tablet/ScanDataSource.java index 8b7aca78f78..77b0de4b080 100644 --- a/server/tserver/src/main/java/org/apache/accumulo/tserver/tablet/ScanDataSource.java +++ b/server/tserver/src/main/java/org/apache/accumulo/tserver/tablet/ScanDataSource.java @@ -100,7 +100,6 @@ public DataSource getNewDataSource() { } finally { try { if (fileManager != null) { - tablet.getScanMetrics().decrementOpenFiles(fileManager.getNumOpenFiles()); fileManager.releaseOpenFiles(false); } } catch (Exception e) { @@ -154,7 +153,6 @@ private SortedKeyValueIterator createIterator() throws IOException { // only acquire the file manager when we know the tablet is open if (fileManager == null) { fileManager = tablet.getTabletResources().newScanFileManager(scanParams.getScanDispatch()); - tablet.getScanMetrics().incrementOpenFiles(fileManager.getNumOpenFiles()); log.trace("Adding active scan for {}, scanId:{}", tablet.getExtent(), scanDataSourceId); tablet.addActiveScans(this); } @@ -274,7 +272,6 @@ public void close(boolean sawErrors) { } try { if (fileManager != null) { - tablet.getScanMetrics().decrementOpenFiles(fileManager.getNumOpenFiles()); fileManager.releaseOpenFiles(sawErrors); } } finally {