forked from linkedin/ambry
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[vcr-2.0] Measure Azure Storage Container lag in bytes (linkedin#2884)
This patch measures the drift of a Azure Storage container from a Ambry partition in bytes. It introduces a class responsible for aggregating container metrics. We deliberately avoid emitting per-container metrics to prevent an overwhelming increase in the number of metrics, which would strain the telemetry system. This approach has been tried previously and was unsuccessful. A daemon will run periodically, emitting aggregate metrics in a controlled and predictable manner. The drift of the Azure container from the Ambry partition is set using a compare-and-set mechanism to prevent accidental multithreading errors. However, it's unlikely that multiple threads will handle the same partition in the VCR, as a single thread manages all replicas of a partition using ROUND_ROBIN policy. We use the min() function because bootstrapping replicas can skew the data, falsely indicating a large drift when the partition is fully backed up. If the lag or drift is -1, we round it up to 0 and proceed.
- Loading branch information
Showing
7 changed files
with
209 additions
and
5 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
56 changes: 56 additions & 0 deletions
56
ambry-cloud/src/main/java/com/github/ambry/cloud/azure/AzureStorageContainerMetrics.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
/** | ||
* Copyright 2024 LinkedIn Corp. All rights reserved. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
*/ | ||
package com.github.ambry.cloud.azure; | ||
|
||
import java.util.concurrent.ConcurrentHashMap; | ||
import java.util.concurrent.atomic.AtomicLong; | ||
|
||
|
||
/** | ||
* A class that holds all metrics pertaining to one Azure Storage Container. | ||
*/ | ||
public class AzureStorageContainerMetrics { | ||
/** | ||
* id is the unique identifier of the azure-container or ambry-partition. | ||
*/ | ||
Long id; | ||
/** | ||
* lag is the number of bytes that the azure-container is behind or ahead of the associated ambry-partition. | ||
* Although we don't emit a positive drift, it is possible to have a positive drift if the azure-container is ahead | ||
* of a bootstrapping ambry-partition. | ||
*/ | ||
ConcurrentHashMap<String, AtomicLong> replicaLag; | ||
|
||
public AzureStorageContainerMetrics(Long id) { | ||
this.id = id; | ||
replicaLag = new ConcurrentHashMap<>(); | ||
} | ||
|
||
public void addPartitionReplica(String hostname) { | ||
replicaLag.putIfAbsent(hostname, new AtomicLong(Long.MAX_VALUE)); | ||
} | ||
|
||
public void removePartitionReplica(String hostname) { | ||
replicaLag.remove(hostname); | ||
} | ||
|
||
public Long getPartitionLag() { | ||
return replicaLag.values().stream().map(AtomicLong::get).reduce(Long.MAX_VALUE, Long::min); | ||
} | ||
|
||
public void setPartitionReplicaLag(String hostname, long update) { | ||
this.replicaLag.get(hostname).compareAndSet(this.replicaLag.get(hostname).get(), update); | ||
} | ||
|
||
} |
115 changes: 115 additions & 0 deletions
115
...oud/src/main/java/com/github/ambry/cloud/azure/AzureStorageContainerMetricsCollector.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,115 @@ | ||
/** | ||
* Copyright 2024 LinkedIn Corp. All rights reserved. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
*/ | ||
package com.github.ambry.cloud.azure; | ||
|
||
import com.codahale.metrics.MetricRegistry; | ||
import com.github.ambry.config.VerifiableProperties; | ||
import com.github.ambry.replication.RemoteReplicaInfo; | ||
import com.github.ambry.utils.Utils; | ||
import java.util.List; | ||
import java.util.concurrent.ConcurrentHashMap; | ||
import java.util.concurrent.ScheduledExecutorService; | ||
import java.util.concurrent.TimeUnit; | ||
import org.slf4j.Logger; | ||
import org.slf4j.LoggerFactory; | ||
|
||
|
||
/** | ||
* A class that aggregates container metrics. We do _NOT_ want to emit per-container metrics because then the number | ||
* of metrics increases proportionally overwhelming telemetry. This was attempted and failed. | ||
* | ||
* A daemon will run at regular intervals emitting aggregate metrics in a controlled and predictable manner. | ||
* This is a singleton class to avoid multiple collector threads. | ||
*/ | ||
public class AzureStorageContainerMetricsCollector { | ||
private final AzureMetrics metrics; | ||
private final ConcurrentHashMap<Long, AzureStorageContainerMetrics> metricMap; | ||
private final ScheduledExecutorService executor; | ||
private static AzureStorageContainerMetricsCollector instance; | ||
private final VerifiableProperties properties; | ||
|
||
public static final Logger logger = LoggerFactory.getLogger(AzureStorageContainerMetricsCollector.class); | ||
|
||
private AzureStorageContainerMetricsCollector(MetricRegistry metrics, VerifiableProperties properties) { | ||
metricMap = new ConcurrentHashMap<>(); | ||
this.metrics = new AzureMetrics(metrics); | ||
this.properties = properties; | ||
executor = Utils.newScheduler(1, "azure_storage_container_metrics_collector_", true); | ||
executor.scheduleWithFixedDelay(getCollector(), 0, 2, TimeUnit.MINUTES); | ||
logger.info("Started AzureStorageContainerMetricsCollector"); | ||
} | ||
|
||
private Runnable getCollector() { | ||
return () -> { | ||
Long totalLag = metricMap.values().stream() | ||
.map(container -> container.getPartitionLag()) | ||
.reduce(0L, Long::sum); | ||
this.metrics.azureContainerLagBytesCount.inc(totalLag); | ||
}; | ||
} | ||
|
||
/** | ||
* Thread-safe singleton initializer | ||
* @param metrics | ||
* @return collector instance | ||
*/ | ||
public static synchronized AzureStorageContainerMetricsCollector getInstance(MetricRegistry metrics, | ||
VerifiableProperties properties) { | ||
if (instance == null) { | ||
instance = new AzureStorageContainerMetricsCollector(metrics, properties); | ||
} | ||
return instance; | ||
} | ||
|
||
public void addPartitionReplicas(List<RemoteReplicaInfo> remoteReplicaInfos) { | ||
for (RemoteReplicaInfo rinfo : remoteReplicaInfos) { | ||
// Don't store any references to PartitionId or RemoteReplicaInfo. | ||
// With improper clean up, these references linger around and cause memory leaks. | ||
long pid = rinfo.getReplicaId().getPartitionId().getId(); | ||
String rid = rinfo.getReplicaId().getDataNodeId().getHostname(); | ||
metricMap.putIfAbsent(pid, new AzureStorageContainerMetrics(pid)); | ||
metricMap.get(pid).addPartitionReplica(rid); | ||
} | ||
} | ||
|
||
public void removePartitionReplicas(List<RemoteReplicaInfo> remoteReplicaInfos) { | ||
for (RemoteReplicaInfo rinfo : remoteReplicaInfos) { | ||
long pid = rinfo.getReplicaId().getPartitionId().getId(); | ||
String rid = rinfo.getReplicaId().getDataNodeId().getHostname(); | ||
if (metricMap.containsKey(pid)) { | ||
metricMap.get(pid).removePartitionReplica(rid); | ||
} | ||
} | ||
} | ||
|
||
public void removePartition(Long id) { | ||
metricMap.remove(id); | ||
} | ||
|
||
/** | ||
* Sets the lag of azure-container from ambry-partition. | ||
* We use a compare-set to guard against accidental multithreaded errors, although two threads will most likely | ||
* not be responsible for a single partition in VCR. A single thread handles all replicas of a partition. | ||
* However, we want to avoid any races between reader and writers. | ||
* Use min() as bootstrapping replicas can give a wrong picture and indicate a large lag even though the partition | ||
* is fully backed up in Azure. | ||
* @param rinfo RemoteReplicaInfo | ||
* @param lag Lag in bytes | ||
*/ | ||
public synchronized void setPartitionReplicaLag(RemoteReplicaInfo rinfo, long lag) { | ||
long pid = rinfo.getReplicaId().getPartitionId().getId(); | ||
String rid = rinfo.getReplicaId().getDataNodeId().getHostname(); | ||
metricMap.get(pid).setPartitionReplicaLag(rid, lag); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters