reduce cost of resilience4j metrics

This commit is contained in:
Jonathan Klabunde Tomer
2025-08-27 18:02:48 -07:00
committed by GitHub
parent 8fe87b77e4
commit 8e429e267f
2 changed files with 35 additions and 8 deletions

View File

@@ -22,6 +22,7 @@ import io.micrometer.core.instrument.distribution.DistributionStatisticConfig;
import io.micrometer.registry.otlp.OtlpMeterRegistry;
import io.micrometer.statsd.StatsdMeterRegistry;
import java.time.Duration;
import java.util.Set;
import org.whispersystems.textsecuregcm.WhisperServerConfiguration;
import org.whispersystems.textsecuregcm.WhisperServerVersion;
import org.whispersystems.textsecuregcm.configuration.dynamic.DynamicConfiguration;
@@ -32,6 +33,9 @@ public class MetricsUtil {
public static final String PREFIX = "chat";
private static Set<String> ALLOWED_R4J_METRICS = Set.of("resilience4j.circuitbreaker.calls", "resilience4j.circuitbreaker.not.permitted.calls", "resilience4j.circuitbreaker.state");
private static Set<String> ALLOWED_R4J_STATE_GAUGES = Set.of("open", "closed", "half_open");
private static volatile boolean registeredMetrics = false;
/**
@@ -84,6 +88,7 @@ public class MetricsUtil {
config.getOpenTelemetryConfiguration(), io.micrometer.core.instrument.Clock.SYSTEM);
configureMeterFilters(otlpMeterRegistry.config(), dynamicConfigurationManager);
configureCircuitBreakerMeterFilters(otlpMeterRegistry.config());
Metrics.addRegistry(otlpMeterRegistry);
if (config.getOpenTelemetryConfiguration().shutdownWaitDuration().compareTo(shutdownWaitDuration) > 0) {
@@ -136,6 +141,36 @@ public class MetricsUtil {
&& id.getName().startsWith(awsSdkMetricNamePrefix)));
}
// A separate function from configureMeterFilters only so we can use it on OTLP but not statsd
static void configureCircuitBreakerMeterFilters(MeterRegistry.Config config) {
config.meterFilter(
MeterFilter.deny(
id -> id.getName().equals("resilience4j.circuitbreaker.state") &&
id.getTags().stream().anyMatch(
t -> t.getKey().equals("state") && !ALLOWED_R4J_STATE_GAUGES.contains(t.getValue()))));
config.meterFilter(
MeterFilter.deny(
id -> id.getName().equals("resilience4j.circuitbreaker.calls") &&
id.getTags().stream().anyMatch(
t -> t.getKey().equals("kind") && t.getValue().equals("ignored"))));
config.meterFilter(
MeterFilter.deny(
id -> id.getName().startsWith("resilience4j.circuitbreaker") &&
!ALLOWED_R4J_METRICS.contains(id.getName())));
config.meterFilter(new MeterFilter() {
@Override
public DistributionStatisticConfig configure(final Meter.Id id, final DistributionStatisticConfig config) {
if (id.getName().equals("resilience4j.circuitbreaker.calls")) {
return DistributionStatisticConfig.NONE;
}
return config;
}
});
}
public static void registerSystemResourceMetrics(final Environment environment) {
new ProcessorMetrics().bindTo(Metrics.globalRegistry);
new FileDescriptorMetrics().bindTo(Metrics.globalRegistry);

View File

@@ -32,14 +32,6 @@ import org.whispersystems.textsecuregcm.util.ResilienceUtil;
/**
* Adds a circuit breaker to every Netty {@link Channel} that gets created, so that a single unhealthy shard does not
* impact all cluster operations.
* <p>
* For metrics to be registered, users <em>must</em> create a synthetic {@link ClusterTopologyChangedEvent} after the
* initial connection. For example:
* <pre>
* clusterClient.connect();
* clusterClient.getResources().eventBus().publish(
* new ClusterTopologyChangedEvent(Collections.emptyList(), clusterClient.getPartitions().getPartitions()));
* </pre>
*/
public class LettuceShardCircuitBreaker implements NettyCustomizer {