mirror of
https://github.com/signalapp/Signal-Server
synced 2026-04-21 06:38:04 +01:00
Add per-shard Redis circuit breakers
This commit is contained in:
@@ -176,8 +176,10 @@ import org.whispersystems.textsecuregcm.push.ProvisioningManager;
|
||||
import org.whispersystems.textsecuregcm.push.PushLatencyManager;
|
||||
import org.whispersystems.textsecuregcm.push.PushNotificationManager;
|
||||
import org.whispersystems.textsecuregcm.push.ReceiptSender;
|
||||
import org.whispersystems.textsecuregcm.redis.ClusterFaultTolerantRedisCluster;
|
||||
import org.whispersystems.textsecuregcm.redis.ConnectionEventLogger;
|
||||
import org.whispersystems.textsecuregcm.redis.FaultTolerantRedisCluster;
|
||||
import org.whispersystems.textsecuregcm.redis.ShardFaultTolerantRedisCluster;
|
||||
import org.whispersystems.textsecuregcm.registration.RegistrationServiceClient;
|
||||
import org.whispersystems.textsecuregcm.s3.PolicySigner;
|
||||
import org.whispersystems.textsecuregcm.s3.PostPolicyGenerator;
|
||||
@@ -412,18 +414,25 @@ public class WhisperServerService extends Application<WhisperServerConfiguration
|
||||
final VerificationSessions verificationSessions = new VerificationSessions(dynamoDbAsyncClient,
|
||||
config.getDynamoDbTables().getVerificationSessions().getTableName(), clock);
|
||||
|
||||
final ClientResources redisClientResources = ClientResources.builder()
|
||||
.commandLatencyRecorder(new MicrometerCommandLatencyRecorder(Metrics.globalRegistry, MicrometerOptions.builder().build()))
|
||||
.build();
|
||||
final ClientResources.Builder redisClientResourcesBuilder = ClientResources.builder()
|
||||
.commandLatencyRecorder(
|
||||
new MicrometerCommandLatencyRecorder(Metrics.globalRegistry, MicrometerOptions.builder().build()));
|
||||
final ClientResources redisClientResources = redisClientResourcesBuilder.build();
|
||||
|
||||
ConnectionEventLogger.logConnectionEvents(redisClientResources);
|
||||
|
||||
FaultTolerantRedisCluster cacheCluster = new FaultTolerantRedisCluster("main_cache_cluster", config.getCacheClusterConfiguration(), redisClientResources);
|
||||
FaultTolerantRedisCluster messagesCluster = new FaultTolerantRedisCluster("messages_cluster", config.getMessageCacheConfiguration().getRedisClusterConfiguration(), redisClientResources);
|
||||
FaultTolerantRedisCluster clientPresenceCluster = new FaultTolerantRedisCluster("client_presence_cluster", config.getClientPresenceClusterConfiguration(), redisClientResources);
|
||||
FaultTolerantRedisCluster metricsCluster = new FaultTolerantRedisCluster("metrics_cluster", config.getMetricsClusterConfiguration(), redisClientResources);
|
||||
FaultTolerantRedisCluster pushSchedulerCluster = new FaultTolerantRedisCluster("push_scheduler", config.getPushSchedulerCluster(), redisClientResources);
|
||||
FaultTolerantRedisCluster rateLimitersCluster = new FaultTolerantRedisCluster("rate_limiters", config.getRateLimitersCluster(), redisClientResources);
|
||||
FaultTolerantRedisCluster cacheCluster = new ClusterFaultTolerantRedisCluster("main_cache_cluster",
|
||||
config.getCacheClusterConfiguration(), redisClientResources);
|
||||
FaultTolerantRedisCluster messagesCluster = new ClusterFaultTolerantRedisCluster("messages_cluster",
|
||||
config.getMessageCacheConfiguration().getRedisClusterConfiguration(), redisClientResources);
|
||||
FaultTolerantRedisCluster clientPresenceCluster = new ClusterFaultTolerantRedisCluster("client_presence_cluster",
|
||||
config.getClientPresenceClusterConfiguration(), redisClientResources);
|
||||
FaultTolerantRedisCluster metricsCluster = new ShardFaultTolerantRedisCluster("metrics",
|
||||
config.getMetricsClusterConfiguration(), redisClientResourcesBuilder);
|
||||
FaultTolerantRedisCluster pushSchedulerCluster = new ClusterFaultTolerantRedisCluster("push_scheduler",
|
||||
config.getPushSchedulerCluster(), redisClientResources);
|
||||
FaultTolerantRedisCluster rateLimitersCluster = new ClusterFaultTolerantRedisCluster("rate_limiters",
|
||||
config.getRateLimitersCluster(), redisClientResources);
|
||||
|
||||
final BlockingQueue<Runnable> keyspaceNotificationDispatchQueue = new ArrayBlockingQueue<>(100_000);
|
||||
Metrics.gaugeCollectionSize(name(getClass(), "keyspaceNotificationDispatchQueueSize"), Collections.emptyList(),
|
||||
|
||||
@@ -41,8 +41,7 @@ public class CircuitBreakerConfiguration {
|
||||
|
||||
@JsonProperty
|
||||
@NotNull
|
||||
@Min(1)
|
||||
private long waitDurationInOpenStateInSeconds = 10;
|
||||
private Duration waitDurationInOpenState = Duration.ofSeconds(10);
|
||||
|
||||
@JsonProperty
|
||||
private List<String> ignoredExceptions = Collections.emptyList();
|
||||
@@ -64,8 +63,8 @@ public class CircuitBreakerConfiguration {
|
||||
return slidingWindowMinimumNumberOfCalls;
|
||||
}
|
||||
|
||||
public long getWaitDurationInOpenStateInSeconds() {
|
||||
return waitDurationInOpenStateInSeconds;
|
||||
public Duration getWaitDurationInOpenState() {
|
||||
return waitDurationInOpenState;
|
||||
}
|
||||
|
||||
public List<Class<?>> getIgnoredExceptions() {
|
||||
@@ -101,8 +100,8 @@ public class CircuitBreakerConfiguration {
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
public void setWaitDurationInOpenStateInSeconds(int seconds) {
|
||||
this.waitDurationInOpenStateInSeconds = seconds;
|
||||
public void setWaitDurationInOpenState(Duration duration) {
|
||||
this.waitDurationInOpenState = duration;
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
@@ -115,7 +114,7 @@ public class CircuitBreakerConfiguration {
|
||||
.failureRateThreshold(getFailureRateThreshold())
|
||||
.ignoreExceptions(getIgnoredExceptions().toArray(new Class[0]))
|
||||
.permittedNumberOfCallsInHalfOpenState(getPermittedNumberOfCallsInHalfOpenState())
|
||||
.waitDurationInOpenState(Duration.ofSeconds(getWaitDurationInOpenStateInSeconds()))
|
||||
.waitDurationInOpenState(getWaitDurationInOpenState())
|
||||
.slidingWindow(getSlidingWindowSize(), getSlidingWindowMinimumNumberOfCalls(),
|
||||
CircuitBreakerConfig.SlidingWindowType.COUNT_BASED)
|
||||
.build();
|
||||
|
||||
@@ -24,6 +24,7 @@ import java.util.concurrent.ThreadLocalRandom;
|
||||
import java.util.function.Predicate;
|
||||
import java.util.function.Supplier;
|
||||
import java.util.stream.IntStream;
|
||||
import io.micrometer.core.instrument.Tags;
|
||||
import org.glassfish.jersey.SslConfigurator;
|
||||
import org.whispersystems.textsecuregcm.configuration.CircuitBreakerConfiguration;
|
||||
import org.whispersystems.textsecuregcm.configuration.RetryConfiguration;
|
||||
@@ -56,7 +57,7 @@ public class FaultTolerantHttpClient {
|
||||
this.defaultRequestTimeout = defaultRequestTimeout;
|
||||
this.breaker = CircuitBreaker.of(name + "-breaker", circuitBreakerConfiguration.toCircuitBreakerConfig());
|
||||
|
||||
CircuitBreakerUtil.registerMetrics(breaker, FaultTolerantHttpClient.class);
|
||||
CircuitBreakerUtil.registerMetrics(breaker, FaultTolerantHttpClient.class, Tags.empty());
|
||||
|
||||
if (retryConfiguration != null) {
|
||||
if (this.retryExecutor == null) {
|
||||
|
||||
@@ -78,7 +78,7 @@ public class ProvisioningManager extends RedisPubSubAdapter<byte[], byte[]> impl
|
||||
|
||||
this.circuitBreaker = CircuitBreaker.of("pubsub-breaker", circuitBreakerConfiguration.toCircuitBreakerConfig());
|
||||
|
||||
CircuitBreakerUtil.registerMetrics(circuitBreaker, ProvisioningManager.class);
|
||||
CircuitBreakerUtil.registerMetrics(circuitBreaker, ProvisioningManager.class, Tags.empty());
|
||||
|
||||
Metrics.gaugeMapSize(ACTIVE_LISTENERS_GAUGE_NAME, Tags.empty(), listenersByProvisioningAddress);
|
||||
}
|
||||
|
||||
@@ -0,0 +1,106 @@
|
||||
/*
|
||||
* Copyright 2013-2020 Signal Messenger, LLC
|
||||
* SPDX-License-Identifier: AGPL-3.0-only
|
||||
*/
|
||||
|
||||
package org.whispersystems.textsecuregcm.redis;
|
||||
|
||||
import static org.whispersystems.textsecuregcm.metrics.MetricsUtil.name;
|
||||
|
||||
import io.github.resilience4j.circuitbreaker.CircuitBreaker;
|
||||
import io.github.resilience4j.retry.Retry;
|
||||
import io.lettuce.core.RedisException;
|
||||
import io.lettuce.core.cluster.event.ClusterTopologyChangedEvent;
|
||||
import io.lettuce.core.cluster.pubsub.StatefulRedisClusterPubSubConnection;
|
||||
import io.micrometer.core.instrument.Metrics;
|
||||
import io.micrometer.core.instrument.Tags;
|
||||
import io.micrometer.core.instrument.Timer;
|
||||
import java.util.function.Consumer;
|
||||
import java.util.function.Function;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.whispersystems.textsecuregcm.util.CircuitBreakerUtil;
|
||||
import reactor.core.scheduler.Scheduler;
|
||||
|
||||
public class ClusterFaultTolerantPubSubConnection<K, V> implements FaultTolerantPubSubConnection<K, V> {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(ClusterFaultTolerantPubSubConnection.class);
|
||||
|
||||
|
||||
private final String name;
|
||||
private final StatefulRedisClusterPubSubConnection<K, V> pubSubConnection;
|
||||
|
||||
private final CircuitBreaker circuitBreaker;
|
||||
private final Retry retry;
|
||||
private final Retry resubscribeRetry;
|
||||
private final Scheduler topologyChangedEventScheduler;
|
||||
|
||||
private final Timer executeTimer;
|
||||
|
||||
public ClusterFaultTolerantPubSubConnection(final String name,
|
||||
final StatefulRedisClusterPubSubConnection<K, V> pubSubConnection, final CircuitBreaker circuitBreaker,
|
||||
final Retry retry, final Retry resubscribeRetry, final Scheduler topologyChangedEventScheduler) {
|
||||
this.name = name;
|
||||
this.pubSubConnection = pubSubConnection;
|
||||
this.circuitBreaker = circuitBreaker;
|
||||
this.retry = retry;
|
||||
this.resubscribeRetry = resubscribeRetry;
|
||||
this.topologyChangedEventScheduler = topologyChangedEventScheduler;
|
||||
|
||||
this.pubSubConnection.setNodeMessagePropagation(true);
|
||||
|
||||
this.executeTimer = Metrics.timer(name(getClass(), "execute"), "clusterName", name + "-pubsub");
|
||||
|
||||
CircuitBreakerUtil.registerMetrics(circuitBreaker, ClusterFaultTolerantPubSubConnection.class, Tags.empty());
|
||||
}
|
||||
|
||||
@Override
|
||||
public void usePubSubConnection(final Consumer<StatefulRedisClusterPubSubConnection<K, V>> consumer) {
|
||||
try {
|
||||
circuitBreaker.executeCheckedRunnable(
|
||||
() -> retry.executeRunnable(() -> executeTimer.record(() -> consumer.accept(pubSubConnection))));
|
||||
} catch (final Throwable t) {
|
||||
if (t instanceof RedisException) {
|
||||
throw (RedisException) t;
|
||||
} else {
|
||||
throw new RedisException(t);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public <T> T withPubSubConnection(final Function<StatefulRedisClusterPubSubConnection<K, V>, T> function) {
|
||||
try {
|
||||
return circuitBreaker.executeCheckedSupplier(
|
||||
() -> retry.executeCallable(() -> executeTimer.record(() -> function.apply(pubSubConnection))));
|
||||
} catch (final Throwable t) {
|
||||
if (t instanceof RedisException) {
|
||||
throw (RedisException) t;
|
||||
} else {
|
||||
throw new RedisException(t);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void subscribeToClusterTopologyChangedEvents(final Runnable eventHandler) {
|
||||
|
||||
usePubSubConnection(connection -> connection.getResources().eventBus().get()
|
||||
.filter(event -> event instanceof ClusterTopologyChangedEvent)
|
||||
.subscribeOn(topologyChangedEventScheduler)
|
||||
.subscribe(event -> {
|
||||
logger.info("Got topology change event for {}, resubscribing all keyspace notifications", name);
|
||||
|
||||
resubscribeRetry.executeRunnable(() -> {
|
||||
try {
|
||||
eventHandler.run();
|
||||
} catch (final RuntimeException e) {
|
||||
logger.warn("Resubscribe for {} failed", name, e);
|
||||
throw e;
|
||||
}
|
||||
});
|
||||
}));
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,196 @@
|
||||
/*
|
||||
* Copyright 2013-2020 Signal Messenger, LLC
|
||||
* SPDX-License-Identifier: AGPL-3.0-only
|
||||
*/
|
||||
|
||||
package org.whispersystems.textsecuregcm.redis;
|
||||
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
import io.github.resilience4j.circuitbreaker.CircuitBreaker;
|
||||
import io.github.resilience4j.core.IntervalFunction;
|
||||
import io.github.resilience4j.reactor.circuitbreaker.operator.CircuitBreakerOperator;
|
||||
import io.github.resilience4j.reactor.retry.RetryOperator;
|
||||
import io.github.resilience4j.retry.Retry;
|
||||
import io.github.resilience4j.retry.RetryConfig;
|
||||
import io.lettuce.core.ClientOptions.DisconnectedBehavior;
|
||||
import io.lettuce.core.RedisCommandTimeoutException;
|
||||
import io.lettuce.core.RedisException;
|
||||
import io.lettuce.core.TimeoutOptions;
|
||||
import io.lettuce.core.cluster.ClusterClientOptions;
|
||||
import io.lettuce.core.cluster.ClusterTopologyRefreshOptions;
|
||||
import io.lettuce.core.cluster.RedisClusterClient;
|
||||
import io.lettuce.core.cluster.api.StatefulRedisClusterConnection;
|
||||
import io.lettuce.core.cluster.pubsub.StatefulRedisClusterPubSubConnection;
|
||||
import io.lettuce.core.codec.ByteArrayCodec;
|
||||
import io.lettuce.core.resource.ClientResources;
|
||||
import java.time.Duration;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.function.Consumer;
|
||||
import java.util.function.Function;
|
||||
import io.micrometer.core.instrument.Tags;
|
||||
import org.reactivestreams.Publisher;
|
||||
import org.whispersystems.textsecuregcm.configuration.CircuitBreakerConfiguration;
|
||||
import org.whispersystems.textsecuregcm.configuration.RedisClusterConfiguration;
|
||||
import org.whispersystems.textsecuregcm.configuration.RetryConfiguration;
|
||||
import org.whispersystems.textsecuregcm.util.CircuitBreakerUtil;
|
||||
import reactor.core.publisher.Flux;
|
||||
import reactor.core.scheduler.Schedulers;
|
||||
|
||||
/**
|
||||
* A fault-tolerant access manager for a Redis cluster. A single circuit breaker protects all cluster
|
||||
* calls.
|
||||
*/
|
||||
public class ClusterFaultTolerantRedisCluster implements FaultTolerantRedisCluster {
|
||||
|
||||
private final String name;
|
||||
|
||||
private final RedisClusterClient clusterClient;
|
||||
|
||||
private final StatefulRedisClusterConnection<String, String> stringConnection;
|
||||
private final StatefulRedisClusterConnection<byte[], byte[]> binaryConnection;
|
||||
|
||||
private final List<StatefulRedisClusterPubSubConnection<?, ?>> pubSubConnections = new ArrayList<>();
|
||||
|
||||
private final CircuitBreaker circuitBreaker;
|
||||
private final Retry retry;
|
||||
private final Retry topologyChangedEventRetry;
|
||||
|
||||
public ClusterFaultTolerantRedisCluster(final String name, final RedisClusterConfiguration clusterConfiguration,
|
||||
final ClientResources clientResources) {
|
||||
this(name,
|
||||
RedisClusterClient.create(clientResources,
|
||||
RedisUriUtil.createRedisUriWithTimeout(clusterConfiguration.getConfigurationUri(),
|
||||
clusterConfiguration.getTimeout())),
|
||||
clusterConfiguration.getTimeout(),
|
||||
clusterConfiguration.getCircuitBreakerConfiguration(),
|
||||
clusterConfiguration.getRetryConfiguration());
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
ClusterFaultTolerantRedisCluster(final String name, final RedisClusterClient clusterClient,
|
||||
final Duration commandTimeout,
|
||||
final CircuitBreakerConfiguration circuitBreakerConfiguration, final RetryConfiguration retryConfiguration) {
|
||||
this.name = name;
|
||||
|
||||
this.clusterClient = clusterClient;
|
||||
this.clusterClient.setOptions(ClusterClientOptions.builder()
|
||||
.disconnectedBehavior(DisconnectedBehavior.REJECT_COMMANDS)
|
||||
.validateClusterNodeMembership(false)
|
||||
.topologyRefreshOptions(ClusterTopologyRefreshOptions.builder()
|
||||
.enableAllAdaptiveRefreshTriggers()
|
||||
.build())
|
||||
// for asynchronous commands
|
||||
.timeoutOptions(TimeoutOptions.builder()
|
||||
.fixedTimeout(commandTimeout)
|
||||
.build())
|
||||
.publishOnScheduler(true)
|
||||
.build());
|
||||
|
||||
this.stringConnection = clusterClient.connect();
|
||||
this.binaryConnection = clusterClient.connect(ByteArrayCodec.INSTANCE);
|
||||
|
||||
this.circuitBreaker = CircuitBreaker.of(name + "-breaker", circuitBreakerConfiguration.toCircuitBreakerConfig());
|
||||
this.retry = Retry.of(name + "-retry", retryConfiguration.toRetryConfigBuilder()
|
||||
.retryOnException(exception -> exception instanceof RedisCommandTimeoutException).build());
|
||||
final RetryConfig topologyChangedEventRetryConfig = RetryConfig.custom()
|
||||
.maxAttempts(Integer.MAX_VALUE)
|
||||
.intervalFunction(
|
||||
IntervalFunction.ofExponentialRandomBackoff(Duration.ofSeconds(1), 1.5, Duration.ofSeconds(30)))
|
||||
.build();
|
||||
|
||||
this.topologyChangedEventRetry = Retry.of(name + "-topologyChangedRetry", topologyChangedEventRetryConfig);
|
||||
|
||||
CircuitBreakerUtil.registerMetrics(circuitBreaker, FaultTolerantRedisCluster.class, Tags.empty());
|
||||
CircuitBreakerUtil.registerMetrics(retry, FaultTolerantRedisCluster.class);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void shutdown() {
|
||||
stringConnection.close();
|
||||
binaryConnection.close();
|
||||
|
||||
for (final StatefulRedisClusterPubSubConnection<?, ?> pubSubConnection : pubSubConnections) {
|
||||
pubSubConnection.close();
|
||||
}
|
||||
|
||||
clusterClient.shutdown();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void useCluster(final Consumer<StatefulRedisClusterConnection<String, String>> consumer) {
|
||||
useConnection(stringConnection, consumer);
|
||||
}
|
||||
|
||||
@Override
|
||||
public <T> T withCluster(final Function<StatefulRedisClusterConnection<String, String>, T> function) {
|
||||
return withConnection(stringConnection, function);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void useBinaryCluster(final Consumer<StatefulRedisClusterConnection<byte[], byte[]>> consumer) {
|
||||
useConnection(binaryConnection, consumer);
|
||||
}
|
||||
|
||||
@Override
|
||||
public <T> T withBinaryCluster(final Function<StatefulRedisClusterConnection<byte[], byte[]>, T> function) {
|
||||
return withConnection(binaryConnection, function);
|
||||
}
|
||||
|
||||
@Override
|
||||
public <T> Publisher<T> withBinaryClusterReactive(
|
||||
final Function<StatefulRedisClusterConnection<byte[], byte[]>, Publisher<T>> function) {
|
||||
return withConnectionReactive(binaryConnection, function);
|
||||
}
|
||||
|
||||
@Override
|
||||
public <K, V> void useConnection(final StatefulRedisClusterConnection<K, V> connection,
|
||||
final Consumer<StatefulRedisClusterConnection<K, V>> consumer) {
|
||||
try {
|
||||
circuitBreaker.executeCheckedRunnable(() -> retry.executeRunnable(() -> consumer.accept(connection)));
|
||||
} catch (final Throwable t) {
|
||||
if (t instanceof RedisException) {
|
||||
throw (RedisException) t;
|
||||
} else {
|
||||
throw new RedisException(t);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public <T, K, V> T withConnection(final StatefulRedisClusterConnection<K, V> connection,
|
||||
final Function<StatefulRedisClusterConnection<K, V>, T> function) {
|
||||
try {
|
||||
return circuitBreaker.executeCheckedSupplier(() -> retry.executeCallable(() -> function.apply(connection)));
|
||||
} catch (final Throwable t) {
|
||||
if (t instanceof RedisException) {
|
||||
throw (RedisException) t;
|
||||
} else {
|
||||
throw new RedisException(t);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public <T, K, V> Publisher<T> withConnectionReactive(final StatefulRedisClusterConnection<K, V> connection,
|
||||
final Function<StatefulRedisClusterConnection<K, V>, Publisher<T>> function) {
|
||||
|
||||
return Flux.from(function.apply(connection))
|
||||
.transformDeferred(RetryOperator.of(retry))
|
||||
.transformDeferred(CircuitBreakerOperator.of(circuitBreaker));
|
||||
}
|
||||
|
||||
public FaultTolerantPubSubConnection<String, String> createPubSubConnection() {
|
||||
final StatefulRedisClusterPubSubConnection<String, String> pubSubConnection = clusterClient.connectPubSub();
|
||||
pubSubConnections.add(pubSubConnection);
|
||||
|
||||
return new ClusterFaultTolerantPubSubConnection<>(name, pubSubConnection, circuitBreaker, retry,
|
||||
topologyChangedEventRetry,
|
||||
Schedulers.newSingle(name + "-redisPubSubEvents", true));
|
||||
}
|
||||
}
|
||||
@@ -1,102 +1,19 @@
|
||||
/*
|
||||
* Copyright 2013-2020 Signal Messenger, LLC
|
||||
* Copyright 2024 Signal Messenger, LLC
|
||||
* SPDX-License-Identifier: AGPL-3.0-only
|
||||
*/
|
||||
|
||||
package org.whispersystems.textsecuregcm.redis;
|
||||
|
||||
import static org.whispersystems.textsecuregcm.metrics.MetricsUtil.name;
|
||||
|
||||
import io.github.resilience4j.circuitbreaker.CircuitBreaker;
|
||||
import io.github.resilience4j.retry.Retry;
|
||||
import io.lettuce.core.RedisException;
|
||||
import io.lettuce.core.cluster.event.ClusterTopologyChangedEvent;
|
||||
import io.lettuce.core.cluster.pubsub.StatefulRedisClusterPubSubConnection;
|
||||
import io.micrometer.core.instrument.Metrics;
|
||||
import io.micrometer.core.instrument.Timer;
|
||||
import java.util.function.Consumer;
|
||||
import java.util.function.Function;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.whispersystems.textsecuregcm.util.CircuitBreakerUtil;
|
||||
import reactor.core.scheduler.Scheduler;
|
||||
|
||||
public class FaultTolerantPubSubConnection<K, V> {
|
||||
public interface FaultTolerantPubSubConnection<K, V> {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(FaultTolerantPubSubConnection.class);
|
||||
void usePubSubConnection(Consumer<StatefulRedisClusterPubSubConnection<K, V>> consumer);
|
||||
|
||||
<T> T withPubSubConnection(Function<StatefulRedisClusterPubSubConnection<K, V>, T> function);
|
||||
|
||||
private final String name;
|
||||
private final StatefulRedisClusterPubSubConnection<K, V> pubSubConnection;
|
||||
|
||||
private final CircuitBreaker circuitBreaker;
|
||||
private final Retry retry;
|
||||
private final Retry resubscribeRetry;
|
||||
private final Scheduler topologyChangedEventScheduler;
|
||||
|
||||
private final Timer executeTimer;
|
||||
|
||||
public FaultTolerantPubSubConnection(final String name,
|
||||
final StatefulRedisClusterPubSubConnection<K, V> pubSubConnection, final CircuitBreaker circuitBreaker,
|
||||
final Retry retry, final Retry resubscribeRetry, final Scheduler topologyChangedEventScheduler) {
|
||||
this.name = name;
|
||||
this.pubSubConnection = pubSubConnection;
|
||||
this.circuitBreaker = circuitBreaker;
|
||||
this.retry = retry;
|
||||
this.resubscribeRetry = resubscribeRetry;
|
||||
this.topologyChangedEventScheduler = topologyChangedEventScheduler;
|
||||
|
||||
this.pubSubConnection.setNodeMessagePropagation(true);
|
||||
|
||||
this.executeTimer = Metrics.timer(name(getClass(), "execute"), "clusterName", name + "-pubsub");
|
||||
|
||||
CircuitBreakerUtil.registerMetrics(circuitBreaker, FaultTolerantPubSubConnection.class);
|
||||
}
|
||||
|
||||
public void usePubSubConnection(final Consumer<StatefulRedisClusterPubSubConnection<K, V>> consumer) {
|
||||
try {
|
||||
circuitBreaker.executeCheckedRunnable(
|
||||
() -> retry.executeRunnable(() -> executeTimer.record(() -> consumer.accept(pubSubConnection))));
|
||||
} catch (final Throwable t) {
|
||||
if (t instanceof RedisException) {
|
||||
throw (RedisException) t;
|
||||
} else {
|
||||
throw new RedisException(t);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public <T> T withPubSubConnection(final Function<StatefulRedisClusterPubSubConnection<K, V>, T> function) {
|
||||
try {
|
||||
return circuitBreaker.executeCheckedSupplier(
|
||||
() -> retry.executeCallable(() -> executeTimer.record(() -> function.apply(pubSubConnection))));
|
||||
} catch (final Throwable t) {
|
||||
if (t instanceof RedisException) {
|
||||
throw (RedisException) t;
|
||||
} else {
|
||||
throw new RedisException(t);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
public void subscribeToClusterTopologyChangedEvents(final Runnable eventHandler) {
|
||||
|
||||
usePubSubConnection(connection -> connection.getResources().eventBus().get()
|
||||
.filter(event -> event instanceof ClusterTopologyChangedEvent)
|
||||
.subscribeOn(topologyChangedEventScheduler)
|
||||
.subscribe(event -> {
|
||||
logger.info("Got topology change event for {}, resubscribing all keyspace notifications", name);
|
||||
|
||||
resubscribeRetry.executeRunnable(() -> {
|
||||
try {
|
||||
eventHandler.run();
|
||||
} catch (final RuntimeException e) {
|
||||
logger.warn("Resubscribe for {} failed", name, e);
|
||||
throw e;
|
||||
}
|
||||
});
|
||||
}));
|
||||
}
|
||||
|
||||
void subscribeToClusterTopologyChangedEvents(Runnable eventHandler);
|
||||
}
|
||||
|
||||
@@ -1,183 +1,40 @@
|
||||
/*
|
||||
* Copyright 2013-2020 Signal Messenger, LLC
|
||||
* Copyright 2024 Signal Messenger, LLC
|
||||
* SPDX-License-Identifier: AGPL-3.0-only
|
||||
*/
|
||||
|
||||
package org.whispersystems.textsecuregcm.redis;
|
||||
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
import io.github.resilience4j.circuitbreaker.CircuitBreaker;
|
||||
import io.github.resilience4j.core.IntervalFunction;
|
||||
import io.github.resilience4j.reactor.circuitbreaker.operator.CircuitBreakerOperator;
|
||||
import io.github.resilience4j.reactor.retry.RetryOperator;
|
||||
import io.github.resilience4j.retry.Retry;
|
||||
import io.github.resilience4j.retry.RetryConfig;
|
||||
import io.lettuce.core.ClientOptions.DisconnectedBehavior;
|
||||
import io.lettuce.core.RedisCommandTimeoutException;
|
||||
import io.lettuce.core.RedisException;
|
||||
import io.lettuce.core.TimeoutOptions;
|
||||
import io.lettuce.core.cluster.ClusterClientOptions;
|
||||
import io.lettuce.core.cluster.ClusterTopologyRefreshOptions;
|
||||
import io.lettuce.core.cluster.RedisClusterClient;
|
||||
import io.lettuce.core.cluster.api.StatefulRedisClusterConnection;
|
||||
import io.lettuce.core.cluster.pubsub.StatefulRedisClusterPubSubConnection;
|
||||
import io.lettuce.core.codec.ByteArrayCodec;
|
||||
import io.lettuce.core.resource.ClientResources;
|
||||
import java.time.Duration;
|
||||
import java.util.ArrayList;
|
||||
import java.util.List;
|
||||
import java.util.function.Consumer;
|
||||
import java.util.function.Function;
|
||||
import org.reactivestreams.Publisher;
|
||||
import org.whispersystems.textsecuregcm.configuration.CircuitBreakerConfiguration;
|
||||
import org.whispersystems.textsecuregcm.configuration.RedisClusterConfiguration;
|
||||
import org.whispersystems.textsecuregcm.configuration.RetryConfiguration;
|
||||
import org.whispersystems.textsecuregcm.util.CircuitBreakerUtil;
|
||||
import reactor.core.publisher.Flux;
|
||||
import reactor.core.scheduler.Schedulers;
|
||||
|
||||
/**
|
||||
* A fault-tolerant access manager for a Redis cluster. A fault-tolerant Redis cluster provides managed,
|
||||
* circuit-breaker-protected access to connections.
|
||||
*/
|
||||
public class FaultTolerantRedisCluster {
|
||||
public interface FaultTolerantRedisCluster {
|
||||
|
||||
private final String name;
|
||||
void shutdown();
|
||||
|
||||
private final RedisClusterClient clusterClient;
|
||||
String getName();
|
||||
|
||||
private final StatefulRedisClusterConnection<String, String> stringConnection;
|
||||
private final StatefulRedisClusterConnection<byte[], byte[]> binaryConnection;
|
||||
void useCluster(Consumer<StatefulRedisClusterConnection<String, String>> consumer);
|
||||
|
||||
private final List<StatefulRedisClusterPubSubConnection<?, ?>> pubSubConnections = new ArrayList<>();
|
||||
<T> T withCluster(Function<StatefulRedisClusterConnection<String, String>, T> function);
|
||||
|
||||
private final CircuitBreaker circuitBreaker;
|
||||
private final Retry retry;
|
||||
private final Retry topologyChangedEventRetry;
|
||||
void useBinaryCluster(Consumer<StatefulRedisClusterConnection<byte[], byte[]>> consumer);
|
||||
|
||||
public FaultTolerantRedisCluster(final String name, final RedisClusterConfiguration clusterConfiguration,
|
||||
final ClientResources clientResources) {
|
||||
this(name,
|
||||
RedisClusterClient.create(clientResources,
|
||||
RedisUriUtil.createRedisUriWithTimeout(clusterConfiguration.getConfigurationUri(),
|
||||
clusterConfiguration.getTimeout())),
|
||||
clusterConfiguration.getTimeout(),
|
||||
clusterConfiguration.getCircuitBreakerConfiguration(),
|
||||
clusterConfiguration.getRetryConfiguration());
|
||||
}
|
||||
<T> T withBinaryCluster(Function<StatefulRedisClusterConnection<byte[], byte[]>, T> function);
|
||||
|
||||
@VisibleForTesting
|
||||
FaultTolerantRedisCluster(final String name, final RedisClusterClient clusterClient, final Duration commandTimeout,
|
||||
final CircuitBreakerConfiguration circuitBreakerConfiguration, final RetryConfiguration retryConfiguration) {
|
||||
this.name = name;
|
||||
<T> Publisher<T> withBinaryClusterReactive(
|
||||
Function<StatefulRedisClusterConnection<byte[], byte[]>, Publisher<T>> function);
|
||||
|
||||
this.clusterClient = clusterClient;
|
||||
this.clusterClient.setOptions(ClusterClientOptions.builder()
|
||||
.disconnectedBehavior(DisconnectedBehavior.REJECT_COMMANDS)
|
||||
.validateClusterNodeMembership(false)
|
||||
.topologyRefreshOptions(ClusterTopologyRefreshOptions.builder()
|
||||
.enableAllAdaptiveRefreshTriggers()
|
||||
.build())
|
||||
// for asynchronous commands
|
||||
.timeoutOptions(TimeoutOptions.builder()
|
||||
.fixedTimeout(commandTimeout)
|
||||
.build())
|
||||
.publishOnScheduler(true)
|
||||
.build());
|
||||
<K, V> void useConnection(StatefulRedisClusterConnection<K, V> connection,
|
||||
Consumer<StatefulRedisClusterConnection<K, V>> consumer);
|
||||
|
||||
this.stringConnection = clusterClient.connect();
|
||||
this.binaryConnection = clusterClient.connect(ByteArrayCodec.INSTANCE);
|
||||
<T, K, V> T withConnection(StatefulRedisClusterConnection<K, V> connection,
|
||||
Function<StatefulRedisClusterConnection<K, V>, T> function);
|
||||
|
||||
this.circuitBreaker = CircuitBreaker.of(name + "-breaker", circuitBreakerConfiguration.toCircuitBreakerConfig());
|
||||
this.retry = Retry.of(name + "-retry", retryConfiguration.toRetryConfigBuilder()
|
||||
.retryOnException(exception -> exception instanceof RedisCommandTimeoutException).build());
|
||||
final RetryConfig topologyChangedEventRetryConfig = RetryConfig.custom()
|
||||
.maxAttempts(Integer.MAX_VALUE)
|
||||
.intervalFunction(
|
||||
IntervalFunction.ofExponentialRandomBackoff(Duration.ofSeconds(1), 1.5, Duration.ofSeconds(30)))
|
||||
.build();
|
||||
<T, K, V> Publisher<T> withConnectionReactive(StatefulRedisClusterConnection<K, V> connection,
|
||||
Function<StatefulRedisClusterConnection<K, V>, Publisher<T>> function);
|
||||
|
||||
this.topologyChangedEventRetry = Retry.of(name + "-topologyChangedRetry", topologyChangedEventRetryConfig);
|
||||
|
||||
CircuitBreakerUtil.registerMetrics(circuitBreaker, FaultTolerantRedisCluster.class);
|
||||
CircuitBreakerUtil.registerMetrics(retry, FaultTolerantRedisCluster.class);
|
||||
}
|
||||
|
||||
void shutdown() {
|
||||
stringConnection.close();
|
||||
binaryConnection.close();
|
||||
|
||||
for (final StatefulRedisClusterPubSubConnection<?, ?> pubSubConnection : pubSubConnections) {
|
||||
pubSubConnection.close();
|
||||
}
|
||||
|
||||
clusterClient.shutdown();
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
public void useCluster(final Consumer<StatefulRedisClusterConnection<String, String>> consumer) {
|
||||
useConnection(stringConnection, consumer);
|
||||
}
|
||||
|
||||
public <T> T withCluster(final Function<StatefulRedisClusterConnection<String, String>, T> function) {
|
||||
return withConnection(stringConnection, function);
|
||||
}
|
||||
|
||||
public void useBinaryCluster(final Consumer<StatefulRedisClusterConnection<byte[], byte[]>> consumer) {
|
||||
useConnection(binaryConnection, consumer);
|
||||
}
|
||||
|
||||
public <T> T withBinaryCluster(final Function<StatefulRedisClusterConnection<byte[], byte[]>, T> function) {
|
||||
return withConnection(binaryConnection, function);
|
||||
}
|
||||
|
||||
public <T> Publisher<T> withBinaryClusterReactive(
|
||||
final Function<StatefulRedisClusterConnection<byte[], byte[]>, Publisher<T>> function) {
|
||||
return withConnectionReactive(binaryConnection, function);
|
||||
}
|
||||
|
||||
private <K, V> void useConnection(final StatefulRedisClusterConnection<K, V> connection,
|
||||
final Consumer<StatefulRedisClusterConnection<K, V>> consumer) {
|
||||
try {
|
||||
circuitBreaker.executeCheckedRunnable(() -> retry.executeRunnable(() -> consumer.accept(connection)));
|
||||
} catch (final Throwable t) {
|
||||
if (t instanceof RedisException) {
|
||||
throw (RedisException) t;
|
||||
} else {
|
||||
throw new RedisException(t);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private <T, K, V> T withConnection(final StatefulRedisClusterConnection<K, V> connection,
|
||||
final Function<StatefulRedisClusterConnection<K, V>, T> function) {
|
||||
try {
|
||||
return circuitBreaker.executeCheckedSupplier(() -> retry.executeCallable(() -> function.apply(connection)));
|
||||
} catch (final Throwable t) {
|
||||
if (t instanceof RedisException) {
|
||||
throw (RedisException) t;
|
||||
} else {
|
||||
throw new RedisException(t);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private <T, K, V> Publisher<T> withConnectionReactive(final StatefulRedisClusterConnection<K, V> connection,
|
||||
final Function<StatefulRedisClusterConnection<K, V>, Publisher<T>> function) {
|
||||
|
||||
return Flux.from(function.apply(connection))
|
||||
.transformDeferred(RetryOperator.of(retry))
|
||||
.transformDeferred(CircuitBreakerOperator.of(circuitBreaker));
|
||||
}
|
||||
|
||||
public FaultTolerantPubSubConnection<String, String> createPubSubConnection() {
|
||||
final StatefulRedisClusterPubSubConnection<String, String> pubSubConnection = clusterClient.connectPubSub();
|
||||
pubSubConnections.add(pubSubConnection);
|
||||
|
||||
return new FaultTolerantPubSubConnection<>(name, pubSubConnection, circuitBreaker, retry, topologyChangedEventRetry,
|
||||
Schedulers.newSingle(name + "-redisPubSubEvents", true));
|
||||
}
|
||||
FaultTolerantPubSubConnection<String, String> createPubSubConnection();
|
||||
}
|
||||
|
||||
@@ -0,0 +1,136 @@
|
||||
/*
|
||||
* Copyright 2024 Signal Messenger, LLC
|
||||
* SPDX-License-Identifier: AGPL-3.0-only
|
||||
*/
|
||||
|
||||
package org.whispersystems.textsecuregcm.redis;
|
||||
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
import io.github.resilience4j.circuitbreaker.CallNotPermittedException;
|
||||
import io.github.resilience4j.circuitbreaker.CircuitBreaker;
|
||||
import io.github.resilience4j.circuitbreaker.CircuitBreakerConfig;
|
||||
import io.lettuce.core.protocol.CommandHandler;
|
||||
import io.lettuce.core.protocol.CompleteableCommand;
|
||||
import io.lettuce.core.protocol.RedisCommand;
|
||||
import io.lettuce.core.resource.NettyCustomizer;
|
||||
import io.micrometer.core.instrument.Tags;
|
||||
import io.netty.channel.Channel;
|
||||
import io.netty.channel.ChannelDuplexHandler;
|
||||
import io.netty.channel.ChannelHandlerContext;
|
||||
import io.netty.channel.ChannelPromise;
|
||||
import java.net.SocketAddress;
|
||||
import java.util.Map;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.stream.StreamSupport;
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.whispersystems.textsecuregcm.util.CircuitBreakerUtil;
|
||||
|
||||
public class LettuceShardCircuitBreaker implements NettyCustomizer {
|
||||
|
||||
private final String clusterName;
|
||||
private final CircuitBreakerConfig circuitBreakerConfig;
|
||||
|
||||
public LettuceShardCircuitBreaker(final String clusterName, final CircuitBreakerConfig circuitBreakerConfig) {
|
||||
this.clusterName = clusterName;
|
||||
this.circuitBreakerConfig = circuitBreakerConfig;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void afterChannelInitialized(final Channel channel) {
|
||||
|
||||
final ChannelCircuitBreakerHandler channelCircuitBreakerHandler = new ChannelCircuitBreakerHandler(clusterName,
|
||||
circuitBreakerConfig);
|
||||
|
||||
final String commandHandlerName = StreamSupport.stream(channel.pipeline().spliterator(), false)
|
||||
.filter(entry -> entry.getValue() instanceof CommandHandler)
|
||||
.map(Map.Entry::getKey)
|
||||
.findFirst()
|
||||
.orElseThrow();
|
||||
channel.pipeline().addBefore(commandHandlerName, null, channelCircuitBreakerHandler);
|
||||
}
|
||||
|
||||
static final class ChannelCircuitBreakerHandler extends ChannelDuplexHandler {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(ChannelCircuitBreakerHandler.class);
|
||||
|
||||
private static final String SHARD_TAG_NAME = "shard";
|
||||
private static final String CLUSTER_TAG_NAME = "cluster";
|
||||
|
||||
private final String clusterName;
|
||||
private final CircuitBreakerConfig circuitBreakerConfig;
|
||||
|
||||
@VisibleForTesting
|
||||
CircuitBreaker breaker;
|
||||
|
||||
public ChannelCircuitBreakerHandler(final String name, CircuitBreakerConfig circuitBreakerConfig) {
|
||||
this.clusterName = name;
|
||||
this.circuitBreakerConfig = circuitBreakerConfig;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void connect(final ChannelHandlerContext ctx, final SocketAddress remoteAddress,
|
||||
final SocketAddress localAddress, final ChannelPromise promise) throws Exception {
|
||||
super.connect(ctx, remoteAddress, localAddress, promise);
|
||||
// Unfortunately, the Channel's remote address is null until connect() is called, so we have to wait to initialize
|
||||
// the breaker with the remote’s name.
|
||||
// There is a Channel attribute, io.lettuce.core.ConnectionBuilder.REDIS_URI, but this does not always
|
||||
// match remote address, as it is inherited from the Bootstrap attributes and not updated for the Channel connection
|
||||
|
||||
// In some cases, like the default connection, the remote address includes the DNS hostname, which we want to exclude.
|
||||
final String shardAddress = StringUtils.substringAfter(remoteAddress.toString(), "/");
|
||||
breaker = CircuitBreaker.of("%s/%s-breaker".formatted(clusterName, shardAddress), circuitBreakerConfig);
|
||||
CircuitBreakerUtil.registerMetrics(breaker, getClass(),
|
||||
Tags.of(CLUSTER_TAG_NAME, clusterName, SHARD_TAG_NAME, shardAddress));
|
||||
}
|
||||
|
||||
@Override
|
||||
public void write(final ChannelHandlerContext ctx, final Object msg, final ChannelPromise promise)
|
||||
throws Exception {
|
||||
|
||||
logger.trace("Breaker state is {}", breaker.getState());
|
||||
|
||||
// Note: io.lettuce.core.protocol.CommandHandler also supports batches (List/Collection<RedisCommand>),
|
||||
// but we do not use that feature, so we can just check for single commands
|
||||
//
|
||||
// There are two types of RedisCommands that are not CompleteableCommand:
|
||||
// - io.lettuce.core.protocol.Command
|
||||
// - io.lettuce.core.protocol.PristineFallbackCommand
|
||||
//
|
||||
// The former always get wrapped by one of the other command types, and the latter is only used in an edge case
|
||||
// to consume responses.
|
||||
if (msg instanceof RedisCommand<?, ?, ?> rc && rc instanceof CompleteableCommand<?> command) {
|
||||
try {
|
||||
breaker.acquirePermission();
|
||||
|
||||
// state can change in acquirePermission()
|
||||
logger.trace("Breaker is permitted: {}", breaker.getState());
|
||||
|
||||
final long startNanos = System.nanoTime();
|
||||
|
||||
command.onComplete((ignored, throwable) -> {
|
||||
final long durationNanos = System.nanoTime() - startNanos;
|
||||
|
||||
if (throwable != null) {
|
||||
breaker.onError(durationNanos, TimeUnit.NANOSECONDS, throwable);
|
||||
logger.debug("Command completed with error", throwable);
|
||||
} else {
|
||||
breaker.onSuccess(durationNanos, TimeUnit.NANOSECONDS);
|
||||
}
|
||||
});
|
||||
|
||||
} catch (final CallNotPermittedException e) {
|
||||
rc.completeExceptionally(e);
|
||||
promise.tryFailure(e);
|
||||
return;
|
||||
}
|
||||
|
||||
} else {
|
||||
logger.warn("Unexpected msg type: {}", msg.getClass());
|
||||
}
|
||||
|
||||
super.write(ctx, msg, promise);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,97 @@
|
||||
/*
|
||||
* Copyright 2013-2020 Signal Messenger, LLC
|
||||
* SPDX-License-Identifier: AGPL-3.0-only
|
||||
*/
|
||||
|
||||
package org.whispersystems.textsecuregcm.redis;
|
||||
|
||||
import static org.whispersystems.textsecuregcm.metrics.MetricsUtil.name;
|
||||
|
||||
import io.github.resilience4j.retry.Retry;
|
||||
import io.lettuce.core.RedisException;
|
||||
import io.lettuce.core.cluster.event.ClusterTopologyChangedEvent;
|
||||
import io.lettuce.core.cluster.pubsub.StatefulRedisClusterPubSubConnection;
|
||||
import io.micrometer.core.instrument.Metrics;
|
||||
import io.micrometer.core.instrument.Timer;
|
||||
import java.util.function.Consumer;
|
||||
import java.util.function.Function;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import reactor.core.scheduler.Scheduler;
|
||||
|
||||
public class ShardFaultTolerantPubSubConnection<K, V> implements FaultTolerantPubSubConnection<K, V> {
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(ShardFaultTolerantPubSubConnection.class);
|
||||
|
||||
|
||||
private final String name;
|
||||
private final StatefulRedisClusterPubSubConnection<K, V> pubSubConnection;
|
||||
|
||||
private final Retry retry;
|
||||
private final Retry resubscribeRetry;
|
||||
private final Scheduler topologyChangedEventScheduler;
|
||||
|
||||
private final Timer executeTimer;
|
||||
|
||||
public ShardFaultTolerantPubSubConnection(final String name,
|
||||
final StatefulRedisClusterPubSubConnection<K, V> pubSubConnection,
|
||||
final Retry retry, final Retry resubscribeRetry, final Scheduler topologyChangedEventScheduler) {
|
||||
this.name = name;
|
||||
this.pubSubConnection = pubSubConnection;
|
||||
this.retry = retry;
|
||||
this.resubscribeRetry = resubscribeRetry;
|
||||
this.topologyChangedEventScheduler = topologyChangedEventScheduler;
|
||||
|
||||
this.pubSubConnection.setNodeMessagePropagation(true);
|
||||
|
||||
this.executeTimer = Metrics.timer(name(getClass(), "execute"), "clusterName", name + "-pubsub");
|
||||
}
|
||||
|
||||
@Override
|
||||
public void usePubSubConnection(final Consumer<StatefulRedisClusterPubSubConnection<K, V>> consumer) {
|
||||
try {
|
||||
retry.executeRunnable(() -> executeTimer.record(() -> consumer.accept(pubSubConnection)));
|
||||
} catch (final Throwable t) {
|
||||
if (t instanceof RedisException) {
|
||||
throw (RedisException) t;
|
||||
} else {
|
||||
throw new RedisException(t);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public <T> T withPubSubConnection(final Function<StatefulRedisClusterPubSubConnection<K, V>, T> function) {
|
||||
try {
|
||||
return retry.executeCallable(() -> executeTimer.record(() -> function.apply(pubSubConnection)));
|
||||
} catch (final Throwable t) {
|
||||
if (t instanceof RedisException) {
|
||||
throw (RedisException) t;
|
||||
} else {
|
||||
throw new RedisException(t);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Override
|
||||
public void subscribeToClusterTopologyChangedEvents(final Runnable eventHandler) {
|
||||
|
||||
usePubSubConnection(connection -> connection.getResources().eventBus().get()
|
||||
.filter(event -> event instanceof ClusterTopologyChangedEvent)
|
||||
.subscribeOn(topologyChangedEventScheduler)
|
||||
.subscribe(event -> {
|
||||
logger.info("Got topology change event for {}, resubscribing all keyspace notifications", name);
|
||||
|
||||
resubscribeRetry.executeRunnable(() -> {
|
||||
try {
|
||||
eventHandler.run();
|
||||
} catch (final RuntimeException e) {
|
||||
logger.warn("Resubscribe for {} failed", name, e);
|
||||
throw e;
|
||||
}
|
||||
});
|
||||
}));
|
||||
}
|
||||
|
||||
}
|
||||
@@ -0,0 +1,197 @@
|
||||
/*
|
||||
* Copyright 2024 Signal Messenger, LLC
|
||||
* SPDX-License-Identifier: AGPL-3.0-only
|
||||
*/
|
||||
|
||||
package org.whispersystems.textsecuregcm.redis;
|
||||
|
||||
import io.github.resilience4j.core.IntervalFunction;
|
||||
import io.github.resilience4j.reactor.retry.RetryOperator;
|
||||
import io.github.resilience4j.retry.Retry;
|
||||
import io.github.resilience4j.retry.RetryConfig;
|
||||
import io.lettuce.core.ClientOptions;
|
||||
import io.lettuce.core.RedisCommandTimeoutException;
|
||||
import io.lettuce.core.RedisException;
|
||||
import io.lettuce.core.RedisURI;
|
||||
import io.lettuce.core.TimeoutOptions;
|
||||
import io.lettuce.core.cluster.ClusterClientOptions;
|
||||
import io.lettuce.core.cluster.ClusterTopologyRefreshOptions;
|
||||
import io.lettuce.core.cluster.RedisClusterClient;
|
||||
import io.lettuce.core.cluster.api.StatefulRedisClusterConnection;
|
||||
import io.lettuce.core.cluster.pubsub.StatefulRedisClusterPubSubConnection;
|
||||
import io.lettuce.core.codec.ByteArrayCodec;
|
||||
import io.lettuce.core.resource.ClientResources;
|
||||
import java.time.Duration;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Collections;
|
||||
import java.util.List;
|
||||
import java.util.function.Consumer;
|
||||
import java.util.function.Function;
|
||||
import org.reactivestreams.Publisher;
|
||||
import org.whispersystems.textsecuregcm.configuration.CircuitBreakerConfiguration;
|
||||
import org.whispersystems.textsecuregcm.configuration.RedisClusterConfiguration;
|
||||
import org.whispersystems.textsecuregcm.configuration.RetryConfiguration;
|
||||
import org.whispersystems.textsecuregcm.util.CircuitBreakerUtil;
|
||||
import reactor.core.publisher.Flux;
|
||||
import reactor.core.scheduler.Schedulers;
|
||||
|
||||
/**
|
||||
* A fault-tolerant access manager for a Redis cluster. Each shard in the cluster has a dedicated circuit breaker.
|
||||
*
|
||||
* @see LettuceShardCircuitBreaker
|
||||
*/
|
||||
public class ShardFaultTolerantRedisCluster implements FaultTolerantRedisCluster {
|
||||
|
||||
private final String name;
|
||||
|
||||
private final RedisClusterClient clusterClient;
|
||||
|
||||
private final StatefulRedisClusterConnection<String, String> stringConnection;
|
||||
private final StatefulRedisClusterConnection<byte[], byte[]> binaryConnection;
|
||||
|
||||
private final List<StatefulRedisClusterPubSubConnection<?, ?>> pubSubConnections = new ArrayList<>();
|
||||
|
||||
private final Retry retry;
|
||||
private final Retry topologyChangedEventRetry;
|
||||
|
||||
|
||||
public ShardFaultTolerantRedisCluster(final String name, final RedisClusterConfiguration clusterConfiguration,
|
||||
final ClientResources.Builder clientResourcesBuilder) {
|
||||
|
||||
this(name, clientResourcesBuilder,
|
||||
Collections.singleton(RedisUriUtil.createRedisUriWithTimeout(clusterConfiguration.getConfigurationUri(),
|
||||
clusterConfiguration.getTimeout())),
|
||||
clusterConfiguration.getTimeout(),
|
||||
clusterConfiguration.getCircuitBreakerConfiguration(),
|
||||
clusterConfiguration.getRetryConfiguration());
|
||||
|
||||
}
|
||||
|
||||
ShardFaultTolerantRedisCluster(String name, final ClientResources.Builder clientResourcesBuilder,
|
||||
Iterable<RedisURI> redisUris, Duration commandTimeout, CircuitBreakerConfiguration circuitBreakerConfig,
|
||||
RetryConfiguration retryConfiguration) {
|
||||
|
||||
this.name = name;
|
||||
|
||||
this.clusterClient = RedisClusterClient.create(
|
||||
clientResourcesBuilder.nettyCustomizer(
|
||||
new LettuceShardCircuitBreaker(name, circuitBreakerConfig.toCircuitBreakerConfig())).
|
||||
build(),
|
||||
redisUris);
|
||||
this.clusterClient.setOptions(ClusterClientOptions.builder()
|
||||
.disconnectedBehavior(ClientOptions.DisconnectedBehavior.REJECT_COMMANDS)
|
||||
.validateClusterNodeMembership(false)
|
||||
.topologyRefreshOptions(ClusterTopologyRefreshOptions.builder()
|
||||
.enableAllAdaptiveRefreshTriggers()
|
||||
.build())
|
||||
// for asynchronous commands
|
||||
.timeoutOptions(TimeoutOptions.builder()
|
||||
.fixedTimeout(commandTimeout)
|
||||
.build())
|
||||
.publishOnScheduler(true)
|
||||
.build());
|
||||
|
||||
this.stringConnection = clusterClient.connect();
|
||||
this.binaryConnection = clusterClient.connect(ByteArrayCodec.INSTANCE);
|
||||
|
||||
this.retry = Retry.of(name + "-retry", retryConfiguration.toRetryConfigBuilder()
|
||||
.retryOnException(exception -> exception instanceof RedisCommandTimeoutException).build());
|
||||
final RetryConfig topologyChangedEventRetryConfig = RetryConfig.custom()
|
||||
.maxAttempts(Integer.MAX_VALUE)
|
||||
.intervalFunction(
|
||||
IntervalFunction.ofExponentialRandomBackoff(Duration.ofSeconds(1), 1.5, Duration.ofSeconds(30)))
|
||||
.build();
|
||||
|
||||
this.topologyChangedEventRetry = Retry.of(name + "-topologyChangedRetry", topologyChangedEventRetryConfig);
|
||||
|
||||
CircuitBreakerUtil.registerMetrics(retry, ShardFaultTolerantRedisCluster.class);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void shutdown() {
|
||||
stringConnection.close();
|
||||
binaryConnection.close();
|
||||
|
||||
for (final StatefulRedisClusterPubSubConnection<?, ?> pubSubConnection : pubSubConnections) {
|
||||
pubSubConnection.close();
|
||||
}
|
||||
|
||||
clusterClient.shutdown();
|
||||
}
|
||||
|
||||
public String getName() {
|
||||
return name;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void useCluster(final Consumer<StatefulRedisClusterConnection<String, String>> consumer) {
|
||||
useConnection(stringConnection, consumer);
|
||||
}
|
||||
|
||||
@Override
|
||||
public <T> T withCluster(final Function<StatefulRedisClusterConnection<String, String>, T> function) {
|
||||
return withConnection(stringConnection, function);
|
||||
}
|
||||
|
||||
@Override
|
||||
public void useBinaryCluster(final Consumer<StatefulRedisClusterConnection<byte[], byte[]>> consumer) {
|
||||
useConnection(binaryConnection, consumer);
|
||||
}
|
||||
|
||||
@Override
|
||||
public <T> T withBinaryCluster(final Function<StatefulRedisClusterConnection<byte[], byte[]>, T> function) {
|
||||
return withConnection(binaryConnection, function);
|
||||
}
|
||||
|
||||
@Override
|
||||
public <T> Publisher<T> withBinaryClusterReactive(
|
||||
final Function<StatefulRedisClusterConnection<byte[], byte[]>, Publisher<T>> function) {
|
||||
return withConnectionReactive(binaryConnection, function);
|
||||
}
|
||||
|
||||
@Override
|
||||
public <K, V> void useConnection(final StatefulRedisClusterConnection<K, V> connection,
|
||||
final Consumer<StatefulRedisClusterConnection<K, V>> consumer) {
|
||||
try {
|
||||
retry.executeRunnable(() -> consumer.accept(connection));
|
||||
} catch (final Throwable t) {
|
||||
if (t instanceof RedisException) {
|
||||
throw (RedisException) t;
|
||||
} else {
|
||||
throw new RedisException(t);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public <T, K, V> T withConnection(final StatefulRedisClusterConnection<K, V> connection,
|
||||
final Function<StatefulRedisClusterConnection<K, V>, T> function) {
|
||||
try {
|
||||
return retry.executeCallable(() -> function.apply(connection));
|
||||
} catch (final Throwable t) {
|
||||
if (t instanceof RedisException) {
|
||||
throw (RedisException) t;
|
||||
} else {
|
||||
throw new RedisException(t);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public <T, K, V> Publisher<T> withConnectionReactive(final StatefulRedisClusterConnection<K, V> connection,
|
||||
final Function<StatefulRedisClusterConnection<K, V>, Publisher<T>> function) {
|
||||
|
||||
return Flux.from(function.apply(connection))
|
||||
.transformDeferred(RetryOperator.of(retry));
|
||||
}
|
||||
|
||||
@Override
|
||||
public FaultTolerantPubSubConnection<String, String> createPubSubConnection() {
|
||||
final StatefulRedisClusterPubSubConnection<String, String> pubSubConnection = clusterClient.connectPubSub();
|
||||
pubSubConnections.add(pubSubConnection);
|
||||
|
||||
return new ShardFaultTolerantPubSubConnection<>(name, pubSubConnection, retry, topologyChangedEventRetry,
|
||||
Schedulers.newSingle(name + "-redisPubSubEvents", true));
|
||||
}
|
||||
|
||||
}
|
||||
@@ -23,20 +23,22 @@ public class CircuitBreakerUtil {
|
||||
private static final String BREAKER_NAME_TAG_NAME = "breakerName";
|
||||
private static final String OUTCOME_TAG_NAME = "outcome";
|
||||
|
||||
public static void registerMetrics(CircuitBreaker circuitBreaker, Class<?> clazz) {
|
||||
public static void registerMetrics(CircuitBreaker circuitBreaker, Class<?> clazz, Tags additionalTags) {
|
||||
final String breakerName = clazz.getSimpleName() + "/" + circuitBreaker.getName();
|
||||
|
||||
final Counter successCounter = Metrics.counter(CIRCUIT_BREAKER_CALL_COUNTER_NAME,
|
||||
BREAKER_NAME_TAG_NAME, breakerName,
|
||||
OUTCOME_TAG_NAME, "success");
|
||||
additionalTags.and(
|
||||
BREAKER_NAME_TAG_NAME, breakerName,
|
||||
OUTCOME_TAG_NAME, "success"));
|
||||
|
||||
final Counter failureCounter = Metrics.counter(CIRCUIT_BREAKER_CALL_COUNTER_NAME,
|
||||
BREAKER_NAME_TAG_NAME, breakerName,
|
||||
OUTCOME_TAG_NAME, "failure");
|
||||
additionalTags.and(
|
||||
BREAKER_NAME_TAG_NAME, breakerName,
|
||||
OUTCOME_TAG_NAME, "failure"));
|
||||
|
||||
final Counter unpermittedCounter = Metrics.counter(CIRCUIT_BREAKER_CALL_COUNTER_NAME,
|
||||
BREAKER_NAME_TAG_NAME, breakerName,
|
||||
OUTCOME_TAG_NAME, "unpermitted");
|
||||
additionalTags.and(BREAKER_NAME_TAG_NAME, breakerName,
|
||||
OUTCOME_TAG_NAME, "unpermitted"));
|
||||
|
||||
circuitBreaker.getEventPublisher().onSuccess(event -> {
|
||||
successCounter.increment();
|
||||
|
||||
@@ -29,6 +29,7 @@ import org.whispersystems.textsecuregcm.configuration.dynamic.DynamicConfigurati
|
||||
import org.whispersystems.textsecuregcm.controllers.SecureStorageController;
|
||||
import org.whispersystems.textsecuregcm.controllers.SecureValueRecovery2Controller;
|
||||
import org.whispersystems.textsecuregcm.push.ClientPresenceManager;
|
||||
import org.whispersystems.textsecuregcm.redis.ClusterFaultTolerantRedisCluster;
|
||||
import org.whispersystems.textsecuregcm.redis.FaultTolerantRedisCluster;
|
||||
import org.whispersystems.textsecuregcm.securestorage.SecureStorageClient;
|
||||
import org.whispersystems.textsecuregcm.securevaluerecovery.SecureValueRecovery2Client;
|
||||
@@ -107,7 +108,7 @@ public class AssignUsernameCommand extends EnvironmentCommand<WhisperServerConfi
|
||||
|
||||
ClientResources redisClusterClientResources = ClientResources.builder().build();
|
||||
|
||||
FaultTolerantRedisCluster cacheCluster = new FaultTolerantRedisCluster("main_cache_cluster",
|
||||
FaultTolerantRedisCluster cacheCluster = new ClusterFaultTolerantRedisCluster("main_cache_cluster",
|
||||
configuration.getCacheClusterConfiguration(), redisClusterClientResources);
|
||||
|
||||
Scheduler messageDeliveryScheduler = Schedulers.fromExecutorService(
|
||||
@@ -173,13 +174,14 @@ public class AssignUsernameCommand extends EnvironmentCommand<WhisperServerConfi
|
||||
configuration.getDynamoDbTables().getMessages().getTableName(),
|
||||
configuration.getDynamoDbTables().getMessages().getExpiration(),
|
||||
messageDeletionExecutor);
|
||||
FaultTolerantRedisCluster messageInsertCacheCluster = new FaultTolerantRedisCluster("message_insert_cluster",
|
||||
FaultTolerantRedisCluster messageInsertCacheCluster = new ClusterFaultTolerantRedisCluster("message_insert_cluster",
|
||||
configuration.getMessageCacheConfiguration().getRedisClusterConfiguration(), redisClusterClientResources);
|
||||
FaultTolerantRedisCluster messageReadDeleteCluster = new FaultTolerantRedisCluster("message_read_delete_cluster",
|
||||
FaultTolerantRedisCluster messageReadDeleteCluster = new ClusterFaultTolerantRedisCluster(
|
||||
"message_read_delete_cluster",
|
||||
configuration.getMessageCacheConfiguration().getRedisClusterConfiguration(), redisClusterClientResources);
|
||||
FaultTolerantRedisCluster clientPresenceCluster = new FaultTolerantRedisCluster("client_presence_cluster",
|
||||
FaultTolerantRedisCluster clientPresenceCluster = new ClusterFaultTolerantRedisCluster("client_presence_cluster",
|
||||
configuration.getClientPresenceClusterConfiguration(), redisClusterClientResources);
|
||||
FaultTolerantRedisCluster rateLimitersCluster = new FaultTolerantRedisCluster("rate_limiters",
|
||||
FaultTolerantRedisCluster rateLimitersCluster = new ClusterFaultTolerantRedisCluster("rate_limiters",
|
||||
configuration.getRateLimitersCluster(), redisClusterClientResources);
|
||||
SecureValueRecovery2Client secureValueRecovery2Client = new SecureValueRecovery2Client(
|
||||
secureValueRecoveryCredentialsGenerator, secureValueRecoveryExecutor, secureValueRecoveryServiceRetryExecutor,
|
||||
|
||||
@@ -29,6 +29,7 @@ import org.whispersystems.textsecuregcm.controllers.SecureStorageController;
|
||||
import org.whispersystems.textsecuregcm.controllers.SecureValueRecovery2Controller;
|
||||
import org.whispersystems.textsecuregcm.metrics.MetricsUtil;
|
||||
import org.whispersystems.textsecuregcm.push.ClientPresenceManager;
|
||||
import org.whispersystems.textsecuregcm.redis.ClusterFaultTolerantRedisCluster;
|
||||
import org.whispersystems.textsecuregcm.redis.FaultTolerantRedisCluster;
|
||||
import org.whispersystems.textsecuregcm.securestorage.SecureStorageClient;
|
||||
import org.whispersystems.textsecuregcm.securevaluerecovery.SecureValueRecovery2Client;
|
||||
@@ -90,7 +91,7 @@ record CommandDependencies(
|
||||
|
||||
ClientResources redisClusterClientResources = ClientResources.builder().build();
|
||||
|
||||
FaultTolerantRedisCluster cacheCluster = new FaultTolerantRedisCluster("main_cache_cluster",
|
||||
FaultTolerantRedisCluster cacheCluster = new ClusterFaultTolerantRedisCluster("main_cache_cluster",
|
||||
configuration.getCacheClusterConfiguration(), redisClusterClientResources);
|
||||
|
||||
ScheduledExecutorService recurringJobExecutor = environment.lifecycle()
|
||||
@@ -161,13 +162,14 @@ record CommandDependencies(
|
||||
configuration.getDynamoDbTables().getMessages().getTableName(),
|
||||
configuration.getDynamoDbTables().getMessages().getExpiration(),
|
||||
messageDeletionExecutor);
|
||||
FaultTolerantRedisCluster messageInsertCacheCluster = new FaultTolerantRedisCluster("message_insert_cluster",
|
||||
FaultTolerantRedisCluster messageInsertCacheCluster = new ClusterFaultTolerantRedisCluster("message_insert_cluster",
|
||||
configuration.getMessageCacheConfiguration().getRedisClusterConfiguration(), redisClusterClientResources);
|
||||
FaultTolerantRedisCluster messageReadDeleteCluster = new FaultTolerantRedisCluster("message_read_delete_cluster",
|
||||
FaultTolerantRedisCluster messageReadDeleteCluster = new ClusterFaultTolerantRedisCluster(
|
||||
"message_read_delete_cluster",
|
||||
configuration.getMessageCacheConfiguration().getRedisClusterConfiguration(), redisClusterClientResources);
|
||||
FaultTolerantRedisCluster clientPresenceCluster = new FaultTolerantRedisCluster("client_presence_cluster",
|
||||
FaultTolerantRedisCluster clientPresenceCluster = new ClusterFaultTolerantRedisCluster("client_presence_cluster",
|
||||
configuration.getClientPresenceClusterConfiguration(), redisClusterClientResources);
|
||||
FaultTolerantRedisCluster rateLimitersCluster = new FaultTolerantRedisCluster("rate_limiters",
|
||||
FaultTolerantRedisCluster rateLimitersCluster = new ClusterFaultTolerantRedisCluster("rate_limiters",
|
||||
configuration.getRateLimitersCluster(), redisClusterClientResources);
|
||||
SecureValueRecovery2Client secureValueRecovery2Client = new SecureValueRecovery2Client(
|
||||
secureValueRecoveryCredentialsGenerator, secureValueRecoveryServiceExecutor,
|
||||
|
||||
@@ -19,6 +19,7 @@ import org.whispersystems.textsecuregcm.WhisperServerConfiguration;
|
||||
import org.whispersystems.textsecuregcm.metrics.MetricsUtil;
|
||||
import org.whispersystems.textsecuregcm.push.APNSender;
|
||||
import org.whispersystems.textsecuregcm.push.ApnPushNotificationScheduler;
|
||||
import org.whispersystems.textsecuregcm.redis.ClusterFaultTolerantRedisCluster;
|
||||
import org.whispersystems.textsecuregcm.redis.FaultTolerantRedisCluster;
|
||||
import org.whispersystems.textsecuregcm.util.logging.UncaughtExceptionHandler;
|
||||
|
||||
@@ -63,7 +64,7 @@ public class ScheduledApnPushNotificationSenderServiceCommand extends ServerComm
|
||||
});
|
||||
}
|
||||
|
||||
final FaultTolerantRedisCluster pushSchedulerCluster = new FaultTolerantRedisCluster("push_scheduler",
|
||||
final FaultTolerantRedisCluster pushSchedulerCluster = new ClusterFaultTolerantRedisCluster("push_scheduler",
|
||||
configuration.getPushSchedulerCluster(), deps.redisClusterClientResources());
|
||||
|
||||
final ExecutorService apnSenderExecutor = environment.lifecycle().executorService(name(getClass(), "apnSender-%d"))
|
||||
|
||||
Reference in New Issue
Block a user