Add retry after exceptions during a cluster topology change event callback

This commit is contained in:
Chris Eager
2023-03-23 09:50:15 -05:00
committed by Chris Eager
parent 0cc84131de
commit 3ccfeb490b
13 changed files with 323 additions and 132 deletions

View File

@@ -10,28 +10,41 @@ import static org.whispersystems.textsecuregcm.metrics.MetricsUtil.name;
import io.github.resilience4j.circuitbreaker.CircuitBreaker;
import io.github.resilience4j.retry.Retry;
import io.lettuce.core.RedisException;
import io.lettuce.core.cluster.event.ClusterTopologyChangedEvent;
import io.lettuce.core.cluster.pubsub.StatefulRedisClusterPubSubConnection;
import io.micrometer.core.instrument.Metrics;
import io.micrometer.core.instrument.Timer;
import java.util.function.Consumer;
import java.util.function.Function;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.whispersystems.textsecuregcm.util.CircuitBreakerUtil;
import reactor.core.scheduler.Scheduler;
public class FaultTolerantPubSubConnection<K, V> {
private static final Logger logger = LoggerFactory.getLogger(FaultTolerantPubSubConnection.class);
private final String name;
private final StatefulRedisClusterPubSubConnection<K, V> pubSubConnection;
private final CircuitBreaker circuitBreaker;
private final Retry retry;
private final Retry resubscribeRetry;
private final Scheduler topologyChangedEventScheduler;
private final Timer executeTimer;
public FaultTolerantPubSubConnection(final String name,
final StatefulRedisClusterPubSubConnection<K, V> pubSubConnection, final CircuitBreaker circuitBreaker,
final Retry retry) {
final Retry retry, final Retry resubscribeRetry, final Scheduler topologyChangedEventScheduler) {
this.name = name;
this.pubSubConnection = pubSubConnection;
this.circuitBreaker = circuitBreaker;
this.retry = retry;
this.resubscribeRetry = resubscribeRetry;
this.topologyChangedEventScheduler = topologyChangedEventScheduler;
this.pubSubConnection.setNodeMessagePropagation(true);
@@ -58,11 +71,32 @@ public class FaultTolerantPubSubConnection<K, V> {
return circuitBreaker.executeCheckedSupplier(
() -> retry.executeCallable(() -> executeTimer.record(() -> function.apply(pubSubConnection))));
} catch (final Throwable t) {
if (t instanceof RedisException) {
throw (RedisException) t;
} else {
throw new RedisException(t);
}
if (t instanceof RedisException) {
throw (RedisException) t;
} else {
throw new RedisException(t);
}
}
}
public void subscribeToClusterTopologyChangedEvents(final Runnable eventHandler) {
usePubSubConnection(connection -> connection.getResources().eventBus().get()
.filter(event -> event instanceof ClusterTopologyChangedEvent)
.subscribeOn(topologyChangedEventScheduler)
.subscribe(event -> {
logger.info("Got topology change event for {}, resubscribing all keyspace notifications", name);
resubscribeRetry.executeRunnable(() -> {
try {
eventHandler.run();
} catch (final RuntimeException e) {
logger.warn("Resubscribe for {} failed", name, e);
throw e;
}
});
}));
}
}

View File

@@ -7,9 +7,11 @@ package org.whispersystems.textsecuregcm.redis;
import com.google.common.annotations.VisibleForTesting;
import io.github.resilience4j.circuitbreaker.CircuitBreaker;
import io.github.resilience4j.core.IntervalFunction;
import io.github.resilience4j.reactor.circuitbreaker.operator.CircuitBreakerOperator;
import io.github.resilience4j.reactor.retry.RetryOperator;
import io.github.resilience4j.retry.Retry;
import io.github.resilience4j.retry.RetryConfig;
import io.lettuce.core.ClientOptions.DisconnectedBehavior;
import io.lettuce.core.RedisCommandTimeoutException;
import io.lettuce.core.RedisException;
@@ -31,6 +33,7 @@ import org.whispersystems.textsecuregcm.configuration.RedisClusterConfiguration;
import org.whispersystems.textsecuregcm.configuration.RetryConfiguration;
import org.whispersystems.textsecuregcm.util.CircuitBreakerUtil;
import reactor.core.publisher.Flux;
import reactor.core.scheduler.Schedulers;
/**
* A fault-tolerant access manager for a Redis cluster. A fault-tolerant Redis cluster provides managed,
@@ -38,51 +41,61 @@ import reactor.core.publisher.Flux;
*/
public class FaultTolerantRedisCluster {
private final String name;
private final String name;
private final RedisClusterClient clusterClient;
private final RedisClusterClient clusterClient;
private final StatefulRedisClusterConnection<String, String> stringConnection;
private final StatefulRedisClusterConnection<byte[], byte[]> binaryConnection;
private final StatefulRedisClusterConnection<String, String> stringConnection;
private final StatefulRedisClusterConnection<byte[], byte[]> binaryConnection;
private final List<StatefulRedisClusterPubSubConnection<?, ?>> pubSubConnections = new ArrayList<>();
private final List<StatefulRedisClusterPubSubConnection<?, ?>> pubSubConnections = new ArrayList<>();
private final CircuitBreaker circuitBreaker;
private final Retry retry;
private final CircuitBreaker circuitBreaker;
private final Retry retry;
private final Retry topologyChangedEventRetry;
public FaultTolerantRedisCluster(final String name, final RedisClusterConfiguration clusterConfiguration, final ClientResources clientResources) {
this(name,
RedisClusterClient.create(clientResources, clusterConfiguration.getConfigurationUri()),
clusterConfiguration.getTimeout(),
clusterConfiguration.getCircuitBreakerConfiguration(),
clusterConfiguration.getRetryConfiguration());
}
public FaultTolerantRedisCluster(final String name, final RedisClusterConfiguration clusterConfiguration,
final ClientResources clientResources) {
this(name,
RedisClusterClient.create(clientResources, clusterConfiguration.getConfigurationUri()),
clusterConfiguration.getTimeout(),
clusterConfiguration.getCircuitBreakerConfiguration(),
clusterConfiguration.getRetryConfiguration());
}
@VisibleForTesting
FaultTolerantRedisCluster(final String name, final RedisClusterClient clusterClient, final Duration commandTimeout, final CircuitBreakerConfiguration circuitBreakerConfiguration, final RetryConfiguration retryConfiguration) {
this.name = name;
@VisibleForTesting
FaultTolerantRedisCluster(final String name, final RedisClusterClient clusterClient, final Duration commandTimeout,
final CircuitBreakerConfiguration circuitBreakerConfiguration, final RetryConfiguration retryConfiguration) {
this.name = name;
this.clusterClient = clusterClient;
this.clusterClient.setDefaultTimeout(commandTimeout);
this.clusterClient.setOptions(ClusterClientOptions.builder()
.disconnectedBehavior(DisconnectedBehavior.REJECT_COMMANDS)
.validateClusterNodeMembership(false)
.topologyRefreshOptions(ClusterTopologyRefreshOptions.builder()
.enableAllAdaptiveRefreshTriggers()
.build())
.publishOnScheduler(true)
.build());
this.clusterClient = clusterClient;
this.clusterClient.setDefaultTimeout(commandTimeout);
this.clusterClient.setOptions(ClusterClientOptions.builder()
.disconnectedBehavior(DisconnectedBehavior.REJECT_COMMANDS)
.validateClusterNodeMembership(false)
.topologyRefreshOptions(ClusterTopologyRefreshOptions.builder()
.enableAllAdaptiveRefreshTriggers()
.build())
.publishOnScheduler(true)
.build());
this.stringConnection = clusterClient.connect();
this.binaryConnection = clusterClient.connect(ByteArrayCodec.INSTANCE);
this.stringConnection = clusterClient.connect();
this.binaryConnection = clusterClient.connect(ByteArrayCodec.INSTANCE);
this.circuitBreaker = CircuitBreaker.of(name + "-breaker", circuitBreakerConfiguration.toCircuitBreakerConfig());
this.retry = Retry.of(name + "-retry", retryConfiguration.toRetryConfigBuilder()
.retryOnException(exception -> exception instanceof RedisCommandTimeoutException).build());
this.circuitBreaker = CircuitBreaker.of(name + "-breaker", circuitBreakerConfiguration.toCircuitBreakerConfig());
this.retry = Retry.of(name + "-retry", retryConfiguration.toRetryConfigBuilder()
.retryOnException(exception -> exception instanceof RedisCommandTimeoutException).build());
final RetryConfig topologyChangedEventRetryConfig = RetryConfig.custom()
.maxAttempts(Integer.MAX_VALUE)
.intervalFunction(
IntervalFunction.ofExponentialRandomBackoff(Duration.ofSeconds(1), 1.5, Duration.ofSeconds(30)))
.build();
this.topologyChangedEventRetry = Retry.of(name + "-topologyChangedRetry", topologyChangedEventRetryConfig);
CircuitBreakerUtil.registerMetrics(circuitBreaker, FaultTolerantRedisCluster.class);
CircuitBreakerUtil.registerMetrics(retry, FaultTolerantRedisCluster.class);
}
}
void shutdown() {
stringConnection.close();
@@ -158,6 +171,7 @@ public class FaultTolerantRedisCluster {
final StatefulRedisClusterPubSubConnection<String, String> pubSubConnection = clusterClient.connectPubSub();
pubSubConnections.add(pubSubConnection);
return new FaultTolerantPubSubConnection<>(name, pubSubConnection, circuitBreaker, retry);
return new FaultTolerantPubSubConnection<>(name, pubSubConnection, circuitBreaker, retry, topologyChangedEventRetry,
Schedulers.newSingle(name + "-redisPubSubEvents"));
}
}