Add per-shard Redis circuit breakers

This commit is contained in:
Chris Eager
2024-04-12 11:22:41 -05:00
committed by GitHub
parent 05a92494bb
commit 2dc707d86e
22 changed files with 1677 additions and 313 deletions

View File

@@ -30,6 +30,7 @@ import io.lettuce.core.cluster.pubsub.api.sync.RedisClusterPubSubCommands;
import io.lettuce.core.event.Event;
import io.lettuce.core.event.EventBus;
import io.lettuce.core.resource.ClientResources;
import java.time.Duration;
import java.util.Collections;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit;
@@ -63,7 +64,7 @@ class FaultTolerantPubSubConnectionTest {
breakerConfiguration.setFailureRateThreshold(100);
breakerConfiguration.setSlidingWindowSize(1);
breakerConfiguration.setSlidingWindowMinimumNumberOfCalls(1);
breakerConfiguration.setWaitDurationInOpenStateInSeconds(Integer.MAX_VALUE);
breakerConfiguration.setWaitDurationInOpenState(Duration.ofSeconds(Integer.MAX_VALUE));
final RetryConfiguration retryConfiguration = new RetryConfiguration();
retryConfiguration.setMaxAttempts(3);
@@ -78,7 +79,7 @@ class FaultTolerantPubSubConnectionTest {
.build();
final Retry resubscribeRetry = Retry.of("test-resubscribe", resubscribeRetryConfiguration);
faultTolerantPubSubConnection = new FaultTolerantPubSubConnection<>("test", pubSubConnection, circuitBreaker,
faultTolerantPubSubConnection = new ClusterFaultTolerantPubSubConnection<>("test", pubSubConnection, circuitBreaker,
retry, resubscribeRetry, Schedulers.newSingle("test"));
}

View File

@@ -6,9 +6,9 @@
package org.whispersystems.textsecuregcm.redis;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertInstanceOf;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTimeoutPreemptively;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.mockito.ArgumentMatchers.anyString;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.when;
@@ -60,13 +60,13 @@ class FaultTolerantRedisClusterTest {
breakerConfiguration.setFailureRateThreshold(100);
breakerConfiguration.setSlidingWindowSize(1);
breakerConfiguration.setSlidingWindowMinimumNumberOfCalls(1);
breakerConfiguration.setWaitDurationInOpenStateInSeconds(Integer.MAX_VALUE);
breakerConfiguration.setWaitDurationInOpenState(Duration.ofSeconds(Integer.MAX_VALUE));
final RetryConfiguration retryConfiguration = new RetryConfiguration();
retryConfiguration.setMaxAttempts(3);
retryConfiguration.setWaitDuration(0);
faultTolerantCluster = new FaultTolerantRedisCluster("test", clusterClient, Duration.ofSeconds(2),
faultTolerantCluster = new ClusterFaultTolerantRedisCluster("test", clusterClient, Duration.ofSeconds(2),
breakerConfiguration, retryConfiguration);
}
@@ -84,7 +84,7 @@ class FaultTolerantRedisClusterTest {
final RedisException redisException = assertThrows(RedisException.class,
() -> faultTolerantCluster.withCluster(connection -> connection.sync().get("OH NO")));
assertTrue(redisException.getCause() instanceof CallNotPermittedException);
assertInstanceOf(CallNotPermittedException.class, redisException.getCause());
}
@Test
@@ -132,7 +132,7 @@ class FaultTolerantRedisClusterTest {
assertTimeoutPreemptively(Duration.ofSeconds(1), () -> {
final ExecutionException asyncException = assertThrows(ExecutionException.class,
() -> cluster.withCluster(connection -> connection.async().blpop(TIMEOUT.toMillis() * 2, "key")).get());
assertTrue(asyncException.getCause() instanceof RedisCommandTimeoutException);
assertInstanceOf(RedisCommandTimeoutException.class, asyncException.getCause());
assertThrows(RedisCommandTimeoutException.class,
() -> cluster.withCluster(connection -> connection.sync().blpop(TIMEOUT.toMillis() * 2, "key")));

View File

@@ -0,0 +1,149 @@
/*
* Copyright 2024 Signal Messenger, LLC
* SPDX-License-Identifier: AGPL-3.0-only
*/
package org.whispersystems.textsecuregcm.redis;
import static org.junit.jupiter.api.Assertions.assertNotNull;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.mockito.ArgumentMatchers.anyLong;
import static org.mockito.Mockito.doThrow;
import static org.mockito.Mockito.eq;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.verify;
import static org.mockito.Mockito.verifyNoInteractions;
import io.github.resilience4j.circuitbreaker.CallNotPermittedException;
import io.github.resilience4j.circuitbreaker.CircuitBreaker;
import io.lettuce.core.ClientOptions;
import io.lettuce.core.codec.StringCodec;
import io.lettuce.core.output.StatusOutput;
import io.lettuce.core.protocol.AsyncCommand;
import io.lettuce.core.protocol.Command;
import io.lettuce.core.protocol.CommandHandler;
import io.lettuce.core.protocol.CommandType;
import io.lettuce.core.protocol.Endpoint;
import io.lettuce.core.resource.ClientResources;
import io.netty.channel.Channel;
import io.netty.channel.ChannelHandlerContext;
import io.netty.channel.ChannelPromise;
import io.netty.channel.embedded.EmbeddedChannel;
import java.io.IOException;
import java.net.SocketAddress;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.stream.StreamSupport;
import javax.annotation.Nullable;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.MethodSource;
import org.whispersystems.textsecuregcm.configuration.CircuitBreakerConfiguration;
class LettuceShardCircuitBreakerTest {
private LettuceShardCircuitBreaker.ChannelCircuitBreakerHandler channelCircuitBreakerHandler;
@BeforeEach
void setUp() {
channelCircuitBreakerHandler = new LettuceShardCircuitBreaker.ChannelCircuitBreakerHandler(
"test",
new CircuitBreakerConfiguration().toCircuitBreakerConfig());
}
@Test
void testAfterChannelInitialized() {
final LettuceShardCircuitBreaker lettuceShardCircuitBreaker = new LettuceShardCircuitBreaker("test",
new CircuitBreakerConfiguration().toCircuitBreakerConfig());
final Channel channel = new EmbeddedChannel(
new CommandHandler(ClientOptions.create(), ClientResources.create(), mock(Endpoint.class)));
lettuceShardCircuitBreaker.afterChannelInitialized(channel);
final AtomicBoolean foundCommandHandler = new AtomicBoolean(false);
final AtomicBoolean foundChannelCircuitBreakerHandler = new AtomicBoolean(false);
StreamSupport.stream(channel.pipeline().spliterator(), false)
.forEach(nameAndHandler -> {
if (nameAndHandler.getValue() instanceof CommandHandler) {
foundCommandHandler.set(true);
}
if (nameAndHandler.getValue() instanceof LettuceShardCircuitBreaker.ChannelCircuitBreakerHandler) {
foundChannelCircuitBreakerHandler.set(true);
}
if (foundCommandHandler.get()) {
assertTrue(foundChannelCircuitBreakerHandler.get(),
"circuit breaker handler should be before the command handler");
}
});
assertTrue(foundChannelCircuitBreakerHandler.get());
assertTrue(foundCommandHandler.get());
}
@Test
void testHandlerConnect() throws Exception {
channelCircuitBreakerHandler.connect(mock(ChannelHandlerContext.class), mock(SocketAddress.class),
mock(SocketAddress.class), mock(ChannelPromise.class));
assertNotNull(channelCircuitBreakerHandler.breaker);
}
@ParameterizedTest
@MethodSource
void testHandlerWriteBreakerClosed(@Nullable final Throwable t) throws Exception {
final CircuitBreaker breaker = mock(CircuitBreaker.class);
channelCircuitBreakerHandler.breaker = breaker;
final AsyncCommand<String, String, String> command = new AsyncCommand<>(
new Command<>(CommandType.PING, new StatusOutput<>(StringCodec.ASCII)));
final ChannelHandlerContext channelHandlerContext = mock(ChannelHandlerContext.class);
final ChannelPromise channelPromise = mock(ChannelPromise.class);
channelCircuitBreakerHandler.write(channelHandlerContext, command, channelPromise);
verify(breaker).acquirePermission();
if (t != null) {
command.completeExceptionally(t);
verify(breaker).onError(anyLong(), eq(TimeUnit.NANOSECONDS), eq(t));
} else {
command.complete("PONG");
verify(breaker).onSuccess(anyLong(), eq(TimeUnit.NANOSECONDS));
}
// write should always be forwarded when the breaker is closed
verify(channelHandlerContext).write(command, channelPromise);
}
static List<Throwable> testHandlerWriteBreakerClosed() {
final List<Throwable> errors = new ArrayList<>();
errors.add(null);
errors.add(new IOException("timeout"));
return errors;
}
@Test
void testHandlerWriteBreakerOpen() throws Exception {
final CircuitBreaker breaker = mock(CircuitBreaker.class);
channelCircuitBreakerHandler.breaker = breaker;
final CallNotPermittedException callNotPermittedException = mock(CallNotPermittedException.class);
doThrow(callNotPermittedException).when(breaker).acquirePermission();
@SuppressWarnings("unchecked") final AsyncCommand<String, String, String> command = mock(AsyncCommand.class);
final ChannelHandlerContext channelHandlerContext = mock(ChannelHandlerContext.class);
final ChannelPromise channelPromise = mock(ChannelPromise.class);
channelCircuitBreakerHandler.write(channelHandlerContext, command, channelPromise);
verify(command).completeExceptionally(callNotPermittedException);
verify(channelPromise).tryFailure(callNotPermittedException);
verifyNoInteractions(channelHandlerContext);
}
}

View File

@@ -20,7 +20,6 @@ import java.net.ServerSocket;
import java.time.Duration;
import java.util.Arrays;
import java.util.List;
import java.util.stream.Collectors;
import org.junit.jupiter.api.extension.AfterAllCallback;
import org.junit.jupiter.api.extension.AfterEachCallback;
import org.junit.jupiter.api.extension.BeforeAllCallback;
@@ -50,8 +49,8 @@ public class RedisClusterExtension implements BeforeAllCallback, BeforeEachCallb
}
public static RedisClusterExtensionBuilder builder() {
return new RedisClusterExtensionBuilder();
public static Builder builder() {
return new Builder();
}
@Override
@@ -81,12 +80,9 @@ public class RedisClusterExtension implements BeforeAllCallback, BeforeEachCallb
@Override
public void beforeEach(final ExtensionContext context) throws Exception {
final List<String> urls = Arrays.stream(CLUSTER_NODES)
.map(node -> String.format("redis://127.0.0.1:%d", node.ports().get(0)))
.toList();
redisCluster = new FaultTolerantRedisCluster("test-cluster",
RedisClusterClient.create(urls.stream().map(RedisURI::create).collect(Collectors.toList())),
redisCluster = new ClusterFaultTolerantRedisCluster("test-cluster",
RedisClusterClient.create(getRedisURIs()),
timeout,
new CircuitBreakerConfiguration(),
retryConfiguration);
@@ -120,6 +116,13 @@ public class RedisClusterExtension implements BeforeAllCallback, BeforeEachCallb
redisCluster.useCluster(connection -> connection.sync().flushall());
}
public static List<RedisURI> getRedisURIs() {
return Arrays.stream(CLUSTER_NODES)
.map(node -> "redis://127.0.0.1:%d".formatted(node.ports().getFirst()))
.map(RedisURI::create)
.toList();
}
public FaultTolerantRedisCluster getRedisCluster() {
return redisCluster;
}
@@ -140,12 +143,12 @@ public class RedisClusterExtension implements BeforeAllCallback, BeforeEachCallb
}
private static void assembleCluster(final RedisServer... nodes) throws InterruptedException {
try (final RedisClient meetClient = RedisClient.create(RedisURI.create("127.0.0.1", nodes[0].ports().get(0)))) {
try (final RedisClient meetClient = RedisClient.create(RedisURI.create("127.0.0.1", nodes[0].ports().getFirst()))) {
final StatefulRedisConnection<String, String> connection = meetClient.connect();
final RedisCommands<String, String> commands = connection.sync();
for (int i = 1; i < nodes.length; i++) {
commands.clusterMeet("127.0.0.1", nodes[i].ports().get(0));
commands.clusterMeet("127.0.0.1", nodes[i].ports().getFirst());
}
}
@@ -155,7 +158,8 @@ public class RedisClusterExtension implements BeforeAllCallback, BeforeEachCallb
final int startInclusive = i * slotsPerNode;
final int endExclusive = i == nodes.length - 1 ? SlotHash.SLOT_COUNT : (i + 1) * slotsPerNode;
try (final RedisClient assignSlotClient = RedisClient.create(RedisURI.create("127.0.0.1", nodes[i].ports().get(0)));
try (final RedisClient assignSlotClient = RedisClient.create(
RedisURI.create("127.0.0.1", nodes[i].ports().getFirst()));
final StatefulRedisConnection<String, String> assignSlotConnection = assignSlotClient.connect()) {
final int[] slots = new int[endExclusive - startInclusive];
@@ -167,7 +171,7 @@ public class RedisClusterExtension implements BeforeAllCallback, BeforeEachCallb
}
}
try (final RedisClient waitClient = RedisClient.create(RedisURI.create("127.0.0.1", nodes[0].ports().get(0)));
try (final RedisClient waitClient = RedisClient.create(RedisURI.create("127.0.0.1", nodes[0].ports().getFirst()));
final StatefulRedisConnection<String, String> connection = waitClient.connect()) {
// CLUSTER INFO gives us a big blob of key-value pairs, but the one we're interested in is `cluster_state`.
// According to https://redis.io/commands/cluster-info, `cluster_state:ok` means that the node is ready to
@@ -181,7 +185,7 @@ public class RedisClusterExtension implements BeforeAllCallback, BeforeEachCallb
if (tries == 20) {
throw new RuntimeException(
String.format("Timeout: Redis not ready after waiting %d milliseconds", tries * sleepMillis));
"Timeout: Redis not ready after waiting %d milliseconds".formatted(tries * sleepMillis));
}
}
}
@@ -215,20 +219,20 @@ public class RedisClusterExtension implements BeforeAllCallback, BeforeEachCallb
}
}
public static class RedisClusterExtensionBuilder {
public static class Builder {
private Duration timeout = DEFAULT_TIMEOUT;
private RetryConfiguration retryConfiguration = new RetryConfiguration();
private RedisClusterExtensionBuilder() {
private Builder() {
}
RedisClusterExtensionBuilder timeout(Duration timeout) {
Builder timeout(Duration timeout) {
this.timeout = timeout;
return this;
}
RedisClusterExtensionBuilder retryConfiguration(RetryConfiguration retryConfiguration) {
Builder retryConfiguration(RetryConfiguration retryConfiguration) {
this.retryConfiguration = retryConfiguration;
return this;
}

View File

@@ -0,0 +1,197 @@
/*
* Copyright 2013-2020 Signal Messenger, LLC
* SPDX-License-Identifier: AGPL-3.0-only
*/
package org.whispersystems.textsecuregcm.redis;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;
import static org.mockito.ArgumentMatchers.any;
import static org.mockito.ArgumentMatchers.anyString;
import static org.mockito.Mockito.atLeastOnce;
import static org.mockito.Mockito.clearInvocations;
import static org.mockito.Mockito.mock;
import static org.mockito.Mockito.reset;
import static org.mockito.Mockito.verify;
import static org.mockito.Mockito.when;
import io.github.resilience4j.core.IntervalFunction;
import io.github.resilience4j.retry.Retry;
import io.github.resilience4j.retry.RetryConfig;
import io.lettuce.core.RedisCommandTimeoutException;
import io.lettuce.core.RedisException;
import io.lettuce.core.cluster.event.ClusterTopologyChangedEvent;
import io.lettuce.core.cluster.pubsub.StatefulRedisClusterPubSubConnection;
import io.lettuce.core.cluster.pubsub.api.sync.RedisClusterPubSubCommands;
import io.lettuce.core.event.Event;
import io.lettuce.core.event.EventBus;
import io.lettuce.core.resource.ClientResources;
import java.util.Collections;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Nested;
import org.junit.jupiter.api.Test;
import org.whispersystems.textsecuregcm.configuration.RetryConfiguration;
import reactor.core.publisher.Flux;
import reactor.core.scheduler.Schedulers;
import reactor.test.publisher.TestPublisher;
class ShardFaultTolerantPubSubConnectionTest {
private StatefulRedisClusterPubSubConnection<String, String> pubSubConnection;
private RedisClusterPubSubCommands<String, String> pubSubCommands;
private ShardFaultTolerantPubSubConnection<String, String> faultTolerantPubSubConnection;
@SuppressWarnings("unchecked")
@BeforeEach
public void setUp() {
pubSubConnection = mock(StatefulRedisClusterPubSubConnection.class);
pubSubCommands = mock(RedisClusterPubSubCommands.class);
when(pubSubConnection.sync()).thenReturn(pubSubCommands);
final RetryConfiguration retryConfiguration = new RetryConfiguration();
retryConfiguration.setMaxAttempts(3);
retryConfiguration.setWaitDuration(10);
final Retry retry = Retry.of("test", retryConfiguration.toRetryConfig());
final RetryConfig resubscribeRetryConfiguration = RetryConfig.custom()
.maxAttempts(Integer.MAX_VALUE)
.intervalFunction(IntervalFunction.ofExponentialBackoff(5))
.build();
final Retry resubscribeRetry = Retry.of("test-resubscribe", resubscribeRetryConfiguration);
faultTolerantPubSubConnection = new ShardFaultTolerantPubSubConnection<>("test", pubSubConnection,
retry, resubscribeRetry, Schedulers.newSingle("test"));
}
@Test
void testRetry() {
when(pubSubCommands.get(anyString()))
.thenThrow(new RedisCommandTimeoutException())
.thenThrow(new RedisCommandTimeoutException())
.thenReturn("value");
assertEquals("value",
faultTolerantPubSubConnection.withPubSubConnection(connection -> connection.sync().get("key")));
when(pubSubCommands.get(anyString()))
.thenThrow(new RedisCommandTimeoutException())
.thenThrow(new RedisCommandTimeoutException())
.thenThrow(new RedisCommandTimeoutException())
.thenReturn("value");
assertThrows(RedisCommandTimeoutException.class,
() -> faultTolerantPubSubConnection.withPubSubConnection(connection -> connection.sync().get("key")));
}
@Nested
class ClusterTopologyChangedEventTest {
private TestPublisher<Event> eventPublisher;
private Runnable resubscribe;
private AtomicInteger resubscribeCounter;
private CountDownLatch resubscribeFailure;
private CountDownLatch resubscribeSuccess;
@BeforeEach
@SuppressWarnings("unchecked")
void setup() {
// ignore inherited stubbing
reset(pubSubConnection);
eventPublisher = TestPublisher.createCold();
final ClientResources clientResources = mock(ClientResources.class);
when(pubSubConnection.getResources())
.thenReturn(clientResources);
final EventBus eventBus = mock(EventBus.class);
when(clientResources.eventBus())
.thenReturn(eventBus);
final Flux<Event> eventFlux = Flux.from(eventPublisher);
when(eventBus.get()).thenReturn(eventFlux);
resubscribeCounter = new AtomicInteger();
resubscribe = () -> {
try {
resubscribeCounter.incrementAndGet();
pubSubConnection.sync().nodes((ignored) -> true);
resubscribeSuccess.countDown();
} catch (final RuntimeException e) {
resubscribeFailure.countDown();
throw e;
}
};
resubscribeSuccess = new CountDownLatch(1);
resubscribeFailure = new CountDownLatch(1);
}
@SuppressWarnings("unchecked")
@Test
void testSubscribeToClusterTopologyChangedEvents() throws Exception {
when(pubSubConnection.sync())
.thenThrow(new RedisException("Cluster unavailable"));
eventPublisher.next(new ClusterTopologyChangedEvent(Collections.emptyList(), Collections.emptyList()));
faultTolerantPubSubConnection.subscribeToClusterTopologyChangedEvents(resubscribe);
assertTrue(resubscribeFailure.await(1, TimeUnit.SECONDS));
// simulate cluster recovery - no more exceptions, run the retry
reset(pubSubConnection);
clearInvocations(pubSubCommands);
when(pubSubConnection.sync())
.thenReturn(pubSubCommands);
assertTrue(resubscribeSuccess.await(1, TimeUnit.SECONDS));
assertTrue(resubscribeCounter.get() >= 2, String.format("resubscribe called %d times", resubscribeCounter.get()));
verify(pubSubCommands).nodes(any());
}
@Test
@SuppressWarnings("unchecked")
void testMultipleEventsWithPendingRetries() throws Exception {
// more complicated scenario: multiple events while retries are pending
// cluster is down
when(pubSubConnection.sync())
.thenThrow(new RedisException("Cluster unavailable"));
// publish multiple topology changed events
eventPublisher.next(new ClusterTopologyChangedEvent(Collections.emptyList(), Collections.emptyList()));
eventPublisher.next(new ClusterTopologyChangedEvent(Collections.emptyList(), Collections.emptyList()));
eventPublisher.next(new ClusterTopologyChangedEvent(Collections.emptyList(), Collections.emptyList()));
eventPublisher.next(new ClusterTopologyChangedEvent(Collections.emptyList(), Collections.emptyList()));
faultTolerantPubSubConnection.subscribeToClusterTopologyChangedEvents(resubscribe);
assertTrue(resubscribeFailure.await(1, TimeUnit.SECONDS));
// simulate cluster recovery - no more exceptions, run the retry
reset(pubSubConnection);
clearInvocations(pubSubCommands);
when(pubSubConnection.sync())
.thenReturn(pubSubCommands);
assertTrue(resubscribeSuccess.await(1, TimeUnit.SECONDS));
verify(pubSubCommands, atLeastOnce()).nodes(any());
}
}
}

View File

@@ -0,0 +1,495 @@
/*
* Copyright 2024 Signal Messenger, LLC
* SPDX-License-Identifier: AGPL-3.0-only
*/
package org.whispersystems.textsecuregcm.redis;
import static org.junit.jupiter.api.Assertions.assertEquals;
import static org.junit.jupiter.api.Assertions.assertInstanceOf;
import static org.junit.jupiter.api.Assertions.assertThrows;
import static org.junit.jupiter.api.Assertions.assertTrue;
import io.github.resilience4j.circuitbreaker.CallNotPermittedException;
import io.lettuce.core.RedisCommandTimeoutException;
import io.lettuce.core.RedisException;
import io.lettuce.core.RedisURI;
import io.lettuce.core.cluster.models.partitions.ClusterPartitionParser;
import io.lettuce.core.cluster.models.partitions.Partitions;
import io.lettuce.core.cluster.models.partitions.RedisClusterNode;
import io.lettuce.core.cluster.pubsub.RedisClusterPubSubAdapter;
import io.lettuce.core.event.EventBus;
import io.lettuce.core.event.EventPublisherOptions;
import io.lettuce.core.metrics.CommandLatencyCollectorOptions;
import io.lettuce.core.metrics.CommandLatencyRecorder;
import io.lettuce.core.resource.ClientResources;
import io.lettuce.core.resource.Delay;
import io.lettuce.core.resource.DnsResolver;
import io.lettuce.core.resource.EventLoopGroupProvider;
import io.lettuce.core.resource.NettyCustomizer;
import io.lettuce.core.resource.SocketAddressResolver;
import io.lettuce.core.resource.ThreadFactoryProvider;
import io.lettuce.core.tracing.Tracing;
import io.netty.bootstrap.Bootstrap;
import io.netty.channel.Channel;
import io.netty.channel.ChannelDuplexHandler;
import io.netty.channel.ChannelHandler;
import io.netty.channel.ChannelHandlerContext;
import io.netty.channel.ChannelPromise;
import io.netty.resolver.AddressResolverGroup;
import io.netty.util.Timer;
import io.netty.util.concurrent.EventExecutorGroup;
import java.net.InetSocketAddress;
import java.net.SocketAddress;
import java.time.Duration;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Optional;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutionException;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicReference;
import java.util.function.Supplier;
import org.apache.commons.lang3.StringUtils;
import org.jetbrains.annotations.Nullable;
import org.junit.jupiter.api.AfterEach;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.api.Timeout;
import org.junit.jupiter.api.extension.RegisterExtension;
import org.whispersystems.textsecuregcm.configuration.CircuitBreakerConfiguration;
import org.whispersystems.textsecuregcm.configuration.RetryConfiguration;
import org.whispersystems.textsecuregcm.util.Pair;
import org.whispersystems.textsecuregcm.util.RedisClusterUtil;
// ThreadMode.SEPARATE_THREAD protects against hangs in the remote Redis calls, as this mode allows the test code to be
// preempted by the timeout check
@Timeout(value = 5, threadMode = Timeout.ThreadMode.SEPARATE_THREAD)
class ShardFaultTolerantRedisClusterTest {
private static final Duration TIMEOUT = Duration.ofMillis(50);
private static final RetryConfiguration RETRY_CONFIGURATION = new RetryConfiguration();
static {
RETRY_CONFIGURATION.setMaxAttempts(1);
RETRY_CONFIGURATION.setWaitDuration(50);
}
@RegisterExtension
static final RedisClusterExtension REDIS_CLUSTER_EXTENSION = RedisClusterExtension.builder()
.retryConfiguration(RETRY_CONFIGURATION)
.timeout(TIMEOUT)
.build();
private ShardFaultTolerantRedisCluster cluster;
private static ShardFaultTolerantRedisCluster buildCluster(
@Nullable final CircuitBreakerConfiguration circuitBreakerConfiguration,
final ClientResources.Builder clientResourcesBuilder) {
return new ShardFaultTolerantRedisCluster("test", clientResourcesBuilder,
RedisClusterExtension.getRedisURIs(), TIMEOUT,
Optional.ofNullable(circuitBreakerConfiguration).orElseGet(CircuitBreakerConfiguration::new),
RETRY_CONFIGURATION);
}
@AfterEach
void tearDown() {
cluster.shutdown();
}
@Test
void testTimeout() {
cluster = buildCluster(null, ClientResources.builder());
final ExecutionException asyncException = assertThrows(ExecutionException.class,
() -> cluster.withCluster(connection -> connection.async().blpop(2 * TIMEOUT.toMillis() / 1000d, "key"))
.get());
assertInstanceOf(RedisCommandTimeoutException.class, asyncException.getCause());
assertThrows(RedisCommandTimeoutException.class,
() -> cluster.withCluster(connection -> connection.sync().blpop(2 * TIMEOUT.toMillis() / 1000d, "key")));
}
@Test
void testTimeoutCircuitBreaker() throws Exception {
// because were using a single key, and blpop involves *Redis* also blocking, the breaker wait duration must be
// longer than the sum of the remote timeouts
final Duration breakerWaitDuration = TIMEOUT.multipliedBy(5);
final CircuitBreakerConfiguration circuitBreakerConfig = new CircuitBreakerConfiguration();
circuitBreakerConfig.setFailureRateThreshold(1);
circuitBreakerConfig.setSlidingWindowMinimumNumberOfCalls(1);
circuitBreakerConfig.setSlidingWindowSize(1);
circuitBreakerConfig.setWaitDurationInOpenState(breakerWaitDuration);
cluster = buildCluster(circuitBreakerConfig, ClientResources.builder());
final String key = "key";
// the first call should time out and open the breaker
assertThrows(RedisCommandTimeoutException.class,
() -> cluster.withCluster(connection -> connection.sync().blpop(2 * TIMEOUT.toMillis() / 1000d, key)));
// the second call gets blocked by the breaker
final RedisException e = assertThrows(RedisException.class,
() -> cluster.withCluster(connection -> connection.sync().blpop(2 * TIMEOUT.toMillis() / 1000d, key)));
assertInstanceOf(CallNotPermittedException.class, e.getCause());
// wait for breaker to be half-open
Thread.sleep(breakerWaitDuration.toMillis() * 2);
assertEquals(0, (Long) cluster.withCluster(connection -> connection.sync().llen(key)));
}
@Test
void testShardUnavailable() {
final TestBreakerManager testBreakerManager = new TestBreakerManager();
final CircuitBreakerConfiguration circuitBreakerConfig = new CircuitBreakerConfiguration();
circuitBreakerConfig.setFailureRateThreshold(1);
circuitBreakerConfig.setSlidingWindowMinimumNumberOfCalls(2);
circuitBreakerConfig.setSlidingWindowSize(5);
final ClientResources.Builder builder = CompositeNettyCustomizerClientResourcesBuilder.builder()
.nettyCustomizer(testBreakerManager);
cluster = buildCluster(circuitBreakerConfig, builder);
// this test will open the breaker on one shard and check that other shards are still available,
// so we get two nodes and a slot+key on each to test
final Pair<RedisClusterNode, RedisClusterNode> nodePair =
cluster.withCluster(connection -> {
Partitions partitions = ClusterPartitionParser.parse(connection.sync().clusterNodes());
assertTrue(partitions.size() >= 2);
return new Pair<>(partitions.getPartition(0), partitions.getPartition(1));
});
final RedisClusterNode unavailableNode = nodePair.first();
final int unavailableSlot = unavailableNode.getSlots().getFirst();
final String unavailableKey = "key::{%s}".formatted(RedisClusterUtil.getMinimalHashTag(unavailableSlot));
final int availableSlot = nodePair.second().getSlots().getFirst();
final String availableKey = "key::{%s}".formatted(RedisClusterUtil.getMinimalHashTag(availableSlot));
cluster.useCluster(connection -> {
connection.sync().set(unavailableKey, "unavailable");
connection.sync().set(availableKey, "available");
assertEquals("unavailable", connection.sync().get(unavailableKey));
assertEquals("available", connection.sync().get(availableKey));
});
// shard is now unavailable
testBreakerManager.openBreaker(unavailableNode.getUri());
final RedisException e = assertThrows(RedisException.class, () ->
cluster.useCluster(connection -> connection.sync().get(unavailableKey)));
assertInstanceOf(CallNotPermittedException.class, e.getCause());
// other shard is still available
assertEquals("available", cluster.withCluster(connection -> connection.sync().get(availableKey)));
// shard is available again
testBreakerManager.closeBreaker(unavailableNode.getUri());
assertEquals("unavailable", cluster.withCluster(connection -> connection.sync().get(unavailableKey)));
}
@Test
void testShardUnavailablePubSub() throws Exception {
final TestBreakerManager testBreakerManager = new TestBreakerManager();
final CircuitBreakerConfiguration circuitBreakerConfig = new CircuitBreakerConfiguration();
circuitBreakerConfig.setFailureRateThreshold(1);
circuitBreakerConfig.setSlidingWindowMinimumNumberOfCalls(2);
circuitBreakerConfig.setSlidingWindowSize(5);
final ClientResources.Builder builder = CompositeNettyCustomizerClientResourcesBuilder.builder()
.nettyCustomizer(testBreakerManager);
cluster = buildCluster(circuitBreakerConfig, builder);
cluster.useCluster(
connection -> connection.sync().upstream().commands().configSet("notify-keyspace-events", "K$glz"));
// this test will open the breaker on one shard and check that other shards are still available,
// so we get two nodes and a slot+key on each to test
final Pair<RedisClusterNode, RedisClusterNode> nodePair =
cluster.withCluster(connection -> {
Partitions partitions = ClusterPartitionParser.parse(connection.sync().clusterNodes());
assertTrue(partitions.size() >= 2);
return new Pair<>(partitions.getPartition(0), partitions.getPartition(1));
});
final RedisClusterNode unavailableNode = nodePair.first();
final int unavailableSlot = unavailableNode.getSlots().getFirst();
final String unavailableKey = "key::{%s}".formatted(RedisClusterUtil.getMinimalHashTag(unavailableSlot));
final RedisClusterNode availableNode = nodePair.second();
final int availableSlot = availableNode.getSlots().getFirst();
final String availableKey = "key::{%s}".formatted(RedisClusterUtil.getMinimalHashTag(availableSlot));
final FaultTolerantPubSubConnection<String, String> pubSubConnection = cluster.createPubSubConnection();
// Keyspace notifications are delivered on a different thread, so we use a CountDownLatch to wait for the
// expected number of notifications to arrive
final AtomicReference<CountDownLatch> countDownLatchRef = new AtomicReference<>();
final Map<String, AtomicInteger> channelMessageCounts = new ConcurrentHashMap<>();
final String keyspacePrefix = "__keyspace@0__:";
final RedisClusterPubSubAdapter<String, String> listener = new RedisClusterPubSubAdapter<>() {
@Override
public void message(final RedisClusterNode node, final String channel, final String message) {
channelMessageCounts.computeIfAbsent(StringUtils.substringAfter(channel, keyspacePrefix),
k -> new AtomicInteger(0))
.incrementAndGet();
countDownLatchRef.get().countDown();
}
};
countDownLatchRef.set(new CountDownLatch(2));
pubSubConnection.usePubSubConnection(c -> {
c.addListener(listener);
c.sync().nodes(node -> node.is(RedisClusterNode.NodeFlag.UPSTREAM) && node.hasSlot(availableSlot))
.commands()
.subscribe(keyspacePrefix + availableKey);
c.sync().nodes(node -> node.is(RedisClusterNode.NodeFlag.UPSTREAM) && node.hasSlot(unavailableSlot))
.commands()
.subscribe(keyspacePrefix + unavailableKey);
});
cluster.useCluster(connection -> {
connection.sync().set(availableKey, "ping1");
connection.sync().set(unavailableKey, "ping1");
});
countDownLatchRef.get().await();
assertEquals(1, channelMessageCounts.get(availableKey).get());
assertEquals(1, channelMessageCounts.get(unavailableKey).get());
// shard is now unavailable
testBreakerManager.openBreaker(unavailableNode.getUri());
final RedisException e = assertThrows(RedisException.class, () ->
cluster.useCluster(connection -> connection.sync().set(unavailableKey, "ping2")));
assertInstanceOf(CallNotPermittedException.class, e.getCause());
assertEquals(1, channelMessageCounts.get(unavailableKey).get());
assertEquals(1, channelMessageCounts.get(availableKey).get());
countDownLatchRef.set(new CountDownLatch(1));
pubSubConnection.usePubSubConnection(connection -> connection.sync().set(availableKey, "ping2"));
countDownLatchRef.get().await();
assertEquals(1, channelMessageCounts.get(unavailableKey).get());
assertEquals(2, channelMessageCounts.get(availableKey).get());
// shard is available again
testBreakerManager.closeBreaker(unavailableNode.getUri());
countDownLatchRef.set(new CountDownLatch(2));
cluster.useCluster(connection -> {
connection.sync().set(availableKey, "ping3");
connection.sync().set(unavailableKey, "ping3");
});
countDownLatchRef.get().await();
assertEquals(2, channelMessageCounts.get(unavailableKey).get());
assertEquals(3, channelMessageCounts.get(availableKey).get());
}
@ChannelHandler.Sharable
private static class TestBreakerManager extends ChannelDuplexHandler implements NettyCustomizer {
private final Map<RedisURI, Set<LettuceShardCircuitBreaker.ChannelCircuitBreakerHandler>> urisToChannelBreakers = new ConcurrentHashMap<>();
private final AtomicInteger counter = new AtomicInteger();
@Override
public void afterChannelInitialized(Channel channel) {
channel.pipeline().addFirst("TestBreakerManager#" + counter.getAndIncrement(), this);
}
@Override
public void connect(final ChannelHandlerContext ctx, final SocketAddress remoteAddress,
final SocketAddress localAddress, final ChannelPromise promise) throws Exception {
super.connect(ctx, remoteAddress, localAddress, promise);
final LettuceShardCircuitBreaker.ChannelCircuitBreakerHandler channelCircuitBreakerHandler =
ctx.channel().pipeline().get(LettuceShardCircuitBreaker.ChannelCircuitBreakerHandler.class);
urisToChannelBreakers.computeIfAbsent(getRedisURI(ctx.channel()), ignored -> new HashSet<>())
.add(channelCircuitBreakerHandler);
}
private static RedisURI getRedisURI(Channel channel) {
final InetSocketAddress inetAddress = (InetSocketAddress) channel.remoteAddress();
return RedisURI.create(inetAddress.getHostString(), inetAddress.getPort());
}
void openBreaker(final RedisURI redisURI) {
urisToChannelBreakers.get(redisURI).forEach(handler -> handler.breaker.transitionToOpenState());
}
void closeBreaker(final RedisURI redisURI) {
urisToChannelBreakers.get(redisURI).forEach(handler -> handler.breaker.transitionToClosedState());
}
}
static class CompositeNettyCustomizer implements NettyCustomizer {
private final List<NettyCustomizer> nettyCustomizers = new ArrayList<>();
@Override
public void afterBootstrapInitialized(final Bootstrap bootstrap) {
nettyCustomizers.forEach(nc -> nc.afterBootstrapInitialized(bootstrap));
}
@Override
public void afterChannelInitialized(final Channel channel) {
nettyCustomizers.forEach(nc -> nc.afterChannelInitialized(channel));
}
void add(NettyCustomizer customizer) {
nettyCustomizers.add(customizer);
}
}
static class CompositeNettyCustomizerClientResourcesBuilder implements ClientResources.Builder {
private final CompositeNettyCustomizer compositeNettyCustomizer;
private final ClientResources.Builder delegate;
static CompositeNettyCustomizerClientResourcesBuilder builder() {
return new CompositeNettyCustomizerClientResourcesBuilder();
}
private CompositeNettyCustomizerClientResourcesBuilder() {
this.compositeNettyCustomizer = new CompositeNettyCustomizer();
this.delegate = ClientResources.builder().nettyCustomizer(compositeNettyCustomizer);
}
@Override
public ClientResources.Builder addressResolverGroup(final AddressResolverGroup<?> addressResolverGroup) {
delegate.addressResolverGroup(addressResolverGroup);
return this;
}
@Override
public ClientResources.Builder commandLatencyRecorder(final CommandLatencyRecorder latencyRecorder) {
delegate.commandLatencyRecorder(latencyRecorder);
return this;
}
@Override
@Deprecated
public ClientResources.Builder commandLatencyCollectorOptions(
final CommandLatencyCollectorOptions commandLatencyCollectorOptions) {
delegate.commandLatencyCollectorOptions(commandLatencyCollectorOptions);
return this;
}
@Override
public ClientResources.Builder commandLatencyPublisherOptions(
final EventPublisherOptions commandLatencyPublisherOptions) {
delegate.commandLatencyPublisherOptions(commandLatencyPublisherOptions);
return this;
}
@Override
public ClientResources.Builder computationThreadPoolSize(final int computationThreadPoolSize) {
delegate.computationThreadPoolSize(computationThreadPoolSize);
return this;
}
@Override
@Deprecated
public ClientResources.Builder dnsResolver(final DnsResolver dnsResolver) {
delegate.dnsResolver(dnsResolver);
return this;
}
@Override
public ClientResources.Builder eventBus(final EventBus eventBus) {
delegate.eventBus(eventBus);
return this;
}
@Override
public ClientResources.Builder eventExecutorGroup(final EventExecutorGroup eventExecutorGroup) {
delegate.eventExecutorGroup(eventExecutorGroup);
return this;
}
@Override
public ClientResources.Builder eventLoopGroupProvider(final EventLoopGroupProvider eventLoopGroupProvider) {
delegate.eventLoopGroupProvider(eventLoopGroupProvider);
return this;
}
@Override
public ClientResources.Builder ioThreadPoolSize(final int ioThreadPoolSize) {
delegate.ioThreadPoolSize(ioThreadPoolSize);
return this;
}
@Override
public ClientResources.Builder nettyCustomizer(final NettyCustomizer nettyCustomizer) {
compositeNettyCustomizer.add(nettyCustomizer);
return this;
}
@Override
public ClientResources.Builder reconnectDelay(final Delay reconnectDelay) {
delegate.reconnectDelay(reconnectDelay);
return this;
}
@Override
public ClientResources.Builder reconnectDelay(final Supplier<Delay> reconnectDelay) {
delegate.reconnectDelay(reconnectDelay);
return this;
}
@Override
public ClientResources.Builder socketAddressResolver(final SocketAddressResolver socketAddressResolver) {
delegate.socketAddressResolver(socketAddressResolver);
return this;
}
@Override
public ClientResources.Builder threadFactoryProvider(final ThreadFactoryProvider threadFactoryProvider) {
delegate.threadFactoryProvider(threadFactoryProvider);
return this;
}
@Override
public ClientResources.Builder timer(final Timer timer) {
delegate.timer(timer);
return this;
}
@Override
public ClientResources.Builder tracing(final Tracing tracing) {
delegate.tracing(tracing);
return this;
}
@Override
public ClientResources build() {
return delegate.build();
}
}
}