mirror of
https://github.com/signalapp/Signal-Server
synced 2026-04-21 23:28:04 +01:00
When persisting messages fails due to a full queue in DynamoDB, automatically unlink one device to free up room.
Co-authored-by: Chris Eager <79161849+eager-signal@users.noreply.github.com>
This commit is contained in:
committed by
GitHub
parent
ce60f13320
commit
8f7bae54fe
@@ -16,17 +16,30 @@ import com.codahale.metrics.Timer;
|
||||
import com.google.common.annotations.VisibleForTesting;
|
||||
import io.dropwizard.lifecycle.Managed;
|
||||
import io.micrometer.core.instrument.Counter;
|
||||
import reactor.core.publisher.Flux;
|
||||
import reactor.util.function.Tuple2;
|
||||
import reactor.util.function.Tuples;
|
||||
import software.amazon.awssdk.services.dynamodb.model.ItemCollectionSizeLimitExceededException;
|
||||
|
||||
import java.time.Duration;
|
||||
import java.time.Instant;
|
||||
import java.util.Comparator;
|
||||
import java.util.List;
|
||||
import java.util.Optional;
|
||||
import java.util.UUID;
|
||||
import java.util.concurrent.CompletableFuture;
|
||||
import java.util.concurrent.ExecutorService;
|
||||
import java.util.concurrent.Executors;
|
||||
import java.util.concurrent.TimeUnit;
|
||||
import java.util.function.Supplier;
|
||||
import java.util.stream.Collectors;
|
||||
|
||||
import org.apache.commons.lang3.StringUtils;
|
||||
import org.slf4j.Logger;
|
||||
import org.slf4j.LoggerFactory;
|
||||
import org.whispersystems.textsecuregcm.configuration.dynamic.DynamicConfiguration;
|
||||
import org.whispersystems.textsecuregcm.entities.MessageProtos;
|
||||
import org.whispersystems.textsecuregcm.push.ClientPresenceManager;
|
||||
import org.whispersystems.textsecuregcm.util.Constants;
|
||||
import org.whispersystems.textsecuregcm.util.Util;
|
||||
|
||||
@@ -35,6 +48,8 @@ public class MessagePersister implements Managed {
|
||||
private final MessagesCache messagesCache;
|
||||
private final MessagesManager messagesManager;
|
||||
private final AccountsManager accountsManager;
|
||||
private final ClientPresenceManager clientPresenceManager;
|
||||
private final KeysManager keysManager;
|
||||
|
||||
private final Duration persistDelay;
|
||||
|
||||
@@ -50,27 +65,35 @@ public class MessagePersister implements Managed {
|
||||
private final Counter oversizedQueueCounter = counter(name(MessagePersister.class, "persistQueueOversized"));
|
||||
private final Histogram queueCountHistogram = metricRegistry.histogram(name(MessagePersister.class, "queueCount"));
|
||||
private final Histogram queueSizeHistogram = metricRegistry.histogram(name(MessagePersister.class, "queueSize"));
|
||||
private final ExecutorService executor;
|
||||
|
||||
static final int QUEUE_BATCH_LIMIT = 100;
|
||||
static final int MESSAGE_BATCH_LIMIT = 100;
|
||||
|
||||
private static final long EXCEPTION_PAUSE_MILLIS = Duration.ofSeconds(3).toMillis();
|
||||
public static final Duration UNLINK_TIMEOUT = Duration.ofHours(1);
|
||||
|
||||
private static final int CONSECUTIVE_EMPTY_CACHE_REMOVAL_LIMIT = 3;
|
||||
|
||||
private static final Logger logger = LoggerFactory.getLogger(MessagePersister.class);
|
||||
|
||||
public MessagePersister(final MessagesCache messagesCache, final MessagesManager messagesManager,
|
||||
final AccountsManager accountsManager,
|
||||
final AccountsManager accountsManager, final ClientPresenceManager clientPresenceManager,
|
||||
final KeysManager keysManager,
|
||||
final DynamicConfigurationManager<DynamicConfiguration> dynamicConfigurationManager,
|
||||
final Duration persistDelay,
|
||||
final int dedicatedProcessWorkerThreadCount) {
|
||||
final int dedicatedProcessWorkerThreadCount,
|
||||
final ExecutorService executor
|
||||
) {
|
||||
this.messagesCache = messagesCache;
|
||||
this.messagesManager = messagesManager;
|
||||
this.accountsManager = accountsManager;
|
||||
this.clientPresenceManager = clientPresenceManager;
|
||||
this.keysManager = keysManager;
|
||||
this.persistDelay = persistDelay;
|
||||
this.workerThreads = new Thread[dedicatedProcessWorkerThreadCount];
|
||||
this.dedicatedProcess = true;
|
||||
this.executor = executor;
|
||||
|
||||
for (int i = 0; i < workerThreads.length; i++) {
|
||||
workerThreads[i] = new Thread(() -> {
|
||||
@@ -139,12 +162,14 @@ public class MessagePersister implements Managed {
|
||||
final UUID accountUuid = MessagesCache.getAccountUuidFromQueueName(queue);
|
||||
final byte deviceId = MessagesCache.getDeviceIdFromQueueName(queue);
|
||||
|
||||
final Optional<Account> maybeAccount = accountsManager.getByAccountIdentifier(accountUuid);
|
||||
if (maybeAccount.isEmpty()) {
|
||||
logger.error("No account record found for account {}", accountUuid);
|
||||
continue;
|
||||
}
|
||||
try {
|
||||
persistQueue(accountUuid, deviceId);
|
||||
persistQueue(maybeAccount.get(), deviceId);
|
||||
} catch (final Exception e) {
|
||||
if (e instanceof ItemCollectionSizeLimitExceededException) {
|
||||
oversizedQueueCounter.increment();
|
||||
}
|
||||
persistQueueExceptionMeter.mark();
|
||||
logger.warn("Failed to persist queue {}::{}; will schedule for retry", accountUuid, deviceId, e);
|
||||
|
||||
@@ -161,14 +186,8 @@ public class MessagePersister implements Managed {
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
void persistQueue(final UUID accountUuid, final byte deviceId) throws MessagePersistenceException {
|
||||
final Optional<Account> maybeAccount = accountsManager.getByAccountIdentifier(accountUuid);
|
||||
|
||||
if (maybeAccount.isEmpty()) {
|
||||
logger.error("No account record found for account {}", accountUuid);
|
||||
return;
|
||||
}
|
||||
|
||||
void persistQueue(final Account account, final byte deviceId) throws MessagePersistenceException {
|
||||
final UUID accountUuid = account.getUuid();
|
||||
try (final Timer.Context ignored = persistQueueTimer.time()) {
|
||||
messagesCache.lockQueueForPersistence(accountUuid, deviceId);
|
||||
|
||||
@@ -197,9 +216,73 @@ public class MessagePersister implements Managed {
|
||||
} while (!messages.isEmpty());
|
||||
|
||||
queueSizeHistogram.update(messageCount);
|
||||
} catch (ItemCollectionSizeLimitExceededException e) {
|
||||
oversizedQueueCounter.increment();
|
||||
unlinkLeastActiveDevice(account, deviceId); // this will either do a deferred reschedule for retry or throw
|
||||
} finally {
|
||||
messagesCache.unlockQueueForPersistence(accountUuid, deviceId);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@VisibleForTesting
|
||||
void unlinkLeastActiveDevice(final Account account, byte destinationDeviceId) throws MessagePersistenceException {
|
||||
if (!messagesCache.lockAccountForMessagePersisterCleanup(account.getUuid())) {
|
||||
// don't try to unlink an account multiple times in parallel; just fail this persist attempt
|
||||
// and we'll try again, hopefully deletions in progress will have made room
|
||||
throw new MessagePersistenceException("account has a full queue and another device-unlinking attempt is in progress");
|
||||
}
|
||||
|
||||
// Selection logic:
|
||||
|
||||
// The primary device is never unlinked
|
||||
// The least-recently-seen inactive device gets unlinked
|
||||
// If there are none, the device with the oldest queued message (not necessarily the
|
||||
// least-recently-seen; a device could be connecting frequently but have some problem fetching
|
||||
// its messages) is unlinked
|
||||
final Device deviceToDelete = account.getDevices()
|
||||
.stream()
|
||||
.filter(d -> !d.isPrimary() && !d.isEnabled())
|
||||
.min(Comparator.comparing(Device::getLastSeen))
|
||||
.or(() ->
|
||||
Flux.fromIterable(account.getDevices())
|
||||
.filter(d -> !d.isPrimary())
|
||||
.flatMap(d ->
|
||||
messagesManager
|
||||
.getEarliestUndeliveredTimestampForDevice(account.getUuid(), d.getId())
|
||||
.map(t -> Tuples.of(d, t)))
|
||||
.sort(Comparator.comparing(Tuple2::getT2))
|
||||
.map(Tuple2::getT1)
|
||||
.next()
|
||||
.blockOptional())
|
||||
.orElseThrow(() -> new MessagePersistenceException("account has a full queue and no unlinkable devices"));
|
||||
|
||||
logger.warn("Failed to persist queue {}::{} due to overfull queue; will unlink device {}{}",
|
||||
account.getUuid(), destinationDeviceId, deviceToDelete.getId(), deviceToDelete.getId() == destinationDeviceId ? "" : " and schedule for retry");
|
||||
executor.execute(
|
||||
() -> {
|
||||
try {
|
||||
// Lock the device's auth token first to prevent it from connecting while we're
|
||||
// destroying its queue, but we don't want to completely remove the device from the
|
||||
// account until we're sure the messages have been cleared because otherwise we won't
|
||||
// be able to find it later to try again, in the event of a failure or timeout
|
||||
final Account updatedAccount = accountsManager.update(
|
||||
account,
|
||||
a -> a.getDevice(deviceToDelete.getId()).ifPresent(Device::lockAuthTokenHash));
|
||||
clientPresenceManager.disconnectPresence(account.getUuid(), deviceToDelete.getId());
|
||||
CompletableFuture
|
||||
.allOf(
|
||||
keysManager.delete(account.getUuid(), deviceToDelete.getId()),
|
||||
messagesManager.clear(account.getUuid(), deviceToDelete.getId()))
|
||||
.orTimeout((UNLINK_TIMEOUT.toSeconds() * 3) / 4, TimeUnit.SECONDS)
|
||||
.join();
|
||||
accountsManager.update(updatedAccount, a -> a.removeDevice(deviceToDelete.getId()));
|
||||
} finally {
|
||||
messagesCache.unlockAccountForMessagePersisterCleanup(account.getUuid());
|
||||
if (deviceToDelete.getId() != destinationDeviceId) { // no point in persisting a device we just purged
|
||||
messagesCache.addQueueToPersist(account.getUuid(), destinationDeviceId);
|
||||
}
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -12,6 +12,7 @@ import com.google.protobuf.InvalidProtocolBufferException;
|
||||
import io.dropwizard.lifecycle.Managed;
|
||||
import io.lettuce.core.ScoredValue;
|
||||
import io.lettuce.core.ScriptOutputType;
|
||||
import io.lettuce.core.SetArgs;
|
||||
import io.lettuce.core.ZAddArgs;
|
||||
import io.lettuce.core.cluster.SlotHash;
|
||||
import io.lettuce.core.cluster.models.partitions.RedisClusterNode;
|
||||
@@ -382,6 +383,20 @@ public class MessagesCache extends RedisClusterPubSubAdapter<String, String> imp
|
||||
connection -> connection.sync().del(getPersistInProgressKey(accountUuid, deviceId)));
|
||||
}
|
||||
|
||||
boolean lockAccountForMessagePersisterCleanup(final UUID accountUuid) {
|
||||
return readDeleteCluster.withBinaryCluster(
|
||||
connection -> "OK".equals(
|
||||
connection.sync().set(
|
||||
getUnlinkInProgressKey(accountUuid),
|
||||
LOCK_VALUE,
|
||||
new SetArgs().ex(MessagePersister.UNLINK_TIMEOUT.toSeconds()).nx())));
|
||||
}
|
||||
|
||||
void unlockAccountForMessagePersisterCleanup(final UUID accountUuid) {
|
||||
readDeleteCluster.useBinaryCluster(
|
||||
connection -> connection.sync().del(getUnlinkInProgressKey(accountUuid)));
|
||||
}
|
||||
|
||||
public void addMessageAvailabilityListener(final UUID destinationUuid, final byte deviceId,
|
||||
final MessageAvailabilityListener listener) {
|
||||
final String queueName = getQueueName(destinationUuid, deviceId);
|
||||
@@ -531,6 +546,10 @@ public class MessagesCache extends RedisClusterPubSubAdapter<String, String> imp
|
||||
return ("user_queue_persisting::{" + accountUuid + "::" + deviceId + "}").getBytes(StandardCharsets.UTF_8);
|
||||
}
|
||||
|
||||
private static byte[] getUnlinkInProgressKey(final UUID accountUuid) {
|
||||
return ("user_account_unlinking::{" + accountUuid + "}").getBytes(StandardCharsets.UTF_8);
|
||||
}
|
||||
|
||||
static UUID getAccountUuidFromQueueName(final String queueName) {
|
||||
final int startOfHashTag = queueName.indexOf('{');
|
||||
|
||||
|
||||
@@ -102,6 +102,10 @@ public class MessagesManager {
|
||||
.tap(Micrometer.metrics(Metrics.globalRegistry));
|
||||
}
|
||||
|
||||
public Mono<Long> getEarliestUndeliveredTimestampForDevice(UUID destinationUuid, byte destinationDevice) {
|
||||
return Mono.from(messagesDynamoDb.load(destinationUuid, destinationDevice, 1)).map(Envelope::getServerTimestamp);
|
||||
}
|
||||
|
||||
public CompletableFuture<Void> clear(UUID destinationUuid) {
|
||||
return CompletableFuture.allOf(
|
||||
messagesCache.clear(destinationUuid),
|
||||
|
||||
@@ -62,9 +62,14 @@ public class MessagePersisterServiceCommand extends ServerCommand<WhisperServerC
|
||||
|
||||
final MessagePersister messagePersister = new MessagePersister(deps.messagesCache(), deps.messagesManager(),
|
||||
deps.accountsManager(),
|
||||
deps.clientPresenceManager(),
|
||||
deps.keysManager(),
|
||||
dynamicConfigurationManager,
|
||||
Duration.ofMinutes(configuration.getMessageCacheConfiguration().getPersistDelayMinutes()),
|
||||
namespace.getInt(WORKER_COUNT));
|
||||
namespace.getInt(WORKER_COUNT),
|
||||
environment.lifecycle().executorService("messagePersisterUnlinkDeviceExecutor-%d")
|
||||
.maxThreads(2)
|
||||
.build());
|
||||
|
||||
environment.lifecycle().manage(deps.messagesCache());
|
||||
environment.lifecycle().manage(messagePersister);
|
||||
|
||||
Reference in New Issue
Block a user