From 9180917b7c33e8b96fd45424514b4b72f267d84a Mon Sep 17 00:00:00 2001 From: Cody Henthorne Date: Tue, 15 Jul 2025 12:14:07 -0400 Subject: [PATCH] Adjust domain scrubbing. --- .../org/signal/core/util/logging/Scrubber.kt | 30 ++++++++++++++----- .../signal/core/util/logging/ScrubberTest.kt | 12 ++++++++ 2 files changed, 34 insertions(+), 8 deletions(-) diff --git a/core-util-jvm/src/main/java/org/signal/core/util/logging/Scrubber.kt b/core-util-jvm/src/main/java/org/signal/core/util/logging/Scrubber.kt index 0cb348c8b6..294e325781 100644 --- a/core-util-jvm/src/main/java/org/signal/core/util/logging/Scrubber.kt +++ b/core-util-jvm/src/main/java/org/signal/core/util/logging/Scrubber.kt @@ -7,6 +7,7 @@ package org.signal.core.util.logging import org.signal.core.util.CryptoUtil import org.signal.core.util.Hex +import org.signal.core.util.isNotNullOrBlank import java.util.regex.Matcher import java.util.regex.Pattern @@ -59,9 +60,9 @@ object Scrubber { private val IPV6_PATTERN = Pattern.compile("([0-9a-fA-F]{0,4}:){3,7}([0-9a-fA-F]){0,4}") private const val IPV6_CENSOR = "...ipv6..." - /** The domain name except for TLD will be censored. */ - private val DOMAIN_PATTERN = Pattern.compile("([a-z0-9]+\\.)+([a-z0-9\\-]*[a-z\\-][a-z0-9\\-]*)", Pattern.CASE_INSENSITIVE) - private const val DOMAIN_CENSOR = "***." + /** The domain name and path except for TLD will be censored. */ + private val URL_PATTERN = Pattern.compile("([a-z0-9]+\\.)+([a-z0-9\\-]*[a-z\\-][a-z0-9\\-]*)(/[/a-z0-9\\-_.~:@?&=#%+\\[\\]!$()*,;]*)?", Pattern.CASE_INSENSITIVE) + private const val URL_CENSOR = "***" private val TOP_100_TLDS: Set = setOf( "com", "net", "org", "jp", "de", "uk", "fr", "br", "it", "ru", "es", "me", "gov", "pl", "ca", "au", "cn", "co", "in", "nl", "edu", "info", "eu", "ch", "id", "at", "kr", "cz", "mx", "be", "tv", "se", "tr", "tw", "al", "ua", "ir", "vn", @@ -95,7 +96,7 @@ object Scrubber { .scrubGroupsV2() .scrubPnis() .scrubUuids() - .scrubDomains() + .scrubUrls() .scrubIpv4() .scrubIpv6() .scrubCallLinkKeys() @@ -177,13 +178,26 @@ object Scrubber { } } - private fun CharSequence.scrubDomains(): CharSequence { - return scrub(this, DOMAIN_PATTERN) { matcher, output -> + private fun CharSequence.scrubUrls(): CharSequence { + return scrub(this, URL_PATTERN) { matcher, output -> val match: String = matcher.group(0)!! - if (matcher.groupCount() == 2 && TOP_100_TLDS.contains(matcher.group(2)!!.lowercase()) && !match.endsWith("signal.org") && !match.endsWith("debuglogs.org")) { + + if ( + (matcher.groupCount() == 2 || matcher.groupCount() == 3) && + TOP_100_TLDS.contains(matcher.group(2)!!.lowercase()) && + !(matcher.group(1).endsWith("signal.") && matcher.group(2) == "org" && !match.contains("cdn")) && + !(matcher.group(1).endsWith("debuglogs.") && matcher.group(2) == "org") + ) { output - .append(DOMAIN_CENSOR) + .append(URL_CENSOR) + .append(".") .append(matcher.group(2)) + .run { + if (matcher.groupCount() == 3 && matcher.group(3).isNotNullOrBlank()) { + append("/") + append(URL_CENSOR) + } + } } else { output.append(match) } diff --git a/core-util-jvm/src/test/java/org/signal/core/util/logging/ScrubberTest.kt b/core-util-jvm/src/test/java/org/signal/core/util/logging/ScrubberTest.kt index 71aeffcedd..50c94bdc03 100644 --- a/core-util-jvm/src/test/java/org/signal/core/util/logging/ScrubberTest.kt +++ b/core-util-jvm/src/test/java/org/signal/core/util/logging/ScrubberTest.kt @@ -250,6 +250,18 @@ class ScrubberTest(private val input: String, private val expected: String) { "Recipient::123", "Recipient::123" ), + arrayOf( + "url with text before https://example.com/v1/endpoint;asdf123%20$[]?asdf&asdf#asdf and stuff afterwards", + "url with text before https://***.com/*** and stuff afterwards" + ), + arrayOf( + "https://signal.org/v1/endpoint", + "https://signal.org/v1/endpoint" + ), + arrayOf( + "https://cdn3.signal.org/v1/endpoint", + "https://***.org/***" + ), arrayOf( "https://debuglogs.org/android/7.47.2/2b5ccf4e3e58e44f12b3c92cfd5b526a2432f1dd0f81c8f89dededb176f1122d", "https://debuglogs.org/android/7.47.2/2b5ccf4e3e58e44f12b3c92cfd5b526a2432f1dd0f81c8f89dededb176f1122d"