From 791e95c64506264160a3f8a91204be69c91c9201 Mon Sep 17 00:00:00 2001 From: Greyson Parrelli Date: Thu, 27 Feb 2025 12:17:24 -0500 Subject: [PATCH] Convert StringUtil to kotlin. --- .../java/org/signal/core/util/StringUtil.java | 378 ----------------- .../java/org/signal/core/util/StringUtil.kt | 398 ++++++++++++++++++ 2 files changed, 398 insertions(+), 378 deletions(-) delete mode 100644 core-util/src/main/java/org/signal/core/util/StringUtil.java create mode 100644 core-util/src/main/java/org/signal/core/util/StringUtil.kt diff --git a/core-util/src/main/java/org/signal/core/util/StringUtil.java b/core-util/src/main/java/org/signal/core/util/StringUtil.java deleted file mode 100644 index 4afca60c0a..0000000000 --- a/core-util/src/main/java/org/signal/core/util/StringUtil.java +++ /dev/null @@ -1,378 +0,0 @@ -package org.signal.core.util; - -import android.text.SpannableStringBuilder; -import android.text.TextUtils; - -import androidx.annotation.NonNull; -import androidx.annotation.Nullable; -import androidx.core.text.BidiFormatter; - -import java.io.ByteArrayOutputStream; -import java.io.IOException; -import java.nio.charset.StandardCharsets; -import java.util.List; -import java.util.Set; -import java.util.regex.Pattern; - -public final class StringUtil { - - private static final Set WHITESPACE = SetUtil.newHashSet('\u200E', // left-to-right mark - '\u200F', // right-to-left mark - '\u2007', // figure space - '\u200B', // zero-width space - '\u2800'); // braille blank - - private static final Pattern ALL_ASCII_PATTERN = Pattern.compile("^[\\x00-\\x7F]*$"); - - private static final class Bidi { - /** Override text direction */ - private static final Set OVERRIDES = SetUtil.newHashSet("\u202a".codePointAt(0), /* LRE */ - "\u202b".codePointAt(0), /* RLE */ - "\u202d".codePointAt(0), /* LRO */ - "\u202e".codePointAt(0) /* RLO */); - - /** Set direction and isolate surrounding text */ - private static final Set ISOLATES = SetUtil.newHashSet("\u2066".codePointAt(0), /* LRI */ - "\u2067".codePointAt(0), /* RLI */ - "\u2068".codePointAt(0) /* FSI */); - /** Closes things in {@link #OVERRIDES} */ - private static final int PDF = "\u202c".codePointAt(0); - - /** Closes things in {@link #ISOLATES} */ - private static final int PDI = "\u2069".codePointAt(0); - - /** Auto-detecting isolate */ - private static final int FSI = "\u2068".codePointAt(0); - } - - private StringUtil() { - } - - /** - * Trims a name string to fit into the byte length requirement. - *

- * This method treats a surrogate pair and a grapheme cluster a single character - * See examples in tests defined in StringUtilText_trimToFit. - */ - public static @NonNull String trimToFit(@Nullable String name, int maxByteLength) { - if (TextUtils.isEmpty(name)) { - return ""; - } - - if (name.getBytes(StandardCharsets.UTF_8).length <= maxByteLength) { - return name; - } - - try (ByteArrayOutputStream stream = new ByteArrayOutputStream()) { - for (String graphemeCharacter : new CharacterIterable(name)) { - byte[] bytes = graphemeCharacter.getBytes(StandardCharsets.UTF_8); - - if (stream.size() + bytes.length <= maxByteLength) { - stream.write(bytes); - } else { - break; - } - } - return stream.toString(); - } catch (IOException e) { - throw new AssertionError(e); - } - } - - /** - * @return A charsequence with no leading or trailing whitespace. Only creates a new charsequence - * if it has to. - */ - public static @NonNull CharSequence trim(@NonNull CharSequence charSequence) { - if (charSequence.length() == 0) { - return charSequence; - } - - int start = 0; - int end = charSequence.length() - 1; - - while (start < charSequence.length() && Character.isWhitespace(charSequence.charAt(start))) { - start++; - } - - while (end >= 0 && end > start && Character.isWhitespace(charSequence.charAt(end))) { - end--; - } - - if (start > 0 || end < charSequence.length() - 1) { - return charSequence.subSequence(start, end + 1); - } else { - return charSequence; - } - } - - /** - * @return True if the string is empty, or if it contains nothing but whitespace characters. - * Accounts for various unicode whitespace characters. - */ - public static boolean isVisuallyEmpty(@Nullable String value) { - if (value == null || value.length() == 0) { - return true; - } - - return indexOfFirstNonEmptyChar(value) == -1; - } - - /** - * @return String without any leading or trailing whitespace. - * Accounts for various unicode whitespace characters. - */ - public static String trimToVisualBounds(@NonNull String value) { - int start = indexOfFirstNonEmptyChar(value); - - if (start == -1) { - return ""; - } - - int end = indexOfLastNonEmptyChar(value); - - return value.substring(start, end + 1); - } - - private static int indexOfFirstNonEmptyChar(@NonNull String value) { - int length = value.length(); - - for (int i = 0; i < length; i++) { - if (!isVisuallyEmpty(value.charAt(i))) { - return i; - } - } - - return -1; - } - - private static int indexOfLastNonEmptyChar(@NonNull String value) { - for (int i = value.length() - 1; i >= 0; i--) { - if (!isVisuallyEmpty(value.charAt(i))) { - return i; - } - } - return -1; - } - - /** - * @return True if the character is invisible or whitespace. Accounts for various unicode - * whitespace characters. - */ - public static boolean isVisuallyEmpty(char c) { - return Character.isWhitespace(c) || WHITESPACE.contains(c); - } - - /** - * @return A string representation of the provided unicode code point. - */ - public static @NonNull String codePointToString(int codePoint) { - return new String(Character.toChars(codePoint)); - } - - /** - * @return True if the provided text contains a mix of LTR and RTL characters, otherwise false. - */ - public static boolean hasMixedTextDirection(@Nullable CharSequence text) { - if (text == null) { - return false; - } - - Boolean isLtr = null; - - for (int i = 0, len = Character.codePointCount(text, 0, text.length()); i < len; i++) { - int codePoint = Character.codePointAt(text, i); - byte direction = Character.getDirectionality(codePoint); - boolean isLetter = Character.isLetter(codePoint); - - if (isLtr != null && isLtr && direction != Character.DIRECTIONALITY_LEFT_TO_RIGHT && isLetter) { - return true; - } else if (isLtr != null && !isLtr && direction != Character.DIRECTIONALITY_RIGHT_TO_LEFT && isLetter) { - return true; - } else if (isLetter) { - isLtr = direction == Character.DIRECTIONALITY_LEFT_TO_RIGHT; - } - } - - return false; - } - - /** - * @return True if the text is null or has a length of 0, otherwise false. - */ - public static boolean isEmpty(@Nullable String text) { - return text == null || text.length() == 0; - } - - /** - * Isolates bi-directional text from influencing surrounding text. You should use this whenever - * you're injecting user-generated text into a larger string. - * - * You'd think we'd be able to trust {@link BidiFormatter}, but unfortunately it just misses some - * corner cases, so here we are. - * - * The general idea is just to balance out the opening and closing codepoints, and then wrap the - * whole thing in FSI/PDI to isolate it. - * - * For more details, see: - * https://www.w3.org/International/questions/qa-bidi-unicode-controls - */ - public static @NonNull String isolateBidi(@Nullable String text) { - if (text == null) { - return ""; - } - - if (isEmpty(text)) { - return text; - } - - if (ALL_ASCII_PATTERN.matcher(text).matches()) { - return text; - } - - int overrideCount = 0; - int overrideCloseCount = 0; - int isolateCount = 0; - int isolateCloseCount = 0; - - for (int i = 0, len = text.codePointCount(0, text.length()); i < len; i++) { - int codePoint = text.codePointAt(i); - - if (Bidi.OVERRIDES.contains(codePoint)) { - overrideCount++; - } else if (codePoint == Bidi.PDF) { - overrideCloseCount++; - } else if (Bidi.ISOLATES.contains(codePoint)) { - isolateCount++; - } else if (codePoint == Bidi.PDI) { - isolateCloseCount++; - } - } - - StringBuilder suffix = new StringBuilder(); - - while (overrideCount > overrideCloseCount) { - suffix.appendCodePoint(Bidi.PDF); - overrideCloseCount++; - } - - while (isolateCount > isolateCloseCount) { - suffix.appendCodePoint(Bidi.FSI); - isolateCloseCount++; - } - - StringBuilder out = new StringBuilder(); - - return out.appendCodePoint(Bidi.FSI) - .append(text) - .append(suffix) - .appendCodePoint(Bidi.PDI) - .toString(); - } - - public static @Nullable String stripBidiProtection(@Nullable String text) { - if (text == null) return null; - - return text.replaceAll("[\\u2068\\u2069\\u202c]", ""); - } - - public static @NonNull String stripBidiIndicator(@NonNull String text) { - return text.replace("\u200F", ""); - } - - /** - * Trims a {@link CharSequence} of starting and trailing whitespace. Behavior matches - * {@link String#trim()} to preserve expectations around results. - */ - public static CharSequence trimSequence(CharSequence text) { - int length = text.length(); - int startIndex = 0; - - while ((startIndex < length) && (text.charAt(startIndex) <= ' ')) { - startIndex++; - } - while ((startIndex < length) && (text.charAt(length - 1) <= ' ')) { - length--; - } - return (startIndex > 0 || length < text.length()) ? text.subSequence(startIndex, length) : text; - } - - /** - * If the {@param text} exceeds the {@param maxChars} it is trimmed in the middle so that the result is exactly {@param maxChars} long including an added - * ellipsis character. - *

- * Otherwise the string is returned untouched. - *

- * When {@param maxChars} is even, one more character is kept from the end of the string than the start. - */ - public static @Nullable CharSequence abbreviateInMiddle(@Nullable CharSequence text, int maxChars) { - if (text == null || text.length() <= maxChars) { - return text; - } - - int start = (maxChars - 1) / 2; - int end = (maxChars - 1) - start; - return text.subSequence(0, start) + "…" + text.subSequence(text.length() - end, text.length()); - } - - /** - * @return The number of graphemes in the provided string. - */ - public static int getGraphemeCount(@NonNull CharSequence text) { - BreakIteratorCompat iterator = BreakIteratorCompat.getInstance(); - iterator.setText(text); - return iterator.countBreaks(); - } - - public static String forceLtr(@NonNull CharSequence text) { - return "\u202a" + text + "\u202c"; - } - - public static @NonNull CharSequence replace(@NonNull CharSequence text, char toReplace, String replacement) { - SpannableStringBuilder updatedText = null; - - for (int i = text.length() - 1; i >= 0; i--) { - if (text.charAt(i) == toReplace) { - if (updatedText == null) { - updatedText = SpannableStringBuilder.valueOf(text); - } - updatedText.replace(i, i + 1, replacement); - } - } - - if (updatedText != null) { - return updatedText; - } else { - return text; - } - } - - public static boolean startsWith(@NonNull CharSequence text, @NonNull CharSequence substring) { - if (substring.length() > text.length()) { - return false; - } - - for (int i = 0; i < substring.length(); i++) { - if (text.charAt(i) != substring.charAt(i)) { - return false; - } - } - - return true; - } - - public static boolean endsWith(@NonNull CharSequence text, @NonNull CharSequence substring) { - if (substring.length() > text.length()) { - return false; - } - - int textIndex = text.length() - 1; - for (int substringIndex = substring.length() - 1; substringIndex >= 0; substringIndex--, textIndex--) { - if (text.charAt(textIndex) != substring.charAt(substringIndex)) { - return false; - } - } - - return true; - } -} diff --git a/core-util/src/main/java/org/signal/core/util/StringUtil.kt b/core-util/src/main/java/org/signal/core/util/StringUtil.kt new file mode 100644 index 0000000000..143a9a77f2 --- /dev/null +++ b/core-util/src/main/java/org/signal/core/util/StringUtil.kt @@ -0,0 +1,398 @@ +package org.signal.core.util + +import android.text.SpannableStringBuilder +import java.io.ByteArrayOutputStream +import java.io.IOException +import java.nio.charset.StandardCharsets +import java.util.regex.Pattern + +object StringUtil { + private val WHITESPACE: Set = setOf( + '\u200E', // left-to-right mark + '\u200F', // right-to-left mark + '\u2007', // figure space + '\u200B', // zero-width space + '\u2800' // braille blank + ) + + private val ALL_ASCII_PATTERN: Pattern = Pattern.compile("^[\\x00-\\x7F]*$") + + private object Bidi { + /** Override text direction */ + val OVERRIDES: Set = SetUtil.newHashSet( + "\u202a".codePointAt(0), /* LRE */ + "\u202b".codePointAt(0), /* RLE */ + "\u202d".codePointAt(0), /* LRO */ + "\u202e".codePointAt(0) /* RLO */ + ) + + /** Set direction and isolate surrounding text */ + val ISOLATES: Set = SetUtil.newHashSet( + "\u2066".codePointAt(0), /* LRI */ + "\u2067".codePointAt(0), /* RLI */ + "\u2068".codePointAt(0) /* FSI */ + ) + + /** Closes things in [.OVERRIDES] */ + val PDF: Int = "\u202c".codePointAt(0) + + /** Closes things in [.ISOLATES] */ + val PDI: Int = "\u2069".codePointAt(0) + + /** Auto-detecting isolate */ + val FSI: Int = "\u2068".codePointAt(0) + } + + /** + * Trims a name string to fit into the byte length requirement. + * + * + * This method treats a surrogate pair and a grapheme cluster a single character + * See examples in tests defined in StringUtilText_trimToFit. + */ + @JvmStatic + fun trimToFit(name: String?, maxByteLength: Int): String { + if (name.isNullOrEmpty()) { + return "" + } + + if (name.toByteArray(StandardCharsets.UTF_8).size <= maxByteLength) { + return name + } + + try { + ByteArrayOutputStream().use { stream -> + for (graphemeCharacter in CharacterIterable(name)) { + val bytes = graphemeCharacter.toByteArray(StandardCharsets.UTF_8) + + if (stream.size() + bytes.size <= maxByteLength) { + stream.write(bytes) + } else { + break + } + } + return stream.toString() + } + } catch (e: IOException) { + throw AssertionError(e) + } + } + + /** + * @return A charsequence with no leading or trailing whitespace. Only creates a new charsequence + * if it has to. + */ + @JvmStatic + fun trim(charSequence: CharSequence): CharSequence { + if (charSequence.isEmpty()) { + return charSequence + } + + var start = 0 + var end = charSequence.length - 1 + + while (start < charSequence.length && Character.isWhitespace(charSequence[start])) { + start++ + } + + while (end >= 0 && end > start && Character.isWhitespace(charSequence[end])) { + end-- + } + + return if (start > 0 || end < charSequence.length - 1) { + charSequence.subSequence(start, end + 1) + } else { + charSequence + } + } + + /** + * @return True if the string is empty, or if it contains nothing but whitespace characters. + * Accounts for various unicode whitespace characters. + */ + @JvmStatic + fun isVisuallyEmpty(value: String?): Boolean { + if (value.isNullOrEmpty()) { + return true + } + + return indexOfFirstNonEmptyChar(value) == -1 + } + + /** + * @return String without any leading or trailing whitespace. + * Accounts for various unicode whitespace characters. + */ + @JvmStatic + fun trimToVisualBounds(value: String): String { + val start = indexOfFirstNonEmptyChar(value) + + if (start == -1) { + return "" + } + + val end = indexOfLastNonEmptyChar(value) + + return value.substring(start, end + 1) + } + + private fun indexOfFirstNonEmptyChar(value: String): Int { + val length = value.length + + for (i in 0 until length) { + if (!isVisuallyEmpty(value[i])) { + return i + } + } + + return -1 + } + + private fun indexOfLastNonEmptyChar(value: String): Int { + for (i in value.length - 1 downTo 0) { + if (!isVisuallyEmpty(value[i])) { + return i + } + } + return -1 + } + + /** + * @return True if the character is invisible or whitespace. Accounts for various unicode + * whitespace characters. + */ + fun isVisuallyEmpty(c: Char): Boolean { + return Character.isWhitespace(c) || WHITESPACE.contains(c) + } + + /** + * @return A string representation of the provided unicode code point. + */ + fun codePointToString(codePoint: Int): String { + return String(Character.toChars(codePoint)) + } + + /** + * @return True if the provided text contains a mix of LTR and RTL characters, otherwise false. + */ + @JvmStatic + fun hasMixedTextDirection(text: CharSequence?): Boolean { + if (text == null) { + return false + } + + var isLtr: Boolean? = null + + var i = 0 + val len = Character.codePointCount(text, 0, text.length) + while (i < len) { + val codePoint = Character.codePointAt(text, i) + val direction = Character.getDirectionality(codePoint) + val isLetter = Character.isLetter(codePoint) + + if (isLtr != null && isLtr && direction != Character.DIRECTIONALITY_LEFT_TO_RIGHT && isLetter) { + return true + } else if (isLtr != null && !isLtr && direction != Character.DIRECTIONALITY_RIGHT_TO_LEFT && isLetter) { + return true + } else if (isLetter) { + isLtr = direction == Character.DIRECTIONALITY_LEFT_TO_RIGHT + } + i++ + } + + return false + } + + /** + * @return True if the text is null or has a length of 0, otherwise false. + */ + @JvmStatic + fun isEmpty(text: String?): Boolean { + return text.isNullOrEmpty() + } + + /** + * Isolates bi-directional text from influencing surrounding text. You should use this whenever + * you're injecting user-generated text into a larger string. + * + * You'd think we'd be able to trust BidiFormatter, but unfortunately it just misses some + * corner cases, so here we are. + * + * The general idea is just to balance out the opening and closing codepoints, and then wrap the + * whole thing in FSI/PDI to isolate it. + * + * For more details, see: + * https://www.w3.org/International/questions/qa-bidi-unicode-controls + */ + @JvmStatic + fun isolateBidi(text: String?): String { + if (text == null) { + return "" + } + + if (isEmpty(text)) { + return text + } + + if (ALL_ASCII_PATTERN.matcher(text).matches()) { + return text + } + + var overrideCount = 0 + var overrideCloseCount = 0 + var isolateCount = 0 + var isolateCloseCount = 0 + + var i = 0 + val len = text.codePointCount(0, text.length) + while (i < len) { + val codePoint = text.codePointAt(i) + + if (Bidi.OVERRIDES.contains(codePoint)) { + overrideCount++ + } else if (codePoint == Bidi.PDF) { + overrideCloseCount++ + } else if (Bidi.ISOLATES.contains(codePoint)) { + isolateCount++ + } else if (codePoint == Bidi.PDI) { + isolateCloseCount++ + } + i++ + } + + val suffix = StringBuilder() + + while (overrideCount > overrideCloseCount) { + suffix.appendCodePoint(Bidi.PDF) + overrideCloseCount++ + } + + while (isolateCount > isolateCloseCount) { + suffix.appendCodePoint(Bidi.FSI) + isolateCloseCount++ + } + + val out = StringBuilder() + + return out.appendCodePoint(Bidi.FSI) + .append(text) + .append(suffix) + .appendCodePoint(Bidi.PDI) + .toString() + } + + @JvmStatic + fun stripBidiProtection(text: String?): String? { + if (text == null) return null + + return text.replace("[\\u2068\\u2069\\u202c]".toRegex(), "") + } + + fun stripBidiIndicator(text: String): String { + return text.replace("\u200F", "") + } + + /** + * Trims a [CharSequence] of starting and trailing whitespace. Behavior matches + * [String.trim] to preserve expectations around results. + */ + @JvmStatic + fun trimSequence(text: CharSequence): CharSequence { + var length = text.length + var startIndex = 0 + + while ((startIndex < length) && (text[startIndex] <= ' ')) { + startIndex++ + } + while ((startIndex < length) && (text[length - 1] <= ' ')) { + length-- + } + return if ((startIndex > 0 || length < text.length)) text.subSequence(startIndex, length) else text + } + + /** + * If the {@param text} exceeds the {@param maxChars} it is trimmed in the middle so that the result is exactly {@param maxChars} long including an added + * ellipsis character. + * + * + * Otherwise the string is returned untouched. + * + * + * When {@param maxChars} is even, one more character is kept from the end of the string than the start. + */ + @JvmStatic + fun abbreviateInMiddle(text: CharSequence?, maxChars: Int): CharSequence? { + if (text == null || text.length <= maxChars) { + return text + } + + val start = (maxChars - 1) / 2 + val end = (maxChars - 1) - start + return text.subSequence(0, start).toString() + "…" + text.subSequence(text.length - end, text.length) + } + + /** + * @return The number of graphemes in the provided string. + */ + @JvmStatic + fun getGraphemeCount(text: CharSequence): Int { + val iterator = BreakIteratorCompat.getInstance() + iterator.setText(text) + return iterator.countBreaks() + } + + @JvmStatic + fun forceLtr(text: CharSequence): String { + return "\u202a" + text + "\u202c" + } + + @JvmStatic + fun replace(text: CharSequence, toReplace: Char, replacement: String?): CharSequence { + var updatedText: SpannableStringBuilder? = null + + for (i in text.length - 1 downTo 0) { + if (text[i] == toReplace) { + if (updatedText == null) { + updatedText = SpannableStringBuilder.valueOf(text) + } + updatedText!!.replace(i, i + 1, replacement) + } + } + + return updatedText ?: text + } + + @JvmStatic + fun startsWith(text: CharSequence, substring: CharSequence): Boolean { + if (substring.length > text.length) { + return false + } + + for (i in substring.indices) { + if (text[i] != substring[i]) { + return false + } + } + + return true + } + + @JvmStatic + fun endsWith(text: CharSequence, substring: CharSequence): Boolean { + if (substring.length > text.length) { + return false + } + + var textIndex = text.length - 1 + var substringIndex = substring.length - 1 + while (substringIndex >= 0) { + if (text[textIndex] != substring[substringIndex]) { + return false + } + substringIndex-- + textIndex-- + } + + return true + } +}