Move more util classes to core-util.

2026-07-01 19:46:08 +01:00 · 2022-03-29 16:48:39 -04:00
parent 390b7ff834
commit 77ea2deada
55 changed files with 147 additions and 141 deletions
@@ -0,0 +1,124 @@
+package org.signal.core.util;
+
+import android.os.Build;
+
+import androidx.annotation.NonNull;
+import androidx.annotation.RequiresApi;
+
+import java.util.Iterator;
+
+/**
+ * Iterates over a string treating a surrogate pair and a grapheme cluster a single character.
+ */
+public final class CharacterIterable implements Iterable<String> {
+
+  private final String string;
+
+  public CharacterIterable(@NonNull String string) {
+    this.string = string;
+  }
+
+  @Override
+  public @NonNull Iterator<String> iterator() {
+    return new CharacterIterator();
+  }
+
+  private class CharacterIterator implements Iterator<String> {
+    private static final int UNINITIALIZED = -2;
+
+    private final BreakIteratorCompat breakIterator;
+
+    private int lastIndex = UNINITIALIZED;
+
+    CharacterIterator() {
+      this.breakIterator = Build.VERSION.SDK_INT >= 24 ? new AndroidIcuBreakIterator(string)
+                                                       : new FallbackBreakIterator(string);
+    }
+
+    @Override
+    public boolean hasNext() {
+      if (lastIndex == UNINITIALIZED) {
+        lastIndex = breakIterator.first();
+      }
+      return !breakIterator.isDone(lastIndex);
+    }
+
+    @Override
+    public String next() {
+      int firstIndex = lastIndex;
+      lastIndex = breakIterator.next();
+      return string.substring(firstIndex, lastIndex);
+    }
+  }
+
+  private interface BreakIteratorCompat {
+    int first();
+
+    int next();
+
+    boolean isDone(int index);
+  }
+
+  /**
+   * An BreakIteratorCompat implementation that delegates calls to `android.icu.text.BreakIterator`.
+   * This class handles grapheme clusters fine but requires Android API >= 24.
+   */
+  @RequiresApi(24)
+  private static class AndroidIcuBreakIterator implements BreakIteratorCompat {
+    private final android.icu.text.BreakIterator breakIterator = android.icu.text.BreakIterator.getCharacterInstance();
+
+    public AndroidIcuBreakIterator(@NonNull String string) {
+      breakIterator.setText(string);
+    }
+
+    @Override
+    public int first() {
+      return breakIterator.first();
+    }
+
+    @Override
+    public int next() {
+      return breakIterator.next();
+    }
+
+    @Override
+    public boolean isDone(int index) {
+      return index == android.icu.text.BreakIterator.DONE;
+    }
+  }
+
+  /**
+   * An BreakIteratorCompat implementation that delegates calls to `java.text.BreakIterator`.
+   * This class may or may not handle grapheme clusters well depending on the underlying implementation.
+   * In the emulator, API 23 implements ICU version of the BreakIterator so that it handles grapheme
+   * clusters fine. But API 21 implements RuleBasedIterator which does not handle grapheme clusters.
+   * <p>
+   * If it doesn't handle grapheme clusters correctly, in most cases the combined characters are
+   * broken up into pieces when the code tries to trim a string. For example, an emoji that is
+   * a combination of a person, gender and skin tone, trimming the character using this class may result
+   * in trimming the parts of the character, e.g. a dark skin frowning woman emoji may result in
+   * a neutral skin frowning woman emoji.
+   */
+  private static class FallbackBreakIterator implements BreakIteratorCompat {
+    private final java.text.BreakIterator breakIterator = java.text.BreakIterator.getCharacterInstance();
+
+    public FallbackBreakIterator(@NonNull String string) {
+      breakIterator.setText(string);
+    }
+
+    @Override
+    public int first() {
+      return breakIterator.first();
+    }
+
+    @Override
+    public int next() {
+      return breakIterator.next();
+    }
+
+    @Override
+    public boolean isDone(int index) {
+      return index == java.text.BreakIterator.DONE;
+    }
+  }
+}
@@ -0,0 +1,34 @@
+package org.signal.core.util;
+
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.LinkedHashSet;
+import java.util.Set;
+
+public final class SetUtil {
+  private SetUtil() {}
+
+  public static <E> Set<E> intersection(Collection<E> a, Collection<E> b) {
+    Set<E> intersection = new LinkedHashSet<>(a);
+    intersection.retainAll(b);
+    return intersection;
+  }
+
+  public static <E> Set<E> difference(Collection<E> a, Collection<E> b) {
+    Set<E> difference = new LinkedHashSet<>(a);
+    difference.removeAll(b);
+    return difference;
+  }
+
+  public static <E> Set<E> union(Set<E> a, Set<E> b) {
+    Set<E> result = new LinkedHashSet<>(a);
+    result.addAll(b);
+    return result;
+  }
+
+  @SafeVarargs
+  public static <E> HashSet<E> newHashSet(E... elements) {
+    return new HashSet<>(Arrays.asList(elements));
+  }
+}
@@ -2,6 +2,7 @@ package org.signal.core.util

 import androidx.sqlite.db.SupportSQLiteDatabase
 import android.content.ContentValues
+import android.text.TextUtils
 import androidx.annotation.VisibleForTesting
 import java.lang.NullPointerException
 import java.lang.StringBuilder
@@ -73,6 +74,76 @@ object SqlUtil {
    return arrayOf(argument.toString())
  }

+  /**
+   * Builds a case-insensitive GLOB pattern for fuzzy text queries. Works with all unicode
+   * characters.
+   *
+   * Ex:
+   * cat -> [cC][aA][tT]
+   */
+  @JvmStatic
+  fun buildCaseInsensitiveGlobPattern(query: String): String {
+    if (TextUtils.isEmpty(query)) {
+      return "*"
+    }
+
+    val pattern = StringBuilder()
+    var i = 0
+    val len = query.codePointCount(0, query.length)
+    while (i < len) {
+      val point = StringUtil.codePointToString(query.codePointAt(i))
+      pattern.append("[")
+      pattern.append(point.toLowerCase(Locale.getDefault()))
+      pattern.append(point.toUpperCase(Locale.getDefault()))
+      pattern.append(getAccentuatedCharRegex(point.toLowerCase(Locale.getDefault())))
+      pattern.append("]")
+      i++
+    }
+
+    return "*$pattern*"
+  }
+
+  private fun getAccentuatedCharRegex(query: String): String {
+    return when (query) {
+      "a" -> "À-Åà-åĀ-ąǍǎǞ-ǡǺ-ǻȀ-ȃȦȧȺɐ-ɒḀḁẚẠ-ặ"
+      "b" -> "ßƀ-ƅɃɓḂ-ḇ"
+      "c" -> "çÇĆ-čƆ-ƈȻȼɔḈḉ"
+      "d" -> "ÐðĎ-đƉ-ƍȡɖɗḊ-ḓ"
+      "e" -> "È-Ëè-ëĒ-ěƎ-ƐǝȄ-ȇȨȩɆɇɘ-ɞḔ-ḝẸ-ệ"
+      "f" -> "ƑƒḞḟ"
+      "g" -> "Ĝ-ģƓǤ-ǧǴǵḠḡ"
+      "h" -> "Ĥ-ħƕǶȞȟḢ-ḫẖ"
+      "i" -> "Ì-Ïì-ïĨ-ıƖƗǏǐȈ-ȋɨɪḬ-ḯỈ-ị"
+      "j" -> "ĴĵǰȷɈɉɟ"
+      "k" -> "Ķ-ĸƘƙǨǩḰ-ḵ"
+      "l" -> "Ĺ-łƚȴȽɫ-ɭḶ-ḽ"
+      "m" -> "Ɯɯ-ɱḾ-ṃ"
+      "n" -> "ÑñŃ-ŋƝƞǸǹȠȵɲ-ɴṄ-ṋ"
+      "o" -> "Ò-ÖØò-öøŌ-őƟ-ơǑǒǪ-ǭǾǿȌ-ȏȪ-ȱṌ-ṓỌ-ợ"
+      "p" -> "ƤƥṔ-ṗ"
+      "q" -> ""
+      "r" -> "Ŕ-řƦȐ-ȓɌɍṘ-ṟ"
+      "s" -> "Ś-šƧƨȘșȿṠ-ṩ"
+      "t" -> "Ţ-ŧƫ-ƮȚțȾṪ-ṱẗ"
+      "u" -> "Ù-Üù-üŨ-ųƯ-ƱǓ-ǜȔ-ȗɄṲ-ṻỤ-ự"
+      "v" -> "ƲɅṼ-ṿ"
+      "w" -> "ŴŵẀ-ẉẘ"
+      "x" -> "Ẋ-ẍ"
+      "y" -> "ÝýÿŶ-ŸƔƳƴȲȳɎɏẎẏỲ-ỹỾỿẙ"
+      "z" -> "Ź-žƵƶɀẐ-ẕ"
+      "α" -> "\u0386\u0391\u03AC\u03B1\u1F00-\u1F0F\u1F70\u1F71\u1F80-\u1F8F\u1FB0-\u1FB4\u1FB6-\u1FBC"
+      "ε" -> "\u0388\u0395\u03AD\u03B5\u1F10-\u1F15\u1F18-\u1F1D\u1F72\u1F73\u1FC8\u1FC9"
+      "η" -> "\u0389\u0397\u03AE\u03B7\u1F20-\u1F2F\u1F74\u1F75\u1F90-\u1F9F\u1F20-\u1F2F\u1F74\u1F75\u1F90-\u1F9F\u1fc2\u1fc3\u1fc4\u1fc6\u1FC7\u1FCA\u1FCB\u1FCC"
+      "ι" -> "\u038A\u0390\u0399\u03AA\u03AF\u03B9\u03CA\u1F30-\u1F3F\u1F76\u1F77\u1FD0-\u1FD3\u1FD6-\u1FDB"
+      "ο" -> "\u038C\u039F\u03BF\u03CC\u1F40-\u1F45\u1F48-\u1F4D\u1F78\u1F79\u1FF8\u1FF9"
+      "σ" -> "\u03A3\u03C2\u03C3"
+      "ς" -> "\u03A3\u03C2\u03C3"
+      "υ" -> "\u038E\u03A5\u03AB\u03C5\u03CB\u03CD\u1F50-\u1F57\u1F59\u1F5B\u1F5D\u1F5F\u1F7A\u1F7B\u1FE0-\u1FE3\u1FE6-\u1FEB"
+      "ω" -> "\u038F\u03A9\u03C9\u03CE\u1F60-\u1F6F\u1F7C\u1F7D\u1FA0-\u1FAF\u1FF2-\u1FF4\u1FF6\u1FF7\u1FFA-\u1FFC"
+      else -> ""
+    }
+  }
+
  /**
   * Returns an updated query and args pairing that will only update rows that would *actually*
   * change. In other words, if [SupportSQLiteDatabase.update]
@@ -0,0 +1,325 @@
+package org.signal.core.util;
+
+import android.text.TextUtils;
+
+import androidx.annotation.NonNull;
+import androidx.annotation.Nullable;
+import androidx.core.text.BidiFormatter;
+
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.util.Set;
+import java.util.regex.Pattern;
+
+public final class StringUtil {
+
+  private static final Set<Character> WHITESPACE = SetUtil.newHashSet('\u200E',  // left-to-right mark
+                                                                      '\u200F',  // right-to-left mark
+                                                                      '\u2007',  // figure space
+                                                                      '\u200B',  // zero-width space
+                                                                      '\u2800'); // braille blank
+
+
+  private static final Pattern ALL_ASCII_PATTERN = Pattern.compile("^[\\x00-\\x7F]*$");
+
+  private static final class Bidi {
+    /** Override text direction  */
+    private static final Set<Integer> OVERRIDES = SetUtil.newHashSet("\u202a".codePointAt(0), /* LRE */
+                                                                     "\u202b".codePointAt(0), /* RLE */
+                                                                     "\u202d".codePointAt(0), /* LRO */
+                                                                     "\u202e".codePointAt(0)  /* RLO */);
+
+    /** Set direction and isolate surrounding text */
+    private static final Set<Integer> ISOLATES = SetUtil.newHashSet("\u2066".codePointAt(0), /* LRI */
+                                                                    "\u2067".codePointAt(0), /* RLI */
+                                                                    "\u2068".codePointAt(0)  /* FSI */);
+    /** Closes things in {@link #OVERRIDES} */
+    private static final int PDF = "\u202c".codePointAt(0);
+
+    /** Closes things in {@link #ISOLATES} */
+    private static final int PDI = "\u2069".codePointAt(0);
+
+    /** Auto-detecting isolate */
+    private static final int FSI = "\u2068".codePointAt(0);
+  }
+
+  private StringUtil() {
+  }
+
+  /**
+   * Trims a name string to fit into the byte length requirement.
+   * <p>
+   * This method treats a surrogate pair and a grapheme cluster a single character
+   * See examples in tests defined in StringUtilText_trimToFit.
+   */
+  public static @NonNull String trimToFit(@Nullable String name, int maxByteLength) {
+    if (TextUtils.isEmpty(name)) {
+      return "";
+    }
+
+    if (name.getBytes(StandardCharsets.UTF_8).length <= maxByteLength) {
+      return name;
+    }
+
+    try (ByteArrayOutputStream stream = new ByteArrayOutputStream()) {
+      for (String graphemeCharacter : new CharacterIterable(name)) {
+        byte[] bytes = graphemeCharacter.getBytes(StandardCharsets.UTF_8);
+
+        if (stream.size() + bytes.length <= maxByteLength) {
+          stream.write(bytes);
+        } else {
+          break;
+        }
+      }
+      return stream.toString();
+    } catch (IOException e) {
+      throw new AssertionError(e);
+    }
+  }
+
+  /**
+   * @return A charsequence with no leading or trailing whitespace. Only creates a new charsequence
+   *         if it has to.
+   */
+  public static @NonNull CharSequence trim(@NonNull CharSequence charSequence) {
+    if (charSequence.length() == 0) {
+      return charSequence;
+    }
+
+    int start = 0;
+    int end   = charSequence.length() - 1;
+
+    while (start < charSequence.length() && Character.isWhitespace(charSequence.charAt(start))) {
+      start++;
+    }
+
+    while (end >= 0 && end > start && Character.isWhitespace(charSequence.charAt(end))) {
+      end--;
+    }
+
+    if (start > 0 || end < charSequence.length() - 1) {
+      return charSequence.subSequence(start, end + 1);
+    } else {
+      return charSequence;
+    }
+  }
+
+  /**
+   * @return True if the string is empty, or if it contains nothing but whitespace characters.
+   *         Accounts for various unicode whitespace characters.
+   */
+  public static boolean isVisuallyEmpty(@Nullable String value) {
+    if (value == null || value.length() == 0) {
+      return true;
+    }
+
+    return indexOfFirstNonEmptyChar(value) == -1;
+  }
+
+  /**
+   * @return String without any leading or trailing whitespace.
+   *         Accounts for various unicode whitespace characters.
+   */
+  public static String trimToVisualBounds(@NonNull String value) {
+    int start = indexOfFirstNonEmptyChar(value);
+
+    if (start == -1) {
+      return "";
+    }
+
+    int end = indexOfLastNonEmptyChar(value);
+
+    return value.substring(start, end + 1);
+  }
+
+  private static int indexOfFirstNonEmptyChar(@NonNull String value) {
+    int length = value.length();
+
+    for (int i = 0; i < length; i++) {
+      if (!isVisuallyEmpty(value.charAt(i))) {
+        return i;
+      }
+    }
+
+    return -1;
+  }
+
+  private static int indexOfLastNonEmptyChar(@NonNull String value) {
+    for (int i = value.length() - 1; i >= 0; i--) {
+      if (!isVisuallyEmpty(value.charAt(i))) {
+        return i;
+      }
+    }
+    return -1;
+  }
+
+  /**
+   * @return True if the character is invisible or whitespace. Accounts for various unicode
+   *         whitespace characters.
+   */
+  public static boolean isVisuallyEmpty(char c) {
+    return Character.isWhitespace(c) || WHITESPACE.contains(c);
+  }
+
+  /**
+   * @return A string representation of the provided unicode code point.
+   */
+  public static @NonNull String codePointToString(int codePoint) {
+    return new String(Character.toChars(codePoint));
+  }
+
+  /**
+   * @return True if the provided text contains a mix of LTR and RTL characters, otherwise false.
+   */
+  public static boolean hasMixedTextDirection(@Nullable CharSequence text) {
+    if (text == null) {
+      return false;
+    }
+
+    Boolean isLtr = null;
+
+    for (int i = 0, len = Character.codePointCount(text, 0, text.length()); i < len; i++) {
+      int     codePoint = Character.codePointAt(text, i);
+      byte    direction = Character.getDirectionality(codePoint);
+      boolean isLetter  = Character.isLetter(codePoint);
+
+      if (isLtr != null && isLtr && direction != Character.DIRECTIONALITY_LEFT_TO_RIGHT && isLetter) {
+        return true;
+      } else if (isLtr != null && !isLtr && direction != Character.DIRECTIONALITY_RIGHT_TO_LEFT && isLetter) {
+        return true;
+      } else if (isLetter) {
+        isLtr = direction == Character.DIRECTIONALITY_LEFT_TO_RIGHT;
+      }
+    }
+
+    return false;
+  }
+
+  /**
+   * @return True if the text is null or has a length of 0, otherwise false.
+   */
+  public static boolean isEmpty(@Nullable String text) {
+    return text == null || text.length() == 0;
+  }
+
+  /**
+   * Isolates bi-directional text from influencing surrounding text. You should use this whenever
+   * you're injecting user-generated text into a larger string.
+   *
+   * You'd think we'd be able to trust {@link BidiFormatter}, but unfortunately it just misses some
+   * corner cases, so here we are.
+   *
+   * The general idea is just to balance out the opening and closing codepoints, and then wrap the
+   * whole thing in FSI/PDI to isolate it.
+   *
+   * For more details, see:
+   * https://www.w3.org/International/questions/qa-bidi-unicode-controls
+   */
+  public static @NonNull String isolateBidi(@Nullable String text) {
+    if (text == null) {
+      return "";
+    }
+
+    if (isEmpty(text)) {
+      return text;
+    }
+
+    if (ALL_ASCII_PATTERN.matcher(text).matches()) {
+      return text;
+    }
+
+    int overrideCount      = 0;
+    int overrideCloseCount = 0;
+    int isolateCount       = 0;
+    int isolateCloseCount  = 0;
+
+    for (int i = 0, len = text.codePointCount(0, text.length()); i < len; i++) {
+      int codePoint = text.codePointAt(i);
+
+      if (Bidi.OVERRIDES.contains(codePoint)) {
+        overrideCount++;
+      } else if (codePoint == Bidi.PDF) {
+        overrideCloseCount++;
+      } else if (Bidi.ISOLATES.contains(codePoint)) {
+        isolateCount++;
+      } else if (codePoint == Bidi.PDI) {
+        isolateCloseCount++;
+      }
+    }
+
+    StringBuilder suffix = new StringBuilder();
+
+    while (overrideCount > overrideCloseCount) {
+      suffix.appendCodePoint(Bidi.PDF);
+      overrideCloseCount++;
+    }
+
+    while (isolateCount > isolateCloseCount) {
+      suffix.appendCodePoint(Bidi.FSI);
+      isolateCloseCount++;
+    }
+
+    StringBuilder out = new StringBuilder();
+
+    return out.appendCodePoint(Bidi.FSI)
+              .append(text)
+              .append(suffix)
+              .appendCodePoint(Bidi.PDI)
+              .toString();
+  }
+
+  public static @Nullable String stripBidiProtection(@Nullable String text) {
+    if (text == null) return null;
+
+    return text.replaceAll("[\\u2068\\u2069\\u202c]", "");
+  }
+
+  public static @NonNull String stripBidiIndicator(@NonNull String text) {
+    return text.replace("\u200F", "");
+  }
+
+  /**
+   * Trims a {@link CharSequence} of starting and trailing whitespace. Behavior matches
+   * {@link String#trim()} to preserve expectations around results.
+   */
+  public static CharSequence trimSequence(CharSequence text) {
+    int length     = text.length();
+    int startIndex = 0;
+
+    while ((startIndex < length) && (text.charAt(startIndex) <= ' ')) {
+      startIndex++;
+    }
+    while ((startIndex < length) && (text.charAt(length - 1) <= ' ')) {
+      length--;
+    }
+    return (startIndex > 0 || length < text.length()) ? text.subSequence(startIndex, length) : text;
+  }
+
+  /**
+   * If the {@param text} exceeds the {@param maxChars} it is trimmed in the middle so that the result is exactly {@param maxChars} long including an added
+   * ellipsis character.
+   * <p>
+   * Otherwise the string is returned untouched.
+   * <p>
+   * When {@param maxChars} is even, one more character is kept from the end of the string than the start.
+   */
+  public static @Nullable CharSequence abbreviateInMiddle(@Nullable CharSequence text, int maxChars) {
+     if (text == null || text.length() <= maxChars) {
+      return text;
+    }
+
+    int start = (maxChars - 1) / 2;
+    int end   = (maxChars - 1) - start;
+    return text.subSequence(0, start) + "…" + text.subSequence(text.length() - end, text.length());
+  }
+
+  /**
+   * @return The number of graphemes in the provided string.
+   */
+  public static int getGraphemeCount(@NonNull CharSequence text) {
+    BreakIteratorCompat iterator = BreakIteratorCompat.getInstance();
+    iterator.setText(text);
+    return iterator.countBreaks();
+  }
+}