Fix incorrect span indices for normalised search text.

This commit is contained in:
Sagar
2025-04-23 23:03:09 +05:30
committed by Cody Henthorne
parent 64239962fc
commit 7b3897cac6
2 changed files with 101 additions and 24 deletions

View File

@@ -85,17 +85,15 @@ public class SearchUtil {
static List<Pair<Integer, Integer>> getStrictHighlightRanges(@NonNull Locale locale,
@NonNull String text,
@NonNull String highlight)
{
if (text.length() == 0) {
return Collections.emptyList();
}
@NonNull String highlight) {
String normalizedText = text.toLowerCase(locale);
String normalizedHighlight = highlight.toLowerCase(locale);
List<String> highlightTokens = Stream.of(normalizedHighlight.split("\\s")).filter(s -> s.trim().length() > 0).toList();
List<String> highlightTokens = Stream.of(normalizedHighlight.split("\\s"))
.filter(s -> !s.trim().isEmpty())
.toList();
List<Pair<Integer, Integer>> ranges = new LinkedList<>();
int[] indexMap = buildIndexMap(text, normalizedText, locale);
List<Pair<Integer, Integer>> ranges = new LinkedList<>();
int lastHighlightEndIndex = 0;
@@ -103,12 +101,15 @@ public class SearchUtil {
int index;
do {
index = normalizedText.indexOf(highlightToken, lastHighlightEndIndex);
index = normalizedText.indexOf(highlightToken, lastHighlightEndIndex);
lastHighlightEndIndex = index + highlightToken.length();
} while (index > 0 && !Character.isWhitespace(normalizedText.charAt(index - 1)));
if (index >= 0) {
ranges.add(new Pair<>(index, lastHighlightEndIndex));
// Map normalized range back to original text indices
int start = indexMap[index];
int end = indexMap[Math.min(index + highlightToken.length() - 1, indexMap.length - 1)] + 1;
ranges.add(new Pair<>(start, end));
}
if (index < 0 || lastHighlightEndIndex >= normalizedText.length()) {
@@ -123,32 +124,53 @@ public class SearchUtil {
return ranges;
}
static List<Pair<Integer, Integer>> getHighlightRanges(@NonNull Locale locale,
@NonNull String text,
@NonNull String highlight)
{
if (text.length() == 0) {
return Collections.emptyList();
private static int[] buildIndexMap(@NonNull String original, @NonNull String normalized, @NonNull Locale locale) {
int[] indexMap = new int[normalized.length()];
int originalCharIndex = 0;
int normalizedCharIndex = 0;
while (originalCharIndex < original.length() && normalizedCharIndex < normalized.length()) {
String originalCharacter = String.valueOf(original.charAt(originalCharIndex));
String normalizedCharacter = originalCharacter.toLowerCase(locale);
for (int i = 0; i < normalizedCharacter.length() && normalizedCharIndex < indexMap.length; i++, normalizedCharIndex++) {
indexMap[normalizedCharIndex] = originalCharIndex;
}
originalCharIndex++;
}
return indexMap;
}
static List<Pair<Integer, Integer>> getHighlightRanges(@NonNull Locale locale,
@NonNull String text,
@NonNull String highlight) {
String normalizedText = text.toLowerCase(locale);
String normalizedHighlight = highlight.toLowerCase(locale);
List<String> highlightTokens = Stream.of(normalizedHighlight.split("\\s")).filter(s -> s.trim().length() > 0).toList();
List<String> highlightTokens = Stream.of(normalizedHighlight.split("\\s"))
.filter(s -> !s.trim().isEmpty())
.toList();
List<Pair<Integer, Integer>> ranges = new LinkedList<>();
int[] indexMap = buildIndexMap(text, normalizedText, locale);
int lastHighlightEndIndex = 0;
List<Pair<Integer, Integer>> ranges = new LinkedList<>();
int lastIndex = 0;
for (String highlightToken : highlightTokens) {
int index = 0;
lastHighlightEndIndex = 0;
lastIndex = 0;
while (index != -1) {
index = normalizedText.indexOf(highlightToken, lastHighlightEndIndex);
index = normalizedText.indexOf(highlightToken, lastIndex);
if (index != -1) {
lastHighlightEndIndex = index + highlightToken.length();
ranges.add(new Pair<>(index, lastHighlightEndIndex));
index = lastHighlightEndIndex;
// Map normalized range back to original text indices
int start = indexMap[index];
int end = indexMap[Math.min(index + highlightToken.length() - 1, indexMap.length - 1)] + 1;
ranges.add(new Pair<>(start, end));
lastIndex = index + highlightToken.length();
}
}
}

View File

@@ -4,6 +4,7 @@ import org.junit.Test;
import org.signal.libsignal.protocol.util.Pair;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.Locale;
@@ -81,6 +82,60 @@ public class SearchUtilTest {
assertEquals(Arrays.asList(new Pair<>(0, 1)), result);
}
@Test
public void getHighlightRanges_singleHighlightTokenMultipleMatches_turkish_text() {
String text = "İaİ";
String highlight = "i";
List<Pair<Integer, Integer>> result = SearchUtil.getHighlightRanges(LOCALE, text, highlight);
assertEquals(List.of(new Pair<>(0, 1), new Pair<>(2, 3)), result);
}
@Test
public void getHighlightRanges_singleHighlightTokenMultipleMatches_turkish_both() {
String text = "İaİ";
String highlight = "İaİ";
List<Pair<Integer, Integer>> result = SearchUtil.getHighlightRanges(LOCALE, text, highlight);
assertEquals(List.of(new Pair<>(0, 3)), result);
}
@Test
public void getHighlightRanges_singleHighlightTokenMultipleMatches_turkish_highlight() {
String text = "iai";
String highlight = "İaİ";
List<Pair<Integer, Integer>> result = SearchUtil.getHighlightRanges(LOCALE, text, highlight);
assertEquals(Collections.emptyList(), result);
}
@Test
public void getStrictHighlightRanges_singleHighlightToken_turkish_text() {
String text = "İaİ";
String highlight = "i";
List<Pair<Integer, Integer>> result = SearchUtil.getStrictHighlightRanges(LOCALE, text, highlight);
assertEquals(List.of(new Pair<>(0, 1)), result);
}
@Test
public void getStrictHighlightRanges_singleHighlightToken_turkish_highlight() {
String text = "iai";
String highlight = "İaİ";
List<Pair<Integer, Integer>> result = SearchUtil.getStrictHighlightRanges(LOCALE, text, highlight);
assertEquals(Collections.emptyList(), result);
}
@Test
public void getStrictHighlightRanges_singleHighlightToken_turkish_both() {
String text = "İaİ";
String highlight = "İaİ";
List<Pair<Integer, Integer>> result = SearchUtil.getStrictHighlightRanges(LOCALE, text, highlight);
assertEquals(List.of(new Pair<>(0, 3)), result);
}
@Test
public void getHighlightRanges_singleHighlightTokenMultipleMatches() {
String text = "blabla";