Merge pull request #193549 from microsoft/tyler/mixed-tyrannosaurus

Revert "Add stemming support to TF-IDF implementation (#193531)"
This commit is contained in:
Benjamin Simmonds
2023-09-20 09:27:46 +02:00
committed by GitHub
2 changed files with 17 additions and 260 deletions

View File

@@ -6,19 +6,22 @@
import { CancellationToken } from 'vs/base/common/cancellation'; import { CancellationToken } from 'vs/base/common/cancellation';
type SparseEmbedding = Record</* word */ string, /* weight */number>; type SparseEmbedding = Record</* word */ string, /* weight */number>;
type TermFrequencies = Map</* word */ string, { occurrences: number; weight: number }>; type TermFrequencies = Map</* word */ string, /*occurrences*/ number>;
type DocumentOccurrences = Map</* word */ string, /*documentOccurrences*/ number>; type DocumentOccurrences = Map</* word */ string, /*documentOccurrences*/ number>;
function countMapFrom<K>(values: Iterable<K>): Map<K, number> {
const map = new Map<K, number>();
for (const value of values) {
map.set(value, (map.get(value) ?? 0) + 1);
}
return map;
}
interface DocumentChunkEntry { interface DocumentChunkEntry {
readonly text: string; readonly text: string;
readonly tf: TermFrequencies; readonly tf: TermFrequencies;
} }
interface Term {
readonly term: string;
readonly weight: number;
}
export interface TfIdfDocument { export interface TfIdfDocument {
readonly key: string; readonly key: string;
readonly textChunks: readonly string[]; readonly textChunks: readonly string[];
@@ -72,53 +75,26 @@ export class TfIdfCalculator {
* Count how many times each term (word) appears in a string. * Count how many times each term (word) appears in a string.
*/ */
private static termFrequencies(input: string): TermFrequencies { private static termFrequencies(input: string): TermFrequencies {
const map = new Map<string, { weight: number; occurrences: number }>(); return countMapFrom(TfIdfCalculator.splitTerms(input));
for (const value of TfIdfCalculator.splitTerms(input)) {
const existing = map.get(value.term);
if (existing) {
existing.occurrences++;
} else {
map.set(value.term, { weight: value.weight, occurrences: 1 });
}
}
return map;
} }
/** /**
* Break a string into terms (words). * Break a string into terms (words).
*
* TODO: confirm that when we break up a word or generate stems, we likely accidentally over-weight its terms.
* For instance, if the document is: `cats wear hats` and the user searches `cats wear`, we could end up giving too
* much weight to `cats` since the document would be broken into: `[cats, cat, wear, hats, hat]` while the query
* would be broken into `[cats, cat, wear]`. This means that terms derived from `cats` end up being matched on multiple
* times, which isn't really right.
*
* Maybe we need to generate a tree of terms for the document where we stop searching once a match has been found:
*/ */
private static *splitTerms(input: string): Iterable<Term> { private static *splitTerms(input: string): Iterable<string> {
const normalize = (word: string) => word.toLowerCase(); const normalize = (word: string) => word.toLowerCase();
// Only match on words that are at least 3 characters long and start with a letter // Only match on words that are at least 3 characters long and start with a letter
for (const [word] of input.matchAll(/\b\p{Letter}[\p{Letter}\d]{2,}\b/gu)) { for (const [word] of input.matchAll(/\b\p{Letter}[\p{Letter}\d]{2,}\b/gu)) {
yield { term: normalize(word), weight: 1 }; yield normalize(word);
// Include both the original term and the stemmed version // eslint-disable-next-line local/code-no-look-behind-regex
const stemmedTerm = stem(word); const camelParts = word.split(/(?<=[a-z])(?=[A-Z])/g);
if (stemmedTerm !== word) {
yield { term: normalize(stemmedTerm), weight: 0.75 };
}
const camelParts = word.split(/(?=[A-Z])/g);
if (camelParts.length > 1) { if (camelParts.length > 1) {
for (const part of camelParts) { for (const part of camelParts) {
// Require at least 3 letters in the parts of a camel case word // Require at least 3 letters in the parts of a camel case word
if (part.length > 2 && /\p{Letter}{3,}/gu.test(part)) { if (part.length > 2 && /\p{Letter}{3,}/gu.test(part)) {
yield { term: normalize(part), weight: 0.75 }; yield normalize(part);
const stemmedPart = stem(part);
if (stemmedPart !== part && stemmedPart.length > 2) {
yield { term: normalize(stemmedPart), weight: 0.5 };
}
} }
} }
} }
@@ -210,7 +186,7 @@ export class TfIdfCalculator {
idfCache.set(term, chunkIdf); idfCache.set(term, chunkIdf);
} }
const chunkTfidf = chunkTf.weight * chunkTf.occurrences * chunkIdf; const chunkTfidf = chunkTf * chunkIdf;
sum += chunkTfidf * termTfidf; sum += chunkTfidf * termTfidf;
} }
return sum; return sum;
@@ -233,7 +209,7 @@ export class TfIdfCalculator {
for (const [word, occurrences] of termFrequencies) { for (const [word, occurrences] of termFrequencies) {
const idf = this.computeIdf(word); const idf = this.computeIdf(word);
if (idf > 0) { if (idf > 0) {
embedding[word] = occurrences.weight * occurrences.occurrences * idf; embedding[word] = occurrences * idf;
} }
} }
return embedding; return embedding;
@@ -263,197 +239,3 @@ export function normalizeTfIdfScores(scores: TfIdfScore[]): NormalizedTfIdfScore
return result as TfIdfScore[]; return result as TfIdfScore[];
} }
// https://github.com/maxxxxxdlp/porter-stemming
/**
* TypeScript implementation of the Porter-Stemmer algorithm
*/
export function stem(raw: string): string {
if (raw.length < minLength) { return raw; }
let word = raw;
const firstCharacter = word[0];
if (firstCharacter === 'y') { word = firstCharacter.toUpperCase() + word.slice(1); }
word = steps.reduce((word, step) => step(word), word);
// Turn initial Y back to y
if (firstCharacter === 'y') { word = firstCharacter.toLowerCase() + word.slice(1); }
return word;
}
const minLength = 3;
const vowel = '[aeiouy]';
const consonant = '[^aeiou]';
const consonantSequence = `${consonant}[^aeiouy]*`;
const o = new RegExp(`^${consonantSequence}${vowel}[^aeiouwxy]$`, 'u');
/**
* Try to match a word against a rule
*/
const replace =
(
replacements: Readonly<
Record<
string,
| string
| readonly [condition: (word: string) => boolean, replacement: string]
>
>
) =>
(word: string): string => {
const entries = Object.entries(replacements).sort(
([left], [right]) => right.length - left.length
);
for (const [suffix, replacement] of entries) {
if (!word.endsWith(suffix)) { continue; }
if (
Array.isArray(replacement) &&
!replacement[0](word.slice(0, -suffix.length))
) { break; }
return `${word.slice(0, -suffix.length)}${Array.isArray(replacement) ? replacement[1] : replacement
}`;
}
return word;
};
const calculateMeasure = (word: string): number =>
sum(
Array.from(word.split(''), (_, index) =>
!isConsonant(word, index) &&
index + 1 < word.length &&
isConsonant(word, index + 1)
? 1
: 0
)
);
const sum = (array: readonly number[]): number =>
array.reduce((sum, value) => sum + value, 0);
const measure =
(min: number) =>
(word: string): boolean =>
calculateMeasure(word) > min;
function isConsonant(word: string, index: number): boolean {
const vowels = 'aeiou';
if (vowels.includes(word[index])) { return false; }
if (word[index] === 'y') { return index === 0 ? true : !isConsonant(word, index - 1); }
else { return true; }
}
const hasVowel = (word: string): boolean =>
Array.from(word.split('')).some((_, index) => !isConsonant(word, index));
const steps: readonly ((word: string) => string)[] = [
// Step 1a
replace({
sses: 'ss',
ies: 'i',
ss: 'ss',
s: '',
}),
// Step 1b
(word): string => {
if (word.endsWith('eed')) { return replace({ eed: [measure(0), 'ee'] })(word); }
const updated = replace({ ed: [hasVowel, ''], ing: [hasVowel, ''] })(word);
if (updated === word) { return word; }
const replaced = replace({
at: 'ate',
bl: 'ble',
iz: 'ize',
})(updated);
if (replaced !== updated) { return replaced; }
if (
replaced.at(-1) === replaced.at(-'dd'.length) &&
isConsonant(replaced, replaced.length - 1) &&
!['l', 's', 'z'].some((letter) => replaced.endsWith(letter))
) { return replaced.slice(0, -1); }
if (calculateMeasure(replaced) === 1 && o.test(replaced)) { return `${replaced}e`; }
return replaced;
},
// Step 1c
replace({
y: [hasVowel, 'i'],
}),
// Step 2
replace({
ational: [measure(0), 'ate'],
tional: [measure(0), 'tion'],
enci: [measure(0), 'ence'],
anci: [measure(0), 'ance'],
izer: [measure(0), 'ize'],
abli: [measure(0), 'able'],
alli: [measure(0), 'al'],
entli: [measure(0), 'ent'],
eli: [measure(0), 'e'],
ousli: [measure(0), 'ous'],
ization: [measure(0), 'ize'],
ation: [measure(0), 'ate'],
ator: [measure(0), 'ate'],
alism: [measure(0), 'al'],
iveness: [measure(0), 'ive'],
fulness: [measure(0), 'ful'],
ousness: [measure(0), 'ous'],
aliti: [measure(0), 'al'],
iviti: [measure(0), 'ive'],
biliti: [measure(0), 'ble'],
logi: [measure(0), 'log'],
bli: [measure(0), 'ble'],
}),
// Step 3
replace({
icate: [measure(0), 'ic'],
ative: [measure(0), ''],
alize: [measure(0), 'al'],
iciti: [measure(0), 'ic'],
ical: [measure(0), 'ic'],
ful: [measure(0), ''],
ness: [measure(0), ''],
}),
// Step 4
(word): string => {
const newWord = replace({
al: [measure(1), ''],
ance: [measure(1), ''],
ence: [measure(1), ''],
er: [measure(1), ''],
ic: [measure(1), ''],
able: [measure(1), ''],
ible: [measure(1), ''],
ant: [measure(1), ''],
ement: [measure(1), ''],
ment: [measure(1), ''],
ent: [measure(1), ''],
ou: [measure(1), ''],
ism: [measure(1), ''],
ate: [measure(1), ''],
iti: [measure(1), ''],
ous: [measure(1), ''],
ive: [measure(1), ''],
ize: [measure(1), ''],
})(word);
if (newWord !== word) { return newWord; }
return (word.endsWith('tion') || word.endsWith('sion')) &&
measure(1)(word.slice(0, -'ion'.length))
? word.slice(0, -'ion'.length)
: word;
},
// Step 5a
(word): string => {
if (!word.endsWith('e')) { return word; }
const stem = word.slice(0, -1);
const measure = calculateMeasure(stem);
return measure > 1 || (measure === 1 && !o.test(stem)) ? stem : word;
},
// Step 5b
(word): string =>
word.endsWith('ll') && measure(1)(word.slice(0, -1))
? word.slice(0, -1)
: word,
];

View File

@@ -163,19 +163,6 @@ suite('TF-IDF Calculator', function () {
} }
}); });
test('Should weigh exact match higher than camelCase match', () => {
for (const docs of permutate([
makeDocument('/A', 'catDog'),
makeDocument('/B', 'cat cat cat fish'),
makeDocument('/C', 'dog dog cat rat'),
makeDocument('/D', 'pig'),
])) {
const tfidf = new TfIdfCalculator().updateDocuments(docs);
const scores = tfidf.calculateScores('catDog', CancellationToken.None);
assertScoreOrdersEqual(scores, ['/A', '/C', '/B']);
}
});
test('Should not match document after delete', () => { test('Should not match document after delete', () => {
const docA = makeDocument('/A', 'cat dog cat'); const docA = makeDocument('/A', 'cat dog cat');
const docB = makeDocument('/B', 'cat fish'); const docB = makeDocument('/B', 'cat fish');
@@ -197,18 +184,6 @@ suite('TF-IDF Calculator', function () {
scores = tfidf.calculateScores('cat', CancellationToken.None); scores = tfidf.calculateScores('cat', CancellationToken.None);
assertScoreOrdersEqual(scores, []); assertScoreOrdersEqual(scores, []);
}); });
test('Should find stemmed words', () => {
for (const docs of permutate([
makeDocument('/A', 'cats'),
makeDocument('/B', 'dogs cat'),
makeDocument('/D', 'pig'),
])) {
const tfidf = new TfIdfCalculator().updateDocuments(docs);
const scores = tfidf.calculateScores('cats', CancellationToken.None);
assertScoreOrdersEqual(scores, ['/A', '/B']);
}
});
}); });
function makeDocument(key: string, content: string | string[]): TfIdfDocument { function makeDocument(key: string, content: string | string[]): TfIdfDocument {