From 4ea2c19b2afbddd59fde6b4b5957c46802bbc005 Mon Sep 17 00:00:00 2001 From: Tyler Leonhardt Date: Tue, 19 Sep 2023 22:58:14 -0700 Subject: [PATCH] Revert "Add stemming support to TF-IDF implementation (#193531)" This reverts commit 2ad860b576023e50c27b1118c4f037a409a14d79. --- src/vs/base/common/tfIdf.ts | 252 ++------------------------ src/vs/base/test/common/tfIdf.test.ts | 25 --- 2 files changed, 17 insertions(+), 260 deletions(-) diff --git a/src/vs/base/common/tfIdf.ts b/src/vs/base/common/tfIdf.ts index 38cfb1980d7..993f41b5a78 100644 --- a/src/vs/base/common/tfIdf.ts +++ b/src/vs/base/common/tfIdf.ts @@ -6,19 +6,22 @@ import { CancellationToken } from 'vs/base/common/cancellation'; type SparseEmbedding = Record; -type TermFrequencies = Map; +type TermFrequencies = Map; type DocumentOccurrences = Map; +function countMapFrom(values: Iterable): Map { + const map = new Map(); + for (const value of values) { + map.set(value, (map.get(value) ?? 0) + 1); + } + return map; +} + interface DocumentChunkEntry { readonly text: string; readonly tf: TermFrequencies; } -interface Term { - readonly term: string; - readonly weight: number; -} - export interface TfIdfDocument { readonly key: string; readonly textChunks: readonly string[]; @@ -72,53 +75,26 @@ export class TfIdfCalculator { * Count how many times each term (word) appears in a string. */ private static termFrequencies(input: string): TermFrequencies { - const map = new Map(); - for (const value of TfIdfCalculator.splitTerms(input)) { - const existing = map.get(value.term); - if (existing) { - existing.occurrences++; - } else { - map.set(value.term, { weight: value.weight, occurrences: 1 }); - } - } - return map; + return countMapFrom(TfIdfCalculator.splitTerms(input)); } /** * Break a string into terms (words). - * - * TODO: confirm that when we break up a word or generate stems, we likely accidentally over-weight its terms. - * For instance, if the document is: `cats wear hats` and the user searches `cats wear`, we could end up giving too - * much weight to `cats` since the document would be broken into: `[cats, cat, wear, hats, hat]` while the query - * would be broken into `[cats, cat, wear]`. This means that terms derived from `cats` end up being matched on multiple - * times, which isn't really right. - * - * Maybe we need to generate a tree of terms for the document where we stop searching once a match has been found: */ - private static *splitTerms(input: string): Iterable { + private static *splitTerms(input: string): Iterable { const normalize = (word: string) => word.toLowerCase(); // Only match on words that are at least 3 characters long and start with a letter for (const [word] of input.matchAll(/\b\p{Letter}[\p{Letter}\d]{2,}\b/gu)) { - yield { term: normalize(word), weight: 1 }; + yield normalize(word); - // Include both the original term and the stemmed version - const stemmedTerm = stem(word); - if (stemmedTerm !== word) { - yield { term: normalize(stemmedTerm), weight: 0.75 }; - } - - const camelParts = word.split(/(?=[A-Z])/g); + // eslint-disable-next-line local/code-no-look-behind-regex + const camelParts = word.split(/(?<=[a-z])(?=[A-Z])/g); if (camelParts.length > 1) { for (const part of camelParts) { // Require at least 3 letters in the parts of a camel case word if (part.length > 2 && /\p{Letter}{3,}/gu.test(part)) { - yield { term: normalize(part), weight: 0.75 }; - - const stemmedPart = stem(part); - if (stemmedPart !== part && stemmedPart.length > 2) { - yield { term: normalize(stemmedPart), weight: 0.5 }; - } + yield normalize(part); } } } @@ -210,7 +186,7 @@ export class TfIdfCalculator { idfCache.set(term, chunkIdf); } - const chunkTfidf = chunkTf.weight * chunkTf.occurrences * chunkIdf; + const chunkTfidf = chunkTf * chunkIdf; sum += chunkTfidf * termTfidf; } return sum; @@ -233,7 +209,7 @@ export class TfIdfCalculator { for (const [word, occurrences] of termFrequencies) { const idf = this.computeIdf(word); if (idf > 0) { - embedding[word] = occurrences.weight * occurrences.occurrences * idf; + embedding[word] = occurrences * idf; } } return embedding; @@ -263,197 +239,3 @@ export function normalizeTfIdfScores(scores: TfIdfScore[]): NormalizedTfIdfScore return result as TfIdfScore[]; } - -// https://github.com/maxxxxxdlp/porter-stemming - -/** - * TypeScript implementation of the Porter-Stemmer algorithm - */ -export function stem(raw: string): string { - if (raw.length < minLength) { return raw; } - - let word = raw; - const firstCharacter = word[0]; - if (firstCharacter === 'y') { word = firstCharacter.toUpperCase() + word.slice(1); } - - word = steps.reduce((word, step) => step(word), word); - - // Turn initial Y back to y - if (firstCharacter === 'y') { word = firstCharacter.toLowerCase() + word.slice(1); } - - return word; -} - -const minLength = 3; -const vowel = '[aeiouy]'; -const consonant = '[^aeiou]'; -const consonantSequence = `${consonant}[^aeiouy]*`; -const o = new RegExp(`^${consonantSequence}${vowel}[^aeiouwxy]$`, 'u'); - -/** - * Try to match a word against a rule - */ -const replace = - ( - replacements: Readonly< - Record< - string, - | string - | readonly [condition: (word: string) => boolean, replacement: string] - > - > - ) => - (word: string): string => { - const entries = Object.entries(replacements).sort( - ([left], [right]) => right.length - left.length - ); - for (const [suffix, replacement] of entries) { - if (!word.endsWith(suffix)) { continue; } - if ( - Array.isArray(replacement) && - !replacement[0](word.slice(0, -suffix.length)) - ) { break; } - return `${word.slice(0, -suffix.length)}${Array.isArray(replacement) ? replacement[1] : replacement - }`; - } - return word; - }; - -const calculateMeasure = (word: string): number => - sum( - Array.from(word.split(''), (_, index) => - !isConsonant(word, index) && - index + 1 < word.length && - isConsonant(word, index + 1) - ? 1 - : 0 - ) - ); - -const sum = (array: readonly number[]): number => - array.reduce((sum, value) => sum + value, 0); - -const measure = - (min: number) => - (word: string): boolean => - calculateMeasure(word) > min; - -function isConsonant(word: string, index: number): boolean { - const vowels = 'aeiou'; - if (vowels.includes(word[index])) { return false; } - if (word[index] === 'y') { return index === 0 ? true : !isConsonant(word, index - 1); } - else { return true; } -} - -const hasVowel = (word: string): boolean => - Array.from(word.split('')).some((_, index) => !isConsonant(word, index)); - -const steps: readonly ((word: string) => string)[] = [ - // Step 1a - replace({ - sses: 'ss', - ies: 'i', - ss: 'ss', - s: '', - }), - // Step 1b - (word): string => { - if (word.endsWith('eed')) { return replace({ eed: [measure(0), 'ee'] })(word); } - const updated = replace({ ed: [hasVowel, ''], ing: [hasVowel, ''] })(word); - if (updated === word) { return word; } - const replaced = replace({ - at: 'ate', - bl: 'ble', - iz: 'ize', - })(updated); - if (replaced !== updated) { return replaced; } - - if ( - replaced.at(-1) === replaced.at(-'dd'.length) && - isConsonant(replaced, replaced.length - 1) && - !['l', 's', 'z'].some((letter) => replaced.endsWith(letter)) - ) { return replaced.slice(0, -1); } - - if (calculateMeasure(replaced) === 1 && o.test(replaced)) { return `${replaced}e`; } - return replaced; - }, - // Step 1c - replace({ - y: [hasVowel, 'i'], - }), - // Step 2 - replace({ - ational: [measure(0), 'ate'], - tional: [measure(0), 'tion'], - enci: [measure(0), 'ence'], - anci: [measure(0), 'ance'], - izer: [measure(0), 'ize'], - abli: [measure(0), 'able'], - alli: [measure(0), 'al'], - entli: [measure(0), 'ent'], - eli: [measure(0), 'e'], - ousli: [measure(0), 'ous'], - ization: [measure(0), 'ize'], - ation: [measure(0), 'ate'], - ator: [measure(0), 'ate'], - alism: [measure(0), 'al'], - iveness: [measure(0), 'ive'], - fulness: [measure(0), 'ful'], - ousness: [measure(0), 'ous'], - aliti: [measure(0), 'al'], - iviti: [measure(0), 'ive'], - biliti: [measure(0), 'ble'], - logi: [measure(0), 'log'], - bli: [measure(0), 'ble'], - }), - // Step 3 - replace({ - icate: [measure(0), 'ic'], - ative: [measure(0), ''], - alize: [measure(0), 'al'], - iciti: [measure(0), 'ic'], - ical: [measure(0), 'ic'], - ful: [measure(0), ''], - ness: [measure(0), ''], - }), - // Step 4 - (word): string => { - const newWord = replace({ - al: [measure(1), ''], - ance: [measure(1), ''], - ence: [measure(1), ''], - er: [measure(1), ''], - ic: [measure(1), ''], - able: [measure(1), ''], - ible: [measure(1), ''], - ant: [measure(1), ''], - ement: [measure(1), ''], - ment: [measure(1), ''], - ent: [measure(1), ''], - ou: [measure(1), ''], - ism: [measure(1), ''], - ate: [measure(1), ''], - iti: [measure(1), ''], - ous: [measure(1), ''], - ive: [measure(1), ''], - ize: [measure(1), ''], - })(word); - if (newWord !== word) { return newWord; } - return (word.endsWith('tion') || word.endsWith('sion')) && - measure(1)(word.slice(0, -'ion'.length)) - ? word.slice(0, -'ion'.length) - : word; - }, - // Step 5a - (word): string => { - if (!word.endsWith('e')) { return word; } - const stem = word.slice(0, -1); - const measure = calculateMeasure(stem); - return measure > 1 || (measure === 1 && !o.test(stem)) ? stem : word; - }, - // Step 5b - (word): string => - word.endsWith('ll') && measure(1)(word.slice(0, -1)) - ? word.slice(0, -1) - : word, -]; diff --git a/src/vs/base/test/common/tfIdf.test.ts b/src/vs/base/test/common/tfIdf.test.ts index 38ffa884ba3..69c094759eb 100644 --- a/src/vs/base/test/common/tfIdf.test.ts +++ b/src/vs/base/test/common/tfIdf.test.ts @@ -163,19 +163,6 @@ suite('TF-IDF Calculator', function () { } }); - test('Should weigh exact match higher than camelCase match', () => { - for (const docs of permutate([ - makeDocument('/A', 'catDog'), - makeDocument('/B', 'cat cat cat fish'), - makeDocument('/C', 'dog dog cat rat'), - makeDocument('/D', 'pig'), - ])) { - const tfidf = new TfIdfCalculator().updateDocuments(docs); - const scores = tfidf.calculateScores('catDog', CancellationToken.None); - assertScoreOrdersEqual(scores, ['/A', '/C', '/B']); - } - }); - test('Should not match document after delete', () => { const docA = makeDocument('/A', 'cat dog cat'); const docB = makeDocument('/B', 'cat fish'); @@ -197,18 +184,6 @@ suite('TF-IDF Calculator', function () { scores = tfidf.calculateScores('cat', CancellationToken.None); assertScoreOrdersEqual(scores, []); }); - - test('Should find stemmed words', () => { - for (const docs of permutate([ - makeDocument('/A', 'cats'), - makeDocument('/B', 'dogs cat'), - makeDocument('/D', 'pig'), - ])) { - const tfidf = new TfIdfCalculator().updateDocuments(docs); - const scores = tfidf.calculateScores('cats', CancellationToken.None); - assertScoreOrdersEqual(scores, ['/A', '/B']); - } - }); }); function makeDocument(key: string, content: string | string[]): TfIdfDocument {