Revert "Add stemming support to TF-IDF implementation (#193531)"

This reverts commit 2ad860b576.
2025-12-20 02:08:47 +00:00 · 2023-09-19 22:58:14 -07:00
parent 2ad860b576
commit 4ea2c19b2a
2 changed files with 17 additions and 260 deletions
--- a/src/vs/base/common/tfIdf.ts
+++ b/src/vs/base/common/tfIdf.ts
@@ -6,19 +6,22 @@
 import { CancellationToken } from 'vs/base/common/cancellation';

 type SparseEmbedding = Record</* word */ string, /* weight */number>;
-type TermFrequencies = Map</* word */ string, { occurrences: number; weight: number }>;
+type TermFrequencies = Map</* word */ string, /*occurrences*/ number>;
 type DocumentOccurrences = Map</* word */ string, /*documentOccurrences*/ number>;

+function countMapFrom<K>(values: Iterable<K>): Map<K, number> {
+	const map = new Map<K, number>();
+	for (const value of values) {
+		map.set(value, (map.get(value) ?? 0) + 1);
+	}
+	return map;
+}
+
 interface DocumentChunkEntry {
 	readonly text: string;
 	readonly tf: TermFrequencies;
 }

-interface Term {
-	readonly term: string;
-	readonly weight: number;
-}
-
 export interface TfIdfDocument {
 	readonly key: string;
 	readonly textChunks: readonly string[];
@@ -72,53 +75,26 @@ export class TfIdfCalculator {
 	 * Count how many times each term (word) appears in a string.
 	 */
 	private static termFrequencies(input: string): TermFrequencies {
-		const map = new Map<string, { weight: number; occurrences: number }>();
-		for (const value of TfIdfCalculator.splitTerms(input)) {
-			const existing = map.get(value.term);
-			if (existing) {
-				existing.occurrences++;
-			} else {
-				map.set(value.term, { weight: value.weight, occurrences: 1 });
-			}
-		}
-		return map;
+		return countMapFrom(TfIdfCalculator.splitTerms(input));
 	}

 	/**
 	 * Break a string into terms (words).
-	 *
-	 * TODO: confirm that when we break up a word or generate stems, we likely accidentally over-weight its terms.
-	 * For instance, if the document is: `cats wear hats` and the user searches `cats wear`, we could end up giving too
-	 * much weight to `cats` since the document would be broken into: `[cats, cat, wear, hats, hat]` while the query
-	 * would be broken into `[cats, cat, wear]`. This means that terms derived from `cats` end up being matched on multiple
-	 * times, which isn't really right.
-	 *
-	 * Maybe we need to generate a tree of terms for the document where we stop searching once a match has been found:
 	 */
-	private static *splitTerms(input: string): Iterable<Term> {
+	private static *splitTerms(input: string): Iterable<string> {
 		const normalize = (word: string) => word.toLowerCase();

 		// Only match on words that are at least 3 characters long and start with a letter
 		for (const [word] of input.matchAll(/\b\p{Letter}[\p{Letter}\d]{2,}\b/gu)) {
-			yield { term: normalize(word), weight: 1 };
+			yield normalize(word);

-			// Include both the original term and the stemmed version
-			const stemmedTerm = stem(word);
-			if (stemmedTerm !== word) {
-				yield { term: normalize(stemmedTerm), weight: 0.75 };
-			}
-
-			const camelParts = word.split(/(?=[A-Z])/g);
+			// eslint-disable-next-line local/code-no-look-behind-regex
+			const camelParts = word.split(/(?<=[a-z])(?=[A-Z])/g);
 			if (camelParts.length > 1) {
 				for (const part of camelParts) {
 					// Require at least 3 letters in the parts of a camel case word
 					if (part.length > 2 && /\p{Letter}{3,}/gu.test(part)) {
-						yield { term: normalize(part), weight: 0.75 };
-
-						const stemmedPart = stem(part);
-						if (stemmedPart !== part && stemmedPart.length > 2) {
-							yield { term: normalize(stemmedPart), weight: 0.5 };
-						}
+						yield normalize(part);
 					}
 				}
 			}
@@ -210,7 +186,7 @@ export class TfIdfCalculator {
 				idfCache.set(term, chunkIdf);
 			}

-			const chunkTfidf = chunkTf.weight * chunkTf.occurrences * chunkIdf;
+			const chunkTfidf = chunkTf * chunkIdf;
 			sum += chunkTfidf * termTfidf;
 		}
 		return sum;
@@ -233,7 +209,7 @@ export class TfIdfCalculator {
 		for (const [word, occurrences] of termFrequencies) {
 			const idf = this.computeIdf(word);
 			if (idf > 0) {
-				embedding[word] = occurrences.weight * occurrences.occurrences * idf;
+				embedding[word] = occurrences * idf;
 			}
 		}
 		return embedding;
@@ -263,197 +239,3 @@ export function normalizeTfIdfScores(scores: TfIdfScore[]): NormalizedTfIdfScore

 	return result as TfIdfScore[];
 }
-
-// https://github.com/maxxxxxdlp/porter-stemming
-
-/**
- * TypeScript implementation of the Porter-Stemmer algorithm
- */
-export function stem(raw: string): string {
-	if (raw.length < minLength) { return raw; }
-
-	let word = raw;
-	const firstCharacter = word[0];
-	if (firstCharacter === 'y') { word = firstCharacter.toUpperCase() + word.slice(1); }
-
-	word = steps.reduce((word, step) => step(word), word);
-
-	// Turn initial Y back to y
-	if (firstCharacter === 'y') { word = firstCharacter.toLowerCase() + word.slice(1); }
-
-	return word;
-}
-
-const minLength = 3;
-const vowel = '[aeiouy]';
-const consonant = '[^aeiou]';
-const consonantSequence = `${consonant}[^aeiouy]*`;
-const o = new RegExp(`^${consonantSequence}${vowel}[^aeiouwxy]$`, 'u');
-
-/**
- * Try to match a word against a rule
- */
-const replace =
-	(
-		replacements: Readonly<
-			Record<
-				string,
-				| string
-				| readonly [condition: (word: string) => boolean, replacement: string]
-			>
-		>
-	) =>
-		(word: string): string => {
-			const entries = Object.entries(replacements).sort(
-				([left], [right]) => right.length - left.length
-			);
-			for (const [suffix, replacement] of entries) {
-				if (!word.endsWith(suffix)) { continue; }
-				if (
-					Array.isArray(replacement) &&
-					!replacement[0](word.slice(0, -suffix.length))
-				) { break; }
-				return `${word.slice(0, -suffix.length)}${Array.isArray(replacement) ? replacement[1] : replacement
-					}`;
-			}
-			return word;
-		};
-
-const calculateMeasure = (word: string): number =>
-	sum(
-		Array.from(word.split(''), (_, index) =>
-			!isConsonant(word, index) &&
-				index + 1 < word.length &&
-				isConsonant(word, index + 1)
-				? 1
-				: 0
-		)
-	);
-
-const sum = (array: readonly number[]): number =>
-	array.reduce((sum, value) => sum + value, 0);
-
-const measure =
-	(min: number) =>
-		(word: string): boolean =>
-			calculateMeasure(word) > min;
-
-function isConsonant(word: string, index: number): boolean {
-	const vowels = 'aeiou';
-	if (vowels.includes(word[index])) { return false; }
-	if (word[index] === 'y') { return index === 0 ? true : !isConsonant(word, index - 1); }
-	else { return true; }
-}
-
-const hasVowel = (word: string): boolean =>
-	Array.from(word.split('')).some((_, index) => !isConsonant(word, index));
-
-const steps: readonly ((word: string) => string)[] = [
-	// Step 1a
-	replace({
-		sses: 'ss',
-		ies: 'i',
-		ss: 'ss',
-		s: '',
-	}),
-	// Step 1b
-	(word): string => {
-		if (word.endsWith('eed')) { return replace({ eed: [measure(0), 'ee'] })(word); }
-		const updated = replace({ ed: [hasVowel, ''], ing: [hasVowel, ''] })(word);
-		if (updated === word) { return word; }
-		const replaced = replace({
-			at: 'ate',
-			bl: 'ble',
-			iz: 'ize',
-		})(updated);
-		if (replaced !== updated) { return replaced; }
-
-		if (
-			replaced.at(-1) === replaced.at(-'dd'.length) &&
-			isConsonant(replaced, replaced.length - 1) &&
-			!['l', 's', 'z'].some((letter) => replaced.endsWith(letter))
-		) { return replaced.slice(0, -1); }
-
-		if (calculateMeasure(replaced) === 1 && o.test(replaced)) { return `${replaced}e`; }
-		return replaced;
-	},
-	// Step 1c
-	replace({
-		y: [hasVowel, 'i'],
-	}),
-	// Step 2
-	replace({
-		ational: [measure(0), 'ate'],
-		tional: [measure(0), 'tion'],
-		enci: [measure(0), 'ence'],
-		anci: [measure(0), 'ance'],
-		izer: [measure(0), 'ize'],
-		abli: [measure(0), 'able'],
-		alli: [measure(0), 'al'],
-		entli: [measure(0), 'ent'],
-		eli: [measure(0), 'e'],
-		ousli: [measure(0), 'ous'],
-		ization: [measure(0), 'ize'],
-		ation: [measure(0), 'ate'],
-		ator: [measure(0), 'ate'],
-		alism: [measure(0), 'al'],
-		iveness: [measure(0), 'ive'],
-		fulness: [measure(0), 'ful'],
-		ousness: [measure(0), 'ous'],
-		aliti: [measure(0), 'al'],
-		iviti: [measure(0), 'ive'],
-		biliti: [measure(0), 'ble'],
-		logi: [measure(0), 'log'],
-		bli: [measure(0), 'ble'],
-	}),
-	// Step 3
-	replace({
-		icate: [measure(0), 'ic'],
-		ative: [measure(0), ''],
-		alize: [measure(0), 'al'],
-		iciti: [measure(0), 'ic'],
-		ical: [measure(0), 'ic'],
-		ful: [measure(0), ''],
-		ness: [measure(0), ''],
-	}),
-	// Step 4
-	(word): string => {
-		const newWord = replace({
-			al: [measure(1), ''],
-			ance: [measure(1), ''],
-			ence: [measure(1), ''],
-			er: [measure(1), ''],
-			ic: [measure(1), ''],
-			able: [measure(1), ''],
-			ible: [measure(1), ''],
-			ant: [measure(1), ''],
-			ement: [measure(1), ''],
-			ment: [measure(1), ''],
-			ent: [measure(1), ''],
-			ou: [measure(1), ''],
-			ism: [measure(1), ''],
-			ate: [measure(1), ''],
-			iti: [measure(1), ''],
-			ous: [measure(1), ''],
-			ive: [measure(1), ''],
-			ize: [measure(1), ''],
-		})(word);
-		if (newWord !== word) { return newWord; }
-		return (word.endsWith('tion') || word.endsWith('sion')) &&
-			measure(1)(word.slice(0, -'ion'.length))
-			? word.slice(0, -'ion'.length)
-			: word;
-	},
-	// Step 5a
-	(word): string => {
-		if (!word.endsWith('e')) { return word; }
-		const stem = word.slice(0, -1);
-		const measure = calculateMeasure(stem);
-		return measure > 1 || (measure === 1 && !o.test(stem)) ? stem : word;
-	},
-	// Step 5b
-	(word): string =>
-		word.endsWith('ll') && measure(1)(word.slice(0, -1))
-			? word.slice(0, -1)
-			: word,
-];
--- a/src/vs/base/test/common/tfIdf.test.ts
+++ b/src/vs/base/test/common/tfIdf.test.ts
@@ -163,19 +163,6 @@ suite('TF-IDF Calculator', function () {
 		}
 	});

-	test('Should weigh exact match higher than camelCase match', () => {
-		for (const docs of permutate([
-			makeDocument('/A', 'catDog'),
-			makeDocument('/B', 'cat cat cat fish'),
-			makeDocument('/C', 'dog dog cat rat'),
-			makeDocument('/D', 'pig'),
-		])) {
-			const tfidf = new TfIdfCalculator().updateDocuments(docs);
-			const scores = tfidf.calculateScores('catDog', CancellationToken.None);
-			assertScoreOrdersEqual(scores, ['/A', '/C', '/B']);
-		}
-	});
-
 	test('Should not match document after delete', () => {
 		const docA = makeDocument('/A', 'cat dog cat');
 		const docB = makeDocument('/B', 'cat fish');
@@ -197,18 +184,6 @@ suite('TF-IDF Calculator', function () {
 		scores = tfidf.calculateScores('cat', CancellationToken.None);
 		assertScoreOrdersEqual(scores, []);
 	});
-
-	test('Should find stemmed words', () => {
-		for (const docs of permutate([
-			makeDocument('/A', 'cats'),
-			makeDocument('/B', 'dogs cat'),
-			makeDocument('/D', 'pig'),
-		])) {
-			const tfidf = new TfIdfCalculator().updateDocuments(docs);
-			const scores = tfidf.calculateScores('cats', CancellationToken.None);
-			assertScoreOrdersEqual(scores, ['/A', '/B']);
-		}
-	});
 });

 function makeDocument(key: string, content: string | string[]): TfIdfDocument {