Merge pull request #193549 from microsoft/tyler/mixed-tyrannosaurus

Revert "Add stemming support to TF-IDF implementation (#193531)"
2025-12-20 02:08:47 +00:00 · 2023-09-20 09:27:46 +02:00
parent 2ad860b576 4ea2c19b2a
commit f27dd75c38
2 changed files with 17 additions and 260 deletions
--- a/src/vs/base/common/tfIdf.ts
+++ b/src/vs/base/common/tfIdf.ts
@@ -6,19 +6,22 @@
 import { CancellationToken } from 'vs/base/common/cancellation';
 type SparseEmbedding = Record</* word */ string, /* weight */number>;
-type TermFrequencies = Map</* word */ string, { occurrences: number; weight: number }>;
+type TermFrequencies = Map</* word */ string, /*occurrences*/ number>;
 type DocumentOccurrences = Map</* word */ string, /*documentOccurrences*/ number>;
 function countMapFrom<K>(values: Iterable<K>): Map<K, number> {
 	const map = new Map<K, number>();
 	for (const value of values) {
 		map.set(value, (map.get(value) ?? 0) + 1);
 	}
 	return map;
 }
 interface DocumentChunkEntry {
 	readonly text: string;
 	readonly tf: TermFrequencies;
 }
 interface Term {
 	readonly term: string;
 	readonly weight: number;
 }
 export interface TfIdfDocument {
 	readonly key: string;
 	readonly textChunks: readonly string[];
@@ -72,53 +75,26 @@ export class TfIdfCalculator {
 	 * Count how many times each term (word) appears in a string.
 	 */
 	private static termFrequencies(input: string): TermFrequencies {
-		const map = new Map<string, { weight: number; occurrences: number }>();
+		return countMapFrom(TfIdfCalculator.splitTerms(input));
 		for (const value of TfIdfCalculator.splitTerms(input)) {
 			const existing = map.get(value.term);
 			if (existing) {
 				existing.occurrences++;
 			} else {
 				map.set(value.term, { weight: value.weight, occurrences: 1 });
 			}
 		}
 		return map;
 	}
 	/**
 	 * Break a string into terms (words).
 	 *
 	 * TODO: confirm that when we break up a word or generate stems, we likely accidentally over-weight its terms.
 	 * For instance, if the document is: `cats wear hats` and the user searches `cats wear`, we could end up giving too
 	 * much weight to `cats` since the document would be broken into: `[cats, cat, wear, hats, hat]` while the query
 	 * would be broken into `[cats, cat, wear]`. This means that terms derived from `cats` end up being matched on multiple
 	 * times, which isn't really right.
 	 *
 	 * Maybe we need to generate a tree of terms for the document where we stop searching once a match has been found:
 	 */
-	private static *splitTerms(input: string): Iterable<Term> {
+	private static *splitTerms(input: string): Iterable<string> {
 		const normalize = (word: string) => word.toLowerCase();
 		// Only match on words that are at least 3 characters long and start with a letter
 		for (const [word] of input.matchAll(/\b\p{Letter}[\p{Letter}\d]{2,}\b/gu)) {
-			yield { term: normalize(word), weight: 1 };
+			yield normalize(word);
-			// Include both the original term and the stemmed version
+			// eslint-disable-next-line local/code-no-look-behind-regex
-			const stemmedTerm = stem(word);
+			const camelParts = word.split(/(?<=[a-z])(?=[A-Z])/g);
 			if (stemmedTerm !== word) {
 				yield { term: normalize(stemmedTerm), weight: 0.75 };
 			}
 			const camelParts = word.split(/(?=[A-Z])/g);
 			if (camelParts.length > 1) {
 				for (const part of camelParts) {
 					// Require at least 3 letters in the parts of a camel case word
 					if (part.length > 2 && /\p{Letter}{3,}/gu.test(part)) {
-						yield { term: normalize(part), weight: 0.75 };
+						yield normalize(part);
 						const stemmedPart = stem(part);
 						if (stemmedPart !== part && stemmedPart.length > 2) {
 							yield { term: normalize(stemmedPart), weight: 0.5 };
 						}
 					}
 				}
 			}
@@ -210,7 +186,7 @@ export class TfIdfCalculator {
 				idfCache.set(term, chunkIdf);
 			}
-			const chunkTfidf = chunkTf.weight * chunkTf.occurrences * chunkIdf;
+			const chunkTfidf = chunkTf * chunkIdf;
 			sum += chunkTfidf * termTfidf;
 		}
 		return sum;
@@ -233,7 +209,7 @@ export class TfIdfCalculator {
 		for (const [word, occurrences] of termFrequencies) {
 			const idf = this.computeIdf(word);
 			if (idf > 0) {
-				embedding[word] = occurrences.weight * occurrences.occurrences * idf;
+				embedding[word] = occurrences * idf;
 			}
 		}
 		return embedding;
@@ -263,197 +239,3 @@ export function normalizeTfIdfScores(scores: TfIdfScore[]): NormalizedTfIdfScore
 	return result as TfIdfScore[];
 }
 // https://github.com/maxxxxxdlp/porter-stemming
 /**
 * TypeScript implementation of the Porter-Stemmer algorithm
 */
 export function stem(raw: string): string {
 	if (raw.length < minLength) { return raw; }
 	let word = raw;
 	const firstCharacter = word[0];
 	if (firstCharacter === 'y') { word = firstCharacter.toUpperCase() + word.slice(1); }
 	word = steps.reduce((word, step) => step(word), word);
 	// Turn initial Y back to y
 	if (firstCharacter === 'y') { word = firstCharacter.toLowerCase() + word.slice(1); }
 	return word;
 }
 const minLength = 3;
 const vowel = '[aeiouy]';
 const consonant = '[^aeiou]';
 const consonantSequence = `${consonant}[^aeiouy]*`;
 const o = new RegExp(`^${consonantSequence}${vowel}[^aeiouwxy]$`, 'u');
 /**
 * Try to match a word against a rule
 */
 const replace =
 	(
 		replacements: Readonly<
 			Record<
 				string,
 				| string
 				| readonly [condition: (word: string) => boolean, replacement: string]
 			>
 		>
 	) =>
 		(word: string): string => {
 			const entries = Object.entries(replacements).sort(
 				([left], [right]) => right.length - left.length
 			);
 			for (const [suffix, replacement] of entries) {
 				if (!word.endsWith(suffix)) { continue; }
 				if (
 					Array.isArray(replacement) &&
 					!replacement[0](word.slice(0, -suffix.length))
 				) { break; }
 				return `${word.slice(0, -suffix.length)}${Array.isArray(replacement) ? replacement[1] : replacement
 					}`;
 			}
 			return word;
 		};
 const calculateMeasure = (word: string): number =>
 	sum(
 		Array.from(word.split(''), (_, index) =>
 			!isConsonant(word, index) &&
 				index + 1 < word.length &&
 				isConsonant(word, index + 1)
 				? 1
 				: 0
 		)
 	);
 const sum = (array: readonly number[]): number =>
 	array.reduce((sum, value) => sum + value, 0);
 const measure =
 	(min: number) =>
 		(word: string): boolean =>
 			calculateMeasure(word) > min;
 function isConsonant(word: string, index: number): boolean {
 	const vowels = 'aeiou';
 	if (vowels.includes(word[index])) { return false; }
 	if (word[index] === 'y') { return index === 0 ? true : !isConsonant(word, index - 1); }
 	else { return true; }
 }
 const hasVowel = (word: string): boolean =>
 	Array.from(word.split('')).some((_, index) => !isConsonant(word, index));
 const steps: readonly ((word: string) => string)[] = [
 	// Step 1a
 	replace({
 		sses: 'ss',
 		ies: 'i',
 		ss: 'ss',
 		s: '',
 	}),
 	// Step 1b
 	(word): string => {
 		if (word.endsWith('eed')) { return replace({ eed: [measure(0), 'ee'] })(word); }
 		const updated = replace({ ed: [hasVowel, ''], ing: [hasVowel, ''] })(word);
 		if (updated === word) { return word; }
 		const replaced = replace({
 			at: 'ate',
 			bl: 'ble',
 			iz: 'ize',
 		})(updated);
 		if (replaced !== updated) { return replaced; }
 		if (
 			replaced.at(-1) === replaced.at(-'dd'.length) &&
 			isConsonant(replaced, replaced.length - 1) &&
 			!['l', 's', 'z'].some((letter) => replaced.endsWith(letter))
 		) { return replaced.slice(0, -1); }
 		if (calculateMeasure(replaced) === 1 && o.test(replaced)) { return `${replaced}e`; }
 		return replaced;
 	},
 	// Step 1c
 	replace({
 		y: [hasVowel, 'i'],
 	}),
 	// Step 2
 	replace({
 		ational: [measure(0), 'ate'],
 		tional: [measure(0), 'tion'],
 		enci: [measure(0), 'ence'],
 		anci: [measure(0), 'ance'],
 		izer: [measure(0), 'ize'],
 		abli: [measure(0), 'able'],
 		alli: [measure(0), 'al'],
 		entli: [measure(0), 'ent'],
 		eli: [measure(0), 'e'],
 		ousli: [measure(0), 'ous'],
 		ization: [measure(0), 'ize'],
 		ation: [measure(0), 'ate'],
 		ator: [measure(0), 'ate'],
 		alism: [measure(0), 'al'],
 		iveness: [measure(0), 'ive'],
 		fulness: [measure(0), 'ful'],
 		ousness: [measure(0), 'ous'],
 		aliti: [measure(0), 'al'],
 		iviti: [measure(0), 'ive'],
 		biliti: [measure(0), 'ble'],
 		logi: [measure(0), 'log'],
 		bli: [measure(0), 'ble'],
 	}),
 	// Step 3
 	replace({
 		icate: [measure(0), 'ic'],
 		ative: [measure(0), ''],
 		alize: [measure(0), 'al'],
 		iciti: [measure(0), 'ic'],
 		ical: [measure(0), 'ic'],
 		ful: [measure(0), ''],
 		ness: [measure(0), ''],
 	}),
 	// Step 4
 	(word): string => {
 		const newWord = replace({
 			al: [measure(1), ''],
 			ance: [measure(1), ''],
 			ence: [measure(1), ''],
 			er: [measure(1), ''],
 			ic: [measure(1), ''],
 			able: [measure(1), ''],
 			ible: [measure(1), ''],
 			ant: [measure(1), ''],
 			ement: [measure(1), ''],
 			ment: [measure(1), ''],
 			ent: [measure(1), ''],
 			ou: [measure(1), ''],
 			ism: [measure(1), ''],
 			ate: [measure(1), ''],
 			iti: [measure(1), ''],
 			ous: [measure(1), ''],
 			ive: [measure(1), ''],
 			ize: [measure(1), ''],
 		})(word);
 		if (newWord !== word) { return newWord; }
 		return (word.endsWith('tion') || word.endsWith('sion')) &&
 			measure(1)(word.slice(0, -'ion'.length))
 			? word.slice(0, -'ion'.length)
 			: word;
 	},
 	// Step 5a
 	(word): string => {
 		if (!word.endsWith('e')) { return word; }
 		const stem = word.slice(0, -1);
 		const measure = calculateMeasure(stem);
 		return measure > 1 || (measure === 1 && !o.test(stem)) ? stem : word;
 	},
 	// Step 5b
 	(word): string =>
 		word.endsWith('ll') && measure(1)(word.slice(0, -1))
 			? word.slice(0, -1)
 			: word,
 ];
--- a/src/vs/base/test/common/tfIdf.test.ts
+++ b/src/vs/base/test/common/tfIdf.test.ts
@@ -163,19 +163,6 @@ suite('TF-IDF Calculator', function () {
 		}
 	});
 	test('Should weigh exact match higher than camelCase match', () => {
 		for (const docs of permutate([
 			makeDocument('/A', 'catDog'),
 			makeDocument('/B', 'cat cat cat fish'),
 			makeDocument('/C', 'dog dog cat rat'),
 			makeDocument('/D', 'pig'),
 		])) {
 			const tfidf = new TfIdfCalculator().updateDocuments(docs);
 			const scores = tfidf.calculateScores('catDog', CancellationToken.None);
 			assertScoreOrdersEqual(scores, ['/A', '/C', '/B']);
 		}
 	});
 	test('Should not match document after delete', () => {
 		const docA = makeDocument('/A', 'cat dog cat');
 		const docB = makeDocument('/B', 'cat fish');
@@ -197,18 +184,6 @@ suite('TF-IDF Calculator', function () {
 		scores = tfidf.calculateScores('cat', CancellationToken.None);
 		assertScoreOrdersEqual(scores, []);
 	});
 	test('Should find stemmed words', () => {
 		for (const docs of permutate([
 			makeDocument('/A', 'cats'),
 			makeDocument('/B', 'dogs cat'),
 			makeDocument('/D', 'pig'),
 		])) {
 			const tfidf = new TfIdfCalculator().updateDocuments(docs);
 			const scores = tfidf.calculateScores('cats', CancellationToken.None);
 			assertScoreOrdersEqual(scores, ['/A', '/B']);
 		}
 	});
 });
 function makeDocument(key: string, content: string | string[]): TfIdfDocument {