mirror of
https://github.com/microsoft/vscode.git
synced 2025-12-20 02:08:47 +00:00
Revert "Add stemming support to TF-IDF implementation (#193531)"
This reverts commit 2ad860b576.
This commit is contained in:
@@ -6,19 +6,22 @@
|
|||||||
import { CancellationToken } from 'vs/base/common/cancellation';
|
import { CancellationToken } from 'vs/base/common/cancellation';
|
||||||
|
|
||||||
type SparseEmbedding = Record</* word */ string, /* weight */number>;
|
type SparseEmbedding = Record</* word */ string, /* weight */number>;
|
||||||
type TermFrequencies = Map</* word */ string, { occurrences: number; weight: number }>;
|
type TermFrequencies = Map</* word */ string, /*occurrences*/ number>;
|
||||||
type DocumentOccurrences = Map</* word */ string, /*documentOccurrences*/ number>;
|
type DocumentOccurrences = Map</* word */ string, /*documentOccurrences*/ number>;
|
||||||
|
|
||||||
|
function countMapFrom<K>(values: Iterable<K>): Map<K, number> {
|
||||||
|
const map = new Map<K, number>();
|
||||||
|
for (const value of values) {
|
||||||
|
map.set(value, (map.get(value) ?? 0) + 1);
|
||||||
|
}
|
||||||
|
return map;
|
||||||
|
}
|
||||||
|
|
||||||
interface DocumentChunkEntry {
|
interface DocumentChunkEntry {
|
||||||
readonly text: string;
|
readonly text: string;
|
||||||
readonly tf: TermFrequencies;
|
readonly tf: TermFrequencies;
|
||||||
}
|
}
|
||||||
|
|
||||||
interface Term {
|
|
||||||
readonly term: string;
|
|
||||||
readonly weight: number;
|
|
||||||
}
|
|
||||||
|
|
||||||
export interface TfIdfDocument {
|
export interface TfIdfDocument {
|
||||||
readonly key: string;
|
readonly key: string;
|
||||||
readonly textChunks: readonly string[];
|
readonly textChunks: readonly string[];
|
||||||
@@ -72,53 +75,26 @@ export class TfIdfCalculator {
|
|||||||
* Count how many times each term (word) appears in a string.
|
* Count how many times each term (word) appears in a string.
|
||||||
*/
|
*/
|
||||||
private static termFrequencies(input: string): TermFrequencies {
|
private static termFrequencies(input: string): TermFrequencies {
|
||||||
const map = new Map<string, { weight: number; occurrences: number }>();
|
return countMapFrom(TfIdfCalculator.splitTerms(input));
|
||||||
for (const value of TfIdfCalculator.splitTerms(input)) {
|
|
||||||
const existing = map.get(value.term);
|
|
||||||
if (existing) {
|
|
||||||
existing.occurrences++;
|
|
||||||
} else {
|
|
||||||
map.set(value.term, { weight: value.weight, occurrences: 1 });
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return map;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Break a string into terms (words).
|
* Break a string into terms (words).
|
||||||
*
|
|
||||||
* TODO: confirm that when we break up a word or generate stems, we likely accidentally over-weight its terms.
|
|
||||||
* For instance, if the document is: `cats wear hats` and the user searches `cats wear`, we could end up giving too
|
|
||||||
* much weight to `cats` since the document would be broken into: `[cats, cat, wear, hats, hat]` while the query
|
|
||||||
* would be broken into `[cats, cat, wear]`. This means that terms derived from `cats` end up being matched on multiple
|
|
||||||
* times, which isn't really right.
|
|
||||||
*
|
|
||||||
* Maybe we need to generate a tree of terms for the document where we stop searching once a match has been found:
|
|
||||||
*/
|
*/
|
||||||
private static *splitTerms(input: string): Iterable<Term> {
|
private static *splitTerms(input: string): Iterable<string> {
|
||||||
const normalize = (word: string) => word.toLowerCase();
|
const normalize = (word: string) => word.toLowerCase();
|
||||||
|
|
||||||
// Only match on words that are at least 3 characters long and start with a letter
|
// Only match on words that are at least 3 characters long and start with a letter
|
||||||
for (const [word] of input.matchAll(/\b\p{Letter}[\p{Letter}\d]{2,}\b/gu)) {
|
for (const [word] of input.matchAll(/\b\p{Letter}[\p{Letter}\d]{2,}\b/gu)) {
|
||||||
yield { term: normalize(word), weight: 1 };
|
yield normalize(word);
|
||||||
|
|
||||||
// Include both the original term and the stemmed version
|
// eslint-disable-next-line local/code-no-look-behind-regex
|
||||||
const stemmedTerm = stem(word);
|
const camelParts = word.split(/(?<=[a-z])(?=[A-Z])/g);
|
||||||
if (stemmedTerm !== word) {
|
|
||||||
yield { term: normalize(stemmedTerm), weight: 0.75 };
|
|
||||||
}
|
|
||||||
|
|
||||||
const camelParts = word.split(/(?=[A-Z])/g);
|
|
||||||
if (camelParts.length > 1) {
|
if (camelParts.length > 1) {
|
||||||
for (const part of camelParts) {
|
for (const part of camelParts) {
|
||||||
// Require at least 3 letters in the parts of a camel case word
|
// Require at least 3 letters in the parts of a camel case word
|
||||||
if (part.length > 2 && /\p{Letter}{3,}/gu.test(part)) {
|
if (part.length > 2 && /\p{Letter}{3,}/gu.test(part)) {
|
||||||
yield { term: normalize(part), weight: 0.75 };
|
yield normalize(part);
|
||||||
|
|
||||||
const stemmedPart = stem(part);
|
|
||||||
if (stemmedPart !== part && stemmedPart.length > 2) {
|
|
||||||
yield { term: normalize(stemmedPart), weight: 0.5 };
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -210,7 +186,7 @@ export class TfIdfCalculator {
|
|||||||
idfCache.set(term, chunkIdf);
|
idfCache.set(term, chunkIdf);
|
||||||
}
|
}
|
||||||
|
|
||||||
const chunkTfidf = chunkTf.weight * chunkTf.occurrences * chunkIdf;
|
const chunkTfidf = chunkTf * chunkIdf;
|
||||||
sum += chunkTfidf * termTfidf;
|
sum += chunkTfidf * termTfidf;
|
||||||
}
|
}
|
||||||
return sum;
|
return sum;
|
||||||
@@ -233,7 +209,7 @@ export class TfIdfCalculator {
|
|||||||
for (const [word, occurrences] of termFrequencies) {
|
for (const [word, occurrences] of termFrequencies) {
|
||||||
const idf = this.computeIdf(word);
|
const idf = this.computeIdf(word);
|
||||||
if (idf > 0) {
|
if (idf > 0) {
|
||||||
embedding[word] = occurrences.weight * occurrences.occurrences * idf;
|
embedding[word] = occurrences * idf;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return embedding;
|
return embedding;
|
||||||
@@ -263,197 +239,3 @@ export function normalizeTfIdfScores(scores: TfIdfScore[]): NormalizedTfIdfScore
|
|||||||
|
|
||||||
return result as TfIdfScore[];
|
return result as TfIdfScore[];
|
||||||
}
|
}
|
||||||
|
|
||||||
// https://github.com/maxxxxxdlp/porter-stemming
|
|
||||||
|
|
||||||
/**
|
|
||||||
* TypeScript implementation of the Porter-Stemmer algorithm
|
|
||||||
*/
|
|
||||||
export function stem(raw: string): string {
|
|
||||||
if (raw.length < minLength) { return raw; }
|
|
||||||
|
|
||||||
let word = raw;
|
|
||||||
const firstCharacter = word[0];
|
|
||||||
if (firstCharacter === 'y') { word = firstCharacter.toUpperCase() + word.slice(1); }
|
|
||||||
|
|
||||||
word = steps.reduce((word, step) => step(word), word);
|
|
||||||
|
|
||||||
// Turn initial Y back to y
|
|
||||||
if (firstCharacter === 'y') { word = firstCharacter.toLowerCase() + word.slice(1); }
|
|
||||||
|
|
||||||
return word;
|
|
||||||
}
|
|
||||||
|
|
||||||
const minLength = 3;
|
|
||||||
const vowel = '[aeiouy]';
|
|
||||||
const consonant = '[^aeiou]';
|
|
||||||
const consonantSequence = `${consonant}[^aeiouy]*`;
|
|
||||||
const o = new RegExp(`^${consonantSequence}${vowel}[^aeiouwxy]$`, 'u');
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Try to match a word against a rule
|
|
||||||
*/
|
|
||||||
const replace =
|
|
||||||
(
|
|
||||||
replacements: Readonly<
|
|
||||||
Record<
|
|
||||||
string,
|
|
||||||
| string
|
|
||||||
| readonly [condition: (word: string) => boolean, replacement: string]
|
|
||||||
>
|
|
||||||
>
|
|
||||||
) =>
|
|
||||||
(word: string): string => {
|
|
||||||
const entries = Object.entries(replacements).sort(
|
|
||||||
([left], [right]) => right.length - left.length
|
|
||||||
);
|
|
||||||
for (const [suffix, replacement] of entries) {
|
|
||||||
if (!word.endsWith(suffix)) { continue; }
|
|
||||||
if (
|
|
||||||
Array.isArray(replacement) &&
|
|
||||||
!replacement[0](word.slice(0, -suffix.length))
|
|
||||||
) { break; }
|
|
||||||
return `${word.slice(0, -suffix.length)}${Array.isArray(replacement) ? replacement[1] : replacement
|
|
||||||
}`;
|
|
||||||
}
|
|
||||||
return word;
|
|
||||||
};
|
|
||||||
|
|
||||||
const calculateMeasure = (word: string): number =>
|
|
||||||
sum(
|
|
||||||
Array.from(word.split(''), (_, index) =>
|
|
||||||
!isConsonant(word, index) &&
|
|
||||||
index + 1 < word.length &&
|
|
||||||
isConsonant(word, index + 1)
|
|
||||||
? 1
|
|
||||||
: 0
|
|
||||||
)
|
|
||||||
);
|
|
||||||
|
|
||||||
const sum = (array: readonly number[]): number =>
|
|
||||||
array.reduce((sum, value) => sum + value, 0);
|
|
||||||
|
|
||||||
const measure =
|
|
||||||
(min: number) =>
|
|
||||||
(word: string): boolean =>
|
|
||||||
calculateMeasure(word) > min;
|
|
||||||
|
|
||||||
function isConsonant(word: string, index: number): boolean {
|
|
||||||
const vowels = 'aeiou';
|
|
||||||
if (vowels.includes(word[index])) { return false; }
|
|
||||||
if (word[index] === 'y') { return index === 0 ? true : !isConsonant(word, index - 1); }
|
|
||||||
else { return true; }
|
|
||||||
}
|
|
||||||
|
|
||||||
const hasVowel = (word: string): boolean =>
|
|
||||||
Array.from(word.split('')).some((_, index) => !isConsonant(word, index));
|
|
||||||
|
|
||||||
const steps: readonly ((word: string) => string)[] = [
|
|
||||||
// Step 1a
|
|
||||||
replace({
|
|
||||||
sses: 'ss',
|
|
||||||
ies: 'i',
|
|
||||||
ss: 'ss',
|
|
||||||
s: '',
|
|
||||||
}),
|
|
||||||
// Step 1b
|
|
||||||
(word): string => {
|
|
||||||
if (word.endsWith('eed')) { return replace({ eed: [measure(0), 'ee'] })(word); }
|
|
||||||
const updated = replace({ ed: [hasVowel, ''], ing: [hasVowel, ''] })(word);
|
|
||||||
if (updated === word) { return word; }
|
|
||||||
const replaced = replace({
|
|
||||||
at: 'ate',
|
|
||||||
bl: 'ble',
|
|
||||||
iz: 'ize',
|
|
||||||
})(updated);
|
|
||||||
if (replaced !== updated) { return replaced; }
|
|
||||||
|
|
||||||
if (
|
|
||||||
replaced.at(-1) === replaced.at(-'dd'.length) &&
|
|
||||||
isConsonant(replaced, replaced.length - 1) &&
|
|
||||||
!['l', 's', 'z'].some((letter) => replaced.endsWith(letter))
|
|
||||||
) { return replaced.slice(0, -1); }
|
|
||||||
|
|
||||||
if (calculateMeasure(replaced) === 1 && o.test(replaced)) { return `${replaced}e`; }
|
|
||||||
return replaced;
|
|
||||||
},
|
|
||||||
// Step 1c
|
|
||||||
replace({
|
|
||||||
y: [hasVowel, 'i'],
|
|
||||||
}),
|
|
||||||
// Step 2
|
|
||||||
replace({
|
|
||||||
ational: [measure(0), 'ate'],
|
|
||||||
tional: [measure(0), 'tion'],
|
|
||||||
enci: [measure(0), 'ence'],
|
|
||||||
anci: [measure(0), 'ance'],
|
|
||||||
izer: [measure(0), 'ize'],
|
|
||||||
abli: [measure(0), 'able'],
|
|
||||||
alli: [measure(0), 'al'],
|
|
||||||
entli: [measure(0), 'ent'],
|
|
||||||
eli: [measure(0), 'e'],
|
|
||||||
ousli: [measure(0), 'ous'],
|
|
||||||
ization: [measure(0), 'ize'],
|
|
||||||
ation: [measure(0), 'ate'],
|
|
||||||
ator: [measure(0), 'ate'],
|
|
||||||
alism: [measure(0), 'al'],
|
|
||||||
iveness: [measure(0), 'ive'],
|
|
||||||
fulness: [measure(0), 'ful'],
|
|
||||||
ousness: [measure(0), 'ous'],
|
|
||||||
aliti: [measure(0), 'al'],
|
|
||||||
iviti: [measure(0), 'ive'],
|
|
||||||
biliti: [measure(0), 'ble'],
|
|
||||||
logi: [measure(0), 'log'],
|
|
||||||
bli: [measure(0), 'ble'],
|
|
||||||
}),
|
|
||||||
// Step 3
|
|
||||||
replace({
|
|
||||||
icate: [measure(0), 'ic'],
|
|
||||||
ative: [measure(0), ''],
|
|
||||||
alize: [measure(0), 'al'],
|
|
||||||
iciti: [measure(0), 'ic'],
|
|
||||||
ical: [measure(0), 'ic'],
|
|
||||||
ful: [measure(0), ''],
|
|
||||||
ness: [measure(0), ''],
|
|
||||||
}),
|
|
||||||
// Step 4
|
|
||||||
(word): string => {
|
|
||||||
const newWord = replace({
|
|
||||||
al: [measure(1), ''],
|
|
||||||
ance: [measure(1), ''],
|
|
||||||
ence: [measure(1), ''],
|
|
||||||
er: [measure(1), ''],
|
|
||||||
ic: [measure(1), ''],
|
|
||||||
able: [measure(1), ''],
|
|
||||||
ible: [measure(1), ''],
|
|
||||||
ant: [measure(1), ''],
|
|
||||||
ement: [measure(1), ''],
|
|
||||||
ment: [measure(1), ''],
|
|
||||||
ent: [measure(1), ''],
|
|
||||||
ou: [measure(1), ''],
|
|
||||||
ism: [measure(1), ''],
|
|
||||||
ate: [measure(1), ''],
|
|
||||||
iti: [measure(1), ''],
|
|
||||||
ous: [measure(1), ''],
|
|
||||||
ive: [measure(1), ''],
|
|
||||||
ize: [measure(1), ''],
|
|
||||||
})(word);
|
|
||||||
if (newWord !== word) { return newWord; }
|
|
||||||
return (word.endsWith('tion') || word.endsWith('sion')) &&
|
|
||||||
measure(1)(word.slice(0, -'ion'.length))
|
|
||||||
? word.slice(0, -'ion'.length)
|
|
||||||
: word;
|
|
||||||
},
|
|
||||||
// Step 5a
|
|
||||||
(word): string => {
|
|
||||||
if (!word.endsWith('e')) { return word; }
|
|
||||||
const stem = word.slice(0, -1);
|
|
||||||
const measure = calculateMeasure(stem);
|
|
||||||
return measure > 1 || (measure === 1 && !o.test(stem)) ? stem : word;
|
|
||||||
},
|
|
||||||
// Step 5b
|
|
||||||
(word): string =>
|
|
||||||
word.endsWith('ll') && measure(1)(word.slice(0, -1))
|
|
||||||
? word.slice(0, -1)
|
|
||||||
: word,
|
|
||||||
];
|
|
||||||
|
|||||||
@@ -163,19 +163,6 @@ suite('TF-IDF Calculator', function () {
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
test('Should weigh exact match higher than camelCase match', () => {
|
|
||||||
for (const docs of permutate([
|
|
||||||
makeDocument('/A', 'catDog'),
|
|
||||||
makeDocument('/B', 'cat cat cat fish'),
|
|
||||||
makeDocument('/C', 'dog dog cat rat'),
|
|
||||||
makeDocument('/D', 'pig'),
|
|
||||||
])) {
|
|
||||||
const tfidf = new TfIdfCalculator().updateDocuments(docs);
|
|
||||||
const scores = tfidf.calculateScores('catDog', CancellationToken.None);
|
|
||||||
assertScoreOrdersEqual(scores, ['/A', '/C', '/B']);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
test('Should not match document after delete', () => {
|
test('Should not match document after delete', () => {
|
||||||
const docA = makeDocument('/A', 'cat dog cat');
|
const docA = makeDocument('/A', 'cat dog cat');
|
||||||
const docB = makeDocument('/B', 'cat fish');
|
const docB = makeDocument('/B', 'cat fish');
|
||||||
@@ -197,18 +184,6 @@ suite('TF-IDF Calculator', function () {
|
|||||||
scores = tfidf.calculateScores('cat', CancellationToken.None);
|
scores = tfidf.calculateScores('cat', CancellationToken.None);
|
||||||
assertScoreOrdersEqual(scores, []);
|
assertScoreOrdersEqual(scores, []);
|
||||||
});
|
});
|
||||||
|
|
||||||
test('Should find stemmed words', () => {
|
|
||||||
for (const docs of permutate([
|
|
||||||
makeDocument('/A', 'cats'),
|
|
||||||
makeDocument('/B', 'dogs cat'),
|
|
||||||
makeDocument('/D', 'pig'),
|
|
||||||
])) {
|
|
||||||
const tfidf = new TfIdfCalculator().updateDocuments(docs);
|
|
||||||
const scores = tfidf.calculateScores('cats', CancellationToken.None);
|
|
||||||
assertScoreOrdersEqual(scores, ['/A', '/B']);
|
|
||||||
}
|
|
||||||
});
|
|
||||||
});
|
});
|
||||||
|
|
||||||
function makeDocument(key: string, content: string | string[]): TfIdfDocument {
|
function makeDocument(key: string, content: string | string[]): TfIdfDocument {
|
||||||
|
|||||||
Reference in New Issue
Block a user