Add the ability to specify a list of candidate encodings when guessing encoding (#36951) (#208550)

* Allow to configure a list of encodings to use when guessing #36951

* Bump up the jschardet version into 3.1.2 #36951

* missing merge

* some polish

* renames

* some polish

* some polish

* cleanup

---------

Co-authored-by: Benjamin Pasero <benjamin.pasero@microsoft.com>
This commit is contained in:
yutotnh
2024-06-05 19:23:20 +09:00
committed by GitHub
parent eb7359d330
commit 3ab7948e14
13 changed files with 178 additions and 49 deletions

View File

@@ -49,15 +49,38 @@ const JSCHARDET_TO_ICONV_ENCODINGS: { [name: string]: string } = {
'big5': 'cp950'
};
export function detectEncoding(buffer: Buffer): string | null {
const MAP_CANDIDATE_GUESS_ENCODING_TO_JSCHARDET: { [key: string]: string } = {
utf8: 'UTF-8',
utf16le: 'UTF-16LE',
utf16be: 'UTF-16BE',
windows1252: 'windows-1252',
windows1250: 'windows-1250',
iso88592: 'ISO-8859-2',
windows1251: 'windows-1251',
cp866: 'IBM866',
iso88595: 'ISO-8859-5',
koi8r: 'KOI8-R',
windows1253: 'windows-1253',
iso88597: 'ISO-8859-7',
windows1255: 'windows-1255',
iso88598: 'ISO-8859-8',
cp950: 'Big5',
shiftjis: 'SHIFT_JIS',
eucjp: 'EUC-JP',
euckr: 'EUC-KR',
gb2312: 'GB2312'
};
export function detectEncoding(buffer: Buffer, candidateGuessEncodings: string[]): string | null {
const result = detectEncodingByBOM(buffer);
if (result) {
return result;
}
const detected = jschardet.detect(buffer);
candidateGuessEncodings = candidateGuessEncodings.map(e => MAP_CANDIDATE_GUESS_ENCODING_TO_JSCHARDET[e]).filter(e => !!e);
const detected = jschardet.detect(buffer, candidateGuessEncodings.length > 0 ? { detectEncodings: candidateGuessEncodings } : undefined);
if (!detected || !detected.encoding) {
return null;
}

View File

@@ -1233,11 +1233,11 @@ export class Repository {
.filter(entry => !!entry);
}
async bufferString(object: string, encoding: string = 'utf8', autoGuessEncoding = false): Promise<string> {
async bufferString(object: string, encoding: string = 'utf8', autoGuessEncoding = false, candidateGuessEncodings: string[] = []): Promise<string> {
const stdout = await this.buffer(object);
if (autoGuessEncoding) {
encoding = detectEncoding(stdout) || encoding;
encoding = detectEncoding(stdout, candidateGuessEncodings) || encoding;
}
encoding = iconv.encodingExists(encoding) ? encoding : 'utf8';

View File

@@ -1865,13 +1865,14 @@ export class Repository implements Disposable {
const configFiles = workspace.getConfiguration('files', Uri.file(filePath));
const defaultEncoding = configFiles.get<string>('encoding');
const autoGuessEncoding = configFiles.get<boolean>('autoGuessEncoding');
const candidateGuessEncodings = configFiles.get<string[]>('candidateGuessEncodings');
try {
return await this.repository.bufferString(`${ref}:${path}`, defaultEncoding, autoGuessEncoding);
return await this.repository.bufferString(`${ref}:${path}`, defaultEncoding, autoGuessEncoding, candidateGuessEncodings);
} catch (err) {
if (err.gitErrorCode === GitErrorCodes.WrongCase) {
const gitRelativePath = await this.repository.getGitRelativePath(ref, path);
return await this.repository.bufferString(`${ref}:${gitRelativePath}`, defaultEncoding, autoGuessEncoding);
return await this.repository.bufferString(`${ref}:${gitRelativePath}`, defaultEncoding, autoGuessEncoding, candidateGuessEncodings);
}
throw err;