Merge pull request #17454 from Microsoft/ben/17408

Search does not work in UTF-16 LE encoded files (fixes #17408)
This commit is contained in:
Benjamin Pasero
2016-12-19 09:36:39 +01:00
committed by GitHub
3 changed files with 43 additions and 17 deletions

View File

@@ -13,6 +13,18 @@ export const UTF8_with_bom = 'utf8bom';
export const UTF16be = 'utf16be';
export const UTF16le = 'utf16le';
export function bomLength(encoding: string): number {
switch (encoding) {
case UTF8:
return 3;
case UTF16be:
case UTF16le:
return 2;
}
return 0;
}
export function decode(buffer: NodeBuffer, encoding: string, options?: any): string {
return iconv.decode(buffer, toNodeEncoding(encoding), options);
}

View File

@@ -15,7 +15,7 @@ import { TPromise } from 'vs/base/common/winjs.base';
import { ISerializedFileMatch } from '../search';
import * as baseMime from 'vs/base/common/mime';
import { ILineMatch } from 'vs/platform/search/common/search';
import { UTF16le, UTF16be, UTF8, UTF8_with_bom, encodingExists, decode } from 'vs/base/node/encoding';
import { UTF16le, UTF16be, UTF8, UTF8_with_bom, encodingExists, decode, bomLength } from 'vs/base/node/encoding';
import { detectMimeAndEncodingFromBuffer } from 'vs/base/node/mime';
import { ISearchWorker, ISearchWorkerSearchArgs, ISearchWorkerSearchResult } from './searchWorkerIpc';
@@ -66,6 +66,9 @@ interface IFileSearchResult {
limitReached?: boolean;
}
const LF = 0x0a;
const CR = 0x0d;
export class SearchWorkerEngine {
private nextSearch = TPromise.wrap(null);
private isCanceled = false;
@@ -205,7 +208,7 @@ export class SearchWorkerEngine {
// Detect encoding and mime when this is the beginning of the file
if (isFirstRead) {
let mimeAndEncoding = detectMimeAndEncodingFromBuffer(buffer, bytesRead);
const mimeAndEncoding = detectMimeAndEncodingFromBuffer(buffer, bytesRead);
if (mimeAndEncoding.mimes[mimeAndEncoding.mimes.length - 1] !== baseMime.MIME_TEXT) {
return clb(null); // skip files that seem binary
}
@@ -213,23 +216,34 @@ export class SearchWorkerEngine {
// Check for BOM offset
switch (mimeAndEncoding.encoding) {
case UTF8:
pos = i = 3;
pos = i = bomLength(UTF8);
options.encoding = UTF8;
break;
case UTF16be:
pos = i = 2;
pos = i = bomLength(UTF16be);
options.encoding = UTF16be;
break;
case UTF16le:
pos = i = 2;
pos = i = bomLength(UTF16le);
options.encoding = UTF16le;
break;
}
}
// when we are running with UTF16le, LF and CR are encoded as
// 0A 00 (LF) and 0D 00 (CR). the zero bytes are at the end
// due to little endianess. since we want to split our buffer
// into lines, we need to skip over the 00 bytes after LF and CR
// so UTF16-LE gets a multiplier of 2, otherwise we would include
// bad 00 bytes in our resulting buffer.
let byteOffsetMultiplier = 1;
if (options.encoding === UTF16le) {
byteOffsetMultiplier = 2;
}
if (lastBufferHadTraillingCR) {
if (buffer[i] === 0x0a) { // LF (Line Feed)
lineFinished(1);
if (buffer[i] === LF) {
lineFinished(1 * byteOffsetMultiplier);
i++;
} else {
lineFinished(0);
@@ -239,16 +253,16 @@ export class SearchWorkerEngine {
}
for (; i < bytesRead; ++i) {
if (buffer[i] === 0x0a) { // LF (Line Feed)
lineFinished(1);
} else if (buffer[i] === 0x0d) { // CR (Carriage Return)
if (buffer[i] === LF) {
lineFinished(1 * byteOffsetMultiplier);
} else if (buffer[i] === CR) { // CR (Carriage Return)
if (i + 1 === bytesRead) {
lastBufferHadTraillingCR = true;
} else if (buffer[i + 1] === 0x0a) { // LF (Line Feed)
lineFinished(2);
} else if (buffer[i + 1] === LF) {
lineFinished(2 * byteOffsetMultiplier);
i++;
} else {
lineFinished(1);
lineFinished(1 * byteOffsetMultiplier);
}
}
}
@@ -339,7 +353,7 @@ export class LineMatch implements ILineMatch {
}
serialize(): ILineMatch {
let result = {
const result = {
preview: this.preview,
lineNumber: this.lineNumber,
offsetAndLengths: this.offsetAndLengths

View File

@@ -696,7 +696,7 @@ suite('Search', () => {
}
}, () => { }, (error) => {
assert.ok(!error);
assert.equal(c, 2);
assert.equal(c, 3);
done();
});
});
@@ -717,7 +717,7 @@ suite('Search', () => {
}
}, (result) => { }, (error) => {
assert.ok(!error);
assert.equal(c, 748);
assert.equal(c, 776);
done();
});
});
@@ -739,7 +739,7 @@ suite('Search', () => {
}
}, (result) => { }, (error) => {
assert.ok(!error);
assert.equal(c, 366);
assert.equal(c, 394);
done();
});
});