Files
vscode/src/vs/platform/webContentExtractor/electron-main/webPageLoader.ts
T
2025-12-04 11:21:56 -08:00

324 lines
10 KiB
TypeScript

/*---------------------------------------------------------------------------------------------
* Copyright (c) Microsoft Corporation. All rights reserved.
* Licensed under the MIT License. See License.txt in the project root for license information.
*--------------------------------------------------------------------------------------------*/
import type { BrowserWindow, BrowserWindowConstructorOptions, Event } from 'electron';
import { Queue, raceTimeout, TimeoutTimer } from '../../../base/common/async.js';
import { createSingleCallFunction } from '../../../base/common/functional.js';
import { Disposable, toDisposable } from '../../../base/common/lifecycle.js';
import { equalsIgnoreCase } from '../../../base/common/strings.js';
import { URI } from '../../../base/common/uri.js';
import { generateUuid } from '../../../base/common/uuid.js';
import { ILogService } from '../../log/common/log.js';
import { IWebContentExtractorOptions, WebContentExtractResult } from '../common/webContentExtractor.js';
import { AXNode, convertAXTreeToMarkdown } from './cdpAccessibilityDomain.js';
type NetworkRequestEventParams = Readonly<{
requestId?: string;
request?: { url?: string };
response?: { status?: number; statusText?: string };
type?: string;
}>;
/**
* A web page loader that uses Electron to load web pages and extract their content.
*/
export class WebPageLoader extends Disposable {
private static readonly TIMEOUT = 30000; // 30 seconds
private static readonly POST_LOAD_TIMEOUT = 5000; // 5 seconds - increased for dynamic content
private static readonly FRAME_TIMEOUT = 500; // 0.5 seconds
private static readonly IDLE_DEBOUNCE_TIME = 500; // 0.5 seconds - wait after last network request
private readonly _window: BrowserWindow;
private readonly _debugger: Electron.Debugger;
private readonly _requests = new Set<string>();
private readonly _queue = this._register(new Queue());
private readonly _timeout = this._register(new TimeoutTimer());
private readonly _idleDebounceTimer = this._register(new TimeoutTimer());
private _onResult = (_result: WebContentExtractResult) => { };
private _didFinishLoad = false;
constructor(
browserWindowFactory: (options: BrowserWindowConstructorOptions) => BrowserWindow,
private readonly _logger: ILogService,
private readonly _uri: URI,
private readonly _options?: IWebContentExtractorOptions,
) {
super();
this._window = browserWindowFactory({
width: 800,
height: 600,
show: false,
webPreferences: {
partition: generateUuid(), // do not share any state with the default renderer session
javascript: true,
offscreen: true,
sandbox: true,
webgl: false,
}
});
this._register(toDisposable(() => this._window.destroy()));
this._debugger = this._window.webContents.debugger;
this._debugger.attach('1.1');
this._debugger.on('message', this.onDebugMessage.bind(this));
this._window.webContents
.once('did-start-loading', this.onStartLoading.bind(this))
.once('did-finish-load', this.onFinishLoad.bind(this))
.once('did-fail-load', this.onFailLoad.bind(this))
.once('will-navigate', this.onRedirect.bind(this))
.once('will-redirect', this.onRedirect.bind(this));
// Disable any UI interactions that could interfere with content loading.
this._window.webContents
.on('login', (event) => event.preventDefault())
.on('select-client-certificate', (event) => event.preventDefault())
.on('certificate-error', (event) => event.preventDefault());
}
private trace(message: string) {
this._logger.trace(`[WebPageLoader] [${this._uri}] ${message}`);
}
/**
* Loads the web page and extracts its content.
*/
public async load() {
return await new Promise<WebContentExtractResult>((resolve) => {
this._onResult = createSingleCallFunction((result) => {
switch (result.status) {
case 'ok':
this.trace(`Loaded web page content, status: ${result.status}, title: '${result.title}', length: ${result.result.length}`);
break;
case 'redirect':
this.trace(`Loaded web page content, status: ${result.status}, toURI: ${result.toURI}`);
break;
case 'error':
this.trace(`Loaded web page content, status: ${result.status}, code: ${result.statusCode}, error: '${result.error}', title: '${result.title}', length: ${result.result?.length ?? 0}`);
break;
}
const content = result.status !== 'redirect' ? result.result : undefined;
if (content !== undefined) {
this.trace(content.length < 200 ? `Extracted content: '${content}'` : `Extracted content preview: '${content.substring(0, 200)}...'`);
}
resolve(result);
this.dispose();
});
this.trace(`Loading web page content`);
void this._window.loadURL(this._uri.toString(true));
this.setTimeout(WebPageLoader.TIMEOUT);
});
}
/**
* Sets a timeout to trigger content extraction regardless of current loading state.
*/
private setTimeout(time: number) {
if (this._store.isDisposed) {
return;
}
this.trace(`Setting page load timeout to ${time} ms`);
this._timeout.cancelAndSet(() => {
this.trace(`Page load timeout reached`);
void this._queue.queue(() => this.extractContent());
}, time);
}
/**
* Handles the 'did-start-loading' event, enabling network tracking.
*/
private onStartLoading() {
if (this._store.isDisposed) {
return;
}
this.trace(`Received 'did-start-loading' event`);
void this._debugger.sendCommand('Network.enable').catch(() => {
// This throws when we destroy the window on redirect.
});
}
/**
* Handles the 'did-finish-load' event, checking for idle state
* and updating timeout to allow for post-load activities.
*/
private onFinishLoad() {
if (this._store.isDisposed) {
return;
}
this.trace(`Received 'did-finish-load' event`);
this._didFinishLoad = true;
this.scheduleIdleCheck();
this.setTimeout(WebPageLoader.POST_LOAD_TIMEOUT);
}
/**
* Handles the 'did-fail-load' event, reporting load failures.
*/
private onFailLoad(_event: Event, statusCode: number, error: string) {
if (this._store.isDisposed) {
return;
}
this.trace(`Received 'did-fail-load' event, code: ${statusCode}, error: '${error}'`);
if (statusCode === -3) {
this.trace(`Ignoring ERR_ABORTED (-3) as it may be caused by CSP or other measures`);
void this._queue.queue(() => this.extractContent());
} else {
void this._queue.queue(() => this.extractContent({ status: 'error', statusCode, error }));
}
}
/**
* Handles the 'will-navigate' and 'will-redirect' events, managing redirects.
*/
private onRedirect(event: Event, url: string) {
if (this._store.isDisposed) {
return;
}
this.trace(`Received 'will-navigate' or 'will-redirect' event, url: ${url}`);
if (!this._options?.followRedirects) {
const toURI = URI.parse(url);
if (!equalsIgnoreCase(toURI.authority, this._uri.authority)) {
event.preventDefault();
this._onResult({ status: 'redirect', toURI });
}
}
}
/**
* Handles debugger messages related to network requests, tracking their lifecycle.
* @note DO NOT add logging to this function, microsoft.com will freeze when too many logs are generated
*/
private onDebugMessage(_event: Event, method: string, params: NetworkRequestEventParams) {
if (this._store.isDisposed) {
return;
}
const { requestId, type, response } = params;
switch (method) {
case 'Network.requestWillBeSent':
if (requestId !== undefined) {
this._requests.add(requestId);
this._idleDebounceTimer.cancel();
}
break;
case 'Network.loadingFinished':
case 'Network.loadingFailed':
if (requestId !== undefined) {
this._requests.delete(requestId);
if (this._requests.size === 0 && this._didFinishLoad) {
this.scheduleIdleCheck();
}
}
break;
case 'Network.responseReceived':
if (type === 'Document') {
const statusCode = response?.status ?? 0;
if (statusCode >= 400) {
const error = response?.statusText || `HTTP error ${statusCode}`;
void this._queue.queue(() => this.extractContent({ status: 'error', statusCode, error }));
}
}
break;
}
}
/**
* Schedules an idle check after a debounce period to allow for bursts of network activity.
* If idle is detected, proceeds to extract content.
*/
private scheduleIdleCheck() {
if (this._store.isDisposed) {
return;
}
this._idleDebounceTimer.cancelAndSet(async () => {
if (this._store.isDisposed) {
return;
}
await this.nextFrame();
if (this._requests.size === 0) {
this._queue.queue(() => this.extractContent());
} else {
this.trace(`New network requests detected, deferring content extraction`);
}
}, WebPageLoader.IDLE_DEBOUNCE_TIME);
}
/**
* Waits for a rendering frame to ensure the page had a chance to update.
*/
private async nextFrame() {
if (this._store.isDisposed) {
return;
}
// Wait for a rendering frame to ensure the page had a chance to update.
await raceTimeout(
new Promise<void>((resolve) => {
try {
this.trace(`Waiting for a frame to be rendered`);
this._window.webContents.beginFrameSubscription(false, () => {
try {
this.trace(`A frame has been rendered`);
this._window.webContents.endFrameSubscription();
} catch {
// ignore errors
}
resolve();
});
} catch {
// ignore errors
resolve();
}
}),
WebPageLoader.FRAME_TIMEOUT
);
}
/**
* Extracts the content of the loaded web page using the Accessibility domain and reports the result.
*/
private async extractContent(errorResult?: WebContentExtractResult & { status: 'error' }) {
if (this._store.isDisposed) {
return;
}
try {
this.trace(`Extracting content using Accessibility domain`);
const title = this._window.webContents.getTitle();
const { nodes } = await this._debugger.sendCommand('Accessibility.getFullAXTree') as { nodes: AXNode[] };
const result = convertAXTreeToMarkdown(this._uri, nodes);
if (errorResult !== undefined) {
this._onResult({ ...errorResult, result, title });
} else {
this._onResult({ status: 'ok', result, title });
}
} catch (e) {
if (errorResult !== undefined) {
this._onResult(errorResult);
} else {
this._onResult({
status: 'error',
error: e instanceof Error ? e.message : String(e)
});
}
}
}
}