Files
vscode/scripts/chat-simulation/test-chat-perf-regression.js
2026-04-21 19:11:30 -07:00

1936 lines
84 KiB
JavaScript

/*---------------------------------------------------------------------------------------------
* Copyright (c) Microsoft Corporation. All rights reserved.
* Licensed under the MIT License. See License.txt in the project root for license information.
*--------------------------------------------------------------------------------------------*/
// @ts-check
/**
* Chat performance benchmark.
*
* Uses the real copilot extension with IS_SCENARIO_AUTOMATION=1 and a local
* mock LLM server. Measures the full stack: prompt building, context
* gathering, tool resolution, rendering, GC, and layout overhead.
*
* Usage:
* npm run perf:chat # all scenarios vs 1.115.0
* npm run perf:chat -- --runs 10 # 10 runs per scenario
* npm run perf:chat -- --scenario text-only # single scenario
* npm run perf:chat -- --no-baseline # skip baseline comparison
* npm run perf:chat -- --build 1.110.0 --baseline-build 1.115.0
* npm run perf:chat -- --resume .chat-simulation-data/2026-04-14/results.json --runs 3
*/
const path = require('path');
const fs = require('fs');
const {
ROOT, DATA_DIR, METRIC_DEFS, loadConfig,
resolveBuild, isVersionString, buildEnv, buildArgs, prepareRunDir,
robustStats, welchTTest, summarize, markDuration, launchVSCode,
getNextExtHostInspectPort, connectToExtHostInspector, getRepoRoot,
} = require('./common/utils');
const { getUserTurns, getScenarioIds } = require('./common/mock-llm-server');
const { registerPerfScenarios, getScenarioDescription } = require('./common/perf-scenarios');
// -- Config (edit config.jsonc to change defaults) ---------------------------
const CONFIG = loadConfig('perfRegression');
// -- CLI args ----------------------------------------------------------------
function parseArgs() {
const args = process.argv.slice(2);
const opts = {
runs: CONFIG.runsPerScenario ?? 5,
verbose: false,
ci: false,
noCache: false,
force: false,
heapSnapshots: false,
/** @type {string[]} */
scenarios: [],
/** @type {string | undefined} */
build: undefined,
/** @type {string | undefined} */
baseline: undefined,
/** @type {string | undefined} */
baselineBuild: CONFIG.baselineBuild ?? '1.115.0',
saveBaseline: false,
threshold: CONFIG.regressionThreshold ?? 0.2,
/** @type {Record<string, number | string>} */
metricThresholds: CONFIG.metricThresholds ?? {},
/** @type {string | undefined} */
resume: undefined,
productionBuild: false,
/** @type {Record<string, any>} */
settingsOverrides: {},
/** @type {Record<string, any>} */
testSettingsOverrides: {},
/** @type {Record<string, any>} */
baselineSettingsOverrides: {},
cleanupDiagnostics: false,
};
for (let i = 0; i < args.length; i++) {
switch (args[i]) {
case '--runs': opts.runs = parseInt(args[++i], 10); break;
case '--verbose': opts.verbose = true; break;
case '--scenario': case '-s': opts.scenarios.push(args[++i]); break;
case '--build': case '-b': opts.build = args[++i]; break;
case '--baseline': opts.baseline = args[++i]; break;
case '--baseline-build': opts.baselineBuild = args[++i]; break;
case '--no-baseline': opts.baselineBuild = undefined; break;
case '--save-baseline': opts.saveBaseline = true; break;
case '--threshold': opts.threshold = parseFloat(args[++i]); break;
case '--resume': opts.resume = args[++i]; break;
case '--production-build': opts.productionBuild = true; break;
case '--setting': case '--test-setting': case '--baseline-setting': {
const kv = args[++i];
const eq = kv.indexOf('=');
if (eq === -1) { console.error(`${args[i - 1]} requires key=value, got: ${kv}`); process.exit(1); }
const key = kv.slice(0, eq);
const raw = kv.slice(eq + 1);
// Parse booleans and numbers, keep rest as strings
const val = raw === 'true' ? true : raw === 'false' ? false : /^-?\d+(\.\d+)?$/.test(raw) ? Number(raw) : raw;
const flag = args[i - 1];
if (flag === '--test-setting') { opts.testSettingsOverrides[key] = val; }
else if (flag === '--baseline-setting') { opts.baselineSettingsOverrides[key] = val; }
else { opts.settingsOverrides[key] = val; }
break;
}
case '--no-cache': opts.noCache = true; break;
case '--force': opts.force = true; break;
case '--heap-snapshots': opts.heapSnapshots = true; break;
case '--ci': opts.ci = true; opts.noCache = true; opts.heapSnapshots = true; opts.cleanupDiagnostics = true; break;
case '--cleanup-diagnostics': opts.cleanupDiagnostics = true; break;
case '--help': case '-h':
console.log([
'Chat performance benchmark',
'',
'Options:',
' --runs <n> Number of runs per scenario (default: 5)',
' --scenario <id> Scenario to run (repeatable; default: all)',
' --build <path|ver> Path to VS Code build, or a version to download',
' (e.g. "1.110.0", "insiders", commit hash, or local path)',
' --baseline <path> Compare against a baseline JSON file',
' --baseline-build <v> Version or path to benchmark as baseline',
' (e.g. "1.115.0", "insiders", commit hash, or local path)',
' --no-baseline Skip baseline comparison entirely',
' --save-baseline Save results as the new baseline (requires --baseline <path>)',
' --resume <path> Resume a previous run, adding more iterations to increase',
' confidence. Merges new runs with existing rawRuns data',
' --threshold <frac> Regression threshold fraction (default: 0.2 = 20%)',
' --production-build Build a local bundled package (via gulp vscode) for',
' apples-to-apples comparison against a release baseline',
' --setting <k=v> Set a VS Code setting override for all builds (repeatable)',
' --test-setting <k=v> Set a VS Code setting override for test build only',
' --baseline-setting <k=v> Set a VS Code setting override for baseline build only',
' e.g. --setting chat.experimental.incrementalRendering.enabled=true',
' --no-cache Ignore cached baseline data, always run fresh',
' --force Skip build mode mismatch confirmation',
' --heap-snapshots Take heap snapshots (slow; auto-enabled in --ci mode)',
' --ci CI mode: write Markdown summary to ci-summary.md (implies --no-cache, --heap-snapshots, --cleanup-diagnostics)',
' --cleanup-diagnostics Remove heap snapshots, CPU profiles, and traces after each run to save disk space',
' --verbose Print per-run details',
'',
'Scenarios: ' + getScenarioIds().join(', '),
].join('\n'));
process.exit(0);
}
}
if (opts.scenarios.length === 0) {
opts.scenarios = getScenarioIds();
} else {
const knownIds = new Set(getScenarioIds());
const unknown = opts.scenarios.filter(s => !knownIds.has(s));
if (unknown.length > 0) {
console.error(`Unknown scenario(s): ${unknown.join(', ')}\nAvailable: ${[...knownIds].join(', ')}`);
process.exit(1);
}
}
return opts;
}
// -- Build mode detection ----------------------------------------------------
/**
* Classify an electron path into a build mode.
* @param {string} electronPath
* @returns {'dev' | 'production' | 'release'}
*/
function detectBuildMode(electronPath) {
if (electronPath.includes('.vscode-test')) {
return 'release';
}
if (electronPath.includes('VSCode-')) {
return 'production';
}
return 'dev';
}
/**
* Return a human-readable label for a build mode.
* @param {'dev' | 'production' | 'release'} mode
* @returns {string}
*/
function buildModeLabel(mode) {
switch (mode) {
case 'dev': return 'development (unbundled)';
case 'production': return 'production (bundled, local)';
case 'release': return 'release (bundled, downloaded)';
}
}
// -- Production build --------------------------------------------------------
/**
* Build a local production (bundled) VS Code package using `gulp vscode`.
* Returns the path to the Electron executable in the packaged output.
*
* The gulp task compiles TypeScript, bundles JS, and packages with Electron
* into `../VSCode-<platform>-<arch>/`. This is the same process used for
* release builds, minus minification and mangling.
*/
function buildProductionBuild() {
const product = require(path.join(ROOT, 'product.json'));
const platform = process.platform;
const arch = process.arch;
const destDir = path.join(ROOT, '..', `VSCode-${platform}-${arch}`);
console.log('[chat-simulation] Building local production package (gulp vscode)...');
console.log('[chat-simulation] This may take a few minutes on the first run.');
const { execSync } = require('child_process');
try {
execSync('npm run gulp -- vscode', {
cwd: ROOT,
stdio: 'inherit',
timeout: 10 * 60 * 1000, // 10 minute timeout
});
} catch (e) {
// The copilot shim step may fail locally when the copilot SDK is not
// fully packaged (it is normally supplied via CI). As long as the
// Electron executable was produced we can still benchmark.
console.warn('[chat-simulation] gulp vscode exited with errors (see above). Checking if executable was still produced...');
}
/** @type {string} */
let electronPath;
if (platform === 'darwin') {
electronPath = path.join(destDir, `${product.nameLong}.app`, 'Contents', 'MacOS', product.nameShort);
} else if (platform === 'linux') {
electronPath = path.join(destDir, product.applicationName);
} else {
electronPath = path.join(destDir, `${product.nameShort}.exe`);
}
if (!fs.existsSync(electronPath)) {
console.error(`[chat-simulation] Production build failed — executable not found at: ${electronPath}`);
process.exit(1);
}
// Merge product.overrides.json into the packaged product.json.
// The overrides file contains extensionsGallery and other config that
// the OSS product.json lacks. In dev builds these are loaded at
// runtime when VSCODE_DEV is set, but the production build doesn't
// set that flag so we bake them in.
const overridesPath = path.join(ROOT, 'product.overrides.json');
if (fs.existsSync(overridesPath)) {
/** @type {string} */
let appDir;
if (platform === 'darwin') {
appDir = path.join(destDir, `${product.nameLong}.app`, 'Contents', 'Resources', 'app');
} else {
appDir = path.join(destDir, 'resources', 'app');
}
const packagedProductPath = path.join(appDir, 'product.json');
if (fs.existsSync(packagedProductPath)) {
const packagedProduct = JSON.parse(fs.readFileSync(packagedProductPath, 'utf-8'));
const overrides = JSON.parse(fs.readFileSync(overridesPath, 'utf-8'));
const merged = Object.assign(packagedProduct, overrides);
fs.writeFileSync(packagedProductPath, JSON.stringify(merged, null, '\t'));
console.log('[chat-simulation] Merged product.overrides.json into packaged product.json');
}
}
console.log(`[chat-simulation] Production build ready: ${electronPath}`);
return electronPath;
}
/**
* @typedef {{ type: 'fraction', value: number } | { type: 'absolute', value: number }} MetricThreshold
*/
/**
* Parse a metric threshold value from config.
* - A number is treated as a fraction (e.g. 0.2 = 20%).
* - A string like "100ms" or "5" is treated as an absolute delta.
* @param {number | string} raw
* @returns {MetricThreshold}
*/
function parseMetricThreshold(raw) {
if (typeof raw === 'number') {
return { type: 'fraction', value: raw };
}
// Strip unit suffix (ms, MB, etc.) and parse the number
const num = parseFloat(raw);
if (isNaN(num)) {
throw new Error(`Invalid metric threshold: ${raw}`);
}
return { type: 'absolute', value: num };
}
/**
* Get the regression threshold for a specific metric.
* Uses per-metric override from config if available, otherwise the global threshold.
* @param {{ threshold: number, metricThresholds?: Record<string, number | string> }} opts
* @param {string} metric
* @returns {MetricThreshold}
*/
function getMetricThreshold(opts, metric) {
const raw = opts.metricThresholds?.[metric];
if (raw !== undefined) {
return parseMetricThreshold(raw);
}
return { type: 'fraction', value: opts.threshold };
}
/**
* Check whether a change exceeds the threshold.
* @param {MetricThreshold} threshold
* @param {number} change - fractional change (e.g. 0.5 = 50% increase)
* @param {number} absoluteDelta - absolute difference (cur.median - bas.median)
* @returns {boolean}
*/
function exceedsThreshold(threshold, change, absoluteDelta) {
if (threshold.type === 'absolute') {
return absoluteDelta > threshold.value;
}
return change > threshold.value;
}
// -- Metrics -----------------------------------------------------------------
/**
* @typedef {{
* timeToUIUpdated: number,
* timeToFirstToken: number,
* timeToComplete: number,
* timeToRenderComplete: number,
* instructionCollectionTime: number,
* agentInvokeTime: number,
* heapUsedBefore: number,
* heapUsedAfter: number,
* heapDelta: number,
* heapDeltaPostGC: number,
* majorGCs: number,
* minorGCs: number,
* gcDurationMs: number,
* layoutCount: number,
* layoutDurationMs: number,
* recalcStyleCount: number,
* forcedReflowCount: number,
* longTaskCount: number,
* longAnimationFrameCount: number,
* longAnimationFrameTotalMs: number,
* frameCount: number,
* compositeLayers: number,
* paintCount: number,
* hasInternalMarks: boolean,
* responseHasContent: boolean,
* internalFirstToken: number,
* profilePath: string,
* tracePath: string,
* snapshotPath: string,
* extHostHeapUsedBefore: number,
* extHostHeapUsedAfter: number,
* extHostHeapDelta: number,
* extHostHeapDeltaPostGC: number,
* extHostProfilePath: string,
* extHostSnapshotPath: string,
* }} RunMetrics
*/
// -- Single run --------------------------------------------------------------
/**
* @param {string} electronPath
* @param {string} scenario
* @param {{ url: string, requestCount: () => number, waitForRequests: (n: number, ms: number) => Promise<void>, completionCount: () => number, waitForCompletion: (n: number, ms: number) => Promise<void> }} mockServer
* @param {boolean} verbose
* @param {string} runIndex
* @param {string} runDir - timestamped run directory for diagnostics
* @param {'baseline' | 'test'} role - whether this is a baseline or test run
* @param {Record<string, any>} [settingsOverrides] - custom VS Code settings
* @param {{ heapSnapshots?: boolean }} [runOpts] - additional run options
* @returns {Promise<RunMetrics>}
*/
async function runOnce(electronPath, scenario, mockServer, verbose, runIndex, runDir, role, settingsOverrides, runOpts) {
const takeHeapSnapshots = runOpts?.heapSnapshots ?? false;
const { userDataDir, extDir, logsDir } = prepareRunDir(runIndex, mockServer, settingsOverrides);
const isDevBuild = !electronPath.includes('.vscode-test') && !electronPath.includes('VSCode-');
// Extract a clean build label from the path.
// Dev: .build/electron/Code - OSS.app/.../Code - OSS → "dev"
// Stable: .vscode-test/vscode-darwin-arm64-1.115.0/Visual Studio Code.app/.../Electron → "1.115.0"
// Production: ../VSCode-darwin-arm64/Code - OSS.app/.../Code - OSS → "production"
let buildLabel = 'dev';
if (!isDevBuild) {
const vscodeTestMatch = electronPath.match(/vscode-test\/vscode-[^/]*?-(\d+\.\d+\.\d+)/);
if (vscodeTestMatch) {
buildLabel = vscodeTestMatch[1];
} else if (electronPath.includes('VSCode-')) {
buildLabel = 'production';
} else {
buildLabel = path.basename(electronPath);
}
}
// For dev builds from a different repo, derive the repo root from the
// electron path so that the build loads its own out/ source code.
const appRoot = isDevBuild ? (getRepoRoot(electronPath) || ROOT) : ROOT;
if (isDevBuild && appRoot !== ROOT) {
if (verbose) {
console.log(` [debug] Using appRoot from electron path: ${appRoot}`);
}
}
// Create a per-run diagnostics directory: <runDir>/<role>-<build>/<scenario>-<i>/
const runDiagDir = path.join(runDir, `${role}-${buildLabel}`, runIndex.replace(/^baseline-/, ''));
fs.mkdirSync(runDiagDir, { recursive: true });
const tracePath = path.join(runDiagDir, 'trace.json');
const extHostInspectPort = getNextExtHostInspectPort();
const vscode = await launchVSCode(
electronPath,
buildArgs(userDataDir, extDir, logsDir, { isDevBuild, extHostInspectPort, traceFile: tracePath, appRoot }),
buildEnv(mockServer, { isDevBuild }),
{ verbose },
);
activeVSCode = vscode;
const window = vscode.page;
// Declared outside try so the finally block can clean up
/** @type {{ send: (method: string, params?: any) => Promise<any>, on: (event: string, listener: (params: any) => void) => void, close: () => void } | null} */
let extHostInspector = null;
/** @type {{ usedSize: number, totalSize: number } | null} */
let extHostHeapBefore = null;
/** @type {Omit<RunMetrics, 'majorGCs' | 'minorGCs' | 'gcDurationMs' | 'longTaskCount' | 'longAnimationFrameCount' | 'longAnimationFrameTotalMs' | 'timeToUIUpdated' | 'timeToFirstToken' | 'timeToComplete' | 'timeToRenderComplete' | 'layoutDurationMs' | 'instructionCollectionTime' | 'agentInvokeTime' | 'hasInternalMarks' | 'internalFirstToken'> | null} */
let partialMetrics = null;
// Timing vars hoisted for access in post-close trace parsing
let submitTime = 0;
let firstResponseTime = 0;
let responseCompleteTime = 0;
let renderCompleteTime = 0;
try {
await window.waitForSelector('.monaco-workbench', { timeout: 60_000 });
const cdp = await window.context().newCDPSession(window);
await cdp.send('Performance.enable');
const heapBefore = /** @type {any} */ (await cdp.send('Runtime.getHeapUsage'));
const metricsBefore = await cdp.send('Performance.getMetrics');
// Open chat
const chatShortcut = process.platform === 'darwin' ? 'Control+Meta+KeyI' : 'Control+Alt+KeyI';
await window.keyboard.press(chatShortcut);
const CHAT_VIEW = 'div[id="workbench.panel.chat"]';
const chatEditorSel = `${CHAT_VIEW} .interactive-input-part .monaco-editor[role="code"]`;
await window.waitForSelector(CHAT_VIEW, { timeout: 15_000 });
await window.waitForFunction(
(selector) => Array.from(document.querySelectorAll(selector)).some(el => {
const rect = el.getBoundingClientRect();
return rect.width > 0 && rect.height > 0;
}),
chatEditorSel, { timeout: 15_000 },
);
// Dismiss dialogs
const dismissDialog = async () => {
for (const sel of ['.chat-setup-dialog', '.dialog-shadow', '.monaco-dialog-box']) {
const el = await window.$(sel);
if (el) { await window.keyboard.press('Escape'); await new Promise(r => setTimeout(r, 500)); break; }
}
};
await dismissDialog();
// Wait for extension activation
const reqsBefore = mockServer.requestCount();
try { await mockServer.waitForRequests(reqsBefore + 4, 30_000); } catch { }
if (verbose) {
console.log(` [debug] Extension active (${mockServer.requestCount() - reqsBefore} new requests)`);
}
// Connect to extension host inspector for profiling/heap data
try {
extHostInspector = await connectToExtHostInspector(extHostInspectPort, { verbose, timeoutMs: 15_000 });
await extHostInspector.send('HeapProfiler.enable');
await extHostInspector.send('Profiler.enable');
await extHostInspector.send('Profiler.start');
extHostHeapBefore = await extHostInspector.send('Runtime.getHeapUsage');
if (verbose && extHostHeapBefore) {
console.log(` [ext-host] Heap before: ${Math.round(extHostHeapBefore.usedSize / 1024 / 1024)}MB`);
}
} catch (err) {
if (verbose) {
console.log(` [ext-host] Could not connect to inspector: ${err}`);
}
}
// Wait for model resolution
await new Promise(r => setTimeout(r, 3000));
await dismissDialog();
// Focus input
await window.click(chatEditorSel);
const focusStart = Date.now();
while (Date.now() - focusStart < 5_000) {
const focused = await window.evaluate((sel) => {
const el = document.querySelector(sel);
return el && (el.classList.contains('focused') || el.contains(document.activeElement));
}, chatEditorSel).catch(() => false);
if (focused) { break; }
await new Promise(r => setTimeout(r, 50));
}
// Type message — use the smoke-test driver's typeInEditor when available
// (dev builds), fall back to pressSequentially for stable/insiders builds.
const chatMessage = `[scenario:${scenario}] Explain how this code works`;
const actualInputSelector = await window.evaluate((editorSel) => {
const editor = document.querySelector(editorSel);
if (!editor) { throw new Error('Chat editor not found'); }
return editor.querySelector('.native-edit-context') ? editorSel + ' .native-edit-context' : editorSel + ' textarea';
}, chatEditorSel);
const hasDriver = await window.evaluate(() =>
// @ts-ignore
!!globalThis.driver?.typeInEditor
).catch(() => false);
if (hasDriver) {
await window.evaluate(({ selector, text }) => {
// @ts-ignore
return globalThis.driver.typeInEditor(selector, text);
}, { selector: actualInputSelector, text: chatMessage });
} else {
// Fallback: click the input element and use pressSequentially
await window.click(actualInputSelector);
await new Promise(r => setTimeout(r, 200));
await window.locator(actualInputSelector).pressSequentially(chatMessage, { delay: 0 });
}
// Start CPU profiler to capture call stacks during the interaction
await cdp.send('Profiler.enable');
await cdp.send('Profiler.start');
// Submit
const completionsBefore = mockServer.completionCount();
submitTime = Date.now();
await window.keyboard.press('Enter');
// Wait for mock server to serve the response
try { await mockServer.waitForCompletion(completionsBefore + 1, 60_000); } catch { }
firstResponseTime = Date.now();
// Wait for DOM response to settle
await dismissDialog();
const responseSelector = `${CHAT_VIEW} .interactive-item-container.interactive-response`;
await window.waitForFunction(
(sel) => {
const responses = document.querySelectorAll(sel);
if (responses.length === 0) { return false; }
return !responses[responses.length - 1].classList.contains('chat-response-loading');
},
responseSelector, { timeout: 30_000 },
);
responseCompleteTime = Date.now();
// -- User turn injection loop -----------------------------------------
// For multi-turn scenarios with user follow-ups, type each follow-up
// message and wait for the model's response to settle.
const userTurns = getUserTurns(scenario);
for (let ut = 0; ut < userTurns.length; ut++) {
const userTurn = userTurns[ut];
if (verbose) {
console.log(` [debug] User follow-up ${ut + 1}/${userTurns.length}: "${userTurn.message}"`);
}
// Brief pause to let the UI settle between turns
await new Promise(r => setTimeout(r, 500));
// Focus the chat input
await window.click(chatEditorSel);
const utFocusStart = Date.now();
while (Date.now() - utFocusStart < 3_000) {
const focused = await window.evaluate((sel) => {
const el = document.querySelector(sel);
return el && (el.classList.contains('focused') || el.contains(document.activeElement));
}, chatEditorSel).catch(() => false);
if (focused) { break; }
await new Promise(r => setTimeout(r, 50));
}
// Type the follow-up message
if (hasDriver) {
await window.evaluate(({ selector, text }) => {
// @ts-ignore
return globalThis.driver.typeInEditor(selector, text);
}, { selector: actualInputSelector, text: userTurn.message });
} else {
await window.click(actualInputSelector);
await new Promise(r => setTimeout(r, 200));
await window.locator(actualInputSelector).pressSequentially(userTurn.message, { delay: 0 });
}
// Submit follow-up
const utCompBefore = mockServer.completionCount();
await window.keyboard.press('Enter');
// Wait for mock server to serve the response for this turn
try { await mockServer.waitForCompletion(utCompBefore + 1, 60_000); } catch { }
// Wait for the new response to finish rendering.
// The chat list is virtualized — old response elements are
// recycled out of the DOM as new ones appear, so we cannot
// rely on counting DOM elements. Instead, scroll to the
// bottom and wait for no response to be in loading state.
await dismissDialog();
await window.evaluate((chatViewSel) => {
const input = document.querySelector(chatViewSel + ' .interactive-input-part');
if (input) { input.scrollIntoView({ block: 'end' }); }
}, CHAT_VIEW);
await new Promise(r => setTimeout(r, 200));
await window.waitForFunction(
(sel) => {
const responses = document.querySelectorAll(sel);
if (responses.length === 0) { return false; }
return !responses[responses.length - 1].classList.contains('chat-response-loading');
},
responseSelector,
{ timeout: 30_000 },
);
responseCompleteTime = Date.now();
if (verbose) {
const utResponseInfo = await window.evaluate((sel) => {
const responses = document.querySelectorAll(sel);
const last = responses[responses.length - 1];
return last ? (last.textContent || '').substring(0, 150) : '(empty)';
}, responseSelector);
console.log(` [debug] Follow-up response (first 150 chars): ${utResponseInfo}`);
}
}
// Stop CPU profiler and save the profile
const { profile } = /** @type {any} */ (await cdp.send('Profiler.stop'));
const profilePath = path.join(runDiagDir, 'profile.cpuprofile');
fs.writeFileSync(profilePath, JSON.stringify(profile));
if (verbose) {
console.log(` [debug] CPU profile saved to ${profilePath}`);
}
const responseInfo = await window.evaluate((sel) => {
const responses = document.querySelectorAll(sel);
const last = responses[responses.length - 1];
if (!last) { return { hasContent: false, text: '' }; }
const text = last.textContent || '';
return { hasContent: text.trim().length > 0, text: text.substring(0, 200) };
}, responseSelector);
if (verbose) {
console.log(` [debug] Response content (first 200 chars): ${responseInfo.text}`);
console.log(` [debug] Client-side timing: firstResponse=${firstResponseTime - submitTime}ms, complete=${responseCompleteTime - submitTime}ms`);
}
// Wait for the typewriter animation to finish rendering.
// The chat UI animates streamed content word-by-word after the
// response stream completes. We need to wait until all content
// is rendered before capturing layout/style metrics, otherwise
// we miss the rendering phase where batching optimizations matter.
await window.waitForFunction(
(sel) => {
const responses = document.querySelectorAll(sel);
const last = responses[responses.length - 1];
if (!last) { return true; }
// The typewriter animation is done when there are no
// elements with the 'typewriter' or 'animating' class,
// and no pending cursor animations.
const hasAnimating = last.querySelector('.chat-animated-word, .chat-typewriter-cursor');
return !hasAnimating;
},
responseSelector,
{ timeout: 30_000 },
).catch(() => {
// Fallback: if the selector-based check doesn't work (e.g.
// the CSS classes differ across versions), wait for content
// to stabilize by polling textContent.
});
// Additional stabilization: poll until textContent stops changing.
// This catches any remaining animation regardless of CSS class names.
{
let prev = '';
let stableCount = 0;
const stabilizeStart = Date.now();
while (stableCount < 3 && Date.now() - stabilizeStart < 10_000) {
const current = await window.evaluate((sel) => {
const responses = document.querySelectorAll(sel);
const last = responses[responses.length - 1];
return last ? (last.textContent || '') : '';
}, responseSelector).catch(() => '');
if (current === prev) {
stableCount++;
} else {
stableCount = 0;
prev = current;
}
await new Promise(r => setTimeout(r, 100));
}
}
renderCompleteTime = Date.now();
if (verbose) {
console.log(` [debug] Render stabilized: ${renderCompleteTime - responseCompleteTime}ms after stream complete`);
}
const heapAfter = /** @type {any} */ (await cdp.send('Runtime.getHeapUsage'));
const metricsAfter = await cdp.send('Performance.getMetrics');
// -- Extension host metrics (non-snapshot) ---------------------------
let extHostHeapUsedBefore = -1;
let extHostHeapUsedAfter = -1;
let extHostHeapDelta = -1;
let extHostHeapDeltaPostGC = -1;
let extHostProfilePath = '';
let extHostSnapshotPath = '';
if (extHostInspector && extHostHeapBefore) {
try {
extHostHeapUsedBefore = Math.round(extHostHeapBefore.usedSize / 1024 / 1024);
// Stop CPU profiler and save
const extProfile = await extHostInspector.send('Profiler.stop');
extHostProfilePath = path.join(runDiagDir, 'exthost-profile.cpuprofile');
fs.writeFileSync(extHostProfilePath, JSON.stringify(extProfile.profile));
if (verbose) {
console.log(` [ext-host] CPU profile saved to ${extHostProfilePath}`);
}
// Heap usage after interaction
const extHostHeapAfter = await extHostInspector.send('Runtime.getHeapUsage');
extHostHeapUsedAfter = Math.round(extHostHeapAfter.usedSize / 1024 / 1024);
extHostHeapDelta = extHostHeapUsedAfter - extHostHeapUsedBefore;
// Force GC and measure retained heap
try {
await extHostInspector.send('Runtime.evaluate', { expression: 'gc()', awaitPromise: false, includeCommandLineAPI: true });
await new Promise(r => setTimeout(r, 200));
const extHostHeapPostGC = await extHostInspector.send('Runtime.getHeapUsage');
extHostHeapDeltaPostGC = Math.round(extHostHeapPostGC.usedSize / 1024 / 1024) - extHostHeapUsedBefore;
} catch {
extHostHeapDeltaPostGC = -1;
}
if (verbose) {
console.log(` [ext-host] Heap: before=${extHostHeapUsedBefore}MB, after=${extHostHeapUsedAfter}MB, delta=${extHostHeapDelta}MB, deltaPostGC=${extHostHeapDeltaPostGC}MB`);
}
} catch (err) {
if (verbose) {
console.log(` [ext-host] Error collecting metrics: ${err}`);
}
}
}
// -- Heap snapshots (opt-in, parallelized) ---------------------------
let snapshotPath = '';
if (takeHeapSnapshots) {
const snapshotPromises = [];
// Renderer snapshot
snapshotPromises.push((async () => {
const p = path.join(runDiagDir, 'heap.heapsnapshot');
await cdp.send('HeapProfiler.enable');
const chunks = /** @type {string[]} */ ([]);
cdp.on('HeapProfiler.addHeapSnapshotChunk', (/** @type {any} */ params) => {
chunks.push(params.chunk);
});
await cdp.send('HeapProfiler.takeHeapSnapshot', { reportProgress: false });
fs.writeFileSync(p, chunks.join(''));
return p;
})());
// Extension host snapshot (parallel with renderer)
if (extHostInspector && extHostHeapBefore) {
snapshotPromises.push((async () => {
const p = path.join(runDiagDir, 'exthost-heap.heapsnapshot');
const chunks = /** @type {string[]} */ ([]);
extHostInspector.on('HeapProfiler.addHeapSnapshotChunk', (/** @type {any} */ params) => {
chunks.push(params.chunk);
});
await extHostInspector.send('HeapProfiler.takeHeapSnapshot', { reportProgress: false });
fs.writeFileSync(p, chunks.join(''));
return p;
})());
}
const snapshotResults = await Promise.all(snapshotPromises);
snapshotPath = snapshotResults[0];
if (snapshotResults.length > 1) {
extHostSnapshotPath = snapshotResults[1];
}
if (verbose) {
console.log(` [debug] Renderer snapshot saved to ${snapshotPath}`);
if (extHostSnapshotPath) {
console.log(` [ext-host] Snapshot saved to ${extHostSnapshotPath}`);
}
}
}
// Close ext host inspector now that snapshots (if any) are done
if (extHostInspector) {
extHostInspector.close();
}
// Store partial metrics here so we can combine with trace data after close.
/** @param {any} r @param {string} name */
function getMetric(r, name) {
const e = r.metrics?.find((/** @type {any} */ m) => m.name === name);
return e ? e.value : 0;
}
partialMetrics = {
heapUsedBefore: Math.round(heapBefore.usedSize / 1024 / 1024),
heapUsedAfter: Math.round(heapAfter.usedSize / 1024 / 1024),
heapDelta: Math.round((heapAfter.usedSize - heapBefore.usedSize) / 1024 / 1024),
heapDeltaPostGC: await (async () => {
// Force a full GC then measure heap to get deterministic retained-memory delta.
// --js-flags=--expose-gc is not required: CDP's Runtime.evaluate can call gc()
// when includeCommandLineAPI is true.
try {
await cdp.send('Runtime.evaluate', { expression: 'gc()', awaitPromise: false, includeCommandLineAPI: true });
await new Promise(r => setTimeout(r, 200));
const heapPostGC = /** @type {any} */ (await cdp.send('Runtime.getHeapUsage'));
return Math.round((heapPostGC.usedSize - heapBefore.usedSize) / 1024 / 1024);
} catch {
return -1; // gc() not available in this build
}
})(),
layoutCount: getMetric(metricsAfter, 'LayoutCount') - getMetric(metricsBefore, 'LayoutCount'),
recalcStyleCount: getMetric(metricsAfter, 'RecalcStyleCount') - getMetric(metricsBefore, 'RecalcStyleCount'),
forcedReflowCount: getMetric(metricsAfter, 'ForcedStyleRecalcs') - getMetric(metricsBefore, 'ForcedStyleRecalcs'),
frameCount: getMetric(metricsAfter, 'FrameCount') - getMetric(metricsBefore, 'FrameCount'),
compositeLayers: getMetric(metricsAfter, 'CompositeLayers') - getMetric(metricsBefore, 'CompositeLayers'),
paintCount: getMetric(metricsAfter, 'PaintCount') - getMetric(metricsBefore, 'PaintCount'),
responseHasContent: responseInfo.hasContent,
profilePath,
tracePath,
snapshotPath,
extHostHeapUsedBefore,
extHostHeapUsedAfter,
extHostHeapDelta,
extHostHeapDeltaPostGC,
extHostProfilePath,
extHostSnapshotPath,
};
} finally {
if (extHostInspector) {
try { extHostInspector.close(); } catch { }
}
activeVSCode = null;
await vscode.close();
}
// Read the trace file written by VS Code on exit via --trace-startup-file
/** @type {Array<any>} */
let traceEvents = [];
try {
const traceData = JSON.parse(fs.readFileSync(tracePath, 'utf-8'));
traceEvents = traceData.traceEvents || [];
} catch {
// Trace file may not exist if VS Code crashed before shutdown
}
// Extract code/chat/* perf marks from blink.user_timing trace events.
// These appear as instant ('R' or 'I') events with timestamps in microseconds.
const chatMarks = traceEvents
.filter(e => e.cat === 'blink.user_timing' && e.name && e.name.startsWith('code/chat/'))
.map(e => ({ name: e.name, startTime: e.ts / 1000 }));
if (verbose && chatMarks.length > 0) {
console.log(` [trace] chatMarks (${chatMarks.length}): ${chatMarks.map((/** @type {any} */ m) => m.name.split('/').slice(-1)[0]).join(', ')}`);
}
// Parse timing — prefer internal code/chat/* marks (precise, in-process)
// with client-side Date.now() as fallback for older builds without marks.
const timeToUIUpdated = markDuration(chatMarks, 'request/start', 'request/uiUpdated');
const internalFirstToken = markDuration(chatMarks, 'request/start', 'request/firstToken');
const timeToFirstToken = internalFirstToken >= 0 ? internalFirstToken : (firstResponseTime - submitTime);
const timeToComplete = responseCompleteTime - submitTime;
const timeToRenderComplete = renderCompleteTime - submitTime;
const instructionCollectionTime = markDuration(chatMarks, 'request/willCollectInstructions', 'request/didCollectInstructions');
const agentInvokeTime = markDuration(chatMarks, 'agent/willInvoke', 'agent/didInvoke');
// Parse GC events from trace.
// Use the trace-event category and phase fields which are stable
// across V8 versions, rather than matching event name substrings.
let majorGCs = 0, minorGCs = 0, gcDurationMs = 0;
for (const event of traceEvents) {
const isGC = event.cat === 'v8.gc'
|| event.cat === 'devtools.timeline,v8'
|| (typeof event.cat === 'string' && event.cat.split(',').some((/** @type {string} */ c) => {
const t = c.trim();
return t === 'v8.gc' || t === 'disabled-by-default-v8.gc' || t === 'disabled-by-default-v8.gc_stats';
}));
if (!isGC) { continue; }
// Only count complete ('X') or duration-begin ('B') events to
// avoid double-counting begin/end pairs.
if (event.ph && event.ph !== 'X' && event.ph !== 'B') { continue; }
const name = event.name || '';
if (/Major|MarkCompact|MSC|MC|IncrementalMarking|FinalizeMC/i.test(name)) { majorGCs++; }
else if (/Minor|Scaveng/i.test(name)) { minorGCs++; }
else { minorGCs++; } // default unknown GC events to minor
if (event.dur) { gcDurationMs += event.dur / 1000; }
}
// Parse Layout duration from devtools.timeline trace events.
let layoutDurationMs = 0;
for (const event of traceEvents) {
if (event.name === 'Layout' && event.ph === 'X' && event.dur) {
layoutDurationMs += event.dur / 1000;
}
}
let longTaskCount = 0;
for (const event of traceEvents) {
if (event.name === 'RunTask' && event.dur && event.dur > 50_000) { longTaskCount++; }
}
// Parse Long Animation Frame (LoAF) events from devtools.timeline trace.
// AnimationFrame events use async flow pairs (ph:'s' start, ph:'f' finish)
// with matching ids. Compute duration from each s→f pair.
let longAnimationFrameCount = 0;
let longAnimationFrameTotalMs = 0;
{
/** @type {Map<number, number>} */
const frameStarts = new Map();
for (const event of traceEvents) {
if (event.cat === 'devtools.timeline' && event.name === 'AnimationFrame') {
if (event.ph === 's') {
frameStarts.set(event.id, event.ts);
} else if (event.ph === 'f' && frameStarts.has(event.id)) {
const durationMs = (event.ts - /** @type {number} */(frameStarts.get(event.id))) / 1000;
frameStarts.delete(event.id);
if (durationMs > 50) {
longAnimationFrameCount++;
longAnimationFrameTotalMs += durationMs;
}
}
}
}
}
return {
...partialMetrics,
timeToUIUpdated, timeToFirstToken, timeToComplete, timeToRenderComplete, instructionCollectionTime, agentInvokeTime,
hasInternalMarks: chatMarks.length > 0,
internalFirstToken,
majorGCs, minorGCs,
gcDurationMs: Math.round(gcDurationMs * 100) / 100,
layoutDurationMs: Math.round(layoutDurationMs * 100) / 100,
longTaskCount,
longAnimationFrameCount,
longAnimationFrameTotalMs: Math.round(longAnimationFrameTotalMs * 100) / 100,
};
}
// -- CI summary generation ---------------------------------------------------
const GITHUB_REPO = 'https://github.com/microsoft/vscode';
/**
* Format a build identifier as a Markdown link when possible.
* - Commit SHAs link to the commit page.
* - Semver versions link to the release tag page.
* - Everything else (e.g. "baseline", "dev (local)") is returned as inline code.
* @param {string} label
* @returns {string}
*/
function formatBuildLink(label) {
if (/^[0-9a-f]{7,40}$/.test(label)) {
const short = label.substring(0, 7);
return `[\`${short}\`](${GITHUB_REPO}/commit/${label})`;
}
if (/^\d+\.\d+\.\d+/.test(label)) {
return `[\`${label}\`](${GITHUB_REPO}/releases/tag/${label})`;
}
return `\`${label}\``;
}
/**
* Build a GitHub compare link between two build identifiers, if both are
* commit-like or version-like references. Returns empty string otherwise.
* @param {string} base
* @param {string} test
* @returns {string}
*/
function formatCompareLink(base, test) {
const isRef = (/** @type {string} */ v) => /^[0-9a-f]{7,40}$/.test(v) || /^\d+\.\d+\.\d+/.test(v);
if (!isRef(base) || !isRef(test)) {
return '';
}
return `[compare](${GITHUB_REPO}/compare/${base}...${test})`;
}
/**
* Generate a detailed Markdown summary table for CI.
* Printed to stdout and written to ci-summary.md.
*
* @param {Record<string, any>} jsonReport
* @param {Record<string, any> | null} baseline
* @param {{ threshold: number, metricThresholds?: Record<string, number | string>, runs: number, baselineBuild?: string, build?: string }} opts
*/
function generateCISummary(jsonReport, baseline, opts) {
const baseLabel = opts.baselineBuild || 'baseline';
const testBuildMode = jsonReport.buildMode || 'dev';
const testLabel = testBuildMode === 'dev' ? 'dev (local)'
: testBuildMode === 'production' ? 'production (local)'
: opts.build || testBuildMode;
const baseLink = formatBuildLink(baseLabel);
const testLink = formatBuildLink(testLabel);
const compareLink = formatCompareLink(baseLabel, testLabel);
const allMetrics = [
['timeToFirstToken', 'timing', 'ms'],
['timeToComplete', 'timing', 'ms'],
['layoutCount', 'rendering', ''],
['recalcStyleCount', 'rendering', ''],
['forcedReflowCount', 'rendering', ''],
['longTaskCount', 'rendering', ''],
['longAnimationFrameCount', 'rendering', ''],
['longAnimationFrameTotalMs', 'rendering', 'ms'],
['frameCount', 'rendering', ''],
['compositeLayers', 'rendering', ''],
['paintCount', 'rendering', ''],
['heapDelta', 'memory', 'MB'],
['heapDeltaPostGC', 'memory', 'MB'],
['gcDurationMs', 'memory', 'ms'],
['extHostHeapDelta', 'extHost', 'MB'],
['extHostHeapDeltaPostGC', 'extHost', 'MB'],
];
const regressionMetricNames = new Set(['timeToFirstToken', 'timeToComplete', 'forcedReflowCount', 'longTaskCount', 'longAnimationFrameCount']);
const lines = [];
const scenarios = Object.keys(jsonReport.scenarios);
// -- Collect verdicts per scenario/metric --------------------------------
/** @type {Map<string, { metric: string, verdict: string, change: number, pValue: string, basStr: string, curStr: string }[]>} */
const scenarioVerdicts = new Map();
let totalRegressions = 0;
let totalImprovements = 0;
for (const scenario of scenarios) {
const current = jsonReport.scenarios[scenario];
const base = baseline?.scenarios?.[scenario];
/** @type {{ metric: string, verdict: string, change: number, pValue: string, basStr: string, curStr: string }[]} */
const verdicts = [];
if (base) {
for (const [metric, group, unit] of allMetrics) {
const cur = current[group]?.[metric];
const bas = base[group]?.[metric];
if (!cur || !bas || bas.median === null || bas.median === undefined) { continue; }
const change = bas.median !== 0 ? (cur.median - bas.median) / bas.median : 0;
const isRegressionMetric = regressionMetricNames.has(metric);
const curRaw = (current.rawRuns || []).map((/** @type {any} */ r) => r[metric]).filter((/** @type {any} */ v) => v >= 0);
const basRaw = (base.rawRuns || []).map((/** @type {any} */ r) => r[metric]).filter((/** @type {any} */ v) => v >= 0);
const ttest = welchTTest(basRaw, curRaw);
const pStr = ttest ? `${ttest.pValue}` : 'n/a';
const metricThreshold = getMetricThreshold(opts, metric);
const absoluteDelta = cur.median - bas.median;
let verdict = '';
if (isRegressionMetric) {
if (exceedsThreshold(metricThreshold, change, absoluteDelta)) {
if (!ttest || ttest.significant) {
verdict = 'REGRESSION';
totalRegressions++;
} else {
verdict = 'noise';
}
} else if (exceedsThreshold(metricThreshold, -change, -absoluteDelta) && ttest?.significant) {
verdict = 'improved';
totalImprovements++;
} else {
verdict = 'ok';
}
} else {
verdict = 'info';
}
const basStr = `${bas.median}${unit} \xb1${bas.stddev}${unit}`;
const curStr = `${cur.median}${unit} \xb1${cur.stddev}${unit}`;
verdicts.push({ metric, verdict, change, pValue: pStr, basStr, curStr });
}
}
scenarioVerdicts.set(scenario, verdicts);
}
// -- Header with verdict up front ----------------------------------------
const hasRegressions = totalRegressions > 0;
const verdictIcon = hasRegressions ? '\u274C' : '\u2705';
const verdictText = hasRegressions
? `${totalRegressions} regression(s) detected`
: totalImprovements > 0
? `No regressions \u2014 ${totalImprovements} improvement(s)`
: 'No significant changes';
lines.push(`# ${verdictIcon} Chat Performance: ${verdictText}`);
lines.push('');
lines.push(`| | |`);
lines.push(`|---|---|`);
lines.push(`| **Baseline** | ${baseLink} |`);
lines.push(`| **Test** | ${testLink} |`);
if (compareLink) {
lines.push(`| **Diff** | ${compareLink} |`);
}
lines.push(`| **Runs per scenario** | ${opts.runs} |`);
const overrides = Object.entries(opts.metricThresholds || {}).filter(([, v]) => {
const parsed = parseMetricThreshold(v);
return parsed.type !== 'fraction' || parsed.value !== opts.threshold;
});
if (overrides.length > 0) {
const overrideStr = overrides.map(([k, v]) => {
const parsed = parseMetricThreshold(v);
return `${k}: ${parsed.type === 'absolute' ? `${parsed.value}${k.includes('Ms') || k.includes('Time') || k.includes('time') ? 'ms' : ''}` : `${(parsed.value * 100).toFixed(0)}%`}`;
}).join(', ');
lines.push(`| **Regression threshold** | ${(opts.threshold * 100).toFixed(0)}% (${overrideStr}) |`);
} else {
lines.push(`| **Regression threshold** | ${(opts.threshold * 100).toFixed(0)}% |`);
}
lines.push(`| **Scenarios** | ${scenarios.length} |`);
lines.push(`| **Platform** | ${process.platform} / ${process.arch} |`);
if (jsonReport.buildMode) {
lines.push(`| **Build mode** | ${jsonReport.buildMode} |`);
}
lines.push('');
if (jsonReport.mismatchedBuildMode) {
lines.push('> **⚠ Build mode mismatch:** The test and baseline builds use different build modes.');
lines.push('> Results may not be directly comparable. For apples-to-apples comparisons,');
lines.push('> use the same build type for both (e.g. `--production-build` with a local');
lines.push('> baseline path, or two version strings).');
lines.push('');
}
// -- At-a-glance overview table: one row per scenario --------------------
lines.push(`## Overview`);
lines.push('');
lines.push('| Scenario | Description | TTFT | Complete | Layouts | Styles | LoAF | Verdict |');
lines.push('|----------|-------------|-----:|---------:|--------:|-------:|-----:|:-------:|');
for (const scenario of scenarios) {
const verdicts = scenarioVerdicts.get(scenario) || [];
const get = (/** @type {string} */ m) => verdicts.find(v => v.metric === m);
const ttft = get('timeToFirstToken');
const complete = get('timeToComplete');
const layouts = get('layoutCount');
const styles = get('recalcStyleCount');
const loaf = get('longAnimationFrameCount');
const fmtCell = (/** @type {{ change: number, verdict: string } | undefined} */ v) => {
if (!v) { return '\u2014'; }
const pct = `${v.change > 0 ? '+' : ''}${(v.change * 100).toFixed(0)}%`;
return pct;
};
const fmtVerdict = (/** @type {{ verdict: string, change: number }[]} */ vs) => {
const hasRegression = vs.some(v => v.verdict === 'REGRESSION');
const hasImproved = vs.some(v => v.verdict === 'improved');
if (hasRegression) { return '\u274C Regressed'; }
if (hasImproved) { return '\u2B06\uFE0F Improved'; }
return '\u2705 OK';
};
const keyVerdicts = [ttft, complete, layouts, styles, loaf].filter(Boolean);
const rowVerdict = fmtVerdict(/** @type {any[]} */(keyVerdicts));
lines.push(`| ${scenario} | ${getScenarioDescription(scenario)} | ${fmtCell(ttft)} | ${fmtCell(complete)} | ${fmtCell(layouts)} | ${fmtCell(styles)} | ${fmtCell(loaf)} | ${rowVerdict} |`);
}
lines.push('');
// -- Regressions & improvements detail section ---------------------------
const hasNotable = [...scenarioVerdicts.values()].some(vs => vs.some(v => v.verdict === 'REGRESSION' || v.verdict === 'improved'));
if (hasNotable) {
lines.push('## Regressions & Improvements');
lines.push('');
lines.push('Only metrics that regressed or improved significantly are shown below.');
lines.push('');
for (const scenario of scenarios) {
const verdicts = scenarioVerdicts.get(scenario) || [];
const notable = verdicts.filter(v => v.verdict === 'REGRESSION' || v.verdict === 'improved');
if (notable.length === 0) { continue; }
const icon = notable.some(v => v.verdict === 'REGRESSION') ? '\u274C' : '\u2B06\uFE0F';
lines.push(`### ${icon} ${scenario}`);
lines.push('');
lines.push('| Metric | Baseline | Test | Change | p-value | Verdict |');
lines.push('|--------|----------|------|--------|---------|---------|');
for (const v of notable) {
const pct = `${v.change > 0 ? '+' : ''}${(v.change * 100).toFixed(1)}%`;
const verdictIcon = v.verdict === 'REGRESSION' ? '\u274C' : '\u2B06\uFE0F';
lines.push(`| ${v.metric} | ${v.basStr} | ${v.curStr} | ${pct} | ${v.pValue} | ${verdictIcon} ${v.verdict} |`);
}
lines.push('');
}
}
// -- Full metric tables in collapsible section ---------------------------
lines.push('<details><summary>Full metric details per scenario</summary>');
lines.push('');
for (const scenario of scenarios) {
const verdicts = scenarioVerdicts.get(scenario) || [];
const base = baseline?.scenarios?.[scenario];
lines.push(`### ${scenario}`);
lines.push('');
if (!base) {
const current = jsonReport.scenarios[scenario];
lines.push('> No baseline data for this scenario.');
lines.push('');
lines.push('| Metric | Value | StdDev | CV | n |');
lines.push('|--------|------:|-------:|---:|--:|');
for (const [metric, group, unit] of allMetrics) {
const cur = current[group]?.[metric];
if (!cur) { continue; }
lines.push(`| ${metric} | ${cur.median}${unit} | \xb1${cur.stddev}${unit} | ${(cur.cv * 100).toFixed(0)}% | ${cur.n} |`);
}
lines.push('');
continue;
}
lines.push(`| Metric | Baseline | Test | Change | p-value | Verdict |`);
lines.push(`|--------|----------|------|--------|---------|---------|`);
for (const v of verdicts) {
const pct = `${v.change > 0 ? '+' : ''}${(v.change * 100).toFixed(1)}%`;
let verdictDisplay = v.verdict;
if (v.verdict === 'REGRESSION') { verdictDisplay = '\u274C REGRESSION'; }
else if (v.verdict === 'improved') { verdictDisplay = '\u2B06\uFE0F improved'; }
else if (v.verdict === 'ok') { verdictDisplay = '\u2705 ok'; }
else if (v.verdict === 'noise') { verdictDisplay = '\uD83C\uDF2B\uFE0F noise'; }
else if (v.verdict === 'info') { verdictDisplay = '\u2139\uFE0F'; }
lines.push(`| ${v.metric} | ${v.basStr} | ${v.curStr} | ${pct} | ${v.pValue} | ${verdictDisplay} |`);
}
lines.push('');
}
lines.push('</details>');
lines.push('');
// -- Raw run data in collapsible section ---------------------------------
lines.push('<details><summary>Raw run data</summary>');
lines.push('');
for (const scenario of scenarios) {
const current = jsonReport.scenarios[scenario];
lines.push(`### ${scenario}`);
lines.push('');
lines.push('| Run | TTFT (ms) | Complete (ms) | Layouts | Style Recalcs | LoAF Count | LoAF (ms) | Frames | Heap Delta (MB) | Internal Marks |');
lines.push('|----:|----------:|--------------:|--------:|--------------:|-----------:|----------:|-------:|----------------:|:--------------:|');
const runs = current.rawRuns || [];
for (let i = 0; i < runs.length; i++) {
const r = runs[i];
const round2 = (/** @type {number} */ v) => Math.round(v * 100) / 100;
lines.push(`| ${i + 1} | ${round2(r.timeToFirstToken)} | ${r.timeToComplete} | ${r.layoutCount} | ${r.recalcStyleCount} | ${r.longAnimationFrameCount ?? '-'} | ${r.longAnimationFrameTotalMs !== null && r.longAnimationFrameTotalMs !== undefined ? round2(r.longAnimationFrameTotalMs) : '-'} | ${r.frameCount ?? '-'} | ${r.heapDelta} | ${r.hasInternalMarks ? 'yes' : 'no'} |`);
}
lines.push('');
}
if (baseline) {
for (const scenario of scenarios) {
const base = baseline.scenarios?.[scenario];
if (!base) { continue; }
lines.push(`### ${scenario} (baseline)`);
lines.push('');
lines.push('| Run | TTFT (ms) | Complete (ms) | Layouts | Style Recalcs | LoAF Count | LoAF (ms) | Frames | Heap Delta (MB) | Internal Marks |');
lines.push('|----:|----------:|--------------:|--------:|--------------:|-----------:|----------:|-------:|----------------:|:--------------:|');
const runs = base.rawRuns || [];
for (let i = 0; i < runs.length; i++) {
const r = runs[i];
const round2 = (/** @type {number} */ v) => Math.round(v * 100) / 100;
lines.push(`| ${i + 1} | ${round2(r.timeToFirstToken)} | ${r.timeToComplete} | ${r.layoutCount} | ${r.recalcStyleCount} | ${r.longAnimationFrameCount ?? '-'} | ${r.longAnimationFrameTotalMs !== null && r.longAnimationFrameTotalMs !== undefined ? round2(r.longAnimationFrameTotalMs) : '-'} | ${r.frameCount ?? '-'} | ${r.heapDelta} | ${r.hasInternalMarks ? 'yes' : 'no'} |`);
}
lines.push('');
}
}
lines.push('</details>');
lines.push('');
return lines.join('\n');
}
// -- Cleanup on SIGINT/SIGTERM -----------------------------------------------
/** @type {{ close: () => Promise<void> } | null} */
let activeVSCode = null;
/** @type {{ close: () => Promise<void> } | null} */
let activeMockServer = null;
function installSignalHandlers() {
const cleanup = async () => {
console.log('\n[chat-simulation] Caught interrupt, cleaning up...');
try { await activeVSCode?.close(); } catch { }
try { await activeMockServer?.close(); } catch { }
process.exit(130);
};
process.on('SIGINT', cleanup);
process.on('SIGTERM', cleanup);
}
// -- Diagnostic cleanup ------------------------------------------------------
/**
* Remove large diagnostic files (heap snapshots, CPU profiles, traces) from
* a run's metrics to free disk space. Keeps the JSON results data intact.
* @param {RunMetrics} metrics
*/
function cleanupRunDiagnostics(metrics) {
const filesToDelete = [
metrics.profilePath,
metrics.tracePath,
metrics.snapshotPath,
metrics.extHostProfilePath,
metrics.extHostSnapshotPath,
];
for (const filePath of filesToDelete) {
if (filePath && fs.existsSync(filePath)) {
try {
fs.rmSync(filePath, { force: true });
} catch {
// Ignore cleanup errors
}
}
}
}
/**
* Clean up diagnostics for all scenarios that did NOT regress.
* Keeps diagnostics for regressed scenarios so they can be investigated.
* @param {Record<string, RunMetrics[]>} allResults - test results by scenario
* @param {Set<string>} regressedScenarios - scenarios that regressed
*/
function cleanupNonRegressedDiagnostics(allResults, regressedScenarios) {
for (const [scenario, runs] of Object.entries(allResults)) {
if (regressedScenarios.has(scenario)) {
continue;
}
for (const metrics of runs) {
cleanupRunDiagnostics(metrics);
}
}
}
// -- Main --------------------------------------------------------------------
async function main() {
registerPerfScenarios();
const opts = parseArgs();
installSignalHandlers();
const { startServer } = require('./common/mock-llm-server');
const mockServer = await startServer(0);
activeMockServer = mockServer;
console.log(`[chat-simulation] Mock LLM server: ${mockServer.url}`);
// -- Resume mode --------------------------------------------------------
if (opts.resume) {
if (!fs.existsSync(opts.resume)) {
console.error(`[chat-simulation] Resume file not found: ${opts.resume}`);
process.exit(1);
}
const prevResults = JSON.parse(fs.readFileSync(opts.resume, 'utf-8'));
const prevDir = path.dirname(opts.resume);
// Find the associated baseline JSON in the same directory
const baselineFiles = fs.readdirSync(prevDir).filter((/** @type {string} */ f) => f.startsWith('baseline-') && f.endsWith('.json'));
const baselineFile = baselineFiles.length > 0 ? path.join(prevDir, baselineFiles[0]) : null;
const prevBaseline = baselineFile ? JSON.parse(fs.readFileSync(baselineFile, 'utf-8')) : null;
// Determine which scenarios to resume (default: all from previous run)
const resumeScenarios = opts.scenarios.length > 0
? opts.scenarios.filter(s => prevResults.scenarios?.[s])
: Object.keys(prevResults.scenarios || {});
if (resumeScenarios.length === 0) {
console.error('[chat-simulation] No matching scenarios found in previous results');
process.exit(1);
}
const testElectron = await resolveBuild(opts.build);
const baselineVersion = prevBaseline?.baselineBuildVersion;
const baselineElectron = baselineVersion ? await resolveBuild(baselineVersion) : null;
const runsToAdd = opts.runs;
console.log(`[chat-simulation] Resuming from: ${opts.resume}`);
console.log(`[chat-simulation] Adding ${runsToAdd} runs per scenario`);
console.log(`[chat-simulation] Scenarios: ${resumeScenarios.join(', ')}`);
if (prevBaseline) {
console.log(`[chat-simulation] Baseline: ${baselineVersion} (${prevBaseline.scenarios?.[resumeScenarios[0]]?.rawRuns?.length || 0} existing runs)`);
}
console.log('');
for (const scenario of resumeScenarios) {
console.log(`[chat-simulation] === Resuming: ${scenario} ===`);
const prevTestRuns = prevResults.scenarios[scenario]?.rawRuns || [];
const prevBaseRuns = prevBaseline?.scenarios?.[scenario]?.rawRuns || [];
// Run additional test iterations
console.log(`[chat-simulation] Test build (${prevTestRuns.length} existing + ${runsToAdd} new)`);
for (let i = 0; i < runsToAdd; i++) {
const runIdx = `${scenario}-resume-${prevTestRuns.length + i}`;
console.log(`[chat-simulation] Run ${i + 1}/${runsToAdd}...`);
try {
const m = await runOnce(testElectron, scenario, mockServer, opts.verbose, runIdx, prevDir, 'test', { ...opts.settingsOverrides, ...opts.testSettingsOverrides }, { heapSnapshots: opts.heapSnapshots });
// Clean up previous run's diagnostics to bound disk usage; keep the latest
if (opts.cleanupDiagnostics && prevTestRuns.length > 0) { cleanupRunDiagnostics(prevTestRuns[prevTestRuns.length - 1]); }
prevTestRuns.push(m);
if (opts.verbose) {
const src = m.hasInternalMarks ? 'internal' : 'client-side';
console.log(` [${src}] firstToken=${m.timeToFirstToken}ms, complete=${m.timeToComplete}ms`);
}
} catch (err) { console.error(` Run ${i + 1} failed: ${err}`); }
}
// Run additional baseline iterations
if (baselineElectron && prevBaseline?.scenarios?.[scenario]) {
console.log(`[chat-simulation] Baseline build (${prevBaseRuns.length} existing + ${runsToAdd} new)`);
for (let i = 0; i < runsToAdd; i++) {
const runIdx = `baseline-${scenario}-resume-${prevBaseRuns.length + i}`;
console.log(`[chat-simulation] Run ${i + 1}/${runsToAdd}...`);
try {
const m = await runOnce(baselineElectron, scenario, mockServer, opts.verbose, runIdx, prevDir, 'baseline', { ...opts.settingsOverrides, ...opts.baselineSettingsOverrides }, { heapSnapshots: opts.heapSnapshots });
// Clean up previous run's diagnostics to bound disk usage; keep the latest
if (opts.cleanupDiagnostics && prevBaseRuns.length > 0) { cleanupRunDiagnostics(prevBaseRuns[prevBaseRuns.length - 1]); }
prevBaseRuns.push(m);
} catch (err) { console.error(` Run ${i + 1} failed: ${err}`); }
}
}
// Recompute stats with merged data
const sd = /** @type {any} */ ({ runs: prevTestRuns.length, timing: {}, memory: {}, rendering: {}, extHost: {}, rawRuns: prevTestRuns });
for (const [metric, group] of METRIC_DEFS) { sd[group][metric] = robustStats(prevTestRuns.map((/** @type {any} */ r) => r[metric])); }
prevResults.scenarios[scenario] = sd;
if (prevBaseline?.scenarios?.[scenario]) {
const bsd = /** @type {any} */ ({ runs: prevBaseRuns.length, timing: {}, memory: {}, rendering: {}, extHost: {}, rawRuns: prevBaseRuns });
for (const [metric, group] of METRIC_DEFS) { bsd[group][metric] = robustStats(prevBaseRuns.map((/** @type {any} */ r) => r[metric])); }
prevBaseline.scenarios[scenario] = bsd;
}
console.log(`[chat-simulation] Merged: test n=${prevTestRuns.length}${prevBaseRuns.length > 0 ? `, baseline n=${prevBaseRuns.length}` : ''}`);
console.log('');
}
// Write updated files back
prevResults.runsPerScenario = Math.max(prevResults.runsPerScenario || 0, ...Object.values(prevResults.scenarios).map((/** @type {any} */ s) => s.runs));
prevResults.lastResumed = new Date().toISOString();
fs.writeFileSync(opts.resume, JSON.stringify(prevResults, null, 2));
console.log(`[chat-simulation] Updated results: ${opts.resume}`);
if (prevBaseline && baselineFile) {
prevBaseline.lastResumed = new Date().toISOString();
fs.writeFileSync(baselineFile, JSON.stringify(prevBaseline, null, 2));
// Also update cached baseline
const cachedPath = path.join(DATA_DIR, path.basename(baselineFile));
fs.writeFileSync(cachedPath, JSON.stringify(prevBaseline, null, 2));
console.log(`[chat-simulation] Updated baseline: ${baselineFile}`);
}
// -- Re-run comparison with merged data --------------------------------
opts.baseline = baselineFile || undefined;
const jsonReport = prevResults;
jsonReport._resultsPath = opts.resume;
// Fall through to comparison logic below
await printComparison(jsonReport, opts);
await mockServer.close();
return;
}
// -- Normal (non-resume) flow -------------------------------------------
// --production-build: build a local bundled (non-dev) package from the
// current source tree using `gulp vscode`. This produces the same
// packaging as a release build (bundled JS, no VSCODE_DEV) while still
// testing your local changes.
if (opts.productionBuild && !opts.build) {
const prodBuildPath = buildProductionBuild();
opts.build = prodBuildPath;
console.log(`[chat-simulation] --production-build: using local production build at ${prodBuildPath}`);
}
const electronPath = await resolveBuild(opts.build);
if (!fs.existsSync(electronPath)) {
console.error(`Electron not found at: ${electronPath}`);
console.error('Run "node build/lib/preLaunch.ts" first, or pass --build <path>');
process.exit(1);
}
// Detect build modes for both test and baseline builds
const testBuildMode = detectBuildMode(electronPath);
// Resolve the baseline build path early so we can detect its mode.
// For version strings this downloads; for local paths it resolves directly.
const isBaselineVersionString = opts.baselineBuild && isVersionString(opts.baselineBuild);
const isBaselineLocalPath = opts.baselineBuild && !isBaselineVersionString;
/** @type {string | undefined} */
let baselineElectronPath;
if (isBaselineLocalPath) {
baselineElectronPath = await resolveBuild(opts.baselineBuild);
if (!fs.existsSync(baselineElectronPath)) {
console.error(`Baseline build not found at: ${baselineElectronPath}`);
process.exit(1);
}
}
const baselineBuildMode = opts.baselineBuild
? (isBaselineVersionString ? 'release' : detectBuildMode(baselineElectronPath || ''))
: undefined;
const isMismatchedBuildMode = baselineBuildMode !== undefined && testBuildMode !== baselineBuildMode;
// Create a timestamped run directory for all output
const runTimestamp = new Date().toISOString().replace(/[:.]/g, '-').slice(0, 19);
const runDir = path.join(DATA_DIR, runTimestamp);
fs.mkdirSync(runDir, { recursive: true });
console.log(`[chat-simulation] Output: ${runDir}`);
// Compute effective settings per role
const testSettings = { ...opts.settingsOverrides, ...opts.testSettingsOverrides };
const baselineSettings = { ...opts.settingsOverrides, ...opts.baselineSettingsOverrides };
// -- Baseline build --------------------------------------------------
if (opts.baselineBuild) {
// Use a sanitized label for file names — replace path separators for local paths
const baselineLabel = isBaselineLocalPath
? path.basename(path.resolve(opts.baselineBuild))
: opts.baselineBuild;
const baselineJsonPath = path.join(runDir, `baseline-${baselineLabel}.json`);
// Local paths: always run fresh (no caching — the build may have changed)
// Version strings: use caching as before
const cachedPath = isBaselineLocalPath ? null : path.join(DATA_DIR, `baseline-${baselineLabel}.json`);
const cachedBaseline = cachedPath && !opts.noCache && fs.existsSync(cachedPath)
? JSON.parse(fs.readFileSync(cachedPath, 'utf-8'))
: null;
if (cachedBaseline?.baselineBuildVersion === opts.baselineBuild) {
// Check if the cache covers all requested scenarios
const cachedScenarios = new Set(Object.keys(cachedBaseline.scenarios || {}));
const missingScenarios = opts.scenarios.filter((/** @type {string} */ s) => !cachedScenarios.has(s));
// Also check if cached scenarios have fewer runs than requested
const shortScenarios = opts.scenarios.filter((/** @type {string} */ s) => {
const cached = cachedBaseline.scenarios?.[s];
return cached && (cached.rawRuns?.length || 0) < opts.runs;
});
if (missingScenarios.length === 0 && shortScenarios.length === 0) {
console.log(`[chat-simulation] Using cached baseline for ${opts.baselineBuild}`);
fs.writeFileSync(baselineJsonPath, JSON.stringify(cachedBaseline, null, 2));
opts.baseline = baselineJsonPath;
} else {
const scenariosToRun = [...new Set([...missingScenarios, ...shortScenarios])];
if (missingScenarios.length > 0) {
console.log(`[chat-simulation] Cached baseline missing scenarios: ${missingScenarios.join(', ')}`);
}
if (shortScenarios.length > 0) {
console.log(`[chat-simulation] Cached baseline needs more runs for: ${shortScenarios.map((/** @type {string} */ s) => `${s} (${cachedBaseline.scenarios[s].rawRuns?.length || 0}/${opts.runs})`).join(', ')}`);
}
console.log(`[chat-simulation] Running baseline for ${scenariosToRun.length} scenario(s)...`);
const baselineExePath = baselineElectronPath || await resolveBuild(opts.baselineBuild);
for (const scenario of scenariosToRun) {
const existingRuns = cachedBaseline.scenarios?.[scenario]?.rawRuns || [];
const runsNeeded = opts.runs - existingRuns.length;
/** @type {RunMetrics[]} */
const newResults = [];
for (let i = 0; i < runsNeeded; i++) {
try {
const m = await runOnce(baselineExePath, scenario, mockServer, opts.verbose, `baseline-${scenario}-${existingRuns.length + i}`, runDir, 'baseline', baselineSettings, { heapSnapshots: opts.heapSnapshots });
// Clean up previous run's diagnostics to bound disk usage; keep the latest
if (opts.cleanupDiagnostics && newResults.length > 0) { cleanupRunDiagnostics(newResults[newResults.length - 1]); }
newResults.push(m);
}
catch (err) { console.error(`[chat-simulation] Baseline run ${i + 1} failed: ${err}`); }
}
const allRuns = [...existingRuns, ...newResults];
if (allRuns.length > 0) {
const sd = /** @type {any} */ ({ runs: allRuns.length, timing: {}, memory: {}, rendering: {}, extHost: {}, rawRuns: allRuns });
for (const [metric, group] of METRIC_DEFS) { sd[group][metric] = robustStats(allRuns.map((/** @type {any} */ r) => r[metric])); }
cachedBaseline.scenarios[scenario] = sd;
}
}
cachedBaseline.runsPerScenario = opts.runs;
fs.writeFileSync(baselineJsonPath, JSON.stringify(cachedBaseline, null, 2));
if (cachedPath) {
fs.writeFileSync(cachedPath, JSON.stringify(cachedBaseline, null, 2));
}
opts.baseline = baselineJsonPath;
}
} else {
const baselineExePath = baselineElectronPath || await resolveBuild(opts.baselineBuild);
console.log(`[chat-simulation] Benchmarking baseline build (${baselineLabel})...`);
/** @type {Record<string, RunMetrics[]>} */
const baselineResults = {};
for (const scenario of opts.scenarios) {
/** @type {RunMetrics[]} */
const results = [];
for (let i = 0; i < opts.runs; i++) {
try {
const m = await runOnce(baselineExePath, scenario, mockServer, opts.verbose, `baseline-${scenario}-${i}`, runDir, 'baseline', baselineSettings, { heapSnapshots: opts.heapSnapshots });
// Clean up previous run's diagnostics to bound disk usage; keep the latest
if (opts.cleanupDiagnostics && results.length > 0) { cleanupRunDiagnostics(results[results.length - 1]); }
results.push(m);
}
catch (err) { console.error(`[chat-simulation] Baseline run ${i + 1} failed: ${err}`); }
}
if (results.length > 0) { baselineResults[scenario] = results; }
}
const baselineReport = {
timestamp: new Date().toISOString(),
baselineBuildVersion: opts.baselineBuild,
platform: process.platform,
runsPerScenario: opts.runs,
scenarios: /** @type {Record<string, any>} */ ({}),
};
for (const [scenario, results] of Object.entries(baselineResults)) {
const sd = /** @type {any} */ ({ runs: results.length, timing: {}, memory: {}, rendering: {}, extHost: {}, rawRuns: results });
for (const [metric, group] of METRIC_DEFS) { sd[group][metric] = robustStats(results.map(r => /** @type {any} */(r)[metric])); }
baselineReport.scenarios[scenario] = sd;
}
fs.writeFileSync(baselineJsonPath, JSON.stringify(baselineReport, null, 2));
// Cache at the top level for reuse across runs (version strings only)
if (cachedPath) {
fs.writeFileSync(cachedPath, JSON.stringify(baselineReport, null, 2));
}
opts.baseline = baselineJsonPath;
}
console.log('');
}
// -- Run benchmarks --------------------------------------------------
console.log(`[chat-simulation] Electron: ${electronPath}`);
console.log(`[chat-simulation] Build mode: ${buildModeLabel(testBuildMode)}`);
if (baselineBuildMode) {
console.log(`[chat-simulation] Baseline mode: ${buildModeLabel(baselineBuildMode)}`);
}
console.log(`[chat-simulation] Runs per scenario: ${opts.runs}`);
console.log(`[chat-simulation] Scenarios: ${opts.scenarios.join(', ')}`);
if (Object.keys(opts.settingsOverrides).length > 0) {
console.log(`[chat-simulation] Settings overrides (all): ${JSON.stringify(opts.settingsOverrides)}`);
}
if (Object.keys(opts.testSettingsOverrides).length > 0) {
console.log(`[chat-simulation] Settings overrides (test): ${JSON.stringify(opts.testSettingsOverrides)}`);
}
if (Object.keys(opts.baselineSettingsOverrides).length > 0) {
console.log(`[chat-simulation] Settings overrides (baseline): ${JSON.stringify(opts.baselineSettingsOverrides)}`);
}
if (isMismatchedBuildMode) {
console.log('');
console.log(`[chat-simulation] ⚠ WARNING: Build mode mismatch — test is ${testBuildMode}, baseline is ${baselineBuildMode}.`);
console.log('[chat-simulation] Results may not be directly comparable. For apples-to-apples');
console.log('[chat-simulation] comparisons, use the same build type for both.');
if (testBuildMode === 'dev') {
console.log('[chat-simulation] To use a local production build instead:');
console.log('[chat-simulation] npm run perf:chat -- --production-build');
}
if (!opts.ci && !opts.force) {
const readline = require('readline');
const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
const answer = await new Promise(resolve => rl.question('[chat-simulation] Continue anyway? [y/N] ', resolve));
rl.close();
if (String(answer).toLowerCase() !== 'y') {
console.log('[chat-simulation] Aborted.');
await mockServer.close();
process.exit(0);
}
}
}
console.log('');
/** @type {Record<string, RunMetrics[]>} */
const allResults = {};
let anyFailed = false;
for (const scenario of opts.scenarios) {
console.log(`[chat-simulation] === Scenario: ${scenario} ===`);
/** @type {RunMetrics[]} */
const results = [];
for (let i = 0; i < opts.runs; i++) {
console.log(`[chat-simulation] Run ${i + 1}/${opts.runs}...`);
try {
const metrics = await runOnce(electronPath, scenario, mockServer, opts.verbose, `${scenario}-${i}`, runDir, 'test', testSettings, { heapSnapshots: opts.heapSnapshots });
// Clean up previous run's diagnostics to bound disk usage; keep the latest
if (opts.cleanupDiagnostics && results.length > 0) { cleanupRunDiagnostics(results[results.length - 1]); }
results.push(metrics);
if (opts.verbose) {
const src = metrics.hasInternalMarks ? 'internal' : 'client-side';
console.log(` [${src}] firstToken=${metrics.timeToFirstToken}ms, complete=${metrics.timeToComplete}ms, heap=delta${metrics.heapDelta}MB, longTasks=${metrics.longTaskCount}${metrics.hasInternalMarks ? `, internalTTFT=${metrics.internalFirstToken}ms` : ''}`);
}
} catch (err) { console.error(` Run ${i + 1} failed: ${err}`); }
}
if (results.length === 0) { console.error(`[chat-simulation] All runs failed for scenario: ${scenario}`); anyFailed = true; }
else { allResults[scenario] = results; }
console.log('');
}
// -- Summary ---------------------------------------------------------
console.log('[chat-simulation] ======================= Summary =======================');
for (const [scenario, results] of Object.entries(allResults)) {
console.log('');
console.log(` -- ${scenario} (${results.length} runs) --`);
console.log('');
console.log(' Timing:');
console.log(summarize(results.map(r => r.timeToFirstToken), ' Request → First token ', 'ms'));
console.log(summarize(results.map(r => r.timeToComplete), ' Request → Complete ', 'ms'));
console.log(summarize(results.map(r => r.timeToRenderComplete), ' Request → Rendered ', 'ms'));
console.log('');
console.log(' Rendering:');
console.log(summarize(results.map(r => r.layoutCount), ' Layouts ', ''));
console.log(summarize(results.map(r => r.layoutDurationMs), ' Layout duration ', 'ms'));
console.log(summarize(results.map(r => r.recalcStyleCount), ' Style recalcs ', ''));
console.log(summarize(results.map(r => r.forcedReflowCount), ' Forced reflows ', ''));
console.log(summarize(results.map(r => r.longTaskCount), ' Long tasks (>50ms) ', ''));
console.log(summarize(results.map(r => r.longAnimationFrameCount), ' Long anim. frames ', ''));
console.log(summarize(results.map(r => r.longAnimationFrameTotalMs), ' LoAF total duration ', 'ms'));
console.log(summarize(results.map(r => r.frameCount), ' Frames ', ''));
console.log(summarize(results.map(r => r.compositeLayers), ' Composite layers ', ''));
console.log(summarize(results.map(r => r.paintCount), ' Paints ', ''));
console.log('');
console.log(' Memory:');
console.log(summarize(results.map(r => r.heapDelta), ' Heap delta ', 'MB'));
console.log(summarize(results.map(r => r.heapDeltaPostGC), ' Heap delta (post-GC) ', 'MB'));
console.log(summarize(results.map(r => r.gcDurationMs), ' GC duration ', 'ms'));
if (results.some(r => r.extHostHeapDelta >= 0)) {
console.log('');
console.log(' Extension Host:');
console.log(summarize(results.map(r => r.extHostHeapUsedBefore), ' Heap before ', 'MB'));
console.log(summarize(results.map(r => r.extHostHeapUsedAfter), ' Heap after ', 'MB'));
console.log(summarize(results.map(r => r.extHostHeapDelta), ' Heap delta ', 'MB'));
console.log(summarize(results.map(r => r.extHostHeapDeltaPostGC), ' Heap delta (post-GC) ', 'MB'));
}
}
// -- JSON output -----------------------------------------------------
const jsonPath = path.join(runDir, 'results.json');
const jsonReport = /** @type {{ timestamp: string, platform: NodeJS.Platform, runsPerScenario: number, buildMode: string, mismatchedBuildMode: boolean, scenarios: Record<string, any>, _resultsPath?: string }} */ ({
timestamp: new Date().toISOString(),
platform: process.platform,
runsPerScenario: opts.runs,
buildMode: testBuildMode,
mismatchedBuildMode: !!isMismatchedBuildMode,
scenarios: /** @type {Record<string, any>} */ ({}),
});
for (const [scenario, results] of Object.entries(allResults)) {
const sd = /** @type {any} */ ({ runs: results.length, timing: {}, memory: {}, rendering: {}, extHost: {}, rawRuns: results });
for (const [metric, group] of METRIC_DEFS) { sd[group][metric] = robustStats(results.map(r => /** @type {any} */(r)[metric])); }
jsonReport.scenarios[scenario] = sd;
}
fs.writeFileSync(jsonPath, JSON.stringify(jsonReport, null, 2));
jsonReport._resultsPath = jsonPath;
console.log('');
console.log(`[chat-simulation] Results written to ${jsonPath}`);
// -- Save baseline ---------------------------------------------------
if (opts.saveBaseline) {
if (!opts.baseline) { console.error('[chat-simulation] --save-baseline requires --baseline <path>'); process.exit(1); }
fs.writeFileSync(opts.baseline, JSON.stringify(jsonReport, null, 2));
console.log(`[chat-simulation] Baseline saved to ${opts.baseline}`);
}
// -- Baseline comparison ---------------------------------------------
const regressedScenarios = await printComparison(jsonReport, opts);
// Clean up diagnostics for scenarios that did not regress
if (opts.cleanupDiagnostics) {
cleanupNonRegressedDiagnostics(allResults, regressedScenarios);
}
if (anyFailed) { process.exit(1); }
await mockServer.close();
}
/**
* Print baseline comparison and exit with code 1 if regressions found.
* Returns the set of scenario IDs that regressed.
* @param {Record<string, any>} jsonReport
* @param {{ threshold: number, metricThresholds?: Record<string, number | string>, baseline?: string, ci?: boolean, resume?: string, build?: string, baselineBuild?: string, runs: number, cleanupDiagnostics?: boolean }} opts
* @returns {Promise<Set<string>>}
*/
async function printComparison(jsonReport, opts) {
let regressionFound = false;
let inconclusiveFound = false;
/** @type {Set<string>} */
const regressedScenarios = new Set();
if (opts.baseline && fs.existsSync(opts.baseline)) {
const baseline = JSON.parse(fs.readFileSync(opts.baseline, 'utf-8'));
console.log('');
console.log(`[chat-simulation] =========== Baseline Comparison (threshold: ${(opts.threshold * 100).toFixed(0)}%) ===========`);
console.log(`[chat-simulation] Baseline: ${baseline.baselineBuildVersion || baseline.timestamp}`);
if (jsonReport.mismatchedBuildMode) {
console.log(`[chat-simulation] ⚠ Note: build mode mismatch — test is ${jsonReport.buildMode}, baseline differs.`);
console.log('[chat-simulation] Results may not be directly comparable.');
}
console.log('');
// Metrics that trigger regression failure when they exceed the threshold
const regressionMetrics = [
// [metric, group, unit]
['timeToFirstToken', 'timing', 'ms'],
['timeToComplete', 'timing', 'ms'],
['layoutCount', 'rendering', ''],
['recalcStyleCount', 'rendering', ''],
['forcedReflowCount', 'rendering', ''],
['longTaskCount', 'rendering', ''],
];
// Informational metrics — shown in comparison but don't trigger failure
const infoMetrics = [
['heapDelta', 'memory', 'MB'],
['gcDurationMs', 'memory', 'ms'],
['extHostHeapDelta', 'extHost', 'MB'],
['extHostHeapDeltaPostGC', 'extHost', 'MB'],
];
for (const scenario of Object.keys(jsonReport.scenarios)) {
const current = jsonReport.scenarios[scenario];
const base = baseline.scenarios?.[scenario];
if (!base) { console.log(` ${scenario}: (no baseline)`); continue; }
/** @type {string[]} */
const diffs = [];
let scenarioRegression = false;
for (const [metric, group, unit] of regressionMetrics) {
const cur = current[group]?.[metric];
const bas = base[group]?.[metric];
if (!cur || !bas || !bas.median) { continue; }
const change = (cur.median - bas.median) / bas.median;
const pct = `${change > 0 ? '+' : ''}${(change * 100).toFixed(1)}%`;
// Statistical significance via Welch's t-test on raw run values
const curRaw = (current.rawRuns || []).map((/** @type {any} */ r) => r[metric]).filter((/** @type {any} */ v) => v >= 0);
const basRaw = (base.rawRuns || []).map((/** @type {any} */ r) => r[metric]).filter((/** @type {any} */ v) => v >= 0);
const ttest = welchTTest(basRaw, curRaw);
const metricThreshold = getMetricThreshold(opts, metric);
const absoluteDelta = cur.median - bas.median;
let flag = '';
if (exceedsThreshold(metricThreshold, change, absoluteDelta)) {
if (!ttest) {
flag = ' ← possible regression (n too small for significance test)';
inconclusiveFound = true;
} else if (ttest.significant) {
flag = ` ← REGRESSION (p=${ttest.pValue}, ${ttest.confidence} confidence)`;
scenarioRegression = true;
regressionFound = true;
} else {
flag = ` (likely noise — p=${ttest.pValue}, not significant)`;
inconclusiveFound = true;
}
} else if (ttest && change > 0 && ttest.significant && ttest.confidence === 'high') {
flag = ` (significant increase, p=${ttest.pValue})`;
}
diffs.push(` ${metric}: ${bas.median}${unit}${cur.median}${unit} (${pct})${flag}`);
}
for (const [metric, group, unit] of infoMetrics) {
const cur = current[group]?.[metric];
const bas = base[group]?.[metric];
if (!cur || !bas || bas.median === null || bas.median === undefined) { continue; }
const change = bas.median !== 0 ? (cur.median - bas.median) / bas.median : 0;
const pct = `${change > 0 ? '+' : ''}${(change * 100).toFixed(1)}%`;
diffs.push(` ${metric}: ${bas.median}${unit}${cur.median}${unit} (${pct}) [info]`);
}
console.log(` ${scenario}: ${scenarioRegression ? 'FAIL' : 'OK'}`);
if (scenarioRegression) { regressedScenarios.add(scenario); }
diffs.forEach(d => console.log(d));
}
console.log('');
console.log(regressionFound
? `[chat-simulation] REGRESSION DETECTED — exceeded ${(opts.threshold * 100).toFixed(0)}% threshold with statistical significance`
: `[chat-simulation] All metrics within ${(opts.threshold * 100).toFixed(0)}% of baseline (or not statistically significant)`);
if (inconclusiveFound && !regressionFound) {
// Find the results.json path to suggest in the hint
const resultsPath = Object.keys(jsonReport.scenarios).length > 0
? (jsonReport._resultsPath || opts.resume || 'path/to/results.json')
: 'path/to/results.json';
// Estimate required runs from the observed effect size and variance
// using power analysis for Welch's t-test (alpha=0.05, 80% power).
// n_per_group = 2 * ((z_alpha/2 + z_beta) / d)^2 where d = Cohen's d
let maxNeeded = 0;
for (const scenario of Object.keys(jsonReport.scenarios)) {
const current = jsonReport.scenarios[scenario];
const base = baseline.scenarios?.[scenario];
if (!base) { continue; }
for (const [metric, group] of [['timeToFirstToken', 'timing'], ['timeToComplete', 'timing'], ['layoutCount', 'rendering'], ['recalcStyleCount', 'rendering']]) {
const curRaw = (current.rawRuns || []).map((/** @type {any} */ r) => r[metric]).filter((/** @type {any} */ v) => v >= 0);
const basRaw = (base.rawRuns || []).map((/** @type {any} */ r) => r[metric]).filter((/** @type {any} */ v) => v >= 0);
if (curRaw.length < 2 || basRaw.length < 2) { continue; }
const meanA = basRaw.reduce((/** @type {number} */ s, /** @type {number} */ v) => s + v, 0) / basRaw.length;
const meanB = curRaw.reduce((/** @type {number} */ s, /** @type {number} */ v) => s + v, 0) / curRaw.length;
const varA = basRaw.reduce((/** @type {number} */ s, /** @type {number} */ v) => s + (v - meanA) ** 2, 0) / (basRaw.length - 1);
const varB = curRaw.reduce((/** @type {number} */ s, /** @type {number} */ v) => s + (v - meanB) ** 2, 0) / (curRaw.length - 1);
const pooledSD = Math.sqrt((varA + varB) / 2);
if (pooledSD === 0) { continue; }
const d = Math.abs(meanB - meanA) / pooledSD;
if (d === 0) { continue; }
// z_0.025 = 1.96, z_0.2 = 0.842
const nPerGroup = Math.ceil(2 * ((1.96 + 0.842) / d) ** 2);
const currentN = Math.min(curRaw.length, basRaw.length);
maxNeeded = Math.max(maxNeeded, nPerGroup - currentN);
}
}
const suggestedRuns = Math.max(1, Math.min(maxNeeded, 20));
console.log('');
console.log('[chat-simulation] Some metrics exceeded the threshold but were not statistically significant.');
console.log('[chat-simulation] To increase confidence, add more runs with --resume:');
console.log(`[chat-simulation] npm run perf:chat -- --resume ${resultsPath} --runs ${suggestedRuns}`);
}
}
// -- CI summary ------------------------------------------------------
if (opts.ci) {
const ciBaseline = opts.baseline && fs.existsSync(opts.baseline)
? JSON.parse(fs.readFileSync(opts.baseline, 'utf-8'))
: null;
const summary = generateCISummary(jsonReport, ciBaseline, {
threshold: opts.threshold,
metricThresholds: opts.metricThresholds,
runs: jsonReport.runsPerScenario || opts.runs,
baselineBuild: ciBaseline?.baselineBuildVersion || opts.baselineBuild,
build: opts.build,
});
// Write to file for GitHub Actions $GITHUB_STEP_SUMMARY
const summaryPath = path.join(DATA_DIR, 'ci-summary.md');
fs.writeFileSync(summaryPath, summary);
console.log(`[chat-simulation] CI summary written to ${summaryPath}`);
// Also print the full summary table to stdout
console.log('');
console.log('==================================================================');
console.log(' CHAT PERF COMPARISON RESULTS ');
console.log('==================================================================');
console.log('');
console.log(summary);
}
if (regressionFound) { process.exit(1); }
return regressedScenarios;
}
main().catch(err => { console.error(err); process.exit(1); });