mirror of
https://github.com/microsoft/vscode.git
synced 2026-05-15 12:51:00 +01:00
1936 lines
84 KiB
JavaScript
1936 lines
84 KiB
JavaScript
/*---------------------------------------------------------------------------------------------
|
|
* Copyright (c) Microsoft Corporation. All rights reserved.
|
|
* Licensed under the MIT License. See License.txt in the project root for license information.
|
|
*--------------------------------------------------------------------------------------------*/
|
|
|
|
// @ts-check
|
|
|
|
/**
|
|
* Chat performance benchmark.
|
|
*
|
|
* Uses the real copilot extension with IS_SCENARIO_AUTOMATION=1 and a local
|
|
* mock LLM server. Measures the full stack: prompt building, context
|
|
* gathering, tool resolution, rendering, GC, and layout overhead.
|
|
*
|
|
* Usage:
|
|
* npm run perf:chat # all scenarios vs 1.115.0
|
|
* npm run perf:chat -- --runs 10 # 10 runs per scenario
|
|
* npm run perf:chat -- --scenario text-only # single scenario
|
|
* npm run perf:chat -- --no-baseline # skip baseline comparison
|
|
* npm run perf:chat -- --build 1.110.0 --baseline-build 1.115.0
|
|
* npm run perf:chat -- --resume .chat-simulation-data/2026-04-14/results.json --runs 3
|
|
*/
|
|
|
|
const path = require('path');
|
|
const fs = require('fs');
|
|
const {
|
|
ROOT, DATA_DIR, METRIC_DEFS, loadConfig,
|
|
resolveBuild, isVersionString, buildEnv, buildArgs, prepareRunDir,
|
|
robustStats, welchTTest, summarize, markDuration, launchVSCode,
|
|
getNextExtHostInspectPort, connectToExtHostInspector, getRepoRoot,
|
|
} = require('./common/utils');
|
|
const { getUserTurns, getScenarioIds } = require('./common/mock-llm-server');
|
|
const { registerPerfScenarios, getScenarioDescription } = require('./common/perf-scenarios');
|
|
|
|
// -- Config (edit config.jsonc to change defaults) ---------------------------
|
|
|
|
const CONFIG = loadConfig('perfRegression');
|
|
|
|
// -- CLI args ----------------------------------------------------------------
|
|
|
|
function parseArgs() {
|
|
const args = process.argv.slice(2);
|
|
const opts = {
|
|
runs: CONFIG.runsPerScenario ?? 5,
|
|
verbose: false,
|
|
ci: false,
|
|
noCache: false,
|
|
force: false,
|
|
heapSnapshots: false,
|
|
/** @type {string[]} */
|
|
scenarios: [],
|
|
/** @type {string | undefined} */
|
|
build: undefined,
|
|
/** @type {string | undefined} */
|
|
baseline: undefined,
|
|
/** @type {string | undefined} */
|
|
baselineBuild: CONFIG.baselineBuild ?? '1.115.0',
|
|
saveBaseline: false,
|
|
threshold: CONFIG.regressionThreshold ?? 0.2,
|
|
/** @type {Record<string, number | string>} */
|
|
metricThresholds: CONFIG.metricThresholds ?? {},
|
|
/** @type {string | undefined} */
|
|
resume: undefined,
|
|
productionBuild: false,
|
|
/** @type {Record<string, any>} */
|
|
settingsOverrides: {},
|
|
/** @type {Record<string, any>} */
|
|
testSettingsOverrides: {},
|
|
/** @type {Record<string, any>} */
|
|
baselineSettingsOverrides: {},
|
|
cleanupDiagnostics: false,
|
|
};
|
|
for (let i = 0; i < args.length; i++) {
|
|
switch (args[i]) {
|
|
case '--runs': opts.runs = parseInt(args[++i], 10); break;
|
|
case '--verbose': opts.verbose = true; break;
|
|
case '--scenario': case '-s': opts.scenarios.push(args[++i]); break;
|
|
case '--build': case '-b': opts.build = args[++i]; break;
|
|
case '--baseline': opts.baseline = args[++i]; break;
|
|
case '--baseline-build': opts.baselineBuild = args[++i]; break;
|
|
case '--no-baseline': opts.baselineBuild = undefined; break;
|
|
case '--save-baseline': opts.saveBaseline = true; break;
|
|
case '--threshold': opts.threshold = parseFloat(args[++i]); break;
|
|
case '--resume': opts.resume = args[++i]; break;
|
|
case '--production-build': opts.productionBuild = true; break;
|
|
case '--setting': case '--test-setting': case '--baseline-setting': {
|
|
const kv = args[++i];
|
|
const eq = kv.indexOf('=');
|
|
if (eq === -1) { console.error(`${args[i - 1]} requires key=value, got: ${kv}`); process.exit(1); }
|
|
const key = kv.slice(0, eq);
|
|
const raw = kv.slice(eq + 1);
|
|
// Parse booleans and numbers, keep rest as strings
|
|
const val = raw === 'true' ? true : raw === 'false' ? false : /^-?\d+(\.\d+)?$/.test(raw) ? Number(raw) : raw;
|
|
const flag = args[i - 1];
|
|
if (flag === '--test-setting') { opts.testSettingsOverrides[key] = val; }
|
|
else if (flag === '--baseline-setting') { opts.baselineSettingsOverrides[key] = val; }
|
|
else { opts.settingsOverrides[key] = val; }
|
|
break;
|
|
}
|
|
case '--no-cache': opts.noCache = true; break;
|
|
case '--force': opts.force = true; break;
|
|
case '--heap-snapshots': opts.heapSnapshots = true; break;
|
|
case '--ci': opts.ci = true; opts.noCache = true; opts.heapSnapshots = true; opts.cleanupDiagnostics = true; break;
|
|
case '--cleanup-diagnostics': opts.cleanupDiagnostics = true; break;
|
|
case '--help': case '-h':
|
|
console.log([
|
|
'Chat performance benchmark',
|
|
'',
|
|
'Options:',
|
|
' --runs <n> Number of runs per scenario (default: 5)',
|
|
' --scenario <id> Scenario to run (repeatable; default: all)',
|
|
' --build <path|ver> Path to VS Code build, or a version to download',
|
|
' (e.g. "1.110.0", "insiders", commit hash, or local path)',
|
|
' --baseline <path> Compare against a baseline JSON file',
|
|
' --baseline-build <v> Version or path to benchmark as baseline',
|
|
' (e.g. "1.115.0", "insiders", commit hash, or local path)',
|
|
' --no-baseline Skip baseline comparison entirely',
|
|
' --save-baseline Save results as the new baseline (requires --baseline <path>)',
|
|
' --resume <path> Resume a previous run, adding more iterations to increase',
|
|
' confidence. Merges new runs with existing rawRuns data',
|
|
' --threshold <frac> Regression threshold fraction (default: 0.2 = 20%)',
|
|
' --production-build Build a local bundled package (via gulp vscode) for',
|
|
' apples-to-apples comparison against a release baseline',
|
|
' --setting <k=v> Set a VS Code setting override for all builds (repeatable)',
|
|
' --test-setting <k=v> Set a VS Code setting override for test build only',
|
|
' --baseline-setting <k=v> Set a VS Code setting override for baseline build only',
|
|
' e.g. --setting chat.experimental.incrementalRendering.enabled=true',
|
|
' --no-cache Ignore cached baseline data, always run fresh',
|
|
' --force Skip build mode mismatch confirmation',
|
|
' --heap-snapshots Take heap snapshots (slow; auto-enabled in --ci mode)',
|
|
' --ci CI mode: write Markdown summary to ci-summary.md (implies --no-cache, --heap-snapshots, --cleanup-diagnostics)',
|
|
' --cleanup-diagnostics Remove heap snapshots, CPU profiles, and traces after each run to save disk space',
|
|
' --verbose Print per-run details',
|
|
'',
|
|
'Scenarios: ' + getScenarioIds().join(', '),
|
|
].join('\n'));
|
|
process.exit(0);
|
|
}
|
|
}
|
|
if (opts.scenarios.length === 0) {
|
|
opts.scenarios = getScenarioIds();
|
|
} else {
|
|
const knownIds = new Set(getScenarioIds());
|
|
const unknown = opts.scenarios.filter(s => !knownIds.has(s));
|
|
if (unknown.length > 0) {
|
|
console.error(`Unknown scenario(s): ${unknown.join(', ')}\nAvailable: ${[...knownIds].join(', ')}`);
|
|
process.exit(1);
|
|
}
|
|
}
|
|
return opts;
|
|
}
|
|
|
|
// -- Build mode detection ----------------------------------------------------
|
|
|
|
/**
|
|
* Classify an electron path into a build mode.
|
|
* @param {string} electronPath
|
|
* @returns {'dev' | 'production' | 'release'}
|
|
*/
|
|
function detectBuildMode(electronPath) {
|
|
if (electronPath.includes('.vscode-test')) {
|
|
return 'release';
|
|
}
|
|
if (electronPath.includes('VSCode-')) {
|
|
return 'production';
|
|
}
|
|
return 'dev';
|
|
}
|
|
|
|
/**
|
|
* Return a human-readable label for a build mode.
|
|
* @param {'dev' | 'production' | 'release'} mode
|
|
* @returns {string}
|
|
*/
|
|
function buildModeLabel(mode) {
|
|
switch (mode) {
|
|
case 'dev': return 'development (unbundled)';
|
|
case 'production': return 'production (bundled, local)';
|
|
case 'release': return 'release (bundled, downloaded)';
|
|
}
|
|
}
|
|
|
|
// -- Production build --------------------------------------------------------
|
|
|
|
/**
|
|
* Build a local production (bundled) VS Code package using `gulp vscode`.
|
|
* Returns the path to the Electron executable in the packaged output.
|
|
*
|
|
* The gulp task compiles TypeScript, bundles JS, and packages with Electron
|
|
* into `../VSCode-<platform>-<arch>/`. This is the same process used for
|
|
* release builds, minus minification and mangling.
|
|
*/
|
|
function buildProductionBuild() {
|
|
const product = require(path.join(ROOT, 'product.json'));
|
|
const platform = process.platform;
|
|
const arch = process.arch;
|
|
const destDir = path.join(ROOT, '..', `VSCode-${platform}-${arch}`);
|
|
|
|
console.log('[chat-simulation] Building local production package (gulp vscode)...');
|
|
console.log('[chat-simulation] This may take a few minutes on the first run.');
|
|
|
|
const { execSync } = require('child_process');
|
|
try {
|
|
execSync('npm run gulp -- vscode', {
|
|
cwd: ROOT,
|
|
stdio: 'inherit',
|
|
timeout: 10 * 60 * 1000, // 10 minute timeout
|
|
});
|
|
} catch (e) {
|
|
// The copilot shim step may fail locally when the copilot SDK is not
|
|
// fully packaged (it is normally supplied via CI). As long as the
|
|
// Electron executable was produced we can still benchmark.
|
|
console.warn('[chat-simulation] gulp vscode exited with errors (see above). Checking if executable was still produced...');
|
|
}
|
|
|
|
/** @type {string} */
|
|
let electronPath;
|
|
if (platform === 'darwin') {
|
|
electronPath = path.join(destDir, `${product.nameLong}.app`, 'Contents', 'MacOS', product.nameShort);
|
|
} else if (platform === 'linux') {
|
|
electronPath = path.join(destDir, product.applicationName);
|
|
} else {
|
|
electronPath = path.join(destDir, `${product.nameShort}.exe`);
|
|
}
|
|
|
|
if (!fs.existsSync(electronPath)) {
|
|
console.error(`[chat-simulation] Production build failed — executable not found at: ${electronPath}`);
|
|
process.exit(1);
|
|
}
|
|
|
|
// Merge product.overrides.json into the packaged product.json.
|
|
// The overrides file contains extensionsGallery and other config that
|
|
// the OSS product.json lacks. In dev builds these are loaded at
|
|
// runtime when VSCODE_DEV is set, but the production build doesn't
|
|
// set that flag so we bake them in.
|
|
const overridesPath = path.join(ROOT, 'product.overrides.json');
|
|
if (fs.existsSync(overridesPath)) {
|
|
/** @type {string} */
|
|
let appDir;
|
|
if (platform === 'darwin') {
|
|
appDir = path.join(destDir, `${product.nameLong}.app`, 'Contents', 'Resources', 'app');
|
|
} else {
|
|
appDir = path.join(destDir, 'resources', 'app');
|
|
}
|
|
const packagedProductPath = path.join(appDir, 'product.json');
|
|
if (fs.existsSync(packagedProductPath)) {
|
|
const packagedProduct = JSON.parse(fs.readFileSync(packagedProductPath, 'utf-8'));
|
|
const overrides = JSON.parse(fs.readFileSync(overridesPath, 'utf-8'));
|
|
const merged = Object.assign(packagedProduct, overrides);
|
|
fs.writeFileSync(packagedProductPath, JSON.stringify(merged, null, '\t'));
|
|
console.log('[chat-simulation] Merged product.overrides.json into packaged product.json');
|
|
}
|
|
}
|
|
|
|
console.log(`[chat-simulation] Production build ready: ${electronPath}`);
|
|
return electronPath;
|
|
}
|
|
|
|
/**
|
|
* @typedef {{ type: 'fraction', value: number } | { type: 'absolute', value: number }} MetricThreshold
|
|
*/
|
|
|
|
/**
|
|
* Parse a metric threshold value from config.
|
|
* - A number is treated as a fraction (e.g. 0.2 = 20%).
|
|
* - A string like "100ms" or "5" is treated as an absolute delta.
|
|
* @param {number | string} raw
|
|
* @returns {MetricThreshold}
|
|
*/
|
|
function parseMetricThreshold(raw) {
|
|
if (typeof raw === 'number') {
|
|
return { type: 'fraction', value: raw };
|
|
}
|
|
// Strip unit suffix (ms, MB, etc.) and parse the number
|
|
const num = parseFloat(raw);
|
|
if (isNaN(num)) {
|
|
throw new Error(`Invalid metric threshold: ${raw}`);
|
|
}
|
|
return { type: 'absolute', value: num };
|
|
}
|
|
|
|
/**
|
|
* Get the regression threshold for a specific metric.
|
|
* Uses per-metric override from config if available, otherwise the global threshold.
|
|
* @param {{ threshold: number, metricThresholds?: Record<string, number | string> }} opts
|
|
* @param {string} metric
|
|
* @returns {MetricThreshold}
|
|
*/
|
|
function getMetricThreshold(opts, metric) {
|
|
const raw = opts.metricThresholds?.[metric];
|
|
if (raw !== undefined) {
|
|
return parseMetricThreshold(raw);
|
|
}
|
|
return { type: 'fraction', value: opts.threshold };
|
|
}
|
|
|
|
/**
|
|
* Check whether a change exceeds the threshold.
|
|
* @param {MetricThreshold} threshold
|
|
* @param {number} change - fractional change (e.g. 0.5 = 50% increase)
|
|
* @param {number} absoluteDelta - absolute difference (cur.median - bas.median)
|
|
* @returns {boolean}
|
|
*/
|
|
function exceedsThreshold(threshold, change, absoluteDelta) {
|
|
if (threshold.type === 'absolute') {
|
|
return absoluteDelta > threshold.value;
|
|
}
|
|
return change > threshold.value;
|
|
}
|
|
|
|
// -- Metrics -----------------------------------------------------------------
|
|
|
|
/**
|
|
* @typedef {{
|
|
* timeToUIUpdated: number,
|
|
* timeToFirstToken: number,
|
|
* timeToComplete: number,
|
|
* timeToRenderComplete: number,
|
|
* instructionCollectionTime: number,
|
|
* agentInvokeTime: number,
|
|
* heapUsedBefore: number,
|
|
* heapUsedAfter: number,
|
|
* heapDelta: number,
|
|
* heapDeltaPostGC: number,
|
|
* majorGCs: number,
|
|
* minorGCs: number,
|
|
* gcDurationMs: number,
|
|
* layoutCount: number,
|
|
* layoutDurationMs: number,
|
|
* recalcStyleCount: number,
|
|
* forcedReflowCount: number,
|
|
* longTaskCount: number,
|
|
* longAnimationFrameCount: number,
|
|
* longAnimationFrameTotalMs: number,
|
|
* frameCount: number,
|
|
* compositeLayers: number,
|
|
* paintCount: number,
|
|
* hasInternalMarks: boolean,
|
|
* responseHasContent: boolean,
|
|
* internalFirstToken: number,
|
|
* profilePath: string,
|
|
* tracePath: string,
|
|
* snapshotPath: string,
|
|
* extHostHeapUsedBefore: number,
|
|
* extHostHeapUsedAfter: number,
|
|
* extHostHeapDelta: number,
|
|
* extHostHeapDeltaPostGC: number,
|
|
* extHostProfilePath: string,
|
|
* extHostSnapshotPath: string,
|
|
* }} RunMetrics
|
|
*/
|
|
|
|
// -- Single run --------------------------------------------------------------
|
|
|
|
/**
|
|
* @param {string} electronPath
|
|
* @param {string} scenario
|
|
* @param {{ url: string, requestCount: () => number, waitForRequests: (n: number, ms: number) => Promise<void>, completionCount: () => number, waitForCompletion: (n: number, ms: number) => Promise<void> }} mockServer
|
|
* @param {boolean} verbose
|
|
* @param {string} runIndex
|
|
* @param {string} runDir - timestamped run directory for diagnostics
|
|
* @param {'baseline' | 'test'} role - whether this is a baseline or test run
|
|
* @param {Record<string, any>} [settingsOverrides] - custom VS Code settings
|
|
* @param {{ heapSnapshots?: boolean }} [runOpts] - additional run options
|
|
* @returns {Promise<RunMetrics>}
|
|
*/
|
|
async function runOnce(electronPath, scenario, mockServer, verbose, runIndex, runDir, role, settingsOverrides, runOpts) {
|
|
const takeHeapSnapshots = runOpts?.heapSnapshots ?? false;
|
|
const { userDataDir, extDir, logsDir } = prepareRunDir(runIndex, mockServer, settingsOverrides);
|
|
const isDevBuild = !electronPath.includes('.vscode-test') && !electronPath.includes('VSCode-');
|
|
// Extract a clean build label from the path.
|
|
// Dev: .build/electron/Code - OSS.app/.../Code - OSS → "dev"
|
|
// Stable: .vscode-test/vscode-darwin-arm64-1.115.0/Visual Studio Code.app/.../Electron → "1.115.0"
|
|
// Production: ../VSCode-darwin-arm64/Code - OSS.app/.../Code - OSS → "production"
|
|
let buildLabel = 'dev';
|
|
if (!isDevBuild) {
|
|
const vscodeTestMatch = electronPath.match(/vscode-test\/vscode-[^/]*?-(\d+\.\d+\.\d+)/);
|
|
if (vscodeTestMatch) {
|
|
buildLabel = vscodeTestMatch[1];
|
|
} else if (electronPath.includes('VSCode-')) {
|
|
buildLabel = 'production';
|
|
} else {
|
|
buildLabel = path.basename(electronPath);
|
|
}
|
|
}
|
|
|
|
// For dev builds from a different repo, derive the repo root from the
|
|
// electron path so that the build loads its own out/ source code.
|
|
const appRoot = isDevBuild ? (getRepoRoot(electronPath) || ROOT) : ROOT;
|
|
if (isDevBuild && appRoot !== ROOT) {
|
|
if (verbose) {
|
|
console.log(` [debug] Using appRoot from electron path: ${appRoot}`);
|
|
}
|
|
}
|
|
|
|
// Create a per-run diagnostics directory: <runDir>/<role>-<build>/<scenario>-<i>/
|
|
const runDiagDir = path.join(runDir, `${role}-${buildLabel}`, runIndex.replace(/^baseline-/, ''));
|
|
fs.mkdirSync(runDiagDir, { recursive: true });
|
|
|
|
const tracePath = path.join(runDiagDir, 'trace.json');
|
|
const extHostInspectPort = getNextExtHostInspectPort();
|
|
const vscode = await launchVSCode(
|
|
electronPath,
|
|
buildArgs(userDataDir, extDir, logsDir, { isDevBuild, extHostInspectPort, traceFile: tracePath, appRoot }),
|
|
buildEnv(mockServer, { isDevBuild }),
|
|
{ verbose },
|
|
);
|
|
activeVSCode = vscode;
|
|
const window = vscode.page;
|
|
|
|
// Declared outside try so the finally block can clean up
|
|
/** @type {{ send: (method: string, params?: any) => Promise<any>, on: (event: string, listener: (params: any) => void) => void, close: () => void } | null} */
|
|
let extHostInspector = null;
|
|
/** @type {{ usedSize: number, totalSize: number } | null} */
|
|
let extHostHeapBefore = null;
|
|
/** @type {Omit<RunMetrics, 'majorGCs' | 'minorGCs' | 'gcDurationMs' | 'longTaskCount' | 'longAnimationFrameCount' | 'longAnimationFrameTotalMs' | 'timeToUIUpdated' | 'timeToFirstToken' | 'timeToComplete' | 'timeToRenderComplete' | 'layoutDurationMs' | 'instructionCollectionTime' | 'agentInvokeTime' | 'hasInternalMarks' | 'internalFirstToken'> | null} */
|
|
let partialMetrics = null;
|
|
// Timing vars hoisted for access in post-close trace parsing
|
|
let submitTime = 0;
|
|
let firstResponseTime = 0;
|
|
let responseCompleteTime = 0;
|
|
let renderCompleteTime = 0;
|
|
|
|
try {
|
|
await window.waitForSelector('.monaco-workbench', { timeout: 60_000 });
|
|
|
|
const cdp = await window.context().newCDPSession(window);
|
|
await cdp.send('Performance.enable');
|
|
const heapBefore = /** @type {any} */ (await cdp.send('Runtime.getHeapUsage'));
|
|
|
|
const metricsBefore = await cdp.send('Performance.getMetrics');
|
|
|
|
// Open chat
|
|
const chatShortcut = process.platform === 'darwin' ? 'Control+Meta+KeyI' : 'Control+Alt+KeyI';
|
|
await window.keyboard.press(chatShortcut);
|
|
|
|
const CHAT_VIEW = 'div[id="workbench.panel.chat"]';
|
|
const chatEditorSel = `${CHAT_VIEW} .interactive-input-part .monaco-editor[role="code"]`;
|
|
|
|
await window.waitForSelector(CHAT_VIEW, { timeout: 15_000 });
|
|
await window.waitForFunction(
|
|
(selector) => Array.from(document.querySelectorAll(selector)).some(el => {
|
|
const rect = el.getBoundingClientRect();
|
|
return rect.width > 0 && rect.height > 0;
|
|
}),
|
|
chatEditorSel, { timeout: 15_000 },
|
|
);
|
|
|
|
// Dismiss dialogs
|
|
const dismissDialog = async () => {
|
|
for (const sel of ['.chat-setup-dialog', '.dialog-shadow', '.monaco-dialog-box']) {
|
|
const el = await window.$(sel);
|
|
if (el) { await window.keyboard.press('Escape'); await new Promise(r => setTimeout(r, 500)); break; }
|
|
}
|
|
};
|
|
await dismissDialog();
|
|
|
|
// Wait for extension activation
|
|
const reqsBefore = mockServer.requestCount();
|
|
try { await mockServer.waitForRequests(reqsBefore + 4, 30_000); } catch { }
|
|
if (verbose) {
|
|
console.log(` [debug] Extension active (${mockServer.requestCount() - reqsBefore} new requests)`);
|
|
}
|
|
|
|
// Connect to extension host inspector for profiling/heap data
|
|
try {
|
|
extHostInspector = await connectToExtHostInspector(extHostInspectPort, { verbose, timeoutMs: 15_000 });
|
|
await extHostInspector.send('HeapProfiler.enable');
|
|
await extHostInspector.send('Profiler.enable');
|
|
await extHostInspector.send('Profiler.start');
|
|
extHostHeapBefore = await extHostInspector.send('Runtime.getHeapUsage');
|
|
if (verbose && extHostHeapBefore) {
|
|
console.log(` [ext-host] Heap before: ${Math.round(extHostHeapBefore.usedSize / 1024 / 1024)}MB`);
|
|
}
|
|
} catch (err) {
|
|
if (verbose) {
|
|
console.log(` [ext-host] Could not connect to inspector: ${err}`);
|
|
}
|
|
}
|
|
|
|
// Wait for model resolution
|
|
await new Promise(r => setTimeout(r, 3000));
|
|
await dismissDialog();
|
|
|
|
// Focus input
|
|
await window.click(chatEditorSel);
|
|
const focusStart = Date.now();
|
|
while (Date.now() - focusStart < 5_000) {
|
|
const focused = await window.evaluate((sel) => {
|
|
const el = document.querySelector(sel);
|
|
return el && (el.classList.contains('focused') || el.contains(document.activeElement));
|
|
}, chatEditorSel).catch(() => false);
|
|
if (focused) { break; }
|
|
await new Promise(r => setTimeout(r, 50));
|
|
}
|
|
|
|
// Type message — use the smoke-test driver's typeInEditor when available
|
|
// (dev builds), fall back to pressSequentially for stable/insiders builds.
|
|
const chatMessage = `[scenario:${scenario}] Explain how this code works`;
|
|
const actualInputSelector = await window.evaluate((editorSel) => {
|
|
const editor = document.querySelector(editorSel);
|
|
if (!editor) { throw new Error('Chat editor not found'); }
|
|
return editor.querySelector('.native-edit-context') ? editorSel + ' .native-edit-context' : editorSel + ' textarea';
|
|
}, chatEditorSel);
|
|
|
|
const hasDriver = await window.evaluate(() =>
|
|
// @ts-ignore
|
|
!!globalThis.driver?.typeInEditor
|
|
).catch(() => false);
|
|
|
|
if (hasDriver) {
|
|
await window.evaluate(({ selector, text }) => {
|
|
// @ts-ignore
|
|
return globalThis.driver.typeInEditor(selector, text);
|
|
}, { selector: actualInputSelector, text: chatMessage });
|
|
} else {
|
|
// Fallback: click the input element and use pressSequentially
|
|
await window.click(actualInputSelector);
|
|
await new Promise(r => setTimeout(r, 200));
|
|
await window.locator(actualInputSelector).pressSequentially(chatMessage, { delay: 0 });
|
|
}
|
|
|
|
// Start CPU profiler to capture call stacks during the interaction
|
|
await cdp.send('Profiler.enable');
|
|
await cdp.send('Profiler.start');
|
|
|
|
// Submit
|
|
const completionsBefore = mockServer.completionCount();
|
|
submitTime = Date.now();
|
|
await window.keyboard.press('Enter');
|
|
|
|
// Wait for mock server to serve the response
|
|
try { await mockServer.waitForCompletion(completionsBefore + 1, 60_000); } catch { }
|
|
firstResponseTime = Date.now();
|
|
|
|
// Wait for DOM response to settle
|
|
await dismissDialog();
|
|
const responseSelector = `${CHAT_VIEW} .interactive-item-container.interactive-response`;
|
|
await window.waitForFunction(
|
|
(sel) => {
|
|
const responses = document.querySelectorAll(sel);
|
|
if (responses.length === 0) { return false; }
|
|
return !responses[responses.length - 1].classList.contains('chat-response-loading');
|
|
},
|
|
responseSelector, { timeout: 30_000 },
|
|
);
|
|
responseCompleteTime = Date.now();
|
|
|
|
// -- User turn injection loop -----------------------------------------
|
|
// For multi-turn scenarios with user follow-ups, type each follow-up
|
|
// message and wait for the model's response to settle.
|
|
const userTurns = getUserTurns(scenario);
|
|
for (let ut = 0; ut < userTurns.length; ut++) {
|
|
const userTurn = userTurns[ut];
|
|
if (verbose) {
|
|
console.log(` [debug] User follow-up ${ut + 1}/${userTurns.length}: "${userTurn.message}"`);
|
|
}
|
|
|
|
// Brief pause to let the UI settle between turns
|
|
await new Promise(r => setTimeout(r, 500));
|
|
|
|
// Focus the chat input
|
|
await window.click(chatEditorSel);
|
|
const utFocusStart = Date.now();
|
|
while (Date.now() - utFocusStart < 3_000) {
|
|
const focused = await window.evaluate((sel) => {
|
|
const el = document.querySelector(sel);
|
|
return el && (el.classList.contains('focused') || el.contains(document.activeElement));
|
|
}, chatEditorSel).catch(() => false);
|
|
if (focused) { break; }
|
|
await new Promise(r => setTimeout(r, 50));
|
|
}
|
|
|
|
// Type the follow-up message
|
|
if (hasDriver) {
|
|
await window.evaluate(({ selector, text }) => {
|
|
// @ts-ignore
|
|
return globalThis.driver.typeInEditor(selector, text);
|
|
}, { selector: actualInputSelector, text: userTurn.message });
|
|
} else {
|
|
await window.click(actualInputSelector);
|
|
await new Promise(r => setTimeout(r, 200));
|
|
await window.locator(actualInputSelector).pressSequentially(userTurn.message, { delay: 0 });
|
|
}
|
|
|
|
// Submit follow-up
|
|
const utCompBefore = mockServer.completionCount();
|
|
await window.keyboard.press('Enter');
|
|
|
|
// Wait for mock server to serve the response for this turn
|
|
try { await mockServer.waitForCompletion(utCompBefore + 1, 60_000); } catch { }
|
|
|
|
// Wait for the new response to finish rendering.
|
|
// The chat list is virtualized — old response elements are
|
|
// recycled out of the DOM as new ones appear, so we cannot
|
|
// rely on counting DOM elements. Instead, scroll to the
|
|
// bottom and wait for no response to be in loading state.
|
|
await dismissDialog();
|
|
await window.evaluate((chatViewSel) => {
|
|
const input = document.querySelector(chatViewSel + ' .interactive-input-part');
|
|
if (input) { input.scrollIntoView({ block: 'end' }); }
|
|
}, CHAT_VIEW);
|
|
await new Promise(r => setTimeout(r, 200));
|
|
|
|
await window.waitForFunction(
|
|
(sel) => {
|
|
const responses = document.querySelectorAll(sel);
|
|
if (responses.length === 0) { return false; }
|
|
return !responses[responses.length - 1].classList.contains('chat-response-loading');
|
|
},
|
|
responseSelector,
|
|
{ timeout: 30_000 },
|
|
);
|
|
responseCompleteTime = Date.now();
|
|
|
|
if (verbose) {
|
|
const utResponseInfo = await window.evaluate((sel) => {
|
|
const responses = document.querySelectorAll(sel);
|
|
const last = responses[responses.length - 1];
|
|
return last ? (last.textContent || '').substring(0, 150) : '(empty)';
|
|
}, responseSelector);
|
|
console.log(` [debug] Follow-up response (first 150 chars): ${utResponseInfo}`);
|
|
}
|
|
}
|
|
|
|
// Stop CPU profiler and save the profile
|
|
const { profile } = /** @type {any} */ (await cdp.send('Profiler.stop'));
|
|
const profilePath = path.join(runDiagDir, 'profile.cpuprofile');
|
|
fs.writeFileSync(profilePath, JSON.stringify(profile));
|
|
if (verbose) {
|
|
console.log(` [debug] CPU profile saved to ${profilePath}`);
|
|
}
|
|
|
|
const responseInfo = await window.evaluate((sel) => {
|
|
const responses = document.querySelectorAll(sel);
|
|
const last = responses[responses.length - 1];
|
|
if (!last) { return { hasContent: false, text: '' }; }
|
|
const text = last.textContent || '';
|
|
return { hasContent: text.trim().length > 0, text: text.substring(0, 200) };
|
|
}, responseSelector);
|
|
|
|
if (verbose) {
|
|
console.log(` [debug] Response content (first 200 chars): ${responseInfo.text}`);
|
|
console.log(` [debug] Client-side timing: firstResponse=${firstResponseTime - submitTime}ms, complete=${responseCompleteTime - submitTime}ms`);
|
|
}
|
|
|
|
// Wait for the typewriter animation to finish rendering.
|
|
// The chat UI animates streamed content word-by-word after the
|
|
// response stream completes. We need to wait until all content
|
|
// is rendered before capturing layout/style metrics, otherwise
|
|
// we miss the rendering phase where batching optimizations matter.
|
|
await window.waitForFunction(
|
|
(sel) => {
|
|
const responses = document.querySelectorAll(sel);
|
|
const last = responses[responses.length - 1];
|
|
if (!last) { return true; }
|
|
// The typewriter animation is done when there are no
|
|
// elements with the 'typewriter' or 'animating' class,
|
|
// and no pending cursor animations.
|
|
const hasAnimating = last.querySelector('.chat-animated-word, .chat-typewriter-cursor');
|
|
return !hasAnimating;
|
|
},
|
|
responseSelector,
|
|
{ timeout: 30_000 },
|
|
).catch(() => {
|
|
// Fallback: if the selector-based check doesn't work (e.g.
|
|
// the CSS classes differ across versions), wait for content
|
|
// to stabilize by polling textContent.
|
|
});
|
|
|
|
// Additional stabilization: poll until textContent stops changing.
|
|
// This catches any remaining animation regardless of CSS class names.
|
|
{
|
|
let prev = '';
|
|
let stableCount = 0;
|
|
const stabilizeStart = Date.now();
|
|
while (stableCount < 3 && Date.now() - stabilizeStart < 10_000) {
|
|
const current = await window.evaluate((sel) => {
|
|
const responses = document.querySelectorAll(sel);
|
|
const last = responses[responses.length - 1];
|
|
return last ? (last.textContent || '') : '';
|
|
}, responseSelector).catch(() => '');
|
|
if (current === prev) {
|
|
stableCount++;
|
|
} else {
|
|
stableCount = 0;
|
|
prev = current;
|
|
}
|
|
await new Promise(r => setTimeout(r, 100));
|
|
}
|
|
}
|
|
renderCompleteTime = Date.now();
|
|
if (verbose) {
|
|
console.log(` [debug] Render stabilized: ${renderCompleteTime - responseCompleteTime}ms after stream complete`);
|
|
}
|
|
|
|
const heapAfter = /** @type {any} */ (await cdp.send('Runtime.getHeapUsage'));
|
|
const metricsAfter = await cdp.send('Performance.getMetrics');
|
|
|
|
// -- Extension host metrics (non-snapshot) ---------------------------
|
|
let extHostHeapUsedBefore = -1;
|
|
let extHostHeapUsedAfter = -1;
|
|
let extHostHeapDelta = -1;
|
|
let extHostHeapDeltaPostGC = -1;
|
|
let extHostProfilePath = '';
|
|
let extHostSnapshotPath = '';
|
|
if (extHostInspector && extHostHeapBefore) {
|
|
try {
|
|
extHostHeapUsedBefore = Math.round(extHostHeapBefore.usedSize / 1024 / 1024);
|
|
|
|
// Stop CPU profiler and save
|
|
const extProfile = await extHostInspector.send('Profiler.stop');
|
|
extHostProfilePath = path.join(runDiagDir, 'exthost-profile.cpuprofile');
|
|
fs.writeFileSync(extHostProfilePath, JSON.stringify(extProfile.profile));
|
|
if (verbose) {
|
|
console.log(` [ext-host] CPU profile saved to ${extHostProfilePath}`);
|
|
}
|
|
|
|
// Heap usage after interaction
|
|
const extHostHeapAfter = await extHostInspector.send('Runtime.getHeapUsage');
|
|
extHostHeapUsedAfter = Math.round(extHostHeapAfter.usedSize / 1024 / 1024);
|
|
extHostHeapDelta = extHostHeapUsedAfter - extHostHeapUsedBefore;
|
|
|
|
// Force GC and measure retained heap
|
|
try {
|
|
await extHostInspector.send('Runtime.evaluate', { expression: 'gc()', awaitPromise: false, includeCommandLineAPI: true });
|
|
await new Promise(r => setTimeout(r, 200));
|
|
const extHostHeapPostGC = await extHostInspector.send('Runtime.getHeapUsage');
|
|
extHostHeapDeltaPostGC = Math.round(extHostHeapPostGC.usedSize / 1024 / 1024) - extHostHeapUsedBefore;
|
|
} catch {
|
|
extHostHeapDeltaPostGC = -1;
|
|
}
|
|
|
|
if (verbose) {
|
|
console.log(` [ext-host] Heap: before=${extHostHeapUsedBefore}MB, after=${extHostHeapUsedAfter}MB, delta=${extHostHeapDelta}MB, deltaPostGC=${extHostHeapDeltaPostGC}MB`);
|
|
}
|
|
} catch (err) {
|
|
if (verbose) {
|
|
console.log(` [ext-host] Error collecting metrics: ${err}`);
|
|
}
|
|
}
|
|
}
|
|
|
|
// -- Heap snapshots (opt-in, parallelized) ---------------------------
|
|
let snapshotPath = '';
|
|
if (takeHeapSnapshots) {
|
|
const snapshotPromises = [];
|
|
|
|
// Renderer snapshot
|
|
snapshotPromises.push((async () => {
|
|
const p = path.join(runDiagDir, 'heap.heapsnapshot');
|
|
await cdp.send('HeapProfiler.enable');
|
|
const chunks = /** @type {string[]} */ ([]);
|
|
cdp.on('HeapProfiler.addHeapSnapshotChunk', (/** @type {any} */ params) => {
|
|
chunks.push(params.chunk);
|
|
});
|
|
await cdp.send('HeapProfiler.takeHeapSnapshot', { reportProgress: false });
|
|
fs.writeFileSync(p, chunks.join(''));
|
|
return p;
|
|
})());
|
|
|
|
// Extension host snapshot (parallel with renderer)
|
|
if (extHostInspector && extHostHeapBefore) {
|
|
snapshotPromises.push((async () => {
|
|
const p = path.join(runDiagDir, 'exthost-heap.heapsnapshot');
|
|
const chunks = /** @type {string[]} */ ([]);
|
|
extHostInspector.on('HeapProfiler.addHeapSnapshotChunk', (/** @type {any} */ params) => {
|
|
chunks.push(params.chunk);
|
|
});
|
|
await extHostInspector.send('HeapProfiler.takeHeapSnapshot', { reportProgress: false });
|
|
fs.writeFileSync(p, chunks.join(''));
|
|
return p;
|
|
})());
|
|
}
|
|
|
|
const snapshotResults = await Promise.all(snapshotPromises);
|
|
snapshotPath = snapshotResults[0];
|
|
if (snapshotResults.length > 1) {
|
|
extHostSnapshotPath = snapshotResults[1];
|
|
}
|
|
|
|
if (verbose) {
|
|
console.log(` [debug] Renderer snapshot saved to ${snapshotPath}`);
|
|
if (extHostSnapshotPath) {
|
|
console.log(` [ext-host] Snapshot saved to ${extHostSnapshotPath}`);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Close ext host inspector now that snapshots (if any) are done
|
|
if (extHostInspector) {
|
|
extHostInspector.close();
|
|
}
|
|
|
|
// Store partial metrics here so we can combine with trace data after close.
|
|
|
|
/** @param {any} r @param {string} name */
|
|
function getMetric(r, name) {
|
|
const e = r.metrics?.find((/** @type {any} */ m) => m.name === name);
|
|
return e ? e.value : 0;
|
|
}
|
|
|
|
partialMetrics = {
|
|
heapUsedBefore: Math.round(heapBefore.usedSize / 1024 / 1024),
|
|
heapUsedAfter: Math.round(heapAfter.usedSize / 1024 / 1024),
|
|
heapDelta: Math.round((heapAfter.usedSize - heapBefore.usedSize) / 1024 / 1024),
|
|
heapDeltaPostGC: await (async () => {
|
|
// Force a full GC then measure heap to get deterministic retained-memory delta.
|
|
// --js-flags=--expose-gc is not required: CDP's Runtime.evaluate can call gc()
|
|
// when includeCommandLineAPI is true.
|
|
try {
|
|
await cdp.send('Runtime.evaluate', { expression: 'gc()', awaitPromise: false, includeCommandLineAPI: true });
|
|
await new Promise(r => setTimeout(r, 200));
|
|
const heapPostGC = /** @type {any} */ (await cdp.send('Runtime.getHeapUsage'));
|
|
return Math.round((heapPostGC.usedSize - heapBefore.usedSize) / 1024 / 1024);
|
|
} catch {
|
|
return -1; // gc() not available in this build
|
|
}
|
|
})(),
|
|
layoutCount: getMetric(metricsAfter, 'LayoutCount') - getMetric(metricsBefore, 'LayoutCount'),
|
|
recalcStyleCount: getMetric(metricsAfter, 'RecalcStyleCount') - getMetric(metricsBefore, 'RecalcStyleCount'),
|
|
forcedReflowCount: getMetric(metricsAfter, 'ForcedStyleRecalcs') - getMetric(metricsBefore, 'ForcedStyleRecalcs'),
|
|
frameCount: getMetric(metricsAfter, 'FrameCount') - getMetric(metricsBefore, 'FrameCount'),
|
|
compositeLayers: getMetric(metricsAfter, 'CompositeLayers') - getMetric(metricsBefore, 'CompositeLayers'),
|
|
paintCount: getMetric(metricsAfter, 'PaintCount') - getMetric(metricsBefore, 'PaintCount'),
|
|
responseHasContent: responseInfo.hasContent,
|
|
profilePath,
|
|
tracePath,
|
|
snapshotPath,
|
|
extHostHeapUsedBefore,
|
|
extHostHeapUsedAfter,
|
|
extHostHeapDelta,
|
|
extHostHeapDeltaPostGC,
|
|
extHostProfilePath,
|
|
extHostSnapshotPath,
|
|
};
|
|
} finally {
|
|
if (extHostInspector) {
|
|
try { extHostInspector.close(); } catch { }
|
|
}
|
|
activeVSCode = null;
|
|
await vscode.close();
|
|
}
|
|
|
|
// Read the trace file written by VS Code on exit via --trace-startup-file
|
|
/** @type {Array<any>} */
|
|
let traceEvents = [];
|
|
try {
|
|
const traceData = JSON.parse(fs.readFileSync(tracePath, 'utf-8'));
|
|
traceEvents = traceData.traceEvents || [];
|
|
} catch {
|
|
// Trace file may not exist if VS Code crashed before shutdown
|
|
}
|
|
|
|
// Extract code/chat/* perf marks from blink.user_timing trace events.
|
|
// These appear as instant ('R' or 'I') events with timestamps in microseconds.
|
|
const chatMarks = traceEvents
|
|
.filter(e => e.cat === 'blink.user_timing' && e.name && e.name.startsWith('code/chat/'))
|
|
.map(e => ({ name: e.name, startTime: e.ts / 1000 }));
|
|
|
|
if (verbose && chatMarks.length > 0) {
|
|
console.log(` [trace] chatMarks (${chatMarks.length}): ${chatMarks.map((/** @type {any} */ m) => m.name.split('/').slice(-1)[0]).join(', ')}`);
|
|
}
|
|
|
|
// Parse timing — prefer internal code/chat/* marks (precise, in-process)
|
|
// with client-side Date.now() as fallback for older builds without marks.
|
|
const timeToUIUpdated = markDuration(chatMarks, 'request/start', 'request/uiUpdated');
|
|
const internalFirstToken = markDuration(chatMarks, 'request/start', 'request/firstToken');
|
|
const timeToFirstToken = internalFirstToken >= 0 ? internalFirstToken : (firstResponseTime - submitTime);
|
|
const timeToComplete = responseCompleteTime - submitTime;
|
|
const timeToRenderComplete = renderCompleteTime - submitTime;
|
|
const instructionCollectionTime = markDuration(chatMarks, 'request/willCollectInstructions', 'request/didCollectInstructions');
|
|
const agentInvokeTime = markDuration(chatMarks, 'agent/willInvoke', 'agent/didInvoke');
|
|
|
|
// Parse GC events from trace.
|
|
// Use the trace-event category and phase fields which are stable
|
|
// across V8 versions, rather than matching event name substrings.
|
|
let majorGCs = 0, minorGCs = 0, gcDurationMs = 0;
|
|
for (const event of traceEvents) {
|
|
const isGC = event.cat === 'v8.gc'
|
|
|| event.cat === 'devtools.timeline,v8'
|
|
|| (typeof event.cat === 'string' && event.cat.split(',').some((/** @type {string} */ c) => {
|
|
const t = c.trim();
|
|
return t === 'v8.gc' || t === 'disabled-by-default-v8.gc' || t === 'disabled-by-default-v8.gc_stats';
|
|
}));
|
|
if (!isGC) { continue; }
|
|
// Only count complete ('X') or duration-begin ('B') events to
|
|
// avoid double-counting begin/end pairs.
|
|
if (event.ph && event.ph !== 'X' && event.ph !== 'B') { continue; }
|
|
const name = event.name || '';
|
|
if (/Major|MarkCompact|MSC|MC|IncrementalMarking|FinalizeMC/i.test(name)) { majorGCs++; }
|
|
else if (/Minor|Scaveng/i.test(name)) { minorGCs++; }
|
|
else { minorGCs++; } // default unknown GC events to minor
|
|
if (event.dur) { gcDurationMs += event.dur / 1000; }
|
|
}
|
|
// Parse Layout duration from devtools.timeline trace events.
|
|
let layoutDurationMs = 0;
|
|
for (const event of traceEvents) {
|
|
if (event.name === 'Layout' && event.ph === 'X' && event.dur) {
|
|
layoutDurationMs += event.dur / 1000;
|
|
}
|
|
}
|
|
|
|
let longTaskCount = 0;
|
|
for (const event of traceEvents) {
|
|
if (event.name === 'RunTask' && event.dur && event.dur > 50_000) { longTaskCount++; }
|
|
}
|
|
|
|
// Parse Long Animation Frame (LoAF) events from devtools.timeline trace.
|
|
// AnimationFrame events use async flow pairs (ph:'s' start, ph:'f' finish)
|
|
// with matching ids. Compute duration from each s→f pair.
|
|
let longAnimationFrameCount = 0;
|
|
let longAnimationFrameTotalMs = 0;
|
|
{
|
|
/** @type {Map<number, number>} */
|
|
const frameStarts = new Map();
|
|
for (const event of traceEvents) {
|
|
if (event.cat === 'devtools.timeline' && event.name === 'AnimationFrame') {
|
|
if (event.ph === 's') {
|
|
frameStarts.set(event.id, event.ts);
|
|
} else if (event.ph === 'f' && frameStarts.has(event.id)) {
|
|
const durationMs = (event.ts - /** @type {number} */(frameStarts.get(event.id))) / 1000;
|
|
frameStarts.delete(event.id);
|
|
if (durationMs > 50) {
|
|
longAnimationFrameCount++;
|
|
longAnimationFrameTotalMs += durationMs;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return {
|
|
...partialMetrics,
|
|
timeToUIUpdated, timeToFirstToken, timeToComplete, timeToRenderComplete, instructionCollectionTime, agentInvokeTime,
|
|
hasInternalMarks: chatMarks.length > 0,
|
|
internalFirstToken,
|
|
majorGCs, minorGCs,
|
|
gcDurationMs: Math.round(gcDurationMs * 100) / 100,
|
|
layoutDurationMs: Math.round(layoutDurationMs * 100) / 100,
|
|
longTaskCount,
|
|
longAnimationFrameCount,
|
|
longAnimationFrameTotalMs: Math.round(longAnimationFrameTotalMs * 100) / 100,
|
|
};
|
|
}
|
|
|
|
// -- CI summary generation ---------------------------------------------------
|
|
|
|
const GITHUB_REPO = 'https://github.com/microsoft/vscode';
|
|
|
|
/**
|
|
* Format a build identifier as a Markdown link when possible.
|
|
* - Commit SHAs link to the commit page.
|
|
* - Semver versions link to the release tag page.
|
|
* - Everything else (e.g. "baseline", "dev (local)") is returned as inline code.
|
|
* @param {string} label
|
|
* @returns {string}
|
|
*/
|
|
function formatBuildLink(label) {
|
|
if (/^[0-9a-f]{7,40}$/.test(label)) {
|
|
const short = label.substring(0, 7);
|
|
return `[\`${short}\`](${GITHUB_REPO}/commit/${label})`;
|
|
}
|
|
if (/^\d+\.\d+\.\d+/.test(label)) {
|
|
return `[\`${label}\`](${GITHUB_REPO}/releases/tag/${label})`;
|
|
}
|
|
return `\`${label}\``;
|
|
}
|
|
|
|
/**
|
|
* Build a GitHub compare link between two build identifiers, if both are
|
|
* commit-like or version-like references. Returns empty string otherwise.
|
|
* @param {string} base
|
|
* @param {string} test
|
|
* @returns {string}
|
|
*/
|
|
function formatCompareLink(base, test) {
|
|
const isRef = (/** @type {string} */ v) => /^[0-9a-f]{7,40}$/.test(v) || /^\d+\.\d+\.\d+/.test(v);
|
|
if (!isRef(base) || !isRef(test)) {
|
|
return '';
|
|
}
|
|
return `[compare](${GITHUB_REPO}/compare/${base}...${test})`;
|
|
}
|
|
|
|
/**
|
|
* Generate a detailed Markdown summary table for CI.
|
|
* Printed to stdout and written to ci-summary.md.
|
|
*
|
|
* @param {Record<string, any>} jsonReport
|
|
* @param {Record<string, any> | null} baseline
|
|
* @param {{ threshold: number, metricThresholds?: Record<string, number | string>, runs: number, baselineBuild?: string, build?: string }} opts
|
|
*/
|
|
function generateCISummary(jsonReport, baseline, opts) {
|
|
const baseLabel = opts.baselineBuild || 'baseline';
|
|
const testBuildMode = jsonReport.buildMode || 'dev';
|
|
const testLabel = testBuildMode === 'dev' ? 'dev (local)'
|
|
: testBuildMode === 'production' ? 'production (local)'
|
|
: opts.build || testBuildMode;
|
|
const baseLink = formatBuildLink(baseLabel);
|
|
const testLink = formatBuildLink(testLabel);
|
|
const compareLink = formatCompareLink(baseLabel, testLabel);
|
|
const allMetrics = [
|
|
['timeToFirstToken', 'timing', 'ms'],
|
|
['timeToComplete', 'timing', 'ms'],
|
|
['layoutCount', 'rendering', ''],
|
|
['recalcStyleCount', 'rendering', ''],
|
|
['forcedReflowCount', 'rendering', ''],
|
|
['longTaskCount', 'rendering', ''],
|
|
['longAnimationFrameCount', 'rendering', ''],
|
|
['longAnimationFrameTotalMs', 'rendering', 'ms'],
|
|
['frameCount', 'rendering', ''],
|
|
['compositeLayers', 'rendering', ''],
|
|
['paintCount', 'rendering', ''],
|
|
['heapDelta', 'memory', 'MB'],
|
|
['heapDeltaPostGC', 'memory', 'MB'],
|
|
['gcDurationMs', 'memory', 'ms'],
|
|
['extHostHeapDelta', 'extHost', 'MB'],
|
|
['extHostHeapDeltaPostGC', 'extHost', 'MB'],
|
|
];
|
|
const regressionMetricNames = new Set(['timeToFirstToken', 'timeToComplete', 'forcedReflowCount', 'longTaskCount', 'longAnimationFrameCount']);
|
|
|
|
const lines = [];
|
|
const scenarios = Object.keys(jsonReport.scenarios);
|
|
|
|
// -- Collect verdicts per scenario/metric --------------------------------
|
|
/** @type {Map<string, { metric: string, verdict: string, change: number, pValue: string, basStr: string, curStr: string }[]>} */
|
|
const scenarioVerdicts = new Map();
|
|
let totalRegressions = 0;
|
|
let totalImprovements = 0;
|
|
|
|
for (const scenario of scenarios) {
|
|
const current = jsonReport.scenarios[scenario];
|
|
const base = baseline?.scenarios?.[scenario];
|
|
/** @type {{ metric: string, verdict: string, change: number, pValue: string, basStr: string, curStr: string }[]} */
|
|
const verdicts = [];
|
|
|
|
if (base) {
|
|
for (const [metric, group, unit] of allMetrics) {
|
|
const cur = current[group]?.[metric];
|
|
const bas = base[group]?.[metric];
|
|
if (!cur || !bas || bas.median === null || bas.median === undefined) { continue; }
|
|
|
|
const change = bas.median !== 0 ? (cur.median - bas.median) / bas.median : 0;
|
|
const isRegressionMetric = regressionMetricNames.has(metric);
|
|
|
|
const curRaw = (current.rawRuns || []).map((/** @type {any} */ r) => r[metric]).filter((/** @type {any} */ v) => v >= 0);
|
|
const basRaw = (base.rawRuns || []).map((/** @type {any} */ r) => r[metric]).filter((/** @type {any} */ v) => v >= 0);
|
|
const ttest = welchTTest(basRaw, curRaw);
|
|
const pStr = ttest ? `${ttest.pValue}` : 'n/a';
|
|
|
|
const metricThreshold = getMetricThreshold(opts, metric);
|
|
const absoluteDelta = cur.median - bas.median;
|
|
let verdict = '';
|
|
if (isRegressionMetric) {
|
|
if (exceedsThreshold(metricThreshold, change, absoluteDelta)) {
|
|
if (!ttest || ttest.significant) {
|
|
verdict = 'REGRESSION';
|
|
totalRegressions++;
|
|
} else {
|
|
verdict = 'noise';
|
|
}
|
|
} else if (exceedsThreshold(metricThreshold, -change, -absoluteDelta) && ttest?.significant) {
|
|
verdict = 'improved';
|
|
totalImprovements++;
|
|
} else {
|
|
verdict = 'ok';
|
|
}
|
|
} else {
|
|
verdict = 'info';
|
|
}
|
|
|
|
const basStr = `${bas.median}${unit} \xb1${bas.stddev}${unit}`;
|
|
const curStr = `${cur.median}${unit} \xb1${cur.stddev}${unit}`;
|
|
verdicts.push({ metric, verdict, change, pValue: pStr, basStr, curStr });
|
|
}
|
|
}
|
|
scenarioVerdicts.set(scenario, verdicts);
|
|
}
|
|
|
|
// -- Header with verdict up front ----------------------------------------
|
|
const hasRegressions = totalRegressions > 0;
|
|
const verdictIcon = hasRegressions ? '\u274C' : '\u2705';
|
|
const verdictText = hasRegressions
|
|
? `${totalRegressions} regression(s) detected`
|
|
: totalImprovements > 0
|
|
? `No regressions \u2014 ${totalImprovements} improvement(s)`
|
|
: 'No significant changes';
|
|
|
|
lines.push(`# ${verdictIcon} Chat Performance: ${verdictText}`);
|
|
lines.push('');
|
|
lines.push(`| | |`);
|
|
lines.push(`|---|---|`);
|
|
lines.push(`| **Baseline** | ${baseLink} |`);
|
|
lines.push(`| **Test** | ${testLink} |`);
|
|
if (compareLink) {
|
|
lines.push(`| **Diff** | ${compareLink} |`);
|
|
}
|
|
lines.push(`| **Runs per scenario** | ${opts.runs} |`);
|
|
const overrides = Object.entries(opts.metricThresholds || {}).filter(([, v]) => {
|
|
const parsed = parseMetricThreshold(v);
|
|
return parsed.type !== 'fraction' || parsed.value !== opts.threshold;
|
|
});
|
|
if (overrides.length > 0) {
|
|
const overrideStr = overrides.map(([k, v]) => {
|
|
const parsed = parseMetricThreshold(v);
|
|
return `${k}: ${parsed.type === 'absolute' ? `${parsed.value}${k.includes('Ms') || k.includes('Time') || k.includes('time') ? 'ms' : ''}` : `${(parsed.value * 100).toFixed(0)}%`}`;
|
|
}).join(', ');
|
|
lines.push(`| **Regression threshold** | ${(opts.threshold * 100).toFixed(0)}% (${overrideStr}) |`);
|
|
} else {
|
|
lines.push(`| **Regression threshold** | ${(opts.threshold * 100).toFixed(0)}% |`);
|
|
}
|
|
lines.push(`| **Scenarios** | ${scenarios.length} |`);
|
|
lines.push(`| **Platform** | ${process.platform} / ${process.arch} |`);
|
|
if (jsonReport.buildMode) {
|
|
lines.push(`| **Build mode** | ${jsonReport.buildMode} |`);
|
|
}
|
|
lines.push('');
|
|
if (jsonReport.mismatchedBuildMode) {
|
|
lines.push('> **⚠ Build mode mismatch:** The test and baseline builds use different build modes.');
|
|
lines.push('> Results may not be directly comparable. For apples-to-apples comparisons,');
|
|
lines.push('> use the same build type for both (e.g. `--production-build` with a local');
|
|
lines.push('> baseline path, or two version strings).');
|
|
lines.push('');
|
|
}
|
|
|
|
// -- At-a-glance overview table: one row per scenario --------------------
|
|
lines.push(`## Overview`);
|
|
lines.push('');
|
|
lines.push('| Scenario | Description | TTFT | Complete | Layouts | Styles | LoAF | Verdict |');
|
|
lines.push('|----------|-------------|-----:|---------:|--------:|-------:|-----:|:-------:|');
|
|
|
|
for (const scenario of scenarios) {
|
|
const verdicts = scenarioVerdicts.get(scenario) || [];
|
|
const get = (/** @type {string} */ m) => verdicts.find(v => v.metric === m);
|
|
|
|
const ttft = get('timeToFirstToken');
|
|
const complete = get('timeToComplete');
|
|
const layouts = get('layoutCount');
|
|
const styles = get('recalcStyleCount');
|
|
const loaf = get('longAnimationFrameCount');
|
|
|
|
const fmtCell = (/** @type {{ change: number, verdict: string } | undefined} */ v) => {
|
|
if (!v) { return '\u2014'; }
|
|
const pct = `${v.change > 0 ? '+' : ''}${(v.change * 100).toFixed(0)}%`;
|
|
return pct;
|
|
};
|
|
|
|
const fmtVerdict = (/** @type {{ verdict: string, change: number }[]} */ vs) => {
|
|
const hasRegression = vs.some(v => v.verdict === 'REGRESSION');
|
|
const hasImproved = vs.some(v => v.verdict === 'improved');
|
|
if (hasRegression) { return '\u274C Regressed'; }
|
|
if (hasImproved) { return '\u2B06\uFE0F Improved'; }
|
|
return '\u2705 OK';
|
|
};
|
|
|
|
const keyVerdicts = [ttft, complete, layouts, styles, loaf].filter(Boolean);
|
|
const rowVerdict = fmtVerdict(/** @type {any[]} */(keyVerdicts));
|
|
|
|
lines.push(`| ${scenario} | ${getScenarioDescription(scenario)} | ${fmtCell(ttft)} | ${fmtCell(complete)} | ${fmtCell(layouts)} | ${fmtCell(styles)} | ${fmtCell(loaf)} | ${rowVerdict} |`);
|
|
}
|
|
lines.push('');
|
|
|
|
// -- Regressions & improvements detail section ---------------------------
|
|
const hasNotable = [...scenarioVerdicts.values()].some(vs => vs.some(v => v.verdict === 'REGRESSION' || v.verdict === 'improved'));
|
|
if (hasNotable) {
|
|
lines.push('## Regressions & Improvements');
|
|
lines.push('');
|
|
lines.push('Only metrics that regressed or improved significantly are shown below.');
|
|
lines.push('');
|
|
|
|
for (const scenario of scenarios) {
|
|
const verdicts = scenarioVerdicts.get(scenario) || [];
|
|
const notable = verdicts.filter(v => v.verdict === 'REGRESSION' || v.verdict === 'improved');
|
|
if (notable.length === 0) { continue; }
|
|
|
|
const icon = notable.some(v => v.verdict === 'REGRESSION') ? '\u274C' : '\u2B06\uFE0F';
|
|
lines.push(`### ${icon} ${scenario}`);
|
|
lines.push('');
|
|
lines.push('| Metric | Baseline | Test | Change | p-value | Verdict |');
|
|
lines.push('|--------|----------|------|--------|---------|---------|');
|
|
for (const v of notable) {
|
|
const pct = `${v.change > 0 ? '+' : ''}${(v.change * 100).toFixed(1)}%`;
|
|
const verdictIcon = v.verdict === 'REGRESSION' ? '\u274C' : '\u2B06\uFE0F';
|
|
lines.push(`| ${v.metric} | ${v.basStr} | ${v.curStr} | ${pct} | ${v.pValue} | ${verdictIcon} ${v.verdict} |`);
|
|
}
|
|
lines.push('');
|
|
}
|
|
}
|
|
|
|
// -- Full metric tables in collapsible section ---------------------------
|
|
lines.push('<details><summary>Full metric details per scenario</summary>');
|
|
lines.push('');
|
|
|
|
for (const scenario of scenarios) {
|
|
const verdicts = scenarioVerdicts.get(scenario) || [];
|
|
const base = baseline?.scenarios?.[scenario];
|
|
|
|
lines.push(`### ${scenario}`);
|
|
lines.push('');
|
|
|
|
if (!base) {
|
|
const current = jsonReport.scenarios[scenario];
|
|
lines.push('> No baseline data for this scenario.');
|
|
lines.push('');
|
|
lines.push('| Metric | Value | StdDev | CV | n |');
|
|
lines.push('|--------|------:|-------:|---:|--:|');
|
|
for (const [metric, group, unit] of allMetrics) {
|
|
const cur = current[group]?.[metric];
|
|
if (!cur) { continue; }
|
|
lines.push(`| ${metric} | ${cur.median}${unit} | \xb1${cur.stddev}${unit} | ${(cur.cv * 100).toFixed(0)}% | ${cur.n} |`);
|
|
}
|
|
lines.push('');
|
|
continue;
|
|
}
|
|
|
|
lines.push(`| Metric | Baseline | Test | Change | p-value | Verdict |`);
|
|
lines.push(`|--------|----------|------|--------|---------|---------|`);
|
|
|
|
for (const v of verdicts) {
|
|
const pct = `${v.change > 0 ? '+' : ''}${(v.change * 100).toFixed(1)}%`;
|
|
let verdictDisplay = v.verdict;
|
|
if (v.verdict === 'REGRESSION') { verdictDisplay = '\u274C REGRESSION'; }
|
|
else if (v.verdict === 'improved') { verdictDisplay = '\u2B06\uFE0F improved'; }
|
|
else if (v.verdict === 'ok') { verdictDisplay = '\u2705 ok'; }
|
|
else if (v.verdict === 'noise') { verdictDisplay = '\uD83C\uDF2B\uFE0F noise'; }
|
|
else if (v.verdict === 'info') { verdictDisplay = '\u2139\uFE0F'; }
|
|
lines.push(`| ${v.metric} | ${v.basStr} | ${v.curStr} | ${pct} | ${v.pValue} | ${verdictDisplay} |`);
|
|
}
|
|
lines.push('');
|
|
}
|
|
lines.push('</details>');
|
|
lines.push('');
|
|
|
|
// -- Raw run data in collapsible section ---------------------------------
|
|
lines.push('<details><summary>Raw run data</summary>');
|
|
lines.push('');
|
|
for (const scenario of scenarios) {
|
|
const current = jsonReport.scenarios[scenario];
|
|
lines.push(`### ${scenario}`);
|
|
lines.push('');
|
|
lines.push('| Run | TTFT (ms) | Complete (ms) | Layouts | Style Recalcs | LoAF Count | LoAF (ms) | Frames | Heap Delta (MB) | Internal Marks |');
|
|
lines.push('|----:|----------:|--------------:|--------:|--------------:|-----------:|----------:|-------:|----------------:|:--------------:|');
|
|
const runs = current.rawRuns || [];
|
|
for (let i = 0; i < runs.length; i++) {
|
|
const r = runs[i];
|
|
const round2 = (/** @type {number} */ v) => Math.round(v * 100) / 100;
|
|
lines.push(`| ${i + 1} | ${round2(r.timeToFirstToken)} | ${r.timeToComplete} | ${r.layoutCount} | ${r.recalcStyleCount} | ${r.longAnimationFrameCount ?? '-'} | ${r.longAnimationFrameTotalMs !== null && r.longAnimationFrameTotalMs !== undefined ? round2(r.longAnimationFrameTotalMs) : '-'} | ${r.frameCount ?? '-'} | ${r.heapDelta} | ${r.hasInternalMarks ? 'yes' : 'no'} |`);
|
|
}
|
|
lines.push('');
|
|
}
|
|
if (baseline) {
|
|
for (const scenario of scenarios) {
|
|
const base = baseline.scenarios?.[scenario];
|
|
if (!base) { continue; }
|
|
lines.push(`### ${scenario} (baseline)`);
|
|
lines.push('');
|
|
lines.push('| Run | TTFT (ms) | Complete (ms) | Layouts | Style Recalcs | LoAF Count | LoAF (ms) | Frames | Heap Delta (MB) | Internal Marks |');
|
|
lines.push('|----:|----------:|--------------:|--------:|--------------:|-----------:|----------:|-------:|----------------:|:--------------:|');
|
|
const runs = base.rawRuns || [];
|
|
for (let i = 0; i < runs.length; i++) {
|
|
const r = runs[i];
|
|
const round2 = (/** @type {number} */ v) => Math.round(v * 100) / 100;
|
|
lines.push(`| ${i + 1} | ${round2(r.timeToFirstToken)} | ${r.timeToComplete} | ${r.layoutCount} | ${r.recalcStyleCount} | ${r.longAnimationFrameCount ?? '-'} | ${r.longAnimationFrameTotalMs !== null && r.longAnimationFrameTotalMs !== undefined ? round2(r.longAnimationFrameTotalMs) : '-'} | ${r.frameCount ?? '-'} | ${r.heapDelta} | ${r.hasInternalMarks ? 'yes' : 'no'} |`);
|
|
}
|
|
lines.push('');
|
|
}
|
|
}
|
|
lines.push('</details>');
|
|
lines.push('');
|
|
|
|
return lines.join('\n');
|
|
}
|
|
|
|
// -- Cleanup on SIGINT/SIGTERM -----------------------------------------------
|
|
|
|
/** @type {{ close: () => Promise<void> } | null} */
|
|
let activeVSCode = null;
|
|
/** @type {{ close: () => Promise<void> } | null} */
|
|
let activeMockServer = null;
|
|
|
|
function installSignalHandlers() {
|
|
const cleanup = async () => {
|
|
console.log('\n[chat-simulation] Caught interrupt, cleaning up...');
|
|
try { await activeVSCode?.close(); } catch { }
|
|
try { await activeMockServer?.close(); } catch { }
|
|
process.exit(130);
|
|
};
|
|
process.on('SIGINT', cleanup);
|
|
process.on('SIGTERM', cleanup);
|
|
}
|
|
|
|
// -- Diagnostic cleanup ------------------------------------------------------
|
|
|
|
/**
|
|
* Remove large diagnostic files (heap snapshots, CPU profiles, traces) from
|
|
* a run's metrics to free disk space. Keeps the JSON results data intact.
|
|
* @param {RunMetrics} metrics
|
|
*/
|
|
function cleanupRunDiagnostics(metrics) {
|
|
const filesToDelete = [
|
|
metrics.profilePath,
|
|
metrics.tracePath,
|
|
metrics.snapshotPath,
|
|
metrics.extHostProfilePath,
|
|
metrics.extHostSnapshotPath,
|
|
];
|
|
for (const filePath of filesToDelete) {
|
|
if (filePath && fs.existsSync(filePath)) {
|
|
try {
|
|
fs.rmSync(filePath, { force: true });
|
|
} catch {
|
|
// Ignore cleanup errors
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Clean up diagnostics for all scenarios that did NOT regress.
|
|
* Keeps diagnostics for regressed scenarios so they can be investigated.
|
|
* @param {Record<string, RunMetrics[]>} allResults - test results by scenario
|
|
* @param {Set<string>} regressedScenarios - scenarios that regressed
|
|
*/
|
|
function cleanupNonRegressedDiagnostics(allResults, regressedScenarios) {
|
|
for (const [scenario, runs] of Object.entries(allResults)) {
|
|
if (regressedScenarios.has(scenario)) {
|
|
continue;
|
|
}
|
|
for (const metrics of runs) {
|
|
cleanupRunDiagnostics(metrics);
|
|
}
|
|
}
|
|
}
|
|
|
|
// -- Main --------------------------------------------------------------------
|
|
|
|
async function main() {
|
|
registerPerfScenarios();
|
|
const opts = parseArgs();
|
|
|
|
installSignalHandlers();
|
|
|
|
const { startServer } = require('./common/mock-llm-server');
|
|
const mockServer = await startServer(0);
|
|
activeMockServer = mockServer;
|
|
console.log(`[chat-simulation] Mock LLM server: ${mockServer.url}`);
|
|
|
|
// -- Resume mode --------------------------------------------------------
|
|
if (opts.resume) {
|
|
if (!fs.existsSync(opts.resume)) {
|
|
console.error(`[chat-simulation] Resume file not found: ${opts.resume}`);
|
|
process.exit(1);
|
|
}
|
|
const prevResults = JSON.parse(fs.readFileSync(opts.resume, 'utf-8'));
|
|
const prevDir = path.dirname(opts.resume);
|
|
|
|
// Find the associated baseline JSON in the same directory
|
|
const baselineFiles = fs.readdirSync(prevDir).filter((/** @type {string} */ f) => f.startsWith('baseline-') && f.endsWith('.json'));
|
|
const baselineFile = baselineFiles.length > 0 ? path.join(prevDir, baselineFiles[0]) : null;
|
|
const prevBaseline = baselineFile ? JSON.parse(fs.readFileSync(baselineFile, 'utf-8')) : null;
|
|
|
|
// Determine which scenarios to resume (default: all from previous run)
|
|
const resumeScenarios = opts.scenarios.length > 0
|
|
? opts.scenarios.filter(s => prevResults.scenarios?.[s])
|
|
: Object.keys(prevResults.scenarios || {});
|
|
|
|
if (resumeScenarios.length === 0) {
|
|
console.error('[chat-simulation] No matching scenarios found in previous results');
|
|
process.exit(1);
|
|
}
|
|
|
|
const testElectron = await resolveBuild(opts.build);
|
|
const baselineVersion = prevBaseline?.baselineBuildVersion;
|
|
const baselineElectron = baselineVersion ? await resolveBuild(baselineVersion) : null;
|
|
|
|
const runsToAdd = opts.runs;
|
|
console.log(`[chat-simulation] Resuming from: ${opts.resume}`);
|
|
console.log(`[chat-simulation] Adding ${runsToAdd} runs per scenario`);
|
|
console.log(`[chat-simulation] Scenarios: ${resumeScenarios.join(', ')}`);
|
|
if (prevBaseline) {
|
|
console.log(`[chat-simulation] Baseline: ${baselineVersion} (${prevBaseline.scenarios?.[resumeScenarios[0]]?.rawRuns?.length || 0} existing runs)`);
|
|
}
|
|
console.log('');
|
|
|
|
for (const scenario of resumeScenarios) {
|
|
console.log(`[chat-simulation] === Resuming: ${scenario} ===`);
|
|
const prevTestRuns = prevResults.scenarios[scenario]?.rawRuns || [];
|
|
const prevBaseRuns = prevBaseline?.scenarios?.[scenario]?.rawRuns || [];
|
|
|
|
// Run additional test iterations
|
|
console.log(`[chat-simulation] Test build (${prevTestRuns.length} existing + ${runsToAdd} new)`);
|
|
for (let i = 0; i < runsToAdd; i++) {
|
|
const runIdx = `${scenario}-resume-${prevTestRuns.length + i}`;
|
|
console.log(`[chat-simulation] Run ${i + 1}/${runsToAdd}...`);
|
|
try {
|
|
const m = await runOnce(testElectron, scenario, mockServer, opts.verbose, runIdx, prevDir, 'test', { ...opts.settingsOverrides, ...opts.testSettingsOverrides }, { heapSnapshots: opts.heapSnapshots });
|
|
// Clean up previous run's diagnostics to bound disk usage; keep the latest
|
|
if (opts.cleanupDiagnostics && prevTestRuns.length > 0) { cleanupRunDiagnostics(prevTestRuns[prevTestRuns.length - 1]); }
|
|
prevTestRuns.push(m);
|
|
if (opts.verbose) {
|
|
const src = m.hasInternalMarks ? 'internal' : 'client-side';
|
|
console.log(` [${src}] firstToken=${m.timeToFirstToken}ms, complete=${m.timeToComplete}ms`);
|
|
}
|
|
} catch (err) { console.error(` Run ${i + 1} failed: ${err}`); }
|
|
}
|
|
|
|
// Run additional baseline iterations
|
|
if (baselineElectron && prevBaseline?.scenarios?.[scenario]) {
|
|
console.log(`[chat-simulation] Baseline build (${prevBaseRuns.length} existing + ${runsToAdd} new)`);
|
|
for (let i = 0; i < runsToAdd; i++) {
|
|
const runIdx = `baseline-${scenario}-resume-${prevBaseRuns.length + i}`;
|
|
console.log(`[chat-simulation] Run ${i + 1}/${runsToAdd}...`);
|
|
try {
|
|
const m = await runOnce(baselineElectron, scenario, mockServer, opts.verbose, runIdx, prevDir, 'baseline', { ...opts.settingsOverrides, ...opts.baselineSettingsOverrides }, { heapSnapshots: opts.heapSnapshots });
|
|
// Clean up previous run's diagnostics to bound disk usage; keep the latest
|
|
if (opts.cleanupDiagnostics && prevBaseRuns.length > 0) { cleanupRunDiagnostics(prevBaseRuns[prevBaseRuns.length - 1]); }
|
|
prevBaseRuns.push(m);
|
|
} catch (err) { console.error(` Run ${i + 1} failed: ${err}`); }
|
|
}
|
|
}
|
|
|
|
// Recompute stats with merged data
|
|
const sd = /** @type {any} */ ({ runs: prevTestRuns.length, timing: {}, memory: {}, rendering: {}, extHost: {}, rawRuns: prevTestRuns });
|
|
for (const [metric, group] of METRIC_DEFS) { sd[group][metric] = robustStats(prevTestRuns.map((/** @type {any} */ r) => r[metric])); }
|
|
prevResults.scenarios[scenario] = sd;
|
|
|
|
if (prevBaseline?.scenarios?.[scenario]) {
|
|
const bsd = /** @type {any} */ ({ runs: prevBaseRuns.length, timing: {}, memory: {}, rendering: {}, extHost: {}, rawRuns: prevBaseRuns });
|
|
for (const [metric, group] of METRIC_DEFS) { bsd[group][metric] = robustStats(prevBaseRuns.map((/** @type {any} */ r) => r[metric])); }
|
|
prevBaseline.scenarios[scenario] = bsd;
|
|
}
|
|
console.log(`[chat-simulation] Merged: test n=${prevTestRuns.length}${prevBaseRuns.length > 0 ? `, baseline n=${prevBaseRuns.length}` : ''}`);
|
|
console.log('');
|
|
}
|
|
|
|
// Write updated files back
|
|
prevResults.runsPerScenario = Math.max(prevResults.runsPerScenario || 0, ...Object.values(prevResults.scenarios).map((/** @type {any} */ s) => s.runs));
|
|
prevResults.lastResumed = new Date().toISOString();
|
|
fs.writeFileSync(opts.resume, JSON.stringify(prevResults, null, 2));
|
|
console.log(`[chat-simulation] Updated results: ${opts.resume}`);
|
|
|
|
if (prevBaseline && baselineFile) {
|
|
prevBaseline.lastResumed = new Date().toISOString();
|
|
fs.writeFileSync(baselineFile, JSON.stringify(prevBaseline, null, 2));
|
|
// Also update cached baseline
|
|
const cachedPath = path.join(DATA_DIR, path.basename(baselineFile));
|
|
fs.writeFileSync(cachedPath, JSON.stringify(prevBaseline, null, 2));
|
|
console.log(`[chat-simulation] Updated baseline: ${baselineFile}`);
|
|
}
|
|
|
|
// -- Re-run comparison with merged data --------------------------------
|
|
opts.baseline = baselineFile || undefined;
|
|
const jsonReport = prevResults;
|
|
jsonReport._resultsPath = opts.resume;
|
|
|
|
// Fall through to comparison logic below
|
|
await printComparison(jsonReport, opts);
|
|
await mockServer.close();
|
|
return;
|
|
}
|
|
|
|
// -- Normal (non-resume) flow -------------------------------------------
|
|
// --production-build: build a local bundled (non-dev) package from the
|
|
// current source tree using `gulp vscode`. This produces the same
|
|
// packaging as a release build (bundled JS, no VSCODE_DEV) while still
|
|
// testing your local changes.
|
|
if (opts.productionBuild && !opts.build) {
|
|
const prodBuildPath = buildProductionBuild();
|
|
opts.build = prodBuildPath;
|
|
console.log(`[chat-simulation] --production-build: using local production build at ${prodBuildPath}`);
|
|
}
|
|
|
|
const electronPath = await resolveBuild(opts.build);
|
|
|
|
if (!fs.existsSync(electronPath)) {
|
|
console.error(`Electron not found at: ${electronPath}`);
|
|
console.error('Run "node build/lib/preLaunch.ts" first, or pass --build <path>');
|
|
process.exit(1);
|
|
}
|
|
|
|
// Detect build modes for both test and baseline builds
|
|
const testBuildMode = detectBuildMode(electronPath);
|
|
|
|
// Resolve the baseline build path early so we can detect its mode.
|
|
// For version strings this downloads; for local paths it resolves directly.
|
|
const isBaselineVersionString = opts.baselineBuild && isVersionString(opts.baselineBuild);
|
|
const isBaselineLocalPath = opts.baselineBuild && !isBaselineVersionString;
|
|
/** @type {string | undefined} */
|
|
let baselineElectronPath;
|
|
if (isBaselineLocalPath) {
|
|
baselineElectronPath = await resolveBuild(opts.baselineBuild);
|
|
if (!fs.existsSync(baselineElectronPath)) {
|
|
console.error(`Baseline build not found at: ${baselineElectronPath}`);
|
|
process.exit(1);
|
|
}
|
|
}
|
|
const baselineBuildMode = opts.baselineBuild
|
|
? (isBaselineVersionString ? 'release' : detectBuildMode(baselineElectronPath || ''))
|
|
: undefined;
|
|
|
|
const isMismatchedBuildMode = baselineBuildMode !== undefined && testBuildMode !== baselineBuildMode;
|
|
|
|
// Create a timestamped run directory for all output
|
|
const runTimestamp = new Date().toISOString().replace(/[:.]/g, '-').slice(0, 19);
|
|
const runDir = path.join(DATA_DIR, runTimestamp);
|
|
fs.mkdirSync(runDir, { recursive: true });
|
|
console.log(`[chat-simulation] Output: ${runDir}`);
|
|
|
|
// Compute effective settings per role
|
|
const testSettings = { ...opts.settingsOverrides, ...opts.testSettingsOverrides };
|
|
const baselineSettings = { ...opts.settingsOverrides, ...opts.baselineSettingsOverrides };
|
|
|
|
// -- Baseline build --------------------------------------------------
|
|
if (opts.baselineBuild) {
|
|
// Use a sanitized label for file names — replace path separators for local paths
|
|
const baselineLabel = isBaselineLocalPath
|
|
? path.basename(path.resolve(opts.baselineBuild))
|
|
: opts.baselineBuild;
|
|
const baselineJsonPath = path.join(runDir, `baseline-${baselineLabel}.json`);
|
|
|
|
// Local paths: always run fresh (no caching — the build may have changed)
|
|
// Version strings: use caching as before
|
|
const cachedPath = isBaselineLocalPath ? null : path.join(DATA_DIR, `baseline-${baselineLabel}.json`);
|
|
const cachedBaseline = cachedPath && !opts.noCache && fs.existsSync(cachedPath)
|
|
? JSON.parse(fs.readFileSync(cachedPath, 'utf-8'))
|
|
: null;
|
|
|
|
if (cachedBaseline?.baselineBuildVersion === opts.baselineBuild) {
|
|
// Check if the cache covers all requested scenarios
|
|
const cachedScenarios = new Set(Object.keys(cachedBaseline.scenarios || {}));
|
|
const missingScenarios = opts.scenarios.filter((/** @type {string} */ s) => !cachedScenarios.has(s));
|
|
|
|
// Also check if cached scenarios have fewer runs than requested
|
|
const shortScenarios = opts.scenarios.filter((/** @type {string} */ s) => {
|
|
const cached = cachedBaseline.scenarios?.[s];
|
|
return cached && (cached.rawRuns?.length || 0) < opts.runs;
|
|
});
|
|
|
|
if (missingScenarios.length === 0 && shortScenarios.length === 0) {
|
|
console.log(`[chat-simulation] Using cached baseline for ${opts.baselineBuild}`);
|
|
fs.writeFileSync(baselineJsonPath, JSON.stringify(cachedBaseline, null, 2));
|
|
opts.baseline = baselineJsonPath;
|
|
} else {
|
|
const scenariosToRun = [...new Set([...missingScenarios, ...shortScenarios])];
|
|
if (missingScenarios.length > 0) {
|
|
console.log(`[chat-simulation] Cached baseline missing scenarios: ${missingScenarios.join(', ')}`);
|
|
}
|
|
if (shortScenarios.length > 0) {
|
|
console.log(`[chat-simulation] Cached baseline needs more runs for: ${shortScenarios.map((/** @type {string} */ s) => `${s} (${cachedBaseline.scenarios[s].rawRuns?.length || 0}/${opts.runs})`).join(', ')}`);
|
|
}
|
|
console.log(`[chat-simulation] Running baseline for ${scenariosToRun.length} scenario(s)...`);
|
|
const baselineExePath = baselineElectronPath || await resolveBuild(opts.baselineBuild);
|
|
for (const scenario of scenariosToRun) {
|
|
const existingRuns = cachedBaseline.scenarios?.[scenario]?.rawRuns || [];
|
|
const runsNeeded = opts.runs - existingRuns.length;
|
|
/** @type {RunMetrics[]} */
|
|
const newResults = [];
|
|
for (let i = 0; i < runsNeeded; i++) {
|
|
try {
|
|
const m = await runOnce(baselineExePath, scenario, mockServer, opts.verbose, `baseline-${scenario}-${existingRuns.length + i}`, runDir, 'baseline', baselineSettings, { heapSnapshots: opts.heapSnapshots });
|
|
// Clean up previous run's diagnostics to bound disk usage; keep the latest
|
|
if (opts.cleanupDiagnostics && newResults.length > 0) { cleanupRunDiagnostics(newResults[newResults.length - 1]); }
|
|
newResults.push(m);
|
|
}
|
|
catch (err) { console.error(`[chat-simulation] Baseline run ${i + 1} failed: ${err}`); }
|
|
}
|
|
const allRuns = [...existingRuns, ...newResults];
|
|
if (allRuns.length > 0) {
|
|
const sd = /** @type {any} */ ({ runs: allRuns.length, timing: {}, memory: {}, rendering: {}, extHost: {}, rawRuns: allRuns });
|
|
for (const [metric, group] of METRIC_DEFS) { sd[group][metric] = robustStats(allRuns.map((/** @type {any} */ r) => r[metric])); }
|
|
cachedBaseline.scenarios[scenario] = sd;
|
|
}
|
|
}
|
|
cachedBaseline.runsPerScenario = opts.runs;
|
|
fs.writeFileSync(baselineJsonPath, JSON.stringify(cachedBaseline, null, 2));
|
|
if (cachedPath) {
|
|
fs.writeFileSync(cachedPath, JSON.stringify(cachedBaseline, null, 2));
|
|
}
|
|
opts.baseline = baselineJsonPath;
|
|
}
|
|
} else {
|
|
const baselineExePath = baselineElectronPath || await resolveBuild(opts.baselineBuild);
|
|
console.log(`[chat-simulation] Benchmarking baseline build (${baselineLabel})...`);
|
|
/** @type {Record<string, RunMetrics[]>} */
|
|
const baselineResults = {};
|
|
for (const scenario of opts.scenarios) {
|
|
/** @type {RunMetrics[]} */
|
|
const results = [];
|
|
for (let i = 0; i < opts.runs; i++) {
|
|
try {
|
|
const m = await runOnce(baselineExePath, scenario, mockServer, opts.verbose, `baseline-${scenario}-${i}`, runDir, 'baseline', baselineSettings, { heapSnapshots: opts.heapSnapshots });
|
|
// Clean up previous run's diagnostics to bound disk usage; keep the latest
|
|
if (opts.cleanupDiagnostics && results.length > 0) { cleanupRunDiagnostics(results[results.length - 1]); }
|
|
results.push(m);
|
|
}
|
|
catch (err) { console.error(`[chat-simulation] Baseline run ${i + 1} failed: ${err}`); }
|
|
}
|
|
if (results.length > 0) { baselineResults[scenario] = results; }
|
|
}
|
|
const baselineReport = {
|
|
timestamp: new Date().toISOString(),
|
|
baselineBuildVersion: opts.baselineBuild,
|
|
platform: process.platform,
|
|
runsPerScenario: opts.runs,
|
|
scenarios: /** @type {Record<string, any>} */ ({}),
|
|
};
|
|
for (const [scenario, results] of Object.entries(baselineResults)) {
|
|
const sd = /** @type {any} */ ({ runs: results.length, timing: {}, memory: {}, rendering: {}, extHost: {}, rawRuns: results });
|
|
for (const [metric, group] of METRIC_DEFS) { sd[group][metric] = robustStats(results.map(r => /** @type {any} */(r)[metric])); }
|
|
baselineReport.scenarios[scenario] = sd;
|
|
}
|
|
fs.writeFileSync(baselineJsonPath, JSON.stringify(baselineReport, null, 2));
|
|
// Cache at the top level for reuse across runs (version strings only)
|
|
if (cachedPath) {
|
|
fs.writeFileSync(cachedPath, JSON.stringify(baselineReport, null, 2));
|
|
}
|
|
opts.baseline = baselineJsonPath;
|
|
}
|
|
console.log('');
|
|
}
|
|
|
|
// -- Run benchmarks --------------------------------------------------
|
|
console.log(`[chat-simulation] Electron: ${electronPath}`);
|
|
console.log(`[chat-simulation] Build mode: ${buildModeLabel(testBuildMode)}`);
|
|
if (baselineBuildMode) {
|
|
console.log(`[chat-simulation] Baseline mode: ${buildModeLabel(baselineBuildMode)}`);
|
|
}
|
|
console.log(`[chat-simulation] Runs per scenario: ${opts.runs}`);
|
|
console.log(`[chat-simulation] Scenarios: ${opts.scenarios.join(', ')}`);
|
|
if (Object.keys(opts.settingsOverrides).length > 0) {
|
|
console.log(`[chat-simulation] Settings overrides (all): ${JSON.stringify(opts.settingsOverrides)}`);
|
|
}
|
|
if (Object.keys(opts.testSettingsOverrides).length > 0) {
|
|
console.log(`[chat-simulation] Settings overrides (test): ${JSON.stringify(opts.testSettingsOverrides)}`);
|
|
}
|
|
if (Object.keys(opts.baselineSettingsOverrides).length > 0) {
|
|
console.log(`[chat-simulation] Settings overrides (baseline): ${JSON.stringify(opts.baselineSettingsOverrides)}`);
|
|
}
|
|
|
|
if (isMismatchedBuildMode) {
|
|
console.log('');
|
|
console.log(`[chat-simulation] ⚠ WARNING: Build mode mismatch — test is ${testBuildMode}, baseline is ${baselineBuildMode}.`);
|
|
console.log('[chat-simulation] Results may not be directly comparable. For apples-to-apples');
|
|
console.log('[chat-simulation] comparisons, use the same build type for both.');
|
|
if (testBuildMode === 'dev') {
|
|
console.log('[chat-simulation] To use a local production build instead:');
|
|
console.log('[chat-simulation] npm run perf:chat -- --production-build');
|
|
}
|
|
if (!opts.ci && !opts.force) {
|
|
const readline = require('readline');
|
|
const rl = readline.createInterface({ input: process.stdin, output: process.stdout });
|
|
const answer = await new Promise(resolve => rl.question('[chat-simulation] Continue anyway? [y/N] ', resolve));
|
|
rl.close();
|
|
if (String(answer).toLowerCase() !== 'y') {
|
|
console.log('[chat-simulation] Aborted.');
|
|
await mockServer.close();
|
|
process.exit(0);
|
|
}
|
|
}
|
|
}
|
|
console.log('');
|
|
|
|
/** @type {Record<string, RunMetrics[]>} */
|
|
const allResults = {};
|
|
let anyFailed = false;
|
|
|
|
for (const scenario of opts.scenarios) {
|
|
console.log(`[chat-simulation] === Scenario: ${scenario} ===`);
|
|
/** @type {RunMetrics[]} */
|
|
const results = [];
|
|
for (let i = 0; i < opts.runs; i++) {
|
|
console.log(`[chat-simulation] Run ${i + 1}/${opts.runs}...`);
|
|
try {
|
|
const metrics = await runOnce(electronPath, scenario, mockServer, opts.verbose, `${scenario}-${i}`, runDir, 'test', testSettings, { heapSnapshots: opts.heapSnapshots });
|
|
// Clean up previous run's diagnostics to bound disk usage; keep the latest
|
|
if (opts.cleanupDiagnostics && results.length > 0) { cleanupRunDiagnostics(results[results.length - 1]); }
|
|
results.push(metrics);
|
|
if (opts.verbose) {
|
|
const src = metrics.hasInternalMarks ? 'internal' : 'client-side';
|
|
console.log(` [${src}] firstToken=${metrics.timeToFirstToken}ms, complete=${metrics.timeToComplete}ms, heap=delta${metrics.heapDelta}MB, longTasks=${metrics.longTaskCount}${metrics.hasInternalMarks ? `, internalTTFT=${metrics.internalFirstToken}ms` : ''}`);
|
|
}
|
|
} catch (err) { console.error(` Run ${i + 1} failed: ${err}`); }
|
|
}
|
|
if (results.length === 0) { console.error(`[chat-simulation] All runs failed for scenario: ${scenario}`); anyFailed = true; }
|
|
else { allResults[scenario] = results; }
|
|
console.log('');
|
|
}
|
|
|
|
// -- Summary ---------------------------------------------------------
|
|
console.log('[chat-simulation] ======================= Summary =======================');
|
|
for (const [scenario, results] of Object.entries(allResults)) {
|
|
console.log('');
|
|
console.log(` -- ${scenario} (${results.length} runs) --`);
|
|
console.log('');
|
|
console.log(' Timing:');
|
|
console.log(summarize(results.map(r => r.timeToFirstToken), ' Request → First token ', 'ms'));
|
|
console.log(summarize(results.map(r => r.timeToComplete), ' Request → Complete ', 'ms'));
|
|
console.log(summarize(results.map(r => r.timeToRenderComplete), ' Request → Rendered ', 'ms'));
|
|
console.log('');
|
|
console.log(' Rendering:');
|
|
console.log(summarize(results.map(r => r.layoutCount), ' Layouts ', ''));
|
|
console.log(summarize(results.map(r => r.layoutDurationMs), ' Layout duration ', 'ms'));
|
|
console.log(summarize(results.map(r => r.recalcStyleCount), ' Style recalcs ', ''));
|
|
console.log(summarize(results.map(r => r.forcedReflowCount), ' Forced reflows ', ''));
|
|
console.log(summarize(results.map(r => r.longTaskCount), ' Long tasks (>50ms) ', ''));
|
|
console.log(summarize(results.map(r => r.longAnimationFrameCount), ' Long anim. frames ', ''));
|
|
console.log(summarize(results.map(r => r.longAnimationFrameTotalMs), ' LoAF total duration ', 'ms'));
|
|
console.log(summarize(results.map(r => r.frameCount), ' Frames ', ''));
|
|
console.log(summarize(results.map(r => r.compositeLayers), ' Composite layers ', ''));
|
|
console.log(summarize(results.map(r => r.paintCount), ' Paints ', ''));
|
|
console.log('');
|
|
console.log(' Memory:');
|
|
console.log(summarize(results.map(r => r.heapDelta), ' Heap delta ', 'MB'));
|
|
console.log(summarize(results.map(r => r.heapDeltaPostGC), ' Heap delta (post-GC) ', 'MB'));
|
|
console.log(summarize(results.map(r => r.gcDurationMs), ' GC duration ', 'ms'));
|
|
if (results.some(r => r.extHostHeapDelta >= 0)) {
|
|
console.log('');
|
|
console.log(' Extension Host:');
|
|
console.log(summarize(results.map(r => r.extHostHeapUsedBefore), ' Heap before ', 'MB'));
|
|
console.log(summarize(results.map(r => r.extHostHeapUsedAfter), ' Heap after ', 'MB'));
|
|
console.log(summarize(results.map(r => r.extHostHeapDelta), ' Heap delta ', 'MB'));
|
|
console.log(summarize(results.map(r => r.extHostHeapDeltaPostGC), ' Heap delta (post-GC) ', 'MB'));
|
|
}
|
|
}
|
|
|
|
// -- JSON output -----------------------------------------------------
|
|
const jsonPath = path.join(runDir, 'results.json');
|
|
const jsonReport = /** @type {{ timestamp: string, platform: NodeJS.Platform, runsPerScenario: number, buildMode: string, mismatchedBuildMode: boolean, scenarios: Record<string, any>, _resultsPath?: string }} */ ({
|
|
timestamp: new Date().toISOString(),
|
|
platform: process.platform,
|
|
runsPerScenario: opts.runs,
|
|
buildMode: testBuildMode,
|
|
mismatchedBuildMode: !!isMismatchedBuildMode,
|
|
scenarios: /** @type {Record<string, any>} */ ({}),
|
|
});
|
|
for (const [scenario, results] of Object.entries(allResults)) {
|
|
const sd = /** @type {any} */ ({ runs: results.length, timing: {}, memory: {}, rendering: {}, extHost: {}, rawRuns: results });
|
|
for (const [metric, group] of METRIC_DEFS) { sd[group][metric] = robustStats(results.map(r => /** @type {any} */(r)[metric])); }
|
|
jsonReport.scenarios[scenario] = sd;
|
|
}
|
|
fs.writeFileSync(jsonPath, JSON.stringify(jsonReport, null, 2));
|
|
jsonReport._resultsPath = jsonPath;
|
|
console.log('');
|
|
console.log(`[chat-simulation] Results written to ${jsonPath}`);
|
|
|
|
// -- Save baseline ---------------------------------------------------
|
|
if (opts.saveBaseline) {
|
|
if (!opts.baseline) { console.error('[chat-simulation] --save-baseline requires --baseline <path>'); process.exit(1); }
|
|
fs.writeFileSync(opts.baseline, JSON.stringify(jsonReport, null, 2));
|
|
console.log(`[chat-simulation] Baseline saved to ${opts.baseline}`);
|
|
}
|
|
|
|
// -- Baseline comparison ---------------------------------------------
|
|
const regressedScenarios = await printComparison(jsonReport, opts);
|
|
|
|
// Clean up diagnostics for scenarios that did not regress
|
|
if (opts.cleanupDiagnostics) {
|
|
cleanupNonRegressedDiagnostics(allResults, regressedScenarios);
|
|
}
|
|
|
|
if (anyFailed) { process.exit(1); }
|
|
await mockServer.close();
|
|
}
|
|
|
|
/**
|
|
* Print baseline comparison and exit with code 1 if regressions found.
|
|
* Returns the set of scenario IDs that regressed.
|
|
* @param {Record<string, any>} jsonReport
|
|
* @param {{ threshold: number, metricThresholds?: Record<string, number | string>, baseline?: string, ci?: boolean, resume?: string, build?: string, baselineBuild?: string, runs: number, cleanupDiagnostics?: boolean }} opts
|
|
* @returns {Promise<Set<string>>}
|
|
*/
|
|
async function printComparison(jsonReport, opts) {
|
|
let regressionFound = false;
|
|
let inconclusiveFound = false;
|
|
/** @type {Set<string>} */
|
|
const regressedScenarios = new Set();
|
|
if (opts.baseline && fs.existsSync(opts.baseline)) {
|
|
const baseline = JSON.parse(fs.readFileSync(opts.baseline, 'utf-8'));
|
|
console.log('');
|
|
console.log(`[chat-simulation] =========== Baseline Comparison (threshold: ${(opts.threshold * 100).toFixed(0)}%) ===========`);
|
|
console.log(`[chat-simulation] Baseline: ${baseline.baselineBuildVersion || baseline.timestamp}`);
|
|
if (jsonReport.mismatchedBuildMode) {
|
|
console.log(`[chat-simulation] ⚠ Note: build mode mismatch — test is ${jsonReport.buildMode}, baseline differs.`);
|
|
console.log('[chat-simulation] Results may not be directly comparable.');
|
|
}
|
|
console.log('');
|
|
|
|
// Metrics that trigger regression failure when they exceed the threshold
|
|
const regressionMetrics = [
|
|
// [metric, group, unit]
|
|
['timeToFirstToken', 'timing', 'ms'],
|
|
['timeToComplete', 'timing', 'ms'],
|
|
['layoutCount', 'rendering', ''],
|
|
['recalcStyleCount', 'rendering', ''],
|
|
['forcedReflowCount', 'rendering', ''],
|
|
['longTaskCount', 'rendering', ''],
|
|
];
|
|
// Informational metrics — shown in comparison but don't trigger failure
|
|
const infoMetrics = [
|
|
['heapDelta', 'memory', 'MB'],
|
|
['gcDurationMs', 'memory', 'ms'],
|
|
['extHostHeapDelta', 'extHost', 'MB'],
|
|
['extHostHeapDeltaPostGC', 'extHost', 'MB'],
|
|
];
|
|
|
|
for (const scenario of Object.keys(jsonReport.scenarios)) {
|
|
const current = jsonReport.scenarios[scenario];
|
|
const base = baseline.scenarios?.[scenario];
|
|
if (!base) { console.log(` ${scenario}: (no baseline)`); continue; }
|
|
|
|
/** @type {string[]} */
|
|
const diffs = [];
|
|
let scenarioRegression = false;
|
|
|
|
for (const [metric, group, unit] of regressionMetrics) {
|
|
const cur = current[group]?.[metric];
|
|
const bas = base[group]?.[metric];
|
|
if (!cur || !bas || !bas.median) { continue; }
|
|
const change = (cur.median - bas.median) / bas.median;
|
|
const pct = `${change > 0 ? '+' : ''}${(change * 100).toFixed(1)}%`;
|
|
|
|
// Statistical significance via Welch's t-test on raw run values
|
|
const curRaw = (current.rawRuns || []).map((/** @type {any} */ r) => r[metric]).filter((/** @type {any} */ v) => v >= 0);
|
|
const basRaw = (base.rawRuns || []).map((/** @type {any} */ r) => r[metric]).filter((/** @type {any} */ v) => v >= 0);
|
|
const ttest = welchTTest(basRaw, curRaw);
|
|
|
|
const metricThreshold = getMetricThreshold(opts, metric);
|
|
const absoluteDelta = cur.median - bas.median;
|
|
let flag = '';
|
|
if (exceedsThreshold(metricThreshold, change, absoluteDelta)) {
|
|
if (!ttest) {
|
|
flag = ' ← possible regression (n too small for significance test)';
|
|
inconclusiveFound = true;
|
|
} else if (ttest.significant) {
|
|
flag = ` ← REGRESSION (p=${ttest.pValue}, ${ttest.confidence} confidence)`;
|
|
scenarioRegression = true;
|
|
regressionFound = true;
|
|
} else {
|
|
flag = ` (likely noise — p=${ttest.pValue}, not significant)`;
|
|
inconclusiveFound = true;
|
|
}
|
|
} else if (ttest && change > 0 && ttest.significant && ttest.confidence === 'high') {
|
|
flag = ` (significant increase, p=${ttest.pValue})`;
|
|
}
|
|
diffs.push(` ${metric}: ${bas.median}${unit} → ${cur.median}${unit} (${pct})${flag}`);
|
|
}
|
|
for (const [metric, group, unit] of infoMetrics) {
|
|
const cur = current[group]?.[metric];
|
|
const bas = base[group]?.[metric];
|
|
if (!cur || !bas || bas.median === null || bas.median === undefined) { continue; }
|
|
const change = bas.median !== 0 ? (cur.median - bas.median) / bas.median : 0;
|
|
const pct = `${change > 0 ? '+' : ''}${(change * 100).toFixed(1)}%`;
|
|
diffs.push(` ${metric}: ${bas.median}${unit} → ${cur.median}${unit} (${pct}) [info]`);
|
|
}
|
|
console.log(` ${scenario}: ${scenarioRegression ? 'FAIL' : 'OK'}`);
|
|
if (scenarioRegression) { regressedScenarios.add(scenario); }
|
|
diffs.forEach(d => console.log(d));
|
|
}
|
|
|
|
console.log('');
|
|
console.log(regressionFound
|
|
? `[chat-simulation] REGRESSION DETECTED — exceeded ${(opts.threshold * 100).toFixed(0)}% threshold with statistical significance`
|
|
: `[chat-simulation] All metrics within ${(opts.threshold * 100).toFixed(0)}% of baseline (or not statistically significant)`);
|
|
|
|
if (inconclusiveFound && !regressionFound) {
|
|
// Find the results.json path to suggest in the hint
|
|
const resultsPath = Object.keys(jsonReport.scenarios).length > 0
|
|
? (jsonReport._resultsPath || opts.resume || 'path/to/results.json')
|
|
: 'path/to/results.json';
|
|
// Estimate required runs from the observed effect size and variance
|
|
// using power analysis for Welch's t-test (alpha=0.05, 80% power).
|
|
// n_per_group = 2 * ((z_alpha/2 + z_beta) / d)^2 where d = Cohen's d
|
|
let maxNeeded = 0;
|
|
for (const scenario of Object.keys(jsonReport.scenarios)) {
|
|
const current = jsonReport.scenarios[scenario];
|
|
const base = baseline.scenarios?.[scenario];
|
|
if (!base) { continue; }
|
|
for (const [metric, group] of [['timeToFirstToken', 'timing'], ['timeToComplete', 'timing'], ['layoutCount', 'rendering'], ['recalcStyleCount', 'rendering']]) {
|
|
const curRaw = (current.rawRuns || []).map((/** @type {any} */ r) => r[metric]).filter((/** @type {any} */ v) => v >= 0);
|
|
const basRaw = (base.rawRuns || []).map((/** @type {any} */ r) => r[metric]).filter((/** @type {any} */ v) => v >= 0);
|
|
if (curRaw.length < 2 || basRaw.length < 2) { continue; }
|
|
const meanA = basRaw.reduce((/** @type {number} */ s, /** @type {number} */ v) => s + v, 0) / basRaw.length;
|
|
const meanB = curRaw.reduce((/** @type {number} */ s, /** @type {number} */ v) => s + v, 0) / curRaw.length;
|
|
const varA = basRaw.reduce((/** @type {number} */ s, /** @type {number} */ v) => s + (v - meanA) ** 2, 0) / (basRaw.length - 1);
|
|
const varB = curRaw.reduce((/** @type {number} */ s, /** @type {number} */ v) => s + (v - meanB) ** 2, 0) / (curRaw.length - 1);
|
|
const pooledSD = Math.sqrt((varA + varB) / 2);
|
|
if (pooledSD === 0) { continue; }
|
|
const d = Math.abs(meanB - meanA) / pooledSD;
|
|
if (d === 0) { continue; }
|
|
// z_0.025 = 1.96, z_0.2 = 0.842
|
|
const nPerGroup = Math.ceil(2 * ((1.96 + 0.842) / d) ** 2);
|
|
const currentN = Math.min(curRaw.length, basRaw.length);
|
|
maxNeeded = Math.max(maxNeeded, nPerGroup - currentN);
|
|
}
|
|
}
|
|
const suggestedRuns = Math.max(1, Math.min(maxNeeded, 20));
|
|
console.log('');
|
|
console.log('[chat-simulation] Some metrics exceeded the threshold but were not statistically significant.');
|
|
console.log('[chat-simulation] To increase confidence, add more runs with --resume:');
|
|
console.log(`[chat-simulation] npm run perf:chat -- --resume ${resultsPath} --runs ${suggestedRuns}`);
|
|
}
|
|
}
|
|
|
|
// -- CI summary ------------------------------------------------------
|
|
if (opts.ci) {
|
|
const ciBaseline = opts.baseline && fs.existsSync(opts.baseline)
|
|
? JSON.parse(fs.readFileSync(opts.baseline, 'utf-8'))
|
|
: null;
|
|
const summary = generateCISummary(jsonReport, ciBaseline, {
|
|
threshold: opts.threshold,
|
|
metricThresholds: opts.metricThresholds,
|
|
runs: jsonReport.runsPerScenario || opts.runs,
|
|
baselineBuild: ciBaseline?.baselineBuildVersion || opts.baselineBuild,
|
|
build: opts.build,
|
|
});
|
|
|
|
// Write to file for GitHub Actions $GITHUB_STEP_SUMMARY
|
|
const summaryPath = path.join(DATA_DIR, 'ci-summary.md');
|
|
fs.writeFileSync(summaryPath, summary);
|
|
console.log(`[chat-simulation] CI summary written to ${summaryPath}`);
|
|
|
|
// Also print the full summary table to stdout
|
|
console.log('');
|
|
console.log('==================================================================');
|
|
console.log(' CHAT PERF COMPARISON RESULTS ');
|
|
console.log('==================================================================');
|
|
console.log('');
|
|
console.log(summary);
|
|
}
|
|
|
|
if (regressionFound) { process.exit(1); }
|
|
return regressedScenarios;
|
|
}
|
|
|
|
main().catch(err => { console.error(err); process.exit(1); });
|