/*--------------------------------------------------------------------------------------------- * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the MIT License. See License.txt in the project root for license information. *--------------------------------------------------------------------------------------------*/ // @ts-check /** * Chat performance benchmark. * * Uses the real copilot extension with IS_SCENARIO_AUTOMATION=1 and a local * mock LLM server. Measures the full stack: prompt building, context * gathering, tool resolution, rendering, GC, and layout overhead. * * Usage: * npm run perf:chat # all scenarios vs 1.115.0 * npm run perf:chat -- --runs 10 # 10 runs per scenario * npm run perf:chat -- --scenario text-only # single scenario * npm run perf:chat -- --no-baseline # skip baseline comparison * npm run perf:chat -- --build 1.110.0 --baseline-build 1.115.0 * npm run perf:chat -- --resume .chat-simulation-data/2026-04-14/results.json --runs 3 */ const path = require('path'); const fs = require('fs'); const { ROOT, DATA_DIR, METRIC_DEFS, loadConfig, resolveBuild, isVersionString, buildEnv, buildArgs, prepareRunDir, robustStats, welchTTest, summarize, markDuration, launchVSCode, getNextExtHostInspectPort, connectToExtHostInspector, getRepoRoot, } = require('./common/utils'); const { getUserTurns, getScenarioIds } = require('./common/mock-llm-server'); const { registerPerfScenarios, getScenarioDescription } = require('./common/perf-scenarios'); // -- Config (edit config.jsonc to change defaults) --------------------------- const CONFIG = loadConfig('perfRegression'); // -- CLI args ---------------------------------------------------------------- function parseArgs() { const args = process.argv.slice(2); const opts = { runs: CONFIG.runsPerScenario ?? 5, verbose: false, ci: false, noCache: false, force: false, heapSnapshots: false, /** @type {string[]} */ scenarios: [], /** @type {string | undefined} */ build: undefined, /** @type {string | undefined} */ baseline: undefined, /** @type {string | undefined} */ baselineBuild: CONFIG.baselineBuild ?? '1.115.0', saveBaseline: false, threshold: CONFIG.regressionThreshold ?? 0.2, /** @type {Record} */ metricThresholds: CONFIG.metricThresholds ?? {}, /** @type {string | undefined} */ resume: undefined, productionBuild: false, /** @type {Record} */ settingsOverrides: {}, /** @type {Record} */ testSettingsOverrides: {}, /** @type {Record} */ baselineSettingsOverrides: {}, cleanupDiagnostics: false, }; for (let i = 0; i < args.length; i++) { switch (args[i]) { case '--runs': opts.runs = parseInt(args[++i], 10); break; case '--verbose': opts.verbose = true; break; case '--scenario': case '-s': opts.scenarios.push(args[++i]); break; case '--build': case '-b': opts.build = args[++i]; break; case '--baseline': opts.baseline = args[++i]; break; case '--baseline-build': opts.baselineBuild = args[++i]; break; case '--no-baseline': opts.baselineBuild = undefined; break; case '--save-baseline': opts.saveBaseline = true; break; case '--threshold': opts.threshold = parseFloat(args[++i]); break; case '--resume': opts.resume = args[++i]; break; case '--production-build': opts.productionBuild = true; break; case '--setting': case '--test-setting': case '--baseline-setting': { const kv = args[++i]; const eq = kv.indexOf('='); if (eq === -1) { console.error(`${args[i - 1]} requires key=value, got: ${kv}`); process.exit(1); } const key = kv.slice(0, eq); const raw = kv.slice(eq + 1); // Parse booleans and numbers, keep rest as strings const val = raw === 'true' ? true : raw === 'false' ? false : /^-?\d+(\.\d+)?$/.test(raw) ? Number(raw) : raw; const flag = args[i - 1]; if (flag === '--test-setting') { opts.testSettingsOverrides[key] = val; } else if (flag === '--baseline-setting') { opts.baselineSettingsOverrides[key] = val; } else { opts.settingsOverrides[key] = val; } break; } case '--no-cache': opts.noCache = true; break; case '--force': opts.force = true; break; case '--heap-snapshots': opts.heapSnapshots = true; break; case '--ci': opts.ci = true; opts.noCache = true; opts.heapSnapshots = true; opts.cleanupDiagnostics = true; break; case '--cleanup-diagnostics': opts.cleanupDiagnostics = true; break; case '--help': case '-h': console.log([ 'Chat performance benchmark', '', 'Options:', ' --runs Number of runs per scenario (default: 5)', ' --scenario Scenario to run (repeatable; default: all)', ' --build Path to VS Code build, or a version to download', ' (e.g. "1.110.0", "insiders", commit hash, or local path)', ' --baseline Compare against a baseline JSON file', ' --baseline-build Version or path to benchmark as baseline', ' (e.g. "1.115.0", "insiders", commit hash, or local path)', ' --no-baseline Skip baseline comparison entirely', ' --save-baseline Save results as the new baseline (requires --baseline )', ' --resume Resume a previous run, adding more iterations to increase', ' confidence. Merges new runs with existing rawRuns data', ' --threshold Regression threshold fraction (default: 0.2 = 20%)', ' --production-build Build a local bundled package (via gulp vscode) for', ' apples-to-apples comparison against a release baseline', ' --setting Set a VS Code setting override for all builds (repeatable)', ' --test-setting Set a VS Code setting override for test build only', ' --baseline-setting Set a VS Code setting override for baseline build only', ' e.g. --setting chat.experimental.incrementalRendering.enabled=true', ' --no-cache Ignore cached baseline data, always run fresh', ' --force Skip build mode mismatch confirmation', ' --heap-snapshots Take heap snapshots (slow; auto-enabled in --ci mode)', ' --ci CI mode: write Markdown summary to ci-summary.md (implies --no-cache, --heap-snapshots, --cleanup-diagnostics)', ' --cleanup-diagnostics Remove heap snapshots, CPU profiles, and traces after each run to save disk space', ' --verbose Print per-run details', '', 'Scenarios: ' + getScenarioIds().join(', '), ].join('\n')); process.exit(0); } } if (opts.scenarios.length === 0) { opts.scenarios = getScenarioIds(); } else { const knownIds = new Set(getScenarioIds()); const unknown = opts.scenarios.filter(s => !knownIds.has(s)); if (unknown.length > 0) { console.error(`Unknown scenario(s): ${unknown.join(', ')}\nAvailable: ${[...knownIds].join(', ')}`); process.exit(1); } } return opts; } // -- Build mode detection ---------------------------------------------------- /** * Classify an electron path into a build mode. * @param {string} electronPath * @returns {'dev' | 'production' | 'release'} */ function detectBuildMode(electronPath) { if (electronPath.includes('.vscode-test')) { return 'release'; } if (electronPath.includes('VSCode-')) { return 'production'; } return 'dev'; } /** * Return a human-readable label for a build mode. * @param {'dev' | 'production' | 'release'} mode * @returns {string} */ function buildModeLabel(mode) { switch (mode) { case 'dev': return 'development (unbundled)'; case 'production': return 'production (bundled, local)'; case 'release': return 'release (bundled, downloaded)'; } } // -- Production build -------------------------------------------------------- /** * Build a local production (bundled) VS Code package using `gulp vscode`. * Returns the path to the Electron executable in the packaged output. * * The gulp task compiles TypeScript, bundles JS, and packages with Electron * into `../VSCode--/`. This is the same process used for * release builds, minus minification and mangling. */ function buildProductionBuild() { const product = require(path.join(ROOT, 'product.json')); const platform = process.platform; const arch = process.arch; const destDir = path.join(ROOT, '..', `VSCode-${platform}-${arch}`); console.log('[chat-simulation] Building local production package (gulp vscode)...'); console.log('[chat-simulation] This may take a few minutes on the first run.'); const { execSync } = require('child_process'); try { execSync('npm run gulp -- vscode', { cwd: ROOT, stdio: 'inherit', timeout: 10 * 60 * 1000, // 10 minute timeout }); } catch (e) { // The copilot shim step may fail locally when the copilot SDK is not // fully packaged (it is normally supplied via CI). As long as the // Electron executable was produced we can still benchmark. console.warn('[chat-simulation] gulp vscode exited with errors (see above). Checking if executable was still produced...'); } /** @type {string} */ let electronPath; if (platform === 'darwin') { electronPath = path.join(destDir, `${product.nameLong}.app`, 'Contents', 'MacOS', product.nameShort); } else if (platform === 'linux') { electronPath = path.join(destDir, product.applicationName); } else { electronPath = path.join(destDir, `${product.nameShort}.exe`); } if (!fs.existsSync(electronPath)) { console.error(`[chat-simulation] Production build failed — executable not found at: ${electronPath}`); process.exit(1); } // Merge product.overrides.json into the packaged product.json. // The overrides file contains extensionsGallery and other config that // the OSS product.json lacks. In dev builds these are loaded at // runtime when VSCODE_DEV is set, but the production build doesn't // set that flag so we bake them in. const overridesPath = path.join(ROOT, 'product.overrides.json'); if (fs.existsSync(overridesPath)) { /** @type {string} */ let appDir; if (platform === 'darwin') { appDir = path.join(destDir, `${product.nameLong}.app`, 'Contents', 'Resources', 'app'); } else { appDir = path.join(destDir, 'resources', 'app'); } const packagedProductPath = path.join(appDir, 'product.json'); if (fs.existsSync(packagedProductPath)) { const packagedProduct = JSON.parse(fs.readFileSync(packagedProductPath, 'utf-8')); const overrides = JSON.parse(fs.readFileSync(overridesPath, 'utf-8')); const merged = Object.assign(packagedProduct, overrides); fs.writeFileSync(packagedProductPath, JSON.stringify(merged, null, '\t')); console.log('[chat-simulation] Merged product.overrides.json into packaged product.json'); } } console.log(`[chat-simulation] Production build ready: ${electronPath}`); return electronPath; } /** * @typedef {{ type: 'fraction', value: number } | { type: 'absolute', value: number }} MetricThreshold */ /** * Parse a metric threshold value from config. * - A number is treated as a fraction (e.g. 0.2 = 20%). * - A string like "100ms" or "5" is treated as an absolute delta. * @param {number | string} raw * @returns {MetricThreshold} */ function parseMetricThreshold(raw) { if (typeof raw === 'number') { return { type: 'fraction', value: raw }; } // Strip unit suffix (ms, MB, etc.) and parse the number const num = parseFloat(raw); if (isNaN(num)) { throw new Error(`Invalid metric threshold: ${raw}`); } return { type: 'absolute', value: num }; } /** * Get the regression threshold for a specific metric. * Uses per-metric override from config if available, otherwise the global threshold. * @param {{ threshold: number, metricThresholds?: Record }} opts * @param {string} metric * @returns {MetricThreshold} */ function getMetricThreshold(opts, metric) { const raw = opts.metricThresholds?.[metric]; if (raw !== undefined) { return parseMetricThreshold(raw); } return { type: 'fraction', value: opts.threshold }; } /** * Check whether a change exceeds the threshold. * @param {MetricThreshold} threshold * @param {number} change - fractional change (e.g. 0.5 = 50% increase) * @param {number} absoluteDelta - absolute difference (cur.median - bas.median) * @returns {boolean} */ function exceedsThreshold(threshold, change, absoluteDelta) { if (threshold.type === 'absolute') { return absoluteDelta > threshold.value; } return change > threshold.value; } // -- Metrics ----------------------------------------------------------------- /** * @typedef {{ * timeToUIUpdated: number, * timeToFirstToken: number, * timeToComplete: number, * timeToRenderComplete: number, * instructionCollectionTime: number, * agentInvokeTime: number, * heapUsedBefore: number, * heapUsedAfter: number, * heapDelta: number, * heapDeltaPostGC: number, * majorGCs: number, * minorGCs: number, * gcDurationMs: number, * layoutCount: number, * layoutDurationMs: number, * recalcStyleCount: number, * forcedReflowCount: number, * longTaskCount: number, * longAnimationFrameCount: number, * longAnimationFrameTotalMs: number, * frameCount: number, * compositeLayers: number, * paintCount: number, * hasInternalMarks: boolean, * responseHasContent: boolean, * internalFirstToken: number, * profilePath: string, * tracePath: string, * snapshotPath: string, * extHostHeapUsedBefore: number, * extHostHeapUsedAfter: number, * extHostHeapDelta: number, * extHostHeapDeltaPostGC: number, * extHostProfilePath: string, * extHostSnapshotPath: string, * }} RunMetrics */ // -- Single run -------------------------------------------------------------- /** * @param {string} electronPath * @param {string} scenario * @param {{ url: string, requestCount: () => number, waitForRequests: (n: number, ms: number) => Promise, completionCount: () => number, waitForCompletion: (n: number, ms: number) => Promise }} mockServer * @param {boolean} verbose * @param {string} runIndex * @param {string} runDir - timestamped run directory for diagnostics * @param {'baseline' | 'test'} role - whether this is a baseline or test run * @param {Record} [settingsOverrides] - custom VS Code settings * @param {{ heapSnapshots?: boolean }} [runOpts] - additional run options * @returns {Promise} */ async function runOnce(electronPath, scenario, mockServer, verbose, runIndex, runDir, role, settingsOverrides, runOpts) { const takeHeapSnapshots = runOpts?.heapSnapshots ?? false; const { userDataDir, extDir, logsDir } = prepareRunDir(runIndex, mockServer, settingsOverrides); const isDevBuild = !electronPath.includes('.vscode-test') && !electronPath.includes('VSCode-'); // Extract a clean build label from the path. // Dev: .build/electron/Code - OSS.app/.../Code - OSS → "dev" // Stable: .vscode-test/vscode-darwin-arm64-1.115.0/Visual Studio Code.app/.../Electron → "1.115.0" // Production: ../VSCode-darwin-arm64/Code - OSS.app/.../Code - OSS → "production" let buildLabel = 'dev'; if (!isDevBuild) { const vscodeTestMatch = electronPath.match(/vscode-test\/vscode-[^/]*?-(\d+\.\d+\.\d+)/); if (vscodeTestMatch) { buildLabel = vscodeTestMatch[1]; } else if (electronPath.includes('VSCode-')) { buildLabel = 'production'; } else { buildLabel = path.basename(electronPath); } } // For dev builds from a different repo, derive the repo root from the // electron path so that the build loads its own out/ source code. const appRoot = isDevBuild ? (getRepoRoot(electronPath) || ROOT) : ROOT; if (isDevBuild && appRoot !== ROOT) { if (verbose) { console.log(` [debug] Using appRoot from electron path: ${appRoot}`); } } // Create a per-run diagnostics directory: /-/-/ const runDiagDir = path.join(runDir, `${role}-${buildLabel}`, runIndex.replace(/^baseline-/, '')); fs.mkdirSync(runDiagDir, { recursive: true }); const tracePath = path.join(runDiagDir, 'trace.json'); const extHostInspectPort = getNextExtHostInspectPort(); const vscode = await launchVSCode( electronPath, buildArgs(userDataDir, extDir, logsDir, { isDevBuild, extHostInspectPort, traceFile: tracePath, appRoot }), buildEnv(mockServer, { isDevBuild }), { verbose }, ); activeVSCode = vscode; const window = vscode.page; // Declared outside try so the finally block can clean up /** @type {{ send: (method: string, params?: any) => Promise, on: (event: string, listener: (params: any) => void) => void, close: () => void } | null} */ let extHostInspector = null; /** @type {{ usedSize: number, totalSize: number } | null} */ let extHostHeapBefore = null; /** @type {Omit | null} */ let partialMetrics = null; // Timing vars hoisted for access in post-close trace parsing let submitTime = 0; let firstResponseTime = 0; let responseCompleteTime = 0; let renderCompleteTime = 0; try { await window.waitForSelector('.monaco-workbench', { timeout: 60_000 }); const cdp = await window.context().newCDPSession(window); await cdp.send('Performance.enable'); const heapBefore = /** @type {any} */ (await cdp.send('Runtime.getHeapUsage')); const metricsBefore = await cdp.send('Performance.getMetrics'); // Open chat const chatShortcut = process.platform === 'darwin' ? 'Control+Meta+KeyI' : 'Control+Alt+KeyI'; await window.keyboard.press(chatShortcut); const CHAT_VIEW = 'div[id="workbench.panel.chat"]'; const chatEditorSel = `${CHAT_VIEW} .interactive-input-part .monaco-editor[role="code"]`; await window.waitForSelector(CHAT_VIEW, { timeout: 15_000 }); await window.waitForFunction( (selector) => Array.from(document.querySelectorAll(selector)).some(el => { const rect = el.getBoundingClientRect(); return rect.width > 0 && rect.height > 0; }), chatEditorSel, { timeout: 15_000 }, ); // Dismiss dialogs const dismissDialog = async () => { for (const sel of ['.chat-setup-dialog', '.dialog-shadow', '.monaco-dialog-box']) { const el = await window.$(sel); if (el) { await window.keyboard.press('Escape'); await new Promise(r => setTimeout(r, 500)); break; } } }; await dismissDialog(); // Wait for extension activation const reqsBefore = mockServer.requestCount(); try { await mockServer.waitForRequests(reqsBefore + 4, 30_000); } catch { } if (verbose) { console.log(` [debug] Extension active (${mockServer.requestCount() - reqsBefore} new requests)`); } // Connect to extension host inspector for profiling/heap data try { extHostInspector = await connectToExtHostInspector(extHostInspectPort, { verbose, timeoutMs: 15_000 }); await extHostInspector.send('HeapProfiler.enable'); await extHostInspector.send('Profiler.enable'); await extHostInspector.send('Profiler.start'); extHostHeapBefore = await extHostInspector.send('Runtime.getHeapUsage'); if (verbose && extHostHeapBefore) { console.log(` [ext-host] Heap before: ${Math.round(extHostHeapBefore.usedSize / 1024 / 1024)}MB`); } } catch (err) { if (verbose) { console.log(` [ext-host] Could not connect to inspector: ${err}`); } } // Wait for model resolution await new Promise(r => setTimeout(r, 3000)); await dismissDialog(); // Focus input await window.click(chatEditorSel); const focusStart = Date.now(); while (Date.now() - focusStart < 5_000) { const focused = await window.evaluate((sel) => { const el = document.querySelector(sel); return el && (el.classList.contains('focused') || el.contains(document.activeElement)); }, chatEditorSel).catch(() => false); if (focused) { break; } await new Promise(r => setTimeout(r, 50)); } // Type message — use the smoke-test driver's typeInEditor when available // (dev builds), fall back to pressSequentially for stable/insiders builds. const chatMessage = `[scenario:${scenario}] Explain how this code works`; const actualInputSelector = await window.evaluate((editorSel) => { const editor = document.querySelector(editorSel); if (!editor) { throw new Error('Chat editor not found'); } return editor.querySelector('.native-edit-context') ? editorSel + ' .native-edit-context' : editorSel + ' textarea'; }, chatEditorSel); const hasDriver = await window.evaluate(() => // @ts-ignore !!globalThis.driver?.typeInEditor ).catch(() => false); if (hasDriver) { await window.evaluate(({ selector, text }) => { // @ts-ignore return globalThis.driver.typeInEditor(selector, text); }, { selector: actualInputSelector, text: chatMessage }); } else { // Fallback: click the input element and use pressSequentially await window.click(actualInputSelector); await new Promise(r => setTimeout(r, 200)); await window.locator(actualInputSelector).pressSequentially(chatMessage, { delay: 0 }); } // Start CPU profiler to capture call stacks during the interaction await cdp.send('Profiler.enable'); await cdp.send('Profiler.start'); // Submit const completionsBefore = mockServer.completionCount(); submitTime = Date.now(); await window.keyboard.press('Enter'); // Wait for mock server to serve the response try { await mockServer.waitForCompletion(completionsBefore + 1, 60_000); } catch { } firstResponseTime = Date.now(); // Wait for DOM response to settle await dismissDialog(); const responseSelector = `${CHAT_VIEW} .interactive-item-container.interactive-response`; await window.waitForFunction( (sel) => { const responses = document.querySelectorAll(sel); if (responses.length === 0) { return false; } return !responses[responses.length - 1].classList.contains('chat-response-loading'); }, responseSelector, { timeout: 30_000 }, ); responseCompleteTime = Date.now(); // -- User turn injection loop ----------------------------------------- // For multi-turn scenarios with user follow-ups, type each follow-up // message and wait for the model's response to settle. const userTurns = getUserTurns(scenario); for (let ut = 0; ut < userTurns.length; ut++) { const userTurn = userTurns[ut]; if (verbose) { console.log(` [debug] User follow-up ${ut + 1}/${userTurns.length}: "${userTurn.message}"`); } // Brief pause to let the UI settle between turns await new Promise(r => setTimeout(r, 500)); // Focus the chat input await window.click(chatEditorSel); const utFocusStart = Date.now(); while (Date.now() - utFocusStart < 3_000) { const focused = await window.evaluate((sel) => { const el = document.querySelector(sel); return el && (el.classList.contains('focused') || el.contains(document.activeElement)); }, chatEditorSel).catch(() => false); if (focused) { break; } await new Promise(r => setTimeout(r, 50)); } // Type the follow-up message if (hasDriver) { await window.evaluate(({ selector, text }) => { // @ts-ignore return globalThis.driver.typeInEditor(selector, text); }, { selector: actualInputSelector, text: userTurn.message }); } else { await window.click(actualInputSelector); await new Promise(r => setTimeout(r, 200)); await window.locator(actualInputSelector).pressSequentially(userTurn.message, { delay: 0 }); } // Submit follow-up const utCompBefore = mockServer.completionCount(); await window.keyboard.press('Enter'); // Wait for mock server to serve the response for this turn try { await mockServer.waitForCompletion(utCompBefore + 1, 60_000); } catch { } // Wait for the new response to finish rendering. // The chat list is virtualized — old response elements are // recycled out of the DOM as new ones appear, so we cannot // rely on counting DOM elements. Instead, scroll to the // bottom and wait for no response to be in loading state. await dismissDialog(); await window.evaluate((chatViewSel) => { const input = document.querySelector(chatViewSel + ' .interactive-input-part'); if (input) { input.scrollIntoView({ block: 'end' }); } }, CHAT_VIEW); await new Promise(r => setTimeout(r, 200)); await window.waitForFunction( (sel) => { const responses = document.querySelectorAll(sel); if (responses.length === 0) { return false; } return !responses[responses.length - 1].classList.contains('chat-response-loading'); }, responseSelector, { timeout: 30_000 }, ); responseCompleteTime = Date.now(); if (verbose) { const utResponseInfo = await window.evaluate((sel) => { const responses = document.querySelectorAll(sel); const last = responses[responses.length - 1]; return last ? (last.textContent || '').substring(0, 150) : '(empty)'; }, responseSelector); console.log(` [debug] Follow-up response (first 150 chars): ${utResponseInfo}`); } } // Stop CPU profiler and save the profile const { profile } = /** @type {any} */ (await cdp.send('Profiler.stop')); const profilePath = path.join(runDiagDir, 'profile.cpuprofile'); fs.writeFileSync(profilePath, JSON.stringify(profile)); if (verbose) { console.log(` [debug] CPU profile saved to ${profilePath}`); } const responseInfo = await window.evaluate((sel) => { const responses = document.querySelectorAll(sel); const last = responses[responses.length - 1]; if (!last) { return { hasContent: false, text: '' }; } const text = last.textContent || ''; return { hasContent: text.trim().length > 0, text: text.substring(0, 200) }; }, responseSelector); if (verbose) { console.log(` [debug] Response content (first 200 chars): ${responseInfo.text}`); console.log(` [debug] Client-side timing: firstResponse=${firstResponseTime - submitTime}ms, complete=${responseCompleteTime - submitTime}ms`); } // Wait for the typewriter animation to finish rendering. // The chat UI animates streamed content word-by-word after the // response stream completes. We need to wait until all content // is rendered before capturing layout/style metrics, otherwise // we miss the rendering phase where batching optimizations matter. await window.waitForFunction( (sel) => { const responses = document.querySelectorAll(sel); const last = responses[responses.length - 1]; if (!last) { return true; } // The typewriter animation is done when there are no // elements with the 'typewriter' or 'animating' class, // and no pending cursor animations. const hasAnimating = last.querySelector('.chat-animated-word, .chat-typewriter-cursor'); return !hasAnimating; }, responseSelector, { timeout: 30_000 }, ).catch(() => { // Fallback: if the selector-based check doesn't work (e.g. // the CSS classes differ across versions), wait for content // to stabilize by polling textContent. }); // Additional stabilization: poll until textContent stops changing. // This catches any remaining animation regardless of CSS class names. { let prev = ''; let stableCount = 0; const stabilizeStart = Date.now(); while (stableCount < 3 && Date.now() - stabilizeStart < 10_000) { const current = await window.evaluate((sel) => { const responses = document.querySelectorAll(sel); const last = responses[responses.length - 1]; return last ? (last.textContent || '') : ''; }, responseSelector).catch(() => ''); if (current === prev) { stableCount++; } else { stableCount = 0; prev = current; } await new Promise(r => setTimeout(r, 100)); } } renderCompleteTime = Date.now(); if (verbose) { console.log(` [debug] Render stabilized: ${renderCompleteTime - responseCompleteTime}ms after stream complete`); } const heapAfter = /** @type {any} */ (await cdp.send('Runtime.getHeapUsage')); const metricsAfter = await cdp.send('Performance.getMetrics'); // -- Extension host metrics (non-snapshot) --------------------------- let extHostHeapUsedBefore = -1; let extHostHeapUsedAfter = -1; let extHostHeapDelta = -1; let extHostHeapDeltaPostGC = -1; let extHostProfilePath = ''; let extHostSnapshotPath = ''; if (extHostInspector && extHostHeapBefore) { try { extHostHeapUsedBefore = Math.round(extHostHeapBefore.usedSize / 1024 / 1024); // Stop CPU profiler and save const extProfile = await extHostInspector.send('Profiler.stop'); extHostProfilePath = path.join(runDiagDir, 'exthost-profile.cpuprofile'); fs.writeFileSync(extHostProfilePath, JSON.stringify(extProfile.profile)); if (verbose) { console.log(` [ext-host] CPU profile saved to ${extHostProfilePath}`); } // Heap usage after interaction const extHostHeapAfter = await extHostInspector.send('Runtime.getHeapUsage'); extHostHeapUsedAfter = Math.round(extHostHeapAfter.usedSize / 1024 / 1024); extHostHeapDelta = extHostHeapUsedAfter - extHostHeapUsedBefore; // Force GC and measure retained heap try { await extHostInspector.send('Runtime.evaluate', { expression: 'gc()', awaitPromise: false, includeCommandLineAPI: true }); await new Promise(r => setTimeout(r, 200)); const extHostHeapPostGC = await extHostInspector.send('Runtime.getHeapUsage'); extHostHeapDeltaPostGC = Math.round(extHostHeapPostGC.usedSize / 1024 / 1024) - extHostHeapUsedBefore; } catch { extHostHeapDeltaPostGC = -1; } if (verbose) { console.log(` [ext-host] Heap: before=${extHostHeapUsedBefore}MB, after=${extHostHeapUsedAfter}MB, delta=${extHostHeapDelta}MB, deltaPostGC=${extHostHeapDeltaPostGC}MB`); } } catch (err) { if (verbose) { console.log(` [ext-host] Error collecting metrics: ${err}`); } } } // -- Heap snapshots (opt-in, parallelized) --------------------------- let snapshotPath = ''; if (takeHeapSnapshots) { const snapshotPromises = []; // Renderer snapshot snapshotPromises.push((async () => { const p = path.join(runDiagDir, 'heap.heapsnapshot'); await cdp.send('HeapProfiler.enable'); const chunks = /** @type {string[]} */ ([]); cdp.on('HeapProfiler.addHeapSnapshotChunk', (/** @type {any} */ params) => { chunks.push(params.chunk); }); await cdp.send('HeapProfiler.takeHeapSnapshot', { reportProgress: false }); fs.writeFileSync(p, chunks.join('')); return p; })()); // Extension host snapshot (parallel with renderer) if (extHostInspector && extHostHeapBefore) { snapshotPromises.push((async () => { const p = path.join(runDiagDir, 'exthost-heap.heapsnapshot'); const chunks = /** @type {string[]} */ ([]); extHostInspector.on('HeapProfiler.addHeapSnapshotChunk', (/** @type {any} */ params) => { chunks.push(params.chunk); }); await extHostInspector.send('HeapProfiler.takeHeapSnapshot', { reportProgress: false }); fs.writeFileSync(p, chunks.join('')); return p; })()); } const snapshotResults = await Promise.all(snapshotPromises); snapshotPath = snapshotResults[0]; if (snapshotResults.length > 1) { extHostSnapshotPath = snapshotResults[1]; } if (verbose) { console.log(` [debug] Renderer snapshot saved to ${snapshotPath}`); if (extHostSnapshotPath) { console.log(` [ext-host] Snapshot saved to ${extHostSnapshotPath}`); } } } // Close ext host inspector now that snapshots (if any) are done if (extHostInspector) { extHostInspector.close(); } // Store partial metrics here so we can combine with trace data after close. /** @param {any} r @param {string} name */ function getMetric(r, name) { const e = r.metrics?.find((/** @type {any} */ m) => m.name === name); return e ? e.value : 0; } partialMetrics = { heapUsedBefore: Math.round(heapBefore.usedSize / 1024 / 1024), heapUsedAfter: Math.round(heapAfter.usedSize / 1024 / 1024), heapDelta: Math.round((heapAfter.usedSize - heapBefore.usedSize) / 1024 / 1024), heapDeltaPostGC: await (async () => { // Force a full GC then measure heap to get deterministic retained-memory delta. // --js-flags=--expose-gc is not required: CDP's Runtime.evaluate can call gc() // when includeCommandLineAPI is true. try { await cdp.send('Runtime.evaluate', { expression: 'gc()', awaitPromise: false, includeCommandLineAPI: true }); await new Promise(r => setTimeout(r, 200)); const heapPostGC = /** @type {any} */ (await cdp.send('Runtime.getHeapUsage')); return Math.round((heapPostGC.usedSize - heapBefore.usedSize) / 1024 / 1024); } catch { return -1; // gc() not available in this build } })(), layoutCount: getMetric(metricsAfter, 'LayoutCount') - getMetric(metricsBefore, 'LayoutCount'), recalcStyleCount: getMetric(metricsAfter, 'RecalcStyleCount') - getMetric(metricsBefore, 'RecalcStyleCount'), forcedReflowCount: getMetric(metricsAfter, 'ForcedStyleRecalcs') - getMetric(metricsBefore, 'ForcedStyleRecalcs'), frameCount: getMetric(metricsAfter, 'FrameCount') - getMetric(metricsBefore, 'FrameCount'), compositeLayers: getMetric(metricsAfter, 'CompositeLayers') - getMetric(metricsBefore, 'CompositeLayers'), paintCount: getMetric(metricsAfter, 'PaintCount') - getMetric(metricsBefore, 'PaintCount'), responseHasContent: responseInfo.hasContent, profilePath, tracePath, snapshotPath, extHostHeapUsedBefore, extHostHeapUsedAfter, extHostHeapDelta, extHostHeapDeltaPostGC, extHostProfilePath, extHostSnapshotPath, }; } finally { if (extHostInspector) { try { extHostInspector.close(); } catch { } } activeVSCode = null; await vscode.close(); } // Read the trace file written by VS Code on exit via --trace-startup-file /** @type {Array} */ let traceEvents = []; try { const traceData = JSON.parse(fs.readFileSync(tracePath, 'utf-8')); traceEvents = traceData.traceEvents || []; } catch { // Trace file may not exist if VS Code crashed before shutdown } // Extract code/chat/* perf marks from blink.user_timing trace events. // These appear as instant ('R' or 'I') events with timestamps in microseconds. const chatMarks = traceEvents .filter(e => e.cat === 'blink.user_timing' && e.name && e.name.startsWith('code/chat/')) .map(e => ({ name: e.name, startTime: e.ts / 1000 })); if (verbose && chatMarks.length > 0) { console.log(` [trace] chatMarks (${chatMarks.length}): ${chatMarks.map((/** @type {any} */ m) => m.name.split('/').slice(-1)[0]).join(', ')}`); } // Parse timing — prefer internal code/chat/* marks (precise, in-process) // with client-side Date.now() as fallback for older builds without marks. const timeToUIUpdated = markDuration(chatMarks, 'request/start', 'request/uiUpdated'); const internalFirstToken = markDuration(chatMarks, 'request/start', 'request/firstToken'); const timeToFirstToken = internalFirstToken >= 0 ? internalFirstToken : (firstResponseTime - submitTime); const timeToComplete = responseCompleteTime - submitTime; const timeToRenderComplete = renderCompleteTime - submitTime; const instructionCollectionTime = markDuration(chatMarks, 'request/willCollectInstructions', 'request/didCollectInstructions'); const agentInvokeTime = markDuration(chatMarks, 'agent/willInvoke', 'agent/didInvoke'); // Parse GC events from trace. // Use the trace-event category and phase fields which are stable // across V8 versions, rather than matching event name substrings. let majorGCs = 0, minorGCs = 0, gcDurationMs = 0; for (const event of traceEvents) { const isGC = event.cat === 'v8.gc' || event.cat === 'devtools.timeline,v8' || (typeof event.cat === 'string' && event.cat.split(',').some((/** @type {string} */ c) => { const t = c.trim(); return t === 'v8.gc' || t === 'disabled-by-default-v8.gc' || t === 'disabled-by-default-v8.gc_stats'; })); if (!isGC) { continue; } // Only count complete ('X') or duration-begin ('B') events to // avoid double-counting begin/end pairs. if (event.ph && event.ph !== 'X' && event.ph !== 'B') { continue; } const name = event.name || ''; if (/Major|MarkCompact|MSC|MC|IncrementalMarking|FinalizeMC/i.test(name)) { majorGCs++; } else if (/Minor|Scaveng/i.test(name)) { minorGCs++; } else { minorGCs++; } // default unknown GC events to minor if (event.dur) { gcDurationMs += event.dur / 1000; } } // Parse Layout duration from devtools.timeline trace events. let layoutDurationMs = 0; for (const event of traceEvents) { if (event.name === 'Layout' && event.ph === 'X' && event.dur) { layoutDurationMs += event.dur / 1000; } } let longTaskCount = 0; for (const event of traceEvents) { if (event.name === 'RunTask' && event.dur && event.dur > 50_000) { longTaskCount++; } } // Parse Long Animation Frame (LoAF) events from devtools.timeline trace. // AnimationFrame events use async flow pairs (ph:'s' start, ph:'f' finish) // with matching ids. Compute duration from each s→f pair. let longAnimationFrameCount = 0; let longAnimationFrameTotalMs = 0; { /** @type {Map} */ const frameStarts = new Map(); for (const event of traceEvents) { if (event.cat === 'devtools.timeline' && event.name === 'AnimationFrame') { if (event.ph === 's') { frameStarts.set(event.id, event.ts); } else if (event.ph === 'f' && frameStarts.has(event.id)) { const durationMs = (event.ts - /** @type {number} */(frameStarts.get(event.id))) / 1000; frameStarts.delete(event.id); if (durationMs > 50) { longAnimationFrameCount++; longAnimationFrameTotalMs += durationMs; } } } } } return { ...partialMetrics, timeToUIUpdated, timeToFirstToken, timeToComplete, timeToRenderComplete, instructionCollectionTime, agentInvokeTime, hasInternalMarks: chatMarks.length > 0, internalFirstToken, majorGCs, minorGCs, gcDurationMs: Math.round(gcDurationMs * 100) / 100, layoutDurationMs: Math.round(layoutDurationMs * 100) / 100, longTaskCount, longAnimationFrameCount, longAnimationFrameTotalMs: Math.round(longAnimationFrameTotalMs * 100) / 100, }; } // -- CI summary generation --------------------------------------------------- const GITHUB_REPO = 'https://github.com/microsoft/vscode'; /** * Format a build identifier as a Markdown link when possible. * - Commit SHAs link to the commit page. * - Semver versions link to the release tag page. * - Everything else (e.g. "baseline", "dev (local)") is returned as inline code. * @param {string} label * @returns {string} */ function formatBuildLink(label) { if (/^[0-9a-f]{7,40}$/.test(label)) { const short = label.substring(0, 7); return `[\`${short}\`](${GITHUB_REPO}/commit/${label})`; } if (/^\d+\.\d+\.\d+/.test(label)) { return `[\`${label}\`](${GITHUB_REPO}/releases/tag/${label})`; } return `\`${label}\``; } /** * Build a GitHub compare link between two build identifiers, if both are * commit-like or version-like references. Returns empty string otherwise. * @param {string} base * @param {string} test * @returns {string} */ function formatCompareLink(base, test) { const isRef = (/** @type {string} */ v) => /^[0-9a-f]{7,40}$/.test(v) || /^\d+\.\d+\.\d+/.test(v); if (!isRef(base) || !isRef(test)) { return ''; } return `[compare](${GITHUB_REPO}/compare/${base}...${test})`; } /** * Generate a detailed Markdown summary table for CI. * Printed to stdout and written to ci-summary.md. * * @param {Record} jsonReport * @param {Record | null} baseline * @param {{ threshold: number, metricThresholds?: Record, runs: number, baselineBuild?: string, build?: string }} opts */ function generateCISummary(jsonReport, baseline, opts) { const baseLabel = opts.baselineBuild || 'baseline'; const testBuildMode = jsonReport.buildMode || 'dev'; const testLabel = testBuildMode === 'dev' ? 'dev (local)' : testBuildMode === 'production' ? 'production (local)' : opts.build || testBuildMode; const baseLink = formatBuildLink(baseLabel); const testLink = formatBuildLink(testLabel); const compareLink = formatCompareLink(baseLabel, testLabel); const allMetrics = [ ['timeToFirstToken', 'timing', 'ms'], ['timeToComplete', 'timing', 'ms'], ['layoutCount', 'rendering', ''], ['recalcStyleCount', 'rendering', ''], ['forcedReflowCount', 'rendering', ''], ['longTaskCount', 'rendering', ''], ['longAnimationFrameCount', 'rendering', ''], ['longAnimationFrameTotalMs', 'rendering', 'ms'], ['frameCount', 'rendering', ''], ['compositeLayers', 'rendering', ''], ['paintCount', 'rendering', ''], ['heapDelta', 'memory', 'MB'], ['heapDeltaPostGC', 'memory', 'MB'], ['gcDurationMs', 'memory', 'ms'], ['extHostHeapDelta', 'extHost', 'MB'], ['extHostHeapDeltaPostGC', 'extHost', 'MB'], ]; const regressionMetricNames = new Set(['timeToFirstToken', 'timeToComplete', 'forcedReflowCount', 'longTaskCount', 'longAnimationFrameCount']); const lines = []; const scenarios = Object.keys(jsonReport.scenarios); // -- Collect verdicts per scenario/metric -------------------------------- /** @type {Map} */ const scenarioVerdicts = new Map(); let totalRegressions = 0; let totalImprovements = 0; for (const scenario of scenarios) { const current = jsonReport.scenarios[scenario]; const base = baseline?.scenarios?.[scenario]; /** @type {{ metric: string, verdict: string, change: number, pValue: string, basStr: string, curStr: string }[]} */ const verdicts = []; if (base) { for (const [metric, group, unit] of allMetrics) { const cur = current[group]?.[metric]; const bas = base[group]?.[metric]; if (!cur || !bas || bas.median === null || bas.median === undefined) { continue; } const change = bas.median !== 0 ? (cur.median - bas.median) / bas.median : 0; const isRegressionMetric = regressionMetricNames.has(metric); const curRaw = (current.rawRuns || []).map((/** @type {any} */ r) => r[metric]).filter((/** @type {any} */ v) => v >= 0); const basRaw = (base.rawRuns || []).map((/** @type {any} */ r) => r[metric]).filter((/** @type {any} */ v) => v >= 0); const ttest = welchTTest(basRaw, curRaw); const pStr = ttest ? `${ttest.pValue}` : 'n/a'; const metricThreshold = getMetricThreshold(opts, metric); const absoluteDelta = cur.median - bas.median; let verdict = ''; if (isRegressionMetric) { if (exceedsThreshold(metricThreshold, change, absoluteDelta)) { if (!ttest || ttest.significant) { verdict = 'REGRESSION'; totalRegressions++; } else { verdict = 'noise'; } } else if (exceedsThreshold(metricThreshold, -change, -absoluteDelta) && ttest?.significant) { verdict = 'improved'; totalImprovements++; } else { verdict = 'ok'; } } else { verdict = 'info'; } const basStr = `${bas.median}${unit} \xb1${bas.stddev}${unit}`; const curStr = `${cur.median}${unit} \xb1${cur.stddev}${unit}`; verdicts.push({ metric, verdict, change, pValue: pStr, basStr, curStr }); } } scenarioVerdicts.set(scenario, verdicts); } // -- Header with verdict up front ---------------------------------------- const hasRegressions = totalRegressions > 0; const verdictIcon = hasRegressions ? '\u274C' : '\u2705'; const verdictText = hasRegressions ? `${totalRegressions} regression(s) detected` : totalImprovements > 0 ? `No regressions \u2014 ${totalImprovements} improvement(s)` : 'No significant changes'; lines.push(`# ${verdictIcon} Chat Performance: ${verdictText}`); lines.push(''); lines.push(`| | |`); lines.push(`|---|---|`); lines.push(`| **Baseline** | ${baseLink} |`); lines.push(`| **Test** | ${testLink} |`); if (compareLink) { lines.push(`| **Diff** | ${compareLink} |`); } lines.push(`| **Runs per scenario** | ${opts.runs} |`); const overrides = Object.entries(opts.metricThresholds || {}).filter(([, v]) => { const parsed = parseMetricThreshold(v); return parsed.type !== 'fraction' || parsed.value !== opts.threshold; }); if (overrides.length > 0) { const overrideStr = overrides.map(([k, v]) => { const parsed = parseMetricThreshold(v); return `${k}: ${parsed.type === 'absolute' ? `${parsed.value}${k.includes('Ms') || k.includes('Time') || k.includes('time') ? 'ms' : ''}` : `${(parsed.value * 100).toFixed(0)}%`}`; }).join(', '); lines.push(`| **Regression threshold** | ${(opts.threshold * 100).toFixed(0)}% (${overrideStr}) |`); } else { lines.push(`| **Regression threshold** | ${(opts.threshold * 100).toFixed(0)}% |`); } lines.push(`| **Scenarios** | ${scenarios.length} |`); lines.push(`| **Platform** | ${process.platform} / ${process.arch} |`); if (jsonReport.buildMode) { lines.push(`| **Build mode** | ${jsonReport.buildMode} |`); } lines.push(''); if (jsonReport.mismatchedBuildMode) { lines.push('> **⚠ Build mode mismatch:** The test and baseline builds use different build modes.'); lines.push('> Results may not be directly comparable. For apples-to-apples comparisons,'); lines.push('> use the same build type for both (e.g. `--production-build` with a local'); lines.push('> baseline path, or two version strings).'); lines.push(''); } // -- At-a-glance overview table: one row per scenario -------------------- lines.push(`## Overview`); lines.push(''); lines.push('| Scenario | Description | TTFT | Complete | Layouts | Styles | LoAF | Verdict |'); lines.push('|----------|-------------|-----:|---------:|--------:|-------:|-----:|:-------:|'); for (const scenario of scenarios) { const verdicts = scenarioVerdicts.get(scenario) || []; const get = (/** @type {string} */ m) => verdicts.find(v => v.metric === m); const ttft = get('timeToFirstToken'); const complete = get('timeToComplete'); const layouts = get('layoutCount'); const styles = get('recalcStyleCount'); const loaf = get('longAnimationFrameCount'); const fmtCell = (/** @type {{ change: number, verdict: string } | undefined} */ v) => { if (!v) { return '\u2014'; } const pct = `${v.change > 0 ? '+' : ''}${(v.change * 100).toFixed(0)}%`; return pct; }; const fmtVerdict = (/** @type {{ verdict: string, change: number }[]} */ vs) => { const hasRegression = vs.some(v => v.verdict === 'REGRESSION'); const hasImproved = vs.some(v => v.verdict === 'improved'); if (hasRegression) { return '\u274C Regressed'; } if (hasImproved) { return '\u2B06\uFE0F Improved'; } return '\u2705 OK'; }; const keyVerdicts = [ttft, complete, layouts, styles, loaf].filter(Boolean); const rowVerdict = fmtVerdict(/** @type {any[]} */(keyVerdicts)); lines.push(`| ${scenario} | ${getScenarioDescription(scenario)} | ${fmtCell(ttft)} | ${fmtCell(complete)} | ${fmtCell(layouts)} | ${fmtCell(styles)} | ${fmtCell(loaf)} | ${rowVerdict} |`); } lines.push(''); // -- Regressions & improvements detail section --------------------------- const hasNotable = [...scenarioVerdicts.values()].some(vs => vs.some(v => v.verdict === 'REGRESSION' || v.verdict === 'improved')); if (hasNotable) { lines.push('## Regressions & Improvements'); lines.push(''); lines.push('Only metrics that regressed or improved significantly are shown below.'); lines.push(''); for (const scenario of scenarios) { const verdicts = scenarioVerdicts.get(scenario) || []; const notable = verdicts.filter(v => v.verdict === 'REGRESSION' || v.verdict === 'improved'); if (notable.length === 0) { continue; } const icon = notable.some(v => v.verdict === 'REGRESSION') ? '\u274C' : '\u2B06\uFE0F'; lines.push(`### ${icon} ${scenario}`); lines.push(''); lines.push('| Metric | Baseline | Test | Change | p-value | Verdict |'); lines.push('|--------|----------|------|--------|---------|---------|'); for (const v of notable) { const pct = `${v.change > 0 ? '+' : ''}${(v.change * 100).toFixed(1)}%`; const verdictIcon = v.verdict === 'REGRESSION' ? '\u274C' : '\u2B06\uFE0F'; lines.push(`| ${v.metric} | ${v.basStr} | ${v.curStr} | ${pct} | ${v.pValue} | ${verdictIcon} ${v.verdict} |`); } lines.push(''); } } // -- Full metric tables in collapsible section --------------------------- lines.push('
Full metric details per scenario'); lines.push(''); for (const scenario of scenarios) { const verdicts = scenarioVerdicts.get(scenario) || []; const base = baseline?.scenarios?.[scenario]; lines.push(`### ${scenario}`); lines.push(''); if (!base) { const current = jsonReport.scenarios[scenario]; lines.push('> No baseline data for this scenario.'); lines.push(''); lines.push('| Metric | Value | StdDev | CV | n |'); lines.push('|--------|------:|-------:|---:|--:|'); for (const [metric, group, unit] of allMetrics) { const cur = current[group]?.[metric]; if (!cur) { continue; } lines.push(`| ${metric} | ${cur.median}${unit} | \xb1${cur.stddev}${unit} | ${(cur.cv * 100).toFixed(0)}% | ${cur.n} |`); } lines.push(''); continue; } lines.push(`| Metric | Baseline | Test | Change | p-value | Verdict |`); lines.push(`|--------|----------|------|--------|---------|---------|`); for (const v of verdicts) { const pct = `${v.change > 0 ? '+' : ''}${(v.change * 100).toFixed(1)}%`; let verdictDisplay = v.verdict; if (v.verdict === 'REGRESSION') { verdictDisplay = '\u274C REGRESSION'; } else if (v.verdict === 'improved') { verdictDisplay = '\u2B06\uFE0F improved'; } else if (v.verdict === 'ok') { verdictDisplay = '\u2705 ok'; } else if (v.verdict === 'noise') { verdictDisplay = '\uD83C\uDF2B\uFE0F noise'; } else if (v.verdict === 'info') { verdictDisplay = '\u2139\uFE0F'; } lines.push(`| ${v.metric} | ${v.basStr} | ${v.curStr} | ${pct} | ${v.pValue} | ${verdictDisplay} |`); } lines.push(''); } lines.push('
'); lines.push(''); // -- Raw run data in collapsible section --------------------------------- lines.push('
Raw run data'); lines.push(''); for (const scenario of scenarios) { const current = jsonReport.scenarios[scenario]; lines.push(`### ${scenario}`); lines.push(''); lines.push('| Run | TTFT (ms) | Complete (ms) | Layouts | Style Recalcs | LoAF Count | LoAF (ms) | Frames | Heap Delta (MB) | Internal Marks |'); lines.push('|----:|----------:|--------------:|--------:|--------------:|-----------:|----------:|-------:|----------------:|:--------------:|'); const runs = current.rawRuns || []; for (let i = 0; i < runs.length; i++) { const r = runs[i]; const round2 = (/** @type {number} */ v) => Math.round(v * 100) / 100; lines.push(`| ${i + 1} | ${round2(r.timeToFirstToken)} | ${r.timeToComplete} | ${r.layoutCount} | ${r.recalcStyleCount} | ${r.longAnimationFrameCount ?? '-'} | ${r.longAnimationFrameTotalMs !== null && r.longAnimationFrameTotalMs !== undefined ? round2(r.longAnimationFrameTotalMs) : '-'} | ${r.frameCount ?? '-'} | ${r.heapDelta} | ${r.hasInternalMarks ? 'yes' : 'no'} |`); } lines.push(''); } if (baseline) { for (const scenario of scenarios) { const base = baseline.scenarios?.[scenario]; if (!base) { continue; } lines.push(`### ${scenario} (baseline)`); lines.push(''); lines.push('| Run | TTFT (ms) | Complete (ms) | Layouts | Style Recalcs | LoAF Count | LoAF (ms) | Frames | Heap Delta (MB) | Internal Marks |'); lines.push('|----:|----------:|--------------:|--------:|--------------:|-----------:|----------:|-------:|----------------:|:--------------:|'); const runs = base.rawRuns || []; for (let i = 0; i < runs.length; i++) { const r = runs[i]; const round2 = (/** @type {number} */ v) => Math.round(v * 100) / 100; lines.push(`| ${i + 1} | ${round2(r.timeToFirstToken)} | ${r.timeToComplete} | ${r.layoutCount} | ${r.recalcStyleCount} | ${r.longAnimationFrameCount ?? '-'} | ${r.longAnimationFrameTotalMs !== null && r.longAnimationFrameTotalMs !== undefined ? round2(r.longAnimationFrameTotalMs) : '-'} | ${r.frameCount ?? '-'} | ${r.heapDelta} | ${r.hasInternalMarks ? 'yes' : 'no'} |`); } lines.push(''); } } lines.push('
'); lines.push(''); return lines.join('\n'); } // -- Cleanup on SIGINT/SIGTERM ----------------------------------------------- /** @type {{ close: () => Promise } | null} */ let activeVSCode = null; /** @type {{ close: () => Promise } | null} */ let activeMockServer = null; function installSignalHandlers() { const cleanup = async () => { console.log('\n[chat-simulation] Caught interrupt, cleaning up...'); try { await activeVSCode?.close(); } catch { } try { await activeMockServer?.close(); } catch { } process.exit(130); }; process.on('SIGINT', cleanup); process.on('SIGTERM', cleanup); } // -- Diagnostic cleanup ------------------------------------------------------ /** * Remove large diagnostic files (heap snapshots, CPU profiles, traces) from * a run's metrics to free disk space. Keeps the JSON results data intact. * @param {RunMetrics} metrics */ function cleanupRunDiagnostics(metrics) { const filesToDelete = [ metrics.profilePath, metrics.tracePath, metrics.snapshotPath, metrics.extHostProfilePath, metrics.extHostSnapshotPath, ]; for (const filePath of filesToDelete) { if (filePath && fs.existsSync(filePath)) { try { fs.rmSync(filePath, { force: true }); } catch { // Ignore cleanup errors } } } } /** * Clean up diagnostics for all scenarios that did NOT regress. * Keeps diagnostics for regressed scenarios so they can be investigated. * @param {Record} allResults - test results by scenario * @param {Set} regressedScenarios - scenarios that regressed */ function cleanupNonRegressedDiagnostics(allResults, regressedScenarios) { for (const [scenario, runs] of Object.entries(allResults)) { if (regressedScenarios.has(scenario)) { continue; } for (const metrics of runs) { cleanupRunDiagnostics(metrics); } } } // -- Main -------------------------------------------------------------------- async function main() { registerPerfScenarios(); const opts = parseArgs(); installSignalHandlers(); const { startServer } = require('./common/mock-llm-server'); const mockServer = await startServer(0); activeMockServer = mockServer; console.log(`[chat-simulation] Mock LLM server: ${mockServer.url}`); // -- Resume mode -------------------------------------------------------- if (opts.resume) { if (!fs.existsSync(opts.resume)) { console.error(`[chat-simulation] Resume file not found: ${opts.resume}`); process.exit(1); } const prevResults = JSON.parse(fs.readFileSync(opts.resume, 'utf-8')); const prevDir = path.dirname(opts.resume); // Find the associated baseline JSON in the same directory const baselineFiles = fs.readdirSync(prevDir).filter((/** @type {string} */ f) => f.startsWith('baseline-') && f.endsWith('.json')); const baselineFile = baselineFiles.length > 0 ? path.join(prevDir, baselineFiles[0]) : null; const prevBaseline = baselineFile ? JSON.parse(fs.readFileSync(baselineFile, 'utf-8')) : null; // Determine which scenarios to resume (default: all from previous run) const resumeScenarios = opts.scenarios.length > 0 ? opts.scenarios.filter(s => prevResults.scenarios?.[s]) : Object.keys(prevResults.scenarios || {}); if (resumeScenarios.length === 0) { console.error('[chat-simulation] No matching scenarios found in previous results'); process.exit(1); } const testElectron = await resolveBuild(opts.build); const baselineVersion = prevBaseline?.baselineBuildVersion; const baselineElectron = baselineVersion ? await resolveBuild(baselineVersion) : null; const runsToAdd = opts.runs; console.log(`[chat-simulation] Resuming from: ${opts.resume}`); console.log(`[chat-simulation] Adding ${runsToAdd} runs per scenario`); console.log(`[chat-simulation] Scenarios: ${resumeScenarios.join(', ')}`); if (prevBaseline) { console.log(`[chat-simulation] Baseline: ${baselineVersion} (${prevBaseline.scenarios?.[resumeScenarios[0]]?.rawRuns?.length || 0} existing runs)`); } console.log(''); for (const scenario of resumeScenarios) { console.log(`[chat-simulation] === Resuming: ${scenario} ===`); const prevTestRuns = prevResults.scenarios[scenario]?.rawRuns || []; const prevBaseRuns = prevBaseline?.scenarios?.[scenario]?.rawRuns || []; // Run additional test iterations console.log(`[chat-simulation] Test build (${prevTestRuns.length} existing + ${runsToAdd} new)`); for (let i = 0; i < runsToAdd; i++) { const runIdx = `${scenario}-resume-${prevTestRuns.length + i}`; console.log(`[chat-simulation] Run ${i + 1}/${runsToAdd}...`); try { const m = await runOnce(testElectron, scenario, mockServer, opts.verbose, runIdx, prevDir, 'test', { ...opts.settingsOverrides, ...opts.testSettingsOverrides }, { heapSnapshots: opts.heapSnapshots }); // Clean up previous run's diagnostics to bound disk usage; keep the latest if (opts.cleanupDiagnostics && prevTestRuns.length > 0) { cleanupRunDiagnostics(prevTestRuns[prevTestRuns.length - 1]); } prevTestRuns.push(m); if (opts.verbose) { const src = m.hasInternalMarks ? 'internal' : 'client-side'; console.log(` [${src}] firstToken=${m.timeToFirstToken}ms, complete=${m.timeToComplete}ms`); } } catch (err) { console.error(` Run ${i + 1} failed: ${err}`); } } // Run additional baseline iterations if (baselineElectron && prevBaseline?.scenarios?.[scenario]) { console.log(`[chat-simulation] Baseline build (${prevBaseRuns.length} existing + ${runsToAdd} new)`); for (let i = 0; i < runsToAdd; i++) { const runIdx = `baseline-${scenario}-resume-${prevBaseRuns.length + i}`; console.log(`[chat-simulation] Run ${i + 1}/${runsToAdd}...`); try { const m = await runOnce(baselineElectron, scenario, mockServer, opts.verbose, runIdx, prevDir, 'baseline', { ...opts.settingsOverrides, ...opts.baselineSettingsOverrides }, { heapSnapshots: opts.heapSnapshots }); // Clean up previous run's diagnostics to bound disk usage; keep the latest if (opts.cleanupDiagnostics && prevBaseRuns.length > 0) { cleanupRunDiagnostics(prevBaseRuns[prevBaseRuns.length - 1]); } prevBaseRuns.push(m); } catch (err) { console.error(` Run ${i + 1} failed: ${err}`); } } } // Recompute stats with merged data const sd = /** @type {any} */ ({ runs: prevTestRuns.length, timing: {}, memory: {}, rendering: {}, extHost: {}, rawRuns: prevTestRuns }); for (const [metric, group] of METRIC_DEFS) { sd[group][metric] = robustStats(prevTestRuns.map((/** @type {any} */ r) => r[metric])); } prevResults.scenarios[scenario] = sd; if (prevBaseline?.scenarios?.[scenario]) { const bsd = /** @type {any} */ ({ runs: prevBaseRuns.length, timing: {}, memory: {}, rendering: {}, extHost: {}, rawRuns: prevBaseRuns }); for (const [metric, group] of METRIC_DEFS) { bsd[group][metric] = robustStats(prevBaseRuns.map((/** @type {any} */ r) => r[metric])); } prevBaseline.scenarios[scenario] = bsd; } console.log(`[chat-simulation] Merged: test n=${prevTestRuns.length}${prevBaseRuns.length > 0 ? `, baseline n=${prevBaseRuns.length}` : ''}`); console.log(''); } // Write updated files back prevResults.runsPerScenario = Math.max(prevResults.runsPerScenario || 0, ...Object.values(prevResults.scenarios).map((/** @type {any} */ s) => s.runs)); prevResults.lastResumed = new Date().toISOString(); fs.writeFileSync(opts.resume, JSON.stringify(prevResults, null, 2)); console.log(`[chat-simulation] Updated results: ${opts.resume}`); if (prevBaseline && baselineFile) { prevBaseline.lastResumed = new Date().toISOString(); fs.writeFileSync(baselineFile, JSON.stringify(prevBaseline, null, 2)); // Also update cached baseline const cachedPath = path.join(DATA_DIR, path.basename(baselineFile)); fs.writeFileSync(cachedPath, JSON.stringify(prevBaseline, null, 2)); console.log(`[chat-simulation] Updated baseline: ${baselineFile}`); } // -- Re-run comparison with merged data -------------------------------- opts.baseline = baselineFile || undefined; const jsonReport = prevResults; jsonReport._resultsPath = opts.resume; // Fall through to comparison logic below await printComparison(jsonReport, opts); await mockServer.close(); return; } // -- Normal (non-resume) flow ------------------------------------------- // --production-build: build a local bundled (non-dev) package from the // current source tree using `gulp vscode`. This produces the same // packaging as a release build (bundled JS, no VSCODE_DEV) while still // testing your local changes. if (opts.productionBuild && !opts.build) { const prodBuildPath = buildProductionBuild(); opts.build = prodBuildPath; console.log(`[chat-simulation] --production-build: using local production build at ${prodBuildPath}`); } const electronPath = await resolveBuild(opts.build); if (!fs.existsSync(electronPath)) { console.error(`Electron not found at: ${electronPath}`); console.error('Run "node build/lib/preLaunch.ts" first, or pass --build '); process.exit(1); } // Detect build modes for both test and baseline builds const testBuildMode = detectBuildMode(electronPath); // Resolve the baseline build path early so we can detect its mode. // For version strings this downloads; for local paths it resolves directly. const isBaselineVersionString = opts.baselineBuild && isVersionString(opts.baselineBuild); const isBaselineLocalPath = opts.baselineBuild && !isBaselineVersionString; /** @type {string | undefined} */ let baselineElectronPath; if (isBaselineLocalPath) { baselineElectronPath = await resolveBuild(opts.baselineBuild); if (!fs.existsSync(baselineElectronPath)) { console.error(`Baseline build not found at: ${baselineElectronPath}`); process.exit(1); } } const baselineBuildMode = opts.baselineBuild ? (isBaselineVersionString ? 'release' : detectBuildMode(baselineElectronPath || '')) : undefined; const isMismatchedBuildMode = baselineBuildMode !== undefined && testBuildMode !== baselineBuildMode; // Create a timestamped run directory for all output const runTimestamp = new Date().toISOString().replace(/[:.]/g, '-').slice(0, 19); const runDir = path.join(DATA_DIR, runTimestamp); fs.mkdirSync(runDir, { recursive: true }); console.log(`[chat-simulation] Output: ${runDir}`); // Compute effective settings per role const testSettings = { ...opts.settingsOverrides, ...opts.testSettingsOverrides }; const baselineSettings = { ...opts.settingsOverrides, ...opts.baselineSettingsOverrides }; // -- Baseline build -------------------------------------------------- if (opts.baselineBuild) { // Use a sanitized label for file names — replace path separators for local paths const baselineLabel = isBaselineLocalPath ? path.basename(path.resolve(opts.baselineBuild)) : opts.baselineBuild; const baselineJsonPath = path.join(runDir, `baseline-${baselineLabel}.json`); // Local paths: always run fresh (no caching — the build may have changed) // Version strings: use caching as before const cachedPath = isBaselineLocalPath ? null : path.join(DATA_DIR, `baseline-${baselineLabel}.json`); const cachedBaseline = cachedPath && !opts.noCache && fs.existsSync(cachedPath) ? JSON.parse(fs.readFileSync(cachedPath, 'utf-8')) : null; if (cachedBaseline?.baselineBuildVersion === opts.baselineBuild) { // Check if the cache covers all requested scenarios const cachedScenarios = new Set(Object.keys(cachedBaseline.scenarios || {})); const missingScenarios = opts.scenarios.filter((/** @type {string} */ s) => !cachedScenarios.has(s)); // Also check if cached scenarios have fewer runs than requested const shortScenarios = opts.scenarios.filter((/** @type {string} */ s) => { const cached = cachedBaseline.scenarios?.[s]; return cached && (cached.rawRuns?.length || 0) < opts.runs; }); if (missingScenarios.length === 0 && shortScenarios.length === 0) { console.log(`[chat-simulation] Using cached baseline for ${opts.baselineBuild}`); fs.writeFileSync(baselineJsonPath, JSON.stringify(cachedBaseline, null, 2)); opts.baseline = baselineJsonPath; } else { const scenariosToRun = [...new Set([...missingScenarios, ...shortScenarios])]; if (missingScenarios.length > 0) { console.log(`[chat-simulation] Cached baseline missing scenarios: ${missingScenarios.join(', ')}`); } if (shortScenarios.length > 0) { console.log(`[chat-simulation] Cached baseline needs more runs for: ${shortScenarios.map((/** @type {string} */ s) => `${s} (${cachedBaseline.scenarios[s].rawRuns?.length || 0}/${opts.runs})`).join(', ')}`); } console.log(`[chat-simulation] Running baseline for ${scenariosToRun.length} scenario(s)...`); const baselineExePath = baselineElectronPath || await resolveBuild(opts.baselineBuild); for (const scenario of scenariosToRun) { const existingRuns = cachedBaseline.scenarios?.[scenario]?.rawRuns || []; const runsNeeded = opts.runs - existingRuns.length; /** @type {RunMetrics[]} */ const newResults = []; for (let i = 0; i < runsNeeded; i++) { try { const m = await runOnce(baselineExePath, scenario, mockServer, opts.verbose, `baseline-${scenario}-${existingRuns.length + i}`, runDir, 'baseline', baselineSettings, { heapSnapshots: opts.heapSnapshots }); // Clean up previous run's diagnostics to bound disk usage; keep the latest if (opts.cleanupDiagnostics && newResults.length > 0) { cleanupRunDiagnostics(newResults[newResults.length - 1]); } newResults.push(m); } catch (err) { console.error(`[chat-simulation] Baseline run ${i + 1} failed: ${err}`); } } const allRuns = [...existingRuns, ...newResults]; if (allRuns.length > 0) { const sd = /** @type {any} */ ({ runs: allRuns.length, timing: {}, memory: {}, rendering: {}, extHost: {}, rawRuns: allRuns }); for (const [metric, group] of METRIC_DEFS) { sd[group][metric] = robustStats(allRuns.map((/** @type {any} */ r) => r[metric])); } cachedBaseline.scenarios[scenario] = sd; } } cachedBaseline.runsPerScenario = opts.runs; fs.writeFileSync(baselineJsonPath, JSON.stringify(cachedBaseline, null, 2)); if (cachedPath) { fs.writeFileSync(cachedPath, JSON.stringify(cachedBaseline, null, 2)); } opts.baseline = baselineJsonPath; } } else { const baselineExePath = baselineElectronPath || await resolveBuild(opts.baselineBuild); console.log(`[chat-simulation] Benchmarking baseline build (${baselineLabel})...`); /** @type {Record} */ const baselineResults = {}; for (const scenario of opts.scenarios) { /** @type {RunMetrics[]} */ const results = []; for (let i = 0; i < opts.runs; i++) { try { const m = await runOnce(baselineExePath, scenario, mockServer, opts.verbose, `baseline-${scenario}-${i}`, runDir, 'baseline', baselineSettings, { heapSnapshots: opts.heapSnapshots }); // Clean up previous run's diagnostics to bound disk usage; keep the latest if (opts.cleanupDiagnostics && results.length > 0) { cleanupRunDiagnostics(results[results.length - 1]); } results.push(m); } catch (err) { console.error(`[chat-simulation] Baseline run ${i + 1} failed: ${err}`); } } if (results.length > 0) { baselineResults[scenario] = results; } } const baselineReport = { timestamp: new Date().toISOString(), baselineBuildVersion: opts.baselineBuild, platform: process.platform, runsPerScenario: opts.runs, scenarios: /** @type {Record} */ ({}), }; for (const [scenario, results] of Object.entries(baselineResults)) { const sd = /** @type {any} */ ({ runs: results.length, timing: {}, memory: {}, rendering: {}, extHost: {}, rawRuns: results }); for (const [metric, group] of METRIC_DEFS) { sd[group][metric] = robustStats(results.map(r => /** @type {any} */(r)[metric])); } baselineReport.scenarios[scenario] = sd; } fs.writeFileSync(baselineJsonPath, JSON.stringify(baselineReport, null, 2)); // Cache at the top level for reuse across runs (version strings only) if (cachedPath) { fs.writeFileSync(cachedPath, JSON.stringify(baselineReport, null, 2)); } opts.baseline = baselineJsonPath; } console.log(''); } // -- Run benchmarks -------------------------------------------------- console.log(`[chat-simulation] Electron: ${electronPath}`); console.log(`[chat-simulation] Build mode: ${buildModeLabel(testBuildMode)}`); if (baselineBuildMode) { console.log(`[chat-simulation] Baseline mode: ${buildModeLabel(baselineBuildMode)}`); } console.log(`[chat-simulation] Runs per scenario: ${opts.runs}`); console.log(`[chat-simulation] Scenarios: ${opts.scenarios.join(', ')}`); if (Object.keys(opts.settingsOverrides).length > 0) { console.log(`[chat-simulation] Settings overrides (all): ${JSON.stringify(opts.settingsOverrides)}`); } if (Object.keys(opts.testSettingsOverrides).length > 0) { console.log(`[chat-simulation] Settings overrides (test): ${JSON.stringify(opts.testSettingsOverrides)}`); } if (Object.keys(opts.baselineSettingsOverrides).length > 0) { console.log(`[chat-simulation] Settings overrides (baseline): ${JSON.stringify(opts.baselineSettingsOverrides)}`); } if (isMismatchedBuildMode) { console.log(''); console.log(`[chat-simulation] ⚠ WARNING: Build mode mismatch — test is ${testBuildMode}, baseline is ${baselineBuildMode}.`); console.log('[chat-simulation] Results may not be directly comparable. For apples-to-apples'); console.log('[chat-simulation] comparisons, use the same build type for both.'); if (testBuildMode === 'dev') { console.log('[chat-simulation] To use a local production build instead:'); console.log('[chat-simulation] npm run perf:chat -- --production-build'); } if (!opts.ci && !opts.force) { const readline = require('readline'); const rl = readline.createInterface({ input: process.stdin, output: process.stdout }); const answer = await new Promise(resolve => rl.question('[chat-simulation] Continue anyway? [y/N] ', resolve)); rl.close(); if (String(answer).toLowerCase() !== 'y') { console.log('[chat-simulation] Aborted.'); await mockServer.close(); process.exit(0); } } } console.log(''); /** @type {Record} */ const allResults = {}; let anyFailed = false; for (const scenario of opts.scenarios) { console.log(`[chat-simulation] === Scenario: ${scenario} ===`); /** @type {RunMetrics[]} */ const results = []; for (let i = 0; i < opts.runs; i++) { console.log(`[chat-simulation] Run ${i + 1}/${opts.runs}...`); try { const metrics = await runOnce(electronPath, scenario, mockServer, opts.verbose, `${scenario}-${i}`, runDir, 'test', testSettings, { heapSnapshots: opts.heapSnapshots }); // Clean up previous run's diagnostics to bound disk usage; keep the latest if (opts.cleanupDiagnostics && results.length > 0) { cleanupRunDiagnostics(results[results.length - 1]); } results.push(metrics); if (opts.verbose) { const src = metrics.hasInternalMarks ? 'internal' : 'client-side'; console.log(` [${src}] firstToken=${metrics.timeToFirstToken}ms, complete=${metrics.timeToComplete}ms, heap=delta${metrics.heapDelta}MB, longTasks=${metrics.longTaskCount}${metrics.hasInternalMarks ? `, internalTTFT=${metrics.internalFirstToken}ms` : ''}`); } } catch (err) { console.error(` Run ${i + 1} failed: ${err}`); } } if (results.length === 0) { console.error(`[chat-simulation] All runs failed for scenario: ${scenario}`); anyFailed = true; } else { allResults[scenario] = results; } console.log(''); } // -- Summary --------------------------------------------------------- console.log('[chat-simulation] ======================= Summary ======================='); for (const [scenario, results] of Object.entries(allResults)) { console.log(''); console.log(` -- ${scenario} (${results.length} runs) --`); console.log(''); console.log(' Timing:'); console.log(summarize(results.map(r => r.timeToFirstToken), ' Request → First token ', 'ms')); console.log(summarize(results.map(r => r.timeToComplete), ' Request → Complete ', 'ms')); console.log(summarize(results.map(r => r.timeToRenderComplete), ' Request → Rendered ', 'ms')); console.log(''); console.log(' Rendering:'); console.log(summarize(results.map(r => r.layoutCount), ' Layouts ', '')); console.log(summarize(results.map(r => r.layoutDurationMs), ' Layout duration ', 'ms')); console.log(summarize(results.map(r => r.recalcStyleCount), ' Style recalcs ', '')); console.log(summarize(results.map(r => r.forcedReflowCount), ' Forced reflows ', '')); console.log(summarize(results.map(r => r.longTaskCount), ' Long tasks (>50ms) ', '')); console.log(summarize(results.map(r => r.longAnimationFrameCount), ' Long anim. frames ', '')); console.log(summarize(results.map(r => r.longAnimationFrameTotalMs), ' LoAF total duration ', 'ms')); console.log(summarize(results.map(r => r.frameCount), ' Frames ', '')); console.log(summarize(results.map(r => r.compositeLayers), ' Composite layers ', '')); console.log(summarize(results.map(r => r.paintCount), ' Paints ', '')); console.log(''); console.log(' Memory:'); console.log(summarize(results.map(r => r.heapDelta), ' Heap delta ', 'MB')); console.log(summarize(results.map(r => r.heapDeltaPostGC), ' Heap delta (post-GC) ', 'MB')); console.log(summarize(results.map(r => r.gcDurationMs), ' GC duration ', 'ms')); if (results.some(r => r.extHostHeapDelta >= 0)) { console.log(''); console.log(' Extension Host:'); console.log(summarize(results.map(r => r.extHostHeapUsedBefore), ' Heap before ', 'MB')); console.log(summarize(results.map(r => r.extHostHeapUsedAfter), ' Heap after ', 'MB')); console.log(summarize(results.map(r => r.extHostHeapDelta), ' Heap delta ', 'MB')); console.log(summarize(results.map(r => r.extHostHeapDeltaPostGC), ' Heap delta (post-GC) ', 'MB')); } } // -- JSON output ----------------------------------------------------- const jsonPath = path.join(runDir, 'results.json'); const jsonReport = /** @type {{ timestamp: string, platform: NodeJS.Platform, runsPerScenario: number, buildMode: string, mismatchedBuildMode: boolean, scenarios: Record, _resultsPath?: string }} */ ({ timestamp: new Date().toISOString(), platform: process.platform, runsPerScenario: opts.runs, buildMode: testBuildMode, mismatchedBuildMode: !!isMismatchedBuildMode, scenarios: /** @type {Record} */ ({}), }); for (const [scenario, results] of Object.entries(allResults)) { const sd = /** @type {any} */ ({ runs: results.length, timing: {}, memory: {}, rendering: {}, extHost: {}, rawRuns: results }); for (const [metric, group] of METRIC_DEFS) { sd[group][metric] = robustStats(results.map(r => /** @type {any} */(r)[metric])); } jsonReport.scenarios[scenario] = sd; } fs.writeFileSync(jsonPath, JSON.stringify(jsonReport, null, 2)); jsonReport._resultsPath = jsonPath; console.log(''); console.log(`[chat-simulation] Results written to ${jsonPath}`); // -- Save baseline --------------------------------------------------- if (opts.saveBaseline) { if (!opts.baseline) { console.error('[chat-simulation] --save-baseline requires --baseline '); process.exit(1); } fs.writeFileSync(opts.baseline, JSON.stringify(jsonReport, null, 2)); console.log(`[chat-simulation] Baseline saved to ${opts.baseline}`); } // -- Baseline comparison --------------------------------------------- const regressedScenarios = await printComparison(jsonReport, opts); // Clean up diagnostics for scenarios that did not regress if (opts.cleanupDiagnostics) { cleanupNonRegressedDiagnostics(allResults, regressedScenarios); } if (anyFailed) { process.exit(1); } await mockServer.close(); } /** * Print baseline comparison and exit with code 1 if regressions found. * Returns the set of scenario IDs that regressed. * @param {Record} jsonReport * @param {{ threshold: number, metricThresholds?: Record, baseline?: string, ci?: boolean, resume?: string, build?: string, baselineBuild?: string, runs: number, cleanupDiagnostics?: boolean }} opts * @returns {Promise>} */ async function printComparison(jsonReport, opts) { let regressionFound = false; let inconclusiveFound = false; /** @type {Set} */ const regressedScenarios = new Set(); if (opts.baseline && fs.existsSync(opts.baseline)) { const baseline = JSON.parse(fs.readFileSync(opts.baseline, 'utf-8')); console.log(''); console.log(`[chat-simulation] =========== Baseline Comparison (threshold: ${(opts.threshold * 100).toFixed(0)}%) ===========`); console.log(`[chat-simulation] Baseline: ${baseline.baselineBuildVersion || baseline.timestamp}`); if (jsonReport.mismatchedBuildMode) { console.log(`[chat-simulation] ⚠ Note: build mode mismatch — test is ${jsonReport.buildMode}, baseline differs.`); console.log('[chat-simulation] Results may not be directly comparable.'); } console.log(''); // Metrics that trigger regression failure when they exceed the threshold const regressionMetrics = [ // [metric, group, unit] ['timeToFirstToken', 'timing', 'ms'], ['timeToComplete', 'timing', 'ms'], ['layoutCount', 'rendering', ''], ['recalcStyleCount', 'rendering', ''], ['forcedReflowCount', 'rendering', ''], ['longTaskCount', 'rendering', ''], ]; // Informational metrics — shown in comparison but don't trigger failure const infoMetrics = [ ['heapDelta', 'memory', 'MB'], ['gcDurationMs', 'memory', 'ms'], ['extHostHeapDelta', 'extHost', 'MB'], ['extHostHeapDeltaPostGC', 'extHost', 'MB'], ]; for (const scenario of Object.keys(jsonReport.scenarios)) { const current = jsonReport.scenarios[scenario]; const base = baseline.scenarios?.[scenario]; if (!base) { console.log(` ${scenario}: (no baseline)`); continue; } /** @type {string[]} */ const diffs = []; let scenarioRegression = false; for (const [metric, group, unit] of regressionMetrics) { const cur = current[group]?.[metric]; const bas = base[group]?.[metric]; if (!cur || !bas || !bas.median) { continue; } const change = (cur.median - bas.median) / bas.median; const pct = `${change > 0 ? '+' : ''}${(change * 100).toFixed(1)}%`; // Statistical significance via Welch's t-test on raw run values const curRaw = (current.rawRuns || []).map((/** @type {any} */ r) => r[metric]).filter((/** @type {any} */ v) => v >= 0); const basRaw = (base.rawRuns || []).map((/** @type {any} */ r) => r[metric]).filter((/** @type {any} */ v) => v >= 0); const ttest = welchTTest(basRaw, curRaw); const metricThreshold = getMetricThreshold(opts, metric); const absoluteDelta = cur.median - bas.median; let flag = ''; if (exceedsThreshold(metricThreshold, change, absoluteDelta)) { if (!ttest) { flag = ' ← possible regression (n too small for significance test)'; inconclusiveFound = true; } else if (ttest.significant) { flag = ` ← REGRESSION (p=${ttest.pValue}, ${ttest.confidence} confidence)`; scenarioRegression = true; regressionFound = true; } else { flag = ` (likely noise — p=${ttest.pValue}, not significant)`; inconclusiveFound = true; } } else if (ttest && change > 0 && ttest.significant && ttest.confidence === 'high') { flag = ` (significant increase, p=${ttest.pValue})`; } diffs.push(` ${metric}: ${bas.median}${unit} → ${cur.median}${unit} (${pct})${flag}`); } for (const [metric, group, unit] of infoMetrics) { const cur = current[group]?.[metric]; const bas = base[group]?.[metric]; if (!cur || !bas || bas.median === null || bas.median === undefined) { continue; } const change = bas.median !== 0 ? (cur.median - bas.median) / bas.median : 0; const pct = `${change > 0 ? '+' : ''}${(change * 100).toFixed(1)}%`; diffs.push(` ${metric}: ${bas.median}${unit} → ${cur.median}${unit} (${pct}) [info]`); } console.log(` ${scenario}: ${scenarioRegression ? 'FAIL' : 'OK'}`); if (scenarioRegression) { regressedScenarios.add(scenario); } diffs.forEach(d => console.log(d)); } console.log(''); console.log(regressionFound ? `[chat-simulation] REGRESSION DETECTED — exceeded ${(opts.threshold * 100).toFixed(0)}% threshold with statistical significance` : `[chat-simulation] All metrics within ${(opts.threshold * 100).toFixed(0)}% of baseline (or not statistically significant)`); if (inconclusiveFound && !regressionFound) { // Find the results.json path to suggest in the hint const resultsPath = Object.keys(jsonReport.scenarios).length > 0 ? (jsonReport._resultsPath || opts.resume || 'path/to/results.json') : 'path/to/results.json'; // Estimate required runs from the observed effect size and variance // using power analysis for Welch's t-test (alpha=0.05, 80% power). // n_per_group = 2 * ((z_alpha/2 + z_beta) / d)^2 where d = Cohen's d let maxNeeded = 0; for (const scenario of Object.keys(jsonReport.scenarios)) { const current = jsonReport.scenarios[scenario]; const base = baseline.scenarios?.[scenario]; if (!base) { continue; } for (const [metric, group] of [['timeToFirstToken', 'timing'], ['timeToComplete', 'timing'], ['layoutCount', 'rendering'], ['recalcStyleCount', 'rendering']]) { const curRaw = (current.rawRuns || []).map((/** @type {any} */ r) => r[metric]).filter((/** @type {any} */ v) => v >= 0); const basRaw = (base.rawRuns || []).map((/** @type {any} */ r) => r[metric]).filter((/** @type {any} */ v) => v >= 0); if (curRaw.length < 2 || basRaw.length < 2) { continue; } const meanA = basRaw.reduce((/** @type {number} */ s, /** @type {number} */ v) => s + v, 0) / basRaw.length; const meanB = curRaw.reduce((/** @type {number} */ s, /** @type {number} */ v) => s + v, 0) / curRaw.length; const varA = basRaw.reduce((/** @type {number} */ s, /** @type {number} */ v) => s + (v - meanA) ** 2, 0) / (basRaw.length - 1); const varB = curRaw.reduce((/** @type {number} */ s, /** @type {number} */ v) => s + (v - meanB) ** 2, 0) / (curRaw.length - 1); const pooledSD = Math.sqrt((varA + varB) / 2); if (pooledSD === 0) { continue; } const d = Math.abs(meanB - meanA) / pooledSD; if (d === 0) { continue; } // z_0.025 = 1.96, z_0.2 = 0.842 const nPerGroup = Math.ceil(2 * ((1.96 + 0.842) / d) ** 2); const currentN = Math.min(curRaw.length, basRaw.length); maxNeeded = Math.max(maxNeeded, nPerGroup - currentN); } } const suggestedRuns = Math.max(1, Math.min(maxNeeded, 20)); console.log(''); console.log('[chat-simulation] Some metrics exceeded the threshold but were not statistically significant.'); console.log('[chat-simulation] To increase confidence, add more runs with --resume:'); console.log(`[chat-simulation] npm run perf:chat -- --resume ${resultsPath} --runs ${suggestedRuns}`); } } // -- CI summary ------------------------------------------------------ if (opts.ci) { const ciBaseline = opts.baseline && fs.existsSync(opts.baseline) ? JSON.parse(fs.readFileSync(opts.baseline, 'utf-8')) : null; const summary = generateCISummary(jsonReport, ciBaseline, { threshold: opts.threshold, metricThresholds: opts.metricThresholds, runs: jsonReport.runsPerScenario || opts.runs, baselineBuild: ciBaseline?.baselineBuildVersion || opts.baselineBuild, build: opts.build, }); // Write to file for GitHub Actions $GITHUB_STEP_SUMMARY const summaryPath = path.join(DATA_DIR, 'ci-summary.md'); fs.writeFileSync(summaryPath, summary); console.log(`[chat-simulation] CI summary written to ${summaryPath}`); // Also print the full summary table to stdout console.log(''); console.log('=================================================================='); console.log(' CHAT PERF COMPARISON RESULTS '); console.log('=================================================================='); console.log(''); console.log(summary); } if (regressionFound) { process.exit(1); } return regressedScenarios; } main().catch(err => { console.error(err); process.exit(1); });