Files
Giuseppe Cianci dd5c74f230 Simulation workbench: show error states and improve rerun UX (#4319)
* fix simulate task for windows

* fix ui for errors

* update

* address copilot review
2026-03-10 17:22:16 +00:00

569 lines
21 KiB
TypeScript
Raw Permalink Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/*---------------------------------------------------------------------------------------------
* Copyright (c) Microsoft Corporation. All rights reserved.
* Licensed under the MIT License. See License.txt in the project root for license information.
*--------------------------------------------------------------------------------------------*/
import * as path from 'path';
import { IPromptWorkspaceLabels, PromptWorkspaceLabels } from '../src/extension/context/node/resolvers/promptWorkspaceLabels';
import { INewWorkspacePreviewContentManager, NewWorkspacePreviewContentManagerImpl } from '../src/extension/intents/node/newIntent';
import { IntentError } from '../src/extension/prompt/node/intents';
import { ISimulationModelConfig } from '../src/extension/test/node/services';
import { IToolsService } from '../src/extension/tools/common/toolsService';
import { TestToolsService } from '../src/extension/tools/node/test/testToolsService';
import { IEndpointProvider } from '../src/platform/endpoint/common/endpointProvider';
import { TestEndpointProvider } from '../src/platform/endpoint/test/node/testEndpointProvider';
import { ConsoleLog, ILogService, LogServiceImpl } from '../src/platform/log/common/logService';
import { APIUsage } from '../src/platform/networking/common/openai';
import { ISimulationTestContext } from '../src/platform/simulationTestContext/common/simulationTestContext';
import { ITasksService } from '../src/platform/tasks/common/tasksService';
import { TestTasksService } from '../src/platform/tasks/common/testTasksService';
import { TestingServiceCollection } from '../src/platform/test/node/services';
import { ITokenizerProvider } from '../src/platform/tokenizer/node/tokenizer';
import { count } from '../src/util/common/arrays';
import { WellKnownLanguageId } from '../src/util/common/languages';
import { groupBy } from '../src/util/vs/base/common/collections';
import { BugIndicatingError } from '../src/util/vs/base/common/errors';
import { Lazy } from '../src/util/vs/base/common/lazy';
import { safeStringify } from '../src/util/vs/base/common/objects';
import { SyncDescriptor } from '../src/util/vs/platform/instantiation/common/descriptors';
import { SimulationExtHostToolsService } from './base/extHostContext/simulationExtHostToolsService';
import { SimulationBaseline, TestBaselineComparison } from './base/simulationBaseline';
import { CacheMode, createSimulationAccessor, CurrentTestRunInfo, SimulationServicesOptions } from './base/simulationContext';
import { ISimulationEndpointHealth } from './base/simulationEndpointHealth';
import { SimulationOptions } from './base/simulationOptions';
import { ISimulationOutcome } from './base/simulationOutcome';
import { FetchRequestCollector } from './base/spyingChatMLFetcher';
import { ISimulationTestRuntime, SimulationTest, SimulationTestRuntime, toDirname } from './base/stest';
import { IJSONOutputPrinter } from './jsonOutputPrinter';
import { green, red, violet, yellow } from './outputColorer';
import { ExternalSimulationTestRuntime } from './simulation/externalScenarios';
import * as shared from './simulation/shared/sharedTypes';
import { ITestSnapshots, TestSnapshotsImpl } from './simulation/testSnapshot';
import { TaskRunner } from './taskRunner';
import { TestExecutionInExtension } from './testExecutionInExtension';
import { createScoreRenderer, printTime } from './util';
/**
* Represents outcome of N runs of a scenario.
*/
export interface ITestResult {
test: string;
outcomeDirectory: string;
conversationPath?: string;
score: number;
usage: APIUsage;
// FIXME@ulugbekna: specify when the outcome is undefined
outcomes: (shared.SimulationTestOutcome | undefined)[];
duration: number;
cacheInfo: TestRunCacheInfo[];
originalResults: ITestRunResult[];
}
interface ITestRunResultCommon {
contentFilterCount: number;
usage: APIUsage;
cacheInfo: TestRunCacheInfo;
hasCacheMiss: boolean;
}
interface ITestRunResultPass extends ITestRunResultCommon {
kind: 'pass';
explicitScore: number | undefined;
duration: number;
outcome: shared.SimulationTestOutcome | undefined;
}
interface ITestRunResultFail extends ITestRunResultCommon {
kind: 'fail';
message: string;
duration: number;
outcome: shared.SimulationTestOutcome;
}
/**
* Represents outcome of a single run of a scenario.
*/
export type ITestRunResult = ITestRunResultPass | ITestRunResultFail;
export type CacheInfo = { type: 'request'; key: string }; // TODO: add other caches here
export type TestRunCacheInfo = CacheInfo[];
export interface SimulationTestContext {
opts: SimulationOptions;
baseline: SimulationBaseline;
canUseBaseline: boolean;
jsonOutputPrinter: IJSONOutputPrinter;
outputPath: string;
externalScenariosPath?: string;
modelConfig: ISimulationModelConfig;
simulationEndpointHealth: ISimulationEndpointHealth;
simulationServicesOptions: SimulationServicesOptions;
simulationOutcome: ISimulationOutcome;
tokenizerProvider: ITokenizerProvider;
}
export type GroupedScores = Map<string, Map<WellKnownLanguageId | undefined, Map<string | undefined, number[]>>>;
function mergeGroupedScopes(into: GroupedScores, from: GroupedScores) {
for (const [key, value] of from) {
const intoValue = into.get(key);
if (!intoValue) {
into.set(key, value);
continue;
}
for (const [language, scores] of value) {
const intoScores = intoValue.get(language);
if (intoScores) {
for (const [model, score] of scores) {
if (intoScores.has(model)) {
intoScores.set(model, [...intoScores.get(model)!, ...score]);
} else {
intoScores.set(model, score);
}
}
} else {
intoValue.set(language, scores);
}
}
}
}
export type ExecuteTestResult = {
testResultsPromises: Promise<ITestResult>[];
getGroupedScores(): Promise<GroupedScores>;
};
export async function executeTests(ctx: SimulationTestContext, testsToRun: readonly SimulationTest[]): Promise<ExecuteTestResult> {
const location = groupBy(testsToRun as SimulationTest[], test => (test.suite.extHost ?? ctx.opts.inExtensionHost) ? 'extHost' : 'local');
const extensionRunner = new Lazy(() => TestExecutionInExtension.create(ctx));
const [extHost, local] = await Promise.all([
executeTestsUsing(ctx, location['extHost'] ?? [], (...args) => extensionRunner.value.then(e => e.executeTest(...args))),
executeTestsUsing(ctx, location['local'] ?? [], executeTestOnce),
]);
return {
testResultsPromises: [...extHost.testResultsPromises, ...local.testResultsPromises],
getGroupedScores: async () => {
const [fromExtHost, fromLocal] = await Promise.all([extHost.getGroupedScores(), local.getGroupedScores()]);
await extensionRunner.rawValue?.then(r => r.dispose());
mergeGroupedScopes(fromLocal, fromExtHost);
return fromLocal;
},
};
}
async function executeTestsUsing(ctx: SimulationTestContext, testsToRun: readonly SimulationTest[], executeTestFn: ExecuteTestOnceFn): Promise<ExecuteTestResult> {
const { opts, jsonOutputPrinter } = ctx;
const groupedScores: Map<string, Map<WellKnownLanguageId | undefined, Map<string | undefined, number[]>>> = new Map();
const taskRunner = new TaskRunner(opts.parallelism);
const testResultsPromises: Promise<ITestResult>[] = [];
for (const test of testsToRun) {
if (test.options.optional && (test.options.skip(ctx.opts) || opts.ci)) { // CI never runs optional stests
// Avoid spamming the console, we now have very many skipped stests
// console.log(` Skipping ${test.fullName}`);
ctx.baseline.setSkippedTest(test.fullName);
jsonOutputPrinter.print({ type: shared.OutputType.skippedTest, name: test.fullName });
continue;
}
const testRun = executeTestNTimes(ctx, taskRunner, test, groupedScores, executeTestFn);
testResultsPromises.push(testRun);
if (opts.parallelism === 1) {
await testRun;
}
}
return {
testResultsPromises,
getGroupedScores: async () => {
await Promise.all(testResultsPromises);
return groupedScores;
},
};
}
/** Runs a single scenario `nRuns` times. */
async function executeTestNTimes(
ctx: SimulationTestContext,
taskRunner: TaskRunner,
test: SimulationTest,
groupedScores: Map<string, Map<WellKnownLanguageId | undefined, Map<string | undefined, number[]>>>,
executeTestFn: ExecuteTestOnceFn
): Promise<ITestResult> {
const { opts } = ctx;
const outcomeDirectory = path.join(ctx.outputPath, toDirname(test.fullName));
const testStartTime = Date.now();
const scheduledTestRuns: Promise<ITestRunResult>[] = [];
for (let kthRun = 0; kthRun < opts.nRuns; kthRun++) {
scheduledTestRuns.push(taskRunner.run(() => executeTestFn(ctx, taskRunner.parallelism, outcomeDirectory, test, kthRun)));
}
const runResults: ITestRunResult[] = await Promise.all(scheduledTestRuns);
const testElapsedTime = Date.now() - testStartTime;
const testSummary = {
results: runResults,
hasCacheMisses: runResults.some(x => x.hasCacheMiss),
contentFilterCount: runResults.filter(x => x.contentFilterCount > 0).length,
};
if (!opts.externalScenarios) {
await ctx.simulationOutcome.set(test, testSummary.results);
}
const testResultToScore = (result: ITestRunResult) => result.kind === 'pass' ? (result.explicitScore ?? 1) : 0;
const scoreTotal = Math.round(testSummary.results.reduce((total, result) => total + testResultToScore(result), 0) * 1000) / 1000;
const currentScore = scoreTotal / testSummary.results.length;
const currentPassCount = count(testSummary.results, s => s.kind === 'pass');
const baselineComparison = ctx.baseline.setCurrentResult({
name: test.fullName,
optional: test.options.optional ? true : undefined,
contentFilterCount: testSummary.contentFilterCount,
passCount: currentPassCount,
failCount: testSummary.results.length - currentPassCount,
score: currentScore,
attributes: test.attributes
});
printTestRunResultsToCli({ testSummary, ctx, test, currentScore, testElapsedTime, baselineComparison, });
if (opts.verbose !== undefined) {
printVerbose(opts, testSummary);
}
updateGroupedScores({ test, currentScore, groupedScores });
const duration = testSummary.results.reduce((acc, c) => acc + c.duration, 0);
const initial: APIUsage = { completion_tokens: 0, prompt_tokens: 0, total_tokens: 0, prompt_tokens_details: { cached_tokens: 0 } };
const usage: APIUsage = testSummary.results.reduce((acc, c): APIUsage => {
if (c.usage === undefined) { return acc; }
const { completion_tokens, prompt_tokens, total_tokens, prompt_tokens_details } = c.usage;
return {
completion_tokens: acc.completion_tokens + completion_tokens,
prompt_tokens: acc.prompt_tokens + prompt_tokens,
total_tokens: acc.total_tokens + total_tokens,
prompt_tokens_details: {
cached_tokens: (acc.prompt_tokens_details?.cached_tokens ?? 0) + (prompt_tokens_details?.cached_tokens ?? 0),
}
} satisfies APIUsage;
}, initial);
return {
test: test.fullName,
outcomeDirectory: path.relative(ctx.outputPath, outcomeDirectory),
conversationPath: test.options.conversationPath,
score: currentScore,
duration,
usage,
outcomes: testSummary.results.map(r => r.outcome),
cacheInfo: testSummary.results.map(r => r.cacheInfo),
originalResults: testSummary.results,
};
}
function printTestRunResultsToCli({ testSummary, ctx, test, currentScore, testElapsedTime, baselineComparison }: {
testSummary: {
contentFilterCount: number;
results: ITestRunResult[];
hasCacheMisses: boolean;
};
ctx: SimulationTestContext;
test: SimulationTest;
currentScore: number;
testElapsedTime: number;
baselineComparison: TestBaselineComparison;
}) {
const scoreToString = createScoreRenderer(ctx.opts, ctx.canUseBaseline);
const didScoreChange = !baselineComparison.isNew && baselineComparison.prevScore !== baselineComparison.currScore;
const prettyScoreValue = didScoreChange
? `${scoreToString(baselineComparison.prevScore)} -> ${scoreToString(baselineComparison.currScore)}`
: `${scoreToString(currentScore)}`;
let icon = '=';
let color = (x: string | number) => x;
if (baselineComparison.isNew) {
icon = '◆';
color = violet;
} else if (baselineComparison.isImproved) {
icon = '▲';
color = green;
} else if (baselineComparison.isWorsened) {
icon = '▼';
color = red;
}
const prettyTestTime = ctx.opts.parallelism === 1 ? ` (${(testElapsedTime > 10 ? yellow(printTime(testElapsedTime)) : printTime(testElapsedTime))})` : '';
const prettyContentFilter = (testSummary.contentFilterCount ? yellow(` (⚠️ content filter affected ${testSummary.contentFilterCount} runs)`) : '');
const hadCacheMisses = testSummary.hasCacheMisses ? yellow(' (️💸 cache miss)') : '';
console.log(` ${color(icon)} [${color(prettyScoreValue)}] ${color(test.fullName)}${prettyTestTime}${hadCacheMisses}${prettyContentFilter}`);
}
function printVerbose(
opts: SimulationOptions,
testSummary: {
contentFilterCount: number;
results: ITestRunResult[];
}
) {
for (let i = 0; i < testSummary.results.length; i++) {
const result = testSummary.results[i];
console.log(` ${i + 1} - ${result.kind === 'pass' ? green(result.kind) : red(result.kind)}`);
if (result.kind === 'fail' && result.message && opts.verbose !== 0) {
// indent the message and print
console.error(result.message.split(/\r\n|\r|\n/g).map(line => ` ${line}`).join('\n'));
}
}
}
function updateGroupedScores({ test, currentScore, groupedScores }: {
test: SimulationTest;
currentScore: number;
groupedScores: Map<string, Map<string | undefined, Map<string | undefined, number[]>>>;
}) {
const suiteName = test.suite.fullName;
const model = test.model;
if (groupedScores.has(suiteName)) {
const scoresPerSuite = groupedScores.get(suiteName);
if (scoresPerSuite!.has(test.language)) {
const scoresPerLanguage = scoresPerSuite!.get(test.language);
if (scoresPerLanguage!.has(model)) {
scoresPerLanguage!.set(model, [...scoresPerLanguage!.get(model)!, currentScore]);
} else {
scoresPerLanguage?.set(model, [currentScore]);
}
} else {
scoresPerSuite!.set(test.language, new Map([[model, [currentScore]]]));
}
} else {
groupedScores.set(suiteName, new Map());
groupedScores.get(suiteName)!.set(test.language, new Map([[model, [currentScore]]]));
}
}
type ExecuteTestOnceFn = (
ctx: SimulationTestContext,
parallelism: number,
outcomeDirectory: string,
test: SimulationTest,
runNumber: number,
) => Promise<ITestRunResult>;
export const executeTestOnce = async (
ctx: SimulationTestContext,
parallelism: number,
outcomeDirectory: string,
test: SimulationTest,
runNumber: number,
isInRealExtensionHost = false,
) => {
const { opts, jsonOutputPrinter } = ctx;
const fetchRequestCollector = new FetchRequestCollector();
const currentTestRunInfo: CurrentTestRunInfo = {
test,
testRunNumber: runNumber,
fetchRequestCollector: fetchRequestCollector,
isInRealExtensionHost,
};
let testingServiceCollection: TestingServiceCollection;
try {
testingServiceCollection = await createSimulationAccessor(
ctx.modelConfig,
ctx.simulationServicesOptions,
currentTestRunInfo
);
} catch (e) {
const msg = e instanceof Error ? (e.stack ?? e.message) : String(e);
console.error(`Error in createSimulationAccessor`, e);
jsonOutputPrinter.print({ type: shared.OutputType.testRunStart, name: test.fullName, runNumber } satisfies shared.ITestRunStartOutput);
jsonOutputPrinter.print({
type: shared.OutputType.testRunEnd,
name: test.fullName,
runNumber,
duration: 0,
writtenFiles: [],
error: msg,
pass: false,
explicitScore: undefined,
annotations: undefined,
averageRequestDuration: undefined,
requestCount: 0,
hasCacheMiss: false,
} satisfies shared.ITestRunEndOutput);
return {
kind: 'fail',
message: msg,
contentFilterCount: 0,
duration: 0,
usage: { prompt_tokens: 0, completion_tokens: 0, total_tokens: 0 },
outcome: { kind: 'failed', error: msg, hitContentFilter: false, critical: true },
cacheInfo: [],
hasCacheMiss: false,
} satisfies ITestRunResultFail;
}
testingServiceCollection.define(ISimulationOutcome, ctx.simulationOutcome);
testingServiceCollection.define(ITokenizerProvider, ctx.tokenizerProvider);
testingServiceCollection.define(ISimulationEndpointHealth, ctx.simulationEndpointHealth);
testingServiceCollection.define(IJSONOutputPrinter, ctx.jsonOutputPrinter);
testingServiceCollection.define(ITasksService, new TestTasksService());
if (test.model || test.embeddingType) {
// We prefer opts that come from the CLI over test specific args since Opts are global and must apply to the entire simulation
const smartChatModel = (opts.smartChatModel ?? opts.chatModel) ?? test.model;
const fastChatModel = (opts.fastChatModel ?? opts.chatModel) ?? test.model;
const fastRewriteModel = (opts.fastRewriteModel ?? opts.chatModel) ?? test.model;
testingServiceCollection.define(IEndpointProvider, new SyncDescriptor(TestEndpointProvider, [smartChatModel, fastChatModel, fastRewriteModel, currentTestRunInfo, opts.modelCacheMode === CacheMode.Disable, undefined]));
}
const simulationTestRuntime = (ctx.externalScenariosPath !== undefined)
? new ExternalSimulationTestRuntime(ctx.outputPath, outcomeDirectory, runNumber)
: new SimulationTestRuntime(ctx.outputPath, outcomeDirectory, runNumber);
testingServiceCollection.define(ISimulationTestRuntime, simulationTestRuntime);
testingServiceCollection.define(ISimulationTestContext, simulationTestRuntime);
testingServiceCollection.define(ILogService, new SyncDescriptor(LogServiceImpl, [[new ConsoleLog(`🪵 ${currentTestRunInfo.test.fullName} (Run #${currentTestRunInfo.testRunNumber + 1}):\n`), simulationTestRuntime]]));
testingServiceCollection.define(INewWorkspacePreviewContentManager, new SyncDescriptor(NewWorkspacePreviewContentManagerImpl));
let snapshots: TestSnapshotsImpl | undefined;
if (test.options.location) {
snapshots = new TestSnapshotsImpl(test.options.location.path, test.fullName, runNumber);
testingServiceCollection.define(ITestSnapshots, snapshots);
}
testingServiceCollection.define(IPromptWorkspaceLabels, new SyncDescriptor(PromptWorkspaceLabels));
if (isInRealExtensionHost) {
testingServiceCollection.define(IToolsService, new SyncDescriptor(SimulationExtHostToolsService, [ctx.simulationServicesOptions.disabledTools]));
} else {
testingServiceCollection.define(IToolsService, new SyncDescriptor(TestToolsService, [ctx.simulationServicesOptions.disabledTools]));
}
jsonOutputPrinter.print({ type: shared.OutputType.testRunStart, name: test.fullName, runNumber } satisfies shared.ITestRunStartOutput);
if (process.stdout.isTTY && parallelism === 1) {
process.stdout.write(` Running scenario: ${test.fullName} - ${runNumber + 1}/${opts.nRuns}`.substring(0, process.stdout.columns - 1));
}
const testStartTime = Date.now();
let pass = true;
let err: unknown | undefined;
try {
await test.run(testingServiceCollection);
await snapshots?.dispose();
await fetchRequestCollector.complete();
const result: ITestRunResultPass = {
kind: 'pass',
explicitScore: simulationTestRuntime.getExplicitScore(),
usage: fetchRequestCollector.usage,
contentFilterCount: fetchRequestCollector.contentFilterCount,
duration: Date.now() - testStartTime,
outcome: simulationTestRuntime.getOutcome(),
cacheInfo: fetchRequestCollector.cacheInfo,
hasCacheMiss: fetchRequestCollector.hasCacheMiss,
};
return result;
} catch (e) {
pass = false;
err = e;
let msg = err instanceof Error ? (err.stack ? err.stack : err.message) : safeStringify(err);
await fetchRequestCollector.complete();
let critical = false;
if (e instanceof BugIndicatingError || e instanceof TypeError) {
critical = true;
}
if (e instanceof CriticalError) {
critical = true;
msg = e.message;
}
const result: ITestRunResultFail = {
kind: 'fail',
message: msg,
contentFilterCount: fetchRequestCollector.contentFilterCount,
duration: Date.now() - testStartTime,
usage: fetchRequestCollector.usage,
outcome: {
kind: 'failed',
error: msg,
hitContentFilter: fetchRequestCollector.contentFilterCount > 0,
critical,
},
cacheInfo: fetchRequestCollector.cacheInfo,
hasCacheMiss: fetchRequestCollector.hasCacheMiss,
};
return result;
} finally {
// (context.safeGet(ILanguageFeaturesService) as { dispose?: () => Promise<void> })?.dispose?.();
await simulationTestRuntime.writeFile(shared.SIMULATION_REQUESTS_FILENAME, JSON.stringify(fetchRequestCollector.interceptedRequests.map(r => r.toJSON()), undefined, 2), shared.REQUESTS_TAG);
if (err) {
simulationTestRuntime.log(`Scenario failed due to an error:`, err);
if ((<any>err).code !== 'ERR_ASSERTION' && !(err instanceof IntentError)) {
// Make visible to the console unexpected errors
console.log(`Scenario ${test.fullName} failed due to an error:`);
console.log(err);
}
}
await simulationTestRuntime.flushLogs();
jsonOutputPrinter.print({
type: shared.OutputType.testRunEnd,
name: test.fullName,
runNumber,
duration: Date.now() - testStartTime,
writtenFiles: simulationTestRuntime.getWrittenFiles(),
error: err instanceof Error ? `${err.message}\n${err.stack}` : JSON.stringify(err),
pass,
explicitScore: simulationTestRuntime.getExplicitScore(),
annotations: simulationTestRuntime.getOutcome()?.annotations,
averageRequestDuration: fetchRequestCollector.averageRequestDuration,
requestCount: fetchRequestCollector.interceptedRequests.length,
hasCacheMiss: fetchRequestCollector.hasCacheMiss,
} satisfies shared.ITestRunEndOutput);
if (process.stdout.isTTY && parallelism === 1) {
process.stdout.write('\r\x1b[K');
}
testingServiceCollection.dispose();
}
};
/**
* When thrown, fails stest CI.
*/
export class CriticalError extends Error {
constructor(message: string) {
super(message);
this.name = 'CriticalError';
}
}