/*--------------------------------------------------------------------------------------------- * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the MIT License. See License.txt in the project root for license information. *--------------------------------------------------------------------------------------------*/ import * as path from 'path'; import { IPromptWorkspaceLabels, PromptWorkspaceLabels } from '../src/extension/context/node/resolvers/promptWorkspaceLabels'; import { INewWorkspacePreviewContentManager, NewWorkspacePreviewContentManagerImpl } from '../src/extension/intents/node/newIntent'; import { IntentError } from '../src/extension/prompt/node/intents'; import { ISimulationModelConfig } from '../src/extension/test/node/services'; import { IToolsService } from '../src/extension/tools/common/toolsService'; import { TestToolsService } from '../src/extension/tools/node/test/testToolsService'; import { IEndpointProvider } from '../src/platform/endpoint/common/endpointProvider'; import { TestEndpointProvider } from '../src/platform/endpoint/test/node/testEndpointProvider'; import { ConsoleLog, ILogService, LogServiceImpl } from '../src/platform/log/common/logService'; import { APIUsage } from '../src/platform/networking/common/openai'; import { ISimulationTestContext } from '../src/platform/simulationTestContext/common/simulationTestContext'; import { ITasksService } from '../src/platform/tasks/common/tasksService'; import { TestTasksService } from '../src/platform/tasks/common/testTasksService'; import { TestingServiceCollection } from '../src/platform/test/node/services'; import { ITokenizerProvider } from '../src/platform/tokenizer/node/tokenizer'; import { count } from '../src/util/common/arrays'; import { WellKnownLanguageId } from '../src/util/common/languages'; import { groupBy } from '../src/util/vs/base/common/collections'; import { BugIndicatingError } from '../src/util/vs/base/common/errors'; import { Lazy } from '../src/util/vs/base/common/lazy'; import { safeStringify } from '../src/util/vs/base/common/objects'; import { SyncDescriptor } from '../src/util/vs/platform/instantiation/common/descriptors'; import { SimulationExtHostToolsService } from './base/extHostContext/simulationExtHostToolsService'; import { SimulationBaseline, TestBaselineComparison } from './base/simulationBaseline'; import { CacheMode, createSimulationAccessor, CurrentTestRunInfo, SimulationServicesOptions } from './base/simulationContext'; import { ISimulationEndpointHealth } from './base/simulationEndpointHealth'; import { SimulationOptions } from './base/simulationOptions'; import { ISimulationOutcome } from './base/simulationOutcome'; import { FetchRequestCollector } from './base/spyingChatMLFetcher'; import { ISimulationTestRuntime, SimulationTest, SimulationTestRuntime, toDirname } from './base/stest'; import { IJSONOutputPrinter } from './jsonOutputPrinter'; import { green, red, violet, yellow } from './outputColorer'; import { ExternalSimulationTestRuntime } from './simulation/externalScenarios'; import * as shared from './simulation/shared/sharedTypes'; import { ITestSnapshots, TestSnapshotsImpl } from './simulation/testSnapshot'; import { TaskRunner } from './taskRunner'; import { TestExecutionInExtension } from './testExecutionInExtension'; import { createScoreRenderer, printTime } from './util'; /** * Represents outcome of N runs of a scenario. */ export interface ITestResult { test: string; outcomeDirectory: string; conversationPath?: string; score: number; usage: APIUsage; // FIXME@ulugbekna: specify when the outcome is undefined outcomes: (shared.SimulationTestOutcome | undefined)[]; duration: number; cacheInfo: TestRunCacheInfo[]; originalResults: ITestRunResult[]; } interface ITestRunResultCommon { contentFilterCount: number; usage: APIUsage; cacheInfo: TestRunCacheInfo; hasCacheMiss: boolean; } interface ITestRunResultPass extends ITestRunResultCommon { kind: 'pass'; explicitScore: number | undefined; duration: number; outcome: shared.SimulationTestOutcome | undefined; } interface ITestRunResultFail extends ITestRunResultCommon { kind: 'fail'; message: string; duration: number; outcome: shared.SimulationTestOutcome; } /** * Represents outcome of a single run of a scenario. */ export type ITestRunResult = ITestRunResultPass | ITestRunResultFail; export type CacheInfo = { type: 'request'; key: string }; // TODO: add other caches here export type TestRunCacheInfo = CacheInfo[]; export interface SimulationTestContext { opts: SimulationOptions; baseline: SimulationBaseline; canUseBaseline: boolean; jsonOutputPrinter: IJSONOutputPrinter; outputPath: string; externalScenariosPath?: string; modelConfig: ISimulationModelConfig; simulationEndpointHealth: ISimulationEndpointHealth; simulationServicesOptions: SimulationServicesOptions; simulationOutcome: ISimulationOutcome; tokenizerProvider: ITokenizerProvider; } export type GroupedScores = Map>>; function mergeGroupedScopes(into: GroupedScores, from: GroupedScores) { for (const [key, value] of from) { const intoValue = into.get(key); if (!intoValue) { into.set(key, value); continue; } for (const [language, scores] of value) { const intoScores = intoValue.get(language); if (intoScores) { for (const [model, score] of scores) { if (intoScores.has(model)) { intoScores.set(model, [...intoScores.get(model)!, ...score]); } else { intoScores.set(model, score); } } } else { intoValue.set(language, scores); } } } } export type ExecuteTestResult = { testResultsPromises: Promise[]; getGroupedScores(): Promise; }; export async function executeTests(ctx: SimulationTestContext, testsToRun: readonly SimulationTest[]): Promise { const location = groupBy(testsToRun as SimulationTest[], test => (test.suite.extHost ?? ctx.opts.inExtensionHost) ? 'extHost' : 'local'); const extensionRunner = new Lazy(() => TestExecutionInExtension.create(ctx)); const [extHost, local] = await Promise.all([ executeTestsUsing(ctx, location['extHost'] ?? [], (...args) => extensionRunner.value.then(e => e.executeTest(...args))), executeTestsUsing(ctx, location['local'] ?? [], executeTestOnce), ]); return { testResultsPromises: [...extHost.testResultsPromises, ...local.testResultsPromises], getGroupedScores: async () => { const [fromExtHost, fromLocal] = await Promise.all([extHost.getGroupedScores(), local.getGroupedScores()]); await extensionRunner.rawValue?.then(r => r.dispose()); mergeGroupedScopes(fromLocal, fromExtHost); return fromLocal; }, }; } async function executeTestsUsing(ctx: SimulationTestContext, testsToRun: readonly SimulationTest[], executeTestFn: ExecuteTestOnceFn): Promise { const { opts, jsonOutputPrinter } = ctx; const groupedScores: Map>> = new Map(); const taskRunner = new TaskRunner(opts.parallelism); const testResultsPromises: Promise[] = []; for (const test of testsToRun) { if (test.options.optional && (test.options.skip(ctx.opts) || opts.ci)) { // CI never runs optional stests // Avoid spamming the console, we now have very many skipped stests // console.log(` Skipping ${test.fullName}`); ctx.baseline.setSkippedTest(test.fullName); jsonOutputPrinter.print({ type: shared.OutputType.skippedTest, name: test.fullName }); continue; } const testRun = executeTestNTimes(ctx, taskRunner, test, groupedScores, executeTestFn); testResultsPromises.push(testRun); if (opts.parallelism === 1) { await testRun; } } return { testResultsPromises, getGroupedScores: async () => { await Promise.all(testResultsPromises); return groupedScores; }, }; } /** Runs a single scenario `nRuns` times. */ async function executeTestNTimes( ctx: SimulationTestContext, taskRunner: TaskRunner, test: SimulationTest, groupedScores: Map>>, executeTestFn: ExecuteTestOnceFn ): Promise { const { opts } = ctx; const outcomeDirectory = path.join(ctx.outputPath, toDirname(test.fullName)); const testStartTime = Date.now(); const scheduledTestRuns: Promise[] = []; for (let kthRun = 0; kthRun < opts.nRuns; kthRun++) { scheduledTestRuns.push(taskRunner.run(() => executeTestFn(ctx, taskRunner.parallelism, outcomeDirectory, test, kthRun))); } const runResults: ITestRunResult[] = await Promise.all(scheduledTestRuns); const testElapsedTime = Date.now() - testStartTime; const testSummary = { results: runResults, hasCacheMisses: runResults.some(x => x.hasCacheMiss), contentFilterCount: runResults.filter(x => x.contentFilterCount > 0).length, }; if (!opts.externalScenarios) { await ctx.simulationOutcome.set(test, testSummary.results); } const testResultToScore = (result: ITestRunResult) => result.kind === 'pass' ? (result.explicitScore ?? 1) : 0; const scoreTotal = Math.round(testSummary.results.reduce((total, result) => total + testResultToScore(result), 0) * 1000) / 1000; const currentScore = scoreTotal / testSummary.results.length; const currentPassCount = count(testSummary.results, s => s.kind === 'pass'); const baselineComparison = ctx.baseline.setCurrentResult({ name: test.fullName, optional: test.options.optional ? true : undefined, contentFilterCount: testSummary.contentFilterCount, passCount: currentPassCount, failCount: testSummary.results.length - currentPassCount, score: currentScore, attributes: test.attributes }); printTestRunResultsToCli({ testSummary, ctx, test, currentScore, testElapsedTime, baselineComparison, }); if (opts.verbose !== undefined) { printVerbose(opts, testSummary); } updateGroupedScores({ test, currentScore, groupedScores }); const duration = testSummary.results.reduce((acc, c) => acc + c.duration, 0); const initial: APIUsage = { completion_tokens: 0, prompt_tokens: 0, total_tokens: 0, prompt_tokens_details: { cached_tokens: 0 } }; const usage: APIUsage = testSummary.results.reduce((acc, c): APIUsage => { if (c.usage === undefined) { return acc; } const { completion_tokens, prompt_tokens, total_tokens, prompt_tokens_details } = c.usage; return { completion_tokens: acc.completion_tokens + completion_tokens, prompt_tokens: acc.prompt_tokens + prompt_tokens, total_tokens: acc.total_tokens + total_tokens, prompt_tokens_details: { cached_tokens: (acc.prompt_tokens_details?.cached_tokens ?? 0) + (prompt_tokens_details?.cached_tokens ?? 0), } } satisfies APIUsage; }, initial); return { test: test.fullName, outcomeDirectory: path.relative(ctx.outputPath, outcomeDirectory), conversationPath: test.options.conversationPath, score: currentScore, duration, usage, outcomes: testSummary.results.map(r => r.outcome), cacheInfo: testSummary.results.map(r => r.cacheInfo), originalResults: testSummary.results, }; } function printTestRunResultsToCli({ testSummary, ctx, test, currentScore, testElapsedTime, baselineComparison }: { testSummary: { contentFilterCount: number; results: ITestRunResult[]; hasCacheMisses: boolean; }; ctx: SimulationTestContext; test: SimulationTest; currentScore: number; testElapsedTime: number; baselineComparison: TestBaselineComparison; }) { const scoreToString = createScoreRenderer(ctx.opts, ctx.canUseBaseline); const didScoreChange = !baselineComparison.isNew && baselineComparison.prevScore !== baselineComparison.currScore; const prettyScoreValue = didScoreChange ? `${scoreToString(baselineComparison.prevScore)} -> ${scoreToString(baselineComparison.currScore)}` : `${scoreToString(currentScore)}`; let icon = '='; let color = (x: string | number) => x; if (baselineComparison.isNew) { icon = '◆'; color = violet; } else if (baselineComparison.isImproved) { icon = '▲'; color = green; } else if (baselineComparison.isWorsened) { icon = '▼'; color = red; } const prettyTestTime = ctx.opts.parallelism === 1 ? ` (${(testElapsedTime > 10 ? yellow(printTime(testElapsedTime)) : printTime(testElapsedTime))})` : ''; const prettyContentFilter = (testSummary.contentFilterCount ? yellow(` (⚠️ content filter affected ${testSummary.contentFilterCount} runs)`) : ''); const hadCacheMisses = testSummary.hasCacheMisses ? yellow(' (️️️💸 cache miss)') : ''; console.log(` ${color(icon)} [${color(prettyScoreValue)}] ${color(test.fullName)}${prettyTestTime}${hadCacheMisses}${prettyContentFilter}`); } function printVerbose( opts: SimulationOptions, testSummary: { contentFilterCount: number; results: ITestRunResult[]; } ) { for (let i = 0; i < testSummary.results.length; i++) { const result = testSummary.results[i]; console.log(` ${i + 1} - ${result.kind === 'pass' ? green(result.kind) : red(result.kind)}`); if (result.kind === 'fail' && result.message && opts.verbose !== 0) { // indent the message and print console.error(result.message.split(/\r\n|\r|\n/g).map(line => ` ${line}`).join('\n')); } } } function updateGroupedScores({ test, currentScore, groupedScores }: { test: SimulationTest; currentScore: number; groupedScores: Map>>; }) { const suiteName = test.suite.fullName; const model = test.model; if (groupedScores.has(suiteName)) { const scoresPerSuite = groupedScores.get(suiteName); if (scoresPerSuite!.has(test.language)) { const scoresPerLanguage = scoresPerSuite!.get(test.language); if (scoresPerLanguage!.has(model)) { scoresPerLanguage!.set(model, [...scoresPerLanguage!.get(model)!, currentScore]); } else { scoresPerLanguage?.set(model, [currentScore]); } } else { scoresPerSuite!.set(test.language, new Map([[model, [currentScore]]])); } } else { groupedScores.set(suiteName, new Map()); groupedScores.get(suiteName)!.set(test.language, new Map([[model, [currentScore]]])); } } type ExecuteTestOnceFn = ( ctx: SimulationTestContext, parallelism: number, outcomeDirectory: string, test: SimulationTest, runNumber: number, ) => Promise; export const executeTestOnce = async ( ctx: SimulationTestContext, parallelism: number, outcomeDirectory: string, test: SimulationTest, runNumber: number, isInRealExtensionHost = false, ) => { const { opts, jsonOutputPrinter } = ctx; const fetchRequestCollector = new FetchRequestCollector(); const currentTestRunInfo: CurrentTestRunInfo = { test, testRunNumber: runNumber, fetchRequestCollector: fetchRequestCollector, isInRealExtensionHost, }; let testingServiceCollection: TestingServiceCollection; try { testingServiceCollection = await createSimulationAccessor( ctx.modelConfig, ctx.simulationServicesOptions, currentTestRunInfo ); } catch (e) { const msg = e instanceof Error ? (e.stack ?? e.message) : String(e); console.error(`Error in createSimulationAccessor`, e); jsonOutputPrinter.print({ type: shared.OutputType.testRunStart, name: test.fullName, runNumber } satisfies shared.ITestRunStartOutput); jsonOutputPrinter.print({ type: shared.OutputType.testRunEnd, name: test.fullName, runNumber, duration: 0, writtenFiles: [], error: msg, pass: false, explicitScore: undefined, annotations: undefined, averageRequestDuration: undefined, requestCount: 0, hasCacheMiss: false, } satisfies shared.ITestRunEndOutput); return { kind: 'fail', message: msg, contentFilterCount: 0, duration: 0, usage: { prompt_tokens: 0, completion_tokens: 0, total_tokens: 0 }, outcome: { kind: 'failed', error: msg, hitContentFilter: false, critical: true }, cacheInfo: [], hasCacheMiss: false, } satisfies ITestRunResultFail; } testingServiceCollection.define(ISimulationOutcome, ctx.simulationOutcome); testingServiceCollection.define(ITokenizerProvider, ctx.tokenizerProvider); testingServiceCollection.define(ISimulationEndpointHealth, ctx.simulationEndpointHealth); testingServiceCollection.define(IJSONOutputPrinter, ctx.jsonOutputPrinter); testingServiceCollection.define(ITasksService, new TestTasksService()); if (test.model || test.embeddingType) { // We prefer opts that come from the CLI over test specific args since Opts are global and must apply to the entire simulation const smartChatModel = (opts.smartChatModel ?? opts.chatModel) ?? test.model; const fastChatModel = (opts.fastChatModel ?? opts.chatModel) ?? test.model; const fastRewriteModel = (opts.fastRewriteModel ?? opts.chatModel) ?? test.model; testingServiceCollection.define(IEndpointProvider, new SyncDescriptor(TestEndpointProvider, [smartChatModel, fastChatModel, fastRewriteModel, currentTestRunInfo, opts.modelCacheMode === CacheMode.Disable, undefined])); } const simulationTestRuntime = (ctx.externalScenariosPath !== undefined) ? new ExternalSimulationTestRuntime(ctx.outputPath, outcomeDirectory, runNumber) : new SimulationTestRuntime(ctx.outputPath, outcomeDirectory, runNumber); testingServiceCollection.define(ISimulationTestRuntime, simulationTestRuntime); testingServiceCollection.define(ISimulationTestContext, simulationTestRuntime); testingServiceCollection.define(ILogService, new SyncDescriptor(LogServiceImpl, [[new ConsoleLog(`🪵 ${currentTestRunInfo.test.fullName} (Run #${currentTestRunInfo.testRunNumber + 1}):\n`), simulationTestRuntime]])); testingServiceCollection.define(INewWorkspacePreviewContentManager, new SyncDescriptor(NewWorkspacePreviewContentManagerImpl)); let snapshots: TestSnapshotsImpl | undefined; if (test.options.location) { snapshots = new TestSnapshotsImpl(test.options.location.path, test.fullName, runNumber); testingServiceCollection.define(ITestSnapshots, snapshots); } testingServiceCollection.define(IPromptWorkspaceLabels, new SyncDescriptor(PromptWorkspaceLabels)); if (isInRealExtensionHost) { testingServiceCollection.define(IToolsService, new SyncDescriptor(SimulationExtHostToolsService, [ctx.simulationServicesOptions.disabledTools])); } else { testingServiceCollection.define(IToolsService, new SyncDescriptor(TestToolsService, [ctx.simulationServicesOptions.disabledTools])); } jsonOutputPrinter.print({ type: shared.OutputType.testRunStart, name: test.fullName, runNumber } satisfies shared.ITestRunStartOutput); if (process.stdout.isTTY && parallelism === 1) { process.stdout.write(` Running scenario: ${test.fullName} - ${runNumber + 1}/${opts.nRuns}`.substring(0, process.stdout.columns - 1)); } const testStartTime = Date.now(); let pass = true; let err: unknown | undefined; try { await test.run(testingServiceCollection); await snapshots?.dispose(); await fetchRequestCollector.complete(); const result: ITestRunResultPass = { kind: 'pass', explicitScore: simulationTestRuntime.getExplicitScore(), usage: fetchRequestCollector.usage, contentFilterCount: fetchRequestCollector.contentFilterCount, duration: Date.now() - testStartTime, outcome: simulationTestRuntime.getOutcome(), cacheInfo: fetchRequestCollector.cacheInfo, hasCacheMiss: fetchRequestCollector.hasCacheMiss, }; return result; } catch (e) { pass = false; err = e; let msg = err instanceof Error ? (err.stack ? err.stack : err.message) : safeStringify(err); await fetchRequestCollector.complete(); let critical = false; if (e instanceof BugIndicatingError || e instanceof TypeError) { critical = true; } if (e instanceof CriticalError) { critical = true; msg = e.message; } const result: ITestRunResultFail = { kind: 'fail', message: msg, contentFilterCount: fetchRequestCollector.contentFilterCount, duration: Date.now() - testStartTime, usage: fetchRequestCollector.usage, outcome: { kind: 'failed', error: msg, hitContentFilter: fetchRequestCollector.contentFilterCount > 0, critical, }, cacheInfo: fetchRequestCollector.cacheInfo, hasCacheMiss: fetchRequestCollector.hasCacheMiss, }; return result; } finally { // (context.safeGet(ILanguageFeaturesService) as { dispose?: () => Promise })?.dispose?.(); await simulationTestRuntime.writeFile(shared.SIMULATION_REQUESTS_FILENAME, JSON.stringify(fetchRequestCollector.interceptedRequests.map(r => r.toJSON()), undefined, 2), shared.REQUESTS_TAG); if (err) { simulationTestRuntime.log(`Scenario failed due to an error:`, err); if ((err).code !== 'ERR_ASSERTION' && !(err instanceof IntentError)) { // Make visible to the console unexpected errors console.log(`Scenario ${test.fullName} failed due to an error:`); console.log(err); } } await simulationTestRuntime.flushLogs(); jsonOutputPrinter.print({ type: shared.OutputType.testRunEnd, name: test.fullName, runNumber, duration: Date.now() - testStartTime, writtenFiles: simulationTestRuntime.getWrittenFiles(), error: err instanceof Error ? `${err.message}\n${err.stack}` : JSON.stringify(err), pass, explicitScore: simulationTestRuntime.getExplicitScore(), annotations: simulationTestRuntime.getOutcome()?.annotations, averageRequestDuration: fetchRequestCollector.averageRequestDuration, requestCount: fetchRequestCollector.interceptedRequests.length, hasCacheMiss: fetchRequestCollector.hasCacheMiss, } satisfies shared.ITestRunEndOutput); if (process.stdout.isTTY && parallelism === 1) { process.stdout.write('\r\x1b[K'); } testingServiceCollection.dispose(); } }; /** * When thrown, fails stest CI. */ export class CriticalError extends Error { constructor(message: string) { super(message); this.name = 'CriticalError'; } }