/*--------------------------------------------------------------------------------------------- * Copyright (c) Microsoft Corporation. All rights reserved. * Licensed under the MIT License. See License.txt in the project root for license information. *--------------------------------------------------------------------------------------------*/ import { promises as fs } from 'fs'; import * as path from 'path'; import * as readline from 'readline'; // Edit tool names we're tracking const EDIT_TOOL_NAMES = ['insert_edit_into_file', 'replace_string_in_file', 'multi_replace_string_in_file', 'apply_patch']; // Tool names that indicate a continuation/retry attempt const CONTINUATION_TOOL_NAMES = ['read_file']; interface ToolCall { tool: string; input_tokens?: number; cached_input_tokens?: number; output_tokens?: number; response: string | string[]; edits?: Array<{ path: string; edits: { replacements: Array<{ replaceRange: { start: number; endExclusive: number }; newText: string; }>; }; }>; } interface EditOperation { toolName: string; timestamp: string; success: boolean; filePath?: string; turnIndex: number; isRetry: boolean; retrySucceeded?: boolean; } interface ConversationAnalysis { conversationPath: string; edits: EditOperation[]; totalEdits: number; successfulEdits: number; failedEdits: number; successfulEditsWithRetries: number; totalUniqueEdits: number; modelName?: string; } interface RunAnalysis { runId: string; conversations: ConversationAnalysis[]; totalEdits: number; successRate: number; successRateWithRetries: number; totalUniqueEdits: number; modelName?: string; } async function listRuns(amlOutPath: string): Promise { const entries = await fs.readdir(amlOutPath, { withFileTypes: true }); // Filter directories that are numeric run IDs const runs = entries .filter(e => e.isDirectory() && /^\d+$/.test(e.name)) .map(e => e.name) .sort((a, b) => parseInt(b) - parseInt(a)); // Sort descending (newest first) return runs; } async function promptUserForRun(runs: string[]): Promise { console.log('\nAvailable test runs (newest first):'); runs.slice(0, 10).forEach((run, i) => { console.log(` ${i + 1}. ${run}`); }); if (runs.length > 10) { console.log(` ... and ${runs.length - 10} more`); } const rl = readline.createInterface({ input: process.stdin, output: process.stdout }); return new Promise((resolve) => { rl.question('\nEnter run number (or press Enter for the most recent): ', (answer) => { rl.close(); const choice = answer.trim(); if (choice === '') { resolve(runs[0]); } else { const index = parseInt(choice) - 1; if (index >= 0 && index < runs.length) { resolve(runs[index]); } else { console.log('Invalid selection, using most recent run.'); resolve(runs[0]); } } }); }); } async function analyzeConversation(conversationPath: string): Promise { const trajectoryPath = path.join(conversationPath, 'trajectories', 'trajectory.json'); let toolCalls: ToolCall[] = []; let modelName: string | undefined; try { const content = await fs.readFile(trajectoryPath, 'utf-8'); toolCalls = JSON.parse(content); } catch (error) { console.warn(`Could not read trajectory file: ${trajectoryPath}`); return { conversationPath, edits: [], totalEdits: 0, successfulEdits: 0, failedEdits: 0, successfulEditsWithRetries: 0, totalUniqueEdits: 0 }; } const edits: EditOperation[] = []; let turnIndex = 0; for (let i = 0; i < toolCalls.length; i++) { const toolCall = toolCalls[i]; if (!EDIT_TOOL_NAMES.includes(toolCall.tool)) { continue; } // Determine success based on response const response = Array.isArray(toolCall.response) ? toolCall.response[0] : toolCall.response; const success = typeof response === 'string' && response.includes('successfully edited'); // Get file path from edits if available const filePath = toolCall.edits && toolCall.edits.length > 0 ? toolCall.edits[0].path : undefined; // Detect retry pattern: failed edit -> continuation tool -> another edit let isRetry = false; let retrySucceeded: boolean | undefined; if (!success) { // Look ahead to see if there's a continuation tool followed by another edit let j = i + 1; let foundContinuationTool = false; while (j < toolCalls.length && j < i + 10) { // Look ahead max 10 calls if (CONTINUATION_TOOL_NAMES.includes(toolCalls[j].tool)) { foundContinuationTool = true; } else if (foundContinuationTool && EDIT_TOOL_NAMES.includes(toolCalls[j].tool)) { // Found a retry! isRetry = true; const retryResponse = Array.isArray(toolCalls[j].response) ? toolCalls[j].response[0] : toolCalls[j].response; retrySucceeded = typeof retryResponse === 'string' && retryResponse.includes('successfully edited'); break; } else if (EDIT_TOOL_NAMES.includes(toolCalls[j].tool)) { // Another edit without continuation tool in between, not a retry break; } j++; } } edits.push({ toolName: toolCall.tool, timestamp: new Date().toISOString(), // Trajectory doesn't have timestamps, use current time success, filePath, turnIndex: turnIndex++, isRetry, retrySucceeded }); } const successfulEdits = edits.filter(e => e.success).length; // Calculate success rate accounting for retries (final outcome only) const editsWithRetries = edits.filter(e => !e.success && e.isRetry); const retriedSuccesses = editsWithRetries.filter(e => e.retrySucceeded).length; const successfulEditsWithRetries = successfulEdits + retriedSuccesses; const totalUniqueEdits = edits.length - editsWithRetries.length + editsWithRetries.filter(e => e.retrySucceeded !== undefined).length; return { conversationPath, edits, totalEdits: edits.length, successfulEdits, failedEdits: edits.length - successfulEdits, successfulEditsWithRetries, totalUniqueEdits, modelName }; } async function analyzeRun(runId: string, basePath: string): Promise { const runPath = path.join(basePath, runId); const conversations: ConversationAnalysis[] = []; try { const entries = await fs.readdir(runPath, { withFileTypes: true }); for (const entry of entries) { if (entry.isDirectory()) { const conversationPath = path.join(runPath, entry.name); const analysis = await analyzeConversation(conversationPath); if (analysis.totalEdits > 0) { conversations.push(analysis); } } } } catch (error) { console.error(`Error reading run directory: ${error}`); } const totalEdits = conversations.reduce((sum, c) => sum + c.totalEdits, 0); const totalSuccessful = conversations.reduce((sum, c) => sum + c.successfulEdits, 0); const totalSuccessfulWithRetries = conversations.reduce((sum, c) => sum + c.successfulEditsWithRetries, 0); const totalUniqueEdits = conversations.reduce((sum, c) => sum + c.totalUniqueEdits, 0); // Get model name from first conversation that has one const modelName = conversations.find(c => c.modelName)?.modelName; return { runId, conversations, totalEdits, successRate: totalEdits > 0 ? totalSuccessful / totalEdits : 0, successRateWithRetries: totalUniqueEdits > 0 ? totalSuccessfulWithRetries / totalUniqueEdits : 0, totalUniqueEdits, modelName }; } function generateHTML(analysis: RunAnalysis, outputPath: string, includeRetries: boolean = false): string { // Build Sankey data const sankeyNodes: string[] = []; const sankeyLinks: Array<{ source: number; target: number; value: number }> = []; const nodeMap = new Map(); const getNodeIndex = (name: string): number => { if (!nodeMap.has(name)) { nodeMap.set(name, sankeyNodes.length); sankeyNodes.push(name); } return nodeMap.get(name)!; }; // Track flows const flows = new Map(); for (const conv of analysis.conversations) { for (const edit of conv.edits) { const toolNode = edit.toolName; // Check if this is a failed edit with a retry if (includeRetries && !edit.success && edit.isRetry && edit.retrySucceeded !== undefined) { // Show full retry flow: Tool -> Failed -> read_file -> Retry Edit -> Final Result const failedNode = 'Failed (will retry)'; const readFileNode = 'read_file'; const retryEditNode = `${toolNode} (retry)`; const finalResult = edit.retrySucceeded ? 'Success' : 'Failed'; flows.set(`${toolNode}->${failedNode}`, (flows.get(`${toolNode}->${failedNode}`) || 0) + 1); flows.set(`${failedNode}->${readFileNode}`, (flows.get(`${failedNode}->${readFileNode}`) || 0) + 1); flows.set(`${readFileNode}->${retryEditNode}`, (flows.get(`${readFileNode}->${retryEditNode}`) || 0) + 1); flows.set(`${retryEditNode}->${finalResult}`, (flows.get(`${retryEditNode}->${finalResult}`) || 0) + 1); continue; } // Tool -> Success/Fail const resultNode = edit.success ? 'Success' : 'Failed'; const flowKey = `${toolNode}->${resultNode}`; flows.set(flowKey, (flows.get(flowKey) || 0) + 1); } } // Convert flows to Sankey links for (const [flowKey, count] of flows.entries()) { const [source, target] = flowKey.split('->'); sankeyLinks.push({ source: getNodeIndex(source), target: getNodeIndex(target), value: count }); } // Build table rows const tableRows = analysis.conversations.flatMap(conv => conv.edits.map(edit => ({ conversation: path.basename(conv.conversationPath), toolName: edit.toolName, timestamp: edit.timestamp, success: edit.success, turnIndex: edit.turnIndex, isRetry: edit.isRetry, retrySucceeded: edit.retrySucceeded, filePath: edit.filePath })) ); const html = ` Run ${analysis.runId}${analysis.modelName ? ' - ' + analysis.modelName : ''}

🔧 Run ${analysis.runId}${analysis.modelName ? ' - ' + analysis.modelName : ''}

Analysis of edit tool operations and success rates

Total Edits

${analysis.totalEdits}

Success Rate

${(analysis.successRate * 100).toFixed(1)}%

Conversations

${analysis.conversations.length}

Include retries (show re-evaluate → retry flows)

Edit Operations

${tableRows.map(row => ` `).join('')}

Conversation	Tool	Turn	File	Status	Retry
${row.conversation}	`${row.toolName}`	${row.turnIndex}	${row.filePath \|\| '-'}	${row.success ? '✓ Success' : '✗ Failed'}	${row.isRetry ? (row.retrySucceeded === true ? '✓ Retry Success' : row.retrySucceeded === false ? '✗ Retry Failed' : 'Retry Pending') : '-'}

`; return html; } async function main() { const args = process.argv.slice(2); const runIdArg = args.find(arg => arg.startsWith('--runId=')); const basePath = path.join('/Users/connor/Github/vscode-copilot-evaluation/.msbenchRun'); let runId: string; if (runIdArg) { runId = runIdArg.split('=')[1]; console.log(`Using run ID: ${runId}`); } else { const runs = await listRuns(basePath); if (runs.length === 0) { console.error('No test runs found in', basePath); process.exit(1); } runId = await promptUserForRun(runs); console.log(`Selected run: ${runId}`); } console.log('\nAnalyzing run...'); const analysis = await analyzeRun(runId, basePath); console.log(`\nFound ${analysis.conversations.length} conversations with edits`); console.log(`Total edits: ${analysis.totalEdits}`); console.log(`Success rate: ${(analysis.successRate * 100).toFixed(1)}%`); const outputPath = path.join(basePath, runId, 'edit-analysis.html'); const html = generateHTML(analysis, outputPath); await fs.writeFile(outputPath, html, 'utf-8'); console.log(`\n✓ Analysis complete! Generated: ${outputPath}`); } main().catch(console.error);