mirror of
https://github.com/microsoft/vscode.git
synced 2026-07-03 13:06:06 +01:00
6bd7400f1c
* utils: document binarySearch
* nes-datagen: generate training data from continuous recordings
Continuous enhanced telemetry now ships sliding-window recordings that, unlike per-request alternative-action recordings, carry no requestTime. The datagen pipeline needs a point to split each recording into edit history before/after, so this adds a pluggable pivot strategy (starting with Random, selectable via --pivot-strategy) and a new continuous/ pipeline module that replays a recording at the chosen pivot to produce a processed row.
Along the way this consolidates the pipeline's error and index handling: a shared WithRowIndex<T> replaces the ad-hoc { originalRowIndex, ... } pairs, per-record processing returns Result<IProcessedRow, Error> instead of field-presence unions, and failures surface as original Error objects (no string round-tripping). The telemetry sender's continuous payload is now the documented IContinuousRecording type.
Co-authored-by: Copilot App <223556219+Copilot@users.noreply.github.com>
* nes-datagen: label alt-action replay errors by originalRowIndex
Address PR review: the alternative-action path mislabeled diagnostics when
earlier records failed to parse.
- processAllRows: push replay errors with the row's true `originalRowIndex`
instead of its position in the filtered `rows` array (parse failures make
`rows` sparse, so the two diverge).
- loadAndProduceProcessedRows: resolve `languageForRow` via an
`originalRowIndex`-keyed Map rather than positional `rows[i]`, matching how
callers pass `e.originalRowIndex`.
- Clarify the `recordCount` doc: it counts successfully-parsed records (parse
failures are counted separately in `parseErrors`).
- Add a regression spec asserting replay errors carry the row index, not the
array position.
Co-authored-by: Copilot App <223556219+Copilot@users.noreply.github.com>
---------
Co-authored-by: Copilot App <223556219+Copilot@users.noreply.github.com>
468 lines
23 KiB
TypeScript
468 lines
23 KiB
TypeScript
/*---------------------------------------------------------------------------------------------
|
|
* Copyright (c) Microsoft Corporation. All rights reserved.
|
|
* Licensed under the MIT License. See License.txt in the project root for license information.
|
|
*--------------------------------------------------------------------------------------------*/
|
|
import minimist from 'minimist';
|
|
import { EmbeddingType } from '../../src/platform/embeddings/common/embeddingsComputer';
|
|
import { CacheMode } from './simulationContext';
|
|
|
|
/** Number of runs that are stored in baseline.json */
|
|
export const BASELINE_RUN_COUNT = 10;
|
|
|
|
export enum NesDatagenSampleTask {
|
|
Xtab = 'xtab',
|
|
CursorSameFile = 'cursor-same-file',
|
|
CursorCrossFile = 'cursor-cross-file',
|
|
CursorBoth = 'cursor-both',
|
|
}
|
|
|
|
/**
|
|
* Shape of the recordings in the nes-datagen input file.
|
|
*/
|
|
export enum NesDatagenInputFormat {
|
|
/** Per-request "alternative action" recordings bookmarked at the NES request time. */
|
|
AlternativeAction = 'alternative-action',
|
|
/** Continuous enhanced-telemetry slices with no request bookmark; a pivot is synthesized. */
|
|
Continuous = 'continuous',
|
|
}
|
|
|
|
/**
|
|
* How to choose the pivot in a continuous recording (only meaningful when
|
|
* `--input-format=continuous`). The pivot splits the timeline into context and
|
|
* the oracle (next user edit).
|
|
*/
|
|
export enum PivotStrategy {
|
|
/** Pick a single eligible pivot uniformly at random. */
|
|
Random = 'random',
|
|
}
|
|
|
|
export type NesDatagen = {
|
|
readonly input: string;
|
|
readonly output: string | undefined;
|
|
readonly rowOffset: number;
|
|
readonly workerMode: boolean;
|
|
readonly sampleTask: NesDatagenSampleTask;
|
|
/** Shape of the input recordings. */
|
|
readonly inputFormat: NesDatagenInputFormat;
|
|
/** Pivot selection strategy for continuous recordings. Ignored for alternative-action input. */
|
|
readonly pivotStrategy: PivotStrategy;
|
|
/**
|
|
* Seed for the continuous pivot RNG. Resolved once (random when `--seed` is
|
|
* omitted) so it can be propagated to all parallel workers for reproducible
|
|
* output. Ignored for alternative-action input.
|
|
*/
|
|
readonly seed: number;
|
|
/** Minimum same-file lines above the request cursor for a move to count as a jump. */
|
|
readonly sameFileJumpMinAbove: number;
|
|
/** Minimum same-file lines below the request cursor for a move to count as a jump. */
|
|
readonly sameFileJumpMinBelow: number;
|
|
};
|
|
|
|
export class SimulationOptions {
|
|
public static fromProcessArgs(): SimulationOptions {
|
|
return new SimulationOptions(process.argv);
|
|
}
|
|
|
|
public static fromArray(argv: readonly string[]): SimulationOptions {
|
|
return new SimulationOptions(argv);
|
|
}
|
|
|
|
private readonly argv: minimist.ParsedArgs;
|
|
|
|
public readonly help: boolean;
|
|
public readonly listModels: boolean;
|
|
public readonly listTests: boolean;
|
|
public readonly listSuites: boolean;
|
|
public readonly jsonOutput: boolean;
|
|
public readonly nRuns: number;
|
|
public readonly chatModel: string | undefined;
|
|
public readonly smartChatModel: string | undefined;
|
|
public readonly fastChatModel: string | undefined;
|
|
public readonly fastRewriteModel: string | undefined;
|
|
public readonly summarizeHistory: boolean;
|
|
public readonly swebenchPrompt: boolean;
|
|
public readonly embeddingType: EmbeddingType | undefined;
|
|
public readonly boost: boolean;
|
|
public readonly parallelism: number;
|
|
public readonly lmCacheMode: CacheMode;
|
|
public readonly modelCacheMode: CacheMode;
|
|
public readonly resourcesCacheMode: CacheMode;
|
|
public readonly cachePath: string | undefined;
|
|
public readonly externalBaseline: string | undefined;
|
|
public readonly externalScenarios: string | undefined;
|
|
public readonly output: string | undefined;
|
|
public readonly inline: boolean;
|
|
public readonly sidebar: boolean;
|
|
public readonly applyChatCodeBlocks: boolean;
|
|
public readonly stageCacheEntries: boolean;
|
|
public readonly ci: boolean;
|
|
public readonly gc: boolean;
|
|
public readonly externalCacheLayersPath: string | undefined;
|
|
public readonly verbose: number | boolean | undefined;
|
|
public readonly grep: string[] | string | undefined;
|
|
public readonly omitGrep: string | undefined;
|
|
public readonly heapSnapshots: boolean | string | undefined;
|
|
/** --scenario-test, --scenarioTest Run tests from provided scenario test file name */
|
|
public readonly scenarioTest: string | undefined;
|
|
public readonly isUpdateBaseline: boolean;
|
|
public readonly noFetch: boolean;
|
|
public readonly noCachePointer: boolean;
|
|
/**
|
|
* A label for the current simulation run, to be displayed in the UI for distinguishing between runs.
|
|
*/
|
|
public readonly label: string;
|
|
public readonly runServerPoweredNesProvider: boolean;
|
|
public readonly nes: 'external' | 'coffe' | undefined;
|
|
public readonly nesUrl: string | undefined;
|
|
public readonly nesApiKey: string | undefined;
|
|
|
|
public readonly nesDatagen: NesDatagen | undefined;
|
|
|
|
public readonly subcommand: 'nes-datagen' | undefined;
|
|
|
|
public readonly disabledTools: Set<string>;
|
|
|
|
/** If true, all tests are run in the extension host */
|
|
public readonly inExtensionHost: boolean;
|
|
/** Extensions to ensure are available in the extension host */
|
|
public readonly installExtensions: string[];
|
|
/** Whether to run headless (defaults to false) */
|
|
public readonly headless: boolean;
|
|
/** @internal Only run a single test number */
|
|
public readonly runNumber: number;
|
|
/** Explicit workspace URI to use for stest --in-extension-host */
|
|
public readonly useScenarioWorkspace: boolean;
|
|
|
|
/** If true, will try to use code search using our service. */
|
|
public readonly useExperimentalCodeSearchService: boolean;
|
|
|
|
public readonly configFile: string | undefined;
|
|
|
|
public readonly modelConfigFile: string | undefined;
|
|
|
|
/**
|
|
* Path to a JSON file describing an adhoc chat request to send (used by the
|
|
* simulation workbench "Adhoc request sender" mode). The file contains
|
|
* `{ system: string; user: string; model: string }`.
|
|
*/
|
|
public readonly adhocRequestFile: string | undefined;
|
|
|
|
protected constructor(processArgv: readonly string[]) {
|
|
const argv = minimist(processArgv.slice(2));
|
|
this.argv = argv;
|
|
this.help = boolean(argv['help'], false);
|
|
this.listModels = boolean(argv['list-models'], false);
|
|
this.listTests = boolean(argv['list-tests'], false);
|
|
this.listSuites = boolean(argv['list-suites'], false);
|
|
this.jsonOutput = boolean(argv['json'], false);
|
|
this.isUpdateBaseline = boolean(argv['update-baseline'] ?? argv['u'], false);
|
|
this.boost = boolean(argv['boost'], false);
|
|
const fetch = boolean(argv['fetch'], true);
|
|
this.noFetch = !fetch; // `--no-fetch` becomes argv[`fetch`] because of how minimist works
|
|
const cachePointer = boolean(argv['cache-pointer'], true);
|
|
this.noCachePointer = !cachePointer; // `--no-cache-pointer` becomes argv[`cache-pointer`] because of how minimist works
|
|
this.nRuns = typeof argv['n'] === 'number' ? argv['n'] : (this.isUpdateBaseline || argv['ci'] ? BASELINE_RUN_COUNT : 10);
|
|
this.chatModel = this.argv['model'];
|
|
this.smartChatModel = this.argv['smart-model'];
|
|
this.fastChatModel = this.argv['fast-model'];
|
|
this.fastRewriteModel = this.argv['fast-rewrite-model'];
|
|
this.summarizeHistory = boolean(argv['summarize-history'], true);
|
|
this.swebenchPrompt = boolean(argv['swebench-prompt'], false);
|
|
this.embeddingType = cliOptionsToWellKnownEmbeddingsType(this.argv['embedding-model']);
|
|
this.parallelism = this.argv['parallelism'] ?? this.argv['p'] ?? 20;
|
|
this.modelCacheMode = this.argv['skip-model-cache'] ? CacheMode.Disable : CacheMode.Default;
|
|
this.lmCacheMode = (
|
|
this.argv['skip-cache'] ? CacheMode.Disable
|
|
: (this.argv['require-cache'] ? CacheMode.Require : CacheMode.Default)
|
|
);
|
|
this.resourcesCacheMode = (
|
|
this.argv['skip-resources-cache'] ? CacheMode.Disable : CacheMode.Default
|
|
);
|
|
this.externalScenarios = this.argv['external-scenarios'];
|
|
this.externalBaseline = this.argv['external-baseline']; // must be set after `externalScenarios`
|
|
this.validateExternalBaseline();
|
|
this.output = this.argv['output'];
|
|
this.cachePath = this.argv['cache-location'];
|
|
this.inline = boolean(this.argv['inline'], false);
|
|
this.sidebar = boolean(this.argv['sidebar'], false);
|
|
this.applyChatCodeBlocks = boolean(this.argv['apply-chat-code-blocks'], false);
|
|
this.stageCacheEntries = boolean(this.argv['stage-cache-entries'], false);
|
|
this.ci = boolean(this.argv['ci'], false);
|
|
this.gc = boolean(this.argv['gc'], false);
|
|
this.externalCacheLayersPath = argv['external-cache-layers-path'];
|
|
this.verbose = this.argv['verbose'];
|
|
this.grep = argv['grep'];
|
|
this.omitGrep = argv['omit-grep'];
|
|
this.heapSnapshots = argv['heap-snapshots'];
|
|
this.scenarioTest = argv['scenarioTest'] ?? argv['scenario-test'];
|
|
this.label = argv['label'] ?? '';
|
|
|
|
this.inExtensionHost = boolean(argv['in-extension-host'], false);
|
|
this.installExtensions = argv['install-extension'] ? argv['install-extension'].split(',') : [];
|
|
this.headless = boolean(argv['headless'], true);
|
|
this.runNumber = Number(argv['run-number']) || 0;
|
|
|
|
this.runServerPoweredNesProvider = boolean(argv['runServerPoweredNesProvider'], false);
|
|
|
|
this.nes = SimulationOptions.validateNesArgument(argv['nes']);
|
|
|
|
this.nesUrl = argv['nes-url'];
|
|
// [SuppressMessage("Microsoft.Security", "CS002:SecretInNextLine", Justification="used for local simulation tests")]
|
|
this.nesApiKey = argv['nes-api-key'];
|
|
SimulationOptions.validateNesUrlOverride(this.nesUrl, this.nesApiKey);
|
|
|
|
this.disabledTools = argv['disable-tools'] ? new Set(argv['disable-tools'].split(',')) : new Set();
|
|
this.useScenarioWorkspace = boolean(argv['scenario-workspace-folder'], false);
|
|
|
|
this.useExperimentalCodeSearchService = boolean(argv['use-experimental-code-search-service'], false);
|
|
|
|
const isNesDatagen = (argv._ as string[]).includes('nes-datagen');
|
|
this.subcommand = isNesDatagen ? 'nes-datagen' : undefined;
|
|
this.nesDatagen = isNesDatagen && argv['input']
|
|
? {
|
|
input: argv['input'],
|
|
output: argv['out'],
|
|
rowOffset: typeof argv['row-offset'] === 'number' ? argv['row-offset'] : 0,
|
|
workerMode: boolean(argv['worker'], false),
|
|
sampleTask: SimulationOptions.validateSampleTask(argv['sample-task']),
|
|
inputFormat: SimulationOptions.validateInputFormat(argv['input-format']),
|
|
pivotStrategy: SimulationOptions.validatePivotStrategy(argv['pivot-strategy']),
|
|
seed: SimulationOptions.resolveSeed(argv['seed']),
|
|
sameFileJumpMinAbove: typeof argv['same-file-jump-min-above'] === 'number' ? argv['same-file-jump-min-above'] : 2,
|
|
sameFileJumpMinBelow: typeof argv['same-file-jump-min-below'] === 'number' ? argv['same-file-jump-min-below'] : 5,
|
|
}
|
|
: undefined;
|
|
|
|
this.configFile = argv['config-file'];
|
|
this.modelConfigFile = argv['model-config-file'];
|
|
this.adhocRequestFile = argv['adhoc-request-file'];
|
|
}
|
|
|
|
public printHelp(): void {
|
|
console.log([
|
|
`Example usages: `,
|
|
` npm run simulate`,
|
|
` npm run simulate -- --external-scenarios=<path> --inline --output=<path>`,
|
|
` npm run simulate -- --external-scenarios=<path> --sidebar --output=<path>`,
|
|
` npm run simulate -- --external-scenarios=<path> --nes --output=<path>`,
|
|
` npm run simulate -- --update-baseline`,
|
|
``,
|
|
` -u, --update-baseline Updates scores in baseline.json if they change as a result of your changes to prompts sent to the model`,
|
|
` --external-scenarios Path to a directory containing scenarios to run`,
|
|
` --inline Run inline chat external scenarios`,
|
|
` --sidebar Run sidebar chat external scenarios`,
|
|
` --nes Run NES external scenarios`,
|
|
` --output Path to a directory where to generate output`,
|
|
` --n Run each scenario N times`,
|
|
` --ci Equivalent to --n=${BASELINE_RUN_COUNT} but throws if the baseline is not up-to-date`,
|
|
` --gc Used with --require-cache to compact cache layers into the baseline cache`,
|
|
` --external-cache-layers-path Used to specify the path to the external cache layers`,
|
|
` --grep Run a test which contains the passed-in string`,
|
|
` --omit-grep Run a test which does not contain the passed-in string`,
|
|
` --embedding-model Specify the model to use for the embedding endpoint (default: ada)`,
|
|
` Values: ada, text3small, text3large`,
|
|
` --list-models List available chat models`,
|
|
` --model Specify the model to use for the chat endpoint (use --list-models to see valid options)`,
|
|
` --smart-model Specify the model to use in place of the smarter slower model, i.e GPT 4o`,
|
|
` --fast-model Specify the model to use in place of the faster / less smart model, i.e GPT 4o mini`,
|
|
` --fast-rewrite-model [experimental] Specify the model to use for the fast rewrite endpoint`,
|
|
` -p, --parallelism [experimental] Run tests in parallel (default: 1)`,
|
|
` --skip-cache [experimental] Do not use the cache for language model requests`,
|
|
` --require-cache [experimental] Require cache hits, fail on cache misses`,
|
|
` --regenerate-cache [experimental] Fetch all responses and refresh the cache`,
|
|
` --skip-resources-cache [experimental] Do not use the cache for computed resources`,
|
|
` --skip-model-cache [experimental] Do not use the cache for model metadata`,
|
|
` --stage-cache-entries [experimental] Stage cache files that were used in current simulation run`,
|
|
` --list-tests List tests without running them`,
|
|
` --json Print output in JSONL format`,
|
|
` --verbose Print more information about test and assertion failures`,
|
|
` --scenario-test Run tests from provided scenario test file name, e.g., 'docComment.stest' or 'docComment.stest.ts' (--scenarioTest is supported but will be deprecated in future)`,
|
|
` --no-fetch Do not send requests to the model endpoint (uses cache but doesn't write to it) (useful to make sure prompts are unchanged by observing cache misses)`,
|
|
` --no-cache-pointer [experimental] Do not write files to outcome/`,
|
|
` --label A label for the current simulation run, to be displayed in the UI for distinguishing between runs`,
|
|
` --nes-url To override endpoint URL for NES (must be used with --nes-api-key)`,
|
|
` --nes-api-key API key for endpoint URL provided via NES (must be used with --nes-url)`,
|
|
` --runServerPoweredNesProvider Run stests against the http server powered NES provider (server must be run at port 8001)`,
|
|
` --disable-tools A comma-separated list of tools to disable`,
|
|
` --swebench-prompt Use the headless swebench prompt for agent mode`,
|
|
` --summarize-history Enable experimental conversation history summarization in agent mode`,
|
|
` --scenario-workspace-folder If true, runs the stest inline in the scenario's workspace folder`,
|
|
` --config-file Path to a JSON file containing configuration options`,
|
|
` --model-config-file Path to a JSON file containing model configuration options`,
|
|
``,
|
|
`Subcommands:`,
|
|
` nes-datagen Generate training data from alternative action recordings`,
|
|
` Run 'npm run simulate -- nes-datagen --help' for options`,
|
|
``,
|
|
].join('\n'));
|
|
}
|
|
|
|
public printTrainHelp(): void {
|
|
console.log([
|
|
`Usage: npm run simulate -- --config-file=<path> [global options] nes-datagen --input=<path> [options]`,
|
|
``,
|
|
`Generate training data by replaying alternative action recordings through the NES prompt pipeline.`,
|
|
`The prompting strategy is read from the model configuration in --config-file.`,
|
|
``,
|
|
`Options:`,
|
|
` --input Path to a JSON or JSON Lines file with training data recordings (required)`,
|
|
` Format is inferred from the extension: .jsonl/.ndjson → JSON Lines, otherwise JSON array`,
|
|
` --out Output path for the JSON Lines file. Default: <input-path>_output.jsonl`,
|
|
` --input-format Shape of the input recordings (default: alternative-action)`,
|
|
` Values: alternative-action, continuous`,
|
|
` alternative-action → per-request recordings bookmarked at the NES request time`,
|
|
` continuous → continuous enhanced-telemetry slices; a pivot is synthesized`,
|
|
` --pivot-strategy How to pick the pivot in a continuous recording (default: random; only for --input-format=continuous)`,
|
|
` Values: random`,
|
|
` random → pick a single eligible pivot uniformly at random`,
|
|
` --seed Integer seed for the continuous pivot RNG (default: random, logged for reproducibility)`,
|
|
` --sample-task Which target to generate (default: xtab)`,
|
|
` Values: xtab, cursor-same-file, cursor-cross-file, cursor-both`,
|
|
` xtab → edit-prediction sample (assistant = an edit)`,
|
|
` cursor-same-file → next-cursor-line sample restricted to the active file`,
|
|
` cursor-cross-file → next-cursor-line sample for a jump to another file`,
|
|
` cursor-both → tries same-file first, falls back to cross-file (one sample per row)`,
|
|
` --same-file-jump-min-above Minimum lines above request cursor for a same-file move to count as a jump (default: 2)`,
|
|
` --same-file-jump-min-below Minimum lines below request cursor for a same-file move to count as a jump (default: 5)`,
|
|
``,
|
|
`Global options (placed before 'nes-datagen'):`,
|
|
` --config-file Path to a JSON config file (required for nes-datagen)`,
|
|
` Must include "github.copilot.chat.inlineEdits.xtabProvider.modelConfiguration"`,
|
|
` with at least { "modelName", "promptingStrategy", "includeTagsInCurrentFile" }`,
|
|
` -p, --parallelism Number of parallel workers (default: 20)`,
|
|
` --verbose Print detailed progress and error information`,
|
|
` --help Show this help message`,
|
|
``,
|
|
`Examples:`,
|
|
` npm run simulate -- --config-file=config.json nes-datagen --input=data.json`,
|
|
` npm run simulate -- --config-file=config.json --parallelism=10 --verbose nes-datagen --input=data.json`,
|
|
` npm run simulate -- --config-file=config.json nes-datagen --input=data.json --sample-task=cursor-same-file`,
|
|
` npm run simulate -- --config-file=config.json nes-datagen --input=data.json --sample-task=cursor-cross-file`,
|
|
` npm run simulate -- --config-file=config.json nes-datagen --input=data.json --sample-task=cursor-both --same-file-jump-min-above=8 --same-file-jump-min-below=8`,
|
|
` npm run simulate -- --config-file=config.json nes-datagen --input=continuous.jsonl --input-format=continuous`,
|
|
` npm run simulate -- --config-file=config.json nes-datagen --input=continuous.jsonl --input-format=continuous --pivot-strategy=random --seed=42`,
|
|
``,
|
|
].join('\n'));
|
|
}
|
|
|
|
private validateExternalBaseline() {
|
|
if (this.externalBaseline && !this.externalScenarios) {
|
|
throw new Error('External scenarios must be provided for external baseline to work.');
|
|
}
|
|
}
|
|
|
|
private static validateNesArgument(nes: unknown): 'external' | 'coffe' | undefined {
|
|
if (nes === undefined || nes === null) {
|
|
return undefined;
|
|
}
|
|
if (typeof nes === 'boolean') { // this's for backward compat because previously it was possible to just pass `--nes` to run external stests against NES
|
|
return 'external';
|
|
}
|
|
if (typeof nes !== 'string') {
|
|
throw new Error(`--nes must be a string, but got: ${typeof nes}`);
|
|
}
|
|
switch (nes) {
|
|
case 'external':
|
|
case 'coffe':
|
|
return nes;
|
|
default:
|
|
throw new Error(`--nes can only be 'external' or 'coffe', but got: ${nes}`);
|
|
}
|
|
}
|
|
|
|
private static validateNesUrlOverride(nesUrl: string | undefined, nesApiKey: string | undefined): void {
|
|
if (nesUrl !== undefined && nesApiKey === undefined) {
|
|
throw new Error(`--nesApiKey must be provided when --nesUrl is set`);
|
|
}
|
|
if (nesUrl === undefined && nesApiKey !== undefined) {
|
|
throw new Error(`--nesUrl must be provided when --nesApiKey is set`);
|
|
}
|
|
}
|
|
|
|
private static validateSampleTask(value: unknown): NesDatagenSampleTask {
|
|
if (value === undefined || value === null) {
|
|
return NesDatagenSampleTask.Xtab;
|
|
}
|
|
if (typeof value !== 'string') {
|
|
throw new Error(`--sample-task must be a string, but got: ${typeof value}`);
|
|
}
|
|
const allowed = Object.values(NesDatagenSampleTask) as string[];
|
|
if (!allowed.includes(value)) {
|
|
throw new Error(`--sample-task must be one of [${allowed.join(', ')}], but got: ${value}`);
|
|
}
|
|
return value as NesDatagenSampleTask;
|
|
}
|
|
|
|
private static validateInputFormat(value: unknown): NesDatagenInputFormat {
|
|
if (value === undefined || value === null) {
|
|
return NesDatagenInputFormat.AlternativeAction;
|
|
}
|
|
if (typeof value !== 'string') {
|
|
throw new Error(`--input-format must be a string, but got: ${typeof value}`);
|
|
}
|
|
const allowed = Object.values(NesDatagenInputFormat) as string[];
|
|
if (!allowed.includes(value)) {
|
|
throw new Error(`--input-format must be one of [${allowed.join(', ')}], but got: ${value}`);
|
|
}
|
|
return value as NesDatagenInputFormat;
|
|
}
|
|
|
|
private static validatePivotStrategy(value: unknown): PivotStrategy {
|
|
if (value === undefined || value === null) {
|
|
return PivotStrategy.Random;
|
|
}
|
|
if (typeof value !== 'string') {
|
|
throw new Error(`--pivot-strategy must be a string, but got: ${typeof value}`);
|
|
}
|
|
const allowed = Object.values(PivotStrategy) as string[];
|
|
if (!allowed.includes(value)) {
|
|
throw new Error(`--pivot-strategy must be one of [${allowed.join(', ')}], but got: ${value}`);
|
|
}
|
|
return value as PivotStrategy;
|
|
}
|
|
|
|
/**
|
|
* Resolve the continuous pivot seed. When `--seed` is omitted a random
|
|
* 32-bit seed is generated so that the parent can log it and propagate it to
|
|
* every worker, keeping output reproducible.
|
|
*/
|
|
private static resolveSeed(value: unknown): number {
|
|
if (value === undefined || value === null) {
|
|
return Math.floor(Math.random() * 0x100000000);
|
|
}
|
|
if (typeof value !== 'number' || !Number.isInteger(value)) {
|
|
throw new Error(`--seed must be an integer, but got: ${value}`);
|
|
}
|
|
return value >>> 0;
|
|
}
|
|
}
|
|
|
|
function cliOptionsToWellKnownEmbeddingsType(model: string | undefined): EmbeddingType | undefined {
|
|
switch (model) {
|
|
case 'text3small':
|
|
case EmbeddingType.text3small_512.id:
|
|
return EmbeddingType.text3small_512;
|
|
|
|
case 'metis':
|
|
case EmbeddingType.metis_1024_I16_Binary.id:
|
|
return EmbeddingType.metis_1024_I16_Binary;
|
|
|
|
case undefined:
|
|
return undefined;
|
|
|
|
default:
|
|
throw new Error(`Unknown embedding model: ${model}`);
|
|
}
|
|
}
|
|
|
|
function boolean(value: any, defaultValue: boolean): boolean {
|
|
if (typeof value === 'undefined') {
|
|
return defaultValue;
|
|
}
|
|
if (value === 'false') {
|
|
// treat the string 'false' as false
|
|
return false;
|
|
}
|
|
return Boolean(value);
|
|
}
|