Add basic github lexical search

Fixes #312210

This is using the old search endpoint. We'll start switching over the new one once it's ready

Co-authored-by: Copilot <copilot@github.com>
This commit is contained in:
Matt Bierner
2026-04-23 14:16:49 -07:00
parent f84260a508
commit cf82f857de
13 changed files with 492 additions and 48 deletions
@@ -13,19 +13,20 @@ import { URI } from '../../../util/vs/base/common/uri';
import { Range } from '../../../util/vs/editor/common/core/range';
import { createDecorator, IInstantiationService } from '../../../util/vs/platform/instantiation/common/instantiation';
import { IAuthenticationService } from '../../authentication/common/authentication';
import { FileChunkAndScore } from '../../chunking/common/chunk';
import { FileChunk, FileChunkAndScore } from '../../chunking/common/chunk';
import { stripChunkTextMetadata, truncateToMaxUtf8Length } from '../../chunking/common/chunkingStringUtils';
import { EmbeddingType } from '../../embeddings/common/embeddingsComputer';
import { ICAPIClientService } from '../../endpoint/common/capiClient';
import { IEnvService } from '../../env/common/envService';
import { GithubRepoId, toGithubNwo } from '../../git/common/gitService';
import { makeGitHubAPIRequest } from '../../github/common/githubAPI';
import { getGithubMetadataHeaders } from '../../github/common/githubApiFetcherService';
import { IIgnoreService } from '../../ignore/common/ignoreService';
import { ILogService } from '../../log/common/logService';
import { Response } from '../../networking/common/fetcherService';
import { IFetcherService, Response } from '../../networking/common/fetcherService';
import { postRequest } from '../../networking/common/networking';
import { ITelemetryService } from '../../telemetry/common/telemetry';
import { CodeSearchOptions, CodeSearchResult, RemoteCodeSearchError, RemoteCodeSearchIndexState, RemoteCodeSearchIndexStatus } from './remoteCodeSearch';
import { CodeSearchOptions, LexicalCodeSearchResult, RemoteCodeSearchError, RemoteCodeSearchIndexState, RemoteCodeSearchIndexStatus, SemanticCodeSearchResult } from './remoteCodeSearch';
interface ResponseShape {
@@ -46,6 +47,7 @@ type SemanticSearchResult = {
location: {
path: string; // file path
commit_sha: string;
ref_name: string;
repo: {
nwo: string;
url: string;
@@ -54,11 +56,19 @@ type SemanticSearchResult = {
};
export interface GithubCodeSearchRepoInfo {
readonly kind: 'repo';
readonly githubRepoId: GithubRepoId;
readonly localRepoRoot: URI | undefined;
readonly indexedCommit: string | undefined;
}
export interface GithubCodeSearchOrgInfo {
readonly kind: 'org';
readonly org: string;
}
export type GithubCodeSearchScope = GithubCodeSearchRepoInfo | GithubCodeSearchOrgInfo;
export const IGithubCodeSearchService = createDecorator('IGithubCodeSearchService');
export interface IGithubCodeSearchService {
@@ -89,16 +99,29 @@ export interface IGithubCodeSearchService {
*
* The repo must have been indexed first. Make sure to check {@link getRemoteIndexState} or call {@link triggerIndexing}.
*/
searchRepo(
semanticSearch(
authOptions: { readonly silent: boolean },
embeddingType: EmbeddingType,
repo: GithubCodeSearchRepoInfo,
scope: GithubCodeSearchRepoInfo,
query: string,
maxResults: number,
options: CodeSearchOptions,
telemetryInfo: TelemetryCorrelationId,
token: CancellationToken,
): Promise<CodeSearchResult>;
): Promise<SemanticCodeSearchResult>;
/**
* Lexical searches a given github repo or org for relevant code snippets
*/
lexicalSearch(
authOptions: { readonly silent: boolean },
scope: GithubCodeSearchScope,
query: string,
maxResults: number,
options: CodeSearchOptions,
telemetryInfo: TelemetryCorrelationId,
token: CancellationToken,
): Promise<LexicalCodeSearchResult>;
}
export class GithubCodeSearchService implements IGithubCodeSearchService {
@@ -109,6 +132,7 @@ export class GithubCodeSearchService implements IGithubCodeSearchService {
@IAuthenticationService private readonly _authenticationService: IAuthenticationService,
@ICAPIClientService private readonly _capiClientService: ICAPIClientService,
@IEnvService private readonly _envService: IEnvService,
@IFetcherService private readonly _fetcherService: IFetcherService,
@IIgnoreService private readonly _ignoreService: IIgnoreService,
@ILogService private readonly _logService: ILogService,
@ITelemetryService private readonly _telemetryService: ITelemetryService,
@@ -252,7 +276,7 @@ export class GithubCodeSearchService implements IGithubCodeSearchService {
return Result.ok(true);
}
async searchRepo(
async semanticSearch(
auth: { readonly silent: boolean },
embeddingType: EmbeddingType,
repo: GithubCodeSearchRepoInfo,
@@ -261,7 +285,7 @@ export class GithubCodeSearchService implements IGithubCodeSearchService {
options: CodeSearchOptions,
telemetryInfo: TelemetryCorrelationId,
token: CancellationToken
): Promise<CodeSearchResult> {
): Promise<SemanticCodeSearchResult> {
const authToken = await this.getGithubAccessToken(auth.silent);
if (!authToken) {
throw new Error('No valid auth token');
@@ -342,6 +366,80 @@ export class GithubCodeSearchService implements IGithubCodeSearchService {
return result;
}
async lexicalSearch(
auth: { readonly silent: boolean },
scope: GithubCodeSearchScope,
query: string,
maxResults: number,
options: CodeSearchOptions,
telemetryInfo: TelemetryCorrelationId,
token: CancellationToken
): Promise<LexicalCodeSearchResult> {
const authToken = await this.getGithubAccessToken(auth.silent);
if (!authToken) {
throw new Error('No valid auth token');
}
const scopeQualifier = scope.kind === 'org' ? `org:${scope.org}` : `repo:${toGithubNwo(scope.githubRepoId)}`;
const searchQuery = `${query} ${scopeQualifier}`;
const routeSlug = `search/code?q=${encodeURIComponent(searchQuery)}&per_page=${maxResults}`;
const body = await raceCancellationError(makeGitHubAPIRequest(
this._fetcherService,
this._logService,
this._telemetryService,
this._capiClientService.dotcomAPIURL,
routeSlug,
'GET',
authToken,
{
accept: 'application/vnd.github.text-match+json',
additionalHeaders: getGithubMetadataHeaders(telemetryInfo.callTracker, this._envService),
callSite: 'github-code-search-lexical',
},
), token);
if (!body) {
/* __GDPR__
"githubCodeSearch.lexicalSearch.error" : {
"owner": "mjbvz",
"comment": "Information about failed lexical code searches",
"workspaceSearchSource": { "classification": "SystemMetaData", "purpose": "FeatureInsight", "comment": "Caller of the search" },
"workspaceSearchCorrelationId": { "classification": "SystemMetaData", "purpose": "FeatureInsight", "comment": "Correlation id for the search" }
}
*/
this._telemetryService.sendMSFTTelemetryEvent('githubCodeSearch.lexicalSearch.error', {
workspaceSearchSource: telemetryInfo.callTracker.toString(),
workspaceSearchCorrelationId: telemetryInfo.correlationId,
});
throw new Error(`Code search lexical search failed`);
}
if (!Array.isArray(body.items)) {
throw new Error(`Code search lexical search unexpected response json shape`);
}
const result = await raceCancellationError(parseLexicalSearchResponse(body, scope, options, this._ignoreService), token);
/* __GDPR__
"githubCodeSearch.lexicalSearch.success" : {
"owner": "mjbvz",
"comment": "Information about successful lexical code searches",
"workspaceSearchSource": { "classification": "SystemMetaData", "purpose": "FeatureInsight", "comment": "Caller of the search" },
"workspaceSearchCorrelationId": { "classification": "SystemMetaData", "purpose": "FeatureInsight", "comment": "Correlation id for the search" },
"resultCount": { "classification": "SystemMetaData", "purpose": "FeatureInsight", "isMeasurement": true, "comment": "Total number of returned items from the search" }
}
*/
this._telemetryService.sendMSFTTelemetryEvent('githubCodeSearch.lexicalSearch.success', {
workspaceSearchSource: telemetryInfo.callTracker.toString(),
workspaceSearchCorrelationId: telemetryInfo.correlationId,
}, {
resultCount: body.items.length,
});
return result;
}
private async getGithubAccessToken(silent: boolean) {
return (await this._authenticationService.getGitHubSession('permissive', { silent }))?.accessToken
?? (await this._authenticationService.getGitHubSession('any', { silent }))?.accessToken;
@@ -370,7 +468,7 @@ export class GithubCodeSearchService implements IGithubCodeSearchService {
}
}
export async function parseGithubCodeSearchResponse(body: ResponseShape, repo: GithubCodeSearchRepoInfo, options: CodeSearchOptions & { skipVerifyRepo?: boolean }, ignoreService: IIgnoreService): Promise<CodeSearchResult> {
export async function parseGithubCodeSearchResponse(body: ResponseShape, repo: GithubCodeSearchRepoInfo, options: CodeSearchOptions & { skipVerifyRepo?: boolean }, ignoreService: IIgnoreService): Promise<SemanticCodeSearchResult> {
let outOfSync = false;
const outChunks: FileChunkAndScore[] = [];
@@ -415,5 +513,106 @@ export async function parseGithubCodeSearchResponse(body: ResponseShape, repo: G
});
}));
return { chunks: outChunks, outOfSync };
// Extract the remote URL and ref name from the first result
const firstResult = body.results[0];
let remoteUrl: string | undefined;
let refName: string | undefined;
if (firstResult) {
// Derive the web URL from the API URL (e.g. https://api.github.com/repos/o/r -> https://github.com/o/r)
const apiUrl = firstResult.location.repo.url;
const nwo = firstResult.location.repo.nwo;
try {
const parsed = URI.parse(apiUrl);
const host = parsed.authority === 'api.github.com' ? 'github.com' : parsed.authority.replace(/^api\./, '');
remoteUrl = `https://${host}/${nwo}`;
} catch {
// Fall back to constructing from nwo
remoteUrl = `https://github.com/${nwo}`;
}
// Extract branch name from ref_name (e.g. "refs/heads/main" -> "main")
const rawRef = firstResult.location.ref_name;
if (rawRef?.startsWith('refs/heads/')) {
refName = rawRef.slice('refs/heads/'.length);
} else if (rawRef) {
refName = rawRef;
}
}
return { chunks: outChunks, outOfSync, remoteUrl, refName };
}
interface LexicalSearchResponseShape {
readonly total_count: number;
readonly incomplete_results: boolean;
readonly items: readonly LexicalSearchItem[];
}
type LexicalSearchItem = {
readonly path: string;
readonly repository: {
readonly full_name: string;
};
readonly text_matches?: readonly {
readonly fragment: string;
readonly matches: readonly { readonly text: string; readonly indices: readonly [number, number] }[];
readonly object_type: string;
readonly property: string;
}[];
readonly score: number;
};
export async function parseLexicalSearchResponse(body: LexicalSearchResponseShape, scope: GithubCodeSearchScope & { skipVerifyRepo?: boolean }, options: CodeSearchOptions & { skipVerifyRepo?: boolean }, ignoreService: IIgnoreService): Promise<LexicalCodeSearchResult> {
const outChunks: FileChunk[] = [];
await Promise.all(body.items.map(async (item): Promise<void> => {
if (!options.skipVerifyRepo && scope.kind === 'repo' && item.repository.full_name.toLowerCase() !== toGithubNwo(scope.githubRepoId)) {
return;
}
if (!options.skipVerifyRepo && scope.kind === 'org' && item.repository.full_name.toLowerCase().split('/')[0] !== scope.org.toLowerCase()) {
return;
}
const localRepoRoot = scope.kind === 'repo' ? scope.localRepoRoot : undefined;
let fileUri: URI;
if (localRepoRoot) {
fileUri = URI.joinPath(localRepoRoot, item.path);
if (await ignoreService.isCopilotIgnored(fileUri)) {
return;
}
} else {
fileUri = URI.from({
scheme: 'githubRepoResult',
path: '/' + item.repository.full_name + '/' + item.path
});
}
if (!shouldInclude(fileUri, options.globPatterns)) {
return;
}
const textMatches = item.text_matches?.filter(m => m.property === 'content');
if (textMatches && textMatches.length > 0) {
for (const match of textMatches) {
outChunks.push({
file: fileUri,
text: match.fragment,
rawText: undefined,
range: new Range(0, 0, 0, 0),
isFullFile: false,
});
}
} else {
// No text matches, include the file as a whole-file result
outChunks.push({
file: fileUri,
text: '',
rawText: undefined,
range: new Range(0, 0, 0, 0),
isFullFile: true,
});
}
}));
return { chunks: outChunks, outOfSync: false };
}