From a55012c839ec21185ddb9bdaaecfbdfbaa4ff8a9 Mon Sep 17 00:00:00 2001 From: Oleg Solomko Date: Mon, 31 Mar 2025 07:40:09 -0700 Subject: [PATCH] implement front matter parser --- .../parsers/frontMatterHeader.ts | 395 ++++++++++++++++++ .../tokens/frontMatterHeaderToken.ts | 75 ++++ .../tokens/markdownExtensionsToken.ts | 11 + 3 files changed, 481 insertions(+) create mode 100644 src/vs/editor/common/codecs/markdownExtensionsCodec/parsers/frontMatterHeader.ts create mode 100644 src/vs/editor/common/codecs/markdownExtensionsCodec/tokens/frontMatterHeaderToken.ts create mode 100644 src/vs/editor/common/codecs/markdownExtensionsCodec/tokens/markdownExtensionsToken.ts diff --git a/src/vs/editor/common/codecs/markdownExtensionsCodec/parsers/frontMatterHeader.ts b/src/vs/editor/common/codecs/markdownExtensionsCodec/parsers/frontMatterHeader.ts new file mode 100644 index 00000000000..27b5b889fc8 --- /dev/null +++ b/src/vs/editor/common/codecs/markdownExtensionsCodec/parsers/frontMatterHeader.ts @@ -0,0 +1,395 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See License.txt in the project root for license information. + *--------------------------------------------------------------------------------------------*/ + +import { BaseToken } from '../../baseToken.js'; +import { Dash } from '../../simpleCodec/tokens/dash.js'; +import { NewLine } from '../../linesCodec/tokens/newLine.js'; +import { assertDefined } from '../../../../../base/common/types.js'; +import { TSimpleDecoderToken } from '../../simpleCodec/simpleDecoder.js'; +import { assert, assertNever } from '../../../../../base/common/assert.js'; +import { CarriageReturn } from '../../linesCodec/tokens/carriageReturn.js'; +import { FrontMatterHeaderToken } from '../tokens/frontMatterHeaderToken.js'; +import { assertNotConsumed, ParserBase, TAcceptTokenResult } from '../../simpleCodec/parserBase.js'; + +/** + * TODO: @legomushroom + */ +class FrontMatterMarker { + /** + * TODO: @legomushroom + */ + public readonly dashCount: number; + + /** + * Full range of the token. + */ + public get range() { + return BaseToken.fullRange(this.tokens); + } + + constructor( + public readonly tokens: readonly (Dash | CarriageReturn | NewLine)[], + ) { + const lastToken = tokens[tokens.length - 1]; + + assert( + lastToken instanceof NewLine, + `Front Matter header must end with a new line token, got '${lastToken}'.`, + ); + + this.dashCount = this.tokens + .filter((token) => { return token instanceof Dash; }) + .length; + } + + /** + * Returns a string representation of the token. + */ + public toString(): string { + return `frontmatter-marker(${this.dashCount}:${this.range})`; + } +} + +/** + * TODO: @legomushroom + */ +type TMarkerToken = Dash | CarriageReturn | NewLine; + +/** + * TODO: @legomushroom + */ +export class PartialFrontMatterStartMarker extends ParserBase { + constructor(token: Dash) { + const { range } = token; + + assert( + range.startLineNumber === 1, + `Front Matter header must start at the first line, but it starts at line #${range.startLineNumber}.`, + ); + + assert( + range.startColumn === 1, + `Front Matter header must start at the beginning of the line, but it starts at ${range.startColumn}.`, + ); + + super([token]); + } + + @assertNotConsumed + public accept(token: TSimpleDecoderToken): TAcceptTokenResult { + const previousToken = this.currentTokens[this.currentTokens.length - 1]; + + // collect a sequence of dash tokens that may end with a CR token + // TODO: @legomushroom - include `Space` token? + if ((token instanceof Dash) || (token instanceof CarriageReturn)) { + // a dash or CR tokens can go only after another dash token + if ((previousToken instanceof Dash) === false) { + this.isConsumed = true; + + return { + result: 'failure', + wasTokenConsumed: false, + }; + } + + + this.currentTokens.push(token); + + return { + result: 'success', + wasTokenConsumed: true, + nextParser: this, + }; + } + + // stop collecting dash tokens when a new line token is encountered + if (token instanceof NewLine) { + this.isConsumed = true; + + return { + result: 'success', + wasTokenConsumed: true, + nextParser: new PartialFrontMatterHeader( + new FrontMatterMarker([ + ...this.currentTokens, + token, + ]), + ), + }; + } + + // any other token is invalid for the `start marker` + this.isConsumed = true; + return { + result: 'failure', + wasTokenConsumed: false, + }; + } +} + +/** + * TODO: @legomushroom + */ +export class PartialFrontMatterHeader extends ParserBase { + /** + * TODO: @legomushroom + */ + private partialEndMarker?: PartialFrontMatterEndMarker; + + /** + * TODO: @legomushroom + */ + private maybeEndMarker?: FrontMatterMarker; + + constructor( + public readonly startMarker: FrontMatterMarker, + ) { + super([]); + } + + public override get tokens(): readonly TSimpleDecoderToken[] { + return [ + ...this.startMarker.tokens, + ...this.currentTokens, + ...(this.maybeEndMarker?.tokens ?? []), // TODO: @legomushroom + ]; + } + + /** + * TODO: @legomushroom + */ + /** + * Convert the current token sequence into a {@link MarkdownComment} token. + * + * Note! that this method marks the current parser object as "consumed" + * hence it should not be used after this method is called. + */ + public asFrontMatterHeader(): FrontMatterHeaderToken | null { + if (this.partialEndMarker === undefined) { + return null; + } + + if (this.partialEndMarker.dashCount !== this.startMarker.dashCount) { + return null; + } + + this.isConsumed = true; + + return FrontMatterHeaderToken.fromTokens( + this.startMarker.tokens, + this.currentTokens, + this.partialEndMarker.tokens, + ); + } + + @assertNotConsumed + public accept(token: TSimpleDecoderToken): TAcceptTokenResult { + + // if in the mode of parsing the end marker sequence, forward + // the token to the current end marker parser instance + if (this.partialEndMarker !== undefined) { + return this.acceptEndMarkerToken(token); + } + + + // collect all tokens until a `dash token at the beginning of a line` is found + if (((token instanceof Dash) === false) || (token.range.startColumn !== 1)) { + this.currentTokens.push(token); + + return { + result: 'success', + wasTokenConsumed: true, + nextParser: this, + }; + } + + // a dash token at the beginning of the line might be a start of the `end marker` + // sequence of the front matter header, hence initialize appropriate parser object + assert( + this.partialEndMarker === undefined, + `End marker parser must not be present.`, + ); + this.partialEndMarker = new PartialFrontMatterEndMarker(token); + + return { + result: 'success', + wasTokenConsumed: true, + nextParser: this, + }; + } + + /** + * TODO: @legomushroom + * @throws + */ + private acceptEndMarkerToken( + token: TSimpleDecoderToken, + ): TAcceptTokenResult { + assertDefined( + this.partialEndMarker, + `Partial end marker parser must be initialized.`, + ); + + // if we have a partial end marker, we are in the process of parsing + // the end marker, so just pass the token to it and return + const acceptResult = this.partialEndMarker.accept(token); + const { result, wasTokenConsumed } = acceptResult; + + // TODO: @legomushroom + if (result === 'success') { + const { nextParser } = acceptResult; + const endMarkerParsingComplete = (nextParser instanceof FrontMatterMarker); + + if (endMarkerParsingComplete === false) { + return { + result: 'success', + wasTokenConsumed, + nextParser: this, + }; + } + + const endMarker = nextParser; + + // start and end markers must have the same number of dashes, hence + // if they don't match, we would like to continue parsing the header + // until we find an end marker with the same number of dashes + if (endMarker.dashCount !== this.startMarker.dashCount) { + return this.handleEndMarkerParsingFailure( + endMarker.tokens, + wasTokenConsumed, + token, + ); + } + + // found a valid end marker, so the parsing process is complete + this.maybeEndMarker = endMarker; + delete this.partialEndMarker; + + this.isConsumed = true; + return { + result: 'success', + wasTokenConsumed: true, + nextParser: FrontMatterHeaderToken.fromTokens( + this.startMarker.tokens, + this.currentTokens, + this.maybeEndMarker.tokens, + ), + }; + } + + // if failed to parse the end marker, we would like to continue parsing + // the header until we find a valid end marker + if (result === 'failure') { + return this.handleEndMarkerParsingFailure( + this.partialEndMarker.tokens, + wasTokenConsumed, + token, + ); + } + + assertNever( + result, + `Unexpected result '${result}' while parsing the end marker.`, + ); + } + + /** + * TODO: @legomushroom + */ + private handleEndMarkerParsingFailure( + accumulatedTokens: readonly TSimpleDecoderToken[], + wasTokenConsumed: boolean, + token: TSimpleDecoderToken, + ): TAcceptTokenResult { + this.currentTokens.push(...accumulatedTokens); + + if (wasTokenConsumed === false) { + this.currentTokens.push(token); + } + + delete this.partialEndMarker; + delete this.maybeEndMarker; + + return { + result: 'success', + wasTokenConsumed: true, + nextParser: this, + }; + } +} + +/** + * TODO: @legomushroom + */ +class PartialFrontMatterEndMarker extends ParserBase { + constructor(token: Dash) { + const { range } = token; + + assert( + range.startColumn === 1, + `Front Matter header must start at the beginning of the line, but it starts at ${range.startColumn}.`, + ); + + super([token]); + } + + /** + * TODO: @legomushroom + */ + public get dashCount(): number { + return this.tokens + .filter((token) => { return token instanceof Dash; }) + .length; + } + + @assertNotConsumed + public accept(token: TSimpleDecoderToken): TAcceptTokenResult { + const previousToken = this.currentTokens[this.currentTokens.length - 1]; + + // collect a sequence of dash tokens that may end with a CR token + // TODO: @legomushroom - include `Space` token? + if ((token instanceof Dash) || (token instanceof CarriageReturn)) { + // a dash or CR tokens can go only after another dash token + if ((previousToken instanceof Dash) === false) { + this.isConsumed = true; + + return { + result: 'failure', + wasTokenConsumed: false, + }; + } + + + this.currentTokens.push(token); + + return { + result: 'success', + wasTokenConsumed: true, + nextParser: this, + }; + } + + // stop collecting dash tokens when a new line token is encountered + if (token instanceof NewLine) { + this.isConsumed = true; + + return { + result: 'success', + wasTokenConsumed: true, + nextParser: new FrontMatterMarker([ + ...this.currentTokens, + token, + ]), + }; + } + + // any other token is invalid for the `start marker` + this.isConsumed = true; + return { + result: 'failure', + wasTokenConsumed: false, + }; + } +} diff --git a/src/vs/editor/common/codecs/markdownExtensionsCodec/tokens/frontMatterHeaderToken.ts b/src/vs/editor/common/codecs/markdownExtensionsCodec/tokens/frontMatterHeaderToken.ts new file mode 100644 index 00000000000..83d4e4529b0 --- /dev/null +++ b/src/vs/editor/common/codecs/markdownExtensionsCodec/tokens/frontMatterHeaderToken.ts @@ -0,0 +1,75 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See License.txt in the project root for license information. + *--------------------------------------------------------------------------------------------*/ + +import { Range } from '../../../core/range.js'; +import { BaseToken } from '../../baseToken.js'; +import { MarkdownExtensionsToken } from './markdownExtensionsToken.js'; +import { TSimpleDecoderToken } from '../../simpleCodec/simpleDecoder.js'; + +/** + * TODO: @legomushroom + */ +export class FrontMatterHeaderToken extends MarkdownExtensionsToken { + constructor( + range: Range, + public readonly startMarker: string, + public readonly contents: string, + public readonly endMarker: string, + ) { + // TODO: @legomushroom - validate text? + + super(range); + } + + /** + * TODO: @legomushroom + */ + public get text(): string { + return [ + this.startMarker, + this.contents, + this.endMarker, + ].join(''); + } + + /** + * Check if this token is equal to another one. + */ + public override equals(other: T): boolean { + if (!super.sameRange(other.range)) { + return false; + } + + if (!(other instanceof FrontMatterHeaderToken)) { + return false; + } + + return this.text === other.text; + } + + /** + * TODO: @legomushroom + */ + public static fromTokens( + startMarkerTokens: readonly TSimpleDecoderToken[], + contentTokens: readonly TSimpleDecoderToken[], + endMarkerTokens: readonly TSimpleDecoderToken[], + ): FrontMatterHeaderToken { + return new FrontMatterHeaderToken( + BaseToken.fullRange([...startMarkerTokens, ...endMarkerTokens]), + BaseToken.render(startMarkerTokens), + BaseToken.render(contentTokens), + BaseToken.render(endMarkerTokens), + ); + } + + /** + * Returns a string representation of the token. + */ + public override toString(): string { + // TODO: @legomushroom - add an utility to truncate strings + return `frontmatter("${this.text.slice(0, 16)}")${this.range}`; + } +} diff --git a/src/vs/editor/common/codecs/markdownExtensionsCodec/tokens/markdownExtensionsToken.ts b/src/vs/editor/common/codecs/markdownExtensionsCodec/tokens/markdownExtensionsToken.ts new file mode 100644 index 00000000000..ea31355e8ce --- /dev/null +++ b/src/vs/editor/common/codecs/markdownExtensionsCodec/tokens/markdownExtensionsToken.ts @@ -0,0 +1,11 @@ +/*--------------------------------------------------------------------------------------------- + * Copyright (c) Microsoft Corporation. All rights reserved. + * Licensed under the MIT License. See License.txt in the project root for license information. + *--------------------------------------------------------------------------------------------*/ + +import { MarkdownToken } from '../../markdownCodec/tokens/markdownToken.js'; + +/** + * TODO: @legomushroom + */ +export abstract class MarkdownExtensionsToken extends MarkdownToken { }