import { DecodingMode, EntityDecoder, htmlDecodeTree, xmlDecodeTree, } from "entities/decode"; const enum CharCodes { Tab = 0x9, // "\t" NewLine = 0xa, // "\n" FormFeed = 0xc, // "\f" CarriageReturn = 0xd, // "\r" Space = 0x20, // " " ExclamationMark = 0x21, // "!" Number = 0x23, // "#" Amp = 0x26, // "&" SingleQuote = 0x27, // "'" DoubleQuote = 0x22, // '"' Dash = 0x2d, // "-" Slash = 0x2f, // "/" Zero = 0x30, // "0" Nine = 0x39, // "9" Semi = 0x3b, // ";" Lt = 0x3c, // "<" Eq = 0x3d, // "=" Gt = 0x3e, // ">" Questionmark = 0x3f, // "?" UpperA = 0x41, // "A" LowerA = 0x61, // "a" UpperF = 0x46, // "F" LowerF = 0x66, // "f" UpperZ = 0x5a, // "Z" LowerZ = 0x7a, // "z" LowerX = 0x78, // "x" OpeningSquareBracket = 0x5b, // "[" } /** All the states the tokenizer can be in. */ const enum State { Text = 1, BeforeTagName, // After < InTagName, InSelfClosingTag, BeforeClosingTagName, InClosingTagName, AfterClosingTagName, // Attributes BeforeAttributeName, InAttributeName, AfterAttributeName, BeforeAttributeValue, InAttributeValueDq, // " InAttributeValueSq, // ' InAttributeValueNq, // Declarations BeforeDeclaration, // ! InDeclaration, // Processing instructions InProcessingInstruction, // ? // Comments & CDATA BeforeComment, CDATASequence, DeclarationSequence, InSpecialComment, InCommentLike, // Special tags SpecialStartSequence, InSpecialTag, InPlainText, InEntity, } function isWhitespace(c: number): boolean { return ( c === CharCodes.Space || c === CharCodes.NewLine || c === CharCodes.Tab || c === CharCodes.FormFeed || c === CharCodes.CarriageReturn ); } function isEndOfTagSection(c: number): boolean { return c === CharCodes.Slash || c === CharCodes.Gt || isWhitespace(c); } function isASCIIAlpha(c: number): boolean { return ( (c >= CharCodes.LowerA && c <= CharCodes.LowerZ) || (c >= CharCodes.UpperA && c <= CharCodes.UpperZ) ); } /** * Quote style used for parsed attributes. */ export enum QuoteType { NoValue = 0, Unquoted = 1, Single = 2, Double = 3, } /** * Low-level tokenizer callback interface. */ export interface Callbacks { onattribdata(start: number, endIndex: number): void; onattribentity(codepoint: number): void; onattribend(quote: QuoteType, endIndex: number): void; onattribname(start: number, endIndex: number): void; oncdata(start: number, endIndex: number, endOffset: number): void; onclosetag(start: number, endIndex: number): void; oncomment(start: number, endIndex: number, endOffset: number): void; ondeclaration(start: number, endIndex: number): void; onend(): void; onopentagend(endIndex: number): void; onopentagname(start: number, endIndex: number): void; onprocessinginstruction(start: number, endIndex: number): void; onselfclosingtag(endIndex: number): void; ontext(start: number, endIndex: number): void; ontextentity(codepoint: number, endIndex: number): void; isInForeignContext?(): boolean; } /** * Sequences used to match longer strings. * * We don't have `Script`, `Style`, or `Title` here. Instead, we re-use the *End * sequences with an increased offset. */ const Sequences = { Empty: new Uint8Array(0), Cdata: new Uint8Array([0x43, 0x44, 0x41, 0x54, 0x41, 0x5b]), // CDATA[ CdataEnd: new Uint8Array([0x5d, 0x5d, 0x3e]), // ]]> CommentEnd: new Uint8Array([0x2d, 0x2d, 0x21, 0x3e]), // `--!>` Doctype: new Uint8Array([0x64, 0x6f, 0x63, 0x74, 0x79, 0x70, 0x65]), // `doctype` IframeEnd: new Uint8Array([0x3c, 0x2f, 0x69, 0x66, 0x72, 0x61, 0x6d, 0x65]), // `([ [Sequences.IframeEnd[2], Sequences.IframeEnd], [Sequences.NoembedEnd[2], Sequences.NoembedEnd], [Sequences.Plaintext[2], Sequences.Plaintext], [Sequences.ScriptEnd[2], Sequences.ScriptEnd], [Sequences.TitleEnd[2], Sequences.TitleEnd], [Sequences.XmpEnd[2], Sequences.XmpEnd], ]); /** * Tokenizer implementation used by `Parser`. */ export default class Tokenizer { /** The current state the tokenizer is in. */ private state = State.Text; /** The read buffer. */ private buffer = ""; /** The beginning of the section that is currently being read. */ private sectionStart = 0; /** The index within the buffer that we are currently looking at. */ private index = 0; /** The start of the last entity. */ private entityStart = 0; /** Some behavior, eg. when decoding entities, is done while we are in another state. This keeps track of the other state type. */ private baseState = State.Text; /** For special parsing behavior inside of script and style tags. */ private isSpecial = false; /** Indicates whether the tokenizer has been paused. */ running = true; /** The offset of the current buffer. */ private offset = 0; private readonly xmlMode: boolean; private readonly decodeEntities: boolean; private readonly recognizeSelfClosing: boolean; private readonly entityDecoder: EntityDecoder; constructor( { xmlMode = false, decodeEntities = true, recognizeSelfClosing = xmlMode, }: { xmlMode?: boolean; decodeEntities?: boolean; recognizeSelfClosing?: boolean; }, private readonly cbs: Callbacks, ) { this.xmlMode = xmlMode; this.decodeEntities = decodeEntities; this.recognizeSelfClosing = recognizeSelfClosing; this.entityDecoder = new EntityDecoder( xmlMode ? xmlDecodeTree : htmlDecodeTree, (cp, consumed) => this.emitCodePoint(cp, consumed), ); } reset(): void { this.state = State.Text; this.buffer = ""; this.sectionStart = 0; this.index = 0; this.baseState = State.Text; this.isSpecial = false; this.currentSequence = Sequences.Empty; this.sequenceIndex = 0; this.running = true; this.offset = 0; } write(chunk: string): void { this.offset += this.buffer.length; this.buffer = chunk; this.parse(); } end(): void { if (this.running) this.finish(); } pause(): void { this.running = false; } resume(): void { this.running = true; if (this.index < this.buffer.length + this.offset) { this.parse(); } } private stateText(c: number): void { if ( c === CharCodes.Lt || (!this.decodeEntities && this.fastForwardTo(CharCodes.Lt)) ) { if (this.index > this.sectionStart) { this.cbs.ontext(this.sectionStart, this.index); } this.state = State.BeforeTagName; this.sectionStart = this.index; } else if (this.decodeEntities && c === CharCodes.Amp) { this.startEntity(); } } private currentSequence: Uint8Array = Sequences.Empty; private sequenceIndex = 0; private enterTagBody(): void { if (this.currentSequence === Sequences.Plaintext) { this.currentSequence = Sequences.Empty; this.state = State.InPlainText; } else if (this.isSpecial) { this.state = State.InSpecialTag; this.sequenceIndex = 0; } else { this.state = State.Text; } } /** * Match the opening tag name against an HTML text-only tag sequence. * * Some tags share an initial prefix (`script`/`style`, `title`/`textarea`, * `noembed`/`noframes`), so we may switch to an alternate sequence at the * first distinguishing byte. On a successful full match we fall back to * the normal tag-name state; a later `>` will enter raw-text, RCDATA, or * plaintext mode based on `currentSequence` / `isSpecial`. * @param c Current character code point. */ private stateSpecialStartSequence(c: number): void { const lower = c | 0x20; // Still matching — check for an alternate sequence at branch points. if (this.sequenceIndex < this.currentSequence.length) { if (lower === this.currentSequence[this.sequenceIndex]) { this.sequenceIndex++; return; } if (this.sequenceIndex === 3) { if ( this.currentSequence === Sequences.ScriptEnd && lower === Sequences.StyleEnd[3] ) { this.currentSequence = Sequences.StyleEnd; this.sequenceIndex = 4; return; } if ( this.currentSequence === Sequences.TitleEnd && lower === Sequences.TextareaEnd[3] ) { this.currentSequence = Sequences.TextareaEnd; this.sequenceIndex = 4; return; } } else if ( this.sequenceIndex === 4 && this.currentSequence === Sequences.NoembedEnd && lower === Sequences.NoframesEnd[4] ) { this.currentSequence = Sequences.NoframesEnd; this.sequenceIndex = 5; return; } } else if (isEndOfTagSection(c)) { // Full match on a valid tag boundary — keep the sequence. this.sequenceIndex = 0; this.state = State.InTagName; this.stateInTagName(c); return; } // No match — abandon special-tag detection. this.isSpecial = false; this.currentSequence = Sequences.Empty; this.sequenceIndex = 0; this.state = State.InTagName; this.stateInTagName(c); } private stateCDATASequence(c: number): void { if (c === Sequences.Cdata[this.sequenceIndex]) { if (++this.sequenceIndex === Sequences.Cdata.length) { this.state = State.InCommentLike; this.currentSequence = Sequences.CdataEnd; this.sequenceIndex = 0; this.sectionStart = this.index + 1; } } else { this.sequenceIndex = 0; if (this.xmlMode) { this.state = State.InDeclaration; this.stateInDeclaration(c); // Reconsume the character } else { this.state = State.InSpecialComment; this.stateInSpecialComment(c); // Reconsume the character } } } /** * When we wait for one specific character, we can speed things up * by skipping through the buffer until we find it. * @param c Current character code point. * @returns Whether the character was found. */ private fastForwardTo(c: number): boolean { while (++this.index < this.buffer.length + this.offset) { if (this.buffer.charCodeAt(this.index - this.offset) === c) { return true; } } /* * We increment the index at the end of the `parse` loop, * so set it to `buffer.length - 1` here. * * TODO: Refactor `parse` to increment index before calling states. */ this.index = this.buffer.length + this.offset - 1; return false; } /** * Emit a comment token and return to the text state. * @param offset Number of characters in the end sequence that have already been matched. */ private emitComment(offset: number): void { this.cbs.oncomment(this.sectionStart, this.index, offset); this.sequenceIndex = 0; this.sectionStart = this.index + 1; this.state = State.Text; } /** * Comments and CDATA end with `-->` and `]]>`. * * Their common qualities are: * - Their end sequences have a distinct character they start with. * - That character is then repeated, so we have to check multiple repeats. * - All characters but the start character of the sequence can be skipped. * @param c Current character code point. */ private stateInCommentLike(c: number): void { if ( !this.xmlMode && this.currentSequence === Sequences.CommentEnd && this.sequenceIndex <= 1 && /* * We're still at the very start of the comment: the only * characters consumed since ``, 1 for ``). */ this.index === this.sectionStart + this.sequenceIndex && c === CharCodes.Gt ) { // Abruptly closed empty HTML comment. this.emitComment(this.sequenceIndex); } else if ( this.currentSequence === Sequences.CommentEnd && this.sequenceIndex === 2 && c === CharCodes.Gt ) { // `!` is optional here, so the same sequence also accepts `-->`. this.emitComment(2); } else if ( this.currentSequence === Sequences.CommentEnd && this.sequenceIndex === this.currentSequence.length - 1 && c !== CharCodes.Gt ) { this.sequenceIndex = Number(c === CharCodes.Dash); } else if (c === this.currentSequence[this.sequenceIndex]) { if (++this.sequenceIndex === this.currentSequence.length) { if (this.currentSequence === Sequences.CdataEnd) { this.cbs.oncdata(this.sectionStart, this.index, 2); } else { this.cbs.oncomment(this.sectionStart, this.index, 3); } this.sequenceIndex = 0; this.sectionStart = this.index + 1; this.state = State.Text; } } else if (this.sequenceIndex === 0) { // Fast-forward to the first character of the sequence if (this.fastForwardTo(this.currentSequence[0])) { this.sequenceIndex = 1; } } else if (c !== this.currentSequence[this.sequenceIndex - 1]) { // Allow long sequences, eg. --->, ]]]> this.sequenceIndex = 0; } } /** * HTML only allows ASCII alpha characters (a-z and A-Z) at the beginning of a tag name. * * XML allows a lot more characters here (@see https://www.w3.org/TR/REC-xml/#NT-NameStartChar). * We allow anything that wouldn't end the tag. * @param c Current character code point. */ private isTagStartChar(c: number) { return this.xmlMode ? !isEndOfTagSection(c) : isASCIIAlpha(c); } /** * Scan raw-text / RCDATA content for the matching end tag. * * For RCDATA tags (`