import { strict as assert } from "node:assert"; import { Parser as HTMLParser2 } from "htmlparser2"; import { WritableStream } from "htmlparser2/WritableStream"; import { Duplex, Readable, Writable } from 'node:stream'; import duplexify from "duplexify"; type HTMLParser2CBs = ConstructorParameters[0]; type HTMLParser2Opts = ConstructorParameters[1]; type HTMLParser2Args = ConstructorParameters; const htmlVoidElements = [ 'area', 'base', 'basefont', 'bgsound', 'br', 'col', 'command', 'embed', 'frame', 'hr', 'image', 'img', 'input', 'isindex', 'keygen', 'link', 'menuitem', 'meta', 'nextid', 'param', 'source', 'track', 'wbr', ]; export function openTag(tagName: string, attributes: {[k: string]: string}) { // Transform attributes into string let attrs = Object.entries(attributes) .map(([k,v])=>{ // If the HTML coming through uses single quotes for the attribute, it // can contain a double quote, so just escape those. Markdown-it generates // this if you use a " in an alt tag, but I also do this sometimes too v = v .replace(/"/g, '"'); return `${k}="${v}"`; }) .join(' '); attrs = attrs ? ' ' + attrs : ''; // self close certain things, because JSX requires all tags to be closed, // no html
const selfClosing = htmlVoidElements.includes(tagName) ? ' /' : ''; return `<${tagName}${attrs}${selfClosing}>`; } export function closeTag(tagName: string) { if (htmlVoidElements.includes(tagName)) { // No closing tag return ''; } return ``; } interface HTMLChunk { innerText: string; innerHTML: string; } function htmlSelectorChunker(matcher: (tag: string, attrs:{ [s: string]: string })=>boolean, cb: (chunk: HTMLChunk)=>void): HTMLParser2CBs { let tagStateStack: { tag: string, attrs: { [s: string]: string }, marked?: boolean, innerText?: string, innerHTML?: string }[] = []; const htmlParser2CBs: HTMLParser2CBs = { onopentag(tag, attrs) { const marked = tagStateStack.find(t => t.marked); const tagStackItem = { tag, attrs }; if (matcher(tag, attrs)) { assert(!marked, "Nested tag marking encountered, not implemented/no sane implementation"); (tagStackItem as any).marked = true; (tagStackItem as any).innerText = ""; (tagStackItem as any).innerHTML = ""; } tagStateStack.push(tagStackItem); if (marked) { marked.innerHTML += openTag(tag, attrs); const str = tag === "br" ? "\n" : ""; marked.innerText += str; } }, ontext(text) { const marked = tagStateStack.find(t => t.marked); if (!marked) { return; // nothing to do } marked.innerText += text; }, onclosetag(tag) { const marked = tagStateStack.find(t => t.marked); if (!marked) { return; } marked.innerHTML += closeTag(tag); const popped = tagStateStack.pop(); if (marked === popped) { cb(popped as HTMLChunk); } }, }; return htmlParser2CBs; } class ExternalReadable extends Readable { _read() { } } export function htmlSelectorChunkedDuplex( matcher: (tag: string, attrs:{ [s: string]: string })=>boolean, postProcess: (chunk: HTMLChunk)=>string ): Duplex { const readable = new ExternalReadable(); const cbs = htmlSelectorChunker( matcher, (chunk)=>{ const out = postProcess(chunk); readable.push(out); } ); const writable = new WritableStream({ ...cbs, onerror(error){ readable.emit("error", error); }, onend() { readable.push(null); readable.emit("close"); } }); return duplexify(writable, readable); }