152 lines
3.8 KiB
TypeScript
152 lines
3.8 KiB
TypeScript
import { strict as assert } from "node:assert";
|
|
import { Parser as HTMLParser2 } from "htmlparser2";
|
|
import { WritableStream } from "htmlparser2/WritableStream";
|
|
import { Duplex, Readable, Writable } from 'node:stream';
|
|
import duplexify from "duplexify";
|
|
type HTMLParser2CBs = ConstructorParameters<typeof HTMLParser2>[0];
|
|
type HTMLParser2Opts = ConstructorParameters<typeof HTMLParser2>[1];
|
|
type HTMLParser2Args = ConstructorParameters<typeof HTMLParser2>;
|
|
|
|
|
|
const htmlVoidElements = [
|
|
'area',
|
|
'base',
|
|
'basefont',
|
|
'bgsound',
|
|
'br',
|
|
'col',
|
|
'command',
|
|
'embed',
|
|
'frame',
|
|
'hr',
|
|
'image',
|
|
'img',
|
|
'input',
|
|
'isindex',
|
|
'keygen',
|
|
'link',
|
|
'menuitem',
|
|
'meta',
|
|
'nextid',
|
|
'param',
|
|
'source',
|
|
'track',
|
|
'wbr',
|
|
];
|
|
|
|
export function openTag(tagName: string, attributes: {[k: string]: string}) {
|
|
// Transform attributes into string
|
|
let attrs = Object.entries(attributes)
|
|
.map(([k,v])=>{
|
|
// If the HTML coming through uses single quotes for the attribute, it
|
|
// can contain a double quote, so just escape those. Markdown-it generates
|
|
// this if you use a " in an alt tag, but I also do this sometimes too
|
|
v = v
|
|
.replace(/"/g, '"');
|
|
return `${k}="${v}"`;
|
|
})
|
|
.join(' ');
|
|
attrs = attrs ? ' ' + attrs : '';
|
|
// self close certain things, because JSX requires all tags to be closed,
|
|
// no html <br>
|
|
const selfClosing = htmlVoidElements.includes(tagName) ? ' /' : '';
|
|
return `<${tagName}${attrs}${selfClosing}>`;
|
|
}
|
|
|
|
export function closeTag(tagName: string) {
|
|
if (htmlVoidElements.includes(tagName)) {
|
|
// No closing tag
|
|
return '';
|
|
}
|
|
|
|
return `</${tagName}>`;
|
|
}
|
|
|
|
interface HTMLChunk {
|
|
innerText: string;
|
|
innerHTML: string;
|
|
}
|
|
|
|
function htmlSelectorChunker(matcher: (tag: string, attrs:{ [s: string]: string })=>boolean, cb: (chunk: HTMLChunk)=>void): HTMLParser2CBs {
|
|
let tagStateStack: {
|
|
tag: string,
|
|
attrs: { [s: string]: string },
|
|
marked?: boolean,
|
|
innerText?: string,
|
|
innerHTML?: string
|
|
}[] = [];
|
|
const htmlParser2CBs: HTMLParser2CBs = {
|
|
onopentag(tag, attrs) {
|
|
const marked = tagStateStack.find(t => t.marked);
|
|
|
|
const tagStackItem = {
|
|
tag, attrs
|
|
};
|
|
if (matcher(tag, attrs)) {
|
|
assert(!marked, "Nested tag marking encountered, not implemented/no sane implementation");
|
|
(tagStackItem as any).marked = true;
|
|
(tagStackItem as any).innerText = "";
|
|
(tagStackItem as any).innerHTML = "";
|
|
}
|
|
tagStateStack.push(tagStackItem);
|
|
|
|
if (marked) {
|
|
marked.innerHTML += openTag(tag, attrs);
|
|
const str = tag === "br" ? "\n" : "";
|
|
marked.innerText += str;
|
|
}
|
|
|
|
},
|
|
ontext(text) {
|
|
const marked = tagStateStack.find(t => t.marked);
|
|
if (!marked) {
|
|
return; // nothing to do
|
|
}
|
|
|
|
marked.innerText += text;
|
|
},
|
|
onclosetag(tag) {
|
|
const marked = tagStateStack.find(t => t.marked);
|
|
if (!marked) {
|
|
return;
|
|
}
|
|
marked.innerHTML += closeTag(tag);
|
|
const popped = tagStateStack.pop();
|
|
if (marked === popped) {
|
|
cb(popped as HTMLChunk);
|
|
}
|
|
},
|
|
};
|
|
return htmlParser2CBs;
|
|
}
|
|
|
|
class ExternalReadable extends Readable {
|
|
_read() {
|
|
|
|
}
|
|
}
|
|
|
|
export function htmlSelectorChunkedDuplex(
|
|
matcher: (tag: string, attrs:{ [s: string]: string })=>boolean,
|
|
postProcess: (chunk: HTMLChunk)=>string
|
|
): Duplex {
|
|
const readable = new ExternalReadable();
|
|
const cbs = htmlSelectorChunker(
|
|
matcher,
|
|
(chunk)=>{
|
|
const out = postProcess(chunk);
|
|
readable.push(out);
|
|
}
|
|
);
|
|
const writable = new WritableStream({
|
|
...cbs,
|
|
onerror(error){
|
|
readable.emit("error", error);
|
|
},
|
|
onend() {
|
|
readable.push(null);
|
|
readable.emit("close");
|
|
}
|
|
});
|
|
return duplexify(writable, readable);
|
|
}
|