base-data-manager/data-export/html.ts

152 lines
3.8 KiB
TypeScript

import { strict as assert } from "node:assert";
import { Parser as HTMLParser2 } from "htmlparser2";
import { WritableStream } from "htmlparser2/WritableStream";
import { Duplex, Readable, Writable } from 'node:stream';
import duplexify from "duplexify";
type HTMLParser2CBs = ConstructorParameters<typeof HTMLParser2>[0];
type HTMLParser2Opts = ConstructorParameters<typeof HTMLParser2>[1];
type HTMLParser2Args = ConstructorParameters<typeof HTMLParser2>;
const htmlVoidElements = [
'area',
'base',
'basefont',
'bgsound',
'br',
'col',
'command',
'embed',
'frame',
'hr',
'image',
'img',
'input',
'isindex',
'keygen',
'link',
'menuitem',
'meta',
'nextid',
'param',
'source',
'track',
'wbr',
];
export function openTag(tagName: string, attributes: {[k: string]: string}) {
// Transform attributes into string
let attrs = Object.entries(attributes)
.map(([k,v])=>{
// If the HTML coming through uses single quotes for the attribute, it
// can contain a double quote, so just escape those. Markdown-it generates
// this if you use a " in an alt tag, but I also do this sometimes too
v = v
.replace(/"/g, '&quot;');
return `${k}="${v}"`;
})
.join(' ');
attrs = attrs ? ' ' + attrs : '';
// self close certain things, because JSX requires all tags to be closed,
// no html <br>
const selfClosing = htmlVoidElements.includes(tagName) ? ' /' : '';
return `<${tagName}${attrs}${selfClosing}>`;
}
export function closeTag(tagName: string) {
if (htmlVoidElements.includes(tagName)) {
// No closing tag
return '';
}
return `</${tagName}>`;
}
interface HTMLChunk {
innerText: string;
innerHTML: string;
}
function htmlSelectorChunker(matcher: (tag: string, attrs:{ [s: string]: string })=>boolean, cb: (chunk: HTMLChunk)=>void): HTMLParser2CBs {
let tagStateStack: {
tag: string,
attrs: { [s: string]: string },
marked?: boolean,
innerText?: string,
innerHTML?: string
}[] = [];
const htmlParser2CBs: HTMLParser2CBs = {
onopentag(tag, attrs) {
const marked = tagStateStack.find(t => t.marked);
const tagStackItem = {
tag, attrs
};
if (matcher(tag, attrs)) {
assert(!marked, "Nested tag marking encountered, not implemented/no sane implementation");
(tagStackItem as any).marked = true;
(tagStackItem as any).innerText = "";
(tagStackItem as any).innerHTML = "";
}
tagStateStack.push(tagStackItem);
if (marked) {
marked.innerHTML += openTag(tag, attrs);
const str = tag === "br" ? "\n" : "";
marked.innerText += str;
}
},
ontext(text) {
const marked = tagStateStack.find(t => t.marked);
if (!marked) {
return; // nothing to do
}
marked.innerText += text;
},
onclosetag(tag) {
const marked = tagStateStack.find(t => t.marked);
if (!marked) {
return;
}
marked.innerHTML += closeTag(tag);
const popped = tagStateStack.pop();
if (marked === popped) {
cb(popped as HTMLChunk);
}
},
};
return htmlParser2CBs;
}
class ExternalReadable extends Readable {
_read() {
}
}
export function htmlSelectorChunkedDuplex(
matcher: (tag: string, attrs:{ [s: string]: string })=>boolean,
postProcess: (chunk: HTMLChunk)=>string
): Duplex {
const readable = new ExternalReadable();
const cbs = htmlSelectorChunker(
matcher,
(chunk)=>{
const out = postProcess(chunk);
readable.push(out);
}
);
const writable = new WritableStream({
...cbs,
onerror(error){
readable.emit("error", error);
},
onend() {
readable.push(null);
readable.emit("close");
}
});
return duplexify(writable, readable);
}