Have you tried using an XML parser instead?

This commit is contained in:
Cynthia Foxwell 2025-04-05 10:39:37 -06:00
parent 48e92d30f5
commit c6bbdaa69c
Signed by: Cynosphere
SSH key fingerprint: SHA256:H3SM8ufP/uxqLwKSH7xY89TDnbR9uOHzjLoBr0tlajk
3 changed files with 104 additions and 1 deletions

View file

@ -22,6 +22,7 @@
"@peertube/http-signature": "^1.7.0",
"@projectdysnomia/dysnomia": "github:projectdysnomia/dysnomia#dev",
"dumpy": "github:Cynosphere/dumpy.js",
"fast-xml-parser": "^5.2.0",
"google-images": "^2.1.0",
"jimp": "^0.22.7",
"murmurhash": "^2.0.1",

16
pnpm-lock.yaml generated
View file

@ -20,6 +20,9 @@ importers:
dumpy:
specifier: github:Cynosphere/dumpy.js
version: https://codeload.github.com/Cynosphere/dumpy.js/tar.gz/5fc22353cdcb97084bab572266390e780d9f7a7b(encoding@0.1.13)
fast-xml-parser:
specifier: ^5.2.0
version: 5.2.0
google-images:
specifier: ^2.1.0
version: 2.1.0
@ -814,6 +817,10 @@ packages:
fast-levenshtein@2.0.6:
resolution: {integrity: sha512-DCXu6Ifhqcks7TZKY3Hxp3y6qphY5SJZmrWMDrKcERSOXWQdMhU9Ig/PYrzyw/ul9jOIyh0N4M0tbC5hodg8dw==}
fast-xml-parser@5.2.0:
resolution: {integrity: sha512-Uw9+Mjt4SBRud1IcaYuW/O0lW8SKKdMl5g7g24HiIuyH5fQSD+AVLybSlJtqLYEbytVFjWQa5DMGcNgeksdRBg==}
hasBin: true
fastq@1.15.0:
resolution: {integrity: sha512-wBrocU2LCXXa+lWBt8RoIRD89Fi8OdABODa/kEnyeyjS5aZO5/GNvI5sEINADqP/h8M29UHTHUb53sUu5Ihqdw==}
@ -1445,6 +1452,9 @@ packages:
resolution: {integrity: sha512-6fPc+R4ihwqP6N/aIv2f1gMH8lOVtWQHoqC4yK6oSDVVocumAsfCqjkXnqiYMhmMwS/mEHLp7Vehlt3ql6lEig==}
engines: {node: '>=8'}
strnum@2.0.5:
resolution: {integrity: sha512-YAT3K/sgpCUxhxNMrrdhtod3jckkpYwH6JAuwmUdXZsmzH1wUyzTMrrK2wYCEEqlKwrWDd35NeuUkbBy/1iK+Q==}
strtok3@6.3.0:
resolution: {integrity: sha512-fZtbhtvI9I48xDSywd/somNqgUHl2L2cstmXCCif0itOf96jeW18MBSyrLuNicYQVkvpOxkZtkzujiTJ9LW5Jw==}
engines: {node: '>=10'}
@ -2542,6 +2552,10 @@ snapshots:
fast-levenshtein@2.0.6: {}
fast-xml-parser@5.2.0:
dependencies:
strnum: 2.0.5
fastq@1.15.0:
dependencies:
reusify: 1.0.4
@ -3244,6 +3258,8 @@ snapshots:
strip-json-comments@3.1.1: {}
strnum@2.0.5: {}
strtok3@6.3.0:
dependencies:
'@tokenizer/token': 0.3.0

View file

@ -1,3 +1,5 @@
const {XMLParser} = require("fast-xml-parser");
// https://stackoverflow.com/a/39243641
const htmlEntities = {
nbsp: " ",
@ -30,7 +32,91 @@ function parseHtmlEntities(str) {
});
}
const parser = new XMLParser({
ignoreAttributes: false,
preserveOrder: true,
unpairedTags: ["hr", "br", "link", "meta"],
stopNodes: ["*.pre", "*.script", "*.code"],
processEntities: true,
htmlEntities: true,
attributeNamePrefix: "$",
alwaysCreateTextNode: true,
trimValues: false,
});
const tagBlacklist = ["script", "style"];
function whenYouWalking(tree, images = true, embed = true) {
let str = "";
for (const elem of tree) {
if (elem["#text"]) {
str += elem["#text"].replaceAll(/[\\*\-_~#<[]/g, "\\$&").replaceAll("||", "|\u200b|");
} else if (elem.a) {
const content = whenYouWalking(elem.a, images, embed);
const link = elem.$href ?? elem[":@"]?.$href;
const alt = elem.$alt ?? elem.$title ?? elem[":@"]?.$alt ?? elem[":@"]?.$title;
if (link == null || !link.startsWith("http")) {
str += content;
} else if (content == link || link.replace(/^https?:\/\//, "") == content) {
str += embed ? link : `<${link}>`;
} else {
str += `[${content}](${embed ? "" : "<"}${link}${embed ? "" : ">"}${alt != null ? ` "${alt}"` : ""})`;
}
} else if (elem.b) {
str += `**${whenYouWalking(elem.b, images, embed)}**`;
} else if (elem.blockquote) {
str += `> ${whenYouWalking(elem.blockquote, images, embed).replaceAll(/\n/g, "\n> ")}`;
} else if (elem.br) {
str += `\n${whenYouWalking(elem.br, images, embed)}`;
} else if (elem.code) {
str += `\`${whenYouWalking(elem.code, images, embed)}\``;
} else if (elem.dd) {
str += `\u3000\u3000${whenYouWalking(elem.dd, images, embed).replaceAll(/\n/g, "\n\u3000\u3000")}`;
} else if (elem.em) {
str += `_${whenYouWalking(elem.em, images, embed)}_`;
} else if (elem.i) {
str += `_${whenYouWalking(elem.i, images, embed)}_`;
} else if (elem.img && images) {
const link = elem.$src ?? elem[":@"]?.$src;
const alt = elem.$alt ?? elem.$title ?? elem[":@"]?.$alt ?? elem[":@"]?.$title;
str += `[${alt ?? "<Image>"}](${embed ? "" : "<"}${link}${embed ? "" : ">"})`;
} else if (elem.ol) {
str += `1. ${whenYouWalking(elem.ol, images, embed)}`;
} else if (elem.p) {
str += `\n${whenYouWalking(elem.p, images, embed)}\n`;
} else if (elem.pre) {
str += `\`\`\`\n${whenYouWalking(elem.pre, images, embed)}\`\`\``;
} else if (elem.s) {
str += `~~${whenYouWalking(elem.s, images, embed)}~~`;
} else if (elem.small) {
str += `-# ${whenYouWalking(elem.s, images, embed)}`;
} else if (elem.u) {
str += `__${whenYouWalking(elem.u, images, embed)}__`;
} else if (elem.ul) {
str += `- ${whenYouWalking(elem.ul, images, embed)}`;
} else {
for (const [key, val] of Object.entries(elem)) {
if (key != ":@" && !key.startsWith("$") && !tagBlacklist.includes(key)) {
str += whenYouWalking(val, images, embed);
break;
}
}
}
}
return str;
}
function htmlToMarkdown(str, images = true, embed = true) {
if (!str.startsWith("<body>")) str = `<body>${str}</body>`;
const tree = parser.parse(str)[0].body;
return whenYouWalking(tree, images, embed);
}
function htmlToMarkdownOld(str, images = true, embed = true) {
str = str.replaceAll("\\", "\\\\");
str = str.replace(/<style(\s*[^>]+)?>(.|\n)*?<\/style>/gi, "");
str = str.replace(/<a (\s*[^>]+)?href="([^"]+?)"(\s*[^>]+)?>(.+?)<\/a>/gi, (_, __, url, ___, text) => {
@ -89,4 +175,4 @@ function htmlToMarkdown(str, images = true, embed = true) {
return str;
}
module.exports = {parseHtmlEntities, htmlToMarkdown};
module.exports = {parseHtmlEntities, htmlToMarkdown, htmlToMarkdownOld};