Have you tried using an XML parser instead?
This commit is contained in:
parent
48e92d30f5
commit
c6bbdaa69c
3 changed files with 104 additions and 1 deletions
|
@ -22,6 +22,7 @@
|
|||
"@peertube/http-signature": "^1.7.0",
|
||||
"@projectdysnomia/dysnomia": "github:projectdysnomia/dysnomia#dev",
|
||||
"dumpy": "github:Cynosphere/dumpy.js",
|
||||
"fast-xml-parser": "^5.2.0",
|
||||
"google-images": "^2.1.0",
|
||||
"jimp": "^0.22.7",
|
||||
"murmurhash": "^2.0.1",
|
||||
|
|
16
pnpm-lock.yaml
generated
16
pnpm-lock.yaml
generated
|
@ -20,6 +20,9 @@ importers:
|
|||
dumpy:
|
||||
specifier: github:Cynosphere/dumpy.js
|
||||
version: https://codeload.github.com/Cynosphere/dumpy.js/tar.gz/5fc22353cdcb97084bab572266390e780d9f7a7b(encoding@0.1.13)
|
||||
fast-xml-parser:
|
||||
specifier: ^5.2.0
|
||||
version: 5.2.0
|
||||
google-images:
|
||||
specifier: ^2.1.0
|
||||
version: 2.1.0
|
||||
|
@ -814,6 +817,10 @@ packages:
|
|||
fast-levenshtein@2.0.6:
|
||||
resolution: {integrity: sha512-DCXu6Ifhqcks7TZKY3Hxp3y6qphY5SJZmrWMDrKcERSOXWQdMhU9Ig/PYrzyw/ul9jOIyh0N4M0tbC5hodg8dw==}
|
||||
|
||||
fast-xml-parser@5.2.0:
|
||||
resolution: {integrity: sha512-Uw9+Mjt4SBRud1IcaYuW/O0lW8SKKdMl5g7g24HiIuyH5fQSD+AVLybSlJtqLYEbytVFjWQa5DMGcNgeksdRBg==}
|
||||
hasBin: true
|
||||
|
||||
fastq@1.15.0:
|
||||
resolution: {integrity: sha512-wBrocU2LCXXa+lWBt8RoIRD89Fi8OdABODa/kEnyeyjS5aZO5/GNvI5sEINADqP/h8M29UHTHUb53sUu5Ihqdw==}
|
||||
|
||||
|
@ -1445,6 +1452,9 @@ packages:
|
|||
resolution: {integrity: sha512-6fPc+R4ihwqP6N/aIv2f1gMH8lOVtWQHoqC4yK6oSDVVocumAsfCqjkXnqiYMhmMwS/mEHLp7Vehlt3ql6lEig==}
|
||||
engines: {node: '>=8'}
|
||||
|
||||
strnum@2.0.5:
|
||||
resolution: {integrity: sha512-YAT3K/sgpCUxhxNMrrdhtod3jckkpYwH6JAuwmUdXZsmzH1wUyzTMrrK2wYCEEqlKwrWDd35NeuUkbBy/1iK+Q==}
|
||||
|
||||
strtok3@6.3.0:
|
||||
resolution: {integrity: sha512-fZtbhtvI9I48xDSywd/somNqgUHl2L2cstmXCCif0itOf96jeW18MBSyrLuNicYQVkvpOxkZtkzujiTJ9LW5Jw==}
|
||||
engines: {node: '>=10'}
|
||||
|
@ -2542,6 +2552,10 @@ snapshots:
|
|||
|
||||
fast-levenshtein@2.0.6: {}
|
||||
|
||||
fast-xml-parser@5.2.0:
|
||||
dependencies:
|
||||
strnum: 2.0.5
|
||||
|
||||
fastq@1.15.0:
|
||||
dependencies:
|
||||
reusify: 1.0.4
|
||||
|
@ -3244,6 +3258,8 @@ snapshots:
|
|||
|
||||
strip-json-comments@3.1.1: {}
|
||||
|
||||
strnum@2.0.5: {}
|
||||
|
||||
strtok3@6.3.0:
|
||||
dependencies:
|
||||
'@tokenizer/token': 0.3.0
|
||||
|
|
|
@ -1,3 +1,5 @@
|
|||
const {XMLParser} = require("fast-xml-parser");
|
||||
|
||||
// https://stackoverflow.com/a/39243641
|
||||
const htmlEntities = {
|
||||
nbsp: " ",
|
||||
|
@ -30,7 +32,91 @@ function parseHtmlEntities(str) {
|
|||
});
|
||||
}
|
||||
|
||||
const parser = new XMLParser({
|
||||
ignoreAttributes: false,
|
||||
preserveOrder: true,
|
||||
unpairedTags: ["hr", "br", "link", "meta"],
|
||||
stopNodes: ["*.pre", "*.script", "*.code"],
|
||||
processEntities: true,
|
||||
htmlEntities: true,
|
||||
attributeNamePrefix: "$",
|
||||
alwaysCreateTextNode: true,
|
||||
trimValues: false,
|
||||
});
|
||||
|
||||
const tagBlacklist = ["script", "style"];
|
||||
function whenYouWalking(tree, images = true, embed = true) {
|
||||
let str = "";
|
||||
|
||||
for (const elem of tree) {
|
||||
if (elem["#text"]) {
|
||||
str += elem["#text"].replaceAll(/[\\*\-_~#<[]/g, "\\$&").replaceAll("||", "|\u200b|");
|
||||
} else if (elem.a) {
|
||||
const content = whenYouWalking(elem.a, images, embed);
|
||||
const link = elem.$href ?? elem[":@"]?.$href;
|
||||
const alt = elem.$alt ?? elem.$title ?? elem[":@"]?.$alt ?? elem[":@"]?.$title;
|
||||
|
||||
if (link == null || !link.startsWith("http")) {
|
||||
str += content;
|
||||
} else if (content == link || link.replace(/^https?:\/\//, "") == content) {
|
||||
str += embed ? link : `<${link}>`;
|
||||
} else {
|
||||
str += `[${content}](${embed ? "" : "<"}${link}${embed ? "" : ">"}${alt != null ? ` "${alt}"` : ""})`;
|
||||
}
|
||||
} else if (elem.b) {
|
||||
str += `**${whenYouWalking(elem.b, images, embed)}**`;
|
||||
} else if (elem.blockquote) {
|
||||
str += `> ${whenYouWalking(elem.blockquote, images, embed).replaceAll(/\n/g, "\n> ")}`;
|
||||
} else if (elem.br) {
|
||||
str += `\n${whenYouWalking(elem.br, images, embed)}`;
|
||||
} else if (elem.code) {
|
||||
str += `\`${whenYouWalking(elem.code, images, embed)}\``;
|
||||
} else if (elem.dd) {
|
||||
str += `\u3000\u3000${whenYouWalking(elem.dd, images, embed).replaceAll(/\n/g, "\n\u3000\u3000")}`;
|
||||
} else if (elem.em) {
|
||||
str += `_${whenYouWalking(elem.em, images, embed)}_`;
|
||||
} else if (elem.i) {
|
||||
str += `_${whenYouWalking(elem.i, images, embed)}_`;
|
||||
} else if (elem.img && images) {
|
||||
const link = elem.$src ?? elem[":@"]?.$src;
|
||||
const alt = elem.$alt ?? elem.$title ?? elem[":@"]?.$alt ?? elem[":@"]?.$title;
|
||||
|
||||
str += `[${alt ?? "<Image>"}](${embed ? "" : "<"}${link}${embed ? "" : ">"})`;
|
||||
} else if (elem.ol) {
|
||||
str += `1. ${whenYouWalking(elem.ol, images, embed)}`;
|
||||
} else if (elem.p) {
|
||||
str += `\n${whenYouWalking(elem.p, images, embed)}\n`;
|
||||
} else if (elem.pre) {
|
||||
str += `\`\`\`\n${whenYouWalking(elem.pre, images, embed)}\`\`\``;
|
||||
} else if (elem.s) {
|
||||
str += `~~${whenYouWalking(elem.s, images, embed)}~~`;
|
||||
} else if (elem.small) {
|
||||
str += `-# ${whenYouWalking(elem.s, images, embed)}`;
|
||||
} else if (elem.u) {
|
||||
str += `__${whenYouWalking(elem.u, images, embed)}__`;
|
||||
} else if (elem.ul) {
|
||||
str += `- ${whenYouWalking(elem.ul, images, embed)}`;
|
||||
} else {
|
||||
for (const [key, val] of Object.entries(elem)) {
|
||||
if (key != ":@" && !key.startsWith("$") && !tagBlacklist.includes(key)) {
|
||||
str += whenYouWalking(val, images, embed);
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return str;
|
||||
}
|
||||
|
||||
function htmlToMarkdown(str, images = true, embed = true) {
|
||||
if (!str.startsWith("<body>")) str = `<body>${str}</body>`;
|
||||
const tree = parser.parse(str)[0].body;
|
||||
|
||||
return whenYouWalking(tree, images, embed);
|
||||
}
|
||||
|
||||
function htmlToMarkdownOld(str, images = true, embed = true) {
|
||||
str = str.replaceAll("\\", "\\\\");
|
||||
str = str.replace(/<style(\s*[^>]+)?>(.|\n)*?<\/style>/gi, "");
|
||||
str = str.replace(/<a (\s*[^>]+)?href="([^"]+?)"(\s*[^>]+)?>(.+?)<\/a>/gi, (_, __, url, ___, text) => {
|
||||
|
@ -89,4 +175,4 @@ function htmlToMarkdown(str, images = true, embed = true) {
|
|||
return str;
|
||||
}
|
||||
|
||||
module.exports = {parseHtmlEntities, htmlToMarkdown};
|
||||
module.exports = {parseHtmlEntities, htmlToMarkdown, htmlToMarkdownOld};
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue