Have you tried using an XML parser instead?
This commit is contained in:
parent
48e92d30f5
commit
c6bbdaa69c
3 changed files with 104 additions and 1 deletions
|
@ -22,6 +22,7 @@
|
||||||
"@peertube/http-signature": "^1.7.0",
|
"@peertube/http-signature": "^1.7.0",
|
||||||
"@projectdysnomia/dysnomia": "github:projectdysnomia/dysnomia#dev",
|
"@projectdysnomia/dysnomia": "github:projectdysnomia/dysnomia#dev",
|
||||||
"dumpy": "github:Cynosphere/dumpy.js",
|
"dumpy": "github:Cynosphere/dumpy.js",
|
||||||
|
"fast-xml-parser": "^5.2.0",
|
||||||
"google-images": "^2.1.0",
|
"google-images": "^2.1.0",
|
||||||
"jimp": "^0.22.7",
|
"jimp": "^0.22.7",
|
||||||
"murmurhash": "^2.0.1",
|
"murmurhash": "^2.0.1",
|
||||||
|
|
16
pnpm-lock.yaml
generated
16
pnpm-lock.yaml
generated
|
@ -20,6 +20,9 @@ importers:
|
||||||
dumpy:
|
dumpy:
|
||||||
specifier: github:Cynosphere/dumpy.js
|
specifier: github:Cynosphere/dumpy.js
|
||||||
version: https://codeload.github.com/Cynosphere/dumpy.js/tar.gz/5fc22353cdcb97084bab572266390e780d9f7a7b(encoding@0.1.13)
|
version: https://codeload.github.com/Cynosphere/dumpy.js/tar.gz/5fc22353cdcb97084bab572266390e780d9f7a7b(encoding@0.1.13)
|
||||||
|
fast-xml-parser:
|
||||||
|
specifier: ^5.2.0
|
||||||
|
version: 5.2.0
|
||||||
google-images:
|
google-images:
|
||||||
specifier: ^2.1.0
|
specifier: ^2.1.0
|
||||||
version: 2.1.0
|
version: 2.1.0
|
||||||
|
@ -814,6 +817,10 @@ packages:
|
||||||
fast-levenshtein@2.0.6:
|
fast-levenshtein@2.0.6:
|
||||||
resolution: {integrity: sha512-DCXu6Ifhqcks7TZKY3Hxp3y6qphY5SJZmrWMDrKcERSOXWQdMhU9Ig/PYrzyw/ul9jOIyh0N4M0tbC5hodg8dw==}
|
resolution: {integrity: sha512-DCXu6Ifhqcks7TZKY3Hxp3y6qphY5SJZmrWMDrKcERSOXWQdMhU9Ig/PYrzyw/ul9jOIyh0N4M0tbC5hodg8dw==}
|
||||||
|
|
||||||
|
fast-xml-parser@5.2.0:
|
||||||
|
resolution: {integrity: sha512-Uw9+Mjt4SBRud1IcaYuW/O0lW8SKKdMl5g7g24HiIuyH5fQSD+AVLybSlJtqLYEbytVFjWQa5DMGcNgeksdRBg==}
|
||||||
|
hasBin: true
|
||||||
|
|
||||||
fastq@1.15.0:
|
fastq@1.15.0:
|
||||||
resolution: {integrity: sha512-wBrocU2LCXXa+lWBt8RoIRD89Fi8OdABODa/kEnyeyjS5aZO5/GNvI5sEINADqP/h8M29UHTHUb53sUu5Ihqdw==}
|
resolution: {integrity: sha512-wBrocU2LCXXa+lWBt8RoIRD89Fi8OdABODa/kEnyeyjS5aZO5/GNvI5sEINADqP/h8M29UHTHUb53sUu5Ihqdw==}
|
||||||
|
|
||||||
|
@ -1445,6 +1452,9 @@ packages:
|
||||||
resolution: {integrity: sha512-6fPc+R4ihwqP6N/aIv2f1gMH8lOVtWQHoqC4yK6oSDVVocumAsfCqjkXnqiYMhmMwS/mEHLp7Vehlt3ql6lEig==}
|
resolution: {integrity: sha512-6fPc+R4ihwqP6N/aIv2f1gMH8lOVtWQHoqC4yK6oSDVVocumAsfCqjkXnqiYMhmMwS/mEHLp7Vehlt3ql6lEig==}
|
||||||
engines: {node: '>=8'}
|
engines: {node: '>=8'}
|
||||||
|
|
||||||
|
strnum@2.0.5:
|
||||||
|
resolution: {integrity: sha512-YAT3K/sgpCUxhxNMrrdhtod3jckkpYwH6JAuwmUdXZsmzH1wUyzTMrrK2wYCEEqlKwrWDd35NeuUkbBy/1iK+Q==}
|
||||||
|
|
||||||
strtok3@6.3.0:
|
strtok3@6.3.0:
|
||||||
resolution: {integrity: sha512-fZtbhtvI9I48xDSywd/somNqgUHl2L2cstmXCCif0itOf96jeW18MBSyrLuNicYQVkvpOxkZtkzujiTJ9LW5Jw==}
|
resolution: {integrity: sha512-fZtbhtvI9I48xDSywd/somNqgUHl2L2cstmXCCif0itOf96jeW18MBSyrLuNicYQVkvpOxkZtkzujiTJ9LW5Jw==}
|
||||||
engines: {node: '>=10'}
|
engines: {node: '>=10'}
|
||||||
|
@ -2542,6 +2552,10 @@ snapshots:
|
||||||
|
|
||||||
fast-levenshtein@2.0.6: {}
|
fast-levenshtein@2.0.6: {}
|
||||||
|
|
||||||
|
fast-xml-parser@5.2.0:
|
||||||
|
dependencies:
|
||||||
|
strnum: 2.0.5
|
||||||
|
|
||||||
fastq@1.15.0:
|
fastq@1.15.0:
|
||||||
dependencies:
|
dependencies:
|
||||||
reusify: 1.0.4
|
reusify: 1.0.4
|
||||||
|
@ -3244,6 +3258,8 @@ snapshots:
|
||||||
|
|
||||||
strip-json-comments@3.1.1: {}
|
strip-json-comments@3.1.1: {}
|
||||||
|
|
||||||
|
strnum@2.0.5: {}
|
||||||
|
|
||||||
strtok3@6.3.0:
|
strtok3@6.3.0:
|
||||||
dependencies:
|
dependencies:
|
||||||
'@tokenizer/token': 0.3.0
|
'@tokenizer/token': 0.3.0
|
||||||
|
|
|
@ -1,3 +1,5 @@
|
||||||
|
const {XMLParser} = require("fast-xml-parser");
|
||||||
|
|
||||||
// https://stackoverflow.com/a/39243641
|
// https://stackoverflow.com/a/39243641
|
||||||
const htmlEntities = {
|
const htmlEntities = {
|
||||||
nbsp: " ",
|
nbsp: " ",
|
||||||
|
@ -30,7 +32,91 @@ function parseHtmlEntities(str) {
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const parser = new XMLParser({
|
||||||
|
ignoreAttributes: false,
|
||||||
|
preserveOrder: true,
|
||||||
|
unpairedTags: ["hr", "br", "link", "meta"],
|
||||||
|
stopNodes: ["*.pre", "*.script", "*.code"],
|
||||||
|
processEntities: true,
|
||||||
|
htmlEntities: true,
|
||||||
|
attributeNamePrefix: "$",
|
||||||
|
alwaysCreateTextNode: true,
|
||||||
|
trimValues: false,
|
||||||
|
});
|
||||||
|
|
||||||
|
const tagBlacklist = ["script", "style"];
|
||||||
|
function whenYouWalking(tree, images = true, embed = true) {
|
||||||
|
let str = "";
|
||||||
|
|
||||||
|
for (const elem of tree) {
|
||||||
|
if (elem["#text"]) {
|
||||||
|
str += elem["#text"].replaceAll(/[\\*\-_~#<[]/g, "\\$&").replaceAll("||", "|\u200b|");
|
||||||
|
} else if (elem.a) {
|
||||||
|
const content = whenYouWalking(elem.a, images, embed);
|
||||||
|
const link = elem.$href ?? elem[":@"]?.$href;
|
||||||
|
const alt = elem.$alt ?? elem.$title ?? elem[":@"]?.$alt ?? elem[":@"]?.$title;
|
||||||
|
|
||||||
|
if (link == null || !link.startsWith("http")) {
|
||||||
|
str += content;
|
||||||
|
} else if (content == link || link.replace(/^https?:\/\//, "") == content) {
|
||||||
|
str += embed ? link : `<${link}>`;
|
||||||
|
} else {
|
||||||
|
str += `[${content}](${embed ? "" : "<"}${link}${embed ? "" : ">"}${alt != null ? ` "${alt}"` : ""})`;
|
||||||
|
}
|
||||||
|
} else if (elem.b) {
|
||||||
|
str += `**${whenYouWalking(elem.b, images, embed)}**`;
|
||||||
|
} else if (elem.blockquote) {
|
||||||
|
str += `> ${whenYouWalking(elem.blockquote, images, embed).replaceAll(/\n/g, "\n> ")}`;
|
||||||
|
} else if (elem.br) {
|
||||||
|
str += `\n${whenYouWalking(elem.br, images, embed)}`;
|
||||||
|
} else if (elem.code) {
|
||||||
|
str += `\`${whenYouWalking(elem.code, images, embed)}\``;
|
||||||
|
} else if (elem.dd) {
|
||||||
|
str += `\u3000\u3000${whenYouWalking(elem.dd, images, embed).replaceAll(/\n/g, "\n\u3000\u3000")}`;
|
||||||
|
} else if (elem.em) {
|
||||||
|
str += `_${whenYouWalking(elem.em, images, embed)}_`;
|
||||||
|
} else if (elem.i) {
|
||||||
|
str += `_${whenYouWalking(elem.i, images, embed)}_`;
|
||||||
|
} else if (elem.img && images) {
|
||||||
|
const link = elem.$src ?? elem[":@"]?.$src;
|
||||||
|
const alt = elem.$alt ?? elem.$title ?? elem[":@"]?.$alt ?? elem[":@"]?.$title;
|
||||||
|
|
||||||
|
str += `[${alt ?? "<Image>"}](${embed ? "" : "<"}${link}${embed ? "" : ">"})`;
|
||||||
|
} else if (elem.ol) {
|
||||||
|
str += `1. ${whenYouWalking(elem.ol, images, embed)}`;
|
||||||
|
} else if (elem.p) {
|
||||||
|
str += `\n${whenYouWalking(elem.p, images, embed)}\n`;
|
||||||
|
} else if (elem.pre) {
|
||||||
|
str += `\`\`\`\n${whenYouWalking(elem.pre, images, embed)}\`\`\``;
|
||||||
|
} else if (elem.s) {
|
||||||
|
str += `~~${whenYouWalking(elem.s, images, embed)}~~`;
|
||||||
|
} else if (elem.small) {
|
||||||
|
str += `-# ${whenYouWalking(elem.s, images, embed)}`;
|
||||||
|
} else if (elem.u) {
|
||||||
|
str += `__${whenYouWalking(elem.u, images, embed)}__`;
|
||||||
|
} else if (elem.ul) {
|
||||||
|
str += `- ${whenYouWalking(elem.ul, images, embed)}`;
|
||||||
|
} else {
|
||||||
|
for (const [key, val] of Object.entries(elem)) {
|
||||||
|
if (key != ":@" && !key.startsWith("$") && !tagBlacklist.includes(key)) {
|
||||||
|
str += whenYouWalking(val, images, embed);
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return str;
|
||||||
|
}
|
||||||
|
|
||||||
function htmlToMarkdown(str, images = true, embed = true) {
|
function htmlToMarkdown(str, images = true, embed = true) {
|
||||||
|
if (!str.startsWith("<body>")) str = `<body>${str}</body>`;
|
||||||
|
const tree = parser.parse(str)[0].body;
|
||||||
|
|
||||||
|
return whenYouWalking(tree, images, embed);
|
||||||
|
}
|
||||||
|
|
||||||
|
function htmlToMarkdownOld(str, images = true, embed = true) {
|
||||||
str = str.replaceAll("\\", "\\\\");
|
str = str.replaceAll("\\", "\\\\");
|
||||||
str = str.replace(/<style(\s*[^>]+)?>(.|\n)*?<\/style>/gi, "");
|
str = str.replace(/<style(\s*[^>]+)?>(.|\n)*?<\/style>/gi, "");
|
||||||
str = str.replace(/<a (\s*[^>]+)?href="([^"]+?)"(\s*[^>]+)?>(.+?)<\/a>/gi, (_, __, url, ___, text) => {
|
str = str.replace(/<a (\s*[^>]+)?href="([^"]+?)"(\s*[^>]+)?>(.+?)<\/a>/gi, (_, __, url, ___, text) => {
|
||||||
|
@ -89,4 +175,4 @@ function htmlToMarkdown(str, images = true, embed = true) {
|
||||||
return str;
|
return str;
|
||||||
}
|
}
|
||||||
|
|
||||||
module.exports = {parseHtmlEntities, htmlToMarkdown};
|
module.exports = {parseHtmlEntities, htmlToMarkdown, htmlToMarkdownOld};
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue