diff --git a/package.json b/package.json index 649fac0..b3a2db6 100644 --- a/package.json +++ b/package.json @@ -22,6 +22,7 @@ "@peertube/http-signature": "^1.7.0", "@projectdysnomia/dysnomia": "github:projectdysnomia/dysnomia#dev", "dumpy": "github:Cynosphere/dumpy.js", + "fast-xml-parser": "^5.2.0", "google-images": "^2.1.0", "jimp": "^0.22.7", "murmurhash": "^2.0.1", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index c803142..e8963a9 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -20,6 +20,9 @@ importers: dumpy: specifier: github:Cynosphere/dumpy.js version: https://codeload.github.com/Cynosphere/dumpy.js/tar.gz/5fc22353cdcb97084bab572266390e780d9f7a7b(encoding@0.1.13) + fast-xml-parser: + specifier: ^5.2.0 + version: 5.2.0 google-images: specifier: ^2.1.0 version: 2.1.0 @@ -814,6 +817,10 @@ packages: fast-levenshtein@2.0.6: resolution: {integrity: sha512-DCXu6Ifhqcks7TZKY3Hxp3y6qphY5SJZmrWMDrKcERSOXWQdMhU9Ig/PYrzyw/ul9jOIyh0N4M0tbC5hodg8dw==} + fast-xml-parser@5.2.0: + resolution: {integrity: sha512-Uw9+Mjt4SBRud1IcaYuW/O0lW8SKKdMl5g7g24HiIuyH5fQSD+AVLybSlJtqLYEbytVFjWQa5DMGcNgeksdRBg==} + hasBin: true + fastq@1.15.0: resolution: {integrity: sha512-wBrocU2LCXXa+lWBt8RoIRD89Fi8OdABODa/kEnyeyjS5aZO5/GNvI5sEINADqP/h8M29UHTHUb53sUu5Ihqdw==} @@ -1445,6 +1452,9 @@ packages: resolution: {integrity: sha512-6fPc+R4ihwqP6N/aIv2f1gMH8lOVtWQHoqC4yK6oSDVVocumAsfCqjkXnqiYMhmMwS/mEHLp7Vehlt3ql6lEig==} engines: {node: '>=8'} + strnum@2.0.5: + resolution: {integrity: sha512-YAT3K/sgpCUxhxNMrrdhtod3jckkpYwH6JAuwmUdXZsmzH1wUyzTMrrK2wYCEEqlKwrWDd35NeuUkbBy/1iK+Q==} + strtok3@6.3.0: resolution: {integrity: sha512-fZtbhtvI9I48xDSywd/somNqgUHl2L2cstmXCCif0itOf96jeW18MBSyrLuNicYQVkvpOxkZtkzujiTJ9LW5Jw==} engines: {node: '>=10'} @@ -2542,6 +2552,10 @@ snapshots: fast-levenshtein@2.0.6: {} + fast-xml-parser@5.2.0: + dependencies: + strnum: 2.0.5 + fastq@1.15.0: dependencies: reusify: 1.0.4 @@ -3244,6 +3258,8 @@ snapshots: strip-json-comments@3.1.1: {} + strnum@2.0.5: {} + strtok3@6.3.0: dependencies: '@tokenizer/token': 0.3.0 diff --git a/src/util/html.js b/src/util/html.js index be61121..c54c777 100644 --- a/src/util/html.js +++ b/src/util/html.js @@ -1,3 +1,5 @@ +const {XMLParser} = require("fast-xml-parser"); + // https://stackoverflow.com/a/39243641 const htmlEntities = { nbsp: " ", @@ -30,7 +32,91 @@ function parseHtmlEntities(str) { }); } +const parser = new XMLParser({ + ignoreAttributes: false, + preserveOrder: true, + unpairedTags: ["hr", "br", "link", "meta"], + stopNodes: ["*.pre", "*.script", "*.code"], + processEntities: true, + htmlEntities: true, + attributeNamePrefix: "$", + alwaysCreateTextNode: true, + trimValues: false, +}); + +const tagBlacklist = ["script", "style"]; +function whenYouWalking(tree, images = true, embed = true) { + let str = ""; + + for (const elem of tree) { + if (elem["#text"]) { + str += elem["#text"].replaceAll(/[\\*\-_~#<[]/g, "\\$&").replaceAll("||", "|\u200b|"); + } else if (elem.a) { + const content = whenYouWalking(elem.a, images, embed); + const link = elem.$href ?? elem[":@"]?.$href; + const alt = elem.$alt ?? elem.$title ?? elem[":@"]?.$alt ?? elem[":@"]?.$title; + + if (link == null || !link.startsWith("http")) { + str += content; + } else if (content == link || link.replace(/^https?:\/\//, "") == content) { + str += embed ? link : `<${link}>`; + } else { + str += `[${content}](${embed ? "" : "<"}${link}${embed ? "" : ">"}${alt != null ? ` "${alt}"` : ""})`; + } + } else if (elem.b) { + str += `**${whenYouWalking(elem.b, images, embed)}**`; + } else if (elem.blockquote) { + str += `> ${whenYouWalking(elem.blockquote, images, embed).replaceAll(/\n/g, "\n> ")}`; + } else if (elem.br) { + str += `\n${whenYouWalking(elem.br, images, embed)}`; + } else if (elem.code) { + str += `\`${whenYouWalking(elem.code, images, embed)}\``; + } else if (elem.dd) { + str += `\u3000\u3000${whenYouWalking(elem.dd, images, embed).replaceAll(/\n/g, "\n\u3000\u3000")}`; + } else if (elem.em) { + str += `_${whenYouWalking(elem.em, images, embed)}_`; + } else if (elem.i) { + str += `_${whenYouWalking(elem.i, images, embed)}_`; + } else if (elem.img && images) { + const link = elem.$src ?? elem[":@"]?.$src; + const alt = elem.$alt ?? elem.$title ?? elem[":@"]?.$alt ?? elem[":@"]?.$title; + + str += `[${alt ?? ""}](${embed ? "" : "<"}${link}${embed ? "" : ">"})`; + } else if (elem.ol) { + str += `1. ${whenYouWalking(elem.ol, images, embed)}`; + } else if (elem.p) { + str += `\n${whenYouWalking(elem.p, images, embed)}\n`; + } else if (elem.pre) { + str += `\`\`\`\n${whenYouWalking(elem.pre, images, embed)}\`\`\``; + } else if (elem.s) { + str += `~~${whenYouWalking(elem.s, images, embed)}~~`; + } else if (elem.small) { + str += `-# ${whenYouWalking(elem.s, images, embed)}`; + } else if (elem.u) { + str += `__${whenYouWalking(elem.u, images, embed)}__`; + } else if (elem.ul) { + str += `- ${whenYouWalking(elem.ul, images, embed)}`; + } else { + for (const [key, val] of Object.entries(elem)) { + if (key != ":@" && !key.startsWith("$") && !tagBlacklist.includes(key)) { + str += whenYouWalking(val, images, embed); + break; + } + } + } + } + + return str; +} + function htmlToMarkdown(str, images = true, embed = true) { + if (!str.startsWith("")) str = `${str}`; + const tree = parser.parse(str)[0].body; + + return whenYouWalking(tree, images, embed); +} + +function htmlToMarkdownOld(str, images = true, embed = true) { str = str.replaceAll("\\", "\\\\"); str = str.replace(/]+)?>(.|\n)*?<\/style>/gi, ""); str = str.replace(/]+)?href="([^"]+?)"(\s*[^>]+)?>(.+?)<\/a>/gi, (_, __, url, ___, text) => { @@ -89,4 +175,4 @@ function htmlToMarkdown(str, images = true, embed = true) { return str; } -module.exports = {parseHtmlEntities, htmlToMarkdown}; +module.exports = {parseHtmlEntities, htmlToMarkdown, htmlToMarkdownOld};