From 1e5c56c0d652ab4e663d0af7763a3d051f17893e Mon Sep 17 00:00:00 2001 From: Cynthia Foxwell Date: Fri, 28 Mar 2025 09:32:48 -0600 Subject: [PATCH] html to markdown fixes --- src/util/html.js | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/src/util/html.js b/src/util/html.js index 85bb5ec..092a0f0 100644 --- a/src/util/html.js +++ b/src/util/html.js @@ -32,14 +32,14 @@ function parseHtmlEntities(str) { function htmlToMarkdown(str, images = true, embed = true) { str = str.replaceAll("\\", "\\\\"); - str = str.replace(/]+)?>(.|\n)*?<\/style>/gi, ""); - str = str.replace(/]+)?href="([^"]+?)"(\s+[^>]+)?>(.+?)<\/a>/gi, (_, __, url, ___, text) => { + str = str.replace(/]+)?>(.|\n)*?<\/style>/gi, ""); + str = str.replace(/]+)?href="([^"]+?)"(\s*[^>]+)?>(.+?)<\/a>/gi, (_, __, url, ___, text) => { url = url.replace(/^\/\//, "https://").replace("\\#", "#"); return url == text ? url : `[${text}](${embed ? "" : "<"}${url}${embed ? "" : ">"})`; }); if (images) str = str.replace( - /]+)?src="([^"]+?)"(\s+[^>]+)?(alt|title)="([^"]+?)"(\s+[^>]+)?\/>/gi, + /]+)?src="([^"]+?)"(\s*[^>]+)?(alt|title)="([^"]+?)"(\s*[^>]+)?\/>/gi, `[$5](${embed ? "" : "<"}$2${embed ? "" : ">"})` ); str = str.replace(/<\/?\s*br\s*\/?>/gi, "\n"); @@ -49,7 +49,7 @@ function htmlToMarkdown(str, images = true, embed = true) { ); str = str.replace(/<\/?p>/gi, "\n"); str = str.replace(/
((.|\n)*?)<\/dd>/gi, (_, inner) => "\u3000\u3000" + inner.split("\n").join("\n\u3000\u3000")); - str = str.replace(/]+)?>((.|\n)*?)<\/ol>/gi, (_, __, inner) => { + str = str.replace(/]+)?>((.|\n)*?)<\/ol>/gi, (_, __, inner) => { let index = 0; return inner .replace(/
  • /gi, () => { @@ -59,7 +59,7 @@ function htmlToMarkdown(str, images = true, embed = true) { .replace(/<\/li>/gi, "\n") .replaceAll("\n\n", "\n"); }); - str = str.replace(/]+)?>((.|\n)*?)<\/ul>/gi, (_, __, inner) => { + str = str.replace(/]+)?>((.|\n)*?)<\/ul>/gi, (_, __, inner) => { let index = 0; return inner .replace(/
  • /gi, () => { @@ -69,17 +69,17 @@ function htmlToMarkdown(str, images = true, embed = true) { .replace(/<\/li>/gi, "\n") .replaceAll("\n\n", "\n"); }); - str = str.replace(/<\/?code(\s+[^>]+)?>/gi, "`"); - str = str.replace(/<\/?em(\s+[^>]+)?>/gi, "_"); - str = str.replace(/<\/?i(\s+[^>]+)?>/gi, "_"); - str = str.replace(/<\/?b(\s+[^>]+)?>/gi, "**"); - str = str.replace(/<\/?u(\s+[^>]+)?>/gi, "__"); - str = str.replace(/<\/?s(\s+[^>]+)?>/gi, "~~"); - str = str.replace(/]+)?>/gi, "# "); - str = str.replace(/]+)?>/gi, "## "); - str = str.replace(/]+)?>/gi, "### "); - str = str.replace(/<\/?h4(\s+[^>]+)?>/gi, "**"); - str = str.replace(/<(math|noscript)(\s+[^>]+)?>((.|\n)*?)<\/(math|noscript)>/gi, ""); + str = str.replace(/<\/?code(\s*[^>]+)?>/gi, "`"); + str = str.replace(/<\/?em(\s*[^>]+)?>/gi, "_"); + str = str.replace(/<\/?i(\s*[^>]+)?>/gi, "_"); + str = str.replace(/<\/?b(\s*[^>]+)?>/gi, "**"); + str = str.replace(/<\/?u(\s*[^>]+)?>/gi, "__"); + str = str.replace(/<\/?s(\s*[^>]+)?>/gi, "~~"); + str = str.replace(/]+)?>/gi, "# "); + str = str.replace(/]+)?>/gi, "## "); + str = str.replace(/]+)?>/gi, "### "); + str = str.replace(/<\/?h4(\s*[^>]+)?>/gi, "**"); + str = str.replace(/<(math|noscript)(\s*[^>]+)?>((.|\n)*?)<\/(math|noscript)>/gi, ""); str = str.replace(/<[^>]+?>/gi, ""); str = parseHtmlEntities(str); // whyyyyyyyyyyyy