html to markdown fixes

This commit is contained in:
Cynthia Foxwell 2025-03-28 09:32:48 -06:00
parent 2388f0cc47
commit 1e5c56c0d6
Signed by: Cynosphere
SSH key fingerprint: SHA256:H3SM8ufP/uxqLwKSH7xY89TDnbR9uOHzjLoBr0tlajk

View file

@ -32,14 +32,14 @@ function parseHtmlEntities(str) {
function htmlToMarkdown(str, images = true, embed = true) {
str = str.replaceAll("\\", "\\\\");
str = str.replace(/<style(\s+[^>]+)?>(.|\n)*?<\/style>/gi, "");
str = str.replace(/<a (\s+[^>]+)?href="([^"]+?)"(\s+[^>]+)?>(.+?)<\/a>/gi, (_, __, url, ___, text) => {
str = str.replace(/<style(\s*[^>]+)?>(.|\n)*?<\/style>/gi, "");
str = str.replace(/<a (\s*[^>]+)?href="([^"]+?)"(\s*[^>]+)?>(.+?)<\/a>/gi, (_, __, url, ___, text) => {
url = url.replace(/^\/\//, "https://").replace("\\#", "#");
return url == text ? url : `[${text}](${embed ? "" : "&lt;"}${url}${embed ? "" : "&gt;"})`;
});
if (images)
str = str.replace(
/<img (\s+[^>]+)?src="([^"]+?)"(\s+[^>]+)?(alt|title)="([^"]+?)"(\s+[^>]+)?\/>/gi,
/<img (\s*[^>]+)?src="([^"]+?)"(\s*[^>]+)?(alt|title)="([^"]+?)"(\s*[^>]+)?\/>/gi,
`[$5](${embed ? "" : "&lt;"}$2${embed ? "" : "&gt;"})`
);
str = str.replace(/<\/?\s*br\s*\/?>/gi, "\n");
@ -49,7 +49,7 @@ function htmlToMarkdown(str, images = true, embed = true) {
);
str = str.replace(/<\/?p>/gi, "\n");
str = str.replace(/<dd>((.|\n)*?)<\/dd>/gi, (_, inner) => "\u3000\u3000" + inner.split("\n").join("\n\u3000\u3000"));
str = str.replace(/<ol(\s+[^>]+)?>((.|\n)*?)<\/ol>/gi, (_, __, inner) => {
str = str.replace(/<ol(\s*[^>]+)?>((.|\n)*?)<\/ol>/gi, (_, __, inner) => {
let index = 0;
return inner
.replace(/<li>/gi, () => {
@ -59,7 +59,7 @@ function htmlToMarkdown(str, images = true, embed = true) {
.replace(/<\/li>/gi, "\n")
.replaceAll("\n\n", "\n");
});
str = str.replace(/<ul(\s+[^>]+)?>((.|\n)*?)<\/ul>/gi, (_, __, inner) => {
str = str.replace(/<ul(\s*[^>]+)?>((.|\n)*?)<\/ul>/gi, (_, __, inner) => {
let index = 0;
return inner
.replace(/<li>/gi, () => {
@ -69,17 +69,17 @@ function htmlToMarkdown(str, images = true, embed = true) {
.replace(/<\/li>/gi, "\n")
.replaceAll("\n\n", "\n");
});
str = str.replace(/<\/?code(\s+[^>]+)?>/gi, "`");
str = str.replace(/<\/?em(\s+[^>]+)?>/gi, "_");
str = str.replace(/<\/?i(\s+[^>]+)?>/gi, "_");
str = str.replace(/<\/?b(\s+[^>]+)?>/gi, "**");
str = str.replace(/<\/?u(\s+[^>]+)?>/gi, "__");
str = str.replace(/<\/?s(\s+[^>]+)?>/gi, "~~");
str = str.replace(/<h1(\s+[^>]+)?>/gi, "# ");
str = str.replace(/<h2(\s+[^>]+)?>/gi, "## ");
str = str.replace(/<h3(\s+[^>]+)?>/gi, "### ");
str = str.replace(/<\/?h4(\s+[^>]+)?>/gi, "**");
str = str.replace(/<(math|noscript)(\s+[^>]+)?>((.|\n)*?)<\/(math|noscript)>/gi, "");
str = str.replace(/<\/?code(\s*[^>]+)?>/gi, "`");
str = str.replace(/<\/?em(\s*[^>]+)?>/gi, "_");
str = str.replace(/<\/?i(\s*[^>]+)?>/gi, "_");
str = str.replace(/<\/?b(\s*[^>]+)?>/gi, "**");
str = str.replace(/<\/?u(\s*[^>]+)?>/gi, "__");
str = str.replace(/<\/?s(\s*[^>]+)?>/gi, "~~");
str = str.replace(/<h1(\s*[^>]+)?>/gi, "# ");
str = str.replace(/<h2(\s*[^>]+)?>/gi, "## ");
str = str.replace(/<h3(\s*[^>]+)?>/gi, "### ");
str = str.replace(/<\/?h4(\s*[^>]+)?>/gi, "**");
str = str.replace(/<(math|noscript)(\s*[^>]+)?>((.|\n)*?)<\/(math|noscript)>/gi, "");
str = str.replace(/<[^>]+?>/gi, "");
str = parseHtmlEntities(str);
// whyyyyyyyyyyyy