add turndown for m->d formatting

This commit is contained in:
Cadence Ember 2023-08-26 01:43:17 +12:00
parent 27b8c547e3
commit 8c4e16e255
5 changed files with 297 additions and 16 deletions

View file

@ -2,19 +2,41 @@
const Ty = require("../../types")
const DiscordTypes = require("discord-api-types/v10")
const markdown = require("discord-markdown")
const chunk = require("chunk-text")
const TurndownService = require("turndown")
const passthrough = require("../../passthrough")
const { sync, db, discord } = passthrough
/** @type {import("../../matrix/file")} */
const file = sync.require("../../matrix/file")
// https://github.com/mixmark-io/turndown/blob/97e4535ca76bb2e70d9caa2aa4d4686956b06d44/src/utilities.js#L26C28-L33C2
const BLOCK_ELEMENTS = [
"ADDRESS", "ARTICLE", "ASIDE", "AUDIO", "BLOCKQUOTE", "BODY", "CANVAS",
"CENTER", "DD", "DETAILS", "DIR", "DIV", "DL", "DT", "FIELDSET", "FIGCAPTION", "FIGURE",
"FOOTER", "FORM", "FRAMESET", "H1", "H2", "H3", "H4", "H5", "H6", "HEADER",
"HGROUP", "HR", "HTML", "ISINDEX", "LI", "MAIN", "MENU", "NAV", "NOFRAMES",
"NOSCRIPT", "OL", "OUTPUT", "P", "PRE", "SECTION", "SUMMARY", "TABLE", "TBODY", "TD",
"TFOOT", "TH", "THEAD", "TR", "UL"
]
const turndownService = new TurndownService({
hr: "----"
})
turndownService.addRule("strikethrough", {
filter: ["del", "s", "strike"],
replacement: function (content) {
return "~~" + content + "~~"
}
})
/**
* @param {Ty.Event.Outer<Ty.Event.M_Room_Message>} event
*/
function eventToMessage(event) {
/** @type {(DiscordTypes.RESTPostAPIWebhookWithTokenJSONBody & {files?: {name: string, file: Buffer}[]})[]} */
const messages = []
let messages = []
let displayName = event.sender
let avatarURL = undefined
@ -24,20 +46,51 @@ function eventToMessage(event) {
// TODO: get the media repo domain and the avatar url from the matrix member event
}
if (event.content.msgtype === "m.text") {
messages.push({
content: event.content.body,
username: displayName,
avatar_url: avatarURL
})
} else if (event.content.msgtype === "m.emote") {
messages.push({
content: `\* _${displayName} ${event.content.body}_`,
username: displayName,
avatar_url: avatarURL
// Convert content depending on what the message is
let content = event.content.body // ultimate fallback
if (event.content.format === "org.matrix.custom.html" && event.content.formatted_body) {
let input = event.content.formatted_body
if (event.content.msgtype === "m.emote") {
input = `* ${displayName} ${input}`
}
// Note: Element's renderers on Web and Android currently collapse whitespace, like the browser does. Turndown also collapses whitespace which is good for me.
// If later I'm using a client that doesn't collapse whitespace and I want turndown to follow suit, uncomment the following line of code, and it Just Works:
// input = input.replace(/ /g, "&nbsp;")
// There is also a corresponding test to uncomment, named "event2message: whitespace is retained"
// The matrix spec hasn't decided whether \n counts as a newline or not, but I'm going to count it, because if it's in the data it's there for a reason.
// But I should not count it if it's between block elements.
input = input.replace(/(<\/?([^ >]+)[^>]*>)?\n(<\/?([^ >]+)[^>]*>)?/g, (whole, beforeContext, beforeTag, afterContext, afterTag) => {
if (typeof beforeTag !== "string" && typeof afterTag !== "string") {
return "<br>"
}
beforeContext = beforeContext || ""
beforeTag = beforeTag || ""
afterContext = afterContext || ""
afterTag = afterTag || ""
if (!BLOCK_ELEMENTS.includes(beforeTag.toUpperCase()) && !BLOCK_ELEMENTS.includes(afterTag.toUpperCase())) {
return beforeContext + "<br>" + afterContext
} else {
return whole
}
})
// @ts-ignore
content = turndownService.turndown(input)
// It's optimised for commonmark, we need to replace the space-space-newline with just newline
content = content.replace(/ \n/g, "\n")
}
// Split into 2000 character chunks
const chunks = chunk(content, 2000)
messages = messages.concat(chunks.map(content => ({
content,
username: displayName,
avatar_url: avatarURL
})))
return messages
}

View file

@ -4,6 +4,12 @@ const {test} = require("supertape")
const {eventToMessage} = require("./event-to-message")
const data = require("../../test/data")
function sameFirstContentAndWhitespace(t, a, b) {
const a2 = JSON.stringify(a[0].content)
const b2 = JSON.stringify(b[0].content)
t.equal(a2, b2)
}
test("event2message: janky test", t => {
t.deepEqual(
eventToMessage({
@ -28,6 +34,165 @@ test("event2message: janky test", t => {
)
})
test("event2message: basic html is converted to markdown", t => {
t.deepEqual(
eventToMessage({
content: {
msgtype: "m.text",
body: "wrong body",
format: "org.matrix.custom.html",
formatted_body: "this <strong>is</strong> a <strong><em>test</em></strong> of <del>formatting</del>"
},
event_id: "$g07oYSZFWBkxohNEfywldwgcWj1hbhDzQ1sBAKvqOOU",
origin_server_ts: 1688301929913,
room_id: "!kLRqKKUQXcibIMtOpl:cadence.moe",
sender: "@cadence:cadence.moe",
type: "m.room.message",
unsigned: {
age: 405299
}
}),
[{
username: "cadence",
content: "this **is** a **_test_** of ~~formatting~~",
avatar_url: undefined
}]
)
})
test("event2message: markdown syntax is escaped", t => {
t.deepEqual(
eventToMessage({
content: {
msgtype: "m.text",
body: "wrong body",
format: "org.matrix.custom.html",
formatted_body: "this **is** an <strong><em>extreme</em></strong> \\*test\\* of"
},
event_id: "$g07oYSZFWBkxohNEfywldwgcWj1hbhDzQ1sBAKvqOOU",
origin_server_ts: 1688301929913,
room_id: "!kLRqKKUQXcibIMtOpl:cadence.moe",
sender: "@cadence:cadence.moe",
type: "m.room.message",
unsigned: {
age: 405299
}
}),
[{
username: "cadence",
content: "this \\*\\*is\\*\\* an **_extreme_** \\\\\\*test\\\\\\* of",
avatar_url: undefined
}]
)
})
test("event2message: html lines are bridged correctly", t => {
t.deepEqual(
eventToMessage({
content: {
msgtype: "m.text",
body: "wrong body",
format: "org.matrix.custom.html",
formatted_body: "<p>paragraph one<br>line <em>two</em><br>line three<br><br>paragraph two\nline <em>two</em>\nline three\n\nparagraph three</p><p>paragraph four\nline two<br>line three\nline four</p>paragraph five"
},
event_id: "$g07oYSZFWBkxohNEfywldwgcWj1hbhDzQ1sBAKvqOOU",
origin_server_ts: 1688301929913,
room_id: "!kLRqKKUQXcibIMtOpl:cadence.moe",
sender: "@cadence:cadence.moe",
type: "m.room.message",
unsigned: {
age: 405299
}
}),
[{
username: "cadence",
content: "paragraph one\nline _two_\nline three\n\nparagraph two\nline _two_\nline three\n\nparagraph three\n\nparagraph four\nline two\nline three\nline four\n\nparagraph five",
avatar_url: undefined
}]
)
})
/*test("event2message: whitespace is retained", t => {
t.deepEqual(
eventToMessage({
content: {
msgtype: "m.text",
body: "wrong body",
format: "org.matrix.custom.html",
formatted_body: "line one: test test<br>line two: <strong>test</strong> <strong>test</strong><br>line three: <strong>test test</strong><br>line four: test<strong> </strong>test<br> line five"
},
event_id: "$g07oYSZFWBkxohNEfywldwgcWj1hbhDzQ1sBAKvqOOU",
origin_server_ts: 1688301929913,
room_id: "!kLRqKKUQXcibIMtOpl:cadence.moe",
sender: "@cadence:cadence.moe",
type: "m.room.message",
unsigned: {
age: 405299
}
}),
[{
username: "cadence",
content: "line one: test test\nline two: **test** **test**\nline three: **test test**\nline four: test test\n line five",
avatar_url: undefined
}]
)
})*/
test("event2message: whitespace is collapsed", t => {
sameFirstContentAndWhitespace(
t,
eventToMessage({
content: {
msgtype: "m.text",
body: "wrong body",
format: "org.matrix.custom.html",
formatted_body: "line one: test test<br>line two: <strong>test</strong> <strong>test</strong><br>line three: <strong>test test</strong><br>line four: test<strong> </strong>test<br> line five"
},
event_id: "$g07oYSZFWBkxohNEfywldwgcWj1hbhDzQ1sBAKvqOOU",
origin_server_ts: 1688301929913,
room_id: "!kLRqKKUQXcibIMtOpl:cadence.moe",
sender: "@cadence:cadence.moe",
type: "m.room.message",
unsigned: {
age: 405299
}
}),
[{
username: "cadence",
content: "line one: test test\nline two: **test** **test**\nline three: **test test**\nline four: test test\nline five",
avatar_url: undefined
}]
)
})
test("event2message: lists are bridged correctly", t => {
sameFirstContentAndWhitespace(
t,
eventToMessage({
"type": "m.room.message",
"sender": "@cadence:cadence.moe",
"content": {
"msgtype": "m.text",
"body": "* line one\n* line two\n* line three\n * nested one\n * nested two\n* line four",
"format": "org.matrix.custom.html",
"formatted_body": "<ul>\n<li>line one</li>\n<li>line two</li>\n<li>line three\n<ul>\n<li>nested one</li>\n<li>nested two</li>\n</ul>\n</li>\n<li>line four</li>\n</ul>\n"
},
"origin_server_ts": 1692967314062,
"unsigned": {
"age": 112,
"transaction_id": "m1692967313951.441"
},
"event_id": "$l-xQPY5vNJo3SNxU9d8aOWNVD1glMslMyrp4M_JEF70",
"room_id": "!BpMdOUkWWhFxmTrENV:cadence.moe"
}),
[{
username: "cadence",
content: "* line one\n* line two\n* line three\n * nested one\n * nested two\n* line four",
avatar_url: undefined
}]
)
})
test("event2message: long messages are split", t => {
t.deepEqual(
eventToMessage({
@ -55,3 +220,29 @@ test("event2message: long messages are split", t => {
}]
)
})
test("event2message: m.emote markdown syntax is escaped", t => {
t.deepEqual(
eventToMessage({
content: {
msgtype: "m.emote",
body: "wrong body",
format: "org.matrix.custom.html",
formatted_body: "shows you **her** <strong><em>extreme</em></strong> \\*test\\* of"
},
event_id: "$g07oYSZFWBkxohNEfywldwgcWj1hbhDzQ1sBAKvqOOU",
origin_server_ts: 1688301929913,
room_id: "!kLRqKKUQXcibIMtOpl:cadence.moe",
sender: "@cadence:cadence.moe",
type: "m.room.message",
unsigned: {
age: 405299
}
}),
[{
username: "cadence",
content: "\\* cadence shows you \\*\\*her\\*\\* **_extreme_** \\\\\\*test\\\\\\* of",
avatar_url: undefined
}]
)
})

37
package-lock.json generated
View file

@ -10,6 +10,7 @@
"license": "MIT",
"dependencies": {
"better-sqlite3": "^8.3.0",
"chunk-text": "^2.0.1",
"cloudstorm": "^0.8.0",
"discord-markdown": "git+https://git.sr.ht/~cadence/nodejs-discord-markdown#440130ef343c8183a81c7c09809731484aa3a182",
"heatsync": "^2.4.1",
@ -20,7 +21,8 @@
"node-fetch": "^2.6.7",
"prettier-bytes": "^1.0.4",
"snowtransfer": "^0.8.0",
"try-to-catch": "^3.0.1"
"try-to-catch": "^3.0.1",
"turndown": "^7.1.2"
},
"devDependencies": {
"@types/node": "^18.16.0",
@ -732,6 +734,18 @@
"resolved": "https://registry.npmjs.org/chownr/-/chownr-1.1.4.tgz",
"integrity": "sha512-jJ0bqzaylmJtVnNgzTeSOs8DPavpbYgEr/b0YL8/2GO3xJEhInFmhKMUnEJQjZumK7KXGFhUy89PrsJWlakBVg=="
},
"node_modules/chunk-text": {
"version": "2.0.1",
"resolved": "https://registry.npmjs.org/chunk-text/-/chunk-text-2.0.1.tgz",
"integrity": "sha512-ER6TSpe2DT4wjOVOKJ3FFAYv7wE77HA/Ztz88Peiv3lq/2oVMsItYJJsVVI0xNZM8cdImOOTNqlw+LQz7gYdJg==",
"dependencies": {
"runes": "^0.4.3"
},
"bin": {
"chunk": "bin/server.js",
"chunk-text": "bin/server.js"
}
},
"node_modules/ci-info": {
"version": "3.8.0",
"resolved": "https://registry.npmjs.org/ci-info/-/ci-info-3.8.0.tgz",
@ -1057,6 +1071,11 @@
"simple-markdown": "^0.7.2"
}
},
"node_modules/domino": {
"version": "2.1.6",
"resolved": "https://registry.npmjs.org/domino/-/domino-2.1.6.tgz",
"integrity": "sha512-3VdM/SXBZX2omc9JF9nOPCtDaYQ67BGp5CoLpIQlO2KCAPETs8TcDHacF26jXadGbvUteZzRTeos2fhID5+ucQ=="
},
"node_modules/ee-first": {
"version": "1.1.1",
"resolved": "https://registry.npmjs.org/ee-first/-/ee-first-1.1.1.tgz",
@ -2646,6 +2665,14 @@
"node": "*"
}
},
"node_modules/runes": {
"version": "0.4.3",
"resolved": "https://registry.npmjs.org/runes/-/runes-0.4.3.tgz",
"integrity": "sha512-K6p9y4ZyL9wPzA+PMDloNQPfoDGTiFYDvdlXznyGKgD10BJpcAosvATKrExRKOrNLgD8E7Um7WGW0lxsnOuNLg==",
"engines": {
"node": ">=4.0.0"
}
},
"node_modules/safe-buffer": {
"version": "5.2.1",
"resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.2.1.tgz",
@ -3216,6 +3243,14 @@
"node": "*"
}
},
"node_modules/turndown": {
"version": "7.1.2",
"resolved": "https://registry.npmjs.org/turndown/-/turndown-7.1.2.tgz",
"integrity": "sha512-ntI9R7fcUKjqBP6QU8rBK2Ehyt8LAzt3UBT9JR9tgo6GtuKvyUzpayWmeMKJw1DPdXzktvtIT8m2mVXz+bL/Qg==",
"dependencies": {
"domino": "^2.1.6"
}
},
"node_modules/type-is": {
"version": "1.6.18",
"resolved": "https://registry.npmjs.org/type-is/-/type-is-1.6.18.tgz",

View file

@ -16,6 +16,7 @@
"license": "MIT",
"dependencies": {
"better-sqlite3": "^8.3.0",
"chunk-text": "^2.0.1",
"cloudstorm": "^0.8.0",
"discord-markdown": "git+https://git.sr.ht/~cadence/nodejs-discord-markdown#440130ef343c8183a81c7c09809731484aa3a182",
"heatsync": "^2.4.1",
@ -26,7 +27,8 @@
"node-fetch": "^2.6.7",
"prettier-bytes": "^1.0.4",
"snowtransfer": "^0.8.0",
"try-to-catch": "^3.0.1"
"try-to-catch": "^3.0.1",
"turndown": "^7.1.2"
},
"devDependencies": {
"@types/node": "^18.16.0",

2
types.d.ts vendored
View file

@ -67,7 +67,7 @@ export namespace Event {
}
export type M_Room_Message = {
msgtype: "m.text"
msgtype: "m.text" | "m.emote"
body: string
format?: "org.matrix.custom.html"
formatted_body?: string