Parsing HTML with regular expressions (GONE WRONG)

This commit is contained in:
Cadence Ember 2024-08-20 03:15:33 +12:00
parent 2c93ff1e6d
commit 2a080b737c
2 changed files with 37 additions and 3 deletions

View file

@ -329,7 +329,7 @@ async function uploadEndOfMessageSpriteSheet(content, attachments, pendingFiles,
*/ */
async function handleRoomOrMessageLinks(input, di) { async function handleRoomOrMessageLinks(input, di) {
let offset = 0 let offset = 0
for (const match of [...input.matchAll(/("?https:\/\/matrix.to\/#\/(![^"/, ?)]+)(?:\/(\$[^"/ ?)]+))?(?:\?[^",:!? )]*)?)(">|[, )]|$)/g)]) { for (const match of [...input.matchAll(/("?https:\/\/matrix.to\/#\/(![^"/, ?)]+)(?:\/(\$[^"/ ?)]+))?(?:\?[^",:!? )]*?)?)(">|[,<\n )]|$)/g)]) {
assert(typeof match.index === "number") assert(typeof match.index === "number")
const [_, attributeValue, roomID, eventID, endMarker] = match const [_, attributeValue, roomID, eventID, endMarker] = match
let result let result
@ -726,7 +726,7 @@ async function eventToMessage(event, guild, di) {
content = turndownService.turndown(root) content = turndownService.turndown(root)
// Put < > around any surviving matrix.to links to hide the URL previews // Put < > around any surviving matrix.to links to hide the URL previews
content = content.replace(/\bhttps?:\/\/matrix\.to\/[^ )]*/g, "<$&>") content = content.replace(/\bhttps?:\/\/matrix\.to\/[^<>\n )]*/g, "<$&>")
// It's designed for commonmark, we need to replace the space-space-newline with just newline // It's designed for commonmark, we need to replace the space-space-newline with just newline
content = content.replace(/ \n/g, "\n") content = content.replace(/ \n/g, "\n")
@ -745,7 +745,7 @@ async function eventToMessage(event, guild, di) {
} }
content = await handleRoomOrMessageLinks(content, di) // Replace matrix.to links with discord.com equivalents where possible content = await handleRoomOrMessageLinks(content, di) // Replace matrix.to links with discord.com equivalents where possible
content = content.replace(/\bhttps?:\/\/matrix\.to\/[^ )]*/, "<$&>") // Put < > around any surviving matrix.to links to hide the URL previews content = content.replace(/\bhttps?:\/\/matrix\.to\/[^<>\n )]*/, "<$&>") // Put < > around any surviving matrix.to links to hide the URL previews
const result = await checkWrittenMentions(content, event.sender, event.room_id, guild, di) const result = await checkWrittenMentions(content, event.sender, event.room_id, guild, di)
if (result) { if (result) {

View file

@ -3025,6 +3025,40 @@ test("event2message: mentioning known bridged events works (formatted body)", as
) )
}) })
test("event2message: mentioning known bridged events followed by line break and user mention works (partially formatted body)", async t => {
t.deepEqual(
await eventToMessage({
content: {
msgtype: "m.text",
body: "wrong body",
format: "org.matrix.custom.html",
formatted_body: `https://matrix.to/#/!CzvdIdUQXgUjDVKxeU:cadence.moe/$zXSlyI78DQqQwwfPUSzZ1b-nXzbUrCDljJgnGDdoI10?via=cadence.moe<a href="https://matrix.to/#/@_ooye_extremity:cadence.moe">extremity</a>`
},
event_id: "$g07oYSZFWBkxohNEfywldwgcWj1hbhDzQ1sBAKvqOOU",
origin_server_ts: 1688301929913,
room_id: "!kLRqKKUQXcibIMtOpl:cadence.moe",
sender: "@cadence:cadence.moe",
type: "m.room.message",
unsigned: {
age: 405299
}
}),
{
ensureJoined: [],
messagesToDelete: [],
messagesToEdit: [],
messagesToSend: [{
username: "cadence [they]",
content: "https://discord.com/channels/497159726455455754/497161350934560778/1141619794500649020<@114147806469554185>",
avatar_url: undefined,
allowed_mentions: {
parse: ["users", "roles"]
}
}]
}
)
})
test("event2message: mentioning unknown bridged events can approximate with timestamps", async t => { test("event2message: mentioning unknown bridged events can approximate with timestamps", async t => {
let called = 0 let called = 0
t.deepEqual( t.deepEqual(