New unicode emoji processor

This commit is contained in:
Cadence Ember 2025-01-17 18:05:34 +13:00
parent 14574b4e2c
commit f42eb6495f
7 changed files with 4015 additions and 46 deletions

View file

@ -0,0 +1,77 @@
// @ts-check
const fs = require("fs")
const {join} = require("path")
const s = fs.readFileSync(join(__dirname, "..", "src", "m2d", "converters", "emojis.txt"), "utf8").split("\n").map(x => encodeURIComponent(x))
const searchPattern = "%EF%B8%8F"
/**
* adapted from es.map.group-by.js in core-js
* @template K,V
* @param {V[]} items
* @param {(item: V) => K} fn
* @returns {Map<K, V[]>}
*/
function groupBy(items, fn) {
var map = new Map();
for (const value of items) {
var key = fn(value);
if (!map.has(key)) map.set(key, [value]);
else map.get(key).push(value);
}
return map;
}
/**
* @param {number[]} items
* @param {number} width
*/
function xhistogram(items, width) {
const chars = " ▏▎▍▌▋▊▉"
const max = items.reduce((a, c) => c > a ? c : a, 0)
return items.map(v => {
const p = v / max * (width-1)
return (
Array(Math.floor(p)).fill("█").join("") /* whole part */
+ chars[Math.ceil((p % 1) * (chars.length-1))] /* decimal part */
).padEnd(width)
})
}
/**
* @param {number[]} items
* @param {[number, number]} xrange
*/
function yhistogram(items, xrange, printHeader = false) {
const chars = "░▁_▂▃▄▅▆▇█"
const ones = "₀₁₂₃₄₅₆₇₈₉"
const tens = "0123456789"
const xy = []
let max = 0
/** value (x) -> frequency (y) */
const grouped = groupBy(items, x => x)
for (let i = xrange[0]; i <= xrange[1]; i++) {
if (printHeader) {
if (i === -1) process.stdout.write("-")
else if (i.toString().at(-1) === "0") process.stdout.write(tens[i/10])
else process.stdout.write(ones[i%10])
}
const y = grouped.get(i)?.length ?? 0
if (y > max) max = y
xy.push(y)
}
if (printHeader) console.log()
return xy.map(y => chars[Math.ceil(y / max * (chars.length-1))]).join("")
}
const grouped = groupBy(s, x => x.length)
const sortedGroups = [...grouped.entries()].sort((a, b) => b[0] - a[0])
let length = 0
const lengthHistogram = xhistogram(sortedGroups.map(v => v[1].length), 10)
for (let i = 0; i < sortedGroups.length; i++) {
const [k, v] = sortedGroups[i]
const l = lengthHistogram[i]
const h = yhistogram(v.map(x => x.indexOf(searchPattern)), [-1, k - searchPattern.length], i === 0)
if (i === 0) length = h.length + 1
console.log(`${h.padEnd(length, i % 2 === 0 ? "⸱" : " ")}length ${k.toString().padEnd(3)} ${l} ${v.length}`)
}

View file

@ -53,7 +53,7 @@ async function removeReaction(data, reactions) {
*/
async function removeEmojiReaction(data, reactions) {
const key = await emojiToKey.emojiToKey(data.emoji)
const discordPreferredEncoding = emoji.encodeEmoji(key, undefined)
const discordPreferredEncoding = await emoji.encodeEmoji(key, undefined)
db.prepare("DELETE FROM reaction WHERE message_id = ? AND encoded_emoji = ?").run(data.message_id, discordPreferredEncoding)
return converter.removeEmojiReaction(data, reactions, key)

View file

@ -20,7 +20,7 @@ async function addReaction(event) {
if (!messageID) return // Nothing can be done if the parent message was never bridged.
const key = event.content["m.relates_to"].key
const discordPreferredEncoding = emoji.encodeEmoji(key, event.content.shortcode)
const discordPreferredEncoding = await emoji.encodeEmoji(key, event.content.shortcode)
if (!discordPreferredEncoding) return
await discord.snow.channel.createReaction(channelID, messageID, discordPreferredEncoding) // acting as the discord bot itself

View file

@ -1,58 +1,98 @@
// @ts-check
const assert = require("assert").strict
const Ty = require("../../types")
const fsp = require("fs").promises
const {join} = require("path")
const emojisp = fsp.readFile(join(__dirname, "emojis.txt"), "utf8").then(content => content.split("\n"))
const passthrough = require("../../passthrough")
const {sync, select} = passthrough
const {select} = passthrough
/**
* @param {string} input
* @param {string | null | undefined} shortcode
* @returns {string?}
*/
function encodeEmoji(input, shortcode) {
let discordPreferredEncoding
if (input.startsWith("mxc://")) {
// Custom emoji
let row = select("emoji", ["emoji_id", "name"], {mxc_url: input}).get()
if (!row && shortcode) {
// Use the name to try to find a known emoji with the same name.
const name = shortcode.replace(/^:|:$/g, "")
row = select("emoji", ["emoji_id", "name"], {name: name}).get()
}
if (!row) {
// We don't have this emoji and there's no realistic way to just-in-time upload a new emoji somewhere.
// Sucks!
return null
}
// Cool, we got an exact or a candidate emoji.
discordPreferredEncoding = encodeURIComponent(`${row.name}:${row.emoji_id}`)
} else {
// Default emoji
// https://github.com/discord/discord-api-docs/issues/2723#issuecomment-807022205 ????????????
const encoded = encodeURIComponent(input)
const encodedTrimmed = encoded.replace(/%EF%B8%8F/g, "")
const forceTrimmedList = [
"%F0%9F%91%8D", // 👍
"%F0%9F%91%8E", // 👎️
"%E2%AD%90", // ⭐
"%F0%9F%90%88", // 🐈
"%E2%9D%93", // ❓
"%F0%9F%8F%86", // 🏆️
"%F0%9F%93%9A", // 📚️
"%F0%9F%90%9F", // 🐟️
]
discordPreferredEncoding =
( forceTrimmedList.includes(encodedTrimmed) ? encodedTrimmed
: encodedTrimmed !== encoded && [...input].length === 2 ? encoded
: encodedTrimmed)
console.log("add reaction from matrix:", input, encoded, encodedTrimmed, "chosen:", discordPreferredEncoding)
function encodeCustomEmoji(input, shortcode) {
// Custom emoji
let row = select("emoji", ["emoji_id", "name"], {mxc_url: input}).get()
if (!row && shortcode) {
// Use the name to try to find a known emoji with the same name.
const name = shortcode.replace(/^:|:$/g, "")
row = select("emoji", ["emoji_id", "name"], {name: name}).get()
}
if (!row) {
// We don't have this emoji and there's no realistic way to just-in-time upload a new emoji somewhere. Sucks!
return null
}
return encodeURIComponent(`${row.name}:${row.emoji_id}`)
}
/**
* @param {string} input
* @returns {Promise<string?>} URL encoded!
*/
async function encodeDefaultEmoji(input) {
// Default emoji
// Shortcut: If there are ASCII letters then it's not an emoji, it's a freeform Matrix text reaction.
// (Regional indicator letters are not ASCII. ASCII digits might be part of an emoji.)
if (input.match(/[A-Za-z]/)) return null
// Check against the dataset
const emojis = await emojisp
const encoded = encodeURIComponent(input)
// Best case scenario: they reacted with an exact replica of a valid emoji.
if (emojis.includes(input)) return encoded
// Maybe it has some extraneous \ufe0f or \ufe0e (at the end or in the middle), and it'll be valid if they're removed.
const trimmed = input.replace(/\ufe0e|\ufe0f/g, "")
const trimmedEncoded = encodeURIComponent(trimmed)
if (trimmed !== input) {
if (emojis.includes(trimmed)) return trimmedEncoded
}
// Okay, well, maybe it was already missing one and it actually needs an extra \ufe0f, and it'll be valid if that's added.
else {
const appended = input + "\ufe0f"
const appendedEncoded = encodeURIComponent(appended)
if (emojis.includes(appended)) return appendedEncoded
}
// Hmm, so adding or removing that from the end didn't help, but maybe there needs to be one in the middle? We can try some heuristics.
// These heuristics come from executing scripts/emoji-surrogates-statistics.js.
if (trimmedEncoded.length <= 21 && trimmed.match(/^[*#0-9]/)) { // ->19: Keycap digit? 0⃣ 1⃣ 2⃣ 3⃣ 4⃣ 5⃣ 6⃣ 7⃣ 8⃣ 9⃣ *️⃣ #️⃣
const keycap = trimmed[0] + "\ufe0f" + trimmed.slice(1)
if (emojis.includes(keycap)) return encodeURIComponent(keycap)
} else if (trimmedEncoded.length === 27 && trimmed[0] === "⛹") { // ->45: ⛹️‍♀️ ⛹️‍♂️
const balling = trimmed[0] + "\ufe0f" + trimmed.slice(1) + "\ufe0f"
if (emojis.includes(balling)) return encodeURIComponent(balling)
} else if (trimmedEncoded.length === 30) { // ->39: ⛓️‍💥 ❤️‍🩹 ❤️‍🔥 or ->48: 🏳️‍⚧️ 🏌️‍♀️ 🕵️‍♀️ 🏋️‍♀️ and gender variants
const thriving = trimmed[0] + "\ufe0f" + trimmed.slice(1)
if (emojis.includes(thriving)) return encodeURIComponent(thriving)
const powerful = trimmed.slice(0, 2) + "\ufe0f" + trimmed.slice(2) + "\ufe0f"
if (emojis.includes(powerful)) return encodeURIComponent(powerful)
} else if (trimmedEncoded.length === 51 && trimmed[3] === "❤") { // ->60: 👩‍❤️‍👨 👩‍❤️‍👩 👨‍❤️‍👨
const yellowRomance = trimmed.slice(0, 3) + "❤\ufe0f" + trimmed.slice(4)
if (emojis.includes(yellowRomance)) return encodeURIComponent(yellowRomance)
}
// there are a few more longer ones but I got bored
return null
}
/**
* @param {string} input
* @param {string | null | undefined} shortcode
* @returns {Promise<string?>}
*/
async function encodeEmoji(input, shortcode) {
if (input.startsWith("mxc://")) {
return encodeCustomEmoji(input, shortcode)
} else {
return encodeDefaultEmoji(input)
}
return discordPreferredEncoding
}
module.exports.encodeEmoji = encodeEmoji

View file

@ -0,0 +1,52 @@
// @ts-check
const {test} = require("supertape")
const {encodeEmoji} = require("./emoji")
test("emoji: valid", async t => {
t.equal(await encodeEmoji("🦄", null), "%F0%9F%A6%84")
})
test("emoji: freeform text", async t => {
t.equal(await encodeEmoji("ha", null), null)
})
test("emoji: suspicious unicode", async t => {
t.equal(await encodeEmoji("Ⓐ", null), null)
})
test("emoji: needs u+fe0f added", async t => {
t.equal(await encodeEmoji("☺", null), "%E2%98%BA%EF%B8%8F")
})
test("emoji: needs u+fe0f removed", async t => {
t.equal(await encodeEmoji("⭐️", null), "%E2%AD%90")
})
test("emoji: number key needs u+fe0f in the middle", async t => {
t.equal(await encodeEmoji("3⃣", null), "3%EF%B8%8F%E2%83%A3")
})
test("emoji: hash key needs u+fe0f in the middle", async t => {
t.equal(await encodeEmoji("#⃣", null), "%23%EF%B8%8F%E2%83%A3")
})
test("emoji: broken chains needs u+fe0f in the middle", async t => {
t.equal(await encodeEmoji("⛓‍💥", null), "%E2%9B%93%EF%B8%8F%E2%80%8D%F0%9F%92%A5")
})
test("emoji: balling needs u+fe0f in the middle", async t => {
t.equal(await encodeEmoji("⛹‍♀", null), "%E2%9B%B9%EF%B8%8F%E2%80%8D%E2%99%80%EF%B8%8F")
})
test("emoji: trans flag needs u+fe0f in the middle", async t => {
t.equal(await encodeEmoji("🏳‍⚧", null), "%F0%9F%8F%B3%EF%B8%8F%E2%80%8D%E2%9A%A7%EF%B8%8F")
})
test("emoji: spy needs u+fe0f in the middle", async t => {
t.equal(await encodeEmoji("🕵‍♀", null), "%F0%9F%95%B5%EF%B8%8F%E2%80%8D%E2%99%80%EF%B8%8F")
})
test("emoji: couple needs u+fe0f in the middle", async t => {
t.equal(await encodeEmoji("👩‍❤‍👩", null), "%F0%9F%91%A9%E2%80%8D%E2%9D%A4%EF%B8%8F%E2%80%8D%F0%9F%91%A9")
})

File diff suppressed because it is too large Load diff

View file

@ -141,6 +141,7 @@ file._actuallyUploadDiscordFileToMxc = function(url, res) { throw new Error(`Not
require("../src/d2m/converters/user-to-mxid.test")
require("../src/m2d/converters/diff-pins.test")
require("../src/m2d/converters/event-to-message.test")
require("../src/m2d/converters/emoji.test")
require("../src/m2d/converters/utils.test")
require("../src/m2d/converters/emoji-sheet.test")
require("../src/discord/interactions/invite.test")