New unicode emoji processor
This commit is contained in:
		
							parent
							
								
									14574b4e2c
								
							
						
					
					
						commit
						f42eb6495f
					
				
					 7 changed files with 4015 additions and 46 deletions
				
			
		
							
								
								
									
										77
									
								
								scripts/emoji-surrogates-statistics.js
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										77
									
								
								scripts/emoji-surrogates-statistics.js
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,77 @@ | |||
| // @ts-check
 | ||||
| 
 | ||||
| const fs = require("fs") | ||||
| const {join} = require("path") | ||||
| const s = fs.readFileSync(join(__dirname, "..", "src", "m2d", "converters", "emojis.txt"), "utf8").split("\n").map(x => encodeURIComponent(x)) | ||||
| const searchPattern = "%EF%B8%8F" | ||||
| 
 | ||||
| /** | ||||
|  * adapted from es.map.group-by.js in core-js | ||||
|  * @template K,V | ||||
|  * @param {V[]} items | ||||
|  * @param {(item: V) => K} fn | ||||
|  * @returns {Map<K, V[]>} | ||||
|  */ | ||||
| function groupBy(items, fn) { | ||||
| 	var map = new Map(); | ||||
| 	for (const value of items) { | ||||
| 		var key = fn(value); | ||||
| 		if (!map.has(key)) map.set(key, [value]); | ||||
| 		else map.get(key).push(value); | ||||
| 	} | ||||
| 	return map; | ||||
| } | ||||
| 
 | ||||
| /** | ||||
|  * @param {number[]} items | ||||
|  * @param {number} width | ||||
|  */ | ||||
| function xhistogram(items, width) { | ||||
| 	const chars = " ▏▎▍▌▋▊▉" | ||||
| 	const max = items.reduce((a, c) => c > a ? c : a, 0) | ||||
| 	return items.map(v => { | ||||
| 		const p = v / max * (width-1) | ||||
| 		return ( | ||||
| 			Array(Math.floor(p)).fill("█").join("") /* whole part */ | ||||
| 			+ chars[Math.ceil((p % 1) * (chars.length-1))] /* decimal part */ | ||||
| 		).padEnd(width) | ||||
| 	}) | ||||
| } | ||||
| 
 | ||||
| /** | ||||
|  * @param {number[]} items | ||||
|  * @param {[number, number]} xrange | ||||
|  */ | ||||
| function yhistogram(items, xrange, printHeader = false) { | ||||
| 	const chars = "░▁_▂▃▄▅▆▇█" | ||||
| 	const ones = "₀₁₂₃₄₅₆₇₈₉" | ||||
| 	const tens = "0123456789" | ||||
| 	const xy = [] | ||||
| 	let max = 0 | ||||
| 	/** value (x) -> frequency (y) */ | ||||
| 	const grouped = groupBy(items, x => x) | ||||
| 	for (let i = xrange[0]; i <= xrange[1]; i++) { | ||||
| 		if (printHeader) { | ||||
| 			if (i === -1) process.stdout.write("-") | ||||
| 			else if (i.toString().at(-1) === "0") process.stdout.write(tens[i/10]) | ||||
| 			else process.stdout.write(ones[i%10]) | ||||
| 		} | ||||
| 		const y = grouped.get(i)?.length ?? 0 | ||||
| 		if (y > max) max = y | ||||
| 		xy.push(y) | ||||
| 	} | ||||
| 	if (printHeader) console.log() | ||||
| 	return xy.map(y => chars[Math.ceil(y / max * (chars.length-1))]).join("") | ||||
| } | ||||
| 
 | ||||
| const grouped = groupBy(s, x => x.length) | ||||
| const sortedGroups = [...grouped.entries()].sort((a, b) => b[0] - a[0]) | ||||
| let length = 0 | ||||
| const lengthHistogram = xhistogram(sortedGroups.map(v => v[1].length), 10) | ||||
| for (let i = 0; i < sortedGroups.length; i++) { | ||||
| 	const [k, v] = sortedGroups[i] | ||||
| 	const l = lengthHistogram[i] | ||||
| 	const h = yhistogram(v.map(x => x.indexOf(searchPattern)), [-1, k - searchPattern.length], i === 0) | ||||
| 	if (i === 0) length = h.length + 1 | ||||
| 	console.log(`${h.padEnd(length, i % 2 === 0 ? "⸱" : " ")}length ${k.toString().padEnd(3)} ${l} ${v.length}`) | ||||
| } | ||||
|  | @ -53,7 +53,7 @@ async function removeReaction(data, reactions) { | |||
|  */ | ||||
| async function removeEmojiReaction(data, reactions) { | ||||
| 	const key = await emojiToKey.emojiToKey(data.emoji) | ||||
| 	const discordPreferredEncoding = emoji.encodeEmoji(key, undefined) | ||||
| 	const discordPreferredEncoding = await emoji.encodeEmoji(key, undefined) | ||||
| 	db.prepare("DELETE FROM reaction WHERE message_id = ? AND encoded_emoji = ?").run(data.message_id, discordPreferredEncoding) | ||||
| 
 | ||||
| 	return converter.removeEmojiReaction(data, reactions, key) | ||||
|  |  | |||
|  | @ -20,7 +20,7 @@ async function addReaction(event) { | |||
| 	if (!messageID) return // Nothing can be done if the parent message was never bridged.
 | ||||
| 
 | ||||
| 	const key = event.content["m.relates_to"].key | ||||
| 	const discordPreferredEncoding = emoji.encodeEmoji(key, event.content.shortcode) | ||||
| 	const discordPreferredEncoding = await emoji.encodeEmoji(key, event.content.shortcode) | ||||
| 	if (!discordPreferredEncoding) return | ||||
| 
 | ||||
| 	await discord.snow.channel.createReaction(channelID, messageID, discordPreferredEncoding) // acting as the discord bot itself
 | ||||
|  |  | |||
|  | @ -1,58 +1,98 @@ | |||
| // @ts-check
 | ||||
| 
 | ||||
| const assert = require("assert").strict | ||||
| const Ty = require("../../types") | ||||
| const fsp = require("fs").promises | ||||
| const {join} = require("path") | ||||
| const emojisp = fsp.readFile(join(__dirname, "emojis.txt"), "utf8").then(content => content.split("\n")) | ||||
| 
 | ||||
| const passthrough = require("../../passthrough") | ||||
| const {sync, select} = passthrough | ||||
| const {select} = passthrough | ||||
| 
 | ||||
| 
 | ||||
| /** | ||||
|  * @param {string} input | ||||
|  * @param {string | null | undefined} shortcode | ||||
|  * @returns {string?} | ||||
|  */ | ||||
| function encodeEmoji(input, shortcode) { | ||||
| 	let discordPreferredEncoding | ||||
| 	if (input.startsWith("mxc://")) { | ||||
| 		// Custom emoji
 | ||||
| 		let row = select("emoji", ["emoji_id", "name"], {mxc_url: input}).get() | ||||
| 		if (!row && shortcode) { | ||||
| 			// Use the name to try to find a known emoji with the same name.
 | ||||
| 			const name = shortcode.replace(/^:|:$/g, "") | ||||
| 			row = select("emoji", ["emoji_id", "name"], {name: name}).get() | ||||
| 		} | ||||
| 		if (!row) { | ||||
| 			// We don't have this emoji and there's no realistic way to just-in-time upload a new emoji somewhere.
 | ||||
| 			// Sucks!
 | ||||
| 			return null | ||||
| 		} | ||||
| 		// Cool, we got an exact or a candidate emoji.
 | ||||
| 		discordPreferredEncoding = encodeURIComponent(`${row.name}:${row.emoji_id}`) | ||||
| 	} else { | ||||
| 		// Default emoji
 | ||||
| 		// https://github.com/discord/discord-api-docs/issues/2723#issuecomment-807022205 ????????????
 | ||||
| 		const encoded = encodeURIComponent(input) | ||||
| 		const encodedTrimmed = encoded.replace(/%EF%B8%8F/g, "") | ||||
| 
 | ||||
| 		const forceTrimmedList = [ | ||||
| 			"%F0%9F%91%8D", // 👍
 | ||||
| 			"%F0%9F%91%8E", // 👎️
 | ||||
| 			"%E2%AD%90", // ⭐
 | ||||
| 			"%F0%9F%90%88", // 🐈
 | ||||
| 			"%E2%9D%93", // ❓
 | ||||
| 			"%F0%9F%8F%86", // 🏆️
 | ||||
| 			"%F0%9F%93%9A", // 📚️
 | ||||
| 			"%F0%9F%90%9F", // 🐟️
 | ||||
| 		] | ||||
| 
 | ||||
| 		discordPreferredEncoding = | ||||
| 			( forceTrimmedList.includes(encodedTrimmed) ? encodedTrimmed | ||||
| 			: encodedTrimmed !== encoded && [...input].length === 2 ? encoded | ||||
| 			: encodedTrimmed) | ||||
| 
 | ||||
| 		console.log("add reaction from matrix:", input, encoded, encodedTrimmed, "chosen:", discordPreferredEncoding) | ||||
| function encodeCustomEmoji(input, shortcode) { | ||||
| 	// Custom emoji
 | ||||
| 	let row = select("emoji", ["emoji_id", "name"], {mxc_url: input}).get() | ||||
| 	if (!row && shortcode) { | ||||
| 		// Use the name to try to find a known emoji with the same name.
 | ||||
| 		const name = shortcode.replace(/^:|:$/g, "") | ||||
| 		row = select("emoji", ["emoji_id", "name"], {name: name}).get() | ||||
| 	} | ||||
| 	if (!row) { | ||||
| 		// We don't have this emoji and there's no realistic way to just-in-time upload a new emoji somewhere. Sucks!
 | ||||
| 		return null | ||||
| 	} | ||||
| 	return encodeURIComponent(`${row.name}:${row.emoji_id}`) | ||||
| } | ||||
| 
 | ||||
| /** | ||||
|  * @param {string} input | ||||
|  * @returns {Promise<string?>} URL encoded! | ||||
|  */ | ||||
| async function encodeDefaultEmoji(input) { | ||||
| 	// Default emoji
 | ||||
| 
 | ||||
| 	// Shortcut: If there are ASCII letters then it's not an emoji, it's a freeform Matrix text reaction.
 | ||||
| 	// (Regional indicator letters are not ASCII. ASCII digits might be part of an emoji.)
 | ||||
| 	if (input.match(/[A-Za-z]/)) return null | ||||
| 
 | ||||
| 	// Check against the dataset
 | ||||
| 	const emojis = await emojisp | ||||
| 	const encoded = encodeURIComponent(input) | ||||
| 
 | ||||
| 	// Best case scenario: they reacted with an exact replica of a valid emoji.
 | ||||
| 	if (emojis.includes(input)) return encoded | ||||
| 
 | ||||
| 	// Maybe it has some extraneous \ufe0f or \ufe0e (at the end or in the middle), and it'll be valid if they're removed.
 | ||||
| 	const trimmed = input.replace(/\ufe0e|\ufe0f/g, "") | ||||
| 	const trimmedEncoded = encodeURIComponent(trimmed) | ||||
| 	if (trimmed !== input) { | ||||
| 		if (emojis.includes(trimmed)) return trimmedEncoded | ||||
| 	} | ||||
| 
 | ||||
| 	// Okay, well, maybe it was already missing one and it actually needs an extra \ufe0f, and it'll be valid if that's added.
 | ||||
| 	else { | ||||
| 		const appended = input + "\ufe0f" | ||||
| 		const appendedEncoded = encodeURIComponent(appended) | ||||
| 		if (emojis.includes(appended)) return appendedEncoded | ||||
| 	} | ||||
| 
 | ||||
| 	// Hmm, so adding or removing that from the end didn't help, but maybe there needs to be one in the middle? We can try some heuristics.
 | ||||
| 	// These heuristics come from executing scripts/emoji-surrogates-statistics.js.
 | ||||
| 	if (trimmedEncoded.length <= 21 && trimmed.match(/^[*#0-9]/)) { // ->19: Keycap digit? 0️⃣ 1️⃣ 2️⃣ 3️⃣ 4️⃣ 5️⃣ 6️⃣ 7️⃣ 8️⃣ 9️⃣ *️⃣ #️⃣
 | ||||
| 		const keycap = trimmed[0] + "\ufe0f" + trimmed.slice(1) | ||||
| 		if (emojis.includes(keycap)) return encodeURIComponent(keycap) | ||||
| 	} else if (trimmedEncoded.length === 27 && trimmed[0] === "⛹") { // ->45: ⛹️♀️ ⛹️♂️
 | ||||
| 		const balling = trimmed[0] + "\ufe0f" + trimmed.slice(1) + "\ufe0f" | ||||
| 		if (emojis.includes(balling)) return encodeURIComponent(balling) | ||||
| 	} else if (trimmedEncoded.length === 30) { // ->39: ⛓️💥 ❤️🩹 ❤️🔥 or ->48: 🏳️⚧️ 🏌️♀️ 🕵️♀️ 🏋️♀️ and gender variants
 | ||||
| 		const thriving = trimmed[0] + "\ufe0f" + trimmed.slice(1) | ||||
| 		if (emojis.includes(thriving)) return encodeURIComponent(thriving) | ||||
| 		const powerful = trimmed.slice(0, 2) + "\ufe0f" + trimmed.slice(2) + "\ufe0f" | ||||
| 		if (emojis.includes(powerful)) return encodeURIComponent(powerful) | ||||
| 	} else if (trimmedEncoded.length === 51 && trimmed[3] === "❤") { // ->60: 👩❤️👨 👩❤️👩 👨❤️👨
 | ||||
| 		const yellowRomance = trimmed.slice(0, 3) + "❤\ufe0f" + trimmed.slice(4) | ||||
| 		if (emojis.includes(yellowRomance)) return encodeURIComponent(yellowRomance) | ||||
| 	} | ||||
| 
 | ||||
| 	// there are a few more longer ones but I got bored
 | ||||
| 	return null | ||||
| } | ||||
| 
 | ||||
| /** | ||||
|  * @param {string} input | ||||
|  * @param {string | null | undefined} shortcode | ||||
|  * @returns {Promise<string?>} | ||||
|  */ | ||||
| async function encodeEmoji(input, shortcode) { | ||||
| 	if (input.startsWith("mxc://")) { | ||||
| 		return encodeCustomEmoji(input, shortcode) | ||||
| 	} else { | ||||
| 		return encodeDefaultEmoji(input) | ||||
| 	} | ||||
| 	return discordPreferredEncoding | ||||
| } | ||||
| 
 | ||||
| module.exports.encodeEmoji = encodeEmoji | ||||
|  |  | |||
							
								
								
									
										52
									
								
								src/m2d/converters/emoji.test.js
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										52
									
								
								src/m2d/converters/emoji.test.js
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,52 @@ | |||
| // @ts-check
 | ||||
| 
 | ||||
| const {test} = require("supertape") | ||||
| const {encodeEmoji} = require("./emoji") | ||||
| 
 | ||||
| test("emoji: valid", async t => { | ||||
| 	t.equal(await encodeEmoji("🦄", null), "%F0%9F%A6%84") | ||||
| }) | ||||
| 
 | ||||
| test("emoji: freeform text", async t => { | ||||
| 	t.equal(await encodeEmoji("ha", null), null) | ||||
| }) | ||||
| 
 | ||||
| test("emoji: suspicious unicode", async t => { | ||||
| 	t.equal(await encodeEmoji("Ⓐ", null), null) | ||||
| }) | ||||
| 
 | ||||
| test("emoji: needs u+fe0f added", async t => { | ||||
| 	t.equal(await encodeEmoji("☺", null), "%E2%98%BA%EF%B8%8F") | ||||
| }) | ||||
| 
 | ||||
| test("emoji: needs u+fe0f removed", async t => { | ||||
| 	t.equal(await encodeEmoji("⭐️", null), "%E2%AD%90") | ||||
| }) | ||||
| 
 | ||||
| test("emoji: number key needs u+fe0f in the middle", async t => { | ||||
| 	t.equal(await encodeEmoji("3⃣", null), "3%EF%B8%8F%E2%83%A3") | ||||
| }) | ||||
| 
 | ||||
| test("emoji: hash key needs u+fe0f in the middle", async t => { | ||||
| 	t.equal(await encodeEmoji("#⃣", null), "%23%EF%B8%8F%E2%83%A3") | ||||
| }) | ||||
| 
 | ||||
| test("emoji: broken chains needs u+fe0f in the middle", async t => { | ||||
| 	t.equal(await encodeEmoji("⛓💥", null), "%E2%9B%93%EF%B8%8F%E2%80%8D%F0%9F%92%A5") | ||||
| }) | ||||
| 
 | ||||
| test("emoji: balling needs u+fe0f in the middle", async t => { | ||||
| 	t.equal(await encodeEmoji("⛹♀", null), "%E2%9B%B9%EF%B8%8F%E2%80%8D%E2%99%80%EF%B8%8F") | ||||
| }) | ||||
| 
 | ||||
| test("emoji: trans flag needs u+fe0f in the middle", async t => { | ||||
| 	t.equal(await encodeEmoji("🏳⚧", null), "%F0%9F%8F%B3%EF%B8%8F%E2%80%8D%E2%9A%A7%EF%B8%8F") | ||||
| }) | ||||
| 
 | ||||
| test("emoji: spy needs u+fe0f in the middle", async t => { | ||||
| 	t.equal(await encodeEmoji("🕵♀", null), "%F0%9F%95%B5%EF%B8%8F%E2%80%8D%E2%99%80%EF%B8%8F") | ||||
| }) | ||||
| 
 | ||||
| test("emoji: couple needs u+fe0f in the middle", async t => { | ||||
| 	t.equal(await encodeEmoji("👩❤👩", null), "%F0%9F%91%A9%E2%80%8D%E2%9D%A4%EF%B8%8F%E2%80%8D%F0%9F%91%A9") | ||||
| }) | ||||
							
								
								
									
										3799
									
								
								src/m2d/converters/emojis.txt
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										3799
									
								
								src/m2d/converters/emojis.txt
									
										
									
									
									
										Normal file
									
								
							
										
											
												File diff suppressed because it is too large
												Load diff
											
										
									
								
							|  | @ -141,6 +141,7 @@ file._actuallyUploadDiscordFileToMxc = function(url, res) { throw new Error(`Not | |||
| 	require("../src/d2m/converters/user-to-mxid.test") | ||||
| 	require("../src/m2d/converters/diff-pins.test") | ||||
| 	require("../src/m2d/converters/event-to-message.test") | ||||
| 	require("../src/m2d/converters/emoji.test") | ||||
| 	require("../src/m2d/converters/utils.test") | ||||
| 	require("../src/m2d/converters/emoji-sheet.test") | ||||
| 	require("../src/discord/interactions/invite.test") | ||||
|  |  | |||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue