Compare commits

..

4 commits

60 changed files with 5399 additions and 1421 deletions

3
.gitignore vendored
View file

@ -1,4 +1,5 @@
node_modules/
your.db
*.db
your.csv
.gitSAFE
*.DELETE-THIS-HAS-PII

25
README.md Normal file
View file

@ -0,0 +1,25 @@
# base-data-manager
A Typescript project for parsing through many types of data exports to tabular formats
** This is heavily WIP, and mostly just a toy for myself **
### Installation
* Install `jq`
* Install sqlite `csv.so` extension (Hardcoded to `/home/cobertos/sqlite-files/` currently)
* Install `node` + `pnpm i`
* See `main.ts` for current example usage
### Proposed Architecture
The architecture runs in 2 steps.
The first step is unopinionated in it's output format. It's meant to take the source data exactly as-is and output it as csv. All source data should pass through, but will be normalized in csv
**TODO: It's not completely unopinionated, there is some normalization for names of columns I think we want to apply? Or maybe we apply that later...**
An optional second step combines everything into a single SQLite database. From here we normalize many different types of data across multiple exports into a single opinionated output. For example, message threads/channels should all have the same table format, or end up in the same table
**TODO: No idea if the second part should be a part of this project... but it currently is**

File diff suppressed because it is too large Load diff

View file

@ -1,15 +1,16 @@
import { TaskTargetPipelineHelper } from "./task.ts";
import { pipe, branch, cmd, assignMeta, cd, glob, read, branchGen, type PipelineOp } from "./task.ts";
import { htmlSelectorChunkedDuplex } from "./html.ts";
export function google(this: TaskTargetPipelineHelper){
const p = this.assignMeta({ idValue: t=>`Google - ${t.basename}` }); // Generic ID for everything in here
const col: Set<TaskTargetPipelineHelper> = new Set();
export function google(){
return pipe(
// Generic ID for everything in here
assignMeta({ idValue: t=>`Google - ${t.basename}` }),
branchGen(function*() {
// TODO: There is a root takeout folder
p.collect(col).cd('Access Log Activity/Activities - A list of Google services accessed by.csv').read()
p.collect(col).cd('Devices - A list of devices (i.e. Nest, Pixel, iPh.csv').read()
yield pipe(cd('Access Log Activity/Activities - A list of Google services accessed by.csv'), read())
yield pipe(cd('Devices - A list of devices (i.e. Nest, Pixel, iPh.csv'), read())
// Assignments - data was empty
// Business messages - GMB messages, there's some but so far outside of what I want
@ -21,46 +22,55 @@ export function google(this: TaskTargetPipelineHelper){
// TODO: Device Information.json
// TODO: Dictionary.csv
// TODO: ...
p.collect(col).cd('Chrome/History.json')
.read()
yield pipe(
cd('Chrome/History.json'),
read(),
// TODO: Typed Url", no data
// TODO: "session", complex data
// Omitted .ptoken and .client_id for now. I think ptoken is maybe for the history API? client_id is base64 something...
// TODO: time_usec IS WRONG!! Needs to be ms
.cmd(["jq", "-r", `["favicon_url","page_transition","title","url","time_usec"],
cmd(["jq", "-r", `["favicon_url","page_transition","title","url","time_usec"],
(
."Browser History"[]
| [.favicon_url, .page_transition, .title, .url, (.time_usec | todateiso8601)]
)
| @csv`])
| @csv
`])
);
// TODO: Contactss, exports an .vcf
// TODO: ...
// a = t.fork().cd(`Google Pay`)
p.collect(col).cd(`Google Pay/Google transactions`).glob(`transactions_*.csv`)
.read()
yield pipe(
cd(`Google Pay/Google transactions`),
glob(`transactions_*.csv`),
read(),
// .fork("a").cd(`Money sends and requests`)
// .fork().cd(`Money sends and requests.csv`)
// .read()
// .cmd(t=>["sqlite-utils", "insert", "your.db", t.basename, "-", "--csv", "--detect-types"])
// TODO: One more folder, and it only has a pdf
);
// TODO: Google Play Movies _ TV - no data
// TODO: ...
p.collect(col).cd("Location History/Location History.json")
.read()
yield pipe(
cd("Location History/Location History.json"),
read(),
// TODO: This is missing
// "altitude" : 158,
// "verticalAccuracy" : 68
// and the activity models. I had no idea google tries to determine if I'm "tilting"
.cmd(["jq", "-r", `["timestamp","latitudeE7","longitudeE7","accuracy"],
cmd(["jq", "-r", `["timestamp","latitudeE7","longitudeE7","accuracy"],
(
.locations[]
| [.timestampMs | todateiso8601, .latitudeE7, .longitudeE7, .accuracy]
)
| @csv`])
| @csv
`])
);
// There's also the semantic history but that's an entire nother can of worms
// it seems like
@ -99,7 +109,7 @@ export function google(this: TaskTargetPipelineHelper){
// TODO: News
// TODO: Profile
// TODO: Tasks - No data
return Array.from(col);
})
);
};

52
data-export/io.ts Normal file
View file

@ -0,0 +1,52 @@
import fs from 'node:fs/promises';
import fsSync from 'node:fs';
import { DatabaseSync } from "node:sqlite";
import { type ProcessOutputAggregate, type RunOutput, TaskTarget, runAll, type ProcessOutputSimple } from "./task.ts";
import { ProcessOutput } from 'zx';
async function loadCSVTable(
db: DatabaseSync,
target: TaskTarget,
result: ProcessOutput | ProcessOutputAggregate | ProcessOutputSimple
) {
const id = target.id;
const table = id;
const tmpPath = `/tmp/${id}.csv`;
// console.log(`Writing ${tmpPath}`);
const fd = await fs.open(tmpPath, 'w');
await fs.writeFile(fd, result.stdout, { encoding: 'utf8' });
await fd.close();
// console.log(`Loading ${tmpPath} → table ${table}`);
db.exec(`CREATE VIRTUAL TABLE temp.intermediate USING csv(filename='${tmpPath}', header);`);
db.exec(`CREATE TABLE "${table}" AS SELECT * FROM intermediate;`);
db.exec(`DROP TABLE IF EXISTS intermediate;`);
return;
}
// TODO: This should really have the same name throughout the codebase?
export const runPipeline = runAll;
/**
* @param db Must be a DatabaseSync with the csv.so extension enabled
*/
export async function loadIntoDb(db: DatabaseSync, runOutput: RunOutput[]) {
for (const {result, target} of runOutput) {
await loadCSVTable(db, target, result);
}
}
export function getDefaultDB(): DatabaseSync {
const db = new DatabaseSync(":memory:", { allowExtension: true });
db.loadExtension("/home/cobertos/sqlite-files/csv.so")
db.enableLoadExtension(false);
return db;
}
export async function dumpDBToDisk(db: DatabaseSync, dumpPath: string) {
if (fsSync.existsSync(dumpPath)) {
await fs.unlink(dumpPath); // unlink the old
}
// Dump it all to the path specified
db.exec(`VACUUM main INTO '${dumpPath}'`);
}

View file

@ -1,15 +1,18 @@
import { $, type ProcessOutput } from 'zx';
import os from 'os';
import { type TaskTarget, run } from "./task.ts";
$.verbose = false;
export async function parallel(
targets: TaskTarget[],
/**Generic parallel runner with optional logging
* Runs `targets` with `runFn` up to a maximum of `maxConcurrency` amount at a time
* Shaped in a way that expects generally something that returns zx.ProcessOutput (or
* something with .duration and .ok built-in to the return)
* @param runFn Should NOT throw. Return { ok: false } instead
*/
export async function parallel<T, R extends { duration: number, ok: boolean }>(
targets: T[],
runFn: (t: T)=>Promise<R>,
quiet: boolean = false,
maxConcurrency: number = os.cpus().length
): Promise<ProcessOutput[]> {
const resultMap = new Map<string, ProcessOutput>();
): Promise<R[]> {
const resultMap = new Map<T, R>();
const total = targets.length;
let completed = 0;
@ -40,14 +43,14 @@ export async function parallel(
process.stderr.write(`\r${formatEta()}`.padEnd(80));
}
async function runJob(t: TaskTarget): Promise<void> {
async function runJob(t: T): Promise<void> {
running++;
printStatus();
const result = await run(t);
const result = await runFn(t);
completionTimes.push(result.duration);
resultMap.set(t.id, result);
resultMap.set(t, result);
running--;
completed++;
@ -76,13 +79,15 @@ export async function parallel(
process.stderr.write('\n');
const totalSeconds = ((Date.now() - startTime) / 1000).toFixed(1);
const failed = Array.from(resultMap.values().filter(p => !p.ok));
if (!quiet) {
process.stderr.write(
`\nCompleted ${total} jobs in ${totalSeconds}s (${failed.length} failed)\n`
);
}
const output = targets
.map(t => {
const r = resultMap.get(t.id)!;
const r = resultMap.get(t)!;
return r;
});

View file

@ -3,7 +3,10 @@ import fs from 'node:fs';
import { strict as assert } from "node:assert";
import { ZipFS } from "./zipFs.ts";
import { globSync } from "glob";
import { $, ProcessPromise, quote } from "zx";
import { $, ProcessOutput, quote } from "zx";
import { parallel } from "./parallel.ts";
$.verbose = false;
type FSImpl = {
isZip?: boolean;
@ -38,19 +41,20 @@ function safe(s: string) {
interface TaskTargetOp {
type: "read" | "mid";
toShell(target: TaskTarget): string;
toShell(target: TaskTarget): string | undefined;
clone(): TaskTargetOp;
}
class TaskTargetRead implements TaskTargetOp {
get type(){ return "read" as const; }
toShell(target: TaskTarget) {
if (target.fsImpl.isZip) {
// Read the file to stdout from the target inside the zip file
// This relies on the internals of fsImpl a bit to have the path to
// the root zip so we can create a command against it
assert(target.fsImpl.zipPath, "Should have a zipPath");
// We need to be able to do this
return `7z x ${quote(target.fsImpl.zipPath)} -so ${quote(target.path)}`;
}
// TODO : Implement when reading from a zip file
return `cat ${quote(target.path)}`;
}
clone() {
@ -115,19 +119,10 @@ export const COLUMN_TYPES = {
"TODO": {}
};
// // if (type === "numeric") {
// // queryLine = `min(${columnName}) as lo, max(${columnName}) as hi, count(*) as n`;
// // formatFn = (r: any)=>`${r.n} rows from ${r.lo} to ${r.hi} for ${t.id}`;
// // }
// // else {
// // queryLine = `count(*) as n`;
// // formatFn = (r: any)=>`${r.n} rows for ${t.id}`;
// // }
/**Column metadata. Just a string into the TYPES*/
type ColumnMeta = (keyof typeof COLUMN_TYPES | undefined);
// Make non-optional version of just the metadata values of TaskTarget
type TaskTargetMeta = Required<Pick<TaskTarget, "idValue" | "perRowDescription" | "perRowTags" | "columnMeta">>;
type TaskTargetMeta = Required<Pick<TaskTarget, "idValue" | "perRowDescription" | "perRowTags" | "columnMeta" | "aggregate" | "metaIdValue" | "aggregateColumns">>;
export class TaskTarget {
/**The current path pointed to by this TaskTarget*/
@ -149,15 +144,16 @@ export class TaskTarget {
* you might do something like '"{3}" sent from {2} to {1}'
* */
perRowDescription?: string;
/**For every output CSV, this defines a SQL expression evaluated per-row that
* returns a comma-separated string of tags to assign to that row.
* Use the items {0}, {1} to template column values, same as perRowDescription.
* Example: A static set of tags: "'me,facebook'"
* Example: Tags derived from a column: "'facebook,' || {2}"
* */
/**A CSV of tags that is added to every row of the table (TODO: no template functionality currently)*/
perRowTags?: string;
/**Metadata about the columns*/
columnMeta?: ColumnMeta[];
/**Whether or not to aggregate to a single task (everything with the id value idValue)*/
aggregate?: boolean;
/**Names of the columns to aggregate with*/
aggregateColumns?: string[];
/**A metadata TaskTarget for this TaskTarget, if one exists*/
metaIdValue?: ValidId;
constructor(path: string){
this.path = path;
@ -194,6 +190,15 @@ export class TaskTarget {
}
return safe(this.idValue);
}
get metaId() {
if (!this.metaIdValue) {
return undefined;
}
if (typeof this.metaIdValue === "function") {
return safe(this.metaIdValue(this));
}
return safe(this.metaIdValue);
}
/**Changes the current directory of the target*/
cd(path: string): TaskTarget {
@ -233,6 +238,9 @@ export class TaskTarget {
t.perRowDescription = this.perRowDescription;
t.perRowTags = this.perRowTags;
t.columnMeta = this.columnMeta?.slice();
t.metaIdValue = this.metaIdValue;
t.aggregate = this.aggregate;
t.aggregateColumns = this.aggregateColumns?.slice();
return t;
}
@ -247,6 +255,7 @@ export class TaskTarget {
toShell() {
const shell = this.pipeline
.map(p => p.toShell(this))
.filter(p => !!p) // remove empty strings and undefined
.join(" | ")
return shell;
}
@ -269,42 +278,72 @@ export class TaskTarget {
}
}
export function each(targets: TaskTarget[], fn: (t: TaskTarget)=>void) {
for (const t of targets) {
fn(t);
export interface PipelineOp {
(targets: TaskTarget[]): TaskTarget[] | Promise<TaskTarget[]>;
}
export function cd(path: string): PipelineOp {
return (targets: TaskTarget[]) => targets.map(t => t.clone().cd(path));
}
export function map(targets: TaskTarget[], fn: (t: TaskTarget)=>TaskTarget) {
const newTargets = [];
for (const t of targets) {
newTargets.push(fn(t));
export function glob(globPath: string): PipelineOp {
return (targets: TaskTarget[]) => targets.map(t => t.glob(globPath)).flat();
}
return newTargets;
export function unzip(): PipelineOp {
return async (targets: TaskTarget[]) => Promise.all(targets.map(t => t.unzip()));
}
export function cd(targets: TaskTarget[], path: string): TaskTarget[] {
return targets.map(t => t.clone().cd(path));
export function read(): PipelineOp {
return (targets: TaskTarget[]) => targets.map(t => t.clone().read())
}
export function glob(targets: TaskTarget[], globPath: string): TaskTarget[] {
return targets.map(t => t.glob(globPath)).flat();
export function cmd(cmd: ValidCmd): PipelineOp {
return (targets: TaskTarget[]) => targets.map(t => t.clone().cmd(cmd))
}
export async function unzip(targets: TaskTarget[]): Promise<TaskTarget[]> {
return Promise.all(targets.map(t => t.unzip()));
export function assignMeta(meta: Partial<TaskTargetMeta>): PipelineOp {
return (targets: TaskTarget[]) => targets.map(t => t.clone().assignMeta(meta))
}
export function read(targets: TaskTarget[]): TaskTarget[] {
return targets.map(t => t.clone().read())
export function each(fn: (t: TaskTarget)=>TaskTarget): PipelineOp {
return (targets: TaskTarget[])=> targets.map(fn);
}
export function cmd(targets: TaskTarget[], cmd: ValidCmd): TaskTarget[] {
return targets.map(t => t.clone().cmd(cmd))
export function pipe(...ops: PipelineOp[]): PipelineOp {
return async (targets: TaskTarget[]) => {
for (const op of ops) {
targets = await op(targets);
}
export function assignMeta(targets: TaskTarget[], meta: Partial<TaskTargetMeta>): TaskTarget[] {
return targets.map(t => t.clone().assignMeta(meta))
return targets;
};
}
export function branch(...ops: PipelineOp[]): PipelineOp {
return async (targets: TaskTarget[]) => {
const targetsArrays = await Promise.all(ops.map(op => op(targets)));
return targetsArrays.flat();
};
}
export function branchGen(genFn: ()=>Generator<PipelineOp>): PipelineOp {
const opsToBranch = Array.from(genFn());
return (targets: TaskTarget[]) => {
return branch(...opsToBranch)(targets);
};
}
export async function execPaths(entries: ({path: string, op: PipelineOp })[]) {
return (await Promise.all(
// Map every entry path into a TaskTarget and run the PipelineOp with
// that TaskTarget
entries
.map(async ({path,op})=>{
const targets = [new TaskTarget(path)];
return await op(targets);
})
)).flat();
}
/**Verify, anything that fails is skipped and throws an error*/
export async function verify(targets: TaskTarget[]) {
const outTargets: TaskTarget[] = [];
for (const t of targets) {
// Make sure fsImpl is ready
// TODO: DO NOT PUT THIS IN VERIFY, this should go somewhere in the task building stuff...
if ("ready" in t.fsImpl && !t.fsImpl.ready && t.fsImpl.init) {
await t.fsImpl.init();
}
@ -319,78 +358,133 @@ export async function verify(targets: TaskTarget[]) {
outTargets.push(t);
}
return outTargets;
}
function collectionSwap(a: TaskTargetPipelineHelper, b: TaskTargetPipelineHelper) {
if (!a.__collection) {
return;
export interface ProcessOutputAggregate {
stdout: string;
stderr: string;
exitCodes: (number | null)[];
duration: number;
ok: boolean;
}
export interface ProcessOutputSimple {
stdout: string;
stderr: string;
exitCode: number;
duration: number;
ok: boolean;
}
// Remove a, add b
const collection = a.__collection;
delete a.__collection;
collection.delete(a);
b.__collection = collection;
collection.add(b);
function combineProcessOutputAggregate(poa: ProcessOutputAggregate | undefined, t: TaskTarget, po: ProcessOutput) {
if (!poa) {
assert(t.aggregateColumns, "aggregate TaskTarget must have aggregateColumns");
const headers = t.aggregateColumns.join(",") + "\n";
return {
stdout: headers + po.stdout,
stderr: po.stderr,
exitCodes: [po.exitCode],
duration: po.duration,
ok: po.ok
};
}
export class TaskTargetPipelineHelper extends Array<TaskTarget> {
__collection?: Set<TaskTargetPipelineHelper>;
static pipeline(t: TaskTarget[]): TaskTargetPipelineHelper {
if (Object.getPrototypeOf(t) === TaskTargetPipelineHelper.prototype) {
return t as any; // Already done
}
Object.setPrototypeOf(t, TaskTargetPipelineHelper.prototype);
return t as any;
// Comes with a builtin "\n" from jq on stdout and stderr, no need to add
// a trailing one
poa.stdout += po.stdout;
poa.stderr += po.stderr;
poa.exitCodes.push(po.exitCode);
poa.duration += po.duration;
poa.ok &&= po.ok;
return poa;
}
_fn(fn: (t: TaskTarget[])=>TaskTarget[]): TaskTargetPipelineHelper {
const p = TaskTargetPipelineHelper.pipeline(this);
const t = fn(p);
const p2 = TaskTargetPipelineHelper.pipeline(t);
collectionSwap(p, p2); // Move collection pointer to the new item, ends always end up in the collection
return p2;
}
async _afn(fn: (t: TaskTarget[])=>Promise<TaskTarget[]>): Promise<TaskTargetPipelineHelper> {
const p = TaskTargetPipelineHelper.pipeline(this);
const t = await fn(p);
const p2 = TaskTargetPipelineHelper.pipeline(t);
collectionSwap(p, p2); // Move collection pointer to the new item, ends always end up in the collection
return p2;
export interface RunOutput {
target: TaskTarget,
result: ProcessOutput | ProcessOutputAggregate | ProcessOutputSimple
}
cd(path: string): TaskTargetPipelineHelper {
return this._fn(t => cd(t, path));
}
glob(globPath: string): TaskTargetPipelineHelper {
return this._fn(t => glob(t, globPath));
}
async unzip(): Promise<TaskTargetPipelineHelper> {
return this._afn(unzip);
}
read(): TaskTargetPipelineHelper {
return this._fn(read);
}
cmd(_cmd: ValidCmd): TaskTargetPipelineHelper {
return this._fn(t => cmd(t, _cmd));
}
assignMeta(meta: Partial<TaskTargetMeta>): TaskTargetPipelineHelper {
return this._fn(t => assignMeta(t, meta));
}
/**
* @todo Nested versions of this don't currently work, but they could if we
* turn __collection into an array of collections
*/
collect(_c: Set<TaskTargetPipelineHelper>) {
this.__collection = _c;
return this;
}
}
export async function run(target: TaskTarget): Promise<ProcessPromise> {
export async function run(target: TaskTarget): Promise<ProcessOutput> {
const command = target.toShell();
return await $({ nothrow: true })`bash -c ${command}`;
}
export async function runAll(targets: TaskTarget[]): Promise<RunOutput[]> {
const finalTargets = await verify(targets);
const results = await parallel(finalTargets, run, true);
const nonAggregateTargets: TaskTarget[] = finalTargets.filter(t => !t.aggregate);
const nonAggregateResults: RunOutput[] = [];
const aggregateResultsMap: Record<string, RunOutput> = {};
// == Aggregate tables ==
// Some TaskTargets have .aggregate: true, which means they should all be combined
// into a single task with the id of the .id property
for (const [idx, r] of results.entries()) {
const t = finalTargets[idx];
if (!t.aggregate) {
nonAggregateResults.push({
target: t,
result: r
});
continue;
}
const aggregateId = t.id;
const prevResult = aggregateResultsMap[aggregateId]?.result;
aggregateResultsMap[aggregateId] = {
target: t, // Use target t for metadata, so it will use the last target
result: combineProcessOutputAggregate(prevResult as (ProcessOutputAggregate | undefined), t, r)
};
}
// == Metadata table ==
// Each TaskTarget has things like perRowDescription and other things we want to store
// and output. this creates a single TaskTarget for all that perTable metadata
function csvEscape(s: string | undefined) {
if (s === undefined) {
return "";
}
if (s.includes("\"") || s.includes(",") || s.includes("\n")) {
return `"${s.replace(/\"/g, "\"\"")}"`;
}
return s;
}
let metadataCSV = "id,perRowDescription,perRowTags,columnMeta,metaId\n";
for (const t of nonAggregateTargets) {
const tableNamePart = t.id;
const perRowDescriptionPart = t.perRowDescription;
const perRowTagsPart = t.perRowTags;
const columnMetaPart = t.columnMeta?.join(",") ?? "";
const metaIdPart = t.metaId;
metadataCSV += [
csvEscape(tableNamePart),
csvEscape(perRowDescriptionPart),
csvEscape(perRowTagsPart),
csvEscape(columnMetaPart),
csvEscape(metaIdPart)
].join(",") + "\n";
}
// Won't be removed by verify() because we're adding it after that's used
// TODO: Would be nice to bake this into TaskTarget/verify for tasks that dont point
// to a real path
const metadataTarget = new TaskTarget("<none>");
metadataTarget
// id, perRowDescription, perRowTags, columnMeta, metaId
.assignMeta({
idValue: "base_data_manager_metadata",
columnMeta: ["any", "any", "any", "any", "any"],
perRowTags: "internal",
});
const metadataResult= {
stdout: metadataCSV,
stderr: "",
exitCode: 0,
duration: 0, // TODO
ok: true
};
const metadataRunOutput: RunOutput = { target: metadataTarget, result: metadataResult };
const aggregateResults: RunOutput[] = Object.values(aggregateResultsMap);
return aggregateResults.concat(nonAggregateResults).concat(metadataRunOutput);
}

176
main.ts
View file

@ -1,158 +1,56 @@
import fs from 'node:fs/promises';
import fsSync from 'node:fs';
import nodePath from "node:path";
import { DatabaseSync } from "node:sqlite";
import "./data-export/facebook.ts";
import { type DatabaseSync } from "node:sqlite";
import { fileURLToPath } from "node:url";
import { google } from "./data-export/google.ts";
import { TaskTargetPipelineHelper, TaskTarget, verify } from "./data-export/task.ts";
import { parallel } from "./data-export/parallel.ts";
import { ProcessOutput } from 'zx';
import { facebook, facebook_v2 } from "./data-export/facebook.ts";
import { type TaskTarget, execPaths } from "./data-export/task.ts";
import * as DataIO from "./data-export/io.ts";
declare module "./data-export/task.ts" {
interface TaskTargetPipelineHelper {
google: typeof google;
}
}
const __filename = fileURLToPath(import.meta.url);
Object.assign(TaskTargetPipelineHelper.prototype, {
google
});
export const startTime = Date.now();
export const elapsed = ()=>`${((Date.now() - startTime) / 1000).toFixed(2)}s`;
async function loadCSVTable(
db: DatabaseSync,
target: TaskTarget,
result: ProcessOutput
) {
const id = target.id;
const table = id;
const tmpPath = `/tmp/${id}.csv`;
console.log(`Writing ${tmpPath}`);
const fd = await fs.open(tmpPath, 'w');
await fs.writeFile(fd, result.stdout, { encoding: 'utf8' });
await fd.close();
console.log(`Loading ${tmpPath} → table ${table}`);
export async function loadTaskInNewDb(targets: TaskTarget[]): Promise<DatabaseSync> {
console.log(`${elapsed()} - Run all targets`);
const out = await DataIO.runPipeline(targets);
console.log(`${elapsed()} - Final targets exported to CSV. Got ${out.length} targets`);
// const headers = lines[0].split(",");
// const columnsSql = headers.map(h => `"${h}" TEXT`).join(", ");
db.exec(`CREATE VIRTUAL TABLE temp.tmp_${table} USING csv(filename='${tmpPath}');`);
// db.exec(`CREATE TABLE "${table}" AS SELECT * FROM intermediate;`);
// db.exec(`DROP TABLE IF EXISTS intermediate;`);
return `tmp_${table}`;
}
function getColumnNames(db: DatabaseSync, tableName: string) {
return db.prepare(`PRAGMA table_info(${tableName})`).all().map(c => c.name) as string[];
}
function templateToSql(template: string, columns: string[]) {
// Convert '{0}, {1}' to '%s, %s'
const args: string[] = [];
const sqlTemplate = template.replace(/\{(\d+)\}/g, (match, index) => {
args.push(columns[parseInt(index)]);
return '%s';
});
return `printf('${sqlTemplate}', ${args.join(', ')})`;
}
function templateToSqlExpr(template: string, columns: string[]) {
// perRowTags is already a SQL expression; just substitute {N} with column names
return template.replace(/\{(\d+)\}/g, (_match, index) => columns[parseInt(index)]);
// TODO: Add an option to output everything plainly as CSV in a single directory
console.log(`${elapsed()} - Building combined database table in :memory:`);
const db = DataIO.getDefaultDB();
await DataIO.loadIntoDb(db, out);
const tableCount = db.prepare(`SELECT COUNT(*) as count FROM base_data_manager_metadata`).get()!.count;
console.log(`${elapsed()} - Single database built with ${tableCount} tables`);
return db;
}
async function main() {
// Configurable stuff
const sqlitePath = 'your.db';
const t = TaskTargetPipelineHelper;
const targets = TaskTargetPipelineHelper.pipeline([
// new TaskTarget("/home/cobertos/Seafile/projects/base-data-manager/test/fixtures/facebook-json-2021-05-01"),
new TaskTarget("/home/cobertos/Seafile/archive/ExportedServiceData/facebook/formapcast_facebook-DEADNAME-May2021-json"),
//new TaskTarget("/home/cobertos/Seafile/archive/ExportedServiceData/facebook/facebook-x-2025-11-29-x.zip").zip()).facebook_v2();
//new TaskTarget("/home/cobertos/Seafile/archive/ExportedServiceData/google/2023-NAMEwork-001").facebook_v2();
])
.facebook();
// .facebook_v2();
// .google();
console.log(`${elapsed()} - Building targets`);
const targets = await execPaths([
{path: "/home/cobertos/Seafile/archive/ExportedServiceData/facebook/formapcast_facebook-DEADNAME-May2021-json", op: facebook()}
// {path: "/home/cobertos/Seafile/projects/base-data-manager/test/fixtures/facebook-json-2021-05-01", op: facebook()}
// {path: "/home/cobertos/Seafile/archive/ExportedServiceData/facebook/facebook-x-2025-11-29-x.zip", op: pipe(unzip(), facebook_v2())}
// {path: "/home/cobertos/Seafile/archive/ExportedServiceData/google/2023-NAMEwork-001", op: facebook_v2()}
]);
console.log(`${elapsed()} - Found ${targets.filter(t => !t.aggregate).length} possible targets`);
// TODO: Make this less painful in task.ts
// let zipTask = t.fork().zip("/home/cobertos/Seafile/archive/ExportedServiceData/facebook/facebook-DEADNAME-May2021-json.zip");
// await (zipTask.fsImpl as any).init();
const db = await loadTaskInNewDb(targets);
const finalTargets = await verify(targets);
const results = await parallel(finalTargets, true);
if (fsSync.existsSync(sqlitePath)) {
await fs.unlink(sqlitePath); // unlink the old
}
// Open an in-memory db for speed
const db = new DatabaseSync(":memory:", { allowExtension: true });
db.loadExtension("/home/cobertos/sqlite-files/csv.so")
db.enableLoadExtension(false);
// New output table
db.exec(`CREATE TABLE combined (timestamp TEXT, description TEXT, sender TEXT, receiver TEXT, tags TEXT, lat REAL, lng REAL);`);
for (const [idx, target] of targets.entries()) {
const result = results[idx];
if (!target.columnMeta) {
continue; // No column information
}
const tableName = await loadCSVTable(db, target, result);
const columnNames = getColumnNames(db, tableName);
// Now find what to insert into each row of the combined
let descriptionPart = `'An entry from the ${tableName} table'`; // Default is just kinda garbo...
if (target.perRowDescription) {
descriptionPart = templateToSql(target.perRowDescription, columnNames);
}
let timestampPart: string | undefined;
let senderPart = 'NULL';
let receiverPart = 'NULL';
let latPart = 'NULL';
let lngPart = 'NULL';
for (const [idx, col] of target.columnMeta.entries()) {
const columnName = columnNames[idx];
if (col === "isodatetime") {
timestampPart = columnName;
} else if (col === "sender") {
senderPart = columnName;
} else if (col === "receiver") {
receiverPart = columnName;
} else if (col === "lat") {
latPart = columnName;
} else if (col === "lng") {
lngPart = columnName;
}
}
if (!timestampPart) {
continue;
}
let tagsPart = 'NULL';
if (target.perRowTags) {
tagsPart = templateToSqlExpr(target.perRowTags, columnNames);
}
// OFFSET + LIMIT to ignore the CSV headers
db.exec(`INSERT INTO combined SELECT ${timestampPart}, ${descriptionPart}, ${senderPart}, ${receiverPart}, ${tagsPart}, ${latPart}, ${lngPart} FROM ${tableName} LIMIT -1 OFFSET 1;`);
}
// Dump it all to the path specified
db.exec(`VACUUM main INTO '${sqlitePath}'`);
// Now dump it as a CSV
const rows = db.prepare(`
SELECT timestamp || ',' || '"' || replace(description, '"', '""') || '"' as row FROM combined
`)
.all()
.map(r => r.row)
.join('\n');
db.close();
await fs.writeFile('your.csv', rows, { encoding: "utf8" });
console.log(`${elapsed()} - Writing database to disk at "${sqlitePath}"`);
DataIO.dumpDBToDisk(db, sqlitePath);
console.log(`${elapsed()} - Database written to disk`);
}
if (process.argv[1] === __filename) {
main();
}
// TODO: Move this into here
// csvSink(

View file

@ -27,6 +27,7 @@
},
"devDependencies": {
"@types/node": "^24.1.0",
"csv-parse": "^6.1.0",
"typescript": "^5.9.3"
}
}

8
pnpm-lock.yaml generated
View file

@ -33,6 +33,9 @@ importers:
'@types/node':
specifier: ^24.1.0
version: 24.10.0
csv-parse:
specifier: ^6.1.0
version: 6.1.0
typescript:
specifier: ^5.9.3
version: 5.9.3
@ -59,6 +62,9 @@ packages:
buffer-crc32@0.2.13:
resolution: {integrity: sha512-VO9Ht/+p3SN7SKWqcrgEzjGbRSJYTx+Q1pTQC0wrWqHx0vpJraQ6GtHx8tvcg1rlK1byhU5gccxgOgj7B0TDkQ==}
csv-parse@6.1.0:
resolution: {integrity: sha512-CEE+jwpgLn+MmtCpVcPtiCZpVtB6Z2OKPTr34pycYYoL7sxdOkXDdQ4lRiw6ioC0q6BLqhc6cKweCVvral8yhw==}
dom-serializer@2.0.0:
resolution: {integrity: sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg==}
@ -176,6 +182,8 @@ snapshots:
buffer-crc32@0.2.13: {}
csv-parse@6.1.0: {}
dom-serializer@2.0.0:
dependencies:
domelementtype: 2.3.0

View file

@ -1,9 +1,11 @@
import test from "node:test";
import nodePath from "node:path";
import { strict as assert } from "node:assert";
import { TaskTargetPipelineHelper, TaskTarget, verify, run } from "../data-export/task.ts";
import { TaskTarget, verify, run, unzip, pipe } from "../data-export/task.ts";
import { parallel } from "../data-export/parallel.ts";
import "../data-export/facebook.ts";
import { facebook, facebook_v2 } from "../data-export/facebook.ts";
import * as DataIO from "../data-export/io.ts";
import { parse } from "csv-parse/sync"; // For better diffs + error checking of CSV output
const THIS_FILE = import.meta.dirname;
const FACEBOOK_V1_DIR = nodePath.join(THIS_FILE, 'fixtures/facebook-json-2021-05-01');
@ -11,67 +13,56 @@ const FACEBOOK_V1_ZIPPED = nodePath.join(THIS_FILE, 'fixtures/facebook-json-2021
const FACEBOOK_V2_DIR = nodePath.join(THIS_FILE, 'fixtures/facebook-json-2025-11-29');
test("facebook: Can load the 2021 export", async (t) => {
const targets = TaskTargetPipelineHelper.pipeline([
const targets = [
new TaskTarget(FACEBOOK_V1_DIR)
])
.facebook();
const finalTargets = await verify(targets);
const result = await parallel(finalTargets, true);
]
const builtTargets = await facebook()(targets);
const out = await DataIO.runPipeline(builtTargets);
const idAndCSVs: [string, string][] = [];
for (const [idx, r] of result.entries()) {
const target = finalTargets[idx];
assert.ok(!r.stderr, `Task ${target.id} should have no stderr output`);
assert.ok(r.ok, `Task ${target.id} should be okay`);
idAndCSVs.push([target.id, r.stdout]);
for (const {target, result} of out) {
assert.ok(!result.stderr, `Task ${target.id} should have no stderr output`);
assert.ok(result.ok, `Task ${target.id} should be okay`);
idAndCSVs.push([target.id, result.stdout]);
}
const csvs = idAndCSVs
.sort() // Keep stable ordering for snapshots
.map(v => v[1])
.map(v => parse(v[1]))
t.assert.snapshot(csvs);
});
test("facebook: Can load the 2021 export zipped", async (t) => {
const targets = await TaskTargetPipelineHelper.pipeline([
const targets = [
new TaskTarget(FACEBOOK_V1_ZIPPED)
])
.unzip();
const targets2 = targets
.facebook();
const finalTargets = await verify(targets2);
const result = await parallel(finalTargets, true);
];
const builtTargets = await pipe(unzip(), facebook())(targets);
const out = await DataIO.runPipeline(builtTargets);
const idAndCSVs: [string, string][] = [];
for (const [idx, r] of result.entries()) {
const target = finalTargets[idx];
assert.ok(!r.stderr, `Task ${target.id} should have no stderr output`);
assert.ok(r.ok, `Task ${target.id} should be okay`);
idAndCSVs.push([target.id, r.stdout]);
for (const {target, result} of out) {
assert.ok(!result.stderr, `Task ${target.id} should have no stderr output`);
assert.ok(result.ok, `Task ${target.id} should be okay`);
idAndCSVs.push([target.id, result.stdout]);
}
const csvs = idAndCSVs
.sort() // Keep stable ordering for snapshots
.map(v => v[1])
.map(v => parse(v[1]))
t.assert.snapshot(csvs);
});
test("facebook: Can load the 2025 export", async (t) => {
const targets = TaskTargetPipelineHelper.pipeline([
const targets = [
new TaskTarget(FACEBOOK_V2_DIR)
])
.facebook_v2();
const finalTargets = await verify(targets);
const result = await parallel(finalTargets, true);
]
const builtTargets = await facebook_v2()(targets);
const out = await DataIO.runPipeline(builtTargets);
const idAndCSVs: [string, string][] = [];
for (const [idx, r] of result.entries()) {
const target = finalTargets[idx];
assert.ok(!r.stderr, `Task ${target.id} should have no stderr output`);
assert.ok(r.ok, `Task ${target.id} should be okay`);
idAndCSVs.push([target.id, r.stdout]);
for (const {target, result} of out) {
assert.ok(!result.stderr, `Task ${target.id} should have no stderr output`);
assert.ok(result.ok, `Task ${target.id} should be okay`);
idAndCSVs.push([target.id, result.stdout]);
}
const csvs = idAndCSVs
.sort() // Keep stable ordering for snapshots
.map(v => v[1])
.map(v => parse(v[1]))
t.assert.snapshot(csvs);
});

File diff suppressed because it is too large Load diff

View file

@ -11,3 +11,6 @@
* `facebook-json-2021-05-01` - Facebook JSON export
* `facebook-json-2025-11-29` - Facebook JSON export
* [`discord-chat-exporter-2026-02`](./discord-chat-exporter-2026-02.md) - Discord export with [DiscordChatExporter](https://github.com/Tyrrrz/DiscordChatExporter) sometime around Feb 2026
* [`discord-json-2021-01`](./discord-json-2021-01.md) - Discord JSON export
* [`snapchat-2023-11`](./snapchat-2023-11.md) - Snapchat JSON + HTML export

View file

@ -0,0 +1,25 @@
# discord-chat-exporter-2026-02
An export from `DiscordChatExporter`, a comprehensive DiscordChatExporter
## Export methodology
This uses the version of `DiscordChatExporter` that existed at the top of the releases tab on GitHub around `2026 February`. **TODO: figure out version**
This export used a command something like the following to try to get _everything_ `dotnet DiscordChatExporter.Cli.dll export -t xxx -o ~/DiscordChatExporter -f json --media --reuse-media --include-threads -c xxx`
* It uses `export` command and `-c` but it's the same for `exportguild` and `-g`
* `-f json` so only the json export
* `--media` download all media
* `--reuse-media` not quite sure what this does because it puts it in a folder per channel...
* `--include-threads` to get any threads
## Manual edits
* Lots of image replacing + placeholders
* Had to rename the folders
## Notes
The export format has files and folders with similar, information-dense names. I tried to preserve that as that's the only way to correlate between the folder and the file name
* No exif on any media files
* There's embeds, thumbnails in the example chat messages but I have no other specimen

View file

@ -0,0 +1,145 @@
{
"guild": {
"id": "111111111111111111",
"name": "xxxxxxxx",
"iconUrl": "GuildName - Text Channels - ChannelName [0000000000000000].json_Files/avatar.png"
},
"channel": {
"id": "111111111111111111",
"type": "xxxxxxxxxxxxx",
"categoryId": "111111111111111111",
"category": "xxxxxxxxxxxxx",
"name": "xxxxxxx",
"topic": null
},
"dateRange": {
"after": null,
"before": null
},
"exportedAt": "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
"messages": [
{
"id": "111111111111111111",
"type": "xxxxxxxxxxxxxxx",
"timestamp": "2020-04-13T10:09:08.000000+00:00",
"timestampEdited": null,
"callEndedTimestamp": null,
"isPinned": false,
"content": "xxxxxxxxxxxxxxxxxx",
"author": {
"id": "111111111111111111",
"name": "xxxxxxxx",
"discriminator": "1111",
"nickname": "xxxxxxxx",
"color": null,
"isBot": false,
"roles": [],
"avatarUrl": "GuildName - Text Channels - ChannelName [0000000000000000].json_Files/avatar.png"
},
"attachments": [],
"embeds": [],
"stickers": [],
"reactions": [],
"mentions": [],
"inlineEmojis": []
},
{
"id": "111111111111111111",
"type": "xxxxxxx",
"timestamp": "2020-04-13T10:09:08.000000+00:00",
"timestampEdited": null,
"callEndedTimestamp": null,
"isPinned": false,
"content": "xxxxxxxxx",
"author": {
"id": "111111111111111111",
"name": "xxxxxxxx",
"discriminator": "1111",
"nickname": "xxxxxxxx",
"color": null,
"isBot": false,
"roles": [],
"avatarUrl": "GuildName - Text Channels - ChannelName [0000000000000000].json_Files/avatar.png"
},
"attachments": [],
"embeds": [],
"stickers": [],
"reactions": [],
"mentions": [],
"inlineEmojis": []
},
{
"id": "111111111111111111",
"type": "xxxxxxx",
"timestamp": "2020-04-13T10:09:08.000000+00:00",
"timestampEdited": null,
"callEndedTimestamp": null,
"isPinned": false,
"content": "https://example.com/example.png",
"author": {
"id": "111111111111111111",
"name": "xxxxxxxx",
"discriminator": "1111",
"nickname": "xxxxxxxx",
"color": null,
"isBot": false,
"roles": [],
"avatarUrl": "GuildName - Text Channels - ChannelName [0000000000000000].json_Files/avatar.png"
},
"attachments": [],
"embeds": [
{
"title": "",
"url": "https://example.com/example.png",
"timestamp": null,
"description": "",
"thumbnail": {
"url": "GuildName - Text Channels - ChannelName [0000000000000000].json_Files/example.png",
"width": 111,
"height": 111
},
"images": [],
"fields": [],
"inlineEmojis": []
}
],
"stickers": [],
"reactions": [],
"mentions": [],
"inlineEmojis": []
},
{
"id": "111111111111111111",
"type": "xxxxxxx",
"timestamp": "2020-04-13T10:09:08.000000+00:00",
"timestampEdited": null,
"callEndedTimestamp": null,
"isPinned": false,
"content": "xxx",
"author": {
"id": "111111111111111111",
"name": "xxxxxxxx",
"discriminator": "1111",
"nickname": "xxxxxxxx",
"color": null,
"isBot": false,
"roles": [],
"avatarUrl": "GuildName - Text Channels - ChannelName [0000000000000000].json_Files/avatar.png"
},
"attachments": [
{
"id": "111111111111111111",
"url": "GuildName - Text Channels - ChannelName [0000000000000000].json_Files/unknown-SUFFIX.png",
"fileName": "unknown.png",
"fileSizeBytes": 111111
}
],
"embeds": [],
"stickers": [],
"reactions": [],
"mentions": [],
"inlineEmojis": []
}
],
"messageCount": 111
}

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.2 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.3 KiB

41
test/fixtures/discord-json-2021-01.md vendored Normal file
View file

@ -0,0 +1,41 @@
# discord-json-2021-01
## Manual edits
* images -> placeholders
* `accounts/avatar.png`
* manually scrub folder names
* `account/applications/0000000000000`
## Notes about files
* `activity/`
* All the .json are NDJSON so some json tools don't like them
* _Massive_ files. They hang scrub.ts for a long long time (had to run these piecemeal)
* These files also have an _incredible_ amount of shapes and variance.
* Instead of outputing all the shapes I made a sort of "super-object" to capture the shape with `jq -n '[inputs] | add' events-2021-00000-of-00001.json.tmp > unique_shape.json` and then scrubbing `unique_shape.json`
* `messages/`
* I hand did these to keep all the ids the same
* There are multiple types of chats. DMs, guild channels, etc
* I hand did the csvs as I have no scrubber for that
* These are only **THE EXPORTING USERS MESSAGES**, no other user, just fyi
* Ids in `messages.csv` are just the id of the message, not of any user
* There is the potential to derive missing info from a channel via `@` tags sent or possibly via attachments. Maybe...
* `11111111111111111`
* This one has a shorter id (it's an older one)
* Has `type: 0` but there's no guild information in `channel.json`
* The user name was `null` in `index.json`
* It's a really odd one
* `222222222222222222`
* This was a dm channel (said `direct message with xxx#7777` in index.json)
* Has `type: 1` and there are two recipients (just the ids) in `channel.json`
* Unfortunately that's all the info in the export
* `333333333333333333`
* This was a normal guild channel
* `type: 0` and there's guild information in `channel.json`
* I kept a good set of messages around from this one to show how attachements and other stuff works
* The last message seemed to be a link not as an attachment. Links just seem to be normal text
* `programs/`
* was empty...
* `servers/``
* Info about _some_ of the guilds we have ids for
* guild.json didn't really contain anything except the name
* I kept around the only guild I noticed an audit-log.json with info in it

View file

@ -0,0 +1,26 @@
__ __ ___ _ _ ___ ___ ___ _____ ___ _
\ \ / / / _ \ | | | | | _ \ o O O | \ / \ |_ _| / \ | |
\ V / | (_) | | |_| | | / o | |) | | - | | | | - | |_|
_|_|_ \___/ \___/ |_|_\ TS__[O] |___/ |_|_| _|_|_ |_|_| _(_)_
_| """ |_|"""""|_|"""""|_|"""""| <======|_|"""""|_|"""""|_|"""""|_|"""""|_| """ |
"`-0-0-'"`-0-0-'"`-0-0-'"`-0-0-'./o--000'"`-0-0-'"`-0-0-'"`-0-0-'"`-0-0-'"`-0-0-'
___ ___ _ _ ___ ___ ___ _ _ _
|_ _| / __| o O O | || | | __| | _ \ | __| | | | | | |
| | \__ \ o | __ | | _| | / | _| |_| |_| |_|
|___| |___/ TS__[O] |_||_| |___| |_|_\ |___| _(_)_ _(_)_ _(_)_
_|"""""|_|"""""| <======|_|"""""|_|"""""|_|"""""|_|"""""|_| """ |_| """ |_| """ |
"`-0-0-'"`-0-0-'./o--000'"`-0-0-'"`-0-0-'"`-0-0-'"`-0-0-'"`-0-0-'"`-0-0-'"`-0-0-'
Welcome to your Discord Data Package!
Inside, you'll find a few JSON (JavaScript Object Notation) and CSV (Comma Separated Values) files
of the data we use to provide Discord's service to you. We've chosen these formats for ease of
processing. Furthermore, the files have been organized into logical groups to make it easy to
understand and work with (at least, we hope so)!
For more information, you can view our in-depth help article at the following URL:
https://support.discord.com/hc/articles/360004957991
All the best,
Discord Team

View file

@ -0,0 +1,16 @@
{
"id": "111111111111111111",
"name": "xxxxxxx",
"icon": null,
"description": "",
"summary": "",
"hook": false,
"verify_key": "a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1",
"flags": 1,
"secret": "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
"redirect_uris": [],
"rpc_application_state": 1,
"store_application_state": 1,
"verification_state": 1,
"interactions_endpoint_url": null
}

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.7 KiB

View file

@ -0,0 +1,399 @@
{
"id": "111111111111111111",
"username": "xxxxxxxx",
"discriminator": 1111,
"email": "not_a_real_email@example.com",
"verified": false,
"avatar_hash": "a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1",
"has_mobile": false,
"needs_email_verification": false,
"premium_until": "2020-04-13T10:09:08.000000+00:00",
"flags": 11111111111111,
"phone": "xxxxxxxxxxxx",
"temp_banned_until": null,
"ip": "1.1.1.1",
"settings": {
"locale": "xxxxx",
"show_current_game": false,
"restricted_guilds": [],
"default_guilds_restricted": false,
"inline_attachment_media": false,
"inline_embed_media": false,
"gif_auto_play": false,
"render_embeds": false,
"render_reactions": false,
"animate_emoji": false,
"enable_tts_command": false,
"message_display_compact": false,
"convert_emoticons": false,
"explicit_content_filter": 1,
"disable_games_tab": false,
"theme": "xxxx",
"developer_mode": false,
"guild_positions": [
"111111111111111111",
"111111111111111111"
],
"detect_platform_accounts": false,
"status": "xxxxxx",
"afk_timeout": 111,
"timezone_offset": 111,
"stream_notifications_enabled": false,
"allow_accessibility_detection": false,
"contact_sync_enabled": false,
"native_phone_integration_enabled": false,
"animate_stickers": 1,
"friend_source_flags": {
"all": false
},
"guild_folders": [
{
"guild_ids": [
"111111111111111111"
],
"id": null,
"name": null,
"color": null
},
{
"guild_ids": [
"111111111111111111"
],
"id": null,
"name": null,
"color": null
}
],
"custom_status": null
},
"connections": [
{
"type": "xxxxxxxxx",
"id": "xxxxxxxxxxx",
"name": "xxxxxxxxxxx",
"revoked": false,
"visibility": 1,
"friend_sync": false,
"show_activity": false,
"verified": false
},
{
"type": "xxxxxxx",
"id": "xxxxxxxx",
"name": "xxxxxxxx",
"revoked": false,
"visibility": 1,
"friend_sync": false,
"show_activity": false,
"verified": false
}
],
"external_friends_lists": [
{
"user_id": "111111111111111111",
"platform_type": "xxxxx",
"name": "xxxxxxxx",
"id_hash": "a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1",
"friend_id_hashes": [
"a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1",
"a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1"
]
},
{
"user_id": "111111111111111111",
"platform_type": "xxxxxxxxx",
"name": "xxxxxxxxxxx",
"id_hash": "a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1",
"friend_id_hashes": [
"a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1",
"a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1"
]
}
],
"friend_suggestions": [],
"mfa_sessions": [],
"relationships": [
{
"id": "11111111111111111",
"type": 1,
"nickname": null,
"user": {
"id": "11111111111111111",
"username": "xxxxxxxxxxxx",
"avatar": "a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1",
"discriminator": "1111",
"public_flags": 1
}
},
{
"id": "11111111111111111",
"type": 1,
"nickname": null,
"user": {
"id": "11111111111111111",
"username": "xxxx",
"avatar": "a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1",
"discriminator": "1111",
"public_flags": 111
}
}
],
"payments": [
{
"id": "111111111111111111",
"created_at": "2020-04-13T10:09:08.000000+00:00",
"currency": "xxx",
"tax": 111,
"tax_inclusive": false,
"amount": 1111,
"amount_refunded": 1,
"status": 1,
"description": "xxxxxxxxxxxxxxxxxxxx",
"flags": 1,
"subscription": {
"id": "111111111111111111",
"type": 1,
"current_period_start": "2020-04-13T10:09:08.000000+00:00",
"current_period_end": "2020-04-13T10:09:08.000000+00:00",
"payment_gateway": null,
"payment_gateway_plan_id": "xxxxxxxxxxxxxxxxxxx",
"currency": "xxx",
"plan_id": "111111111111111111",
"items": [
{
"id": "111111111111111111",
"plan_id": "111111111111111111",
"quantity": 1
}
]
},
"payment_source": {
"id": "111111111111111111",
"type": 1,
"invalid": false,
"brand": "xxxx",
"last_4": "1111",
"expires_month": 11,
"expires_year": 1111,
"billing_address": {
"name": "xxxxxxxxxxxxx",
"line_1": "xxxxxxxxxxxxxxxxx",
"line_2": null,
"city": "xxxxxxxx",
"state": "xx",
"country": "xx",
"postal_code": "11111"
},
"country": "xx"
},
"sku_id": "111111111111111111",
"sku_price": 1111,
"sku_subscription_plan_id": "111111111111111111"
},
{
"id": "111111111111111111",
"created_at": "2020-04-13T10:09:08.000000+00:00",
"currency": "xxx",
"tax": 111,
"tax_inclusive": false,
"amount": 1111,
"amount_refunded": 1,
"status": 1,
"description": "xxxxxxxxxxxxxxxxxxxx",
"flags": 1,
"subscription": {
"id": "111111111111111111",
"type": 1,
"current_period_start": "2020-04-13T10:09:08.000000+00:00",
"current_period_end": "2020-04-13T10:09:08.000000+00:00",
"payment_gateway": null,
"payment_gateway_plan_id": "xxxxxxxxxxxxxxxxxxx",
"currency": "xxx",
"plan_id": "111111111111111111",
"items": [
{
"id": "111111111111111111",
"plan_id": "111111111111111111",
"quantity": 1
}
]
},
"payment_source": {
"id": "111111111111111111",
"type": 1,
"invalid": false,
"brand": "xxxx",
"last_4": "1111",
"expires_month": 11,
"expires_year": 1111,
"billing_address": {
"name": "xxxxxxxxxxxxx",
"line_1": "xxxxxxxxxxxxxxxxxx",
"line_2": null,
"city": "xxxxxxxxxx",
"state": "xx",
"country": "xx",
"postal_code": "11111"
},
"country": "xx"
},
"sku_id": "111111111111111111",
"sku_price": 1111,
"sku_subscription_plan_id": "111111111111111111"
}
],
"payment_sources": [
{
"id": "111111111111111111",
"type": 1,
"invalid": false,
"brand": "xxxx",
"last_4": "1111",
"expires_month": 11,
"expires_year": 1111,
"billing_address": {
"name": "xxxxxxxxxxxxx",
"line_1": "xxxxxxxxxxxxxxxxx",
"line_2": null,
"city": "xxxxxxxx",
"state": "xx",
"country": "xx",
"postal_code": "11111"
},
"country": "xx"
}
],
"guild_settings": [
{
"guild_id": null,
"suppress_everyone": false,
"suppress_roles": false,
"message_notifications": 1,
"mobile_push": false,
"muted": false,
"mute_config": null,
"channel_overrides": [
{
"channel_id": "111111111111111111",
"message_notifications": 1,
"muted": false,
"mute_config": null
}
],
"version": 11
},
{
"guild_id": "11111111111111111",
"suppress_everyone": false,
"suppress_roles": false,
"message_notifications": 1,
"mobile_push": false,
"muted": false,
"mute_config": null,
"channel_overrides": [
{
"channel_id": "111111111111111111",
"message_notifications": 1,
"muted": false,
"mute_config": null
},
{
"channel_id": "111111111111111111",
"message_notifications": 1,
"muted": false,
"mute_config": null
}
],
"version": 1
}
],
"library_applications": [
{
"application": {
"id": "111111111111111111",
"name": "xxxxxxxxxxxx",
"icon": "a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1",
"description": "xxxxxxxxxxxxxxxxxxxxx",
"summary": "xxxxxxxxxxxxxxxxxxxxx",
"primary_sku_id": "111111111111111111",
"hook": false,
"slug": "xxxxxxxxxxxx",
"guild_id": "111111111111111111",
"verify_key": "a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1",
"publishers": [
{
"id": "111111111111111111",
"name": "xxxxxxxxxxx"
}
],
"developers": [
{
"id": "111111111111111111",
"name": "xxxxxxxxxxx"
},
{
"id": "111111111111111111",
"name": "xxxxxxxxxxxxxxxxxxxxxxxx"
}
]
},
"branch_id": "111111111111111111",
"sku_id": "111111111111111111",
"sku": {
"id": "111111111111111111",
"type": 1,
"premium": false,
"preorder_release_at": null,
"preorder_approximate_release_date": null
},
"flags": 1,
"created_at": "2020-04-13T10:09:08.000000+00:00",
"entitlements": [
{
"id": "111111111111111111",
"sku_id": "111111111111111111",
"application_id": "111111111111111111",
"user_id": "111111111111111111",
"type": 1,
"deleted": false,
"gift_code_flags": 1,
"branches": [
"111111111111111111"
]
}
]
}
],
"entitlements": [
{
"id": "111111111111111111",
"sku_id": "111111111111111111",
"application_id": "111111111111111111",
"user_id": "111111111111111111",
"type": 1,
"deleted": false,
"gift_code_flags": 1,
"branches": [
"111111111111111111"
],
"sku_name": "xxxxxxxxxxxx"
}
],
"user_activity_application_statistics": [
{
"application_id": "111111111111111111",
"last_played_at": "2020-04-13T10:09:08.000000+00:00",
"total_duration": 1111,
"total_discord_sku_duration": 1
},
{
"application_id": "111111111111111111",
"last_played_at": "2020-04-13T10:09:08.000000+00:00",
"total_duration": 111111,
"total_discord_sku_duration": 1
}
],
"notes": {
"111111111111111111": "xxxx"
}
}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View file

@ -0,0 +1 @@
{"id": "11111111111111111", "type": 0}

View file

@ -0,0 +1,2 @@
ID,Timestamp,Contents,Attachments
8888888888,2022-02-22 22:22:22.222222+00:00,Heyo,
1 ID Timestamp Contents Attachments
2 8888888888 2022-02-22 22:22:22.222222+00:00 Heyo

View file

@ -0,0 +1 @@
{"id": "222222222222222222", "type": 1, "recipients": ["00000000000000000", "1111111111111111"]}

View file

@ -0,0 +1,2 @@
ID,Timestamp,Contents,Attachments
2222222222222,2022-22-22 22:22:22.22222+00:00,Heyo,
1 ID Timestamp Contents Attachments
2 2222222222222 2022-22-22 22:22:22.22222+00:00 Heyo

View file

@ -0,0 +1 @@
{"id": "333333333333333333", "type": 0, "name": "generalchat", "guild": {"id": "333333333333333332", "name": "xxx"}}

View file

@ -0,0 +1,6 @@
ID,Timestamp,Contents,Attachments
000000000000000005,2011-02-02 02:05:02.000000+00:00,Huh what the heck is this message,
000000000000000004,2011-02-02 02:04:02.000000+00:00,<:thonk:000000000000000000><:thonk:000000000000000000><:thonk:000000000000000000>,
000000000000000003,2011-02-02 02:03:02.000000+00:00,"(so <@00000000000000000> who are you)",
000000000000000002,2011-02-02 02:02:02.000000+00:00,,https://cdn.discordapp.com/attachments/000000000000000000/000000000000000000/image.png
000000000000000001,2011-02-02 02:01:02.000000+00:00,https://google.com/whatever,
1 ID Timestamp Contents Attachments
2 000000000000000005 2011-02-02 02:05:02.000000+00:00 Huh what the heck is this message
3 000000000000000004 2011-02-02 02:04:02.000000+00:00 <:thonk:000000000000000000><:thonk:000000000000000000><:thonk:000000000000000000>
4 000000000000000003 2011-02-02 02:03:02.000000+00:00 (so <@00000000000000000> who are you)
5 000000000000000002 2011-02-02 02:02:02.000000+00:00 https://cdn.discordapp.com/attachments/000000000000000000/000000000000000000/image.png
6 000000000000000001 2011-02-02 02:01:02.000000+00:00 https://google.com/whatever

View file

@ -0,0 +1,5 @@
{
"11111111111111111": null,
"222222222222222222": "Direct Message with xxx#7777",
"333333333333333333": "generalchat"
}

View file

@ -0,0 +1,18 @@
[
{
"id": "111111111111111111",
"user_id": "111111111111111111",
"action_type": 11,
"changes": [
{
"key": "xxxx",
"new_value": [
{
"name": "xxxxxxxxxx",
"id": "111111111111111111"
}
]
}
]
}
]

View file

@ -0,0 +1,4 @@
{
"id": "444444444444444444",
"name": "xxx"
}

View file

@ -0,0 +1,3 @@
{
"444444444444444444": "xxx"
}

9
test/fixtures/facebook-json.md vendored Normal file
View file

@ -0,0 +1,9 @@
# facebook-json exports
## `facebook-json-2021-05-01`
* Manual edits of images -> placeholders, folder names, key names (in support cases specficially)
* This was one of the first few datasets I scrubbed so a lot of manual work was done. Should be easier now
* I went poking around this one and there was no exif on any of the images I looked at, only in the json was there exif
## `facebook-json-2025-11-29`
* Manual edits of images -> placeholders, folder names, key names
* This was one of the first few datasets I scrubbed so a lot of manual work was done. Should be easier now

83
test/fixtures/snapchat-2023-11.md vendored Normal file
View file

@ -0,0 +1,83 @@
# Snapchat
Exported from the web exporter
## Manual Edits
* memories and chat_media placeholders
* Snapchat seemed to have events exported where the `+` in emails broke my parsing and the email contained a ' ' instead, so I fixed that
* Keys use unique dates in `json/in_app_surveys.json`
* Keys in `json/chat_history.json` use user ids, had to manually truncate and edit
## Notes
* `memories/`
* No exif data
* Does not seem to have any correlating .json file. It's just a dump to the disk
* files are like `2020-01-01_aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa-main.jpg`
* Date has no time, just date
* `aaaaa...` seems to be a guid
* `main` | `overlay` at the end, with the same guid
* `main` is just the image
* `overlay` looks to be like a filter or some other applied thing that was saved with the memory
* Images may be rotated
* `chat_media/`
* No exif
* files are like `2020-01-01_b~xxxx.jpeg`
* sometimes they have `main` | `overlay` or something
* No idea what the `b~` means or if the xxx is an id or what. Perhaps base64 encoded protobuf, but nothing I decoded seemed to correlate to any identifier in the export
* Only referenced from ... oh... it's broken. The `type: "MEDIA"` in snapchats exporter has all empty "content" fields. Amazing... So this will have to be pieced together some other way
* This will most likel have to be manually repaired
* `json/`
* Scrubbed
* See manual changes
* Comes with both an html and json export (I will only keep the json after deduping)
* NOTE: That the html export has explanations which might be useful to explain some of these fields...
* I compared all .html to .json side by side (browser <-> text editor) and all of them were present in both and had the same data except `snap_history.html` (was empty in .html) and `faq.html` (just informational)
* I noticed on chat history html pages it puts _every_ category, not just the ones I have. Might be useful future reference
```
Frequently Asked Questions
Login History and Account Information
Snap History Metadata
Chat History Metadata
My AI
Our Story & Spotlight Content
Spotlight Replies
Purchase History
Snapchat Support History
User Profile
Public Profiles
Friends
Ranking
Story History
Account History
Location
Search History
Terms History
Subscriptions
Bitmoji
In-app Surveys
Reported Content
Bitmoji Kit
Connected Apps
Talk History
Ads Manager
My Lenses
Memories
Cameos
Email Campaign History
Snap Tokens
Payouts
Orders
Snap Map Places
Shopping Favorites
Payments
My Sounds
Photoshoot Snaps
Feature Emails
AI Selfies
```

View file

@ -0,0 +1,38 @@
{
"Basic Information": {
"Username": "xxxxxxxxx",
"Name": "xxxxx",
"Creation Date": "2020-04-13 10:09:08 UTC",
"Registration IP": "",
"Country": ""
},
"Device Information": {
"Make": "",
"Model ID": "",
"Model Name": "",
"Language": "",
"OS Type": "",
"OS Version": "",
"Connection Type": ""
},
"Device History": [],
"Privacy Policy and Terms of Service Acceptance History": [],
"Custom Creative Tools Terms": [],
"Login History": [
{
"IP": "1.1.1.1",
"Country": "xx",
"Created": "2020-04-13 10:09:08 UTC",
"Status": "xxxxxxx",
"Device": "some/path"
},
{
"IP": "1.1.1.1",
"Country": "xx",
"Created": "2020-04-13 10:09:08 UTC",
"Status": "xxxxxxx",
"Device": "some/path"
}
],
"Family Center": []
}

View file

@ -0,0 +1,47 @@
{
"Display Name Change": [
{
"Date": "2020-04-13 10:09:08 UTC",
"Display Name": "xxxxx"
},
{
"Date": "",
"Display Name": "xxxxxx"
}
],
"Email Change": [
{
"Date": "2020-04-13 10:09:08 UTC",
"Email Address": "not_a_real_email@example.com"
}
],
"Mobile Number Change": [],
"Password Change": [
{
"Date": "2020-04-13 10:09:08 UTC"
},
{
"Date": "2020-04-13 10:09:08 UTC"
}
],
"Snapchat Linked to Bitmoji": [
{
"Date": "2020-04-13 10:09:08 UTC"
}
],
"Spectacles": [],
"Two-Factor Authentication": [],
"Account deactivated / reactivated": [],
"Download My Data Reports": [
{
"Date": "2020-04-13 10:09:08 UTC",
"Status": "xxxxxxx",
"Email Address": "not_a_real_email@example.com"
},
{
"Date": "2020-04-13 10:09:08 UTC",
"Status": "xxxxxxxxx",
"Email Address": "not_a_real_email@example.com"
}
]
}

View file

@ -0,0 +1,31 @@
{
"Basic Information": {
"First Name": "",
"Last Name": "",
"Email": "",
"Phone Number": "",
"Account Creation Date": "2020-04-13 10:09:08 UTC",
"Account Creation User Agent": ""
},
"Analytics": {
"App Open Count": 1,
"Avatar Gender": "xxxx",
"Outfit Save Count": 1,
"Share Count": 1
},
"Terms of Service Acceptance History": [
{
"Version": "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
"Acceptance Date": "2020-04-13 10:09:08"
},
{
"Version": "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
"Acceptance Date": "2020-04-13 10:09:08"
}
],
"Search History": [],
"Support Cases": [],
"Selfies": [],
"Keyboard Enable Full Access History (iOS only)": [],
"Connected Apps": []
}

View file

@ -0,0 +1,8 @@
{
"Cameos Selfie": {
"Cameos Body Selected": "xxxxxxxxxxxx",
"Hairstyle": "xxxxxxxxxxxx",
"Use My Cameos Selfie": "xxxxxxx"
},
"Cameos Stories": []
}

View file

@ -0,0 +1,42 @@
{
"some_friend": [
{
"From": "xxxxxxxxx",
"Media Type": "xxxxx",
"Created": "2020-04-13 10:09:08 UTC",
"Content": "",
"Conversation Title": null,
"IsSender": false,
"Created(microseconds)": 1111111111111
},
{
"From": "xxxxxxxxx",
"Media Type": "xxxx",
"Created": "2020-04-13 10:09:08 UTC",
"Content": "xxxxxxxxxxxxxxxxxx",
"Conversation Title": null,
"IsSender": false,
"Created(microseconds)": 1111111111111
}
],
"some_friend_too": [
{
"From": "xxxxxxxxxxxxxx",
"Media Type": "xxxxx",
"Created": "2020-04-13 10:09:08 UTC",
"Content": "",
"Conversation Title": "xxxxxxxxxxxxxxxx",
"IsSender": false,
"Created(microseconds)": 1111111111111
},
{
"From": "xxxxxxxxxxxxx",
"Media Type": "xxxx",
"Created": "2020-04-13 10:09:08 UTC",
"Content": "xxxxxxxxxxxxxxxxxxxxxx",
"Conversation Title": "xxxxxxxxxxxxxxxx",
"IsSender": false,
"Created(microseconds)": 1111111111111
}
]
}

View file

@ -0,0 +1,11 @@
{
"Login History": [],
"Permissions": [
{
"App": "xxxxxxx",
"Time": "2020-04-13 10:09:08 UTC",
"Type": "xxxxxxx"
}
],
"Connected Applications": []
}

View file

@ -0,0 +1,13 @@
{
"Email Campaign Subscriptions": [
{
"Email Campaign": "xxxxxxxxxxxxxxxx",
"Opt Out Status": "xxxxxxxxxxxx"
},
{
"Email Campaign": "xxxxxxxxxxxxxxx",
"Opt Out Status": "xxxxxxxxxxxx"
}
],
"Email Campaign History": []
}

View file

@ -0,0 +1,100 @@
{
"Friends": [
{
"Username": "xxxxxxxxxxxxx",
"Display Name": "xxxxxxxxxxxxxxxxxxx",
"Creation Timestamp": "2020-04-13 10:09:08 UTC",
"Last Modified Timestamp": "2020-04-13 10:09:08 UTC",
"Source": "xxxxxxxxxxxxxx"
},
{
"Username": "xxxxxxxxxxxxxxx",
"Display Name": "xxxxxxxxxxxxxxx",
"Creation Timestamp": "2020-04-13 10:09:08 UTC",
"Last Modified Timestamp": "2020-04-13 10:09:08 UTC",
"Source": "xxxxxxxxxxxxxxxxxx"
}
],
"Friend Requests Sent": [
{
"Username": "xxxxxxxxxx",
"Display Name": "xxxxxxxxxxx",
"Creation Timestamp": "2020-04-13 10:09:08 UTC",
"Last Modified Timestamp": "2020-04-13 10:09:08 UTC",
"Source": "xxxxxxxxxxxxxxxxxxxxxx"
},
{
"Username": "xxxxxxxxx",
"Display Name": "xxxxxx",
"Creation Timestamp": "2020-04-13 10:09:08 UTC",
"Last Modified Timestamp": "2020-04-13 10:09:08 UTC",
"Source": "xxxxxxxxxxxxxxxxxxxxxx"
}
],
"Blocked Users": [
{
"Username": "xxxxxxxxxxxxxx",
"Display Name": "xxxxxxxxxxxxxxxxxxxxxxx",
"Creation Timestamp": "2020-04-13 10:09:08 UTC",
"Last Modified Timestamp": "2020-04-13 10:09:08 UTC",
"Source": "xxxxxxxxxxxxxxxxxxxxxx"
},
{
"Username": "xxxxxxxxxxxxxx",
"Display Name": "xxxxxxxxxxxxxx",
"Creation Timestamp": "2020-04-13 10:09:08 UTC",
"Last Modified Timestamp": "2020-04-13 10:09:08 UTC",
"Source": "xxxxxxxxxxxxxxxx"
}
],
"Deleted Friends": [
{
"Username": "xxxxxx",
"Display Name": "xxxxxxxxxxxxx",
"Creation Timestamp": "2020-04-13 10:09:08 UTC",
"Last Modified Timestamp": "2020-04-13 10:09:08 UTC",
"Source": "xxxxxxxxxxxxxxxx"
},
{
"Username": "xxxxxxxxxxxxxxx",
"Display Name": "xxxxxxxxxxxxx",
"Creation Timestamp": "2020-04-13 10:09:08 UTC",
"Last Modified Timestamp": "2020-04-13 10:09:08 UTC",
"Source": "xxxxxxxxxxxxxxxx"
}
],
"Hidden Friend Suggestions": [],
"Ignored Snapchatters": [
{
"Username": "xxxxxxxxx",
"Display Name": "xxxxxxxxxxxxxxxxxxxxxxxx",
"Creation Timestamp": "2020-04-13 10:09:08 UTC",
"Last Modified Timestamp": "2020-04-13 10:09:08 UTC",
"Source": "xxxxxxxxxxxxxxxx"
},
{
"Username": "xxxxxxxx",
"Display Name": "xxxxxxxxxxxxxx",
"Creation Timestamp": "2020-04-13 10:09:08 UTC",
"Last Modified Timestamp": "2020-04-13 10:09:08 UTC",
"Source": "xxxxxxxxxxxxxxxx"
}
],
"Pending Requests": [
{
"Username": "xxxxxxxxxxxxxxx",
"Display Name": "xxxxxxxxxxx",
"Creation Timestamp": "2020-04-13 10:09:08 UTC",
"Last Modified Timestamp": "2020-04-13 10:09:08 UTC",
"Source": "xxxxxxxxxxxxxxxx"
},
{
"Username": "xxxxxxxxxxxxxx",
"Display Name": "xxxxxxxxxxxxx",
"Creation Timestamp": "2020-04-13 10:09:08 UTC",
"Last Modified Timestamp": "2020-04-13 10:09:08 UTC",
"Source": "xxxxxxxxxxxxxxxx"
}
],
"Shortcuts": []
}

View file

@ -0,0 +1,26 @@
{
"Survey 2020/04/12": [
{
"Time": "xxxxxxxxxxxx",
"Survey Question": "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
"Survey Response": "xxxxxxxxxx"
},
{
"Time": "xxxxxxxxxxxx",
"Survey Question": "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
"Survey Response": "xxx"
}
],
"Survey 2020/04/13": [
{
"Time": "xxxxxxxxxxxx",
"Survey Question": "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
"Survey Response": "xxxxxxxxxxxxxx"
},
{
"Time": "xxxxxxxxxxxx",
"Survey Question": "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
"Survey Response": "some/path"
}
]
}

View file

@ -0,0 +1,23 @@
{
"Frequent Locations": [],
"Latest Location": [
{
"City": "",
"Country": "",
"Region": ""
}
],
"Home & Work": {},
"Daily Top Locations": [],
"Top Locations Per Six-Day Period": [],
"Location History": [],
"Businesses and public places you may have visited": [],
"Areas you may have visited in the last two years": [
{
"Time": "some/path",
"City": "xxxxxx",
"Region": "xxxxxxxx",
"Postal Code": "11111"
}
]
}

View file

@ -0,0 +1,6 @@
{
"Number of Stories Viewed": [
1
],
"Content Interests": []
}

View file

@ -0,0 +1,11 @@
{
"Shared Story": [],
"Spotlight History": [
{
"Story Date": "2020-04-13 10:09:08 UTC",
"Story URL": "url://somewhere",
"Action Type": "xxxx",
"View Time": "xxxxxxxxxxxxx"
}
]
}

View file

@ -0,0 +1,4 @@
{
"My AI Content": [],
"My AI Memory": []
}

View file

@ -0,0 +1,10 @@
{
"Public Users": [
"xxxxxxxxxxxxxxx"
],
"Publishers": [],
"Stories": [],
"Last Active Timezone": "some/path",
"Push Notifications": [],
"Hidden Category Sections": []
}

View file

@ -0,0 +1,15 @@
{
"Snap Inc. Terms of Service": [
{
"Version": "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
"Acceptance Date": "2020-04-13 10:09:08 UTC"
},
{
"Version": "xxxxxxxxxxxxxxxxxxxxxxxxx",
"Acceptance Date": "2020-04-13 10:09:08 UTC"
}
],
"Custom Creative Tools Terms": [],
"Business Services Terms": [],
"Games Terms": []
}

View file

@ -0,0 +1,39 @@
{
"App Profile": {
"Country": "xx",
"Creation Time": "2020-04-13 10:09:08 UTC",
"Account Creation Country": "xxxxxxx",
"Platform Version": "xxxxxxx",
"In-app Language": "xx"
},
"Demographics": {
"Cohort Age": "",
"Derived Ad Demographic": ""
},
"Subscriptions": [],
"Engagement": [],
"Discover Channels Viewed": [],
"Breakdown of Time Spent on App": [],
"Ads You Interacted With": [],
"Interest Categories": [
"xxxxxx",
"xxxxxxxxxxxxxxxxxxx"
],
"Content Categories": [
"xxxxxxxxxxxxxxxxxxxxxxxxxxxx",
"some/path"
],
"Geographic Information": [],
"Interactions": {
"Web Interactions": [
"xxxxxxxxxxxxx",
"xxxxxxxxxxxxxxxxxxxxxx"
],
"App Interactions": [
"url://somewhere",
"url://somewhere"
]
},
"Off-Platform Sharing": [],
"Mobile Ad Id": "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
}

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.2 KiB

View file

@ -9,7 +9,6 @@ import {
cmd,
assignMeta,
verify,
TaskTargetPipelineHelper,
} from "../data-export/task.ts";
const THIS_FILE = import.meta.dirname;
@ -92,7 +91,7 @@ test("TaskTarget: pushToPipeline throws if read is not the first op", () => {
test("TaskTarget: clone produces an independent copy", () => {
const t = new TaskTarget("/foo").assignMeta({
idValue: "orig",
columnMeta: ["yeag"]
columnMeta: ["any"]
});
t.read();
const c = t.clone();
@ -155,41 +154,41 @@ test("toShell: cmd with function resolves at shell-generation time", () => {
// -- module-level functions ---------------------------------------------------
test("cd: clones and changes directory of each target", () => {
test("cd: clones and changes directory of each target", async () => {
const targets = [new TaskTarget("/a"), new TaskTarget("/b")];
const result = cd(targets, "sub");
const result = await cd("sub")(targets);
assert.equal(result[0].path, "/a/sub");
assert.equal(result[1].path, "/b/sub");
assert.equal(targets[0].path, "/a"); // originals unchanged
});
test("read: clones and adds a read op to each target", () => {
test("read: clones and adds a read op to each target", async () => {
const targets = [new TaskTarget("/a.txt"), new TaskTarget("/b.txt")];
const result = read(targets);
const result = await read()(targets);
assert.equal(result[0].pipeline[0].type, "read");
assert.equal(result[1].pipeline[0].type, "read");
assert.equal(targets[0].pipeline.length, 0); // originals unchanged
});
test("cmd: clones and appends a cmd op to each target", () => {
test("cmd: clones and appends a cmd op to each target", async () => {
const targets = [new TaskTarget("/a.txt")];
targets[0].read();
const result = cmd(targets, "jq .");
const result = await cmd("jq .")(targets);
assert.equal(result[0].pipeline.length, 2);
assert.equal(targets[0].pipeline.length, 1); // original unchanged
});
test("assignMeta: clones and sets meta on each target", () => {
test("assignMeta: clones and sets meta on each target", async () => {
const targets = [new TaskTarget("/a"), new TaskTarget("/b")];
const result = assignMeta(targets, { idValue: "myid" });
const result = await assignMeta({ idValue: "myid" })(targets);
assert.equal(result[0].id, "myid");
assert.equal(result[1].id, "myid");
assert.throws(() => targets[0].id); // originals have no id
});
test("taskGlob: returns matching targets across all input targets", () => {
test("taskGlob: returns matching targets across all input targets", async () => {
const targets = [new TaskTarget(FIXTURE_DIR)];
const result = taskGlob(targets, "friends/*.json");
const result = await taskGlob("friends/*.json")(targets);
assert.ok(result.length > 0);
assert.ok(result.every(r => r.path.endsWith(".json")));
});
@ -226,75 +225,3 @@ test("verify: filters a mixed list to only valid targets", async () => {
assert.equal(result[0], good);
});
// -- TaskTargetPipelineHelper -------------------------------------------------
test("TaskTargetPipelineHelper: pipeline() promotes a plain array", () => {
const p = TaskTargetPipelineHelper.pipeline([new TaskTarget("/a")]);
assert.ok(p instanceof TaskTargetPipelineHelper);
});
test("TaskTargetPipelineHelper: pipeline() is idempotent", () => {
const arr = [new TaskTarget("/a")];
const p1 = TaskTargetPipelineHelper.pipeline(arr);
const p2 = TaskTargetPipelineHelper.pipeline(p1);
assert.equal(p1, p2);
});
test("TaskTargetPipelineHelper: cd returns a new helper with paths changed", () => {
const p = TaskTargetPipelineHelper.pipeline([new TaskTarget("/a"), new TaskTarget("/b")]);
const p2 = p.cd("sub");
assert.ok(p2 instanceof TaskTargetPipelineHelper);
assert.equal(p2[0].path, "/a/sub");
assert.equal(p2[1].path, "/b/sub");
});
test("TaskTargetPipelineHelper: read returns a new helper with read ops added", () => {
const p = TaskTargetPipelineHelper.pipeline([new TaskTarget("/a.txt")]);
const p2 = p.read();
assert.ok(p2 instanceof TaskTargetPipelineHelper);
assert.equal(p2[0].pipeline[0].type, "read");
});
test("TaskTargetPipelineHelper: cmd returns a new helper with cmd ops added", () => {
const p = TaskTargetPipelineHelper.pipeline([new TaskTarget("/a.txt")]);
const p2 = p.read().cmd("jq .");
assert.equal(p2[0].toShell(), "cat /a.txt | jq .");
});
// -- collect ------------------------------------------------------------------
test("collect: the final end of a chain is added to the collection set", () => {
const collection = new Set<TaskTargetPipelineHelper>();
const p = TaskTargetPipelineHelper.pipeline([new TaskTarget("/foo")]);
p.collect(collection);
const p2 = p.cd("sub");
assert.equal(collection.size, 1);
assert.ok(collection.has(p2));
});
test("collect: moving the chain end removes the old element and adds the new one", () => {
const collection = new Set<TaskTargetPipelineHelper>();
const p = TaskTargetPipelineHelper.pipeline([new TaskTarget("/foo")]);
p.collect(collection);
const p2 = p.cd("sub");
const p3 = p2.read();
assert.equal(collection.size, 1);
assert.ok(collection.has(p3));
assert.ok(!collection.has(p2));
});
test("collect: gathers the ends of multiple independent pipeline branches", () => {
const collection = new Set<TaskTargetPipelineHelper>();
const b1 = TaskTargetPipelineHelper.pipeline([new TaskTarget("/a.txt")]).collect(collection).read();
const b2 = TaskTargetPipelineHelper.pipeline([new TaskTarget("/b.txt")]).collect(collection).read();
assert.equal(collection.size, 2);
assert.ok(collection.has(b1));
assert.ok(collection.has(b2));
const allTargets = [...collection].flat();
assert.equal(allTargets.length, 2);
});

225
timelinize.ts Normal file
View file

@ -0,0 +1,225 @@
import { type SQLOutputValue, type DatabaseSync } from "node:sqlite";
import { createWriteStream } from 'node:fs';
import { fileURLToPath } from "node:url";
import "./data-export/facebook.ts";
import { facebook } from "./data-export/facebook.ts";
import { execPaths, COLUMN_TYPES } from "./data-export/task.ts";
import * as DataIO from "./data-export/io.ts";
import {
startTime,
elapsed,
loadTaskInNewDb
} from "./main.ts";
const __filename = fileURLToPath(import.meta.url);
function dumpDBTableToCSV(db: DatabaseSync, tableName: string, outputFile: string) {
const stream = createWriteStream(outputFile);
const stmt = db.prepare(`SELECT * FROM ${tableName}`);
let headerWritten = false;
for (const row of stmt.iterate()) {
if (!headerWritten) {
stream.write(Object.keys(row).join(',') + '\n');
headerWritten = true;
}
stream.write(Object.values(row).map(v => `"${String(v ?? '').replace(/"/g, '""')}"`).join(',') + '\n');
}
stream.end();
}
function getColumnNames(db: DatabaseSync, tableName: string) {
return db.prepare(`PRAGMA table_info(${tableName})`).all().map(c => c.name) as string[];
}
function templateToSql(template: string, columns: string[]) {
// Convert '{0}, {1}' to '%s, %s'
const args: string[] = [];
const sqlTemplate = template.replace(/\{(\d+)\}/g, (match, index) => {
args.push(columns[parseInt(index)]);
return '%s';
});
return `printf('${sqlTemplate}', ${args.join(', ')})`;
}
function sqlLiteral(str: string | undefined | null): string {
if (str === null || str === undefined) {
return 'NULL';
}
// Escape single quotes by doubling them
const escaped = str.replace(/'/g, "''");
// Wrap in single quotes
return `'${escaped}'`;
}
async function main() {
// Configure the tasks to run
console.log(`${elapsed()} - Building targets`);
const targets = await execPaths([
{path: "/home/cobertos/Seafile/archive/ExportedServiceData/facebook/formapcast_facebook-DEADNAME-May2021-json", op: facebook()}
]);
console.log(`${elapsed()} - Found ${targets.filter(t => !t.aggregate).length} possible targets`);
const db = await loadTaskInNewDb(targets);
// New output tables
db.exec(`CREATE TABLE combined (timestamp TEXT, description TEXT, type TEXT, sender TEXT, receiver TEXT, lat REAL, lng REAL, tags TEXT);`);
//(message, email, note,
// social, location, media, event, document,
// bookmark; defaults to note)
type ColumnMetaType = (keyof typeof COLUMN_TYPES);
interface MetadataRow {
id: string,
perRowDescription?: string,
perRowTags?: string,
columnMeta: ColumnMetaType[],
columnNames: string[],
metaId?: string
}
function verifyMetdataRow(input: Record<string, SQLOutputValue>): undefined | MetadataRow {
const { id, perRowDescription, perRowTags, columnMeta: columnMetaCSV, metaId } = input;
if (!id) {
console.error("Row did not have id/tableName, skipping");
return undefined;
}
if (typeof id !== "string") {
console.error(`Id must be string, got ${typeof id}, ${id}`);
return undefined;
}
if (!columnMetaCSV) {
console.warn(`${id} did not have columnMeta, nothing to do. Skipping`);
return undefined; // No column information
}
if (typeof columnMetaCSV !== "string") {
console.warn(`${id} did not have columnMeta of type string. Skipping`);
return undefined;
}
const columnMeta = columnMetaCSV.split(",") as ColumnMetaType[];
// Get the column names from the table id
const columnNames = getColumnNames(db, id);
if (columnNames.length !== columnMeta.length) {
console.error(`columnNames and columnMeta did not have same length. skipping`);
return undefined;
}
if (typeof perRowDescription !== "string" && perRowDescription !== undefined && perRowDescription !== null) {
console.warn(`Invalid typeof perRowDescription, was ${typeof perRowDescription}, value ${perRowDescription}`);
return undefined;
}
if (typeof perRowTags !== "string" && perRowTags !== undefined && perRowTags !== null) {
console.warn(`Invalid typeof perRowTags, was ${typeof perRowTags}, value ${perRowTags}`);
return undefined;
}
if (typeof metaId !== "string" && metaId !== undefined && metaId !== null) {
console.warn(`Invalid typeof metaId, was ${typeof metaId}, value ${metaId}`);
return undefined;
}
return {
id,
perRowDescription: perRowDescription ?? undefined,
perRowTags: perRowTags ?? undefined,
columnMeta,
columnNames,
metaId: metaId ?? undefined
};
}
/**Maps columnMeta names to the column names*/
function metaToNames(meta: MetadataRow): Partial<Record<ColumnMetaType, string>> {
const out: Partial<Record<ColumnMetaType, string>> = {};
for (const [idx, name] of meta.columnNames.entries()) {
const metaName = meta.columnMeta[idx];
if (out[metaName]) {
console.warn(`Duplicate column with metaName "${metaName}". The current one which will be used is "${out[metaName]}". Skipping the duplicate.`);
continue;
}
out[metaName] = name;
}
return out;
}
function metaParts(metaNameToColumnName: Partial<Record<ColumnMetaType, string>>): Record<ColumnMetaType, string> {
const out: Record<ColumnMetaType, string> = {} as any;
for (const type of Object.keys(COLUMN_TYPES) as ColumnMetaType[]) {
if (!metaNameToColumnName[type]) {
out[type] = "NULL";
continue;
}
// Wrap in brackets so column names like "from" don't cause any issues
out[type] = `[${metaNameToColumnName[type]}]`
}
return out;
}
// Iterate over all the tables and their metadata
const statement = db.prepare(`SELECT id, perRowDescription, perRowTags, columnMeta, metaId FROM base_data_manager_metadata`);
for (const row of statement.iterate()) {
const verified = verifyMetdataRow(row);
if (!verified) {
continue;
}
const { id, perRowDescription, perRowTags, columnMeta, columnNames, metaId } = verified;
const metaNameToColumnName = metaToNames(verified);
const part = metaParts(metaNameToColumnName);
// Now find what to insert into each row of the combined
// Per row tags is an string of csv'd items but needs to be made a literal
// TODO: Make this either a template string or have jq do something
// tagsPart = templateToSqlExpr(target.perRowTags, columnNames);
const tagsPart = sqlLiteral(perRowTags);
// Choose what to do with this table based on what meta is present
if (
!!metaNameToColumnName.sender
&& !!metaNameToColumnName.isodatetime
) {
if (!metaId) {
console.warn(`Chat ${id} with .sender but no .metaId. Skipping`);
continue;
}
// First pull the name of the conversation out of the metaId
const receiverThreadTitle = db.prepare(`SELECT title FROM ${metaId} WHERE (id=${sqlLiteral(id)})`).get()?.title;
if (!receiverThreadTitle || typeof receiverThreadTitle !== "string") {
console.warn(`Chat ${id} with .metaId ${metaId} returned invalid receiverThreadTitle ${typeof receiverThreadTitle}. Skipping`);
continue;
}
const receiverPart = sqlLiteral(receiverThreadTitle);
// Put this table into the combined table
db.exec(`INSERT INTO combined SELECT ${part.isodatetime}, ${part.text}, 'message', ${part.sender}, ${receiverPart}, ${part.lat}, ${part.lng}, ${tagsPart} FROM ${id};`);
}
else if (!!metaNameToColumnName.isodatetime) {
// Put this table into the combined table
let descriptionPart = perRowDescription
? templateToSql(perRowDescription, columnNames)
: `'An entry from the ${id} table'`; // Default is just kinda garbo...
db.exec(`INSERT INTO combined SELECT ${part.isodatetime}, ${descriptionPart}, 'node', NULL, NULL, ${part.lat}, ${part.lng}, ${tagsPart} FROM ${id};`);
}
else {
console.warn(`Table with id ${id} had no isodatetime or anything else of value, skipping...`);
}
}
const count = db.prepare(`SELECT COUNT(*) as count FROM combined`).get()?.count;
console.log(`${elapsed()} - Combined database built with ${count} rows`);
// Dump it to the disk for debugging
const sqlitePath = "debug_your.csv.db";
console.log(`${elapsed()} - Writing database to disk at "${sqlitePath}"`);
await DataIO.dumpDBToDisk(db, sqlitePath);
console.log(`${elapsed()} - Database written to disk`);
// Dump it all to the path specified
dumpDBTableToCSV(db, "combined", "your.csv");
console.log(`${elapsed()} - Combined database written to disk as CSV`);
db.close();
}
if (process.argv[1] === __filename) {
main();
}

View file

@ -3,14 +3,24 @@
# fd -t f .json -0 | xargs -I % -0 -- jq -f scrub.jq "%" > "%"
# (Though you should remove the end `> "%"` first to get just the output without
# persisting to be sure it's what you want first)
def scrub:
walk(
def scrub_key:
if test("^[0-9]+$") then
("1" * length)
else
.
end;
def scrub_primitive:
if type == "string" then
if test("^(([0-9]{1,3}\\.){3}[0-9]{1,3})$") then
# IPv4
"1.1.1.1"
elif test("^([0-9a-fA-F]{0,4}:){2,7}[0-9a-fA-F]{0,4}$") then
# IPv6
"2000:0000:0000:0000:0000:0000:0000:0000"
elif test("^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$") then
# Email-like
"not_a_real_email@example.com"
elif test("\\.(jpg|jpeg|png|gif|bmp|webp|svg|ico|tiff|mp3|wav|flac|aac|ogg|wma|m4a|mp4|avi|mkv|mov|wmv|flv|webm)$"; "i") then
# Leave these alone, you will have to manually go through these later and replace with
@ -22,27 +32,60 @@ def scrub:
"url://somewhere"
elif test("/") then
"some/path"
elif test("^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}[+\\-][0-9]{2}:[0-9]{2}$") then
# iso date time without millis with timezone
"2020-04-13T10:09:08+00:00"
elif test("^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}(\\.[0-9]{1,6})?[+\\-][0-9]{2}:[0-9]{2}$") then
# iso date time with millis with timezone
"2020-04-13T10:09:08.000000+00:00"
elif test("^[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2} UTC") then
# Date format from snapchat export
"2020-04-13 10:09:08 UTC"
elif test("^[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}") then
# Date format from snapchat export
"2020-04-13 10:09:08"
elif test("^[0-9]+$") then
# preserve length of the string
"1" * length
elif test("^[0-9a-fA-F]+$") then #hexadecimal string
# repeat the hex pattern and truncate to original length
("a1" * length)[:length]
elif . == "" then
# prevents empty string from just returning null instead of empty string
""
else
"xxx"
# Preserve string length for other strings
"x" * length
end
elif type == "number" then
if 946702800 <= . and . <= 1893474000 then
# Take modulo 1 year to get variance in the output, then add offset to bring to ~2024
((((. % 31557600) + 1704067200) / 5000 | floor) * 5000)
elif . == (. | floor) then
# Integer - preserve digit count
(tostring | length) as $len | ("1" * $len) | tonumber
else
69
end
elif type == "array" then
# Keep only 2 elements, but scrub *those* elements
if length > 1 then
[ (.[0] | scrub), (.[1] | scrub) ]
elif length > 0 then
[ (.[0] | scrub) ]
else
[]
8.08
end
elif type == "boolean" then
# Replace all booleans with false, this can give sensative info away based
# on what the key was in the data
false
else
.
end
);
end;
def scrub:
if type == "object" then
# Apply scrubbing to both keys and values
with_entries(.key |= scrub_key | .value |= scrub)
elif type == "array" then
# Keep only 2 elements, but scrub *those* elements
.[:2] | map(scrub)
else
# Scrub a primitive value
scrub_primitive
end;
# Call scrub
scrub

View file

@ -27,9 +27,6 @@ assert(targetDir, "Usage: ./scrub.ts <directory>");
const targetPath = path.resolve(targetDir);
// const stat = await fs.stat(targetPath);
// assert(stat.isDirectory(), "");
const [notADir] = await ptry($`test -d ${targetPath}`);
assert(!notADir, `Error: '${targetPath}' is not a directory`);
@ -49,12 +46,16 @@ console.log("filePaths", filePaths);
for (const file of filePaths) {
console.log(`Processing: ${file}`);
const tmpFile = `${file}.tmp`;
const piiFile = `${file}.DELETE-THIS-HAS-PII`;
const [jqErr] = await ptry($`jq -f ${scrubJq} ${file} > ${tmpFile}`);
assert(!jqErr, `Error processing ${file}: ${jqErr}`);
const [mvErr] = await ptry($`mv ${tmpFile} ${file}`);
assert(!mvErr, `Error moving ${tmpFile} to ${file}: ${mvErr}`);
const [mvErr] = await ptry($`mv ${file} ${piiFile}`);
assert(!mvErr, `Error moving ${file} to ${piiFile}: ${mvErr}`);
const [mv2Err] = await ptry($`mv ${tmpFile} ${file}`);
assert(!mv2Err, `Error moving ${tmpFile} to ${file}: ${mv2Err}`);
}
console.log();