Compare commits
No commits in common. "c093fbfceeeef52ec6ffcb52f4b2e71360a2b9cd" and "9c3bdaa10001072ed6475c99d72a80d66a43fe95" have entirely different histories.
c093fbfcee
...
9c3bdaa100
60 changed files with 1422 additions and 5400 deletions
3
.gitignore
vendored
3
.gitignore
vendored
|
|
@ -1,5 +1,4 @@
|
||||||
node_modules/
|
node_modules/
|
||||||
*.db
|
your.db
|
||||||
your.csv
|
your.csv
|
||||||
.gitSAFE
|
.gitSAFE
|
||||||
*.DELETE-THIS-HAS-PII
|
|
||||||
25
README.md
25
README.md
|
|
@ -1,25 +0,0 @@
|
||||||
# base-data-manager
|
|
||||||
|
|
||||||
A Typescript project for parsing through many types of data exports to tabular formats
|
|
||||||
|
|
||||||
** This is heavily WIP, and mostly just a toy for myself **
|
|
||||||
|
|
||||||
### Installation
|
|
||||||
|
|
||||||
* Install `jq`
|
|
||||||
* Install sqlite `csv.so` extension (Hardcoded to `/home/cobertos/sqlite-files/` currently)
|
|
||||||
* Install `node` + `pnpm i`
|
|
||||||
* See `main.ts` for current example usage
|
|
||||||
|
|
||||||
|
|
||||||
### Proposed Architecture
|
|
||||||
|
|
||||||
The architecture runs in 2 steps.
|
|
||||||
|
|
||||||
The first step is unopinionated in it's output format. It's meant to take the source data exactly as-is and output it as csv. All source data should pass through, but will be normalized in csv
|
|
||||||
|
|
||||||
**TODO: It's not completely unopinionated, there is some normalization for names of columns I think we want to apply? Or maybe we apply that later...**
|
|
||||||
|
|
||||||
An optional second step combines everything into a single SQLite database. From here we normalize many different types of data across multiple exports into a single opinionated output. For example, message threads/channels should all have the same table format, or end up in the same table
|
|
||||||
|
|
||||||
**TODO: No idea if the second part should be a part of this project... but it currently is**
|
|
||||||
File diff suppressed because it is too large
Load diff
|
|
@ -1,115 +1,105 @@
|
||||||
import { pipe, branch, cmd, assignMeta, cd, glob, read, branchGen, type PipelineOp } from "./task.ts";
|
import { TaskTargetPipelineHelper } from "./task.ts";
|
||||||
import { htmlSelectorChunkedDuplex } from "./html.ts";
|
import { htmlSelectorChunkedDuplex } from "./html.ts";
|
||||||
|
|
||||||
export function google(){
|
export function google(this: TaskTargetPipelineHelper){
|
||||||
return pipe(
|
const p = this.assignMeta({ idValue: t=>`Google - ${t.basename}` }); // Generic ID for everything in here
|
||||||
// Generic ID for everything in here
|
const col: Set<TaskTargetPipelineHelper> = new Set();
|
||||||
assignMeta({ idValue: t=>`Google - ${t.basename}` }),
|
|
||||||
branchGen(function*() {
|
// TODO: There is a root takeout folder
|
||||||
// TODO: There is a root takeout folder
|
|
||||||
|
|
||||||
|
|
||||||
yield pipe(cd('Access Log Activity/Activities - A list of Google services accessed by.csv'), read())
|
p.collect(col).cd('Access Log Activity/Activities - A list of Google services accessed by.csv').read()
|
||||||
yield pipe(cd('Devices - A list of devices (i.e. Nest, Pixel, iPh.csv'), read())
|
p.collect(col).cd('Devices - A list of devices (i.e. Nest, Pixel, iPh.csv').read()
|
||||||
|
|
||||||
// Assignments - data was empty
|
// Assignments - data was empty
|
||||||
// Business messages - GMB messages, there's some but so far outside of what I want
|
// Business messages - GMB messages, there's some but so far outside of what I want
|
||||||
// TODO: Calendar, exports an .ics
|
// TODO: Calendar, exports an .ics
|
||||||
|
|
||||||
// a = t.fork().cd(`Chrome`)
|
// a = t.fork().cd(`Chrome`)
|
||||||
// TODO: Assersses and mode.json
|
// TODO: Assersses and mode.json
|
||||||
// TODO: Bookmarks.csv
|
// TODO: Bookmarks.csv
|
||||||
// TODO: Device Information.json
|
// TODO: Device Information.json
|
||||||
// TODO: Dictionary.csv
|
// TODO: Dictionary.csv
|
||||||
// TODO: ...
|
// TODO: ...
|
||||||
yield pipe(
|
p.collect(col).cd('Chrome/History.json')
|
||||||
cd('Chrome/History.json'),
|
.read()
|
||||||
read(),
|
// TODO: Typed Url", no data
|
||||||
// TODO: Typed Url", no data
|
// TODO: "session", complex data
|
||||||
// TODO: "session", complex data
|
// Omitted .ptoken and .client_id for now. I think ptoken is maybe for the history API? client_id is base64 something...
|
||||||
// Omitted .ptoken and .client_id for now. I think ptoken is maybe for the history API? client_id is base64 something...
|
// TODO: time_usec IS WRONG!! Needs to be ms
|
||||||
// TODO: time_usec IS WRONG!! Needs to be ms
|
.cmd(["jq", "-r", `["favicon_url","page_transition","title","url","time_usec"],
|
||||||
cmd(["jq", "-r", `["favicon_url","page_transition","title","url","time_usec"],
|
(
|
||||||
(
|
."Browser History"[]
|
||||||
."Browser History"[]
|
| [.favicon_url, .page_transition, .title, .url, (.time_usec | todateiso8601)]
|
||||||
| [.favicon_url, .page_transition, .title, .url, (.time_usec | todateiso8601)]
|
)
|
||||||
)
|
| @csv`])
|
||||||
| @csv
|
|
||||||
`])
|
|
||||||
);
|
|
||||||
|
|
||||||
// TODO: Contactss, exports an .vcf
|
// TODO: Contactss, exports an .vcf
|
||||||
// TODO: ...
|
// TODO: ...
|
||||||
|
|
||||||
// a = t.fork().cd(`Google Pay`)
|
// a = t.fork().cd(`Google Pay`)
|
||||||
yield pipe(
|
p.collect(col).cd(`Google Pay/Google transactions`).glob(`transactions_*.csv`)
|
||||||
cd(`Google Pay/Google transactions`),
|
.read()
|
||||||
glob(`transactions_*.csv`),
|
// .fork("a").cd(`Money sends and requests`)
|
||||||
read(),
|
// .fork().cd(`Money sends and requests.csv`)
|
||||||
// .fork("a").cd(`Money sends and requests`)
|
// .read()
|
||||||
// .fork().cd(`Money sends and requests.csv`)
|
// .cmd(t=>["sqlite-utils", "insert", "your.db", t.basename, "-", "--csv", "--detect-types"])
|
||||||
// .read()
|
// TODO: One more folder, and it only has a pdf
|
||||||
// .cmd(t=>["sqlite-utils", "insert", "your.db", t.basename, "-", "--csv", "--detect-types"])
|
|
||||||
// TODO: One more folder, and it only has a pdf
|
|
||||||
);
|
|
||||||
|
|
||||||
// TODO: Google Play Movies _ TV - no data
|
// TODO: Google Play Movies _ TV - no data
|
||||||
// TODO: ...
|
// TODO: ...
|
||||||
|
|
||||||
yield pipe(
|
p.collect(col).cd("Location History/Location History.json")
|
||||||
cd("Location History/Location History.json"),
|
.read()
|
||||||
read(),
|
// TODO: This is missing
|
||||||
// TODO: This is missing
|
// "altitude" : 158,
|
||||||
// "altitude" : 158,
|
// "verticalAccuracy" : 68
|
||||||
// "verticalAccuracy" : 68
|
// and the activity models. I had no idea google tries to determine if I'm "tilting"
|
||||||
// and the activity models. I had no idea google tries to determine if I'm "tilting"
|
.cmd(["jq", "-r", `["timestamp","latitudeE7","longitudeE7","accuracy"],
|
||||||
cmd(["jq", "-r", `["timestamp","latitudeE7","longitudeE7","accuracy"],
|
(
|
||||||
(
|
.locations[]
|
||||||
.locations[]
|
| [.timestampMs | todateiso8601, .latitudeE7, .longitudeE7, .accuracy]
|
||||||
| [.timestampMs | todateiso8601, .latitudeE7, .longitudeE7, .accuracy]
|
)
|
||||||
)
|
| @csv`])
|
||||||
| @csv
|
// There's also the semantic history but that's an entire nother can of worms
|
||||||
`])
|
// it seems like
|
||||||
);
|
|
||||||
// There's also the semantic history but that's an entire nother can of worms
|
|
||||||
// it seems like
|
|
||||||
|
|
||||||
// TODO: Needs no-headers!
|
// TODO: Needs no-headers!
|
||||||
// a = t.fork().cd(`My Activity`)
|
// a = t.fork().cd(`My Activity`)
|
||||||
// a.fork().glob(`**/MyActivity.html`)
|
// a.fork().glob(`**/MyActivity.html`)
|
||||||
// .setId(t=>`Google - ${t.basenameN(2)}`)
|
// .setId(t=>`Google - ${t.basenameN(2)}`)
|
||||||
// .read()
|
// .read()
|
||||||
// .pipe(()=>{
|
// .pipe(()=>{
|
||||||
// // Parses the MyActivity format, chunking it into pieces of HTML text
|
// // Parses the MyActivity format, chunking it into pieces of HTML text
|
||||||
// // and then parsing out the text
|
// // and then parsing out the text
|
||||||
// const dup = htmlSelectorChunkedDuplex(
|
// const dup = htmlSelectorChunkedDuplex(
|
||||||
// (tag, attrs)=>{
|
// (tag, attrs)=>{
|
||||||
// // TODO: We also probably want to get and parse each
|
// // TODO: We also probably want to get and parse each
|
||||||
// // ".content-cell.mdl-typography--caption" as well (it
|
// // ".content-cell.mdl-typography--caption" as well (it
|
||||||
// // has location for websearches and sometimes a details field)
|
// // has location for websearches and sometimes a details field)
|
||||||
// // but then we have to get ".mdl-grid" and parse it
|
// // but then we have to get ".mdl-grid" and parse it
|
||||||
// return attrs.class?.includes("content-cell")
|
// return attrs.class?.includes("content-cell")
|
||||||
// && attrs.class?.includes("mdl-typography--body-1")
|
// && attrs.class?.includes("mdl-typography--body-1")
|
||||||
// && !attrs.class?.includes("mdl-typography--text-right")
|
// && !attrs.class?.includes("mdl-typography--text-right")
|
||||||
// },
|
// },
|
||||||
// (chunk)=>{
|
// (chunk)=>{
|
||||||
// const text = chunk.innerText;
|
// const text = chunk.innerText;
|
||||||
// const split = text.split("\n");
|
// const split = text.split("\n");
|
||||||
// const timestamp = split.pop(); // TODO: need to parse this
|
// const timestamp = split.pop(); // TODO: need to parse this
|
||||||
// const rest = split.join("\n");
|
// const rest = split.join("\n");
|
||||||
// // TODO: Escape instead of replace
|
// // TODO: Escape instead of replace
|
||||||
// const restSafe = rest.replace(/"/g, "'").replace(/\n/g,"\\n"); // escape newlines and quotes
|
// const restSafe = rest.replace(/"/g, "'").replace(/\n/g,"\\n"); // escape newlines and quotes
|
||||||
// // Return a CSV
|
// // Return a CSV
|
||||||
// return `"${restSafe}","${timestamp}"\n`;
|
// return `"${restSafe}","${timestamp}"\n`;
|
||||||
// }
|
// }
|
||||||
// );
|
// );
|
||||||
// return dup;
|
// return dup;
|
||||||
// })
|
// })
|
||||||
|
|
||||||
// TODO: News
|
// TODO: News
|
||||||
// TODO: Profile
|
// TODO: Profile
|
||||||
// TODO: Tasks - No data
|
// TODO: Tasks - No data
|
||||||
})
|
|
||||||
);
|
return Array.from(col);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,52 +0,0 @@
|
||||||
import fs from 'node:fs/promises';
|
|
||||||
import fsSync from 'node:fs';
|
|
||||||
import { DatabaseSync } from "node:sqlite";
|
|
||||||
import { type ProcessOutputAggregate, type RunOutput, TaskTarget, runAll, type ProcessOutputSimple } from "./task.ts";
|
|
||||||
import { ProcessOutput } from 'zx';
|
|
||||||
|
|
||||||
|
|
||||||
async function loadCSVTable(
|
|
||||||
db: DatabaseSync,
|
|
||||||
target: TaskTarget,
|
|
||||||
result: ProcessOutput | ProcessOutputAggregate | ProcessOutputSimple
|
|
||||||
) {
|
|
||||||
const id = target.id;
|
|
||||||
const table = id;
|
|
||||||
const tmpPath = `/tmp/${id}.csv`;
|
|
||||||
// console.log(`Writing ${tmpPath}`);
|
|
||||||
const fd = await fs.open(tmpPath, 'w');
|
|
||||||
await fs.writeFile(fd, result.stdout, { encoding: 'utf8' });
|
|
||||||
await fd.close();
|
|
||||||
// console.log(`Loading ${tmpPath} → table ${table}`);
|
|
||||||
|
|
||||||
db.exec(`CREATE VIRTUAL TABLE temp.intermediate USING csv(filename='${tmpPath}', header);`);
|
|
||||||
db.exec(`CREATE TABLE "${table}" AS SELECT * FROM intermediate;`);
|
|
||||||
db.exec(`DROP TABLE IF EXISTS intermediate;`);
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO: This should really have the same name throughout the codebase?
|
|
||||||
export const runPipeline = runAll;
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @param db Must be a DatabaseSync with the csv.so extension enabled
|
|
||||||
*/
|
|
||||||
export async function loadIntoDb(db: DatabaseSync, runOutput: RunOutput[]) {
|
|
||||||
for (const {result, target} of runOutput) {
|
|
||||||
await loadCSVTable(db, target, result);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
export function getDefaultDB(): DatabaseSync {
|
|
||||||
const db = new DatabaseSync(":memory:", { allowExtension: true });
|
|
||||||
db.loadExtension("/home/cobertos/sqlite-files/csv.so")
|
|
||||||
db.enableLoadExtension(false);
|
|
||||||
return db;
|
|
||||||
}
|
|
||||||
export async function dumpDBToDisk(db: DatabaseSync, dumpPath: string) {
|
|
||||||
if (fsSync.existsSync(dumpPath)) {
|
|
||||||
await fs.unlink(dumpPath); // unlink the old
|
|
||||||
}
|
|
||||||
|
|
||||||
// Dump it all to the path specified
|
|
||||||
db.exec(`VACUUM main INTO '${dumpPath}'`);
|
|
||||||
}
|
|
||||||
|
|
@ -1,18 +1,15 @@
|
||||||
|
import { $, type ProcessOutput } from 'zx';
|
||||||
import os from 'os';
|
import os from 'os';
|
||||||
|
import { type TaskTarget, run } from "./task.ts";
|
||||||
|
|
||||||
/**Generic parallel runner with optional logging
|
$.verbose = false;
|
||||||
* Runs `targets` with `runFn` up to a maximum of `maxConcurrency` amount at a time
|
|
||||||
* Shaped in a way that expects generally something that returns zx.ProcessOutput (or
|
export async function parallel(
|
||||||
* something with .duration and .ok built-in to the return)
|
targets: TaskTarget[],
|
||||||
* @param runFn Should NOT throw. Return { ok: false } instead
|
|
||||||
*/
|
|
||||||
export async function parallel<T, R extends { duration: number, ok: boolean }>(
|
|
||||||
targets: T[],
|
|
||||||
runFn: (t: T)=>Promise<R>,
|
|
||||||
quiet: boolean = false,
|
quiet: boolean = false,
|
||||||
maxConcurrency: number = os.cpus().length
|
maxConcurrency: number = os.cpus().length
|
||||||
): Promise<R[]> {
|
): Promise<ProcessOutput[]> {
|
||||||
const resultMap = new Map<T, R>();
|
const resultMap = new Map<string, ProcessOutput>();
|
||||||
|
|
||||||
const total = targets.length;
|
const total = targets.length;
|
||||||
let completed = 0;
|
let completed = 0;
|
||||||
|
|
@ -43,14 +40,14 @@ export async function parallel<T, R extends { duration: number, ok: boolean }>(
|
||||||
process.stderr.write(`\r${formatEta()}`.padEnd(80));
|
process.stderr.write(`\r${formatEta()}`.padEnd(80));
|
||||||
}
|
}
|
||||||
|
|
||||||
async function runJob(t: T): Promise<void> {
|
async function runJob(t: TaskTarget): Promise<void> {
|
||||||
running++;
|
running++;
|
||||||
printStatus();
|
printStatus();
|
||||||
|
|
||||||
const result = await runFn(t);
|
const result = await run(t);
|
||||||
completionTimes.push(result.duration);
|
completionTimes.push(result.duration);
|
||||||
|
|
||||||
resultMap.set(t, result);
|
resultMap.set(t.id, result);
|
||||||
|
|
||||||
running--;
|
running--;
|
||||||
completed++;
|
completed++;
|
||||||
|
|
@ -79,15 +76,13 @@ export async function parallel<T, R extends { duration: number, ok: boolean }>(
|
||||||
process.stderr.write('\n');
|
process.stderr.write('\n');
|
||||||
const totalSeconds = ((Date.now() - startTime) / 1000).toFixed(1);
|
const totalSeconds = ((Date.now() - startTime) / 1000).toFixed(1);
|
||||||
const failed = Array.from(resultMap.values().filter(p => !p.ok));
|
const failed = Array.from(resultMap.values().filter(p => !p.ok));
|
||||||
if (!quiet) {
|
process.stderr.write(
|
||||||
process.stderr.write(
|
`\nCompleted ${total} jobs in ${totalSeconds}s (${failed.length} failed)\n`
|
||||||
`\nCompleted ${total} jobs in ${totalSeconds}s (${failed.length} failed)\n`
|
);
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
const output = targets
|
const output = targets
|
||||||
.map(t => {
|
.map(t => {
|
||||||
const r = resultMap.get(t)!;
|
const r = resultMap.get(t.id)!;
|
||||||
return r;
|
return r;
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -3,10 +3,7 @@ import fs from 'node:fs';
|
||||||
import { strict as assert } from "node:assert";
|
import { strict as assert } from "node:assert";
|
||||||
import { ZipFS } from "./zipFs.ts";
|
import { ZipFS } from "./zipFs.ts";
|
||||||
import { globSync } from "glob";
|
import { globSync } from "glob";
|
||||||
import { $, ProcessOutput, quote } from "zx";
|
import { $, ProcessPromise, quote } from "zx";
|
||||||
import { parallel } from "./parallel.ts";
|
|
||||||
|
|
||||||
$.verbose = false;
|
|
||||||
|
|
||||||
type FSImpl = {
|
type FSImpl = {
|
||||||
isZip?: boolean;
|
isZip?: boolean;
|
||||||
|
|
@ -41,20 +38,19 @@ function safe(s: string) {
|
||||||
|
|
||||||
interface TaskTargetOp {
|
interface TaskTargetOp {
|
||||||
type: "read" | "mid";
|
type: "read" | "mid";
|
||||||
toShell(target: TaskTarget): string | undefined;
|
toShell(target: TaskTarget): string;
|
||||||
clone(): TaskTargetOp;
|
clone(): TaskTargetOp;
|
||||||
}
|
}
|
||||||
class TaskTargetRead implements TaskTargetOp {
|
class TaskTargetRead implements TaskTargetOp {
|
||||||
get type(){ return "read" as const; }
|
get type(){ return "read" as const; }
|
||||||
toShell(target: TaskTarget) {
|
toShell(target: TaskTarget) {
|
||||||
if (target.fsImpl.isZip) {
|
if (target.fsImpl.isZip) {
|
||||||
// Read the file to stdout from the target inside the zip file
|
|
||||||
// This relies on the internals of fsImpl a bit to have the path to
|
|
||||||
// the root zip so we can create a command against it
|
|
||||||
assert(target.fsImpl.zipPath, "Should have a zipPath");
|
assert(target.fsImpl.zipPath, "Should have a zipPath");
|
||||||
|
// We need to be able to do this
|
||||||
return `7z x ${quote(target.fsImpl.zipPath)} -so ${quote(target.path)}`;
|
return `7z x ${quote(target.fsImpl.zipPath)} -so ${quote(target.path)}`;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// TODO : Implement when reading from a zip file
|
||||||
return `cat ${quote(target.path)}`;
|
return `cat ${quote(target.path)}`;
|
||||||
}
|
}
|
||||||
clone() {
|
clone() {
|
||||||
|
|
@ -119,10 +115,19 @@ export const COLUMN_TYPES = {
|
||||||
"TODO": {}
|
"TODO": {}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// // if (type === "numeric") {
|
||||||
|
// // queryLine = `min(${columnName}) as lo, max(${columnName}) as hi, count(*) as n`;
|
||||||
|
// // formatFn = (r: any)=>`${r.n} rows from ${r.lo} to ${r.hi} for ${t.id}`;
|
||||||
|
// // }
|
||||||
|
// // else {
|
||||||
|
// // queryLine = `count(*) as n`;
|
||||||
|
// // formatFn = (r: any)=>`${r.n} rows for ${t.id}`;
|
||||||
|
// // }
|
||||||
|
|
||||||
/**Column metadata. Just a string into the TYPES*/
|
/**Column metadata. Just a string into the TYPES*/
|
||||||
type ColumnMeta = (keyof typeof COLUMN_TYPES | undefined);
|
type ColumnMeta = (keyof typeof COLUMN_TYPES | undefined);
|
||||||
// Make non-optional version of just the metadata values of TaskTarget
|
// Make non-optional version of just the metadata values of TaskTarget
|
||||||
type TaskTargetMeta = Required<Pick<TaskTarget, "idValue" | "perRowDescription" | "perRowTags" | "columnMeta" | "aggregate" | "metaIdValue" | "aggregateColumns">>;
|
type TaskTargetMeta = Required<Pick<TaskTarget, "idValue" | "perRowDescription" | "perRowTags" | "columnMeta">>;
|
||||||
|
|
||||||
export class TaskTarget {
|
export class TaskTarget {
|
||||||
/**The current path pointed to by this TaskTarget*/
|
/**The current path pointed to by this TaskTarget*/
|
||||||
|
|
@ -144,16 +149,15 @@ export class TaskTarget {
|
||||||
* you might do something like '"{3}" sent from {2} to {1}'
|
* you might do something like '"{3}" sent from {2} to {1}'
|
||||||
* */
|
* */
|
||||||
perRowDescription?: string;
|
perRowDescription?: string;
|
||||||
/**A CSV of tags that is added to every row of the table (TODO: no template functionality currently)*/
|
/**For every output CSV, this defines a SQL expression evaluated per-row that
|
||||||
|
* returns a comma-separated string of tags to assign to that row.
|
||||||
|
* Use the items {0}, {1} to template column values, same as perRowDescription.
|
||||||
|
* Example: A static set of tags: "'me,facebook'"
|
||||||
|
* Example: Tags derived from a column: "'facebook,' || {2}"
|
||||||
|
* */
|
||||||
perRowTags?: string;
|
perRowTags?: string;
|
||||||
/**Metadata about the columns*/
|
/**Metadata about the columns*/
|
||||||
columnMeta?: ColumnMeta[];
|
columnMeta?: ColumnMeta[];
|
||||||
/**Whether or not to aggregate to a single task (everything with the id value idValue)*/
|
|
||||||
aggregate?: boolean;
|
|
||||||
/**Names of the columns to aggregate with*/
|
|
||||||
aggregateColumns?: string[];
|
|
||||||
/**A metadata TaskTarget for this TaskTarget, if one exists*/
|
|
||||||
metaIdValue?: ValidId;
|
|
||||||
|
|
||||||
constructor(path: string){
|
constructor(path: string){
|
||||||
this.path = path;
|
this.path = path;
|
||||||
|
|
@ -190,15 +194,6 @@ export class TaskTarget {
|
||||||
}
|
}
|
||||||
return safe(this.idValue);
|
return safe(this.idValue);
|
||||||
}
|
}
|
||||||
get metaId() {
|
|
||||||
if (!this.metaIdValue) {
|
|
||||||
return undefined;
|
|
||||||
}
|
|
||||||
if (typeof this.metaIdValue === "function") {
|
|
||||||
return safe(this.metaIdValue(this));
|
|
||||||
}
|
|
||||||
return safe(this.metaIdValue);
|
|
||||||
}
|
|
||||||
|
|
||||||
/**Changes the current directory of the target*/
|
/**Changes the current directory of the target*/
|
||||||
cd(path: string): TaskTarget {
|
cd(path: string): TaskTarget {
|
||||||
|
|
@ -238,9 +233,6 @@ export class TaskTarget {
|
||||||
t.perRowDescription = this.perRowDescription;
|
t.perRowDescription = this.perRowDescription;
|
||||||
t.perRowTags = this.perRowTags;
|
t.perRowTags = this.perRowTags;
|
||||||
t.columnMeta = this.columnMeta?.slice();
|
t.columnMeta = this.columnMeta?.slice();
|
||||||
t.metaIdValue = this.metaIdValue;
|
|
||||||
t.aggregate = this.aggregate;
|
|
||||||
t.aggregateColumns = this.aggregateColumns?.slice();
|
|
||||||
return t;
|
return t;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -255,7 +247,6 @@ export class TaskTarget {
|
||||||
toShell() {
|
toShell() {
|
||||||
const shell = this.pipeline
|
const shell = this.pipeline
|
||||||
.map(p => p.toShell(this))
|
.map(p => p.toShell(this))
|
||||||
.filter(p => !!p) // remove empty strings and undefined
|
|
||||||
.join(" | ")
|
.join(" | ")
|
||||||
return shell;
|
return shell;
|
||||||
}
|
}
|
||||||
|
|
@ -278,72 +269,42 @@ export class TaskTarget {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface PipelineOp {
|
export function each(targets: TaskTarget[], fn: (t: TaskTarget)=>void) {
|
||||||
(targets: TaskTarget[]): TaskTarget[] | Promise<TaskTarget[]>;
|
for (const t of targets) {
|
||||||
|
fn(t);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
export function map(targets: TaskTarget[], fn: (t: TaskTarget)=>TaskTarget) {
|
||||||
export function cd(path: string): PipelineOp {
|
const newTargets = [];
|
||||||
return (targets: TaskTarget[]) => targets.map(t => t.clone().cd(path));
|
for (const t of targets) {
|
||||||
|
newTargets.push(fn(t));
|
||||||
|
}
|
||||||
|
return newTargets;
|
||||||
}
|
}
|
||||||
export function glob(globPath: string): PipelineOp {
|
export function cd(targets: TaskTarget[], path: string): TaskTarget[] {
|
||||||
return (targets: TaskTarget[]) => targets.map(t => t.glob(globPath)).flat();
|
return targets.map(t => t.clone().cd(path));
|
||||||
}
|
}
|
||||||
export function unzip(): PipelineOp {
|
export function glob(targets: TaskTarget[], globPath: string): TaskTarget[] {
|
||||||
return async (targets: TaskTarget[]) => Promise.all(targets.map(t => t.unzip()));
|
return targets.map(t => t.glob(globPath)).flat();
|
||||||
}
|
}
|
||||||
export function read(): PipelineOp {
|
export async function unzip(targets: TaskTarget[]): Promise<TaskTarget[]> {
|
||||||
return (targets: TaskTarget[]) => targets.map(t => t.clone().read())
|
return Promise.all(targets.map(t => t.unzip()));
|
||||||
}
|
}
|
||||||
export function cmd(cmd: ValidCmd): PipelineOp {
|
export function read(targets: TaskTarget[]): TaskTarget[] {
|
||||||
return (targets: TaskTarget[]) => targets.map(t => t.clone().cmd(cmd))
|
return targets.map(t => t.clone().read())
|
||||||
}
|
}
|
||||||
export function assignMeta(meta: Partial<TaskTargetMeta>): PipelineOp {
|
export function cmd(targets: TaskTarget[], cmd: ValidCmd): TaskTarget[] {
|
||||||
return (targets: TaskTarget[]) => targets.map(t => t.clone().assignMeta(meta))
|
return targets.map(t => t.clone().cmd(cmd))
|
||||||
}
|
}
|
||||||
|
export function assignMeta(targets: TaskTarget[], meta: Partial<TaskTargetMeta>): TaskTarget[] {
|
||||||
export function each(fn: (t: TaskTarget)=>TaskTarget): PipelineOp {
|
return targets.map(t => t.clone().assignMeta(meta))
|
||||||
return (targets: TaskTarget[])=> targets.map(fn);
|
|
||||||
}
|
}
|
||||||
export function pipe(...ops: PipelineOp[]): PipelineOp {
|
|
||||||
return async (targets: TaskTarget[]) => {
|
|
||||||
for (const op of ops) {
|
|
||||||
targets = await op(targets);
|
|
||||||
}
|
|
||||||
return targets;
|
|
||||||
};
|
|
||||||
}
|
|
||||||
export function branch(...ops: PipelineOp[]): PipelineOp {
|
|
||||||
return async (targets: TaskTarget[]) => {
|
|
||||||
const targetsArrays = await Promise.all(ops.map(op => op(targets)));
|
|
||||||
return targetsArrays.flat();
|
|
||||||
};
|
|
||||||
}
|
|
||||||
export function branchGen(genFn: ()=>Generator<PipelineOp>): PipelineOp {
|
|
||||||
const opsToBranch = Array.from(genFn());
|
|
||||||
return (targets: TaskTarget[]) => {
|
|
||||||
return branch(...opsToBranch)(targets);
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
export async function execPaths(entries: ({path: string, op: PipelineOp })[]) {
|
|
||||||
return (await Promise.all(
|
|
||||||
// Map every entry path into a TaskTarget and run the PipelineOp with
|
|
||||||
// that TaskTarget
|
|
||||||
entries
|
|
||||||
.map(async ({path,op})=>{
|
|
||||||
const targets = [new TaskTarget(path)];
|
|
||||||
return await op(targets);
|
|
||||||
})
|
|
||||||
)).flat();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/**Verify, anything that fails is skipped and throws an error*/
|
/**Verify, anything that fails is skipped and throws an error*/
|
||||||
export async function verify(targets: TaskTarget[]) {
|
export async function verify(targets: TaskTarget[]) {
|
||||||
const outTargets: TaskTarget[] = [];
|
const outTargets: TaskTarget[] = [];
|
||||||
for (const t of targets) {
|
for (const t of targets) {
|
||||||
// Make sure fsImpl is ready
|
// Make sure fsImpl is ready
|
||||||
// TODO: DO NOT PUT THIS IN VERIFY, this should go somewhere in the task building stuff...
|
|
||||||
if ("ready" in t.fsImpl && !t.fsImpl.ready && t.fsImpl.init) {
|
if ("ready" in t.fsImpl && !t.fsImpl.ready && t.fsImpl.init) {
|
||||||
await t.fsImpl.init();
|
await t.fsImpl.init();
|
||||||
}
|
}
|
||||||
|
|
@ -358,133 +319,78 @@ export async function verify(targets: TaskTarget[]) {
|
||||||
|
|
||||||
outTargets.push(t);
|
outTargets.push(t);
|
||||||
}
|
}
|
||||||
|
|
||||||
return outTargets;
|
return outTargets;
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface ProcessOutputAggregate {
|
function collectionSwap(a: TaskTargetPipelineHelper, b: TaskTargetPipelineHelper) {
|
||||||
stdout: string;
|
if (!a.__collection) {
|
||||||
stderr: string;
|
return;
|
||||||
exitCodes: (number | null)[];
|
|
||||||
duration: number;
|
|
||||||
ok: boolean;
|
|
||||||
}
|
|
||||||
export interface ProcessOutputSimple {
|
|
||||||
stdout: string;
|
|
||||||
stderr: string;
|
|
||||||
exitCode: number;
|
|
||||||
duration: number;
|
|
||||||
ok: boolean;
|
|
||||||
}
|
|
||||||
|
|
||||||
function combineProcessOutputAggregate(poa: ProcessOutputAggregate | undefined, t: TaskTarget, po: ProcessOutput) {
|
|
||||||
if (!poa) {
|
|
||||||
assert(t.aggregateColumns, "aggregate TaskTarget must have aggregateColumns");
|
|
||||||
const headers = t.aggregateColumns.join(",") + "\n";
|
|
||||||
return {
|
|
||||||
stdout: headers + po.stdout,
|
|
||||||
stderr: po.stderr,
|
|
||||||
exitCodes: [po.exitCode],
|
|
||||||
duration: po.duration,
|
|
||||||
ok: po.ok
|
|
||||||
};
|
|
||||||
}
|
}
|
||||||
|
|
||||||
// Comes with a builtin "\n" from jq on stdout and stderr, no need to add
|
// Remove a, add b
|
||||||
// a trailing one
|
const collection = a.__collection;
|
||||||
poa.stdout += po.stdout;
|
delete a.__collection;
|
||||||
poa.stderr += po.stderr;
|
collection.delete(a);
|
||||||
poa.exitCodes.push(po.exitCode);
|
b.__collection = collection;
|
||||||
poa.duration += po.duration;
|
collection.add(b);
|
||||||
poa.ok &&= po.ok;
|
|
||||||
return poa;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
export interface RunOutput {
|
export class TaskTargetPipelineHelper extends Array<TaskTarget> {
|
||||||
target: TaskTarget,
|
__collection?: Set<TaskTargetPipelineHelper>;
|
||||||
result: ProcessOutput | ProcessOutputAggregate | ProcessOutputSimple
|
|
||||||
|
static pipeline(t: TaskTarget[]): TaskTargetPipelineHelper {
|
||||||
|
if (Object.getPrototypeOf(t) === TaskTargetPipelineHelper.prototype) {
|
||||||
|
return t as any; // Already done
|
||||||
|
}
|
||||||
|
Object.setPrototypeOf(t, TaskTargetPipelineHelper.prototype);
|
||||||
|
return t as any;
|
||||||
|
}
|
||||||
|
|
||||||
|
_fn(fn: (t: TaskTarget[])=>TaskTarget[]): TaskTargetPipelineHelper {
|
||||||
|
const p = TaskTargetPipelineHelper.pipeline(this);
|
||||||
|
const t = fn(p);
|
||||||
|
const p2 = TaskTargetPipelineHelper.pipeline(t);
|
||||||
|
collectionSwap(p, p2); // Move collection pointer to the new item, ends always end up in the collection
|
||||||
|
return p2;
|
||||||
|
}
|
||||||
|
async _afn(fn: (t: TaskTarget[])=>Promise<TaskTarget[]>): Promise<TaskTargetPipelineHelper> {
|
||||||
|
const p = TaskTargetPipelineHelper.pipeline(this);
|
||||||
|
const t = await fn(p);
|
||||||
|
const p2 = TaskTargetPipelineHelper.pipeline(t);
|
||||||
|
collectionSwap(p, p2); // Move collection pointer to the new item, ends always end up in the collection
|
||||||
|
return p2;
|
||||||
|
}
|
||||||
|
|
||||||
|
cd(path: string): TaskTargetPipelineHelper {
|
||||||
|
return this._fn(t => cd(t, path));
|
||||||
|
}
|
||||||
|
glob(globPath: string): TaskTargetPipelineHelper {
|
||||||
|
return this._fn(t => glob(t, globPath));
|
||||||
|
}
|
||||||
|
async unzip(): Promise<TaskTargetPipelineHelper> {
|
||||||
|
return this._afn(unzip);
|
||||||
|
}
|
||||||
|
read(): TaskTargetPipelineHelper {
|
||||||
|
return this._fn(read);
|
||||||
|
}
|
||||||
|
cmd(_cmd: ValidCmd): TaskTargetPipelineHelper {
|
||||||
|
return this._fn(t => cmd(t, _cmd));
|
||||||
|
}
|
||||||
|
assignMeta(meta: Partial<TaskTargetMeta>): TaskTargetPipelineHelper {
|
||||||
|
return this._fn(t => assignMeta(t, meta));
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @todo Nested versions of this don't currently work, but they could if we
|
||||||
|
* turn __collection into an array of collections
|
||||||
|
*/
|
||||||
|
collect(_c: Set<TaskTargetPipelineHelper>) {
|
||||||
|
this.__collection = _c;
|
||||||
|
return this;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function run(target: TaskTarget): Promise<ProcessOutput> {
|
export async function run(target: TaskTarget): Promise<ProcessPromise> {
|
||||||
const command = target.toShell();
|
const command = target.toShell();
|
||||||
return await $({ nothrow: true })`bash -c ${command}`;
|
return await $({ nothrow: true })`bash -c ${command}`;
|
||||||
}
|
|
||||||
|
|
||||||
export async function runAll(targets: TaskTarget[]): Promise<RunOutput[]> {
|
|
||||||
const finalTargets = await verify(targets);
|
|
||||||
const results = await parallel(finalTargets, run, true);
|
|
||||||
|
|
||||||
const nonAggregateTargets: TaskTarget[] = finalTargets.filter(t => !t.aggregate);
|
|
||||||
const nonAggregateResults: RunOutput[] = [];
|
|
||||||
const aggregateResultsMap: Record<string, RunOutput> = {};
|
|
||||||
|
|
||||||
// == Aggregate tables ==
|
|
||||||
// Some TaskTargets have .aggregate: true, which means they should all be combined
|
|
||||||
// into a single task with the id of the .id property
|
|
||||||
for (const [idx, r] of results.entries()) {
|
|
||||||
const t = finalTargets[idx];
|
|
||||||
if (!t.aggregate) {
|
|
||||||
nonAggregateResults.push({
|
|
||||||
target: t,
|
|
||||||
result: r
|
|
||||||
});
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
const aggregateId = t.id;
|
|
||||||
const prevResult = aggregateResultsMap[aggregateId]?.result;
|
|
||||||
aggregateResultsMap[aggregateId] = {
|
|
||||||
target: t, // Use target t for metadata, so it will use the last target
|
|
||||||
result: combineProcessOutputAggregate(prevResult as (ProcessOutputAggregate | undefined), t, r)
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
// == Metadata table ==
|
|
||||||
// Each TaskTarget has things like perRowDescription and other things we want to store
|
|
||||||
// and output. this creates a single TaskTarget for all that perTable metadata
|
|
||||||
function csvEscape(s: string | undefined) {
|
|
||||||
if (s === undefined) {
|
|
||||||
return "";
|
|
||||||
}
|
|
||||||
if (s.includes("\"") || s.includes(",") || s.includes("\n")) {
|
|
||||||
return `"${s.replace(/\"/g, "\"\"")}"`;
|
|
||||||
}
|
|
||||||
return s;
|
|
||||||
}
|
|
||||||
let metadataCSV = "id,perRowDescription,perRowTags,columnMeta,metaId\n";
|
|
||||||
for (const t of nonAggregateTargets) {
|
|
||||||
const tableNamePart = t.id;
|
|
||||||
const perRowDescriptionPart = t.perRowDescription;
|
|
||||||
const perRowTagsPart = t.perRowTags;
|
|
||||||
const columnMetaPart = t.columnMeta?.join(",") ?? "";
|
|
||||||
const metaIdPart = t.metaId;
|
|
||||||
metadataCSV += [
|
|
||||||
csvEscape(tableNamePart),
|
|
||||||
csvEscape(perRowDescriptionPart),
|
|
||||||
csvEscape(perRowTagsPart),
|
|
||||||
csvEscape(columnMetaPart),
|
|
||||||
csvEscape(metaIdPart)
|
|
||||||
].join(",") + "\n";
|
|
||||||
}
|
|
||||||
// Won't be removed by verify() because we're adding it after that's used
|
|
||||||
// TODO: Would be nice to bake this into TaskTarget/verify for tasks that dont point
|
|
||||||
// to a real path
|
|
||||||
const metadataTarget = new TaskTarget("<none>");
|
|
||||||
metadataTarget
|
|
||||||
// id, perRowDescription, perRowTags, columnMeta, metaId
|
|
||||||
.assignMeta({
|
|
||||||
idValue: "base_data_manager_metadata",
|
|
||||||
columnMeta: ["any", "any", "any", "any", "any"],
|
|
||||||
perRowTags: "internal",
|
|
||||||
});
|
|
||||||
const metadataResult= {
|
|
||||||
stdout: metadataCSV,
|
|
||||||
stderr: "",
|
|
||||||
exitCode: 0,
|
|
||||||
duration: 0, // TODO
|
|
||||||
ok: true
|
|
||||||
};
|
|
||||||
const metadataRunOutput: RunOutput = { target: metadataTarget, result: metadataResult };
|
|
||||||
|
|
||||||
const aggregateResults: RunOutput[] = Object.values(aggregateResultsMap);
|
|
||||||
return aggregateResults.concat(nonAggregateResults).concat(metadataRunOutput);
|
|
||||||
}
|
}
|
||||||
242
main.ts
242
main.ts
|
|
@ -1,90 +1,192 @@
|
||||||
import { type DatabaseSync } from "node:sqlite";
|
import fs from 'node:fs/promises';
|
||||||
import { fileURLToPath } from "node:url";
|
import fsSync from 'node:fs';
|
||||||
|
import nodePath from "node:path";
|
||||||
|
import { DatabaseSync } from "node:sqlite";
|
||||||
|
import "./data-export/facebook.ts";
|
||||||
import { google } from "./data-export/google.ts";
|
import { google } from "./data-export/google.ts";
|
||||||
import { facebook, facebook_v2 } from "./data-export/facebook.ts";
|
import { TaskTargetPipelineHelper, TaskTarget, verify } from "./data-export/task.ts";
|
||||||
import { type TaskTarget, execPaths } from "./data-export/task.ts";
|
import { parallel } from "./data-export/parallel.ts";
|
||||||
import * as DataIO from "./data-export/io.ts";
|
import { ProcessOutput } from 'zx';
|
||||||
|
|
||||||
const __filename = fileURLToPath(import.meta.url);
|
declare module "./data-export/task.ts" {
|
||||||
|
interface TaskTargetPipelineHelper {
|
||||||
|
google: typeof google;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
export const startTime = Date.now();
|
Object.assign(TaskTargetPipelineHelper.prototype, {
|
||||||
export const elapsed = ()=>`${((Date.now() - startTime) / 1000).toFixed(2)}s`;
|
google
|
||||||
|
});
|
||||||
|
|
||||||
export async function loadTaskInNewDb(targets: TaskTarget[]): Promise<DatabaseSync> {
|
async function loadCSVTable(
|
||||||
console.log(`${elapsed()} - Run all targets`);
|
db: DatabaseSync,
|
||||||
const out = await DataIO.runPipeline(targets);
|
target: TaskTarget,
|
||||||
console.log(`${elapsed()} - Final targets exported to CSV. Got ${out.length} targets`);
|
result: ProcessOutput
|
||||||
|
) {
|
||||||
|
const id = target.id;
|
||||||
|
const table = id;
|
||||||
|
const tmpPath = `/tmp/${id}.csv`;
|
||||||
|
console.log(`Writing ${tmpPath}`);
|
||||||
|
const fd = await fs.open(tmpPath, 'w');
|
||||||
|
await fs.writeFile(fd, result.stdout, { encoding: 'utf8' });
|
||||||
|
await fd.close();
|
||||||
|
console.log(`Loading ${tmpPath} → table ${table}`);
|
||||||
|
|
||||||
// TODO: Add an option to output everything plainly as CSV in a single directory
|
// const headers = lines[0].split(",");
|
||||||
|
// const columnsSql = headers.map(h => `"${h}" TEXT`).join(", ");
|
||||||
console.log(`${elapsed()} - Building combined database table in :memory:`);
|
db.exec(`CREATE VIRTUAL TABLE temp.tmp_${table} USING csv(filename='${tmpPath}');`);
|
||||||
const db = DataIO.getDefaultDB();
|
// db.exec(`CREATE TABLE "${table}" AS SELECT * FROM intermediate;`);
|
||||||
await DataIO.loadIntoDb(db, out);
|
// db.exec(`DROP TABLE IF EXISTS intermediate;`);
|
||||||
|
return `tmp_${table}`;
|
||||||
const tableCount = db.prepare(`SELECT COUNT(*) as count FROM base_data_manager_metadata`).get()!.count;
|
}
|
||||||
console.log(`${elapsed()} - Single database built with ${tableCount} tables`);
|
function getColumnNames(db: DatabaseSync, tableName: string) {
|
||||||
|
return db.prepare(`PRAGMA table_info(${tableName})`).all().map(c => c.name) as string[];
|
||||||
return db;
|
}
|
||||||
|
function templateToSql(template: string, columns: string[]) {
|
||||||
|
// Convert '{0}, {1}' to '%s, %s'
|
||||||
|
const args: string[] = [];
|
||||||
|
const sqlTemplate = template.replace(/\{(\d+)\}/g, (match, index) => {
|
||||||
|
args.push(columns[parseInt(index)]);
|
||||||
|
return '%s';
|
||||||
|
});
|
||||||
|
return `printf('${sqlTemplate}', ${args.join(', ')})`;
|
||||||
|
}
|
||||||
|
function templateToSqlExpr(template: string, columns: string[]) {
|
||||||
|
// perRowTags is already a SQL expression; just substitute {N} with column names
|
||||||
|
return template.replace(/\{(\d+)\}/g, (_match, index) => columns[parseInt(index)]);
|
||||||
}
|
}
|
||||||
|
|
||||||
async function main() {
|
async function main() {
|
||||||
// Configurable stuff
|
|
||||||
const sqlitePath = 'your.db';
|
const sqlitePath = 'your.db';
|
||||||
|
|
||||||
console.log(`${elapsed()} - Building targets`);
|
const t = TaskTargetPipelineHelper;
|
||||||
const targets = await execPaths([
|
const targets = TaskTargetPipelineHelper.pipeline([
|
||||||
{path: "/home/cobertos/Seafile/archive/ExportedServiceData/facebook/formapcast_facebook-DEADNAME-May2021-json", op: facebook()}
|
// new TaskTarget("/home/cobertos/Seafile/projects/base-data-manager/test/fixtures/facebook-json-2021-05-01"),
|
||||||
// {path: "/home/cobertos/Seafile/projects/base-data-manager/test/fixtures/facebook-json-2021-05-01", op: facebook()}
|
new TaskTarget("/home/cobertos/Seafile/archive/ExportedServiceData/facebook/formapcast_facebook-DEADNAME-May2021-json"),
|
||||||
// {path: "/home/cobertos/Seafile/archive/ExportedServiceData/facebook/facebook-x-2025-11-29-x.zip", op: pipe(unzip(), facebook_v2())}
|
//new TaskTarget("/home/cobertos/Seafile/archive/ExportedServiceData/facebook/facebook-x-2025-11-29-x.zip").zip()).facebook_v2();
|
||||||
// {path: "/home/cobertos/Seafile/archive/ExportedServiceData/google/2023-NAMEwork-001", op: facebook_v2()}
|
//new TaskTarget("/home/cobertos/Seafile/archive/ExportedServiceData/google/2023-NAMEwork-001").facebook_v2();
|
||||||
]);
|
])
|
||||||
console.log(`${elapsed()} - Found ${targets.filter(t => !t.aggregate).length} possible targets`);
|
.facebook();
|
||||||
|
// .facebook_v2();
|
||||||
|
// .google();
|
||||||
|
|
||||||
const db = await loadTaskInNewDb(targets);
|
// TODO: Make this less painful in task.ts
|
||||||
|
// let zipTask = t.fork().zip("/home/cobertos/Seafile/archive/ExportedServiceData/facebook/facebook-DEADNAME-May2021-json.zip");
|
||||||
|
// await (zipTask.fsImpl as any).init();
|
||||||
|
|
||||||
console.log(`${elapsed()} - Writing database to disk at "${sqlitePath}"`);
|
const finalTargets = await verify(targets);
|
||||||
DataIO.dumpDBToDisk(db, sqlitePath);
|
const results = await parallel(finalTargets, true);
|
||||||
|
|
||||||
|
if (fsSync.existsSync(sqlitePath)) {
|
||||||
|
await fs.unlink(sqlitePath); // unlink the old
|
||||||
|
}
|
||||||
|
// Open an in-memory db for speed
|
||||||
|
const db = new DatabaseSync(":memory:", { allowExtension: true });
|
||||||
|
db.loadExtension("/home/cobertos/sqlite-files/csv.so")
|
||||||
|
db.enableLoadExtension(false);
|
||||||
|
|
||||||
|
// New output table
|
||||||
|
db.exec(`CREATE TABLE combined (timestamp TEXT, description TEXT, sender TEXT, receiver TEXT, tags TEXT, lat REAL, lng REAL);`);
|
||||||
|
|
||||||
|
for (const [idx, target] of targets.entries()) {
|
||||||
|
const result = results[idx];
|
||||||
|
|
||||||
|
if (!target.columnMeta) {
|
||||||
|
continue; // No column information
|
||||||
|
}
|
||||||
|
|
||||||
|
const tableName = await loadCSVTable(db, target, result);
|
||||||
|
const columnNames = getColumnNames(db, tableName);
|
||||||
|
|
||||||
|
// Now find what to insert into each row of the combined
|
||||||
|
let descriptionPart = `'An entry from the ${tableName} table'`; // Default is just kinda garbo...
|
||||||
|
if (target.perRowDescription) {
|
||||||
|
descriptionPart = templateToSql(target.perRowDescription, columnNames);
|
||||||
|
}
|
||||||
|
|
||||||
|
let timestampPart: string | undefined;
|
||||||
|
let senderPart = 'NULL';
|
||||||
|
let receiverPart = 'NULL';
|
||||||
|
let latPart = 'NULL';
|
||||||
|
let lngPart = 'NULL';
|
||||||
|
for (const [idx, col] of target.columnMeta.entries()) {
|
||||||
|
const columnName = columnNames[idx];
|
||||||
|
if (col === "isodatetime") {
|
||||||
|
timestampPart = columnName;
|
||||||
|
} else if (col === "sender") {
|
||||||
|
senderPart = columnName;
|
||||||
|
} else if (col === "receiver") {
|
||||||
|
receiverPart = columnName;
|
||||||
|
} else if (col === "lat") {
|
||||||
|
latPart = columnName;
|
||||||
|
} else if (col === "lng") {
|
||||||
|
lngPart = columnName;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (!timestampPart) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
let tagsPart = 'NULL';
|
||||||
|
if (target.perRowTags) {
|
||||||
|
tagsPart = templateToSqlExpr(target.perRowTags, columnNames);
|
||||||
|
}
|
||||||
|
|
||||||
|
// OFFSET + LIMIT to ignore the CSV headers
|
||||||
|
db.exec(`INSERT INTO combined SELECT ${timestampPart}, ${descriptionPart}, ${senderPart}, ${receiverPart}, ${tagsPart}, ${latPart}, ${lngPart} FROM ${tableName} LIMIT -1 OFFSET 1;`);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Dump it all to the path specified
|
||||||
|
db.exec(`VACUUM main INTO '${sqlitePath}'`);
|
||||||
|
|
||||||
|
// Now dump it as a CSV
|
||||||
|
const rows = db.prepare(`
|
||||||
|
SELECT timestamp || ',' || '"' || replace(description, '"', '""') || '"' as row FROM combined
|
||||||
|
`)
|
||||||
|
.all()
|
||||||
|
.map(r => r.row)
|
||||||
|
.join('\n');
|
||||||
|
db.close();
|
||||||
|
|
||||||
|
await fs.writeFile('your.csv', rows, { encoding: "utf8" });
|
||||||
|
|
||||||
console.log(`${elapsed()} - Database written to disk`);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (process.argv[1] === __filename) {
|
main();
|
||||||
main();
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO: Move this into here
|
// TODO: Move this into here
|
||||||
// csvSink(
|
// csvSink(
|
||||||
// summarization?: [string, string][]
|
// summarization?: [string, string][]
|
||||||
// ) {
|
// ) {
|
||||||
// // TODO:
|
// // TODO:
|
||||||
// return this;
|
// return this;
|
||||||
|
|
||||||
// // Ingest this csv into the database at the given id
|
// // Ingest this csv into the database at the given id
|
||||||
// // this.cmd(t=>["sqlite-utils", "insert", "your.db", t.id, "-", "--csv", "--detect-types"]);
|
// // this.cmd(t=>["sqlite-utils", "insert", "your.db", t.id, "-", "--csv", "--detect-types"]);
|
||||||
// // Add a post processing function for these targets that prints out the summarization
|
// // Add a post processing function for these targets that prints out the summarization
|
||||||
// // stats
|
// // stats
|
||||||
// // this.post(async (t: TaskTarget)=>{
|
// // this.post(async (t: TaskTarget)=>{
|
||||||
// // // We only do the first one so far for the summarization
|
// // // We only do the first one so far for the summarization
|
||||||
// // let queryLine: string;
|
// // let queryLine: string;
|
||||||
// // let formatFn: (r: any)=>string;
|
// // let formatFn: (r: any)=>string;
|
||||||
// // const [columnName, type] = summarization?.[0] ?? [undefined, undefined];
|
// // const [columnName, type] = summarization?.[0] ?? [undefined, undefined];
|
||||||
// // if (type === "numeric") {
|
// // if (type === "numeric") {
|
||||||
// // queryLine = `min(${columnName}) as lo, max(${columnName}) as hi, count(*) as n`;
|
// // queryLine = `min(${columnName}) as lo, max(${columnName}) as hi, count(*) as n`;
|
||||||
// // formatFn = (r: any)=>`${r.n} rows from ${r.lo} to ${r.hi} for ${t.id}`;
|
// // formatFn = (r: any)=>`${r.n} rows from ${r.lo} to ${r.hi} for ${t.id}`;
|
||||||
// // }
|
// // }
|
||||||
// // else {
|
// // else {
|
||||||
// // queryLine = `count(*) as n`;
|
// // queryLine = `count(*) as n`;
|
||||||
// // formatFn = (r: any)=>`${r.n} rows for ${t.id}`;
|
// // formatFn = (r: any)=>`${r.n} rows for ${t.id}`;
|
||||||
// // }
|
// // }
|
||||||
|
|
||||||
// // const cmd = "sqlite-utils";
|
// // const cmd = "sqlite-utils";
|
||||||
// // const args = ["query", "your.db", `select ${queryLine} from ${t.id}`]
|
// // const args = ["query", "your.db", `select ${queryLine} from ${t.id}`]
|
||||||
// // const { stdout, stderr } = await execFile(cmd, args);
|
// // const { stdout, stderr } = await execFile(cmd, args);
|
||||||
// // const results = JSON.parse(stdout);
|
// // const results = JSON.parse(stdout);
|
||||||
// // const result = results[0]; // should only be one result in the array for this type of query
|
// // const result = results[0]; // should only be one result in the array for this type of query
|
||||||
// // const logLine = formatFn(result);
|
// // const logLine = formatFn(result);
|
||||||
// // (t as any).log = logLine;
|
// // (t as any).log = logLine;
|
||||||
// // });
|
// // });
|
||||||
|
|
||||||
// // return this;
|
// // return this;
|
||||||
// }
|
// }
|
||||||
|
|
@ -27,7 +27,6 @@
|
||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
"@types/node": "^24.1.0",
|
"@types/node": "^24.1.0",
|
||||||
"csv-parse": "^6.1.0",
|
|
||||||
"typescript": "^5.9.3"
|
"typescript": "^5.9.3"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
8
pnpm-lock.yaml
generated
8
pnpm-lock.yaml
generated
|
|
@ -33,9 +33,6 @@ importers:
|
||||||
'@types/node':
|
'@types/node':
|
||||||
specifier: ^24.1.0
|
specifier: ^24.1.0
|
||||||
version: 24.10.0
|
version: 24.10.0
|
||||||
csv-parse:
|
|
||||||
specifier: ^6.1.0
|
|
||||||
version: 6.1.0
|
|
||||||
typescript:
|
typescript:
|
||||||
specifier: ^5.9.3
|
specifier: ^5.9.3
|
||||||
version: 5.9.3
|
version: 5.9.3
|
||||||
|
|
@ -62,9 +59,6 @@ packages:
|
||||||
buffer-crc32@0.2.13:
|
buffer-crc32@0.2.13:
|
||||||
resolution: {integrity: sha512-VO9Ht/+p3SN7SKWqcrgEzjGbRSJYTx+Q1pTQC0wrWqHx0vpJraQ6GtHx8tvcg1rlK1byhU5gccxgOgj7B0TDkQ==}
|
resolution: {integrity: sha512-VO9Ht/+p3SN7SKWqcrgEzjGbRSJYTx+Q1pTQC0wrWqHx0vpJraQ6GtHx8tvcg1rlK1byhU5gccxgOgj7B0TDkQ==}
|
||||||
|
|
||||||
csv-parse@6.1.0:
|
|
||||||
resolution: {integrity: sha512-CEE+jwpgLn+MmtCpVcPtiCZpVtB6Z2OKPTr34pycYYoL7sxdOkXDdQ4lRiw6ioC0q6BLqhc6cKweCVvral8yhw==}
|
|
||||||
|
|
||||||
dom-serializer@2.0.0:
|
dom-serializer@2.0.0:
|
||||||
resolution: {integrity: sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg==}
|
resolution: {integrity: sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg==}
|
||||||
|
|
||||||
|
|
@ -182,8 +176,6 @@ snapshots:
|
||||||
|
|
||||||
buffer-crc32@0.2.13: {}
|
buffer-crc32@0.2.13: {}
|
||||||
|
|
||||||
csv-parse@6.1.0: {}
|
|
||||||
|
|
||||||
dom-serializer@2.0.0:
|
dom-serializer@2.0.0:
|
||||||
dependencies:
|
dependencies:
|
||||||
domelementtype: 2.3.0
|
domelementtype: 2.3.0
|
||||||
|
|
|
||||||
|
|
@ -1,11 +1,9 @@
|
||||||
import test from "node:test";
|
import test from "node:test";
|
||||||
import nodePath from "node:path";
|
import nodePath from "node:path";
|
||||||
import { strict as assert } from "node:assert";
|
import { strict as assert } from "node:assert";
|
||||||
import { TaskTarget, verify, run, unzip, pipe } from "../data-export/task.ts";
|
import { TaskTargetPipelineHelper, TaskTarget, verify, run } from "../data-export/task.ts";
|
||||||
import { parallel } from "../data-export/parallel.ts";
|
import { parallel } from "../data-export/parallel.ts";
|
||||||
import { facebook, facebook_v2 } from "../data-export/facebook.ts";
|
import "../data-export/facebook.ts";
|
||||||
import * as DataIO from "../data-export/io.ts";
|
|
||||||
import { parse } from "csv-parse/sync"; // For better diffs + error checking of CSV output
|
|
||||||
|
|
||||||
const THIS_FILE = import.meta.dirname;
|
const THIS_FILE = import.meta.dirname;
|
||||||
const FACEBOOK_V1_DIR = nodePath.join(THIS_FILE, 'fixtures/facebook-json-2021-05-01');
|
const FACEBOOK_V1_DIR = nodePath.join(THIS_FILE, 'fixtures/facebook-json-2021-05-01');
|
||||||
|
|
@ -13,56 +11,67 @@ const FACEBOOK_V1_ZIPPED = nodePath.join(THIS_FILE, 'fixtures/facebook-json-2021
|
||||||
const FACEBOOK_V2_DIR = nodePath.join(THIS_FILE, 'fixtures/facebook-json-2025-11-29');
|
const FACEBOOK_V2_DIR = nodePath.join(THIS_FILE, 'fixtures/facebook-json-2025-11-29');
|
||||||
|
|
||||||
test("facebook: Can load the 2021 export", async (t) => {
|
test("facebook: Can load the 2021 export", async (t) => {
|
||||||
const targets = [
|
const targets = TaskTargetPipelineHelper.pipeline([
|
||||||
new TaskTarget(FACEBOOK_V1_DIR)
|
new TaskTarget(FACEBOOK_V1_DIR)
|
||||||
]
|
])
|
||||||
const builtTargets = await facebook()(targets);
|
.facebook();
|
||||||
const out = await DataIO.runPipeline(builtTargets);
|
|
||||||
|
const finalTargets = await verify(targets);
|
||||||
|
const result = await parallel(finalTargets, true);
|
||||||
const idAndCSVs: [string, string][] = [];
|
const idAndCSVs: [string, string][] = [];
|
||||||
for (const {target, result} of out) {
|
for (const [idx, r] of result.entries()) {
|
||||||
assert.ok(!result.stderr, `Task ${target.id} should have no stderr output`);
|
const target = finalTargets[idx];
|
||||||
assert.ok(result.ok, `Task ${target.id} should be okay`);
|
assert.ok(!r.stderr, `Task ${target.id} should have no stderr output`);
|
||||||
idAndCSVs.push([target.id, result.stdout]);
|
assert.ok(r.ok, `Task ${target.id} should be okay`);
|
||||||
|
idAndCSVs.push([target.id, r.stdout]);
|
||||||
}
|
}
|
||||||
const csvs = idAndCSVs
|
const csvs = idAndCSVs
|
||||||
.sort() // Keep stable ordering for snapshots
|
.sort() // Keep stable ordering for snapshots
|
||||||
.map(v => parse(v[1]))
|
.map(v => v[1])
|
||||||
|
|
||||||
t.assert.snapshot(csvs);
|
t.assert.snapshot(csvs);
|
||||||
});
|
});
|
||||||
test("facebook: Can load the 2021 export zipped", async (t) => {
|
test("facebook: Can load the 2021 export zipped", async (t) => {
|
||||||
const targets = [
|
const targets = await TaskTargetPipelineHelper.pipeline([
|
||||||
new TaskTarget(FACEBOOK_V1_ZIPPED)
|
new TaskTarget(FACEBOOK_V1_ZIPPED)
|
||||||
];
|
])
|
||||||
const builtTargets = await pipe(unzip(), facebook())(targets);
|
.unzip();
|
||||||
const out = await DataIO.runPipeline(builtTargets);
|
const targets2 = targets
|
||||||
|
.facebook();
|
||||||
|
|
||||||
|
const finalTargets = await verify(targets2);
|
||||||
|
const result = await parallel(finalTargets, true);
|
||||||
const idAndCSVs: [string, string][] = [];
|
const idAndCSVs: [string, string][] = [];
|
||||||
for (const {target, result} of out) {
|
for (const [idx, r] of result.entries()) {
|
||||||
assert.ok(!result.stderr, `Task ${target.id} should have no stderr output`);
|
const target = finalTargets[idx];
|
||||||
assert.ok(result.ok, `Task ${target.id} should be okay`);
|
assert.ok(!r.stderr, `Task ${target.id} should have no stderr output`);
|
||||||
idAndCSVs.push([target.id, result.stdout]);
|
assert.ok(r.ok, `Task ${target.id} should be okay`);
|
||||||
|
idAndCSVs.push([target.id, r.stdout]);
|
||||||
}
|
}
|
||||||
const csvs = idAndCSVs
|
const csvs = idAndCSVs
|
||||||
.sort() // Keep stable ordering for snapshots
|
.sort() // Keep stable ordering for snapshots
|
||||||
.map(v => parse(v[1]))
|
.map(v => v[1])
|
||||||
|
|
||||||
t.assert.snapshot(csvs);
|
t.assert.snapshot(csvs);
|
||||||
});
|
});
|
||||||
test("facebook: Can load the 2025 export", async (t) => {
|
test("facebook: Can load the 2025 export", async (t) => {
|
||||||
const targets = [
|
const targets = TaskTargetPipelineHelper.pipeline([
|
||||||
new TaskTarget(FACEBOOK_V2_DIR)
|
new TaskTarget(FACEBOOK_V2_DIR)
|
||||||
]
|
])
|
||||||
const builtTargets = await facebook_v2()(targets);
|
.facebook_v2();
|
||||||
const out = await DataIO.runPipeline(builtTargets);
|
|
||||||
|
const finalTargets = await verify(targets);
|
||||||
|
const result = await parallel(finalTargets, true);
|
||||||
const idAndCSVs: [string, string][] = [];
|
const idAndCSVs: [string, string][] = [];
|
||||||
for (const {target, result} of out) {
|
for (const [idx, r] of result.entries()) {
|
||||||
assert.ok(!result.stderr, `Task ${target.id} should have no stderr output`);
|
const target = finalTargets[idx];
|
||||||
assert.ok(result.ok, `Task ${target.id} should be okay`);
|
assert.ok(!r.stderr, `Task ${target.id} should have no stderr output`);
|
||||||
idAndCSVs.push([target.id, result.stdout]);
|
assert.ok(r.ok, `Task ${target.id} should be okay`);
|
||||||
|
idAndCSVs.push([target.id, r.stdout]);
|
||||||
}
|
}
|
||||||
const csvs = idAndCSVs
|
const csvs = idAndCSVs
|
||||||
.sort() // Keep stable ordering for snapshots
|
.sort() // Keep stable ordering for snapshots
|
||||||
.map(v => parse(v[1]))
|
.map(v => v[1])
|
||||||
|
|
||||||
t.assert.snapshot(csvs);
|
t.assert.snapshot(csvs);
|
||||||
});
|
});
|
||||||
|
|
|
||||||
File diff suppressed because it is too large
Load diff
3
test/fixtures/README.md
vendored
3
test/fixtures/README.md
vendored
|
|
@ -11,6 +11,3 @@
|
||||||
|
|
||||||
* `facebook-json-2021-05-01` - Facebook JSON export
|
* `facebook-json-2021-05-01` - Facebook JSON export
|
||||||
* `facebook-json-2025-11-29` - Facebook JSON export
|
* `facebook-json-2025-11-29` - Facebook JSON export
|
||||||
* [`discord-chat-exporter-2026-02`](./discord-chat-exporter-2026-02.md) - Discord export with [DiscordChatExporter](https://github.com/Tyrrrz/DiscordChatExporter) sometime around Feb 2026
|
|
||||||
* [`discord-json-2021-01`](./discord-json-2021-01.md) - Discord JSON export
|
|
||||||
* [`snapchat-2023-11`](./snapchat-2023-11.md) - Snapchat JSON + HTML export
|
|
||||||
|
|
|
||||||
25
test/fixtures/discord-chat-exporter-2026-02.md
vendored
25
test/fixtures/discord-chat-exporter-2026-02.md
vendored
|
|
@ -1,25 +0,0 @@
|
||||||
# discord-chat-exporter-2026-02
|
|
||||||
|
|
||||||
An export from `DiscordChatExporter`, a comprehensive DiscordChatExporter
|
|
||||||
|
|
||||||
## Export methodology
|
|
||||||
|
|
||||||
This uses the version of `DiscordChatExporter` that existed at the top of the releases tab on GitHub around `2026 February`. **TODO: figure out version**
|
|
||||||
|
|
||||||
This export used a command something like the following to try to get _everything_ `dotnet DiscordChatExporter.Cli.dll export -t xxx -o ~/DiscordChatExporter -f json --media --reuse-media --include-threads -c xxx`
|
|
||||||
|
|
||||||
* It uses `export` command and `-c` but it's the same for `exportguild` and `-g`
|
|
||||||
* `-f json` so only the json export
|
|
||||||
* `--media` download all media
|
|
||||||
* `--reuse-media` not quite sure what this does because it puts it in a folder per channel...
|
|
||||||
* `--include-threads` to get any threads
|
|
||||||
|
|
||||||
## Manual edits
|
|
||||||
* Lots of image replacing + placeholders
|
|
||||||
* Had to rename the folders
|
|
||||||
|
|
||||||
## Notes
|
|
||||||
The export format has files and folders with similar, information-dense names. I tried to preserve that as that's the only way to correlate between the folder and the file name
|
|
||||||
|
|
||||||
* No exif on any media files
|
|
||||||
* There's embeds, thumbnails in the example chat messages but I have no other specimen
|
|
||||||
|
|
@ -1,145 +0,0 @@
|
||||||
{
|
|
||||||
"guild": {
|
|
||||||
"id": "111111111111111111",
|
|
||||||
"name": "xxxxxxxx",
|
|
||||||
"iconUrl": "GuildName - Text Channels - ChannelName [0000000000000000].json_Files/avatar.png"
|
|
||||||
},
|
|
||||||
"channel": {
|
|
||||||
"id": "111111111111111111",
|
|
||||||
"type": "xxxxxxxxxxxxx",
|
|
||||||
"categoryId": "111111111111111111",
|
|
||||||
"category": "xxxxxxxxxxxxx",
|
|
||||||
"name": "xxxxxxx",
|
|
||||||
"topic": null
|
|
||||||
},
|
|
||||||
"dateRange": {
|
|
||||||
"after": null,
|
|
||||||
"before": null
|
|
||||||
},
|
|
||||||
"exportedAt": "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
|
|
||||||
"messages": [
|
|
||||||
{
|
|
||||||
"id": "111111111111111111",
|
|
||||||
"type": "xxxxxxxxxxxxxxx",
|
|
||||||
"timestamp": "2020-04-13T10:09:08.000000+00:00",
|
|
||||||
"timestampEdited": null,
|
|
||||||
"callEndedTimestamp": null,
|
|
||||||
"isPinned": false,
|
|
||||||
"content": "xxxxxxxxxxxxxxxxxx",
|
|
||||||
"author": {
|
|
||||||
"id": "111111111111111111",
|
|
||||||
"name": "xxxxxxxx",
|
|
||||||
"discriminator": "1111",
|
|
||||||
"nickname": "xxxxxxxx",
|
|
||||||
"color": null,
|
|
||||||
"isBot": false,
|
|
||||||
"roles": [],
|
|
||||||
"avatarUrl": "GuildName - Text Channels - ChannelName [0000000000000000].json_Files/avatar.png"
|
|
||||||
},
|
|
||||||
"attachments": [],
|
|
||||||
"embeds": [],
|
|
||||||
"stickers": [],
|
|
||||||
"reactions": [],
|
|
||||||
"mentions": [],
|
|
||||||
"inlineEmojis": []
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": "111111111111111111",
|
|
||||||
"type": "xxxxxxx",
|
|
||||||
"timestamp": "2020-04-13T10:09:08.000000+00:00",
|
|
||||||
"timestampEdited": null,
|
|
||||||
"callEndedTimestamp": null,
|
|
||||||
"isPinned": false,
|
|
||||||
"content": "xxxxxxxxx",
|
|
||||||
"author": {
|
|
||||||
"id": "111111111111111111",
|
|
||||||
"name": "xxxxxxxx",
|
|
||||||
"discriminator": "1111",
|
|
||||||
"nickname": "xxxxxxxx",
|
|
||||||
"color": null,
|
|
||||||
"isBot": false,
|
|
||||||
"roles": [],
|
|
||||||
"avatarUrl": "GuildName - Text Channels - ChannelName [0000000000000000].json_Files/avatar.png"
|
|
||||||
},
|
|
||||||
"attachments": [],
|
|
||||||
"embeds": [],
|
|
||||||
"stickers": [],
|
|
||||||
"reactions": [],
|
|
||||||
"mentions": [],
|
|
||||||
"inlineEmojis": []
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": "111111111111111111",
|
|
||||||
"type": "xxxxxxx",
|
|
||||||
"timestamp": "2020-04-13T10:09:08.000000+00:00",
|
|
||||||
"timestampEdited": null,
|
|
||||||
"callEndedTimestamp": null,
|
|
||||||
"isPinned": false,
|
|
||||||
"content": "https://example.com/example.png",
|
|
||||||
"author": {
|
|
||||||
"id": "111111111111111111",
|
|
||||||
"name": "xxxxxxxx",
|
|
||||||
"discriminator": "1111",
|
|
||||||
"nickname": "xxxxxxxx",
|
|
||||||
"color": null,
|
|
||||||
"isBot": false,
|
|
||||||
"roles": [],
|
|
||||||
"avatarUrl": "GuildName - Text Channels - ChannelName [0000000000000000].json_Files/avatar.png"
|
|
||||||
},
|
|
||||||
"attachments": [],
|
|
||||||
"embeds": [
|
|
||||||
{
|
|
||||||
"title": "",
|
|
||||||
"url": "https://example.com/example.png",
|
|
||||||
"timestamp": null,
|
|
||||||
"description": "",
|
|
||||||
"thumbnail": {
|
|
||||||
"url": "GuildName - Text Channels - ChannelName [0000000000000000].json_Files/example.png",
|
|
||||||
"width": 111,
|
|
||||||
"height": 111
|
|
||||||
},
|
|
||||||
"images": [],
|
|
||||||
"fields": [],
|
|
||||||
"inlineEmojis": []
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"stickers": [],
|
|
||||||
"reactions": [],
|
|
||||||
"mentions": [],
|
|
||||||
"inlineEmojis": []
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": "111111111111111111",
|
|
||||||
"type": "xxxxxxx",
|
|
||||||
"timestamp": "2020-04-13T10:09:08.000000+00:00",
|
|
||||||
"timestampEdited": null,
|
|
||||||
"callEndedTimestamp": null,
|
|
||||||
"isPinned": false,
|
|
||||||
"content": "xxx",
|
|
||||||
"author": {
|
|
||||||
"id": "111111111111111111",
|
|
||||||
"name": "xxxxxxxx",
|
|
||||||
"discriminator": "1111",
|
|
||||||
"nickname": "xxxxxxxx",
|
|
||||||
"color": null,
|
|
||||||
"isBot": false,
|
|
||||||
"roles": [],
|
|
||||||
"avatarUrl": "GuildName - Text Channels - ChannelName [0000000000000000].json_Files/avatar.png"
|
|
||||||
},
|
|
||||||
"attachments": [
|
|
||||||
{
|
|
||||||
"id": "111111111111111111",
|
|
||||||
"url": "GuildName - Text Channels - ChannelName [0000000000000000].json_Files/unknown-SUFFIX.png",
|
|
||||||
"fileName": "unknown.png",
|
|
||||||
"fileSizeBytes": 111111
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"embeds": [],
|
|
||||||
"stickers": [],
|
|
||||||
"reactions": [],
|
|
||||||
"mentions": [],
|
|
||||||
"inlineEmojis": []
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"messageCount": 111
|
|
||||||
}
|
|
||||||
Binary file not shown.
|
Before Width: | Height: | Size: 1.2 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 1.3 KiB |
Binary file not shown.
|
Before Width: | Height: | Size: 1.3 KiB |
41
test/fixtures/discord-json-2021-01.md
vendored
41
test/fixtures/discord-json-2021-01.md
vendored
|
|
@ -1,41 +0,0 @@
|
||||||
# discord-json-2021-01
|
|
||||||
|
|
||||||
## Manual edits
|
|
||||||
* images -> placeholders
|
|
||||||
* `accounts/avatar.png`
|
|
||||||
* manually scrub folder names
|
|
||||||
* `account/applications/0000000000000`
|
|
||||||
|
|
||||||
## Notes about files
|
|
||||||
* `activity/`
|
|
||||||
* All the .json are NDJSON so some json tools don't like them
|
|
||||||
* _Massive_ files. They hang scrub.ts for a long long time (had to run these piecemeal)
|
|
||||||
* These files also have an _incredible_ amount of shapes and variance.
|
|
||||||
* Instead of outputing all the shapes I made a sort of "super-object" to capture the shape with `jq -n '[inputs] | add' events-2021-00000-of-00001.json.tmp > unique_shape.json` and then scrubbing `unique_shape.json`
|
|
||||||
* `messages/`
|
|
||||||
* I hand did these to keep all the ids the same
|
|
||||||
* There are multiple types of chats. DMs, guild channels, etc
|
|
||||||
* I hand did the csvs as I have no scrubber for that
|
|
||||||
* These are only **THE EXPORTING USERS MESSAGES**, no other user, just fyi
|
|
||||||
* Ids in `messages.csv` are just the id of the message, not of any user
|
|
||||||
* There is the potential to derive missing info from a channel via `@` tags sent or possibly via attachments. Maybe...
|
|
||||||
* `11111111111111111`
|
|
||||||
* This one has a shorter id (it's an older one)
|
|
||||||
* Has `type: 0` but there's no guild information in `channel.json`
|
|
||||||
* The user name was `null` in `index.json`
|
|
||||||
* It's a really odd one
|
|
||||||
* `222222222222222222`
|
|
||||||
* This was a dm channel (said `direct message with xxx#7777` in index.json)
|
|
||||||
* Has `type: 1` and there are two recipients (just the ids) in `channel.json`
|
|
||||||
* Unfortunately that's all the info in the export
|
|
||||||
* `333333333333333333`
|
|
||||||
* This was a normal guild channel
|
|
||||||
* `type: 0` and there's guild information in `channel.json`
|
|
||||||
* I kept a good set of messages around from this one to show how attachements and other stuff works
|
|
||||||
* The last message seemed to be a link not as an attachment. Links just seem to be normal text
|
|
||||||
* `programs/`
|
|
||||||
* was empty...
|
|
||||||
* `servers/``
|
|
||||||
* Info about _some_ of the guilds we have ids for
|
|
||||||
* guild.json didn't really contain anything except the name
|
|
||||||
* I kept around the only guild I noticed an audit-log.json with info in it
|
|
||||||
26
test/fixtures/discord-json-2021-01/README.txt
vendored
26
test/fixtures/discord-json-2021-01/README.txt
vendored
|
|
@ -1,26 +0,0 @@
|
||||||
__ __ ___ _ _ ___ ___ ___ _____ ___ _
|
|
||||||
\ \ / / / _ \ | | | | | _ \ o O O | \ / \ |_ _| / \ | |
|
|
||||||
\ V / | (_) | | |_| | | / o | |) | | - | | | | - | |_|
|
|
||||||
_|_|_ \___/ \___/ |_|_\ TS__[O] |___/ |_|_| _|_|_ |_|_| _(_)_
|
|
||||||
_| """ |_|"""""|_|"""""|_|"""""| <======|_|"""""|_|"""""|_|"""""|_|"""""|_| """ |
|
|
||||||
"`-0-0-'"`-0-0-'"`-0-0-'"`-0-0-'./o--000'"`-0-0-'"`-0-0-'"`-0-0-'"`-0-0-'"`-0-0-'
|
|
||||||
___ ___ _ _ ___ ___ ___ _ _ _
|
|
||||||
|_ _| / __| o O O | || | | __| | _ \ | __| | | | | | |
|
|
||||||
| | \__ \ o | __ | | _| | / | _| |_| |_| |_|
|
|
||||||
|___| |___/ TS__[O] |_||_| |___| |_|_\ |___| _(_)_ _(_)_ _(_)_
|
|
||||||
_|"""""|_|"""""| <======|_|"""""|_|"""""|_|"""""|_|"""""|_| """ |_| """ |_| """ |
|
|
||||||
"`-0-0-'"`-0-0-'./o--000'"`-0-0-'"`-0-0-'"`-0-0-'"`-0-0-'"`-0-0-'"`-0-0-'"`-0-0-'
|
|
||||||
|
|
||||||
Welcome to your Discord Data Package!
|
|
||||||
|
|
||||||
Inside, you'll find a few JSON (JavaScript Object Notation) and CSV (Comma Separated Values) files
|
|
||||||
of the data we use to provide Discord's service to you. We've chosen these formats for ease of
|
|
||||||
processing. Furthermore, the files have been organized into logical groups to make it easy to
|
|
||||||
understand and work with (at least, we hope so)!
|
|
||||||
|
|
||||||
For more information, you can view our in-depth help article at the following URL:
|
|
||||||
|
|
||||||
https://support.discord.com/hc/articles/360004957991
|
|
||||||
|
|
||||||
All the best,
|
|
||||||
Discord Team
|
|
||||||
|
|
@ -1,16 +0,0 @@
|
||||||
{
|
|
||||||
"id": "111111111111111111",
|
|
||||||
"name": "xxxxxxx",
|
|
||||||
"icon": null,
|
|
||||||
"description": "",
|
|
||||||
"summary": "",
|
|
||||||
"hook": false,
|
|
||||||
"verify_key": "a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1",
|
|
||||||
"flags": 1,
|
|
||||||
"secret": "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
|
|
||||||
"redirect_uris": [],
|
|
||||||
"rpc_application_state": 1,
|
|
||||||
"store_application_state": 1,
|
|
||||||
"verification_state": 1,
|
|
||||||
"interactions_endpoint_url": null
|
|
||||||
}
|
|
||||||
Binary file not shown.
|
Before Width: | Height: | Size: 1.7 KiB |
399
test/fixtures/discord-json-2021-01/account/user.json
vendored
399
test/fixtures/discord-json-2021-01/account/user.json
vendored
|
|
@ -1,399 +0,0 @@
|
||||||
{
|
|
||||||
"id": "111111111111111111",
|
|
||||||
"username": "xxxxxxxx",
|
|
||||||
"discriminator": 1111,
|
|
||||||
"email": "not_a_real_email@example.com",
|
|
||||||
"verified": false,
|
|
||||||
"avatar_hash": "a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1",
|
|
||||||
"has_mobile": false,
|
|
||||||
"needs_email_verification": false,
|
|
||||||
"premium_until": "2020-04-13T10:09:08.000000+00:00",
|
|
||||||
"flags": 11111111111111,
|
|
||||||
"phone": "xxxxxxxxxxxx",
|
|
||||||
"temp_banned_until": null,
|
|
||||||
"ip": "1.1.1.1",
|
|
||||||
"settings": {
|
|
||||||
"locale": "xxxxx",
|
|
||||||
"show_current_game": false,
|
|
||||||
"restricted_guilds": [],
|
|
||||||
"default_guilds_restricted": false,
|
|
||||||
"inline_attachment_media": false,
|
|
||||||
"inline_embed_media": false,
|
|
||||||
"gif_auto_play": false,
|
|
||||||
"render_embeds": false,
|
|
||||||
"render_reactions": false,
|
|
||||||
"animate_emoji": false,
|
|
||||||
"enable_tts_command": false,
|
|
||||||
"message_display_compact": false,
|
|
||||||
"convert_emoticons": false,
|
|
||||||
"explicit_content_filter": 1,
|
|
||||||
"disable_games_tab": false,
|
|
||||||
"theme": "xxxx",
|
|
||||||
"developer_mode": false,
|
|
||||||
"guild_positions": [
|
|
||||||
"111111111111111111",
|
|
||||||
"111111111111111111"
|
|
||||||
],
|
|
||||||
"detect_platform_accounts": false,
|
|
||||||
"status": "xxxxxx",
|
|
||||||
"afk_timeout": 111,
|
|
||||||
"timezone_offset": 111,
|
|
||||||
"stream_notifications_enabled": false,
|
|
||||||
"allow_accessibility_detection": false,
|
|
||||||
"contact_sync_enabled": false,
|
|
||||||
"native_phone_integration_enabled": false,
|
|
||||||
"animate_stickers": 1,
|
|
||||||
"friend_source_flags": {
|
|
||||||
"all": false
|
|
||||||
},
|
|
||||||
"guild_folders": [
|
|
||||||
{
|
|
||||||
"guild_ids": [
|
|
||||||
"111111111111111111"
|
|
||||||
],
|
|
||||||
"id": null,
|
|
||||||
"name": null,
|
|
||||||
"color": null
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"guild_ids": [
|
|
||||||
"111111111111111111"
|
|
||||||
],
|
|
||||||
"id": null,
|
|
||||||
"name": null,
|
|
||||||
"color": null
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"custom_status": null
|
|
||||||
},
|
|
||||||
"connections": [
|
|
||||||
{
|
|
||||||
"type": "xxxxxxxxx",
|
|
||||||
"id": "xxxxxxxxxxx",
|
|
||||||
"name": "xxxxxxxxxxx",
|
|
||||||
"revoked": false,
|
|
||||||
"visibility": 1,
|
|
||||||
"friend_sync": false,
|
|
||||||
"show_activity": false,
|
|
||||||
"verified": false
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"type": "xxxxxxx",
|
|
||||||
"id": "xxxxxxxx",
|
|
||||||
"name": "xxxxxxxx",
|
|
||||||
"revoked": false,
|
|
||||||
"visibility": 1,
|
|
||||||
"friend_sync": false,
|
|
||||||
"show_activity": false,
|
|
||||||
"verified": false
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"external_friends_lists": [
|
|
||||||
{
|
|
||||||
"user_id": "111111111111111111",
|
|
||||||
"platform_type": "xxxxx",
|
|
||||||
"name": "xxxxxxxx",
|
|
||||||
"id_hash": "a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1",
|
|
||||||
"friend_id_hashes": [
|
|
||||||
"a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1",
|
|
||||||
"a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"user_id": "111111111111111111",
|
|
||||||
"platform_type": "xxxxxxxxx",
|
|
||||||
"name": "xxxxxxxxxxx",
|
|
||||||
"id_hash": "a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1",
|
|
||||||
"friend_id_hashes": [
|
|
||||||
"a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1",
|
|
||||||
"a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"friend_suggestions": [],
|
|
||||||
"mfa_sessions": [],
|
|
||||||
"relationships": [
|
|
||||||
{
|
|
||||||
"id": "11111111111111111",
|
|
||||||
"type": 1,
|
|
||||||
"nickname": null,
|
|
||||||
"user": {
|
|
||||||
"id": "11111111111111111",
|
|
||||||
"username": "xxxxxxxxxxxx",
|
|
||||||
"avatar": "a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1",
|
|
||||||
"discriminator": "1111",
|
|
||||||
"public_flags": 1
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": "11111111111111111",
|
|
||||||
"type": 1,
|
|
||||||
"nickname": null,
|
|
||||||
"user": {
|
|
||||||
"id": "11111111111111111",
|
|
||||||
"username": "xxxx",
|
|
||||||
"avatar": "a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1",
|
|
||||||
"discriminator": "1111",
|
|
||||||
"public_flags": 111
|
|
||||||
}
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"payments": [
|
|
||||||
{
|
|
||||||
"id": "111111111111111111",
|
|
||||||
"created_at": "2020-04-13T10:09:08.000000+00:00",
|
|
||||||
"currency": "xxx",
|
|
||||||
"tax": 111,
|
|
||||||
"tax_inclusive": false,
|
|
||||||
"amount": 1111,
|
|
||||||
"amount_refunded": 1,
|
|
||||||
"status": 1,
|
|
||||||
"description": "xxxxxxxxxxxxxxxxxxxx",
|
|
||||||
"flags": 1,
|
|
||||||
"subscription": {
|
|
||||||
"id": "111111111111111111",
|
|
||||||
"type": 1,
|
|
||||||
"current_period_start": "2020-04-13T10:09:08.000000+00:00",
|
|
||||||
"current_period_end": "2020-04-13T10:09:08.000000+00:00",
|
|
||||||
"payment_gateway": null,
|
|
||||||
"payment_gateway_plan_id": "xxxxxxxxxxxxxxxxxxx",
|
|
||||||
"currency": "xxx",
|
|
||||||
"plan_id": "111111111111111111",
|
|
||||||
"items": [
|
|
||||||
{
|
|
||||||
"id": "111111111111111111",
|
|
||||||
"plan_id": "111111111111111111",
|
|
||||||
"quantity": 1
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"payment_source": {
|
|
||||||
"id": "111111111111111111",
|
|
||||||
"type": 1,
|
|
||||||
"invalid": false,
|
|
||||||
"brand": "xxxx",
|
|
||||||
"last_4": "1111",
|
|
||||||
"expires_month": 11,
|
|
||||||
"expires_year": 1111,
|
|
||||||
"billing_address": {
|
|
||||||
"name": "xxxxxxxxxxxxx",
|
|
||||||
"line_1": "xxxxxxxxxxxxxxxxx",
|
|
||||||
"line_2": null,
|
|
||||||
"city": "xxxxxxxx",
|
|
||||||
"state": "xx",
|
|
||||||
"country": "xx",
|
|
||||||
"postal_code": "11111"
|
|
||||||
},
|
|
||||||
"country": "xx"
|
|
||||||
},
|
|
||||||
"sku_id": "111111111111111111",
|
|
||||||
"sku_price": 1111,
|
|
||||||
"sku_subscription_plan_id": "111111111111111111"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": "111111111111111111",
|
|
||||||
"created_at": "2020-04-13T10:09:08.000000+00:00",
|
|
||||||
"currency": "xxx",
|
|
||||||
"tax": 111,
|
|
||||||
"tax_inclusive": false,
|
|
||||||
"amount": 1111,
|
|
||||||
"amount_refunded": 1,
|
|
||||||
"status": 1,
|
|
||||||
"description": "xxxxxxxxxxxxxxxxxxxx",
|
|
||||||
"flags": 1,
|
|
||||||
"subscription": {
|
|
||||||
"id": "111111111111111111",
|
|
||||||
"type": 1,
|
|
||||||
"current_period_start": "2020-04-13T10:09:08.000000+00:00",
|
|
||||||
"current_period_end": "2020-04-13T10:09:08.000000+00:00",
|
|
||||||
"payment_gateway": null,
|
|
||||||
"payment_gateway_plan_id": "xxxxxxxxxxxxxxxxxxx",
|
|
||||||
"currency": "xxx",
|
|
||||||
"plan_id": "111111111111111111",
|
|
||||||
"items": [
|
|
||||||
{
|
|
||||||
"id": "111111111111111111",
|
|
||||||
"plan_id": "111111111111111111",
|
|
||||||
"quantity": 1
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"payment_source": {
|
|
||||||
"id": "111111111111111111",
|
|
||||||
"type": 1,
|
|
||||||
"invalid": false,
|
|
||||||
"brand": "xxxx",
|
|
||||||
"last_4": "1111",
|
|
||||||
"expires_month": 11,
|
|
||||||
"expires_year": 1111,
|
|
||||||
"billing_address": {
|
|
||||||
"name": "xxxxxxxxxxxxx",
|
|
||||||
"line_1": "xxxxxxxxxxxxxxxxxx",
|
|
||||||
"line_2": null,
|
|
||||||
"city": "xxxxxxxxxx",
|
|
||||||
"state": "xx",
|
|
||||||
"country": "xx",
|
|
||||||
"postal_code": "11111"
|
|
||||||
},
|
|
||||||
"country": "xx"
|
|
||||||
},
|
|
||||||
"sku_id": "111111111111111111",
|
|
||||||
"sku_price": 1111,
|
|
||||||
"sku_subscription_plan_id": "111111111111111111"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"payment_sources": [
|
|
||||||
{
|
|
||||||
"id": "111111111111111111",
|
|
||||||
"type": 1,
|
|
||||||
"invalid": false,
|
|
||||||
"brand": "xxxx",
|
|
||||||
"last_4": "1111",
|
|
||||||
"expires_month": 11,
|
|
||||||
"expires_year": 1111,
|
|
||||||
"billing_address": {
|
|
||||||
"name": "xxxxxxxxxxxxx",
|
|
||||||
"line_1": "xxxxxxxxxxxxxxxxx",
|
|
||||||
"line_2": null,
|
|
||||||
"city": "xxxxxxxx",
|
|
||||||
"state": "xx",
|
|
||||||
"country": "xx",
|
|
||||||
"postal_code": "11111"
|
|
||||||
},
|
|
||||||
"country": "xx"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"guild_settings": [
|
|
||||||
{
|
|
||||||
"guild_id": null,
|
|
||||||
"suppress_everyone": false,
|
|
||||||
"suppress_roles": false,
|
|
||||||
"message_notifications": 1,
|
|
||||||
"mobile_push": false,
|
|
||||||
"muted": false,
|
|
||||||
"mute_config": null,
|
|
||||||
"channel_overrides": [
|
|
||||||
{
|
|
||||||
"channel_id": "111111111111111111",
|
|
||||||
"message_notifications": 1,
|
|
||||||
"muted": false,
|
|
||||||
"mute_config": null
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"version": 11
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"guild_id": "11111111111111111",
|
|
||||||
"suppress_everyone": false,
|
|
||||||
"suppress_roles": false,
|
|
||||||
"message_notifications": 1,
|
|
||||||
"mobile_push": false,
|
|
||||||
"muted": false,
|
|
||||||
"mute_config": null,
|
|
||||||
"channel_overrides": [
|
|
||||||
{
|
|
||||||
"channel_id": "111111111111111111",
|
|
||||||
"message_notifications": 1,
|
|
||||||
"muted": false,
|
|
||||||
"mute_config": null
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"channel_id": "111111111111111111",
|
|
||||||
"message_notifications": 1,
|
|
||||||
"muted": false,
|
|
||||||
"mute_config": null
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"version": 1
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"library_applications": [
|
|
||||||
{
|
|
||||||
"application": {
|
|
||||||
"id": "111111111111111111",
|
|
||||||
"name": "xxxxxxxxxxxx",
|
|
||||||
"icon": "a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1",
|
|
||||||
"description": "xxxxxxxxxxxxxxxxxxxxx",
|
|
||||||
"summary": "xxxxxxxxxxxxxxxxxxxxx",
|
|
||||||
"primary_sku_id": "111111111111111111",
|
|
||||||
"hook": false,
|
|
||||||
"slug": "xxxxxxxxxxxx",
|
|
||||||
"guild_id": "111111111111111111",
|
|
||||||
"verify_key": "a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1",
|
|
||||||
"publishers": [
|
|
||||||
{
|
|
||||||
"id": "111111111111111111",
|
|
||||||
"name": "xxxxxxxxxxx"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"developers": [
|
|
||||||
{
|
|
||||||
"id": "111111111111111111",
|
|
||||||
"name": "xxxxxxxxxxx"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"id": "111111111111111111",
|
|
||||||
"name": "xxxxxxxxxxxxxxxxxxxxxxxx"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"branch_id": "111111111111111111",
|
|
||||||
"sku_id": "111111111111111111",
|
|
||||||
"sku": {
|
|
||||||
"id": "111111111111111111",
|
|
||||||
"type": 1,
|
|
||||||
"premium": false,
|
|
||||||
"preorder_release_at": null,
|
|
||||||
"preorder_approximate_release_date": null
|
|
||||||
},
|
|
||||||
"flags": 1,
|
|
||||||
"created_at": "2020-04-13T10:09:08.000000+00:00",
|
|
||||||
"entitlements": [
|
|
||||||
{
|
|
||||||
"id": "111111111111111111",
|
|
||||||
"sku_id": "111111111111111111",
|
|
||||||
"application_id": "111111111111111111",
|
|
||||||
"user_id": "111111111111111111",
|
|
||||||
"type": 1,
|
|
||||||
"deleted": false,
|
|
||||||
"gift_code_flags": 1,
|
|
||||||
"branches": [
|
|
||||||
"111111111111111111"
|
|
||||||
]
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"entitlements": [
|
|
||||||
{
|
|
||||||
"id": "111111111111111111",
|
|
||||||
"sku_id": "111111111111111111",
|
|
||||||
"application_id": "111111111111111111",
|
|
||||||
"user_id": "111111111111111111",
|
|
||||||
"type": 1,
|
|
||||||
"deleted": false,
|
|
||||||
"gift_code_flags": 1,
|
|
||||||
"branches": [
|
|
||||||
"111111111111111111"
|
|
||||||
],
|
|
||||||
"sku_name": "xxxxxxxxxxxx"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"user_activity_application_statistics": [
|
|
||||||
{
|
|
||||||
"application_id": "111111111111111111",
|
|
||||||
"last_played_at": "2020-04-13T10:09:08.000000+00:00",
|
|
||||||
"total_duration": 1111,
|
|
||||||
"total_discord_sku_duration": 1
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"application_id": "111111111111111111",
|
|
||||||
"last_played_at": "2020-04-13T10:09:08.000000+00:00",
|
|
||||||
"total_duration": 111111,
|
|
||||||
"total_discord_sku_duration": 1
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"notes": {
|
|
||||||
"111111111111111111": "xxxx"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
|
|
@ -1 +0,0 @@
|
||||||
{"id": "11111111111111111", "type": 0}
|
|
||||||
|
|
@ -1,2 +0,0 @@
|
||||||
ID,Timestamp,Contents,Attachments
|
|
||||||
8888888888,2022-02-22 22:22:22.222222+00:00,Heyo,
|
|
||||||
|
|
|
@ -1 +0,0 @@
|
||||||
{"id": "222222222222222222", "type": 1, "recipients": ["00000000000000000", "1111111111111111"]}
|
|
||||||
|
|
@ -1,2 +0,0 @@
|
||||||
ID,Timestamp,Contents,Attachments
|
|
||||||
2222222222222,2022-22-22 22:22:22.22222+00:00,Heyo,
|
|
||||||
|
|
|
@ -1 +0,0 @@
|
||||||
{"id": "333333333333333333", "type": 0, "name": "generalchat", "guild": {"id": "333333333333333332", "name": "xxx"}}
|
|
||||||
|
|
@ -1,6 +0,0 @@
|
||||||
ID,Timestamp,Contents,Attachments
|
|
||||||
000000000000000005,2011-02-02 02:05:02.000000+00:00,Huh what the heck is this message,
|
|
||||||
000000000000000004,2011-02-02 02:04:02.000000+00:00,<:thonk:000000000000000000><:thonk:000000000000000000><:thonk:000000000000000000>,
|
|
||||||
000000000000000003,2011-02-02 02:03:02.000000+00:00,"(so <@00000000000000000> who are you)",
|
|
||||||
000000000000000002,2011-02-02 02:02:02.000000+00:00,,https://cdn.discordapp.com/attachments/000000000000000000/000000000000000000/image.png
|
|
||||||
000000000000000001,2011-02-02 02:01:02.000000+00:00,https://google.com/whatever,
|
|
||||||
|
|
|
@ -1,5 +0,0 @@
|
||||||
{
|
|
||||||
"11111111111111111": null,
|
|
||||||
"222222222222222222": "Direct Message with xxx#7777",
|
|
||||||
"333333333333333333": "generalchat"
|
|
||||||
}
|
|
||||||
|
|
@ -1,18 +0,0 @@
|
||||||
[
|
|
||||||
{
|
|
||||||
"id": "111111111111111111",
|
|
||||||
"user_id": "111111111111111111",
|
|
||||||
"action_type": 11,
|
|
||||||
"changes": [
|
|
||||||
{
|
|
||||||
"key": "xxxx",
|
|
||||||
"new_value": [
|
|
||||||
{
|
|
||||||
"name": "xxxxxxxxxx",
|
|
||||||
"id": "111111111111111111"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
]
|
|
||||||
|
|
@ -1,4 +0,0 @@
|
||||||
{
|
|
||||||
"id": "444444444444444444",
|
|
||||||
"name": "xxx"
|
|
||||||
}
|
|
||||||
|
|
@ -1,3 +0,0 @@
|
||||||
{
|
|
||||||
"444444444444444444": "xxx"
|
|
||||||
}
|
|
||||||
9
test/fixtures/facebook-json.md
vendored
9
test/fixtures/facebook-json.md
vendored
|
|
@ -1,9 +0,0 @@
|
||||||
# facebook-json exports
|
|
||||||
|
|
||||||
## `facebook-json-2021-05-01`
|
|
||||||
* Manual edits of images -> placeholders, folder names, key names (in support cases specficially)
|
|
||||||
* This was one of the first few datasets I scrubbed so a lot of manual work was done. Should be easier now
|
|
||||||
* I went poking around this one and there was no exif on any of the images I looked at, only in the json was there exif
|
|
||||||
## `facebook-json-2025-11-29`
|
|
||||||
* Manual edits of images -> placeholders, folder names, key names
|
|
||||||
* This was one of the first few datasets I scrubbed so a lot of manual work was done. Should be easier now
|
|
||||||
83
test/fixtures/snapchat-2023-11.md
vendored
83
test/fixtures/snapchat-2023-11.md
vendored
|
|
@ -1,83 +0,0 @@
|
||||||
# Snapchat
|
|
||||||
|
|
||||||
Exported from the web exporter
|
|
||||||
|
|
||||||
## Manual Edits
|
|
||||||
|
|
||||||
* memories and chat_media placeholders
|
|
||||||
* Snapchat seemed to have events exported where the `+` in emails broke my parsing and the email contained a ' ' instead, so I fixed that
|
|
||||||
* Keys use unique dates in `json/in_app_surveys.json`
|
|
||||||
* Keys in `json/chat_history.json` use user ids, had to manually truncate and edit
|
|
||||||
|
|
||||||
## Notes
|
|
||||||
|
|
||||||
* `memories/`
|
|
||||||
* No exif data
|
|
||||||
* Does not seem to have any correlating .json file. It's just a dump to the disk
|
|
||||||
* files are like `2020-01-01_aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa-main.jpg`
|
|
||||||
* Date has no time, just date
|
|
||||||
* `aaaaa...` seems to be a guid
|
|
||||||
* `main` | `overlay` at the end, with the same guid
|
|
||||||
* `main` is just the image
|
|
||||||
* `overlay` looks to be like a filter or some other applied thing that was saved with the memory
|
|
||||||
* Images may be rotated
|
|
||||||
* `chat_media/`
|
|
||||||
* No exif
|
|
||||||
* files are like `2020-01-01_b~xxxx.jpeg`
|
|
||||||
* sometimes they have `main` | `overlay` or something
|
|
||||||
* No idea what the `b~` means or if the xxx is an id or what. Perhaps base64 encoded protobuf, but nothing I decoded seemed to correlate to any identifier in the export
|
|
||||||
* Only referenced from ... oh... it's broken. The `type: "MEDIA"` in snapchats exporter has all empty "content" fields. Amazing... So this will have to be pieced together some other way
|
|
||||||
* This will most likel have to be manually repaired
|
|
||||||
* `json/`
|
|
||||||
* Scrubbed
|
|
||||||
* See manual changes
|
|
||||||
|
|
||||||
|
|
||||||
* Comes with both an html and json export (I will only keep the json after deduping)
|
|
||||||
* NOTE: That the html export has explanations which might be useful to explain some of these fields...
|
|
||||||
* I compared all .html to .json side by side (browser <-> text editor) and all of them were present in both and had the same data except `snap_history.html` (was empty in .html) and `faq.html` (just informational)
|
|
||||||
* I noticed on chat history html pages it puts _every_ category, not just the ones I have. Might be useful future reference
|
|
||||||
|
|
||||||
```
|
|
||||||
Frequently Asked Questions
|
|
||||||
Login History and Account Information
|
|
||||||
Snap History Metadata
|
|
||||||
Chat History Metadata
|
|
||||||
My AI
|
|
||||||
Our Story & Spotlight Content
|
|
||||||
Spotlight Replies
|
|
||||||
Purchase History
|
|
||||||
Snapchat Support History
|
|
||||||
User Profile
|
|
||||||
Public Profiles
|
|
||||||
Friends
|
|
||||||
Ranking
|
|
||||||
Story History
|
|
||||||
Account History
|
|
||||||
Location
|
|
||||||
Search History
|
|
||||||
Terms History
|
|
||||||
Subscriptions
|
|
||||||
Bitmoji
|
|
||||||
In-app Surveys
|
|
||||||
Reported Content
|
|
||||||
Bitmoji Kit
|
|
||||||
Connected Apps
|
|
||||||
Talk History
|
|
||||||
Ads Manager
|
|
||||||
My Lenses
|
|
||||||
Memories
|
|
||||||
Cameos
|
|
||||||
Email Campaign History
|
|
||||||
Snap Tokens
|
|
||||||
Payouts
|
|
||||||
Orders
|
|
||||||
Snap Map Places
|
|
||||||
Shopping Favorites
|
|
||||||
Payments
|
|
||||||
My Sounds
|
|
||||||
Photoshoot Snaps
|
|
||||||
Feature Emails
|
|
||||||
AI Selfies
|
|
||||||
```
|
|
||||||
|
|
||||||
38
test/fixtures/snapchat-2023-11/json/account.json
vendored
38
test/fixtures/snapchat-2023-11/json/account.json
vendored
|
|
@ -1,38 +0,0 @@
|
||||||
{
|
|
||||||
"Basic Information": {
|
|
||||||
"Username": "xxxxxxxxx",
|
|
||||||
"Name": "xxxxx",
|
|
||||||
"Creation Date": "2020-04-13 10:09:08 UTC",
|
|
||||||
"Registration IP": "",
|
|
||||||
"Country": ""
|
|
||||||
},
|
|
||||||
"Device Information": {
|
|
||||||
"Make": "",
|
|
||||||
"Model ID": "",
|
|
||||||
"Model Name": "",
|
|
||||||
"Language": "",
|
|
||||||
"OS Type": "",
|
|
||||||
"OS Version": "",
|
|
||||||
"Connection Type": ""
|
|
||||||
},
|
|
||||||
"Device History": [],
|
|
||||||
"Privacy Policy and Terms of Service Acceptance History": [],
|
|
||||||
"Custom Creative Tools Terms": [],
|
|
||||||
"Login History": [
|
|
||||||
{
|
|
||||||
"IP": "1.1.1.1",
|
|
||||||
"Country": "xx",
|
|
||||||
"Created": "2020-04-13 10:09:08 UTC",
|
|
||||||
"Status": "xxxxxxx",
|
|
||||||
"Device": "some/path"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"IP": "1.1.1.1",
|
|
||||||
"Country": "xx",
|
|
||||||
"Created": "2020-04-13 10:09:08 UTC",
|
|
||||||
"Status": "xxxxxxx",
|
|
||||||
"Device": "some/path"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"Family Center": []
|
|
||||||
}
|
|
||||||
|
|
@ -1,47 +0,0 @@
|
||||||
{
|
|
||||||
"Display Name Change": [
|
|
||||||
{
|
|
||||||
"Date": "2020-04-13 10:09:08 UTC",
|
|
||||||
"Display Name": "xxxxx"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"Date": "",
|
|
||||||
"Display Name": "xxxxxx"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"Email Change": [
|
|
||||||
{
|
|
||||||
"Date": "2020-04-13 10:09:08 UTC",
|
|
||||||
"Email Address": "not_a_real_email@example.com"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"Mobile Number Change": [],
|
|
||||||
"Password Change": [
|
|
||||||
{
|
|
||||||
"Date": "2020-04-13 10:09:08 UTC"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"Date": "2020-04-13 10:09:08 UTC"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"Snapchat Linked to Bitmoji": [
|
|
||||||
{
|
|
||||||
"Date": "2020-04-13 10:09:08 UTC"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"Spectacles": [],
|
|
||||||
"Two-Factor Authentication": [],
|
|
||||||
"Account deactivated / reactivated": [],
|
|
||||||
"Download My Data Reports": [
|
|
||||||
{
|
|
||||||
"Date": "2020-04-13 10:09:08 UTC",
|
|
||||||
"Status": "xxxxxxx",
|
|
||||||
"Email Address": "not_a_real_email@example.com"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"Date": "2020-04-13 10:09:08 UTC",
|
|
||||||
"Status": "xxxxxxxxx",
|
|
||||||
"Email Address": "not_a_real_email@example.com"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
31
test/fixtures/snapchat-2023-11/json/bitmoji.json
vendored
31
test/fixtures/snapchat-2023-11/json/bitmoji.json
vendored
|
|
@ -1,31 +0,0 @@
|
||||||
{
|
|
||||||
"Basic Information": {
|
|
||||||
"First Name": "",
|
|
||||||
"Last Name": "",
|
|
||||||
"Email": "",
|
|
||||||
"Phone Number": "",
|
|
||||||
"Account Creation Date": "2020-04-13 10:09:08 UTC",
|
|
||||||
"Account Creation User Agent": ""
|
|
||||||
},
|
|
||||||
"Analytics": {
|
|
||||||
"App Open Count": 1,
|
|
||||||
"Avatar Gender": "xxxx",
|
|
||||||
"Outfit Save Count": 1,
|
|
||||||
"Share Count": 1
|
|
||||||
},
|
|
||||||
"Terms of Service Acceptance History": [
|
|
||||||
{
|
|
||||||
"Version": "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
|
|
||||||
"Acceptance Date": "2020-04-13 10:09:08"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"Version": "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
|
|
||||||
"Acceptance Date": "2020-04-13 10:09:08"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"Search History": [],
|
|
||||||
"Support Cases": [],
|
|
||||||
"Selfies": [],
|
|
||||||
"Keyboard Enable Full Access History (iOS only)": [],
|
|
||||||
"Connected Apps": []
|
|
||||||
}
|
|
||||||
|
|
@ -1,8 +0,0 @@
|
||||||
{
|
|
||||||
"Cameos Selfie": {
|
|
||||||
"Cameos Body Selected": "xxxxxxxxxxxx",
|
|
||||||
"Hairstyle": "xxxxxxxxxxxx",
|
|
||||||
"Use My Cameos Selfie": "xxxxxxx"
|
|
||||||
},
|
|
||||||
"Cameos Stories": []
|
|
||||||
}
|
|
||||||
|
|
@ -1,42 +0,0 @@
|
||||||
{
|
|
||||||
"some_friend": [
|
|
||||||
{
|
|
||||||
"From": "xxxxxxxxx",
|
|
||||||
"Media Type": "xxxxx",
|
|
||||||
"Created": "2020-04-13 10:09:08 UTC",
|
|
||||||
"Content": "",
|
|
||||||
"Conversation Title": null,
|
|
||||||
"IsSender": false,
|
|
||||||
"Created(microseconds)": 1111111111111
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"From": "xxxxxxxxx",
|
|
||||||
"Media Type": "xxxx",
|
|
||||||
"Created": "2020-04-13 10:09:08 UTC",
|
|
||||||
"Content": "xxxxxxxxxxxxxxxxxx",
|
|
||||||
"Conversation Title": null,
|
|
||||||
"IsSender": false,
|
|
||||||
"Created(microseconds)": 1111111111111
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"some_friend_too": [
|
|
||||||
{
|
|
||||||
"From": "xxxxxxxxxxxxxx",
|
|
||||||
"Media Type": "xxxxx",
|
|
||||||
"Created": "2020-04-13 10:09:08 UTC",
|
|
||||||
"Content": "",
|
|
||||||
"Conversation Title": "xxxxxxxxxxxxxxxx",
|
|
||||||
"IsSender": false,
|
|
||||||
"Created(microseconds)": 1111111111111
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"From": "xxxxxxxxxxxxx",
|
|
||||||
"Media Type": "xxxx",
|
|
||||||
"Created": "2020-04-13 10:09:08 UTC",
|
|
||||||
"Content": "xxxxxxxxxxxxxxxxxxxxxx",
|
|
||||||
"Conversation Title": "xxxxxxxxxxxxxxxx",
|
|
||||||
"IsSender": false,
|
|
||||||
"Created(microseconds)": 1111111111111
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
|
|
@ -1,11 +0,0 @@
|
||||||
{
|
|
||||||
"Login History": [],
|
|
||||||
"Permissions": [
|
|
||||||
{
|
|
||||||
"App": "xxxxxxx",
|
|
||||||
"Time": "2020-04-13 10:09:08 UTC",
|
|
||||||
"Type": "xxxxxxx"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"Connected Applications": []
|
|
||||||
}
|
|
||||||
|
|
@ -1,13 +0,0 @@
|
||||||
{
|
|
||||||
"Email Campaign Subscriptions": [
|
|
||||||
{
|
|
||||||
"Email Campaign": "xxxxxxxxxxxxxxxx",
|
|
||||||
"Opt Out Status": "xxxxxxxxxxxx"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"Email Campaign": "xxxxxxxxxxxxxxx",
|
|
||||||
"Opt Out Status": "xxxxxxxxxxxx"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"Email Campaign History": []
|
|
||||||
}
|
|
||||||
100
test/fixtures/snapchat-2023-11/json/friends.json
vendored
100
test/fixtures/snapchat-2023-11/json/friends.json
vendored
|
|
@ -1,100 +0,0 @@
|
||||||
{
|
|
||||||
"Friends": [
|
|
||||||
{
|
|
||||||
"Username": "xxxxxxxxxxxxx",
|
|
||||||
"Display Name": "xxxxxxxxxxxxxxxxxxx",
|
|
||||||
"Creation Timestamp": "2020-04-13 10:09:08 UTC",
|
|
||||||
"Last Modified Timestamp": "2020-04-13 10:09:08 UTC",
|
|
||||||
"Source": "xxxxxxxxxxxxxx"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"Username": "xxxxxxxxxxxxxxx",
|
|
||||||
"Display Name": "xxxxxxxxxxxxxxx",
|
|
||||||
"Creation Timestamp": "2020-04-13 10:09:08 UTC",
|
|
||||||
"Last Modified Timestamp": "2020-04-13 10:09:08 UTC",
|
|
||||||
"Source": "xxxxxxxxxxxxxxxxxx"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"Friend Requests Sent": [
|
|
||||||
{
|
|
||||||
"Username": "xxxxxxxxxx",
|
|
||||||
"Display Name": "xxxxxxxxxxx",
|
|
||||||
"Creation Timestamp": "2020-04-13 10:09:08 UTC",
|
|
||||||
"Last Modified Timestamp": "2020-04-13 10:09:08 UTC",
|
|
||||||
"Source": "xxxxxxxxxxxxxxxxxxxxxx"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"Username": "xxxxxxxxx",
|
|
||||||
"Display Name": "xxxxxx",
|
|
||||||
"Creation Timestamp": "2020-04-13 10:09:08 UTC",
|
|
||||||
"Last Modified Timestamp": "2020-04-13 10:09:08 UTC",
|
|
||||||
"Source": "xxxxxxxxxxxxxxxxxxxxxx"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"Blocked Users": [
|
|
||||||
{
|
|
||||||
"Username": "xxxxxxxxxxxxxx",
|
|
||||||
"Display Name": "xxxxxxxxxxxxxxxxxxxxxxx",
|
|
||||||
"Creation Timestamp": "2020-04-13 10:09:08 UTC",
|
|
||||||
"Last Modified Timestamp": "2020-04-13 10:09:08 UTC",
|
|
||||||
"Source": "xxxxxxxxxxxxxxxxxxxxxx"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"Username": "xxxxxxxxxxxxxx",
|
|
||||||
"Display Name": "xxxxxxxxxxxxxx",
|
|
||||||
"Creation Timestamp": "2020-04-13 10:09:08 UTC",
|
|
||||||
"Last Modified Timestamp": "2020-04-13 10:09:08 UTC",
|
|
||||||
"Source": "xxxxxxxxxxxxxxxx"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"Deleted Friends": [
|
|
||||||
{
|
|
||||||
"Username": "xxxxxx",
|
|
||||||
"Display Name": "xxxxxxxxxxxxx",
|
|
||||||
"Creation Timestamp": "2020-04-13 10:09:08 UTC",
|
|
||||||
"Last Modified Timestamp": "2020-04-13 10:09:08 UTC",
|
|
||||||
"Source": "xxxxxxxxxxxxxxxx"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"Username": "xxxxxxxxxxxxxxx",
|
|
||||||
"Display Name": "xxxxxxxxxxxxx",
|
|
||||||
"Creation Timestamp": "2020-04-13 10:09:08 UTC",
|
|
||||||
"Last Modified Timestamp": "2020-04-13 10:09:08 UTC",
|
|
||||||
"Source": "xxxxxxxxxxxxxxxx"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"Hidden Friend Suggestions": [],
|
|
||||||
"Ignored Snapchatters": [
|
|
||||||
{
|
|
||||||
"Username": "xxxxxxxxx",
|
|
||||||
"Display Name": "xxxxxxxxxxxxxxxxxxxxxxxx",
|
|
||||||
"Creation Timestamp": "2020-04-13 10:09:08 UTC",
|
|
||||||
"Last Modified Timestamp": "2020-04-13 10:09:08 UTC",
|
|
||||||
"Source": "xxxxxxxxxxxxxxxx"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"Username": "xxxxxxxx",
|
|
||||||
"Display Name": "xxxxxxxxxxxxxx",
|
|
||||||
"Creation Timestamp": "2020-04-13 10:09:08 UTC",
|
|
||||||
"Last Modified Timestamp": "2020-04-13 10:09:08 UTC",
|
|
||||||
"Source": "xxxxxxxxxxxxxxxx"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"Pending Requests": [
|
|
||||||
{
|
|
||||||
"Username": "xxxxxxxxxxxxxxx",
|
|
||||||
"Display Name": "xxxxxxxxxxx",
|
|
||||||
"Creation Timestamp": "2020-04-13 10:09:08 UTC",
|
|
||||||
"Last Modified Timestamp": "2020-04-13 10:09:08 UTC",
|
|
||||||
"Source": "xxxxxxxxxxxxxxxx"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"Username": "xxxxxxxxxxxxxx",
|
|
||||||
"Display Name": "xxxxxxxxxxxxx",
|
|
||||||
"Creation Timestamp": "2020-04-13 10:09:08 UTC",
|
|
||||||
"Last Modified Timestamp": "2020-04-13 10:09:08 UTC",
|
|
||||||
"Source": "xxxxxxxxxxxxxxxx"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"Shortcuts": []
|
|
||||||
}
|
|
||||||
|
|
@ -1,26 +0,0 @@
|
||||||
{
|
|
||||||
"Survey 2020/04/12": [
|
|
||||||
{
|
|
||||||
"Time": "xxxxxxxxxxxx",
|
|
||||||
"Survey Question": "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
|
|
||||||
"Survey Response": "xxxxxxxxxx"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"Time": "xxxxxxxxxxxx",
|
|
||||||
"Survey Question": "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
|
|
||||||
"Survey Response": "xxx"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"Survey 2020/04/13": [
|
|
||||||
{
|
|
||||||
"Time": "xxxxxxxxxxxx",
|
|
||||||
"Survey Question": "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
|
|
||||||
"Survey Response": "xxxxxxxxxxxxxx"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"Time": "xxxxxxxxxxxx",
|
|
||||||
"Survey Question": "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
|
|
||||||
"Survey Response": "some/path"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
|
|
@ -1,23 +0,0 @@
|
||||||
{
|
|
||||||
"Frequent Locations": [],
|
|
||||||
"Latest Location": [
|
|
||||||
{
|
|
||||||
"City": "",
|
|
||||||
"Country": "",
|
|
||||||
"Region": ""
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"Home & Work": {},
|
|
||||||
"Daily Top Locations": [],
|
|
||||||
"Top Locations Per Six-Day Period": [],
|
|
||||||
"Location History": [],
|
|
||||||
"Businesses and public places you may have visited": [],
|
|
||||||
"Areas you may have visited in the last two years": [
|
|
||||||
{
|
|
||||||
"Time": "some/path",
|
|
||||||
"City": "xxxxxx",
|
|
||||||
"Region": "xxxxxxxx",
|
|
||||||
"Postal Code": "11111"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
|
|
@ -1,6 +0,0 @@
|
||||||
{
|
|
||||||
"Number of Stories Viewed": [
|
|
||||||
1
|
|
||||||
],
|
|
||||||
"Content Interests": []
|
|
||||||
}
|
|
||||||
|
|
@ -1,11 +0,0 @@
|
||||||
{
|
|
||||||
"Shared Story": [],
|
|
||||||
"Spotlight History": [
|
|
||||||
{
|
|
||||||
"Story Date": "2020-04-13 10:09:08 UTC",
|
|
||||||
"Story URL": "url://somewhere",
|
|
||||||
"Action Type": "xxxx",
|
|
||||||
"View Time": "xxxxxxxxxxxxx"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
|
|
@ -1,4 +0,0 @@
|
||||||
{
|
|
||||||
"My AI Content": [],
|
|
||||||
"My AI Memory": []
|
|
||||||
}
|
|
||||||
|
|
@ -1,10 +0,0 @@
|
||||||
{
|
|
||||||
"Public Users": [
|
|
||||||
"xxxxxxxxxxxxxxx"
|
|
||||||
],
|
|
||||||
"Publishers": [],
|
|
||||||
"Stories": [],
|
|
||||||
"Last Active Timezone": "some/path",
|
|
||||||
"Push Notifications": [],
|
|
||||||
"Hidden Category Sections": []
|
|
||||||
}
|
|
||||||
|
|
@ -1,15 +0,0 @@
|
||||||
{
|
|
||||||
"Snap Inc. Terms of Service": [
|
|
||||||
{
|
|
||||||
"Version": "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
|
|
||||||
"Acceptance Date": "2020-04-13 10:09:08 UTC"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"Version": "xxxxxxxxxxxxxxxxxxxxxxxxx",
|
|
||||||
"Acceptance Date": "2020-04-13 10:09:08 UTC"
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"Custom Creative Tools Terms": [],
|
|
||||||
"Business Services Terms": [],
|
|
||||||
"Games Terms": []
|
|
||||||
}
|
|
||||||
|
|
@ -1,39 +0,0 @@
|
||||||
{
|
|
||||||
"App Profile": {
|
|
||||||
"Country": "xx",
|
|
||||||
"Creation Time": "2020-04-13 10:09:08 UTC",
|
|
||||||
"Account Creation Country": "xxxxxxx",
|
|
||||||
"Platform Version": "xxxxxxx",
|
|
||||||
"In-app Language": "xx"
|
|
||||||
},
|
|
||||||
"Demographics": {
|
|
||||||
"Cohort Age": "",
|
|
||||||
"Derived Ad Demographic": ""
|
|
||||||
},
|
|
||||||
"Subscriptions": [],
|
|
||||||
"Engagement": [],
|
|
||||||
"Discover Channels Viewed": [],
|
|
||||||
"Breakdown of Time Spent on App": [],
|
|
||||||
"Ads You Interacted With": [],
|
|
||||||
"Interest Categories": [
|
|
||||||
"xxxxxx",
|
|
||||||
"xxxxxxxxxxxxxxxxxxx"
|
|
||||||
],
|
|
||||||
"Content Categories": [
|
|
||||||
"xxxxxxxxxxxxxxxxxxxxxxxxxxxx",
|
|
||||||
"some/path"
|
|
||||||
],
|
|
||||||
"Geographic Information": [],
|
|
||||||
"Interactions": {
|
|
||||||
"Web Interactions": [
|
|
||||||
"xxxxxxxxxxxxx",
|
|
||||||
"xxxxxxxxxxxxxxxxxxxxxx"
|
|
||||||
],
|
|
||||||
"App Interactions": [
|
|
||||||
"url://somewhere",
|
|
||||||
"url://somewhere"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
"Off-Platform Sharing": [],
|
|
||||||
"Mobile Ad Id": "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
|
|
||||||
}
|
|
||||||
Binary file not shown.
|
Before Width: | Height: | Size: 2.2 KiB |
95
test/task.ts
95
test/task.ts
|
|
@ -9,6 +9,7 @@ import {
|
||||||
cmd,
|
cmd,
|
||||||
assignMeta,
|
assignMeta,
|
||||||
verify,
|
verify,
|
||||||
|
TaskTargetPipelineHelper,
|
||||||
} from "../data-export/task.ts";
|
} from "../data-export/task.ts";
|
||||||
|
|
||||||
const THIS_FILE = import.meta.dirname;
|
const THIS_FILE = import.meta.dirname;
|
||||||
|
|
@ -91,7 +92,7 @@ test("TaskTarget: pushToPipeline throws if read is not the first op", () => {
|
||||||
test("TaskTarget: clone produces an independent copy", () => {
|
test("TaskTarget: clone produces an independent copy", () => {
|
||||||
const t = new TaskTarget("/foo").assignMeta({
|
const t = new TaskTarget("/foo").assignMeta({
|
||||||
idValue: "orig",
|
idValue: "orig",
|
||||||
columnMeta: ["any"]
|
columnMeta: ["yeag"]
|
||||||
});
|
});
|
||||||
t.read();
|
t.read();
|
||||||
const c = t.clone();
|
const c = t.clone();
|
||||||
|
|
@ -154,41 +155,41 @@ test("toShell: cmd with function resolves at shell-generation time", () => {
|
||||||
|
|
||||||
// -- module-level functions ---------------------------------------------------
|
// -- module-level functions ---------------------------------------------------
|
||||||
|
|
||||||
test("cd: clones and changes directory of each target", async () => {
|
test("cd: clones and changes directory of each target", () => {
|
||||||
const targets = [new TaskTarget("/a"), new TaskTarget("/b")];
|
const targets = [new TaskTarget("/a"), new TaskTarget("/b")];
|
||||||
const result = await cd("sub")(targets);
|
const result = cd(targets, "sub");
|
||||||
assert.equal(result[0].path, "/a/sub");
|
assert.equal(result[0].path, "/a/sub");
|
||||||
assert.equal(result[1].path, "/b/sub");
|
assert.equal(result[1].path, "/b/sub");
|
||||||
assert.equal(targets[0].path, "/a"); // originals unchanged
|
assert.equal(targets[0].path, "/a"); // originals unchanged
|
||||||
});
|
});
|
||||||
|
|
||||||
test("read: clones and adds a read op to each target", async () => {
|
test("read: clones and adds a read op to each target", () => {
|
||||||
const targets = [new TaskTarget("/a.txt"), new TaskTarget("/b.txt")];
|
const targets = [new TaskTarget("/a.txt"), new TaskTarget("/b.txt")];
|
||||||
const result = await read()(targets);
|
const result = read(targets);
|
||||||
assert.equal(result[0].pipeline[0].type, "read");
|
assert.equal(result[0].pipeline[0].type, "read");
|
||||||
assert.equal(result[1].pipeline[0].type, "read");
|
assert.equal(result[1].pipeline[0].type, "read");
|
||||||
assert.equal(targets[0].pipeline.length, 0); // originals unchanged
|
assert.equal(targets[0].pipeline.length, 0); // originals unchanged
|
||||||
});
|
});
|
||||||
|
|
||||||
test("cmd: clones and appends a cmd op to each target", async () => {
|
test("cmd: clones and appends a cmd op to each target", () => {
|
||||||
const targets = [new TaskTarget("/a.txt")];
|
const targets = [new TaskTarget("/a.txt")];
|
||||||
targets[0].read();
|
targets[0].read();
|
||||||
const result = await cmd("jq .")(targets);
|
const result = cmd(targets, "jq .");
|
||||||
assert.equal(result[0].pipeline.length, 2);
|
assert.equal(result[0].pipeline.length, 2);
|
||||||
assert.equal(targets[0].pipeline.length, 1); // original unchanged
|
assert.equal(targets[0].pipeline.length, 1); // original unchanged
|
||||||
});
|
});
|
||||||
|
|
||||||
test("assignMeta: clones and sets meta on each target", async () => {
|
test("assignMeta: clones and sets meta on each target", () => {
|
||||||
const targets = [new TaskTarget("/a"), new TaskTarget("/b")];
|
const targets = [new TaskTarget("/a"), new TaskTarget("/b")];
|
||||||
const result = await assignMeta({ idValue: "myid" })(targets);
|
const result = assignMeta(targets, { idValue: "myid" });
|
||||||
assert.equal(result[0].id, "myid");
|
assert.equal(result[0].id, "myid");
|
||||||
assert.equal(result[1].id, "myid");
|
assert.equal(result[1].id, "myid");
|
||||||
assert.throws(() => targets[0].id); // originals have no id
|
assert.throws(() => targets[0].id); // originals have no id
|
||||||
});
|
});
|
||||||
|
|
||||||
test("taskGlob: returns matching targets across all input targets", async () => {
|
test("taskGlob: returns matching targets across all input targets", () => {
|
||||||
const targets = [new TaskTarget(FIXTURE_DIR)];
|
const targets = [new TaskTarget(FIXTURE_DIR)];
|
||||||
const result = await taskGlob("friends/*.json")(targets);
|
const result = taskGlob(targets, "friends/*.json");
|
||||||
assert.ok(result.length > 0);
|
assert.ok(result.length > 0);
|
||||||
assert.ok(result.every(r => r.path.endsWith(".json")));
|
assert.ok(result.every(r => r.path.endsWith(".json")));
|
||||||
});
|
});
|
||||||
|
|
@ -225,3 +226,75 @@ test("verify: filters a mixed list to only valid targets", async () => {
|
||||||
assert.equal(result[0], good);
|
assert.equal(result[0], good);
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// -- TaskTargetPipelineHelper -------------------------------------------------
|
||||||
|
|
||||||
|
test("TaskTargetPipelineHelper: pipeline() promotes a plain array", () => {
|
||||||
|
const p = TaskTargetPipelineHelper.pipeline([new TaskTarget("/a")]);
|
||||||
|
assert.ok(p instanceof TaskTargetPipelineHelper);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("TaskTargetPipelineHelper: pipeline() is idempotent", () => {
|
||||||
|
const arr = [new TaskTarget("/a")];
|
||||||
|
const p1 = TaskTargetPipelineHelper.pipeline(arr);
|
||||||
|
const p2 = TaskTargetPipelineHelper.pipeline(p1);
|
||||||
|
assert.equal(p1, p2);
|
||||||
|
});
|
||||||
|
|
||||||
|
test("TaskTargetPipelineHelper: cd returns a new helper with paths changed", () => {
|
||||||
|
const p = TaskTargetPipelineHelper.pipeline([new TaskTarget("/a"), new TaskTarget("/b")]);
|
||||||
|
const p2 = p.cd("sub");
|
||||||
|
assert.ok(p2 instanceof TaskTargetPipelineHelper);
|
||||||
|
assert.equal(p2[0].path, "/a/sub");
|
||||||
|
assert.equal(p2[1].path, "/b/sub");
|
||||||
|
});
|
||||||
|
|
||||||
|
test("TaskTargetPipelineHelper: read returns a new helper with read ops added", () => {
|
||||||
|
const p = TaskTargetPipelineHelper.pipeline([new TaskTarget("/a.txt")]);
|
||||||
|
const p2 = p.read();
|
||||||
|
assert.ok(p2 instanceof TaskTargetPipelineHelper);
|
||||||
|
assert.equal(p2[0].pipeline[0].type, "read");
|
||||||
|
});
|
||||||
|
|
||||||
|
test("TaskTargetPipelineHelper: cmd returns a new helper with cmd ops added", () => {
|
||||||
|
const p = TaskTargetPipelineHelper.pipeline([new TaskTarget("/a.txt")]);
|
||||||
|
const p2 = p.read().cmd("jq .");
|
||||||
|
assert.equal(p2[0].toShell(), "cat /a.txt | jq .");
|
||||||
|
});
|
||||||
|
|
||||||
|
// -- collect ------------------------------------------------------------------
|
||||||
|
|
||||||
|
test("collect: the final end of a chain is added to the collection set", () => {
|
||||||
|
const collection = new Set<TaskTargetPipelineHelper>();
|
||||||
|
const p = TaskTargetPipelineHelper.pipeline([new TaskTarget("/foo")]);
|
||||||
|
p.collect(collection);
|
||||||
|
|
||||||
|
const p2 = p.cd("sub");
|
||||||
|
assert.equal(collection.size, 1);
|
||||||
|
assert.ok(collection.has(p2));
|
||||||
|
});
|
||||||
|
|
||||||
|
test("collect: moving the chain end removes the old element and adds the new one", () => {
|
||||||
|
const collection = new Set<TaskTargetPipelineHelper>();
|
||||||
|
const p = TaskTargetPipelineHelper.pipeline([new TaskTarget("/foo")]);
|
||||||
|
p.collect(collection);
|
||||||
|
|
||||||
|
const p2 = p.cd("sub");
|
||||||
|
const p3 = p2.read();
|
||||||
|
assert.equal(collection.size, 1);
|
||||||
|
assert.ok(collection.has(p3));
|
||||||
|
assert.ok(!collection.has(p2));
|
||||||
|
});
|
||||||
|
|
||||||
|
test("collect: gathers the ends of multiple independent pipeline branches", () => {
|
||||||
|
const collection = new Set<TaskTargetPipelineHelper>();
|
||||||
|
|
||||||
|
const b1 = TaskTargetPipelineHelper.pipeline([new TaskTarget("/a.txt")]).collect(collection).read();
|
||||||
|
const b2 = TaskTargetPipelineHelper.pipeline([new TaskTarget("/b.txt")]).collect(collection).read();
|
||||||
|
|
||||||
|
assert.equal(collection.size, 2);
|
||||||
|
assert.ok(collection.has(b1));
|
||||||
|
assert.ok(collection.has(b2));
|
||||||
|
|
||||||
|
const allTargets = [...collection].flat();
|
||||||
|
assert.equal(allTargets.length, 2);
|
||||||
|
});
|
||||||
|
|
|
||||||
225
timelinize.ts
225
timelinize.ts
|
|
@ -1,225 +0,0 @@
|
||||||
import { type SQLOutputValue, type DatabaseSync } from "node:sqlite";
|
|
||||||
import { createWriteStream } from 'node:fs';
|
|
||||||
import { fileURLToPath } from "node:url";
|
|
||||||
import "./data-export/facebook.ts";
|
|
||||||
import { facebook } from "./data-export/facebook.ts";
|
|
||||||
import { execPaths, COLUMN_TYPES } from "./data-export/task.ts";
|
|
||||||
import * as DataIO from "./data-export/io.ts";
|
|
||||||
import {
|
|
||||||
startTime,
|
|
||||||
elapsed,
|
|
||||||
loadTaskInNewDb
|
|
||||||
} from "./main.ts";
|
|
||||||
|
|
||||||
const __filename = fileURLToPath(import.meta.url);
|
|
||||||
|
|
||||||
function dumpDBTableToCSV(db: DatabaseSync, tableName: string, outputFile: string) {
|
|
||||||
const stream = createWriteStream(outputFile);
|
|
||||||
const stmt = db.prepare(`SELECT * FROM ${tableName}`);
|
|
||||||
|
|
||||||
let headerWritten = false;
|
|
||||||
for (const row of stmt.iterate()) {
|
|
||||||
if (!headerWritten) {
|
|
||||||
stream.write(Object.keys(row).join(',') + '\n');
|
|
||||||
headerWritten = true;
|
|
||||||
}
|
|
||||||
stream.write(Object.values(row).map(v => `"${String(v ?? '').replace(/"/g, '""')}"`).join(',') + '\n');
|
|
||||||
}
|
|
||||||
|
|
||||||
stream.end();
|
|
||||||
}
|
|
||||||
function getColumnNames(db: DatabaseSync, tableName: string) {
|
|
||||||
return db.prepare(`PRAGMA table_info(${tableName})`).all().map(c => c.name) as string[];
|
|
||||||
}
|
|
||||||
function templateToSql(template: string, columns: string[]) {
|
|
||||||
// Convert '{0}, {1}' to '%s, %s'
|
|
||||||
const args: string[] = [];
|
|
||||||
const sqlTemplate = template.replace(/\{(\d+)\}/g, (match, index) => {
|
|
||||||
args.push(columns[parseInt(index)]);
|
|
||||||
return '%s';
|
|
||||||
});
|
|
||||||
return `printf('${sqlTemplate}', ${args.join(', ')})`;
|
|
||||||
}
|
|
||||||
function sqlLiteral(str: string | undefined | null): string {
|
|
||||||
if (str === null || str === undefined) {
|
|
||||||
return 'NULL';
|
|
||||||
}
|
|
||||||
|
|
||||||
// Escape single quotes by doubling them
|
|
||||||
const escaped = str.replace(/'/g, "''");
|
|
||||||
|
|
||||||
// Wrap in single quotes
|
|
||||||
return `'${escaped}'`;
|
|
||||||
}
|
|
||||||
|
|
||||||
async function main() {
|
|
||||||
// Configure the tasks to run
|
|
||||||
console.log(`${elapsed()} - Building targets`);
|
|
||||||
const targets = await execPaths([
|
|
||||||
{path: "/home/cobertos/Seafile/archive/ExportedServiceData/facebook/formapcast_facebook-DEADNAME-May2021-json", op: facebook()}
|
|
||||||
]);
|
|
||||||
console.log(`${elapsed()} - Found ${targets.filter(t => !t.aggregate).length} possible targets`);
|
|
||||||
const db = await loadTaskInNewDb(targets);
|
|
||||||
|
|
||||||
// New output tables
|
|
||||||
db.exec(`CREATE TABLE combined (timestamp TEXT, description TEXT, type TEXT, sender TEXT, receiver TEXT, lat REAL, lng REAL, tags TEXT);`);
|
|
||||||
|
|
||||||
//(message, email, note,
|
|
||||||
// social, location, media, event, document,
|
|
||||||
// bookmark; defaults to note)
|
|
||||||
|
|
||||||
type ColumnMetaType = (keyof typeof COLUMN_TYPES);
|
|
||||||
interface MetadataRow {
|
|
||||||
id: string,
|
|
||||||
perRowDescription?: string,
|
|
||||||
perRowTags?: string,
|
|
||||||
columnMeta: ColumnMetaType[],
|
|
||||||
columnNames: string[],
|
|
||||||
metaId?: string
|
|
||||||
}
|
|
||||||
function verifyMetdataRow(input: Record<string, SQLOutputValue>): undefined | MetadataRow {
|
|
||||||
const { id, perRowDescription, perRowTags, columnMeta: columnMetaCSV, metaId } = input;
|
|
||||||
if (!id) {
|
|
||||||
console.error("Row did not have id/tableName, skipping");
|
|
||||||
return undefined;
|
|
||||||
}
|
|
||||||
if (typeof id !== "string") {
|
|
||||||
console.error(`Id must be string, got ${typeof id}, ${id}`);
|
|
||||||
return undefined;
|
|
||||||
}
|
|
||||||
if (!columnMetaCSV) {
|
|
||||||
console.warn(`${id} did not have columnMeta, nothing to do. Skipping`);
|
|
||||||
return undefined; // No column information
|
|
||||||
}
|
|
||||||
if (typeof columnMetaCSV !== "string") {
|
|
||||||
console.warn(`${id} did not have columnMeta of type string. Skipping`);
|
|
||||||
return undefined;
|
|
||||||
}
|
|
||||||
const columnMeta = columnMetaCSV.split(",") as ColumnMetaType[];
|
|
||||||
|
|
||||||
// Get the column names from the table id
|
|
||||||
const columnNames = getColumnNames(db, id);
|
|
||||||
if (columnNames.length !== columnMeta.length) {
|
|
||||||
console.error(`columnNames and columnMeta did not have same length. skipping`);
|
|
||||||
return undefined;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (typeof perRowDescription !== "string" && perRowDescription !== undefined && perRowDescription !== null) {
|
|
||||||
console.warn(`Invalid typeof perRowDescription, was ${typeof perRowDescription}, value ${perRowDescription}`);
|
|
||||||
return undefined;
|
|
||||||
}
|
|
||||||
if (typeof perRowTags !== "string" && perRowTags !== undefined && perRowTags !== null) {
|
|
||||||
console.warn(`Invalid typeof perRowTags, was ${typeof perRowTags}, value ${perRowTags}`);
|
|
||||||
return undefined;
|
|
||||||
}
|
|
||||||
if (typeof metaId !== "string" && metaId !== undefined && metaId !== null) {
|
|
||||||
console.warn(`Invalid typeof metaId, was ${typeof metaId}, value ${metaId}`);
|
|
||||||
return undefined;
|
|
||||||
}
|
|
||||||
|
|
||||||
return {
|
|
||||||
id,
|
|
||||||
perRowDescription: perRowDescription ?? undefined,
|
|
||||||
perRowTags: perRowTags ?? undefined,
|
|
||||||
columnMeta,
|
|
||||||
columnNames,
|
|
||||||
metaId: metaId ?? undefined
|
|
||||||
};
|
|
||||||
}
|
|
||||||
|
|
||||||
/**Maps columnMeta names to the column names*/
|
|
||||||
function metaToNames(meta: MetadataRow): Partial<Record<ColumnMetaType, string>> {
|
|
||||||
const out: Partial<Record<ColumnMetaType, string>> = {};
|
|
||||||
for (const [idx, name] of meta.columnNames.entries()) {
|
|
||||||
const metaName = meta.columnMeta[idx];
|
|
||||||
if (out[metaName]) {
|
|
||||||
console.warn(`Duplicate column with metaName "${metaName}". The current one which will be used is "${out[metaName]}". Skipping the duplicate.`);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
out[metaName] = name;
|
|
||||||
}
|
|
||||||
return out;
|
|
||||||
}
|
|
||||||
function metaParts(metaNameToColumnName: Partial<Record<ColumnMetaType, string>>): Record<ColumnMetaType, string> {
|
|
||||||
const out: Record<ColumnMetaType, string> = {} as any;
|
|
||||||
for (const type of Object.keys(COLUMN_TYPES) as ColumnMetaType[]) {
|
|
||||||
if (!metaNameToColumnName[type]) {
|
|
||||||
out[type] = "NULL";
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
// Wrap in brackets so column names like "from" don't cause any issues
|
|
||||||
out[type] = `[${metaNameToColumnName[type]}]`
|
|
||||||
}
|
|
||||||
return out;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Iterate over all the tables and their metadata
|
|
||||||
const statement = db.prepare(`SELECT id, perRowDescription, perRowTags, columnMeta, metaId FROM base_data_manager_metadata`);
|
|
||||||
for (const row of statement.iterate()) {
|
|
||||||
const verified = verifyMetdataRow(row);
|
|
||||||
if (!verified) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
const { id, perRowDescription, perRowTags, columnMeta, columnNames, metaId } = verified;
|
|
||||||
const metaNameToColumnName = metaToNames(verified);
|
|
||||||
const part = metaParts(metaNameToColumnName);
|
|
||||||
|
|
||||||
// Now find what to insert into each row of the combined
|
|
||||||
// Per row tags is an string of csv'd items but needs to be made a literal
|
|
||||||
// TODO: Make this either a template string or have jq do something
|
|
||||||
// tagsPart = templateToSqlExpr(target.perRowTags, columnNames);
|
|
||||||
const tagsPart = sqlLiteral(perRowTags);
|
|
||||||
|
|
||||||
// Choose what to do with this table based on what meta is present
|
|
||||||
if (
|
|
||||||
!!metaNameToColumnName.sender
|
|
||||||
&& !!metaNameToColumnName.isodatetime
|
|
||||||
) {
|
|
||||||
if (!metaId) {
|
|
||||||
console.warn(`Chat ${id} with .sender but no .metaId. Skipping`);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
// First pull the name of the conversation out of the metaId
|
|
||||||
const receiverThreadTitle = db.prepare(`SELECT title FROM ${metaId} WHERE (id=${sqlLiteral(id)})`).get()?.title;
|
|
||||||
if (!receiverThreadTitle || typeof receiverThreadTitle !== "string") {
|
|
||||||
console.warn(`Chat ${id} with .metaId ${metaId} returned invalid receiverThreadTitle ${typeof receiverThreadTitle}. Skipping`);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
const receiverPart = sqlLiteral(receiverThreadTitle);
|
|
||||||
|
|
||||||
// Put this table into the combined table
|
|
||||||
db.exec(`INSERT INTO combined SELECT ${part.isodatetime}, ${part.text}, 'message', ${part.sender}, ${receiverPart}, ${part.lat}, ${part.lng}, ${tagsPart} FROM ${id};`);
|
|
||||||
}
|
|
||||||
else if (!!metaNameToColumnName.isodatetime) {
|
|
||||||
// Put this table into the combined table
|
|
||||||
let descriptionPart = perRowDescription
|
|
||||||
? templateToSql(perRowDescription, columnNames)
|
|
||||||
: `'An entry from the ${id} table'`; // Default is just kinda garbo...
|
|
||||||
db.exec(`INSERT INTO combined SELECT ${part.isodatetime}, ${descriptionPart}, 'node', NULL, NULL, ${part.lat}, ${part.lng}, ${tagsPart} FROM ${id};`);
|
|
||||||
}
|
|
||||||
else {
|
|
||||||
console.warn(`Table with id ${id} had no isodatetime or anything else of value, skipping...`);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const count = db.prepare(`SELECT COUNT(*) as count FROM combined`).get()?.count;
|
|
||||||
console.log(`${elapsed()} - Combined database built with ${count} rows`);
|
|
||||||
|
|
||||||
// Dump it to the disk for debugging
|
|
||||||
const sqlitePath = "debug_your.csv.db";
|
|
||||||
console.log(`${elapsed()} - Writing database to disk at "${sqlitePath}"`);
|
|
||||||
await DataIO.dumpDBToDisk(db, sqlitePath);
|
|
||||||
|
|
||||||
console.log(`${elapsed()} - Database written to disk`);
|
|
||||||
|
|
||||||
// Dump it all to the path specified
|
|
||||||
dumpDBTableToCSV(db, "combined", "your.csv");
|
|
||||||
console.log(`${elapsed()} - Combined database written to disk as CSV`);
|
|
||||||
db.close();
|
|
||||||
}
|
|
||||||
|
|
||||||
if (process.argv[1] === __filename) {
|
|
||||||
main();
|
|
||||||
}
|
|
||||||
|
|
||||||
127
util/scrub.jq
127
util/scrub.jq
|
|
@ -3,89 +3,46 @@
|
||||||
# fd -t f .json -0 | xargs -I % -0 -- jq -f scrub.jq "%" > "%"
|
# fd -t f .json -0 | xargs -I % -0 -- jq -f scrub.jq "%" > "%"
|
||||||
# (Though you should remove the end `> "%"` first to get just the output without
|
# (Though you should remove the end `> "%"` first to get just the output without
|
||||||
# persisting to be sure it's what you want first)
|
# persisting to be sure it's what you want first)
|
||||||
|
|
||||||
def scrub_key:
|
|
||||||
if test("^[0-9]+$") then
|
|
||||||
("1" * length)
|
|
||||||
else
|
|
||||||
.
|
|
||||||
end;
|
|
||||||
|
|
||||||
def scrub_primitive:
|
|
||||||
if type == "string" then
|
|
||||||
if test("^(([0-9]{1,3}\\.){3}[0-9]{1,3})$") then
|
|
||||||
# IPv4
|
|
||||||
"1.1.1.1"
|
|
||||||
elif test("^([0-9a-fA-F]{0,4}:){2,7}[0-9a-fA-F]{0,4}$") then
|
|
||||||
# IPv6
|
|
||||||
"2000:0000:0000:0000:0000:0000:0000:0000"
|
|
||||||
elif test("^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$") then
|
|
||||||
# Email-like
|
|
||||||
"not_a_real_email@example.com"
|
|
||||||
elif test("\\.(jpg|jpeg|png|gif|bmp|webp|svg|ico|tiff|mp3|wav|flac|aac|ogg|wma|m4a|mp4|avi|mkv|mov|wmv|flv|webm)$"; "i") then
|
|
||||||
# Leave these alone, you will have to manually go through these later and replace with
|
|
||||||
# placeholders
|
|
||||||
# TODO: jq 1.7 adds debug(), use this instead when I can upgrade jq, otherwise
|
|
||||||
# you need to manually grep for MANUAL REPAIR NEEDED for now
|
|
||||||
("MANUAL REPAIR NEEDED: \(.)" | stderr) | .
|
|
||||||
elif test("://") then
|
|
||||||
"url://somewhere"
|
|
||||||
elif test("/") then
|
|
||||||
"some/path"
|
|
||||||
elif test("^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}[+\\-][0-9]{2}:[0-9]{2}$") then
|
|
||||||
# iso date time without millis with timezone
|
|
||||||
"2020-04-13T10:09:08+00:00"
|
|
||||||
elif test("^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}(\\.[0-9]{1,6})?[+\\-][0-9]{2}:[0-9]{2}$") then
|
|
||||||
# iso date time with millis with timezone
|
|
||||||
"2020-04-13T10:09:08.000000+00:00"
|
|
||||||
elif test("^[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2} UTC") then
|
|
||||||
# Date format from snapchat export
|
|
||||||
"2020-04-13 10:09:08 UTC"
|
|
||||||
elif test("^[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}") then
|
|
||||||
# Date format from snapchat export
|
|
||||||
"2020-04-13 10:09:08"
|
|
||||||
elif test("^[0-9]+$") then
|
|
||||||
# preserve length of the string
|
|
||||||
"1" * length
|
|
||||||
elif test("^[0-9a-fA-F]+$") then #hexadecimal string
|
|
||||||
# repeat the hex pattern and truncate to original length
|
|
||||||
("a1" * length)[:length]
|
|
||||||
elif . == "" then
|
|
||||||
# prevents empty string from just returning null instead of empty string
|
|
||||||
""
|
|
||||||
else
|
|
||||||
# Preserve string length for other strings
|
|
||||||
"x" * length
|
|
||||||
end
|
|
||||||
elif type == "number" then
|
|
||||||
if 946702800 <= . and . <= 1893474000 then
|
|
||||||
# Take modulo 1 year to get variance in the output, then add offset to bring to ~2024
|
|
||||||
((((. % 31557600) + 1704067200) / 5000 | floor) * 5000)
|
|
||||||
elif . == (. | floor) then
|
|
||||||
# Integer - preserve digit count
|
|
||||||
(tostring | length) as $len | ("1" * $len) | tonumber
|
|
||||||
else
|
|
||||||
8.08
|
|
||||||
end
|
|
||||||
elif type == "boolean" then
|
|
||||||
# Replace all booleans with false, this can give sensative info away based
|
|
||||||
# on what the key was in the data
|
|
||||||
false
|
|
||||||
else
|
|
||||||
.
|
|
||||||
end;
|
|
||||||
|
|
||||||
def scrub:
|
def scrub:
|
||||||
if type == "object" then
|
walk(
|
||||||
# Apply scrubbing to both keys and values
|
if type == "string" then
|
||||||
with_entries(.key |= scrub_key | .value |= scrub)
|
if test("^(([0-9]{1,3}\\.){3}[0-9]{1,3})$") then
|
||||||
elif type == "array" then
|
"1.1.1.1"
|
||||||
# Keep only 2 elements, but scrub *those* elements
|
elif test("^([0-9a-fA-F]{0,4}:){2,7}[0-9a-fA-F]{0,4}$") then
|
||||||
.[:2] | map(scrub)
|
"2000:0000:0000:0000:0000:0000:0000:0000"
|
||||||
else
|
elif test("^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$") then
|
||||||
# Scrub a primitive value
|
"not_a_real_email@example.com"
|
||||||
scrub_primitive
|
elif test("\\.(jpg|jpeg|png|gif|bmp|webp|svg|ico|tiff|mp3|wav|flac|aac|ogg|wma|m4a|mp4|avi|mkv|mov|wmv|flv|webm)$"; "i") then
|
||||||
end;
|
# Leave these alone, you will have to manually go through these later and replace with
|
||||||
|
# placeholders
|
||||||
# Call scrub
|
# TODO: jq 1.7 adds debug(), use this instead when I can upgrade jq, otherwise
|
||||||
scrub
|
# you need to manually grep for MANUAL REPAIR NEEDED for now
|
||||||
|
("MANUAL REPAIR NEEDED: \(.)" | stderr) | .
|
||||||
|
elif test("://") then
|
||||||
|
"url://somewhere"
|
||||||
|
elif test("/") then
|
||||||
|
"some/path"
|
||||||
|
else
|
||||||
|
"xxx"
|
||||||
|
end
|
||||||
|
elif type == "number" then
|
||||||
|
if 946702800 <= . and . <= 1893474000 then
|
||||||
|
# Take modulo 1 year to get variance in the output, then add offset to bring to ~2024
|
||||||
|
((((. % 31557600) + 1704067200) / 5000 | floor) * 5000)
|
||||||
|
else
|
||||||
|
69
|
||||||
|
end
|
||||||
|
elif type == "array" then
|
||||||
|
# Keep only 2 elements, but scrub *those* elements
|
||||||
|
if length > 1 then
|
||||||
|
[ (.[0] | scrub), (.[1] | scrub) ]
|
||||||
|
elif length > 0 then
|
||||||
|
[ (.[0] | scrub) ]
|
||||||
|
else
|
||||||
|
[]
|
||||||
|
end
|
||||||
|
else
|
||||||
|
.
|
||||||
|
end
|
||||||
|
);
|
||||||
|
scrub
|
||||||
|
|
@ -27,6 +27,9 @@ assert(targetDir, "Usage: ./scrub.ts <directory>");
|
||||||
|
|
||||||
const targetPath = path.resolve(targetDir);
|
const targetPath = path.resolve(targetDir);
|
||||||
|
|
||||||
|
// const stat = await fs.stat(targetPath);
|
||||||
|
// assert(stat.isDirectory(), "");
|
||||||
|
|
||||||
const [notADir] = await ptry($`test -d ${targetPath}`);
|
const [notADir] = await ptry($`test -d ${targetPath}`);
|
||||||
assert(!notADir, `Error: '${targetPath}' is not a directory`);
|
assert(!notADir, `Error: '${targetPath}' is not a directory`);
|
||||||
|
|
||||||
|
|
@ -46,16 +49,12 @@ console.log("filePaths", filePaths);
|
||||||
for (const file of filePaths) {
|
for (const file of filePaths) {
|
||||||
console.log(`Processing: ${file}`);
|
console.log(`Processing: ${file}`);
|
||||||
const tmpFile = `${file}.tmp`;
|
const tmpFile = `${file}.tmp`;
|
||||||
const piiFile = `${file}.DELETE-THIS-HAS-PII`;
|
|
||||||
|
|
||||||
const [jqErr] = await ptry($`jq -f ${scrubJq} ${file} > ${tmpFile}`);
|
const [jqErr] = await ptry($`jq -f ${scrubJq} ${file} > ${tmpFile}`);
|
||||||
assert(!jqErr, `Error processing ${file}: ${jqErr}`);
|
assert(!jqErr, `Error processing ${file}: ${jqErr}`);
|
||||||
|
|
||||||
const [mvErr] = await ptry($`mv ${file} ${piiFile}`);
|
const [mvErr] = await ptry($`mv ${tmpFile} ${file}`);
|
||||||
assert(!mvErr, `Error moving ${file} to ${piiFile}: ${mvErr}`);
|
assert(!mvErr, `Error moving ${tmpFile} to ${file}: ${mvErr}`);
|
||||||
|
|
||||||
const [mv2Err] = await ptry($`mv ${tmpFile} ${file}`);
|
|
||||||
assert(!mv2Err, `Error moving ${tmpFile} to ${file}: ${mv2Err}`);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
console.log();
|
console.log();
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue