Compare commits
4 commits
9c3bdaa100
...
c093fbfcee
| Author | SHA1 | Date | |
|---|---|---|---|
| c093fbfcee | |||
| 7d815833e6 | |||
| a4fbe1618d | |||
| f6d0427a45 |
60 changed files with 5399 additions and 1421 deletions
3
.gitignore
vendored
3
.gitignore
vendored
|
|
@ -1,4 +1,5 @@
|
||||||
node_modules/
|
node_modules/
|
||||||
your.db
|
*.db
|
||||||
your.csv
|
your.csv
|
||||||
.gitSAFE
|
.gitSAFE
|
||||||
|
*.DELETE-THIS-HAS-PII
|
||||||
25
README.md
Normal file
25
README.md
Normal file
|
|
@ -0,0 +1,25 @@
|
||||||
|
# base-data-manager
|
||||||
|
|
||||||
|
A Typescript project for parsing through many types of data exports to tabular formats
|
||||||
|
|
||||||
|
** This is heavily WIP, and mostly just a toy for myself **
|
||||||
|
|
||||||
|
### Installation
|
||||||
|
|
||||||
|
* Install `jq`
|
||||||
|
* Install sqlite `csv.so` extension (Hardcoded to `/home/cobertos/sqlite-files/` currently)
|
||||||
|
* Install `node` + `pnpm i`
|
||||||
|
* See `main.ts` for current example usage
|
||||||
|
|
||||||
|
|
||||||
|
### Proposed Architecture
|
||||||
|
|
||||||
|
The architecture runs in 2 steps.
|
||||||
|
|
||||||
|
The first step is unopinionated in it's output format. It's meant to take the source data exactly as-is and output it as csv. All source data should pass through, but will be normalized in csv
|
||||||
|
|
||||||
|
**TODO: It's not completely unopinionated, there is some normalization for names of columns I think we want to apply? Or maybe we apply that later...**
|
||||||
|
|
||||||
|
An optional second step combines everything into a single SQLite database. From here we normalize many different types of data across multiple exports into a single opinionated output. For example, message threads/channels should all have the same table format, or end up in the same table
|
||||||
|
|
||||||
|
**TODO: No idea if the second part should be a part of this project... but it currently is**
|
||||||
File diff suppressed because it is too large
Load diff
|
|
@ -1,105 +1,115 @@
|
||||||
import { TaskTargetPipelineHelper } from "./task.ts";
|
import { pipe, branch, cmd, assignMeta, cd, glob, read, branchGen, type PipelineOp } from "./task.ts";
|
||||||
import { htmlSelectorChunkedDuplex } from "./html.ts";
|
import { htmlSelectorChunkedDuplex } from "./html.ts";
|
||||||
|
|
||||||
export function google(this: TaskTargetPipelineHelper){
|
export function google(){
|
||||||
const p = this.assignMeta({ idValue: t=>`Google - ${t.basename}` }); // Generic ID for everything in here
|
return pipe(
|
||||||
const col: Set<TaskTargetPipelineHelper> = new Set();
|
// Generic ID for everything in here
|
||||||
|
assignMeta({ idValue: t=>`Google - ${t.basename}` }),
|
||||||
// TODO: There is a root takeout folder
|
branchGen(function*() {
|
||||||
|
// TODO: There is a root takeout folder
|
||||||
|
|
||||||
|
|
||||||
p.collect(col).cd('Access Log Activity/Activities - A list of Google services accessed by.csv').read()
|
yield pipe(cd('Access Log Activity/Activities - A list of Google services accessed by.csv'), read())
|
||||||
p.collect(col).cd('Devices - A list of devices (i.e. Nest, Pixel, iPh.csv').read()
|
yield pipe(cd('Devices - A list of devices (i.e. Nest, Pixel, iPh.csv'), read())
|
||||||
|
|
||||||
// Assignments - data was empty
|
// Assignments - data was empty
|
||||||
// Business messages - GMB messages, there's some but so far outside of what I want
|
// Business messages - GMB messages, there's some but so far outside of what I want
|
||||||
// TODO: Calendar, exports an .ics
|
// TODO: Calendar, exports an .ics
|
||||||
|
|
||||||
// a = t.fork().cd(`Chrome`)
|
// a = t.fork().cd(`Chrome`)
|
||||||
// TODO: Assersses and mode.json
|
// TODO: Assersses and mode.json
|
||||||
// TODO: Bookmarks.csv
|
// TODO: Bookmarks.csv
|
||||||
// TODO: Device Information.json
|
// TODO: Device Information.json
|
||||||
// TODO: Dictionary.csv
|
// TODO: Dictionary.csv
|
||||||
// TODO: ...
|
// TODO: ...
|
||||||
p.collect(col).cd('Chrome/History.json')
|
yield pipe(
|
||||||
.read()
|
cd('Chrome/History.json'),
|
||||||
// TODO: Typed Url", no data
|
read(),
|
||||||
// TODO: "session", complex data
|
// TODO: Typed Url", no data
|
||||||
// Omitted .ptoken and .client_id for now. I think ptoken is maybe for the history API? client_id is base64 something...
|
// TODO: "session", complex data
|
||||||
// TODO: time_usec IS WRONG!! Needs to be ms
|
// Omitted .ptoken and .client_id for now. I think ptoken is maybe for the history API? client_id is base64 something...
|
||||||
.cmd(["jq", "-r", `["favicon_url","page_transition","title","url","time_usec"],
|
// TODO: time_usec IS WRONG!! Needs to be ms
|
||||||
(
|
cmd(["jq", "-r", `["favicon_url","page_transition","title","url","time_usec"],
|
||||||
."Browser History"[]
|
(
|
||||||
| [.favicon_url, .page_transition, .title, .url, (.time_usec | todateiso8601)]
|
."Browser History"[]
|
||||||
)
|
| [.favicon_url, .page_transition, .title, .url, (.time_usec | todateiso8601)]
|
||||||
| @csv`])
|
)
|
||||||
|
| @csv
|
||||||
|
`])
|
||||||
|
);
|
||||||
|
|
||||||
// TODO: Contactss, exports an .vcf
|
// TODO: Contactss, exports an .vcf
|
||||||
// TODO: ...
|
// TODO: ...
|
||||||
|
|
||||||
// a = t.fork().cd(`Google Pay`)
|
// a = t.fork().cd(`Google Pay`)
|
||||||
p.collect(col).cd(`Google Pay/Google transactions`).glob(`transactions_*.csv`)
|
yield pipe(
|
||||||
.read()
|
cd(`Google Pay/Google transactions`),
|
||||||
// .fork("a").cd(`Money sends and requests`)
|
glob(`transactions_*.csv`),
|
||||||
// .fork().cd(`Money sends and requests.csv`)
|
read(),
|
||||||
// .read()
|
// .fork("a").cd(`Money sends and requests`)
|
||||||
// .cmd(t=>["sqlite-utils", "insert", "your.db", t.basename, "-", "--csv", "--detect-types"])
|
// .fork().cd(`Money sends and requests.csv`)
|
||||||
// TODO: One more folder, and it only has a pdf
|
// .read()
|
||||||
|
// .cmd(t=>["sqlite-utils", "insert", "your.db", t.basename, "-", "--csv", "--detect-types"])
|
||||||
|
// TODO: One more folder, and it only has a pdf
|
||||||
|
);
|
||||||
|
|
||||||
// TODO: Google Play Movies _ TV - no data
|
// TODO: Google Play Movies _ TV - no data
|
||||||
// TODO: ...
|
// TODO: ...
|
||||||
|
|
||||||
p.collect(col).cd("Location History/Location History.json")
|
yield pipe(
|
||||||
.read()
|
cd("Location History/Location History.json"),
|
||||||
// TODO: This is missing
|
read(),
|
||||||
// "altitude" : 158,
|
// TODO: This is missing
|
||||||
// "verticalAccuracy" : 68
|
// "altitude" : 158,
|
||||||
// and the activity models. I had no idea google tries to determine if I'm "tilting"
|
// "verticalAccuracy" : 68
|
||||||
.cmd(["jq", "-r", `["timestamp","latitudeE7","longitudeE7","accuracy"],
|
// and the activity models. I had no idea google tries to determine if I'm "tilting"
|
||||||
(
|
cmd(["jq", "-r", `["timestamp","latitudeE7","longitudeE7","accuracy"],
|
||||||
.locations[]
|
(
|
||||||
| [.timestampMs | todateiso8601, .latitudeE7, .longitudeE7, .accuracy]
|
.locations[]
|
||||||
)
|
| [.timestampMs | todateiso8601, .latitudeE7, .longitudeE7, .accuracy]
|
||||||
| @csv`])
|
)
|
||||||
// There's also the semantic history but that's an entire nother can of worms
|
| @csv
|
||||||
// it seems like
|
`])
|
||||||
|
);
|
||||||
|
// There's also the semantic history but that's an entire nother can of worms
|
||||||
|
// it seems like
|
||||||
|
|
||||||
// TODO: Needs no-headers!
|
// TODO: Needs no-headers!
|
||||||
// a = t.fork().cd(`My Activity`)
|
// a = t.fork().cd(`My Activity`)
|
||||||
// a.fork().glob(`**/MyActivity.html`)
|
// a.fork().glob(`**/MyActivity.html`)
|
||||||
// .setId(t=>`Google - ${t.basenameN(2)}`)
|
// .setId(t=>`Google - ${t.basenameN(2)}`)
|
||||||
// .read()
|
// .read()
|
||||||
// .pipe(()=>{
|
// .pipe(()=>{
|
||||||
// // Parses the MyActivity format, chunking it into pieces of HTML text
|
// // Parses the MyActivity format, chunking it into pieces of HTML text
|
||||||
// // and then parsing out the text
|
// // and then parsing out the text
|
||||||
// const dup = htmlSelectorChunkedDuplex(
|
// const dup = htmlSelectorChunkedDuplex(
|
||||||
// (tag, attrs)=>{
|
// (tag, attrs)=>{
|
||||||
// // TODO: We also probably want to get and parse each
|
// // TODO: We also probably want to get and parse each
|
||||||
// // ".content-cell.mdl-typography--caption" as well (it
|
// // ".content-cell.mdl-typography--caption" as well (it
|
||||||
// // has location for websearches and sometimes a details field)
|
// // has location for websearches and sometimes a details field)
|
||||||
// // but then we have to get ".mdl-grid" and parse it
|
// // but then we have to get ".mdl-grid" and parse it
|
||||||
// return attrs.class?.includes("content-cell")
|
// return attrs.class?.includes("content-cell")
|
||||||
// && attrs.class?.includes("mdl-typography--body-1")
|
// && attrs.class?.includes("mdl-typography--body-1")
|
||||||
// && !attrs.class?.includes("mdl-typography--text-right")
|
// && !attrs.class?.includes("mdl-typography--text-right")
|
||||||
// },
|
// },
|
||||||
// (chunk)=>{
|
// (chunk)=>{
|
||||||
// const text = chunk.innerText;
|
// const text = chunk.innerText;
|
||||||
// const split = text.split("\n");
|
// const split = text.split("\n");
|
||||||
// const timestamp = split.pop(); // TODO: need to parse this
|
// const timestamp = split.pop(); // TODO: need to parse this
|
||||||
// const rest = split.join("\n");
|
// const rest = split.join("\n");
|
||||||
// // TODO: Escape instead of replace
|
// // TODO: Escape instead of replace
|
||||||
// const restSafe = rest.replace(/"/g, "'").replace(/\n/g,"\\n"); // escape newlines and quotes
|
// const restSafe = rest.replace(/"/g, "'").replace(/\n/g,"\\n"); // escape newlines and quotes
|
||||||
// // Return a CSV
|
// // Return a CSV
|
||||||
// return `"${restSafe}","${timestamp}"\n`;
|
// return `"${restSafe}","${timestamp}"\n`;
|
||||||
// }
|
// }
|
||||||
// );
|
// );
|
||||||
// return dup;
|
// return dup;
|
||||||
// })
|
// })
|
||||||
|
|
||||||
// TODO: News
|
// TODO: News
|
||||||
// TODO: Profile
|
// TODO: Profile
|
||||||
// TODO: Tasks - No data
|
// TODO: Tasks - No data
|
||||||
|
})
|
||||||
return Array.from(col);
|
);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
||||||
52
data-export/io.ts
Normal file
52
data-export/io.ts
Normal file
|
|
@ -0,0 +1,52 @@
|
||||||
|
import fs from 'node:fs/promises';
|
||||||
|
import fsSync from 'node:fs';
|
||||||
|
import { DatabaseSync } from "node:sqlite";
|
||||||
|
import { type ProcessOutputAggregate, type RunOutput, TaskTarget, runAll, type ProcessOutputSimple } from "./task.ts";
|
||||||
|
import { ProcessOutput } from 'zx';
|
||||||
|
|
||||||
|
|
||||||
|
async function loadCSVTable(
|
||||||
|
db: DatabaseSync,
|
||||||
|
target: TaskTarget,
|
||||||
|
result: ProcessOutput | ProcessOutputAggregate | ProcessOutputSimple
|
||||||
|
) {
|
||||||
|
const id = target.id;
|
||||||
|
const table = id;
|
||||||
|
const tmpPath = `/tmp/${id}.csv`;
|
||||||
|
// console.log(`Writing ${tmpPath}`);
|
||||||
|
const fd = await fs.open(tmpPath, 'w');
|
||||||
|
await fs.writeFile(fd, result.stdout, { encoding: 'utf8' });
|
||||||
|
await fd.close();
|
||||||
|
// console.log(`Loading ${tmpPath} → table ${table}`);
|
||||||
|
|
||||||
|
db.exec(`CREATE VIRTUAL TABLE temp.intermediate USING csv(filename='${tmpPath}', header);`);
|
||||||
|
db.exec(`CREATE TABLE "${table}" AS SELECT * FROM intermediate;`);
|
||||||
|
db.exec(`DROP TABLE IF EXISTS intermediate;`);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
// TODO: This should really have the same name throughout the codebase?
|
||||||
|
export const runPipeline = runAll;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @param db Must be a DatabaseSync with the csv.so extension enabled
|
||||||
|
*/
|
||||||
|
export async function loadIntoDb(db: DatabaseSync, runOutput: RunOutput[]) {
|
||||||
|
for (const {result, target} of runOutput) {
|
||||||
|
await loadCSVTable(db, target, result);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
export function getDefaultDB(): DatabaseSync {
|
||||||
|
const db = new DatabaseSync(":memory:", { allowExtension: true });
|
||||||
|
db.loadExtension("/home/cobertos/sqlite-files/csv.so")
|
||||||
|
db.enableLoadExtension(false);
|
||||||
|
return db;
|
||||||
|
}
|
||||||
|
export async function dumpDBToDisk(db: DatabaseSync, dumpPath: string) {
|
||||||
|
if (fsSync.existsSync(dumpPath)) {
|
||||||
|
await fs.unlink(dumpPath); // unlink the old
|
||||||
|
}
|
||||||
|
|
||||||
|
// Dump it all to the path specified
|
||||||
|
db.exec(`VACUUM main INTO '${dumpPath}'`);
|
||||||
|
}
|
||||||
|
|
@ -1,15 +1,18 @@
|
||||||
import { $, type ProcessOutput } from 'zx';
|
|
||||||
import os from 'os';
|
import os from 'os';
|
||||||
import { type TaskTarget, run } from "./task.ts";
|
|
||||||
|
|
||||||
$.verbose = false;
|
/**Generic parallel runner with optional logging
|
||||||
|
* Runs `targets` with `runFn` up to a maximum of `maxConcurrency` amount at a time
|
||||||
export async function parallel(
|
* Shaped in a way that expects generally something that returns zx.ProcessOutput (or
|
||||||
targets: TaskTarget[],
|
* something with .duration and .ok built-in to the return)
|
||||||
|
* @param runFn Should NOT throw. Return { ok: false } instead
|
||||||
|
*/
|
||||||
|
export async function parallel<T, R extends { duration: number, ok: boolean }>(
|
||||||
|
targets: T[],
|
||||||
|
runFn: (t: T)=>Promise<R>,
|
||||||
quiet: boolean = false,
|
quiet: boolean = false,
|
||||||
maxConcurrency: number = os.cpus().length
|
maxConcurrency: number = os.cpus().length
|
||||||
): Promise<ProcessOutput[]> {
|
): Promise<R[]> {
|
||||||
const resultMap = new Map<string, ProcessOutput>();
|
const resultMap = new Map<T, R>();
|
||||||
|
|
||||||
const total = targets.length;
|
const total = targets.length;
|
||||||
let completed = 0;
|
let completed = 0;
|
||||||
|
|
@ -40,14 +43,14 @@ export async function parallel(
|
||||||
process.stderr.write(`\r${formatEta()}`.padEnd(80));
|
process.stderr.write(`\r${formatEta()}`.padEnd(80));
|
||||||
}
|
}
|
||||||
|
|
||||||
async function runJob(t: TaskTarget): Promise<void> {
|
async function runJob(t: T): Promise<void> {
|
||||||
running++;
|
running++;
|
||||||
printStatus();
|
printStatus();
|
||||||
|
|
||||||
const result = await run(t);
|
const result = await runFn(t);
|
||||||
completionTimes.push(result.duration);
|
completionTimes.push(result.duration);
|
||||||
|
|
||||||
resultMap.set(t.id, result);
|
resultMap.set(t, result);
|
||||||
|
|
||||||
running--;
|
running--;
|
||||||
completed++;
|
completed++;
|
||||||
|
|
@ -76,13 +79,15 @@ export async function parallel(
|
||||||
process.stderr.write('\n');
|
process.stderr.write('\n');
|
||||||
const totalSeconds = ((Date.now() - startTime) / 1000).toFixed(1);
|
const totalSeconds = ((Date.now() - startTime) / 1000).toFixed(1);
|
||||||
const failed = Array.from(resultMap.values().filter(p => !p.ok));
|
const failed = Array.from(resultMap.values().filter(p => !p.ok));
|
||||||
process.stderr.write(
|
if (!quiet) {
|
||||||
`\nCompleted ${total} jobs in ${totalSeconds}s (${failed.length} failed)\n`
|
process.stderr.write(
|
||||||
);
|
`\nCompleted ${total} jobs in ${totalSeconds}s (${failed.length} failed)\n`
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
const output = targets
|
const output = targets
|
||||||
.map(t => {
|
.map(t => {
|
||||||
const r = resultMap.get(t.id)!;
|
const r = resultMap.get(t)!;
|
||||||
return r;
|
return r;
|
||||||
});
|
});
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -3,7 +3,10 @@ import fs from 'node:fs';
|
||||||
import { strict as assert } from "node:assert";
|
import { strict as assert } from "node:assert";
|
||||||
import { ZipFS } from "./zipFs.ts";
|
import { ZipFS } from "./zipFs.ts";
|
||||||
import { globSync } from "glob";
|
import { globSync } from "glob";
|
||||||
import { $, ProcessPromise, quote } from "zx";
|
import { $, ProcessOutput, quote } from "zx";
|
||||||
|
import { parallel } from "./parallel.ts";
|
||||||
|
|
||||||
|
$.verbose = false;
|
||||||
|
|
||||||
type FSImpl = {
|
type FSImpl = {
|
||||||
isZip?: boolean;
|
isZip?: boolean;
|
||||||
|
|
@ -38,19 +41,20 @@ function safe(s: string) {
|
||||||
|
|
||||||
interface TaskTargetOp {
|
interface TaskTargetOp {
|
||||||
type: "read" | "mid";
|
type: "read" | "mid";
|
||||||
toShell(target: TaskTarget): string;
|
toShell(target: TaskTarget): string | undefined;
|
||||||
clone(): TaskTargetOp;
|
clone(): TaskTargetOp;
|
||||||
}
|
}
|
||||||
class TaskTargetRead implements TaskTargetOp {
|
class TaskTargetRead implements TaskTargetOp {
|
||||||
get type(){ return "read" as const; }
|
get type(){ return "read" as const; }
|
||||||
toShell(target: TaskTarget) {
|
toShell(target: TaskTarget) {
|
||||||
if (target.fsImpl.isZip) {
|
if (target.fsImpl.isZip) {
|
||||||
|
// Read the file to stdout from the target inside the zip file
|
||||||
|
// This relies on the internals of fsImpl a bit to have the path to
|
||||||
|
// the root zip so we can create a command against it
|
||||||
assert(target.fsImpl.zipPath, "Should have a zipPath");
|
assert(target.fsImpl.zipPath, "Should have a zipPath");
|
||||||
// We need to be able to do this
|
|
||||||
return `7z x ${quote(target.fsImpl.zipPath)} -so ${quote(target.path)}`;
|
return `7z x ${quote(target.fsImpl.zipPath)} -so ${quote(target.path)}`;
|
||||||
}
|
}
|
||||||
|
|
||||||
// TODO : Implement when reading from a zip file
|
|
||||||
return `cat ${quote(target.path)}`;
|
return `cat ${quote(target.path)}`;
|
||||||
}
|
}
|
||||||
clone() {
|
clone() {
|
||||||
|
|
@ -115,19 +119,10 @@ export const COLUMN_TYPES = {
|
||||||
"TODO": {}
|
"TODO": {}
|
||||||
};
|
};
|
||||||
|
|
||||||
// // if (type === "numeric") {
|
|
||||||
// // queryLine = `min(${columnName}) as lo, max(${columnName}) as hi, count(*) as n`;
|
|
||||||
// // formatFn = (r: any)=>`${r.n} rows from ${r.lo} to ${r.hi} for ${t.id}`;
|
|
||||||
// // }
|
|
||||||
// // else {
|
|
||||||
// // queryLine = `count(*) as n`;
|
|
||||||
// // formatFn = (r: any)=>`${r.n} rows for ${t.id}`;
|
|
||||||
// // }
|
|
||||||
|
|
||||||
/**Column metadata. Just a string into the TYPES*/
|
/**Column metadata. Just a string into the TYPES*/
|
||||||
type ColumnMeta = (keyof typeof COLUMN_TYPES | undefined);
|
type ColumnMeta = (keyof typeof COLUMN_TYPES | undefined);
|
||||||
// Make non-optional version of just the metadata values of TaskTarget
|
// Make non-optional version of just the metadata values of TaskTarget
|
||||||
type TaskTargetMeta = Required<Pick<TaskTarget, "idValue" | "perRowDescription" | "perRowTags" | "columnMeta">>;
|
type TaskTargetMeta = Required<Pick<TaskTarget, "idValue" | "perRowDescription" | "perRowTags" | "columnMeta" | "aggregate" | "metaIdValue" | "aggregateColumns">>;
|
||||||
|
|
||||||
export class TaskTarget {
|
export class TaskTarget {
|
||||||
/**The current path pointed to by this TaskTarget*/
|
/**The current path pointed to by this TaskTarget*/
|
||||||
|
|
@ -149,15 +144,16 @@ export class TaskTarget {
|
||||||
* you might do something like '"{3}" sent from {2} to {1}'
|
* you might do something like '"{3}" sent from {2} to {1}'
|
||||||
* */
|
* */
|
||||||
perRowDescription?: string;
|
perRowDescription?: string;
|
||||||
/**For every output CSV, this defines a SQL expression evaluated per-row that
|
/**A CSV of tags that is added to every row of the table (TODO: no template functionality currently)*/
|
||||||
* returns a comma-separated string of tags to assign to that row.
|
|
||||||
* Use the items {0}, {1} to template column values, same as perRowDescription.
|
|
||||||
* Example: A static set of tags: "'me,facebook'"
|
|
||||||
* Example: Tags derived from a column: "'facebook,' || {2}"
|
|
||||||
* */
|
|
||||||
perRowTags?: string;
|
perRowTags?: string;
|
||||||
/**Metadata about the columns*/
|
/**Metadata about the columns*/
|
||||||
columnMeta?: ColumnMeta[];
|
columnMeta?: ColumnMeta[];
|
||||||
|
/**Whether or not to aggregate to a single task (everything with the id value idValue)*/
|
||||||
|
aggregate?: boolean;
|
||||||
|
/**Names of the columns to aggregate with*/
|
||||||
|
aggregateColumns?: string[];
|
||||||
|
/**A metadata TaskTarget for this TaskTarget, if one exists*/
|
||||||
|
metaIdValue?: ValidId;
|
||||||
|
|
||||||
constructor(path: string){
|
constructor(path: string){
|
||||||
this.path = path;
|
this.path = path;
|
||||||
|
|
@ -194,6 +190,15 @@ export class TaskTarget {
|
||||||
}
|
}
|
||||||
return safe(this.idValue);
|
return safe(this.idValue);
|
||||||
}
|
}
|
||||||
|
get metaId() {
|
||||||
|
if (!this.metaIdValue) {
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
if (typeof this.metaIdValue === "function") {
|
||||||
|
return safe(this.metaIdValue(this));
|
||||||
|
}
|
||||||
|
return safe(this.metaIdValue);
|
||||||
|
}
|
||||||
|
|
||||||
/**Changes the current directory of the target*/
|
/**Changes the current directory of the target*/
|
||||||
cd(path: string): TaskTarget {
|
cd(path: string): TaskTarget {
|
||||||
|
|
@ -233,6 +238,9 @@ export class TaskTarget {
|
||||||
t.perRowDescription = this.perRowDescription;
|
t.perRowDescription = this.perRowDescription;
|
||||||
t.perRowTags = this.perRowTags;
|
t.perRowTags = this.perRowTags;
|
||||||
t.columnMeta = this.columnMeta?.slice();
|
t.columnMeta = this.columnMeta?.slice();
|
||||||
|
t.metaIdValue = this.metaIdValue;
|
||||||
|
t.aggregate = this.aggregate;
|
||||||
|
t.aggregateColumns = this.aggregateColumns?.slice();
|
||||||
return t;
|
return t;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -247,6 +255,7 @@ export class TaskTarget {
|
||||||
toShell() {
|
toShell() {
|
||||||
const shell = this.pipeline
|
const shell = this.pipeline
|
||||||
.map(p => p.toShell(this))
|
.map(p => p.toShell(this))
|
||||||
|
.filter(p => !!p) // remove empty strings and undefined
|
||||||
.join(" | ")
|
.join(" | ")
|
||||||
return shell;
|
return shell;
|
||||||
}
|
}
|
||||||
|
|
@ -269,42 +278,72 @@ export class TaskTarget {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
export function each(targets: TaskTarget[], fn: (t: TaskTarget)=>void) {
|
export interface PipelineOp {
|
||||||
for (const t of targets) {
|
(targets: TaskTarget[]): TaskTarget[] | Promise<TaskTarget[]>;
|
||||||
fn(t);
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
export function map(targets: TaskTarget[], fn: (t: TaskTarget)=>TaskTarget) {
|
|
||||||
const newTargets = [];
|
export function cd(path: string): PipelineOp {
|
||||||
for (const t of targets) {
|
return (targets: TaskTarget[]) => targets.map(t => t.clone().cd(path));
|
||||||
newTargets.push(fn(t));
|
|
||||||
}
|
|
||||||
return newTargets;
|
|
||||||
}
|
}
|
||||||
export function cd(targets: TaskTarget[], path: string): TaskTarget[] {
|
export function glob(globPath: string): PipelineOp {
|
||||||
return targets.map(t => t.clone().cd(path));
|
return (targets: TaskTarget[]) => targets.map(t => t.glob(globPath)).flat();
|
||||||
}
|
}
|
||||||
export function glob(targets: TaskTarget[], globPath: string): TaskTarget[] {
|
export function unzip(): PipelineOp {
|
||||||
return targets.map(t => t.glob(globPath)).flat();
|
return async (targets: TaskTarget[]) => Promise.all(targets.map(t => t.unzip()));
|
||||||
}
|
}
|
||||||
export async function unzip(targets: TaskTarget[]): Promise<TaskTarget[]> {
|
export function read(): PipelineOp {
|
||||||
return Promise.all(targets.map(t => t.unzip()));
|
return (targets: TaskTarget[]) => targets.map(t => t.clone().read())
|
||||||
}
|
}
|
||||||
export function read(targets: TaskTarget[]): TaskTarget[] {
|
export function cmd(cmd: ValidCmd): PipelineOp {
|
||||||
return targets.map(t => t.clone().read())
|
return (targets: TaskTarget[]) => targets.map(t => t.clone().cmd(cmd))
|
||||||
}
|
}
|
||||||
export function cmd(targets: TaskTarget[], cmd: ValidCmd): TaskTarget[] {
|
export function assignMeta(meta: Partial<TaskTargetMeta>): PipelineOp {
|
||||||
return targets.map(t => t.clone().cmd(cmd))
|
return (targets: TaskTarget[]) => targets.map(t => t.clone().assignMeta(meta))
|
||||||
}
|
}
|
||||||
export function assignMeta(targets: TaskTarget[], meta: Partial<TaskTargetMeta>): TaskTarget[] {
|
|
||||||
return targets.map(t => t.clone().assignMeta(meta))
|
export function each(fn: (t: TaskTarget)=>TaskTarget): PipelineOp {
|
||||||
|
return (targets: TaskTarget[])=> targets.map(fn);
|
||||||
}
|
}
|
||||||
|
export function pipe(...ops: PipelineOp[]): PipelineOp {
|
||||||
|
return async (targets: TaskTarget[]) => {
|
||||||
|
for (const op of ops) {
|
||||||
|
targets = await op(targets);
|
||||||
|
}
|
||||||
|
return targets;
|
||||||
|
};
|
||||||
|
}
|
||||||
|
export function branch(...ops: PipelineOp[]): PipelineOp {
|
||||||
|
return async (targets: TaskTarget[]) => {
|
||||||
|
const targetsArrays = await Promise.all(ops.map(op => op(targets)));
|
||||||
|
return targetsArrays.flat();
|
||||||
|
};
|
||||||
|
}
|
||||||
|
export function branchGen(genFn: ()=>Generator<PipelineOp>): PipelineOp {
|
||||||
|
const opsToBranch = Array.from(genFn());
|
||||||
|
return (targets: TaskTarget[]) => {
|
||||||
|
return branch(...opsToBranch)(targets);
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function execPaths(entries: ({path: string, op: PipelineOp })[]) {
|
||||||
|
return (await Promise.all(
|
||||||
|
// Map every entry path into a TaskTarget and run the PipelineOp with
|
||||||
|
// that TaskTarget
|
||||||
|
entries
|
||||||
|
.map(async ({path,op})=>{
|
||||||
|
const targets = [new TaskTarget(path)];
|
||||||
|
return await op(targets);
|
||||||
|
})
|
||||||
|
)).flat();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/**Verify, anything that fails is skipped and throws an error*/
|
/**Verify, anything that fails is skipped and throws an error*/
|
||||||
export async function verify(targets: TaskTarget[]) {
|
export async function verify(targets: TaskTarget[]) {
|
||||||
const outTargets: TaskTarget[] = [];
|
const outTargets: TaskTarget[] = [];
|
||||||
for (const t of targets) {
|
for (const t of targets) {
|
||||||
// Make sure fsImpl is ready
|
// Make sure fsImpl is ready
|
||||||
|
// TODO: DO NOT PUT THIS IN VERIFY, this should go somewhere in the task building stuff...
|
||||||
if ("ready" in t.fsImpl && !t.fsImpl.ready && t.fsImpl.init) {
|
if ("ready" in t.fsImpl && !t.fsImpl.ready && t.fsImpl.init) {
|
||||||
await t.fsImpl.init();
|
await t.fsImpl.init();
|
||||||
}
|
}
|
||||||
|
|
@ -319,78 +358,133 @@ export async function verify(targets: TaskTarget[]) {
|
||||||
|
|
||||||
outTargets.push(t);
|
outTargets.push(t);
|
||||||
}
|
}
|
||||||
|
|
||||||
return outTargets;
|
return outTargets;
|
||||||
}
|
}
|
||||||
|
|
||||||
function collectionSwap(a: TaskTargetPipelineHelper, b: TaskTargetPipelineHelper) {
|
export interface ProcessOutputAggregate {
|
||||||
if (!a.__collection) {
|
stdout: string;
|
||||||
return;
|
stderr: string;
|
||||||
}
|
exitCodes: (number | null)[];
|
||||||
|
duration: number;
|
||||||
// Remove a, add b
|
ok: boolean;
|
||||||
const collection = a.__collection;
|
}
|
||||||
delete a.__collection;
|
export interface ProcessOutputSimple {
|
||||||
collection.delete(a);
|
stdout: string;
|
||||||
b.__collection = collection;
|
stderr: string;
|
||||||
collection.add(b);
|
exitCode: number;
|
||||||
|
duration: number;
|
||||||
|
ok: boolean;
|
||||||
}
|
}
|
||||||
|
|
||||||
export class TaskTargetPipelineHelper extends Array<TaskTarget> {
|
function combineProcessOutputAggregate(poa: ProcessOutputAggregate | undefined, t: TaskTarget, po: ProcessOutput) {
|
||||||
__collection?: Set<TaskTargetPipelineHelper>;
|
if (!poa) {
|
||||||
|
assert(t.aggregateColumns, "aggregate TaskTarget must have aggregateColumns");
|
||||||
static pipeline(t: TaskTarget[]): TaskTargetPipelineHelper {
|
const headers = t.aggregateColumns.join(",") + "\n";
|
||||||
if (Object.getPrototypeOf(t) === TaskTargetPipelineHelper.prototype) {
|
return {
|
||||||
return t as any; // Already done
|
stdout: headers + po.stdout,
|
||||||
}
|
stderr: po.stderr,
|
||||||
Object.setPrototypeOf(t, TaskTargetPipelineHelper.prototype);
|
exitCodes: [po.exitCode],
|
||||||
return t as any;
|
duration: po.duration,
|
||||||
|
ok: po.ok
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
_fn(fn: (t: TaskTarget[])=>TaskTarget[]): TaskTargetPipelineHelper {
|
// Comes with a builtin "\n" from jq on stdout and stderr, no need to add
|
||||||
const p = TaskTargetPipelineHelper.pipeline(this);
|
// a trailing one
|
||||||
const t = fn(p);
|
poa.stdout += po.stdout;
|
||||||
const p2 = TaskTargetPipelineHelper.pipeline(t);
|
poa.stderr += po.stderr;
|
||||||
collectionSwap(p, p2); // Move collection pointer to the new item, ends always end up in the collection
|
poa.exitCodes.push(po.exitCode);
|
||||||
return p2;
|
poa.duration += po.duration;
|
||||||
}
|
poa.ok &&= po.ok;
|
||||||
async _afn(fn: (t: TaskTarget[])=>Promise<TaskTarget[]>): Promise<TaskTargetPipelineHelper> {
|
return poa;
|
||||||
const p = TaskTargetPipelineHelper.pipeline(this);
|
|
||||||
const t = await fn(p);
|
|
||||||
const p2 = TaskTargetPipelineHelper.pipeline(t);
|
|
||||||
collectionSwap(p, p2); // Move collection pointer to the new item, ends always end up in the collection
|
|
||||||
return p2;
|
|
||||||
}
|
|
||||||
|
|
||||||
cd(path: string): TaskTargetPipelineHelper {
|
|
||||||
return this._fn(t => cd(t, path));
|
|
||||||
}
|
|
||||||
glob(globPath: string): TaskTargetPipelineHelper {
|
|
||||||
return this._fn(t => glob(t, globPath));
|
|
||||||
}
|
|
||||||
async unzip(): Promise<TaskTargetPipelineHelper> {
|
|
||||||
return this._afn(unzip);
|
|
||||||
}
|
|
||||||
read(): TaskTargetPipelineHelper {
|
|
||||||
return this._fn(read);
|
|
||||||
}
|
|
||||||
cmd(_cmd: ValidCmd): TaskTargetPipelineHelper {
|
|
||||||
return this._fn(t => cmd(t, _cmd));
|
|
||||||
}
|
|
||||||
assignMeta(meta: Partial<TaskTargetMeta>): TaskTargetPipelineHelper {
|
|
||||||
return this._fn(t => assignMeta(t, meta));
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* @todo Nested versions of this don't currently work, but they could if we
|
|
||||||
* turn __collection into an array of collections
|
|
||||||
*/
|
|
||||||
collect(_c: Set<TaskTargetPipelineHelper>) {
|
|
||||||
this.__collection = _c;
|
|
||||||
return this;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function run(target: TaskTarget): Promise<ProcessPromise> {
|
export interface RunOutput {
|
||||||
|
target: TaskTarget,
|
||||||
|
result: ProcessOutput | ProcessOutputAggregate | ProcessOutputSimple
|
||||||
|
}
|
||||||
|
|
||||||
|
export async function run(target: TaskTarget): Promise<ProcessOutput> {
|
||||||
const command = target.toShell();
|
const command = target.toShell();
|
||||||
return await $({ nothrow: true })`bash -c ${command}`;
|
return await $({ nothrow: true })`bash -c ${command}`;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
export async function runAll(targets: TaskTarget[]): Promise<RunOutput[]> {
|
||||||
|
const finalTargets = await verify(targets);
|
||||||
|
const results = await parallel(finalTargets, run, true);
|
||||||
|
|
||||||
|
const nonAggregateTargets: TaskTarget[] = finalTargets.filter(t => !t.aggregate);
|
||||||
|
const nonAggregateResults: RunOutput[] = [];
|
||||||
|
const aggregateResultsMap: Record<string, RunOutput> = {};
|
||||||
|
|
||||||
|
// == Aggregate tables ==
|
||||||
|
// Some TaskTargets have .aggregate: true, which means they should all be combined
|
||||||
|
// into a single task with the id of the .id property
|
||||||
|
for (const [idx, r] of results.entries()) {
|
||||||
|
const t = finalTargets[idx];
|
||||||
|
if (!t.aggregate) {
|
||||||
|
nonAggregateResults.push({
|
||||||
|
target: t,
|
||||||
|
result: r
|
||||||
|
});
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
const aggregateId = t.id;
|
||||||
|
const prevResult = aggregateResultsMap[aggregateId]?.result;
|
||||||
|
aggregateResultsMap[aggregateId] = {
|
||||||
|
target: t, // Use target t for metadata, so it will use the last target
|
||||||
|
result: combineProcessOutputAggregate(prevResult as (ProcessOutputAggregate | undefined), t, r)
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
// == Metadata table ==
|
||||||
|
// Each TaskTarget has things like perRowDescription and other things we want to store
|
||||||
|
// and output. this creates a single TaskTarget for all that perTable metadata
|
||||||
|
function csvEscape(s: string | undefined) {
|
||||||
|
if (s === undefined) {
|
||||||
|
return "";
|
||||||
|
}
|
||||||
|
if (s.includes("\"") || s.includes(",") || s.includes("\n")) {
|
||||||
|
return `"${s.replace(/\"/g, "\"\"")}"`;
|
||||||
|
}
|
||||||
|
return s;
|
||||||
|
}
|
||||||
|
let metadataCSV = "id,perRowDescription,perRowTags,columnMeta,metaId\n";
|
||||||
|
for (const t of nonAggregateTargets) {
|
||||||
|
const tableNamePart = t.id;
|
||||||
|
const perRowDescriptionPart = t.perRowDescription;
|
||||||
|
const perRowTagsPart = t.perRowTags;
|
||||||
|
const columnMetaPart = t.columnMeta?.join(",") ?? "";
|
||||||
|
const metaIdPart = t.metaId;
|
||||||
|
metadataCSV += [
|
||||||
|
csvEscape(tableNamePart),
|
||||||
|
csvEscape(perRowDescriptionPart),
|
||||||
|
csvEscape(perRowTagsPart),
|
||||||
|
csvEscape(columnMetaPart),
|
||||||
|
csvEscape(metaIdPart)
|
||||||
|
].join(",") + "\n";
|
||||||
|
}
|
||||||
|
// Won't be removed by verify() because we're adding it after that's used
|
||||||
|
// TODO: Would be nice to bake this into TaskTarget/verify for tasks that dont point
|
||||||
|
// to a real path
|
||||||
|
const metadataTarget = new TaskTarget("<none>");
|
||||||
|
metadataTarget
|
||||||
|
// id, perRowDescription, perRowTags, columnMeta, metaId
|
||||||
|
.assignMeta({
|
||||||
|
idValue: "base_data_manager_metadata",
|
||||||
|
columnMeta: ["any", "any", "any", "any", "any"],
|
||||||
|
perRowTags: "internal",
|
||||||
|
});
|
||||||
|
const metadataResult= {
|
||||||
|
stdout: metadataCSV,
|
||||||
|
stderr: "",
|
||||||
|
exitCode: 0,
|
||||||
|
duration: 0, // TODO
|
||||||
|
ok: true
|
||||||
|
};
|
||||||
|
const metadataRunOutput: RunOutput = { target: metadataTarget, result: metadataResult };
|
||||||
|
|
||||||
|
const aggregateResults: RunOutput[] = Object.values(aggregateResultsMap);
|
||||||
|
return aggregateResults.concat(nonAggregateResults).concat(metadataRunOutput);
|
||||||
|
}
|
||||||
242
main.ts
242
main.ts
|
|
@ -1,192 +1,90 @@
|
||||||
import fs from 'node:fs/promises';
|
import { type DatabaseSync } from "node:sqlite";
|
||||||
import fsSync from 'node:fs';
|
import { fileURLToPath } from "node:url";
|
||||||
import nodePath from "node:path";
|
|
||||||
import { DatabaseSync } from "node:sqlite";
|
|
||||||
import "./data-export/facebook.ts";
|
|
||||||
import { google } from "./data-export/google.ts";
|
import { google } from "./data-export/google.ts";
|
||||||
import { TaskTargetPipelineHelper, TaskTarget, verify } from "./data-export/task.ts";
|
import { facebook, facebook_v2 } from "./data-export/facebook.ts";
|
||||||
import { parallel } from "./data-export/parallel.ts";
|
import { type TaskTarget, execPaths } from "./data-export/task.ts";
|
||||||
import { ProcessOutput } from 'zx';
|
import * as DataIO from "./data-export/io.ts";
|
||||||
|
|
||||||
declare module "./data-export/task.ts" {
|
const __filename = fileURLToPath(import.meta.url);
|
||||||
interface TaskTargetPipelineHelper {
|
|
||||||
google: typeof google;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Object.assign(TaskTargetPipelineHelper.prototype, {
|
export const startTime = Date.now();
|
||||||
google
|
export const elapsed = ()=>`${((Date.now() - startTime) / 1000).toFixed(2)}s`;
|
||||||
});
|
|
||||||
|
|
||||||
async function loadCSVTable(
|
export async function loadTaskInNewDb(targets: TaskTarget[]): Promise<DatabaseSync> {
|
||||||
db: DatabaseSync,
|
console.log(`${elapsed()} - Run all targets`);
|
||||||
target: TaskTarget,
|
const out = await DataIO.runPipeline(targets);
|
||||||
result: ProcessOutput
|
console.log(`${elapsed()} - Final targets exported to CSV. Got ${out.length} targets`);
|
||||||
) {
|
|
||||||
const id = target.id;
|
|
||||||
const table = id;
|
|
||||||
const tmpPath = `/tmp/${id}.csv`;
|
|
||||||
console.log(`Writing ${tmpPath}`);
|
|
||||||
const fd = await fs.open(tmpPath, 'w');
|
|
||||||
await fs.writeFile(fd, result.stdout, { encoding: 'utf8' });
|
|
||||||
await fd.close();
|
|
||||||
console.log(`Loading ${tmpPath} → table ${table}`);
|
|
||||||
|
|
||||||
// const headers = lines[0].split(",");
|
// TODO: Add an option to output everything plainly as CSV in a single directory
|
||||||
// const columnsSql = headers.map(h => `"${h}" TEXT`).join(", ");
|
|
||||||
db.exec(`CREATE VIRTUAL TABLE temp.tmp_${table} USING csv(filename='${tmpPath}');`);
|
console.log(`${elapsed()} - Building combined database table in :memory:`);
|
||||||
// db.exec(`CREATE TABLE "${table}" AS SELECT * FROM intermediate;`);
|
const db = DataIO.getDefaultDB();
|
||||||
// db.exec(`DROP TABLE IF EXISTS intermediate;`);
|
await DataIO.loadIntoDb(db, out);
|
||||||
return `tmp_${table}`;
|
|
||||||
}
|
const tableCount = db.prepare(`SELECT COUNT(*) as count FROM base_data_manager_metadata`).get()!.count;
|
||||||
function getColumnNames(db: DatabaseSync, tableName: string) {
|
console.log(`${elapsed()} - Single database built with ${tableCount} tables`);
|
||||||
return db.prepare(`PRAGMA table_info(${tableName})`).all().map(c => c.name) as string[];
|
|
||||||
}
|
return db;
|
||||||
function templateToSql(template: string, columns: string[]) {
|
|
||||||
// Convert '{0}, {1}' to '%s, %s'
|
|
||||||
const args: string[] = [];
|
|
||||||
const sqlTemplate = template.replace(/\{(\d+)\}/g, (match, index) => {
|
|
||||||
args.push(columns[parseInt(index)]);
|
|
||||||
return '%s';
|
|
||||||
});
|
|
||||||
return `printf('${sqlTemplate}', ${args.join(', ')})`;
|
|
||||||
}
|
|
||||||
function templateToSqlExpr(template: string, columns: string[]) {
|
|
||||||
// perRowTags is already a SQL expression; just substitute {N} with column names
|
|
||||||
return template.replace(/\{(\d+)\}/g, (_match, index) => columns[parseInt(index)]);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
async function main() {
|
async function main() {
|
||||||
|
// Configurable stuff
|
||||||
const sqlitePath = 'your.db';
|
const sqlitePath = 'your.db';
|
||||||
|
|
||||||
const t = TaskTargetPipelineHelper;
|
console.log(`${elapsed()} - Building targets`);
|
||||||
const targets = TaskTargetPipelineHelper.pipeline([
|
const targets = await execPaths([
|
||||||
// new TaskTarget("/home/cobertos/Seafile/projects/base-data-manager/test/fixtures/facebook-json-2021-05-01"),
|
{path: "/home/cobertos/Seafile/archive/ExportedServiceData/facebook/formapcast_facebook-DEADNAME-May2021-json", op: facebook()}
|
||||||
new TaskTarget("/home/cobertos/Seafile/archive/ExportedServiceData/facebook/formapcast_facebook-DEADNAME-May2021-json"),
|
// {path: "/home/cobertos/Seafile/projects/base-data-manager/test/fixtures/facebook-json-2021-05-01", op: facebook()}
|
||||||
//new TaskTarget("/home/cobertos/Seafile/archive/ExportedServiceData/facebook/facebook-x-2025-11-29-x.zip").zip()).facebook_v2();
|
// {path: "/home/cobertos/Seafile/archive/ExportedServiceData/facebook/facebook-x-2025-11-29-x.zip", op: pipe(unzip(), facebook_v2())}
|
||||||
//new TaskTarget("/home/cobertos/Seafile/archive/ExportedServiceData/google/2023-NAMEwork-001").facebook_v2();
|
// {path: "/home/cobertos/Seafile/archive/ExportedServiceData/google/2023-NAMEwork-001", op: facebook_v2()}
|
||||||
])
|
]);
|
||||||
.facebook();
|
console.log(`${elapsed()} - Found ${targets.filter(t => !t.aggregate).length} possible targets`);
|
||||||
// .facebook_v2();
|
|
||||||
// .google();
|
|
||||||
|
|
||||||
// TODO: Make this less painful in task.ts
|
const db = await loadTaskInNewDb(targets);
|
||||||
// let zipTask = t.fork().zip("/home/cobertos/Seafile/archive/ExportedServiceData/facebook/facebook-DEADNAME-May2021-json.zip");
|
|
||||||
// await (zipTask.fsImpl as any).init();
|
|
||||||
|
|
||||||
const finalTargets = await verify(targets);
|
console.log(`${elapsed()} - Writing database to disk at "${sqlitePath}"`);
|
||||||
const results = await parallel(finalTargets, true);
|
DataIO.dumpDBToDisk(db, sqlitePath);
|
||||||
|
|
||||||
if (fsSync.existsSync(sqlitePath)) {
|
|
||||||
await fs.unlink(sqlitePath); // unlink the old
|
|
||||||
}
|
|
||||||
// Open an in-memory db for speed
|
|
||||||
const db = new DatabaseSync(":memory:", { allowExtension: true });
|
|
||||||
db.loadExtension("/home/cobertos/sqlite-files/csv.so")
|
|
||||||
db.enableLoadExtension(false);
|
|
||||||
|
|
||||||
// New output table
|
|
||||||
db.exec(`CREATE TABLE combined (timestamp TEXT, description TEXT, sender TEXT, receiver TEXT, tags TEXT, lat REAL, lng REAL);`);
|
|
||||||
|
|
||||||
for (const [idx, target] of targets.entries()) {
|
|
||||||
const result = results[idx];
|
|
||||||
|
|
||||||
if (!target.columnMeta) {
|
|
||||||
continue; // No column information
|
|
||||||
}
|
|
||||||
|
|
||||||
const tableName = await loadCSVTable(db, target, result);
|
|
||||||
const columnNames = getColumnNames(db, tableName);
|
|
||||||
|
|
||||||
// Now find what to insert into each row of the combined
|
|
||||||
let descriptionPart = `'An entry from the ${tableName} table'`; // Default is just kinda garbo...
|
|
||||||
if (target.perRowDescription) {
|
|
||||||
descriptionPart = templateToSql(target.perRowDescription, columnNames);
|
|
||||||
}
|
|
||||||
|
|
||||||
let timestampPart: string | undefined;
|
|
||||||
let senderPart = 'NULL';
|
|
||||||
let receiverPart = 'NULL';
|
|
||||||
let latPart = 'NULL';
|
|
||||||
let lngPart = 'NULL';
|
|
||||||
for (const [idx, col] of target.columnMeta.entries()) {
|
|
||||||
const columnName = columnNames[idx];
|
|
||||||
if (col === "isodatetime") {
|
|
||||||
timestampPart = columnName;
|
|
||||||
} else if (col === "sender") {
|
|
||||||
senderPart = columnName;
|
|
||||||
} else if (col === "receiver") {
|
|
||||||
receiverPart = columnName;
|
|
||||||
} else if (col === "lat") {
|
|
||||||
latPart = columnName;
|
|
||||||
} else if (col === "lng") {
|
|
||||||
lngPart = columnName;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (!timestampPart) {
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
let tagsPart = 'NULL';
|
|
||||||
if (target.perRowTags) {
|
|
||||||
tagsPart = templateToSqlExpr(target.perRowTags, columnNames);
|
|
||||||
}
|
|
||||||
|
|
||||||
// OFFSET + LIMIT to ignore the CSV headers
|
|
||||||
db.exec(`INSERT INTO combined SELECT ${timestampPart}, ${descriptionPart}, ${senderPart}, ${receiverPart}, ${tagsPart}, ${latPart}, ${lngPart} FROM ${tableName} LIMIT -1 OFFSET 1;`);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Dump it all to the path specified
|
|
||||||
db.exec(`VACUUM main INTO '${sqlitePath}'`);
|
|
||||||
|
|
||||||
// Now dump it as a CSV
|
|
||||||
const rows = db.prepare(`
|
|
||||||
SELECT timestamp || ',' || '"' || replace(description, '"', '""') || '"' as row FROM combined
|
|
||||||
`)
|
|
||||||
.all()
|
|
||||||
.map(r => r.row)
|
|
||||||
.join('\n');
|
|
||||||
db.close();
|
|
||||||
|
|
||||||
await fs.writeFile('your.csv', rows, { encoding: "utf8" });
|
|
||||||
|
|
||||||
|
console.log(`${elapsed()} - Database written to disk`);
|
||||||
}
|
}
|
||||||
|
|
||||||
main();
|
if (process.argv[1] === __filename) {
|
||||||
|
main();
|
||||||
|
}
|
||||||
|
|
||||||
// TODO: Move this into here
|
// TODO: Move this into here
|
||||||
// csvSink(
|
// csvSink(
|
||||||
// summarization?: [string, string][]
|
// summarization?: [string, string][]
|
||||||
// ) {
|
// ) {
|
||||||
// // TODO:
|
// // TODO:
|
||||||
// return this;
|
// return this;
|
||||||
|
|
||||||
// // Ingest this csv into the database at the given id
|
// // Ingest this csv into the database at the given id
|
||||||
// // this.cmd(t=>["sqlite-utils", "insert", "your.db", t.id, "-", "--csv", "--detect-types"]);
|
// // this.cmd(t=>["sqlite-utils", "insert", "your.db", t.id, "-", "--csv", "--detect-types"]);
|
||||||
// // Add a post processing function for these targets that prints out the summarization
|
// // Add a post processing function for these targets that prints out the summarization
|
||||||
// // stats
|
// // stats
|
||||||
// // this.post(async (t: TaskTarget)=>{
|
// // this.post(async (t: TaskTarget)=>{
|
||||||
// // // We only do the first one so far for the summarization
|
// // // We only do the first one so far for the summarization
|
||||||
// // let queryLine: string;
|
// // let queryLine: string;
|
||||||
// // let formatFn: (r: any)=>string;
|
// // let formatFn: (r: any)=>string;
|
||||||
// // const [columnName, type] = summarization?.[0] ?? [undefined, undefined];
|
// // const [columnName, type] = summarization?.[0] ?? [undefined, undefined];
|
||||||
// // if (type === "numeric") {
|
// // if (type === "numeric") {
|
||||||
// // queryLine = `min(${columnName}) as lo, max(${columnName}) as hi, count(*) as n`;
|
// // queryLine = `min(${columnName}) as lo, max(${columnName}) as hi, count(*) as n`;
|
||||||
// // formatFn = (r: any)=>`${r.n} rows from ${r.lo} to ${r.hi} for ${t.id}`;
|
// // formatFn = (r: any)=>`${r.n} rows from ${r.lo} to ${r.hi} for ${t.id}`;
|
||||||
// // }
|
// // }
|
||||||
// // else {
|
// // else {
|
||||||
// // queryLine = `count(*) as n`;
|
// // queryLine = `count(*) as n`;
|
||||||
// // formatFn = (r: any)=>`${r.n} rows for ${t.id}`;
|
// // formatFn = (r: any)=>`${r.n} rows for ${t.id}`;
|
||||||
// // }
|
// // }
|
||||||
|
|
||||||
// // const cmd = "sqlite-utils";
|
// // const cmd = "sqlite-utils";
|
||||||
// // const args = ["query", "your.db", `select ${queryLine} from ${t.id}`]
|
// // const args = ["query", "your.db", `select ${queryLine} from ${t.id}`]
|
||||||
// // const { stdout, stderr } = await execFile(cmd, args);
|
// // const { stdout, stderr } = await execFile(cmd, args);
|
||||||
// // const results = JSON.parse(stdout);
|
// // const results = JSON.parse(stdout);
|
||||||
// // const result = results[0]; // should only be one result in the array for this type of query
|
// // const result = results[0]; // should only be one result in the array for this type of query
|
||||||
// // const logLine = formatFn(result);
|
// // const logLine = formatFn(result);
|
||||||
// // (t as any).log = logLine;
|
// // (t as any).log = logLine;
|
||||||
// // });
|
// // });
|
||||||
|
|
||||||
// // return this;
|
// // return this;
|
||||||
// }
|
// }
|
||||||
|
|
@ -27,6 +27,7 @@
|
||||||
},
|
},
|
||||||
"devDependencies": {
|
"devDependencies": {
|
||||||
"@types/node": "^24.1.0",
|
"@types/node": "^24.1.0",
|
||||||
|
"csv-parse": "^6.1.0",
|
||||||
"typescript": "^5.9.3"
|
"typescript": "^5.9.3"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
8
pnpm-lock.yaml
generated
8
pnpm-lock.yaml
generated
|
|
@ -33,6 +33,9 @@ importers:
|
||||||
'@types/node':
|
'@types/node':
|
||||||
specifier: ^24.1.0
|
specifier: ^24.1.0
|
||||||
version: 24.10.0
|
version: 24.10.0
|
||||||
|
csv-parse:
|
||||||
|
specifier: ^6.1.0
|
||||||
|
version: 6.1.0
|
||||||
typescript:
|
typescript:
|
||||||
specifier: ^5.9.3
|
specifier: ^5.9.3
|
||||||
version: 5.9.3
|
version: 5.9.3
|
||||||
|
|
@ -59,6 +62,9 @@ packages:
|
||||||
buffer-crc32@0.2.13:
|
buffer-crc32@0.2.13:
|
||||||
resolution: {integrity: sha512-VO9Ht/+p3SN7SKWqcrgEzjGbRSJYTx+Q1pTQC0wrWqHx0vpJraQ6GtHx8tvcg1rlK1byhU5gccxgOgj7B0TDkQ==}
|
resolution: {integrity: sha512-VO9Ht/+p3SN7SKWqcrgEzjGbRSJYTx+Q1pTQC0wrWqHx0vpJraQ6GtHx8tvcg1rlK1byhU5gccxgOgj7B0TDkQ==}
|
||||||
|
|
||||||
|
csv-parse@6.1.0:
|
||||||
|
resolution: {integrity: sha512-CEE+jwpgLn+MmtCpVcPtiCZpVtB6Z2OKPTr34pycYYoL7sxdOkXDdQ4lRiw6ioC0q6BLqhc6cKweCVvral8yhw==}
|
||||||
|
|
||||||
dom-serializer@2.0.0:
|
dom-serializer@2.0.0:
|
||||||
resolution: {integrity: sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg==}
|
resolution: {integrity: sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg==}
|
||||||
|
|
||||||
|
|
@ -176,6 +182,8 @@ snapshots:
|
||||||
|
|
||||||
buffer-crc32@0.2.13: {}
|
buffer-crc32@0.2.13: {}
|
||||||
|
|
||||||
|
csv-parse@6.1.0: {}
|
||||||
|
|
||||||
dom-serializer@2.0.0:
|
dom-serializer@2.0.0:
|
||||||
dependencies:
|
dependencies:
|
||||||
domelementtype: 2.3.0
|
domelementtype: 2.3.0
|
||||||
|
|
|
||||||
|
|
@ -1,9 +1,11 @@
|
||||||
import test from "node:test";
|
import test from "node:test";
|
||||||
import nodePath from "node:path";
|
import nodePath from "node:path";
|
||||||
import { strict as assert } from "node:assert";
|
import { strict as assert } from "node:assert";
|
||||||
import { TaskTargetPipelineHelper, TaskTarget, verify, run } from "../data-export/task.ts";
|
import { TaskTarget, verify, run, unzip, pipe } from "../data-export/task.ts";
|
||||||
import { parallel } from "../data-export/parallel.ts";
|
import { parallel } from "../data-export/parallel.ts";
|
||||||
import "../data-export/facebook.ts";
|
import { facebook, facebook_v2 } from "../data-export/facebook.ts";
|
||||||
|
import * as DataIO from "../data-export/io.ts";
|
||||||
|
import { parse } from "csv-parse/sync"; // For better diffs + error checking of CSV output
|
||||||
|
|
||||||
const THIS_FILE = import.meta.dirname;
|
const THIS_FILE = import.meta.dirname;
|
||||||
const FACEBOOK_V1_DIR = nodePath.join(THIS_FILE, 'fixtures/facebook-json-2021-05-01');
|
const FACEBOOK_V1_DIR = nodePath.join(THIS_FILE, 'fixtures/facebook-json-2021-05-01');
|
||||||
|
|
@ -11,67 +13,56 @@ const FACEBOOK_V1_ZIPPED = nodePath.join(THIS_FILE, 'fixtures/facebook-json-2021
|
||||||
const FACEBOOK_V2_DIR = nodePath.join(THIS_FILE, 'fixtures/facebook-json-2025-11-29');
|
const FACEBOOK_V2_DIR = nodePath.join(THIS_FILE, 'fixtures/facebook-json-2025-11-29');
|
||||||
|
|
||||||
test("facebook: Can load the 2021 export", async (t) => {
|
test("facebook: Can load the 2021 export", async (t) => {
|
||||||
const targets = TaskTargetPipelineHelper.pipeline([
|
const targets = [
|
||||||
new TaskTarget(FACEBOOK_V1_DIR)
|
new TaskTarget(FACEBOOK_V1_DIR)
|
||||||
])
|
]
|
||||||
.facebook();
|
const builtTargets = await facebook()(targets);
|
||||||
|
const out = await DataIO.runPipeline(builtTargets);
|
||||||
const finalTargets = await verify(targets);
|
|
||||||
const result = await parallel(finalTargets, true);
|
|
||||||
const idAndCSVs: [string, string][] = [];
|
const idAndCSVs: [string, string][] = [];
|
||||||
for (const [idx, r] of result.entries()) {
|
for (const {target, result} of out) {
|
||||||
const target = finalTargets[idx];
|
assert.ok(!result.stderr, `Task ${target.id} should have no stderr output`);
|
||||||
assert.ok(!r.stderr, `Task ${target.id} should have no stderr output`);
|
assert.ok(result.ok, `Task ${target.id} should be okay`);
|
||||||
assert.ok(r.ok, `Task ${target.id} should be okay`);
|
idAndCSVs.push([target.id, result.stdout]);
|
||||||
idAndCSVs.push([target.id, r.stdout]);
|
|
||||||
}
|
}
|
||||||
const csvs = idAndCSVs
|
const csvs = idAndCSVs
|
||||||
.sort() // Keep stable ordering for snapshots
|
.sort() // Keep stable ordering for snapshots
|
||||||
.map(v => v[1])
|
.map(v => parse(v[1]))
|
||||||
|
|
||||||
t.assert.snapshot(csvs);
|
t.assert.snapshot(csvs);
|
||||||
});
|
});
|
||||||
test("facebook: Can load the 2021 export zipped", async (t) => {
|
test("facebook: Can load the 2021 export zipped", async (t) => {
|
||||||
const targets = await TaskTargetPipelineHelper.pipeline([
|
const targets = [
|
||||||
new TaskTarget(FACEBOOK_V1_ZIPPED)
|
new TaskTarget(FACEBOOK_V1_ZIPPED)
|
||||||
])
|
];
|
||||||
.unzip();
|
const builtTargets = await pipe(unzip(), facebook())(targets);
|
||||||
const targets2 = targets
|
const out = await DataIO.runPipeline(builtTargets);
|
||||||
.facebook();
|
|
||||||
|
|
||||||
const finalTargets = await verify(targets2);
|
|
||||||
const result = await parallel(finalTargets, true);
|
|
||||||
const idAndCSVs: [string, string][] = [];
|
const idAndCSVs: [string, string][] = [];
|
||||||
for (const [idx, r] of result.entries()) {
|
for (const {target, result} of out) {
|
||||||
const target = finalTargets[idx];
|
assert.ok(!result.stderr, `Task ${target.id} should have no stderr output`);
|
||||||
assert.ok(!r.stderr, `Task ${target.id} should have no stderr output`);
|
assert.ok(result.ok, `Task ${target.id} should be okay`);
|
||||||
assert.ok(r.ok, `Task ${target.id} should be okay`);
|
idAndCSVs.push([target.id, result.stdout]);
|
||||||
idAndCSVs.push([target.id, r.stdout]);
|
|
||||||
}
|
}
|
||||||
const csvs = idAndCSVs
|
const csvs = idAndCSVs
|
||||||
.sort() // Keep stable ordering for snapshots
|
.sort() // Keep stable ordering for snapshots
|
||||||
.map(v => v[1])
|
.map(v => parse(v[1]))
|
||||||
|
|
||||||
t.assert.snapshot(csvs);
|
t.assert.snapshot(csvs);
|
||||||
});
|
});
|
||||||
test("facebook: Can load the 2025 export", async (t) => {
|
test("facebook: Can load the 2025 export", async (t) => {
|
||||||
const targets = TaskTargetPipelineHelper.pipeline([
|
const targets = [
|
||||||
new TaskTarget(FACEBOOK_V2_DIR)
|
new TaskTarget(FACEBOOK_V2_DIR)
|
||||||
])
|
]
|
||||||
.facebook_v2();
|
const builtTargets = await facebook_v2()(targets);
|
||||||
|
const out = await DataIO.runPipeline(builtTargets);
|
||||||
const finalTargets = await verify(targets);
|
|
||||||
const result = await parallel(finalTargets, true);
|
|
||||||
const idAndCSVs: [string, string][] = [];
|
const idAndCSVs: [string, string][] = [];
|
||||||
for (const [idx, r] of result.entries()) {
|
for (const {target, result} of out) {
|
||||||
const target = finalTargets[idx];
|
assert.ok(!result.stderr, `Task ${target.id} should have no stderr output`);
|
||||||
assert.ok(!r.stderr, `Task ${target.id} should have no stderr output`);
|
assert.ok(result.ok, `Task ${target.id} should be okay`);
|
||||||
assert.ok(r.ok, `Task ${target.id} should be okay`);
|
idAndCSVs.push([target.id, result.stdout]);
|
||||||
idAndCSVs.push([target.id, r.stdout]);
|
|
||||||
}
|
}
|
||||||
const csvs = idAndCSVs
|
const csvs = idAndCSVs
|
||||||
.sort() // Keep stable ordering for snapshots
|
.sort() // Keep stable ordering for snapshots
|
||||||
.map(v => v[1])
|
.map(v => parse(v[1]))
|
||||||
|
|
||||||
t.assert.snapshot(csvs);
|
t.assert.snapshot(csvs);
|
||||||
});
|
});
|
||||||
|
|
|
||||||
File diff suppressed because it is too large
Load diff
3
test/fixtures/README.md
vendored
3
test/fixtures/README.md
vendored
|
|
@ -11,3 +11,6 @@
|
||||||
|
|
||||||
* `facebook-json-2021-05-01` - Facebook JSON export
|
* `facebook-json-2021-05-01` - Facebook JSON export
|
||||||
* `facebook-json-2025-11-29` - Facebook JSON export
|
* `facebook-json-2025-11-29` - Facebook JSON export
|
||||||
|
* [`discord-chat-exporter-2026-02`](./discord-chat-exporter-2026-02.md) - Discord export with [DiscordChatExporter](https://github.com/Tyrrrz/DiscordChatExporter) sometime around Feb 2026
|
||||||
|
* [`discord-json-2021-01`](./discord-json-2021-01.md) - Discord JSON export
|
||||||
|
* [`snapchat-2023-11`](./snapchat-2023-11.md) - Snapchat JSON + HTML export
|
||||||
|
|
|
||||||
25
test/fixtures/discord-chat-exporter-2026-02.md
vendored
Normal file
25
test/fixtures/discord-chat-exporter-2026-02.md
vendored
Normal file
|
|
@ -0,0 +1,25 @@
|
||||||
|
# discord-chat-exporter-2026-02
|
||||||
|
|
||||||
|
An export from `DiscordChatExporter`, a comprehensive DiscordChatExporter
|
||||||
|
|
||||||
|
## Export methodology
|
||||||
|
|
||||||
|
This uses the version of `DiscordChatExporter` that existed at the top of the releases tab on GitHub around `2026 February`. **TODO: figure out version**
|
||||||
|
|
||||||
|
This export used a command something like the following to try to get _everything_ `dotnet DiscordChatExporter.Cli.dll export -t xxx -o ~/DiscordChatExporter -f json --media --reuse-media --include-threads -c xxx`
|
||||||
|
|
||||||
|
* It uses `export` command and `-c` but it's the same for `exportguild` and `-g`
|
||||||
|
* `-f json` so only the json export
|
||||||
|
* `--media` download all media
|
||||||
|
* `--reuse-media` not quite sure what this does because it puts it in a folder per channel...
|
||||||
|
* `--include-threads` to get any threads
|
||||||
|
|
||||||
|
## Manual edits
|
||||||
|
* Lots of image replacing + placeholders
|
||||||
|
* Had to rename the folders
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
The export format has files and folders with similar, information-dense names. I tried to preserve that as that's the only way to correlate between the folder and the file name
|
||||||
|
|
||||||
|
* No exif on any media files
|
||||||
|
* There's embeds, thumbnails in the example chat messages but I have no other specimen
|
||||||
|
|
@ -0,0 +1,145 @@
|
||||||
|
{
|
||||||
|
"guild": {
|
||||||
|
"id": "111111111111111111",
|
||||||
|
"name": "xxxxxxxx",
|
||||||
|
"iconUrl": "GuildName - Text Channels - ChannelName [0000000000000000].json_Files/avatar.png"
|
||||||
|
},
|
||||||
|
"channel": {
|
||||||
|
"id": "111111111111111111",
|
||||||
|
"type": "xxxxxxxxxxxxx",
|
||||||
|
"categoryId": "111111111111111111",
|
||||||
|
"category": "xxxxxxxxxxxxx",
|
||||||
|
"name": "xxxxxxx",
|
||||||
|
"topic": null
|
||||||
|
},
|
||||||
|
"dateRange": {
|
||||||
|
"after": null,
|
||||||
|
"before": null
|
||||||
|
},
|
||||||
|
"exportedAt": "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
|
||||||
|
"messages": [
|
||||||
|
{
|
||||||
|
"id": "111111111111111111",
|
||||||
|
"type": "xxxxxxxxxxxxxxx",
|
||||||
|
"timestamp": "2020-04-13T10:09:08.000000+00:00",
|
||||||
|
"timestampEdited": null,
|
||||||
|
"callEndedTimestamp": null,
|
||||||
|
"isPinned": false,
|
||||||
|
"content": "xxxxxxxxxxxxxxxxxx",
|
||||||
|
"author": {
|
||||||
|
"id": "111111111111111111",
|
||||||
|
"name": "xxxxxxxx",
|
||||||
|
"discriminator": "1111",
|
||||||
|
"nickname": "xxxxxxxx",
|
||||||
|
"color": null,
|
||||||
|
"isBot": false,
|
||||||
|
"roles": [],
|
||||||
|
"avatarUrl": "GuildName - Text Channels - ChannelName [0000000000000000].json_Files/avatar.png"
|
||||||
|
},
|
||||||
|
"attachments": [],
|
||||||
|
"embeds": [],
|
||||||
|
"stickers": [],
|
||||||
|
"reactions": [],
|
||||||
|
"mentions": [],
|
||||||
|
"inlineEmojis": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "111111111111111111",
|
||||||
|
"type": "xxxxxxx",
|
||||||
|
"timestamp": "2020-04-13T10:09:08.000000+00:00",
|
||||||
|
"timestampEdited": null,
|
||||||
|
"callEndedTimestamp": null,
|
||||||
|
"isPinned": false,
|
||||||
|
"content": "xxxxxxxxx",
|
||||||
|
"author": {
|
||||||
|
"id": "111111111111111111",
|
||||||
|
"name": "xxxxxxxx",
|
||||||
|
"discriminator": "1111",
|
||||||
|
"nickname": "xxxxxxxx",
|
||||||
|
"color": null,
|
||||||
|
"isBot": false,
|
||||||
|
"roles": [],
|
||||||
|
"avatarUrl": "GuildName - Text Channels - ChannelName [0000000000000000].json_Files/avatar.png"
|
||||||
|
},
|
||||||
|
"attachments": [],
|
||||||
|
"embeds": [],
|
||||||
|
"stickers": [],
|
||||||
|
"reactions": [],
|
||||||
|
"mentions": [],
|
||||||
|
"inlineEmojis": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "111111111111111111",
|
||||||
|
"type": "xxxxxxx",
|
||||||
|
"timestamp": "2020-04-13T10:09:08.000000+00:00",
|
||||||
|
"timestampEdited": null,
|
||||||
|
"callEndedTimestamp": null,
|
||||||
|
"isPinned": false,
|
||||||
|
"content": "https://example.com/example.png",
|
||||||
|
"author": {
|
||||||
|
"id": "111111111111111111",
|
||||||
|
"name": "xxxxxxxx",
|
||||||
|
"discriminator": "1111",
|
||||||
|
"nickname": "xxxxxxxx",
|
||||||
|
"color": null,
|
||||||
|
"isBot": false,
|
||||||
|
"roles": [],
|
||||||
|
"avatarUrl": "GuildName - Text Channels - ChannelName [0000000000000000].json_Files/avatar.png"
|
||||||
|
},
|
||||||
|
"attachments": [],
|
||||||
|
"embeds": [
|
||||||
|
{
|
||||||
|
"title": "",
|
||||||
|
"url": "https://example.com/example.png",
|
||||||
|
"timestamp": null,
|
||||||
|
"description": "",
|
||||||
|
"thumbnail": {
|
||||||
|
"url": "GuildName - Text Channels - ChannelName [0000000000000000].json_Files/example.png",
|
||||||
|
"width": 111,
|
||||||
|
"height": 111
|
||||||
|
},
|
||||||
|
"images": [],
|
||||||
|
"fields": [],
|
||||||
|
"inlineEmojis": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"stickers": [],
|
||||||
|
"reactions": [],
|
||||||
|
"mentions": [],
|
||||||
|
"inlineEmojis": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "111111111111111111",
|
||||||
|
"type": "xxxxxxx",
|
||||||
|
"timestamp": "2020-04-13T10:09:08.000000+00:00",
|
||||||
|
"timestampEdited": null,
|
||||||
|
"callEndedTimestamp": null,
|
||||||
|
"isPinned": false,
|
||||||
|
"content": "xxx",
|
||||||
|
"author": {
|
||||||
|
"id": "111111111111111111",
|
||||||
|
"name": "xxxxxxxx",
|
||||||
|
"discriminator": "1111",
|
||||||
|
"nickname": "xxxxxxxx",
|
||||||
|
"color": null,
|
||||||
|
"isBot": false,
|
||||||
|
"roles": [],
|
||||||
|
"avatarUrl": "GuildName - Text Channels - ChannelName [0000000000000000].json_Files/avatar.png"
|
||||||
|
},
|
||||||
|
"attachments": [
|
||||||
|
{
|
||||||
|
"id": "111111111111111111",
|
||||||
|
"url": "GuildName - Text Channels - ChannelName [0000000000000000].json_Files/unknown-SUFFIX.png",
|
||||||
|
"fileName": "unknown.png",
|
||||||
|
"fileSizeBytes": 111111
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"embeds": [],
|
||||||
|
"stickers": [],
|
||||||
|
"reactions": [],
|
||||||
|
"mentions": [],
|
||||||
|
"inlineEmojis": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"messageCount": 111
|
||||||
|
}
|
||||||
Binary file not shown.
|
After Width: | Height: | Size: 1.2 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 1.3 KiB |
Binary file not shown.
|
After Width: | Height: | Size: 1.3 KiB |
41
test/fixtures/discord-json-2021-01.md
vendored
Normal file
41
test/fixtures/discord-json-2021-01.md
vendored
Normal file
|
|
@ -0,0 +1,41 @@
|
||||||
|
# discord-json-2021-01
|
||||||
|
|
||||||
|
## Manual edits
|
||||||
|
* images -> placeholders
|
||||||
|
* `accounts/avatar.png`
|
||||||
|
* manually scrub folder names
|
||||||
|
* `account/applications/0000000000000`
|
||||||
|
|
||||||
|
## Notes about files
|
||||||
|
* `activity/`
|
||||||
|
* All the .json are NDJSON so some json tools don't like them
|
||||||
|
* _Massive_ files. They hang scrub.ts for a long long time (had to run these piecemeal)
|
||||||
|
* These files also have an _incredible_ amount of shapes and variance.
|
||||||
|
* Instead of outputing all the shapes I made a sort of "super-object" to capture the shape with `jq -n '[inputs] | add' events-2021-00000-of-00001.json.tmp > unique_shape.json` and then scrubbing `unique_shape.json`
|
||||||
|
* `messages/`
|
||||||
|
* I hand did these to keep all the ids the same
|
||||||
|
* There are multiple types of chats. DMs, guild channels, etc
|
||||||
|
* I hand did the csvs as I have no scrubber for that
|
||||||
|
* These are only **THE EXPORTING USERS MESSAGES**, no other user, just fyi
|
||||||
|
* Ids in `messages.csv` are just the id of the message, not of any user
|
||||||
|
* There is the potential to derive missing info from a channel via `@` tags sent or possibly via attachments. Maybe...
|
||||||
|
* `11111111111111111`
|
||||||
|
* This one has a shorter id (it's an older one)
|
||||||
|
* Has `type: 0` but there's no guild information in `channel.json`
|
||||||
|
* The user name was `null` in `index.json`
|
||||||
|
* It's a really odd one
|
||||||
|
* `222222222222222222`
|
||||||
|
* This was a dm channel (said `direct message with xxx#7777` in index.json)
|
||||||
|
* Has `type: 1` and there are two recipients (just the ids) in `channel.json`
|
||||||
|
* Unfortunately that's all the info in the export
|
||||||
|
* `333333333333333333`
|
||||||
|
* This was a normal guild channel
|
||||||
|
* `type: 0` and there's guild information in `channel.json`
|
||||||
|
* I kept a good set of messages around from this one to show how attachements and other stuff works
|
||||||
|
* The last message seemed to be a link not as an attachment. Links just seem to be normal text
|
||||||
|
* `programs/`
|
||||||
|
* was empty...
|
||||||
|
* `servers/``
|
||||||
|
* Info about _some_ of the guilds we have ids for
|
||||||
|
* guild.json didn't really contain anything except the name
|
||||||
|
* I kept around the only guild I noticed an audit-log.json with info in it
|
||||||
26
test/fixtures/discord-json-2021-01/README.txt
vendored
Normal file
26
test/fixtures/discord-json-2021-01/README.txt
vendored
Normal file
|
|
@ -0,0 +1,26 @@
|
||||||
|
__ __ ___ _ _ ___ ___ ___ _____ ___ _
|
||||||
|
\ \ / / / _ \ | | | | | _ \ o O O | \ / \ |_ _| / \ | |
|
||||||
|
\ V / | (_) | | |_| | | / o | |) | | - | | | | - | |_|
|
||||||
|
_|_|_ \___/ \___/ |_|_\ TS__[O] |___/ |_|_| _|_|_ |_|_| _(_)_
|
||||||
|
_| """ |_|"""""|_|"""""|_|"""""| <======|_|"""""|_|"""""|_|"""""|_|"""""|_| """ |
|
||||||
|
"`-0-0-'"`-0-0-'"`-0-0-'"`-0-0-'./o--000'"`-0-0-'"`-0-0-'"`-0-0-'"`-0-0-'"`-0-0-'
|
||||||
|
___ ___ _ _ ___ ___ ___ _ _ _
|
||||||
|
|_ _| / __| o O O | || | | __| | _ \ | __| | | | | | |
|
||||||
|
| | \__ \ o | __ | | _| | / | _| |_| |_| |_|
|
||||||
|
|___| |___/ TS__[O] |_||_| |___| |_|_\ |___| _(_)_ _(_)_ _(_)_
|
||||||
|
_|"""""|_|"""""| <======|_|"""""|_|"""""|_|"""""|_|"""""|_| """ |_| """ |_| """ |
|
||||||
|
"`-0-0-'"`-0-0-'./o--000'"`-0-0-'"`-0-0-'"`-0-0-'"`-0-0-'"`-0-0-'"`-0-0-'"`-0-0-'
|
||||||
|
|
||||||
|
Welcome to your Discord Data Package!
|
||||||
|
|
||||||
|
Inside, you'll find a few JSON (JavaScript Object Notation) and CSV (Comma Separated Values) files
|
||||||
|
of the data we use to provide Discord's service to you. We've chosen these formats for ease of
|
||||||
|
processing. Furthermore, the files have been organized into logical groups to make it easy to
|
||||||
|
understand and work with (at least, we hope so)!
|
||||||
|
|
||||||
|
For more information, you can view our in-depth help article at the following URL:
|
||||||
|
|
||||||
|
https://support.discord.com/hc/articles/360004957991
|
||||||
|
|
||||||
|
All the best,
|
||||||
|
Discord Team
|
||||||
16
test/fixtures/discord-json-2021-01/account/applications/0000000000000000/application.json
vendored
Normal file
16
test/fixtures/discord-json-2021-01/account/applications/0000000000000000/application.json
vendored
Normal file
|
|
@ -0,0 +1,16 @@
|
||||||
|
{
|
||||||
|
"id": "111111111111111111",
|
||||||
|
"name": "xxxxxxx",
|
||||||
|
"icon": null,
|
||||||
|
"description": "",
|
||||||
|
"summary": "",
|
||||||
|
"hook": false,
|
||||||
|
"verify_key": "a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1",
|
||||||
|
"flags": 1,
|
||||||
|
"secret": "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
|
||||||
|
"redirect_uris": [],
|
||||||
|
"rpc_application_state": 1,
|
||||||
|
"store_application_state": 1,
|
||||||
|
"verification_state": 1,
|
||||||
|
"interactions_endpoint_url": null
|
||||||
|
}
|
||||||
BIN
test/fixtures/discord-json-2021-01/account/avatar.png
vendored
Normal file
BIN
test/fixtures/discord-json-2021-01/account/avatar.png
vendored
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 1.7 KiB |
399
test/fixtures/discord-json-2021-01/account/user.json
vendored
Normal file
399
test/fixtures/discord-json-2021-01/account/user.json
vendored
Normal file
|
|
@ -0,0 +1,399 @@
|
||||||
|
{
|
||||||
|
"id": "111111111111111111",
|
||||||
|
"username": "xxxxxxxx",
|
||||||
|
"discriminator": 1111,
|
||||||
|
"email": "not_a_real_email@example.com",
|
||||||
|
"verified": false,
|
||||||
|
"avatar_hash": "a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1",
|
||||||
|
"has_mobile": false,
|
||||||
|
"needs_email_verification": false,
|
||||||
|
"premium_until": "2020-04-13T10:09:08.000000+00:00",
|
||||||
|
"flags": 11111111111111,
|
||||||
|
"phone": "xxxxxxxxxxxx",
|
||||||
|
"temp_banned_until": null,
|
||||||
|
"ip": "1.1.1.1",
|
||||||
|
"settings": {
|
||||||
|
"locale": "xxxxx",
|
||||||
|
"show_current_game": false,
|
||||||
|
"restricted_guilds": [],
|
||||||
|
"default_guilds_restricted": false,
|
||||||
|
"inline_attachment_media": false,
|
||||||
|
"inline_embed_media": false,
|
||||||
|
"gif_auto_play": false,
|
||||||
|
"render_embeds": false,
|
||||||
|
"render_reactions": false,
|
||||||
|
"animate_emoji": false,
|
||||||
|
"enable_tts_command": false,
|
||||||
|
"message_display_compact": false,
|
||||||
|
"convert_emoticons": false,
|
||||||
|
"explicit_content_filter": 1,
|
||||||
|
"disable_games_tab": false,
|
||||||
|
"theme": "xxxx",
|
||||||
|
"developer_mode": false,
|
||||||
|
"guild_positions": [
|
||||||
|
"111111111111111111",
|
||||||
|
"111111111111111111"
|
||||||
|
],
|
||||||
|
"detect_platform_accounts": false,
|
||||||
|
"status": "xxxxxx",
|
||||||
|
"afk_timeout": 111,
|
||||||
|
"timezone_offset": 111,
|
||||||
|
"stream_notifications_enabled": false,
|
||||||
|
"allow_accessibility_detection": false,
|
||||||
|
"contact_sync_enabled": false,
|
||||||
|
"native_phone_integration_enabled": false,
|
||||||
|
"animate_stickers": 1,
|
||||||
|
"friend_source_flags": {
|
||||||
|
"all": false
|
||||||
|
},
|
||||||
|
"guild_folders": [
|
||||||
|
{
|
||||||
|
"guild_ids": [
|
||||||
|
"111111111111111111"
|
||||||
|
],
|
||||||
|
"id": null,
|
||||||
|
"name": null,
|
||||||
|
"color": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"guild_ids": [
|
||||||
|
"111111111111111111"
|
||||||
|
],
|
||||||
|
"id": null,
|
||||||
|
"name": null,
|
||||||
|
"color": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"custom_status": null
|
||||||
|
},
|
||||||
|
"connections": [
|
||||||
|
{
|
||||||
|
"type": "xxxxxxxxx",
|
||||||
|
"id": "xxxxxxxxxxx",
|
||||||
|
"name": "xxxxxxxxxxx",
|
||||||
|
"revoked": false,
|
||||||
|
"visibility": 1,
|
||||||
|
"friend_sync": false,
|
||||||
|
"show_activity": false,
|
||||||
|
"verified": false
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "xxxxxxx",
|
||||||
|
"id": "xxxxxxxx",
|
||||||
|
"name": "xxxxxxxx",
|
||||||
|
"revoked": false,
|
||||||
|
"visibility": 1,
|
||||||
|
"friend_sync": false,
|
||||||
|
"show_activity": false,
|
||||||
|
"verified": false
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"external_friends_lists": [
|
||||||
|
{
|
||||||
|
"user_id": "111111111111111111",
|
||||||
|
"platform_type": "xxxxx",
|
||||||
|
"name": "xxxxxxxx",
|
||||||
|
"id_hash": "a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1",
|
||||||
|
"friend_id_hashes": [
|
||||||
|
"a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1",
|
||||||
|
"a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"user_id": "111111111111111111",
|
||||||
|
"platform_type": "xxxxxxxxx",
|
||||||
|
"name": "xxxxxxxxxxx",
|
||||||
|
"id_hash": "a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1",
|
||||||
|
"friend_id_hashes": [
|
||||||
|
"a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1",
|
||||||
|
"a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"friend_suggestions": [],
|
||||||
|
"mfa_sessions": [],
|
||||||
|
"relationships": [
|
||||||
|
{
|
||||||
|
"id": "11111111111111111",
|
||||||
|
"type": 1,
|
||||||
|
"nickname": null,
|
||||||
|
"user": {
|
||||||
|
"id": "11111111111111111",
|
||||||
|
"username": "xxxxxxxxxxxx",
|
||||||
|
"avatar": "a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1",
|
||||||
|
"discriminator": "1111",
|
||||||
|
"public_flags": 1
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "11111111111111111",
|
||||||
|
"type": 1,
|
||||||
|
"nickname": null,
|
||||||
|
"user": {
|
||||||
|
"id": "11111111111111111",
|
||||||
|
"username": "xxxx",
|
||||||
|
"avatar": "a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1",
|
||||||
|
"discriminator": "1111",
|
||||||
|
"public_flags": 111
|
||||||
|
}
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"payments": [
|
||||||
|
{
|
||||||
|
"id": "111111111111111111",
|
||||||
|
"created_at": "2020-04-13T10:09:08.000000+00:00",
|
||||||
|
"currency": "xxx",
|
||||||
|
"tax": 111,
|
||||||
|
"tax_inclusive": false,
|
||||||
|
"amount": 1111,
|
||||||
|
"amount_refunded": 1,
|
||||||
|
"status": 1,
|
||||||
|
"description": "xxxxxxxxxxxxxxxxxxxx",
|
||||||
|
"flags": 1,
|
||||||
|
"subscription": {
|
||||||
|
"id": "111111111111111111",
|
||||||
|
"type": 1,
|
||||||
|
"current_period_start": "2020-04-13T10:09:08.000000+00:00",
|
||||||
|
"current_period_end": "2020-04-13T10:09:08.000000+00:00",
|
||||||
|
"payment_gateway": null,
|
||||||
|
"payment_gateway_plan_id": "xxxxxxxxxxxxxxxxxxx",
|
||||||
|
"currency": "xxx",
|
||||||
|
"plan_id": "111111111111111111",
|
||||||
|
"items": [
|
||||||
|
{
|
||||||
|
"id": "111111111111111111",
|
||||||
|
"plan_id": "111111111111111111",
|
||||||
|
"quantity": 1
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"payment_source": {
|
||||||
|
"id": "111111111111111111",
|
||||||
|
"type": 1,
|
||||||
|
"invalid": false,
|
||||||
|
"brand": "xxxx",
|
||||||
|
"last_4": "1111",
|
||||||
|
"expires_month": 11,
|
||||||
|
"expires_year": 1111,
|
||||||
|
"billing_address": {
|
||||||
|
"name": "xxxxxxxxxxxxx",
|
||||||
|
"line_1": "xxxxxxxxxxxxxxxxx",
|
||||||
|
"line_2": null,
|
||||||
|
"city": "xxxxxxxx",
|
||||||
|
"state": "xx",
|
||||||
|
"country": "xx",
|
||||||
|
"postal_code": "11111"
|
||||||
|
},
|
||||||
|
"country": "xx"
|
||||||
|
},
|
||||||
|
"sku_id": "111111111111111111",
|
||||||
|
"sku_price": 1111,
|
||||||
|
"sku_subscription_plan_id": "111111111111111111"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "111111111111111111",
|
||||||
|
"created_at": "2020-04-13T10:09:08.000000+00:00",
|
||||||
|
"currency": "xxx",
|
||||||
|
"tax": 111,
|
||||||
|
"tax_inclusive": false,
|
||||||
|
"amount": 1111,
|
||||||
|
"amount_refunded": 1,
|
||||||
|
"status": 1,
|
||||||
|
"description": "xxxxxxxxxxxxxxxxxxxx",
|
||||||
|
"flags": 1,
|
||||||
|
"subscription": {
|
||||||
|
"id": "111111111111111111",
|
||||||
|
"type": 1,
|
||||||
|
"current_period_start": "2020-04-13T10:09:08.000000+00:00",
|
||||||
|
"current_period_end": "2020-04-13T10:09:08.000000+00:00",
|
||||||
|
"payment_gateway": null,
|
||||||
|
"payment_gateway_plan_id": "xxxxxxxxxxxxxxxxxxx",
|
||||||
|
"currency": "xxx",
|
||||||
|
"plan_id": "111111111111111111",
|
||||||
|
"items": [
|
||||||
|
{
|
||||||
|
"id": "111111111111111111",
|
||||||
|
"plan_id": "111111111111111111",
|
||||||
|
"quantity": 1
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"payment_source": {
|
||||||
|
"id": "111111111111111111",
|
||||||
|
"type": 1,
|
||||||
|
"invalid": false,
|
||||||
|
"brand": "xxxx",
|
||||||
|
"last_4": "1111",
|
||||||
|
"expires_month": 11,
|
||||||
|
"expires_year": 1111,
|
||||||
|
"billing_address": {
|
||||||
|
"name": "xxxxxxxxxxxxx",
|
||||||
|
"line_1": "xxxxxxxxxxxxxxxxxx",
|
||||||
|
"line_2": null,
|
||||||
|
"city": "xxxxxxxxxx",
|
||||||
|
"state": "xx",
|
||||||
|
"country": "xx",
|
||||||
|
"postal_code": "11111"
|
||||||
|
},
|
||||||
|
"country": "xx"
|
||||||
|
},
|
||||||
|
"sku_id": "111111111111111111",
|
||||||
|
"sku_price": 1111,
|
||||||
|
"sku_subscription_plan_id": "111111111111111111"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"payment_sources": [
|
||||||
|
{
|
||||||
|
"id": "111111111111111111",
|
||||||
|
"type": 1,
|
||||||
|
"invalid": false,
|
||||||
|
"brand": "xxxx",
|
||||||
|
"last_4": "1111",
|
||||||
|
"expires_month": 11,
|
||||||
|
"expires_year": 1111,
|
||||||
|
"billing_address": {
|
||||||
|
"name": "xxxxxxxxxxxxx",
|
||||||
|
"line_1": "xxxxxxxxxxxxxxxxx",
|
||||||
|
"line_2": null,
|
||||||
|
"city": "xxxxxxxx",
|
||||||
|
"state": "xx",
|
||||||
|
"country": "xx",
|
||||||
|
"postal_code": "11111"
|
||||||
|
},
|
||||||
|
"country": "xx"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"guild_settings": [
|
||||||
|
{
|
||||||
|
"guild_id": null,
|
||||||
|
"suppress_everyone": false,
|
||||||
|
"suppress_roles": false,
|
||||||
|
"message_notifications": 1,
|
||||||
|
"mobile_push": false,
|
||||||
|
"muted": false,
|
||||||
|
"mute_config": null,
|
||||||
|
"channel_overrides": [
|
||||||
|
{
|
||||||
|
"channel_id": "111111111111111111",
|
||||||
|
"message_notifications": 1,
|
||||||
|
"muted": false,
|
||||||
|
"mute_config": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"version": 11
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"guild_id": "11111111111111111",
|
||||||
|
"suppress_everyone": false,
|
||||||
|
"suppress_roles": false,
|
||||||
|
"message_notifications": 1,
|
||||||
|
"mobile_push": false,
|
||||||
|
"muted": false,
|
||||||
|
"mute_config": null,
|
||||||
|
"channel_overrides": [
|
||||||
|
{
|
||||||
|
"channel_id": "111111111111111111",
|
||||||
|
"message_notifications": 1,
|
||||||
|
"muted": false,
|
||||||
|
"mute_config": null
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"channel_id": "111111111111111111",
|
||||||
|
"message_notifications": 1,
|
||||||
|
"muted": false,
|
||||||
|
"mute_config": null
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"version": 1
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"library_applications": [
|
||||||
|
{
|
||||||
|
"application": {
|
||||||
|
"id": "111111111111111111",
|
||||||
|
"name": "xxxxxxxxxxxx",
|
||||||
|
"icon": "a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1",
|
||||||
|
"description": "xxxxxxxxxxxxxxxxxxxxx",
|
||||||
|
"summary": "xxxxxxxxxxxxxxxxxxxxx",
|
||||||
|
"primary_sku_id": "111111111111111111",
|
||||||
|
"hook": false,
|
||||||
|
"slug": "xxxxxxxxxxxx",
|
||||||
|
"guild_id": "111111111111111111",
|
||||||
|
"verify_key": "a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1a1",
|
||||||
|
"publishers": [
|
||||||
|
{
|
||||||
|
"id": "111111111111111111",
|
||||||
|
"name": "xxxxxxxxxxx"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"developers": [
|
||||||
|
{
|
||||||
|
"id": "111111111111111111",
|
||||||
|
"name": "xxxxxxxxxxx"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"id": "111111111111111111",
|
||||||
|
"name": "xxxxxxxxxxxxxxxxxxxxxxxx"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"branch_id": "111111111111111111",
|
||||||
|
"sku_id": "111111111111111111",
|
||||||
|
"sku": {
|
||||||
|
"id": "111111111111111111",
|
||||||
|
"type": 1,
|
||||||
|
"premium": false,
|
||||||
|
"preorder_release_at": null,
|
||||||
|
"preorder_approximate_release_date": null
|
||||||
|
},
|
||||||
|
"flags": 1,
|
||||||
|
"created_at": "2020-04-13T10:09:08.000000+00:00",
|
||||||
|
"entitlements": [
|
||||||
|
{
|
||||||
|
"id": "111111111111111111",
|
||||||
|
"sku_id": "111111111111111111",
|
||||||
|
"application_id": "111111111111111111",
|
||||||
|
"user_id": "111111111111111111",
|
||||||
|
"type": 1,
|
||||||
|
"deleted": false,
|
||||||
|
"gift_code_flags": 1,
|
||||||
|
"branches": [
|
||||||
|
"111111111111111111"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"entitlements": [
|
||||||
|
{
|
||||||
|
"id": "111111111111111111",
|
||||||
|
"sku_id": "111111111111111111",
|
||||||
|
"application_id": "111111111111111111",
|
||||||
|
"user_id": "111111111111111111",
|
||||||
|
"type": 1,
|
||||||
|
"deleted": false,
|
||||||
|
"gift_code_flags": 1,
|
||||||
|
"branches": [
|
||||||
|
"111111111111111111"
|
||||||
|
],
|
||||||
|
"sku_name": "xxxxxxxxxxxx"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"user_activity_application_statistics": [
|
||||||
|
{
|
||||||
|
"application_id": "111111111111111111",
|
||||||
|
"last_played_at": "2020-04-13T10:09:08.000000+00:00",
|
||||||
|
"total_duration": 1111,
|
||||||
|
"total_discord_sku_duration": 1
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"application_id": "111111111111111111",
|
||||||
|
"last_played_at": "2020-04-13T10:09:08.000000+00:00",
|
||||||
|
"total_duration": 111111,
|
||||||
|
"total_discord_sku_duration": 1
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"notes": {
|
||||||
|
"111111111111111111": "xxxx"
|
||||||
|
}
|
||||||
|
}
|
||||||
2
test/fixtures/discord-json-2021-01/activity/analytics/events-2021-00000-of-00001.json
vendored
Normal file
2
test/fixtures/discord-json-2021-01/activity/analytics/events-2021-00000-of-00001.json
vendored
Normal file
File diff suppressed because one or more lines are too long
2
test/fixtures/discord-json-2021-01/activity/modeling/events-2021-00000-of-00001.json
vendored
Normal file
2
test/fixtures/discord-json-2021-01/activity/modeling/events-2021-00000-of-00001.json
vendored
Normal file
File diff suppressed because one or more lines are too long
2
test/fixtures/discord-json-2021-01/activity/reporting/events-2021-00000-of-00001.json
vendored
Normal file
2
test/fixtures/discord-json-2021-01/activity/reporting/events-2021-00000-of-00001.json
vendored
Normal file
File diff suppressed because one or more lines are too long
2
test/fixtures/discord-json-2021-01/activity/tns/events-2021-00000-of-00001.json
vendored
Normal file
2
test/fixtures/discord-json-2021-01/activity/tns/events-2021-00000-of-00001.json
vendored
Normal file
File diff suppressed because one or more lines are too long
1
test/fixtures/discord-json-2021-01/messages/11111111111111111/channel.json
vendored
Normal file
1
test/fixtures/discord-json-2021-01/messages/11111111111111111/channel.json
vendored
Normal file
|
|
@ -0,0 +1 @@
|
||||||
|
{"id": "11111111111111111", "type": 0}
|
||||||
2
test/fixtures/discord-json-2021-01/messages/11111111111111111/messages.csv
vendored
Normal file
2
test/fixtures/discord-json-2021-01/messages/11111111111111111/messages.csv
vendored
Normal file
|
|
@ -0,0 +1,2 @@
|
||||||
|
ID,Timestamp,Contents,Attachments
|
||||||
|
8888888888,2022-02-22 22:22:22.222222+00:00,Heyo,
|
||||||
|
1
test/fixtures/discord-json-2021-01/messages/222222222222222222/channel.json
vendored
Normal file
1
test/fixtures/discord-json-2021-01/messages/222222222222222222/channel.json
vendored
Normal file
|
|
@ -0,0 +1 @@
|
||||||
|
{"id": "222222222222222222", "type": 1, "recipients": ["00000000000000000", "1111111111111111"]}
|
||||||
2
test/fixtures/discord-json-2021-01/messages/222222222222222222/messages.csv
vendored
Normal file
2
test/fixtures/discord-json-2021-01/messages/222222222222222222/messages.csv
vendored
Normal file
|
|
@ -0,0 +1,2 @@
|
||||||
|
ID,Timestamp,Contents,Attachments
|
||||||
|
2222222222222,2022-22-22 22:22:22.22222+00:00,Heyo,
|
||||||
|
1
test/fixtures/discord-json-2021-01/messages/333333333333333333/channel.json
vendored
Normal file
1
test/fixtures/discord-json-2021-01/messages/333333333333333333/channel.json
vendored
Normal file
|
|
@ -0,0 +1 @@
|
||||||
|
{"id": "333333333333333333", "type": 0, "name": "generalchat", "guild": {"id": "333333333333333332", "name": "xxx"}}
|
||||||
6
test/fixtures/discord-json-2021-01/messages/333333333333333333/messages.csv
vendored
Normal file
6
test/fixtures/discord-json-2021-01/messages/333333333333333333/messages.csv
vendored
Normal file
|
|
@ -0,0 +1,6 @@
|
||||||
|
ID,Timestamp,Contents,Attachments
|
||||||
|
000000000000000005,2011-02-02 02:05:02.000000+00:00,Huh what the heck is this message,
|
||||||
|
000000000000000004,2011-02-02 02:04:02.000000+00:00,<:thonk:000000000000000000><:thonk:000000000000000000><:thonk:000000000000000000>,
|
||||||
|
000000000000000003,2011-02-02 02:03:02.000000+00:00,"(so <@00000000000000000> who are you)",
|
||||||
|
000000000000000002,2011-02-02 02:02:02.000000+00:00,,https://cdn.discordapp.com/attachments/000000000000000000/000000000000000000/image.png
|
||||||
|
000000000000000001,2011-02-02 02:01:02.000000+00:00,https://google.com/whatever,
|
||||||
|
5
test/fixtures/discord-json-2021-01/messages/index.json
vendored
Normal file
5
test/fixtures/discord-json-2021-01/messages/index.json
vendored
Normal file
|
|
@ -0,0 +1,5 @@
|
||||||
|
{
|
||||||
|
"11111111111111111": null,
|
||||||
|
"222222222222222222": "Direct Message with xxx#7777",
|
||||||
|
"333333333333333333": "generalchat"
|
||||||
|
}
|
||||||
18
test/fixtures/discord-json-2021-01/servers/444444444444444444/audit-log.json
vendored
Normal file
18
test/fixtures/discord-json-2021-01/servers/444444444444444444/audit-log.json
vendored
Normal file
|
|
@ -0,0 +1,18 @@
|
||||||
|
[
|
||||||
|
{
|
||||||
|
"id": "111111111111111111",
|
||||||
|
"user_id": "111111111111111111",
|
||||||
|
"action_type": 11,
|
||||||
|
"changes": [
|
||||||
|
{
|
||||||
|
"key": "xxxx",
|
||||||
|
"new_value": [
|
||||||
|
{
|
||||||
|
"name": "xxxxxxxxxx",
|
||||||
|
"id": "111111111111111111"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
4
test/fixtures/discord-json-2021-01/servers/444444444444444444/guild.json
vendored
Normal file
4
test/fixtures/discord-json-2021-01/servers/444444444444444444/guild.json
vendored
Normal file
|
|
@ -0,0 +1,4 @@
|
||||||
|
{
|
||||||
|
"id": "444444444444444444",
|
||||||
|
"name": "xxx"
|
||||||
|
}
|
||||||
3
test/fixtures/discord-json-2021-01/servers/index.json
vendored
Normal file
3
test/fixtures/discord-json-2021-01/servers/index.json
vendored
Normal file
|
|
@ -0,0 +1,3 @@
|
||||||
|
{
|
||||||
|
"444444444444444444": "xxx"
|
||||||
|
}
|
||||||
9
test/fixtures/facebook-json.md
vendored
Normal file
9
test/fixtures/facebook-json.md
vendored
Normal file
|
|
@ -0,0 +1,9 @@
|
||||||
|
# facebook-json exports
|
||||||
|
|
||||||
|
## `facebook-json-2021-05-01`
|
||||||
|
* Manual edits of images -> placeholders, folder names, key names (in support cases specficially)
|
||||||
|
* This was one of the first few datasets I scrubbed so a lot of manual work was done. Should be easier now
|
||||||
|
* I went poking around this one and there was no exif on any of the images I looked at, only in the json was there exif
|
||||||
|
## `facebook-json-2025-11-29`
|
||||||
|
* Manual edits of images -> placeholders, folder names, key names
|
||||||
|
* This was one of the first few datasets I scrubbed so a lot of manual work was done. Should be easier now
|
||||||
83
test/fixtures/snapchat-2023-11.md
vendored
Normal file
83
test/fixtures/snapchat-2023-11.md
vendored
Normal file
|
|
@ -0,0 +1,83 @@
|
||||||
|
# Snapchat
|
||||||
|
|
||||||
|
Exported from the web exporter
|
||||||
|
|
||||||
|
## Manual Edits
|
||||||
|
|
||||||
|
* memories and chat_media placeholders
|
||||||
|
* Snapchat seemed to have events exported where the `+` in emails broke my parsing and the email contained a ' ' instead, so I fixed that
|
||||||
|
* Keys use unique dates in `json/in_app_surveys.json`
|
||||||
|
* Keys in `json/chat_history.json` use user ids, had to manually truncate and edit
|
||||||
|
|
||||||
|
## Notes
|
||||||
|
|
||||||
|
* `memories/`
|
||||||
|
* No exif data
|
||||||
|
* Does not seem to have any correlating .json file. It's just a dump to the disk
|
||||||
|
* files are like `2020-01-01_aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa-main.jpg`
|
||||||
|
* Date has no time, just date
|
||||||
|
* `aaaaa...` seems to be a guid
|
||||||
|
* `main` | `overlay` at the end, with the same guid
|
||||||
|
* `main` is just the image
|
||||||
|
* `overlay` looks to be like a filter or some other applied thing that was saved with the memory
|
||||||
|
* Images may be rotated
|
||||||
|
* `chat_media/`
|
||||||
|
* No exif
|
||||||
|
* files are like `2020-01-01_b~xxxx.jpeg`
|
||||||
|
* sometimes they have `main` | `overlay` or something
|
||||||
|
* No idea what the `b~` means or if the xxx is an id or what. Perhaps base64 encoded protobuf, but nothing I decoded seemed to correlate to any identifier in the export
|
||||||
|
* Only referenced from ... oh... it's broken. The `type: "MEDIA"` in snapchats exporter has all empty "content" fields. Amazing... So this will have to be pieced together some other way
|
||||||
|
* This will most likel have to be manually repaired
|
||||||
|
* `json/`
|
||||||
|
* Scrubbed
|
||||||
|
* See manual changes
|
||||||
|
|
||||||
|
|
||||||
|
* Comes with both an html and json export (I will only keep the json after deduping)
|
||||||
|
* NOTE: That the html export has explanations which might be useful to explain some of these fields...
|
||||||
|
* I compared all .html to .json side by side (browser <-> text editor) and all of them were present in both and had the same data except `snap_history.html` (was empty in .html) and `faq.html` (just informational)
|
||||||
|
* I noticed on chat history html pages it puts _every_ category, not just the ones I have. Might be useful future reference
|
||||||
|
|
||||||
|
```
|
||||||
|
Frequently Asked Questions
|
||||||
|
Login History and Account Information
|
||||||
|
Snap History Metadata
|
||||||
|
Chat History Metadata
|
||||||
|
My AI
|
||||||
|
Our Story & Spotlight Content
|
||||||
|
Spotlight Replies
|
||||||
|
Purchase History
|
||||||
|
Snapchat Support History
|
||||||
|
User Profile
|
||||||
|
Public Profiles
|
||||||
|
Friends
|
||||||
|
Ranking
|
||||||
|
Story History
|
||||||
|
Account History
|
||||||
|
Location
|
||||||
|
Search History
|
||||||
|
Terms History
|
||||||
|
Subscriptions
|
||||||
|
Bitmoji
|
||||||
|
In-app Surveys
|
||||||
|
Reported Content
|
||||||
|
Bitmoji Kit
|
||||||
|
Connected Apps
|
||||||
|
Talk History
|
||||||
|
Ads Manager
|
||||||
|
My Lenses
|
||||||
|
Memories
|
||||||
|
Cameos
|
||||||
|
Email Campaign History
|
||||||
|
Snap Tokens
|
||||||
|
Payouts
|
||||||
|
Orders
|
||||||
|
Snap Map Places
|
||||||
|
Shopping Favorites
|
||||||
|
Payments
|
||||||
|
My Sounds
|
||||||
|
Photoshoot Snaps
|
||||||
|
Feature Emails
|
||||||
|
AI Selfies
|
||||||
|
```
|
||||||
|
|
||||||
38
test/fixtures/snapchat-2023-11/json/account.json
vendored
Normal file
38
test/fixtures/snapchat-2023-11/json/account.json
vendored
Normal file
|
|
@ -0,0 +1,38 @@
|
||||||
|
{
|
||||||
|
"Basic Information": {
|
||||||
|
"Username": "xxxxxxxxx",
|
||||||
|
"Name": "xxxxx",
|
||||||
|
"Creation Date": "2020-04-13 10:09:08 UTC",
|
||||||
|
"Registration IP": "",
|
||||||
|
"Country": ""
|
||||||
|
},
|
||||||
|
"Device Information": {
|
||||||
|
"Make": "",
|
||||||
|
"Model ID": "",
|
||||||
|
"Model Name": "",
|
||||||
|
"Language": "",
|
||||||
|
"OS Type": "",
|
||||||
|
"OS Version": "",
|
||||||
|
"Connection Type": ""
|
||||||
|
},
|
||||||
|
"Device History": [],
|
||||||
|
"Privacy Policy and Terms of Service Acceptance History": [],
|
||||||
|
"Custom Creative Tools Terms": [],
|
||||||
|
"Login History": [
|
||||||
|
{
|
||||||
|
"IP": "1.1.1.1",
|
||||||
|
"Country": "xx",
|
||||||
|
"Created": "2020-04-13 10:09:08 UTC",
|
||||||
|
"Status": "xxxxxxx",
|
||||||
|
"Device": "some/path"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"IP": "1.1.1.1",
|
||||||
|
"Country": "xx",
|
||||||
|
"Created": "2020-04-13 10:09:08 UTC",
|
||||||
|
"Status": "xxxxxxx",
|
||||||
|
"Device": "some/path"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"Family Center": []
|
||||||
|
}
|
||||||
47
test/fixtures/snapchat-2023-11/json/account_history.json
vendored
Normal file
47
test/fixtures/snapchat-2023-11/json/account_history.json
vendored
Normal file
|
|
@ -0,0 +1,47 @@
|
||||||
|
{
|
||||||
|
"Display Name Change": [
|
||||||
|
{
|
||||||
|
"Date": "2020-04-13 10:09:08 UTC",
|
||||||
|
"Display Name": "xxxxx"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"Date": "",
|
||||||
|
"Display Name": "xxxxxx"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"Email Change": [
|
||||||
|
{
|
||||||
|
"Date": "2020-04-13 10:09:08 UTC",
|
||||||
|
"Email Address": "not_a_real_email@example.com"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"Mobile Number Change": [],
|
||||||
|
"Password Change": [
|
||||||
|
{
|
||||||
|
"Date": "2020-04-13 10:09:08 UTC"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"Date": "2020-04-13 10:09:08 UTC"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"Snapchat Linked to Bitmoji": [
|
||||||
|
{
|
||||||
|
"Date": "2020-04-13 10:09:08 UTC"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"Spectacles": [],
|
||||||
|
"Two-Factor Authentication": [],
|
||||||
|
"Account deactivated / reactivated": [],
|
||||||
|
"Download My Data Reports": [
|
||||||
|
{
|
||||||
|
"Date": "2020-04-13 10:09:08 UTC",
|
||||||
|
"Status": "xxxxxxx",
|
||||||
|
"Email Address": "not_a_real_email@example.com"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"Date": "2020-04-13 10:09:08 UTC",
|
||||||
|
"Status": "xxxxxxxxx",
|
||||||
|
"Email Address": "not_a_real_email@example.com"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
31
test/fixtures/snapchat-2023-11/json/bitmoji.json
vendored
Normal file
31
test/fixtures/snapchat-2023-11/json/bitmoji.json
vendored
Normal file
|
|
@ -0,0 +1,31 @@
|
||||||
|
{
|
||||||
|
"Basic Information": {
|
||||||
|
"First Name": "",
|
||||||
|
"Last Name": "",
|
||||||
|
"Email": "",
|
||||||
|
"Phone Number": "",
|
||||||
|
"Account Creation Date": "2020-04-13 10:09:08 UTC",
|
||||||
|
"Account Creation User Agent": ""
|
||||||
|
},
|
||||||
|
"Analytics": {
|
||||||
|
"App Open Count": 1,
|
||||||
|
"Avatar Gender": "xxxx",
|
||||||
|
"Outfit Save Count": 1,
|
||||||
|
"Share Count": 1
|
||||||
|
},
|
||||||
|
"Terms of Service Acceptance History": [
|
||||||
|
{
|
||||||
|
"Version": "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
|
||||||
|
"Acceptance Date": "2020-04-13 10:09:08"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"Version": "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
|
||||||
|
"Acceptance Date": "2020-04-13 10:09:08"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"Search History": [],
|
||||||
|
"Support Cases": [],
|
||||||
|
"Selfies": [],
|
||||||
|
"Keyboard Enable Full Access History (iOS only)": [],
|
||||||
|
"Connected Apps": []
|
||||||
|
}
|
||||||
8
test/fixtures/snapchat-2023-11/json/cameos_metadata.json
vendored
Normal file
8
test/fixtures/snapchat-2023-11/json/cameos_metadata.json
vendored
Normal file
|
|
@ -0,0 +1,8 @@
|
||||||
|
{
|
||||||
|
"Cameos Selfie": {
|
||||||
|
"Cameos Body Selected": "xxxxxxxxxxxx",
|
||||||
|
"Hairstyle": "xxxxxxxxxxxx",
|
||||||
|
"Use My Cameos Selfie": "xxxxxxx"
|
||||||
|
},
|
||||||
|
"Cameos Stories": []
|
||||||
|
}
|
||||||
42
test/fixtures/snapchat-2023-11/json/chat_history.json
vendored
Normal file
42
test/fixtures/snapchat-2023-11/json/chat_history.json
vendored
Normal file
|
|
@ -0,0 +1,42 @@
|
||||||
|
{
|
||||||
|
"some_friend": [
|
||||||
|
{
|
||||||
|
"From": "xxxxxxxxx",
|
||||||
|
"Media Type": "xxxxx",
|
||||||
|
"Created": "2020-04-13 10:09:08 UTC",
|
||||||
|
"Content": "",
|
||||||
|
"Conversation Title": null,
|
||||||
|
"IsSender": false,
|
||||||
|
"Created(microseconds)": 1111111111111
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"From": "xxxxxxxxx",
|
||||||
|
"Media Type": "xxxx",
|
||||||
|
"Created": "2020-04-13 10:09:08 UTC",
|
||||||
|
"Content": "xxxxxxxxxxxxxxxxxx",
|
||||||
|
"Conversation Title": null,
|
||||||
|
"IsSender": false,
|
||||||
|
"Created(microseconds)": 1111111111111
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"some_friend_too": [
|
||||||
|
{
|
||||||
|
"From": "xxxxxxxxxxxxxx",
|
||||||
|
"Media Type": "xxxxx",
|
||||||
|
"Created": "2020-04-13 10:09:08 UTC",
|
||||||
|
"Content": "",
|
||||||
|
"Conversation Title": "xxxxxxxxxxxxxxxx",
|
||||||
|
"IsSender": false,
|
||||||
|
"Created(microseconds)": 1111111111111
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"From": "xxxxxxxxxxxxx",
|
||||||
|
"Media Type": "xxxx",
|
||||||
|
"Created": "2020-04-13 10:09:08 UTC",
|
||||||
|
"Content": "xxxxxxxxxxxxxxxxxxxxxx",
|
||||||
|
"Conversation Title": "xxxxxxxxxxxxxxxx",
|
||||||
|
"IsSender": false,
|
||||||
|
"Created(microseconds)": 1111111111111
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
11
test/fixtures/snapchat-2023-11/json/connected_apps.json
vendored
Normal file
11
test/fixtures/snapchat-2023-11/json/connected_apps.json
vendored
Normal file
|
|
@ -0,0 +1,11 @@
|
||||||
|
{
|
||||||
|
"Login History": [],
|
||||||
|
"Permissions": [
|
||||||
|
{
|
||||||
|
"App": "xxxxxxx",
|
||||||
|
"Time": "2020-04-13 10:09:08 UTC",
|
||||||
|
"Type": "xxxxxxx"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"Connected Applications": []
|
||||||
|
}
|
||||||
13
test/fixtures/snapchat-2023-11/json/email_campaign_history.json
vendored
Normal file
13
test/fixtures/snapchat-2023-11/json/email_campaign_history.json
vendored
Normal file
|
|
@ -0,0 +1,13 @@
|
||||||
|
{
|
||||||
|
"Email Campaign Subscriptions": [
|
||||||
|
{
|
||||||
|
"Email Campaign": "xxxxxxxxxxxxxxxx",
|
||||||
|
"Opt Out Status": "xxxxxxxxxxxx"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"Email Campaign": "xxxxxxxxxxxxxxx",
|
||||||
|
"Opt Out Status": "xxxxxxxxxxxx"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"Email Campaign History": []
|
||||||
|
}
|
||||||
100
test/fixtures/snapchat-2023-11/json/friends.json
vendored
Normal file
100
test/fixtures/snapchat-2023-11/json/friends.json
vendored
Normal file
|
|
@ -0,0 +1,100 @@
|
||||||
|
{
|
||||||
|
"Friends": [
|
||||||
|
{
|
||||||
|
"Username": "xxxxxxxxxxxxx",
|
||||||
|
"Display Name": "xxxxxxxxxxxxxxxxxxx",
|
||||||
|
"Creation Timestamp": "2020-04-13 10:09:08 UTC",
|
||||||
|
"Last Modified Timestamp": "2020-04-13 10:09:08 UTC",
|
||||||
|
"Source": "xxxxxxxxxxxxxx"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"Username": "xxxxxxxxxxxxxxx",
|
||||||
|
"Display Name": "xxxxxxxxxxxxxxx",
|
||||||
|
"Creation Timestamp": "2020-04-13 10:09:08 UTC",
|
||||||
|
"Last Modified Timestamp": "2020-04-13 10:09:08 UTC",
|
||||||
|
"Source": "xxxxxxxxxxxxxxxxxx"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"Friend Requests Sent": [
|
||||||
|
{
|
||||||
|
"Username": "xxxxxxxxxx",
|
||||||
|
"Display Name": "xxxxxxxxxxx",
|
||||||
|
"Creation Timestamp": "2020-04-13 10:09:08 UTC",
|
||||||
|
"Last Modified Timestamp": "2020-04-13 10:09:08 UTC",
|
||||||
|
"Source": "xxxxxxxxxxxxxxxxxxxxxx"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"Username": "xxxxxxxxx",
|
||||||
|
"Display Name": "xxxxxx",
|
||||||
|
"Creation Timestamp": "2020-04-13 10:09:08 UTC",
|
||||||
|
"Last Modified Timestamp": "2020-04-13 10:09:08 UTC",
|
||||||
|
"Source": "xxxxxxxxxxxxxxxxxxxxxx"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"Blocked Users": [
|
||||||
|
{
|
||||||
|
"Username": "xxxxxxxxxxxxxx",
|
||||||
|
"Display Name": "xxxxxxxxxxxxxxxxxxxxxxx",
|
||||||
|
"Creation Timestamp": "2020-04-13 10:09:08 UTC",
|
||||||
|
"Last Modified Timestamp": "2020-04-13 10:09:08 UTC",
|
||||||
|
"Source": "xxxxxxxxxxxxxxxxxxxxxx"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"Username": "xxxxxxxxxxxxxx",
|
||||||
|
"Display Name": "xxxxxxxxxxxxxx",
|
||||||
|
"Creation Timestamp": "2020-04-13 10:09:08 UTC",
|
||||||
|
"Last Modified Timestamp": "2020-04-13 10:09:08 UTC",
|
||||||
|
"Source": "xxxxxxxxxxxxxxxx"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"Deleted Friends": [
|
||||||
|
{
|
||||||
|
"Username": "xxxxxx",
|
||||||
|
"Display Name": "xxxxxxxxxxxxx",
|
||||||
|
"Creation Timestamp": "2020-04-13 10:09:08 UTC",
|
||||||
|
"Last Modified Timestamp": "2020-04-13 10:09:08 UTC",
|
||||||
|
"Source": "xxxxxxxxxxxxxxxx"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"Username": "xxxxxxxxxxxxxxx",
|
||||||
|
"Display Name": "xxxxxxxxxxxxx",
|
||||||
|
"Creation Timestamp": "2020-04-13 10:09:08 UTC",
|
||||||
|
"Last Modified Timestamp": "2020-04-13 10:09:08 UTC",
|
||||||
|
"Source": "xxxxxxxxxxxxxxxx"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"Hidden Friend Suggestions": [],
|
||||||
|
"Ignored Snapchatters": [
|
||||||
|
{
|
||||||
|
"Username": "xxxxxxxxx",
|
||||||
|
"Display Name": "xxxxxxxxxxxxxxxxxxxxxxxx",
|
||||||
|
"Creation Timestamp": "2020-04-13 10:09:08 UTC",
|
||||||
|
"Last Modified Timestamp": "2020-04-13 10:09:08 UTC",
|
||||||
|
"Source": "xxxxxxxxxxxxxxxx"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"Username": "xxxxxxxx",
|
||||||
|
"Display Name": "xxxxxxxxxxxxxx",
|
||||||
|
"Creation Timestamp": "2020-04-13 10:09:08 UTC",
|
||||||
|
"Last Modified Timestamp": "2020-04-13 10:09:08 UTC",
|
||||||
|
"Source": "xxxxxxxxxxxxxxxx"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"Pending Requests": [
|
||||||
|
{
|
||||||
|
"Username": "xxxxxxxxxxxxxxx",
|
||||||
|
"Display Name": "xxxxxxxxxxx",
|
||||||
|
"Creation Timestamp": "2020-04-13 10:09:08 UTC",
|
||||||
|
"Last Modified Timestamp": "2020-04-13 10:09:08 UTC",
|
||||||
|
"Source": "xxxxxxxxxxxxxxxx"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"Username": "xxxxxxxxxxxxxx",
|
||||||
|
"Display Name": "xxxxxxxxxxxxx",
|
||||||
|
"Creation Timestamp": "2020-04-13 10:09:08 UTC",
|
||||||
|
"Last Modified Timestamp": "2020-04-13 10:09:08 UTC",
|
||||||
|
"Source": "xxxxxxxxxxxxxxxx"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"Shortcuts": []
|
||||||
|
}
|
||||||
26
test/fixtures/snapchat-2023-11/json/in_app_surveys.json
vendored
Normal file
26
test/fixtures/snapchat-2023-11/json/in_app_surveys.json
vendored
Normal file
|
|
@ -0,0 +1,26 @@
|
||||||
|
{
|
||||||
|
"Survey 2020/04/12": [
|
||||||
|
{
|
||||||
|
"Time": "xxxxxxxxxxxx",
|
||||||
|
"Survey Question": "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
|
||||||
|
"Survey Response": "xxxxxxxxxx"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"Time": "xxxxxxxxxxxx",
|
||||||
|
"Survey Question": "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
|
||||||
|
"Survey Response": "xxx"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"Survey 2020/04/13": [
|
||||||
|
{
|
||||||
|
"Time": "xxxxxxxxxxxx",
|
||||||
|
"Survey Question": "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
|
||||||
|
"Survey Response": "xxxxxxxxxxxxxx"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"Time": "xxxxxxxxxxxx",
|
||||||
|
"Survey Question": "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
|
||||||
|
"Survey Response": "some/path"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
23
test/fixtures/snapchat-2023-11/json/location_history.json
vendored
Normal file
23
test/fixtures/snapchat-2023-11/json/location_history.json
vendored
Normal file
|
|
@ -0,0 +1,23 @@
|
||||||
|
{
|
||||||
|
"Frequent Locations": [],
|
||||||
|
"Latest Location": [
|
||||||
|
{
|
||||||
|
"City": "",
|
||||||
|
"Country": "",
|
||||||
|
"Region": ""
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"Home & Work": {},
|
||||||
|
"Daily Top Locations": [],
|
||||||
|
"Top Locations Per Six-Day Period": [],
|
||||||
|
"Location History": [],
|
||||||
|
"Businesses and public places you may have visited": [],
|
||||||
|
"Areas you may have visited in the last two years": [
|
||||||
|
{
|
||||||
|
"Time": "some/path",
|
||||||
|
"City": "xxxxxx",
|
||||||
|
"Region": "xxxxxxxx",
|
||||||
|
"Postal Code": "11111"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
6
test/fixtures/snapchat-2023-11/json/ranking.json
vendored
Normal file
6
test/fixtures/snapchat-2023-11/json/ranking.json
vendored
Normal file
|
|
@ -0,0 +1,6 @@
|
||||||
|
{
|
||||||
|
"Number of Stories Viewed": [
|
||||||
|
1
|
||||||
|
],
|
||||||
|
"Content Interests": []
|
||||||
|
}
|
||||||
11
test/fixtures/snapchat-2023-11/json/shared_story.json
vendored
Normal file
11
test/fixtures/snapchat-2023-11/json/shared_story.json
vendored
Normal file
|
|
@ -0,0 +1,11 @@
|
||||||
|
{
|
||||||
|
"Shared Story": [],
|
||||||
|
"Spotlight History": [
|
||||||
|
{
|
||||||
|
"Story Date": "2020-04-13 10:09:08 UTC",
|
||||||
|
"Story URL": "url://somewhere",
|
||||||
|
"Action Type": "xxxx",
|
||||||
|
"View Time": "xxxxxxxxxxxxx"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
4
test/fixtures/snapchat-2023-11/json/snapchat_ai.json
vendored
Normal file
4
test/fixtures/snapchat-2023-11/json/snapchat_ai.json
vendored
Normal file
|
|
@ -0,0 +1,4 @@
|
||||||
|
{
|
||||||
|
"My AI Content": [],
|
||||||
|
"My AI Memory": []
|
||||||
|
}
|
||||||
10
test/fixtures/snapchat-2023-11/json/subscriptions.json
vendored
Normal file
10
test/fixtures/snapchat-2023-11/json/subscriptions.json
vendored
Normal file
|
|
@ -0,0 +1,10 @@
|
||||||
|
{
|
||||||
|
"Public Users": [
|
||||||
|
"xxxxxxxxxxxxxxx"
|
||||||
|
],
|
||||||
|
"Publishers": [],
|
||||||
|
"Stories": [],
|
||||||
|
"Last Active Timezone": "some/path",
|
||||||
|
"Push Notifications": [],
|
||||||
|
"Hidden Category Sections": []
|
||||||
|
}
|
||||||
15
test/fixtures/snapchat-2023-11/json/terms_history.json
vendored
Normal file
15
test/fixtures/snapchat-2023-11/json/terms_history.json
vendored
Normal file
|
|
@ -0,0 +1,15 @@
|
||||||
|
{
|
||||||
|
"Snap Inc. Terms of Service": [
|
||||||
|
{
|
||||||
|
"Version": "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx",
|
||||||
|
"Acceptance Date": "2020-04-13 10:09:08 UTC"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"Version": "xxxxxxxxxxxxxxxxxxxxxxxxx",
|
||||||
|
"Acceptance Date": "2020-04-13 10:09:08 UTC"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"Custom Creative Tools Terms": [],
|
||||||
|
"Business Services Terms": [],
|
||||||
|
"Games Terms": []
|
||||||
|
}
|
||||||
39
test/fixtures/snapchat-2023-11/json/user_profile.json
vendored
Normal file
39
test/fixtures/snapchat-2023-11/json/user_profile.json
vendored
Normal file
|
|
@ -0,0 +1,39 @@
|
||||||
|
{
|
||||||
|
"App Profile": {
|
||||||
|
"Country": "xx",
|
||||||
|
"Creation Time": "2020-04-13 10:09:08 UTC",
|
||||||
|
"Account Creation Country": "xxxxxxx",
|
||||||
|
"Platform Version": "xxxxxxx",
|
||||||
|
"In-app Language": "xx"
|
||||||
|
},
|
||||||
|
"Demographics": {
|
||||||
|
"Cohort Age": "",
|
||||||
|
"Derived Ad Demographic": ""
|
||||||
|
},
|
||||||
|
"Subscriptions": [],
|
||||||
|
"Engagement": [],
|
||||||
|
"Discover Channels Viewed": [],
|
||||||
|
"Breakdown of Time Spent on App": [],
|
||||||
|
"Ads You Interacted With": [],
|
||||||
|
"Interest Categories": [
|
||||||
|
"xxxxxx",
|
||||||
|
"xxxxxxxxxxxxxxxxxxx"
|
||||||
|
],
|
||||||
|
"Content Categories": [
|
||||||
|
"xxxxxxxxxxxxxxxxxxxxxxxxxxxx",
|
||||||
|
"some/path"
|
||||||
|
],
|
||||||
|
"Geographic Information": [],
|
||||||
|
"Interactions": {
|
||||||
|
"Web Interactions": [
|
||||||
|
"xxxxxxxxxxxxx",
|
||||||
|
"xxxxxxxxxxxxxxxxxxxxxx"
|
||||||
|
],
|
||||||
|
"App Interactions": [
|
||||||
|
"url://somewhere",
|
||||||
|
"url://somewhere"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"Off-Platform Sharing": [],
|
||||||
|
"Mobile Ad Id": "xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
|
||||||
|
}
|
||||||
BIN
test/fixtures/snapchat-2023-11/memories/2020-01-01_aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa-main.jpg
vendored
Normal file
BIN
test/fixtures/snapchat-2023-11/memories/2020-01-01_aaaaaaaa-aaaa-aaaa-aaaa-aaaaaaaaaaaa-main.jpg
vendored
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 2.2 KiB |
95
test/task.ts
95
test/task.ts
|
|
@ -9,7 +9,6 @@ import {
|
||||||
cmd,
|
cmd,
|
||||||
assignMeta,
|
assignMeta,
|
||||||
verify,
|
verify,
|
||||||
TaskTargetPipelineHelper,
|
|
||||||
} from "../data-export/task.ts";
|
} from "../data-export/task.ts";
|
||||||
|
|
||||||
const THIS_FILE = import.meta.dirname;
|
const THIS_FILE = import.meta.dirname;
|
||||||
|
|
@ -92,7 +91,7 @@ test("TaskTarget: pushToPipeline throws if read is not the first op", () => {
|
||||||
test("TaskTarget: clone produces an independent copy", () => {
|
test("TaskTarget: clone produces an independent copy", () => {
|
||||||
const t = new TaskTarget("/foo").assignMeta({
|
const t = new TaskTarget("/foo").assignMeta({
|
||||||
idValue: "orig",
|
idValue: "orig",
|
||||||
columnMeta: ["yeag"]
|
columnMeta: ["any"]
|
||||||
});
|
});
|
||||||
t.read();
|
t.read();
|
||||||
const c = t.clone();
|
const c = t.clone();
|
||||||
|
|
@ -155,41 +154,41 @@ test("toShell: cmd with function resolves at shell-generation time", () => {
|
||||||
|
|
||||||
// -- module-level functions ---------------------------------------------------
|
// -- module-level functions ---------------------------------------------------
|
||||||
|
|
||||||
test("cd: clones and changes directory of each target", () => {
|
test("cd: clones and changes directory of each target", async () => {
|
||||||
const targets = [new TaskTarget("/a"), new TaskTarget("/b")];
|
const targets = [new TaskTarget("/a"), new TaskTarget("/b")];
|
||||||
const result = cd(targets, "sub");
|
const result = await cd("sub")(targets);
|
||||||
assert.equal(result[0].path, "/a/sub");
|
assert.equal(result[0].path, "/a/sub");
|
||||||
assert.equal(result[1].path, "/b/sub");
|
assert.equal(result[1].path, "/b/sub");
|
||||||
assert.equal(targets[0].path, "/a"); // originals unchanged
|
assert.equal(targets[0].path, "/a"); // originals unchanged
|
||||||
});
|
});
|
||||||
|
|
||||||
test("read: clones and adds a read op to each target", () => {
|
test("read: clones and adds a read op to each target", async () => {
|
||||||
const targets = [new TaskTarget("/a.txt"), new TaskTarget("/b.txt")];
|
const targets = [new TaskTarget("/a.txt"), new TaskTarget("/b.txt")];
|
||||||
const result = read(targets);
|
const result = await read()(targets);
|
||||||
assert.equal(result[0].pipeline[0].type, "read");
|
assert.equal(result[0].pipeline[0].type, "read");
|
||||||
assert.equal(result[1].pipeline[0].type, "read");
|
assert.equal(result[1].pipeline[0].type, "read");
|
||||||
assert.equal(targets[0].pipeline.length, 0); // originals unchanged
|
assert.equal(targets[0].pipeline.length, 0); // originals unchanged
|
||||||
});
|
});
|
||||||
|
|
||||||
test("cmd: clones and appends a cmd op to each target", () => {
|
test("cmd: clones and appends a cmd op to each target", async () => {
|
||||||
const targets = [new TaskTarget("/a.txt")];
|
const targets = [new TaskTarget("/a.txt")];
|
||||||
targets[0].read();
|
targets[0].read();
|
||||||
const result = cmd(targets, "jq .");
|
const result = await cmd("jq .")(targets);
|
||||||
assert.equal(result[0].pipeline.length, 2);
|
assert.equal(result[0].pipeline.length, 2);
|
||||||
assert.equal(targets[0].pipeline.length, 1); // original unchanged
|
assert.equal(targets[0].pipeline.length, 1); // original unchanged
|
||||||
});
|
});
|
||||||
|
|
||||||
test("assignMeta: clones and sets meta on each target", () => {
|
test("assignMeta: clones and sets meta on each target", async () => {
|
||||||
const targets = [new TaskTarget("/a"), new TaskTarget("/b")];
|
const targets = [new TaskTarget("/a"), new TaskTarget("/b")];
|
||||||
const result = assignMeta(targets, { idValue: "myid" });
|
const result = await assignMeta({ idValue: "myid" })(targets);
|
||||||
assert.equal(result[0].id, "myid");
|
assert.equal(result[0].id, "myid");
|
||||||
assert.equal(result[1].id, "myid");
|
assert.equal(result[1].id, "myid");
|
||||||
assert.throws(() => targets[0].id); // originals have no id
|
assert.throws(() => targets[0].id); // originals have no id
|
||||||
});
|
});
|
||||||
|
|
||||||
test("taskGlob: returns matching targets across all input targets", () => {
|
test("taskGlob: returns matching targets across all input targets", async () => {
|
||||||
const targets = [new TaskTarget(FIXTURE_DIR)];
|
const targets = [new TaskTarget(FIXTURE_DIR)];
|
||||||
const result = taskGlob(targets, "friends/*.json");
|
const result = await taskGlob("friends/*.json")(targets);
|
||||||
assert.ok(result.length > 0);
|
assert.ok(result.length > 0);
|
||||||
assert.ok(result.every(r => r.path.endsWith(".json")));
|
assert.ok(result.every(r => r.path.endsWith(".json")));
|
||||||
});
|
});
|
||||||
|
|
@ -226,75 +225,3 @@ test("verify: filters a mixed list to only valid targets", async () => {
|
||||||
assert.equal(result[0], good);
|
assert.equal(result[0], good);
|
||||||
});
|
});
|
||||||
|
|
||||||
// -- TaskTargetPipelineHelper -------------------------------------------------
|
|
||||||
|
|
||||||
test("TaskTargetPipelineHelper: pipeline() promotes a plain array", () => {
|
|
||||||
const p = TaskTargetPipelineHelper.pipeline([new TaskTarget("/a")]);
|
|
||||||
assert.ok(p instanceof TaskTargetPipelineHelper);
|
|
||||||
});
|
|
||||||
|
|
||||||
test("TaskTargetPipelineHelper: pipeline() is idempotent", () => {
|
|
||||||
const arr = [new TaskTarget("/a")];
|
|
||||||
const p1 = TaskTargetPipelineHelper.pipeline(arr);
|
|
||||||
const p2 = TaskTargetPipelineHelper.pipeline(p1);
|
|
||||||
assert.equal(p1, p2);
|
|
||||||
});
|
|
||||||
|
|
||||||
test("TaskTargetPipelineHelper: cd returns a new helper with paths changed", () => {
|
|
||||||
const p = TaskTargetPipelineHelper.pipeline([new TaskTarget("/a"), new TaskTarget("/b")]);
|
|
||||||
const p2 = p.cd("sub");
|
|
||||||
assert.ok(p2 instanceof TaskTargetPipelineHelper);
|
|
||||||
assert.equal(p2[0].path, "/a/sub");
|
|
||||||
assert.equal(p2[1].path, "/b/sub");
|
|
||||||
});
|
|
||||||
|
|
||||||
test("TaskTargetPipelineHelper: read returns a new helper with read ops added", () => {
|
|
||||||
const p = TaskTargetPipelineHelper.pipeline([new TaskTarget("/a.txt")]);
|
|
||||||
const p2 = p.read();
|
|
||||||
assert.ok(p2 instanceof TaskTargetPipelineHelper);
|
|
||||||
assert.equal(p2[0].pipeline[0].type, "read");
|
|
||||||
});
|
|
||||||
|
|
||||||
test("TaskTargetPipelineHelper: cmd returns a new helper with cmd ops added", () => {
|
|
||||||
const p = TaskTargetPipelineHelper.pipeline([new TaskTarget("/a.txt")]);
|
|
||||||
const p2 = p.read().cmd("jq .");
|
|
||||||
assert.equal(p2[0].toShell(), "cat /a.txt | jq .");
|
|
||||||
});
|
|
||||||
|
|
||||||
// -- collect ------------------------------------------------------------------
|
|
||||||
|
|
||||||
test("collect: the final end of a chain is added to the collection set", () => {
|
|
||||||
const collection = new Set<TaskTargetPipelineHelper>();
|
|
||||||
const p = TaskTargetPipelineHelper.pipeline([new TaskTarget("/foo")]);
|
|
||||||
p.collect(collection);
|
|
||||||
|
|
||||||
const p2 = p.cd("sub");
|
|
||||||
assert.equal(collection.size, 1);
|
|
||||||
assert.ok(collection.has(p2));
|
|
||||||
});
|
|
||||||
|
|
||||||
test("collect: moving the chain end removes the old element and adds the new one", () => {
|
|
||||||
const collection = new Set<TaskTargetPipelineHelper>();
|
|
||||||
const p = TaskTargetPipelineHelper.pipeline([new TaskTarget("/foo")]);
|
|
||||||
p.collect(collection);
|
|
||||||
|
|
||||||
const p2 = p.cd("sub");
|
|
||||||
const p3 = p2.read();
|
|
||||||
assert.equal(collection.size, 1);
|
|
||||||
assert.ok(collection.has(p3));
|
|
||||||
assert.ok(!collection.has(p2));
|
|
||||||
});
|
|
||||||
|
|
||||||
test("collect: gathers the ends of multiple independent pipeline branches", () => {
|
|
||||||
const collection = new Set<TaskTargetPipelineHelper>();
|
|
||||||
|
|
||||||
const b1 = TaskTargetPipelineHelper.pipeline([new TaskTarget("/a.txt")]).collect(collection).read();
|
|
||||||
const b2 = TaskTargetPipelineHelper.pipeline([new TaskTarget("/b.txt")]).collect(collection).read();
|
|
||||||
|
|
||||||
assert.equal(collection.size, 2);
|
|
||||||
assert.ok(collection.has(b1));
|
|
||||||
assert.ok(collection.has(b2));
|
|
||||||
|
|
||||||
const allTargets = [...collection].flat();
|
|
||||||
assert.equal(allTargets.length, 2);
|
|
||||||
});
|
|
||||||
|
|
|
||||||
225
timelinize.ts
Normal file
225
timelinize.ts
Normal file
|
|
@ -0,0 +1,225 @@
|
||||||
|
import { type SQLOutputValue, type DatabaseSync } from "node:sqlite";
|
||||||
|
import { createWriteStream } from 'node:fs';
|
||||||
|
import { fileURLToPath } from "node:url";
|
||||||
|
import "./data-export/facebook.ts";
|
||||||
|
import { facebook } from "./data-export/facebook.ts";
|
||||||
|
import { execPaths, COLUMN_TYPES } from "./data-export/task.ts";
|
||||||
|
import * as DataIO from "./data-export/io.ts";
|
||||||
|
import {
|
||||||
|
startTime,
|
||||||
|
elapsed,
|
||||||
|
loadTaskInNewDb
|
||||||
|
} from "./main.ts";
|
||||||
|
|
||||||
|
const __filename = fileURLToPath(import.meta.url);
|
||||||
|
|
||||||
|
function dumpDBTableToCSV(db: DatabaseSync, tableName: string, outputFile: string) {
|
||||||
|
const stream = createWriteStream(outputFile);
|
||||||
|
const stmt = db.prepare(`SELECT * FROM ${tableName}`);
|
||||||
|
|
||||||
|
let headerWritten = false;
|
||||||
|
for (const row of stmt.iterate()) {
|
||||||
|
if (!headerWritten) {
|
||||||
|
stream.write(Object.keys(row).join(',') + '\n');
|
||||||
|
headerWritten = true;
|
||||||
|
}
|
||||||
|
stream.write(Object.values(row).map(v => `"${String(v ?? '').replace(/"/g, '""')}"`).join(',') + '\n');
|
||||||
|
}
|
||||||
|
|
||||||
|
stream.end();
|
||||||
|
}
|
||||||
|
function getColumnNames(db: DatabaseSync, tableName: string) {
|
||||||
|
return db.prepare(`PRAGMA table_info(${tableName})`).all().map(c => c.name) as string[];
|
||||||
|
}
|
||||||
|
function templateToSql(template: string, columns: string[]) {
|
||||||
|
// Convert '{0}, {1}' to '%s, %s'
|
||||||
|
const args: string[] = [];
|
||||||
|
const sqlTemplate = template.replace(/\{(\d+)\}/g, (match, index) => {
|
||||||
|
args.push(columns[parseInt(index)]);
|
||||||
|
return '%s';
|
||||||
|
});
|
||||||
|
return `printf('${sqlTemplate}', ${args.join(', ')})`;
|
||||||
|
}
|
||||||
|
function sqlLiteral(str: string | undefined | null): string {
|
||||||
|
if (str === null || str === undefined) {
|
||||||
|
return 'NULL';
|
||||||
|
}
|
||||||
|
|
||||||
|
// Escape single quotes by doubling them
|
||||||
|
const escaped = str.replace(/'/g, "''");
|
||||||
|
|
||||||
|
// Wrap in single quotes
|
||||||
|
return `'${escaped}'`;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function main() {
|
||||||
|
// Configure the tasks to run
|
||||||
|
console.log(`${elapsed()} - Building targets`);
|
||||||
|
const targets = await execPaths([
|
||||||
|
{path: "/home/cobertos/Seafile/archive/ExportedServiceData/facebook/formapcast_facebook-DEADNAME-May2021-json", op: facebook()}
|
||||||
|
]);
|
||||||
|
console.log(`${elapsed()} - Found ${targets.filter(t => !t.aggregate).length} possible targets`);
|
||||||
|
const db = await loadTaskInNewDb(targets);
|
||||||
|
|
||||||
|
// New output tables
|
||||||
|
db.exec(`CREATE TABLE combined (timestamp TEXT, description TEXT, type TEXT, sender TEXT, receiver TEXT, lat REAL, lng REAL, tags TEXT);`);
|
||||||
|
|
||||||
|
//(message, email, note,
|
||||||
|
// social, location, media, event, document,
|
||||||
|
// bookmark; defaults to note)
|
||||||
|
|
||||||
|
type ColumnMetaType = (keyof typeof COLUMN_TYPES);
|
||||||
|
interface MetadataRow {
|
||||||
|
id: string,
|
||||||
|
perRowDescription?: string,
|
||||||
|
perRowTags?: string,
|
||||||
|
columnMeta: ColumnMetaType[],
|
||||||
|
columnNames: string[],
|
||||||
|
metaId?: string
|
||||||
|
}
|
||||||
|
function verifyMetdataRow(input: Record<string, SQLOutputValue>): undefined | MetadataRow {
|
||||||
|
const { id, perRowDescription, perRowTags, columnMeta: columnMetaCSV, metaId } = input;
|
||||||
|
if (!id) {
|
||||||
|
console.error("Row did not have id/tableName, skipping");
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
if (typeof id !== "string") {
|
||||||
|
console.error(`Id must be string, got ${typeof id}, ${id}`);
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
if (!columnMetaCSV) {
|
||||||
|
console.warn(`${id} did not have columnMeta, nothing to do. Skipping`);
|
||||||
|
return undefined; // No column information
|
||||||
|
}
|
||||||
|
if (typeof columnMetaCSV !== "string") {
|
||||||
|
console.warn(`${id} did not have columnMeta of type string. Skipping`);
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
const columnMeta = columnMetaCSV.split(",") as ColumnMetaType[];
|
||||||
|
|
||||||
|
// Get the column names from the table id
|
||||||
|
const columnNames = getColumnNames(db, id);
|
||||||
|
if (columnNames.length !== columnMeta.length) {
|
||||||
|
console.error(`columnNames and columnMeta did not have same length. skipping`);
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (typeof perRowDescription !== "string" && perRowDescription !== undefined && perRowDescription !== null) {
|
||||||
|
console.warn(`Invalid typeof perRowDescription, was ${typeof perRowDescription}, value ${perRowDescription}`);
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
if (typeof perRowTags !== "string" && perRowTags !== undefined && perRowTags !== null) {
|
||||||
|
console.warn(`Invalid typeof perRowTags, was ${typeof perRowTags}, value ${perRowTags}`);
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
if (typeof metaId !== "string" && metaId !== undefined && metaId !== null) {
|
||||||
|
console.warn(`Invalid typeof metaId, was ${typeof metaId}, value ${metaId}`);
|
||||||
|
return undefined;
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
id,
|
||||||
|
perRowDescription: perRowDescription ?? undefined,
|
||||||
|
perRowTags: perRowTags ?? undefined,
|
||||||
|
columnMeta,
|
||||||
|
columnNames,
|
||||||
|
metaId: metaId ?? undefined
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
/**Maps columnMeta names to the column names*/
|
||||||
|
function metaToNames(meta: MetadataRow): Partial<Record<ColumnMetaType, string>> {
|
||||||
|
const out: Partial<Record<ColumnMetaType, string>> = {};
|
||||||
|
for (const [idx, name] of meta.columnNames.entries()) {
|
||||||
|
const metaName = meta.columnMeta[idx];
|
||||||
|
if (out[metaName]) {
|
||||||
|
console.warn(`Duplicate column with metaName "${metaName}". The current one which will be used is "${out[metaName]}". Skipping the duplicate.`);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
out[metaName] = name;
|
||||||
|
}
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
function metaParts(metaNameToColumnName: Partial<Record<ColumnMetaType, string>>): Record<ColumnMetaType, string> {
|
||||||
|
const out: Record<ColumnMetaType, string> = {} as any;
|
||||||
|
for (const type of Object.keys(COLUMN_TYPES) as ColumnMetaType[]) {
|
||||||
|
if (!metaNameToColumnName[type]) {
|
||||||
|
out[type] = "NULL";
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
// Wrap in brackets so column names like "from" don't cause any issues
|
||||||
|
out[type] = `[${metaNameToColumnName[type]}]`
|
||||||
|
}
|
||||||
|
return out;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Iterate over all the tables and their metadata
|
||||||
|
const statement = db.prepare(`SELECT id, perRowDescription, perRowTags, columnMeta, metaId FROM base_data_manager_metadata`);
|
||||||
|
for (const row of statement.iterate()) {
|
||||||
|
const verified = verifyMetdataRow(row);
|
||||||
|
if (!verified) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
const { id, perRowDescription, perRowTags, columnMeta, columnNames, metaId } = verified;
|
||||||
|
const metaNameToColumnName = metaToNames(verified);
|
||||||
|
const part = metaParts(metaNameToColumnName);
|
||||||
|
|
||||||
|
// Now find what to insert into each row of the combined
|
||||||
|
// Per row tags is an string of csv'd items but needs to be made a literal
|
||||||
|
// TODO: Make this either a template string or have jq do something
|
||||||
|
// tagsPart = templateToSqlExpr(target.perRowTags, columnNames);
|
||||||
|
const tagsPart = sqlLiteral(perRowTags);
|
||||||
|
|
||||||
|
// Choose what to do with this table based on what meta is present
|
||||||
|
if (
|
||||||
|
!!metaNameToColumnName.sender
|
||||||
|
&& !!metaNameToColumnName.isodatetime
|
||||||
|
) {
|
||||||
|
if (!metaId) {
|
||||||
|
console.warn(`Chat ${id} with .sender but no .metaId. Skipping`);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// First pull the name of the conversation out of the metaId
|
||||||
|
const receiverThreadTitle = db.prepare(`SELECT title FROM ${metaId} WHERE (id=${sqlLiteral(id)})`).get()?.title;
|
||||||
|
if (!receiverThreadTitle || typeof receiverThreadTitle !== "string") {
|
||||||
|
console.warn(`Chat ${id} with .metaId ${metaId} returned invalid receiverThreadTitle ${typeof receiverThreadTitle}. Skipping`);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
const receiverPart = sqlLiteral(receiverThreadTitle);
|
||||||
|
|
||||||
|
// Put this table into the combined table
|
||||||
|
db.exec(`INSERT INTO combined SELECT ${part.isodatetime}, ${part.text}, 'message', ${part.sender}, ${receiverPart}, ${part.lat}, ${part.lng}, ${tagsPart} FROM ${id};`);
|
||||||
|
}
|
||||||
|
else if (!!metaNameToColumnName.isodatetime) {
|
||||||
|
// Put this table into the combined table
|
||||||
|
let descriptionPart = perRowDescription
|
||||||
|
? templateToSql(perRowDescription, columnNames)
|
||||||
|
: `'An entry from the ${id} table'`; // Default is just kinda garbo...
|
||||||
|
db.exec(`INSERT INTO combined SELECT ${part.isodatetime}, ${descriptionPart}, 'node', NULL, NULL, ${part.lat}, ${part.lng}, ${tagsPart} FROM ${id};`);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
console.warn(`Table with id ${id} had no isodatetime or anything else of value, skipping...`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const count = db.prepare(`SELECT COUNT(*) as count FROM combined`).get()?.count;
|
||||||
|
console.log(`${elapsed()} - Combined database built with ${count} rows`);
|
||||||
|
|
||||||
|
// Dump it to the disk for debugging
|
||||||
|
const sqlitePath = "debug_your.csv.db";
|
||||||
|
console.log(`${elapsed()} - Writing database to disk at "${sqlitePath}"`);
|
||||||
|
await DataIO.dumpDBToDisk(db, sqlitePath);
|
||||||
|
|
||||||
|
console.log(`${elapsed()} - Database written to disk`);
|
||||||
|
|
||||||
|
// Dump it all to the path specified
|
||||||
|
dumpDBTableToCSV(db, "combined", "your.csv");
|
||||||
|
console.log(`${elapsed()} - Combined database written to disk as CSV`);
|
||||||
|
db.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (process.argv[1] === __filename) {
|
||||||
|
main();
|
||||||
|
}
|
||||||
|
|
||||||
123
util/scrub.jq
123
util/scrub.jq
|
|
@ -3,46 +3,89 @@
|
||||||
# fd -t f .json -0 | xargs -I % -0 -- jq -f scrub.jq "%" > "%"
|
# fd -t f .json -0 | xargs -I % -0 -- jq -f scrub.jq "%" > "%"
|
||||||
# (Though you should remove the end `> "%"` first to get just the output without
|
# (Though you should remove the end `> "%"` first to get just the output without
|
||||||
# persisting to be sure it's what you want first)
|
# persisting to be sure it's what you want first)
|
||||||
def scrub:
|
|
||||||
walk(
|
def scrub_key:
|
||||||
if type == "string" then
|
if test("^[0-9]+$") then
|
||||||
if test("^(([0-9]{1,3}\\.){3}[0-9]{1,3})$") then
|
("1" * length)
|
||||||
"1.1.1.1"
|
else
|
||||||
elif test("^([0-9a-fA-F]{0,4}:){2,7}[0-9a-fA-F]{0,4}$") then
|
.
|
||||||
"2000:0000:0000:0000:0000:0000:0000:0000"
|
end;
|
||||||
elif test("^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$") then
|
|
||||||
"not_a_real_email@example.com"
|
def scrub_primitive:
|
||||||
elif test("\\.(jpg|jpeg|png|gif|bmp|webp|svg|ico|tiff|mp3|wav|flac|aac|ogg|wma|m4a|mp4|avi|mkv|mov|wmv|flv|webm)$"; "i") then
|
if type == "string" then
|
||||||
# Leave these alone, you will have to manually go through these later and replace with
|
if test("^(([0-9]{1,3}\\.){3}[0-9]{1,3})$") then
|
||||||
# placeholders
|
# IPv4
|
||||||
# TODO: jq 1.7 adds debug(), use this instead when I can upgrade jq, otherwise
|
"1.1.1.1"
|
||||||
# you need to manually grep for MANUAL REPAIR NEEDED for now
|
elif test("^([0-9a-fA-F]{0,4}:){2,7}[0-9a-fA-F]{0,4}$") then
|
||||||
("MANUAL REPAIR NEEDED: \(.)" | stderr) | .
|
# IPv6
|
||||||
elif test("://") then
|
"2000:0000:0000:0000:0000:0000:0000:0000"
|
||||||
"url://somewhere"
|
elif test("^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$") then
|
||||||
elif test("/") then
|
# Email-like
|
||||||
"some/path"
|
"not_a_real_email@example.com"
|
||||||
else
|
elif test("\\.(jpg|jpeg|png|gif|bmp|webp|svg|ico|tiff|mp3|wav|flac|aac|ogg|wma|m4a|mp4|avi|mkv|mov|wmv|flv|webm)$"; "i") then
|
||||||
"xxx"
|
# Leave these alone, you will have to manually go through these later and replace with
|
||||||
end
|
# placeholders
|
||||||
elif type == "number" then
|
# TODO: jq 1.7 adds debug(), use this instead when I can upgrade jq, otherwise
|
||||||
if 946702800 <= . and . <= 1893474000 then
|
# you need to manually grep for MANUAL REPAIR NEEDED for now
|
||||||
# Take modulo 1 year to get variance in the output, then add offset to bring to ~2024
|
("MANUAL REPAIR NEEDED: \(.)" | stderr) | .
|
||||||
((((. % 31557600) + 1704067200) / 5000 | floor) * 5000)
|
elif test("://") then
|
||||||
else
|
"url://somewhere"
|
||||||
69
|
elif test("/") then
|
||||||
end
|
"some/path"
|
||||||
elif type == "array" then
|
elif test("^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}[+\\-][0-9]{2}:[0-9]{2}$") then
|
||||||
# Keep only 2 elements, but scrub *those* elements
|
# iso date time without millis with timezone
|
||||||
if length > 1 then
|
"2020-04-13T10:09:08+00:00"
|
||||||
[ (.[0] | scrub), (.[1] | scrub) ]
|
elif test("^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}(\\.[0-9]{1,6})?[+\\-][0-9]{2}:[0-9]{2}$") then
|
||||||
elif length > 0 then
|
# iso date time with millis with timezone
|
||||||
[ (.[0] | scrub) ]
|
"2020-04-13T10:09:08.000000+00:00"
|
||||||
else
|
elif test("^[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2} UTC") then
|
||||||
[]
|
# Date format from snapchat export
|
||||||
end
|
"2020-04-13 10:09:08 UTC"
|
||||||
|
elif test("^[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}") then
|
||||||
|
# Date format from snapchat export
|
||||||
|
"2020-04-13 10:09:08"
|
||||||
|
elif test("^[0-9]+$") then
|
||||||
|
# preserve length of the string
|
||||||
|
"1" * length
|
||||||
|
elif test("^[0-9a-fA-F]+$") then #hexadecimal string
|
||||||
|
# repeat the hex pattern and truncate to original length
|
||||||
|
("a1" * length)[:length]
|
||||||
|
elif . == "" then
|
||||||
|
# prevents empty string from just returning null instead of empty string
|
||||||
|
""
|
||||||
else
|
else
|
||||||
.
|
# Preserve string length for other strings
|
||||||
|
"x" * length
|
||||||
end
|
end
|
||||||
);
|
elif type == "number" then
|
||||||
|
if 946702800 <= . and . <= 1893474000 then
|
||||||
|
# Take modulo 1 year to get variance in the output, then add offset to bring to ~2024
|
||||||
|
((((. % 31557600) + 1704067200) / 5000 | floor) * 5000)
|
||||||
|
elif . == (. | floor) then
|
||||||
|
# Integer - preserve digit count
|
||||||
|
(tostring | length) as $len | ("1" * $len) | tonumber
|
||||||
|
else
|
||||||
|
8.08
|
||||||
|
end
|
||||||
|
elif type == "boolean" then
|
||||||
|
# Replace all booleans with false, this can give sensative info away based
|
||||||
|
# on what the key was in the data
|
||||||
|
false
|
||||||
|
else
|
||||||
|
.
|
||||||
|
end;
|
||||||
|
|
||||||
|
def scrub:
|
||||||
|
if type == "object" then
|
||||||
|
# Apply scrubbing to both keys and values
|
||||||
|
with_entries(.key |= scrub_key | .value |= scrub)
|
||||||
|
elif type == "array" then
|
||||||
|
# Keep only 2 elements, but scrub *those* elements
|
||||||
|
.[:2] | map(scrub)
|
||||||
|
else
|
||||||
|
# Scrub a primitive value
|
||||||
|
scrub_primitive
|
||||||
|
end;
|
||||||
|
|
||||||
|
# Call scrub
|
||||||
scrub
|
scrub
|
||||||
|
|
@ -27,9 +27,6 @@ assert(targetDir, "Usage: ./scrub.ts <directory>");
|
||||||
|
|
||||||
const targetPath = path.resolve(targetDir);
|
const targetPath = path.resolve(targetDir);
|
||||||
|
|
||||||
// const stat = await fs.stat(targetPath);
|
|
||||||
// assert(stat.isDirectory(), "");
|
|
||||||
|
|
||||||
const [notADir] = await ptry($`test -d ${targetPath}`);
|
const [notADir] = await ptry($`test -d ${targetPath}`);
|
||||||
assert(!notADir, `Error: '${targetPath}' is not a directory`);
|
assert(!notADir, `Error: '${targetPath}' is not a directory`);
|
||||||
|
|
||||||
|
|
@ -49,12 +46,16 @@ console.log("filePaths", filePaths);
|
||||||
for (const file of filePaths) {
|
for (const file of filePaths) {
|
||||||
console.log(`Processing: ${file}`);
|
console.log(`Processing: ${file}`);
|
||||||
const tmpFile = `${file}.tmp`;
|
const tmpFile = `${file}.tmp`;
|
||||||
|
const piiFile = `${file}.DELETE-THIS-HAS-PII`;
|
||||||
|
|
||||||
const [jqErr] = await ptry($`jq -f ${scrubJq} ${file} > ${tmpFile}`);
|
const [jqErr] = await ptry($`jq -f ${scrubJq} ${file} > ${tmpFile}`);
|
||||||
assert(!jqErr, `Error processing ${file}: ${jqErr}`);
|
assert(!jqErr, `Error processing ${file}: ${jqErr}`);
|
||||||
|
|
||||||
const [mvErr] = await ptry($`mv ${tmpFile} ${file}`);
|
const [mvErr] = await ptry($`mv ${file} ${piiFile}`);
|
||||||
assert(!mvErr, `Error moving ${tmpFile} to ${file}: ${mvErr}`);
|
assert(!mvErr, `Error moving ${file} to ${piiFile}: ${mvErr}`);
|
||||||
|
|
||||||
|
const [mv2Err] = await ptry($`mv ${tmpFile} ${file}`);
|
||||||
|
assert(!mv2Err, `Error moving ${tmpFile} to ${file}: ${mv2Err}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
console.log();
|
console.log();
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue