diff --git a/data-export/facebook.ts b/data-export/facebook.ts index 9b77423..cb0d656 100644 --- a/data-export/facebook.ts +++ b/data-export/facebook.ts @@ -168,7 +168,7 @@ function facebook_comments_generic(this: TaskTargetPipelineHelper, prop: string) return this.cmd(["jq", "-r", ` ["timestamp","data", "title"], ( - .comments[]? + .${prop}[]? | [(.timestamp | todateiso8601), "TODO", .title] ) | @csv @@ -269,7 +269,7 @@ function facebook_admin_records_generic(this: TaskTargetPipelineHelper, prop: st return this.cmd(["jq", "-r", ` ["event","created_timestamp","ip_address","user_agent","datr_cookie"], ( - .admin_records[] + .${prop}[] | [.event, (.session.created_timestamp | todateiso8601), .ip_address, .user_agent, .datr_cookie] ) | @csv @@ -301,10 +301,10 @@ function facebook_authorized_logins_v2(this: TaskTargetPipelineHelper) { } function facebook_contact_verification_generic(this: TaskTargetPipelineHelper, prop: string) { return this.cmd(["jq", "-r", ` - ["action", "timestamp", "site", "ip_address"], + ["timestamp", "email", "contact_type"], ( .${prop}[] - | [.action, (.timestamp | todateiso8601), .site, .ip_address] + | [(.verification_time | todateiso8601), .contact, .contact_type] ) | @csv `]) @@ -399,7 +399,7 @@ function facebook_v2(this: TaskTargetPipelineHelper) { // No correlary for your_off-facebook_activity.json p.collect(col).cd(`apps_and_websites_off_of_facebook/connected_apps_and_websites.json`).read().facebook_installed_apps_v2(); p.collect(col).cd(`your_facebook_activity/comments_and_reactions/comments.json`).read().facebook_comments_v2(); - p.collect(col).glob(`your_facebook_activity/messages/**/*.json`) // Files are message_1.json, etc + p.collect(col).glob(`your_facebook_activity/messages/*/**/*.json`) // Messages files are in the FOLDERS inside messages (archived_threads, e2ee_cutover, etc...) .setId(t=>`Facebookv2 - Messages ${t.basenameN(2)}`) // 1, 2, etc is not specific enough, include the convo name .read() .facebook_messages_generic() @@ -438,7 +438,8 @@ function facebook_v2(this: TaskTargetPipelineHelper) { p.collect(col).cd(`your_facebook_activity/facebook_marketplace/items_sold.json`).read().facebook_marketplace_items_sold_v2() - return Array.from(col); + const final = Array.from(col).flat(); + return TaskTargetPipelineHelper.pipeline(final); } function facebook(this: TaskTargetPipelineHelper){ @@ -606,7 +607,7 @@ function facebook(this: TaskTargetPipelineHelper){ p.collect(col).cd(`marketplace/items_sold.json`).read().facebook_marketplace_items_sold_v1() - p.collect(col).cd(`messages/**/*.json`) // Files are message_1.json, etc + p.collect(col).glob(`messages/**/*.json`) // Files are message_1.json, etc .setId(t=>`Facebook - Messages ${t.basenameN(2)}`) // 1, 2, etc is not specific enough, include the convo name .read() .facebook_messages_generic() @@ -780,6 +781,7 @@ function facebook(this: TaskTargetPipelineHelper){ // `${facebookRoot}/your_places` - no data in my export // `${facebookRoot}/your_topics` - no data in my export - return Array.from(col); + const final = Array.from(col).flat(); + return TaskTargetPipelineHelper.pipeline(final); }; diff --git a/data-export/parallel.ts b/data-export/parallel.ts new file mode 100644 index 0000000..27add5c --- /dev/null +++ b/data-export/parallel.ts @@ -0,0 +1,86 @@ +import { $, type ProcessOutput } from 'zx'; +import os from 'os'; +import { type TaskTarget, run } from "./task.ts"; + +$.verbose = false; + +type ResultMap = Map; + +export async function parallel( + targets: TaskTarget[], + quiet: boolean = false, + maxConcurrency: number = os.cpus().length +): Promise { + const results = new Map(); + + const total = targets.length; + let completed = 0; + let running = 0; + const completionTimes: number[] = []; + const startTime = Date.now(); + + const inFlight = new Set>(); + + function formatEta(): string { + const left = total - completed; + const avgSeconds = completionTimes.length > 0 + ? completionTimes.reduce((a, b) => a + b, 0) / completionTimes.length / 1000 + : 0; + const etaSeconds = Math.round(left * avgSeconds); + const pct = total > 0 ? Math.round((completed / total) * 100) : 100; + const lastDuration = completionTimes.length > 0 + ? (completionTimes[completionTimes.length - 1] / 1000).toFixed(1) + : '0.0'; + + return `ETA: ${etaSeconds}s Left: ${left} AVG: ${avgSeconds.toFixed(2)}s local:${running}/${completed}/${pct}%/${lastDuration}s`; + } + + function printStatus(): void { + if (quiet) { + return; + } + process.stderr.write(`\r${formatEta()}`.padEnd(80)); + } + + async function runJob(t: TaskTarget): Promise { + running++; + printStatus(); + + const result = await run(t); + completionTimes.push(result.duration); + + results.set(t.id, result); + + running--; + completed++; + printStatus(); + } + + const queue = targets.slice(); + // Process queue with concurrency limit + while (queue.length > 0 || inFlight.size > 0) { + // Fill up to max concurrency + while (queue.length > 0 && inFlight.size < maxConcurrency) { + const target = queue.shift()!; + const promise = runJob(target).then(() => { + inFlight.delete(promise); + }); + inFlight.add(promise); + } + + // Wait for at least one to complete if at capacity + if (inFlight.size > 0) { + await Promise.race(inFlight); + } + } + + // Final status line + process.stderr.write('\n'); + const totalSeconds = ((Date.now() - startTime) / 1000).toFixed(1); + const failed = Array.from(results.values().filter(p => !p.ok)); + process.stderr.write( + `\nCompleted ${total} jobs in ${totalSeconds}s (${failed.length} failed)\n` + ); + + return results; +} diff --git a/data-export/task-before-functional.ts.old b/data-export/task-before-functional.ts.old deleted file mode 100644 index 225cac9..0000000 --- a/data-export/task-before-functional.ts.old +++ /dev/null @@ -1,352 +0,0 @@ -import nodePath from 'node:path'; -import fs from 'node:fs'; -import { strict as assert } from "node:assert"; -import { execFile as _execFile } from "node:child_process"; -import { promisify } from "node:util"; -import { ZipFS } from "./zipFs.ts"; -import { globSync } from "glob"; - -const execFile = promisify(_execFile); - -type FSImpl = { - isZip?: boolean; - zipPath?: string; - init?(): Promise; - ready?: boolean; - - statSync: typeof fs["statSync"]; - existsSync: typeof fs["existsSync"]; - - // Required by glob - lstatSync: typeof fs["lstatSync"]; - // Needs to include withFileTypes DirEnt variant - readdir: typeof fs["readdir"]; - readdirSync: typeof fs["readdirSync"]; - readlinkSync: typeof fs["readlinkSync"]; - realpathSync: typeof fs["realpathSync"]; - promises: { - lstat: typeof fs.promises["lstat"]; - // Needs to include withFileTypes DirEnt - readdir: typeof fs.promises["readdir"]; - readlink: typeof fs.promises["readlink"]; - realpath: typeof fs.promises["realpath"]; - } -}; -const defaultFSImpl = fs; - -function safe(s: string) { - return s.replace(/[^a-zA-Z0-9_]/g, '_'); -} - - -//TODO: DANGER: I doubt this is safe... -function shEscape(s: string) { - assert(!s.includes("\n"), "shEscape given new line, caller needs to handle these"); - if (!s.match(/[ \$\"\'\!]/)) { - return s; - } - // We need to quote this string - // Single quoted strings require you to close the single quoted string, then - // use the escaped single quote, and then reopen the string... obscene - s = s.replace(/'/g, "'\\''"); - s = `'${s}'`; - return s; -} - -abstract class TaskTargetBase { - target: TaskTarget; - constructor(target: TaskTarget) { - this.target = target; - } - abstract get type(): "read" | "mid"; - abstract toShell(): string; -} -class TaskTargetRead extends TaskTargetBase { - get type(){ return "read" as const; } - toShell() { - if (this.target.fsImpl.isZip) { - assert(this.target.fsImpl.zipPath, "Should have a zipPath"); - // We need to be able to do this - return `7z x ${shEscape(this.target.fsImpl.zipPath)} -so ${shEscape(this.target.path)}`; - } - - // TODO : Implement when reading from a zip file - return `cat ${shEscape(this.target.path)}`; - } -} -class TaskTargetCmd extends TaskTargetBase { - get type(){ return "mid" as const; } - /**What nodejs spawn() and execFile() take - * [cmd, ...args]: string[] - */ - cmd: string[]; - static parse(target: TaskTarget, v: string | string[] | ((t: TaskTarget)=>string) | ((t: TaskTarget)=>string[])): string[] { - if (typeof v === "function") { - v = v(target); - } - if (typeof v === "string") { - v = v.split(/\s+/); - } - return v; - } - constructor(target: TaskTarget, cmd: string | string[] | ((t: TaskTarget)=>string) | ((t: TaskTarget)=>string[])) { - super(target); - this.cmd = TaskTargetCmd.parse(target, cmd); - } - toShell() { - const out = this.cmd - .map(c => { - let sh = c.replace(/\n/g, "") - return shEscape(sh); - }); - - return out.join(" "); - } -} - - - -class TaskTarget { - path: string; - fsImpl: FSImpl = defaultFSImpl; - pipeline: TaskTargetBase[]; - idValue: string | ((t: TaskTarget)=>string) | undefined; - postFns: ((t: TaskTarget)=>Promise)[]; - - constructor(path: string){ - this.path = path; - this.pipeline = []; - this.postFns = []; - } - - exists() { - return this.fsImpl.existsSync(this.path); - } - - _joinPath(path: string) { - let finalPath = path; - if (!path.startsWith('/')) { - finalPath = nodePath.join(this.path, path) - } - return finalPath; - } - - get basename() { - return safe(nodePath.basename(this.path)); - } - basenameN(n: number) { - return this.path - .split("/") - .map(s => safe(s)) - .slice(-n) - .join("___"); - } - - get id() { - assert(this.idValue, `TaskTarget for path "${this.path}" must have an id`); - if (typeof this.idValue === "function") { - return safe(this.idValue(this)); - } - return safe(this.idValue); - } - - /**Changes the current directory of the target*/ - cd(path: string) { - this.path = this._joinPath(path); - } - - /**Get a glob off of the target*/ - glob(globPath: string) { - globPath = this._joinPath(globPath); - return globSync(globPath, { - cwd: '/DUMMYCWD', - fs: this.fsImpl - }); - } - - clone() { - const t = new TaskTarget(this.path); - t.fsImpl = this.fsImpl; - t.idValue = typeof this.idValue === "function" ? this.idValue : undefined; - t.postFns = t.postFns.slice(); - //TODO: clone pipeline - return t; - } - - pushToPipeline(v: TaskTargetBase) { - if (v.type === "read") { - assert(this.pipeline.length === 0, "A read can only be the first item in a pipeline"); - } - - this.pipeline.push(v); - } - - pushPostFn(fn: ((t: TaskTarget)=>Promise)) { - this.postFns.push(fn); - } -} - -/**A very composable object*/ -export class Task { - /**A serial pipeline of Streams*/ - targets: TaskTarget[]; - - /**SHARED list of all tasks for this given tree*/ - tasks: Task[]; - - constructor() { - this.tasks = []; - this.targets = [new TaskTarget(process.cwd())]; - } - - cd(path: string) { - for (const t of this.targets) { - // TODO: opts - t.cd(path); - } - return this; - } - - /**Globs for all the paths that match under all targets*/ - glob(globPath: string) { - // For every target, concat glob onto it, glob, and then - // replace the original set of targets with all the new ones - const newTargets: TaskTarget[] = []; - for (const t of this.targets) { - const matches = t.glob(globPath); - for (const m of matches) { - const newT = t.clone(); - newT.path = m; - newTargets.push(newT); - } - } - this.targets = newTargets; - return this; - } - - /**Opens all targets as zip archives*/ - async zip() { - for (const t of this.targets) { - const zfs = new ZipFS(t.path); - await zfs.init(); - t.path = ""; // Each target is now rooted at the base of its respective zip - t.fsImpl = zfs.getImpl() as any; - } - return this; - } - - /**Returns a copy of ourself*/ - clone() { - const t = new Task(); - t.targets = this.targets.map(t => t.clone()); - t.tasks = this.tasks; //SHARED object reference - return t; - } - - /**Returns a copy of ourself, but adds us to this tree's shared - * task list as well*/ - fork() { - const c = this.clone(); - this.tasks.push(c); - return c; - } - - cmd(cmd: string | string[] | ((target: TaskTarget)=>string) | ((target: TaskTarget)=>string[])) { - for (const t of this.targets) { - t.pushToPipeline(new TaskTargetCmd(t, cmd)); - } - return this; - } - read() { - for (const t of this.targets) { - t.pushToPipeline(new TaskTargetRead(t)); - } - return this; - } - setId(idValue: string | ((t: TaskTarget)=>string)) { - for (const t of this.targets) { - t.idValue = idValue; - } - return this; - } - post(fn: any) { - for (const t of this.targets) { - t.pushPostFn(fn); - } - } - types( - types: string[] - ) { - // TODO: - return this; - } - csvSink( - summarization?: [string, string][] - ) { - // Ingest this csv into the database at the given id - // this.cmd(t=>["sqlite-utils", "insert", "your.db", t.id, "-", "--csv", "--detect-types"]); - // Add a post processing function for these targets that prints out the summarization - // stats - this.post(async (t: TaskTarget)=>{ - // We only do the first one so far for the summarization - let queryLine: string; - let formatFn: (r: any)=>string; - const [columnName, type] = summarization?.[0] ?? [undefined, undefined]; - if (type === "numeric") { - queryLine = `min(${columnName}) as lo, max(${columnName}) as hi, count(*) as n`; - formatFn = (r: any)=>`${r.n} rows from ${r.lo} to ${r.hi} for ${t.id}`; - } - else { - queryLine = `count(*) as n`; - formatFn = (r: any)=>`${r.n} rows for ${t.id}`; - } - - const cmd = "sqlite-utils"; - const args = ["query", "your.db", `select ${queryLine} from ${t.id}`] - const { stdout, stderr } = await execFile(cmd, args); - const results = JSON.parse(stdout); - const result = results[0]; // should only be one result in the array for this type of query - const logLine = formatFn(result); - (t as any).log = logLine; - }); - - return this; - } - - /**Collect all the TaskTargets, make sure everything is init'd and exists - * and output the targets for processing*/ - async getFinalTargets() { - const targets: TaskTarget[] = []; - for (const task of this.tasks) { - for (const t of task.targets) { - // Make sure fsImpl is ready - if ("ready" in t.fsImpl && !t.fsImpl.ready && t.fsImpl.init) { - await t.fsImpl.init(); - } - if (t.pipeline.length <= 0) { - continue; // Tasks with empty pipelines are no-ops, remove - } - if (!t.exists()) { - console.warn(`Missing target ${t.path}`); - continue; - } - - targets.push(t); - } - } - return targets; - } - - async getTaskTSVShell() { - const targets = await this.getFinalTargets(); - let out: string[] = []; - for (const t of targets) { - const shell = t.pipeline - .map(p => p.toShell()) - .join(" | ") - out.push(`${t.id}\t${shell}`); - } - - return out.join("\n"); - } -} diff --git a/data-export/task.ts b/data-export/task.ts index 797b477..df884b8 100644 --- a/data-export/task.ts +++ b/data-export/task.ts @@ -3,7 +3,7 @@ import fs from 'node:fs'; import { strict as assert } from "node:assert"; import { ZipFS } from "./zipFs.ts"; import { globSync } from "glob"; -import { $ } from "zx"; +import { $, ProcessPromise, quote } from "zx"; type FSImpl = { isZip?: boolean; @@ -36,20 +36,6 @@ function safe(s: string) { } -//TODO: DANGER: I doubt this is safe... -function shEscape(s: string) { - assert(!s.includes("\n"), "shEscape given new line, caller needs to handle these"); - if (!s.match(/[ \$\"\'\!]/)) { - return s; - } - // We need to quote this string - // Single quoted strings require you to close the single quoted string, then - // use the escaped single quote, and then reopen the string... obscene - s = s.replace(/'/g, "'\\''"); - s = `'${s}'`; - return s; -} - interface TaskTargetOp { type: "read" | "mid"; toShell(target: TaskTarget): string; @@ -61,11 +47,11 @@ class TaskTargetRead implements TaskTargetOp { if (target.fsImpl.isZip) { assert(target.fsImpl.zipPath, "Should have a zipPath"); // We need to be able to do this - return `7z x ${shEscape(target.fsImpl.zipPath)} -so ${shEscape(target.path)}`; + return `7z x ${quote(target.fsImpl.zipPath)} -so ${quote(target.path)}`; } // TODO : Implement when reading from a zip file - return `cat ${shEscape(target.path)}`; + return `cat ${quote(target.path)}`; } clone() { return new TaskTargetRead(); @@ -96,7 +82,7 @@ class TaskTargetCmd implements TaskTargetOp { const out = parsedCmd .map(c => { let sh = c.replace(/\n/g, "") - return shEscape(sh); + return quote(sh); }); return out.join(" "); @@ -332,6 +318,16 @@ export function getTSVManifest(targets: TaskTarget[]): string { return out.join("\n"); } +export function getTaskManifest(targets: TaskTarget[]): [string, string][] { + let out: [string, string][] = []; + for (const t of targets) { + const shell = t.toShell(); + out.push([t.id, shell] as const); + } + + return out; +} + function collectionSwap(a: TaskTargetPipelineHelper, b: TaskTargetPipelineHelper) { if (!a.__collection) { return; @@ -408,25 +404,7 @@ export class TaskTargetPipelineHelper extends Array { } } -export async function parallel(targets: TaskTarget[]) { - const finalTargets = await verify(targets); - const manifestTSV = getTSVManifest(finalTargets); - - try { - await $({ input: manifestTSV })`/usr/bin/parallel \ - --colsep ${'\t'} \ - --jobs 0 \ - --linebuffer \ - --tagstring {1} \ - --eta \ - --joblog out.manifest \ - ${'bash -c {2} > OUTTEST/{1}.csv'} \ - ::::- `; // stdin is in manifestTSV - } - catch(err: any) { - // I'm pretty sure status is the amount that failed? - if (err?.status >= 30) { - throw err; - } - } +export async function run(target: TaskTarget): Promise { + const command = target.toShell(); + return await $({ nothrow: true })`bash -c ${command}`; } \ No newline at end of file diff --git a/package.json b/package.json index b16957c..e0ed113 100644 --- a/package.json +++ b/package.json @@ -5,7 +5,9 @@ "main": "index.js", "type": "module", "scripts": { - "test": "node --enable-source-maps --test --experimental-transform-types --no-warnings ./test/util.ts", + "test": "node --enable-source-maps --test --experimental-transform-types --no-warnings ./test/task.ts", + "test2": "node --enable-source-maps --test --experimental-transform-types --no-warnings ./test/facebook.ts", + "test-update-snapshots": "node --enable-source-maps --test --experimental-transform-types --no-warnings --test-update-snapshots ./test/facebook.ts", "dev": "vite --port 2223", "server": "node --experimental-transform-types server/server.ts", "prototype": "node --import ./util/tsx-loader.js --import ./util/ignore-css-loader.js --experimental-transform-types server/prototype.ts" diff --git a/test/facebook.ts b/test/facebook.ts index 44c2ba2..9835c91 100644 --- a/test/facebook.ts +++ b/test/facebook.ts @@ -1,115 +1,73 @@ import test from "node:test"; -import fs from "node:fs"; -import assert from "node:assert"; +import nodePath from "node:path"; +import { strict as assert } from "node:assert"; import { finished } from "node:stream/promises"; import { Readable, Writable } from "node:stream"; -import { TaskTargetPipelineHelper } from "../data-export/task.ts"; +import { TaskTargetPipelineHelper, TaskTarget, verify, getTSVManifest, getTaskManifest, run } from "../data-export/task.ts"; +import { parallel } from "../data-export/parallel.ts"; +import "../data-export/facebook.ts"; -test("facebook: Can load the 2021 export", async () => { - // TODO: - // const t = new Task(); - // (await t.fork().cd("/home/cobertos/Seafile/archive/ExportedServiceData/facebook/facebook-x-2025-11-29-x.zip").zip()).facebook_v2(); - // const taskText = await t.getTaskTSVShell(); - // await fs.writeFile('test.manifest', taskText); - // // Run everything with parallel - // try { - // execFileSync('/usr/bin/parallel', ['--colsep', '\t', '--jobs', '0', '--linebuffer', '--tagstring', '{1}', '--eta', '--joblog', 'out.manifest', 'bash -c {2} > OUTTEST/{1}.csv', '::::', 'test.manifest'], { - // stdio: 'inherit' - // }); - // } - // catch(err: any) { - // // I'm pretty sure status is the amount that failed? - // if (err?.status >= 30) { - // throw err; - // } - // } +const THIS_FILE = import.meta.dirname; +const FACEBOOK_V1_DIR = nodePath.join(THIS_FILE, 'fixtures/facebook-json-2021-05-01'); +const FACEBOOK_V1_ZIPPED = nodePath.join(THIS_FILE, 'fixtures/facebook-json-2021-05-01.zip'); +const FACEBOOK_V2_DIR = nodePath.join(THIS_FILE, 'fixtures/facebook-json-2025-11-29'); - // // Now take the output and load it all into a single SQLITE file - // const entries = await fs.readdir('OUTTEST', { withFileTypes: true }); - // const csvFiles = entries - // .filter(e => e.isFile() && e.name.endsWith(".csv")) - // .map(e => nodePath.join('OUTTEST', e.name)); +test("facebook: Can load the 2021 export", async (t) => { + const targets = TaskTargetPipelineHelper.pipeline([ + new TaskTarget(FACEBOOK_V1_DIR) + ]) + .facebook(); + + const finalTargets = await verify(targets); + const result = await parallel(finalTargets, true); + for (const [id, r] of result.entries()) { + assert.ok(!r.stderr, `Task ${id} should have no stderr output`); + assert.ok(r.ok, `Task ${id} should be okay`); + } + + const allCSV = Array.from(result.entries()) + .sort() // Keep stable ordering for snapshots + .map(([id, r]) => r.stdout); + + t.assert.snapshot(allCSV); }); +test("facebook: Can load the 2021 export zipped", async (t) => { + const targets = await TaskTargetPipelineHelper.pipeline([ + new TaskTarget(FACEBOOK_V1_ZIPPED) + ]) + .unzip(); + const targets2 = targets + .facebook(); -// import fs from 'node:fs/promises'; -// import { type SpawnOptions, execFile as _execFile, execFileSync } from "node:child_process"; -// import nodePath from "node:path"; -// import { DatabaseSync } from "node:sqlite"; -// import { promisify } from "node:util"; -// import "../data-export/facebook.ts"; -// import { google } from "../data-export/google.ts"; -// const execFile = promisify(_execFile); + const finalTargets = await verify(targets2); + const result = await parallel(finalTargets, true); + for (const [id, r] of result.entries()) { + assert.ok(!r.stderr, `Task ${id} should have no stderr output`); + assert.ok(r.ok, `Task ${id} should be okay`); + } -// declare module "../data-export/task.ts" { -// interface Task { -// google: typeof google; -// } -// } + const allCSV = Array.from(result.entries()) + .sort() // Keep stable ordering for snapshots + .map(([id, r]) => r.stdout); -// Object.assign(Task.prototype, { -// google -// }); + t.assert.snapshot(allCSV); +}); +test("facebook: Can load the 2025 export", async (t) => { + const targets = TaskTargetPipelineHelper.pipeline([ + new TaskTarget(FACEBOOK_V2_DIR) + ]) + .facebook_v2(); -// function loadIntoSqlite( -// paths: string[], -// sqlitePath: string -// ) { -// // Open an in-memory db for speed -// const db = new DatabaseSync(":memory:", { allowExtension: true }); -// db.loadExtension("/home/cobertos/sqlite-files/csv.so") -// db.enableLoadExtension(false); -// for (const path of paths) { -// const table = nodePath.basename(path, ".csv"); -// console.log(`Loading ${path} → table ${table}`); + const finalTargets = await verify(targets); + const result = await parallel(finalTargets, true); + for (const [id, r] of result.entries()) { + assert.ok(!r.stderr, `Task ${id} should have no stderr output`); + assert.ok(r.ok, `Task ${id} should be okay`); + } -// // const headers = lines[0].split(","); -// // const columnsSql = headers.map(h => `"${h}" TEXT`).join(", "); -// db.exec(`CREATE VIRTUAL TABLE temp.intermediate USING csv(filename='${path}');`); -// db.exec(`CREATE TABLE "${table}" AS SELECT * FROM intermediate;`); -// db.exec(`DROP TABLE IF EXISTS intermediate;`); -// } + const allCSV = Array.from(result.entries()) + .sort() // Keep stable ordering for snapshots + .map(([id, r]) => r.stdout); -// // Dump it all to the path specified -// db.exec(`VACUUM main INTO '${sqlitePath}'`); -// db.close(); -// } - -// async function main() { -// const t = new Task(); -// // t.fork().cd("/home/cobertos/Seafile/archive/ExportedServiceData/facebook/formapcast_facebook-DEADNAME-May2021-json") -// // .facebook() - -// (await t.fork().cd("/home/cobertos/Seafile/archive/ExportedServiceData/facebook/facebook-x-2025-11-29-x.zip").zip()).facebook_v2(); - -// // t.fork().cd("/home/cobertos/Seafile/archive/ExportedServiceData/google/2023-NAMEwork-001") -// // .google() - - -// // let zipTask = t.fork().zip("/home/cobertos/Seafile/archive/ExportedServiceData/facebook/facebook-DEADNAME-May2021-json.zip"); -// // await (zipTask.fsImpl as any).init(); - -// // zipTask.facebook(); -// const taskText = await t.getTaskTSVShell(); -// await fs.writeFile('test.manifest', taskText); -// // Run everything with parallel -// try { -// execFileSync('/usr/bin/parallel', ['--colsep', '\t', '--jobs', '0', '--linebuffer', '--tagstring', '{1}', '--eta', '--joblog', 'out.manifest', 'bash -c {2} > OUTTEST/{1}.csv', '::::', 'test.manifest'], { -// stdio: 'inherit' -// }); -// } -// catch(err: any) { -// // I'm pretty sure status is the amount that failed? -// if (err?.status >= 30) { -// throw err; -// } -// } -// // Now take the output and load it all into a single SQLITE file -// const entries = await fs.readdir('OUTTEST', { withFileTypes: true }); -// const csvFiles = entries -// .filter(e => e.isFile() && e.name.endsWith(".csv")) -// .map(e => nodePath.join('OUTTEST', e.name)); -// await fs.unlink('your.db'); -// loadIntoSqlite(csvFiles, 'your.db'); -// } - -// main(); \ No newline at end of file + t.assert.snapshot(allCSV); +}); diff --git a/test/facebook.ts.snapshot b/test/facebook.ts.snapshot new file mode 100644 index 0000000..fb5c5e6 --- /dev/null +++ b/test/facebook.ts.snapshot @@ -0,0 +1,116 @@ +exports[`facebook: Can load the 2021 export 1`] = ` +[ + "\\"album\\",\\"uri\\",\\"creation_timestamp\\"\\n\\"xxx\\",\\"photos_and_videos/CoverPhotos_yyyyyy/200x200png.png\\",\\"2024-03-07T15:23:20Z\\"\\n\\"xxx\\",\\"photos_and_videos/CoverPhotos_yyyyyy/200x200png.png\\",\\"2024-07-01T07:46:40Z\\"\\n", + "[\\n \\"from\\",\\n \\"to\\",\\n \\"timestamp\\",\\n \\"body\\"\\n]\\n\\"Me\\",\\"xxx\\",\\"2024-01-13T07:13:20Z\\",\\"xxx\\"\\n\\"Me\\",\\"xxx\\",\\"2024-01-13T07:13:20Z\\",\\"xxx\\"\\n", + "\\"from\\",\\"to\\",\\"timestamp\\",\\"content\\"\\n\\"xxx\\",\\"\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n\\"xxx\\",\\"\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n", + "\\"from\\",\\"to\\",\\"timestamp\\",\\"content\\"\\n\\"xxx\\",\\"\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n", + "\\"from\\",\\"to\\",\\"timestamp\\",\\"content\\"\\n\\"xxx\\",\\"\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n", + "\\"from\\",\\"to\\",\\"timestamp\\",\\"content\\"\\n\\"xxx\\",\\"\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n\\"xxx\\",\\"\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n", + "\\"action\\",\\"ip\\",\\"user_agent\\",\\"datr_cookie\\",\\"city\\",\\"region\\",\\"country\\",\\"site_name\\",\\"timestamp\\"\\n\\"xxx\\",\\"1.1.1.1\\",\\"some/path\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"2024-05-01T07:53:20Z\\"\\n\\"xxx\\",\\"1.1.1.1\\",\\"some/path\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"2024-05-01T07:53:20Z\\"\\n", + "\\"status\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-05-01T07:53:20Z\\"\\n\\"xxx\\",\\"2024-02-13T14:36:40Z\\"\\n", + "\\"service_name\\",\\"native_app_id\\",\\"username\\",\\"email\\",\\"phone_number\\",\\"name\\"\\n\\"xxx\\",69,\\"xxx\\",\\"not_a_real_email@example.com\\",\\"xxx\\",\\"xxx\\"\\n\\"xxx\\",1707005000,\\"xxx\\",\\"not_a_real_email@example.com\\",,\\"xxx\\"\\n", + "\\"event\\",\\"created_timestamp\\",\\"ip_address\\",\\"user_agent\\",\\"datr_cookie\\"\\n\\"xxx\\",\\"2024-05-01T07:53:20Z\\",,,\\n\\"xxx\\",\\"2024-02-13T14:36:40Z\\",,,\\n", + "\\"name\\",\\"added_timestamp\\"\\n\\"xxx\\",\\"2024-12-29T08:13:20Z\\"\\n\\"xxx\\",\\"2024-09-02T12:26:40Z\\"\\n", + "\\"name\\",\\"created_timestamp\\",\\"updated_timestamp\\",\\"ip_address\\",\\"user_agent\\",\\"location\\",\\"app\\",\\"session_type\\",\\"datr_cookie\\"\\n\\"xxx\\",\\"2024-08-22T01:26:40Z\\",\\"2024-05-11T15:06:40Z\\",\\"1.1.1.1\\",\\"some/path\\",\\"\\",\\"\\",\\"\\",\\"xxx\\"\\n", + "\\"timestamp\\",\\"data\\",\\"title\\"\\n\\"2024-02-08T19:20:00Z\\",\\"TODO\\",\\"xxx\\"\\n\\"2024-01-17T14:00:00Z\\",\\"TODO\\",\\"xxx\\"\\n", + "\\"timestamp\\",\\"email\\",\\"contact_type\\"\\n\\"2024-10-18T07:03:20Z\\",\\"not_a_real_email@example.com\\",69\\n\\"2024-01-21T22:10:00Z\\",\\"not_a_real_email@example.com\\",69\\n", + "\\"name\\"\\n\\"xxx\\"\\n\\"xxx\\"\\n", + "\\"name\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-05-01T07:53:20Z\\"\\n\\"xxx\\",\\"2024-05-01T07:53:20Z\\"\\n", + "\\"name\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-02-13T13:13:20Z\\"\\n\\"xxx\\",\\"2024-10-31T00:36:40Z\\"\\n", + "\\"game\\",\\"added_timestamp\\"\\n\\"xxx\\",\\"2024-11-03T16:06:40Z\\"\\n", + "\\"title\\",\\"price\\",\\"seller\\",\\"created_timestamp\\",\\"latitude\\",\\"longitude\\",\\"description\\"\\n\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"2024-12-18T05:33:20Z\\",69,69,\\"xxx\\"\\n\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"2024-12-18T05:33:20Z\\",69,69,\\"xxx\\"\\n", + "\\"action\\",\\"timestamp\\",\\"site\\",\\"ip_address\\"\\n\\"xxx\\",\\"2024-05-01T07:53:20Z\\",\\"xxx\\",\\"1.1.1.1\\"\\n\\"xxx\\",\\"2024-04-23T17:56:40Z\\",\\"xxx\\",\\"1.1.1.1\\"\\n", + "\\"timestamp\\",\\"unread\\",\\"href\\",\\"text\\"\\n\\"2024-04-30T08:16:40Z\\",true,\\"url://somewhere\\",\\"xxx\\"\\n\\"2024-04-30T08:16:40Z\\",true,\\"url://somewhere\\",\\"xxx\\"\\n", + "\\"name\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-05-01T07:53:20Z\\"\\n\\"xxx\\",\\"2024-05-01T07:53:20Z\\"\\n", + "\\"from\\",\\"to\\",\\"amount\\",\\"currency\\",\\"type\\",\\"status\\",\\"payment_method\\",\\"created_timestamp\\"\\n\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"2024-05-05T21:36:40Z\\"\\n", + "\\"name\\",\\"uri\\",\\"timestamp\\"\\n\\"xxx\\",\\"url://somewhere\\",\\"2024-01-15T12:00:00Z\\"\\n\\"xxx\\",\\"url://somewhere\\",\\"2024-01-12T06:13:20Z\\"\\n", + "\\"from\\",\\"to\\",\\"rank\\",\\"timestamp\\"\\n\\"xxx\\",\\"xxx\\",69,\\"2024-07-22T19:03:20Z\\"\\n", + "\\"title\\",\\"timestamp\\",\\"reaction\\"\\n,\\"2024-01-14T06:50:00Z\\",\\"xxx\\"\\n,\\"2024-01-14T06:50:00Z\\",\\"xxx\\"\\n", + "\\"title\\",\\"timestamp\\"\\n,\\"2024-10-06T08:56:40Z\\"\\n,\\"2024-10-06T08:56:40Z\\"\\n", + "\\"name\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-02-08T16:33:20Z\\"\\n\\"xxx\\",\\"2024-09-24T19:10:00Z\\"\\n", + "\\"name\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-09-27T15:13:20Z\\"\\n\\"xxx\\",\\"2024-08-24T00:40:00Z\\"\\n", + "\\"name\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-01-14T06:50:00Z\\"\\n\\"xxx\\",\\"2024-01-14T06:50:00Z\\"\\n", + "\\"name\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-06-23T05:20:00Z\\"\\n\\"xxx\\",\\"2024-05-25T08:16:40Z\\"\\n", + "\\"title\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-01-14T06:50:00Z\\"\\n\\"xxx\\",\\"2024-04-28T20:10:00Z\\"\\n", + "\\"from\\",\\"to\\",\\"subject\\",\\"message\\",\\"timestamp\\"\\n\\"not_a_real_email@example.com\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"2024-10-16T06:26:40Z\\"\\n\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"url://somewhere\\",\\"2024-10-16T06:26:40Z\\"\\n", + "\\"title\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-12-17T08:43:20Z\\"\\n", + "\\"title\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-01-14T06:50:00Z\\"\\n\\"xxx\\",\\"2024-01-14T06:50:00Z\\"\\n", + "\\"name\\",\\"id\\",\\"type\\",\\"timestamp\\"\\n\\"xxx\\",69,\\"xxx\\",\\"2024-02-11T12:36:40Z\\"\\n\\"xxx\\",69,\\"xxx\\",\\"2024-02-10T19:56:40Z\\"\\n\\"xxx\\",69,\\"xxx\\",\\"2024-02-10T11:36:40Z\\"\\n\\"xxx\\",69,\\"xxx\\",\\"2024-02-07T21:06:40Z\\"\\n", + "\\"name\\",\\"uri\\",\\"timestamp\\"\\n\\"xxx\\",\\"url://somewhere\\",\\"2024-02-27T05:00:00Z\\"\\n\\"xxx\\",\\"url://somewhere\\",\\"2024-05-16T03:26:40Z\\"\\n", + "\\"title\\",\\"data\\",\\"timestamp\\"\\n\\"xxx\\",\\"TODO: data\\",\\"2024-05-01T07:53:20Z\\"\\n\\"xxx\\",\\"TODO: data\\",\\"2024-10-31T06:10:00Z\\"\\n", + "\\"title\\",\\"data\\",\\"timestamp\\"\\n\\"xxx\\",\\"TODO\\",\\"2024-02-08T19:20:00Z\\"\\n\\"xxx\\",\\"TODO\\",\\"2024-02-08T19:20:00Z\\"\\n", + "\\"title\\",\\"data\\",\\"timestamp\\"\\n\\"xxx\\",\\"xxx\\",\\"2024-11-17T06:30:00Z\\"\\n\\"xxx\\",\\"xxx\\",\\"2024-11-17T06:30:00Z\\"\\n" +] +`; + +exports[`facebook: Can load the 2021 export zipped 1`] = ` +[ + "\\"album\\",\\"uri\\",\\"creation_timestamp\\"\\n\\"xxx\\",\\"photos_and_videos/CoverPhotos_yyyyyy/200x200png.png\\",\\"2024-03-07T15:23:20Z\\"\\n\\"xxx\\",\\"photos_and_videos/CoverPhotos_yyyyyy/200x200png.png\\",\\"2024-07-01T07:46:40Z\\"\\n", + "[\\n \\"from\\",\\n \\"to\\",\\n \\"timestamp\\",\\n \\"body\\"\\n]\\n\\"Me\\",\\"xxx\\",\\"2024-01-13T07:13:20Z\\",\\"xxx\\"\\n\\"Me\\",\\"xxx\\",\\"2024-01-13T07:13:20Z\\",\\"xxx\\"\\n", + "\\"from\\",\\"to\\",\\"timestamp\\",\\"content\\"\\n\\"xxx\\",\\"\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n\\"xxx\\",\\"\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n", + "\\"from\\",\\"to\\",\\"timestamp\\",\\"content\\"\\n\\"xxx\\",\\"\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n", + "\\"from\\",\\"to\\",\\"timestamp\\",\\"content\\"\\n\\"xxx\\",\\"\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n", + "\\"from\\",\\"to\\",\\"timestamp\\",\\"content\\"\\n\\"xxx\\",\\"\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n\\"xxx\\",\\"\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n", + "\\"action\\",\\"ip\\",\\"user_agent\\",\\"datr_cookie\\",\\"city\\",\\"region\\",\\"country\\",\\"site_name\\",\\"timestamp\\"\\n\\"xxx\\",\\"1.1.1.1\\",\\"some/path\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"2024-05-01T07:53:20Z\\"\\n\\"xxx\\",\\"1.1.1.1\\",\\"some/path\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"2024-05-01T07:53:20Z\\"\\n", + "\\"status\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-05-01T07:53:20Z\\"\\n\\"xxx\\",\\"2024-02-13T14:36:40Z\\"\\n", + "\\"service_name\\",\\"native_app_id\\",\\"username\\",\\"email\\",\\"phone_number\\",\\"name\\"\\n\\"xxx\\",69,\\"xxx\\",\\"not_a_real_email@example.com\\",\\"xxx\\",\\"xxx\\"\\n\\"xxx\\",1707005000,\\"xxx\\",\\"not_a_real_email@example.com\\",,\\"xxx\\"\\n", + "\\"event\\",\\"created_timestamp\\",\\"ip_address\\",\\"user_agent\\",\\"datr_cookie\\"\\n\\"xxx\\",\\"2024-05-01T07:53:20Z\\",,,\\n\\"xxx\\",\\"2024-02-13T14:36:40Z\\",,,\\n", + "\\"name\\",\\"added_timestamp\\"\\n\\"xxx\\",\\"2024-12-29T08:13:20Z\\"\\n\\"xxx\\",\\"2024-09-02T12:26:40Z\\"\\n", + "\\"name\\",\\"created_timestamp\\",\\"updated_timestamp\\",\\"ip_address\\",\\"user_agent\\",\\"location\\",\\"app\\",\\"session_type\\",\\"datr_cookie\\"\\n\\"xxx\\",\\"2024-08-22T01:26:40Z\\",\\"2024-05-11T15:06:40Z\\",\\"1.1.1.1\\",\\"some/path\\",\\"\\",\\"\\",\\"\\",\\"xxx\\"\\n", + "\\"timestamp\\",\\"data\\",\\"title\\"\\n\\"2024-02-08T19:20:00Z\\",\\"TODO\\",\\"xxx\\"\\n\\"2024-01-17T14:00:00Z\\",\\"TODO\\",\\"xxx\\"\\n", + "\\"timestamp\\",\\"email\\",\\"contact_type\\"\\n\\"2024-10-18T07:03:20Z\\",\\"not_a_real_email@example.com\\",69\\n\\"2024-01-21T22:10:00Z\\",\\"not_a_real_email@example.com\\",69\\n", + "\\"name\\"\\n\\"xxx\\"\\n\\"xxx\\"\\n", + "\\"name\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-05-01T07:53:20Z\\"\\n\\"xxx\\",\\"2024-05-01T07:53:20Z\\"\\n", + "\\"name\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-02-13T13:13:20Z\\"\\n\\"xxx\\",\\"2024-10-31T00:36:40Z\\"\\n", + "\\"game\\",\\"added_timestamp\\"\\n\\"xxx\\",\\"2024-11-03T16:06:40Z\\"\\n", + "\\"title\\",\\"price\\",\\"seller\\",\\"created_timestamp\\",\\"latitude\\",\\"longitude\\",\\"description\\"\\n\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"2024-12-18T05:33:20Z\\",69,69,\\"xxx\\"\\n\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"2024-12-18T05:33:20Z\\",69,69,\\"xxx\\"\\n", + "\\"action\\",\\"timestamp\\",\\"site\\",\\"ip_address\\"\\n\\"xxx\\",\\"2024-05-01T07:53:20Z\\",\\"xxx\\",\\"1.1.1.1\\"\\n\\"xxx\\",\\"2024-04-23T17:56:40Z\\",\\"xxx\\",\\"1.1.1.1\\"\\n", + "\\"timestamp\\",\\"unread\\",\\"href\\",\\"text\\"\\n\\"2024-04-30T08:16:40Z\\",true,\\"url://somewhere\\",\\"xxx\\"\\n\\"2024-04-30T08:16:40Z\\",true,\\"url://somewhere\\",\\"xxx\\"\\n", + "\\"name\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-05-01T07:53:20Z\\"\\n\\"xxx\\",\\"2024-05-01T07:53:20Z\\"\\n", + "\\"from\\",\\"to\\",\\"amount\\",\\"currency\\",\\"type\\",\\"status\\",\\"payment_method\\",\\"created_timestamp\\"\\n\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"2024-05-05T21:36:40Z\\"\\n", + "\\"name\\",\\"uri\\",\\"timestamp\\"\\n\\"xxx\\",\\"url://somewhere\\",\\"2024-01-15T12:00:00Z\\"\\n\\"xxx\\",\\"url://somewhere\\",\\"2024-01-12T06:13:20Z\\"\\n", + "\\"from\\",\\"to\\",\\"rank\\",\\"timestamp\\"\\n\\"xxx\\",\\"xxx\\",69,\\"2024-07-22T19:03:20Z\\"\\n", + "\\"title\\",\\"timestamp\\",\\"reaction\\"\\n,\\"2024-01-14T06:50:00Z\\",\\"xxx\\"\\n,\\"2024-01-14T06:50:00Z\\",\\"xxx\\"\\n", + "\\"title\\",\\"timestamp\\"\\n,\\"2024-10-06T08:56:40Z\\"\\n,\\"2024-10-06T08:56:40Z\\"\\n", + "\\"name\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-02-08T16:33:20Z\\"\\n\\"xxx\\",\\"2024-09-24T19:10:00Z\\"\\n", + "\\"name\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-09-27T15:13:20Z\\"\\n\\"xxx\\",\\"2024-08-24T00:40:00Z\\"\\n", + "\\"name\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-01-14T06:50:00Z\\"\\n\\"xxx\\",\\"2024-01-14T06:50:00Z\\"\\n", + "\\"name\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-06-23T05:20:00Z\\"\\n\\"xxx\\",\\"2024-05-25T08:16:40Z\\"\\n", + "\\"title\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-01-14T06:50:00Z\\"\\n\\"xxx\\",\\"2024-04-28T20:10:00Z\\"\\n", + "\\"from\\",\\"to\\",\\"subject\\",\\"message\\",\\"timestamp\\"\\n\\"not_a_real_email@example.com\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"2024-10-16T06:26:40Z\\"\\n\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"url://somewhere\\",\\"2024-10-16T06:26:40Z\\"\\n", + "\\"title\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-12-17T08:43:20Z\\"\\n", + "\\"title\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-01-14T06:50:00Z\\"\\n\\"xxx\\",\\"2024-01-14T06:50:00Z\\"\\n", + "\\"name\\",\\"id\\",\\"type\\",\\"timestamp\\"\\n\\"xxx\\",69,\\"xxx\\",\\"2024-02-11T12:36:40Z\\"\\n\\"xxx\\",69,\\"xxx\\",\\"2024-02-10T19:56:40Z\\"\\n\\"xxx\\",69,\\"xxx\\",\\"2024-02-10T11:36:40Z\\"\\n\\"xxx\\",69,\\"xxx\\",\\"2024-02-07T21:06:40Z\\"\\n", + "\\"name\\",\\"uri\\",\\"timestamp\\"\\n\\"xxx\\",\\"url://somewhere\\",\\"2024-02-27T05:00:00Z\\"\\n\\"xxx\\",\\"url://somewhere\\",\\"2024-05-16T03:26:40Z\\"\\n", + "\\"title\\",\\"data\\",\\"timestamp\\"\\n\\"xxx\\",\\"TODO: data\\",\\"2024-05-01T07:53:20Z\\"\\n\\"xxx\\",\\"TODO: data\\",\\"2024-10-31T06:10:00Z\\"\\n", + "\\"title\\",\\"data\\",\\"timestamp\\"\\n\\"xxx\\",\\"TODO\\",\\"2024-02-08T19:20:00Z\\"\\n\\"xxx\\",\\"TODO\\",\\"2024-02-08T19:20:00Z\\"\\n", + "\\"title\\",\\"data\\",\\"timestamp\\"\\n\\"xxx\\",\\"xxx\\",\\"2024-11-17T06:30:00Z\\"\\n\\"xxx\\",\\"xxx\\",\\"2024-11-17T06:30:00Z\\"\\n" +] +`; + +exports[`facebook: Can load the 2025 export 1`] = ` +[ + "\\"from\\",\\"to\\",\\"timestamp\\",\\"content\\"\\n\\"xxx\\",\\"\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n\\"xxx\\",\\"\\",\\"1970-01-01T00:00:00Z\\",\\"some/path\\"\\n", + "\\"from\\",\\"to\\",\\"timestamp\\",\\"content\\"\\n\\"xxx\\",\\"\\",\\"1970-01-01T00:00:00Z\\",\\n\\"xxx\\",\\"\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n", + "\\"from\\",\\"to\\",\\"timestamp\\",\\"content\\"\\n\\"xxx\\",\\"\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n\\"xxx\\",\\"\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n", + "\\"from\\",\\"to\\",\\"timestamp\\",\\"content\\"\\n\\"xxx\\",\\"\\",\\"1970-01-01T00:00:00Z\\",\\n", + "\\"action\\",\\"ip\\",\\"user_agent\\",\\"datr_cookie\\",\\"city\\",\\"region\\",\\"country\\",\\"site_name\\",\\"timestamp\\"\\n\\"xxx\\",\\"1.1.1.1\\",\\"some/path\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"2024-11-22T10:06:40Z\\"\\n\\"xxx\\",\\"1.1.1.1\\",\\"some/path\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"2024-11-21T23:00:00Z\\"\\n", + "\\"timestamp\\",\\"data\\",\\"title\\"\\n\\"2024-02-13T02:06:40Z\\",\\"TODO\\",\\"xxx\\"\\n\\"2024-07-12T02:06:40Z\\",\\"TODO\\",\\"xxx\\"\\n", + "\\"name\\",\\"added_timestamp\\"\\n\\"xxx\\",\\"2024-01-12T00:40:00Z\\"\\n\\"xxx\\",\\"2024-06-21T17:13:20Z\\"\\n", + "\\"timestamp\\",\\"email\\",\\"contact_type\\"\\n\\"2024-02-07T19:43:20Z\\",\\"not_a_real_email@example.com\\",69\\n", + "\\"title\\",\\"data\\",\\"timestamp\\"\\n\\"xxx\\",\\"TODO\\",\\"2024-10-06T06:10:00Z\\"\\n\\"xxx\\",\\"TODO\\",\\"2024-01-22T16:13:20Z\\"\\n", + "\\"title\\",\\"price\\",\\"seller\\",\\"created_timestamp\\",\\"latitude\\",\\"longitude\\",\\"description\\"\\n\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"2024-10-02T23:00:00Z\\",69,69,\\"xxx\\"\\n\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"2024-09-27T01:20:00Z\\",69,69,\\"xxx\\"\\n", + "\\"action\\",\\"timestamp\\",\\"site\\",\\"ip_address\\"\\n\\"xxx\\",\\"2024-08-10T14:26:40Z\\",\\"xxx\\",\\"1.1.1.1\\"\\n\\"xxx\\",\\"2024-08-10T14:26:40Z\\",\\"xxx\\",\\"1.1.1.1\\"\\n", + "\\"timestamp\\",\\"unread\\",\\"href\\",\\"text\\"\\n\\"2024-11-20T12:16:40Z\\",true,\\"url://somewhere\\",\\"xxx\\"\\n\\"2024-11-15T00:20:00Z\\",true,\\"url://somewhere\\",\\"xxx\\"\\n", + "\\"title\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-02-21T03:10:00Z\\"\\n", + "\\"name\\",\\"uri\\",\\"timestamp\\"\\n\\"xxx\\",\\"url://somewhere\\",\\"2024-09-11T20:03:20Z\\"\\n\\"xxx\\",\\"url://somewhere\\",\\"2024-01-20T12:50:00Z\\"\\n", + "\\"name\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-09-10T10:43:20Z\\"\\n\\"xxx\\",\\"2024-09-02T12:26:40Z\\"\\n", + "\\"event\\",\\"created_timestamp\\",\\"ip_address\\",\\"user_agent\\",\\"datr_cookie\\"\\n\\"xxx\\",\\"2024-08-11T01:33:20Z\\",,,\\n\\"xxx\\",\\"2024-08-10T14:26:40Z\\",,,\\n", + "\\"name\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-09-01T14:13:20Z\\"\\n\\"xxx\\",\\"2024-08-12T08:06:40Z\\"\\n", + "\\"start\\",\\"end\\"\\n", + "\\"name\\",\\"created_timestamp\\",\\"updated_timestamp\\",\\"ip_address\\",\\"user_agent\\",\\"location\\",\\"app\\",\\"session_type\\",\\"datr_cookie\\"\\n,\\"2024-04-04T19:46:40Z\\",\\"2024-11-23T02:46:40Z\\",\\"1.1.1.1\\",\\"some/path\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\"\\n,\\"2024-04-05T06:53:20Z\\",\\"2024-11-22T10:06:40Z\\",\\"1.1.1.1\\",\\"some/path\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\"\\n", + "\\"name\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-04-01T16:46:40Z\\"\\n\\"xxx\\",\\"2024-09-07T16:03:20Z\\"\\n", + "\\"title\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-02-12T17:46:40Z\\"\\n\\"xxx\\",\\"2024-02-12T17:46:40Z\\"\\n", + "\\"title\\",\\"data\\",\\"timestamp\\"\\n\\"xxx\\",\\"xxx\\",\\"2024-12-08T09:26:40Z\\"\\n\\"xxx\\",\\"xxx\\",\\"2024-12-28T00:16:40Z\\"\\n" +] +`; diff --git a/test/fixtures/facebook-json-2021-05-01.zip b/test/fixtures/facebook-json-2021-05-01.zip new file mode 100644 index 0000000..1c3aff9 Binary files /dev/null and b/test/fixtures/facebook-json-2021-05-01.zip differ diff --git a/test/task.ts b/test/task.ts index cd2c697..a87d240 100644 --- a/test/task.ts +++ b/test/task.ts @@ -1,9 +1,8 @@ import test from "node:test"; -import assert from "node:assert/strict"; +import nodePath from "node:path"; +import { strict as assert } from "node:assert/strict"; import { TaskTarget, - each, - map, cd, glob as taskGlob, read, @@ -14,10 +13,11 @@ import { TaskTargetPipelineHelper, } from "../data-export/task.ts"; -const FIXTURE_DIR = "/projects/base-data-manager/test/fixtures/facebook-json-2021-05-01"; -const FIXTURE_FILE = `${FIXTURE_DIR}/friends/friends.json`; +const THIS_FILE = import.meta.dirname; +const FIXTURE_DIR = nodePath.join(THIS_FILE, 'fixtures/facebook-json-2021-05-01'); +const FIXTURE_FILE = nodePath.join(FIXTURE_DIR, 'friends/friends.json'); -// ── TaskTarget ─────────────────────────────────────────────────────────────── +// -- TaskTarget --------------------------------------------------------------- test("TaskTarget: constructor initializes path, pipeline, postFns", () => { const t = new TaskTarget("/foo/bar"); @@ -110,7 +110,7 @@ test("TaskTarget: glob returns matching TaskTargets from disk", () => { assert.ok(results.every(r => r.path.endsWith(".json"))); }); -// ── toShell / shEscape ─────────────────────────────────────────────────────── +// -- toShell / shEscape ------------------------------------------------------- test("toShell: a single read produces a cat command", () => { const t = new TaskTarget("/foo/bar.txt"); @@ -125,22 +125,17 @@ test("toShell: read piped into cmd", () => { assert.equal(t.toShell(), "cat /foo/bar.txt | jq ."); }); -test("toShell: single-quotes paths that contain spaces", () => { - const t = new TaskTarget("/foo/bar baz.txt"); +for (const c of " $!&".split("")) { + test(`toShell: quotes paths that contain ${JSON.stringify(c)}`, () => { + const t = new TaskTarget(`/foo/bar${c}baz.txt`); + t.read(); + assert.equal(t.toShell(), `cat $'/foo/bar${c}baz.txt'`); + }); +} +test(`toShell: quotes and escapes paths that contain '`, () => { + const t = new TaskTarget(`/foo/bar'baz.txt`); t.read(); - assert.equal(t.toShell(), "cat '/foo/bar baz.txt'"); -}); - -test("toShell: single-quotes paths that contain dollar signs", () => { - const t = new TaskTarget("/foo/$bar.txt"); - t.read(); - assert.equal(t.toShell(), "cat '/foo/$bar.txt'"); -}); - -test("toShell: escapes literal single-quotes inside a path", () => { - const t = new TaskTarget("/foo/it's.txt"); - t.read(); - assert.equal(t.toShell(), "cat '/foo/it'\\''s.txt'"); + assert.equal(t.toShell(), `cat $'/foo/bar\\'baz.txt'`); }); test("toShell: cmd with array splits tokens", () => { @@ -155,22 +150,7 @@ test("toShell: cmd with function resolves at shell-generation time", () => { assert.equal(t.toShell(), "jq -r .name /foo/bar.json"); }); -// ── module-level functions ─────────────────────────────────────────────────── - -test("each: calls fn for every target", () => { - const targets = [new TaskTarget("/a"), new TaskTarget("/b")]; - const paths: string[] = []; - each(targets, t => paths.push(t.path)); - assert.deepEqual(paths, ["/a", "/b"]); -}); - -test("map: transforms each target", () => { - const targets = [new TaskTarget("/a"), new TaskTarget("/b")]; - const result = map(targets, t => { const c = t.clone(); c.path += "/x"; return c; }); - assert.equal(result[0].path, "/a/x"); - assert.equal(result[1].path, "/b/x"); - assert.equal(targets[0].path, "/a"); // originals unchanged -}); +// -- module-level functions --------------------------------------------------- test("cd: clones and changes directory of each target", () => { const targets = [new TaskTarget("/a"), new TaskTarget("/b")]; @@ -211,7 +191,7 @@ test("taskGlob: returns matching targets across all input targets", () => { assert.ok(result.every(r => r.path.endsWith(".json"))); }); -// ── verify ─────────────────────────────────────────────────────────────────── +// -- verify ------------------------------------------------------------------- test("verify: removes targets with an empty pipeline", async () => { const t = new TaskTarget(FIXTURE_FILE); @@ -243,7 +223,7 @@ test("verify: filters a mixed list to only valid targets", async () => { assert.equal(result[0], good); }); -// ── getTSVManifest ─────────────────────────────────────────────────────────── +// -- getTSVManifest ----------------------------------------------------------- test("getTSVManifest: produces idshell for a single target", () => { const t = new TaskTarget("/foo/bar.txt"); @@ -258,7 +238,7 @@ test("getTSVManifest: joins multiple targets with newlines", () => { assert.equal(getTSVManifest([t1, t2]), "a\tcat /a.txt\nb\tcat /b.txt"); }); -// ── TaskTargetPipelineHelper ───────────────────────────────────────────────── +// -- TaskTargetPipelineHelper ------------------------------------------------- test("TaskTargetPipelineHelper: pipeline() promotes a plain array", () => { const p = TaskTargetPipelineHelper.pipeline([new TaskTarget("/a")]); @@ -293,7 +273,7 @@ test("TaskTargetPipelineHelper: cmd returns a new helper with cmd ops added", () assert.equal(p2[0].toShell(), "cat /a.txt | jq ."); }); -// ── collect ────────────────────────────────────────────────────────────────── +// -- collect ------------------------------------------------------------------ test("collect: the final end of a chain is added to the collection set", () => { const collection = new Set();