diff --git a/data-export/facebook.ts b/data-export/facebook.ts index cb0d656..9b77423 100644 --- a/data-export/facebook.ts +++ b/data-export/facebook.ts @@ -168,7 +168,7 @@ function facebook_comments_generic(this: TaskTargetPipelineHelper, prop: string) return this.cmd(["jq", "-r", ` ["timestamp","data", "title"], ( - .${prop}[]? + .comments[]? | [(.timestamp | todateiso8601), "TODO", .title] ) | @csv @@ -269,7 +269,7 @@ function facebook_admin_records_generic(this: TaskTargetPipelineHelper, prop: st return this.cmd(["jq", "-r", ` ["event","created_timestamp","ip_address","user_agent","datr_cookie"], ( - .${prop}[] + .admin_records[] | [.event, (.session.created_timestamp | todateiso8601), .ip_address, .user_agent, .datr_cookie] ) | @csv @@ -301,10 +301,10 @@ function facebook_authorized_logins_v2(this: TaskTargetPipelineHelper) { } function facebook_contact_verification_generic(this: TaskTargetPipelineHelper, prop: string) { return this.cmd(["jq", "-r", ` - ["timestamp", "email", "contact_type"], + ["action", "timestamp", "site", "ip_address"], ( .${prop}[] - | [(.verification_time | todateiso8601), .contact, .contact_type] + | [.action, (.timestamp | todateiso8601), .site, .ip_address] ) | @csv `]) @@ -399,7 +399,7 @@ function facebook_v2(this: TaskTargetPipelineHelper) { // No correlary for your_off-facebook_activity.json p.collect(col).cd(`apps_and_websites_off_of_facebook/connected_apps_and_websites.json`).read().facebook_installed_apps_v2(); p.collect(col).cd(`your_facebook_activity/comments_and_reactions/comments.json`).read().facebook_comments_v2(); - p.collect(col).glob(`your_facebook_activity/messages/*/**/*.json`) // Messages files are in the FOLDERS inside messages (archived_threads, e2ee_cutover, etc...) + p.collect(col).glob(`your_facebook_activity/messages/**/*.json`) // Files are message_1.json, etc .setId(t=>`Facebookv2 - Messages ${t.basenameN(2)}`) // 1, 2, etc is not specific enough, include the convo name .read() .facebook_messages_generic() @@ -438,8 +438,7 @@ function facebook_v2(this: TaskTargetPipelineHelper) { p.collect(col).cd(`your_facebook_activity/facebook_marketplace/items_sold.json`).read().facebook_marketplace_items_sold_v2() - const final = Array.from(col).flat(); - return TaskTargetPipelineHelper.pipeline(final); + return Array.from(col); } function facebook(this: TaskTargetPipelineHelper){ @@ -607,7 +606,7 @@ function facebook(this: TaskTargetPipelineHelper){ p.collect(col).cd(`marketplace/items_sold.json`).read().facebook_marketplace_items_sold_v1() - p.collect(col).glob(`messages/**/*.json`) // Files are message_1.json, etc + p.collect(col).cd(`messages/**/*.json`) // Files are message_1.json, etc .setId(t=>`Facebook - Messages ${t.basenameN(2)}`) // 1, 2, etc is not specific enough, include the convo name .read() .facebook_messages_generic() @@ -781,7 +780,6 @@ function facebook(this: TaskTargetPipelineHelper){ // `${facebookRoot}/your_places` - no data in my export // `${facebookRoot}/your_topics` - no data in my export - const final = Array.from(col).flat(); - return TaskTargetPipelineHelper.pipeline(final); + return Array.from(col); }; diff --git a/data-export/parallel.ts b/data-export/parallel.ts deleted file mode 100644 index 27add5c..0000000 --- a/data-export/parallel.ts +++ /dev/null @@ -1,86 +0,0 @@ -import { $, type ProcessOutput } from 'zx'; -import os from 'os'; -import { type TaskTarget, run } from "./task.ts"; - -$.verbose = false; - -type ResultMap = Map; - -export async function parallel( - targets: TaskTarget[], - quiet: boolean = false, - maxConcurrency: number = os.cpus().length -): Promise { - const results = new Map(); - - const total = targets.length; - let completed = 0; - let running = 0; - const completionTimes: number[] = []; - const startTime = Date.now(); - - const inFlight = new Set>(); - - function formatEta(): string { - const left = total - completed; - const avgSeconds = completionTimes.length > 0 - ? completionTimes.reduce((a, b) => a + b, 0) / completionTimes.length / 1000 - : 0; - const etaSeconds = Math.round(left * avgSeconds); - const pct = total > 0 ? Math.round((completed / total) * 100) : 100; - const lastDuration = completionTimes.length > 0 - ? (completionTimes[completionTimes.length - 1] / 1000).toFixed(1) - : '0.0'; - - return `ETA: ${etaSeconds}s Left: ${left} AVG: ${avgSeconds.toFixed(2)}s local:${running}/${completed}/${pct}%/${lastDuration}s`; - } - - function printStatus(): void { - if (quiet) { - return; - } - process.stderr.write(`\r${formatEta()}`.padEnd(80)); - } - - async function runJob(t: TaskTarget): Promise { - running++; - printStatus(); - - const result = await run(t); - completionTimes.push(result.duration); - - results.set(t.id, result); - - running--; - completed++; - printStatus(); - } - - const queue = targets.slice(); - // Process queue with concurrency limit - while (queue.length > 0 || inFlight.size > 0) { - // Fill up to max concurrency - while (queue.length > 0 && inFlight.size < maxConcurrency) { - const target = queue.shift()!; - const promise = runJob(target).then(() => { - inFlight.delete(promise); - }); - inFlight.add(promise); - } - - // Wait for at least one to complete if at capacity - if (inFlight.size > 0) { - await Promise.race(inFlight); - } - } - - // Final status line - process.stderr.write('\n'); - const totalSeconds = ((Date.now() - startTime) / 1000).toFixed(1); - const failed = Array.from(results.values().filter(p => !p.ok)); - process.stderr.write( - `\nCompleted ${total} jobs in ${totalSeconds}s (${failed.length} failed)\n` - ); - - return results; -} diff --git a/data-export/task-before-functional.ts.old b/data-export/task-before-functional.ts.old new file mode 100644 index 0000000..225cac9 --- /dev/null +++ b/data-export/task-before-functional.ts.old @@ -0,0 +1,352 @@ +import nodePath from 'node:path'; +import fs from 'node:fs'; +import { strict as assert } from "node:assert"; +import { execFile as _execFile } from "node:child_process"; +import { promisify } from "node:util"; +import { ZipFS } from "./zipFs.ts"; +import { globSync } from "glob"; + +const execFile = promisify(_execFile); + +type FSImpl = { + isZip?: boolean; + zipPath?: string; + init?(): Promise; + ready?: boolean; + + statSync: typeof fs["statSync"]; + existsSync: typeof fs["existsSync"]; + + // Required by glob + lstatSync: typeof fs["lstatSync"]; + // Needs to include withFileTypes DirEnt variant + readdir: typeof fs["readdir"]; + readdirSync: typeof fs["readdirSync"]; + readlinkSync: typeof fs["readlinkSync"]; + realpathSync: typeof fs["realpathSync"]; + promises: { + lstat: typeof fs.promises["lstat"]; + // Needs to include withFileTypes DirEnt + readdir: typeof fs.promises["readdir"]; + readlink: typeof fs.promises["readlink"]; + realpath: typeof fs.promises["realpath"]; + } +}; +const defaultFSImpl = fs; + +function safe(s: string) { + return s.replace(/[^a-zA-Z0-9_]/g, '_'); +} + + +//TODO: DANGER: I doubt this is safe... +function shEscape(s: string) { + assert(!s.includes("\n"), "shEscape given new line, caller needs to handle these"); + if (!s.match(/[ \$\"\'\!]/)) { + return s; + } + // We need to quote this string + // Single quoted strings require you to close the single quoted string, then + // use the escaped single quote, and then reopen the string... obscene + s = s.replace(/'/g, "'\\''"); + s = `'${s}'`; + return s; +} + +abstract class TaskTargetBase { + target: TaskTarget; + constructor(target: TaskTarget) { + this.target = target; + } + abstract get type(): "read" | "mid"; + abstract toShell(): string; +} +class TaskTargetRead extends TaskTargetBase { + get type(){ return "read" as const; } + toShell() { + if (this.target.fsImpl.isZip) { + assert(this.target.fsImpl.zipPath, "Should have a zipPath"); + // We need to be able to do this + return `7z x ${shEscape(this.target.fsImpl.zipPath)} -so ${shEscape(this.target.path)}`; + } + + // TODO : Implement when reading from a zip file + return `cat ${shEscape(this.target.path)}`; + } +} +class TaskTargetCmd extends TaskTargetBase { + get type(){ return "mid" as const; } + /**What nodejs spawn() and execFile() take + * [cmd, ...args]: string[] + */ + cmd: string[]; + static parse(target: TaskTarget, v: string | string[] | ((t: TaskTarget)=>string) | ((t: TaskTarget)=>string[])): string[] { + if (typeof v === "function") { + v = v(target); + } + if (typeof v === "string") { + v = v.split(/\s+/); + } + return v; + } + constructor(target: TaskTarget, cmd: string | string[] | ((t: TaskTarget)=>string) | ((t: TaskTarget)=>string[])) { + super(target); + this.cmd = TaskTargetCmd.parse(target, cmd); + } + toShell() { + const out = this.cmd + .map(c => { + let sh = c.replace(/\n/g, "") + return shEscape(sh); + }); + + return out.join(" "); + } +} + + + +class TaskTarget { + path: string; + fsImpl: FSImpl = defaultFSImpl; + pipeline: TaskTargetBase[]; + idValue: string | ((t: TaskTarget)=>string) | undefined; + postFns: ((t: TaskTarget)=>Promise)[]; + + constructor(path: string){ + this.path = path; + this.pipeline = []; + this.postFns = []; + } + + exists() { + return this.fsImpl.existsSync(this.path); + } + + _joinPath(path: string) { + let finalPath = path; + if (!path.startsWith('/')) { + finalPath = nodePath.join(this.path, path) + } + return finalPath; + } + + get basename() { + return safe(nodePath.basename(this.path)); + } + basenameN(n: number) { + return this.path + .split("/") + .map(s => safe(s)) + .slice(-n) + .join("___"); + } + + get id() { + assert(this.idValue, `TaskTarget for path "${this.path}" must have an id`); + if (typeof this.idValue === "function") { + return safe(this.idValue(this)); + } + return safe(this.idValue); + } + + /**Changes the current directory of the target*/ + cd(path: string) { + this.path = this._joinPath(path); + } + + /**Get a glob off of the target*/ + glob(globPath: string) { + globPath = this._joinPath(globPath); + return globSync(globPath, { + cwd: '/DUMMYCWD', + fs: this.fsImpl + }); + } + + clone() { + const t = new TaskTarget(this.path); + t.fsImpl = this.fsImpl; + t.idValue = typeof this.idValue === "function" ? this.idValue : undefined; + t.postFns = t.postFns.slice(); + //TODO: clone pipeline + return t; + } + + pushToPipeline(v: TaskTargetBase) { + if (v.type === "read") { + assert(this.pipeline.length === 0, "A read can only be the first item in a pipeline"); + } + + this.pipeline.push(v); + } + + pushPostFn(fn: ((t: TaskTarget)=>Promise)) { + this.postFns.push(fn); + } +} + +/**A very composable object*/ +export class Task { + /**A serial pipeline of Streams*/ + targets: TaskTarget[]; + + /**SHARED list of all tasks for this given tree*/ + tasks: Task[]; + + constructor() { + this.tasks = []; + this.targets = [new TaskTarget(process.cwd())]; + } + + cd(path: string) { + for (const t of this.targets) { + // TODO: opts + t.cd(path); + } + return this; + } + + /**Globs for all the paths that match under all targets*/ + glob(globPath: string) { + // For every target, concat glob onto it, glob, and then + // replace the original set of targets with all the new ones + const newTargets: TaskTarget[] = []; + for (const t of this.targets) { + const matches = t.glob(globPath); + for (const m of matches) { + const newT = t.clone(); + newT.path = m; + newTargets.push(newT); + } + } + this.targets = newTargets; + return this; + } + + /**Opens all targets as zip archives*/ + async zip() { + for (const t of this.targets) { + const zfs = new ZipFS(t.path); + await zfs.init(); + t.path = ""; // Each target is now rooted at the base of its respective zip + t.fsImpl = zfs.getImpl() as any; + } + return this; + } + + /**Returns a copy of ourself*/ + clone() { + const t = new Task(); + t.targets = this.targets.map(t => t.clone()); + t.tasks = this.tasks; //SHARED object reference + return t; + } + + /**Returns a copy of ourself, but adds us to this tree's shared + * task list as well*/ + fork() { + const c = this.clone(); + this.tasks.push(c); + return c; + } + + cmd(cmd: string | string[] | ((target: TaskTarget)=>string) | ((target: TaskTarget)=>string[])) { + for (const t of this.targets) { + t.pushToPipeline(new TaskTargetCmd(t, cmd)); + } + return this; + } + read() { + for (const t of this.targets) { + t.pushToPipeline(new TaskTargetRead(t)); + } + return this; + } + setId(idValue: string | ((t: TaskTarget)=>string)) { + for (const t of this.targets) { + t.idValue = idValue; + } + return this; + } + post(fn: any) { + for (const t of this.targets) { + t.pushPostFn(fn); + } + } + types( + types: string[] + ) { + // TODO: + return this; + } + csvSink( + summarization?: [string, string][] + ) { + // Ingest this csv into the database at the given id + // this.cmd(t=>["sqlite-utils", "insert", "your.db", t.id, "-", "--csv", "--detect-types"]); + // Add a post processing function for these targets that prints out the summarization + // stats + this.post(async (t: TaskTarget)=>{ + // We only do the first one so far for the summarization + let queryLine: string; + let formatFn: (r: any)=>string; + const [columnName, type] = summarization?.[0] ?? [undefined, undefined]; + if (type === "numeric") { + queryLine = `min(${columnName}) as lo, max(${columnName}) as hi, count(*) as n`; + formatFn = (r: any)=>`${r.n} rows from ${r.lo} to ${r.hi} for ${t.id}`; + } + else { + queryLine = `count(*) as n`; + formatFn = (r: any)=>`${r.n} rows for ${t.id}`; + } + + const cmd = "sqlite-utils"; + const args = ["query", "your.db", `select ${queryLine} from ${t.id}`] + const { stdout, stderr } = await execFile(cmd, args); + const results = JSON.parse(stdout); + const result = results[0]; // should only be one result in the array for this type of query + const logLine = formatFn(result); + (t as any).log = logLine; + }); + + return this; + } + + /**Collect all the TaskTargets, make sure everything is init'd and exists + * and output the targets for processing*/ + async getFinalTargets() { + const targets: TaskTarget[] = []; + for (const task of this.tasks) { + for (const t of task.targets) { + // Make sure fsImpl is ready + if ("ready" in t.fsImpl && !t.fsImpl.ready && t.fsImpl.init) { + await t.fsImpl.init(); + } + if (t.pipeline.length <= 0) { + continue; // Tasks with empty pipelines are no-ops, remove + } + if (!t.exists()) { + console.warn(`Missing target ${t.path}`); + continue; + } + + targets.push(t); + } + } + return targets; + } + + async getTaskTSVShell() { + const targets = await this.getFinalTargets(); + let out: string[] = []; + for (const t of targets) { + const shell = t.pipeline + .map(p => p.toShell()) + .join(" | ") + out.push(`${t.id}\t${shell}`); + } + + return out.join("\n"); + } +} diff --git a/data-export/task.ts b/data-export/task.ts index df884b8..797b477 100644 --- a/data-export/task.ts +++ b/data-export/task.ts @@ -3,7 +3,7 @@ import fs from 'node:fs'; import { strict as assert } from "node:assert"; import { ZipFS } from "./zipFs.ts"; import { globSync } from "glob"; -import { $, ProcessPromise, quote } from "zx"; +import { $ } from "zx"; type FSImpl = { isZip?: boolean; @@ -36,6 +36,20 @@ function safe(s: string) { } +//TODO: DANGER: I doubt this is safe... +function shEscape(s: string) { + assert(!s.includes("\n"), "shEscape given new line, caller needs to handle these"); + if (!s.match(/[ \$\"\'\!]/)) { + return s; + } + // We need to quote this string + // Single quoted strings require you to close the single quoted string, then + // use the escaped single quote, and then reopen the string... obscene + s = s.replace(/'/g, "'\\''"); + s = `'${s}'`; + return s; +} + interface TaskTargetOp { type: "read" | "mid"; toShell(target: TaskTarget): string; @@ -47,11 +61,11 @@ class TaskTargetRead implements TaskTargetOp { if (target.fsImpl.isZip) { assert(target.fsImpl.zipPath, "Should have a zipPath"); // We need to be able to do this - return `7z x ${quote(target.fsImpl.zipPath)} -so ${quote(target.path)}`; + return `7z x ${shEscape(target.fsImpl.zipPath)} -so ${shEscape(target.path)}`; } // TODO : Implement when reading from a zip file - return `cat ${quote(target.path)}`; + return `cat ${shEscape(target.path)}`; } clone() { return new TaskTargetRead(); @@ -82,7 +96,7 @@ class TaskTargetCmd implements TaskTargetOp { const out = parsedCmd .map(c => { let sh = c.replace(/\n/g, "") - return quote(sh); + return shEscape(sh); }); return out.join(" "); @@ -318,16 +332,6 @@ export function getTSVManifest(targets: TaskTarget[]): string { return out.join("\n"); } -export function getTaskManifest(targets: TaskTarget[]): [string, string][] { - let out: [string, string][] = []; - for (const t of targets) { - const shell = t.toShell(); - out.push([t.id, shell] as const); - } - - return out; -} - function collectionSwap(a: TaskTargetPipelineHelper, b: TaskTargetPipelineHelper) { if (!a.__collection) { return; @@ -404,7 +408,25 @@ export class TaskTargetPipelineHelper extends Array { } } -export async function run(target: TaskTarget): Promise { - const command = target.toShell(); - return await $({ nothrow: true })`bash -c ${command}`; +export async function parallel(targets: TaskTarget[]) { + const finalTargets = await verify(targets); + const manifestTSV = getTSVManifest(finalTargets); + + try { + await $({ input: manifestTSV })`/usr/bin/parallel \ + --colsep ${'\t'} \ + --jobs 0 \ + --linebuffer \ + --tagstring {1} \ + --eta \ + --joblog out.manifest \ + ${'bash -c {2} > OUTTEST/{1}.csv'} \ + ::::- `; // stdin is in manifestTSV + } + catch(err: any) { + // I'm pretty sure status is the amount that failed? + if (err?.status >= 30) { + throw err; + } + } } \ No newline at end of file diff --git a/package.json b/package.json index e0ed113..b16957c 100644 --- a/package.json +++ b/package.json @@ -5,9 +5,7 @@ "main": "index.js", "type": "module", "scripts": { - "test": "node --enable-source-maps --test --experimental-transform-types --no-warnings ./test/task.ts", - "test2": "node --enable-source-maps --test --experimental-transform-types --no-warnings ./test/facebook.ts", - "test-update-snapshots": "node --enable-source-maps --test --experimental-transform-types --no-warnings --test-update-snapshots ./test/facebook.ts", + "test": "node --enable-source-maps --test --experimental-transform-types --no-warnings ./test/util.ts", "dev": "vite --port 2223", "server": "node --experimental-transform-types server/server.ts", "prototype": "node --import ./util/tsx-loader.js --import ./util/ignore-css-loader.js --experimental-transform-types server/prototype.ts" diff --git a/test/facebook.ts b/test/facebook.ts index 9835c91..44c2ba2 100644 --- a/test/facebook.ts +++ b/test/facebook.ts @@ -1,73 +1,115 @@ import test from "node:test"; -import nodePath from "node:path"; -import { strict as assert } from "node:assert"; +import fs from "node:fs"; +import assert from "node:assert"; import { finished } from "node:stream/promises"; import { Readable, Writable } from "node:stream"; -import { TaskTargetPipelineHelper, TaskTarget, verify, getTSVManifest, getTaskManifest, run } from "../data-export/task.ts"; -import { parallel } from "../data-export/parallel.ts"; -import "../data-export/facebook.ts"; +import { TaskTargetPipelineHelper } from "../data-export/task.ts"; -const THIS_FILE = import.meta.dirname; -const FACEBOOK_V1_DIR = nodePath.join(THIS_FILE, 'fixtures/facebook-json-2021-05-01'); -const FACEBOOK_V1_ZIPPED = nodePath.join(THIS_FILE, 'fixtures/facebook-json-2021-05-01.zip'); -const FACEBOOK_V2_DIR = nodePath.join(THIS_FILE, 'fixtures/facebook-json-2025-11-29'); +test("facebook: Can load the 2021 export", async () => { + // TODO: + // const t = new Task(); + // (await t.fork().cd("/home/cobertos/Seafile/archive/ExportedServiceData/facebook/facebook-x-2025-11-29-x.zip").zip()).facebook_v2(); + // const taskText = await t.getTaskTSVShell(); + // await fs.writeFile('test.manifest', taskText); + // // Run everything with parallel + // try { + // execFileSync('/usr/bin/parallel', ['--colsep', '\t', '--jobs', '0', '--linebuffer', '--tagstring', '{1}', '--eta', '--joblog', 'out.manifest', 'bash -c {2} > OUTTEST/{1}.csv', '::::', 'test.manifest'], { + // stdio: 'inherit' + // }); + // } + // catch(err: any) { + // // I'm pretty sure status is the amount that failed? + // if (err?.status >= 30) { + // throw err; + // } + // } -test("facebook: Can load the 2021 export", async (t) => { - const targets = TaskTargetPipelineHelper.pipeline([ - new TaskTarget(FACEBOOK_V1_DIR) - ]) - .facebook(); - - const finalTargets = await verify(targets); - const result = await parallel(finalTargets, true); - for (const [id, r] of result.entries()) { - assert.ok(!r.stderr, `Task ${id} should have no stderr output`); - assert.ok(r.ok, `Task ${id} should be okay`); - } - - const allCSV = Array.from(result.entries()) - .sort() // Keep stable ordering for snapshots - .map(([id, r]) => r.stdout); - - t.assert.snapshot(allCSV); + // // Now take the output and load it all into a single SQLITE file + // const entries = await fs.readdir('OUTTEST', { withFileTypes: true }); + // const csvFiles = entries + // .filter(e => e.isFile() && e.name.endsWith(".csv")) + // .map(e => nodePath.join('OUTTEST', e.name)); }); -test("facebook: Can load the 2021 export zipped", async (t) => { - const targets = await TaskTargetPipelineHelper.pipeline([ - new TaskTarget(FACEBOOK_V1_ZIPPED) - ]) - .unzip(); - const targets2 = targets - .facebook(); - const finalTargets = await verify(targets2); - const result = await parallel(finalTargets, true); - for (const [id, r] of result.entries()) { - assert.ok(!r.stderr, `Task ${id} should have no stderr output`); - assert.ok(r.ok, `Task ${id} should be okay`); - } +// import fs from 'node:fs/promises'; +// import { type SpawnOptions, execFile as _execFile, execFileSync } from "node:child_process"; +// import nodePath from "node:path"; +// import { DatabaseSync } from "node:sqlite"; +// import { promisify } from "node:util"; +// import "../data-export/facebook.ts"; +// import { google } from "../data-export/google.ts"; +// const execFile = promisify(_execFile); - const allCSV = Array.from(result.entries()) - .sort() // Keep stable ordering for snapshots - .map(([id, r]) => r.stdout); +// declare module "../data-export/task.ts" { +// interface Task { +// google: typeof google; +// } +// } - t.assert.snapshot(allCSV); -}); -test("facebook: Can load the 2025 export", async (t) => { - const targets = TaskTargetPipelineHelper.pipeline([ - new TaskTarget(FACEBOOK_V2_DIR) - ]) - .facebook_v2(); +// Object.assign(Task.prototype, { +// google +// }); - const finalTargets = await verify(targets); - const result = await parallel(finalTargets, true); - for (const [id, r] of result.entries()) { - assert.ok(!r.stderr, `Task ${id} should have no stderr output`); - assert.ok(r.ok, `Task ${id} should be okay`); - } +// function loadIntoSqlite( +// paths: string[], +// sqlitePath: string +// ) { +// // Open an in-memory db for speed +// const db = new DatabaseSync(":memory:", { allowExtension: true }); +// db.loadExtension("/home/cobertos/sqlite-files/csv.so") +// db.enableLoadExtension(false); +// for (const path of paths) { +// const table = nodePath.basename(path, ".csv"); +// console.log(`Loading ${path} → table ${table}`); - const allCSV = Array.from(result.entries()) - .sort() // Keep stable ordering for snapshots - .map(([id, r]) => r.stdout); +// // const headers = lines[0].split(","); +// // const columnsSql = headers.map(h => `"${h}" TEXT`).join(", "); +// db.exec(`CREATE VIRTUAL TABLE temp.intermediate USING csv(filename='${path}');`); +// db.exec(`CREATE TABLE "${table}" AS SELECT * FROM intermediate;`); +// db.exec(`DROP TABLE IF EXISTS intermediate;`); +// } - t.assert.snapshot(allCSV); -}); +// // Dump it all to the path specified +// db.exec(`VACUUM main INTO '${sqlitePath}'`); +// db.close(); +// } + +// async function main() { +// const t = new Task(); +// // t.fork().cd("/home/cobertos/Seafile/archive/ExportedServiceData/facebook/formapcast_facebook-DEADNAME-May2021-json") +// // .facebook() + +// (await t.fork().cd("/home/cobertos/Seafile/archive/ExportedServiceData/facebook/facebook-x-2025-11-29-x.zip").zip()).facebook_v2(); + +// // t.fork().cd("/home/cobertos/Seafile/archive/ExportedServiceData/google/2023-NAMEwork-001") +// // .google() + + +// // let zipTask = t.fork().zip("/home/cobertos/Seafile/archive/ExportedServiceData/facebook/facebook-DEADNAME-May2021-json.zip"); +// // await (zipTask.fsImpl as any).init(); + +// // zipTask.facebook(); +// const taskText = await t.getTaskTSVShell(); +// await fs.writeFile('test.manifest', taskText); +// // Run everything with parallel +// try { +// execFileSync('/usr/bin/parallel', ['--colsep', '\t', '--jobs', '0', '--linebuffer', '--tagstring', '{1}', '--eta', '--joblog', 'out.manifest', 'bash -c {2} > OUTTEST/{1}.csv', '::::', 'test.manifest'], { +// stdio: 'inherit' +// }); +// } +// catch(err: any) { +// // I'm pretty sure status is the amount that failed? +// if (err?.status >= 30) { +// throw err; +// } +// } +// // Now take the output and load it all into a single SQLITE file +// const entries = await fs.readdir('OUTTEST', { withFileTypes: true }); +// const csvFiles = entries +// .filter(e => e.isFile() && e.name.endsWith(".csv")) +// .map(e => nodePath.join('OUTTEST', e.name)); +// await fs.unlink('your.db'); +// loadIntoSqlite(csvFiles, 'your.db'); +// } + +// main(); \ No newline at end of file diff --git a/test/facebook.ts.snapshot b/test/facebook.ts.snapshot deleted file mode 100644 index fb5c5e6..0000000 --- a/test/facebook.ts.snapshot +++ /dev/null @@ -1,116 +0,0 @@ -exports[`facebook: Can load the 2021 export 1`] = ` -[ - "\\"album\\",\\"uri\\",\\"creation_timestamp\\"\\n\\"xxx\\",\\"photos_and_videos/CoverPhotos_yyyyyy/200x200png.png\\",\\"2024-03-07T15:23:20Z\\"\\n\\"xxx\\",\\"photos_and_videos/CoverPhotos_yyyyyy/200x200png.png\\",\\"2024-07-01T07:46:40Z\\"\\n", - "[\\n \\"from\\",\\n \\"to\\",\\n \\"timestamp\\",\\n \\"body\\"\\n]\\n\\"Me\\",\\"xxx\\",\\"2024-01-13T07:13:20Z\\",\\"xxx\\"\\n\\"Me\\",\\"xxx\\",\\"2024-01-13T07:13:20Z\\",\\"xxx\\"\\n", - "\\"from\\",\\"to\\",\\"timestamp\\",\\"content\\"\\n\\"xxx\\",\\"\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n\\"xxx\\",\\"\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n", - "\\"from\\",\\"to\\",\\"timestamp\\",\\"content\\"\\n\\"xxx\\",\\"\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n", - "\\"from\\",\\"to\\",\\"timestamp\\",\\"content\\"\\n\\"xxx\\",\\"\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n", - "\\"from\\",\\"to\\",\\"timestamp\\",\\"content\\"\\n\\"xxx\\",\\"\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n\\"xxx\\",\\"\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n", - "\\"action\\",\\"ip\\",\\"user_agent\\",\\"datr_cookie\\",\\"city\\",\\"region\\",\\"country\\",\\"site_name\\",\\"timestamp\\"\\n\\"xxx\\",\\"1.1.1.1\\",\\"some/path\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"2024-05-01T07:53:20Z\\"\\n\\"xxx\\",\\"1.1.1.1\\",\\"some/path\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"2024-05-01T07:53:20Z\\"\\n", - "\\"status\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-05-01T07:53:20Z\\"\\n\\"xxx\\",\\"2024-02-13T14:36:40Z\\"\\n", - "\\"service_name\\",\\"native_app_id\\",\\"username\\",\\"email\\",\\"phone_number\\",\\"name\\"\\n\\"xxx\\",69,\\"xxx\\",\\"not_a_real_email@example.com\\",\\"xxx\\",\\"xxx\\"\\n\\"xxx\\",1707005000,\\"xxx\\",\\"not_a_real_email@example.com\\",,\\"xxx\\"\\n", - "\\"event\\",\\"created_timestamp\\",\\"ip_address\\",\\"user_agent\\",\\"datr_cookie\\"\\n\\"xxx\\",\\"2024-05-01T07:53:20Z\\",,,\\n\\"xxx\\",\\"2024-02-13T14:36:40Z\\",,,\\n", - "\\"name\\",\\"added_timestamp\\"\\n\\"xxx\\",\\"2024-12-29T08:13:20Z\\"\\n\\"xxx\\",\\"2024-09-02T12:26:40Z\\"\\n", - "\\"name\\",\\"created_timestamp\\",\\"updated_timestamp\\",\\"ip_address\\",\\"user_agent\\",\\"location\\",\\"app\\",\\"session_type\\",\\"datr_cookie\\"\\n\\"xxx\\",\\"2024-08-22T01:26:40Z\\",\\"2024-05-11T15:06:40Z\\",\\"1.1.1.1\\",\\"some/path\\",\\"\\",\\"\\",\\"\\",\\"xxx\\"\\n", - "\\"timestamp\\",\\"data\\",\\"title\\"\\n\\"2024-02-08T19:20:00Z\\",\\"TODO\\",\\"xxx\\"\\n\\"2024-01-17T14:00:00Z\\",\\"TODO\\",\\"xxx\\"\\n", - "\\"timestamp\\",\\"email\\",\\"contact_type\\"\\n\\"2024-10-18T07:03:20Z\\",\\"not_a_real_email@example.com\\",69\\n\\"2024-01-21T22:10:00Z\\",\\"not_a_real_email@example.com\\",69\\n", - "\\"name\\"\\n\\"xxx\\"\\n\\"xxx\\"\\n", - "\\"name\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-05-01T07:53:20Z\\"\\n\\"xxx\\",\\"2024-05-01T07:53:20Z\\"\\n", - "\\"name\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-02-13T13:13:20Z\\"\\n\\"xxx\\",\\"2024-10-31T00:36:40Z\\"\\n", - "\\"game\\",\\"added_timestamp\\"\\n\\"xxx\\",\\"2024-11-03T16:06:40Z\\"\\n", - "\\"title\\",\\"price\\",\\"seller\\",\\"created_timestamp\\",\\"latitude\\",\\"longitude\\",\\"description\\"\\n\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"2024-12-18T05:33:20Z\\",69,69,\\"xxx\\"\\n\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"2024-12-18T05:33:20Z\\",69,69,\\"xxx\\"\\n", - "\\"action\\",\\"timestamp\\",\\"site\\",\\"ip_address\\"\\n\\"xxx\\",\\"2024-05-01T07:53:20Z\\",\\"xxx\\",\\"1.1.1.1\\"\\n\\"xxx\\",\\"2024-04-23T17:56:40Z\\",\\"xxx\\",\\"1.1.1.1\\"\\n", - "\\"timestamp\\",\\"unread\\",\\"href\\",\\"text\\"\\n\\"2024-04-30T08:16:40Z\\",true,\\"url://somewhere\\",\\"xxx\\"\\n\\"2024-04-30T08:16:40Z\\",true,\\"url://somewhere\\",\\"xxx\\"\\n", - "\\"name\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-05-01T07:53:20Z\\"\\n\\"xxx\\",\\"2024-05-01T07:53:20Z\\"\\n", - "\\"from\\",\\"to\\",\\"amount\\",\\"currency\\",\\"type\\",\\"status\\",\\"payment_method\\",\\"created_timestamp\\"\\n\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"2024-05-05T21:36:40Z\\"\\n", - "\\"name\\",\\"uri\\",\\"timestamp\\"\\n\\"xxx\\",\\"url://somewhere\\",\\"2024-01-15T12:00:00Z\\"\\n\\"xxx\\",\\"url://somewhere\\",\\"2024-01-12T06:13:20Z\\"\\n", - "\\"from\\",\\"to\\",\\"rank\\",\\"timestamp\\"\\n\\"xxx\\",\\"xxx\\",69,\\"2024-07-22T19:03:20Z\\"\\n", - "\\"title\\",\\"timestamp\\",\\"reaction\\"\\n,\\"2024-01-14T06:50:00Z\\",\\"xxx\\"\\n,\\"2024-01-14T06:50:00Z\\",\\"xxx\\"\\n", - "\\"title\\",\\"timestamp\\"\\n,\\"2024-10-06T08:56:40Z\\"\\n,\\"2024-10-06T08:56:40Z\\"\\n", - "\\"name\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-02-08T16:33:20Z\\"\\n\\"xxx\\",\\"2024-09-24T19:10:00Z\\"\\n", - "\\"name\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-09-27T15:13:20Z\\"\\n\\"xxx\\",\\"2024-08-24T00:40:00Z\\"\\n", - "\\"name\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-01-14T06:50:00Z\\"\\n\\"xxx\\",\\"2024-01-14T06:50:00Z\\"\\n", - "\\"name\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-06-23T05:20:00Z\\"\\n\\"xxx\\",\\"2024-05-25T08:16:40Z\\"\\n", - "\\"title\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-01-14T06:50:00Z\\"\\n\\"xxx\\",\\"2024-04-28T20:10:00Z\\"\\n", - "\\"from\\",\\"to\\",\\"subject\\",\\"message\\",\\"timestamp\\"\\n\\"not_a_real_email@example.com\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"2024-10-16T06:26:40Z\\"\\n\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"url://somewhere\\",\\"2024-10-16T06:26:40Z\\"\\n", - "\\"title\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-12-17T08:43:20Z\\"\\n", - "\\"title\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-01-14T06:50:00Z\\"\\n\\"xxx\\",\\"2024-01-14T06:50:00Z\\"\\n", - "\\"name\\",\\"id\\",\\"type\\",\\"timestamp\\"\\n\\"xxx\\",69,\\"xxx\\",\\"2024-02-11T12:36:40Z\\"\\n\\"xxx\\",69,\\"xxx\\",\\"2024-02-10T19:56:40Z\\"\\n\\"xxx\\",69,\\"xxx\\",\\"2024-02-10T11:36:40Z\\"\\n\\"xxx\\",69,\\"xxx\\",\\"2024-02-07T21:06:40Z\\"\\n", - "\\"name\\",\\"uri\\",\\"timestamp\\"\\n\\"xxx\\",\\"url://somewhere\\",\\"2024-02-27T05:00:00Z\\"\\n\\"xxx\\",\\"url://somewhere\\",\\"2024-05-16T03:26:40Z\\"\\n", - "\\"title\\",\\"data\\",\\"timestamp\\"\\n\\"xxx\\",\\"TODO: data\\",\\"2024-05-01T07:53:20Z\\"\\n\\"xxx\\",\\"TODO: data\\",\\"2024-10-31T06:10:00Z\\"\\n", - "\\"title\\",\\"data\\",\\"timestamp\\"\\n\\"xxx\\",\\"TODO\\",\\"2024-02-08T19:20:00Z\\"\\n\\"xxx\\",\\"TODO\\",\\"2024-02-08T19:20:00Z\\"\\n", - "\\"title\\",\\"data\\",\\"timestamp\\"\\n\\"xxx\\",\\"xxx\\",\\"2024-11-17T06:30:00Z\\"\\n\\"xxx\\",\\"xxx\\",\\"2024-11-17T06:30:00Z\\"\\n" -] -`; - -exports[`facebook: Can load the 2021 export zipped 1`] = ` -[ - "\\"album\\",\\"uri\\",\\"creation_timestamp\\"\\n\\"xxx\\",\\"photos_and_videos/CoverPhotos_yyyyyy/200x200png.png\\",\\"2024-03-07T15:23:20Z\\"\\n\\"xxx\\",\\"photos_and_videos/CoverPhotos_yyyyyy/200x200png.png\\",\\"2024-07-01T07:46:40Z\\"\\n", - "[\\n \\"from\\",\\n \\"to\\",\\n \\"timestamp\\",\\n \\"body\\"\\n]\\n\\"Me\\",\\"xxx\\",\\"2024-01-13T07:13:20Z\\",\\"xxx\\"\\n\\"Me\\",\\"xxx\\",\\"2024-01-13T07:13:20Z\\",\\"xxx\\"\\n", - "\\"from\\",\\"to\\",\\"timestamp\\",\\"content\\"\\n\\"xxx\\",\\"\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n\\"xxx\\",\\"\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n", - "\\"from\\",\\"to\\",\\"timestamp\\",\\"content\\"\\n\\"xxx\\",\\"\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n", - "\\"from\\",\\"to\\",\\"timestamp\\",\\"content\\"\\n\\"xxx\\",\\"\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n", - "\\"from\\",\\"to\\",\\"timestamp\\",\\"content\\"\\n\\"xxx\\",\\"\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n\\"xxx\\",\\"\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n", - "\\"action\\",\\"ip\\",\\"user_agent\\",\\"datr_cookie\\",\\"city\\",\\"region\\",\\"country\\",\\"site_name\\",\\"timestamp\\"\\n\\"xxx\\",\\"1.1.1.1\\",\\"some/path\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"2024-05-01T07:53:20Z\\"\\n\\"xxx\\",\\"1.1.1.1\\",\\"some/path\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"2024-05-01T07:53:20Z\\"\\n", - "\\"status\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-05-01T07:53:20Z\\"\\n\\"xxx\\",\\"2024-02-13T14:36:40Z\\"\\n", - "\\"service_name\\",\\"native_app_id\\",\\"username\\",\\"email\\",\\"phone_number\\",\\"name\\"\\n\\"xxx\\",69,\\"xxx\\",\\"not_a_real_email@example.com\\",\\"xxx\\",\\"xxx\\"\\n\\"xxx\\",1707005000,\\"xxx\\",\\"not_a_real_email@example.com\\",,\\"xxx\\"\\n", - "\\"event\\",\\"created_timestamp\\",\\"ip_address\\",\\"user_agent\\",\\"datr_cookie\\"\\n\\"xxx\\",\\"2024-05-01T07:53:20Z\\",,,\\n\\"xxx\\",\\"2024-02-13T14:36:40Z\\",,,\\n", - "\\"name\\",\\"added_timestamp\\"\\n\\"xxx\\",\\"2024-12-29T08:13:20Z\\"\\n\\"xxx\\",\\"2024-09-02T12:26:40Z\\"\\n", - "\\"name\\",\\"created_timestamp\\",\\"updated_timestamp\\",\\"ip_address\\",\\"user_agent\\",\\"location\\",\\"app\\",\\"session_type\\",\\"datr_cookie\\"\\n\\"xxx\\",\\"2024-08-22T01:26:40Z\\",\\"2024-05-11T15:06:40Z\\",\\"1.1.1.1\\",\\"some/path\\",\\"\\",\\"\\",\\"\\",\\"xxx\\"\\n", - "\\"timestamp\\",\\"data\\",\\"title\\"\\n\\"2024-02-08T19:20:00Z\\",\\"TODO\\",\\"xxx\\"\\n\\"2024-01-17T14:00:00Z\\",\\"TODO\\",\\"xxx\\"\\n", - "\\"timestamp\\",\\"email\\",\\"contact_type\\"\\n\\"2024-10-18T07:03:20Z\\",\\"not_a_real_email@example.com\\",69\\n\\"2024-01-21T22:10:00Z\\",\\"not_a_real_email@example.com\\",69\\n", - "\\"name\\"\\n\\"xxx\\"\\n\\"xxx\\"\\n", - "\\"name\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-05-01T07:53:20Z\\"\\n\\"xxx\\",\\"2024-05-01T07:53:20Z\\"\\n", - "\\"name\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-02-13T13:13:20Z\\"\\n\\"xxx\\",\\"2024-10-31T00:36:40Z\\"\\n", - "\\"game\\",\\"added_timestamp\\"\\n\\"xxx\\",\\"2024-11-03T16:06:40Z\\"\\n", - "\\"title\\",\\"price\\",\\"seller\\",\\"created_timestamp\\",\\"latitude\\",\\"longitude\\",\\"description\\"\\n\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"2024-12-18T05:33:20Z\\",69,69,\\"xxx\\"\\n\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"2024-12-18T05:33:20Z\\",69,69,\\"xxx\\"\\n", - "\\"action\\",\\"timestamp\\",\\"site\\",\\"ip_address\\"\\n\\"xxx\\",\\"2024-05-01T07:53:20Z\\",\\"xxx\\",\\"1.1.1.1\\"\\n\\"xxx\\",\\"2024-04-23T17:56:40Z\\",\\"xxx\\",\\"1.1.1.1\\"\\n", - "\\"timestamp\\",\\"unread\\",\\"href\\",\\"text\\"\\n\\"2024-04-30T08:16:40Z\\",true,\\"url://somewhere\\",\\"xxx\\"\\n\\"2024-04-30T08:16:40Z\\",true,\\"url://somewhere\\",\\"xxx\\"\\n", - "\\"name\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-05-01T07:53:20Z\\"\\n\\"xxx\\",\\"2024-05-01T07:53:20Z\\"\\n", - "\\"from\\",\\"to\\",\\"amount\\",\\"currency\\",\\"type\\",\\"status\\",\\"payment_method\\",\\"created_timestamp\\"\\n\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"2024-05-05T21:36:40Z\\"\\n", - "\\"name\\",\\"uri\\",\\"timestamp\\"\\n\\"xxx\\",\\"url://somewhere\\",\\"2024-01-15T12:00:00Z\\"\\n\\"xxx\\",\\"url://somewhere\\",\\"2024-01-12T06:13:20Z\\"\\n", - "\\"from\\",\\"to\\",\\"rank\\",\\"timestamp\\"\\n\\"xxx\\",\\"xxx\\",69,\\"2024-07-22T19:03:20Z\\"\\n", - "\\"title\\",\\"timestamp\\",\\"reaction\\"\\n,\\"2024-01-14T06:50:00Z\\",\\"xxx\\"\\n,\\"2024-01-14T06:50:00Z\\",\\"xxx\\"\\n", - "\\"title\\",\\"timestamp\\"\\n,\\"2024-10-06T08:56:40Z\\"\\n,\\"2024-10-06T08:56:40Z\\"\\n", - "\\"name\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-02-08T16:33:20Z\\"\\n\\"xxx\\",\\"2024-09-24T19:10:00Z\\"\\n", - "\\"name\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-09-27T15:13:20Z\\"\\n\\"xxx\\",\\"2024-08-24T00:40:00Z\\"\\n", - "\\"name\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-01-14T06:50:00Z\\"\\n\\"xxx\\",\\"2024-01-14T06:50:00Z\\"\\n", - "\\"name\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-06-23T05:20:00Z\\"\\n\\"xxx\\",\\"2024-05-25T08:16:40Z\\"\\n", - "\\"title\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-01-14T06:50:00Z\\"\\n\\"xxx\\",\\"2024-04-28T20:10:00Z\\"\\n", - "\\"from\\",\\"to\\",\\"subject\\",\\"message\\",\\"timestamp\\"\\n\\"not_a_real_email@example.com\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"2024-10-16T06:26:40Z\\"\\n\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"url://somewhere\\",\\"2024-10-16T06:26:40Z\\"\\n", - "\\"title\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-12-17T08:43:20Z\\"\\n", - "\\"title\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-01-14T06:50:00Z\\"\\n\\"xxx\\",\\"2024-01-14T06:50:00Z\\"\\n", - "\\"name\\",\\"id\\",\\"type\\",\\"timestamp\\"\\n\\"xxx\\",69,\\"xxx\\",\\"2024-02-11T12:36:40Z\\"\\n\\"xxx\\",69,\\"xxx\\",\\"2024-02-10T19:56:40Z\\"\\n\\"xxx\\",69,\\"xxx\\",\\"2024-02-10T11:36:40Z\\"\\n\\"xxx\\",69,\\"xxx\\",\\"2024-02-07T21:06:40Z\\"\\n", - "\\"name\\",\\"uri\\",\\"timestamp\\"\\n\\"xxx\\",\\"url://somewhere\\",\\"2024-02-27T05:00:00Z\\"\\n\\"xxx\\",\\"url://somewhere\\",\\"2024-05-16T03:26:40Z\\"\\n", - "\\"title\\",\\"data\\",\\"timestamp\\"\\n\\"xxx\\",\\"TODO: data\\",\\"2024-05-01T07:53:20Z\\"\\n\\"xxx\\",\\"TODO: data\\",\\"2024-10-31T06:10:00Z\\"\\n", - "\\"title\\",\\"data\\",\\"timestamp\\"\\n\\"xxx\\",\\"TODO\\",\\"2024-02-08T19:20:00Z\\"\\n\\"xxx\\",\\"TODO\\",\\"2024-02-08T19:20:00Z\\"\\n", - "\\"title\\",\\"data\\",\\"timestamp\\"\\n\\"xxx\\",\\"xxx\\",\\"2024-11-17T06:30:00Z\\"\\n\\"xxx\\",\\"xxx\\",\\"2024-11-17T06:30:00Z\\"\\n" -] -`; - -exports[`facebook: Can load the 2025 export 1`] = ` -[ - "\\"from\\",\\"to\\",\\"timestamp\\",\\"content\\"\\n\\"xxx\\",\\"\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n\\"xxx\\",\\"\\",\\"1970-01-01T00:00:00Z\\",\\"some/path\\"\\n", - "\\"from\\",\\"to\\",\\"timestamp\\",\\"content\\"\\n\\"xxx\\",\\"\\",\\"1970-01-01T00:00:00Z\\",\\n\\"xxx\\",\\"\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n", - "\\"from\\",\\"to\\",\\"timestamp\\",\\"content\\"\\n\\"xxx\\",\\"\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n\\"xxx\\",\\"\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n", - "\\"from\\",\\"to\\",\\"timestamp\\",\\"content\\"\\n\\"xxx\\",\\"\\",\\"1970-01-01T00:00:00Z\\",\\n", - "\\"action\\",\\"ip\\",\\"user_agent\\",\\"datr_cookie\\",\\"city\\",\\"region\\",\\"country\\",\\"site_name\\",\\"timestamp\\"\\n\\"xxx\\",\\"1.1.1.1\\",\\"some/path\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"2024-11-22T10:06:40Z\\"\\n\\"xxx\\",\\"1.1.1.1\\",\\"some/path\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"2024-11-21T23:00:00Z\\"\\n", - "\\"timestamp\\",\\"data\\",\\"title\\"\\n\\"2024-02-13T02:06:40Z\\",\\"TODO\\",\\"xxx\\"\\n\\"2024-07-12T02:06:40Z\\",\\"TODO\\",\\"xxx\\"\\n", - "\\"name\\",\\"added_timestamp\\"\\n\\"xxx\\",\\"2024-01-12T00:40:00Z\\"\\n\\"xxx\\",\\"2024-06-21T17:13:20Z\\"\\n", - "\\"timestamp\\",\\"email\\",\\"contact_type\\"\\n\\"2024-02-07T19:43:20Z\\",\\"not_a_real_email@example.com\\",69\\n", - "\\"title\\",\\"data\\",\\"timestamp\\"\\n\\"xxx\\",\\"TODO\\",\\"2024-10-06T06:10:00Z\\"\\n\\"xxx\\",\\"TODO\\",\\"2024-01-22T16:13:20Z\\"\\n", - "\\"title\\",\\"price\\",\\"seller\\",\\"created_timestamp\\",\\"latitude\\",\\"longitude\\",\\"description\\"\\n\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"2024-10-02T23:00:00Z\\",69,69,\\"xxx\\"\\n\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"2024-09-27T01:20:00Z\\",69,69,\\"xxx\\"\\n", - "\\"action\\",\\"timestamp\\",\\"site\\",\\"ip_address\\"\\n\\"xxx\\",\\"2024-08-10T14:26:40Z\\",\\"xxx\\",\\"1.1.1.1\\"\\n\\"xxx\\",\\"2024-08-10T14:26:40Z\\",\\"xxx\\",\\"1.1.1.1\\"\\n", - "\\"timestamp\\",\\"unread\\",\\"href\\",\\"text\\"\\n\\"2024-11-20T12:16:40Z\\",true,\\"url://somewhere\\",\\"xxx\\"\\n\\"2024-11-15T00:20:00Z\\",true,\\"url://somewhere\\",\\"xxx\\"\\n", - "\\"title\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-02-21T03:10:00Z\\"\\n", - "\\"name\\",\\"uri\\",\\"timestamp\\"\\n\\"xxx\\",\\"url://somewhere\\",\\"2024-09-11T20:03:20Z\\"\\n\\"xxx\\",\\"url://somewhere\\",\\"2024-01-20T12:50:00Z\\"\\n", - "\\"name\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-09-10T10:43:20Z\\"\\n\\"xxx\\",\\"2024-09-02T12:26:40Z\\"\\n", - "\\"event\\",\\"created_timestamp\\",\\"ip_address\\",\\"user_agent\\",\\"datr_cookie\\"\\n\\"xxx\\",\\"2024-08-11T01:33:20Z\\",,,\\n\\"xxx\\",\\"2024-08-10T14:26:40Z\\",,,\\n", - "\\"name\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-09-01T14:13:20Z\\"\\n\\"xxx\\",\\"2024-08-12T08:06:40Z\\"\\n", - "\\"start\\",\\"end\\"\\n", - "\\"name\\",\\"created_timestamp\\",\\"updated_timestamp\\",\\"ip_address\\",\\"user_agent\\",\\"location\\",\\"app\\",\\"session_type\\",\\"datr_cookie\\"\\n,\\"2024-04-04T19:46:40Z\\",\\"2024-11-23T02:46:40Z\\",\\"1.1.1.1\\",\\"some/path\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\"\\n,\\"2024-04-05T06:53:20Z\\",\\"2024-11-22T10:06:40Z\\",\\"1.1.1.1\\",\\"some/path\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\"\\n", - "\\"name\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-04-01T16:46:40Z\\"\\n\\"xxx\\",\\"2024-09-07T16:03:20Z\\"\\n", - "\\"title\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-02-12T17:46:40Z\\"\\n\\"xxx\\",\\"2024-02-12T17:46:40Z\\"\\n", - "\\"title\\",\\"data\\",\\"timestamp\\"\\n\\"xxx\\",\\"xxx\\",\\"2024-12-08T09:26:40Z\\"\\n\\"xxx\\",\\"xxx\\",\\"2024-12-28T00:16:40Z\\"\\n" -] -`; diff --git a/test/fixtures/facebook-json-2021-05-01.zip b/test/fixtures/facebook-json-2021-05-01.zip deleted file mode 100644 index 1c3aff9..0000000 Binary files a/test/fixtures/facebook-json-2021-05-01.zip and /dev/null differ diff --git a/test/task.ts b/test/task.ts index a87d240..cd2c697 100644 --- a/test/task.ts +++ b/test/task.ts @@ -1,8 +1,9 @@ import test from "node:test"; -import nodePath from "node:path"; -import { strict as assert } from "node:assert/strict"; +import assert from "node:assert/strict"; import { TaskTarget, + each, + map, cd, glob as taskGlob, read, @@ -13,11 +14,10 @@ import { TaskTargetPipelineHelper, } from "../data-export/task.ts"; -const THIS_FILE = import.meta.dirname; -const FIXTURE_DIR = nodePath.join(THIS_FILE, 'fixtures/facebook-json-2021-05-01'); -const FIXTURE_FILE = nodePath.join(FIXTURE_DIR, 'friends/friends.json'); +const FIXTURE_DIR = "/projects/base-data-manager/test/fixtures/facebook-json-2021-05-01"; +const FIXTURE_FILE = `${FIXTURE_DIR}/friends/friends.json`; -// -- TaskTarget --------------------------------------------------------------- +// ── TaskTarget ─────────────────────────────────────────────────────────────── test("TaskTarget: constructor initializes path, pipeline, postFns", () => { const t = new TaskTarget("/foo/bar"); @@ -110,7 +110,7 @@ test("TaskTarget: glob returns matching TaskTargets from disk", () => { assert.ok(results.every(r => r.path.endsWith(".json"))); }); -// -- toShell / shEscape ------------------------------------------------------- +// ── toShell / shEscape ─────────────────────────────────────────────────────── test("toShell: a single read produces a cat command", () => { const t = new TaskTarget("/foo/bar.txt"); @@ -125,17 +125,22 @@ test("toShell: read piped into cmd", () => { assert.equal(t.toShell(), "cat /foo/bar.txt | jq ."); }); -for (const c of " $!&".split("")) { - test(`toShell: quotes paths that contain ${JSON.stringify(c)}`, () => { - const t = new TaskTarget(`/foo/bar${c}baz.txt`); - t.read(); - assert.equal(t.toShell(), `cat $'/foo/bar${c}baz.txt'`); - }); -} -test(`toShell: quotes and escapes paths that contain '`, () => { - const t = new TaskTarget(`/foo/bar'baz.txt`); +test("toShell: single-quotes paths that contain spaces", () => { + const t = new TaskTarget("/foo/bar baz.txt"); t.read(); - assert.equal(t.toShell(), `cat $'/foo/bar\\'baz.txt'`); + assert.equal(t.toShell(), "cat '/foo/bar baz.txt'"); +}); + +test("toShell: single-quotes paths that contain dollar signs", () => { + const t = new TaskTarget("/foo/$bar.txt"); + t.read(); + assert.equal(t.toShell(), "cat '/foo/$bar.txt'"); +}); + +test("toShell: escapes literal single-quotes inside a path", () => { + const t = new TaskTarget("/foo/it's.txt"); + t.read(); + assert.equal(t.toShell(), "cat '/foo/it'\\''s.txt'"); }); test("toShell: cmd with array splits tokens", () => { @@ -150,7 +155,22 @@ test("toShell: cmd with function resolves at shell-generation time", () => { assert.equal(t.toShell(), "jq -r .name /foo/bar.json"); }); -// -- module-level functions --------------------------------------------------- +// ── module-level functions ─────────────────────────────────────────────────── + +test("each: calls fn for every target", () => { + const targets = [new TaskTarget("/a"), new TaskTarget("/b")]; + const paths: string[] = []; + each(targets, t => paths.push(t.path)); + assert.deepEqual(paths, ["/a", "/b"]); +}); + +test("map: transforms each target", () => { + const targets = [new TaskTarget("/a"), new TaskTarget("/b")]; + const result = map(targets, t => { const c = t.clone(); c.path += "/x"; return c; }); + assert.equal(result[0].path, "/a/x"); + assert.equal(result[1].path, "/b/x"); + assert.equal(targets[0].path, "/a"); // originals unchanged +}); test("cd: clones and changes directory of each target", () => { const targets = [new TaskTarget("/a"), new TaskTarget("/b")]; @@ -191,7 +211,7 @@ test("taskGlob: returns matching targets across all input targets", () => { assert.ok(result.every(r => r.path.endsWith(".json"))); }); -// -- verify ------------------------------------------------------------------- +// ── verify ─────────────────────────────────────────────────────────────────── test("verify: removes targets with an empty pipeline", async () => { const t = new TaskTarget(FIXTURE_FILE); @@ -223,7 +243,7 @@ test("verify: filters a mixed list to only valid targets", async () => { assert.equal(result[0], good); }); -// -- getTSVManifest ----------------------------------------------------------- +// ── getTSVManifest ─────────────────────────────────────────────────────────── test("getTSVManifest: produces idshell for a single target", () => { const t = new TaskTarget("/foo/bar.txt"); @@ -238,7 +258,7 @@ test("getTSVManifest: joins multiple targets with newlines", () => { assert.equal(getTSVManifest([t1, t2]), "a\tcat /a.txt\nb\tcat /b.txt"); }); -// -- TaskTargetPipelineHelper ------------------------------------------------- +// ── TaskTargetPipelineHelper ───────────────────────────────────────────────── test("TaskTargetPipelineHelper: pipeline() promotes a plain array", () => { const p = TaskTargetPipelineHelper.pipeline([new TaskTarget("/a")]); @@ -273,7 +293,7 @@ test("TaskTargetPipelineHelper: cmd returns a new helper with cmd ops added", () assert.equal(p2[0].toShell(), "cat /a.txt | jq ."); }); -// -- collect ------------------------------------------------------------------ +// ── collect ────────────────────────────────────────────────────────────────── test("collect: the final end of a chain is added to the collection set", () => { const collection = new Set();