Compare commits

...

3 commits

9 changed files with 315 additions and 545 deletions

View file

@ -168,7 +168,7 @@ function facebook_comments_generic(this: TaskTargetPipelineHelper, prop: string)
return this.cmd(["jq", "-r", `
["timestamp","data", "title"],
(
.comments[]?
.${prop}[]?
| [(.timestamp | todateiso8601), "TODO", .title]
)
| @csv
@ -269,7 +269,7 @@ function facebook_admin_records_generic(this: TaskTargetPipelineHelper, prop: st
return this.cmd(["jq", "-r", `
["event","created_timestamp","ip_address","user_agent","datr_cookie"],
(
.admin_records[]
.${prop}[]
| [.event, (.session.created_timestamp | todateiso8601), .ip_address, .user_agent, .datr_cookie]
)
| @csv
@ -301,10 +301,10 @@ function facebook_authorized_logins_v2(this: TaskTargetPipelineHelper) {
}
function facebook_contact_verification_generic(this: TaskTargetPipelineHelper, prop: string) {
return this.cmd(["jq", "-r", `
["action", "timestamp", "site", "ip_address"],
["timestamp", "email", "contact_type"],
(
.${prop}[]
| [.action, (.timestamp | todateiso8601), .site, .ip_address]
| [(.verification_time | todateiso8601), .contact, .contact_type]
)
| @csv
`])
@ -399,7 +399,7 @@ function facebook_v2(this: TaskTargetPipelineHelper) {
// No correlary for your_off-facebook_activity.json
p.collect(col).cd(`apps_and_websites_off_of_facebook/connected_apps_and_websites.json`).read().facebook_installed_apps_v2();
p.collect(col).cd(`your_facebook_activity/comments_and_reactions/comments.json`).read().facebook_comments_v2();
p.collect(col).glob(`your_facebook_activity/messages/**/*.json`) // Files are message_1.json, etc
p.collect(col).glob(`your_facebook_activity/messages/*/**/*.json`) // Messages files are in the FOLDERS inside messages (archived_threads, e2ee_cutover, etc...)
.setId(t=>`Facebookv2 - Messages ${t.basenameN(2)}`) // 1, 2, etc is not specific enough, include the convo name
.read()
.facebook_messages_generic()
@ -438,7 +438,8 @@ function facebook_v2(this: TaskTargetPipelineHelper) {
p.collect(col).cd(`your_facebook_activity/facebook_marketplace/items_sold.json`).read().facebook_marketplace_items_sold_v2()
return Array.from(col);
const final = Array.from(col).flat();
return TaskTargetPipelineHelper.pipeline(final);
}
function facebook(this: TaskTargetPipelineHelper){
@ -606,7 +607,7 @@ function facebook(this: TaskTargetPipelineHelper){
p.collect(col).cd(`marketplace/items_sold.json`).read().facebook_marketplace_items_sold_v1()
p.collect(col).cd(`messages/**/*.json`) // Files are message_1.json, etc
p.collect(col).glob(`messages/**/*.json`) // Files are message_1.json, etc
.setId(t=>`Facebook - Messages ${t.basenameN(2)}`) // 1, 2, etc is not specific enough, include the convo name
.read()
.facebook_messages_generic()
@ -780,6 +781,7 @@ function facebook(this: TaskTargetPipelineHelper){
// `${facebookRoot}/your_places` - no data in my export
// `${facebookRoot}/your_topics` - no data in my export
return Array.from(col);
const final = Array.from(col).flat();
return TaskTargetPipelineHelper.pipeline(final);
};

86
data-export/parallel.ts Normal file
View file

@ -0,0 +1,86 @@
import { $, type ProcessOutput } from 'zx';
import os from 'os';
import { type TaskTarget, run } from "./task.ts";
$.verbose = false;
type ResultMap = Map<string, ProcessOutput>;
export async function parallel(
targets: TaskTarget[],
quiet: boolean = false,
maxConcurrency: number = os.cpus().length
): Promise<ResultMap> {
const results = new Map<string, ProcessOutput>();
const total = targets.length;
let completed = 0;
let running = 0;
const completionTimes: number[] = [];
const startTime = Date.now();
const inFlight = new Set<Promise<void>>();
function formatEta(): string {
const left = total - completed;
const avgSeconds = completionTimes.length > 0
? completionTimes.reduce((a, b) => a + b, 0) / completionTimes.length / 1000
: 0;
const etaSeconds = Math.round(left * avgSeconds);
const pct = total > 0 ? Math.round((completed / total) * 100) : 100;
const lastDuration = completionTimes.length > 0
? (completionTimes[completionTimes.length - 1] / 1000).toFixed(1)
: '0.0';
return `ETA: ${etaSeconds}s Left: ${left} AVG: ${avgSeconds.toFixed(2)}s local:${running}/${completed}/${pct}%/${lastDuration}s`;
}
function printStatus(): void {
if (quiet) {
return;
}
process.stderr.write(`\r${formatEta()}`.padEnd(80));
}
async function runJob(t: TaskTarget): Promise<void> {
running++;
printStatus();
const result = await run(t);
completionTimes.push(result.duration);
results.set(t.id, result);
running--;
completed++;
printStatus();
}
const queue = targets.slice();
// Process queue with concurrency limit
while (queue.length > 0 || inFlight.size > 0) {
// Fill up to max concurrency
while (queue.length > 0 && inFlight.size < maxConcurrency) {
const target = queue.shift()!;
const promise = runJob(target).then(() => {
inFlight.delete(promise);
});
inFlight.add(promise);
}
// Wait for at least one to complete if at capacity
if (inFlight.size > 0) {
await Promise.race(inFlight);
}
}
// Final status line
process.stderr.write('\n');
const totalSeconds = ((Date.now() - startTime) / 1000).toFixed(1);
const failed = Array.from(results.values().filter(p => !p.ok));
process.stderr.write(
`\nCompleted ${total} jobs in ${totalSeconds}s (${failed.length} failed)\n`
);
return results;
}

View file

@ -1,352 +0,0 @@
import nodePath from 'node:path';
import fs from 'node:fs';
import { strict as assert } from "node:assert";
import { execFile as _execFile } from "node:child_process";
import { promisify } from "node:util";
import { ZipFS } from "./zipFs.ts";
import { globSync } from "glob";
const execFile = promisify(_execFile);
type FSImpl = {
isZip?: boolean;
zipPath?: string;
init?(): Promise<void>;
ready?: boolean;
statSync: typeof fs["statSync"];
existsSync: typeof fs["existsSync"];
// Required by glob
lstatSync: typeof fs["lstatSync"];
// Needs to include withFileTypes DirEnt variant
readdir: typeof fs["readdir"];
readdirSync: typeof fs["readdirSync"];
readlinkSync: typeof fs["readlinkSync"];
realpathSync: typeof fs["realpathSync"];
promises: {
lstat: typeof fs.promises["lstat"];
// Needs to include withFileTypes DirEnt
readdir: typeof fs.promises["readdir"];
readlink: typeof fs.promises["readlink"];
realpath: typeof fs.promises["realpath"];
}
};
const defaultFSImpl = fs;
function safe(s: string) {
return s.replace(/[^a-zA-Z0-9_]/g, '_');
}
//TODO: DANGER: I doubt this is safe...
function shEscape(s: string) {
assert(!s.includes("\n"), "shEscape given new line, caller needs to handle these");
if (!s.match(/[ \$\"\'\!]/)) {
return s;
}
// We need to quote this string
// Single quoted strings require you to close the single quoted string, then
// use the escaped single quote, and then reopen the string... obscene
s = s.replace(/'/g, "'\\''");
s = `'${s}'`;
return s;
}
abstract class TaskTargetBase {
target: TaskTarget;
constructor(target: TaskTarget) {
this.target = target;
}
abstract get type(): "read" | "mid";
abstract toShell(): string;
}
class TaskTargetRead extends TaskTargetBase {
get type(){ return "read" as const; }
toShell() {
if (this.target.fsImpl.isZip) {
assert(this.target.fsImpl.zipPath, "Should have a zipPath");
// We need to be able to do this
return `7z x ${shEscape(this.target.fsImpl.zipPath)} -so ${shEscape(this.target.path)}`;
}
// TODO : Implement when reading from a zip file
return `cat ${shEscape(this.target.path)}`;
}
}
class TaskTargetCmd extends TaskTargetBase {
get type(){ return "mid" as const; }
/**What nodejs spawn() and execFile() take
* [cmd, ...args]: string[]
*/
cmd: string[];
static parse(target: TaskTarget, v: string | string[] | ((t: TaskTarget)=>string) | ((t: TaskTarget)=>string[])): string[] {
if (typeof v === "function") {
v = v(target);
}
if (typeof v === "string") {
v = v.split(/\s+/);
}
return v;
}
constructor(target: TaskTarget, cmd: string | string[] | ((t: TaskTarget)=>string) | ((t: TaskTarget)=>string[])) {
super(target);
this.cmd = TaskTargetCmd.parse(target, cmd);
}
toShell() {
const out = this.cmd
.map(c => {
let sh = c.replace(/\n/g, "")
return shEscape(sh);
});
return out.join(" ");
}
}
class TaskTarget {
path: string;
fsImpl: FSImpl = defaultFSImpl;
pipeline: TaskTargetBase[];
idValue: string | ((t: TaskTarget)=>string) | undefined;
postFns: ((t: TaskTarget)=>Promise<void>)[];
constructor(path: string){
this.path = path;
this.pipeline = [];
this.postFns = [];
}
exists() {
return this.fsImpl.existsSync(this.path);
}
_joinPath(path: string) {
let finalPath = path;
if (!path.startsWith('/')) {
finalPath = nodePath.join(this.path, path)
}
return finalPath;
}
get basename() {
return safe(nodePath.basename(this.path));
}
basenameN(n: number) {
return this.path
.split("/")
.map(s => safe(s))
.slice(-n)
.join("___");
}
get id() {
assert(this.idValue, `TaskTarget for path "${this.path}" must have an id`);
if (typeof this.idValue === "function") {
return safe(this.idValue(this));
}
return safe(this.idValue);
}
/**Changes the current directory of the target*/
cd(path: string) {
this.path = this._joinPath(path);
}
/**Get a glob off of the target*/
glob(globPath: string) {
globPath = this._joinPath(globPath);
return globSync(globPath, {
cwd: '/DUMMYCWD',
fs: this.fsImpl
});
}
clone() {
const t = new TaskTarget(this.path);
t.fsImpl = this.fsImpl;
t.idValue = typeof this.idValue === "function" ? this.idValue : undefined;
t.postFns = t.postFns.slice();
//TODO: clone pipeline
return t;
}
pushToPipeline(v: TaskTargetBase) {
if (v.type === "read") {
assert(this.pipeline.length === 0, "A read can only be the first item in a pipeline");
}
this.pipeline.push(v);
}
pushPostFn(fn: ((t: TaskTarget)=>Promise<void>)) {
this.postFns.push(fn);
}
}
/**A very composable object*/
export class Task {
/**A serial pipeline of Streams*/
targets: TaskTarget[];
/**SHARED list of all tasks for this given tree*/
tasks: Task[];
constructor() {
this.tasks = [];
this.targets = [new TaskTarget(process.cwd())];
}
cd(path: string) {
for (const t of this.targets) {
// TODO: opts
t.cd(path);
}
return this;
}
/**Globs for all the paths that match under all targets*/
glob(globPath: string) {
// For every target, concat glob onto it, glob, and then
// replace the original set of targets with all the new ones
const newTargets: TaskTarget[] = [];
for (const t of this.targets) {
const matches = t.glob(globPath);
for (const m of matches) {
const newT = t.clone();
newT.path = m;
newTargets.push(newT);
}
}
this.targets = newTargets;
return this;
}
/**Opens all targets as zip archives*/
async zip() {
for (const t of this.targets) {
const zfs = new ZipFS(t.path);
await zfs.init();
t.path = ""; // Each target is now rooted at the base of its respective zip
t.fsImpl = zfs.getImpl() as any;
}
return this;
}
/**Returns a copy of ourself*/
clone() {
const t = new Task();
t.targets = this.targets.map(t => t.clone());
t.tasks = this.tasks; //SHARED object reference
return t;
}
/**Returns a copy of ourself, but adds us to this tree's shared
* task list as well*/
fork() {
const c = this.clone();
this.tasks.push(c);
return c;
}
cmd(cmd: string | string[] | ((target: TaskTarget)=>string) | ((target: TaskTarget)=>string[])) {
for (const t of this.targets) {
t.pushToPipeline(new TaskTargetCmd(t, cmd));
}
return this;
}
read() {
for (const t of this.targets) {
t.pushToPipeline(new TaskTargetRead(t));
}
return this;
}
setId(idValue: string | ((t: TaskTarget)=>string)) {
for (const t of this.targets) {
t.idValue = idValue;
}
return this;
}
post(fn: any) {
for (const t of this.targets) {
t.pushPostFn(fn);
}
}
types(
types: string[]
) {
// TODO:
return this;
}
csvSink(
summarization?: [string, string][]
) {
// Ingest this csv into the database at the given id
// this.cmd(t=>["sqlite-utils", "insert", "your.db", t.id, "-", "--csv", "--detect-types"]);
// Add a post processing function for these targets that prints out the summarization
// stats
this.post(async (t: TaskTarget)=>{
// We only do the first one so far for the summarization
let queryLine: string;
let formatFn: (r: any)=>string;
const [columnName, type] = summarization?.[0] ?? [undefined, undefined];
if (type === "numeric") {
queryLine = `min(${columnName}) as lo, max(${columnName}) as hi, count(*) as n`;
formatFn = (r: any)=>`${r.n} rows from ${r.lo} to ${r.hi} for ${t.id}`;
}
else {
queryLine = `count(*) as n`;
formatFn = (r: any)=>`${r.n} rows for ${t.id}`;
}
const cmd = "sqlite-utils";
const args = ["query", "your.db", `select ${queryLine} from ${t.id}`]
const { stdout, stderr } = await execFile(cmd, args);
const results = JSON.parse(stdout);
const result = results[0]; // should only be one result in the array for this type of query
const logLine = formatFn(result);
(t as any).log = logLine;
});
return this;
}
/**Collect all the TaskTargets, make sure everything is init'd and exists
* and output the targets for processing*/
async getFinalTargets() {
const targets: TaskTarget[] = [];
for (const task of this.tasks) {
for (const t of task.targets) {
// Make sure fsImpl is ready
if ("ready" in t.fsImpl && !t.fsImpl.ready && t.fsImpl.init) {
await t.fsImpl.init();
}
if (t.pipeline.length <= 0) {
continue; // Tasks with empty pipelines are no-ops, remove
}
if (!t.exists()) {
console.warn(`Missing target ${t.path}`);
continue;
}
targets.push(t);
}
}
return targets;
}
async getTaskTSVShell() {
const targets = await this.getFinalTargets();
let out: string[] = [];
for (const t of targets) {
const shell = t.pipeline
.map(p => p.toShell())
.join(" | ")
out.push(`${t.id}\t${shell}`);
}
return out.join("\n");
}
}

View file

@ -3,7 +3,7 @@ import fs from 'node:fs';
import { strict as assert } from "node:assert";
import { ZipFS } from "./zipFs.ts";
import { globSync } from "glob";
import { $ } from "zx";
import { $, ProcessPromise, quote } from "zx";
type FSImpl = {
isZip?: boolean;
@ -36,20 +36,6 @@ function safe(s: string) {
}
//TODO: DANGER: I doubt this is safe...
function shEscape(s: string) {
assert(!s.includes("\n"), "shEscape given new line, caller needs to handle these");
if (!s.match(/[ \$\"\'\!]/)) {
return s;
}
// We need to quote this string
// Single quoted strings require you to close the single quoted string, then
// use the escaped single quote, and then reopen the string... obscene
s = s.replace(/'/g, "'\\''");
s = `'${s}'`;
return s;
}
interface TaskTargetOp {
type: "read" | "mid";
toShell(target: TaskTarget): string;
@ -61,11 +47,11 @@ class TaskTargetRead implements TaskTargetOp {
if (target.fsImpl.isZip) {
assert(target.fsImpl.zipPath, "Should have a zipPath");
// We need to be able to do this
return `7z x ${shEscape(target.fsImpl.zipPath)} -so ${shEscape(target.path)}`;
return `7z x ${quote(target.fsImpl.zipPath)} -so ${quote(target.path)}`;
}
// TODO : Implement when reading from a zip file
return `cat ${shEscape(target.path)}`;
return `cat ${quote(target.path)}`;
}
clone() {
return new TaskTargetRead();
@ -96,7 +82,7 @@ class TaskTargetCmd implements TaskTargetOp {
const out = parsedCmd
.map(c => {
let sh = c.replace(/\n/g, "")
return shEscape(sh);
return quote(sh);
});
return out.join(" ");
@ -332,6 +318,16 @@ export function getTSVManifest(targets: TaskTarget[]): string {
return out.join("\n");
}
export function getTaskManifest(targets: TaskTarget[]): [string, string][] {
let out: [string, string][] = [];
for (const t of targets) {
const shell = t.toShell();
out.push([t.id, shell] as const);
}
return out;
}
function collectionSwap(a: TaskTargetPipelineHelper, b: TaskTargetPipelineHelper) {
if (!a.__collection) {
return;
@ -408,25 +404,7 @@ export class TaskTargetPipelineHelper extends Array<TaskTarget> {
}
}
export async function parallel(targets: TaskTarget[]) {
const finalTargets = await verify(targets);
const manifestTSV = getTSVManifest(finalTargets);
try {
await $({ input: manifestTSV })`/usr/bin/parallel \
--colsep ${'\t'} \
--jobs 0 \
--linebuffer \
--tagstring {1} \
--eta \
--joblog out.manifest \
${'bash -c {2} > OUTTEST/{1}.csv'} \
::::- `; // stdin is in manifestTSV
}
catch(err: any) {
// I'm pretty sure status is the amount that failed?
if (err?.status >= 30) {
throw err;
}
}
export async function run(target: TaskTarget): Promise<ProcessPromise> {
const command = target.toShell();
return await $({ nothrow: true })`bash -c ${command}`;
}

View file

@ -5,7 +5,9 @@
"main": "index.js",
"type": "module",
"scripts": {
"test": "node --enable-source-maps --test --experimental-transform-types --no-warnings ./test/util.ts",
"test": "node --enable-source-maps --test --experimental-transform-types --no-warnings ./test/task.ts",
"test2": "node --enable-source-maps --test --experimental-transform-types --no-warnings ./test/facebook.ts",
"test-update-snapshots": "node --enable-source-maps --test --experimental-transform-types --no-warnings --test-update-snapshots ./test/facebook.ts",
"dev": "vite --port 2223",
"server": "node --experimental-transform-types server/server.ts",
"prototype": "node --import ./util/tsx-loader.js --import ./util/ignore-css-loader.js --experimental-transform-types server/prototype.ts"

View file

@ -1,115 +1,73 @@
import test from "node:test";
import fs from "node:fs";
import assert from "node:assert";
import nodePath from "node:path";
import { strict as assert } from "node:assert";
import { finished } from "node:stream/promises";
import { Readable, Writable } from "node:stream";
import { TaskTargetPipelineHelper } from "../data-export/task.ts";
import { TaskTargetPipelineHelper, TaskTarget, verify, getTSVManifest, getTaskManifest, run } from "../data-export/task.ts";
import { parallel } from "../data-export/parallel.ts";
import "../data-export/facebook.ts";
test("facebook: Can load the 2021 export", async () => {
// TODO:
// const t = new Task();
// (await t.fork().cd("/home/cobertos/Seafile/archive/ExportedServiceData/facebook/facebook-x-2025-11-29-x.zip").zip()).facebook_v2();
// const taskText = await t.getTaskTSVShell();
// await fs.writeFile('test.manifest', taskText);
// // Run everything with parallel
// try {
// execFileSync('/usr/bin/parallel', ['--colsep', '\t', '--jobs', '0', '--linebuffer', '--tagstring', '{1}', '--eta', '--joblog', 'out.manifest', 'bash -c {2} > OUTTEST/{1}.csv', '::::', 'test.manifest'], {
// stdio: 'inherit'
// });
// }
// catch(err: any) {
// // I'm pretty sure status is the amount that failed?
// if (err?.status >= 30) {
// throw err;
// }
// }
const THIS_FILE = import.meta.dirname;
const FACEBOOK_V1_DIR = nodePath.join(THIS_FILE, 'fixtures/facebook-json-2021-05-01');
const FACEBOOK_V1_ZIPPED = nodePath.join(THIS_FILE, 'fixtures/facebook-json-2021-05-01.zip');
const FACEBOOK_V2_DIR = nodePath.join(THIS_FILE, 'fixtures/facebook-json-2025-11-29');
// // Now take the output and load it all into a single SQLITE file
// const entries = await fs.readdir('OUTTEST', { withFileTypes: true });
// const csvFiles = entries
// .filter(e => e.isFile() && e.name.endsWith(".csv"))
// .map(e => nodePath.join('OUTTEST', e.name));
test("facebook: Can load the 2021 export", async (t) => {
const targets = TaskTargetPipelineHelper.pipeline([
new TaskTarget(FACEBOOK_V1_DIR)
])
.facebook();
const finalTargets = await verify(targets);
const result = await parallel(finalTargets, true);
for (const [id, r] of result.entries()) {
assert.ok(!r.stderr, `Task ${id} should have no stderr output`);
assert.ok(r.ok, `Task ${id} should be okay`);
}
const allCSV = Array.from(result.entries())
.sort() // Keep stable ordering for snapshots
.map(([id, r]) => r.stdout);
t.assert.snapshot(allCSV);
});
test("facebook: Can load the 2021 export zipped", async (t) => {
const targets = await TaskTargetPipelineHelper.pipeline([
new TaskTarget(FACEBOOK_V1_ZIPPED)
])
.unzip();
const targets2 = targets
.facebook();
// import fs from 'node:fs/promises';
// import { type SpawnOptions, execFile as _execFile, execFileSync } from "node:child_process";
// import nodePath from "node:path";
// import { DatabaseSync } from "node:sqlite";
// import { promisify } from "node:util";
// import "../data-export/facebook.ts";
// import { google } from "../data-export/google.ts";
// const execFile = promisify(_execFile);
const finalTargets = await verify(targets2);
const result = await parallel(finalTargets, true);
for (const [id, r] of result.entries()) {
assert.ok(!r.stderr, `Task ${id} should have no stderr output`);
assert.ok(r.ok, `Task ${id} should be okay`);
}
// declare module "../data-export/task.ts" {
// interface Task {
// google: typeof google;
// }
// }
const allCSV = Array.from(result.entries())
.sort() // Keep stable ordering for snapshots
.map(([id, r]) => r.stdout);
// Object.assign(Task.prototype, {
// google
// });
t.assert.snapshot(allCSV);
});
test("facebook: Can load the 2025 export", async (t) => {
const targets = TaskTargetPipelineHelper.pipeline([
new TaskTarget(FACEBOOK_V2_DIR)
])
.facebook_v2();
// function loadIntoSqlite(
// paths: string[],
// sqlitePath: string
// ) {
// // Open an in-memory db for speed
// const db = new DatabaseSync(":memory:", { allowExtension: true });
// db.loadExtension("/home/cobertos/sqlite-files/csv.so")
// db.enableLoadExtension(false);
// for (const path of paths) {
// const table = nodePath.basename(path, ".csv");
// console.log(`Loading ${path} → table ${table}`);
const finalTargets = await verify(targets);
const result = await parallel(finalTargets, true);
for (const [id, r] of result.entries()) {
assert.ok(!r.stderr, `Task ${id} should have no stderr output`);
assert.ok(r.ok, `Task ${id} should be okay`);
}
// // const headers = lines[0].split(",");
// // const columnsSql = headers.map(h => `"${h}" TEXT`).join(", ");
// db.exec(`CREATE VIRTUAL TABLE temp.intermediate USING csv(filename='${path}');`);
// db.exec(`CREATE TABLE "${table}" AS SELECT * FROM intermediate;`);
// db.exec(`DROP TABLE IF EXISTS intermediate;`);
// }
const allCSV = Array.from(result.entries())
.sort() // Keep stable ordering for snapshots
.map(([id, r]) => r.stdout);
// // Dump it all to the path specified
// db.exec(`VACUUM main INTO '${sqlitePath}'`);
// db.close();
// }
// async function main() {
// const t = new Task();
// // t.fork().cd("/home/cobertos/Seafile/archive/ExportedServiceData/facebook/formapcast_facebook-DEADNAME-May2021-json")
// // .facebook()
// (await t.fork().cd("/home/cobertos/Seafile/archive/ExportedServiceData/facebook/facebook-x-2025-11-29-x.zip").zip()).facebook_v2();
// // t.fork().cd("/home/cobertos/Seafile/archive/ExportedServiceData/google/2023-NAMEwork-001")
// // .google()
// // let zipTask = t.fork().zip("/home/cobertos/Seafile/archive/ExportedServiceData/facebook/facebook-DEADNAME-May2021-json.zip");
// // await (zipTask.fsImpl as any).init();
// // zipTask.facebook();
// const taskText = await t.getTaskTSVShell();
// await fs.writeFile('test.manifest', taskText);
// // Run everything with parallel
// try {
// execFileSync('/usr/bin/parallel', ['--colsep', '\t', '--jobs', '0', '--linebuffer', '--tagstring', '{1}', '--eta', '--joblog', 'out.manifest', 'bash -c {2} > OUTTEST/{1}.csv', '::::', 'test.manifest'], {
// stdio: 'inherit'
// });
// }
// catch(err: any) {
// // I'm pretty sure status is the amount that failed?
// if (err?.status >= 30) {
// throw err;
// }
// }
// // Now take the output and load it all into a single SQLITE file
// const entries = await fs.readdir('OUTTEST', { withFileTypes: true });
// const csvFiles = entries
// .filter(e => e.isFile() && e.name.endsWith(".csv"))
// .map(e => nodePath.join('OUTTEST', e.name));
// await fs.unlink('your.db');
// loadIntoSqlite(csvFiles, 'your.db');
// }
// main();
t.assert.snapshot(allCSV);
});

116
test/facebook.ts.snapshot Normal file
View file

@ -0,0 +1,116 @@
exports[`facebook: Can load the 2021 export 1`] = `
[
"\\"album\\",\\"uri\\",\\"creation_timestamp\\"\\n\\"xxx\\",\\"photos_and_videos/CoverPhotos_yyyyyy/200x200png.png\\",\\"2024-03-07T15:23:20Z\\"\\n\\"xxx\\",\\"photos_and_videos/CoverPhotos_yyyyyy/200x200png.png\\",\\"2024-07-01T07:46:40Z\\"\\n",
"[\\n \\"from\\",\\n \\"to\\",\\n \\"timestamp\\",\\n \\"body\\"\\n]\\n\\"Me\\",\\"xxx\\",\\"2024-01-13T07:13:20Z\\",\\"xxx\\"\\n\\"Me\\",\\"xxx\\",\\"2024-01-13T07:13:20Z\\",\\"xxx\\"\\n",
"\\"from\\",\\"to\\",\\"timestamp\\",\\"content\\"\\n\\"xxx\\",\\"<other>\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n\\"xxx\\",\\"<other>\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n",
"\\"from\\",\\"to\\",\\"timestamp\\",\\"content\\"\\n\\"xxx\\",\\"<other>\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n",
"\\"from\\",\\"to\\",\\"timestamp\\",\\"content\\"\\n\\"xxx\\",\\"<other>\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n",
"\\"from\\",\\"to\\",\\"timestamp\\",\\"content\\"\\n\\"xxx\\",\\"<other>\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n\\"xxx\\",\\"<other>\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n",
"\\"action\\",\\"ip\\",\\"user_agent\\",\\"datr_cookie\\",\\"city\\",\\"region\\",\\"country\\",\\"site_name\\",\\"timestamp\\"\\n\\"xxx\\",\\"1.1.1.1\\",\\"some/path\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"2024-05-01T07:53:20Z\\"\\n\\"xxx\\",\\"1.1.1.1\\",\\"some/path\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"2024-05-01T07:53:20Z\\"\\n",
"\\"status\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-05-01T07:53:20Z\\"\\n\\"xxx\\",\\"2024-02-13T14:36:40Z\\"\\n",
"\\"service_name\\",\\"native_app_id\\",\\"username\\",\\"email\\",\\"phone_number\\",\\"name\\"\\n\\"xxx\\",69,\\"xxx\\",\\"not_a_real_email@example.com\\",\\"xxx\\",\\"xxx\\"\\n\\"xxx\\",1707005000,\\"xxx\\",\\"not_a_real_email@example.com\\",,\\"xxx\\"\\n",
"\\"event\\",\\"created_timestamp\\",\\"ip_address\\",\\"user_agent\\",\\"datr_cookie\\"\\n\\"xxx\\",\\"2024-05-01T07:53:20Z\\",,,\\n\\"xxx\\",\\"2024-02-13T14:36:40Z\\",,,\\n",
"\\"name\\",\\"added_timestamp\\"\\n\\"xxx\\",\\"2024-12-29T08:13:20Z\\"\\n\\"xxx\\",\\"2024-09-02T12:26:40Z\\"\\n",
"\\"name\\",\\"created_timestamp\\",\\"updated_timestamp\\",\\"ip_address\\",\\"user_agent\\",\\"location\\",\\"app\\",\\"session_type\\",\\"datr_cookie\\"\\n\\"xxx\\",\\"2024-08-22T01:26:40Z\\",\\"2024-05-11T15:06:40Z\\",\\"1.1.1.1\\",\\"some/path\\",\\"\\",\\"\\",\\"\\",\\"xxx\\"\\n",
"\\"timestamp\\",\\"data\\",\\"title\\"\\n\\"2024-02-08T19:20:00Z\\",\\"TODO\\",\\"xxx\\"\\n\\"2024-01-17T14:00:00Z\\",\\"TODO\\",\\"xxx\\"\\n",
"\\"timestamp\\",\\"email\\",\\"contact_type\\"\\n\\"2024-10-18T07:03:20Z\\",\\"not_a_real_email@example.com\\",69\\n\\"2024-01-21T22:10:00Z\\",\\"not_a_real_email@example.com\\",69\\n",
"\\"name\\"\\n\\"xxx\\"\\n\\"xxx\\"\\n",
"\\"name\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-05-01T07:53:20Z\\"\\n\\"xxx\\",\\"2024-05-01T07:53:20Z\\"\\n",
"\\"name\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-02-13T13:13:20Z\\"\\n\\"xxx\\",\\"2024-10-31T00:36:40Z\\"\\n",
"\\"game\\",\\"added_timestamp\\"\\n\\"xxx\\",\\"2024-11-03T16:06:40Z\\"\\n",
"\\"title\\",\\"price\\",\\"seller\\",\\"created_timestamp\\",\\"latitude\\",\\"longitude\\",\\"description\\"\\n\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"2024-12-18T05:33:20Z\\",69,69,\\"xxx\\"\\n\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"2024-12-18T05:33:20Z\\",69,69,\\"xxx\\"\\n",
"\\"action\\",\\"timestamp\\",\\"site\\",\\"ip_address\\"\\n\\"xxx\\",\\"2024-05-01T07:53:20Z\\",\\"xxx\\",\\"1.1.1.1\\"\\n\\"xxx\\",\\"2024-04-23T17:56:40Z\\",\\"xxx\\",\\"1.1.1.1\\"\\n",
"\\"timestamp\\",\\"unread\\",\\"href\\",\\"text\\"\\n\\"2024-04-30T08:16:40Z\\",true,\\"url://somewhere\\",\\"xxx\\"\\n\\"2024-04-30T08:16:40Z\\",true,\\"url://somewhere\\",\\"xxx\\"\\n",
"\\"name\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-05-01T07:53:20Z\\"\\n\\"xxx\\",\\"2024-05-01T07:53:20Z\\"\\n",
"\\"from\\",\\"to\\",\\"amount\\",\\"currency\\",\\"type\\",\\"status\\",\\"payment_method\\",\\"created_timestamp\\"\\n\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"2024-05-05T21:36:40Z\\"\\n",
"\\"name\\",\\"uri\\",\\"timestamp\\"\\n\\"xxx\\",\\"url://somewhere\\",\\"2024-01-15T12:00:00Z\\"\\n\\"xxx\\",\\"url://somewhere\\",\\"2024-01-12T06:13:20Z\\"\\n",
"\\"from\\",\\"to\\",\\"rank\\",\\"timestamp\\"\\n\\"xxx\\",\\"xxx\\",69,\\"2024-07-22T19:03:20Z\\"\\n",
"\\"title\\",\\"timestamp\\",\\"reaction\\"\\n,\\"2024-01-14T06:50:00Z\\",\\"xxx\\"\\n,\\"2024-01-14T06:50:00Z\\",\\"xxx\\"\\n",
"\\"title\\",\\"timestamp\\"\\n,\\"2024-10-06T08:56:40Z\\"\\n,\\"2024-10-06T08:56:40Z\\"\\n",
"\\"name\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-02-08T16:33:20Z\\"\\n\\"xxx\\",\\"2024-09-24T19:10:00Z\\"\\n",
"\\"name\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-09-27T15:13:20Z\\"\\n\\"xxx\\",\\"2024-08-24T00:40:00Z\\"\\n",
"\\"name\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-01-14T06:50:00Z\\"\\n\\"xxx\\",\\"2024-01-14T06:50:00Z\\"\\n",
"\\"name\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-06-23T05:20:00Z\\"\\n\\"xxx\\",\\"2024-05-25T08:16:40Z\\"\\n",
"\\"title\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-01-14T06:50:00Z\\"\\n\\"xxx\\",\\"2024-04-28T20:10:00Z\\"\\n",
"\\"from\\",\\"to\\",\\"subject\\",\\"message\\",\\"timestamp\\"\\n\\"not_a_real_email@example.com\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"2024-10-16T06:26:40Z\\"\\n\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"url://somewhere\\",\\"2024-10-16T06:26:40Z\\"\\n",
"\\"title\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-12-17T08:43:20Z\\"\\n",
"\\"title\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-01-14T06:50:00Z\\"\\n\\"xxx\\",\\"2024-01-14T06:50:00Z\\"\\n",
"\\"name\\",\\"id\\",\\"type\\",\\"timestamp\\"\\n\\"xxx\\",69,\\"xxx\\",\\"2024-02-11T12:36:40Z\\"\\n\\"xxx\\",69,\\"xxx\\",\\"2024-02-10T19:56:40Z\\"\\n\\"xxx\\",69,\\"xxx\\",\\"2024-02-10T11:36:40Z\\"\\n\\"xxx\\",69,\\"xxx\\",\\"2024-02-07T21:06:40Z\\"\\n",
"\\"name\\",\\"uri\\",\\"timestamp\\"\\n\\"xxx\\",\\"url://somewhere\\",\\"2024-02-27T05:00:00Z\\"\\n\\"xxx\\",\\"url://somewhere\\",\\"2024-05-16T03:26:40Z\\"\\n",
"\\"title\\",\\"data\\",\\"timestamp\\"\\n\\"xxx\\",\\"TODO: data\\",\\"2024-05-01T07:53:20Z\\"\\n\\"xxx\\",\\"TODO: data\\",\\"2024-10-31T06:10:00Z\\"\\n",
"\\"title\\",\\"data\\",\\"timestamp\\"\\n\\"xxx\\",\\"TODO\\",\\"2024-02-08T19:20:00Z\\"\\n\\"xxx\\",\\"TODO\\",\\"2024-02-08T19:20:00Z\\"\\n",
"\\"title\\",\\"data\\",\\"timestamp\\"\\n\\"xxx\\",\\"xxx\\",\\"2024-11-17T06:30:00Z\\"\\n\\"xxx\\",\\"xxx\\",\\"2024-11-17T06:30:00Z\\"\\n"
]
`;
exports[`facebook: Can load the 2021 export zipped 1`] = `
[
"\\"album\\",\\"uri\\",\\"creation_timestamp\\"\\n\\"xxx\\",\\"photos_and_videos/CoverPhotos_yyyyyy/200x200png.png\\",\\"2024-03-07T15:23:20Z\\"\\n\\"xxx\\",\\"photos_and_videos/CoverPhotos_yyyyyy/200x200png.png\\",\\"2024-07-01T07:46:40Z\\"\\n",
"[\\n \\"from\\",\\n \\"to\\",\\n \\"timestamp\\",\\n \\"body\\"\\n]\\n\\"Me\\",\\"xxx\\",\\"2024-01-13T07:13:20Z\\",\\"xxx\\"\\n\\"Me\\",\\"xxx\\",\\"2024-01-13T07:13:20Z\\",\\"xxx\\"\\n",
"\\"from\\",\\"to\\",\\"timestamp\\",\\"content\\"\\n\\"xxx\\",\\"<other>\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n\\"xxx\\",\\"<other>\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n",
"\\"from\\",\\"to\\",\\"timestamp\\",\\"content\\"\\n\\"xxx\\",\\"<other>\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n",
"\\"from\\",\\"to\\",\\"timestamp\\",\\"content\\"\\n\\"xxx\\",\\"<other>\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n",
"\\"from\\",\\"to\\",\\"timestamp\\",\\"content\\"\\n\\"xxx\\",\\"<other>\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n\\"xxx\\",\\"<other>\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n",
"\\"action\\",\\"ip\\",\\"user_agent\\",\\"datr_cookie\\",\\"city\\",\\"region\\",\\"country\\",\\"site_name\\",\\"timestamp\\"\\n\\"xxx\\",\\"1.1.1.1\\",\\"some/path\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"2024-05-01T07:53:20Z\\"\\n\\"xxx\\",\\"1.1.1.1\\",\\"some/path\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"2024-05-01T07:53:20Z\\"\\n",
"\\"status\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-05-01T07:53:20Z\\"\\n\\"xxx\\",\\"2024-02-13T14:36:40Z\\"\\n",
"\\"service_name\\",\\"native_app_id\\",\\"username\\",\\"email\\",\\"phone_number\\",\\"name\\"\\n\\"xxx\\",69,\\"xxx\\",\\"not_a_real_email@example.com\\",\\"xxx\\",\\"xxx\\"\\n\\"xxx\\",1707005000,\\"xxx\\",\\"not_a_real_email@example.com\\",,\\"xxx\\"\\n",
"\\"event\\",\\"created_timestamp\\",\\"ip_address\\",\\"user_agent\\",\\"datr_cookie\\"\\n\\"xxx\\",\\"2024-05-01T07:53:20Z\\",,,\\n\\"xxx\\",\\"2024-02-13T14:36:40Z\\",,,\\n",
"\\"name\\",\\"added_timestamp\\"\\n\\"xxx\\",\\"2024-12-29T08:13:20Z\\"\\n\\"xxx\\",\\"2024-09-02T12:26:40Z\\"\\n",
"\\"name\\",\\"created_timestamp\\",\\"updated_timestamp\\",\\"ip_address\\",\\"user_agent\\",\\"location\\",\\"app\\",\\"session_type\\",\\"datr_cookie\\"\\n\\"xxx\\",\\"2024-08-22T01:26:40Z\\",\\"2024-05-11T15:06:40Z\\",\\"1.1.1.1\\",\\"some/path\\",\\"\\",\\"\\",\\"\\",\\"xxx\\"\\n",
"\\"timestamp\\",\\"data\\",\\"title\\"\\n\\"2024-02-08T19:20:00Z\\",\\"TODO\\",\\"xxx\\"\\n\\"2024-01-17T14:00:00Z\\",\\"TODO\\",\\"xxx\\"\\n",
"\\"timestamp\\",\\"email\\",\\"contact_type\\"\\n\\"2024-10-18T07:03:20Z\\",\\"not_a_real_email@example.com\\",69\\n\\"2024-01-21T22:10:00Z\\",\\"not_a_real_email@example.com\\",69\\n",
"\\"name\\"\\n\\"xxx\\"\\n\\"xxx\\"\\n",
"\\"name\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-05-01T07:53:20Z\\"\\n\\"xxx\\",\\"2024-05-01T07:53:20Z\\"\\n",
"\\"name\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-02-13T13:13:20Z\\"\\n\\"xxx\\",\\"2024-10-31T00:36:40Z\\"\\n",
"\\"game\\",\\"added_timestamp\\"\\n\\"xxx\\",\\"2024-11-03T16:06:40Z\\"\\n",
"\\"title\\",\\"price\\",\\"seller\\",\\"created_timestamp\\",\\"latitude\\",\\"longitude\\",\\"description\\"\\n\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"2024-12-18T05:33:20Z\\",69,69,\\"xxx\\"\\n\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"2024-12-18T05:33:20Z\\",69,69,\\"xxx\\"\\n",
"\\"action\\",\\"timestamp\\",\\"site\\",\\"ip_address\\"\\n\\"xxx\\",\\"2024-05-01T07:53:20Z\\",\\"xxx\\",\\"1.1.1.1\\"\\n\\"xxx\\",\\"2024-04-23T17:56:40Z\\",\\"xxx\\",\\"1.1.1.1\\"\\n",
"\\"timestamp\\",\\"unread\\",\\"href\\",\\"text\\"\\n\\"2024-04-30T08:16:40Z\\",true,\\"url://somewhere\\",\\"xxx\\"\\n\\"2024-04-30T08:16:40Z\\",true,\\"url://somewhere\\",\\"xxx\\"\\n",
"\\"name\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-05-01T07:53:20Z\\"\\n\\"xxx\\",\\"2024-05-01T07:53:20Z\\"\\n",
"\\"from\\",\\"to\\",\\"amount\\",\\"currency\\",\\"type\\",\\"status\\",\\"payment_method\\",\\"created_timestamp\\"\\n\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"2024-05-05T21:36:40Z\\"\\n",
"\\"name\\",\\"uri\\",\\"timestamp\\"\\n\\"xxx\\",\\"url://somewhere\\",\\"2024-01-15T12:00:00Z\\"\\n\\"xxx\\",\\"url://somewhere\\",\\"2024-01-12T06:13:20Z\\"\\n",
"\\"from\\",\\"to\\",\\"rank\\",\\"timestamp\\"\\n\\"xxx\\",\\"xxx\\",69,\\"2024-07-22T19:03:20Z\\"\\n",
"\\"title\\",\\"timestamp\\",\\"reaction\\"\\n,\\"2024-01-14T06:50:00Z\\",\\"xxx\\"\\n,\\"2024-01-14T06:50:00Z\\",\\"xxx\\"\\n",
"\\"title\\",\\"timestamp\\"\\n,\\"2024-10-06T08:56:40Z\\"\\n,\\"2024-10-06T08:56:40Z\\"\\n",
"\\"name\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-02-08T16:33:20Z\\"\\n\\"xxx\\",\\"2024-09-24T19:10:00Z\\"\\n",
"\\"name\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-09-27T15:13:20Z\\"\\n\\"xxx\\",\\"2024-08-24T00:40:00Z\\"\\n",
"\\"name\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-01-14T06:50:00Z\\"\\n\\"xxx\\",\\"2024-01-14T06:50:00Z\\"\\n",
"\\"name\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-06-23T05:20:00Z\\"\\n\\"xxx\\",\\"2024-05-25T08:16:40Z\\"\\n",
"\\"title\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-01-14T06:50:00Z\\"\\n\\"xxx\\",\\"2024-04-28T20:10:00Z\\"\\n",
"\\"from\\",\\"to\\",\\"subject\\",\\"message\\",\\"timestamp\\"\\n\\"not_a_real_email@example.com\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"2024-10-16T06:26:40Z\\"\\n\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"url://somewhere\\",\\"2024-10-16T06:26:40Z\\"\\n",
"\\"title\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-12-17T08:43:20Z\\"\\n",
"\\"title\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-01-14T06:50:00Z\\"\\n\\"xxx\\",\\"2024-01-14T06:50:00Z\\"\\n",
"\\"name\\",\\"id\\",\\"type\\",\\"timestamp\\"\\n\\"xxx\\",69,\\"xxx\\",\\"2024-02-11T12:36:40Z\\"\\n\\"xxx\\",69,\\"xxx\\",\\"2024-02-10T19:56:40Z\\"\\n\\"xxx\\",69,\\"xxx\\",\\"2024-02-10T11:36:40Z\\"\\n\\"xxx\\",69,\\"xxx\\",\\"2024-02-07T21:06:40Z\\"\\n",
"\\"name\\",\\"uri\\",\\"timestamp\\"\\n\\"xxx\\",\\"url://somewhere\\",\\"2024-02-27T05:00:00Z\\"\\n\\"xxx\\",\\"url://somewhere\\",\\"2024-05-16T03:26:40Z\\"\\n",
"\\"title\\",\\"data\\",\\"timestamp\\"\\n\\"xxx\\",\\"TODO: data\\",\\"2024-05-01T07:53:20Z\\"\\n\\"xxx\\",\\"TODO: data\\",\\"2024-10-31T06:10:00Z\\"\\n",
"\\"title\\",\\"data\\",\\"timestamp\\"\\n\\"xxx\\",\\"TODO\\",\\"2024-02-08T19:20:00Z\\"\\n\\"xxx\\",\\"TODO\\",\\"2024-02-08T19:20:00Z\\"\\n",
"\\"title\\",\\"data\\",\\"timestamp\\"\\n\\"xxx\\",\\"xxx\\",\\"2024-11-17T06:30:00Z\\"\\n\\"xxx\\",\\"xxx\\",\\"2024-11-17T06:30:00Z\\"\\n"
]
`;
exports[`facebook: Can load the 2025 export 1`] = `
[
"\\"from\\",\\"to\\",\\"timestamp\\",\\"content\\"\\n\\"xxx\\",\\"<other>\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n\\"xxx\\",\\"<other>\\",\\"1970-01-01T00:00:00Z\\",\\"some/path\\"\\n",
"\\"from\\",\\"to\\",\\"timestamp\\",\\"content\\"\\n\\"xxx\\",\\"<other>\\",\\"1970-01-01T00:00:00Z\\",\\n\\"xxx\\",\\"<other>\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n",
"\\"from\\",\\"to\\",\\"timestamp\\",\\"content\\"\\n\\"xxx\\",\\"<other>\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n\\"xxx\\",\\"<other>\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n",
"\\"from\\",\\"to\\",\\"timestamp\\",\\"content\\"\\n\\"xxx\\",\\"<other>\\",\\"1970-01-01T00:00:00Z\\",\\n",
"\\"action\\",\\"ip\\",\\"user_agent\\",\\"datr_cookie\\",\\"city\\",\\"region\\",\\"country\\",\\"site_name\\",\\"timestamp\\"\\n\\"xxx\\",\\"1.1.1.1\\",\\"some/path\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"2024-11-22T10:06:40Z\\"\\n\\"xxx\\",\\"1.1.1.1\\",\\"some/path\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"2024-11-21T23:00:00Z\\"\\n",
"\\"timestamp\\",\\"data\\",\\"title\\"\\n\\"2024-02-13T02:06:40Z\\",\\"TODO\\",\\"xxx\\"\\n\\"2024-07-12T02:06:40Z\\",\\"TODO\\",\\"xxx\\"\\n",
"\\"name\\",\\"added_timestamp\\"\\n\\"xxx\\",\\"2024-01-12T00:40:00Z\\"\\n\\"xxx\\",\\"2024-06-21T17:13:20Z\\"\\n",
"\\"timestamp\\",\\"email\\",\\"contact_type\\"\\n\\"2024-02-07T19:43:20Z\\",\\"not_a_real_email@example.com\\",69\\n",
"\\"title\\",\\"data\\",\\"timestamp\\"\\n\\"xxx\\",\\"TODO\\",\\"2024-10-06T06:10:00Z\\"\\n\\"xxx\\",\\"TODO\\",\\"2024-01-22T16:13:20Z\\"\\n",
"\\"title\\",\\"price\\",\\"seller\\",\\"created_timestamp\\",\\"latitude\\",\\"longitude\\",\\"description\\"\\n\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"2024-10-02T23:00:00Z\\",69,69,\\"xxx\\"\\n\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"2024-09-27T01:20:00Z\\",69,69,\\"xxx\\"\\n",
"\\"action\\",\\"timestamp\\",\\"site\\",\\"ip_address\\"\\n\\"xxx\\",\\"2024-08-10T14:26:40Z\\",\\"xxx\\",\\"1.1.1.1\\"\\n\\"xxx\\",\\"2024-08-10T14:26:40Z\\",\\"xxx\\",\\"1.1.1.1\\"\\n",
"\\"timestamp\\",\\"unread\\",\\"href\\",\\"text\\"\\n\\"2024-11-20T12:16:40Z\\",true,\\"url://somewhere\\",\\"xxx\\"\\n\\"2024-11-15T00:20:00Z\\",true,\\"url://somewhere\\",\\"xxx\\"\\n",
"\\"title\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-02-21T03:10:00Z\\"\\n",
"\\"name\\",\\"uri\\",\\"timestamp\\"\\n\\"xxx\\",\\"url://somewhere\\",\\"2024-09-11T20:03:20Z\\"\\n\\"xxx\\",\\"url://somewhere\\",\\"2024-01-20T12:50:00Z\\"\\n",
"\\"name\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-09-10T10:43:20Z\\"\\n\\"xxx\\",\\"2024-09-02T12:26:40Z\\"\\n",
"\\"event\\",\\"created_timestamp\\",\\"ip_address\\",\\"user_agent\\",\\"datr_cookie\\"\\n\\"xxx\\",\\"2024-08-11T01:33:20Z\\",,,\\n\\"xxx\\",\\"2024-08-10T14:26:40Z\\",,,\\n",
"\\"name\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-09-01T14:13:20Z\\"\\n\\"xxx\\",\\"2024-08-12T08:06:40Z\\"\\n",
"\\"start\\",\\"end\\"\\n",
"\\"name\\",\\"created_timestamp\\",\\"updated_timestamp\\",\\"ip_address\\",\\"user_agent\\",\\"location\\",\\"app\\",\\"session_type\\",\\"datr_cookie\\"\\n,\\"2024-04-04T19:46:40Z\\",\\"2024-11-23T02:46:40Z\\",\\"1.1.1.1\\",\\"some/path\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\"\\n,\\"2024-04-05T06:53:20Z\\",\\"2024-11-22T10:06:40Z\\",\\"1.1.1.1\\",\\"some/path\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\"\\n",
"\\"name\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-04-01T16:46:40Z\\"\\n\\"xxx\\",\\"2024-09-07T16:03:20Z\\"\\n",
"\\"title\\",\\"timestamp\\"\\n\\"xxx\\",\\"2024-02-12T17:46:40Z\\"\\n\\"xxx\\",\\"2024-02-12T17:46:40Z\\"\\n",
"\\"title\\",\\"data\\",\\"timestamp\\"\\n\\"xxx\\",\\"xxx\\",\\"2024-12-08T09:26:40Z\\"\\n\\"xxx\\",\\"xxx\\",\\"2024-12-28T00:16:40Z\\"\\n"
]
`;

Binary file not shown.

View file

@ -1,9 +1,8 @@
import test from "node:test";
import assert from "node:assert/strict";
import nodePath from "node:path";
import { strict as assert } from "node:assert/strict";
import {
TaskTarget,
each,
map,
cd,
glob as taskGlob,
read,
@ -14,10 +13,11 @@ import {
TaskTargetPipelineHelper,
} from "../data-export/task.ts";
const FIXTURE_DIR = "/projects/base-data-manager/test/fixtures/facebook-json-2021-05-01";
const FIXTURE_FILE = `${FIXTURE_DIR}/friends/friends.json`;
const THIS_FILE = import.meta.dirname;
const FIXTURE_DIR = nodePath.join(THIS_FILE, 'fixtures/facebook-json-2021-05-01');
const FIXTURE_FILE = nodePath.join(FIXTURE_DIR, 'friends/friends.json');
// ── TaskTarget ───────────────────────────────────────────────────────────────
// -- TaskTarget ---------------------------------------------------------------
test("TaskTarget: constructor initializes path, pipeline, postFns", () => {
const t = new TaskTarget("/foo/bar");
@ -110,7 +110,7 @@ test("TaskTarget: glob returns matching TaskTargets from disk", () => {
assert.ok(results.every(r => r.path.endsWith(".json")));
});
// ── toShell / shEscape ───────────────────────────────────────────────────────
// -- toShell / shEscape -------------------------------------------------------
test("toShell: a single read produces a cat command", () => {
const t = new TaskTarget("/foo/bar.txt");
@ -125,22 +125,17 @@ test("toShell: read piped into cmd", () => {
assert.equal(t.toShell(), "cat /foo/bar.txt | jq .");
});
test("toShell: single-quotes paths that contain spaces", () => {
const t = new TaskTarget("/foo/bar baz.txt");
for (const c of " $!&".split("")) {
test(`toShell: quotes paths that contain ${JSON.stringify(c)}`, () => {
const t = new TaskTarget(`/foo/bar${c}baz.txt`);
t.read();
assert.equal(t.toShell(), `cat $'/foo/bar${c}baz.txt'`);
});
}
test(`toShell: quotes and escapes paths that contain '`, () => {
const t = new TaskTarget(`/foo/bar'baz.txt`);
t.read();
assert.equal(t.toShell(), "cat '/foo/bar baz.txt'");
});
test("toShell: single-quotes paths that contain dollar signs", () => {
const t = new TaskTarget("/foo/$bar.txt");
t.read();
assert.equal(t.toShell(), "cat '/foo/$bar.txt'");
});
test("toShell: escapes literal single-quotes inside a path", () => {
const t = new TaskTarget("/foo/it's.txt");
t.read();
assert.equal(t.toShell(), "cat '/foo/it'\\''s.txt'");
assert.equal(t.toShell(), `cat $'/foo/bar\\'baz.txt'`);
});
test("toShell: cmd with array splits tokens", () => {
@ -155,22 +150,7 @@ test("toShell: cmd with function resolves at shell-generation time", () => {
assert.equal(t.toShell(), "jq -r .name /foo/bar.json");
});
// ── module-level functions ───────────────────────────────────────────────────
test("each: calls fn for every target", () => {
const targets = [new TaskTarget("/a"), new TaskTarget("/b")];
const paths: string[] = [];
each(targets, t => paths.push(t.path));
assert.deepEqual(paths, ["/a", "/b"]);
});
test("map: transforms each target", () => {
const targets = [new TaskTarget("/a"), new TaskTarget("/b")];
const result = map(targets, t => { const c = t.clone(); c.path += "/x"; return c; });
assert.equal(result[0].path, "/a/x");
assert.equal(result[1].path, "/b/x");
assert.equal(targets[0].path, "/a"); // originals unchanged
});
// -- module-level functions ---------------------------------------------------
test("cd: clones and changes directory of each target", () => {
const targets = [new TaskTarget("/a"), new TaskTarget("/b")];
@ -211,7 +191,7 @@ test("taskGlob: returns matching targets across all input targets", () => {
assert.ok(result.every(r => r.path.endsWith(".json")));
});
// ── verify ───────────────────────────────────────────────────────────────────
// -- verify -------------------------------------------------------------------
test("verify: removes targets with an empty pipeline", async () => {
const t = new TaskTarget(FIXTURE_FILE);
@ -243,7 +223,7 @@ test("verify: filters a mixed list to only valid targets", async () => {
assert.equal(result[0], good);
});
// ── getTSVManifest ───────────────────────────────────────────────────────────
// -- getTSVManifest -----------------------------------------------------------
test("getTSVManifest: produces id<TAB>shell for a single target", () => {
const t = new TaskTarget("/foo/bar.txt");
@ -258,7 +238,7 @@ test("getTSVManifest: joins multiple targets with newlines", () => {
assert.equal(getTSVManifest([t1, t2]), "a\tcat /a.txt\nb\tcat /b.txt");
});
// ── TaskTargetPipelineHelper ─────────────────────────────────────────────────
// -- TaskTargetPipelineHelper -------------------------------------------------
test("TaskTargetPipelineHelper: pipeline() promotes a plain array", () => {
const p = TaskTargetPipelineHelper.pipeline([new TaskTarget("/a")]);
@ -293,7 +273,7 @@ test("TaskTargetPipelineHelper: cmd returns a new helper with cmd ops added", ()
assert.equal(p2[0].toShell(), "cat /a.txt | jq .");
});
// ── collect ──────────────────────────────────────────────────────────────────
// -- collect ------------------------------------------------------------------
test("collect: the final end of a chain is added to the collection set", () => {
const collection = new Set<TaskTargetPipelineHelper>();