diff --git a/data-export/facebook.ts b/data-export/facebook.ts index 7850138..cb0d656 100644 --- a/data-export/facebook.ts +++ b/data-export/facebook.ts @@ -107,7 +107,7 @@ function facebook_notifications_generic(this: TaskTargetPipelineHelper, prop: st | [(.timestamp | todateiso8601), .unread, .href, .text] ) | @csv`]) - .assignMeta({ columnMeta: ["isodatetime", "any", "url", "text"] }); + .types(["time", "text", "text", "text"]); } function facebook_notifications_v1(this: TaskTargetPipelineHelper) { return this.facebook_notifications_generic("notifications"); @@ -126,7 +126,7 @@ function facebook_installed_apps_generic(this: TaskTargetPipelineHelper, prop: s ) | @csv `]) - .assignMeta({ columnMeta: ["text", "isodatetime"] }); + .types(["text", "time"]); } function facebook_installed_apps_v1(this: TaskTargetPipelineHelper) { return this.facebook_installed_apps_generic("installed_apps"); @@ -173,7 +173,7 @@ function facebook_comments_generic(this: TaskTargetPipelineHelper, prop: string) ) | @csv `]) - .assignMeta({ columnMeta: ["isodatetime", "TODO", "text"] }) + .types(["time", "text", "text"]) } function facebook_comments_v1(this: TaskTargetPipelineHelper) { return this.facebook_comments_generic("comments"); @@ -392,7 +392,7 @@ function facebook_group_posts_v2(this: TaskTargetPipelineHelper) { } function facebook_v2(this: TaskTargetPipelineHelper) { - const p = this.assignMeta({ idValue: t=>`Facebookv2 - ${t.basename}` }); // Generic ID for everything in here + const p = this.setId(t=>`Facebookv2 - ${t.basename}`); // Generic ID for everything in here const col: Set = new Set(); // No correlary to accounts_and_profiles.json @@ -400,7 +400,7 @@ function facebook_v2(this: TaskTargetPipelineHelper) { p.collect(col).cd(`apps_and_websites_off_of_facebook/connected_apps_and_websites.json`).read().facebook_installed_apps_v2(); p.collect(col).cd(`your_facebook_activity/comments_and_reactions/comments.json`).read().facebook_comments_v2(); p.collect(col).glob(`your_facebook_activity/messages/*/**/*.json`) // Messages files are in the FOLDERS inside messages (archived_threads, e2ee_cutover, etc...) - .assignMeta({ idValue: t=>`Facebookv2 - Messages ${t.basenameN(2)}` }) // 1, 2, etc is not specific enough, include the convo name + .setId(t=>`Facebookv2 - Messages ${t.basenameN(2)}`) // 1, 2, etc is not specific enough, include the convo name .read() .facebook_messages_generic() @@ -443,7 +443,7 @@ function facebook_v2(this: TaskTargetPipelineHelper) { } function facebook(this: TaskTargetPipelineHelper){ - const p = this.assignMeta({ idValue: t=>`Facebook - ${t.basename}` }); // Generic ID for everything in here + const p = this.setId(t=>`Facebook - ${t.basename}`); // Generic ID for everything in here const col: Set = new Set(); p.collect(col).cd(`about_you/notifications.json`).read().facebook_notifications_v1() @@ -461,6 +461,7 @@ function facebook(this: TaskTargetPipelineHelper){ | [.service_name, .native_app_id, .username, .email, .phone_number, .name] ) | @csv`]) + .csvSink() p.collect(col).cd(`ads_and_businesses/your_off-facebook_activity.json`).read() @@ -474,6 +475,7 @@ function facebook(this: TaskTargetPipelineHelper){ ) | @csv `]) + .csvSink([["timestamp", "numeric"]]) //TODO: .fork().todo('advertisers_who_uploaded_a_contact_list_with_your_information.json') p.collect(col).cd(`apps_and_websites/apps_and_websites.json`).read().facebook_installed_apps_v1() @@ -484,7 +486,7 @@ function facebook(this: TaskTargetPipelineHelper){ p.collect(col).cd(`comments/comments.json`).read().facebook_comments_v1() p.collect(col).glob(`dating/messages/*.json`) // Files are 0.json, 1.json, etc - .assignMeta({ idValue: t=>`Facebook - Dating Messages ${t.basename}` }) // Slightly more specific message + .setId(t=>`Facebook - Dating Messages ${t.basename}`) // Slightly more specific message .read() .cmd(["jq", "-r", ` ["from","to","timestamp","body"], @@ -495,6 +497,7 @@ function facebook(this: TaskTargetPipelineHelper){ ) | @csv `]) + .csvSink();//[["timestamp", "numeric"]]) //todo: your_dating_activity.json, but it only has a few lines and not super useful //todo: the other dating files are also just, small @@ -531,6 +534,7 @@ function facebook(this: TaskTargetPipelineHelper){ ) | @csv `]) + .csvSink([["added_timestamp", "numeric"]]) p.collect(col).cd(`following_and_followers/unfollowed_pages.json`).read().facebook_pages_unfollowed_v1() p.collect(col).cd(`following_and_followers/following.json`) @@ -543,6 +547,7 @@ function facebook(this: TaskTargetPipelineHelper){ ) | @csv `]) + .csvSink([["timestamp", "numeric"]]) p.collect(col).cd(`following_and_followers/followers.json`) .read() .cmd(["jq", "-r", ` @@ -553,6 +558,7 @@ function facebook(this: TaskTargetPipelineHelper){ ) | @csv `]) + .csvSink() p.collect(col).cd(`friends/sent_friend_requests.json`).read().facebook_friends_generic("sent_requests") p.collect(col).cd(`friends/removed_friends.json`).read().facebook_friends_generic("deleted_friends") @@ -579,6 +585,7 @@ function facebook(this: TaskTargetPipelineHelper){ ) | @csv `]) + .csvSink([["timestamp", "numeric"]]) p.collect(col).cd(`likes_and_reactions/posts_and_comments.json`) .read() .cmd(["jq", "-r", ` @@ -589,6 +596,7 @@ function facebook(this: TaskTargetPipelineHelper){ ) | @csv `]) + .csvSink([["timestamp", "numeric"]]) // TODO: // rcd(`location`); @@ -600,7 +608,7 @@ function facebook(this: TaskTargetPipelineHelper){ p.collect(col).glob(`messages/**/*.json`) // Files are message_1.json, etc - .assignMeta({ idValue: t=>`Facebook - Messages ${t.basenameN(2)}` }) // 1, 2, etc is not specific enough, include the convo name + .setId(t=>`Facebook - Messages ${t.basenameN(2)}`) // 1, 2, etc is not specific enough, include the convo name .read() .facebook_messages_generic() @@ -620,6 +628,7 @@ function facebook(this: TaskTargetPipelineHelper){ ) | @csv `]) + .csvSink([["timestamp", "numeric"]]); p.collect(col).cd(`other_activity/support_correspondences.json`) .read() // TODO: I'm seeing blanks in .from and .to when the replier was Facebook @@ -633,6 +642,7 @@ function facebook(this: TaskTargetPipelineHelper){ ) | @csv `]) + .csvSink([["timestamp", "numeric"]]) // `${facebookRoot}/pages` - no data @@ -647,12 +657,13 @@ function facebook(this: TaskTargetPipelineHelper){ ) | @csv `]) + .csvSink([["created_timestamp", "numeric"]]); // TODO: There's also photos_and_videos/your_videos.json // TODO: There's a media_metadata in each of the images too to convert as well as external files p.collect(col).glob(`photos_and_videos/album/*.json`) // Could use a better name, currently 0.json, 1.json, etc... - .assignMeta({ idValue: t=>`Facebook - Album ${t.basename}` }) //slightly more speciifc name, it woudl be better if we could use the album name + .setId(t=>`Facebook - Album ${t.basename}`) //slightly more speciifc name, it woudl be better if we could use the album name .read() .cmd(["jq", "-r", ` ["album","uri","creation_timestamp"], @@ -662,6 +673,7 @@ function facebook(this: TaskTargetPipelineHelper){ ) | @csv `]) + .csvSink([["creation_timestamp", "numeric"]]) p.collect(col).cd(`posts/your_pinned_posts.json`) .read() @@ -673,6 +685,7 @@ function facebook(this: TaskTargetPipelineHelper){ ) | @csv `]) + .csvSink([["timestamp", "numeric"]]) // TODO: Glob? I never posted a lot on FB p.collect(col).cd(`posts/your_posts_1.json`) .read() @@ -686,6 +699,7 @@ function facebook(this: TaskTargetPipelineHelper){ ) | @csv `]) + .csvSink([["timestamp", "numeric"]]) // `${facebookRoot}/privacy_checkup` - no data @@ -704,6 +718,7 @@ function facebook(this: TaskTargetPipelineHelper){ ) | @csv `]) + .csvSink([["timestamp", "numeric"]]) // `${facebookRoot}/rewards` - no data // `${facebookRoot}/saved_items_and_collections` - no data @@ -720,6 +735,7 @@ function facebook(this: TaskTargetPipelineHelper){ ) | @csv `]) + .csvSink([["timestamp", "numeric"]]) p.collect(col).cd(`security_and_login_information/account_activity.json`).read().facebook_account_activity_v1() p.collect(col).cd(`security_and_login_information/administrative_records.json`).read().facebook_admin_records_v1() p.collect(col).cd(`security_and_login_information/authorized_logins.json`).read().facebook_authorized_logins_v1() @@ -756,6 +772,7 @@ function facebook(this: TaskTargetPipelineHelper){ ) | @csv `]) + .csvSink([["timestamp", "numeric"]]) // `${facebookRoot}/trash` - no data in my export // `${facebookRoot}/voice_recording_and_transcription` - no data in my export diff --git a/data-export/google.ts b/data-export/google.ts index 2cb4701..bffe82b 100644 --- a/data-export/google.ts +++ b/data-export/google.ts @@ -2,7 +2,7 @@ import { TaskTargetPipelineHelper } from "./task.ts"; import { htmlSelectorChunkedDuplex } from "./html.ts"; export function google(this: TaskTargetPipelineHelper){ - const p = this.assignMeta({ idValue: t=>`Google - ${t.basename}` }); // Generic ID for everything in here + const p = this.setId(t=>`Google - ${t.basename}`); // Generic ID for everything in here const col: Set = new Set(); // TODO: There is a root takeout folder @@ -40,6 +40,7 @@ export function google(this: TaskTargetPipelineHelper){ // a = t.fork().cd(`Google Pay`) p.collect(col).cd(`Google Pay/Google transactions`).glob(`transactions_*.csv`) .read() + .csvSink() // .fork("a").cd(`Money sends and requests`) // .fork().cd(`Money sends and requests.csv`) // .read() @@ -61,6 +62,7 @@ export function google(this: TaskTargetPipelineHelper){ | [.timestampMs | todateiso8601, .latitudeE7, .longitudeE7, .accuracy] ) | @csv`]) + .csvSink() // There's also the semantic history but that's an entire nother can of worms // it seems like diff --git a/data-export/parallel.ts b/data-export/parallel.ts index 20f223a..27add5c 100644 --- a/data-export/parallel.ts +++ b/data-export/parallel.ts @@ -4,12 +4,14 @@ import { type TaskTarget, run } from "./task.ts"; $.verbose = false; +type ResultMap = Map; + export async function parallel( targets: TaskTarget[], quiet: boolean = false, maxConcurrency: number = os.cpus().length -): Promise { - const resultMap = new Map(); +): Promise { + const results = new Map(); const total = targets.length; let completed = 0; @@ -47,7 +49,7 @@ export async function parallel( const result = await run(t); completionTimes.push(result.duration); - resultMap.set(t.id, result); + results.set(t.id, result); running--; completed++; @@ -75,16 +77,10 @@ export async function parallel( // Final status line process.stderr.write('\n'); const totalSeconds = ((Date.now() - startTime) / 1000).toFixed(1); - const failed = Array.from(resultMap.values().filter(p => !p.ok)); + const failed = Array.from(results.values().filter(p => !p.ok)); process.stderr.write( `\nCompleted ${total} jobs in ${totalSeconds}s (${failed.length} failed)\n` ); - const output = targets - .map(t => { - const r = resultMap.get(t.id)!; - return r; - }); - - return output; + return results; } diff --git a/data-export/task.ts b/data-export/task.ts index 54e6396..df884b8 100644 --- a/data-export/task.ts +++ b/data-export/task.ts @@ -93,60 +93,17 @@ class TaskTargetCmd implements TaskTargetOp { } type ValidId = string | ((t: TaskTarget)=>string); -export const COLUMN_TYPES = { - /**A numeric value*/ - "numeric": {}, - /**ISO Datetime*/ - "isodatetime": {}, - /**Urls*/ - "url": {}, - /**Freetext*/ - "text": {}, - /**For anything untyped*/ - "any": {}, - "TODO": {} -}; - -// // if (type === "numeric") { -// // queryLine = `min(${columnName}) as lo, max(${columnName}) as hi, count(*) as n`; -// // formatFn = (r: any)=>`${r.n} rows from ${r.lo} to ${r.hi} for ${t.id}`; -// // } -// // else { -// // queryLine = `count(*) as n`; -// // formatFn = (r: any)=>`${r.n} rows for ${t.id}`; -// // } - -/**Column metadata. Just a string into the TYPES*/ -type ColumnMeta = (keyof typeof COLUMN_TYPES | undefined); -// Make non-optional version of just the metadata values of TaskTarget -type TaskTargetMeta = Required>; - export class TaskTarget { - /**The current path pointed to by this TaskTarget*/ path: string; - /**The fsImpl used to access the .path*/ fsImpl: FSImpl = defaultFSImpl; - /**The pipeline of things to do to the above path to get an stdout of the output*/ pipeline: TaskTargetOp[]; - - // == Metadata, user configurable, no good defaults == - /**Id of the TaskTarget - * string - Static id - * fn returning string - Function can derive the id from a task target even after a glob() and cd() operation - **/ - idValue?: ValidId; - /**For every output CSV, this defines a description of that CSV per-row - * Use the items {0}, {1} to template - * Example: For a CSV with a row format like ["time", "sender", "sendee", "message"] - * you might do something like '"{3}" sent from {2} to {1}' - * */ - perRowDescription?: string; - /**Metadata about the columns*/ - columnMeta?: ColumnMeta[]; + idValue: ValidId | undefined; + postFns: ((t: TaskTarget)=>Promise)[]; constructor(path: string){ this.path = path; this.pipeline = []; + this.postFns = []; } exists() { @@ -211,12 +168,10 @@ export class TaskTarget { clone(): TaskTarget { const t = new TaskTarget(this.path); t.fsImpl = this.fsImpl; // holds no state, just needs same impl + t.idValue = this.idValue; + t.postFns = this.postFns.slice(); t.pipeline = this.pipeline.slice() .map(p => p.clone()); - // metadata - t.idValue = this.idValue; - t.perRowDescription = this.perRowDescription; - t.columnMeta = this.columnMeta?.slice(); return t; } @@ -235,6 +190,10 @@ export class TaskTarget { return shell; } + pushPostFn(fn: ((t: TaskTarget)=>Promise)) { + this.postFns.push(fn); + } + cmd(cmd: ValidCmd) { this.pushToPipeline(new TaskTargetCmd(cmd)); return this; @@ -243,14 +202,54 @@ export class TaskTarget { this.pushToPipeline(new TaskTargetRead()); return this; } - assignMeta(meta: Partial) { - Object.assign(this, { - ...meta, - // Clone this deeply so no shared object references - columnMeta: meta.columnMeta?.slice() - }); + setId(idValue: ValidId) { + this.idValue = idValue; return this; } + post(fn: any) { + this.pushPostFn(fn); + } + types( + types: string[] + ) { + // TODO: + return this; + } + csvSink( + summarization?: [string, string][] + ) { + // TODO: + return this; + + // Ingest this csv into the database at the given id + // this.cmd(t=>["sqlite-utils", "insert", "your.db", t.id, "-", "--csv", "--detect-types"]); + // Add a post processing function for these targets that prints out the summarization + // stats + // this.post(async (t: TaskTarget)=>{ + // // We only do the first one so far for the summarization + // let queryLine: string; + // let formatFn: (r: any)=>string; + // const [columnName, type] = summarization?.[0] ?? [undefined, undefined]; + // if (type === "numeric") { + // queryLine = `min(${columnName}) as lo, max(${columnName}) as hi, count(*) as n`; + // formatFn = (r: any)=>`${r.n} rows from ${r.lo} to ${r.hi} for ${t.id}`; + // } + // else { + // queryLine = `count(*) as n`; + // formatFn = (r: any)=>`${r.n} rows for ${t.id}`; + // } + + // const cmd = "sqlite-utils"; + // const args = ["query", "your.db", `select ${queryLine} from ${t.id}`] + // const { stdout, stderr } = await execFile(cmd, args); + // const results = JSON.parse(stdout); + // const result = results[0]; // should only be one result in the array for this type of query + // const logLine = formatFn(result); + // (t as any).log = logLine; + // }); + + // return this; + } } export function each(targets: TaskTarget[], fn: (t: TaskTarget)=>void) { @@ -280,8 +279,8 @@ export function read(targets: TaskTarget[]): TaskTarget[] { export function cmd(targets: TaskTarget[], cmd: ValidCmd): TaskTarget[] { return targets.map(t => t.clone().cmd(cmd)) } -export function assignMeta(targets: TaskTarget[], meta: Partial): TaskTarget[] { - return targets.map(t => t.clone().assignMeta(meta)) +export function setId(targets: TaskTarget[], id: ValidId): TaskTarget[] { + return targets.map(t => t.clone().setId(id)) } /**Verify, anything that fails is skipped and throws an error*/ @@ -306,6 +305,29 @@ export async function verify(targets: TaskTarget[]) { return outTargets; } +/**Writes a manifest for parallel, a TSV where each record is an id + the shell to run + * @todo Enforce doing a verify before we output? + */ +export function getTSVManifest(targets: TaskTarget[]): string { + let out: string[] = []; + for (const t of targets) { + const shell = t.toShell(); + out.push(`${t.id}\t${shell}`); + } + + return out.join("\n"); +} + +export function getTaskManifest(targets: TaskTarget[]): [string, string][] { + let out: [string, string][] = []; + for (const t of targets) { + const shell = t.toShell(); + out.push([t.id, shell] as const); + } + + return out; +} + function collectionSwap(a: TaskTargetPipelineHelper, b: TaskTargetPipelineHelper) { if (!a.__collection) { return; @@ -360,10 +382,18 @@ export class TaskTargetPipelineHelper extends Array { cmd(_cmd: ValidCmd): TaskTargetPipelineHelper { return this._fn(t => cmd(t, _cmd)); } - assignMeta(meta: Partial): TaskTargetPipelineHelper { - return this._fn(t => assignMeta(t, meta)); + setId(id: ValidId): TaskTargetPipelineHelper { + return this._fn(t => setId(t, id)); } + types(...args: any[]) { + // TODO: no-op + return this; + } + csvSink(...args: any[]) { + // TODO: no-op + return this; + } /** * @todo Nested versions of this don't currently work, but they could if we * turn __collection into an array of collections diff --git a/main.ts b/main.ts index de912fb..a8e234a 100644 --- a/main.ts +++ b/main.ts @@ -64,41 +64,4 @@ async function main() { // loadIntoSqlite(csvFiles, 'your.db'); } -main(); - -// TODO: Move this into here - // csvSink( - // summarization?: [string, string][] - // ) { - // // TODO: - // return this; - - // // Ingest this csv into the database at the given id - // // this.cmd(t=>["sqlite-utils", "insert", "your.db", t.id, "-", "--csv", "--detect-types"]); - // // Add a post processing function for these targets that prints out the summarization - // // stats - // // this.post(async (t: TaskTarget)=>{ - // // // We only do the first one so far for the summarization - // // let queryLine: string; - // // let formatFn: (r: any)=>string; - // // const [columnName, type] = summarization?.[0] ?? [undefined, undefined]; - // // if (type === "numeric") { - // // queryLine = `min(${columnName}) as lo, max(${columnName}) as hi, count(*) as n`; - // // formatFn = (r: any)=>`${r.n} rows from ${r.lo} to ${r.hi} for ${t.id}`; - // // } - // // else { - // // queryLine = `count(*) as n`; - // // formatFn = (r: any)=>`${r.n} rows for ${t.id}`; - // // } - - // // const cmd = "sqlite-utils"; - // // const args = ["query", "your.db", `select ${queryLine} from ${t.id}`] - // // const { stdout, stderr } = await execFile(cmd, args); - // // const results = JSON.parse(stdout); - // // const result = results[0]; // should only be one result in the array for this type of query - // // const logLine = formatFn(result); - // // (t as any).log = logLine; - // // }); - - // // return this; - // } \ No newline at end of file +main(); \ No newline at end of file diff --git a/package.json b/package.json index 586e3ff..e0ed113 100644 --- a/package.json +++ b/package.json @@ -20,6 +20,7 @@ "@types/duplexify": "^3.6.5", "@types/yauzl": "^2.10.3", "duplexify": "^4.1.3", + "fp-ts": "^2.16.11", "glob": "^13.0.0", "htmlparser2": "^10.0.0", "yauzl": "^3.2.0", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 2538877..1d5812c 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -17,6 +17,9 @@ importers: duplexify: specifier: ^4.1.3 version: 4.1.3 + fp-ts: + specifier: ^2.16.11 + version: 2.16.11 glob: specifier: ^13.0.0 version: 13.0.0 @@ -86,6 +89,9 @@ packages: resolution: {integrity: sha512-aN97NXWF6AWBTahfVOIrB/NShkzi5H7F9r1s9mD3cDj4Ko5f2qhhVoYMibXF7GlLveb/D2ioWay8lxI97Ven3g==} engines: {node: '>=0.12'} + fp-ts@2.16.11: + resolution: {integrity: sha512-LaI+KaX2NFkfn1ZGHoKCmcfv7yrZsC3b8NtWsTVQeHkq4F27vI5igUuO53sxqDEa2gNQMHFPmpojDw/1zmUK7w==} + glob@13.0.0: resolution: {integrity: sha512-tvZgpqk6fz4BaNZ66ZsRaZnbHvP/jG3uKJvAZOwEVUL4RTA5nJeeLYfyN9/VA8NX/V3IBG+hkeuGpKjvELkVhA==} engines: {node: 20 || >=22} @@ -209,6 +215,8 @@ snapshots: entities@6.0.1: {} + fp-ts@2.16.11: {} + glob@13.0.0: dependencies: minimatch: 10.1.1 diff --git a/test/facebook.ts b/test/facebook.ts index 14eb503..9835c91 100644 --- a/test/facebook.ts +++ b/test/facebook.ts @@ -1,7 +1,9 @@ import test from "node:test"; import nodePath from "node:path"; import { strict as assert } from "node:assert"; -import { TaskTargetPipelineHelper, TaskTarget, verify, run } from "../data-export/task.ts"; +import { finished } from "node:stream/promises"; +import { Readable, Writable } from "node:stream"; +import { TaskTargetPipelineHelper, TaskTarget, verify, getTSVManifest, getTaskManifest, run } from "../data-export/task.ts"; import { parallel } from "../data-export/parallel.ts"; import "../data-export/facebook.ts"; @@ -18,18 +20,16 @@ test("facebook: Can load the 2021 export", async (t) => { const finalTargets = await verify(targets); const result = await parallel(finalTargets, true); - const idAndCSVs: [string, string][] = []; - for (const [idx, r] of result.entries()) { - const target = finalTargets[idx]; - assert.ok(!r.stderr, `Task ${target.id} should have no stderr output`); - assert.ok(r.ok, `Task ${target.id} should be okay`); - idAndCSVs.push([target.id, r.stdout]); + for (const [id, r] of result.entries()) { + assert.ok(!r.stderr, `Task ${id} should have no stderr output`); + assert.ok(r.ok, `Task ${id} should be okay`); } - const csvs = idAndCSVs - .sort() // Keep stable ordering for snapshots - .map(v => v[1]) - t.assert.snapshot(csvs); + const allCSV = Array.from(result.entries()) + .sort() // Keep stable ordering for snapshots + .map(([id, r]) => r.stdout); + + t.assert.snapshot(allCSV); }); test("facebook: Can load the 2021 export zipped", async (t) => { const targets = await TaskTargetPipelineHelper.pipeline([ @@ -41,18 +41,16 @@ test("facebook: Can load the 2021 export zipped", async (t) => { const finalTargets = await verify(targets2); const result = await parallel(finalTargets, true); - const idAndCSVs: [string, string][] = []; - for (const [idx, r] of result.entries()) { - const target = finalTargets[idx]; - assert.ok(!r.stderr, `Task ${target.id} should have no stderr output`); - assert.ok(r.ok, `Task ${target.id} should be okay`); - idAndCSVs.push([target.id, r.stdout]); + for (const [id, r] of result.entries()) { + assert.ok(!r.stderr, `Task ${id} should have no stderr output`); + assert.ok(r.ok, `Task ${id} should be okay`); } - const csvs = idAndCSVs - .sort() // Keep stable ordering for snapshots - .map(v => v[1]) - t.assert.snapshot(csvs); + const allCSV = Array.from(result.entries()) + .sort() // Keep stable ordering for snapshots + .map(([id, r]) => r.stdout); + + t.assert.snapshot(allCSV); }); test("facebook: Can load the 2025 export", async (t) => { const targets = TaskTargetPipelineHelper.pipeline([ @@ -62,16 +60,14 @@ test("facebook: Can load the 2025 export", async (t) => { const finalTargets = await verify(targets); const result = await parallel(finalTargets, true); - const idAndCSVs: [string, string][] = []; - for (const [idx, r] of result.entries()) { - const target = finalTargets[idx]; - assert.ok(!r.stderr, `Task ${target.id} should have no stderr output`); - assert.ok(r.ok, `Task ${target.id} should be okay`); - idAndCSVs.push([target.id, r.stdout]); + for (const [id, r] of result.entries()) { + assert.ok(!r.stderr, `Task ${id} should have no stderr output`); + assert.ok(r.ok, `Task ${id} should be okay`); } - const csvs = idAndCSVs - .sort() // Keep stable ordering for snapshots - .map(v => v[1]) - t.assert.snapshot(csvs); + const allCSV = Array.from(result.entries()) + .sort() // Keep stable ordering for snapshots + .map(([id, r]) => r.stdout); + + t.assert.snapshot(allCSV); }); diff --git a/test/facebook.ts.snapshot b/test/facebook.ts.snapshot index 8b6e62a..fb5c5e6 100644 --- a/test/facebook.ts.snapshot +++ b/test/facebook.ts.snapshot @@ -93,7 +93,6 @@ exports[`facebook: Can load the 2025 export 1`] = ` "\\"from\\",\\"to\\",\\"timestamp\\",\\"content\\"\\n\\"xxx\\",\\"\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n\\"xxx\\",\\"\\",\\"1970-01-01T00:00:00Z\\",\\"some/path\\"\\n", "\\"from\\",\\"to\\",\\"timestamp\\",\\"content\\"\\n\\"xxx\\",\\"\\",\\"1970-01-01T00:00:00Z\\",\\n\\"xxx\\",\\"\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n", "\\"from\\",\\"to\\",\\"timestamp\\",\\"content\\"\\n\\"xxx\\",\\"\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n\\"xxx\\",\\"\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n", - "\\"from\\",\\"to\\",\\"timestamp\\",\\"content\\"\\n\\"xxx\\",\\"\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n\\"xxx\\",\\"\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n", "\\"from\\",\\"to\\",\\"timestamp\\",\\"content\\"\\n\\"xxx\\",\\"\\",\\"1970-01-01T00:00:00Z\\",\\n", "\\"action\\",\\"ip\\",\\"user_agent\\",\\"datr_cookie\\",\\"city\\",\\"region\\",\\"country\\",\\"site_name\\",\\"timestamp\\"\\n\\"xxx\\",\\"1.1.1.1\\",\\"some/path\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"2024-11-22T10:06:40Z\\"\\n\\"xxx\\",\\"1.1.1.1\\",\\"some/path\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"2024-11-21T23:00:00Z\\"\\n", "\\"timestamp\\",\\"data\\",\\"title\\"\\n\\"2024-02-13T02:06:40Z\\",\\"TODO\\",\\"xxx\\"\\n\\"2024-07-12T02:06:40Z\\",\\"TODO\\",\\"xxx\\"\\n", diff --git a/test/task.ts b/test/task.ts index d333362..a87d240 100644 --- a/test/task.ts +++ b/test/task.ts @@ -7,8 +7,9 @@ import { glob as taskGlob, read, cmd, - assignMeta, + setId, verify, + getTSVManifest, TaskTargetPipelineHelper, } from "../data-export/task.ts"; @@ -18,10 +19,11 @@ const FIXTURE_FILE = nodePath.join(FIXTURE_DIR, 'friends/friends.json'); // -- TaskTarget --------------------------------------------------------------- -test("TaskTarget: constructor initializes path, pipeline", () => { +test("TaskTarget: constructor initializes path, pipeline, postFns", () => { const t = new TaskTarget("/foo/bar"); assert.equal(t.path, "/foo/bar"); assert.deepEqual(t.pipeline, []); + assert.deepEqual(t.postFns, []); }); test("TaskTarget: exists() returns true for a real file", () => { @@ -48,12 +50,12 @@ test("TaskTarget: id throws when no idValue is set", () => { }); test("TaskTarget: id with a string value is safe-ified", () => { - const t = new TaskTarget("/foo").assignMeta({ idValue: "my-id" }); + const t = new TaskTarget("/foo").setId("my-id"); assert.equal(t.id, "my_id"); }); test("TaskTarget: id with a function value is resolved against the target", () => { - const t = new TaskTarget("/foo/bar").assignMeta({ idValue: tgt => tgt.basename }); + const t = new TaskTarget("/foo/bar").setId(tgt => tgt.basename); assert.equal(t.id, "bar"); }); @@ -90,17 +92,12 @@ test("TaskTarget: pushToPipeline throws if read is not the first op", () => { }); test("TaskTarget: clone produces an independent copy", () => { - const t = new TaskTarget("/foo").assignMeta({ - idValue: "orig", - columnMeta: ["yeag"] - }); + const t = new TaskTarget("/foo").setId("orig"); t.read(); const c = t.clone(); assert.equal(c.path, "/foo"); assert.equal(c.id, "orig"); - assert(c.pipeline !== t.pipeline); // Different object references assert.equal(c.pipeline.length, 1); - assert(c.columnMeta !== t.columnMeta); // Different object references c.path = "/other"; assert.equal(t.path, "/foo"); // original unchanged }); @@ -179,9 +176,9 @@ test("cmd: clones and appends a cmd op to each target", () => { assert.equal(targets[0].pipeline.length, 1); // original unchanged }); -test("assignMeta: clones and sets meta on each target", () => { +test("setId: clones and sets id on each target", () => { const targets = [new TaskTarget("/a"), new TaskTarget("/b")]; - const result = assignMeta(targets, { idValue: "myid" }); + const result = setId(targets, "myid"); assert.equal(result[0].id, "myid"); assert.equal(result[1].id, "myid"); assert.throws(() => targets[0].id); // originals have no id @@ -226,6 +223,21 @@ test("verify: filters a mixed list to only valid targets", async () => { assert.equal(result[0], good); }); +// -- getTSVManifest ----------------------------------------------------------- + +test("getTSVManifest: produces idshell for a single target", () => { + const t = new TaskTarget("/foo/bar.txt"); + t.setId("myid"); + t.read(); + assert.equal(getTSVManifest([t]), "myid\tcat /foo/bar.txt"); +}); + +test("getTSVManifest: joins multiple targets with newlines", () => { + const t1 = new TaskTarget("/a.txt"); t1.setId("a"); t1.read(); + const t2 = new TaskTarget("/b.txt"); t2.setId("b"); t2.read(); + assert.equal(getTSVManifest([t1, t2]), "a\tcat /a.txt\nb\tcat /b.txt"); +}); + // -- TaskTargetPipelineHelper ------------------------------------------------- test("TaskTargetPipelineHelper: pipeline() promotes a plain array", () => {