diff --git a/data-export/facebook.ts b/data-export/facebook.ts index cb0d656..7850138 100644 --- a/data-export/facebook.ts +++ b/data-export/facebook.ts @@ -107,7 +107,7 @@ function facebook_notifications_generic(this: TaskTargetPipelineHelper, prop: st | [(.timestamp | todateiso8601), .unread, .href, .text] ) | @csv`]) - .types(["time", "text", "text", "text"]); + .assignMeta({ columnMeta: ["isodatetime", "any", "url", "text"] }); } function facebook_notifications_v1(this: TaskTargetPipelineHelper) { return this.facebook_notifications_generic("notifications"); @@ -126,7 +126,7 @@ function facebook_installed_apps_generic(this: TaskTargetPipelineHelper, prop: s ) | @csv `]) - .types(["text", "time"]); + .assignMeta({ columnMeta: ["text", "isodatetime"] }); } function facebook_installed_apps_v1(this: TaskTargetPipelineHelper) { return this.facebook_installed_apps_generic("installed_apps"); @@ -173,7 +173,7 @@ function facebook_comments_generic(this: TaskTargetPipelineHelper, prop: string) ) | @csv `]) - .types(["time", "text", "text"]) + .assignMeta({ columnMeta: ["isodatetime", "TODO", "text"] }) } function facebook_comments_v1(this: TaskTargetPipelineHelper) { return this.facebook_comments_generic("comments"); @@ -392,7 +392,7 @@ function facebook_group_posts_v2(this: TaskTargetPipelineHelper) { } function facebook_v2(this: TaskTargetPipelineHelper) { - const p = this.setId(t=>`Facebookv2 - ${t.basename}`); // Generic ID for everything in here + const p = this.assignMeta({ idValue: t=>`Facebookv2 - ${t.basename}` }); // Generic ID for everything in here const col: Set = new Set(); // No correlary to accounts_and_profiles.json @@ -400,7 +400,7 @@ function facebook_v2(this: TaskTargetPipelineHelper) { p.collect(col).cd(`apps_and_websites_off_of_facebook/connected_apps_and_websites.json`).read().facebook_installed_apps_v2(); p.collect(col).cd(`your_facebook_activity/comments_and_reactions/comments.json`).read().facebook_comments_v2(); p.collect(col).glob(`your_facebook_activity/messages/*/**/*.json`) // Messages files are in the FOLDERS inside messages (archived_threads, e2ee_cutover, etc...) - .setId(t=>`Facebookv2 - Messages ${t.basenameN(2)}`) // 1, 2, etc is not specific enough, include the convo name + .assignMeta({ idValue: t=>`Facebookv2 - Messages ${t.basenameN(2)}` }) // 1, 2, etc is not specific enough, include the convo name .read() .facebook_messages_generic() @@ -443,7 +443,7 @@ function facebook_v2(this: TaskTargetPipelineHelper) { } function facebook(this: TaskTargetPipelineHelper){ - const p = this.setId(t=>`Facebook - ${t.basename}`); // Generic ID for everything in here + const p = this.assignMeta({ idValue: t=>`Facebook - ${t.basename}` }); // Generic ID for everything in here const col: Set = new Set(); p.collect(col).cd(`about_you/notifications.json`).read().facebook_notifications_v1() @@ -461,7 +461,6 @@ function facebook(this: TaskTargetPipelineHelper){ | [.service_name, .native_app_id, .username, .email, .phone_number, .name] ) | @csv`]) - .csvSink() p.collect(col).cd(`ads_and_businesses/your_off-facebook_activity.json`).read() @@ -475,7 +474,6 @@ function facebook(this: TaskTargetPipelineHelper){ ) | @csv `]) - .csvSink([["timestamp", "numeric"]]) //TODO: .fork().todo('advertisers_who_uploaded_a_contact_list_with_your_information.json') p.collect(col).cd(`apps_and_websites/apps_and_websites.json`).read().facebook_installed_apps_v1() @@ -486,7 +484,7 @@ function facebook(this: TaskTargetPipelineHelper){ p.collect(col).cd(`comments/comments.json`).read().facebook_comments_v1() p.collect(col).glob(`dating/messages/*.json`) // Files are 0.json, 1.json, etc - .setId(t=>`Facebook - Dating Messages ${t.basename}`) // Slightly more specific message + .assignMeta({ idValue: t=>`Facebook - Dating Messages ${t.basename}` }) // Slightly more specific message .read() .cmd(["jq", "-r", ` ["from","to","timestamp","body"], @@ -497,7 +495,6 @@ function facebook(this: TaskTargetPipelineHelper){ ) | @csv `]) - .csvSink();//[["timestamp", "numeric"]]) //todo: your_dating_activity.json, but it only has a few lines and not super useful //todo: the other dating files are also just, small @@ -534,7 +531,6 @@ function facebook(this: TaskTargetPipelineHelper){ ) | @csv `]) - .csvSink([["added_timestamp", "numeric"]]) p.collect(col).cd(`following_and_followers/unfollowed_pages.json`).read().facebook_pages_unfollowed_v1() p.collect(col).cd(`following_and_followers/following.json`) @@ -547,7 +543,6 @@ function facebook(this: TaskTargetPipelineHelper){ ) | @csv `]) - .csvSink([["timestamp", "numeric"]]) p.collect(col).cd(`following_and_followers/followers.json`) .read() .cmd(["jq", "-r", ` @@ -558,7 +553,6 @@ function facebook(this: TaskTargetPipelineHelper){ ) | @csv `]) - .csvSink() p.collect(col).cd(`friends/sent_friend_requests.json`).read().facebook_friends_generic("sent_requests") p.collect(col).cd(`friends/removed_friends.json`).read().facebook_friends_generic("deleted_friends") @@ -585,7 +579,6 @@ function facebook(this: TaskTargetPipelineHelper){ ) | @csv `]) - .csvSink([["timestamp", "numeric"]]) p.collect(col).cd(`likes_and_reactions/posts_and_comments.json`) .read() .cmd(["jq", "-r", ` @@ -596,7 +589,6 @@ function facebook(this: TaskTargetPipelineHelper){ ) | @csv `]) - .csvSink([["timestamp", "numeric"]]) // TODO: // rcd(`location`); @@ -608,7 +600,7 @@ function facebook(this: TaskTargetPipelineHelper){ p.collect(col).glob(`messages/**/*.json`) // Files are message_1.json, etc - .setId(t=>`Facebook - Messages ${t.basenameN(2)}`) // 1, 2, etc is not specific enough, include the convo name + .assignMeta({ idValue: t=>`Facebook - Messages ${t.basenameN(2)}` }) // 1, 2, etc is not specific enough, include the convo name .read() .facebook_messages_generic() @@ -628,7 +620,6 @@ function facebook(this: TaskTargetPipelineHelper){ ) | @csv `]) - .csvSink([["timestamp", "numeric"]]); p.collect(col).cd(`other_activity/support_correspondences.json`) .read() // TODO: I'm seeing blanks in .from and .to when the replier was Facebook @@ -642,7 +633,6 @@ function facebook(this: TaskTargetPipelineHelper){ ) | @csv `]) - .csvSink([["timestamp", "numeric"]]) // `${facebookRoot}/pages` - no data @@ -657,13 +647,12 @@ function facebook(this: TaskTargetPipelineHelper){ ) | @csv `]) - .csvSink([["created_timestamp", "numeric"]]); // TODO: There's also photos_and_videos/your_videos.json // TODO: There's a media_metadata in each of the images too to convert as well as external files p.collect(col).glob(`photos_and_videos/album/*.json`) // Could use a better name, currently 0.json, 1.json, etc... - .setId(t=>`Facebook - Album ${t.basename}`) //slightly more speciifc name, it woudl be better if we could use the album name + .assignMeta({ idValue: t=>`Facebook - Album ${t.basename}` }) //slightly more speciifc name, it woudl be better if we could use the album name .read() .cmd(["jq", "-r", ` ["album","uri","creation_timestamp"], @@ -673,7 +662,6 @@ function facebook(this: TaskTargetPipelineHelper){ ) | @csv `]) - .csvSink([["creation_timestamp", "numeric"]]) p.collect(col).cd(`posts/your_pinned_posts.json`) .read() @@ -685,7 +673,6 @@ function facebook(this: TaskTargetPipelineHelper){ ) | @csv `]) - .csvSink([["timestamp", "numeric"]]) // TODO: Glob? I never posted a lot on FB p.collect(col).cd(`posts/your_posts_1.json`) .read() @@ -699,7 +686,6 @@ function facebook(this: TaskTargetPipelineHelper){ ) | @csv `]) - .csvSink([["timestamp", "numeric"]]) // `${facebookRoot}/privacy_checkup` - no data @@ -718,7 +704,6 @@ function facebook(this: TaskTargetPipelineHelper){ ) | @csv `]) - .csvSink([["timestamp", "numeric"]]) // `${facebookRoot}/rewards` - no data // `${facebookRoot}/saved_items_and_collections` - no data @@ -735,7 +720,6 @@ function facebook(this: TaskTargetPipelineHelper){ ) | @csv `]) - .csvSink([["timestamp", "numeric"]]) p.collect(col).cd(`security_and_login_information/account_activity.json`).read().facebook_account_activity_v1() p.collect(col).cd(`security_and_login_information/administrative_records.json`).read().facebook_admin_records_v1() p.collect(col).cd(`security_and_login_information/authorized_logins.json`).read().facebook_authorized_logins_v1() @@ -772,7 +756,6 @@ function facebook(this: TaskTargetPipelineHelper){ ) | @csv `]) - .csvSink([["timestamp", "numeric"]]) // `${facebookRoot}/trash` - no data in my export // `${facebookRoot}/voice_recording_and_transcription` - no data in my export diff --git a/data-export/google.ts b/data-export/google.ts index bffe82b..2cb4701 100644 --- a/data-export/google.ts +++ b/data-export/google.ts @@ -2,7 +2,7 @@ import { TaskTargetPipelineHelper } from "./task.ts"; import { htmlSelectorChunkedDuplex } from "./html.ts"; export function google(this: TaskTargetPipelineHelper){ - const p = this.setId(t=>`Google - ${t.basename}`); // Generic ID for everything in here + const p = this.assignMeta({ idValue: t=>`Google - ${t.basename}` }); // Generic ID for everything in here const col: Set = new Set(); // TODO: There is a root takeout folder @@ -40,7 +40,6 @@ export function google(this: TaskTargetPipelineHelper){ // a = t.fork().cd(`Google Pay`) p.collect(col).cd(`Google Pay/Google transactions`).glob(`transactions_*.csv`) .read() - .csvSink() // .fork("a").cd(`Money sends and requests`) // .fork().cd(`Money sends and requests.csv`) // .read() @@ -62,7 +61,6 @@ export function google(this: TaskTargetPipelineHelper){ | [.timestampMs | todateiso8601, .latitudeE7, .longitudeE7, .accuracy] ) | @csv`]) - .csvSink() // There's also the semantic history but that's an entire nother can of worms // it seems like diff --git a/data-export/parallel.ts b/data-export/parallel.ts index 27add5c..20f223a 100644 --- a/data-export/parallel.ts +++ b/data-export/parallel.ts @@ -4,14 +4,12 @@ import { type TaskTarget, run } from "./task.ts"; $.verbose = false; -type ResultMap = Map; - export async function parallel( targets: TaskTarget[], quiet: boolean = false, maxConcurrency: number = os.cpus().length -): Promise { - const results = new Map(); +): Promise { + const resultMap = new Map(); const total = targets.length; let completed = 0; @@ -49,7 +47,7 @@ export async function parallel( const result = await run(t); completionTimes.push(result.duration); - results.set(t.id, result); + resultMap.set(t.id, result); running--; completed++; @@ -77,10 +75,16 @@ export async function parallel( // Final status line process.stderr.write('\n'); const totalSeconds = ((Date.now() - startTime) / 1000).toFixed(1); - const failed = Array.from(results.values().filter(p => !p.ok)); + const failed = Array.from(resultMap.values().filter(p => !p.ok)); process.stderr.write( `\nCompleted ${total} jobs in ${totalSeconds}s (${failed.length} failed)\n` ); - return results; + const output = targets + .map(t => { + const r = resultMap.get(t.id)!; + return r; + }); + + return output; } diff --git a/data-export/task.ts b/data-export/task.ts index df884b8..54e6396 100644 --- a/data-export/task.ts +++ b/data-export/task.ts @@ -93,17 +93,60 @@ class TaskTargetCmd implements TaskTargetOp { } type ValidId = string | ((t: TaskTarget)=>string); +export const COLUMN_TYPES = { + /**A numeric value*/ + "numeric": {}, + /**ISO Datetime*/ + "isodatetime": {}, + /**Urls*/ + "url": {}, + /**Freetext*/ + "text": {}, + /**For anything untyped*/ + "any": {}, + "TODO": {} +}; + +// // if (type === "numeric") { +// // queryLine = `min(${columnName}) as lo, max(${columnName}) as hi, count(*) as n`; +// // formatFn = (r: any)=>`${r.n} rows from ${r.lo} to ${r.hi} for ${t.id}`; +// // } +// // else { +// // queryLine = `count(*) as n`; +// // formatFn = (r: any)=>`${r.n} rows for ${t.id}`; +// // } + +/**Column metadata. Just a string into the TYPES*/ +type ColumnMeta = (keyof typeof COLUMN_TYPES | undefined); +// Make non-optional version of just the metadata values of TaskTarget +type TaskTargetMeta = Required>; + export class TaskTarget { + /**The current path pointed to by this TaskTarget*/ path: string; + /**The fsImpl used to access the .path*/ fsImpl: FSImpl = defaultFSImpl; + /**The pipeline of things to do to the above path to get an stdout of the output*/ pipeline: TaskTargetOp[]; - idValue: ValidId | undefined; - postFns: ((t: TaskTarget)=>Promise)[]; + + // == Metadata, user configurable, no good defaults == + /**Id of the TaskTarget + * string - Static id + * fn returning string - Function can derive the id from a task target even after a glob() and cd() operation + **/ + idValue?: ValidId; + /**For every output CSV, this defines a description of that CSV per-row + * Use the items {0}, {1} to template + * Example: For a CSV with a row format like ["time", "sender", "sendee", "message"] + * you might do something like '"{3}" sent from {2} to {1}' + * */ + perRowDescription?: string; + /**Metadata about the columns*/ + columnMeta?: ColumnMeta[]; constructor(path: string){ this.path = path; this.pipeline = []; - this.postFns = []; } exists() { @@ -168,10 +211,12 @@ export class TaskTarget { clone(): TaskTarget { const t = new TaskTarget(this.path); t.fsImpl = this.fsImpl; // holds no state, just needs same impl - t.idValue = this.idValue; - t.postFns = this.postFns.slice(); t.pipeline = this.pipeline.slice() .map(p => p.clone()); + // metadata + t.idValue = this.idValue; + t.perRowDescription = this.perRowDescription; + t.columnMeta = this.columnMeta?.slice(); return t; } @@ -190,10 +235,6 @@ export class TaskTarget { return shell; } - pushPostFn(fn: ((t: TaskTarget)=>Promise)) { - this.postFns.push(fn); - } - cmd(cmd: ValidCmd) { this.pushToPipeline(new TaskTargetCmd(cmd)); return this; @@ -202,54 +243,14 @@ export class TaskTarget { this.pushToPipeline(new TaskTargetRead()); return this; } - setId(idValue: ValidId) { - this.idValue = idValue; + assignMeta(meta: Partial) { + Object.assign(this, { + ...meta, + // Clone this deeply so no shared object references + columnMeta: meta.columnMeta?.slice() + }); return this; } - post(fn: any) { - this.pushPostFn(fn); - } - types( - types: string[] - ) { - // TODO: - return this; - } - csvSink( - summarization?: [string, string][] - ) { - // TODO: - return this; - - // Ingest this csv into the database at the given id - // this.cmd(t=>["sqlite-utils", "insert", "your.db", t.id, "-", "--csv", "--detect-types"]); - // Add a post processing function for these targets that prints out the summarization - // stats - // this.post(async (t: TaskTarget)=>{ - // // We only do the first one so far for the summarization - // let queryLine: string; - // let formatFn: (r: any)=>string; - // const [columnName, type] = summarization?.[0] ?? [undefined, undefined]; - // if (type === "numeric") { - // queryLine = `min(${columnName}) as lo, max(${columnName}) as hi, count(*) as n`; - // formatFn = (r: any)=>`${r.n} rows from ${r.lo} to ${r.hi} for ${t.id}`; - // } - // else { - // queryLine = `count(*) as n`; - // formatFn = (r: any)=>`${r.n} rows for ${t.id}`; - // } - - // const cmd = "sqlite-utils"; - // const args = ["query", "your.db", `select ${queryLine} from ${t.id}`] - // const { stdout, stderr } = await execFile(cmd, args); - // const results = JSON.parse(stdout); - // const result = results[0]; // should only be one result in the array for this type of query - // const logLine = formatFn(result); - // (t as any).log = logLine; - // }); - - // return this; - } } export function each(targets: TaskTarget[], fn: (t: TaskTarget)=>void) { @@ -279,8 +280,8 @@ export function read(targets: TaskTarget[]): TaskTarget[] { export function cmd(targets: TaskTarget[], cmd: ValidCmd): TaskTarget[] { return targets.map(t => t.clone().cmd(cmd)) } -export function setId(targets: TaskTarget[], id: ValidId): TaskTarget[] { - return targets.map(t => t.clone().setId(id)) +export function assignMeta(targets: TaskTarget[], meta: Partial): TaskTarget[] { + return targets.map(t => t.clone().assignMeta(meta)) } /**Verify, anything that fails is skipped and throws an error*/ @@ -305,29 +306,6 @@ export async function verify(targets: TaskTarget[]) { return outTargets; } -/**Writes a manifest for parallel, a TSV where each record is an id + the shell to run - * @todo Enforce doing a verify before we output? - */ -export function getTSVManifest(targets: TaskTarget[]): string { - let out: string[] = []; - for (const t of targets) { - const shell = t.toShell(); - out.push(`${t.id}\t${shell}`); - } - - return out.join("\n"); -} - -export function getTaskManifest(targets: TaskTarget[]): [string, string][] { - let out: [string, string][] = []; - for (const t of targets) { - const shell = t.toShell(); - out.push([t.id, shell] as const); - } - - return out; -} - function collectionSwap(a: TaskTargetPipelineHelper, b: TaskTargetPipelineHelper) { if (!a.__collection) { return; @@ -382,18 +360,10 @@ export class TaskTargetPipelineHelper extends Array { cmd(_cmd: ValidCmd): TaskTargetPipelineHelper { return this._fn(t => cmd(t, _cmd)); } - setId(id: ValidId): TaskTargetPipelineHelper { - return this._fn(t => setId(t, id)); + assignMeta(meta: Partial): TaskTargetPipelineHelper { + return this._fn(t => assignMeta(t, meta)); } - types(...args: any[]) { - // TODO: no-op - return this; - } - csvSink(...args: any[]) { - // TODO: no-op - return this; - } /** * @todo Nested versions of this don't currently work, but they could if we * turn __collection into an array of collections diff --git a/main.ts b/main.ts index a8e234a..de912fb 100644 --- a/main.ts +++ b/main.ts @@ -64,4 +64,41 @@ async function main() { // loadIntoSqlite(csvFiles, 'your.db'); } -main(); \ No newline at end of file +main(); + +// TODO: Move this into here + // csvSink( + // summarization?: [string, string][] + // ) { + // // TODO: + // return this; + + // // Ingest this csv into the database at the given id + // // this.cmd(t=>["sqlite-utils", "insert", "your.db", t.id, "-", "--csv", "--detect-types"]); + // // Add a post processing function for these targets that prints out the summarization + // // stats + // // this.post(async (t: TaskTarget)=>{ + // // // We only do the first one so far for the summarization + // // let queryLine: string; + // // let formatFn: (r: any)=>string; + // // const [columnName, type] = summarization?.[0] ?? [undefined, undefined]; + // // if (type === "numeric") { + // // queryLine = `min(${columnName}) as lo, max(${columnName}) as hi, count(*) as n`; + // // formatFn = (r: any)=>`${r.n} rows from ${r.lo} to ${r.hi} for ${t.id}`; + // // } + // // else { + // // queryLine = `count(*) as n`; + // // formatFn = (r: any)=>`${r.n} rows for ${t.id}`; + // // } + + // // const cmd = "sqlite-utils"; + // // const args = ["query", "your.db", `select ${queryLine} from ${t.id}`] + // // const { stdout, stderr } = await execFile(cmd, args); + // // const results = JSON.parse(stdout); + // // const result = results[0]; // should only be one result in the array for this type of query + // // const logLine = formatFn(result); + // // (t as any).log = logLine; + // // }); + + // // return this; + // } \ No newline at end of file diff --git a/package.json b/package.json index e0ed113..586e3ff 100644 --- a/package.json +++ b/package.json @@ -20,7 +20,6 @@ "@types/duplexify": "^3.6.5", "@types/yauzl": "^2.10.3", "duplexify": "^4.1.3", - "fp-ts": "^2.16.11", "glob": "^13.0.0", "htmlparser2": "^10.0.0", "yauzl": "^3.2.0", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 1d5812c..2538877 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -17,9 +17,6 @@ importers: duplexify: specifier: ^4.1.3 version: 4.1.3 - fp-ts: - specifier: ^2.16.11 - version: 2.16.11 glob: specifier: ^13.0.0 version: 13.0.0 @@ -89,9 +86,6 @@ packages: resolution: {integrity: sha512-aN97NXWF6AWBTahfVOIrB/NShkzi5H7F9r1s9mD3cDj4Ko5f2qhhVoYMibXF7GlLveb/D2ioWay8lxI97Ven3g==} engines: {node: '>=0.12'} - fp-ts@2.16.11: - resolution: {integrity: sha512-LaI+KaX2NFkfn1ZGHoKCmcfv7yrZsC3b8NtWsTVQeHkq4F27vI5igUuO53sxqDEa2gNQMHFPmpojDw/1zmUK7w==} - glob@13.0.0: resolution: {integrity: sha512-tvZgpqk6fz4BaNZ66ZsRaZnbHvP/jG3uKJvAZOwEVUL4RTA5nJeeLYfyN9/VA8NX/V3IBG+hkeuGpKjvELkVhA==} engines: {node: 20 || >=22} @@ -215,8 +209,6 @@ snapshots: entities@6.0.1: {} - fp-ts@2.16.11: {} - glob@13.0.0: dependencies: minimatch: 10.1.1 diff --git a/test/facebook.ts b/test/facebook.ts index 9835c91..14eb503 100644 --- a/test/facebook.ts +++ b/test/facebook.ts @@ -1,9 +1,7 @@ import test from "node:test"; import nodePath from "node:path"; import { strict as assert } from "node:assert"; -import { finished } from "node:stream/promises"; -import { Readable, Writable } from "node:stream"; -import { TaskTargetPipelineHelper, TaskTarget, verify, getTSVManifest, getTaskManifest, run } from "../data-export/task.ts"; +import { TaskTargetPipelineHelper, TaskTarget, verify, run } from "../data-export/task.ts"; import { parallel } from "../data-export/parallel.ts"; import "../data-export/facebook.ts"; @@ -20,16 +18,18 @@ test("facebook: Can load the 2021 export", async (t) => { const finalTargets = await verify(targets); const result = await parallel(finalTargets, true); - for (const [id, r] of result.entries()) { - assert.ok(!r.stderr, `Task ${id} should have no stderr output`); - assert.ok(r.ok, `Task ${id} should be okay`); + const idAndCSVs: [string, string][] = []; + for (const [idx, r] of result.entries()) { + const target = finalTargets[idx]; + assert.ok(!r.stderr, `Task ${target.id} should have no stderr output`); + assert.ok(r.ok, `Task ${target.id} should be okay`); + idAndCSVs.push([target.id, r.stdout]); } - - const allCSV = Array.from(result.entries()) + const csvs = idAndCSVs .sort() // Keep stable ordering for snapshots - .map(([id, r]) => r.stdout); + .map(v => v[1]) - t.assert.snapshot(allCSV); + t.assert.snapshot(csvs); }); test("facebook: Can load the 2021 export zipped", async (t) => { const targets = await TaskTargetPipelineHelper.pipeline([ @@ -41,16 +41,18 @@ test("facebook: Can load the 2021 export zipped", async (t) => { const finalTargets = await verify(targets2); const result = await parallel(finalTargets, true); - for (const [id, r] of result.entries()) { - assert.ok(!r.stderr, `Task ${id} should have no stderr output`); - assert.ok(r.ok, `Task ${id} should be okay`); + const idAndCSVs: [string, string][] = []; + for (const [idx, r] of result.entries()) { + const target = finalTargets[idx]; + assert.ok(!r.stderr, `Task ${target.id} should have no stderr output`); + assert.ok(r.ok, `Task ${target.id} should be okay`); + idAndCSVs.push([target.id, r.stdout]); } - - const allCSV = Array.from(result.entries()) + const csvs = idAndCSVs .sort() // Keep stable ordering for snapshots - .map(([id, r]) => r.stdout); + .map(v => v[1]) - t.assert.snapshot(allCSV); + t.assert.snapshot(csvs); }); test("facebook: Can load the 2025 export", async (t) => { const targets = TaskTargetPipelineHelper.pipeline([ @@ -60,14 +62,16 @@ test("facebook: Can load the 2025 export", async (t) => { const finalTargets = await verify(targets); const result = await parallel(finalTargets, true); - for (const [id, r] of result.entries()) { - assert.ok(!r.stderr, `Task ${id} should have no stderr output`); - assert.ok(r.ok, `Task ${id} should be okay`); + const idAndCSVs: [string, string][] = []; + for (const [idx, r] of result.entries()) { + const target = finalTargets[idx]; + assert.ok(!r.stderr, `Task ${target.id} should have no stderr output`); + assert.ok(r.ok, `Task ${target.id} should be okay`); + idAndCSVs.push([target.id, r.stdout]); } - - const allCSV = Array.from(result.entries()) + const csvs = idAndCSVs .sort() // Keep stable ordering for snapshots - .map(([id, r]) => r.stdout); + .map(v => v[1]) - t.assert.snapshot(allCSV); + t.assert.snapshot(csvs); }); diff --git a/test/facebook.ts.snapshot b/test/facebook.ts.snapshot index fb5c5e6..8b6e62a 100644 --- a/test/facebook.ts.snapshot +++ b/test/facebook.ts.snapshot @@ -93,6 +93,7 @@ exports[`facebook: Can load the 2025 export 1`] = ` "\\"from\\",\\"to\\",\\"timestamp\\",\\"content\\"\\n\\"xxx\\",\\"\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n\\"xxx\\",\\"\\",\\"1970-01-01T00:00:00Z\\",\\"some/path\\"\\n", "\\"from\\",\\"to\\",\\"timestamp\\",\\"content\\"\\n\\"xxx\\",\\"\\",\\"1970-01-01T00:00:00Z\\",\\n\\"xxx\\",\\"\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n", "\\"from\\",\\"to\\",\\"timestamp\\",\\"content\\"\\n\\"xxx\\",\\"\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n\\"xxx\\",\\"\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n", + "\\"from\\",\\"to\\",\\"timestamp\\",\\"content\\"\\n\\"xxx\\",\\"\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n\\"xxx\\",\\"\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n", "\\"from\\",\\"to\\",\\"timestamp\\",\\"content\\"\\n\\"xxx\\",\\"\\",\\"1970-01-01T00:00:00Z\\",\\n", "\\"action\\",\\"ip\\",\\"user_agent\\",\\"datr_cookie\\",\\"city\\",\\"region\\",\\"country\\",\\"site_name\\",\\"timestamp\\"\\n\\"xxx\\",\\"1.1.1.1\\",\\"some/path\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"2024-11-22T10:06:40Z\\"\\n\\"xxx\\",\\"1.1.1.1\\",\\"some/path\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"2024-11-21T23:00:00Z\\"\\n", "\\"timestamp\\",\\"data\\",\\"title\\"\\n\\"2024-02-13T02:06:40Z\\",\\"TODO\\",\\"xxx\\"\\n\\"2024-07-12T02:06:40Z\\",\\"TODO\\",\\"xxx\\"\\n", diff --git a/test/task.ts b/test/task.ts index a87d240..d333362 100644 --- a/test/task.ts +++ b/test/task.ts @@ -7,9 +7,8 @@ import { glob as taskGlob, read, cmd, - setId, + assignMeta, verify, - getTSVManifest, TaskTargetPipelineHelper, } from "../data-export/task.ts"; @@ -19,11 +18,10 @@ const FIXTURE_FILE = nodePath.join(FIXTURE_DIR, 'friends/friends.json'); // -- TaskTarget --------------------------------------------------------------- -test("TaskTarget: constructor initializes path, pipeline, postFns", () => { +test("TaskTarget: constructor initializes path, pipeline", () => { const t = new TaskTarget("/foo/bar"); assert.equal(t.path, "/foo/bar"); assert.deepEqual(t.pipeline, []); - assert.deepEqual(t.postFns, []); }); test("TaskTarget: exists() returns true for a real file", () => { @@ -50,12 +48,12 @@ test("TaskTarget: id throws when no idValue is set", () => { }); test("TaskTarget: id with a string value is safe-ified", () => { - const t = new TaskTarget("/foo").setId("my-id"); + const t = new TaskTarget("/foo").assignMeta({ idValue: "my-id" }); assert.equal(t.id, "my_id"); }); test("TaskTarget: id with a function value is resolved against the target", () => { - const t = new TaskTarget("/foo/bar").setId(tgt => tgt.basename); + const t = new TaskTarget("/foo/bar").assignMeta({ idValue: tgt => tgt.basename }); assert.equal(t.id, "bar"); }); @@ -92,12 +90,17 @@ test("TaskTarget: pushToPipeline throws if read is not the first op", () => { }); test("TaskTarget: clone produces an independent copy", () => { - const t = new TaskTarget("/foo").setId("orig"); + const t = new TaskTarget("/foo").assignMeta({ + idValue: "orig", + columnMeta: ["yeag"] + }); t.read(); const c = t.clone(); assert.equal(c.path, "/foo"); assert.equal(c.id, "orig"); + assert(c.pipeline !== t.pipeline); // Different object references assert.equal(c.pipeline.length, 1); + assert(c.columnMeta !== t.columnMeta); // Different object references c.path = "/other"; assert.equal(t.path, "/foo"); // original unchanged }); @@ -176,9 +179,9 @@ test("cmd: clones and appends a cmd op to each target", () => { assert.equal(targets[0].pipeline.length, 1); // original unchanged }); -test("setId: clones and sets id on each target", () => { +test("assignMeta: clones and sets meta on each target", () => { const targets = [new TaskTarget("/a"), new TaskTarget("/b")]; - const result = setId(targets, "myid"); + const result = assignMeta(targets, { idValue: "myid" }); assert.equal(result[0].id, "myid"); assert.equal(result[1].id, "myid"); assert.throws(() => targets[0].id); // originals have no id @@ -223,21 +226,6 @@ test("verify: filters a mixed list to only valid targets", async () => { assert.equal(result[0], good); }); -// -- getTSVManifest ----------------------------------------------------------- - -test("getTSVManifest: produces idshell for a single target", () => { - const t = new TaskTarget("/foo/bar.txt"); - t.setId("myid"); - t.read(); - assert.equal(getTSVManifest([t]), "myid\tcat /foo/bar.txt"); -}); - -test("getTSVManifest: joins multiple targets with newlines", () => { - const t1 = new TaskTarget("/a.txt"); t1.setId("a"); t1.read(); - const t2 = new TaskTarget("/b.txt"); t2.setId("b"); t2.read(); - assert.equal(getTSVManifest([t1, t2]), "a\tcat /a.txt\nb\tcat /b.txt"); -}); - // -- TaskTargetPipelineHelper ------------------------------------------------- test("TaskTargetPipelineHelper: pipeline() promotes a plain array", () => {