Compare commits

..

No commits in common. "3e64969e05dc4b2a60eb999b19a8314f7f69f349" and "fa070985495ca570cf0a3c65b3ca663825578dd2" have entirely different histories.

10 changed files with 187 additions and 163 deletions

View file

@ -107,7 +107,7 @@ function facebook_notifications_generic(this: TaskTargetPipelineHelper, prop: st
| [(.timestamp | todateiso8601), .unread, .href, .text]
)
| @csv`])
.assignMeta({ columnMeta: ["isodatetime", "any", "url", "text"] });
.types(["time", "text", "text", "text"]);
}
function facebook_notifications_v1(this: TaskTargetPipelineHelper) {
return this.facebook_notifications_generic("notifications");
@ -126,7 +126,7 @@ function facebook_installed_apps_generic(this: TaskTargetPipelineHelper, prop: s
)
| @csv
`])
.assignMeta({ columnMeta: ["text", "isodatetime"] });
.types(["text", "time"]);
}
function facebook_installed_apps_v1(this: TaskTargetPipelineHelper) {
return this.facebook_installed_apps_generic("installed_apps");
@ -173,7 +173,7 @@ function facebook_comments_generic(this: TaskTargetPipelineHelper, prop: string)
)
| @csv
`])
.assignMeta({ columnMeta: ["isodatetime", "TODO", "text"] })
.types(["time", "text", "text"])
}
function facebook_comments_v1(this: TaskTargetPipelineHelper) {
return this.facebook_comments_generic("comments");
@ -392,7 +392,7 @@ function facebook_group_posts_v2(this: TaskTargetPipelineHelper) {
}
function facebook_v2(this: TaskTargetPipelineHelper) {
const p = this.assignMeta({ idValue: t=>`Facebookv2 - ${t.basename}` }); // Generic ID for everything in here
const p = this.setId(t=>`Facebookv2 - ${t.basename}`); // Generic ID for everything in here
const col: Set<TaskTargetPipelineHelper> = new Set();
// No correlary to accounts_and_profiles.json
@ -400,7 +400,7 @@ function facebook_v2(this: TaskTargetPipelineHelper) {
p.collect(col).cd(`apps_and_websites_off_of_facebook/connected_apps_and_websites.json`).read().facebook_installed_apps_v2();
p.collect(col).cd(`your_facebook_activity/comments_and_reactions/comments.json`).read().facebook_comments_v2();
p.collect(col).glob(`your_facebook_activity/messages/*/**/*.json`) // Messages files are in the FOLDERS inside messages (archived_threads, e2ee_cutover, etc...)
.assignMeta({ idValue: t=>`Facebookv2 - Messages ${t.basenameN(2)}` }) // 1, 2, etc is not specific enough, include the convo name
.setId(t=>`Facebookv2 - Messages ${t.basenameN(2)}`) // 1, 2, etc is not specific enough, include the convo name
.read()
.facebook_messages_generic()
@ -443,7 +443,7 @@ function facebook_v2(this: TaskTargetPipelineHelper) {
}
function facebook(this: TaskTargetPipelineHelper){
const p = this.assignMeta({ idValue: t=>`Facebook - ${t.basename}` }); // Generic ID for everything in here
const p = this.setId(t=>`Facebook - ${t.basename}`); // Generic ID for everything in here
const col: Set<TaskTargetPipelineHelper> = new Set();
p.collect(col).cd(`about_you/notifications.json`).read().facebook_notifications_v1()
@ -461,6 +461,7 @@ function facebook(this: TaskTargetPipelineHelper){
| [.service_name, .native_app_id, .username, .email, .phone_number, .name]
)
| @csv`])
.csvSink()
p.collect(col).cd(`ads_and_businesses/your_off-facebook_activity.json`).read()
@ -474,6 +475,7 @@ function facebook(this: TaskTargetPipelineHelper){
)
| @csv
`])
.csvSink([["timestamp", "numeric"]])
//TODO: .fork().todo('advertisers_who_uploaded_a_contact_list_with_your_information.json')
p.collect(col).cd(`apps_and_websites/apps_and_websites.json`).read().facebook_installed_apps_v1()
@ -484,7 +486,7 @@ function facebook(this: TaskTargetPipelineHelper){
p.collect(col).cd(`comments/comments.json`).read().facebook_comments_v1()
p.collect(col).glob(`dating/messages/*.json`) // Files are 0.json, 1.json, etc
.assignMeta({ idValue: t=>`Facebook - Dating Messages ${t.basename}` }) // Slightly more specific message
.setId(t=>`Facebook - Dating Messages ${t.basename}`) // Slightly more specific message
.read()
.cmd(["jq", "-r", `
["from","to","timestamp","body"],
@ -495,6 +497,7 @@ function facebook(this: TaskTargetPipelineHelper){
)
| @csv
`])
.csvSink();//[["timestamp", "numeric"]])
//todo: your_dating_activity.json, but it only has a few lines and not super useful
//todo: the other dating files are also just, small
@ -531,6 +534,7 @@ function facebook(this: TaskTargetPipelineHelper){
)
| @csv
`])
.csvSink([["added_timestamp", "numeric"]])
p.collect(col).cd(`following_and_followers/unfollowed_pages.json`).read().facebook_pages_unfollowed_v1()
p.collect(col).cd(`following_and_followers/following.json`)
@ -543,6 +547,7 @@ function facebook(this: TaskTargetPipelineHelper){
)
| @csv
`])
.csvSink([["timestamp", "numeric"]])
p.collect(col).cd(`following_and_followers/followers.json`)
.read()
.cmd(["jq", "-r", `
@ -553,6 +558,7 @@ function facebook(this: TaskTargetPipelineHelper){
)
| @csv
`])
.csvSink()
p.collect(col).cd(`friends/sent_friend_requests.json`).read().facebook_friends_generic("sent_requests")
p.collect(col).cd(`friends/removed_friends.json`).read().facebook_friends_generic("deleted_friends")
@ -579,6 +585,7 @@ function facebook(this: TaskTargetPipelineHelper){
)
| @csv
`])
.csvSink([["timestamp", "numeric"]])
p.collect(col).cd(`likes_and_reactions/posts_and_comments.json`)
.read()
.cmd(["jq", "-r", `
@ -589,6 +596,7 @@ function facebook(this: TaskTargetPipelineHelper){
)
| @csv
`])
.csvSink([["timestamp", "numeric"]])
// TODO:
// rcd(`location`);
@ -600,7 +608,7 @@ function facebook(this: TaskTargetPipelineHelper){
p.collect(col).glob(`messages/**/*.json`) // Files are message_1.json, etc
.assignMeta({ idValue: t=>`Facebook - Messages ${t.basenameN(2)}` }) // 1, 2, etc is not specific enough, include the convo name
.setId(t=>`Facebook - Messages ${t.basenameN(2)}`) // 1, 2, etc is not specific enough, include the convo name
.read()
.facebook_messages_generic()
@ -620,6 +628,7 @@ function facebook(this: TaskTargetPipelineHelper){
)
| @csv
`])
.csvSink([["timestamp", "numeric"]]);
p.collect(col).cd(`other_activity/support_correspondences.json`)
.read()
// TODO: I'm seeing blanks in .from and .to when the replier was Facebook
@ -633,6 +642,7 @@ function facebook(this: TaskTargetPipelineHelper){
)
| @csv
`])
.csvSink([["timestamp", "numeric"]])
// `${facebookRoot}/pages` - no data
@ -647,12 +657,13 @@ function facebook(this: TaskTargetPipelineHelper){
)
| @csv
`])
.csvSink([["created_timestamp", "numeric"]]);
// TODO: There's also photos_and_videos/your_videos.json
// TODO: There's a media_metadata in each of the images too to convert as well as external files
p.collect(col).glob(`photos_and_videos/album/*.json`)
// Could use a better name, currently 0.json, 1.json, etc...
.assignMeta({ idValue: t=>`Facebook - Album ${t.basename}` }) //slightly more speciifc name, it woudl be better if we could use the album name
.setId(t=>`Facebook - Album ${t.basename}`) //slightly more speciifc name, it woudl be better if we could use the album name
.read()
.cmd(["jq", "-r", `
["album","uri","creation_timestamp"],
@ -662,6 +673,7 @@ function facebook(this: TaskTargetPipelineHelper){
)
| @csv
`])
.csvSink([["creation_timestamp", "numeric"]])
p.collect(col).cd(`posts/your_pinned_posts.json`)
.read()
@ -673,6 +685,7 @@ function facebook(this: TaskTargetPipelineHelper){
)
| @csv
`])
.csvSink([["timestamp", "numeric"]])
// TODO: Glob? I never posted a lot on FB
p.collect(col).cd(`posts/your_posts_1.json`)
.read()
@ -686,6 +699,7 @@ function facebook(this: TaskTargetPipelineHelper){
)
| @csv
`])
.csvSink([["timestamp", "numeric"]])
// `${facebookRoot}/privacy_checkup` - no data
@ -704,6 +718,7 @@ function facebook(this: TaskTargetPipelineHelper){
)
| @csv
`])
.csvSink([["timestamp", "numeric"]])
// `${facebookRoot}/rewards` - no data
// `${facebookRoot}/saved_items_and_collections` - no data
@ -720,6 +735,7 @@ function facebook(this: TaskTargetPipelineHelper){
)
| @csv
`])
.csvSink([["timestamp", "numeric"]])
p.collect(col).cd(`security_and_login_information/account_activity.json`).read().facebook_account_activity_v1()
p.collect(col).cd(`security_and_login_information/administrative_records.json`).read().facebook_admin_records_v1()
p.collect(col).cd(`security_and_login_information/authorized_logins.json`).read().facebook_authorized_logins_v1()
@ -756,6 +772,7 @@ function facebook(this: TaskTargetPipelineHelper){
)
| @csv
`])
.csvSink([["timestamp", "numeric"]])
// `${facebookRoot}/trash` - no data in my export
// `${facebookRoot}/voice_recording_and_transcription` - no data in my export

View file

@ -2,7 +2,7 @@ import { TaskTargetPipelineHelper } from "./task.ts";
import { htmlSelectorChunkedDuplex } from "./html.ts";
export function google(this: TaskTargetPipelineHelper){
const p = this.assignMeta({ idValue: t=>`Google - ${t.basename}` }); // Generic ID for everything in here
const p = this.setId(t=>`Google - ${t.basename}`); // Generic ID for everything in here
const col: Set<TaskTargetPipelineHelper> = new Set();
// TODO: There is a root takeout folder
@ -40,6 +40,7 @@ export function google(this: TaskTargetPipelineHelper){
// a = t.fork().cd(`Google Pay`)
p.collect(col).cd(`Google Pay/Google transactions`).glob(`transactions_*.csv`)
.read()
.csvSink()
// .fork("a").cd(`Money sends and requests`)
// .fork().cd(`Money sends and requests.csv`)
// .read()
@ -61,6 +62,7 @@ export function google(this: TaskTargetPipelineHelper){
| [.timestampMs | todateiso8601, .latitudeE7, .longitudeE7, .accuracy]
)
| @csv`])
.csvSink()
// There's also the semantic history but that's an entire nother can of worms
// it seems like

View file

@ -4,12 +4,14 @@ import { type TaskTarget, run } from "./task.ts";
$.verbose = false;
type ResultMap = Map<string, ProcessOutput>;
export async function parallel(
targets: TaskTarget[],
quiet: boolean = false,
maxConcurrency: number = os.cpus().length
): Promise<ProcessOutput[]> {
const resultMap = new Map<string, ProcessOutput>();
): Promise<ResultMap> {
const results = new Map<string, ProcessOutput>();
const total = targets.length;
let completed = 0;
@ -47,7 +49,7 @@ export async function parallel(
const result = await run(t);
completionTimes.push(result.duration);
resultMap.set(t.id, result);
results.set(t.id, result);
running--;
completed++;
@ -75,16 +77,10 @@ export async function parallel(
// Final status line
process.stderr.write('\n');
const totalSeconds = ((Date.now() - startTime) / 1000).toFixed(1);
const failed = Array.from(resultMap.values().filter(p => !p.ok));
const failed = Array.from(results.values().filter(p => !p.ok));
process.stderr.write(
`\nCompleted ${total} jobs in ${totalSeconds}s (${failed.length} failed)\n`
);
const output = targets
.map(t => {
const r = resultMap.get(t.id)!;
return r;
});
return output;
return results;
}

View file

@ -93,60 +93,17 @@ class TaskTargetCmd implements TaskTargetOp {
}
type ValidId = string | ((t: TaskTarget)=>string);
export const COLUMN_TYPES = {
/**A numeric value*/
"numeric": {},
/**ISO Datetime*/
"isodatetime": {},
/**Urls*/
"url": {},
/**Freetext*/
"text": {},
/**For anything untyped*/
"any": {},
"TODO": {}
};
// // if (type === "numeric") {
// // queryLine = `min(${columnName}) as lo, max(${columnName}) as hi, count(*) as n`;
// // formatFn = (r: any)=>`${r.n} rows from ${r.lo} to ${r.hi} for ${t.id}`;
// // }
// // else {
// // queryLine = `count(*) as n`;
// // formatFn = (r: any)=>`${r.n} rows for ${t.id}`;
// // }
/**Column metadata. Just a string into the TYPES*/
type ColumnMeta = (keyof typeof COLUMN_TYPES | undefined);
// Make non-optional version of just the metadata values of TaskTarget
type TaskTargetMeta = Required<Pick<TaskTarget, "idValue" | "perRowDescription" | "columnMeta">>;
export class TaskTarget {
/**The current path pointed to by this TaskTarget*/
path: string;
/**The fsImpl used to access the .path*/
fsImpl: FSImpl = defaultFSImpl;
/**The pipeline of things to do to the above path to get an stdout of the output*/
pipeline: TaskTargetOp[];
// == Metadata, user configurable, no good defaults ==
/**Id of the TaskTarget
* string - Static id
* fn returning string - Function can derive the id from a task target even after a glob() and cd() operation
**/
idValue?: ValidId;
/**For every output CSV, this defines a description of that CSV per-row
* Use the items {0}, {1} to template
* Example: For a CSV with a row format like ["time", "sender", "sendee", "message"]
* you might do something like '"{3}" sent from {2} to {1}'
* */
perRowDescription?: string;
/**Metadata about the columns*/
columnMeta?: ColumnMeta[];
idValue: ValidId | undefined;
postFns: ((t: TaskTarget)=>Promise<void>)[];
constructor(path: string){
this.path = path;
this.pipeline = [];
this.postFns = [];
}
exists() {
@ -211,12 +168,10 @@ export class TaskTarget {
clone(): TaskTarget {
const t = new TaskTarget(this.path);
t.fsImpl = this.fsImpl; // holds no state, just needs same impl
t.idValue = this.idValue;
t.postFns = this.postFns.slice();
t.pipeline = this.pipeline.slice()
.map(p => p.clone());
// metadata
t.idValue = this.idValue;
t.perRowDescription = this.perRowDescription;
t.columnMeta = this.columnMeta?.slice();
return t;
}
@ -235,6 +190,10 @@ export class TaskTarget {
return shell;
}
pushPostFn(fn: ((t: TaskTarget)=>Promise<void>)) {
this.postFns.push(fn);
}
cmd(cmd: ValidCmd) {
this.pushToPipeline(new TaskTargetCmd(cmd));
return this;
@ -243,14 +202,54 @@ export class TaskTarget {
this.pushToPipeline(new TaskTargetRead());
return this;
}
assignMeta(meta: Partial<TaskTargetMeta>) {
Object.assign(this, {
...meta,
// Clone this deeply so no shared object references
columnMeta: meta.columnMeta?.slice()
});
setId(idValue: ValidId) {
this.idValue = idValue;
return this;
}
post(fn: any) {
this.pushPostFn(fn);
}
types(
types: string[]
) {
// TODO:
return this;
}
csvSink(
summarization?: [string, string][]
) {
// TODO:
return this;
// Ingest this csv into the database at the given id
// this.cmd(t=>["sqlite-utils", "insert", "your.db", t.id, "-", "--csv", "--detect-types"]);
// Add a post processing function for these targets that prints out the summarization
// stats
// this.post(async (t: TaskTarget)=>{
// // We only do the first one so far for the summarization
// let queryLine: string;
// let formatFn: (r: any)=>string;
// const [columnName, type] = summarization?.[0] ?? [undefined, undefined];
// if (type === "numeric") {
// queryLine = `min(${columnName}) as lo, max(${columnName}) as hi, count(*) as n`;
// formatFn = (r: any)=>`${r.n} rows from ${r.lo} to ${r.hi} for ${t.id}`;
// }
// else {
// queryLine = `count(*) as n`;
// formatFn = (r: any)=>`${r.n} rows for ${t.id}`;
// }
// const cmd = "sqlite-utils";
// const args = ["query", "your.db", `select ${queryLine} from ${t.id}`]
// const { stdout, stderr } = await execFile(cmd, args);
// const results = JSON.parse(stdout);
// const result = results[0]; // should only be one result in the array for this type of query
// const logLine = formatFn(result);
// (t as any).log = logLine;
// });
// return this;
}
}
export function each(targets: TaskTarget[], fn: (t: TaskTarget)=>void) {
@ -280,8 +279,8 @@ export function read(targets: TaskTarget[]): TaskTarget[] {
export function cmd(targets: TaskTarget[], cmd: ValidCmd): TaskTarget[] {
return targets.map(t => t.clone().cmd(cmd))
}
export function assignMeta(targets: TaskTarget[], meta: Partial<TaskTargetMeta>): TaskTarget[] {
return targets.map(t => t.clone().assignMeta(meta))
export function setId(targets: TaskTarget[], id: ValidId): TaskTarget[] {
return targets.map(t => t.clone().setId(id))
}
/**Verify, anything that fails is skipped and throws an error*/
@ -306,6 +305,29 @@ export async function verify(targets: TaskTarget[]) {
return outTargets;
}
/**Writes a manifest for parallel, a TSV where each record is an id + the shell to run
* @todo Enforce doing a verify before we output?
*/
export function getTSVManifest(targets: TaskTarget[]): string {
let out: string[] = [];
for (const t of targets) {
const shell = t.toShell();
out.push(`${t.id}\t${shell}`);
}
return out.join("\n");
}
export function getTaskManifest(targets: TaskTarget[]): [string, string][] {
let out: [string, string][] = [];
for (const t of targets) {
const shell = t.toShell();
out.push([t.id, shell] as const);
}
return out;
}
function collectionSwap(a: TaskTargetPipelineHelper, b: TaskTargetPipelineHelper) {
if (!a.__collection) {
return;
@ -360,10 +382,18 @@ export class TaskTargetPipelineHelper extends Array<TaskTarget> {
cmd(_cmd: ValidCmd): TaskTargetPipelineHelper {
return this._fn(t => cmd(t, _cmd));
}
assignMeta(meta: Partial<TaskTargetMeta>): TaskTargetPipelineHelper {
return this._fn(t => assignMeta(t, meta));
setId(id: ValidId): TaskTargetPipelineHelper {
return this._fn(t => setId(t, id));
}
types(...args: any[]) {
// TODO: no-op
return this;
}
csvSink(...args: any[]) {
// TODO: no-op
return this;
}
/**
* @todo Nested versions of this don't currently work, but they could if we
* turn __collection into an array of collections

39
main.ts
View file

@ -64,41 +64,4 @@ async function main() {
// loadIntoSqlite(csvFiles, 'your.db');
}
main();
// TODO: Move this into here
// csvSink(
// summarization?: [string, string][]
// ) {
// // TODO:
// return this;
// // Ingest this csv into the database at the given id
// // this.cmd(t=>["sqlite-utils", "insert", "your.db", t.id, "-", "--csv", "--detect-types"]);
// // Add a post processing function for these targets that prints out the summarization
// // stats
// // this.post(async (t: TaskTarget)=>{
// // // We only do the first one so far for the summarization
// // let queryLine: string;
// // let formatFn: (r: any)=>string;
// // const [columnName, type] = summarization?.[0] ?? [undefined, undefined];
// // if (type === "numeric") {
// // queryLine = `min(${columnName}) as lo, max(${columnName}) as hi, count(*) as n`;
// // formatFn = (r: any)=>`${r.n} rows from ${r.lo} to ${r.hi} for ${t.id}`;
// // }
// // else {
// // queryLine = `count(*) as n`;
// // formatFn = (r: any)=>`${r.n} rows for ${t.id}`;
// // }
// // const cmd = "sqlite-utils";
// // const args = ["query", "your.db", `select ${queryLine} from ${t.id}`]
// // const { stdout, stderr } = await execFile(cmd, args);
// // const results = JSON.parse(stdout);
// // const result = results[0]; // should only be one result in the array for this type of query
// // const logLine = formatFn(result);
// // (t as any).log = logLine;
// // });
// // return this;
// }
main();

View file

@ -20,6 +20,7 @@
"@types/duplexify": "^3.6.5",
"@types/yauzl": "^2.10.3",
"duplexify": "^4.1.3",
"fp-ts": "^2.16.11",
"glob": "^13.0.0",
"htmlparser2": "^10.0.0",
"yauzl": "^3.2.0",

8
pnpm-lock.yaml generated
View file

@ -17,6 +17,9 @@ importers:
duplexify:
specifier: ^4.1.3
version: 4.1.3
fp-ts:
specifier: ^2.16.11
version: 2.16.11
glob:
specifier: ^13.0.0
version: 13.0.0
@ -86,6 +89,9 @@ packages:
resolution: {integrity: sha512-aN97NXWF6AWBTahfVOIrB/NShkzi5H7F9r1s9mD3cDj4Ko5f2qhhVoYMibXF7GlLveb/D2ioWay8lxI97Ven3g==}
engines: {node: '>=0.12'}
fp-ts@2.16.11:
resolution: {integrity: sha512-LaI+KaX2NFkfn1ZGHoKCmcfv7yrZsC3b8NtWsTVQeHkq4F27vI5igUuO53sxqDEa2gNQMHFPmpojDw/1zmUK7w==}
glob@13.0.0:
resolution: {integrity: sha512-tvZgpqk6fz4BaNZ66ZsRaZnbHvP/jG3uKJvAZOwEVUL4RTA5nJeeLYfyN9/VA8NX/V3IBG+hkeuGpKjvELkVhA==}
engines: {node: 20 || >=22}
@ -209,6 +215,8 @@ snapshots:
entities@6.0.1: {}
fp-ts@2.16.11: {}
glob@13.0.0:
dependencies:
minimatch: 10.1.1

View file

@ -1,7 +1,9 @@
import test from "node:test";
import nodePath from "node:path";
import { strict as assert } from "node:assert";
import { TaskTargetPipelineHelper, TaskTarget, verify, run } from "../data-export/task.ts";
import { finished } from "node:stream/promises";
import { Readable, Writable } from "node:stream";
import { TaskTargetPipelineHelper, TaskTarget, verify, getTSVManifest, getTaskManifest, run } from "../data-export/task.ts";
import { parallel } from "../data-export/parallel.ts";
import "../data-export/facebook.ts";
@ -18,18 +20,16 @@ test("facebook: Can load the 2021 export", async (t) => {
const finalTargets = await verify(targets);
const result = await parallel(finalTargets, true);
const idAndCSVs: [string, string][] = [];
for (const [idx, r] of result.entries()) {
const target = finalTargets[idx];
assert.ok(!r.stderr, `Task ${target.id} should have no stderr output`);
assert.ok(r.ok, `Task ${target.id} should be okay`);
idAndCSVs.push([target.id, r.stdout]);
for (const [id, r] of result.entries()) {
assert.ok(!r.stderr, `Task ${id} should have no stderr output`);
assert.ok(r.ok, `Task ${id} should be okay`);
}
const csvs = idAndCSVs
.sort() // Keep stable ordering for snapshots
.map(v => v[1])
t.assert.snapshot(csvs);
const allCSV = Array.from(result.entries())
.sort() // Keep stable ordering for snapshots
.map(([id, r]) => r.stdout);
t.assert.snapshot(allCSV);
});
test("facebook: Can load the 2021 export zipped", async (t) => {
const targets = await TaskTargetPipelineHelper.pipeline([
@ -41,18 +41,16 @@ test("facebook: Can load the 2021 export zipped", async (t) => {
const finalTargets = await verify(targets2);
const result = await parallel(finalTargets, true);
const idAndCSVs: [string, string][] = [];
for (const [idx, r] of result.entries()) {
const target = finalTargets[idx];
assert.ok(!r.stderr, `Task ${target.id} should have no stderr output`);
assert.ok(r.ok, `Task ${target.id} should be okay`);
idAndCSVs.push([target.id, r.stdout]);
for (const [id, r] of result.entries()) {
assert.ok(!r.stderr, `Task ${id} should have no stderr output`);
assert.ok(r.ok, `Task ${id} should be okay`);
}
const csvs = idAndCSVs
.sort() // Keep stable ordering for snapshots
.map(v => v[1])
t.assert.snapshot(csvs);
const allCSV = Array.from(result.entries())
.sort() // Keep stable ordering for snapshots
.map(([id, r]) => r.stdout);
t.assert.snapshot(allCSV);
});
test("facebook: Can load the 2025 export", async (t) => {
const targets = TaskTargetPipelineHelper.pipeline([
@ -62,16 +60,14 @@ test("facebook: Can load the 2025 export", async (t) => {
const finalTargets = await verify(targets);
const result = await parallel(finalTargets, true);
const idAndCSVs: [string, string][] = [];
for (const [idx, r] of result.entries()) {
const target = finalTargets[idx];
assert.ok(!r.stderr, `Task ${target.id} should have no stderr output`);
assert.ok(r.ok, `Task ${target.id} should be okay`);
idAndCSVs.push([target.id, r.stdout]);
for (const [id, r] of result.entries()) {
assert.ok(!r.stderr, `Task ${id} should have no stderr output`);
assert.ok(r.ok, `Task ${id} should be okay`);
}
const csvs = idAndCSVs
.sort() // Keep stable ordering for snapshots
.map(v => v[1])
t.assert.snapshot(csvs);
const allCSV = Array.from(result.entries())
.sort() // Keep stable ordering for snapshots
.map(([id, r]) => r.stdout);
t.assert.snapshot(allCSV);
});

View file

@ -93,7 +93,6 @@ exports[`facebook: Can load the 2025 export 1`] = `
"\\"from\\",\\"to\\",\\"timestamp\\",\\"content\\"\\n\\"xxx\\",\\"<other>\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n\\"xxx\\",\\"<other>\\",\\"1970-01-01T00:00:00Z\\",\\"some/path\\"\\n",
"\\"from\\",\\"to\\",\\"timestamp\\",\\"content\\"\\n\\"xxx\\",\\"<other>\\",\\"1970-01-01T00:00:00Z\\",\\n\\"xxx\\",\\"<other>\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n",
"\\"from\\",\\"to\\",\\"timestamp\\",\\"content\\"\\n\\"xxx\\",\\"<other>\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n\\"xxx\\",\\"<other>\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n",
"\\"from\\",\\"to\\",\\"timestamp\\",\\"content\\"\\n\\"xxx\\",\\"<other>\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n\\"xxx\\",\\"<other>\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n",
"\\"from\\",\\"to\\",\\"timestamp\\",\\"content\\"\\n\\"xxx\\",\\"<other>\\",\\"1970-01-01T00:00:00Z\\",\\n",
"\\"action\\",\\"ip\\",\\"user_agent\\",\\"datr_cookie\\",\\"city\\",\\"region\\",\\"country\\",\\"site_name\\",\\"timestamp\\"\\n\\"xxx\\",\\"1.1.1.1\\",\\"some/path\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"2024-11-22T10:06:40Z\\"\\n\\"xxx\\",\\"1.1.1.1\\",\\"some/path\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"2024-11-21T23:00:00Z\\"\\n",
"\\"timestamp\\",\\"data\\",\\"title\\"\\n\\"2024-02-13T02:06:40Z\\",\\"TODO\\",\\"xxx\\"\\n\\"2024-07-12T02:06:40Z\\",\\"TODO\\",\\"xxx\\"\\n",

View file

@ -7,8 +7,9 @@ import {
glob as taskGlob,
read,
cmd,
assignMeta,
setId,
verify,
getTSVManifest,
TaskTargetPipelineHelper,
} from "../data-export/task.ts";
@ -18,10 +19,11 @@ const FIXTURE_FILE = nodePath.join(FIXTURE_DIR, 'friends/friends.json');
// -- TaskTarget ---------------------------------------------------------------
test("TaskTarget: constructor initializes path, pipeline", () => {
test("TaskTarget: constructor initializes path, pipeline, postFns", () => {
const t = new TaskTarget("/foo/bar");
assert.equal(t.path, "/foo/bar");
assert.deepEqual(t.pipeline, []);
assert.deepEqual(t.postFns, []);
});
test("TaskTarget: exists() returns true for a real file", () => {
@ -48,12 +50,12 @@ test("TaskTarget: id throws when no idValue is set", () => {
});
test("TaskTarget: id with a string value is safe-ified", () => {
const t = new TaskTarget("/foo").assignMeta({ idValue: "my-id" });
const t = new TaskTarget("/foo").setId("my-id");
assert.equal(t.id, "my_id");
});
test("TaskTarget: id with a function value is resolved against the target", () => {
const t = new TaskTarget("/foo/bar").assignMeta({ idValue: tgt => tgt.basename });
const t = new TaskTarget("/foo/bar").setId(tgt => tgt.basename);
assert.equal(t.id, "bar");
});
@ -90,17 +92,12 @@ test("TaskTarget: pushToPipeline throws if read is not the first op", () => {
});
test("TaskTarget: clone produces an independent copy", () => {
const t = new TaskTarget("/foo").assignMeta({
idValue: "orig",
columnMeta: ["yeag"]
});
const t = new TaskTarget("/foo").setId("orig");
t.read();
const c = t.clone();
assert.equal(c.path, "/foo");
assert.equal(c.id, "orig");
assert(c.pipeline !== t.pipeline); // Different object references
assert.equal(c.pipeline.length, 1);
assert(c.columnMeta !== t.columnMeta); // Different object references
c.path = "/other";
assert.equal(t.path, "/foo"); // original unchanged
});
@ -179,9 +176,9 @@ test("cmd: clones and appends a cmd op to each target", () => {
assert.equal(targets[0].pipeline.length, 1); // original unchanged
});
test("assignMeta: clones and sets meta on each target", () => {
test("setId: clones and sets id on each target", () => {
const targets = [new TaskTarget("/a"), new TaskTarget("/b")];
const result = assignMeta(targets, { idValue: "myid" });
const result = setId(targets, "myid");
assert.equal(result[0].id, "myid");
assert.equal(result[1].id, "myid");
assert.throws(() => targets[0].id); // originals have no id
@ -226,6 +223,21 @@ test("verify: filters a mixed list to only valid targets", async () => {
assert.equal(result[0], good);
});
// -- getTSVManifest -----------------------------------------------------------
test("getTSVManifest: produces id<TAB>shell for a single target", () => {
const t = new TaskTarget("/foo/bar.txt");
t.setId("myid");
t.read();
assert.equal(getTSVManifest([t]), "myid\tcat /foo/bar.txt");
});
test("getTSVManifest: joins multiple targets with newlines", () => {
const t1 = new TaskTarget("/a.txt"); t1.setId("a"); t1.read();
const t2 = new TaskTarget("/b.txt"); t2.setId("b"); t2.read();
assert.equal(getTSVManifest([t1, t2]), "a\tcat /a.txt\nb\tcat /b.txt");
});
// -- TaskTargetPipelineHelper -------------------------------------------------
test("TaskTargetPipelineHelper: pipeline() promotes a plain array", () => {