Compare commits
2 commits
fa07098549
...
3e64969e05
| Author | SHA1 | Date | |
|---|---|---|---|
| 3e64969e05 | |||
| f6b0f02de7 |
10 changed files with 160 additions and 184 deletions
|
|
@ -107,7 +107,7 @@ function facebook_notifications_generic(this: TaskTargetPipelineHelper, prop: st
|
|||
| [(.timestamp | todateiso8601), .unread, .href, .text]
|
||||
)
|
||||
| @csv`])
|
||||
.types(["time", "text", "text", "text"]);
|
||||
.assignMeta({ columnMeta: ["isodatetime", "any", "url", "text"] });
|
||||
}
|
||||
function facebook_notifications_v1(this: TaskTargetPipelineHelper) {
|
||||
return this.facebook_notifications_generic("notifications");
|
||||
|
|
@ -126,7 +126,7 @@ function facebook_installed_apps_generic(this: TaskTargetPipelineHelper, prop: s
|
|||
)
|
||||
| @csv
|
||||
`])
|
||||
.types(["text", "time"]);
|
||||
.assignMeta({ columnMeta: ["text", "isodatetime"] });
|
||||
}
|
||||
function facebook_installed_apps_v1(this: TaskTargetPipelineHelper) {
|
||||
return this.facebook_installed_apps_generic("installed_apps");
|
||||
|
|
@ -173,7 +173,7 @@ function facebook_comments_generic(this: TaskTargetPipelineHelper, prop: string)
|
|||
)
|
||||
| @csv
|
||||
`])
|
||||
.types(["time", "text", "text"])
|
||||
.assignMeta({ columnMeta: ["isodatetime", "TODO", "text"] })
|
||||
}
|
||||
function facebook_comments_v1(this: TaskTargetPipelineHelper) {
|
||||
return this.facebook_comments_generic("comments");
|
||||
|
|
@ -392,7 +392,7 @@ function facebook_group_posts_v2(this: TaskTargetPipelineHelper) {
|
|||
}
|
||||
|
||||
function facebook_v2(this: TaskTargetPipelineHelper) {
|
||||
const p = this.setId(t=>`Facebookv2 - ${t.basename}`); // Generic ID for everything in here
|
||||
const p = this.assignMeta({ idValue: t=>`Facebookv2 - ${t.basename}` }); // Generic ID for everything in here
|
||||
const col: Set<TaskTargetPipelineHelper> = new Set();
|
||||
|
||||
// No correlary to accounts_and_profiles.json
|
||||
|
|
@ -400,7 +400,7 @@ function facebook_v2(this: TaskTargetPipelineHelper) {
|
|||
p.collect(col).cd(`apps_and_websites_off_of_facebook/connected_apps_and_websites.json`).read().facebook_installed_apps_v2();
|
||||
p.collect(col).cd(`your_facebook_activity/comments_and_reactions/comments.json`).read().facebook_comments_v2();
|
||||
p.collect(col).glob(`your_facebook_activity/messages/*/**/*.json`) // Messages files are in the FOLDERS inside messages (archived_threads, e2ee_cutover, etc...)
|
||||
.setId(t=>`Facebookv2 - Messages ${t.basenameN(2)}`) // 1, 2, etc is not specific enough, include the convo name
|
||||
.assignMeta({ idValue: t=>`Facebookv2 - Messages ${t.basenameN(2)}` }) // 1, 2, etc is not specific enough, include the convo name
|
||||
.read()
|
||||
.facebook_messages_generic()
|
||||
|
||||
|
|
@ -443,7 +443,7 @@ function facebook_v2(this: TaskTargetPipelineHelper) {
|
|||
}
|
||||
|
||||
function facebook(this: TaskTargetPipelineHelper){
|
||||
const p = this.setId(t=>`Facebook - ${t.basename}`); // Generic ID for everything in here
|
||||
const p = this.assignMeta({ idValue: t=>`Facebook - ${t.basename}` }); // Generic ID for everything in here
|
||||
const col: Set<TaskTargetPipelineHelper> = new Set();
|
||||
|
||||
p.collect(col).cd(`about_you/notifications.json`).read().facebook_notifications_v1()
|
||||
|
|
@ -461,7 +461,6 @@ function facebook(this: TaskTargetPipelineHelper){
|
|||
| [.service_name, .native_app_id, .username, .email, .phone_number, .name]
|
||||
)
|
||||
| @csv`])
|
||||
.csvSink()
|
||||
|
||||
|
||||
p.collect(col).cd(`ads_and_businesses/your_off-facebook_activity.json`).read()
|
||||
|
|
@ -475,7 +474,6 @@ function facebook(this: TaskTargetPipelineHelper){
|
|||
)
|
||||
| @csv
|
||||
`])
|
||||
.csvSink([["timestamp", "numeric"]])
|
||||
//TODO: .fork().todo('advertisers_who_uploaded_a_contact_list_with_your_information.json')
|
||||
|
||||
p.collect(col).cd(`apps_and_websites/apps_and_websites.json`).read().facebook_installed_apps_v1()
|
||||
|
|
@ -486,7 +484,7 @@ function facebook(this: TaskTargetPipelineHelper){
|
|||
p.collect(col).cd(`comments/comments.json`).read().facebook_comments_v1()
|
||||
|
||||
p.collect(col).glob(`dating/messages/*.json`) // Files are 0.json, 1.json, etc
|
||||
.setId(t=>`Facebook - Dating Messages ${t.basename}`) // Slightly more specific message
|
||||
.assignMeta({ idValue: t=>`Facebook - Dating Messages ${t.basename}` }) // Slightly more specific message
|
||||
.read()
|
||||
.cmd(["jq", "-r", `
|
||||
["from","to","timestamp","body"],
|
||||
|
|
@ -497,7 +495,6 @@ function facebook(this: TaskTargetPipelineHelper){
|
|||
)
|
||||
| @csv
|
||||
`])
|
||||
.csvSink();//[["timestamp", "numeric"]])
|
||||
//todo: your_dating_activity.json, but it only has a few lines and not super useful
|
||||
//todo: the other dating files are also just, small
|
||||
|
||||
|
|
@ -534,7 +531,6 @@ function facebook(this: TaskTargetPipelineHelper){
|
|||
)
|
||||
| @csv
|
||||
`])
|
||||
.csvSink([["added_timestamp", "numeric"]])
|
||||
|
||||
p.collect(col).cd(`following_and_followers/unfollowed_pages.json`).read().facebook_pages_unfollowed_v1()
|
||||
p.collect(col).cd(`following_and_followers/following.json`)
|
||||
|
|
@ -547,7 +543,6 @@ function facebook(this: TaskTargetPipelineHelper){
|
|||
)
|
||||
| @csv
|
||||
`])
|
||||
.csvSink([["timestamp", "numeric"]])
|
||||
p.collect(col).cd(`following_and_followers/followers.json`)
|
||||
.read()
|
||||
.cmd(["jq", "-r", `
|
||||
|
|
@ -558,7 +553,6 @@ function facebook(this: TaskTargetPipelineHelper){
|
|||
)
|
||||
| @csv
|
||||
`])
|
||||
.csvSink()
|
||||
|
||||
p.collect(col).cd(`friends/sent_friend_requests.json`).read().facebook_friends_generic("sent_requests")
|
||||
p.collect(col).cd(`friends/removed_friends.json`).read().facebook_friends_generic("deleted_friends")
|
||||
|
|
@ -585,7 +579,6 @@ function facebook(this: TaskTargetPipelineHelper){
|
|||
)
|
||||
| @csv
|
||||
`])
|
||||
.csvSink([["timestamp", "numeric"]])
|
||||
p.collect(col).cd(`likes_and_reactions/posts_and_comments.json`)
|
||||
.read()
|
||||
.cmd(["jq", "-r", `
|
||||
|
|
@ -596,7 +589,6 @@ function facebook(this: TaskTargetPipelineHelper){
|
|||
)
|
||||
| @csv
|
||||
`])
|
||||
.csvSink([["timestamp", "numeric"]])
|
||||
|
||||
// TODO:
|
||||
// rcd(`location`);
|
||||
|
|
@ -608,7 +600,7 @@ function facebook(this: TaskTargetPipelineHelper){
|
|||
|
||||
|
||||
p.collect(col).glob(`messages/**/*.json`) // Files are message_1.json, etc
|
||||
.setId(t=>`Facebook - Messages ${t.basenameN(2)}`) // 1, 2, etc is not specific enough, include the convo name
|
||||
.assignMeta({ idValue: t=>`Facebook - Messages ${t.basenameN(2)}` }) // 1, 2, etc is not specific enough, include the convo name
|
||||
.read()
|
||||
.facebook_messages_generic()
|
||||
|
||||
|
|
@ -628,7 +620,6 @@ function facebook(this: TaskTargetPipelineHelper){
|
|||
)
|
||||
| @csv
|
||||
`])
|
||||
.csvSink([["timestamp", "numeric"]]);
|
||||
p.collect(col).cd(`other_activity/support_correspondences.json`)
|
||||
.read()
|
||||
// TODO: I'm seeing blanks in .from and .to when the replier was Facebook
|
||||
|
|
@ -642,7 +633,6 @@ function facebook(this: TaskTargetPipelineHelper){
|
|||
)
|
||||
| @csv
|
||||
`])
|
||||
.csvSink([["timestamp", "numeric"]])
|
||||
|
||||
|
||||
// `${facebookRoot}/pages` - no data
|
||||
|
|
@ -657,13 +647,12 @@ function facebook(this: TaskTargetPipelineHelper){
|
|||
)
|
||||
| @csv
|
||||
`])
|
||||
.csvSink([["created_timestamp", "numeric"]]);
|
||||
|
||||
// TODO: There's also photos_and_videos/your_videos.json
|
||||
// TODO: There's a media_metadata in each of the images too to convert as well as external files
|
||||
p.collect(col).glob(`photos_and_videos/album/*.json`)
|
||||
// Could use a better name, currently 0.json, 1.json, etc...
|
||||
.setId(t=>`Facebook - Album ${t.basename}`) //slightly more speciifc name, it woudl be better if we could use the album name
|
||||
.assignMeta({ idValue: t=>`Facebook - Album ${t.basename}` }) //slightly more speciifc name, it woudl be better if we could use the album name
|
||||
.read()
|
||||
.cmd(["jq", "-r", `
|
||||
["album","uri","creation_timestamp"],
|
||||
|
|
@ -673,7 +662,6 @@ function facebook(this: TaskTargetPipelineHelper){
|
|||
)
|
||||
| @csv
|
||||
`])
|
||||
.csvSink([["creation_timestamp", "numeric"]])
|
||||
|
||||
p.collect(col).cd(`posts/your_pinned_posts.json`)
|
||||
.read()
|
||||
|
|
@ -685,7 +673,6 @@ function facebook(this: TaskTargetPipelineHelper){
|
|||
)
|
||||
| @csv
|
||||
`])
|
||||
.csvSink([["timestamp", "numeric"]])
|
||||
// TODO: Glob? I never posted a lot on FB
|
||||
p.collect(col).cd(`posts/your_posts_1.json`)
|
||||
.read()
|
||||
|
|
@ -699,7 +686,6 @@ function facebook(this: TaskTargetPipelineHelper){
|
|||
)
|
||||
| @csv
|
||||
`])
|
||||
.csvSink([["timestamp", "numeric"]])
|
||||
|
||||
// `${facebookRoot}/privacy_checkup` - no data
|
||||
|
||||
|
|
@ -718,7 +704,6 @@ function facebook(this: TaskTargetPipelineHelper){
|
|||
)
|
||||
| @csv
|
||||
`])
|
||||
.csvSink([["timestamp", "numeric"]])
|
||||
|
||||
// `${facebookRoot}/rewards` - no data
|
||||
// `${facebookRoot}/saved_items_and_collections` - no data
|
||||
|
|
@ -735,7 +720,6 @@ function facebook(this: TaskTargetPipelineHelper){
|
|||
)
|
||||
| @csv
|
||||
`])
|
||||
.csvSink([["timestamp", "numeric"]])
|
||||
p.collect(col).cd(`security_and_login_information/account_activity.json`).read().facebook_account_activity_v1()
|
||||
p.collect(col).cd(`security_and_login_information/administrative_records.json`).read().facebook_admin_records_v1()
|
||||
p.collect(col).cd(`security_and_login_information/authorized_logins.json`).read().facebook_authorized_logins_v1()
|
||||
|
|
@ -772,7 +756,6 @@ function facebook(this: TaskTargetPipelineHelper){
|
|||
)
|
||||
| @csv
|
||||
`])
|
||||
.csvSink([["timestamp", "numeric"]])
|
||||
|
||||
// `${facebookRoot}/trash` - no data in my export
|
||||
// `${facebookRoot}/voice_recording_and_transcription` - no data in my export
|
||||
|
|
|
|||
|
|
@ -2,7 +2,7 @@ import { TaskTargetPipelineHelper } from "./task.ts";
|
|||
import { htmlSelectorChunkedDuplex } from "./html.ts";
|
||||
|
||||
export function google(this: TaskTargetPipelineHelper){
|
||||
const p = this.setId(t=>`Google - ${t.basename}`); // Generic ID for everything in here
|
||||
const p = this.assignMeta({ idValue: t=>`Google - ${t.basename}` }); // Generic ID for everything in here
|
||||
const col: Set<TaskTargetPipelineHelper> = new Set();
|
||||
|
||||
// TODO: There is a root takeout folder
|
||||
|
|
@ -40,7 +40,6 @@ export function google(this: TaskTargetPipelineHelper){
|
|||
// a = t.fork().cd(`Google Pay`)
|
||||
p.collect(col).cd(`Google Pay/Google transactions`).glob(`transactions_*.csv`)
|
||||
.read()
|
||||
.csvSink()
|
||||
// .fork("a").cd(`Money sends and requests`)
|
||||
// .fork().cd(`Money sends and requests.csv`)
|
||||
// .read()
|
||||
|
|
@ -62,7 +61,6 @@ export function google(this: TaskTargetPipelineHelper){
|
|||
| [.timestampMs | todateiso8601, .latitudeE7, .longitudeE7, .accuracy]
|
||||
)
|
||||
| @csv`])
|
||||
.csvSink()
|
||||
// There's also the semantic history but that's an entire nother can of worms
|
||||
// it seems like
|
||||
|
||||
|
|
|
|||
|
|
@ -4,14 +4,12 @@ import { type TaskTarget, run } from "./task.ts";
|
|||
|
||||
$.verbose = false;
|
||||
|
||||
type ResultMap = Map<string, ProcessOutput>;
|
||||
|
||||
export async function parallel(
|
||||
targets: TaskTarget[],
|
||||
quiet: boolean = false,
|
||||
maxConcurrency: number = os.cpus().length
|
||||
): Promise<ResultMap> {
|
||||
const results = new Map<string, ProcessOutput>();
|
||||
): Promise<ProcessOutput[]> {
|
||||
const resultMap = new Map<string, ProcessOutput>();
|
||||
|
||||
const total = targets.length;
|
||||
let completed = 0;
|
||||
|
|
@ -49,7 +47,7 @@ export async function parallel(
|
|||
const result = await run(t);
|
||||
completionTimes.push(result.duration);
|
||||
|
||||
results.set(t.id, result);
|
||||
resultMap.set(t.id, result);
|
||||
|
||||
running--;
|
||||
completed++;
|
||||
|
|
@ -77,10 +75,16 @@ export async function parallel(
|
|||
// Final status line
|
||||
process.stderr.write('\n');
|
||||
const totalSeconds = ((Date.now() - startTime) / 1000).toFixed(1);
|
||||
const failed = Array.from(results.values().filter(p => !p.ok));
|
||||
const failed = Array.from(resultMap.values().filter(p => !p.ok));
|
||||
process.stderr.write(
|
||||
`\nCompleted ${total} jobs in ${totalSeconds}s (${failed.length} failed)\n`
|
||||
);
|
||||
|
||||
return results;
|
||||
const output = targets
|
||||
.map(t => {
|
||||
const r = resultMap.get(t.id)!;
|
||||
return r;
|
||||
});
|
||||
|
||||
return output;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -93,17 +93,60 @@ class TaskTargetCmd implements TaskTargetOp {
|
|||
}
|
||||
|
||||
type ValidId = string | ((t: TaskTarget)=>string);
|
||||
export const COLUMN_TYPES = {
|
||||
/**A numeric value*/
|
||||
"numeric": {},
|
||||
/**ISO Datetime*/
|
||||
"isodatetime": {},
|
||||
/**Urls*/
|
||||
"url": {},
|
||||
/**Freetext*/
|
||||
"text": {},
|
||||
/**For anything untyped*/
|
||||
"any": {},
|
||||
"TODO": {}
|
||||
};
|
||||
|
||||
// // if (type === "numeric") {
|
||||
// // queryLine = `min(${columnName}) as lo, max(${columnName}) as hi, count(*) as n`;
|
||||
// // formatFn = (r: any)=>`${r.n} rows from ${r.lo} to ${r.hi} for ${t.id}`;
|
||||
// // }
|
||||
// // else {
|
||||
// // queryLine = `count(*) as n`;
|
||||
// // formatFn = (r: any)=>`${r.n} rows for ${t.id}`;
|
||||
// // }
|
||||
|
||||
/**Column metadata. Just a string into the TYPES*/
|
||||
type ColumnMeta = (keyof typeof COLUMN_TYPES | undefined);
|
||||
// Make non-optional version of just the metadata values of TaskTarget
|
||||
type TaskTargetMeta = Required<Pick<TaskTarget, "idValue" | "perRowDescription" | "columnMeta">>;
|
||||
|
||||
export class TaskTarget {
|
||||
/**The current path pointed to by this TaskTarget*/
|
||||
path: string;
|
||||
/**The fsImpl used to access the .path*/
|
||||
fsImpl: FSImpl = defaultFSImpl;
|
||||
/**The pipeline of things to do to the above path to get an stdout of the output*/
|
||||
pipeline: TaskTargetOp[];
|
||||
idValue: ValidId | undefined;
|
||||
postFns: ((t: TaskTarget)=>Promise<void>)[];
|
||||
|
||||
// == Metadata, user configurable, no good defaults ==
|
||||
/**Id of the TaskTarget
|
||||
* string - Static id
|
||||
* fn returning string - Function can derive the id from a task target even after a glob() and cd() operation
|
||||
**/
|
||||
idValue?: ValidId;
|
||||
/**For every output CSV, this defines a description of that CSV per-row
|
||||
* Use the items {0}, {1} to template
|
||||
* Example: For a CSV with a row format like ["time", "sender", "sendee", "message"]
|
||||
* you might do something like '"{3}" sent from {2} to {1}'
|
||||
* */
|
||||
perRowDescription?: string;
|
||||
/**Metadata about the columns*/
|
||||
columnMeta?: ColumnMeta[];
|
||||
|
||||
constructor(path: string){
|
||||
this.path = path;
|
||||
this.pipeline = [];
|
||||
this.postFns = [];
|
||||
}
|
||||
|
||||
exists() {
|
||||
|
|
@ -168,10 +211,12 @@ export class TaskTarget {
|
|||
clone(): TaskTarget {
|
||||
const t = new TaskTarget(this.path);
|
||||
t.fsImpl = this.fsImpl; // holds no state, just needs same impl
|
||||
t.idValue = this.idValue;
|
||||
t.postFns = this.postFns.slice();
|
||||
t.pipeline = this.pipeline.slice()
|
||||
.map(p => p.clone());
|
||||
// metadata
|
||||
t.idValue = this.idValue;
|
||||
t.perRowDescription = this.perRowDescription;
|
||||
t.columnMeta = this.columnMeta?.slice();
|
||||
return t;
|
||||
}
|
||||
|
||||
|
|
@ -190,10 +235,6 @@ export class TaskTarget {
|
|||
return shell;
|
||||
}
|
||||
|
||||
pushPostFn(fn: ((t: TaskTarget)=>Promise<void>)) {
|
||||
this.postFns.push(fn);
|
||||
}
|
||||
|
||||
cmd(cmd: ValidCmd) {
|
||||
this.pushToPipeline(new TaskTargetCmd(cmd));
|
||||
return this;
|
||||
|
|
@ -202,54 +243,14 @@ export class TaskTarget {
|
|||
this.pushToPipeline(new TaskTargetRead());
|
||||
return this;
|
||||
}
|
||||
setId(idValue: ValidId) {
|
||||
this.idValue = idValue;
|
||||
assignMeta(meta: Partial<TaskTargetMeta>) {
|
||||
Object.assign(this, {
|
||||
...meta,
|
||||
// Clone this deeply so no shared object references
|
||||
columnMeta: meta.columnMeta?.slice()
|
||||
});
|
||||
return this;
|
||||
}
|
||||
post(fn: any) {
|
||||
this.pushPostFn(fn);
|
||||
}
|
||||
types(
|
||||
types: string[]
|
||||
) {
|
||||
// TODO:
|
||||
return this;
|
||||
}
|
||||
csvSink(
|
||||
summarization?: [string, string][]
|
||||
) {
|
||||
// TODO:
|
||||
return this;
|
||||
|
||||
// Ingest this csv into the database at the given id
|
||||
// this.cmd(t=>["sqlite-utils", "insert", "your.db", t.id, "-", "--csv", "--detect-types"]);
|
||||
// Add a post processing function for these targets that prints out the summarization
|
||||
// stats
|
||||
// this.post(async (t: TaskTarget)=>{
|
||||
// // We only do the first one so far for the summarization
|
||||
// let queryLine: string;
|
||||
// let formatFn: (r: any)=>string;
|
||||
// const [columnName, type] = summarization?.[0] ?? [undefined, undefined];
|
||||
// if (type === "numeric") {
|
||||
// queryLine = `min(${columnName}) as lo, max(${columnName}) as hi, count(*) as n`;
|
||||
// formatFn = (r: any)=>`${r.n} rows from ${r.lo} to ${r.hi} for ${t.id}`;
|
||||
// }
|
||||
// else {
|
||||
// queryLine = `count(*) as n`;
|
||||
// formatFn = (r: any)=>`${r.n} rows for ${t.id}`;
|
||||
// }
|
||||
|
||||
// const cmd = "sqlite-utils";
|
||||
// const args = ["query", "your.db", `select ${queryLine} from ${t.id}`]
|
||||
// const { stdout, stderr } = await execFile(cmd, args);
|
||||
// const results = JSON.parse(stdout);
|
||||
// const result = results[0]; // should only be one result in the array for this type of query
|
||||
// const logLine = formatFn(result);
|
||||
// (t as any).log = logLine;
|
||||
// });
|
||||
|
||||
// return this;
|
||||
}
|
||||
}
|
||||
|
||||
export function each(targets: TaskTarget[], fn: (t: TaskTarget)=>void) {
|
||||
|
|
@ -279,8 +280,8 @@ export function read(targets: TaskTarget[]): TaskTarget[] {
|
|||
export function cmd(targets: TaskTarget[], cmd: ValidCmd): TaskTarget[] {
|
||||
return targets.map(t => t.clone().cmd(cmd))
|
||||
}
|
||||
export function setId(targets: TaskTarget[], id: ValidId): TaskTarget[] {
|
||||
return targets.map(t => t.clone().setId(id))
|
||||
export function assignMeta(targets: TaskTarget[], meta: Partial<TaskTargetMeta>): TaskTarget[] {
|
||||
return targets.map(t => t.clone().assignMeta(meta))
|
||||
}
|
||||
|
||||
/**Verify, anything that fails is skipped and throws an error*/
|
||||
|
|
@ -305,29 +306,6 @@ export async function verify(targets: TaskTarget[]) {
|
|||
return outTargets;
|
||||
}
|
||||
|
||||
/**Writes a manifest for parallel, a TSV where each record is an id + the shell to run
|
||||
* @todo Enforce doing a verify before we output?
|
||||
*/
|
||||
export function getTSVManifest(targets: TaskTarget[]): string {
|
||||
let out: string[] = [];
|
||||
for (const t of targets) {
|
||||
const shell = t.toShell();
|
||||
out.push(`${t.id}\t${shell}`);
|
||||
}
|
||||
|
||||
return out.join("\n");
|
||||
}
|
||||
|
||||
export function getTaskManifest(targets: TaskTarget[]): [string, string][] {
|
||||
let out: [string, string][] = [];
|
||||
for (const t of targets) {
|
||||
const shell = t.toShell();
|
||||
out.push([t.id, shell] as const);
|
||||
}
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
function collectionSwap(a: TaskTargetPipelineHelper, b: TaskTargetPipelineHelper) {
|
||||
if (!a.__collection) {
|
||||
return;
|
||||
|
|
@ -382,18 +360,10 @@ export class TaskTargetPipelineHelper extends Array<TaskTarget> {
|
|||
cmd(_cmd: ValidCmd): TaskTargetPipelineHelper {
|
||||
return this._fn(t => cmd(t, _cmd));
|
||||
}
|
||||
setId(id: ValidId): TaskTargetPipelineHelper {
|
||||
return this._fn(t => setId(t, id));
|
||||
assignMeta(meta: Partial<TaskTargetMeta>): TaskTargetPipelineHelper {
|
||||
return this._fn(t => assignMeta(t, meta));
|
||||
}
|
||||
|
||||
types(...args: any[]) {
|
||||
// TODO: no-op
|
||||
return this;
|
||||
}
|
||||
csvSink(...args: any[]) {
|
||||
// TODO: no-op
|
||||
return this;
|
||||
}
|
||||
/**
|
||||
* @todo Nested versions of this don't currently work, but they could if we
|
||||
* turn __collection into an array of collections
|
||||
|
|
|
|||
39
main.ts
39
main.ts
|
|
@ -64,4 +64,41 @@ async function main() {
|
|||
// loadIntoSqlite(csvFiles, 'your.db');
|
||||
}
|
||||
|
||||
main();
|
||||
main();
|
||||
|
||||
// TODO: Move this into here
|
||||
// csvSink(
|
||||
// summarization?: [string, string][]
|
||||
// ) {
|
||||
// // TODO:
|
||||
// return this;
|
||||
|
||||
// // Ingest this csv into the database at the given id
|
||||
// // this.cmd(t=>["sqlite-utils", "insert", "your.db", t.id, "-", "--csv", "--detect-types"]);
|
||||
// // Add a post processing function for these targets that prints out the summarization
|
||||
// // stats
|
||||
// // this.post(async (t: TaskTarget)=>{
|
||||
// // // We only do the first one so far for the summarization
|
||||
// // let queryLine: string;
|
||||
// // let formatFn: (r: any)=>string;
|
||||
// // const [columnName, type] = summarization?.[0] ?? [undefined, undefined];
|
||||
// // if (type === "numeric") {
|
||||
// // queryLine = `min(${columnName}) as lo, max(${columnName}) as hi, count(*) as n`;
|
||||
// // formatFn = (r: any)=>`${r.n} rows from ${r.lo} to ${r.hi} for ${t.id}`;
|
||||
// // }
|
||||
// // else {
|
||||
// // queryLine = `count(*) as n`;
|
||||
// // formatFn = (r: any)=>`${r.n} rows for ${t.id}`;
|
||||
// // }
|
||||
|
||||
// // const cmd = "sqlite-utils";
|
||||
// // const args = ["query", "your.db", `select ${queryLine} from ${t.id}`]
|
||||
// // const { stdout, stderr } = await execFile(cmd, args);
|
||||
// // const results = JSON.parse(stdout);
|
||||
// // const result = results[0]; // should only be one result in the array for this type of query
|
||||
// // const logLine = formatFn(result);
|
||||
// // (t as any).log = logLine;
|
||||
// // });
|
||||
|
||||
// // return this;
|
||||
// }
|
||||
|
|
@ -20,7 +20,6 @@
|
|||
"@types/duplexify": "^3.6.5",
|
||||
"@types/yauzl": "^2.10.3",
|
||||
"duplexify": "^4.1.3",
|
||||
"fp-ts": "^2.16.11",
|
||||
"glob": "^13.0.0",
|
||||
"htmlparser2": "^10.0.0",
|
||||
"yauzl": "^3.2.0",
|
||||
|
|
|
|||
8
pnpm-lock.yaml
generated
8
pnpm-lock.yaml
generated
|
|
@ -17,9 +17,6 @@ importers:
|
|||
duplexify:
|
||||
specifier: ^4.1.3
|
||||
version: 4.1.3
|
||||
fp-ts:
|
||||
specifier: ^2.16.11
|
||||
version: 2.16.11
|
||||
glob:
|
||||
specifier: ^13.0.0
|
||||
version: 13.0.0
|
||||
|
|
@ -89,9 +86,6 @@ packages:
|
|||
resolution: {integrity: sha512-aN97NXWF6AWBTahfVOIrB/NShkzi5H7F9r1s9mD3cDj4Ko5f2qhhVoYMibXF7GlLveb/D2ioWay8lxI97Ven3g==}
|
||||
engines: {node: '>=0.12'}
|
||||
|
||||
fp-ts@2.16.11:
|
||||
resolution: {integrity: sha512-LaI+KaX2NFkfn1ZGHoKCmcfv7yrZsC3b8NtWsTVQeHkq4F27vI5igUuO53sxqDEa2gNQMHFPmpojDw/1zmUK7w==}
|
||||
|
||||
glob@13.0.0:
|
||||
resolution: {integrity: sha512-tvZgpqk6fz4BaNZ66ZsRaZnbHvP/jG3uKJvAZOwEVUL4RTA5nJeeLYfyN9/VA8NX/V3IBG+hkeuGpKjvELkVhA==}
|
||||
engines: {node: 20 || >=22}
|
||||
|
|
@ -215,8 +209,6 @@ snapshots:
|
|||
|
||||
entities@6.0.1: {}
|
||||
|
||||
fp-ts@2.16.11: {}
|
||||
|
||||
glob@13.0.0:
|
||||
dependencies:
|
||||
minimatch: 10.1.1
|
||||
|
|
|
|||
|
|
@ -1,9 +1,7 @@
|
|||
import test from "node:test";
|
||||
import nodePath from "node:path";
|
||||
import { strict as assert } from "node:assert";
|
||||
import { finished } from "node:stream/promises";
|
||||
import { Readable, Writable } from "node:stream";
|
||||
import { TaskTargetPipelineHelper, TaskTarget, verify, getTSVManifest, getTaskManifest, run } from "../data-export/task.ts";
|
||||
import { TaskTargetPipelineHelper, TaskTarget, verify, run } from "../data-export/task.ts";
|
||||
import { parallel } from "../data-export/parallel.ts";
|
||||
import "../data-export/facebook.ts";
|
||||
|
||||
|
|
@ -20,16 +18,18 @@ test("facebook: Can load the 2021 export", async (t) => {
|
|||
|
||||
const finalTargets = await verify(targets);
|
||||
const result = await parallel(finalTargets, true);
|
||||
for (const [id, r] of result.entries()) {
|
||||
assert.ok(!r.stderr, `Task ${id} should have no stderr output`);
|
||||
assert.ok(r.ok, `Task ${id} should be okay`);
|
||||
const idAndCSVs: [string, string][] = [];
|
||||
for (const [idx, r] of result.entries()) {
|
||||
const target = finalTargets[idx];
|
||||
assert.ok(!r.stderr, `Task ${target.id} should have no stderr output`);
|
||||
assert.ok(r.ok, `Task ${target.id} should be okay`);
|
||||
idAndCSVs.push([target.id, r.stdout]);
|
||||
}
|
||||
|
||||
const allCSV = Array.from(result.entries())
|
||||
const csvs = idAndCSVs
|
||||
.sort() // Keep stable ordering for snapshots
|
||||
.map(([id, r]) => r.stdout);
|
||||
.map(v => v[1])
|
||||
|
||||
t.assert.snapshot(allCSV);
|
||||
t.assert.snapshot(csvs);
|
||||
});
|
||||
test("facebook: Can load the 2021 export zipped", async (t) => {
|
||||
const targets = await TaskTargetPipelineHelper.pipeline([
|
||||
|
|
@ -41,16 +41,18 @@ test("facebook: Can load the 2021 export zipped", async (t) => {
|
|||
|
||||
const finalTargets = await verify(targets2);
|
||||
const result = await parallel(finalTargets, true);
|
||||
for (const [id, r] of result.entries()) {
|
||||
assert.ok(!r.stderr, `Task ${id} should have no stderr output`);
|
||||
assert.ok(r.ok, `Task ${id} should be okay`);
|
||||
const idAndCSVs: [string, string][] = [];
|
||||
for (const [idx, r] of result.entries()) {
|
||||
const target = finalTargets[idx];
|
||||
assert.ok(!r.stderr, `Task ${target.id} should have no stderr output`);
|
||||
assert.ok(r.ok, `Task ${target.id} should be okay`);
|
||||
idAndCSVs.push([target.id, r.stdout]);
|
||||
}
|
||||
|
||||
const allCSV = Array.from(result.entries())
|
||||
const csvs = idAndCSVs
|
||||
.sort() // Keep stable ordering for snapshots
|
||||
.map(([id, r]) => r.stdout);
|
||||
.map(v => v[1])
|
||||
|
||||
t.assert.snapshot(allCSV);
|
||||
t.assert.snapshot(csvs);
|
||||
});
|
||||
test("facebook: Can load the 2025 export", async (t) => {
|
||||
const targets = TaskTargetPipelineHelper.pipeline([
|
||||
|
|
@ -60,14 +62,16 @@ test("facebook: Can load the 2025 export", async (t) => {
|
|||
|
||||
const finalTargets = await verify(targets);
|
||||
const result = await parallel(finalTargets, true);
|
||||
for (const [id, r] of result.entries()) {
|
||||
assert.ok(!r.stderr, `Task ${id} should have no stderr output`);
|
||||
assert.ok(r.ok, `Task ${id} should be okay`);
|
||||
const idAndCSVs: [string, string][] = [];
|
||||
for (const [idx, r] of result.entries()) {
|
||||
const target = finalTargets[idx];
|
||||
assert.ok(!r.stderr, `Task ${target.id} should have no stderr output`);
|
||||
assert.ok(r.ok, `Task ${target.id} should be okay`);
|
||||
idAndCSVs.push([target.id, r.stdout]);
|
||||
}
|
||||
|
||||
const allCSV = Array.from(result.entries())
|
||||
const csvs = idAndCSVs
|
||||
.sort() // Keep stable ordering for snapshots
|
||||
.map(([id, r]) => r.stdout);
|
||||
.map(v => v[1])
|
||||
|
||||
t.assert.snapshot(allCSV);
|
||||
t.assert.snapshot(csvs);
|
||||
});
|
||||
|
|
|
|||
|
|
@ -93,6 +93,7 @@ exports[`facebook: Can load the 2025 export 1`] = `
|
|||
"\\"from\\",\\"to\\",\\"timestamp\\",\\"content\\"\\n\\"xxx\\",\\"<other>\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n\\"xxx\\",\\"<other>\\",\\"1970-01-01T00:00:00Z\\",\\"some/path\\"\\n",
|
||||
"\\"from\\",\\"to\\",\\"timestamp\\",\\"content\\"\\n\\"xxx\\",\\"<other>\\",\\"1970-01-01T00:00:00Z\\",\\n\\"xxx\\",\\"<other>\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n",
|
||||
"\\"from\\",\\"to\\",\\"timestamp\\",\\"content\\"\\n\\"xxx\\",\\"<other>\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n\\"xxx\\",\\"<other>\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n",
|
||||
"\\"from\\",\\"to\\",\\"timestamp\\",\\"content\\"\\n\\"xxx\\",\\"<other>\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n\\"xxx\\",\\"<other>\\",\\"1970-01-01T00:00:00Z\\",\\"xxx\\"\\n",
|
||||
"\\"from\\",\\"to\\",\\"timestamp\\",\\"content\\"\\n\\"xxx\\",\\"<other>\\",\\"1970-01-01T00:00:00Z\\",\\n",
|
||||
"\\"action\\",\\"ip\\",\\"user_agent\\",\\"datr_cookie\\",\\"city\\",\\"region\\",\\"country\\",\\"site_name\\",\\"timestamp\\"\\n\\"xxx\\",\\"1.1.1.1\\",\\"some/path\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"2024-11-22T10:06:40Z\\"\\n\\"xxx\\",\\"1.1.1.1\\",\\"some/path\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"xxx\\",\\"2024-11-21T23:00:00Z\\"\\n",
|
||||
"\\"timestamp\\",\\"data\\",\\"title\\"\\n\\"2024-02-13T02:06:40Z\\",\\"TODO\\",\\"xxx\\"\\n\\"2024-07-12T02:06:40Z\\",\\"TODO\\",\\"xxx\\"\\n",
|
||||
|
|
|
|||
36
test/task.ts
36
test/task.ts
|
|
@ -7,9 +7,8 @@ import {
|
|||
glob as taskGlob,
|
||||
read,
|
||||
cmd,
|
||||
setId,
|
||||
assignMeta,
|
||||
verify,
|
||||
getTSVManifest,
|
||||
TaskTargetPipelineHelper,
|
||||
} from "../data-export/task.ts";
|
||||
|
||||
|
|
@ -19,11 +18,10 @@ const FIXTURE_FILE = nodePath.join(FIXTURE_DIR, 'friends/friends.json');
|
|||
|
||||
// -- TaskTarget ---------------------------------------------------------------
|
||||
|
||||
test("TaskTarget: constructor initializes path, pipeline, postFns", () => {
|
||||
test("TaskTarget: constructor initializes path, pipeline", () => {
|
||||
const t = new TaskTarget("/foo/bar");
|
||||
assert.equal(t.path, "/foo/bar");
|
||||
assert.deepEqual(t.pipeline, []);
|
||||
assert.deepEqual(t.postFns, []);
|
||||
});
|
||||
|
||||
test("TaskTarget: exists() returns true for a real file", () => {
|
||||
|
|
@ -50,12 +48,12 @@ test("TaskTarget: id throws when no idValue is set", () => {
|
|||
});
|
||||
|
||||
test("TaskTarget: id with a string value is safe-ified", () => {
|
||||
const t = new TaskTarget("/foo").setId("my-id");
|
||||
const t = new TaskTarget("/foo").assignMeta({ idValue: "my-id" });
|
||||
assert.equal(t.id, "my_id");
|
||||
});
|
||||
|
||||
test("TaskTarget: id with a function value is resolved against the target", () => {
|
||||
const t = new TaskTarget("/foo/bar").setId(tgt => tgt.basename);
|
||||
const t = new TaskTarget("/foo/bar").assignMeta({ idValue: tgt => tgt.basename });
|
||||
assert.equal(t.id, "bar");
|
||||
});
|
||||
|
||||
|
|
@ -92,12 +90,17 @@ test("TaskTarget: pushToPipeline throws if read is not the first op", () => {
|
|||
});
|
||||
|
||||
test("TaskTarget: clone produces an independent copy", () => {
|
||||
const t = new TaskTarget("/foo").setId("orig");
|
||||
const t = new TaskTarget("/foo").assignMeta({
|
||||
idValue: "orig",
|
||||
columnMeta: ["yeag"]
|
||||
});
|
||||
t.read();
|
||||
const c = t.clone();
|
||||
assert.equal(c.path, "/foo");
|
||||
assert.equal(c.id, "orig");
|
||||
assert(c.pipeline !== t.pipeline); // Different object references
|
||||
assert.equal(c.pipeline.length, 1);
|
||||
assert(c.columnMeta !== t.columnMeta); // Different object references
|
||||
c.path = "/other";
|
||||
assert.equal(t.path, "/foo"); // original unchanged
|
||||
});
|
||||
|
|
@ -176,9 +179,9 @@ test("cmd: clones and appends a cmd op to each target", () => {
|
|||
assert.equal(targets[0].pipeline.length, 1); // original unchanged
|
||||
});
|
||||
|
||||
test("setId: clones and sets id on each target", () => {
|
||||
test("assignMeta: clones and sets meta on each target", () => {
|
||||
const targets = [new TaskTarget("/a"), new TaskTarget("/b")];
|
||||
const result = setId(targets, "myid");
|
||||
const result = assignMeta(targets, { idValue: "myid" });
|
||||
assert.equal(result[0].id, "myid");
|
||||
assert.equal(result[1].id, "myid");
|
||||
assert.throws(() => targets[0].id); // originals have no id
|
||||
|
|
@ -223,21 +226,6 @@ test("verify: filters a mixed list to only valid targets", async () => {
|
|||
assert.equal(result[0], good);
|
||||
});
|
||||
|
||||
// -- getTSVManifest -----------------------------------------------------------
|
||||
|
||||
test("getTSVManifest: produces id<TAB>shell for a single target", () => {
|
||||
const t = new TaskTarget("/foo/bar.txt");
|
||||
t.setId("myid");
|
||||
t.read();
|
||||
assert.equal(getTSVManifest([t]), "myid\tcat /foo/bar.txt");
|
||||
});
|
||||
|
||||
test("getTSVManifest: joins multiple targets with newlines", () => {
|
||||
const t1 = new TaskTarget("/a.txt"); t1.setId("a"); t1.read();
|
||||
const t2 = new TaskTarget("/b.txt"); t2.setId("b"); t2.read();
|
||||
assert.equal(getTSVManifest([t1, t2]), "a\tcat /a.txt\nb\tcat /b.txt");
|
||||
});
|
||||
|
||||
// -- TaskTargetPipelineHelper -------------------------------------------------
|
||||
|
||||
test("TaskTargetPipelineHelper: pipeline() promotes a plain array", () => {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue