base-data-manager/main.ts

90 lines
No EOL
3.6 KiB
TypeScript

import { type DatabaseSync } from "node:sqlite";
import { fileURLToPath } from "node:url";
import { google } from "./data-export/google.ts";
import { facebook, facebook_v2 } from "./data-export/facebook.ts";
import { type TaskTarget, execPaths } from "./data-export/task.ts";
import * as DataIO from "./data-export/io.ts";
const __filename = fileURLToPath(import.meta.url);
export const startTime = Date.now();
export const elapsed = ()=>`${((Date.now() - startTime) / 1000).toFixed(2)}s`;
export async function loadTaskInNewDb(targets: TaskTarget[]): Promise<DatabaseSync> {
console.log(`${elapsed()} - Run all targets`);
const out = await DataIO.runPipeline(targets);
console.log(`${elapsed()} - Final targets exported to CSV. Got ${out.length} targets`);
// TODO: Add an option to output everything plainly as CSV in a single directory
console.log(`${elapsed()} - Building combined database table in :memory:`);
const db = DataIO.getDefaultDB();
await DataIO.loadIntoDb(db, out);
const tableCount = db.prepare(`SELECT COUNT(*) as count FROM base_data_manager_metadata`).get()!.count;
console.log(`${elapsed()} - Single database built with ${tableCount} tables`);
return db;
}
async function main() {
// Configurable stuff
const sqlitePath = 'your.db';
console.log(`${elapsed()} - Building targets`);
const targets = await execPaths([
{path: "/home/cobertos/Seafile/archive/ExportedServiceData/facebook/formapcast_facebook-DEADNAME-May2021-json", op: facebook()}
// {path: "/home/cobertos/Seafile/projects/base-data-manager/test/fixtures/facebook-json-2021-05-01", op: facebook()}
// {path: "/home/cobertos/Seafile/archive/ExportedServiceData/facebook/facebook-x-2025-11-29-x.zip", op: pipe(unzip(), facebook_v2())}
// {path: "/home/cobertos/Seafile/archive/ExportedServiceData/google/2023-NAMEwork-001", op: facebook_v2()}
]);
console.log(`${elapsed()} - Found ${targets.filter(t => !t.aggregate).length} possible targets`);
const db = await loadTaskInNewDb(targets);
console.log(`${elapsed()} - Writing database to disk at "${sqlitePath}"`);
DataIO.dumpDBToDisk(db, sqlitePath);
console.log(`${elapsed()} - Database written to disk`);
}
if (process.argv[1] === __filename) {
main();
}
// TODO: Move this into here
// csvSink(
// summarization?: [string, string][]
// ) {
// // TODO:
// return this;
// // Ingest this csv into the database at the given id
// // this.cmd(t=>["sqlite-utils", "insert", "your.db", t.id, "-", "--csv", "--detect-types"]);
// // Add a post processing function for these targets that prints out the summarization
// // stats
// // this.post(async (t: TaskTarget)=>{
// // // We only do the first one so far for the summarization
// // let queryLine: string;
// // let formatFn: (r: any)=>string;
// // const [columnName, type] = summarization?.[0] ?? [undefined, undefined];
// // if (type === "numeric") {
// // queryLine = `min(${columnName}) as lo, max(${columnName}) as hi, count(*) as n`;
// // formatFn = (r: any)=>`${r.n} rows from ${r.lo} to ${r.hi} for ${t.id}`;
// // }
// // else {
// // queryLine = `count(*) as n`;
// // formatFn = (r: any)=>`${r.n} rows for ${t.id}`;
// // }
// // const cmd = "sqlite-utils";
// // const args = ["query", "your.db", `select ${queryLine} from ${t.id}`]
// // const { stdout, stderr } = await execFile(cmd, args);
// // const results = JSON.parse(stdout);
// // const result = results[0]; // should only be one result in the array for this type of query
// // const logLine = formatFn(result);
// // (t as any).log = logLine;
// // });
// // return this;
// }