base-data-manager/summary.ts

216 lines
8.6 KiB
TypeScript
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import { type DatabaseSync, type SQLOutputValue } from "node:sqlite";
import { fileURLToPath } from "node:url";
import { stringify } from "csv-stringify/sync";
import { facebook } from "./data-export/facebook.ts";
import { execPaths, COLUMN_TYPES } from "./data-export/task.ts";
import { elapsed, loadTaskInNewDb } from "./main.ts";
const __filename = fileURLToPath(import.meta.url);
type ColumnMetaType = keyof typeof COLUMN_TYPES;
interface MetadataRow {
id: string;
perRowTags?: string;
columnMeta: ColumnMetaType[];
columnNames: string[];
}
// ── query helpers ─────────────────────────────────────────────────────────────
function q(name: string) {
return `"${name}"`;
}
/** Format a number compactly: integer if whole, otherwise 4 sig figs */
function fmt(v: number): string {
if (!isFinite(v)) return String(v);
if (Number.isInteger(v)) return String(v);
return v.toPrecision(4).replace(/\.?0+$/, '');
}
/** Non-null filter: treats SQL NULL, empty string, and the literal "null" as missing */
function notNull(col: string) {
return `${q(col)} IS NOT NULL AND ${q(col)} != '' AND ${q(col)} != 'null'`;
}
function rowCount(db: DatabaseSync, table: string): number {
return (db.prepare(`SELECT count(*) as n FROM ${q(table)}`).get() as { n: number }).n;
}
function datetimeRange(db: DatabaseSync, table: string, col: string): string {
const r = db.prepare(
`SELECT MIN(${q(col)}) as lo, MAX(${q(col)}) as hi FROM ${q(table)} WHERE ${notNull(col)}`
).get() as { lo: string | null; hi: string | null };
if (!r.lo) return '(no dates)';
// Trim to date portion if it looks like a full ISO datetime — keeps the line shorter
const trim = (s: string) => s.length > 10 && s[10] === 'T' ? s.slice(0, 10) : s;
return `${trim(r.lo)}..${trim(r.hi!)}`;
}
function numericRange(db: DatabaseSync, table: string, col: string): { lo: number; hi: number } | null {
const r = db.prepare(
`SELECT MIN(CAST(${q(col)} AS REAL)) as lo, MAX(CAST(${q(col)} AS REAL)) as hi
FROM ${q(table)} WHERE ${notNull(col)}`
).get() as { lo: number | null; hi: number | null };
return r.lo !== null ? { lo: r.lo, hi: r.hi! } : null;
}
function topValues(db: DatabaseSync, table: string, col: string, n: number): { distinct: number; top: { v: string; c: number }[] } {
const distinct = (db.prepare(
`SELECT count(distinct ${q(col)}) as d FROM ${q(table)} WHERE ${notNull(col)}`
).get() as { d: number }).d;
const top = db.prepare(
`SELECT ${q(col)} as v, count(*) as c FROM ${q(table)} WHERE ${notNull(col)}
GROUP BY ${q(col)} ORDER BY c DESC LIMIT ${n}`
).all() as { v: string; c: number }[];
return { distinct, top };
}
// ── metadata parsing (mirrors timelinize.ts) ──────────────────────────────────
function getColumnNames(db: DatabaseSync, tableName: string): string[] {
return db.prepare(`PRAGMA table_info(${q(tableName)})`).all().map(c => (c as any).name) as string[];
}
function parseMetadataRow(db: DatabaseSync, row: Record<string, SQLOutputValue>): MetadataRow | undefined {
const { id, perRowTags, columnMeta: columnMetaCSV } = row;
if (!id || typeof id !== 'string') return undefined;
const columnNames = getColumnNames(db, id);
// columnMeta may be absent for tables without type annotations — still useful to show
let columnMeta: ColumnMetaType[] = [];
if (columnMetaCSV && typeof columnMetaCSV === 'string') {
const parsed = columnMetaCSV.split(',') as ColumnMetaType[];
if (parsed.length === columnNames.length) {
columnMeta = parsed;
}
}
return {
id,
perRowTags: typeof perRowTags === 'string' ? perRowTags : undefined,
columnMeta,
columnNames,
};
}
/** Maps semantic type names → the actual column name in this table (first match wins) */
function metaToNames(meta: MetadataRow): Partial<Record<ColumnMetaType, string>> {
const out: Partial<Record<ColumnMetaType, string>> = {};
for (const [idx, colName] of meta.columnNames.entries()) {
const type = meta.columnMeta[idx];
if (!type || out[type]) continue; // skip untyped or already-seen types
out[type] = colName;
}
return out;
}
// ── table row builder ─────────────────────────────────────────────────────────
const TOP_SENDERS = 3;
const MAX_SENDER_LEN = 20;
interface SummaryRow {
type: string;
id: string;
n: string;
dates: string;
senders: string;
geo: string;
tags: string;
}
function buildSummaryRow(db: DatabaseSync, meta: MetadataRow): SummaryRow {
const { id, perRowTags } = meta;
const typeMap = metaToNames(meta);
const n = rowCount(db, id);
// ── shape label ─────────────────────────────────────────────────────────────
let type: string;
if (typeMap.sender && typeMap.isodatetime) type = 'chat';
else if (typeMap.isodatetime) type = 'time';
else type = 'static';
if (typeMap.lat && typeMap.lng) type += '+geo';
// ── datetime range ──────────────────────────────────────────────────────────
const dates = typeMap.isodatetime ? datetimeRange(db, id, typeMap.isodatetime) : '';
// ── sender info ─────────────────────────────────────────────────────────────
let senders = '';
if (typeMap.sender) {
const { distinct, top } = topValues(db, id, typeMap.sender, TOP_SENDERS);
const topStr = top
.map(r => {
const name = r.v.length > MAX_SENDER_LEN ? r.v.slice(0, MAX_SENDER_LEN - 1) + '…' : r.v;
return `${name}×${r.c}`;
})
.join(', ');
senders = `${distinct} [${topStr}]`;
}
// ── geo ranges ──────────────────────────────────────────────────────────────
let geo = '';
const latR = typeMap.lat ? numericRange(db, id, typeMap.lat) : null;
const lngR = typeMap.lng ? numericRange(db, id, typeMap.lng) : null;
if (latR && lngR) {
geo = `[${fmt(latR.lo)}..${fmt(latR.hi)}] [${fmt(lngR.lo)}..${fmt(lngR.hi)}]`;
} else if (latR) {
geo = `lat=[${fmt(latR.lo)}..${fmt(latR.hi)}]`;
}
return { type, id, n: String(n), dates, senders, geo, tags: perRowTags ?? '' };
}
// ── CSV output ────────────────────────────────────────────────────────────────
const COLUMNS: { key: keyof SummaryRow; header: string }[] = [
{ key: 'type', header: 'type' },
{ key: 'id', header: 'id' },
{ key: 'n', header: 'n' },
{ key: 'dates', header: 'dates' },
{ key: 'senders', header: 'senders' },
{ key: 'geo', header: 'lat / lng' },
{ key: 'tags', header: 'tags' },
];
function printCSV(summaryRows: SummaryRow[]) {
const records = [
COLUMNS.map(c => c.header),
...summaryRows.map(row => COLUMNS.map(c => row[c.key])),
];
process.stdout.write(stringify(records));
}
// ── main ─────────────────────────────────────────────────────────────────────
async function main() {
process.stderr.write(`${elapsed()} - Building targets\n`);
const targets = await execPaths([
{path: "/home/cobertos/Seafile/archive/ExportedServiceData/facebook/formapcast_facebook-DEADNAME-May2021-json", op: facebook()}
// {path: "/home/cobertos/Seafile/archive/ExportedServiceData/fitbit/FullHumanName", op: fitbit()}
]);
process.stderr.write(`${elapsed()} - Found ${targets.filter(t => !t.aggregate).length} possible targets\n`);
const db = await loadTaskInNewDb(targets);
const rows = db.prepare(
`SELECT id, perRowTags, columnMeta FROM base_data_manager_metadata ORDER BY id`
).all() as Record<string, SQLOutputValue>[];
const summaryRows: SummaryRow[] = [];
for (const row of rows) {
const meta = parseMetadataRow(db, row);
if (!meta) continue;
summaryRows.push(buildSummaryRow(db, meta));
}
printCSV(summaryRows);
db.close();
}
if (process.argv[1] === __filename) {
main();
}