From 257598a17c2a45203b1de342199347f6699f1ba5 Mon Sep 17 00:00:00 2001 From: cobertos Date: Wed, 4 Mar 2026 11:00:35 +0000 Subject: [PATCH] Claude: add summary.ts --- summary.ts | 194 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 194 insertions(+) create mode 100644 summary.ts diff --git a/summary.ts b/summary.ts new file mode 100644 index 0000000..69a987f --- /dev/null +++ b/summary.ts @@ -0,0 +1,194 @@ +import { type DatabaseSync } from "node:sqlite"; +import { fileURLToPath } from "node:url"; +import { facebook } from "./data-export/facebook.ts"; +import { execPaths } from "./data-export/task.ts"; +import { elapsed, loadTaskInNewDb } from "./main.ts"; + +const __filename = fileURLToPath(import.meta.url); + +// ── constants ──────────────────────────────────────────────────────────────── + +const N_BINS = 5; // histogram bin count for numeric columns +const TOP_N = 5; // most-frequent values shown for text columns +const SAMPLE_N = 500; // rows sampled for column type detection +const MAX_VAL_LEN = 20; // max chars shown per value in top-N list + +// ── helpers ────────────────────────────────────────────────────────────────── + +type ColType = 'numeric' | 'bool' | 'text'; + +/** Values to treat as "missing" regardless of SQLite NULL */ +const NULL_FILTER = `"v" IS NULL OR "v" = '' OR "v" = 'null'`; + +function q(name: string) { + return `"${name}"`; +} + +/** + * Detect column type by sampling up to SAMPLE_N non-null/empty/null-string + * values and testing them in JS. + */ +function detectType(db: DatabaseSync, table: string, col: string): ColType { + const rows = db.prepare( + `SELECT ${q(col)} as v FROM ${q(table)} + WHERE ${q(col)} IS NOT NULL AND ${q(col)} != '' AND ${q(col)} != 'null' + LIMIT ${SAMPLE_N}` + ).all() as { v: unknown }[]; + + if (rows.length === 0) return 'text'; + + const vals = rows.map(r => String(r.v)); + + if (vals.every(v => v === 'true' || v === 'false')) return 'bool'; + if (vals.every(v => v !== '' && isFinite(Number(v)))) return 'numeric'; + return 'text'; +} + +function rowCount(db: DatabaseSync, table: string): number { + return (db.prepare(`SELECT count(*) as n FROM ${q(table)}`).get() as { n: number }).n; +} + +function nullCount(db: DatabaseSync, table: string, col: string): number { + return (db.prepare( + `SELECT count(*) as n FROM ${q(table)} + WHERE ${q(col)} IS NULL OR ${q(col)} = '' OR ${q(col)} = 'null'` + ).get() as { n: number }).n; +} + +// ── per-type summarizers ───────────────────────────────────────────────────── + +function fmt(v: number): string { + if (!isFinite(v)) return String(v); + // Use integer form if it looks like an integer, otherwise 4 sig figs + if (Number.isInteger(v)) return String(v); + const s = v.toPrecision(4); + // Drop trailing zeros after decimal + return s.includes('.') ? s.replace(/\.?0+$/, '') : s; +} + +function summarizeNumeric(db: DatabaseSync, table: string, col: string, n: number): string { + const nulls = nullCount(db, table, col); + + const stat = db.prepare( + `SELECT MIN(CAST(${q(col)} AS REAL)) as lo, + MAX(CAST(${q(col)} AS REAL)) as hi + FROM ${q(table)} + WHERE ${q(col)} IS NOT NULL AND ${q(col)} != '' AND ${q(col)} != 'null'` + ).get() as { lo: number | null; hi: number | null }; + + if (stat.lo === null) return `n=${n}, nulls=${nulls}`; + + const lo = stat.lo, hi = stat.hi!; + const spread = hi - lo; + const binWidth = spread === 0 ? 1 : spread / N_BINS; + + // Pull all non-null numeric values and bin them in JS + const vals = (db.prepare( + `SELECT CAST(${q(col)} AS REAL) as v FROM ${q(table)} + WHERE ${q(col)} IS NOT NULL AND ${q(col)} != '' AND ${q(col)} != 'null'` + ).all() as { v: number }[]).map(r => r.v); + + const bins = new Array(N_BINS).fill(0); + for (const v of vals) { + const idx = spread === 0 ? 0 : Math.min(Math.floor((v - lo) / binWidth), N_BINS - 1); + bins[idx]++; + } + + const binStr = bins + .map((cnt, i) => `${fmt(lo + i * binWidth)}..${fmt(lo + (i + 1) * binWidth)}:${cnt}`) + .join('|'); + + return `n=${n}, nulls=${nulls}, range=[${fmt(lo)}..${fmt(hi)}], bins=[${binStr}]`; +} + +function summarizeBool(db: DatabaseSync, table: string, col: string, n: number): string { + const nulls = nullCount(db, table, col); + const t = (db.prepare(`SELECT count(*) as c FROM ${q(table)} WHERE ${q(col)} = 'true'`).get() as { c: number }).c; + const f = (db.prepare(`SELECT count(*) as c FROM ${q(table)} WHERE ${q(col)} = 'false'`).get() as { c: number }).c; + return `n=${n}, nulls=${nulls}, true=${t}, false=${f}`; +} + +function summarizeText(db: DatabaseSync, table: string, col: string, n: number): string { + const nulls = nullCount(db, table, col); + + const distinct = (db.prepare( + `SELECT count(distinct ${q(col)}) as d FROM ${q(table)} + WHERE ${q(col)} IS NOT NULL AND ${q(col)} != '' AND ${q(col)} != 'null'` + ).get() as { d: number }).d; + + // High-cardinality columns (e.g. timestamps, IDs): show min/max instead of top-N + const nonNull = n - nulls; + if (distinct > nonNull / 2 && distinct > TOP_N) { + const bounds = db.prepare( + `SELECT MIN(${q(col)}) as lo, MAX(${q(col)}) as hi FROM ${q(table)} + WHERE ${q(col)} IS NOT NULL AND ${q(col)} != '' AND ${q(col)} != 'null'` + ).get() as { lo: string; hi: string }; + return `n=${n}, nulls=${nulls}, distinct=${distinct}, range=["${bounds.lo}".."${bounds.hi}"]`; + } + + // Low-cardinality: show top-N most frequent values + const topRows = db.prepare( + `SELECT ${q(col)} as v, count(*) as c FROM ${q(table)} + WHERE ${q(col)} IS NOT NULL AND ${q(col)} != '' AND ${q(col)} != 'null' + GROUP BY ${q(col)} ORDER BY c DESC LIMIT ${TOP_N}` + ).all() as { v: string; c: number }[]; + + const topStr = topRows + .map(r => { + const s = String(r.v); + const display = s.length > MAX_VAL_LEN ? s.slice(0, MAX_VAL_LEN - 1) + '…' : s; + return `"${display}"×${r.c}`; + }) + .join(', '); + + return `n=${n}, nulls=${nulls}, distinct=${distinct}, top=[${topStr}]`; +} + +// ── table summarizer ───────────────────────────────────────────────────────── + +function summarizeTable(db: DatabaseSync, tableName: string) { + const cols = db.prepare(`PRAGMA table_info(${q(tableName)})`).all() as { name: string }[]; + const n = rowCount(db, tableName); + + console.log(`${tableName}:`); + for (const { name } of cols) { + const type = detectType(db, tableName, name); + let summary: string; + if (type === 'numeric') { + summary = summarizeNumeric(db, tableName, name, n); + } else if (type === 'bool') { + summary = summarizeBool(db, tableName, name, n); + } else { + summary = summarizeText(db, tableName, name, n); + } + console.log(` ${name}: ${summary}`); + } +} + +// ── main ───────────────────────────────────────────────────────────────────── + +async function main() { + console.log(`${elapsed()} - Building targets`); + const targets = await execPaths([ + {path: "/home/cobertos/Seafile/archive/ExportedServiceData/facebook/formapcast_facebook-DEADNAME-May2021-json", op: facebook()} + // {path: "/home/cobertos/Seafile/archive/ExportedServiceData/fitbit/FullHumanName", op: fitbit()} + ]); + console.log(`${elapsed()} - Found ${targets.filter(t => !t.aggregate).length} possible targets`); + + const db = await loadTaskInNewDb(targets); + + const tables = db.prepare( + `SELECT name FROM sqlite_master WHERE type='table' ORDER BY name` + ).all() as { name: string }[]; + + console.log(`\n${'─'.repeat(60)}`); + for (const { name } of tables) { + summarizeTable(db, name); + } + + db.close(); +} + +if (process.argv[1] === __filename) { + main(); +}