Compare commits

...

1 commit

Author SHA1 Message Date
257598a17c Claude: add summary.ts 2026-03-04 11:00:35 +00:00

194
summary.ts Normal file
View file

@ -0,0 +1,194 @@
import { type DatabaseSync } from "node:sqlite";
import { fileURLToPath } from "node:url";
import { facebook } from "./data-export/facebook.ts";
import { execPaths } from "./data-export/task.ts";
import { elapsed, loadTaskInNewDb } from "./main.ts";
const __filename = fileURLToPath(import.meta.url);
// ── constants ────────────────────────────────────────────────────────────────
const N_BINS = 5; // histogram bin count for numeric columns
const TOP_N = 5; // most-frequent values shown for text columns
const SAMPLE_N = 500; // rows sampled for column type detection
const MAX_VAL_LEN = 20; // max chars shown per value in top-N list
// ── helpers ──────────────────────────────────────────────────────────────────
type ColType = 'numeric' | 'bool' | 'text';
/** Values to treat as "missing" regardless of SQLite NULL */
const NULL_FILTER = `"v" IS NULL OR "v" = '' OR "v" = 'null'`;
function q(name: string) {
return `"${name}"`;
}
/**
* Detect column type by sampling up to SAMPLE_N non-null/empty/null-string
* values and testing them in JS.
*/
function detectType(db: DatabaseSync, table: string, col: string): ColType {
const rows = db.prepare(
`SELECT ${q(col)} as v FROM ${q(table)}
WHERE ${q(col)} IS NOT NULL AND ${q(col)} != '' AND ${q(col)} != 'null'
LIMIT ${SAMPLE_N}`
).all() as { v: unknown }[];
if (rows.length === 0) return 'text';
const vals = rows.map(r => String(r.v));
if (vals.every(v => v === 'true' || v === 'false')) return 'bool';
if (vals.every(v => v !== '' && isFinite(Number(v)))) return 'numeric';
return 'text';
}
function rowCount(db: DatabaseSync, table: string): number {
return (db.prepare(`SELECT count(*) as n FROM ${q(table)}`).get() as { n: number }).n;
}
function nullCount(db: DatabaseSync, table: string, col: string): number {
return (db.prepare(
`SELECT count(*) as n FROM ${q(table)}
WHERE ${q(col)} IS NULL OR ${q(col)} = '' OR ${q(col)} = 'null'`
).get() as { n: number }).n;
}
// ── per-type summarizers ─────────────────────────────────────────────────────
function fmt(v: number): string {
if (!isFinite(v)) return String(v);
// Use integer form if it looks like an integer, otherwise 4 sig figs
if (Number.isInteger(v)) return String(v);
const s = v.toPrecision(4);
// Drop trailing zeros after decimal
return s.includes('.') ? s.replace(/\.?0+$/, '') : s;
}
function summarizeNumeric(db: DatabaseSync, table: string, col: string, n: number): string {
const nulls = nullCount(db, table, col);
const stat = db.prepare(
`SELECT MIN(CAST(${q(col)} AS REAL)) as lo,
MAX(CAST(${q(col)} AS REAL)) as hi
FROM ${q(table)}
WHERE ${q(col)} IS NOT NULL AND ${q(col)} != '' AND ${q(col)} != 'null'`
).get() as { lo: number | null; hi: number | null };
if (stat.lo === null) return `n=${n}, nulls=${nulls}`;
const lo = stat.lo, hi = stat.hi!;
const spread = hi - lo;
const binWidth = spread === 0 ? 1 : spread / N_BINS;
// Pull all non-null numeric values and bin them in JS
const vals = (db.prepare(
`SELECT CAST(${q(col)} AS REAL) as v FROM ${q(table)}
WHERE ${q(col)} IS NOT NULL AND ${q(col)} != '' AND ${q(col)} != 'null'`
).all() as { v: number }[]).map(r => r.v);
const bins = new Array<number>(N_BINS).fill(0);
for (const v of vals) {
const idx = spread === 0 ? 0 : Math.min(Math.floor((v - lo) / binWidth), N_BINS - 1);
bins[idx]++;
}
const binStr = bins
.map((cnt, i) => `${fmt(lo + i * binWidth)}..${fmt(lo + (i + 1) * binWidth)}:${cnt}`)
.join('|');
return `n=${n}, nulls=${nulls}, range=[${fmt(lo)}..${fmt(hi)}], bins=[${binStr}]`;
}
function summarizeBool(db: DatabaseSync, table: string, col: string, n: number): string {
const nulls = nullCount(db, table, col);
const t = (db.prepare(`SELECT count(*) as c FROM ${q(table)} WHERE ${q(col)} = 'true'`).get() as { c: number }).c;
const f = (db.prepare(`SELECT count(*) as c FROM ${q(table)} WHERE ${q(col)} = 'false'`).get() as { c: number }).c;
return `n=${n}, nulls=${nulls}, true=${t}, false=${f}`;
}
function summarizeText(db: DatabaseSync, table: string, col: string, n: number): string {
const nulls = nullCount(db, table, col);
const distinct = (db.prepare(
`SELECT count(distinct ${q(col)}) as d FROM ${q(table)}
WHERE ${q(col)} IS NOT NULL AND ${q(col)} != '' AND ${q(col)} != 'null'`
).get() as { d: number }).d;
// High-cardinality columns (e.g. timestamps, IDs): show min/max instead of top-N
const nonNull = n - nulls;
if (distinct > nonNull / 2 && distinct > TOP_N) {
const bounds = db.prepare(
`SELECT MIN(${q(col)}) as lo, MAX(${q(col)}) as hi FROM ${q(table)}
WHERE ${q(col)} IS NOT NULL AND ${q(col)} != '' AND ${q(col)} != 'null'`
).get() as { lo: string; hi: string };
return `n=${n}, nulls=${nulls}, distinct=${distinct}, range=["${bounds.lo}".."${bounds.hi}"]`;
}
// Low-cardinality: show top-N most frequent values
const topRows = db.prepare(
`SELECT ${q(col)} as v, count(*) as c FROM ${q(table)}
WHERE ${q(col)} IS NOT NULL AND ${q(col)} != '' AND ${q(col)} != 'null'
GROUP BY ${q(col)} ORDER BY c DESC LIMIT ${TOP_N}`
).all() as { v: string; c: number }[];
const topStr = topRows
.map(r => {
const s = String(r.v);
const display = s.length > MAX_VAL_LEN ? s.slice(0, MAX_VAL_LEN - 1) + '…' : s;
return `"${display}"×${r.c}`;
})
.join(', ');
return `n=${n}, nulls=${nulls}, distinct=${distinct}, top=[${topStr}]`;
}
// ── table summarizer ─────────────────────────────────────────────────────────
function summarizeTable(db: DatabaseSync, tableName: string) {
const cols = db.prepare(`PRAGMA table_info(${q(tableName)})`).all() as { name: string }[];
const n = rowCount(db, tableName);
console.log(`${tableName}:`);
for (const { name } of cols) {
const type = detectType(db, tableName, name);
let summary: string;
if (type === 'numeric') {
summary = summarizeNumeric(db, tableName, name, n);
} else if (type === 'bool') {
summary = summarizeBool(db, tableName, name, n);
} else {
summary = summarizeText(db, tableName, name, n);
}
console.log(` ${name}: ${summary}`);
}
}
// ── main ─────────────────────────────────────────────────────────────────────
async function main() {
console.log(`${elapsed()} - Building targets`);
const targets = await execPaths([
{path: "/home/cobertos/Seafile/archive/ExportedServiceData/facebook/formapcast_facebook-DEADNAME-May2021-json", op: facebook()}
// {path: "/home/cobertos/Seafile/archive/ExportedServiceData/fitbit/FullHumanName", op: fitbit()}
]);
console.log(`${elapsed()} - Found ${targets.filter(t => !t.aggregate).length} possible targets`);
const db = await loadTaskInNewDb(targets);
const tables = db.prepare(
`SELECT name FROM sqlite_master WHERE type='table' ORDER BY name`
).all() as { name: string }[];
console.log(`\n${'─'.repeat(60)}`);
for (const { name } of tables) {
summarizeTable(db, name);
}
db.close();
}
if (process.argv[1] === __filename) {
main();
}