Compare commits
1 commit
28d00763c6
...
257598a17c
| Author | SHA1 | Date | |
|---|---|---|---|
| 257598a17c |
1 changed files with 194 additions and 0 deletions
194
summary.ts
Normal file
194
summary.ts
Normal file
|
|
@ -0,0 +1,194 @@
|
||||||
|
import { type DatabaseSync } from "node:sqlite";
|
||||||
|
import { fileURLToPath } from "node:url";
|
||||||
|
import { facebook } from "./data-export/facebook.ts";
|
||||||
|
import { execPaths } from "./data-export/task.ts";
|
||||||
|
import { elapsed, loadTaskInNewDb } from "./main.ts";
|
||||||
|
|
||||||
|
const __filename = fileURLToPath(import.meta.url);
|
||||||
|
|
||||||
|
// ── constants ────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
const N_BINS = 5; // histogram bin count for numeric columns
|
||||||
|
const TOP_N = 5; // most-frequent values shown for text columns
|
||||||
|
const SAMPLE_N = 500; // rows sampled for column type detection
|
||||||
|
const MAX_VAL_LEN = 20; // max chars shown per value in top-N list
|
||||||
|
|
||||||
|
// ── helpers ──────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
type ColType = 'numeric' | 'bool' | 'text';
|
||||||
|
|
||||||
|
/** Values to treat as "missing" regardless of SQLite NULL */
|
||||||
|
const NULL_FILTER = `"v" IS NULL OR "v" = '' OR "v" = 'null'`;
|
||||||
|
|
||||||
|
function q(name: string) {
|
||||||
|
return `"${name}"`;
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Detect column type by sampling up to SAMPLE_N non-null/empty/null-string
|
||||||
|
* values and testing them in JS.
|
||||||
|
*/
|
||||||
|
function detectType(db: DatabaseSync, table: string, col: string): ColType {
|
||||||
|
const rows = db.prepare(
|
||||||
|
`SELECT ${q(col)} as v FROM ${q(table)}
|
||||||
|
WHERE ${q(col)} IS NOT NULL AND ${q(col)} != '' AND ${q(col)} != 'null'
|
||||||
|
LIMIT ${SAMPLE_N}`
|
||||||
|
).all() as { v: unknown }[];
|
||||||
|
|
||||||
|
if (rows.length === 0) return 'text';
|
||||||
|
|
||||||
|
const vals = rows.map(r => String(r.v));
|
||||||
|
|
||||||
|
if (vals.every(v => v === 'true' || v === 'false')) return 'bool';
|
||||||
|
if (vals.every(v => v !== '' && isFinite(Number(v)))) return 'numeric';
|
||||||
|
return 'text';
|
||||||
|
}
|
||||||
|
|
||||||
|
function rowCount(db: DatabaseSync, table: string): number {
|
||||||
|
return (db.prepare(`SELECT count(*) as n FROM ${q(table)}`).get() as { n: number }).n;
|
||||||
|
}
|
||||||
|
|
||||||
|
function nullCount(db: DatabaseSync, table: string, col: string): number {
|
||||||
|
return (db.prepare(
|
||||||
|
`SELECT count(*) as n FROM ${q(table)}
|
||||||
|
WHERE ${q(col)} IS NULL OR ${q(col)} = '' OR ${q(col)} = 'null'`
|
||||||
|
).get() as { n: number }).n;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── per-type summarizers ─────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
function fmt(v: number): string {
|
||||||
|
if (!isFinite(v)) return String(v);
|
||||||
|
// Use integer form if it looks like an integer, otherwise 4 sig figs
|
||||||
|
if (Number.isInteger(v)) return String(v);
|
||||||
|
const s = v.toPrecision(4);
|
||||||
|
// Drop trailing zeros after decimal
|
||||||
|
return s.includes('.') ? s.replace(/\.?0+$/, '') : s;
|
||||||
|
}
|
||||||
|
|
||||||
|
function summarizeNumeric(db: DatabaseSync, table: string, col: string, n: number): string {
|
||||||
|
const nulls = nullCount(db, table, col);
|
||||||
|
|
||||||
|
const stat = db.prepare(
|
||||||
|
`SELECT MIN(CAST(${q(col)} AS REAL)) as lo,
|
||||||
|
MAX(CAST(${q(col)} AS REAL)) as hi
|
||||||
|
FROM ${q(table)}
|
||||||
|
WHERE ${q(col)} IS NOT NULL AND ${q(col)} != '' AND ${q(col)} != 'null'`
|
||||||
|
).get() as { lo: number | null; hi: number | null };
|
||||||
|
|
||||||
|
if (stat.lo === null) return `n=${n}, nulls=${nulls}`;
|
||||||
|
|
||||||
|
const lo = stat.lo, hi = stat.hi!;
|
||||||
|
const spread = hi - lo;
|
||||||
|
const binWidth = spread === 0 ? 1 : spread / N_BINS;
|
||||||
|
|
||||||
|
// Pull all non-null numeric values and bin them in JS
|
||||||
|
const vals = (db.prepare(
|
||||||
|
`SELECT CAST(${q(col)} AS REAL) as v FROM ${q(table)}
|
||||||
|
WHERE ${q(col)} IS NOT NULL AND ${q(col)} != '' AND ${q(col)} != 'null'`
|
||||||
|
).all() as { v: number }[]).map(r => r.v);
|
||||||
|
|
||||||
|
const bins = new Array<number>(N_BINS).fill(0);
|
||||||
|
for (const v of vals) {
|
||||||
|
const idx = spread === 0 ? 0 : Math.min(Math.floor((v - lo) / binWidth), N_BINS - 1);
|
||||||
|
bins[idx]++;
|
||||||
|
}
|
||||||
|
|
||||||
|
const binStr = bins
|
||||||
|
.map((cnt, i) => `${fmt(lo + i * binWidth)}..${fmt(lo + (i + 1) * binWidth)}:${cnt}`)
|
||||||
|
.join('|');
|
||||||
|
|
||||||
|
return `n=${n}, nulls=${nulls}, range=[${fmt(lo)}..${fmt(hi)}], bins=[${binStr}]`;
|
||||||
|
}
|
||||||
|
|
||||||
|
function summarizeBool(db: DatabaseSync, table: string, col: string, n: number): string {
|
||||||
|
const nulls = nullCount(db, table, col);
|
||||||
|
const t = (db.prepare(`SELECT count(*) as c FROM ${q(table)} WHERE ${q(col)} = 'true'`).get() as { c: number }).c;
|
||||||
|
const f = (db.prepare(`SELECT count(*) as c FROM ${q(table)} WHERE ${q(col)} = 'false'`).get() as { c: number }).c;
|
||||||
|
return `n=${n}, nulls=${nulls}, true=${t}, false=${f}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
function summarizeText(db: DatabaseSync, table: string, col: string, n: number): string {
|
||||||
|
const nulls = nullCount(db, table, col);
|
||||||
|
|
||||||
|
const distinct = (db.prepare(
|
||||||
|
`SELECT count(distinct ${q(col)}) as d FROM ${q(table)}
|
||||||
|
WHERE ${q(col)} IS NOT NULL AND ${q(col)} != '' AND ${q(col)} != 'null'`
|
||||||
|
).get() as { d: number }).d;
|
||||||
|
|
||||||
|
// High-cardinality columns (e.g. timestamps, IDs): show min/max instead of top-N
|
||||||
|
const nonNull = n - nulls;
|
||||||
|
if (distinct > nonNull / 2 && distinct > TOP_N) {
|
||||||
|
const bounds = db.prepare(
|
||||||
|
`SELECT MIN(${q(col)}) as lo, MAX(${q(col)}) as hi FROM ${q(table)}
|
||||||
|
WHERE ${q(col)} IS NOT NULL AND ${q(col)} != '' AND ${q(col)} != 'null'`
|
||||||
|
).get() as { lo: string; hi: string };
|
||||||
|
return `n=${n}, nulls=${nulls}, distinct=${distinct}, range=["${bounds.lo}".."${bounds.hi}"]`;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Low-cardinality: show top-N most frequent values
|
||||||
|
const topRows = db.prepare(
|
||||||
|
`SELECT ${q(col)} as v, count(*) as c FROM ${q(table)}
|
||||||
|
WHERE ${q(col)} IS NOT NULL AND ${q(col)} != '' AND ${q(col)} != 'null'
|
||||||
|
GROUP BY ${q(col)} ORDER BY c DESC LIMIT ${TOP_N}`
|
||||||
|
).all() as { v: string; c: number }[];
|
||||||
|
|
||||||
|
const topStr = topRows
|
||||||
|
.map(r => {
|
||||||
|
const s = String(r.v);
|
||||||
|
const display = s.length > MAX_VAL_LEN ? s.slice(0, MAX_VAL_LEN - 1) + '…' : s;
|
||||||
|
return `"${display}"×${r.c}`;
|
||||||
|
})
|
||||||
|
.join(', ');
|
||||||
|
|
||||||
|
return `n=${n}, nulls=${nulls}, distinct=${distinct}, top=[${topStr}]`;
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── table summarizer ─────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
function summarizeTable(db: DatabaseSync, tableName: string) {
|
||||||
|
const cols = db.prepare(`PRAGMA table_info(${q(tableName)})`).all() as { name: string }[];
|
||||||
|
const n = rowCount(db, tableName);
|
||||||
|
|
||||||
|
console.log(`${tableName}:`);
|
||||||
|
for (const { name } of cols) {
|
||||||
|
const type = detectType(db, tableName, name);
|
||||||
|
let summary: string;
|
||||||
|
if (type === 'numeric') {
|
||||||
|
summary = summarizeNumeric(db, tableName, name, n);
|
||||||
|
} else if (type === 'bool') {
|
||||||
|
summary = summarizeBool(db, tableName, name, n);
|
||||||
|
} else {
|
||||||
|
summary = summarizeText(db, tableName, name, n);
|
||||||
|
}
|
||||||
|
console.log(` ${name}: ${summary}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ── main ─────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
async function main() {
|
||||||
|
console.log(`${elapsed()} - Building targets`);
|
||||||
|
const targets = await execPaths([
|
||||||
|
{path: "/home/cobertos/Seafile/archive/ExportedServiceData/facebook/formapcast_facebook-DEADNAME-May2021-json", op: facebook()}
|
||||||
|
// {path: "/home/cobertos/Seafile/archive/ExportedServiceData/fitbit/FullHumanName", op: fitbit()}
|
||||||
|
]);
|
||||||
|
console.log(`${elapsed()} - Found ${targets.filter(t => !t.aggregate).length} possible targets`);
|
||||||
|
|
||||||
|
const db = await loadTaskInNewDb(targets);
|
||||||
|
|
||||||
|
const tables = db.prepare(
|
||||||
|
`SELECT name FROM sqlite_master WHERE type='table' ORDER BY name`
|
||||||
|
).all() as { name: string }[];
|
||||||
|
|
||||||
|
console.log(`\n${'─'.repeat(60)}`);
|
||||||
|
for (const { name } of tables) {
|
||||||
|
summarizeTable(db, name);
|
||||||
|
}
|
||||||
|
|
||||||
|
db.close();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (process.argv[1] === __filename) {
|
||||||
|
main();
|
||||||
|
}
|
||||||
Loading…
Add table
Add a link
Reference in a new issue