154 lines
5.9 KiB
TypeScript
154 lines
5.9 KiB
TypeScript
import { strict as assert } from "node:assert";
|
|
import { type TestContextAssert } from "node:test";
|
|
import { parse } from "csv-parse/sync";
|
|
|
|
function formatCSVForSnapshot(id: string, csv: string) {
|
|
return `# === ${id} ===\n${csv}`;
|
|
}
|
|
|
|
/**Custom serializer options for id + csv tuples. The default node:test snapshot serialization
|
|
* results in CLI output that looks like the following
|
|
* ```
|
|
* '\n[\n "\\\\"album\\\\",\\\\"uri\\\\"...'
|
|
* ```
|
|
* which is nearly useless to try to find what went wrong in
|
|
* So instead we output the plain csvs + id in a flatter, plainer serialized format
|
|
* to compare against*/
|
|
export function idAndCSVsSnapshotOpts(idAndCSVs: [string, string][]): Parameters<TestContextAssert["snapshot"]> {
|
|
function idAndCSVsSnapshotSerializer(idAndCSVs: [string, string][]) {
|
|
return idAndCSVs.map((idAndCSV)=>formatCSVForSnapshot(...idAndCSV));
|
|
}
|
|
// Keep stable ordering for snapshots
|
|
idAndCSVs.sort();
|
|
return [idAndCSVs, {
|
|
serializers: [idAndCSVsSnapshotSerializer]
|
|
}];
|
|
}
|
|
|
|
/**Scores CSV rows on whether or not we can determine if it has headers
|
|
* In this case
|
|
* ```
|
|
* score < 0 - First row does follow observed patterns in rows [1,rowsToSample), most likely does not have headers
|
|
* score === 0 - Header
|
|
* score > 0 - First row does NOT follow patterns observed in rows [1,rowsToSample), most likely has headers
|
|
* ```
|
|
* Compare the output like `> 0` or `>= 0` depending on your needs
|
|
*
|
|
* The theory here comes from Python's implementation of has_headers which
|
|
* does a similar thing
|
|
* https://github.com/python/cpython/blob/main/Lib/csv.py#L453
|
|
*
|
|
* Scan over dataRows (every row after mightBeHeader) and collect the pattern of
|
|
* the length of the values in the column as well as the type of the values
|
|
* in that column.
|
|
* If the mightBeHeader has the same type as the dataRows and is not a string
|
|
* there's a good chance it's a header. Same if all the dataRows have the
|
|
* same string length but the header has a different string length
|
|
*/
|
|
function getHasHeaderScore(rows: string[][], rowsToSample = 20): number {
|
|
const mightBeHeader = rows[0];
|
|
const dataRows = rows
|
|
.slice(1) // Remove header
|
|
.slice(0, rowsToSample); // Select only the first rowsToSample rows
|
|
|
|
function typeFromValue(v: string) {
|
|
const maybeNum = Number(v);
|
|
if (!isNaN(maybeNum)) {
|
|
return "number" as const;
|
|
}
|
|
return "string" as const;
|
|
}
|
|
|
|
interface ColumnInfo {
|
|
type?: "number" | "string";
|
|
length?: number;
|
|
}
|
|
function deriveColumnInfoFromValue(v: string) {
|
|
return {
|
|
type: typeFromValue(v),
|
|
length: v.length
|
|
};
|
|
}
|
|
function combineColumnInfo(a: ColumnInfo | undefined, b: ColumnInfo) {
|
|
if (!a) {
|
|
// Don't have a previous value yet
|
|
return b;
|
|
}
|
|
|
|
// Combine each piece of info, if it differs
|
|
return {
|
|
type: a.type === b.type ? a.type : undefined,
|
|
length: a.length === b.length ? a.length : undefined
|
|
};
|
|
}
|
|
function scoreColumnInfo(mightBeHeader: ColumnInfo, dataRow?: ColumnInfo) {
|
|
let typeScore = 0;
|
|
if (dataRow?.type !== undefined && dataRow.type !== "string") {
|
|
typeScore = dataRow.type !== mightBeHeader.type ? 1 : -1;
|
|
}
|
|
let lengthScore = 0;
|
|
if (dataRow?.length !== undefined) {
|
|
lengthScore = dataRow.length !== mightBeHeader.length ? 1 : -1;
|
|
}
|
|
return typeScore + lengthScore;
|
|
}
|
|
|
|
// Maps column index to the ColumnInfo derived for that row
|
|
const colInfos: (ColumnInfo | undefined)[] = [];
|
|
// For every sampled row, collect the pattern info across the columns
|
|
for (const row of dataRows) {
|
|
for (const [colIdx, value] of row.entries()) {
|
|
const maybeColInfo = colInfos[colIdx];
|
|
const newColInfo = deriveColumnInfoFromValue(value);
|
|
colInfos[colIdx] = combineColumnInfo(maybeColInfo, newColInfo);
|
|
}
|
|
}
|
|
|
|
// Score headers for differences from the above observed patterns
|
|
let score = 0;
|
|
for (const [idx, headerValue] of mightBeHeader.entries()) {
|
|
const headerColInfo = deriveColumnInfoFromValue(headerValue);
|
|
const maybeDataRowColInfo = colInfos[idx];
|
|
score += scoreColumnInfo(headerColInfo, maybeDataRowColInfo);
|
|
}
|
|
|
|
return score;
|
|
}
|
|
function hasHeader(rows: string[][], rowsToSample = 20): boolean {
|
|
return getHasHeaderScore(rows, rowsToSample) > 0;
|
|
}
|
|
|
|
// Inline test
|
|
assert(hasHeader([["name", "place", "count"], ["who", "where", "2"], ["some", "one", "5"]]) === true, "Inline hasHeader unit-test 1");
|
|
assert(hasHeader([["bingus_column", "nothing", "nothing"], ["bingus", "ggg", "hhhhh"], ["bingus", "ffff", "aaaaaaa"]]) === true, "Inline hasHeader unit-test 2");
|
|
assert(hasHeader([["not", "a", "header"], ["marco", "polo", "afafaf"], ["g", "f", "a"]]) === false, "Inline hasHeader unit-test 3");
|
|
|
|
/**Makes sure the csv passed follows a set of guidelines*/
|
|
export function assertCSVWellFormed(csv: string, msg?: string) {
|
|
// ends in a newline
|
|
// TODO: Fix these, fitbit export apparently fails both of these :(
|
|
//assert(csv[csv.length - 1] === "\n", `${msg} CSV must end in a new line`);
|
|
//assert(!csv.includes("\r"), `${msg} CSV included carriage returns, but we dont want those in our output`);
|
|
|
|
// This throws if:
|
|
// * it finds mismatching lengths of rows
|
|
// * ... others, see below
|
|
// Also see https://csv.js.org/parse/errors/#runtime-errors
|
|
const rows = parse(csv, {
|
|
record_delimiter: '\n', // Default is autodiscovery, but we only want '\n'
|
|
|
|
// Explicitly define these even though they're the default. This is what we
|
|
// want to cause a throw if the csv comes in poorly
|
|
relax_column_count: false,
|
|
relax_quotes: false,
|
|
skip_records_with_error: false,
|
|
skip_empty_lines: false,
|
|
skip_records_with_empty_values: false
|
|
});
|
|
|
|
assert(rows.length > 0, `${msg} CSV had no rows`);
|
|
// Use >= 0 here so if it's ambiguous we just let it pass, some of the tables
|
|
// we output done have any "observable" patterns w.r.t how getHasHeaderScore()
|
|
// works
|
|
assert(getHasHeaderScore(rows) >= 0, `${msg} CSVs should have headers`);
|
|
}
|