base-data-manager/test/utils/csvUtils.ts

154 lines
5.9 KiB
TypeScript

import { strict as assert } from "node:assert";
import { type TestContextAssert } from "node:test";
import { parse } from "csv-parse/sync";
function formatCSVForSnapshot(id: string, csv: string) {
return `# === ${id} ===\n${csv}`;
}
/**Custom serializer options for id + csv tuples. The default node:test snapshot serialization
* results in CLI output that looks like the following
* ```
* '\n[\n "\\\\"album\\\\",\\\\"uri\\\\"...'
* ```
* which is nearly useless to try to find what went wrong in
* So instead we output the plain csvs + id in a flatter, plainer serialized format
* to compare against*/
export function idAndCSVsSnapshotOpts(idAndCSVs: [string, string][]): Parameters<TestContextAssert["snapshot"]> {
function idAndCSVsSnapshotSerializer(idAndCSVs: [string, string][]) {
return idAndCSVs.map((idAndCSV)=>formatCSVForSnapshot(...idAndCSV));
}
// Keep stable ordering for snapshots
idAndCSVs.sort();
return [idAndCSVs, {
serializers: [idAndCSVsSnapshotSerializer]
}];
}
/**Scores CSV rows on whether or not we can determine if it has headers
* In this case
* ```
* score < 0 - First row does follow observed patterns in rows [1,rowsToSample), most likely does not have headers
* score === 0 - Header
* score > 0 - First row does NOT follow patterns observed in rows [1,rowsToSample), most likely has headers
* ```
* Compare the output like `> 0` or `>= 0` depending on your needs
*
* The theory here comes from Python's implementation of has_headers which
* does a similar thing
* https://github.com/python/cpython/blob/main/Lib/csv.py#L453
*
* Scan over dataRows (every row after mightBeHeader) and collect the pattern of
* the length of the values in the column as well as the type of the values
* in that column.
* If the mightBeHeader has the same type as the dataRows and is not a string
* there's a good chance it's a header. Same if all the dataRows have the
* same string length but the header has a different string length
*/
function getHasHeaderScore(rows: string[][], rowsToSample = 20): number {
const mightBeHeader = rows[0];
const dataRows = rows
.slice(1) // Remove header
.slice(0, rowsToSample); // Select only the first rowsToSample rows
function typeFromValue(v: string) {
const maybeNum = Number(v);
if (!isNaN(maybeNum)) {
return "number" as const;
}
return "string" as const;
}
interface ColumnInfo {
type?: "number" | "string";
length?: number;
}
function deriveColumnInfoFromValue(v: string) {
return {
type: typeFromValue(v),
length: v.length
};
}
function combineColumnInfo(a: ColumnInfo | undefined, b: ColumnInfo) {
if (!a) {
// Don't have a previous value yet
return b;
}
// Combine each piece of info, if it differs
return {
type: a.type === b.type ? a.type : undefined,
length: a.length === b.length ? a.length : undefined
};
}
function scoreColumnInfo(mightBeHeader: ColumnInfo, dataRow?: ColumnInfo) {
let typeScore = 0;
if (dataRow?.type !== undefined && dataRow.type !== "string") {
typeScore = dataRow.type !== mightBeHeader.type ? 1 : -1;
}
let lengthScore = 0;
if (dataRow?.length !== undefined) {
lengthScore = dataRow.length !== mightBeHeader.length ? 1 : -1;
}
return typeScore + lengthScore;
}
// Maps column index to the ColumnInfo derived for that row
const colInfos: (ColumnInfo | undefined)[] = [];
// For every sampled row, collect the pattern info across the columns
for (const row of dataRows) {
for (const [colIdx, value] of row.entries()) {
const maybeColInfo = colInfos[colIdx];
const newColInfo = deriveColumnInfoFromValue(value);
colInfos[colIdx] = combineColumnInfo(maybeColInfo, newColInfo);
}
}
// Score headers for differences from the above observed patterns
let score = 0;
for (const [idx, headerValue] of mightBeHeader.entries()) {
const headerColInfo = deriveColumnInfoFromValue(headerValue);
const maybeDataRowColInfo = colInfos[idx];
score += scoreColumnInfo(headerColInfo, maybeDataRowColInfo);
}
return score;
}
function hasHeader(rows: string[][], rowsToSample = 20): boolean {
return getHasHeaderScore(rows, rowsToSample) > 0;
}
// Inline test
assert(hasHeader([["name", "place", "count"], ["who", "where", "2"], ["some", "one", "5"]]) === true, "Inline hasHeader unit-test 1");
assert(hasHeader([["bingus_column", "nothing", "nothing"], ["bingus", "ggg", "hhhhh"], ["bingus", "ffff", "aaaaaaa"]]) === true, "Inline hasHeader unit-test 2");
assert(hasHeader([["not", "a", "header"], ["marco", "polo", "afafaf"], ["g", "f", "a"]]) === false, "Inline hasHeader unit-test 3");
/**Makes sure the csv passed follows a set of guidelines*/
export function assertCSVWellFormed(csv: string, msg?: string) {
// ends in a newline
// TODO: Fix these, fitbit export apparently fails both of these :(
//assert(csv[csv.length - 1] === "\n", `${msg} CSV must end in a new line`);
//assert(!csv.includes("\r"), `${msg} CSV included carriage returns, but we dont want those in our output`);
// This throws if:
// * it finds mismatching lengths of rows
// * ... others, see below
// Also see https://csv.js.org/parse/errors/#runtime-errors
const rows = parse(csv, {
record_delimiter: '\n', // Default is autodiscovery, but we only want '\n'
// Explicitly define these even though they're the default. This is what we
// want to cause a throw if the csv comes in poorly
relax_column_count: false,
relax_quotes: false,
skip_records_with_error: false,
skip_empty_lines: false,
skip_records_with_empty_values: false
});
assert(rows.length > 0, `${msg} CSV had no rows`);
// Use >= 0 here so if it's ambiguous we just let it pass, some of the tables
// we output done have any "observable" patterns w.r.t how getHasHeaderScore()
// works
assert(getHasHeaderScore(rows) >= 0, `${msg} CSVs should have headers`);
}