import { strict as assert } from "node:assert"; import { type TestContextAssert } from "node:test"; import { parse } from "csv-parse/sync"; function formatCSVForSnapshot(id: string, csv: string) { return `# === ${id} ===\n${csv}`; } /**Custom serializer options for id + csv tuples. The default node:test snapshot serialization * results in CLI output that looks like the following * ``` * '\n[\n "\\\\"album\\\\",\\\\"uri\\\\"...' * ``` * which is nearly useless to try to find what went wrong in * So instead we output the plain csvs + id in a flatter, plainer serialized format * to compare against*/ export function idAndCSVsSnapshotOpts(idAndCSVs: [string, string][]): Parameters { function idAndCSVsSnapshotSerializer(idAndCSVs: [string, string][]) { return idAndCSVs.map((idAndCSV)=>formatCSVForSnapshot(...idAndCSV)); } // Keep stable ordering for snapshots idAndCSVs.sort(); return [idAndCSVs, { serializers: [idAndCSVsSnapshotSerializer] }]; } /**Scores CSV rows on whether or not we can determine if it has headers * In this case * ``` * score < 0 - First row does follow observed patterns in rows [1,rowsToSample), most likely does not have headers * score === 0 - Header * score > 0 - First row does NOT follow patterns observed in rows [1,rowsToSample), most likely has headers * ``` * Compare the output like `> 0` or `>= 0` depending on your needs * * The theory here comes from Python's implementation of has_headers which * does a similar thing * https://github.com/python/cpython/blob/main/Lib/csv.py#L453 * * Scan over dataRows (every row after mightBeHeader) and collect the pattern of * the length of the values in the column as well as the type of the values * in that column. * If the mightBeHeader has the same type as the dataRows and is not a string * there's a good chance it's a header. Same if all the dataRows have the * same string length but the header has a different string length */ function getHasHeaderScore(rows: string[][], rowsToSample = 20): number { const mightBeHeader = rows[0]; const dataRows = rows .slice(1) // Remove header .slice(0, rowsToSample); // Select only the first rowsToSample rows function typeFromValue(v: string) { const maybeNum = Number(v); if (!isNaN(maybeNum)) { return "number" as const; } return "string" as const; } interface ColumnInfo { type?: "number" | "string"; length?: number; } function deriveColumnInfoFromValue(v: string) { return { type: typeFromValue(v), length: v.length }; } function combineColumnInfo(a: ColumnInfo | undefined, b: ColumnInfo) { if (!a) { // Don't have a previous value yet return b; } // Combine each piece of info, if it differs return { type: a.type === b.type ? a.type : undefined, length: a.length === b.length ? a.length : undefined }; } function scoreColumnInfo(mightBeHeader: ColumnInfo, dataRow?: ColumnInfo) { let typeScore = 0; if (dataRow?.type !== undefined && dataRow.type !== "string") { typeScore = dataRow.type !== mightBeHeader.type ? 1 : -1; } let lengthScore = 0; if (dataRow?.length !== undefined) { lengthScore = dataRow.length !== mightBeHeader.length ? 1 : -1; } return typeScore + lengthScore; } // Maps column index to the ColumnInfo derived for that row const colInfos: (ColumnInfo | undefined)[] = []; // For every sampled row, collect the pattern info across the columns for (const row of dataRows) { for (const [colIdx, value] of row.entries()) { const maybeColInfo = colInfos[colIdx]; const newColInfo = deriveColumnInfoFromValue(value); colInfos[colIdx] = combineColumnInfo(maybeColInfo, newColInfo); } } // Score headers for differences from the above observed patterns let score = 0; for (const [idx, headerValue] of mightBeHeader.entries()) { const headerColInfo = deriveColumnInfoFromValue(headerValue); const maybeDataRowColInfo = colInfos[idx]; score += scoreColumnInfo(headerColInfo, maybeDataRowColInfo); } return score; } function hasHeader(rows: string[][], rowsToSample = 20): boolean { return getHasHeaderScore(rows, rowsToSample) > 0; } // Inline test assert(hasHeader([["name", "place", "count"], ["who", "where", "2"], ["some", "one", "5"]]) === true, "Inline hasHeader unit-test 1"); assert(hasHeader([["bingus_column", "nothing", "nothing"], ["bingus", "ggg", "hhhhh"], ["bingus", "ffff", "aaaaaaa"]]) === true, "Inline hasHeader unit-test 2"); assert(hasHeader([["not", "a", "header"], ["marco", "polo", "afafaf"], ["g", "f", "a"]]) === false, "Inline hasHeader unit-test 3"); /**Makes sure the csv passed follows a set of guidelines*/ export function assertCSVWellFormed(csv: string, msg?: string) { // ends in a newline // TODO: Fix these, fitbit export apparently fails both of these :( //assert(csv[csv.length - 1] === "\n", `${msg} CSV must end in a new line`); //assert(!csv.includes("\r"), `${msg} CSV included carriage returns, but we dont want those in our output`); // This throws if: // * it finds mismatching lengths of rows // * ... others, see below // Also see https://csv.js.org/parse/errors/#runtime-errors const rows = parse(csv, { record_delimiter: '\n', // Default is autodiscovery, but we only want '\n' // Explicitly define these even though they're the default. This is what we // want to cause a throw if the csv comes in poorly relax_column_count: false, relax_quotes: false, skip_records_with_error: false, skip_empty_lines: false, skip_records_with_empty_values: false }); assert(rows.length > 0, `${msg} CSV had no rows`); // Use >= 0 here so if it's ambiguous we just let it pass, some of the tables // we output done have any "observable" patterns w.r.t how getHasHeaderScore() // works assert(getHasHeaderScore(rows) >= 0, `${msg} CSVs should have headers`); }