import { strict as assert } from "node:assert";
import { type TestContextAssert } from "node:test";
import { parse } from "csv-parse/sync";

function formatCSVForSnapshot(id: string, csv: string) {
  return `# === ${id} ===\n${csv}`;
}

/**Custom serializer options for id + csv tuples. The default node:test snapshot serialization
 * results in CLI output that looks like the following
 * ```
 * '\n[\n  "\\\\"album\\\\",\\\\"uri\\\\"...'
 * ```
 * which is nearly useless to try to find what went wrong in
 * So instead we output the plain csvs + id in a flatter, plainer serialized format
 * to compare against*/
export function idAndCSVsSnapshotOpts(idAndCSVs: [string, string][]): Parameters<TestContextAssert["snapshot"]> {
  function idAndCSVsSnapshotSerializer(idAndCSVs: [string, string][]) {
    return idAndCSVs.map((idAndCSV)=>formatCSVForSnapshot(...idAndCSV));
  }
  // Keep stable ordering for snapshots
  idAndCSVs.sort();
  return [idAndCSVs, {
    serializers: [idAndCSVsSnapshotSerializer]
  }];
}

/**Scores CSV rows on whether or not we can determine if it has headers
 * In this case
 * ```
 * score < 0 - First row does follow observed patterns in rows [1,rowsToSample), most likely does not have headers
 * score === 0 - Header
 * score > 0 - First row does NOT follow patterns observed in rows [1,rowsToSample), most likely has headers
 * ```
 * Compare the output like `> 0` or `>= 0` depending on your needs
 * 
 * The theory here comes from Python's implementation of has_headers which
 * does a similar thing
 * https://github.com/python/cpython/blob/main/Lib/csv.py#L453
 * 
 * Scan over dataRows (every row after mightBeHeader) and collect the pattern of
 * the length of the values in the column as well as the type of the values
 * in that column.
 * If the mightBeHeader has the same type as the dataRows and is not a string
 * there's a good chance it's a header. Same if all the dataRows have the
 * same string length but the header has a different string length
 */
function getHasHeaderScore(rows: string[][], rowsToSample = 20): number {
  const mightBeHeader = rows[0];
  const dataRows = rows
    .slice(1) // Remove header
    .slice(0, rowsToSample); // Select only the first rowsToSample rows

  function typeFromValue(v: string) {
    const maybeNum = Number(v);
    if (!isNaN(maybeNum)) {
      return "number" as const;
    }
    return "string" as const;
  }

  interface ColumnInfo {
    type?: "number" | "string";
    length?: number;
  }
  function deriveColumnInfoFromValue(v: string) {
    return {
      type: typeFromValue(v),
      length: v.length
    };
  }
  function combineColumnInfo(a: ColumnInfo | undefined, b: ColumnInfo) {
    if (!a) {
      // Don't have a previous value yet
      return b;
    }

    // Combine each piece of info, if it differs
    return {
      type: a.type === b.type ? a.type : undefined,
      length: a.length === b.length ? a.length : undefined
    };
  }
  function scoreColumnInfo(mightBeHeader: ColumnInfo, dataRow?: ColumnInfo) {
    let typeScore = 0;
    if (dataRow?.type !== undefined && dataRow.type !== "string") {
      typeScore = dataRow.type !== mightBeHeader.type ? 1 : -1;
    }
    let lengthScore = 0;
    if (dataRow?.length !== undefined) {
      lengthScore = dataRow.length !== mightBeHeader.length ? 1 : -1;
    }
    return typeScore + lengthScore;
  }

  // Maps column index to the ColumnInfo derived for that row
  const colInfos: (ColumnInfo | undefined)[] = [];
  // For every sampled row, collect the pattern info across the columns
  for (const row of dataRows) {
    for (const [colIdx, value] of row.entries()) {
      const maybeColInfo = colInfos[colIdx];
      const newColInfo = deriveColumnInfoFromValue(value);
      colInfos[colIdx] = combineColumnInfo(maybeColInfo, newColInfo);
    }
  }

  // Score headers for differences from the above observed patterns
  let score = 0;
  for (const [idx, headerValue] of mightBeHeader.entries()) {
    const headerColInfo = deriveColumnInfoFromValue(headerValue);
    const maybeDataRowColInfo = colInfos[idx];
    score += scoreColumnInfo(headerColInfo, maybeDataRowColInfo);
  }

  return score;
}
function hasHeader(rows: string[][], rowsToSample = 20): boolean {
  return getHasHeaderScore(rows, rowsToSample) > 0;
}

// Inline test
assert(hasHeader([["name", "place", "count"], ["who", "where", "2"], ["some", "one", "5"]]) === true, "Inline hasHeader unit-test 1");
assert(hasHeader([["bingus_column", "nothing", "nothing"], ["bingus", "ggg", "hhhhh"], ["bingus", "ffff", "aaaaaaa"]]) === true, "Inline hasHeader unit-test 2");
assert(hasHeader([["not", "a", "header"], ["marco", "polo", "afafaf"], ["g", "f", "a"]]) === false, "Inline hasHeader unit-test 3");

/**Makes sure the csv passed follows a set of guidelines*/
export function assertCSVWellFormed(csv: string, msg?: string) {
  // ends in a newline
  // TODO: Fix these, fitbit export apparently fails both of these :(
  //assert(csv[csv.length - 1] === "\n", `${msg} CSV must end in a new line`);
  //assert(!csv.includes("\r"), `${msg} CSV included carriage returns, but we dont want those in our output`);

  // This throws if:
  // * it finds mismatching lengths of rows
  // * ... others, see below
  // Also see https://csv.js.org/parse/errors/#runtime-errors
  const rows = parse(csv, {
    record_delimiter: '\n', // Default is autodiscovery, but we only want '\n'

    // Explicitly define these even though they're the default. This is what we
    // want to cause a throw if the csv comes in poorly
    relax_column_count: false,
    relax_quotes: false,
    skip_records_with_error: false,
    skip_empty_lines: false,
    skip_records_with_empty_values: false
  });

  assert(rows.length > 0, `${msg} CSV had no rows`);
  // Use >= 0 here so if it's ambiguous we just let it pass, some of the tables
  // we output done have any "observable" patterns w.r.t how getHasHeaderScore()
  // works
  assert(getHasHeaderScore(rows) >= 0, `${msg} CSVs should have headers`);
}