base-data-manager/util/scrub.ts

108 lines
3.6 KiB
JavaScript
Executable file

#!/usr/bin/env -S node
import { $, path, minimist } from "zx";
import { strict as assert } from "node:assert";
import fs from "node:fs/promises";
import { parse } from "csv-parse/sync";
import { stringify } from "csv-stringify/sync";
import { scrubPrimitive } from "./scrub_primitive.ts";
$.verbose = true;
/**Catches any p Promise throws and instead returns those in a tuple*/
async function ptry<TRet, TError = Error>(
p: Promise<TRet>
): Promise<[TError, undefined] | [undefined, TRet]> {
try {
const result = await p;
return [undefined, result];
} catch (err) {
return [err as TError, undefined];
}
}
class UnsupportedScrubError extends Error {}
interface ScrubOptions {
hasHeaders?: boolean;
overrideType?: "csv" | "json"
}
/**Scrubs a file, json or csv*/
async function scrubFile(inFile: string, outFile: string, options?: ScrubOptions): Promise<[(Error | undefined), undefined]> {
if (inFile.endsWith(".csv") || overrideType === "csv") {
const [maybeErr, inCSV] = await ptry(fs.readFile(inFile, { encoding: "utf8" }));
if (maybeErr) {
return [maybeErr, undefined];
}
const hasHeaders = options?.hasHeaders ?? false;
const rows = parse(inCSV);
const MAX_ROWS = 20;
let scrubbedRows;
if (hasHeaders) {
const header = rows[0];
const scrubbedRest = rows.slice(1, MAX_ROWS + 1).map(row => row.map(cell => scrubPrimitive(cell)));
scrubbedRows = [header, ...scrubbedRest];
}
else {
scrubbedRows = rows.slice(0, MAX_ROWS).map(row => row.map(cell => scrubPrimitive(cell)));
}
const outCSV = stringify(scrubbedRows);
const [maybeErr2] = await ptry(fs.writeFile(outFile, outCSV, { encoding: "utf8" }));
return [maybeErr2, undefined];
}
else if (inFile.endsWith(".json") || overrideType === "json") {
const [jqErr] = await ptry($`cat ${inFile} | jq -L ${scriptDir} 'include "scrub"; scrub' > ${outFile}`);
return [jqErr, undefined];
}
else {
return [new UnsupportedScrubError(`No method for scrubbing file '${inFile}'`), undefined];
}
}
const scriptDir = path.dirname(new URL(import.meta.url).pathname);
const argv = minimist(process.argv.slice(2), {
boolean: ["has-headers"],
string: ["override-type"],
});
const fileOrGlob = argv._[0];
const hasHeaders = argv["has-headers"]; // already a boolean, no need for !!
const overrideType = argv["override-type"];
assert(fileOrGlob, "Usage: ./scrub.ts [--has-headers] [--override-type=csv] <file_or_glob>");
assert(overrideType === undefined || overrideType === "" || overrideType === "csv" || overrideType === "json", "Override type must be either 'json' or 'csv'");
console.log(`Matching files against passed file_or_glob: '${fileOrGlob}'`);
const filePaths: string[] = [];
for await (const file of fs.glob(fileOrGlob)) {
const resolved = path.resolve(file);
filePaths.push(resolved);
}
console.log("filePaths", filePaths);
assert(filePaths.length > 0, `No files found matching: ${fileOrGlob}`);
for (const file of filePaths) {
console.log(`Processing: ${file}`);
const tmpFile = `${file}.tmp`;
const piiFile = `${file}.DELETE-THIS-HAS-PII`;
const [scrubError] = await scrubFile(file, tmpFile, { hasHeaders, overrideType });
if (scrubError instanceof UnsupportedScrubError) {
console.warn(scrubError.message);
continue;
}
assert(!scrubError, `Error processing ${file}: ${scrubError}`);
const [mvErr] = await ptry($`mv ${file} ${piiFile}`);
assert(!mvErr, `Error moving ${file} to ${piiFile}: ${mvErr}`);
const [mv2Err] = await ptry($`mv ${tmpFile} ${file}`);
assert(!mv2Err, `Error moving ${tmpFile} to ${file}: ${mv2Err}`);
}
console.log();
console.log("Done!");