# Use this to process json files before loading them into unit tests # Something like: # fd -t f .json -0 | xargs -I % -0 -- jq -f scrub.jq "%" > "%" # (Though you should remove the end `> "%"` first to get just the output without # persisting to be sure it's what you want first) def scrub_key: if test("^[0-9]+$") then ("1" * length) else . end; def scrub_primitive: if type == "string" then if test("^(([0-9]{1,3}\\.){3}[0-9]{1,3})$") then # IPv4 "1.1.1.1" elif test("^([0-9a-fA-F]{0,4}:){2,7}[0-9a-fA-F]{0,4}$") then # IPv6 "2000:0000:0000:0000:0000:0000:0000:0000" elif test("^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$") then # Email-like "not_a_real_email@example.com" elif test("\\.(jpg|jpeg|png|gif|bmp|webp|svg|ico|tiff|mp3|wav|flac|aac|ogg|wma|m4a|mp4|avi|mkv|mov|wmv|flv|webm)$"; "i") then # Leave these alone, you will have to manually go through these later and replace with # placeholders # TODO: jq 1.7 adds debug(), use this instead when I can upgrade jq, otherwise # you need to manually grep for MANUAL REPAIR NEEDED for now ("MANUAL REPAIR NEEDED: \(.)" | stderr) | . elif test("^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}[+\\-][0-9]{2}:[0-9]{2}$") then # iso date time without millis with timezone "2020-04-13T10:09:08+00:00" elif test("^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}(\\.[0-9]{1,6})?[+\\-][0-9]{2}:[0-9]{2}$") then # iso date time with millis with timezone "2020-04-13T10:09:08.000000+00:00" elif test("^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}(\\.[0-9]{1,6})?$") then # iso date time with millis no timezone "2020-04-13T10:09:08.000" elif test("^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}Z$") then # iso date time with z and the end (from fitbit export) "2020-04-13T10:09:08Z" elif test("^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}\\.[0-9]{1,6}Z$") then # iso date time with z and the end and millis (from fitbit export) "2020-04-13T10:09:08.000000Z" elif test("^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}$") then # iso date time but no seconds (from fitbit export) "2020-04-13T10:09" elif test("^[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2} - [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}$") then # iso date time range no T (from fitbit export) "2020-04-13 10:09:08 - 2020-04-13 10:09:08" elif test("^[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}(\\.[0-9]{1,6})?[+\\-][0-9]{4}$") then # iso date time with millis with timezone no colon (fitbit export) "2020-04-13 10:09:08+0000" elif test("^[0-9]{4}-[0-9]{2}-[0-9]{2}$") then # Just date "2020-04-13" elif test("^[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2} UTC$") then # Date format from snapchat export "2020-04-13 10:09:08 UTC" elif test("^[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}$") then # Date format from snapchat export "2020-04-13 10:09:08" elif test("^[0-9]{2}/[0-9]{2}/[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}$") then # Date format from fitbit export (wtf ugh) "04/13/20 10:09:08" elif test("^\\w{3} \\w{3} [0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2} UTC [0-9]{4}$") then # Date format from fitbit export (uughhh) "Mon Apr 13 10:09:08 UTC 2020" elif test("^\\+[0-9]{2}:[0-2]{2}$") then # UTC offset (from fitbit export) "+00:00" elif test("^-[0-9]{2}:[0-2]{2}$") then # negative UTC offset (from fitbit export) "-00:00" elif test("^[0-9]+$") then # preserve length of the string "1" * length elif test("^-?[0-9]+(\\.[0-9]+)?$") then # decmial number in a string gsub("[0-9]"; "1") elif test("^[0-9a-fA-F]+$") then #hexadecimal string # repeat the hex pattern and truncate to original length ("a1" * length)[:length] elif . == "" then # prevents empty string from just returning null instead of empty string "" elif . == "true" or . == "false" then "false" elif test("://") then "url://somewhere" elif test("/") then "some/path" elif . == "null" then # From fitbit export in csv "null" else # Preserve string length for other strings "x" * length end elif type == "number" then if 946702800 <= . and . <= 1893474000 then # Take modulo 1 year to get variance in the output, then add offset to bring to ~2024 ((((. % 31557600) + 1704067200) / 5000 | floor) * 5000) elif . == (. | floor) then # Integer - preserve digit count (tostring | length) as $len | ("1" * $len) | tonumber else # Decimal - preserve digit count and leading zero tostring | split(".") | (.[0] | if . == "0" or . == "-0" then . else ("1" * length) end) as $int_part | (.[1] | ("1" * length)) as $frac_part | ($int_part + "." + $frac_part) | tonumber end elif type == "boolean" then # Replace all booleans with false, this can give sensative info away based # on what the key was in the data false else . end; def scrub: if type == "object" then # Apply scrubbing to both keys and values with_entries(.key |= scrub_key | .value |= scrub) elif type == "array" then # Keep only 2 elements, but scrub *those* elements .[:2] | map(scrub) else # Scrub a primitive value scrub_primitive end; # Call scrub #scrub