base-data-manager/util/scrub.jq

136 lines
5.4 KiB
Text

# Use this to process json files before loading them into unit tests
# Something like:
# fd -t f .json -0 | xargs -I % -0 -- jq -f scrub.jq "%" > "%"
# (Though you should remove the end `> "%"` first to get just the output without
# persisting to be sure it's what you want first)
def scrub_key:
if test("^[0-9]+$") then
("1" * length)
else
.
end;
def scrub_primitive:
if type == "string" then
if test("^(([0-9]{1,3}\\.){3}[0-9]{1,3})$") then
# IPv4
"1.1.1.1"
elif test("^([0-9a-fA-F]{0,4}:){2,7}[0-9a-fA-F]{0,4}$") then
# IPv6
"2000:0000:0000:0000:0000:0000:0000:0000"
elif test("^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$") then
# Email-like
"not_a_real_email@example.com"
elif test("\\.(jpg|jpeg|png|gif|bmp|webp|svg|ico|tiff|mp3|wav|flac|aac|ogg|wma|m4a|mp4|avi|mkv|mov|wmv|flv|webm)$"; "i") then
# Leave these alone, you will have to manually go through these later and replace with
# placeholders
# TODO: jq 1.7 adds debug(), use this instead when I can upgrade jq, otherwise
# you need to manually grep for MANUAL REPAIR NEEDED for now
("MANUAL REPAIR NEEDED: \(.)" | stderr) | .
elif test("^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}[+\\-][0-9]{2}:[0-9]{2}$") then
# iso date time without millis with timezone
"2020-04-13T10:09:08+00:00"
elif test("^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}(\\.[0-9]{1,6})?[+\\-][0-9]{2}:[0-9]{2}$") then
# iso date time with millis with timezone
"2020-04-13T10:09:08.000000+00:00"
elif test("^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}(\\.[0-9]{1,6})?$") then
# iso date time with millis no timezone
"2020-04-13T10:09:08.000"
elif test("^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}Z$") then
# iso date time with z and the end (from fitbit export)
"2020-04-13T10:09:08Z"
elif test("^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}\\.[0-9]{1,6}Z$") then
# iso date time with z and the end and millis (from fitbit export)
"2020-04-13T10:09:08.000000Z"
elif test("^[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}$") then
# iso date time but no seconds (from fitbit export)
"2020-04-13T10:09"
elif test("^[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2} - [0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}$") then
# iso date time range no T (from fitbit export)
"2020-04-13 10:09:08 - 2020-04-13 10:09:08"
elif test("^[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}(\\.[0-9]{1,6})?[+\\-][0-9]{4}$") then
# iso date time with millis with timezone no colon (fitbit export)
"2020-04-13 10:09:08+0000"
elif test("^[0-9]{4}-[0-9]{2}-[0-9]{2}$") then
# Just date
"2020-04-13"
elif test("^[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2} UTC$") then
# Date format from snapchat export
"2020-04-13 10:09:08 UTC"
elif test("^[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}$") then
# Date format from snapchat export
"2020-04-13 10:09:08"
elif test("^[0-9]{2}/[0-9]{2}/[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}$") then
# Date format from fitbit export (wtf ugh)
"04/13/20 10:09:08"
elif test("^\\w{3} \\w{3} [0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2} UTC [0-9]{4}$") then
# Date format from fitbit export (uughhh)
"Mon Apr 13 10:09:08 UTC 2020"
elif test("^\\+[0-9]{2}:[0-2]{2}$") then
# UTC offset (from fitbit export)
"+00:00"
elif test("^-[0-9]{2}:[0-2]{2}$") then
# negative UTC offset (from fitbit export)
"-00:00"
elif test("^[0-9]+$") then
# preserve length of the string
"1" * length
elif test("^-?[0-9]+(\\.[0-9]+)?$") then
# decmial number in a string
gsub("[0-9]"; "1")
elif test("^[0-9a-fA-F]+$") then #hexadecimal string
# repeat the hex pattern and truncate to original length
("a1" * length)[:length]
elif . == "" then
# prevents empty string from just returning null instead of empty string
""
elif . == "true" or . == "false" then
"false"
elif test("://") then
"url://somewhere"
elif test("/") then
"some/path"
elif . == "null" then
# From fitbit export in csv
"null"
else
# Preserve string length for other strings
"x" * length
end
elif type == "number" then
if 946702800 <= . and . <= 1893474000 then
# Take modulo 1 year to get variance in the output, then add offset to bring to ~2024
((((. % 31557600) + 1704067200) / 5000 | floor) * 5000)
elif . == (. | floor) then
# Integer - preserve digit count
(tostring | length) as $len | ("1" * $len) | tonumber
else
# Decimal - preserve digit count and leading zero
tostring | split(".") |
(.[0] | if . == "0" or . == "-0" then . else ("1" * length) end) as $int_part |
(.[1] | ("1" * length)) as $frac_part |
($int_part + "." + $frac_part) | tonumber
end
elif type == "boolean" then
# Replace all booleans with false, this can give sensative info away based
# on what the key was in the data
false
else
.
end;
def scrub:
if type == "object" then
# Apply scrubbing to both keys and values
with_entries(.key |= scrub_key | .value |= scrub)
elif type == "array" then
# Keep only 2 elements, but scrub *those* elements
.[:2] | map(scrub)
else
# Scrub a primitive value
scrub_primitive
end;
# Call scrub
#scrub