base-data-manager/data-export/google.ts
cobertos a4fbe1618d Fixed FB dating messages, added metadata as output table, added aggregate message thread metadata from FB
* aggregateId is now metadata and it's just aggregate: boolean and uses .id instead
* Use csv-parse for tests
* Update test snapshots
2026-02-26 11:21:36 -05:00

115 lines
4.3 KiB
TypeScript

import { pipe, branch, cmd, assignMeta, cd, glob, read, branchGen, type PipelineOp } from "./task.ts";
import { htmlSelectorChunkedDuplex } from "./html.ts";
export function google(){
return pipe(
// Generic ID for everything in here
assignMeta({ idValue: t=>`Google - ${t.basename}` }),
branchGen(function*() {
// TODO: There is a root takeout folder
yield pipe(cd('Access Log Activity/Activities - A list of Google services accessed by.csv'), read())
yield pipe(cd('Devices - A list of devices (i.e. Nest, Pixel, iPh.csv'), read())
// Assignments - data was empty
// Business messages - GMB messages, there's some but so far outside of what I want
// TODO: Calendar, exports an .ics
// a = t.fork().cd(`Chrome`)
// TODO: Assersses and mode.json
// TODO: Bookmarks.csv
// TODO: Device Information.json
// TODO: Dictionary.csv
// TODO: ...
yield pipe(
cd('Chrome/History.json'),
read(),
// TODO: Typed Url", no data
// TODO: "session", complex data
// Omitted .ptoken and .client_id for now. I think ptoken is maybe for the history API? client_id is base64 something...
// TODO: time_usec IS WRONG!! Needs to be ms
cmd(["jq", "-r", `["favicon_url","page_transition","title","url","time_usec"],
(
."Browser History"[]
| [.favicon_url, .page_transition, .title, .url, (.time_usec | todateiso8601)]
)
| @csv
`])
);
// TODO: Contactss, exports an .vcf
// TODO: ...
// a = t.fork().cd(`Google Pay`)
yield pipe(
cd(`Google Pay/Google transactions`),
glob(`transactions_*.csv`),
read(),
// .fork("a").cd(`Money sends and requests`)
// .fork().cd(`Money sends and requests.csv`)
// .read()
// .cmd(t=>["sqlite-utils", "insert", "your.db", t.basename, "-", "--csv", "--detect-types"])
// TODO: One more folder, and it only has a pdf
);
// TODO: Google Play Movies _ TV - no data
// TODO: ...
yield pipe(
cd("Location History/Location History.json"),
read(),
// TODO: This is missing
// "altitude" : 158,
// "verticalAccuracy" : 68
// and the activity models. I had no idea google tries to determine if I'm "tilting"
cmd(["jq", "-r", `["timestamp","latitudeE7","longitudeE7","accuracy"],
(
.locations[]
| [.timestampMs | todateiso8601, .latitudeE7, .longitudeE7, .accuracy]
)
| @csv
`])
);
// There's also the semantic history but that's an entire nother can of worms
// it seems like
// TODO: Needs no-headers!
// a = t.fork().cd(`My Activity`)
// a.fork().glob(`**/MyActivity.html`)
// .setId(t=>`Google - ${t.basenameN(2)}`)
// .read()
// .pipe(()=>{
// // Parses the MyActivity format, chunking it into pieces of HTML text
// // and then parsing out the text
// const dup = htmlSelectorChunkedDuplex(
// (tag, attrs)=>{
// // TODO: We also probably want to get and parse each
// // ".content-cell.mdl-typography--caption" as well (it
// // has location for websearches and sometimes a details field)
// // but then we have to get ".mdl-grid" and parse it
// return attrs.class?.includes("content-cell")
// && attrs.class?.includes("mdl-typography--body-1")
// && !attrs.class?.includes("mdl-typography--text-right")
// },
// (chunk)=>{
// const text = chunk.innerText;
// const split = text.split("\n");
// const timestamp = split.pop(); // TODO: need to parse this
// const rest = split.join("\n");
// // TODO: Escape instead of replace
// const restSafe = rest.replace(/"/g, "'").replace(/\n/g,"\\n"); // escape newlines and quotes
// // Return a CSV
// return `"${restSafe}","${timestamp}"\n`;
// }
// );
// return dup;
// })
// TODO: News
// TODO: Profile
// TODO: Tasks - No data
})
);
};