package googlevoice import ( "context" "errors" "fmt" "io" "io/fs" "path" "strings" "time" "github.com/PuerkitoBio/goquery" "github.com/signal-golang/go-vcard" "github.com/timelinize/timelinize/timeline" "go.uber.org/zap" "golang.org/x/net/html" ) func init() { err := timeline.RegisterDataSource(timeline.DataSource{ Name: "google_voice", Title: "Google Voice", Icon: "google_voice.svg", NewFileImporter: func() timeline.FileImporter { return new(FileImporter) }, }) if err != nil { timeline.Log.Fatal("registering data source", zap.Error(err)) } } // FileImporter can import the data from a file. type FileImporter struct { owner timeline.Entity } // Recognize returns whether this input is supported. func (FileImporter) Recognize(_ context.Context, dirEntry timeline.DirEntry, _ timeline.RecognizeParams) (timeline.Recognition, error) { // not a match if the file is not a directory if !dirEntry.IsDir() { return timeline.Recognition{}, nil } var rec timeline.Recognition if dirEntry.FileExists("Phones.vcf") { rec.Confidence += .4 } if dirEntry.FileExists("Bills.html") { rec.Confidence += .15 } if dirEntry.FileExists("Calls") { rec.Confidence += .4 } if dirEntry.Name() == "Voice" { rec.Confidence += .05 } return rec, nil } // FileImport imports data from the input file. func (fimp *FileImporter) FileImport(ctx context.Context, dirEntry timeline.DirEntry, params timeline.ImportParams) error { var err error fimp.owner, err = fimp.getOwner(dirEntry) if err != nil { params.Log.Error("failed getting owner information", zap.Error(err)) } err = fs.WalkDir(dirEntry, "Calls", func(fpath string, d fs.DirEntry, err error) error { if err := ctx.Err(); err != nil { return err } if err != nil { return err } if d.IsDir() { return nil } err = fimp.processConversationFile(ctx, dirEntry, params, fpath) if err != nil { params.Log.Error("processing conversation file", zap.String("path", fpath), zap.Error(err)) } return nil }) if err != nil { return fmt.Errorf("walking call files: %w", err) } return nil } func (*FileImporter) getOwner(dirEntry timeline.DirEntry) (timeline.Entity, error) { var owner timeline.Entity f, err := dirEntry.Open("Phones.vcf") if err != nil { return owner, err } defer f.Close() dec := vcard.NewDecoder(f) for { card, err := dec.Decode() if errors.Is(err, io.EOF) { break } if err != nil { return owner, err } // usually empty from what I've seen, but meh, try anyway owner.Name = card.PreferredValue(vcard.FieldFormattedName) for _, field := range card["TEL"] { if field.Group != "" { // this is a GVOICE number var label string if labels, ok := card[field.Group+".X-ABLabel"]; ok && len(labels) > 0 { label = labels[0].Value } owner.Attributes = append(owner.Attributes, timeline.Attribute{ Name: timeline.AttributePhoneNumber, Value: field.Value, Identity: true, Metadata: timeline.Metadata{ gvoiceGroup: field.Group, "Label": label, }, }) } else if len(field.Params.Types()) > 0 { // this is their regular phone number owner.Attributes = append(owner.Attributes, timeline.Attribute{ Name: timeline.AttributePhoneNumber, Value: field.Value, Identifying: true, }) } } } return owner, nil } func (fimp *FileImporter) ownerGVoiceNumber() string { for _, attr := range fimp.owner.Attributes { if attr.Name == timeline.AttributePhoneNumber && attr.Metadata[gvoiceGroup] != "" { return attr.Value.(string) } } return "" } func (fimp *FileImporter) processConversationFile(_ context.Context, dirEntry timeline.DirEntry, params timeline.ImportParams, conversationFilePath string) error { f, err := dirEntry.Open(conversationFilePath) if err != nil { return err } defer f.Close() doc, err := goquery.NewDocumentFromReader(f) if err != nil { return err } participants := []timeline.Entity{fimp.owner} // addUniqueParticipant adds an entity to the participants list only if it's not already there // (specifically, if an identifying attribute is the same as one for an entity already in the list) addUniqueParticipant := func(entityToAdd timeline.Entity) { if len(entityToAdd.Attributes) == 0 { return } // participant data SHOULD all be the same within the file, so just do a simple check if we've already got this person var isDuplicate bool outer: for _, attr := range entityToAdd.Attributes { for _, existing := range participants { for _, existingAttr := range existing.Attributes { if (attr.Identity || attr.Identifying) && existingAttr.Name == attr.Name && existingAttr.Value == attr.Value { isDuplicate = true break outer } } } } if !isDuplicate { participants = append(participants, entityToAdd) } } // group conversation files list the participants explicitly doc.Find(".participants .sender").Each(func(_ int, s *goquery.Selection) { participant := entityFromSenderCiteTag(s) addUniqueParticipant(participant) }) // when the participants aren't listed, we can still figure it out from scanning the log, // but only if the other participant(s) sent a message if len(participants) == 1 { doc.Find(".hChatLog .message .sender").Each(func(_ int, s *goquery.Selection) { participant := entityFromSenderCiteTag(s) addUniqueParticipant(participant) }) } // last resort, we can get something from the file name if it's a one-on-one conversation file if len(participants) == 1 { // get other entity info from file title; it could be name or phone number const expectedParts = 2 var other timeline.Entity parts := strings.SplitN(path.Base(conversationFilePath), " - Text - ", expectedParts) if len(parts) == expectedParts { var name, phone string nameOrPhone := parts[0] if strings.HasPrefix(nameOrPhone, "+") || isShortPhoneNum(nameOrPhone) { phone = nameOrPhone } else { name = nameOrPhone } other = timeline.Entity{ Name: name, Attributes: []timeline.Attribute{ { Name: timeline.AttributePhoneNumber, Value: phone, Identity: true, }, { // regrettably, this is required, since sometimes we only have their name Name: googleVoiceNameAttr, Value: name, Identifying: true, }, }, } addUniqueParticipant(other) } } // now iterate each message and create the item graphs doc.Find(".hChatLog .message").Each(func(_ int, s *goquery.Selection) { when := s.Find(".dt").AttrOr("title", "") from := s.Find(".sender .tel").AttrOr("href", "") msg := extractTextWithNewlines(s.Find("q")) ts, err := time.Parse(time.RFC3339, when) if err != nil { // TODO: could try parsing it from the text content with HTML entities, but the time zone isn't standard... example input: "Jul 17, 2021, 12:16:16 PM Mountain Time" params.Log.Error("message has invalid or missing RFC 3339 timestamp", zap.String("input", when)) } sender := entityFromSenderCiteTag(s.Find(".sender")) senderPhoneNum := strings.TrimPrefix(from, telPrefix) // owner entity may have more info associated with it, so use that instead of just the phone number given here if senderPhoneNum == fimp.ownerGVoiceNumber() { sender = fimp.owner } item := &timeline.Item{ Classification: timeline.ClassMessage, Timestamp: ts, Owner: sender, } textData := strings.TrimSpace(msg) if textData != "MMS Received" { item.Content = timeline.ItemData{ MediaType: "text/plain", Data: timeline.StringData(textData), } } g := &timeline.Graph{Item: item} // connect all participants who are not the sender as recipients of the message nextParticipant: for _, participant := range participants { for _, partAttr := range participant.Attributes { if partAttr.Name == timeline.AttributePhoneNumber && partAttr.Value == senderPhoneNum { continue nextParticipant } } g.ToEntity(timeline.RelSent, &participant) } // media attachments s.Find("img").Each(func(_ int, s *goquery.Selection) { imgSrc := s.AttrOr("src", "") if imgSrc == "" { return } // believe it or not, the is BROKEN since it doesn't contain the file extension -- we have to try a few common ones! var filename string for _, tryExt := range []string{ ".jpg", ".gif", } { tryFilename := path.Join(path.Dir(conversationFilePath), imgSrc+tryExt) if dirEntry.FileExists(tryFilename) { filename = tryFilename break } } if filename == "" { params.Log.Warn("unable to determine actual filename of attachment", zap.String("filename_stub", imgSrc)) return } attachmentData := func(_ context.Context) (io.ReadCloser, error) { return dirEntry.Open(filename) } // if the message has no text content, make the first attachment the content if item.Content.Data == nil { item.IntermediateLocation = filename item.Content.MediaType = "" // just let the processor sniff it out item.Content.Filename = path.Base(filename) item.Content.Data = attachmentData } else { attachment := &timeline.Item{ Classification: timeline.ClassMessage, Timestamp: ts, Owner: sender, IntermediateLocation: filename, Content: timeline.ItemData{ Filename: path.Base(filename), Data: attachmentData, }, } g.ToItem(timeline.RelAttachment, attachment) } }) params.Pipeline <- g }) return nil } // entityFromSenderCiteTag returns the entity described by the given selection, which should be // a tag. func entityFromSenderCiteTag(s *goquery.Selection) timeline.Entity { phoneNum := strings.TrimPrefix(s.Find(".tel").AttrOr("href", ""), telPrefix) formattedName := s.Find(".fn").Text() // this will actually be the phone number if name is not known / not in contacts if formattedName == phoneNum { formattedName = "" } entity := timeline.Entity{ Name: formattedName, } if phoneNum != "" { entity.Attributes = append(entity.Attributes, timeline.Attribute{ Name: timeline.AttributePhoneNumber, Value: phoneNum, Identity: true, }) } if formattedName != "" { // this is, regrettably, required to be an attribute and an identifying one at that, since we don't always have the person's // phone number when given their name, since in one-on-one conversation files, the data source only gives us either their name // or their phone number in the filename, and we only get both if they sent a message in the conversation // (without this, duplicate entities end up getting created, since we can't link them just by the name; it has to be an identifying attribute) entity.Attributes = append(entity.Attributes, timeline.Attribute{ Name: googleVoiceNameAttr, Value: formattedName, Identifying: true, }) } return entity } func isShortPhoneNum(s string) bool { if s == "" { return false } const maxLen = 6 if len(s) > maxLen { return false } for _, ch := range s { if ch < '0' || ch > '9' { return false } } return true } // extractTextWithNewlines is like calling .Text(), but it // inserts '\n' where
are, or "\n\n" where

tags are. func extractTextWithNewlines(sel *goquery.Selection) string { var sb strings.Builder sel.Contents().Each(func(_ int, s *goquery.Selection) { node := s.Get(0) switch node.Type { case html.TextNode: sb.WriteString(strings.TrimSpace(node.Data)) case html.ElementNode: tag := node.Data switch tag { case "br": sb.WriteString("\n") case "p": // Recurse for children sb.WriteString(extractTextWithNewlines(s)) sb.WriteString("\n\n") // paragraph break default: sb.WriteString(extractTextWithNewlines(s)) } } }) return sb.String() } // Phone numbers are prefixed with this in the data const telPrefix = "tel:" // This metadata key appears on Google Voice number entity attributes const gvoiceGroup = "Google Voice Group" // The name of the attribute that we use to specify the person's name // as it appears in Google Voice (typically from their Google Contact list, // though it could also be a public business or something) -- this is // unfortunately needed since sometimes all we have about a participant // is their name, if they didn't send any messages in the conversation const googleVoiceNameAttr = "gvoice_name"