/* Timelinize Copyright (c) 2013 Matthew Holt This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . */ // Package contactlist implements a data source for contact lists. package contactlist import ( "context" "encoding/base64" "encoding/csv" "errors" "fmt" "io" "path" "strings" "github.com/timelinize/timelinize/datasources/vcard" "github.com/timelinize/timelinize/timeline" "go.uber.org/zap" ) func init() { err := timeline.RegisterDataSource(timeline.DataSource{ Name: "contact_list", Title: "Contact List", Icon: "contact_list.svg", NewFileImporter: func() timeline.FileImporter { return new(FileImporter) }, }) if err != nil { timeline.Log.Fatal("registering data source", zap.Error(err)) } } // FileImporter can import the data from a file. type FileImporter struct{} // FileImport imports data from a file. func (fimp *FileImporter) FileImport(ctx context.Context, dirEntry timeline.DirEntry, params timeline.ImportParams) error { // start by assuming the dirEntry points directly to a CSV file pathInFS := "." // but if a directory was recognized, alter the path to refer to the CSV file within it if dirEntry.IsDir() { pathInFS = path.Base(dirEntry.Name()) + ".csv" } bestColumnMapping, bestDelim, err := bestColumnMappingAndDelim(ctx, dirEntry, ".") if err != nil { return err } // at least 2 fields should be required in order to be useful, right? // like an email by itself (or a name by itself) has no value I think... // especially since no items are attached from a contact list if len(bestColumnMapping) < recognizeAtLeastFields { return errors.New("insufficient header row") } file, err := dirEntry.Open(pathInFS) if err != nil { return fmt.Errorf("opening file: %w", err) } defer file.Close() r := csv.NewReader(file) r.ReuseRecord = true // with this enabled, DO NOT MODIFY THE SLICE RETURNED FROM Read() r.Comma = bestDelim var headerRow []string for { if err := ctx.Err(); err != nil { return err } row, err := r.Read() if err == io.EOF { break } if err != nil { return fmt.Errorf("error reading next record: %w", err) } // header row if len(headerRow) == 0 { headerRow = make([]string, len(row)) copy(headerRow, row) continue } // first, extract mappedValues from recognized columns in the row mappedValues := make(map[string][]string) // map of canonical field name -> associated value(s) from row for canonicalField, colIndices := range bestColumnMapping { for _, colIdx := range colIndices { mappedValues[canonicalField] = append(mappedValues[canonicalField], row[colIdx]) } } // then, convert each field+values pair to something about the person p := new(timeline.Entity) var firstName, midName, lastName string for field, values := range mappedValues { for _, value := range values { value = strings.TrimSpace(value) if value == "" { // ignore empty values; especially if there are multiple matched columns // for a field (like Name, for some reason), don't overwrite a non-empty // first column with an empty second column continue } switch field { case "full_name": p.Name = value case "first_name": firstName = value case "middle_name": midName = value case "last_name": lastName = value case "birthdate": birthDate := vcard.ParseBirthday(value) if birthDate != nil { p.Attributes = append(p.Attributes, timeline.Attribute{ Name: "birth_date", Value: value, }) } case "picture": if strings.HasPrefix(value, "http") { p.NewPicture = timeline.DownloadData(value) } else { picBytes, err := base64.RawStdEncoding.DecodeString(value) if err == nil { p.NewPicture = timeline.ByteData(picBytes) } } case timeline.AttributeGender: p.Attributes = append(p.Attributes, timeline.Attribute{ Name: field, Value: value, }) case timeline.AttributeEmail, timeline.AttributePhoneNumber: p.Attributes = append(p.Attributes, timeline.Attribute{ Name: field, Value: value, Identifying: true, }) } } } // assemble name, if given in different fields if p.Name == "" { p.Name = firstName if midName != "" { if p.Name != "" { p.Name += " " } p.Name += midName } if lastName != "" { if p.Name != "" { p.Name += " " } p.Name += lastName } } // contact lists from Google Takeout have profile pictures as sidecar files, // named as a concatenation of their names, or their email address; we can read // those directly for much faster and more reliable imports, if this import is // acting on a directory rather than on a regular file if dirEntry.IsDir() { pfpPathByName := path.Join(dirEntry.Filename, p.Name) + ".jpg" if timeline.FileExistsFS(dirEntry.FS, pfpPathByName) { p.NewPicture = func(_ context.Context) (io.ReadCloser, error) { return dirEntry.FS.Open(pfpPathByName) } } else if attr, ok := p.Attribute(timeline.AttributeEmail); ok && attr.Value != nil { pfpPathByEmail := path.Join(dirEntry.Filename, attr.Value.(string)) + ".jpg" if timeline.FileExistsFS(dirEntry.FS, pfpPathByEmail) { p.NewPicture = func(_ context.Context) (io.ReadCloser, error) { return dirEntry.FS.Open(pfpPathByEmail) } } } } // I think it's pointless to process a person if there aren't at // least 2 data points about them because we can get single // data points from nearly any data source; the value of adding // a contact list is to get more information about a person to // infer more relationships automatically. if (p.NewPicture == nil && len(p.Metadata) == 0 && len(p.Attributes) == 0) || // only a name is kinda useless (p.Name == "" && p.NewPicture != nil && len(p.Attributes)+len(p.Metadata) == 0) { continue } // finally, send person for processing params.Pipeline <- &timeline.Graph{Entity: p} } return nil }