377 lines
11 KiB
Go
377 lines
11 KiB
Go
/*
|
|
Timelinize
|
|
Copyright (c) 2013 Matthew Holt
|
|
|
|
This program is free software: you can redistribute it and/or modify
|
|
it under the terms of the GNU Affero General Public License as published
|
|
by the Free Software Foundation, either version 3 of the License, or
|
|
(at your option) any later version.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU Affero General Public License for more details.
|
|
|
|
You should have received a copy of the GNU Affero General Public License
|
|
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
// Package vcard implements a data source for vCard files.
|
|
package vcard
|
|
|
|
import (
|
|
"bytes"
|
|
"context"
|
|
"encoding/base64"
|
|
"errors"
|
|
"io"
|
|
"io/fs"
|
|
"path"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/signal-golang/go-vcard"
|
|
"github.com/timelinize/timelinize/timeline"
|
|
"go.uber.org/zap"
|
|
)
|
|
|
|
func init() {
|
|
err := timeline.RegisterDataSource(timeline.DataSource{
|
|
Name: "vcard",
|
|
Title: "vCard",
|
|
Icon: "vcard.svg",
|
|
NewFileImporter: func() timeline.FileImporter { return new(FileImporter) },
|
|
})
|
|
if err != nil {
|
|
timeline.Log.Fatal("registering data source", zap.Error(err))
|
|
}
|
|
}
|
|
|
|
// FileImporter can import the data from a file.
|
|
type FileImporter struct{}
|
|
|
|
// Recognize returns whether the input is supported.
|
|
func (FileImporter) Recognize(_ context.Context, dirEntry timeline.DirEntry, _ timeline.RecognizeParams) (timeline.Recognition, error) {
|
|
rec := timeline.Recognition{DirThreshold: .9}
|
|
|
|
pathInFS := "." // start by assuming we'll inspect the current file
|
|
if dirEntry.IsDir() {
|
|
// if a directory, specifically, a folder for a contact list in a Google Takeout archive,
|
|
// recognize the whole folder so we can import those profile pictures as well
|
|
parts := strings.Split(dirEntry.FullPath(), "/")
|
|
if len(parts) >= 3 && parts[len(parts)-3] == "Takeout" && parts[len(parts)-2] == "Contacts" {
|
|
if vcfFilename := parts[len(parts)-1] + ".vcf"; timeline.FileExistsFS(dirEntry.FS, path.Join(dirEntry.Filename, vcfFilename)) {
|
|
pathInFS = vcfFilename
|
|
} else {
|
|
return rec, nil
|
|
}
|
|
} else {
|
|
return rec, nil
|
|
}
|
|
} else {
|
|
// only inspect file if it has a relevant extension
|
|
switch strings.ToLower(path.Ext(dirEntry.Name())) {
|
|
case ".vcf", ".vcard":
|
|
default:
|
|
return rec, nil
|
|
}
|
|
}
|
|
|
|
// we can import directories, but let the import planner figure that out; only recognize files
|
|
if dirEntry.IsDir() && pathInFS == "." {
|
|
return rec, nil
|
|
}
|
|
|
|
file, err := dirEntry.Open(pathInFS)
|
|
if err != nil {
|
|
return rec, err
|
|
}
|
|
defer file.Close()
|
|
|
|
buf := bufPool.Get().([]byte)
|
|
//nolint:gocritic,staticcheck
|
|
defer bufPool.Put(buf[:]) // ensure that even if buf is resized (it's not), we don't put back a larger buffer (good practice) -- WOW the linters hate this one
|
|
|
|
// read the first few bytes to see if it looks like a legit vcard; ignore empty or short files
|
|
_, err = io.ReadFull(file, buf)
|
|
if err != nil && !errors.Is(err, io.ErrUnexpectedEOF) && !errors.Is(err, io.EOF) {
|
|
return rec, err
|
|
}
|
|
if string(buf) == beginVCard {
|
|
rec.Confidence = 1
|
|
}
|
|
|
|
return rec, nil
|
|
}
|
|
|
|
// FileImport imports data from the given file/folder.
|
|
func (imp *FileImporter) FileImport(ctx context.Context, dirEntry timeline.DirEntry, params timeline.ImportParams) error {
|
|
err := fs.WalkDir(dirEntry.FS, dirEntry.Filename, func(filePath string, d fs.DirEntry, err error) error {
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if err := ctx.Err(); err != nil {
|
|
return err
|
|
}
|
|
if strings.HasPrefix(d.Name(), ".") {
|
|
// skip hidden files & folders
|
|
if d.IsDir() {
|
|
return fs.SkipDir
|
|
}
|
|
return nil
|
|
}
|
|
if d.IsDir() {
|
|
return nil // traverse into subdirectories
|
|
}
|
|
switch strings.ToLower(path.Ext(d.Name())) {
|
|
case ".vcf", ".vcard":
|
|
default:
|
|
return nil
|
|
}
|
|
|
|
file, err := dirEntry.FS.Open(filePath)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer file.Close()
|
|
|
|
dec := vcard.NewDecoder(file)
|
|
for {
|
|
card, err := dec.Decode()
|
|
if errors.Is(err, io.EOF) {
|
|
break
|
|
}
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
p := &timeline.Entity{
|
|
Name: strings.Trim(card.PreferredValue(vcard.FieldFormattedName), nameCutset),
|
|
Metadata: make(timeline.Metadata),
|
|
}
|
|
if p.Name == "" {
|
|
if name := card.Name(); name != nil {
|
|
formattedName := join(" ", []string{
|
|
name.GivenName,
|
|
name.AdditionalName,
|
|
name.FamilyName,
|
|
})
|
|
p.Name = strings.Trim(formattedName, nameCutset)
|
|
p.Metadata["Honorific prefix"] = name.HonorificPrefix
|
|
p.Metadata["Honorific suffix"] = name.HonorificSuffix
|
|
}
|
|
}
|
|
|
|
if rawBday := card.PreferredValue(vcard.FieldBirthday); rawBday != "" {
|
|
p.Attributes = append(p.Attributes, timeline.Attribute{
|
|
Name: "birth_date",
|
|
Value: ParseBirthday(rawBday),
|
|
})
|
|
}
|
|
|
|
for _, phone := range card.Values(vcard.FieldTelephone) {
|
|
// TODO: Don't let home phone numbers be identifying? See if card.Get(vcard.FieldTelephone) can help
|
|
p.Attributes = append(p.Attributes, timeline.Attribute{
|
|
Name: timeline.AttributePhoneNumber,
|
|
Value: phone,
|
|
Identifying: true,
|
|
})
|
|
}
|
|
|
|
for _, email := range card.Values(vcard.FieldEmail) {
|
|
if email == p.Name {
|
|
p.Name = "" // sometimes the email or phone number is also in the Name field for some reason (old Google Contacts)
|
|
}
|
|
p.Attributes = append(p.Attributes, timeline.Attribute{
|
|
Name: timeline.AttributeEmail,
|
|
Value: email,
|
|
Identifying: true,
|
|
})
|
|
}
|
|
|
|
if gender := card.PreferredValue(vcard.FieldGender); gender != "" {
|
|
p.Attributes = append(p.Attributes, timeline.Attribute{
|
|
Name: timeline.AttributeGender,
|
|
Value: gender,
|
|
})
|
|
}
|
|
|
|
// for a picture, see if there's a sidecar file that matches the name or email address
|
|
// (this is the case for Google Takeout exports) -- this is faster
|
|
if p.Name != "" {
|
|
if sidecarFilename := path.Join(path.Dir(filePath), p.Name+".jpg"); timeline.FileExistsFS(dirEntry.FS, sidecarFilename) {
|
|
p.NewPicture = func(_ context.Context) (io.ReadCloser, error) {
|
|
return dirEntry.FS.Open(sidecarFilename)
|
|
}
|
|
} else if attr, ok := p.Attribute(timeline.AttributeEmail); ok && attr.Value != nil {
|
|
sidecarFilename = path.Join(dirEntry.Filename, attr.Value.(string)) + ".jpg"
|
|
if timeline.FileExistsFS(dirEntry.FS, sidecarFilename) {
|
|
p.NewPicture = func(_ context.Context) (io.ReadCloser, error) {
|
|
return dirEntry.FS.Open(sidecarFilename)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
// otherwise, try downloading the picture from the vCard; I have seen URLs and base64,
|
|
// but URLs are often dead, and the vcard package loses some base64 values, for some reason
|
|
if p.NewPicture == nil {
|
|
photo := card.PreferredValue(vcard.FieldPhoto)
|
|
if photo == "" {
|
|
photo = card.PreferredValue(vcard.FieldLogo)
|
|
}
|
|
if strings.HasPrefix(strings.ToLower(photo), "http") {
|
|
p.NewPicture = timeline.DownloadData(photo)
|
|
} else if photo != "" {
|
|
// assume base64 encoding I guess; the parser kind of loses the "tail" information... but we have seen base64 photos
|
|
p.NewPicture = func(_ context.Context) (io.ReadCloser, error) {
|
|
photoBytes, err := base64.StdEncoding.DecodeString(photo)
|
|
return io.NopCloser(bytes.NewReader(photoBytes)), err
|
|
}
|
|
}
|
|
}
|
|
|
|
// the following fields are less common or useful, but still good to have if specified
|
|
|
|
if nickname := card.PreferredValue(vcard.FieldNickname); nickname != "" {
|
|
p.Attributes = append(p.Attributes, timeline.Attribute{
|
|
Name: "nickname",
|
|
Value: nickname,
|
|
})
|
|
}
|
|
|
|
for _, addr := range card.Addresses() {
|
|
// TODO: store components in metadata? or maybe use address parsing service later if needed
|
|
p.Attributes = append(p.Attributes, timeline.Attribute{
|
|
Name: "address",
|
|
Value: join(" ", []string{
|
|
addr.PostOfficeBox,
|
|
addr.StreetAddress,
|
|
addr.ExtendedAddress,
|
|
addr.Locality,
|
|
addr.Region,
|
|
addr.PostalCode,
|
|
addr.Country,
|
|
}),
|
|
})
|
|
}
|
|
|
|
for _, url := range card.Values(vcard.FieldURL) {
|
|
p.Attributes = append(p.Attributes, timeline.Attribute{
|
|
Name: "url",
|
|
Value: url,
|
|
})
|
|
}
|
|
|
|
if anniversary := card.PreferredValue(vcard.FieldAnniversary); anniversary != "" {
|
|
p.Attributes = append(p.Attributes, timeline.Attribute{
|
|
Name: "anniversary",
|
|
Value: anniversary,
|
|
})
|
|
}
|
|
|
|
for _, title := range card.Values(vcard.FieldTitle) {
|
|
p.Attributes = append(p.Attributes, timeline.Attribute{
|
|
Name: "title",
|
|
Value: title,
|
|
})
|
|
}
|
|
|
|
for _, role := range card.Values(vcard.FieldRole) {
|
|
p.Attributes = append(p.Attributes, timeline.Attribute{
|
|
Name: "role",
|
|
Value: role,
|
|
})
|
|
}
|
|
|
|
for _, note := range card.Values(vcard.FieldNote) {
|
|
p.Attributes = append(p.Attributes, timeline.Attribute{
|
|
Name: "note",
|
|
Value: note,
|
|
})
|
|
}
|
|
|
|
// vCard extension: https://www.rfc-editor.org/rfc/rfc6474.html#section-2.1
|
|
if birthPlace := card.PreferredValue("BIRTHPLACE"); birthPlace != "" {
|
|
p.Attributes = append(p.Attributes, timeline.Attribute{
|
|
Name: "birth_place",
|
|
Value: birthPlace,
|
|
})
|
|
}
|
|
if rawDeathDate := card.PreferredValue("DEATHDATE"); rawDeathDate != "" {
|
|
p.Attributes = append(p.Attributes, timeline.Attribute{
|
|
Name: "death_date",
|
|
Value: ParseBirthday(rawDeathDate),
|
|
})
|
|
}
|
|
if deathPlace := card.PreferredValue("DEATHPLACE"); deathPlace != "" {
|
|
p.Attributes = append(p.Attributes, timeline.Attribute{
|
|
Name: "death_place",
|
|
Value: deathPlace,
|
|
})
|
|
}
|
|
|
|
// if we have at least some useful data for the entity, process it
|
|
if p.Name != "" || len(p.Attributes) > 0 {
|
|
params.Pipeline <- &timeline.Graph{Entity: p}
|
|
}
|
|
}
|
|
|
|
return nil
|
|
})
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// ParseBirthday parses bday in one of these formats:
|
|
//
|
|
// - "--MMDD" (year is omitted, and as such, the Unix timestamp of the date will compute to be over 2000 years ago)
|
|
// - "YYYYMMDD"
|
|
// - "YYYY-MM-DD" (issue #153)
|
|
//
|
|
// If the date fails to parse, a nil time is returned.
|
|
func ParseBirthday(bday string) *time.Time {
|
|
var date time.Time
|
|
var err error
|
|
switch len(bday) {
|
|
case 6:
|
|
date, err = time.ParseInLocation("--0102", bday, time.Local)
|
|
case 8:
|
|
date, err = time.ParseInLocation("20060102", bday, time.Local)
|
|
case 10:
|
|
date, err = time.ParseInLocation("2006-01-02", bday, time.Local)
|
|
}
|
|
if err == nil {
|
|
return &date
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// join is like strings.Join, but only joins non-empty elements,
|
|
// and trims spaces around individual elements.
|
|
func join(sep string, elems []string) string {
|
|
var sb strings.Builder
|
|
for _, elem := range elems {
|
|
if strings.TrimSpace(elem) == "" {
|
|
continue
|
|
}
|
|
if sb.Len() > 0 {
|
|
sb.WriteString(sep)
|
|
}
|
|
sb.WriteString(elem)
|
|
}
|
|
return sb.String()
|
|
}
|
|
|
|
const nameCutset = "<\"“”'>"
|
|
|
|
const beginVCard = "BEGIN:VCARD"
|
|
|
|
var bufPool = sync.Pool{
|
|
New: func() any {
|
|
return make([]byte, len(beginVCard))
|
|
},
|
|
}
|