/* Timelinize Copyright (c) 2013 Matthew Holt This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . */ package contactlist import ( "context" "encoding/csv" "fmt" "path" "strings" "github.com/timelinize/timelinize/timeline" ) // Recognize returns true if the file is recognized as a contact list, // or a Google Takeout contact list folder that also contains pictures. func (fimp *FileImporter) Recognize(ctx context.Context, dirEntry timeline.DirEntry, _ timeline.RecognizeParams) (timeline.Recognition, error) { pathInFS := "." // start by assuming we'll inspect the current file if dirEntry.IsDir() { // if a directory, specifically, a folder for a contact list in a Google Takeout archive, // recognize the whole folder so we can import those profile pictures as well parts := strings.Split(dirEntry.FullPath(), "/") if len(parts) >= 3 && parts[len(parts)-3] == "Takeout" && parts[len(parts)-2] == "Contacts" { if csvFilename := parts[len(parts)-1] + ".csv"; timeline.FileExistsFS(dirEntry.FS, path.Join(dirEntry.Filename, csvFilename)) { pathInFS = csvFilename } else { return timeline.Recognition{}, nil } } else { return timeline.Recognition{}, nil } } else { // only inspect file if it has a relevant extension switch strings.ToLower(path.Ext(dirEntry.Name())) { case ".csv", ".tsv": default: return timeline.Recognition{}, nil } } bestColMapping, bestDelim, err := bestColumnMappingAndDelim(ctx, dirEntry, pathInFS) if err != nil { return timeline.Recognition{}, err } if len(bestColMapping) > recognizeAtLeastFields && bestDelim != 0 { return timeline.Recognition{Confidence: 1.0}, nil } return timeline.Recognition{}, nil } // bestColumnMappingAndDelim determines the best mapping of columns, which is canonical field name -> matched column indices, // along with the associated detected delimiter of the file. func bestColumnMappingAndDelim(ctx context.Context, dirEntry timeline.DirEntry, pathInFS string) (map[string][]int, rune, error) { var bestDelim rune var bestMapping map[string][]int // best column mapping across all delimiters for _, delim := range []rune{',', '\t', ';'} { if err := ctx.Err(); err != nil { return nil, 0, err } colMapping, err := determineColumnMappingForDelimiter(dirEntry, pathInFS, delim) if err != nil { return nil, 0, fmt.Errorf("trying to determine column mapping with delimiter %s: %w", string(delim), err) } if len(colMapping) > len(bestMapping) { bestMapping = colMapping bestDelim = delim } } return bestMapping, bestDelim, nil } func determineColumnMappingForDelimiter(dirEntry timeline.DirEntry, pathInFS string, delim rune) (map[string][]int, error) { file, err := dirEntry.Open(pathInFS) if err != nil { return nil, err } defer file.Close() r := csv.NewReader(file) r.Comma = delim var fieldCount int var columnMapping map[string][]int for { row, err := r.Read() if err != nil { return nil, nil // most likely a syntax error, nbd } if fieldCount == 0 { // this must be the header row // should have at least two fields to be useful fieldCount = len(row) if fieldCount < minFields { return nil, nil } // if we don't recognize the minimum number of distinct fields, it's not a recognized contact list columnMapping = bestColumnMappingForFields(row) if len(columnMapping) < recognizeAtLeastFields { return nil, nil } } else { // we've already seen the header row; // field count should be equal to header row if len(row) != fieldCount { return nil, nil } break } } return columnMapping, nil } // bestColumnMappingForFields returns the best mapping of canonical field name // to column indices we could find. func bestColumnMappingForFields(headerRow []string) map[string][]int { var bestMapping map[string][]int for _, f := range formats { result := f.match(headerRow) // TODO: if a tiebreaker is needed, we might want to also consider the number of total columns matched (i.e. multiple email columns is better than one) if len(result) > len(bestMapping) { bestMapping = result } } return bestMapping } const ( minFields = 2 // file must have at least this many fields recognizeAtLeastFields = 2 // we must recognize at least this many field names )