/* Timelinize Copyright (c) 2013 Matthew Holt This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . */ // Package genericcsv implements a data source that imports simple CSV files. // Files must be named SHOULD_IMPORT_GENERIC.csv to be recognized. // // If the first row's first cell cannot be parsed as a timestamp it is treated // as a header row and columns are mapped by name (case-insensitive): // // timestamp / time / date – item timestamp (required) // description / desc – item text content (required) // type – item classification (message, email, note, // social, location, media, event, document, // bookmark; defaults to note) // sender – entity credited as the item's owner/sender // receiver / recipient – entity the item was sent to // latitude / lat – decimal latitude // longitude / lon / lng – decimal longitude // tags – comma-separated tag strings // // Without a header row, columns are positional: timestamp, description, // latitude, longitude. package genericcsv import ( "context" "encoding/csv" "errors" "fmt" "io" "strconv" "strings" "time" "github.com/timelinize/timelinize/timeline" "go.uber.org/zap" ) func init() { err := timeline.RegisterDataSource(timeline.DataSource{ Name: "generic_csv", Title: "Generic CSV", Description: "A CSV file with timestamp, description, and optional type, sender, receiver, latitude, longitude, and tags columns. File must be named SHOULD_IMPORT_GENERIC.csv.", NewFileImporter: func() timeline.FileImporter { return new(Importer) }, }) if err != nil { timeline.Log.Fatal("registering data source", zap.Error(err)) } } // Importer imports data from a generic CSV file. type Importer struct{} const targetFilename = "SHOULD_IMPORT_GENERIC.csv" // Recognize returns whether the input is a recognized generic CSV file. func (Importer) Recognize(_ context.Context, dirEntry timeline.DirEntry, _ timeline.RecognizeParams) (timeline.Recognition, error) { if strings.EqualFold(dirEntry.Name(), targetFilename) { return timeline.Recognition{Confidence: 1}, nil } return timeline.Recognition{}, nil } // noCol is the sentinel value for a column that is absent from the file. const noCol = -1 // columnMap holds the index of each known column, or noCol if absent. type columnMap struct { timestamp int description int itemType int sender int receiver int latitude int longitude int tags int } // buildColumnMap maps header names to column indices. func buildColumnMap(headers []string) columnMap { m := columnMap{ timestamp: noCol, description: noCol, itemType: noCol, sender: noCol, receiver: noCol, latitude: noCol, longitude: noCol, tags: noCol, } for i, h := range headers { switch strings.ToLower(strings.TrimSpace(h)) { case "timestamp", "time", "date": m.timestamp = i case "description", "desc": m.description = i case "type": m.itemType = i case "sender": m.sender = i case "receiver", "recipient": m.receiver = i case "latitude", "lat": m.latitude = i case "longitude", "lon", "lng": m.longitude = i case "tags": m.tags = i } } return m } // positionalColumnMap returns a column map for files without a header row, // preserving the original column ordering. func positionalColumnMap() columnMap { return columnMap{ timestamp: 0, description: 1, latitude: 2, longitude: 3, itemType: noCol, sender: noCol, receiver: noCol, tags: noCol, } } // col safely returns rec[idx] trimmed, or "" if idx is noCol or out of bounds. func col(rec []string, idx int) string { if idx == noCol || idx >= len(rec) { return "" } return strings.TrimSpace(rec[idx]) } // FileImport imports items from the CSV file, one item per row. func (imp *Importer) FileImport(ctx context.Context, dirEntry timeline.DirEntry, params timeline.ImportParams) error { f, err := dirEntry.FS.Open(dirEntry.Filename) if err != nil { return fmt.Errorf("opening file: %w", err) } defer f.Close() r := csv.NewReader(f) r.FieldsPerRecord = -1 // Read the first row to decide whether it is a header or a data row. firstRow, err := r.Read() if errors.Is(err, io.EOF) { return nil } if err != nil { return fmt.Errorf("reading first row: %w", err) } var cols columnMap firstRowIsData := false if len(firstRow) > 0 { if _, tsErr := parseTimestamp(strings.TrimSpace(firstRow[0])); tsErr != nil { // First cell is not a timestamp — treat the row as a header. cols = buildColumnMap(firstRow) } else { // First cell is a valid timestamp — no header row. cols = positionalColumnMap() firstRowIsData = true } } else { cols = positionalColumnMap() } if cols.timestamp == noCol { return fmt.Errorf("no timestamp column found in header") } if cols.description == noCol { return fmt.Errorf("no description column found in header") } lineNum := 1 // If the first row was data, process it before entering the main loop. if firstRowIsData { if g := buildGraph(firstRow, cols, lineNum, params.Log); g != nil { params.Pipeline <- g } } for { if err := ctx.Err(); err != nil { return err } lineNum++ rec, err := r.Read() if errors.Is(err, io.EOF) { break } if err != nil { return fmt.Errorf("reading CSV at line %d: %w", lineNum, err) } if g := buildGraph(rec, cols, lineNum, params.Log); g != nil { params.Pipeline <- g } } return nil } // buildGraph converts a CSV row into a timeline Graph, or returns nil if the // row should be skipped (e.g. unparseable timestamp). func buildGraph(rec []string, cols columnMap, lineNum int, log *zap.Logger) *timeline.Graph { tsStr := col(rec, cols.timestamp) ts, err := parseTimestamp(tsStr) if err != nil { log.Warn("skipping row with unparseable timestamp", zap.Int("line", lineNum), zap.String("value", tsStr), zap.Error(err), ) return nil } item := &timeline.Item{ Classification: classificationFromString(col(rec, cols.itemType)), Timestamp: ts, Content: timeline.ItemData{ Data: timeline.StringData(col(rec, cols.description)), MediaType: "text/plain", }, } // Sender → Item.Owner if senderVal := col(rec, cols.sender); senderVal != "" { item.Owner = entityFromName(senderVal) } // Latitude + Longitude latStr := col(rec, cols.latitude) lonStr := col(rec, cols.longitude) if latStr != "" && lonStr != "" { lat, latErr := strconv.ParseFloat(latStr, 64) lon, lonErr := strconv.ParseFloat(lonStr, 64) if latErr == nil && lonErr == nil { item.Location = timeline.Location{ Latitude: &lat, Longitude: &lon, } } else if latErr != nil { log.Warn("ignoring unparseable latitude", zap.Int("line", lineNum), zap.String("value", latStr), zap.Error(latErr), ) } else { log.Warn("ignoring unparseable longitude", zap.Int("line", lineNum), zap.String("value", lonStr), zap.Error(lonErr), ) } } // Tags → "Tags" metadata key (comma-separated string as-is from the column) if tagsVal := col(rec, cols.tags); tagsVal != "" { item.Metadata = timeline.Metadata{"Tags": tagsVal} } g := &timeline.Graph{Item: item} // Receiver → RelSent relationship if receiverVal := col(rec, cols.receiver); receiverVal != "" { receiver := entityFromName(receiverVal) g.ToEntity(timeline.RelSent, &receiver) } return g } // classificationFromString maps a type string to the corresponding // Classification. Defaults to ClassNote for unknown or empty values. func classificationFromString(s string) timeline.Classification { switch strings.ToLower(s) { case "message": return timeline.ClassMessage case "email": return timeline.ClassEmail case "social": return timeline.ClassSocial case "location": return timeline.ClassLocation case "media": return timeline.ClassMedia case "event": return timeline.ClassEvent case "document": return timeline.ClassDocument case "bookmark": return timeline.ClassBookmark default: return timeline.ClassNote } } // entityFromName creates an Entity identified by name, suitable for use as a // sender or receiver in a generic CSV import. func entityFromName(name string) timeline.Entity { return timeline.Entity{ Name: name, Attributes: []timeline.Attribute{ { Name: "generic_csv_name", Value: name, Identity: true, }, }, } } // timestampFormats lists the timestamp formats tried in order of specificity. var timestampFormats = []string{ time.RFC3339Nano, time.RFC3339, "2006-01-02T15:04:05", "2006-01-02 15:04:05", "2006-01-02T15:04", "2006-01-02 15:04", "2006-01-02", "01/02/2006 15:04:05", "01/02/2006", } func parseTimestamp(s string) (time.Time, error) { for _, format := range timestampFormats { if t, err := time.Parse(format, s); err == nil { return t, nil } } return time.Time{}, fmt.Errorf("unrecognized timestamp format: %q", s) }