344 lines
9.5 KiB
Go
344 lines
9.5 KiB
Go
/*
|
||
Timelinize
|
||
Copyright (c) 2013 Matthew Holt
|
||
|
||
This program is free software: you can redistribute it and/or modify
|
||
it under the terms of the GNU Affero General Public License as published
|
||
by the Free Software Foundation, either version 3 of the License, or
|
||
(at your option) any later version.
|
||
|
||
This program is distributed in the hope that it will be useful,
|
||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
GNU Affero General Public License for more details.
|
||
|
||
You should have received a copy of the GNU Affero General Public License
|
||
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
||
*/
|
||
|
||
// Package genericcsv implements a data source that imports simple CSV files.
|
||
// Files must be named SHOULD_IMPORT_GENERIC.csv to be recognized.
|
||
//
|
||
// If the first row's first cell cannot be parsed as a timestamp it is treated
|
||
// as a header row and columns are mapped by name (case-insensitive):
|
||
//
|
||
// timestamp / time / date – item timestamp (required)
|
||
// description / desc – item text content (required)
|
||
// type – item classification (message, email, note,
|
||
// social, location, media, event, document,
|
||
// bookmark; defaults to note)
|
||
// sender – entity credited as the item's owner/sender
|
||
// receiver / recipient – entity the item was sent to
|
||
// latitude / lat – decimal latitude
|
||
// longitude / lon / lng – decimal longitude
|
||
// tags – comma-separated tag strings
|
||
//
|
||
// Without a header row, columns are positional: timestamp, description,
|
||
// latitude, longitude.
|
||
package genericcsv
|
||
|
||
import (
|
||
"context"
|
||
"encoding/csv"
|
||
"errors"
|
||
"fmt"
|
||
"io"
|
||
"strconv"
|
||
"strings"
|
||
"time"
|
||
|
||
"github.com/timelinize/timelinize/timeline"
|
||
"go.uber.org/zap"
|
||
)
|
||
|
||
func init() {
|
||
err := timeline.RegisterDataSource(timeline.DataSource{
|
||
Name: "generic_csv",
|
||
Title: "Generic CSV",
|
||
Description: "A CSV file with timestamp, description, and optional type, sender, receiver, latitude, longitude, and tags columns. File must be named SHOULD_IMPORT_GENERIC.csv.",
|
||
NewFileImporter: func() timeline.FileImporter { return new(Importer) },
|
||
})
|
||
if err != nil {
|
||
timeline.Log.Fatal("registering data source", zap.Error(err))
|
||
}
|
||
}
|
||
|
||
// Importer imports data from a generic CSV file.
|
||
type Importer struct{}
|
||
|
||
const targetFilename = "SHOULD_IMPORT_GENERIC.csv"
|
||
|
||
// Recognize returns whether the input is a recognized generic CSV file.
|
||
func (Importer) Recognize(_ context.Context, dirEntry timeline.DirEntry, _ timeline.RecognizeParams) (timeline.Recognition, error) {
|
||
if strings.EqualFold(dirEntry.Name(), targetFilename) {
|
||
return timeline.Recognition{Confidence: 1}, nil
|
||
}
|
||
return timeline.Recognition{}, nil
|
||
}
|
||
|
||
// noCol is the sentinel value for a column that is absent from the file.
|
||
const noCol = -1
|
||
|
||
// columnMap holds the index of each known column, or noCol if absent.
|
||
type columnMap struct {
|
||
timestamp int
|
||
description int
|
||
itemType int
|
||
sender int
|
||
receiver int
|
||
latitude int
|
||
longitude int
|
||
tags int
|
||
}
|
||
|
||
// buildColumnMap maps header names to column indices.
|
||
func buildColumnMap(headers []string) columnMap {
|
||
m := columnMap{
|
||
timestamp: noCol, description: noCol, itemType: noCol,
|
||
sender: noCol, receiver: noCol, latitude: noCol, longitude: noCol, tags: noCol,
|
||
}
|
||
for i, h := range headers {
|
||
switch strings.ToLower(strings.TrimSpace(h)) {
|
||
case "timestamp", "time", "date":
|
||
m.timestamp = i
|
||
case "description", "desc":
|
||
m.description = i
|
||
case "type":
|
||
m.itemType = i
|
||
case "sender":
|
||
m.sender = i
|
||
case "receiver", "recipient":
|
||
m.receiver = i
|
||
case "latitude", "lat":
|
||
m.latitude = i
|
||
case "longitude", "lon", "lng":
|
||
m.longitude = i
|
||
case "tags":
|
||
m.tags = i
|
||
}
|
||
}
|
||
return m
|
||
}
|
||
|
||
// positionalColumnMap returns a column map for files without a header row,
|
||
// preserving the original column ordering.
|
||
func positionalColumnMap() columnMap {
|
||
return columnMap{
|
||
timestamp: 0, description: 1, latitude: 2, longitude: 3,
|
||
itemType: noCol, sender: noCol, receiver: noCol, tags: noCol,
|
||
}
|
||
}
|
||
|
||
// col safely returns rec[idx] trimmed, or "" if idx is noCol or out of bounds.
|
||
func col(rec []string, idx int) string {
|
||
if idx == noCol || idx >= len(rec) {
|
||
return ""
|
||
}
|
||
return strings.TrimSpace(rec[idx])
|
||
}
|
||
|
||
// FileImport imports items from the CSV file, one item per row.
|
||
func (imp *Importer) FileImport(ctx context.Context, dirEntry timeline.DirEntry, params timeline.ImportParams) error {
|
||
f, err := dirEntry.FS.Open(dirEntry.Filename)
|
||
if err != nil {
|
||
return fmt.Errorf("opening file: %w", err)
|
||
}
|
||
defer f.Close()
|
||
|
||
r := csv.NewReader(f)
|
||
r.FieldsPerRecord = -1
|
||
|
||
// Read the first row to decide whether it is a header or a data row.
|
||
firstRow, err := r.Read()
|
||
if errors.Is(err, io.EOF) {
|
||
return nil
|
||
}
|
||
if err != nil {
|
||
return fmt.Errorf("reading first row: %w", err)
|
||
}
|
||
|
||
var cols columnMap
|
||
firstRowIsData := false
|
||
|
||
if len(firstRow) > 0 {
|
||
if _, tsErr := parseTimestamp(strings.TrimSpace(firstRow[0])); tsErr != nil {
|
||
// First cell is not a timestamp — treat the row as a header.
|
||
cols = buildColumnMap(firstRow)
|
||
} else {
|
||
// First cell is a valid timestamp — no header row.
|
||
cols = positionalColumnMap()
|
||
firstRowIsData = true
|
||
}
|
||
} else {
|
||
cols = positionalColumnMap()
|
||
}
|
||
|
||
if cols.timestamp == noCol {
|
||
return fmt.Errorf("no timestamp column found in header")
|
||
}
|
||
if cols.description == noCol {
|
||
return fmt.Errorf("no description column found in header")
|
||
}
|
||
|
||
lineNum := 1
|
||
|
||
// If the first row was data, process it before entering the main loop.
|
||
if firstRowIsData {
|
||
if g := buildGraph(firstRow, cols, lineNum, params.Log); g != nil {
|
||
params.Pipeline <- g
|
||
}
|
||
}
|
||
|
||
for {
|
||
if err := ctx.Err(); err != nil {
|
||
return err
|
||
}
|
||
|
||
lineNum++
|
||
rec, err := r.Read()
|
||
if errors.Is(err, io.EOF) {
|
||
break
|
||
}
|
||
if err != nil {
|
||
return fmt.Errorf("reading CSV at line %d: %w", lineNum, err)
|
||
}
|
||
|
||
if g := buildGraph(rec, cols, lineNum, params.Log); g != nil {
|
||
params.Pipeline <- g
|
||
}
|
||
}
|
||
|
||
return nil
|
||
}
|
||
|
||
// buildGraph converts a CSV row into a timeline Graph, or returns nil if the
|
||
// row should be skipped (e.g. unparseable timestamp).
|
||
func buildGraph(rec []string, cols columnMap, lineNum int, log *zap.Logger) *timeline.Graph {
|
||
tsStr := col(rec, cols.timestamp)
|
||
ts, err := parseTimestamp(tsStr)
|
||
if err != nil {
|
||
log.Warn("skipping row with unparseable timestamp",
|
||
zap.Int("line", lineNum),
|
||
zap.String("value", tsStr),
|
||
zap.Error(err),
|
||
)
|
||
return nil
|
||
}
|
||
|
||
item := &timeline.Item{
|
||
Classification: classificationFromString(col(rec, cols.itemType)),
|
||
Timestamp: ts,
|
||
Content: timeline.ItemData{
|
||
Data: timeline.StringData(col(rec, cols.description)),
|
||
MediaType: "text/plain",
|
||
},
|
||
}
|
||
|
||
// Sender → Item.Owner
|
||
if senderVal := col(rec, cols.sender); senderVal != "" {
|
||
item.Owner = entityFromName(senderVal)
|
||
}
|
||
|
||
// Latitude + Longitude
|
||
latStr := col(rec, cols.latitude)
|
||
lonStr := col(rec, cols.longitude)
|
||
if latStr != "" && lonStr != "" {
|
||
lat, latErr := strconv.ParseFloat(latStr, 64)
|
||
lon, lonErr := strconv.ParseFloat(lonStr, 64)
|
||
if latErr == nil && lonErr == nil {
|
||
item.Location = timeline.Location{
|
||
Latitude: &lat,
|
||
Longitude: &lon,
|
||
}
|
||
} else if latErr != nil {
|
||
log.Warn("ignoring unparseable latitude",
|
||
zap.Int("line", lineNum),
|
||
zap.String("value", latStr),
|
||
zap.Error(latErr),
|
||
)
|
||
} else {
|
||
log.Warn("ignoring unparseable longitude",
|
||
zap.Int("line", lineNum),
|
||
zap.String("value", lonStr),
|
||
zap.Error(lonErr),
|
||
)
|
||
}
|
||
}
|
||
|
||
// Tags → "Tags" metadata key (comma-separated string as-is from the column)
|
||
if tagsVal := col(rec, cols.tags); tagsVal != "" {
|
||
item.Metadata = timeline.Metadata{"Tags": tagsVal}
|
||
}
|
||
|
||
g := &timeline.Graph{Item: item}
|
||
|
||
// Receiver → RelSent relationship
|
||
if receiverVal := col(rec, cols.receiver); receiverVal != "" {
|
||
receiver := entityFromName(receiverVal)
|
||
g.ToEntity(timeline.RelSent, &receiver)
|
||
}
|
||
|
||
return g
|
||
}
|
||
|
||
// classificationFromString maps a type string to the corresponding
|
||
// Classification. Defaults to ClassNote for unknown or empty values.
|
||
func classificationFromString(s string) timeline.Classification {
|
||
switch strings.ToLower(s) {
|
||
case "message":
|
||
return timeline.ClassMessage
|
||
case "email":
|
||
return timeline.ClassEmail
|
||
case "social":
|
||
return timeline.ClassSocial
|
||
case "location":
|
||
return timeline.ClassLocation
|
||
case "media":
|
||
return timeline.ClassMedia
|
||
case "event":
|
||
return timeline.ClassEvent
|
||
case "document":
|
||
return timeline.ClassDocument
|
||
case "bookmark":
|
||
return timeline.ClassBookmark
|
||
default:
|
||
return timeline.ClassNote
|
||
}
|
||
}
|
||
|
||
// entityFromName creates an Entity identified by name, suitable for use as a
|
||
// sender or receiver in a generic CSV import.
|
||
func entityFromName(name string) timeline.Entity {
|
||
return timeline.Entity{
|
||
Name: name,
|
||
Attributes: []timeline.Attribute{
|
||
{
|
||
Name: "generic_csv_name",
|
||
Value: name,
|
||
Identity: true,
|
||
},
|
||
},
|
||
}
|
||
}
|
||
|
||
// timestampFormats lists the timestamp formats tried in order of specificity.
|
||
var timestampFormats = []string{
|
||
time.RFC3339Nano,
|
||
time.RFC3339,
|
||
"2006-01-02T15:04:05",
|
||
"2006-01-02 15:04:05",
|
||
"2006-01-02T15:04",
|
||
"2006-01-02 15:04",
|
||
"2006-01-02",
|
||
"01/02/2006 15:04:05",
|
||
"01/02/2006",
|
||
}
|
||
|
||
func parseTimestamp(s string) (time.Time, error) {
|
||
for _, format := range timestampFormats {
|
||
if t, err := time.Parse(format, s); err == nil {
|
||
return t, nil
|
||
}
|
||
}
|
||
return time.Time{}, fmt.Errorf("unrecognized timestamp format: %q", s)
|
||
}
|