1
0
Fork 0
timelinize/datasources/genericcsv/genericcsv.go
2026-02-24 03:48:13 +00:00

344 lines
9.5 KiB
Go
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*
Timelinize
Copyright (c) 2013 Matthew Holt
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published
by the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
// Package genericcsv implements a data source that imports simple CSV files.
// Files must be named SHOULD_IMPORT_GENERIC.csv to be recognized.
//
// If the first row's first cell cannot be parsed as a timestamp it is treated
// as a header row and columns are mapped by name (case-insensitive):
//
// timestamp / time / date item timestamp (required)
// description / desc item text content (required)
// type item classification (message, email, note,
// social, location, media, event, document,
// bookmark; defaults to note)
// sender entity credited as the item's owner/sender
// receiver / recipient entity the item was sent to
// latitude / lat decimal latitude
// longitude / lon / lng decimal longitude
// tags comma-separated tag strings
//
// Without a header row, columns are positional: timestamp, description,
// latitude, longitude.
package genericcsv
import (
"context"
"encoding/csv"
"errors"
"fmt"
"io"
"strconv"
"strings"
"time"
"github.com/timelinize/timelinize/timeline"
"go.uber.org/zap"
)
func init() {
err := timeline.RegisterDataSource(timeline.DataSource{
Name: "generic_csv",
Title: "Generic CSV",
Description: "A CSV file with timestamp, description, and optional type, sender, receiver, latitude, longitude, and tags columns. File must be named SHOULD_IMPORT_GENERIC.csv.",
NewFileImporter: func() timeline.FileImporter { return new(Importer) },
})
if err != nil {
timeline.Log.Fatal("registering data source", zap.Error(err))
}
}
// Importer imports data from a generic CSV file.
type Importer struct{}
const targetFilename = "SHOULD_IMPORT_GENERIC.csv"
// Recognize returns whether the input is a recognized generic CSV file.
func (Importer) Recognize(_ context.Context, dirEntry timeline.DirEntry, _ timeline.RecognizeParams) (timeline.Recognition, error) {
if strings.EqualFold(dirEntry.Name(), targetFilename) {
return timeline.Recognition{Confidence: 1}, nil
}
return timeline.Recognition{}, nil
}
// noCol is the sentinel value for a column that is absent from the file.
const noCol = -1
// columnMap holds the index of each known column, or noCol if absent.
type columnMap struct {
timestamp int
description int
itemType int
sender int
receiver int
latitude int
longitude int
tags int
}
// buildColumnMap maps header names to column indices.
func buildColumnMap(headers []string) columnMap {
m := columnMap{
timestamp: noCol, description: noCol, itemType: noCol,
sender: noCol, receiver: noCol, latitude: noCol, longitude: noCol, tags: noCol,
}
for i, h := range headers {
switch strings.ToLower(strings.TrimSpace(h)) {
case "timestamp", "time", "date":
m.timestamp = i
case "description", "desc":
m.description = i
case "type":
m.itemType = i
case "sender":
m.sender = i
case "receiver", "recipient":
m.receiver = i
case "latitude", "lat":
m.latitude = i
case "longitude", "lon", "lng":
m.longitude = i
case "tags":
m.tags = i
}
}
return m
}
// positionalColumnMap returns a column map for files without a header row,
// preserving the original column ordering.
func positionalColumnMap() columnMap {
return columnMap{
timestamp: 0, description: 1, latitude: 2, longitude: 3,
itemType: noCol, sender: noCol, receiver: noCol, tags: noCol,
}
}
// col safely returns rec[idx] trimmed, or "" if idx is noCol or out of bounds.
func col(rec []string, idx int) string {
if idx == noCol || idx >= len(rec) {
return ""
}
return strings.TrimSpace(rec[idx])
}
// FileImport imports items from the CSV file, one item per row.
func (imp *Importer) FileImport(ctx context.Context, dirEntry timeline.DirEntry, params timeline.ImportParams) error {
f, err := dirEntry.FS.Open(dirEntry.Filename)
if err != nil {
return fmt.Errorf("opening file: %w", err)
}
defer f.Close()
r := csv.NewReader(f)
r.FieldsPerRecord = -1
// Read the first row to decide whether it is a header or a data row.
firstRow, err := r.Read()
if errors.Is(err, io.EOF) {
return nil
}
if err != nil {
return fmt.Errorf("reading first row: %w", err)
}
var cols columnMap
firstRowIsData := false
if len(firstRow) > 0 {
if _, tsErr := parseTimestamp(strings.TrimSpace(firstRow[0])); tsErr != nil {
// First cell is not a timestamp — treat the row as a header.
cols = buildColumnMap(firstRow)
} else {
// First cell is a valid timestamp — no header row.
cols = positionalColumnMap()
firstRowIsData = true
}
} else {
cols = positionalColumnMap()
}
if cols.timestamp == noCol {
return fmt.Errorf("no timestamp column found in header")
}
if cols.description == noCol {
return fmt.Errorf("no description column found in header")
}
lineNum := 1
// If the first row was data, process it before entering the main loop.
if firstRowIsData {
if g := buildGraph(firstRow, cols, lineNum, params.Log); g != nil {
params.Pipeline <- g
}
}
for {
if err := ctx.Err(); err != nil {
return err
}
lineNum++
rec, err := r.Read()
if errors.Is(err, io.EOF) {
break
}
if err != nil {
return fmt.Errorf("reading CSV at line %d: %w", lineNum, err)
}
if g := buildGraph(rec, cols, lineNum, params.Log); g != nil {
params.Pipeline <- g
}
}
return nil
}
// buildGraph converts a CSV row into a timeline Graph, or returns nil if the
// row should be skipped (e.g. unparseable timestamp).
func buildGraph(rec []string, cols columnMap, lineNum int, log *zap.Logger) *timeline.Graph {
tsStr := col(rec, cols.timestamp)
ts, err := parseTimestamp(tsStr)
if err != nil {
log.Warn("skipping row with unparseable timestamp",
zap.Int("line", lineNum),
zap.String("value", tsStr),
zap.Error(err),
)
return nil
}
item := &timeline.Item{
Classification: classificationFromString(col(rec, cols.itemType)),
Timestamp: ts,
Content: timeline.ItemData{
Data: timeline.StringData(col(rec, cols.description)),
MediaType: "text/plain",
},
}
// Sender → Item.Owner
if senderVal := col(rec, cols.sender); senderVal != "" {
item.Owner = entityFromName(senderVal)
}
// Latitude + Longitude
latStr := col(rec, cols.latitude)
lonStr := col(rec, cols.longitude)
if latStr != "" && lonStr != "" {
lat, latErr := strconv.ParseFloat(latStr, 64)
lon, lonErr := strconv.ParseFloat(lonStr, 64)
if latErr == nil && lonErr == nil {
item.Location = timeline.Location{
Latitude: &lat,
Longitude: &lon,
}
} else if latErr != nil {
log.Warn("ignoring unparseable latitude",
zap.Int("line", lineNum),
zap.String("value", latStr),
zap.Error(latErr),
)
} else {
log.Warn("ignoring unparseable longitude",
zap.Int("line", lineNum),
zap.String("value", lonStr),
zap.Error(lonErr),
)
}
}
// Tags → "Tags" metadata key (comma-separated string as-is from the column)
if tagsVal := col(rec, cols.tags); tagsVal != "" {
item.Metadata = timeline.Metadata{"Tags": tagsVal}
}
g := &timeline.Graph{Item: item}
// Receiver → RelSent relationship
if receiverVal := col(rec, cols.receiver); receiverVal != "" {
receiver := entityFromName(receiverVal)
g.ToEntity(timeline.RelSent, &receiver)
}
return g
}
// classificationFromString maps a type string to the corresponding
// Classification. Defaults to ClassNote for unknown or empty values.
func classificationFromString(s string) timeline.Classification {
switch strings.ToLower(s) {
case "message":
return timeline.ClassMessage
case "email":
return timeline.ClassEmail
case "social":
return timeline.ClassSocial
case "location":
return timeline.ClassLocation
case "media":
return timeline.ClassMedia
case "event":
return timeline.ClassEvent
case "document":
return timeline.ClassDocument
case "bookmark":
return timeline.ClassBookmark
default:
return timeline.ClassNote
}
}
// entityFromName creates an Entity identified by name, suitable for use as a
// sender or receiver in a generic CSV import.
func entityFromName(name string) timeline.Entity {
return timeline.Entity{
Name: name,
Attributes: []timeline.Attribute{
{
Name: "generic_csv_name",
Value: name,
Identity: true,
},
},
}
}
// timestampFormats lists the timestamp formats tried in order of specificity.
var timestampFormats = []string{
time.RFC3339Nano,
time.RFC3339,
"2006-01-02T15:04:05",
"2006-01-02 15:04:05",
"2006-01-02T15:04",
"2006-01-02 15:04",
"2006-01-02",
"01/02/2006 15:04:05",
"01/02/2006",
}
func parseTimestamp(s string) (time.Time, error) {
for _, format := range timestampFormats {
if t, err := time.Parse(format, s); err == nil {
return t, nil
}
}
return time.Time{}, fmt.Errorf("unrecognized timestamp format: %q", s)
}