1
0
Fork 0
timelinize/datasources/geojson/geojson.go
Matthew Holt e44daa85df
Refactor location processing options
Add clustering strength parameter
2025-09-20 22:18:40 -06:00

512 lines
14 KiB
Go

/*
Timelinize
Copyright (c) 2013 Matthew Holt
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published
by the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
// Package geojson implements a data source for GeoJSON data (RFC 7946): https://geojson.org/
package geojson
import (
"context"
"encoding/json"
"errors"
"fmt"
"io"
"io/fs"
"math"
"path"
"strings"
"time"
"github.com/timelinize/timelinize/datasources/googlelocation"
"github.com/timelinize/timelinize/timeline"
"go.uber.org/zap"
)
// NOTE: This is very similar to the kmlgx importer, except it's JSON.
// Almost all other code is nearly identical.
func init() {
err := timeline.RegisterDataSource(timeline.DataSource{
Name: "geojson",
Title: "GeoJSON",
Icon: "geojson.svg",
Description: "GeoJSON files containing a collection of points",
NewOptions: func() any { return new(Options) },
NewFileImporter: func() timeline.FileImporter { return new(FileImporter) },
})
if err != nil {
timeline.Log.Fatal("registering data source", zap.Error(err))
}
}
// Options configures the data source.
type Options struct {
// The ID of the owner entity. REQUIRED for linking entity in DB.
// TODO: maybe an attribute ID instead, in case the data represents multiple people
OwnerEntityID uint64 `json:"owner_entity_id"`
// If true, coordinate arrays beyond 2 elements will attempt
// to be decoded in non-spec-compliant ways, which is useful
// if the source data is non-compliant. If the optional 3rd
// element is too big for altitude, it will be tried as
// Unix timestamp (seconds); and if a fourth element exists,
// both third and fourth elements will be tried as timestamp
// or altitude, depending on their magnitude. See #23.
Lenient bool `json:"lenient,omitempty"`
// Options specific to the location processor.
googlelocation.LocationProcessingOptions
}
// FileImporter implements the timeline.FileImporter interface.
type FileImporter struct{}
// Recognize returns whether the file is supported.
func (FileImporter) Recognize(_ context.Context, dirEntry timeline.DirEntry, _ timeline.RecognizeParams) (timeline.Recognition, error) {
rec := timeline.Recognition{DirThreshold: .9}
// we can import directories, but let the import planner figure that out; only recognize files
if dirEntry.IsDir() {
return rec, nil
}
// recognize by file extension
if strings.ToLower(path.Ext(dirEntry.Name())) == ".geojson" {
rec.Confidence = 1
}
return rec, nil
}
// FileImport conducts an import of the data from a file.
func (fi *FileImporter) FileImport(ctx context.Context, dirEntry timeline.DirEntry, params timeline.ImportParams) error {
dsOpt := params.DataSourceOptions.(*Options)
return fs.WalkDir(dirEntry.FS, dirEntry.Filename, func(fpath string, d fs.DirEntry, err error) error {
if err != nil {
return err
}
if err := ctx.Err(); err != nil {
return err
}
if strings.HasPrefix(d.Name(), ".") {
// skip hidden files & folders
if d.IsDir() {
return fs.SkipDir
}
return nil
}
if d.IsDir() {
return nil // traverse into subdirectories
}
// skip unsupported file types
if ext := strings.ToLower(path.Ext(d.Name())); ext != ".geojson" {
return nil
}
file, err := dirEntry.FS.Open(fpath)
if err != nil {
return err
}
defer file.Close()
// create JSON decoder (wrapped to track some state as it decodes)
jsonDec := &decoder{Decoder: json.NewDecoder(file), lenient: dsOpt.Lenient}
// create location processor to clean up any noisy raw data
locProc, err := googlelocation.NewLocationProcessor(jsonDec, dsOpt.LocationProcessingOptions)
if err != nil {
return err
}
// iterate each resulting location point and process it as an item
for {
l, err := locProc.NextLocation(ctx)
if err != nil {
return err
}
if l == nil {
break
}
feature := l.Original.(feature)
// use generic properties as metadata
meta := timeline.Metadata(feature.Properties)
meta = meta.HumanizeKeys()
// fill in special-case, standardized metadata using
// the same keys as with Google Location History
meta["Velocity"] = feature.velocity
meta["Heading"] = feature.heading
// include any metadata added by location processor
meta.Merge(l.Metadata, timeline.MetaMergeReplace)
item := &timeline.Item{
Classification: timeline.ClassLocation,
Timestamp: l.Timestamp,
Timespan: l.Timespan,
Location: l.Location(),
Owner: timeline.Entity{
ID: dsOpt.OwnerEntityID,
},
Metadata: meta,
}
if params.Timeframe.ContainsItem(item, false) {
params.Pipeline <- &timeline.Graph{Item: item}
}
}
return nil
})
}
// decoder wraps the JSON decoder to get the next location from the document.
// It tracks nesting state so we can be sure we're in the right part of the structure.
type decoder struct {
*json.Decoder
foundFeatures bool
lenient bool
// state to persist as we potentially decode locations in batches, depending on structure
current feature
positions []position
}
// NextLocation returns the next available point from the JSON document.
func (dec *decoder) NextLocation(ctx context.Context) (*googlelocation.Location, error) {
for {
if err := ctx.Err(); err != nil {
return nil, err
}
// if we've already decoded a batch of positions, return the next one until the batch is emptied
if len(dec.positions) > 0 {
var pos position
pos, dec.positions = dec.positions[0], dec.positions[1:]
return pos.location(dec.current, dec.lenient)
}
// if we haven't gotten to the 'features' part of the structure yet, keep going
if !dec.foundFeatures {
t, err := dec.Token()
if errors.Is(err, io.EOF) {
break
}
if err != nil {
return nil, fmt.Errorf("decoding next JSON token: %w", err)
}
if val, ok := t.(string); ok && val == "features" {
tkn, err := dec.Token()
if err != nil {
return nil, fmt.Errorf("decoding token after features token: %w", err)
}
if delim, ok := tkn.(json.Delim); ok && delim == '[' {
dec.foundFeatures = true
}
}
continue
}
if !dec.More() {
break
}
// decode the next feature!
dec.current = feature{} // reset it since I'm not sure if Decode writes the fields in-place or if it swaps out everything
if err := dec.Decode(&dec.current); err != nil {
return nil, fmt.Errorf("invalid GeoJSON feature: %w", err)
}
// feature properties are basically arbitrary key-value pairs, but a few common
// ones exist, such as time, altitude, etc; we extract what we can
if err := dec.current.extractKnownProperties(); err != nil {
return nil, fmt.Errorf("reading well-known properties of geojson feature: %w", err)
}
switch dec.current.Geometry.Type {
case "Point":
var coord position
if err := json.Unmarshal(dec.current.Geometry.Coordinates, &coord); err != nil {
return nil, fmt.Errorf("invalid Point coordinates: %w", err)
}
return coord.location(dec.current, dec.lenient)
case "LineString", "MultiPoint":
if err := json.Unmarshal(dec.current.Geometry.Coordinates, &dec.positions); err != nil {
return nil, fmt.Errorf("invalid %s coordinates: %w", dec.current.Geometry.Type, err)
}
case "MultiLineString":
var manyPositions [][]position
if err := json.Unmarshal(dec.current.Geometry.Coordinates, &manyPositions); err != nil {
return nil, fmt.Errorf("invalid MultiLineString coordinates: %w", err)
}
dec.positions = []position{}
for _, positions := range manyPositions {
dec.positions = append(dec.positions, positions...)
}
default:
// skip unsupported types
continue
}
}
return nil, nil
}
// see https://datatracker.ietf.org/doc/html/rfc7946#section-3.1
type feature struct {
Type string `json:"type"`
Properties map[string]any `json:"properties,omitempty"`
Geometry struct {
Type string `json:"type"`
Coordinates json.RawMessage `json:"coordinates"`
} `json:"geometry"`
// certain values extracted (and removed) from "well-known" (obvious or common) keys in Properties
time time.Time
altitude float64 // meters
accuracy float64 // meters (higher values are less accurate; should probably be called "error" instead)
velocity float64 // meters per second
heading float64 // degrees
}
func (f *feature) extractKnownProperties() error {
// we use this to try to guess whether Unix timestamp may be in seconds or milliseconds
const year2286ApproxUnixSec = 10000000000
// time
for _, propName := range []string{
"time",
"timestamp",
"time_long",
"datetime",
"date_time",
} {
// stop trying once we've got a time value
if !f.time.IsZero() {
break
}
switch val := f.Properties[propName].(type) {
case string:
for _, format := range []string{
time.RFC3339,
time.RFC3339Nano,
time.RFC850,
time.RFC822,
time.RFC822Z,
time.RFC1123,
time.RFC1123Z,
} {
t, err := time.Parse(format, val)
if err == nil {
f.time = t
delete(f.Properties, propName)
break
}
}
case int:
if val < year2286ApproxUnixSec {
f.time = time.Unix(int64(val), 0)
delete(f.Properties, propName)
} else {
f.time = time.UnixMilli(int64(val))
delete(f.Properties, propName)
}
case int64:
if val < year2286ApproxUnixSec {
f.time = time.Unix(val, 0)
delete(f.Properties, propName)
} else {
f.time = time.UnixMilli(val)
delete(f.Properties, propName)
}
case float64:
sec, dec := math.Modf(val)
if sec < year2286ApproxUnixSec {
f.time = time.Unix(int64(sec), int64(dec*(1e9)))
delete(f.Properties, propName)
} else {
f.time = time.UnixMilli(int64(sec)) // we don't store more precise than milliseconds
delete(f.Properties, propName)
}
case nil:
// having a time property isn't mandatory, ignore
default:
return fmt.Errorf("unexpected type for time property %s: %T", propName, val)
}
}
// altitude
for _, propName := range []string{
"altitude",
"elevation",
"height",
} {
// stop trying once we've got a value
if f.altitude != 0 {
break
}
switch val := f.Properties[propName].(type) {
case int:
f.altitude = float64(val)
delete(f.Properties, propName)
case int64:
f.altitude = float64(val)
delete(f.Properties, propName)
case float64:
f.altitude = val
delete(f.Properties, propName)
}
}
// accuracy
for _, propName := range []string{
"accuracy",
} {
// stop trying once we've got a value
if f.accuracy != 0 {
break
}
switch val := f.Properties[propName].(type) {
case int:
f.accuracy = float64(val)
delete(f.Properties, propName)
case int64:
f.accuracy = float64(val)
delete(f.Properties, propName)
case float64:
f.accuracy = val
delete(f.Properties, propName)
}
}
// heading
for _, propName := range []string{
"heading",
"bearing",
"direction",
} {
// stop trying once we've got a value
if f.heading != 0 {
break
}
switch val := f.Properties[propName].(type) {
case int:
f.heading = float64(val)
delete(f.Properties, propName)
case int64:
f.heading = float64(val)
delete(f.Properties, propName)
case float64:
f.heading = val
delete(f.Properties, propName)
}
}
// velocity
for _, propName := range []string{
"velocity",
"speed",
} {
// stop trying once we've got a value
if f.velocity != 0 {
break
}
switch val := f.Properties[propName].(type) {
case int:
f.velocity = float64(val)
delete(f.Properties, propName)
case int64:
f.velocity = float64(val)
delete(f.Properties, propName)
case float64:
f.velocity = val
delete(f.Properties, propName)
}
}
return nil
}
// https://datatracker.ietf.org/doc/html/rfc7946#section-3.1.1
type position []float64
func (p position) location(feature feature, lenient bool) (*googlelocation.Location, error) {
const minDimensions = 2
if count := len(p); count < minDimensions {
return nil, fmt.Errorf("expected at least two values for coordinate, got %d: %+v", count, p)
}
latE7, err := googlelocation.FloatToIntE7(p[1])
if err != nil {
return nil, err
}
lonE7, err := googlelocation.FloatToIntE7(p[0])
if err != nil {
return nil, err
}
altitude, ts := feature.altitude, feature.time
// the GeoJSON spec advises against supporting more than the optional 3rd element (altitude),
// but we've seen messy data (https://github.com/timelinize/timelinize/issues/23) where the
// third element is timestamp and even a fourth element is altitude sometimes (!!)...
// that blatantly violates the spec, but we can maybe do some basic sanity checks to see
// if we can assume those values
if len(p) > minDimensions {
if lenient {
// non-spec-compliant
var inferredAltitude float64
var inferredTimestamp time.Time
minAltitude, maxAltitude := -100.0, 20000.0 // meters
// iterate remaining positions
var remaining = len(p) - 2
for i := range remaining {
if p[2+i] > maxAltitude { // time can occur in any position
inferredTimestamp = time.Unix(int64(p[2+i]), 0)
} else if i == 0 && p[2+i] > minAltitude { // altitude must be in first position after coordinates
inferredAltitude = p[2+i]
}
// remaining positions can be altitude, speed, signal quality, number of satellites, etc.
// these are not easily detectable though.
}
if altitude == 0 && inferredAltitude != 0 {
altitude = inferredAltitude
}
if ts.IsZero() && !inferredTimestamp.IsZero() {
ts = inferredTimestamp
}
} else if altitude == 0 {
// spec compliant; third element is optional but must be altitude in meters if present
altitude = p[2]
}
}
return &googlelocation.Location{
Original: feature,
LatitudeE7: latE7,
LongitudeE7: lonE7,
Altitude: altitude,
Uncertainty: feature.accuracy,
Timestamp: ts,
}, nil
}