1
0
Fork 0
timelinize/timeline/entities.go
Matthew Holt e7650c784a
Some minor changes
- New config parameter "resume_jobs" which can disable auto-resuming jobs at timeline open. (closes #159)

- Renamed "a" to "app" in one method using "Rename symbol" (not "Change all occurrences"), which surprisingly updated the identifier in ALL methods. That must be new. Anyway, that's the huge diff.

- Minor fix to metadata merge that does a more proper nil check to avoid a panic.

- Changed some omitempty to omitzero
2025-10-22 15:13:32 -06:00

1192 lines
42 KiB
Go

/*
Timelinize
Copyright (c) 2013 Matthew Holt
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published
by the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
package timeline
import (
"bufio"
"context"
"database/sql"
"encoding/json"
"errors"
"fmt"
"io"
"mime"
"net/http"
"os"
"os/exec"
"path"
"path/filepath"
"regexp"
"runtime"
"strconv"
"strings"
"sync/atomic"
"time"
"github.com/ttacon/libphonenumber"
"go.uber.org/zap"
"go.uber.org/zap/zapcore"
)
// Entity represents a person. All these fields go into the persons table,
// except for UserID which goes into person_identities and which maps that
// user ID to the person being described by name and birth information.
// Birth information is almost always added manually by users, since
// data sources don't usually provide this, and is used to distinguish
// different persons of the same name, and is totally optional.
// TODO: update godoc
type Entity struct {
// database row IDs
ID uint64 `json:"id,omitempty"`
id *uint64 // temporary holding place for DB value
typeID uint64
Type string `json:"type,omitempty"`
name *string // temporary holding place for DB value
Name string `json:"name,omitempty"`
// To set a profile picture for this entity, set this field.
NewPicture DataFunc `json:"-"`
// Very optional extra information about this entity.
Metadata Metadata `json:"metadata,omitempty"`
Attributes []Attribute `json:"attributes,omitempty"`
// THE FOLLOWING FIELDS ARE NOT TO BE SET BY DATA SOURCES.
// Contains the path to the previously-set profile picture file
// relative to the repo root.
Picture *string `json:"picture,omitempty"`
// Fields below are only for use with search or JSON serialization
JobID *int64 `json:"job_id,omitempty"`
Stored time.Time `json:"stored,omitzero"`
Modified *time.Time `json:"modified,omitempty"`
}
func (e Entity) String() string {
return fmt.Sprintf("[id=%d type=%s name=%s picture=%v metadata=%v attributes=%v]",
e.ID, e.Type, e.Name, e.Picture, e.Metadata, e.Attributes)
}
// IsEmpty returns true if the entity has no information.
// It is best to check this AFTER normalizing, which may
// delete data that looks non-empty but is actually empty.
func (e Entity) IsEmpty() bool {
return e.ID == 0 &&
e.Name == "" &&
e.NewPicture == nil &&
e.Picture == nil &&
len(e.Metadata) == 0 &&
len(e.Attributes) == 0
}
func (e Entity) dbName() *string {
if strings.TrimSpace(e.Name) == "" {
return nil
}
return &e.Name
}
// TODO: I'm not sure we're representing the new date attributes properly in the DB
// func (p Entity) birthDateUnix() *int64 {
// if p.BirthDate == nil {
// return nil
// }
// bdUnix := p.BirthDate.Unix()
// return &bdUnix
// }
// func (p Entity) deathDateUnix() *int64 {
// if p.DeathDate == nil {
// return nil
// }
// ddUnix := p.DeathDate.Unix()
// return &ddUnix
// }
// Attribute gets the (first-)named attribute from the entity.
func (e Entity) Attribute(name string) (Attribute, bool) {
for _, attr := range e.Attributes {
if attr.Name == name {
return attr, true
}
}
return Attribute{}, false
}
// AttributeValue gets the value of the (first-)named attribute from the entity.
func (e Entity) AttributeValue(name string) any {
for _, attr := range e.Attributes {
if attr.Name == name {
return attr.Value
}
}
return ""
}
func (tl *Timeline) normalizeEntity(e *Entity) error {
var identityAttr Attribute
for i := 0; i < len(e.Attributes); i++ {
// remove empty attributes
if e.Attributes[i].Empty() {
e.Attributes = append(e.Attributes[:i], e.Attributes[i+1:]...)
i--
continue
}
// ensure at most one identity attribute; multiple would be confusing/ambiguous and
// could lead to confusion or worse (maybe uniqueness constraint violations)
if e.Attributes[i].Identity {
if identityAttr.Name != "" {
return fmt.Errorf("multiple ambiguous identity attributes: at least %s=%s and %s=%s",
identityAttr.Name, identityAttr.Value, e.Attributes[i].Name, e.Attributes[i].Value)
}
identityAttr = e.Attributes[i]
}
// normalize known attributes
e.Attributes[i] = normalizeAttribute(tl.ctx, e.Attributes[i])
}
// clean up name too; use regex to remove repeated spaces, then trim spaces on edges
e.Name = strings.TrimSpace(multiSpaceRegex.ReplaceAllString(e.Name, " "))
// oh, and ensure type is set (DB row requires entity type ID, not name)
if e.Type == "" && e.typeID == 0 {
e.Type = EntityPerson // assume a person, I guess -- data sources don't often distinguish
}
if e.typeID == 0 {
typeID, err := tl.entityTypeNameToID(e.Type)
if err != nil {
return fmt.Errorf("getting entity type ID: %w (entity_type=%s)", err, e.Type)
}
e.typeID = typeID
}
e.Metadata.Clean()
return nil
}
func (e Entity) metadataString() (*string, error) {
if len(e.Metadata) == 0 {
return nil, nil
}
metaBytes, err := json.Marshal(e.Metadata)
if err != nil {
return nil, fmt.Errorf("encoding entity metadata as JSON: %w", err)
}
metaString := string(metaBytes)
return &metaString, nil
}
const (
EntityPerson = "person"
EntityCreature = "creature"
EntityPlace = "place"
)
// Attribute describes an entity.
type Attribute struct {
// The name of the attribute. For attributes that define
// a user's identity on a data source, this name *MUST*
// be unique to the data source if it's possible for a
// different person to have the same attribute+value, or
// else collisions/confusion could arise in the processing
// logic. (For example, "username" is not a good attribute
// name.) Specifically, given a data source, attribute name,
// and a specific value, we should be able to pinpoint
// *which* person or people with certainty even if the value
// alone is the same for different persons. If the value is
// guaranteed to be unique even if the attribute name isn't,
// then it is OK for the attribute name to be shared by data
// sources (e.g. phone number or email address -- these
// concepts span data sources, but they are globally unique
// at any given point in time). See the Identifying field.
//
// Known global attributes will be recognized and/or standardized:
//
// - phone_number: Can be prefixed with region (default: "US"), e.g.: "US:123-456-7890"
// - TODO: email address
// - TODO: physical address
// - TODO: gender
Name string `json:"name"`
// The value of the attribute.
Value any `json:"value"`
// Optional; an alternate value intended for display or as a description.
AltValue string `json:"alt_value,omitempty"`
// If true, this attribute+value combination can be used to
// identify a person.
Identifying bool `json:"identifying,omitempty"`
// If true, this attribute defines a person's identity
// on the data source associated with the current
// processing. There should only be 1 identity attribute
// per data source. When associated with an Item.Owner,
// the item will be credited to this attribute in the DB.
// Implies Identifying=true.
Identity bool `json:"identity"`
// If the attribute represents a place, put its location here.
// At least lat/lon are required (for earth coordinates).
Latitude *float64 `json:"latitude,omitempty"`
Longitude *float64 `json:"longitude,omitempty"`
Altitude *float64 `json:"altitude,omitempty"`
// Optional metadata associated with this attribute.
Metadata Metadata `json:"metadata,omitempty"`
//////////////////////////////////////////////////////////////////////////
// The fields below are NOT intended for use by data sources (importers).
// NOT FOR USE BY DATA SOURCES.
ID uint64 `json:"id,omitempty"`
// NOT FOR USE BY DATA SOURCES. For search results only.
ItemCount int64 `json:"item_count,omitempty"`
// NOT FOR USE BY DATA SOURCES. For search results only.
// This indicates which data source(s) the attribute identifies
// an entity on.
IdentityOn []string `json:"identity_on,omitempty"`
}
// Empty returns true if the name or value is empty or only whitespace.
func (a Attribute) Empty() bool {
if strings.TrimSpace(a.Name) == "" {
return true
}
return strings.TrimSpace(a.valueString()) == "" && a.Latitude == nil && a.Longitude == nil
}
func (a Attribute) valueForDB() any {
switch val := a.Value.(type) {
case *time.Time:
if val == nil {
return nil
}
return val.Unix()
case time.Time:
return val.Unix()
default:
return val
}
}
func (a Attribute) valueString() string {
switch val := a.Value.(type) {
case nil:
return ""
case string:
return val
case *time.Time:
// this case only exists because if the *time.Time is nil,
// we get a panic for calling String() on a nil pointer
// (because time.Time is a fmt.Stringer)
if val == nil {
return ""
}
return val.String()
case fmt.Stringer:
if val == nil {
return ""
}
return val.String()
case int:
return strconv.Itoa(val)
default:
return fmt.Sprintf("%v", val)
}
}
func normalizeAttribute(ctx context.Context, attr Attribute) Attribute {
attr.Name = strings.TrimSpace(attr.Name)
if val, ok := attr.Value.(string); ok {
// if this is a time value that was deserialized from JSON, convert it to a time type
// (values coming directly from data sources should already be time type); we do this
// because we conventionally store timestamps as unix timestamps in the DB, and the
// values need to be time types in order for us to do that
if parsedTime, err := time.Parse(time.RFC3339, val); err == nil {
attr.Value = parsedTime
} else {
attr.Value = strings.TrimSpace(val)
}
}
if attr.Name == AttributePhoneNumber {
// TODO: region could be stored in metadata now
region, num, ok := strings.Cut(attr.Value.(string), ":")
if !ok {
num = region
region = ""
}
stdPhoneNum, err := NormalizePhoneNumber(ctx, num, region)
if err == nil {
attr.Value = stdPhoneNum
}
}
attr.Metadata.Clean()
return attr
}
// NormalizePhoneNumber attempts to parse number and returns
// a standardized version in E164 format. If the number does
// not have an explicit region/country code, the country code
// for the region is used instead. If no country code is
// in the number, the country code from the system's locale
// region is assumed; otherwise the US region is assumed.
//
// We chose E164 because that's what Twilio uses.
func NormalizePhoneNumber(ctx context.Context, number, defaultRegion string) (string, error) {
if defaultRegion == "" {
if _, localeRegion, _, err := getSystemLocale(ctx); err == nil {
defaultRegion = localeRegion
} else {
defaultRegion = "US"
}
}
ph, err := libphonenumber.Parse(number, defaultRegion)
if err != nil {
return "", err
}
return libphonenumber.Format(ph, libphonenumber.E164), nil
}
// processEntityPicture downloads the entity's picture (if relevant), then stores it on
// disk. It does NOT update the database, but it does return the path to the picture file.
func (p *processor) processEntityPicture(ctx context.Context, e Entity) (string, error) {
if e.ID <= 0 {
return "", fmt.Errorf("missing or invalid row ID %d", e.ID)
}
if e.NewPicture == nil {
return "", nil
}
r, err := e.NewPicture(ctx)
if err != nil {
return "", fmt.Errorf("opening profile picture reader: %w", err)
}
if r == nil {
return "", nil // this could happen if the data source finds out there's no picture and no error
}
defer r.Close()
buffered := bufio.NewReader(r)
const bytesNeededToSniff = 512
peekedBytes, err := buffered.Peek(bytesNeededToSniff)
if err != nil {
return "", fmt.Errorf("could not peek profile picture to determine type: %w", err)
}
contentType := http.DetectContentType(peekedBytes)
// use "/" separators here; the fullpath() method will adjust for OS path separator
// (we use the "%09d" formatter so file systems sort more conveniently, but it
// also does not look like a date/time)
pictureFile := path.Join(AssetsFolderName, "profile_pictures", fmt.Sprintf("entity_%09d", e.ID))
disposition, _, _ := mime.ParseMediaType(contentType)
switch disposition {
case imagePNG:
pictureFile += ".png"
case ImageWebP:
pictureFile += ".webp"
case imageGif:
pictureFile += ".gif"
case ImageAVIF:
pictureFile += ".avif"
case ImageJPEG, "":
fallthrough
default:
pictureFile += extJpg
}
fullPath := filepath.Clean(p.tl.FullPath(pictureFile))
// ensure parent dir exists, then open file for writing
if err = os.MkdirAll(filepath.Dir(fullPath), 0700); err != nil {
return "", err
}
w, err := os.OpenFile(fullPath, os.O_CREATE|os.O_RDWR|os.O_TRUNC, 0600)
if err != nil {
return "", err
}
defer w.Close()
if _, err := io.Copy(w, buffered); err != nil {
return "", err
}
if err := w.Sync(); err != nil {
return "", err
}
return pictureFile, nil
}
// Exported consts may be used for thumbnails.
const (
ImageWebP, extWebp = "image/webp", ".webp"
ImageJPEG, extJpg, extJpeg = "image/jpeg", ".jpg", ".jpeg"
ImageAVIF, extAvif = "image/avif", ".avif"
imagePNG, extPng = "image/png", ".png"
imageGif, extGif = "image/gif", ".gif"
VideoWebM = "video/webm"
)
// TODO: update this godoc related to latentID; and note that an empty latentID and nil error is a possible and valid return value here.
// processEntity ingests the incoming person into the timeline. The person's attributes are
// normalized, the person is looked up in the DB according to their identifying attributes
// first, then their ID if given, or their name (if enabled as ID); and if one match is found,
// the person's information and attributes are merged with the one in the database if this one
// has anything new to add; otherwise, it creates a new entity in the DB with the information
// provided. If multiple matches are found (a single identity attribute can technically map to
// multiple entities, however, I believe this has to be done manually for now), then the
// information for the person(s) is not updated (as that would make them look alike) but the
// attributes are added and linked at least to this one person. The row ID of the ID attribute
// that represents the input entity is returned. Note that a row ID of 0 can be returned if
// there is no identity attribute!
func (p *processor) processEntity(ctx context.Context, tx *sql.Tx, in Entity) (latentID, error) {
// remove any empty attributes and standardize the entity
if err := p.tl.normalizeEntity(&in); err != nil {
return latentID{}, err
}
// don't save an empty entity (make sure we do this AFTER normalizing); it's totally
// allowed for items to not have any particular owner specified
if in.IsEmpty() {
return latentID{}, nil
}
// try to find matching entities with given information; first try attributes because
// they are more descriptive, then if no matches, use ID directly if provided, or if
// not, then use name if only it is provided and set to act as an ID (not ideal though,
// as names tend to change...) -- first try attributes by clearing name and ID from a
// copy of the incoming entity
inCopy := in
inCopy.ID = 0
entities, err := p.loadEntities(ctx, tx, &inCopy)
if err != nil {
return latentID{}, fmt.Errorf("searching entities by attributes: %w", err)
}
// then only if that didn't yield any results, try ID next
if len(entities) == 0 && in.ID > 0 {
entities, err = p.loadEntities(ctx, tx, &in)
if err != nil {
return latentID{}, fmt.Errorf("searching entities by ID: %w", err)
}
}
// if our searches yielded 0 results, we can insert the entity; if 1, we can update it
switch len(entities) {
case 0:
// don't create a new entity if *only* an entity ID was provided (it must have been
// erroneous?) with no other attributes, as that implies the entity should already
// exist but it does not... creating a new one is wrong
if in.ID > 0 && len(in.Attributes) == 0 {
return latentID{}, fmt.Errorf("input entity had non-zero row ID (%d), but that entity was not found; and without attributes we do not create a new entity", in.ID)
}
// marshal metadata and convert it to a string pointer... awkward, but seems more reasonable than a blob...
metadata, err := in.metadataString()
if err != nil {
return latentID{}, err
}
// if we could not find a person by any of those identities, add new person to DB
err = tx.QueryRowContext(ctx, `INSERT INTO entities
(type_id, job_id, name, metadata) VALUES (?, ?, ?, ?)
RETURNING id`,
in.typeID, p.ij.job.id, in.dbName(), metadata).Scan(&in.ID)
if err != nil {
return latentID{}, fmt.Errorf("adding new person %+v: %w", in, err)
}
// now that we have the row ID, save profile picture
pictureFile, err := p.processEntityPicture(ctx, in)
if err != nil {
p.log.Error("saving new entity's profile picture",
zap.Uint64("entity_id", in.ID),
zap.Error(err))
} else if pictureFile != "" {
_, err = tx.ExecContext(ctx, `UPDATE entities SET picture_file=? WHERE id=?`, pictureFile, in.ID) // TODO: LIMIT 1, if ever implemented
if err != nil {
p.log.Error("updating new entity's profile picture in DB",
zap.Uint64("entity_id", in.ID),
zap.String("picture_file", pictureFile),
zap.Error(err))
}
}
entities = append(entities, in)
atomic.AddInt64(p.ij.newEntityCount, 1)
case 1:
// if the identity attribute(s) matched exactly 1 entity, update its info in the DB
// (if it matched more than 1 entity, we don't update multiple of them, because we only
// get 1 entity at a time from data sources and we'd end up making the two look alike...
// I can't think of a single data source that explicitly maps one of their accounts to
// multiple entities and gives us that information... so we lose that granularity or
// specificity from data sources, and we have to rely on the user to maintain that
// information because only they'd know)
entity := entities[0]
// update entity with the latest info from the incoming entity
var setClause string
var args []any
// TODO: I used to have "|| (len(in.Name) > len(entity.Name))" as a condition, hoping that it would add missing information (such as a last name to a first name) but when testing with Josh it replaced his name with "Synology NAS Diskstation..." gibberish... anyway...
if entity.Name == "" && in.Name != "" {
if setClause != "" {
setClause += ", "
}
setClause += "name=?"
args = append(args, in.Name)
}
if entity.Picture == nil && in.NewPicture != nil {
// we don't update existing profile pictures at this time... but we can set the picture if one doesn't already exist
in.ID = entity.ID
pictureFile, err := p.processEntityPicture(ctx, in)
if err != nil {
p.log.Error("saving existing entity's new profile picture",
zap.Uint64("entity_id", entity.ID),
zap.Error(err))
} else if pictureFile != "" {
if setClause != "" {
setClause += ", "
}
setClause += "picture_file=?"
args = append(args, pictureFile)
// notify the UI that a new profile picture can be displayed
if entity.ID == ownerEntityID {
p.log.Info("new owner picture", zap.String("picture_file", pictureFile))
}
}
}
setClause = strings.TrimSuffix(setClause, ", ")
if len(args) > 0 {
args = append(args, entity.ID)
_, err := tx.ExecContext(ctx, `UPDATE entities SET `+setClause+` WHERE id=?`, args...) //nolint:gosec
if err != nil {
return latentID{}, fmt.Errorf("updating person %d (%+v): %w", entity.ID, in, err)
}
}
default:
// There's multiple entities with this attribute -- could either be
// because there was not enough information before to combine them
// together automatically, or they are genuinely separate entities
// that share an attribute.
//
// Example of the former: two messaging datasets are imported, one
// uses an old phone number for someone and the other uses the
// person's newer phone number, but only the old one has the person's
// name. Two entities are created since we have no way of knowing it
// is the same person. Then later, a contact list is imported which
// has both phone numbers in a single entity. In that case, the
// incoming entity will match both entities in the database because
// it looks up with both phone number attributes. Their names are
// compatible since one is non-empty, and the other is empty; it
// should be no harm to merge the two.
//
// Example of the latter: a husband and wife share an email address,
// like JoeAndJane@example.com. Obviously, these entities should
// not be combined, as one is named Jane and the other is named
// Joe! But if one of the entities is essentially empty (unnamed),
// there's no harm in combining since that existing empty entity
// could be either of them.
//
// Anyway, it's generally not safe to always merge duplicate entities,
// but it should be okay as long as their names are "compatible."
// (If it WAS always safe to do so, entities could never share
// identifying attributes, and we would have a different, less
// powerful schema design.
entityName, compatible := entityNamesCompatible(entities)
if compatible {
// not really sure which one to keep; but it doesn't really matter,
// any information missing on the one to keep that exists on the
// others (name, profile picture, etc.) will cascade over to it
entityToKeep := entities[0]
mergeEntities := entities[1:]
if err := p.tl.mergeEntities(ctx, tx, entityToKeep, mergeEntities); err != nil {
return latentID{}, fmt.Errorf("failed merging entities: %w", err)
}
if checkedLog := p.log.Check(zapcore.InfoLevel, "merged entities based on new information"); checkedLog != nil {
entityIDs := make([]uint64, 0, len(mergeEntities))
for _, ent := range mergeEntities {
entityIDs = append(entityIDs, ent.ID)
}
checkedLog.Write(
zap.String("unified_entity_name", entityName),
zap.Uint64("kept_entity_id", entityToKeep.ID),
zap.Uint64s("merged_entity_ids", entityIDs),
)
}
}
}
// now store each attribute+value/coords combo if we don't have it yet,
// and link it to the entity
var identityAttributeID uint64
for _, attr := range in.Attributes {
attrID, err := storeAttribute(ctx, tx, attr)
if err != nil {
return latentID{}, fmt.Errorf("%w (entity_name=%s)", err, in.Name)
}
// if this is the identity attribute, then its row ID is what we need to return so item can be associated with it;
// and the data source (DS) ID will be added to its entity_attributes row to indicate their identity on this DS
var linkedDataSourceID *uint64
if attr.Identity {
identityAttributeID = attrID
linkedDataSourceID = &p.dsRowID
}
for _, entity := range entities {
var eaID uint64
var existingDataSourceID *uint64
q := `SELECT id, data_source_id FROM entity_attributes WHERE entity_id=? AND attribute_id=?`
args := []any{entity.ID, attrID}
// If this is an identity attribute, we need to select the row that portrays this identity
// if there is one, or select the existing row that doesn't yet portray / know about this
// identity... this is because an entity can have the same attribute be their identity on
// multiple data sources (like phone number, or email address, are often used by multiple
// data sources), and we want all of them to be represented. This can lead to duplicate
// results in search queries, though, unless they use proper GROUP BY clauses to deduplicate
// on the row ID of what is being searched. For example, search items should GROUP BY item
// row ID and relationship row ID to avoid duplicate "sent to" relationships and duplicate
// items in the search results.
if attr.Identity {
q += " AND (data_source_id=? OR data_source_id IS NULL)"
args = append(args, linkedDataSourceID)
}
q += " LIMIT 1"
err = tx.QueryRowContext(ctx, q, args...).Scan(&eaID, &existingDataSourceID)
noRows := errors.Is(err, sql.ErrNoRows)
if err != nil && !noRows {
return latentID{}, fmt.Errorf("checking for existing link of entity to attribute: %w (entity_id=%d attribute_id=%d)", err, entity.ID, attrID)
}
if noRows {
// the entity and attribute are not yet related in the DB; insert
_, err = tx.ExecContext(ctx,
`INSERT INTO entity_attributes
(entity_id, attribute_id, data_source_id, job_id)
VALUES (?, ?, ?, ?)`,
entity.ID, attrID, linkedDataSourceID, p.ij.job.id)
if err != nil {
return latentID{}, fmt.Errorf("linking entity %d to attribute %d: %w (data_source_id=%#v job_id=%d)",
entity.ID, attrID, err, linkedDataSourceID, p.ij.job.id)
}
} else if eaID > 0 && existingDataSourceID == nil {
// the entity and attribute are already related in the DB but not as an ID on any data source; update
_, err = tx.ExecContext(ctx,
`UPDATE entity_attributes SET data_source_id=?, job_id=? WHERE id=?`, // TODO: LIMIT 1 would be nice...
linkedDataSourceID, p.ij.job.id, eaID)
if err != nil {
return latentID{}, fmt.Errorf("updating entity %d link to attribute %d: %w", entity.ID, attrID, err)
}
}
}
}
return latentID{
entityID: entities[0].ID,
attributeID: identityAttributeID,
}, nil
}
func (p *processor) loadEntities(ctx context.Context, tx *sql.Tx, in *Entity) ([]Entity, error) {
var sb strings.Builder
var args []any
// we use DISTINCT here to avoid duplicate rows in the output,
// since multiple attributes could match the same entity, for
// example -- note that UNION which we perform when there are
// multiple entities also de-duplicates, and we don't need
// DISTINCT in that case; but to avoid complexity with building
// the query, I tested query plans that use both DISTINCT and
// UNION, and found that they were identical (the DB isn't
// doing extra work when both DISTINCT and UNION are used).
const selectClause = `
SELECT DISTINCT
entities.id,
entities.name,
entities.picture_file,
entities.metadata
FROM entities
`
sb.WriteString(selectClause)
if in.ID > 0 {
// in some cases, the entity ID may be specified manually; load that entity directly by ID instead of trying to find it
sb.WriteString("WHERE entities.id=? AND deleted IS NULL")
args = append(args, in.ID)
} else {
if in.Type == EntityPlace {
// place entities can be retrieved by name
sb.WriteString("JOIN entity_types ON entity_types.id = entities.type_id\n")
sb.WriteString("WHERE entity_types.name=? AND entities.name=?")
args = append(args, EntityPlace, in.Name)
}
for _, attr := range in.Attributes {
// skip non-identifying attributes as those are not likely to be unique enough
if !attr.Identifying && !attr.Identity {
continue
}
if len(args) > 0 {
sb.WriteString("\n\nUNION\n\n-- match by attribute ")
sb.WriteString(attr.Name)
sb.WriteString(selectClause)
}
sb.WriteString(`JOIN entity_attributes AS ea ON ea.entity_id = entities.id
JOIN attributes ON attributes.id = ea.attribute_id
WHERE deleted IS NULL AND attributes.name=? AND (`)
args = append(args, attr.Name)
const decPrecisionAt, decPrecisionNear = 4, 3
lowLonAt, highLonAt, lonDecAt := latLonBounds(attr.Longitude, decPrecisionAt)
lowLatAt, highLatAt, latDecAt := latLonBounds(attr.Latitude, decPrecisionAt)
lowLonNear, highLonNear, lonDecNear := latLonBounds(attr.Longitude, decPrecisionNear)
lowLatNear, highLatNear, latDecNear := latLonBounds(attr.Latitude, decPrecisionNear)
if attr.Value != nil {
sb.WriteString("attributes.value=?")
args = append(args, attr.valueForDB())
}
if attr.Latitude != nil && attr.Longitude != nil {
if attr.Value != nil {
sb.WriteString(" OR ")
}
sb.WriteString(`
(
-- entity name is different, but coordinate is really close to same spot as an existing attribute
entities.name!=?
AND round(attributes.longitude, ?) BETWEEN ? AND ?
AND round(attributes.latitude, ?) BETWEEN ? AND ?
)
OR (
-- entity name is the same, and coordinate is somewhat close to same spot as an existing attribute
entities.name=?
AND round(attributes.longitude, ?) BETWEEN ? AND ?
AND round(attributes.latitude, ?) BETWEEN ? AND ?
)`)
args = append(args,
in.Name,
lonDecAt, lowLonAt, highLonAt,
latDecAt, lowLatAt, highLatAt,
in.Name,
lonDecNear, lowLonNear, highLonNear,
latDecNear, lowLatNear, highLatNear)
}
sb.WriteString(")")
}
}
var entities []Entity
// only perform query if there are any qualifiers; otherwise this query returns
// all rows which is obviously wrong; if there's no ID or identifying attributes,
// there's no way for us to know who this entity is anyway
if len(args) > 0 {
q := sb.String()
// run the query to get the entity/entities (note there may be multiple; e.g. if multiple people
// share an account or the account has attributes spanning nultiple positively-ID'ed people)
rows, err := tx.QueryContext(ctx, q, args...)
if err != nil {
return nil, fmt.Errorf("querying entity attributes: %w", err)
}
defer rows.Close()
for rows.Next() {
var ent Entity
var name *string
var metadata *string
err := rows.Scan(&ent.ID, &name, &ent.Picture, &metadata)
if err != nil {
return nil, fmt.Errorf("scanning entity ID: %w", err)
}
if name != nil {
ent.Name = *name
}
if metadata != nil {
err := json.Unmarshal([]byte(*metadata), &ent.Metadata)
if err != nil {
return nil, fmt.Errorf("unmarshaling entity metadata: %w", err)
}
}
entities = append(entities, ent)
}
if err := rows.Err(); err != nil {
return nil, fmt.Errorf("iterating entity rows: %w", err)
}
}
return entities, nil
}
func storeAttribute(ctx context.Context, tx *sql.Tx, attr Attribute) (uint64, error) {
// prepare some values for DB insertion
var metadata *string
if len(attr.Metadata) > 0 {
metaBytes, err := json.Marshal(attr.Metadata)
if err != nil {
return 0, fmt.Errorf("encoding attribute metadata as JSON: %w", err)
}
metaString := string(metaBytes)
metadata = &metaString
}
var altValue *string
if strings.TrimSpace(attr.AltValue) != "" {
altValue = &attr.AltValue
}
const decPrecision = 3
lowLon, highLon, lonDec := latLonBounds(attr.Longitude, decPrecision)
lowLat, highLat, latDec := latLonBounds(attr.Latitude, decPrecision)
var attrID uint64
err := tx.QueryRowContext(ctx, `
SELECT id
FROM attributes
WHERE name=? AND (
value=?
OR (
round(longitude, ?) BETWEEN ? AND ?
AND round(latitude, ?) BETWEEN ? AND ?
)
)
LIMIT 1`, attr.Name, attr.valueForDB(),
lonDec, lowLon, highLon,
latDec, lowLat, highLat).Scan(&attrID)
if err != nil && !errors.Is(err, sql.ErrNoRows) {
return 0, fmt.Errorf("looking up attribute '%+v': %w", attr, err)
}
if attrID == 0 {
// store the attribute+value/coords since we don't have this combo yet
err := tx.QueryRowContext(ctx, `INSERT INTO attributes
(name, value, alt_value, longitude, latitude, altitude, metadata)
VALUES (?, ?, ?, ?, ?, ?, ?)
RETURNING id`,
attr.Name, attr.valueForDB(), altValue,
attr.Longitude, attr.Latitude, attr.Altitude, metadata).Scan(&attrID)
if err != nil {
return 0, fmt.Errorf("storing attribute '%+v': %w", attr, err)
}
}
return attrID, nil
}
// MergeEntities combines the entities to merge into the entity to keep.
func (tl *Timeline) MergeEntities(ctx context.Context, entityIDToKeep uint64, entityIDsToMerge []uint64) error {
// input verification / sanity checks, as well as loading entity information
if entityIDToKeep <= 0 {
return errors.New("entity to keep must have an ID greater than 0")
}
entitiesToMerge := make([]Entity, 0, len(entityIDsToMerge))
seen := make(map[uint64]struct{}) // deduplication
for i, id := range entityIDsToMerge {
if id <= 0 {
return fmt.Errorf("entities to merge must have IDs greater than 0 (%d)", id)
}
if entityIDToKeep == id {
return fmt.Errorf("cannot merge entity into itself (%d)", id)
}
if _, ok := seen[id]; ok {
return fmt.Errorf("entity to merge specified more than once (%d)", id)
}
seen[id] = struct{}{}
if id == ownerEntityID {
// TODO: always keep entity 1 for now, since that's the repo owner... until we figure out a better solution
entityIDToKeep, entityIDsToMerge[i], id = id, entityIDToKeep, entityIDToKeep
}
// load all the entities before we get a lock on the DB to do the changes
entMerge, err := tl.LoadEntity(id)
if err != nil {
return fmt.Errorf("loading entity to merge: %w", err)
}
entitiesToMerge = append(entitiesToMerge, entMerge)
}
entKeep, err := tl.LoadEntity(entityIDToKeep)
if err != nil {
return fmt.Errorf("loading entity to keep: %w", err)
}
tx, err := tl.db.WritePool.BeginTx(ctx, nil)
if err != nil {
return err
}
defer tx.Rollback()
if err = tl.mergeEntities(ctx, tx, entKeep, entitiesToMerge); err != nil {
return fmt.Errorf("merging entities: %w", err)
}
return tx.Commit()
}
func (tl *Timeline) mergeEntities(ctx context.Context, tx *sql.Tx, entKeep Entity, entitiesToMerge []Entity) error {
for _, entMerge := range entitiesToMerge {
// bring over any information on the entity to merge that's missing on the entity to keep
if entMerge.Name != "" && entKeep.Name == "" {
entKeep.Name = entMerge.Name
}
// combine metadata (add only missing fields)
entKeep.Metadata.Merge(entMerge.Metadata, MetaMergeSkip)
metadata, err := entKeep.metadataString()
if err != nil {
return err
}
// if they both have a profile picture, we'll need to delete the one from the entity being merged
var oldPictureFile *string
if entMerge.Picture != nil {
if entKeep.Picture == nil {
entKeep.Picture = entMerge.Picture
} else {
oldPictureFile = entMerge.Picture
}
}
// update the entity to keep with any info that was transferred over from the entity to merge
_, err = tx.ExecContext(ctx, `UPDATE entities SET name=?, picture_file=?, metadata=? WHERE id=?`,
entKeep.Name, entKeep.Picture, metadata, entKeep.ID)
if err != nil {
return fmt.Errorf("updating entity row: %w", err)
}
// replace entity IDs in the entity_attributes table, but if a row update will cause a
// duplicate on entity_id, attribute_id, and data_source_id fields, delete the row instead
if _, err := tx.ExecContext(ctx, `
DELETE FROM entity_attributes AS ea
WHERE entity_id=?
AND EXISTS (
SELECT 1
FROM entity_attributes AS ea_old
WHERE ea_old.entity_id=?
AND ea_old.attribute_id = ea.attribute_id
AND ea_old.data_source_id IS ea.data_source_id
)
`, entKeep.ID, entMerge.ID); err != nil {
return fmt.Errorf("deleting rows that would violate unique constraints after imminent update in entity_attributes: %w", err)
}
if _, err := tx.ExecContext(ctx, `UPDATE entity_attributes SET entity_id=? WHERE entity_id=?`, entKeep.ID, entMerge.ID); err != nil {
return fmt.Errorf("replacing entity ID in entity_attributes: %w", err)
}
// replace entity IDs elsewhere in the database (TODO: we might need to employ similar deletion logic above since we could cause duplicate rows, but these tables are manually populated so it's less likely to make duplicates here)
if _, err := tx.ExecContext(ctx, `UPDATE tagged SET entity_id=? WHERE entity_id=?`, entKeep.ID, entMerge.ID); err != nil {
return fmt.Errorf("replacing entity ID in tagged: %w", err)
}
if _, err := tx.ExecContext(ctx, `UPDATE story_elements SET entity_id=? WHERE entity_id=?`, entKeep.ID, entMerge.ID); err != nil {
return fmt.Errorf("replacing entity ID in story_elements: %w", err)
}
if _, err := tx.ExecContext(ctx, `UPDATE notes SET entity_id=? WHERE entity_id=?`, entKeep.ID, entMerge.ID); err != nil {
return fmt.Errorf("replacing entity ID in notes: %w", err)
}
// handle pass-through attribute for the entity being merged (start by seeing if there's one for the entity to keep)
var passThruAttrIDKeep int64
if err = tx.QueryRowContext(ctx, `SELECT id FROM attributes WHERE name=? AND value=? LIMIT 1`, passThruAttribute, entKeep.ID).Scan(&passThruAttrIDKeep); err != nil && !errors.Is(err, sql.ErrNoRows) {
return fmt.Errorf("selecting pass-thru attribute for entity to keep: %w", err)
}
if passThruAttrIDKeep == 0 {
// a pass-thru attribute doesn't exist for the entity to keep, so we can safely
// just update the pass-thru attribute (if any) for the one to merge to point to
// the one to keep
if _, err := tx.ExecContext(ctx, `UPDATE attributes SET value=? WHERE name=? AND value=?`, entKeep.ID, passThruAttribute, entMerge.ID); err != nil {
return fmt.Errorf("updating pass-thru attribute to point to entity to keep: %w", err)
}
} else {
// this is a little more work: we can't just update an attribute because it would
// violate uniqueness constraints; so update everything that points to the old
// attribute to point to the new one instead
var passThruAttrIDMerge int64
if err = tx.QueryRowContext(ctx, `SELECT id FROM attributes WHERE name=? AND value=? LIMIT 1`, passThruAttribute, entMerge.ID).Scan(&passThruAttrIDMerge); err != nil && !errors.Is(err, sql.ErrNoRows) {
return fmt.Errorf("selecting pass-thru attribute for entity to merge: %w", err)
}
if passThruAttrIDMerge > 0 {
if _, err := tx.ExecContext(ctx, `UPDATE items SET attribute_id=? WHERE attribute_id=?`, passThruAttrIDKeep, passThruAttrIDMerge); err != nil {
return fmt.Errorf("updating attribute ID in items table: %w", err)
}
if _, err := tx.ExecContext(ctx, `UPDATE relationships SET from_attribute_id=? WHERE from_attribute_id=?`, passThruAttrIDKeep, passThruAttrIDMerge); err != nil {
return fmt.Errorf("updating 'from' attribute ID in relationships table: %w", err)
}
if _, err := tx.ExecContext(ctx, `UPDATE relationships SET to_attribute_id=? WHERE to_attribute_id=?`, passThruAttrIDKeep, passThruAttrIDMerge); err != nil {
return fmt.Errorf("updating 'to' attribute ID in relationships table: %w", err)
}
if _, err := tx.ExecContext(ctx, `DELETE FROM entity_attributes WHERE attribute_id=?`, passThruAttrIDMerge); err != nil {
return fmt.Errorf("deleting row in entity_attributes table: %w (attribute_id=%d)", err, passThruAttrIDMerge)
}
if _, err := tx.ExecContext(ctx, `DELETE FROM attributes WHERE id=?`, passThruAttrIDMerge); err != nil {
return fmt.Errorf("deleting pass-thru attribute for entity to merge: %w (attribute_id=%d)", err, passThruAttrIDMerge)
}
}
}
// clean up the entity to merge
if _, err := tx.ExecContext(ctx, `DELETE FROM entities WHERE id=?`, entMerge.ID); err != nil {
return fmt.Errorf("deleting from entities table: %w", err)
}
if oldPictureFile != nil {
if err := tl.deleteRepoFile(*oldPictureFile); err != nil {
return err
}
}
}
return nil
}
// entityNamesCompatible returns the singular name of these entities and true
// if they are compatible, meaning that all the entities with a non-empty Name
// have the same name.
func entityNamesCompatible(entities []Entity) (string, bool) {
var entityName string
for _, e := range entities {
if e.Name == "" {
continue
}
if entityName == "" {
entityName = e.Name
} else if !strings.EqualFold(e.Name, entityName) {
return "", false
}
}
return entityName, true
}
// latentID is a type that holds either the ID of an item
// or an identity attribute for an entity. If an entity
// was processed without an explicit identity attribute,
// this struct will hold the row ID of the entity instead,
// and if the ID of the identity attribute is later needed,
// call identifyingAttributeID() to create a pass-thru
// entity and get a usable attribute ID linked to the
// entity. For items, the itemID field can be accessed directly.
type latentID struct {
itemID, entityID, attributeID uint64
}
// id returns either the item ID or the attribute ID, whichever is set.
func (l latentID) id() uint64 {
if l.itemID > 0 {
return l.itemID
}
if l.attributeID > 0 {
return l.attributeID
}
return 0
}
// identifyingAttributeID returns either the known row ID of the
// attribute identifying the entity, or it will create a pass-thru
// attribute for the entity and return the new row ID.
func (l *latentID) identifyingAttributeID(ctx context.Context, tx *sql.Tx) (uint64, error) {
if l.attributeID > 0 {
return l.attributeID, nil
}
if l.entityID == 0 {
return 0, errors.New("no attribute ID or entity ID set")
}
attrID, err := storeLinkBetweenEntityAndNonIDAttribute(ctx, tx, l.entityID, Attribute{
Name: passThruAttribute,
Value: strconv.FormatUint(l.entityID, 10),
Identity: true,
})
if err != nil {
return 0, err
}
l.attributeID = attrID
return l.attributeID, nil
}
// getSystemLocale returns the language, region, and encoding, if available.
func getSystemLocale(ctx context.Context) (language, region, encoding string, err error) {
// Example of LANG: en_US.UTF-8
if lang := os.Getenv("LANG"); lang != "" {
underscorePos := strings.Index(lang, "_")
if underscorePos < 0 {
err = fmt.Errorf("no underscore divider between language and region in LANG=%s", lang)
return
}
dotPos := strings.Index(lang, ".")
if dotPos < 0 {
err = fmt.Errorf("no dot divider between language and region in LANG=%s", lang)
}
language = lang[:underscorePos]
region = lang[underscorePos+1 : dotPos]
encoding = lang[dotPos+1:]
return
}
if runtime.GOOS == "windows" {
cmd := exec.CommandContext(ctx, "powershell", "Get-Culture | select -exp Name")
var output []byte
output, err = cmd.Output() // e.g. "en-US"
if err != nil {
return
}
parts := strings.Split(strings.TrimSpace(string(output)), "-")
if len(parts) == 2 { //nolint:mnd
language = parts[0]
region = parts[1]
}
return
}
err = errors.New("unable to determine locale because LANG var is unset and/or platform unrecognized")
return
}
var multiSpaceRegex = regexp.MustCompile(`\s{2,}`)
// passThruAttribute is the name of the special attribute that is used to identify
// a specific entity by its row ID and no other features (attributes) in particular.
const passThruAttribute = "_entity"
// Canonical attribute names.
const (
AttributeEmail = "email_address"
AttributePhoneNumber = "phone_number"
AttributeGender = "gender"
)