timelinize/timeline/entities.go

/*
	Timelinize
	Copyright (c) 2013 Matthew Holt

	This program is free software: you can redistribute it and/or modify
	it under the terms of the GNU Affero General Public License as published
	by the Free Software Foundation, either version 3 of the License, or
	(at your option) any later version.

	This program is distributed in the hope that it will be useful,
	but WITHOUT ANY WARRANTY; without even the implied warranty of
	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
	GNU Affero General Public License for more details.

	You should have received a copy of the GNU Affero General Public License
	along with this program.  If not, see <https://www.gnu.org/licenses/>.
*/

package timeline

import (
	"bufio"
	"context"
	"database/sql"
	"encoding/json"
	"errors"
	"fmt"
	"io"
	"mime"
	"net/http"
	"os"
	"os/exec"
	"path"
	"path/filepath"
	"regexp"
	"runtime"
	"strconv"
	"strings"
	"sync/atomic"
	"time"

	"github.com/ttacon/libphonenumber"
	"go.uber.org/zap"
	"go.uber.org/zap/zapcore"
)

// Entity represents a person. All these fields go into the persons table,
// except for UserID which goes into person_identities and which maps that
// user ID to the person being described by name and birth information.
// Birth information is almost always added manually by users, since
// data sources don't usually provide this, and is used to distinguish
// different persons of the same name, and is totally optional.
// TODO: update godoc
type Entity struct {
	// database row IDs
	ID     uint64  `json:"id,omitempty"`
	id     *uint64 // temporary holding place for DB value
	typeID uint64

	Type string  `json:"type,omitempty"`
	name *string // temporary holding place for DB value
	Name string  `json:"name,omitempty"`

	// To set a profile picture for this entity, set this field.
	NewPicture DataFunc `json:"-"`

	// Very optional extra information about this entity.
	Metadata Metadata `json:"metadata,omitempty"`

	Attributes []Attribute `json:"attributes,omitempty"`

	// THE FOLLOWING FIELDS ARE NOT TO BE SET BY DATA SOURCES.

	// Contains the path to the previously-set profile picture file
	// relative to the repo root.
	Picture *string `json:"picture,omitempty"`

	// Fields below are only for use with search or JSON serialization
	JobID    *int64     `json:"job_id,omitempty"`
	Stored   time.Time  `json:"stored,omitzero"`
	Modified *time.Time `json:"modified,omitempty"`
}

func (e Entity) String() string {
	return fmt.Sprintf("[id=%d type=%s name=%s picture=%v metadata=%v attributes=%v]",
		e.ID, e.Type, e.Name, e.Picture, e.Metadata, e.Attributes)
}

// IsEmpty returns true if the entity has no information.
// It is best to check this AFTER normalizing, which may
// delete data that looks non-empty but is actually empty.
func (e Entity) IsEmpty() bool {
	return e.ID == 0 &&
		e.Name == "" &&
		e.NewPicture == nil &&
		e.Picture == nil &&
		len(e.Metadata) == 0 &&
		len(e.Attributes) == 0
}

func (e Entity) dbName() *string {
	if strings.TrimSpace(e.Name) == "" {
		return nil
	}
	return &e.Name
}

// TODO: I'm not sure we're representing the new date attributes properly in the DB
// func (p Entity) birthDateUnix() *int64 {
// 	if p.BirthDate == nil {
// 		return nil
// 	}
// 	bdUnix := p.BirthDate.Unix()
// 	return &bdUnix
// }

// func (p Entity) deathDateUnix() *int64 {
// 	if p.DeathDate == nil {
// 		return nil
// 	}
// 	ddUnix := p.DeathDate.Unix()
// 	return &ddUnix
// }

// Attribute gets the (first-)named attribute from the entity.
func (e Entity) Attribute(name string) (Attribute, bool) {
	for _, attr := range e.Attributes {
		if attr.Name == name {
			return attr, true
		}
	}
	return Attribute{}, false
}

// AttributeValue gets the value of the (first-)named attribute from the entity.
func (e Entity) AttributeValue(name string) any {
	for _, attr := range e.Attributes {
		if attr.Name == name {
			return attr.Value
		}
	}
	return ""
}

func (tl *Timeline) normalizeEntity(e *Entity) error {
	var identityAttr Attribute
	for i := 0; i < len(e.Attributes); i++ {
		// remove empty attributes
		if e.Attributes[i].Empty() {
			e.Attributes = append(e.Attributes[:i], e.Attributes[i+1:]...)
			i--
			continue
		}
		// ensure at most one identity attribute; multiple would be confusing/ambiguous and
		// could lead to confusion or worse (maybe uniqueness constraint violations)
		if e.Attributes[i].Identity {
			if identityAttr.Name != "" {
				return fmt.Errorf("multiple ambiguous identity attributes: at least %s=%s and %s=%s",
					identityAttr.Name, identityAttr.Value, e.Attributes[i].Name, e.Attributes[i].Value)
			}
			identityAttr = e.Attributes[i]
		}
		// normalize known attributes
		e.Attributes[i] = normalizeAttribute(tl.ctx, e.Attributes[i])
	}

	// clean up name too; use regex to remove repeated spaces, then trim spaces on edges
	e.Name = strings.TrimSpace(multiSpaceRegex.ReplaceAllString(e.Name, " "))

	// oh, and ensure type is set (DB row requires entity type ID, not name)
	if e.Type == "" && e.typeID == 0 {
		e.Type = EntityPerson // assume a person, I guess -- data sources don't often distinguish
	}
	if e.typeID == 0 {
		typeID, err := tl.entityTypeNameToID(e.Type)
		if err != nil {
			return fmt.Errorf("getting entity type ID: %w (entity_type=%s)", err, e.Type)
		}
		e.typeID = typeID
	}

	e.Metadata.Clean()

	return nil
}

func (e Entity) metadataString() (*string, error) {
	if len(e.Metadata) == 0 {
		return nil, nil
	}
	metaBytes, err := json.Marshal(e.Metadata)
	if err != nil {
		return nil, fmt.Errorf("encoding entity metadata as JSON: %w", err)
	}
	metaString := string(metaBytes)
	return &metaString, nil
}

const (
	EntityPerson   = "person"
	EntityCreature = "creature"
	EntityPlace    = "place"
)

// Attribute describes an entity.
type Attribute struct {
	// The name of the attribute. For attributes that define
	// a user's identity on a data source, this name *MUST*
	// be unique to the data source if it's possible for a
	// different person to have the same attribute+value, or
	// else collisions/confusion could arise in the processing
	// logic. (For example, "username" is not a good attribute
	// name.) Specifically, given a data source, attribute name,
	// and a specific value, we should be able to pinpoint
	// *which* person or people with certainty even if the value
	// alone is the same for different persons. If the value is
	// guaranteed to be unique even if the attribute name isn't,
	// then it is OK for the attribute name to be shared by data
	// sources (e.g. phone number or email address -- these
	// concepts span data sources, but they are globally unique
	// at any given point in time). See the Identifying field.
	//
	// Known global attributes will be recognized and/or standardized:
	//
	// - phone_number: Can be prefixed with region (default: "US"), e.g.: "US:123-456-7890"
	// - TODO: email address
	// - TODO: physical address
	// - TODO: gender
	Name string `json:"name"`

	// The value of the attribute.
	Value any `json:"value"`

	// Optional; an alternate value intended for display or as a description.
	AltValue string `json:"alt_value,omitempty"`

	// If true, this attribute+value combination can be used to
	// identify a person.
	Identifying bool `json:"identifying,omitempty"`

	// If true, this attribute defines a person's identity
	// on the data source associated with the current
	// processing. There should only be 1 identity attribute
	// per data source. When associated with an Item.Owner,
	// the item will be credited to this attribute in the DB.
	// Implies Identifying=true.
	Identity bool `json:"identity"`

	// If the attribute represents a place, put its location here.
	// At least lat/lon are required (for earth coordinates).
	Latitude  *float64 `json:"latitude,omitempty"`
	Longitude *float64 `json:"longitude,omitempty"`
	Altitude  *float64 `json:"altitude,omitempty"`

	// Optional metadata associated with this attribute.
	Metadata Metadata `json:"metadata,omitempty"`

	//////////////////////////////////////////////////////////////////////////
	// The fields below are NOT intended for use by data sources (importers).

	// NOT FOR USE BY DATA SOURCES.
	ID uint64 `json:"id,omitempty"`

	// NOT FOR USE BY DATA SOURCES. For search results only.
	ItemCount int64 `json:"item_count,omitempty"`

	// NOT FOR USE BY DATA SOURCES. For search results only.
	// This indicates which data source(s) the attribute identifies
	// an entity on.
	IdentityOn []string `json:"identity_on,omitempty"`
}

// Empty returns true if the name or value is empty or only whitespace.
func (a Attribute) Empty() bool {
	if strings.TrimSpace(a.Name) == "" {
		return true
	}
	return strings.TrimSpace(a.valueString()) == "" && a.Latitude == nil && a.Longitude == nil
}

func (a Attribute) valueForDB() any {
	switch val := a.Value.(type) {
	case *time.Time:
		if val == nil {
			return nil
		}
		return val.Unix()
	case time.Time:
		return val.Unix()
	default:
		return val
	}
}

func (a Attribute) valueString() string {
	switch val := a.Value.(type) {
	case nil:
		return ""
	case string:
		return val
	case *time.Time:
		// this case only exists because if the *time.Time is nil,
		// we get a panic for calling String() on a nil pointer
		// (because time.Time is a fmt.Stringer)
		if val == nil {
			return ""
		}
		return val.String()
	case fmt.Stringer:
		if val == nil {
			return ""
		}
		return val.String()
	case int:
		return strconv.Itoa(val)
	default:
		return fmt.Sprintf("%v", val)
	}
}

func normalizeAttribute(ctx context.Context, attr Attribute) Attribute {
	attr.Name = strings.TrimSpace(attr.Name)

	if val, ok := attr.Value.(string); ok {
		// if this is a time value that was deserialized from JSON, convert it to a time type
		// (values coming directly from data sources should already be time type); we do this
		// because we conventionally store timestamps as unix timestamps in the DB, and the
		// values need to be time types in order for us to do that
		if parsedTime, err := time.Parse(time.RFC3339, val); err == nil {
			attr.Value = parsedTime
		} else {
			attr.Value = strings.TrimSpace(val)
		}
	}

	if attr.Name == AttributePhoneNumber {
		// TODO: region could be stored in metadata now
		region, num, ok := strings.Cut(attr.Value.(string), ":")
		if !ok {
			num = region
			region = ""
		}
		stdPhoneNum, err := NormalizePhoneNumber(ctx, num, region)
		if err == nil {
			attr.Value = stdPhoneNum
		}
	}

	attr.Metadata.Clean()

	return attr
}

// NormalizePhoneNumber attempts to parse number and returns
// a standardized version in E164 format. If the number does
// not have an explicit region/country code, the country code
// for the region is used instead. If no country code is
// in the number, the country code from the system's locale
// region is assumed; otherwise the US region is assumed.
//
// We chose E164 because that's what Twilio uses.
func NormalizePhoneNumber(ctx context.Context, number, defaultRegion string) (string, error) {
	if defaultRegion == "" {
		if _, localeRegion, _, err := getSystemLocale(ctx); err == nil {
			defaultRegion = localeRegion
		} else {
			defaultRegion = "US"
		}
	}
	ph, err := libphonenumber.Parse(number, defaultRegion)
	if err != nil {
		return "", err
	}
	return libphonenumber.Format(ph, libphonenumber.E164), nil
}

// processEntityPicture downloads the entity's picture (if relevant), then stores it on
// disk. It does NOT update the database, but it does return the path to the picture file.
func (p *processor) processEntityPicture(ctx context.Context, e Entity) (string, error) {
	if e.ID <= 0 {
		return "", fmt.Errorf("missing or invalid row ID %d", e.ID)
	}
	if e.NewPicture == nil {
		return "", nil
	}

	r, err := e.NewPicture(ctx)
	if err != nil {
		return "", fmt.Errorf("opening profile picture reader: %w", err)
	}
	if r == nil {
		return "", nil // this could happen if the data source finds out there's no picture and no error
	}
	defer r.Close()

	buffered := bufio.NewReader(r)

	const bytesNeededToSniff = 512
	peekedBytes, err := buffered.Peek(bytesNeededToSniff)
	if err != nil {
		return "", fmt.Errorf("could not peek profile picture to determine type: %w", err)
	}

	contentType := http.DetectContentType(peekedBytes)

	// use "/" separators here; the fullpath() method will adjust for OS path separator
	// (we use the "%09d" formatter so file systems sort more conveniently, but it
	// also does not look like a date/time)
	pictureFile := path.Join(AssetsFolderName, "profile_pictures", fmt.Sprintf("entity_%09d", e.ID))
	disposition, _, _ := mime.ParseMediaType(contentType)
	switch disposition {
	case imagePNG:
		pictureFile += ".png"
	case ImageWebP:
		pictureFile += ".webp"
	case imageGif:
		pictureFile += ".gif"
	case ImageAVIF:
		pictureFile += ".avif"
	case ImageJPEG, "":
		fallthrough
	default:
		pictureFile += extJpg
	}
	fullPath := filepath.Clean(p.tl.FullPath(pictureFile))

	// ensure parent dir exists, then open file for writing
	if err = os.MkdirAll(filepath.Dir(fullPath), 0700); err != nil {
		return "", err
	}

	w, err := os.OpenFile(fullPath, os.O_CREATE|os.O_RDWR|os.O_TRUNC, 0600)
	if err != nil {
		return "", err
	}
	defer w.Close()

	if _, err := io.Copy(w, buffered); err != nil {
		return "", err
	}

	if err := w.Sync(); err != nil {
		return "", err
	}

	return pictureFile, nil
}

// Exported consts may be used for thumbnails.
const (
	ImageWebP, extWebp         = "image/webp", ".webp"
	ImageJPEG, extJpg, extJpeg = "image/jpeg", ".jpg", ".jpeg"
	ImageAVIF, extAvif         = "image/avif", ".avif"
	imagePNG, extPng           = "image/png", ".png"
	imageGif, extGif           = "image/gif", ".gif"

	VideoWebM = "video/webm"
)

// TODO: update this godoc related to latentID; and note that an empty latentID and nil error is a possible and valid return value here.
// processEntity ingests the incoming person into the timeline. The person's attributes are
// normalized, the person is looked up in the DB according to their identifying attributes
// first, then their ID if given, or their name (if enabled as ID); and if one match is found,
// the person's information and attributes are merged with the one in the database if this one
// has anything new to add; otherwise, it creates a new entity in the DB with the information
// provided. If multiple matches are found (a single identity attribute can technically map to
// multiple entities, however, I believe this has to be done manually for now), then the
// information for the person(s) is not updated (as that would make them look alike) but the
// attributes are added and linked at least to this one person. The row ID of the ID attribute
// that represents the input entity is returned. Note that a row ID of 0 can be returned if
// there is no identity attribute!
func (p *processor) processEntity(ctx context.Context, tx *sql.Tx, in Entity) (latentID, error) {
	// remove any empty attributes and standardize the entity
	if err := p.tl.normalizeEntity(&in); err != nil {
		return latentID{}, err
	}

	// don't save an empty entity (make sure we do this AFTER normalizing); it's totally
	// allowed for items to not have any particular owner specified
	if in.IsEmpty() {
		return latentID{}, nil
	}

	// try to find matching entities with given information; first try attributes because
	// they are more descriptive, then if no matches, use ID directly if provided, or if
	// not, then use name if only it is provided and set to act as an ID (not ideal though,
	// as names tend to change...) -- first try attributes by clearing name and ID from a
	// copy of the incoming entity
	inCopy := in
	inCopy.ID = 0
	entities, err := p.loadEntities(ctx, tx, &inCopy)
	if err != nil {
		return latentID{}, fmt.Errorf("searching entities by attributes: %w", err)
	}

	// then only if that didn't yield any results, try ID next
	if len(entities) == 0 && in.ID > 0 {
		entities, err = p.loadEntities(ctx, tx, &in)
		if err != nil {
			return latentID{}, fmt.Errorf("searching entities by ID: %w", err)
		}
	}

	// if our searches yielded 0 results, we can insert the entity; if 1, we can update it
	switch len(entities) {
	case 0:
		// don't create a new entity if *only* an entity ID was provided (it must have been
		// erroneous?) with no other attributes, as that implies the entity should already
		// exist but it does not... creating a new one is wrong
		if in.ID > 0 && len(in.Attributes) == 0 {
			return latentID{}, fmt.Errorf("input entity had non-zero row ID (%d), but that entity was not found; and without attributes we do not create a new entity", in.ID)
		}

		// marshal metadata and convert it to a string pointer... awkward, but seems more reasonable than a blob...
		metadata, err := in.metadataString()
		if err != nil {
			return latentID{}, err
		}

		// if we could not find a person by any of those identities, add new person to DB
		err = tx.QueryRowContext(ctx, `INSERT INTO entities
				(type_id, job_id, name, metadata) VALUES (?, ?, ?, ?)
				RETURNING id`,
			in.typeID, p.ij.job.id, in.dbName(), metadata).Scan(&in.ID)
		if err != nil {
			return latentID{}, fmt.Errorf("adding new person %+v: %w", in, err)
		}

		// now that we have the row ID, save profile picture
		pictureFile, err := p.processEntityPicture(ctx, in)
		if err != nil {
			p.log.Error("saving new entity's profile picture",
				zap.Uint64("entity_id", in.ID),
				zap.Error(err))
		} else if pictureFile != "" {
			_, err = tx.ExecContext(ctx, `UPDATE entities SET picture_file=? WHERE id=?`, pictureFile, in.ID) // TODO: LIMIT 1, if ever implemented
			if err != nil {
				p.log.Error("updating new entity's profile picture in DB",
					zap.Uint64("entity_id", in.ID),
					zap.String("picture_file", pictureFile),
					zap.Error(err))
			}
		}

		entities = append(entities, in)

		atomic.AddInt64(p.ij.newEntityCount, 1)
	case 1:
		// if the identity attribute(s) matched exactly 1 entity, update its info in the DB
		// (if it matched more than 1 entity, we don't update multiple of them, because we only
		// get 1 entity at a time from data sources and we'd end up making the two look alike...
		// I can't think of a single data source that explicitly maps one of their accounts to
		// multiple entities and gives us that information... so we lose that granularity or
		// specificity from data sources, and we have to rely on the user to maintain that
		// information because only they'd know)

		entity := entities[0]

		// update entity with the latest info from the incoming entity
		var setClause string
		var args []any
		// TODO: I used to have "|| (len(in.Name) > len(entity.Name))" as a condition, hoping that it would add missing information (such as a last name to a first name) but when testing with Josh it replaced his name with "Synology NAS Diskstation..." gibberish... anyway...
		if entity.Name == "" && in.Name != "" {
			if setClause != "" {
				setClause += ", "
			}
			setClause += "name=?"
			args = append(args, in.Name)
		}
		if entity.Picture == nil && in.NewPicture != nil {
			// we don't update existing profile pictures at this time... but we can set the picture if one doesn't already exist
			in.ID = entity.ID
			pictureFile, err := p.processEntityPicture(ctx, in)
			if err != nil {
				p.log.Error("saving existing entity's new profile picture",
					zap.Uint64("entity_id", entity.ID),
					zap.Error(err))
			} else if pictureFile != "" {
				if setClause != "" {
					setClause += ", "
				}
				setClause += "picture_file=?"
				args = append(args, pictureFile)
				// notify the UI that a new profile picture can be displayed
				if entity.ID == ownerEntityID {
					p.log.Info("new owner picture", zap.String("picture_file", pictureFile))
				}
			}
		}
		setClause = strings.TrimSuffix(setClause, ", ")

		if len(args) > 0 {
			args = append(args, entity.ID)
			_, err := tx.ExecContext(ctx, `UPDATE entities SET `+setClause+` WHERE id=?`, args...) //nolint:gosec
			if err != nil {
				return latentID{}, fmt.Errorf("updating person %d (%+v): %w", entity.ID, in, err)
			}
		}

	default:
		// There's multiple entities with this attribute -- could either be
		// because there was not enough information before to combine them
		// together automatically, or they are genuinely separate entities
		// that share an attribute.
		//
		// Example of the former: two messaging datasets are imported, one
		// uses an old phone number for someone and the other uses the
		// person's newer phone number, but only the old one has the person's
		// name. Two entities are created since we have no way of knowing it
		// is the same person. Then later, a contact list is imported which
		// has both phone numbers in a single entity. In that case, the
		// incoming entity will match both entities in the database because
		// it looks up with both phone number attributes. Their names are
		// compatible since one is non-empty, and the other is empty; it
		// should be no harm to merge the two.
		//
		// Example of the latter: a husband and wife share an email address,
		// like JoeAndJane@example.com. Obviously, these entities should
		// not be combined, as one is named Jane and the other is named
		// Joe! But if one of the entities is essentially empty (unnamed),
		// there's no harm in combining since that existing empty entity
		// could be either of them.
		//
		// Anyway, it's generally not safe to always merge duplicate entities,
		// but it should be okay as long as their names are "compatible."
		// (If it WAS always safe to do so, entities could never share
		// identifying attributes, and we would have a different, less
		// powerful schema design.
		entityName, compatible := entityNamesCompatible(entities)
		if compatible {
			// not really sure which one to keep; but it doesn't really matter,
			// any information missing on the one to keep that exists on the
			// others (name, profile picture, etc.) will cascade over to it
			entityToKeep := entities[0]
			mergeEntities := entities[1:]

			if err := p.tl.mergeEntities(ctx, tx, entityToKeep, mergeEntities); err != nil {
				return latentID{}, fmt.Errorf("failed merging entities: %w", err)
			}
			if checkedLog := p.log.Check(zapcore.InfoLevel, "merged entities based on new information"); checkedLog != nil {
				entityIDs := make([]uint64, 0, len(mergeEntities))
				for _, ent := range mergeEntities {
					entityIDs = append(entityIDs, ent.ID)
				}
				checkedLog.Write(
					zap.String("unified_entity_name", entityName),
					zap.Uint64("kept_entity_id", entityToKeep.ID),
					zap.Uint64s("merged_entity_ids", entityIDs),
				)
			}
		}
	}

	// now store each attribute+value/coords combo if we don't have it yet,
	// and link it to the entity
	var identityAttributeID uint64
	for _, attr := range in.Attributes {
		attrID, err := storeAttribute(ctx, tx, attr)
		if err != nil {
			return latentID{}, fmt.Errorf("%w (entity_name=%s)", err, in.Name)
		}

		// if this is the identity attribute, then its row ID is what we need to return so item can be associated with it;
		// and the data source (DS) ID will be added to its entity_attributes row to indicate their identity on this DS
		var linkedDataSourceID *uint64
		if attr.Identity {
			identityAttributeID = attrID
			linkedDataSourceID = &p.dsRowID
		}

		for _, entity := range entities {
			var eaID uint64
			var existingDataSourceID *uint64

			q := `SELECT id, data_source_id FROM entity_attributes WHERE entity_id=? AND attribute_id=?`
			args := []any{entity.ID, attrID}

			// If this is an identity attribute, we need to select the row that portrays this identity
			// if there is one, or select the existing row that doesn't yet portray / know about this
			// identity... this is because an entity can have the same attribute be their identity on
			// multiple data sources (like phone number, or email address, are often used by multiple
			// data sources), and we want all of them to be represented. This can lead to duplicate
			// results in search queries, though, unless they use proper GROUP BY clauses to deduplicate
			// on the row ID of what is being searched. For example, search items should GROUP BY item
			// row ID and relationship row ID to avoid duplicate "sent to" relationships and duplicate
			// items in the search results.
			if attr.Identity {
				q += " AND (data_source_id=? OR data_source_id IS NULL)"
				args = append(args, linkedDataSourceID)
			}
			q += " LIMIT 1"

			err = tx.QueryRowContext(ctx, q, args...).Scan(&eaID, &existingDataSourceID)
			noRows := errors.Is(err, sql.ErrNoRows)
			if err != nil && !noRows {
				return latentID{}, fmt.Errorf("checking for existing link of entity to attribute: %w (entity_id=%d attribute_id=%d)", err, entity.ID, attrID)
			}

			if noRows {
				// the entity and attribute are not yet related in the DB; insert

				_, err = tx.ExecContext(ctx,
					`INSERT INTO entity_attributes
						(entity_id, attribute_id, data_source_id, job_id)
					VALUES (?, ?, ?, ?)`,
					entity.ID, attrID, linkedDataSourceID, p.ij.job.id)
				if err != nil {
					return latentID{}, fmt.Errorf("linking entity %d to attribute %d: %w (data_source_id=%#v job_id=%d)",
						entity.ID, attrID, err, linkedDataSourceID, p.ij.job.id)
				}
			} else if eaID > 0 && existingDataSourceID == nil {
				// the entity and attribute are already related in the DB but not as an ID on any data source; update

				_, err = tx.ExecContext(ctx,
					`UPDATE entity_attributes SET data_source_id=?, job_id=? WHERE id=?`, // TODO: LIMIT 1 would be nice...
					linkedDataSourceID, p.ij.job.id, eaID)
				if err != nil {
					return latentID{}, fmt.Errorf("updating entity %d link to attribute %d: %w", entity.ID, attrID, err)
				}
			}
		}
	}

	return latentID{
		entityID:    entities[0].ID,
		attributeID: identityAttributeID,
	}, nil
}

func (p *processor) loadEntities(ctx context.Context, tx *sql.Tx, in *Entity) ([]Entity, error) {
	var sb strings.Builder
	var args []any

	// we use DISTINCT here to avoid duplicate rows in the output,
	// since multiple attributes could match the same entity, for
	// example -- note that UNION which we perform when there are
	// multiple entities also de-duplicates, and we don't need
	// DISTINCT in that case; but to avoid complexity with building
	// the query, I tested query plans that use both DISTINCT and
	// UNION, and found that they were identical (the DB isn't
	// doing extra work when both DISTINCT and UNION are used).
	const selectClause = `
SELECT DISTINCT
	entities.id,
	entities.name,
	entities.picture_file,
	entities.metadata
FROM entities
`

	sb.WriteString(selectClause)

	if in.ID > 0 {
		// in some cases, the entity ID may be specified manually; load that entity directly by ID instead of trying to find it
		sb.WriteString("WHERE entities.id=? AND deleted IS NULL")
		args = append(args, in.ID)
	} else {
		if in.Type == EntityPlace {
			// place entities can be retrieved by name
			sb.WriteString("JOIN entity_types ON entity_types.id = entities.type_id\n")
			sb.WriteString("WHERE entity_types.name=? AND entities.name=?")
			args = append(args, EntityPlace, in.Name)
		}

		for _, attr := range in.Attributes {
			// skip non-identifying attributes as those are not likely to be unique enough
			if !attr.Identifying && !attr.Identity {
				continue
			}

			if len(args) > 0 {
				sb.WriteString("\n\nUNION\n\n-- match by attribute ")
				sb.WriteString(attr.Name)
				sb.WriteString(selectClause)
			}
			sb.WriteString(`JOIN entity_attributes AS ea ON ea.entity_id = entities.id
JOIN attributes ON attributes.id = ea.attribute_id
WHERE deleted IS NULL AND attributes.name=? AND (`)
			args = append(args, attr.Name)

			const decPrecisionAt, decPrecisionNear = 4, 3
			lowLonAt, highLonAt, lonDecAt := latLonBounds(attr.Longitude, decPrecisionAt)
			lowLatAt, highLatAt, latDecAt := latLonBounds(attr.Latitude, decPrecisionAt)
			lowLonNear, highLonNear, lonDecNear := latLonBounds(attr.Longitude, decPrecisionNear)
			lowLatNear, highLatNear, latDecNear := latLonBounds(attr.Latitude, decPrecisionNear)

			if attr.Value != nil {
				sb.WriteString("attributes.value=?")
				args = append(args, attr.valueForDB())
			}
			if attr.Latitude != nil && attr.Longitude != nil {
				if attr.Value != nil {
					sb.WriteString(" OR ")
				}
				sb.WriteString(`
(
	-- entity name is different, but coordinate is really close to same spot as an existing attribute
	entities.name!=?
	AND round(attributes.longitude, ?) BETWEEN ? AND ?
	AND round(attributes.latitude, ?) BETWEEN ? AND ?
)
OR (
	-- entity name is the same, and coordinate is somewhat close to same spot as an existing attribute
	entities.name=?
	AND round(attributes.longitude, ?) BETWEEN ? AND ?
	AND round(attributes.latitude, ?) BETWEEN ? AND ?
)`)
				args = append(args,
					in.Name,
					lonDecAt, lowLonAt, highLonAt,
					latDecAt, lowLatAt, highLatAt,
					in.Name,
					lonDecNear, lowLonNear, highLonNear,
					latDecNear, lowLatNear, highLatNear)
			}
			sb.WriteString(")")
		}
	}

	var entities []Entity

	// only perform query if there are any qualifiers; otherwise this query returns
	// all rows which is obviously wrong; if there's no ID or identifying attributes,
	// there's no way for us to know who this entity is anyway
	if len(args) > 0 {
		q := sb.String()

		// run the query to get the entity/entities (note there may be multiple; e.g. if multiple people
		// share an account or the account has attributes spanning nultiple positively-ID'ed people)
		rows, err := tx.QueryContext(ctx, q, args...)
		if err != nil {
			return nil, fmt.Errorf("querying entity attributes: %w", err)
		}
		defer rows.Close()

		for rows.Next() {
			var ent Entity
			var name *string
			var metadata *string
			err := rows.Scan(&ent.ID, &name, &ent.Picture, &metadata)
			if err != nil {
				return nil, fmt.Errorf("scanning entity ID: %w", err)
			}
			if name != nil {
				ent.Name = *name
			}
			if metadata != nil {
				err := json.Unmarshal([]byte(*metadata), &ent.Metadata)
				if err != nil {
					return nil, fmt.Errorf("unmarshaling entity metadata: %w", err)
				}
			}
			entities = append(entities, ent)
		}
		if err := rows.Err(); err != nil {
			return nil, fmt.Errorf("iterating entity rows: %w", err)
		}
	}

	return entities, nil
}

func storeAttribute(ctx context.Context, tx *sql.Tx, attr Attribute) (uint64, error) {
	// prepare some values for DB insertion
	var metadata *string
	if len(attr.Metadata) > 0 {
		metaBytes, err := json.Marshal(attr.Metadata)
		if err != nil {
			return 0, fmt.Errorf("encoding attribute metadata as JSON: %w", err)
		}
		metaString := string(metaBytes)
		metadata = &metaString
	}
	var altValue *string
	if strings.TrimSpace(attr.AltValue) != "" {
		altValue = &attr.AltValue
	}

	const decPrecision = 3
	lowLon, highLon, lonDec := latLonBounds(attr.Longitude, decPrecision)
	lowLat, highLat, latDec := latLonBounds(attr.Latitude, decPrecision)

	var attrID uint64
	err := tx.QueryRowContext(ctx, `
		SELECT id
		FROM attributes
		WHERE name=? AND (
			value=?
			OR (
				round(longitude, ?) BETWEEN ? AND ?
				AND round(latitude, ?) BETWEEN ? AND ?
			)
		)
		LIMIT 1`, attr.Name, attr.valueForDB(),
		lonDec, lowLon, highLon,
		latDec, lowLat, highLat).Scan(&attrID)
	if err != nil && !errors.Is(err, sql.ErrNoRows) {
		return 0, fmt.Errorf("looking up attribute '%+v': %w", attr, err)
	}

	if attrID == 0 {
		// store the attribute+value/coords since we don't have this combo yet
		err := tx.QueryRowContext(ctx, `INSERT INTO attributes
			(name, value, alt_value, longitude, latitude, altitude, metadata)
			VALUES (?, ?, ?, ?, ?, ?, ?)
			RETURNING id`,
			attr.Name, attr.valueForDB(), altValue,
			attr.Longitude, attr.Latitude, attr.Altitude, metadata).Scan(&attrID)
		if err != nil {
			return 0, fmt.Errorf("storing attribute '%+v': %w", attr, err)
		}
	}

	return attrID, nil
}

// MergeEntities combines the entities to merge into the entity to keep.
func (tl *Timeline) MergeEntities(ctx context.Context, entityIDToKeep uint64, entityIDsToMerge []uint64) error {
	// input verification / sanity checks, as well as loading entity information
	if entityIDToKeep <= 0 {
		return errors.New("entity to keep must have an ID greater than 0")
	}
	entitiesToMerge := make([]Entity, 0, len(entityIDsToMerge))
	seen := make(map[uint64]struct{}) // deduplication
	for i, id := range entityIDsToMerge {
		if id <= 0 {
			return fmt.Errorf("entities to merge must have IDs greater than 0 (%d)", id)
		}
		if entityIDToKeep == id {
			return fmt.Errorf("cannot merge entity into itself (%d)", id)
		}
		if _, ok := seen[id]; ok {
			return fmt.Errorf("entity to merge specified more than once (%d)", id)
		}
		seen[id] = struct{}{}
		if id == ownerEntityID {
			// TODO: always keep entity 1 for now, since that's the repo owner... until we figure out a better solution
			entityIDToKeep, entityIDsToMerge[i], id = id, entityIDToKeep, entityIDToKeep
		}

		// load all the entities before we get a lock on the DB to do the changes
		entMerge, err := tl.LoadEntity(id)
		if err != nil {
			return fmt.Errorf("loading entity to merge: %w", err)
		}
		entitiesToMerge = append(entitiesToMerge, entMerge)
	}
	entKeep, err := tl.LoadEntity(entityIDToKeep)
	if err != nil {
		return fmt.Errorf("loading entity to keep: %w", err)
	}

	tx, err := tl.db.WritePool.BeginTx(ctx, nil)
	if err != nil {
		return err
	}
	defer tx.Rollback()

	if err = tl.mergeEntities(ctx, tx, entKeep, entitiesToMerge); err != nil {
		return fmt.Errorf("merging entities: %w", err)
	}

	return tx.Commit()
}

func (tl *Timeline) mergeEntities(ctx context.Context, tx *sql.Tx, entKeep Entity, entitiesToMerge []Entity) error {
	for _, entMerge := range entitiesToMerge {
		// bring over any information on the entity to merge that's missing on the entity to keep
		if entMerge.Name != "" && entKeep.Name == "" {
			entKeep.Name = entMerge.Name
		}

		// combine metadata (add only missing fields)
		entKeep.Metadata.Merge(entMerge.Metadata, MetaMergeSkip)
		metadata, err := entKeep.metadataString()
		if err != nil {
			return err
		}

		// if they both have a profile picture, we'll need to delete the one from the entity being merged
		var oldPictureFile *string
		if entMerge.Picture != nil {
			if entKeep.Picture == nil {
				entKeep.Picture = entMerge.Picture
			} else {
				oldPictureFile = entMerge.Picture
			}
		}

		// update the entity to keep with any info that was transferred over from the entity to merge
		_, err = tx.ExecContext(ctx, `UPDATE entities SET name=?, picture_file=?, metadata=? WHERE id=?`,
			entKeep.Name, entKeep.Picture, metadata, entKeep.ID)
		if err != nil {
			return fmt.Errorf("updating entity row: %w", err)
		}

		// replace entity IDs in the entity_attributes table, but if a row update will cause a
		// duplicate on entity_id, attribute_id, and data_source_id fields, delete the row instead
		if _, err := tx.ExecContext(ctx, `
			DELETE FROM entity_attributes AS ea
				WHERE entity_id=?
				AND EXISTS (
					SELECT 1
					FROM entity_attributes AS ea_old
					WHERE ea_old.entity_id=?
						AND ea_old.attribute_id = ea.attribute_id
						AND ea_old.data_source_id IS ea.data_source_id
				)
		`, entKeep.ID, entMerge.ID); err != nil {
			return fmt.Errorf("deleting rows that would violate unique constraints after imminent update in entity_attributes: %w", err)
		}
		if _, err := tx.ExecContext(ctx, `UPDATE entity_attributes SET entity_id=? WHERE entity_id=?`, entKeep.ID, entMerge.ID); err != nil {
			return fmt.Errorf("replacing entity ID in entity_attributes: %w", err)
		}

		// replace entity IDs elsewhere in the database (TODO: we might need to employ similar deletion logic above since we could cause duplicate rows, but these tables are manually populated so it's less likely to make duplicates here)
		if _, err := tx.ExecContext(ctx, `UPDATE tagged SET entity_id=? WHERE entity_id=?`, entKeep.ID, entMerge.ID); err != nil {
			return fmt.Errorf("replacing entity ID in tagged: %w", err)
		}
		if _, err := tx.ExecContext(ctx, `UPDATE story_elements SET entity_id=? WHERE entity_id=?`, entKeep.ID, entMerge.ID); err != nil {
			return fmt.Errorf("replacing entity ID in story_elements: %w", err)
		}
		if _, err := tx.ExecContext(ctx, `UPDATE notes SET entity_id=? WHERE entity_id=?`, entKeep.ID, entMerge.ID); err != nil {
			return fmt.Errorf("replacing entity ID in notes: %w", err)
		}

		// handle pass-through attribute for the entity being merged (start by seeing if there's one for the entity to keep)
		var passThruAttrIDKeep int64
		if err = tx.QueryRowContext(ctx, `SELECT id FROM attributes WHERE name=? AND value=? LIMIT 1`, passThruAttribute, entKeep.ID).Scan(&passThruAttrIDKeep); err != nil && !errors.Is(err, sql.ErrNoRows) {
			return fmt.Errorf("selecting pass-thru attribute for entity to keep: %w", err)
		}
		if passThruAttrIDKeep == 0 {
			// a pass-thru attribute doesn't exist for the entity to keep, so we can safely
			// just update the pass-thru attribute (if any) for the one to merge to point to
			// the one to keep
			if _, err := tx.ExecContext(ctx, `UPDATE attributes SET value=? WHERE name=? AND value=?`, entKeep.ID, passThruAttribute, entMerge.ID); err != nil {
				return fmt.Errorf("updating pass-thru attribute to point to entity to keep: %w", err)
			}
		} else {
			// this is a little more work: we can't just update an attribute because it would
			// violate uniqueness constraints; so update everything that points to the old
			// attribute to point to the new one instead
			var passThruAttrIDMerge int64
			if err = tx.QueryRowContext(ctx, `SELECT id FROM attributes WHERE name=? AND value=? LIMIT 1`, passThruAttribute, entMerge.ID).Scan(&passThruAttrIDMerge); err != nil && !errors.Is(err, sql.ErrNoRows) {
				return fmt.Errorf("selecting pass-thru attribute for entity to merge: %w", err)
			}
			if passThruAttrIDMerge > 0 {
				if _, err := tx.ExecContext(ctx, `UPDATE items SET attribute_id=? WHERE attribute_id=?`, passThruAttrIDKeep, passThruAttrIDMerge); err != nil {
					return fmt.Errorf("updating attribute ID in items table: %w", err)
				}
				if _, err := tx.ExecContext(ctx, `UPDATE relationships SET from_attribute_id=? WHERE from_attribute_id=?`, passThruAttrIDKeep, passThruAttrIDMerge); err != nil {
					return fmt.Errorf("updating 'from' attribute ID in relationships table: %w", err)
				}
				if _, err := tx.ExecContext(ctx, `UPDATE relationships SET to_attribute_id=? WHERE to_attribute_id=?`, passThruAttrIDKeep, passThruAttrIDMerge); err != nil {
					return fmt.Errorf("updating 'to' attribute ID in relationships table: %w", err)
				}
				if _, err := tx.ExecContext(ctx, `DELETE FROM entity_attributes WHERE attribute_id=?`, passThruAttrIDMerge); err != nil {
					return fmt.Errorf("deleting row in entity_attributes table: %w (attribute_id=%d)", err, passThruAttrIDMerge)
				}
				if _, err := tx.ExecContext(ctx, `DELETE FROM attributes WHERE id=?`, passThruAttrIDMerge); err != nil {
					return fmt.Errorf("deleting pass-thru attribute for entity to merge: %w (attribute_id=%d)", err, passThruAttrIDMerge)
				}
			}
		}

		// clean up the entity to merge
		if _, err := tx.ExecContext(ctx, `DELETE FROM entities WHERE id=?`, entMerge.ID); err != nil {
			return fmt.Errorf("deleting from entities table: %w", err)
		}
		if oldPictureFile != nil {
			if err := tl.deleteRepoFile(*oldPictureFile); err != nil {
				return err
			}
		}
	}

	return nil
}

// entityNamesCompatible returns the singular name of these entities and true
// if they are compatible, meaning that all the entities with a non-empty Name
// have the same name.
func entityNamesCompatible(entities []Entity) (string, bool) {
	var entityName string
	for _, e := range entities {
		if e.Name == "" {
			continue
		}
		if entityName == "" {
			entityName = e.Name
		} else if !strings.EqualFold(e.Name, entityName) {
			return "", false
		}
	}
	return entityName, true
}

// latentID is a type that holds either the ID of an item
// or an identity attribute for an entity. If an entity
// was processed without an explicit identity attribute,
// this struct will hold the row ID of the entity instead,
// and if the ID of the identity attribute is later needed,
// call identifyingAttributeID() to create a pass-thru
// entity and get a usable attribute ID linked to the
// entity. For items, the itemID field can be accessed directly.
type latentID struct {
	itemID, entityID, attributeID uint64
}

// id returns either the item ID or the attribute ID, whichever is set.
func (l latentID) id() uint64 {
	if l.itemID > 0 {
		return l.itemID
	}
	if l.attributeID > 0 {
		return l.attributeID
	}
	return 0
}

// identifyingAttributeID returns either the known row ID of the
// attribute identifying the entity, or it will create a pass-thru
// attribute for the entity and return the new row ID.
func (l *latentID) identifyingAttributeID(ctx context.Context, tx *sql.Tx) (uint64, error) {
	if l.attributeID > 0 {
		return l.attributeID, nil
	}
	if l.entityID == 0 {
		return 0, errors.New("no attribute ID or entity ID set")
	}

	attrID, err := storeLinkBetweenEntityAndNonIDAttribute(ctx, tx, l.entityID, Attribute{
		Name:     passThruAttribute,
		Value:    strconv.FormatUint(l.entityID, 10),
		Identity: true,
	})
	if err != nil {
		return 0, err
	}
	l.attributeID = attrID

	return l.attributeID, nil
}

// getSystemLocale returns the language, region, and encoding, if available.
func getSystemLocale(ctx context.Context) (language, region, encoding string, err error) {
	// Example of LANG: en_US.UTF-8
	if lang := os.Getenv("LANG"); lang != "" {
		underscorePos := strings.Index(lang, "_")
		if underscorePos < 0 {
			err = fmt.Errorf("no underscore divider between language and region in LANG=%s", lang)
			return
		}
		dotPos := strings.Index(lang, ".")
		if dotPos < 0 {
			err = fmt.Errorf("no dot divider between language and region in LANG=%s", lang)
		}
		language = lang[:underscorePos]
		region = lang[underscorePos+1 : dotPos]
		encoding = lang[dotPos+1:]
		return
	}
	if runtime.GOOS == "windows" {
		cmd := exec.CommandContext(ctx, "powershell", "Get-Culture | select -exp Name")
		var output []byte
		output, err = cmd.Output() // e.g. "en-US"
		if err != nil {
			return
		}
		parts := strings.Split(strings.TrimSpace(string(output)), "-")
		if len(parts) == 2 { //nolint:mnd
			language = parts[0]
			region = parts[1]
		}
		return
	}
	err = errors.New("unable to determine locale because LANG var is unset and/or platform unrecognized")
	return
}

var multiSpaceRegex = regexp.MustCompile(`\s{2,}`)

// passThruAttribute is the name of the special attribute that is used to identify
// a specific entity by its row ID and no other features (attributes) in particular.
const passThruAttribute = "_entity"

// Canonical attribute names.
const (
	AttributeEmail       = "email_address"
	AttributePhoneNumber = "phone_number"
	AttributeGender      = "gender"
)