1
0
Fork 0
timelinize/timeline/search.go
Ralf Ebert 8218ab0a39
Don't use time_offset when searching items (#172)
Fix gallery photo prev/next for photos with time zone

When jumping between photos in the gallery with the prev/next button, the buttons were incorrectly enabled/disabled/wouldn't work even if enabled for me. This was caused by the photos having a time zone UTC+1 (time_offset = 3600). The js frontend uses formatted timestamps including the time zone (peekFromItem.timestamp in galleryFilterParams). These are converted to a unix timestamp via UTC().UnixMilli(), so the time zone is already taken care for and adding time_offset is not necessary. For the filtering in the gallery grid itself, a unix timestamp is generated client-side (something like &start=1762124400&end=1762210800) taking the system time zone into account, so this is correct without any extra logic as well.
2025-11-13 10:50:46 -07:00

1058 lines
36 KiB
Go

/*
Timelinize
Copyright (c) 2013 Matthew Holt
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published
by the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
package timeline
import (
"context"
"database/sql"
"encoding/json"
"errors"
"fmt"
"math"
"os"
"path/filepath"
"sort"
"strconv"
"strings"
"time"
)
// ItemSearchParams describes a search for items.
//
// Fields with a slice/array type typically mean "any of these"
// (their elements are OR'ed together).
type ItemSearchParams struct {
// The UUID of the open timeline to search.
Repo string `json:"repo,omitempty"`
// ML searches -- currently, only one of these can be set at a time
// TODO: add a QueryImage field for image search
SemanticText string `json:"semantic_text,omitempty"` // similar to the centroid of the vector formed from this phrase
SimilarTo int64 `json:"similar_to,omitempty"` // similar to the centroid of the vector of the specified item (TODO: should it just be embedding ID?)
RowID []int64 `json:"row_id,omitempty"`
// AccountID []int `json:"account_id,omitempty"` // TODO: restore this, if useful
DataSourceName []string `json:"data_source,omitempty"`
JobID []int64 `json:"job_id,omitempty"`
AttributeID []int64 `json:"attribute_id,omitempty"`
EntityID []uint64 `json:"entity_id,omitempty"`
Classification []string `json:"classification,omitempty"`
OriginalID []string `jsson:"original_id,omitempty"`
DataType []string `json:"data_type,omitempty"`
DataText []string `json:"data_text,omitempty"`
DataFile []string `json:"data_file,omitempty"`
// TODO: how do we effectively search metadata? maybe virtual columns? https://antonz.org/json-virtual-columns/
// TODO: Well, this query was fast: `SELECT * FROM items WHERE items.metadata->>'$.Make' = 'Google' LIMIT 10`
// Metadata map[string][]any `json:"metadata,omitempty"`
// TODO: a way to get items with EntityID, but also any relation FromEntityID or ToEntityID...
ToAttributeID []int64 `json:"to_attribute_id,omitempty"`
ToEntityID []int64 `json:"to_entity_id,omitempty"`
// filter by relationship to other items
// TODO: We can probably wrap ToAttributeID, ToEntityID, and maybe Astructured into this?
Relations []RelationParams `json:"relations,omitempty"`
// bounding box searches
StartTimestamp *time.Time `json:"start_timestamp,omitempty"`
EndTimestamp *time.Time `json:"end_timestamp,omitempty"`
MinLatitude *float64 `json:"min_latitude,omitempty"`
MaxLatitude *float64 `json:"max_latitude,omitempty"`
MinLongitude *float64 `json:"min_longitude,omitempty"`
MaxLongitude *float64 `json:"max_longitude,omitempty"`
Inclusive bool `json:"inclusive,omitempty"` // if true, bounding box is <= and >= instead of < and >
StrictStartTimestamp bool `json:"strict_start_timestamp,omitempty"` // if true, item timestamp must be after start_timestamp (even if item timespan is after)
StrictEndTimestamp bool `json:"strict_end_timestamp,omitempty"` // if true, item timespan must be before end_timestamp (even if item timestamp is before)
NoLocation bool `json:"no_location,omitempty"` // if true, require location columns to be NULL regardless of max/min lat/lon
// proximity searches (location and time are mutually exclusive)
Timestamp *time.Time `json:"timestamp,omitempty"`
Latitude *float64 `json:"latitude,omitempty"`
Longitude *float64 `json:"longitude,omitempty"`
// If true, OR different fields instead of AND
OrFields bool `json:"or_fields,omitempty"`
// How to order results. Default: usually timestamp,
// but can be "smart" depending on search parameters.
// This can also be "stored" to order by date added
// to DB.
OrderBy string `json:"order_by,omitempty"`
Sort SortDir `json:"sort,omitempty"` // ignored if doing proximity search (unless it's SortNone, which breaks proximity searches)
Limit int `json:"limit,omitempty"` // number of rows to include (-1 for no limit); default 1000
Offset int `json:"offset,omitempty"` // number of rows to skip (can be slow if very large)
Sample int `json:"sample,omitempty"` // retrieve every Nth row
// TODO: Matt's note: Pagination can be done more efficiently than
// Offset or WithTotal (below) by using little tricks. For example,
// if the exact count isn't required, set a large limit N (but still
// a sensible order of magnitude) and report simply "N+" items; or
// instead of offset, use WHERE to exclude items we've already paged
// through using an indexed column (for example, if ordering by
// timestamp, use WHERE to exclude items with timestamps that
// would have appeared on previous pages; i.e. keep a cursor of
// last item's timestamp from prev. page)
// query related items and people recursively to this many degrees
// (e.g. 0 is only the item, 1 adds direct relationships, etc...)
Related int `json:"related,omitempty"`
// By default (when this is false), search results will prioritize
// the root nodes of item graphs, i.e. items that are not on the
// end of a directed relation/edge, since this makes more sense
// when viewing the timeline. If this is true, the ends of
// directed relations are ignored, and related items may appear
// in the "top level" of search results as well as nested in
// "related".
// TODO: Rename? Like only_root_items (that would also invert the value...)
Astructured bool `json:"flat,omitempty"`
// TODO: Experimental, for autocomplete... not sure if useful.
PreferPrefix bool `json:"prefer_prefix,omitempty"`
// In addition to results, return a total count that ignores offset and limit.
// This can be very slow for broad queries on large DBs.
// TODO: This can be very slow for large DBs on broad queries... maybe cache results? Or find a way to paginate without this (see comment above)
WithTotal bool `json:"with_total,omitempty"`
// Only return a total count that ignores offset and limit.
OnlyTotal bool `json:"only_total,omitempty"`
// If true, the only fields that will be selected from the DB are
// latitude and longitude, and the search results will not be filled
// with items; instead, the results will contain a GeoJSON document
// with the coordinates, which can then be used directly by mapping
// software to render point or heatmap information. This approach is
// an order of magnitude faster (10x in my testing) for large data
// sets (1M+ rows) to render a heatmap, where only coordinates are
// needed. It avoids scanning the entire item row and allocating
// multiple structs and performing various other computations.
// Instead it efficiently builds a string containing only the
// coordinate data. Non-spatial data will be excluded.
GeoJSON bool `json:"geojson,omitempty"`
ObfuscatedGeoJSON []ObfuscatedLocation `json:"-"` // TODO: this is kind of a hack since it's the only data we encode before returning (for efficiency)
// Include the size of the item content with the results.
// For data files, this involves calling stat() on the file.
WithSize bool `json:"with_size,omitempty"`
// If true, include deleted items (that haven't been erased yet).
Deleted bool `json:"deleted,omitempty"`
// stores the converted names to row IDs
classificationIDs []uint64
}
type SearchResults struct {
// If enabled, the total count of items matching the search query
// without regard for limit and offset.
Total *int `json:"total,omitempty"`
// The items of the search result.
Items []*SearchResult `json:"items,omitempty"`
// The search results in GeoJSON mode. A GeoJSON document
// useful for rendering heatmaps or clusters.
GeoJSON string `json:"geojson,omitempty"`
}
func (tl *Timeline) Search(ctx context.Context, params ItemSearchParams) (SearchResults, error) {
// setting a natural language input has big implications, so make sure it's not just whitespace by accident
params.SemanticText = strings.TrimSpace(params.SemanticText)
// get the DB query string and associated arguments
q, args, err := tl.prepareSearchQuery(ctx, params)
if err != nil {
return SearchResults{}, err
}
// open DB transaction to hopefully make it more efficient; may involve many queries
// (TODO: we don't currently commit this tx, because we didn't make changes - that's OK, right?)
tx, err := tl.db.ReadPool.BeginTx(ctx, nil)
if err != nil {
return SearchResults{}, err
}
defer tx.Rollback()
// in count-only mode, there's only a single row with a single field
if params.OnlyTotal {
var count int
err := tx.QueryRowContext(ctx, q, args...).Scan(&count)
if err != nil {
return SearchResults{}, err
}
return SearchResults{Total: &count}, nil
}
// run query and scan results
rows, err := tx.QueryContext(ctx, q, args...)
if err != nil {
return SearchResults{}, fmt.Errorf("querying db for items: %w", err)
}
// in GeoJSON mode, skip the expensive full-row scans etc; instead, efficiently
// build a GeoJSON string directly -- it's very specific so we can do even better
// than json.Marshal by using a strings builder.
var sb strings.Builder
if params.GeoJSON {
// in testing, I found that Mapbox renders a single huge MultiPoint feature
// way faster than many Point features
sb.WriteString(`{"type":"Feature","geometry":{"type":"MultiPoint","coordinates":[`)
}
// for JSON serialization, always initialize so "0 results" is at least an empty list and not null
results := make([]*SearchResult, 0)
var count, totalCount int
for rows.Next() { // fun fact, the first call to Next() is actually what runs the query
// in GeoJSON mode, skip the usual full-row scan, and instead focus
// on the coordinates which is much more efficient
if params.GeoJSON {
// read the coordinate data
var rowID uint64
var lat, lon *float64
targets := []any{&rowID, &lat, &lon}
if params.WithTotal {
targets = append(targets, &totalCount)
}
err := rows.Scan(targets...)
if err != nil {
defer rows.Close()
return SearchResults{}, err
}
// ensure it's valid, then obfuscate if enabled, then format as strings (waaaaay faster than reflection)
if lat == nil || lon == nil {
continue
}
for _, locob := range params.ObfuscatedGeoJSON {
if locob.Contains(*lat, *lon) {
newLat, newLon := locob.Obfuscate(*lat, *lon, rowID)
lat = &newLat
lon = &newLon
}
}
latStr := strconv.FormatFloat(*lat, 'f', -1, 64)
lonStr := strconv.FormatFloat(*lon, 'f', -1, 64)
// write coordinate to the GeoJSON document
if count > 0 {
sb.WriteRune(',')
}
sb.WriteRune('[')
sb.WriteString(lonStr)
sb.WriteRune(',')
sb.WriteString(latStr)
sb.WriteRune(']')
// continue with next row; skip all the expensive stuff
count++
continue
}
var re relatedEntity // entity is left-joined, so could be null
var embeddingDistance float64
var embeddingID *uint64 // just to know whether there are any embeddings for the item
extraTargets := []any{&re.ID, &re.Name, &re.Picture, &re.Attribute.Name, &re.Attribute.Value, &re.Attribute.AltValue, &embeddingID}
if params.WithTotal {
extraTargets = append(extraTargets, &totalCount)
}
if params.SemanticText != "" || params.SimilarTo > 0 {
extraTargets = append(extraTargets, &embeddingDistance)
}
itemRow, err := scanItemRow(rows, extraTargets)
if err != nil {
rows.Close()
return SearchResults{}, err
}
sr := &SearchResult{RepoID: tl.id.String(), ItemRow: itemRow, HasEmbedding: embeddingID != nil, Distance: embeddingDistance}
if re.ID != nil {
sr.Entity = &re
}
results = append(results, sr)
}
rows.Close()
if err = rows.Err(); err != nil {
return SearchResults{}, fmt.Errorf("iterating item rows: %w", err)
}
// in GeoJSON mode, we can be done early by wrapping up our document
if params.GeoJSON {
sb.WriteString(`]}}`)
return SearchResults{GeoJSON: sb.String(), Total: &totalCount}, nil
}
// TODO: this needs tuning
if params.SemanticText != "" {
// filter results for relevance by passing them through the classifier...
// this is kind of a hack, but it's a well-known difficult problem apparently,
// for any KNN search, to only show relevant results
itemFiles := make(map[uint64]string)
for _, result := range results {
if result.DataFile == nil || result.DataType == nil {
continue
}
if !strings.HasPrefix(*result.DataType, "image/") {
continue
}
itemFiles[result.ID] = tl.FullPath(*result.DataFile)
}
scores, err := classify(ctx, itemFiles, []string{params.SemanticText})
if err != nil {
return SearchResults{}, fmt.Errorf("classifying results: %w", err)
}
for _, sr := range results {
if score, ok := scores[sr.ID]; ok {
sr.Score = score
}
}
// sort by score descending, then prefer items that weren't scored over scored items with a low score
sort.Slice(results, func(i, j int) bool {
_, iWasScored := scores[results[i].ID]
_, jWasScored := scores[results[j].ID]
return results[i].Score > results[j].Score || (jWasScored && !iWasScored)
})
// chop off results that are irrelevant, starting with the first result that was scored and has a score near zero
for i, sr := range results {
_, wasScored := scores[sr.ID]
// TODO: Be smarter: find the range the top/best results score, and only cull results below a significant threshold difference
if sr.Score <= 0.002 && wasScored {
results = results[:i]
break
}
}
}
// traverse relationships
for _, sr := range results {
err = tl.expandRelationships(ctx, tx, params.Related, sr)
if err != nil {
return SearchResults{}, err
}
}
// include size information, if requested
if params.WithSize {
for _, sr := range results {
if sr.DataText != nil {
sr.Size = int64(len(*sr.DataText))
}
if sr.DataFile != nil {
info, err := os.Stat(filepath.Join(tl.Dir(), *sr.DataFile))
if err == nil {
sr.Size = info.Size()
}
}
}
}
return SearchResults{Total: &totalCount, Items: results}, nil
}
// TODO: favorites? or maybe a more flexible albums/lists feature? what to call it... "scrapbooks" or "curations"?
func (tl *Timeline) convertNamesToIDs(params *ItemSearchParams) {
tl.cachesMu.RLock()
for _, className := range params.Classification {
if className == "" {
params.classificationIDs = append(params.classificationIDs, 0)
} else {
params.classificationIDs = append(params.classificationIDs, tl.classifications[className])
}
}
tl.cachesMu.RUnlock()
}
func (tl *Timeline) prepareSearchQuery(ctx context.Context, params ItemSearchParams) (string, []any, error) {
if (params.Latitude == nil && params.Longitude != nil) || (params.Latitude != nil && params.Longitude == nil) {
return "", nil, errors.New("location proximity search must include both lat and lon coordinates")
}
if params.Timestamp != nil && (params.Latitude != nil || params.Longitude != nil) {
return "", nil, errors.New("time proximity and location proximity are mutually exclusive")
}
const maxDegreesOfSeparation = 2
if params.Related > maxDegreesOfSeparation {
// arbitrary, but I suspect it's a good idea to limit this for performance reasons
return "", nil, errors.New("max degrees of separation for relationships is 2")
}
if params.WithTotal && params.OnlyTotal {
return "", nil, errors.New("cannot query results with total and only total at the same time")
}
tl.convertNamesToIDs(&params)
// When viewing a timeline, it can make more intuitive sense
// to only show "root" nodes of item graphs at the "top" of
// the search results, so that items which are only attachments,
// for example, don't appear in the timeline multiple times
// (once as an attachment, and once as "attached to" - it's even
// more confusing if it's an attachment but looks like its own
// item). So by default, we omit items from top level of search
// results which appear at the end of a directed relation/edge.
// Only if Astructured is true do we allow attachments and other
// such items to appear as their own items in the top level.
// Note: We also do astructured if specific row IDs are being queried
// since we still want to select those specific rows even if
// they are, say, attachments of a message. Querying specific
// rows has priority over relationships.
rootItemsOnly := !params.Astructured && len(params.RowID) == 0
// if searching by embeddings, we first select the items that could possibly
// be in the results by filtering with other search parameters, then we do
// distance calculations over that subset of the data, which is theoretically
// faster (see https://github.com/asg017/sqlite-vec/issues/196#issuecomment-2643543058)
var q string
vectorSearch := params.SemanticText != "" || params.SimilarTo > 0
if vectorSearch {
q = "WITH search_results AS (\n"
}
// honor inclusivity for bounding-box searches
lt, gt := "<", ">"
if params.Inclusive {
lt, gt = "<=", ">="
}
// TODO: use strings.Builder (also in RecentConversations())
q += fmt.Sprintf("\t\tSELECT %s, entities.id, entities.name, entities.picture_file, attributes.name, attributes.value, attributes.alt_value, embeddings.id", itemDBColumns)
if params.OnlyTotal {
q = "\t\tSELECT count(DISTINCT items.id)"
}
if params.GeoJSON {
// GeoJSON mode is intended to be more efficient; as such, only select coordinate data
q = "\t\tSELECT items.id, items.latitude, items.longitude"
}
if params.WithTotal {
q += ", count() over() AS total_count"
}
q += `
FROM extended_items AS items
LEFT JOIN attributes ON items.attribute_id = attributes.id
LEFT JOIN entity_attributes ON attributes.id = entity_attributes.attribute_id
LEFT JOIN entities ON entity_attributes.entity_id = entities.id
LEFT JOIN embeddings ON embeddings.item_id = items.id`
// TODO: It's possible that we could move all these (ToAttributeID, ToEntityID, rootItemsOnly) into RelationParams
if len(params.ToAttributeID) > 0 || len(params.ToEntityID) > 0 {
q += `
JOIN relationships ON relationships.from_item_id = items.id`
} else if rootItemsOnly || len(params.Relations) > 0 {
q += `
LEFT JOIN relationships ON relationships.to_item_id = items.id
LEFT JOIN relations ON relations.id = relationships.relation_id`
}
// build the WHERE in terms of groups of OR's that are AND'ed together
var args []any
var clauseCount int
and := func(ors func()) {
clauseCount = 0
if len(args) == 0 {
q += "\n\t\tWHERE"
} else {
if params.OrFields {
q += " OR"
} else {
q += " AND"
}
}
q += " ("
ors()
q += ")"
// if the clause turned out to be empty,
// this is a poor-man's way of undoing it
q = strings.TrimSuffix(q, " OR ()")
q = strings.TrimSuffix(q, " AND ()")
q = strings.TrimSuffix(q, "\n\t\tWHERE ()")
}
or := func(clause string, val any) {
if clauseCount > 0 {
q += " OR "
}
q += clause
args = append(args, val)
clauseCount++
}
and(func() {
if params.RowID != nil && len(params.RowID) == 0 {
or("items.id IS ?", nil)
}
for _, v := range params.RowID {
or("items.id=?", v)
}
})
and(func() {
if params.DataSourceName != nil && len(params.DataSourceName) == 0 {
or("data_source_name IS ?", nil)
}
for _, v := range params.DataSourceName {
or("data_source_name=?", v)
}
})
and(func() {
if params.JobID != nil && len(params.JobID) == 0 {
or("items.job_id IS ?", nil)
}
for _, v := range params.JobID {
or("items.job_id=?", v)
}
})
and(func() {
if params.AttributeID != nil && len(params.AttributeID) == 0 {
or("items.attribute_id IS ?", nil)
}
for _, v := range params.AttributeID {
or("items.attribute_id=?", v)
}
})
and(func() {
if params.EntityID != nil && len(params.EntityID) == 0 {
or("entities.id IS ?", nil)
}
for _, v := range params.EntityID {
or("entities.id=?", v)
}
})
and(func() {
for _, v := range params.classificationIDs {
if v == 0 {
or("items.classification_id IS ?", nil)
} else {
or("items.classification_id=?", v)
}
}
})
and(func() {
if params.OriginalID != nil && len(params.OriginalID) == 0 {
or("items.original_id IS ?", nil)
}
for _, v := range params.OriginalID {
or("items.original_id=?", v)
}
})
and(func() {
if params.DataType != nil && len(params.DataType) == 0 {
or("items.data_type IS ?", nil)
}
for _, v := range params.DataType {
if strings.HasSuffix(v, "/*") {
// useful for searching for all "video/*" files, for example
or("items.data_type LIKE ? || '%'", v[:len(v)-1])
} else {
or("items.data_type=?", v)
}
}
})
and(func() {
if params.DataText != nil && len(params.DataText) == 0 {
or("items.data_text IS ?", nil)
}
for _, v := range params.DataText {
or("items.data_text LIKE '%' || ? || '%'", v)
}
})
and(func() {
if params.DataFile != nil && len(params.DataFile) == 0 {
or("items.data_file IS ?", nil)
}
for _, v := range params.DataFile {
or("items.data_file=?", v)
}
})
and(func() {
// TODO: these can probably be in the same 'AND' group like this, right?
for _, v := range params.ToAttributeID {
or("relationships.to_attribute_id=?", v)
}
for _, v := range params.ToEntityID {
or("relationships.to_entity_id=?", v)
}
})
// TODO: Use BETWEEN maybe
if params.StartTimestamp != nil {
and(func() {
or("items.timestamp"+gt+" ?", params.StartTimestamp.UTC().UnixMilli())
if !params.StrictStartTimestamp {
// if not strict, allow items to spill into the window even if the started before it
or("items.timespan "+gt+" ?", params.StartTimestamp.UTC().UnixMilli())
}
})
}
if params.EndTimestamp != nil {
and(func() {
or("items.timestamp "+lt+" ?", params.EndTimestamp.UTC().UnixMilli())
})
if params.StrictEndTimestamp {
// if strict, items' timespan must end before the EndTimestamp (item can't merely start before it)
and(func() {
or("items.timespan IS NULL OR items.timespan "+lt+" ?", params.EndTimestamp.UTC().UnixMilli())
})
}
}
if params.NoLocation {
and(func() {
or("items.latitude IS ?", nil)
})
and(func() {
or("items.longitude IS ?", nil)
})
and(func() {
or("items.altitude IS ?", nil)
})
and(func() {
or("items.coordinate_system IS ?", nil)
})
} else {
if params.MinLatitude != nil {
and(func() {
or("items.latitude "+gt+" ?", params.MinLatitude)
})
}
if params.MaxLatitude != nil {
and(func() {
or("items.latitude "+lt+" ?", params.MaxLatitude)
})
}
if params.MinLongitude != nil {
and(func() {
or("items.longitude "+gt+" ?", params.MinLongitude)
})
}
if params.MaxLongitude != nil {
and(func() {
or("items.longitude "+lt+" ?", params.MaxLongitude)
})
}
}
// skip deleted items unless we are explicitly supposed to include them
// (NOTE: we include them if the specific row IDs are requested)
if !params.Deleted && len(params.RowID) == 0 {
and(func() {
or("items.deleted IS ?", nil)
})
}
// always skip hidden items
and(func() {
or("items.hidden IS ?", nil)
})
// skip every so many items if sampling is enabled
if params.Sample > 1 {
and(func() {
or("items.id % ? = 0", params.Sample)
})
}
if rootItemsOnly {
// select only items which are not dependent on other items, i.e., items
// that are not at the end of a directed relation from another item
and(func() {
or("relations.directed != ?", 1)
or("relations.subordinating = ?", 0)
or("relationships.to_item_id IS ?", nil)
})
}
// factor in relationships
for _, relParam := range params.Relations {
op := "IS"
if relParam.Not {
op = "IS NOT"
}
and(func() {
if relParam.RelationLabel != "" {
or("relations.label "+op+" ?", relParam.RelationLabel)
}
})
}
if !params.OnlyTotal {
q += "\n\t\tGROUP BY items.id"
}
// don't put order in the temporary table, since we'll be ordering by vector distance
if !vectorSearch {
if params.Sort != SortNone {
q += "\n\t\tORDER BY "
// TODO: not sure if this is how autocomplete will work or be useful, but basically
// this sorts by data text so that if it's a prefix, it's weighed higher in the
// sort, and if it's a suffix then put it at the end; i.e. favor term at beginning
// of words instead of end... I think that's what this does, at least
if params.PreferPrefix && len(params.DataText) == 1 {
q += `CASE
WHEN items.data_text LIKE ? || '%' THEN 1
WHEN items.data_text LIKE '%' || ? THEN 3
ELSE 2
END, `
args = append(args, params.DataText[0], params.DataText[0])
}
// sort
sortDir := strings.ToUpper(string(params.Sort))
if sortDir == "" {
// smart sort: sort ASC by default if only "minimum" side of bounding boxes are specified
if (params.StartTimestamp != nil || params.MinLatitude != nil || params.MinLongitude != nil) &&
params.EndTimestamp == nil && params.MaxLatitude == nil && params.MaxLongitude == nil {
sortDir = string(SortAsc)
} else {
sortDir = string(SortDesc)
}
}
if sortDir != string(SortAsc) && sortDir != string(SortDesc) {
return "", nil, fmt.Errorf("invalid sort direction: %s", sortDir)
}
// location proximity search is only an approximation for simplicity
// see https://stackoverflow.com/a/39298241/1048862
switch {
case params.Latitude != nil && params.Longitude != nil:
// nearest to location; account for shortened distances at poles
// math.Cos() takes radians, hence the conversion to radians inside the cosine
cosLat2 := math.Pow(math.Cos(*params.Latitude*math.Pi/180.0), 2) //nolint:mnd
sortDir = string(SortAsc) // always sort ascending for nearest
q += "((?-items.latitude) * (?-items.latitude)) + ((?-items.longitude) * (?-items.longitude) * ?), items.id " + sortDir
args = append(args, params.Latitude, params.Latitude, params.Longitude, params.Longitude, cosLat2)
case params.Timestamp != nil:
// nearest to timestamp
sortDir = string(SortAsc) // always sort ascending for nearest
q += "abs(?-items.timestamp), items.id " + sortDir
args = append(args, params.Timestamp.UnixMilli())
case params.OrderBy == "stored":
q += "items.stored " + sortDir
default:
// generic sort, which is timestamp and row ID
q += fmt.Sprintf("items.timestamp %s, items.id %s", sortDir, sortDir)
}
}
// limit
if !params.OnlyTotal {
if params.Limit == 0 {
params.Limit = 1000
}
if params.Limit > 0 {
q += "\n\t\tLIMIT ?"
args = append(args, params.Limit)
}
if params.Offset > 0 {
q += "\n\t\tOFFSET ?"
args = append(args, params.Offset)
}
}
}
if vectorSearch {
var targetVectorClause string
if params.SimilarTo > 0 {
targetVectorClause = "(SELECT embedding FROM embeddings JOIN items ON embeddings.item_id = items.id WHERE items.id=? LIMIT 1)"
args = append(args, params.SimilarTo)
} else if params.SemanticText != "" {
// search relative to an arbitrary input (TODO: support image inputs too) - python server must be online
if !pythonServerReady(ctx, false) {
return "", nil, errors.New("python server not ready")
}
embedding, err := generateEmbedding(ctx, "text/plain", []byte(params.SemanticText), nil)
if err != nil {
return "", nil, err
}
targetVectorClause = "?"
args = append(args, string(embedding))
}
q += fmt.Sprintf(`
)
SELECT
search_results.*,
vec_distance_l2(%s, embeddings.embedding) AS distance
FROM search_results
JOIN embeddings ON embeddings.item_id = search_results.id
ORDER BY distance`, targetVectorClause)
if params.Limit == 0 {
params.Limit = 100
}
if params.Limit > 0 {
q += "\nLIMIT ?"
args = append(args, params.Limit)
}
if params.Offset > 0 {
q += "\nOFFSET ?"
args = append(args, params.Offset)
}
}
return q, args, nil
}
func (tl *Timeline) expandRelationships(ctx context.Context, tx *sql.Tx, degrees int, sr *SearchResult) error {
if degrees <= 0 {
return nil
}
// notice how we're careful to avoid recursion while we have open rows
// scanning happening, so that we don't step on other select queries
err := tl.expandRelationshipSingle(ctx, tx, sr)
if err != nil {
return err
}
// now expand relationships recursively until degrees of separation
// from the original search results are reached
for _, rel := range sr.Related {
if rel.FromItem != nil {
err := tl.expandRelationships(ctx, tx, degrees-1, rel.FromItem)
if err != nil {
return err
}
}
if rel.ToItem != nil {
err := tl.expandRelationships(ctx, tx, degrees-1, rel.ToItem)
if err != nil {
return err
}
}
}
return nil
}
func (tl *Timeline) expandRelationshipSingle(ctx context.Context, tx *sql.Tx, sr *SearchResult) error {
// TODO: limit here is arbitrary I think... ho hum
rows, err := tx.QueryContext(ctx, `
SELECT
relationships.id,
relations.directed,
relations.label,
relationships.value,
relationships.start,
relationships.end,
relationships.metadata,
relationships.from_item_id,
relationships.to_item_id,
from_entity.id,
from_entity.name,
from_entity.picture_file,
from_attr.name,
from_attr.value,
from_attr.alt_value,
to_entity.id,
to_entity.name,
to_entity.picture_file,
to_attr.name,
to_attr.value,
to_attr.alt_value
FROM relationships
JOIN items ON items.id=?
JOIN relations ON relations.id=relationships.relation_id
LEFT JOIN attributes AS from_attr ON from_attr.id=relationships.from_attribute_id
LEFT JOIN attributes AS to_attr ON to_attr.id=relationships.to_attribute_id
LEFT JOIN entity_attributes AS from_ea ON from_ea.attribute_id = from_attr.id
LEFT JOIN entity_attributes AS to_ea ON to_ea.attribute_id = to_attr.id
LEFT JOIN entities AS from_entity ON from_entity.id = from_ea.entity_id
LEFT JOIN entities AS to_entity ON to_entity.id = to_ea.entity_id
WHERE (relationships.from_item_id=? OR relationships.to_item_id=?) AND items.hidden IS NULL
GROUP BY relationships.id
LIMIT 10`, sr.ItemRow.ID, sr.ItemRow.ID, sr.ItemRow.ID)
if err != nil {
return fmt.Errorf("querying db for relationships: %w", err)
}
defer rows.Close()
for rows.Next() {
var rel Related
var fromItemID, toItemID *uint64
var relStart, relEnd *int64
var fromEntity, toEntity relatedEntity
var relMeta *string
err := rows.Scan(&rel.RelationshipID, &rel.Directed, &rel.Label, &rel.Value, &relStart, &relEnd, &relMeta, &fromItemID, &toItemID,
&fromEntity.ID, &fromEntity.Name, &fromEntity.Picture, &fromEntity.Attribute.Name, &fromEntity.Attribute.Value, &fromEntity.Attribute.AltValue,
&toEntity.ID, &toEntity.Name, &toEntity.Picture, &toEntity.Attribute.Name, &toEntity.Attribute.Value, &toEntity.Attribute.AltValue)
if err != nil {
return err
}
if fromEntity.ID != nil {
rel.FromEntity = &fromEntity
}
if toEntity.ID != nil {
rel.ToEntity = &toEntity
}
if relStart != nil {
startTime := time.Unix(*relStart, 0)
rel.Start = &startTime
}
if relEnd != nil {
endTime := time.Unix(*relEnd, 0)
rel.End = &endTime
}
if relMeta != nil {
rel.Metadata = json.RawMessage(*relMeta)
}
// expand items, we have to do this until our code can support
// loading an item from among other columns as well; expand
// only if the item is distinct from the parent/starting item
if fromItemID != nil && *fromItemID != sr.ID {
fromRel, err := tl.loadRelatedItem(ctx, tx, *fromItemID)
if err != nil {
return fmt.Errorf("loading from_item: %w", err)
}
rel.FromItem = fromRel
}
if toItemID != nil && *toItemID != sr.ID {
toRel, err := tl.loadRelatedItem(ctx, tx, *toItemID)
if err != nil {
return fmt.Errorf("loading to_item: %w", err)
}
rel.ToItem = toRel
}
sr.Related = append(sr.Related, rel)
}
if err = rows.Err(); err != nil {
return fmt.Errorf("scanning related item rows: %w", err)
}
return nil
}
func (tl *Timeline) loadRelatedItem(ctx context.Context, tx *sql.Tx, itemRowID uint64) (*SearchResult, error) {
ir, err := tl.loadItemRow(ctx, tx, itemRowID, 0, nil, nil, nil, false)
if err != nil {
return nil, fmt.Errorf("loading related item row: %w", err)
}
// TODO: dunno if this safe (QueryRow during a rows.Scan, by the caller of this function)
// TODO: if it would help increase performance, we could probably cache information about persons and person_identities...
// TODO: maybe there's a view we could use for this instead?
var p relatedEntity
if ir.AttributeID != nil {
err = tx.QueryRowContext(ctx, `
SELECT entities.id, entities.name, entities.picture_file, attributes.name, attributes.value, attributes.alt_value, attributes.longitude, attributes.latitude, attributes.altitude
FROM attributes, entities
JOIN entity_attributes
ON entity_attributes.entity_id = entities.id
AND entity_attributes.attribute_id = attributes.id
WHERE attributes.id=?`, ir.AttributeID).Scan(&p.ID, &p.Name, &p.Picture, &p.Attribute.Name, &p.Attribute.Value, &p.Attribute.AltValue, &p.Attribute.Longitude, &p.Attribute.Latitude, &p.Attribute.Altitude)
if err != nil {
return nil, fmt.Errorf("loading related entity: %w", err)
}
}
sr := &SearchResult{RepoID: tl.id.String(), ItemRow: ir}
if p.ID != nil {
sr.Entity = &p
}
return sr, nil
}
type SearchResult struct {
RepoID string `json:"repo_id,omitempty"`
ItemRow
HasEmbedding bool `json:"has_embedding,omitempty"`
Entity *relatedEntity `json:"entity,omitempty"`
Related []Related `json:"related,omitempty"`
Size int64 `json:"size,omitempty"`
// from ML model
Distance float64 `json:"distance,omitempty"`
Score float64 `json:"score,omitempty"`
}
// RelationParams describes a search using item relations.
type RelationParams struct {
Not bool `json:"not,omitempty"` // if true, only match items that do NOT have this relation
RelationLabel string `json:"relation_label,omitempty"`
}
// relatedEntity is a subset of Entity (so we need to make sure
// the JSON field names are the same), but with pointer field
// types because it is left-joined in queries which means they
// can be null.
type relatedEntity struct {
ID *uint64 `json:"id"`
Name *string `json:"name,omitempty"`
Picture *string `json:"picture,omitempty"`
Attribute nullableAttribute `json:"attribute,omitempty"` // TODO: experimental
}
// nullableAttribute is like Attribute but with nullable fields
// so that it can be I/O for the database
type nullableAttribute struct {
ID *uint64 `json:"id,omitempty"`
Name *string `json:"name,omitempty"`
Value *string `json:"value,omitempty"`
AltValue *string `json:"alt_value,omitempty"`
Latitude *float64 `json:"latitude,omitempty"`
Longitude *float64 `json:"longitude,omitempty"`
Altitude *float64 `json:"altitude,omitempty"`
}
func (na nullableAttribute) attribute() Attribute {
var a Attribute
if na.ID != nil {
a.ID = *na.ID
}
if na.Name != nil {
a.Name = *na.Name
}
if na.Value != nil {
a.Value = *na.Value
}
if na.AltValue != nil {
a.AltValue = *na.AltValue
}
a.Longitude = na.Longitude
a.Latitude = na.Latitude
a.Altitude = na.Altitude
return a
}
type Related struct {
Relation
RelationshipID int `json:"relationship_id"`
Value *string `json:"value,omitempty"`
FromItem *SearchResult `json:"from_item,omitempty"`
ToItem *SearchResult `json:"to_item,omitempty"`
FromEntity *relatedEntity `json:"from_entity,omitempty"`
ToEntity *relatedEntity `json:"to_entity,omitempty"`
Start *time.Time `json:"start,omitempty"`
End *time.Time `json:"end,omitempty"`
Metadata json.RawMessage `json:"metadata,omitempty"`
}
type SortDir string
const (
SortNone SortDir = "none" // special value that means to not include a ORDER BY clause
SortAsc SortDir = "ASC"
SortDesc SortDir = "DESC"
)