Fix gallery photo prev/next for photos with time zone When jumping between photos in the gallery with the prev/next button, the buttons were incorrectly enabled/disabled/wouldn't work even if enabled for me. This was caused by the photos having a time zone UTC+1 (time_offset = 3600). The js frontend uses formatted timestamps including the time zone (peekFromItem.timestamp in galleryFilterParams). These are converted to a unix timestamp via UTC().UnixMilli(), so the time zone is already taken care for and adding time_offset is not necessary. For the filtering in the gallery grid itself, a unix timestamp is generated client-side (something like &start=1762124400&end=1762210800) taking the system time zone into account, so this is correct without any extra logic as well.
1058 lines
36 KiB
Go
1058 lines
36 KiB
Go
/*
|
|
Timelinize
|
|
Copyright (c) 2013 Matthew Holt
|
|
|
|
This program is free software: you can redistribute it and/or modify
|
|
it under the terms of the GNU Affero General Public License as published
|
|
by the Free Software Foundation, either version 3 of the License, or
|
|
(at your option) any later version.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU Affero General Public License for more details.
|
|
|
|
You should have received a copy of the GNU Affero General Public License
|
|
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
package timeline
|
|
|
|
import (
|
|
"context"
|
|
"database/sql"
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"math"
|
|
"os"
|
|
"path/filepath"
|
|
"sort"
|
|
"strconv"
|
|
"strings"
|
|
"time"
|
|
)
|
|
|
|
// ItemSearchParams describes a search for items.
|
|
//
|
|
// Fields with a slice/array type typically mean "any of these"
|
|
// (their elements are OR'ed together).
|
|
type ItemSearchParams struct {
|
|
// The UUID of the open timeline to search.
|
|
Repo string `json:"repo,omitempty"`
|
|
|
|
// ML searches -- currently, only one of these can be set at a time
|
|
// TODO: add a QueryImage field for image search
|
|
SemanticText string `json:"semantic_text,omitempty"` // similar to the centroid of the vector formed from this phrase
|
|
SimilarTo int64 `json:"similar_to,omitempty"` // similar to the centroid of the vector of the specified item (TODO: should it just be embedding ID?)
|
|
|
|
RowID []int64 `json:"row_id,omitempty"`
|
|
// AccountID []int `json:"account_id,omitempty"` // TODO: restore this, if useful
|
|
DataSourceName []string `json:"data_source,omitempty"`
|
|
JobID []int64 `json:"job_id,omitempty"`
|
|
AttributeID []int64 `json:"attribute_id,omitempty"`
|
|
EntityID []uint64 `json:"entity_id,omitempty"`
|
|
Classification []string `json:"classification,omitempty"`
|
|
OriginalID []string `jsson:"original_id,omitempty"`
|
|
DataType []string `json:"data_type,omitempty"`
|
|
DataText []string `json:"data_text,omitempty"`
|
|
DataFile []string `json:"data_file,omitempty"`
|
|
|
|
// TODO: how do we effectively search metadata? maybe virtual columns? https://antonz.org/json-virtual-columns/
|
|
// TODO: Well, this query was fast: `SELECT * FROM items WHERE items.metadata->>'$.Make' = 'Google' LIMIT 10`
|
|
// Metadata map[string][]any `json:"metadata,omitempty"`
|
|
|
|
// TODO: a way to get items with EntityID, but also any relation FromEntityID or ToEntityID...
|
|
ToAttributeID []int64 `json:"to_attribute_id,omitempty"`
|
|
ToEntityID []int64 `json:"to_entity_id,omitempty"`
|
|
|
|
// filter by relationship to other items
|
|
// TODO: We can probably wrap ToAttributeID, ToEntityID, and maybe Astructured into this?
|
|
Relations []RelationParams `json:"relations,omitempty"`
|
|
|
|
// bounding box searches
|
|
StartTimestamp *time.Time `json:"start_timestamp,omitempty"`
|
|
EndTimestamp *time.Time `json:"end_timestamp,omitempty"`
|
|
MinLatitude *float64 `json:"min_latitude,omitempty"`
|
|
MaxLatitude *float64 `json:"max_latitude,omitempty"`
|
|
MinLongitude *float64 `json:"min_longitude,omitempty"`
|
|
MaxLongitude *float64 `json:"max_longitude,omitempty"`
|
|
Inclusive bool `json:"inclusive,omitempty"` // if true, bounding box is <= and >= instead of < and >
|
|
StrictStartTimestamp bool `json:"strict_start_timestamp,omitempty"` // if true, item timestamp must be after start_timestamp (even if item timespan is after)
|
|
StrictEndTimestamp bool `json:"strict_end_timestamp,omitempty"` // if true, item timespan must be before end_timestamp (even if item timestamp is before)
|
|
|
|
NoLocation bool `json:"no_location,omitempty"` // if true, require location columns to be NULL regardless of max/min lat/lon
|
|
|
|
// proximity searches (location and time are mutually exclusive)
|
|
Timestamp *time.Time `json:"timestamp,omitempty"`
|
|
Latitude *float64 `json:"latitude,omitempty"`
|
|
Longitude *float64 `json:"longitude,omitempty"`
|
|
|
|
// If true, OR different fields instead of AND
|
|
OrFields bool `json:"or_fields,omitempty"`
|
|
|
|
// How to order results. Default: usually timestamp,
|
|
// but can be "smart" depending on search parameters.
|
|
// This can also be "stored" to order by date added
|
|
// to DB.
|
|
OrderBy string `json:"order_by,omitempty"`
|
|
|
|
Sort SortDir `json:"sort,omitempty"` // ignored if doing proximity search (unless it's SortNone, which breaks proximity searches)
|
|
Limit int `json:"limit,omitempty"` // number of rows to include (-1 for no limit); default 1000
|
|
Offset int `json:"offset,omitempty"` // number of rows to skip (can be slow if very large)
|
|
Sample int `json:"sample,omitempty"` // retrieve every Nth row
|
|
|
|
// TODO: Matt's note: Pagination can be done more efficiently than
|
|
// Offset or WithTotal (below) by using little tricks. For example,
|
|
// if the exact count isn't required, set a large limit N (but still
|
|
// a sensible order of magnitude) and report simply "N+" items; or
|
|
// instead of offset, use WHERE to exclude items we've already paged
|
|
// through using an indexed column (for example, if ordering by
|
|
// timestamp, use WHERE to exclude items with timestamps that
|
|
// would have appeared on previous pages; i.e. keep a cursor of
|
|
// last item's timestamp from prev. page)
|
|
|
|
// query related items and people recursively to this many degrees
|
|
// (e.g. 0 is only the item, 1 adds direct relationships, etc...)
|
|
Related int `json:"related,omitempty"`
|
|
|
|
// By default (when this is false), search results will prioritize
|
|
// the root nodes of item graphs, i.e. items that are not on the
|
|
// end of a directed relation/edge, since this makes more sense
|
|
// when viewing the timeline. If this is true, the ends of
|
|
// directed relations are ignored, and related items may appear
|
|
// in the "top level" of search results as well as nested in
|
|
// "related".
|
|
// TODO: Rename? Like only_root_items (that would also invert the value...)
|
|
Astructured bool `json:"flat,omitempty"`
|
|
|
|
// TODO: Experimental, for autocomplete... not sure if useful.
|
|
PreferPrefix bool `json:"prefer_prefix,omitempty"`
|
|
|
|
// In addition to results, return a total count that ignores offset and limit.
|
|
// This can be very slow for broad queries on large DBs.
|
|
// TODO: This can be very slow for large DBs on broad queries... maybe cache results? Or find a way to paginate without this (see comment above)
|
|
WithTotal bool `json:"with_total,omitempty"`
|
|
|
|
// Only return a total count that ignores offset and limit.
|
|
OnlyTotal bool `json:"only_total,omitempty"`
|
|
|
|
// If true, the only fields that will be selected from the DB are
|
|
// latitude and longitude, and the search results will not be filled
|
|
// with items; instead, the results will contain a GeoJSON document
|
|
// with the coordinates, which can then be used directly by mapping
|
|
// software to render point or heatmap information. This approach is
|
|
// an order of magnitude faster (10x in my testing) for large data
|
|
// sets (1M+ rows) to render a heatmap, where only coordinates are
|
|
// needed. It avoids scanning the entire item row and allocating
|
|
// multiple structs and performing various other computations.
|
|
// Instead it efficiently builds a string containing only the
|
|
// coordinate data. Non-spatial data will be excluded.
|
|
GeoJSON bool `json:"geojson,omitempty"`
|
|
ObfuscatedGeoJSON []ObfuscatedLocation `json:"-"` // TODO: this is kind of a hack since it's the only data we encode before returning (for efficiency)
|
|
|
|
// Include the size of the item content with the results.
|
|
// For data files, this involves calling stat() on the file.
|
|
WithSize bool `json:"with_size,omitempty"`
|
|
|
|
// If true, include deleted items (that haven't been erased yet).
|
|
Deleted bool `json:"deleted,omitempty"`
|
|
|
|
// stores the converted names to row IDs
|
|
classificationIDs []uint64
|
|
}
|
|
|
|
type SearchResults struct {
|
|
// If enabled, the total count of items matching the search query
|
|
// without regard for limit and offset.
|
|
Total *int `json:"total,omitempty"`
|
|
|
|
// The items of the search result.
|
|
Items []*SearchResult `json:"items,omitempty"`
|
|
|
|
// The search results in GeoJSON mode. A GeoJSON document
|
|
// useful for rendering heatmaps or clusters.
|
|
GeoJSON string `json:"geojson,omitempty"`
|
|
}
|
|
|
|
func (tl *Timeline) Search(ctx context.Context, params ItemSearchParams) (SearchResults, error) {
|
|
// setting a natural language input has big implications, so make sure it's not just whitespace by accident
|
|
params.SemanticText = strings.TrimSpace(params.SemanticText)
|
|
|
|
// get the DB query string and associated arguments
|
|
q, args, err := tl.prepareSearchQuery(ctx, params)
|
|
if err != nil {
|
|
return SearchResults{}, err
|
|
}
|
|
|
|
// open DB transaction to hopefully make it more efficient; may involve many queries
|
|
// (TODO: we don't currently commit this tx, because we didn't make changes - that's OK, right?)
|
|
tx, err := tl.db.ReadPool.BeginTx(ctx, nil)
|
|
if err != nil {
|
|
return SearchResults{}, err
|
|
}
|
|
defer tx.Rollback()
|
|
|
|
// in count-only mode, there's only a single row with a single field
|
|
if params.OnlyTotal {
|
|
var count int
|
|
err := tx.QueryRowContext(ctx, q, args...).Scan(&count)
|
|
if err != nil {
|
|
return SearchResults{}, err
|
|
}
|
|
return SearchResults{Total: &count}, nil
|
|
}
|
|
|
|
// run query and scan results
|
|
rows, err := tx.QueryContext(ctx, q, args...)
|
|
if err != nil {
|
|
return SearchResults{}, fmt.Errorf("querying db for items: %w", err)
|
|
}
|
|
|
|
// in GeoJSON mode, skip the expensive full-row scans etc; instead, efficiently
|
|
// build a GeoJSON string directly -- it's very specific so we can do even better
|
|
// than json.Marshal by using a strings builder.
|
|
var sb strings.Builder
|
|
if params.GeoJSON {
|
|
// in testing, I found that Mapbox renders a single huge MultiPoint feature
|
|
// way faster than many Point features
|
|
sb.WriteString(`{"type":"Feature","geometry":{"type":"MultiPoint","coordinates":[`)
|
|
}
|
|
|
|
// for JSON serialization, always initialize so "0 results" is at least an empty list and not null
|
|
results := make([]*SearchResult, 0)
|
|
var count, totalCount int
|
|
for rows.Next() { // fun fact, the first call to Next() is actually what runs the query
|
|
// in GeoJSON mode, skip the usual full-row scan, and instead focus
|
|
// on the coordinates which is much more efficient
|
|
if params.GeoJSON {
|
|
// read the coordinate data
|
|
var rowID uint64
|
|
var lat, lon *float64
|
|
targets := []any{&rowID, &lat, &lon}
|
|
if params.WithTotal {
|
|
targets = append(targets, &totalCount)
|
|
}
|
|
err := rows.Scan(targets...)
|
|
if err != nil {
|
|
defer rows.Close()
|
|
return SearchResults{}, err
|
|
}
|
|
|
|
// ensure it's valid, then obfuscate if enabled, then format as strings (waaaaay faster than reflection)
|
|
if lat == nil || lon == nil {
|
|
continue
|
|
}
|
|
for _, locob := range params.ObfuscatedGeoJSON {
|
|
if locob.Contains(*lat, *lon) {
|
|
newLat, newLon := locob.Obfuscate(*lat, *lon, rowID)
|
|
lat = &newLat
|
|
lon = &newLon
|
|
}
|
|
}
|
|
latStr := strconv.FormatFloat(*lat, 'f', -1, 64)
|
|
lonStr := strconv.FormatFloat(*lon, 'f', -1, 64)
|
|
|
|
// write coordinate to the GeoJSON document
|
|
if count > 0 {
|
|
sb.WriteRune(',')
|
|
}
|
|
sb.WriteRune('[')
|
|
sb.WriteString(lonStr)
|
|
sb.WriteRune(',')
|
|
sb.WriteString(latStr)
|
|
sb.WriteRune(']')
|
|
|
|
// continue with next row; skip all the expensive stuff
|
|
count++
|
|
continue
|
|
}
|
|
|
|
var re relatedEntity // entity is left-joined, so could be null
|
|
var embeddingDistance float64
|
|
var embeddingID *uint64 // just to know whether there are any embeddings for the item
|
|
extraTargets := []any{&re.ID, &re.Name, &re.Picture, &re.Attribute.Name, &re.Attribute.Value, &re.Attribute.AltValue, &embeddingID}
|
|
if params.WithTotal {
|
|
extraTargets = append(extraTargets, &totalCount)
|
|
}
|
|
if params.SemanticText != "" || params.SimilarTo > 0 {
|
|
extraTargets = append(extraTargets, &embeddingDistance)
|
|
}
|
|
itemRow, err := scanItemRow(rows, extraTargets)
|
|
if err != nil {
|
|
rows.Close()
|
|
return SearchResults{}, err
|
|
}
|
|
sr := &SearchResult{RepoID: tl.id.String(), ItemRow: itemRow, HasEmbedding: embeddingID != nil, Distance: embeddingDistance}
|
|
if re.ID != nil {
|
|
sr.Entity = &re
|
|
}
|
|
results = append(results, sr)
|
|
}
|
|
rows.Close()
|
|
if err = rows.Err(); err != nil {
|
|
return SearchResults{}, fmt.Errorf("iterating item rows: %w", err)
|
|
}
|
|
|
|
// in GeoJSON mode, we can be done early by wrapping up our document
|
|
if params.GeoJSON {
|
|
sb.WriteString(`]}}`)
|
|
return SearchResults{GeoJSON: sb.String(), Total: &totalCount}, nil
|
|
}
|
|
|
|
// TODO: this needs tuning
|
|
if params.SemanticText != "" {
|
|
// filter results for relevance by passing them through the classifier...
|
|
// this is kind of a hack, but it's a well-known difficult problem apparently,
|
|
// for any KNN search, to only show relevant results
|
|
itemFiles := make(map[uint64]string)
|
|
for _, result := range results {
|
|
if result.DataFile == nil || result.DataType == nil {
|
|
continue
|
|
}
|
|
if !strings.HasPrefix(*result.DataType, "image/") {
|
|
continue
|
|
}
|
|
itemFiles[result.ID] = tl.FullPath(*result.DataFile)
|
|
}
|
|
|
|
scores, err := classify(ctx, itemFiles, []string{params.SemanticText})
|
|
if err != nil {
|
|
return SearchResults{}, fmt.Errorf("classifying results: %w", err)
|
|
}
|
|
for _, sr := range results {
|
|
if score, ok := scores[sr.ID]; ok {
|
|
sr.Score = score
|
|
}
|
|
}
|
|
// sort by score descending, then prefer items that weren't scored over scored items with a low score
|
|
sort.Slice(results, func(i, j int) bool {
|
|
_, iWasScored := scores[results[i].ID]
|
|
_, jWasScored := scores[results[j].ID]
|
|
return results[i].Score > results[j].Score || (jWasScored && !iWasScored)
|
|
})
|
|
// chop off results that are irrelevant, starting with the first result that was scored and has a score near zero
|
|
for i, sr := range results {
|
|
_, wasScored := scores[sr.ID]
|
|
// TODO: Be smarter: find the range the top/best results score, and only cull results below a significant threshold difference
|
|
if sr.Score <= 0.002 && wasScored {
|
|
results = results[:i]
|
|
break
|
|
}
|
|
}
|
|
}
|
|
|
|
// traverse relationships
|
|
for _, sr := range results {
|
|
err = tl.expandRelationships(ctx, tx, params.Related, sr)
|
|
if err != nil {
|
|
return SearchResults{}, err
|
|
}
|
|
}
|
|
|
|
// include size information, if requested
|
|
if params.WithSize {
|
|
for _, sr := range results {
|
|
if sr.DataText != nil {
|
|
sr.Size = int64(len(*sr.DataText))
|
|
}
|
|
if sr.DataFile != nil {
|
|
info, err := os.Stat(filepath.Join(tl.Dir(), *sr.DataFile))
|
|
if err == nil {
|
|
sr.Size = info.Size()
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return SearchResults{Total: &totalCount, Items: results}, nil
|
|
}
|
|
|
|
// TODO: favorites? or maybe a more flexible albums/lists feature? what to call it... "scrapbooks" or "curations"?
|
|
|
|
func (tl *Timeline) convertNamesToIDs(params *ItemSearchParams) {
|
|
tl.cachesMu.RLock()
|
|
for _, className := range params.Classification {
|
|
if className == "" {
|
|
params.classificationIDs = append(params.classificationIDs, 0)
|
|
} else {
|
|
params.classificationIDs = append(params.classificationIDs, tl.classifications[className])
|
|
}
|
|
}
|
|
tl.cachesMu.RUnlock()
|
|
}
|
|
|
|
func (tl *Timeline) prepareSearchQuery(ctx context.Context, params ItemSearchParams) (string, []any, error) {
|
|
if (params.Latitude == nil && params.Longitude != nil) || (params.Latitude != nil && params.Longitude == nil) {
|
|
return "", nil, errors.New("location proximity search must include both lat and lon coordinates")
|
|
}
|
|
if params.Timestamp != nil && (params.Latitude != nil || params.Longitude != nil) {
|
|
return "", nil, errors.New("time proximity and location proximity are mutually exclusive")
|
|
}
|
|
const maxDegreesOfSeparation = 2
|
|
if params.Related > maxDegreesOfSeparation {
|
|
// arbitrary, but I suspect it's a good idea to limit this for performance reasons
|
|
return "", nil, errors.New("max degrees of separation for relationships is 2")
|
|
}
|
|
if params.WithTotal && params.OnlyTotal {
|
|
return "", nil, errors.New("cannot query results with total and only total at the same time")
|
|
}
|
|
|
|
tl.convertNamesToIDs(¶ms)
|
|
|
|
// When viewing a timeline, it can make more intuitive sense
|
|
// to only show "root" nodes of item graphs at the "top" of
|
|
// the search results, so that items which are only attachments,
|
|
// for example, don't appear in the timeline multiple times
|
|
// (once as an attachment, and once as "attached to" - it's even
|
|
// more confusing if it's an attachment but looks like its own
|
|
// item). So by default, we omit items from top level of search
|
|
// results which appear at the end of a directed relation/edge.
|
|
// Only if Astructured is true do we allow attachments and other
|
|
// such items to appear as their own items in the top level.
|
|
// Note: We also do astructured if specific row IDs are being queried
|
|
// since we still want to select those specific rows even if
|
|
// they are, say, attachments of a message. Querying specific
|
|
// rows has priority over relationships.
|
|
rootItemsOnly := !params.Astructured && len(params.RowID) == 0
|
|
|
|
// if searching by embeddings, we first select the items that could possibly
|
|
// be in the results by filtering with other search parameters, then we do
|
|
// distance calculations over that subset of the data, which is theoretically
|
|
// faster (see https://github.com/asg017/sqlite-vec/issues/196#issuecomment-2643543058)
|
|
var q string
|
|
vectorSearch := params.SemanticText != "" || params.SimilarTo > 0
|
|
if vectorSearch {
|
|
q = "WITH search_results AS (\n"
|
|
}
|
|
|
|
// honor inclusivity for bounding-box searches
|
|
lt, gt := "<", ">"
|
|
if params.Inclusive {
|
|
lt, gt = "<=", ">="
|
|
}
|
|
|
|
// TODO: use strings.Builder (also in RecentConversations())
|
|
|
|
q += fmt.Sprintf("\t\tSELECT %s, entities.id, entities.name, entities.picture_file, attributes.name, attributes.value, attributes.alt_value, embeddings.id", itemDBColumns)
|
|
if params.OnlyTotal {
|
|
q = "\t\tSELECT count(DISTINCT items.id)"
|
|
}
|
|
if params.GeoJSON {
|
|
// GeoJSON mode is intended to be more efficient; as such, only select coordinate data
|
|
q = "\t\tSELECT items.id, items.latitude, items.longitude"
|
|
}
|
|
if params.WithTotal {
|
|
q += ", count() over() AS total_count"
|
|
}
|
|
q += `
|
|
FROM extended_items AS items
|
|
LEFT JOIN attributes ON items.attribute_id = attributes.id
|
|
LEFT JOIN entity_attributes ON attributes.id = entity_attributes.attribute_id
|
|
LEFT JOIN entities ON entity_attributes.entity_id = entities.id
|
|
LEFT JOIN embeddings ON embeddings.item_id = items.id`
|
|
|
|
// TODO: It's possible that we could move all these (ToAttributeID, ToEntityID, rootItemsOnly) into RelationParams
|
|
if len(params.ToAttributeID) > 0 || len(params.ToEntityID) > 0 {
|
|
q += `
|
|
JOIN relationships ON relationships.from_item_id = items.id`
|
|
} else if rootItemsOnly || len(params.Relations) > 0 {
|
|
q += `
|
|
LEFT JOIN relationships ON relationships.to_item_id = items.id
|
|
LEFT JOIN relations ON relations.id = relationships.relation_id`
|
|
}
|
|
|
|
// build the WHERE in terms of groups of OR's that are AND'ed together
|
|
var args []any
|
|
var clauseCount int
|
|
and := func(ors func()) {
|
|
clauseCount = 0
|
|
if len(args) == 0 {
|
|
q += "\n\t\tWHERE"
|
|
} else {
|
|
if params.OrFields {
|
|
q += " OR"
|
|
} else {
|
|
q += " AND"
|
|
}
|
|
}
|
|
q += " ("
|
|
ors()
|
|
q += ")"
|
|
|
|
// if the clause turned out to be empty,
|
|
// this is a poor-man's way of undoing it
|
|
q = strings.TrimSuffix(q, " OR ()")
|
|
q = strings.TrimSuffix(q, " AND ()")
|
|
q = strings.TrimSuffix(q, "\n\t\tWHERE ()")
|
|
}
|
|
or := func(clause string, val any) {
|
|
if clauseCount > 0 {
|
|
q += " OR "
|
|
}
|
|
q += clause
|
|
args = append(args, val)
|
|
clauseCount++
|
|
}
|
|
|
|
and(func() {
|
|
if params.RowID != nil && len(params.RowID) == 0 {
|
|
or("items.id IS ?", nil)
|
|
}
|
|
for _, v := range params.RowID {
|
|
or("items.id=?", v)
|
|
}
|
|
})
|
|
and(func() {
|
|
if params.DataSourceName != nil && len(params.DataSourceName) == 0 {
|
|
or("data_source_name IS ?", nil)
|
|
}
|
|
for _, v := range params.DataSourceName {
|
|
or("data_source_name=?", v)
|
|
}
|
|
})
|
|
and(func() {
|
|
if params.JobID != nil && len(params.JobID) == 0 {
|
|
or("items.job_id IS ?", nil)
|
|
}
|
|
for _, v := range params.JobID {
|
|
or("items.job_id=?", v)
|
|
}
|
|
})
|
|
and(func() {
|
|
if params.AttributeID != nil && len(params.AttributeID) == 0 {
|
|
or("items.attribute_id IS ?", nil)
|
|
}
|
|
for _, v := range params.AttributeID {
|
|
or("items.attribute_id=?", v)
|
|
}
|
|
})
|
|
and(func() {
|
|
if params.EntityID != nil && len(params.EntityID) == 0 {
|
|
or("entities.id IS ?", nil)
|
|
}
|
|
for _, v := range params.EntityID {
|
|
or("entities.id=?", v)
|
|
}
|
|
})
|
|
and(func() {
|
|
for _, v := range params.classificationIDs {
|
|
if v == 0 {
|
|
or("items.classification_id IS ?", nil)
|
|
} else {
|
|
or("items.classification_id=?", v)
|
|
}
|
|
}
|
|
})
|
|
and(func() {
|
|
if params.OriginalID != nil && len(params.OriginalID) == 0 {
|
|
or("items.original_id IS ?", nil)
|
|
}
|
|
for _, v := range params.OriginalID {
|
|
or("items.original_id=?", v)
|
|
}
|
|
})
|
|
and(func() {
|
|
if params.DataType != nil && len(params.DataType) == 0 {
|
|
or("items.data_type IS ?", nil)
|
|
}
|
|
for _, v := range params.DataType {
|
|
if strings.HasSuffix(v, "/*") {
|
|
// useful for searching for all "video/*" files, for example
|
|
or("items.data_type LIKE ? || '%'", v[:len(v)-1])
|
|
} else {
|
|
or("items.data_type=?", v)
|
|
}
|
|
}
|
|
})
|
|
and(func() {
|
|
if params.DataText != nil && len(params.DataText) == 0 {
|
|
or("items.data_text IS ?", nil)
|
|
}
|
|
for _, v := range params.DataText {
|
|
or("items.data_text LIKE '%' || ? || '%'", v)
|
|
}
|
|
})
|
|
and(func() {
|
|
if params.DataFile != nil && len(params.DataFile) == 0 {
|
|
or("items.data_file IS ?", nil)
|
|
}
|
|
for _, v := range params.DataFile {
|
|
or("items.data_file=?", v)
|
|
}
|
|
})
|
|
|
|
and(func() {
|
|
// TODO: these can probably be in the same 'AND' group like this, right?
|
|
for _, v := range params.ToAttributeID {
|
|
or("relationships.to_attribute_id=?", v)
|
|
}
|
|
for _, v := range params.ToEntityID {
|
|
or("relationships.to_entity_id=?", v)
|
|
}
|
|
})
|
|
|
|
// TODO: Use BETWEEN maybe
|
|
|
|
if params.StartTimestamp != nil {
|
|
and(func() {
|
|
or("items.timestamp"+gt+" ?", params.StartTimestamp.UTC().UnixMilli())
|
|
if !params.StrictStartTimestamp {
|
|
// if not strict, allow items to spill into the window even if the started before it
|
|
or("items.timespan "+gt+" ?", params.StartTimestamp.UTC().UnixMilli())
|
|
}
|
|
})
|
|
}
|
|
if params.EndTimestamp != nil {
|
|
and(func() {
|
|
or("items.timestamp "+lt+" ?", params.EndTimestamp.UTC().UnixMilli())
|
|
})
|
|
if params.StrictEndTimestamp {
|
|
// if strict, items' timespan must end before the EndTimestamp (item can't merely start before it)
|
|
and(func() {
|
|
or("items.timespan IS NULL OR items.timespan "+lt+" ?", params.EndTimestamp.UTC().UnixMilli())
|
|
})
|
|
}
|
|
}
|
|
if params.NoLocation {
|
|
and(func() {
|
|
or("items.latitude IS ?", nil)
|
|
})
|
|
and(func() {
|
|
or("items.longitude IS ?", nil)
|
|
})
|
|
and(func() {
|
|
or("items.altitude IS ?", nil)
|
|
})
|
|
and(func() {
|
|
or("items.coordinate_system IS ?", nil)
|
|
})
|
|
} else {
|
|
if params.MinLatitude != nil {
|
|
and(func() {
|
|
or("items.latitude "+gt+" ?", params.MinLatitude)
|
|
})
|
|
}
|
|
if params.MaxLatitude != nil {
|
|
and(func() {
|
|
or("items.latitude "+lt+" ?", params.MaxLatitude)
|
|
})
|
|
}
|
|
if params.MinLongitude != nil {
|
|
and(func() {
|
|
or("items.longitude "+gt+" ?", params.MinLongitude)
|
|
})
|
|
}
|
|
if params.MaxLongitude != nil {
|
|
and(func() {
|
|
or("items.longitude "+lt+" ?", params.MaxLongitude)
|
|
})
|
|
}
|
|
}
|
|
|
|
// skip deleted items unless we are explicitly supposed to include them
|
|
// (NOTE: we include them if the specific row IDs are requested)
|
|
if !params.Deleted && len(params.RowID) == 0 {
|
|
and(func() {
|
|
or("items.deleted IS ?", nil)
|
|
})
|
|
}
|
|
|
|
// always skip hidden items
|
|
and(func() {
|
|
or("items.hidden IS ?", nil)
|
|
})
|
|
|
|
// skip every so many items if sampling is enabled
|
|
if params.Sample > 1 {
|
|
and(func() {
|
|
or("items.id % ? = 0", params.Sample)
|
|
})
|
|
}
|
|
|
|
if rootItemsOnly {
|
|
// select only items which are not dependent on other items, i.e., items
|
|
// that are not at the end of a directed relation from another item
|
|
and(func() {
|
|
or("relations.directed != ?", 1)
|
|
or("relations.subordinating = ?", 0)
|
|
or("relationships.to_item_id IS ?", nil)
|
|
})
|
|
}
|
|
|
|
// factor in relationships
|
|
for _, relParam := range params.Relations {
|
|
op := "IS"
|
|
if relParam.Not {
|
|
op = "IS NOT"
|
|
}
|
|
and(func() {
|
|
if relParam.RelationLabel != "" {
|
|
or("relations.label "+op+" ?", relParam.RelationLabel)
|
|
}
|
|
})
|
|
}
|
|
|
|
if !params.OnlyTotal {
|
|
q += "\n\t\tGROUP BY items.id"
|
|
}
|
|
|
|
// don't put order in the temporary table, since we'll be ordering by vector distance
|
|
if !vectorSearch {
|
|
if params.Sort != SortNone {
|
|
q += "\n\t\tORDER BY "
|
|
|
|
// TODO: not sure if this is how autocomplete will work or be useful, but basically
|
|
// this sorts by data text so that if it's a prefix, it's weighed higher in the
|
|
// sort, and if it's a suffix then put it at the end; i.e. favor term at beginning
|
|
// of words instead of end... I think that's what this does, at least
|
|
if params.PreferPrefix && len(params.DataText) == 1 {
|
|
q += `CASE
|
|
WHEN items.data_text LIKE ? || '%' THEN 1
|
|
WHEN items.data_text LIKE '%' || ? THEN 3
|
|
ELSE 2
|
|
END, `
|
|
args = append(args, params.DataText[0], params.DataText[0])
|
|
}
|
|
|
|
// sort
|
|
sortDir := strings.ToUpper(string(params.Sort))
|
|
if sortDir == "" {
|
|
// smart sort: sort ASC by default if only "minimum" side of bounding boxes are specified
|
|
if (params.StartTimestamp != nil || params.MinLatitude != nil || params.MinLongitude != nil) &&
|
|
params.EndTimestamp == nil && params.MaxLatitude == nil && params.MaxLongitude == nil {
|
|
sortDir = string(SortAsc)
|
|
} else {
|
|
sortDir = string(SortDesc)
|
|
}
|
|
}
|
|
if sortDir != string(SortAsc) && sortDir != string(SortDesc) {
|
|
return "", nil, fmt.Errorf("invalid sort direction: %s", sortDir)
|
|
}
|
|
// location proximity search is only an approximation for simplicity
|
|
// see https://stackoverflow.com/a/39298241/1048862
|
|
switch {
|
|
case params.Latitude != nil && params.Longitude != nil:
|
|
// nearest to location; account for shortened distances at poles
|
|
// math.Cos() takes radians, hence the conversion to radians inside the cosine
|
|
cosLat2 := math.Pow(math.Cos(*params.Latitude*math.Pi/180.0), 2) //nolint:mnd
|
|
sortDir = string(SortAsc) // always sort ascending for nearest
|
|
q += "((?-items.latitude) * (?-items.latitude)) + ((?-items.longitude) * (?-items.longitude) * ?), items.id " + sortDir
|
|
args = append(args, params.Latitude, params.Latitude, params.Longitude, params.Longitude, cosLat2)
|
|
|
|
case params.Timestamp != nil:
|
|
// nearest to timestamp
|
|
sortDir = string(SortAsc) // always sort ascending for nearest
|
|
q += "abs(?-items.timestamp), items.id " + sortDir
|
|
args = append(args, params.Timestamp.UnixMilli())
|
|
|
|
case params.OrderBy == "stored":
|
|
q += "items.stored " + sortDir
|
|
|
|
default:
|
|
// generic sort, which is timestamp and row ID
|
|
q += fmt.Sprintf("items.timestamp %s, items.id %s", sortDir, sortDir)
|
|
}
|
|
}
|
|
|
|
// limit
|
|
if !params.OnlyTotal {
|
|
if params.Limit == 0 {
|
|
params.Limit = 1000
|
|
}
|
|
if params.Limit > 0 {
|
|
q += "\n\t\tLIMIT ?"
|
|
args = append(args, params.Limit)
|
|
}
|
|
if params.Offset > 0 {
|
|
q += "\n\t\tOFFSET ?"
|
|
args = append(args, params.Offset)
|
|
}
|
|
}
|
|
}
|
|
|
|
if vectorSearch {
|
|
var targetVectorClause string
|
|
|
|
if params.SimilarTo > 0 {
|
|
targetVectorClause = "(SELECT embedding FROM embeddings JOIN items ON embeddings.item_id = items.id WHERE items.id=? LIMIT 1)"
|
|
args = append(args, params.SimilarTo)
|
|
} else if params.SemanticText != "" {
|
|
// search relative to an arbitrary input (TODO: support image inputs too) - python server must be online
|
|
if !pythonServerReady(ctx, false) {
|
|
return "", nil, errors.New("python server not ready")
|
|
}
|
|
embedding, err := generateEmbedding(ctx, "text/plain", []byte(params.SemanticText), nil)
|
|
if err != nil {
|
|
return "", nil, err
|
|
}
|
|
targetVectorClause = "?"
|
|
args = append(args, string(embedding))
|
|
}
|
|
|
|
q += fmt.Sprintf(`
|
|
)
|
|
SELECT
|
|
search_results.*,
|
|
vec_distance_l2(%s, embeddings.embedding) AS distance
|
|
FROM search_results
|
|
JOIN embeddings ON embeddings.item_id = search_results.id
|
|
ORDER BY distance`, targetVectorClause)
|
|
if params.Limit == 0 {
|
|
params.Limit = 100
|
|
}
|
|
if params.Limit > 0 {
|
|
q += "\nLIMIT ?"
|
|
args = append(args, params.Limit)
|
|
}
|
|
if params.Offset > 0 {
|
|
q += "\nOFFSET ?"
|
|
args = append(args, params.Offset)
|
|
}
|
|
}
|
|
|
|
return q, args, nil
|
|
}
|
|
|
|
func (tl *Timeline) expandRelationships(ctx context.Context, tx *sql.Tx, degrees int, sr *SearchResult) error {
|
|
if degrees <= 0 {
|
|
return nil
|
|
}
|
|
|
|
// notice how we're careful to avoid recursion while we have open rows
|
|
// scanning happening, so that we don't step on other select queries
|
|
err := tl.expandRelationshipSingle(ctx, tx, sr)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// now expand relationships recursively until degrees of separation
|
|
// from the original search results are reached
|
|
for _, rel := range sr.Related {
|
|
if rel.FromItem != nil {
|
|
err := tl.expandRelationships(ctx, tx, degrees-1, rel.FromItem)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
if rel.ToItem != nil {
|
|
err := tl.expandRelationships(ctx, tx, degrees-1, rel.ToItem)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (tl *Timeline) expandRelationshipSingle(ctx context.Context, tx *sql.Tx, sr *SearchResult) error {
|
|
// TODO: limit here is arbitrary I think... ho hum
|
|
rows, err := tx.QueryContext(ctx, `
|
|
SELECT
|
|
relationships.id,
|
|
relations.directed,
|
|
relations.label,
|
|
relationships.value,
|
|
relationships.start,
|
|
relationships.end,
|
|
relationships.metadata,
|
|
relationships.from_item_id,
|
|
relationships.to_item_id,
|
|
from_entity.id,
|
|
from_entity.name,
|
|
from_entity.picture_file,
|
|
from_attr.name,
|
|
from_attr.value,
|
|
from_attr.alt_value,
|
|
to_entity.id,
|
|
to_entity.name,
|
|
to_entity.picture_file,
|
|
to_attr.name,
|
|
to_attr.value,
|
|
to_attr.alt_value
|
|
FROM relationships
|
|
JOIN items ON items.id=?
|
|
JOIN relations ON relations.id=relationships.relation_id
|
|
LEFT JOIN attributes AS from_attr ON from_attr.id=relationships.from_attribute_id
|
|
LEFT JOIN attributes AS to_attr ON to_attr.id=relationships.to_attribute_id
|
|
LEFT JOIN entity_attributes AS from_ea ON from_ea.attribute_id = from_attr.id
|
|
LEFT JOIN entity_attributes AS to_ea ON to_ea.attribute_id = to_attr.id
|
|
LEFT JOIN entities AS from_entity ON from_entity.id = from_ea.entity_id
|
|
LEFT JOIN entities AS to_entity ON to_entity.id = to_ea.entity_id
|
|
WHERE (relationships.from_item_id=? OR relationships.to_item_id=?) AND items.hidden IS NULL
|
|
GROUP BY relationships.id
|
|
LIMIT 10`, sr.ItemRow.ID, sr.ItemRow.ID, sr.ItemRow.ID)
|
|
if err != nil {
|
|
return fmt.Errorf("querying db for relationships: %w", err)
|
|
}
|
|
defer rows.Close()
|
|
|
|
for rows.Next() {
|
|
var rel Related
|
|
var fromItemID, toItemID *uint64
|
|
var relStart, relEnd *int64
|
|
var fromEntity, toEntity relatedEntity
|
|
var relMeta *string
|
|
err := rows.Scan(&rel.RelationshipID, &rel.Directed, &rel.Label, &rel.Value, &relStart, &relEnd, &relMeta, &fromItemID, &toItemID,
|
|
&fromEntity.ID, &fromEntity.Name, &fromEntity.Picture, &fromEntity.Attribute.Name, &fromEntity.Attribute.Value, &fromEntity.Attribute.AltValue,
|
|
&toEntity.ID, &toEntity.Name, &toEntity.Picture, &toEntity.Attribute.Name, &toEntity.Attribute.Value, &toEntity.Attribute.AltValue)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if fromEntity.ID != nil {
|
|
rel.FromEntity = &fromEntity
|
|
}
|
|
if toEntity.ID != nil {
|
|
rel.ToEntity = &toEntity
|
|
}
|
|
if relStart != nil {
|
|
startTime := time.Unix(*relStart, 0)
|
|
rel.Start = &startTime
|
|
}
|
|
if relEnd != nil {
|
|
endTime := time.Unix(*relEnd, 0)
|
|
rel.End = &endTime
|
|
}
|
|
if relMeta != nil {
|
|
rel.Metadata = json.RawMessage(*relMeta)
|
|
}
|
|
|
|
// expand items, we have to do this until our code can support
|
|
// loading an item from among other columns as well; expand
|
|
// only if the item is distinct from the parent/starting item
|
|
if fromItemID != nil && *fromItemID != sr.ID {
|
|
fromRel, err := tl.loadRelatedItem(ctx, tx, *fromItemID)
|
|
if err != nil {
|
|
return fmt.Errorf("loading from_item: %w", err)
|
|
}
|
|
rel.FromItem = fromRel
|
|
}
|
|
if toItemID != nil && *toItemID != sr.ID {
|
|
toRel, err := tl.loadRelatedItem(ctx, tx, *toItemID)
|
|
if err != nil {
|
|
return fmt.Errorf("loading to_item: %w", err)
|
|
}
|
|
rel.ToItem = toRel
|
|
}
|
|
|
|
sr.Related = append(sr.Related, rel)
|
|
}
|
|
if err = rows.Err(); err != nil {
|
|
return fmt.Errorf("scanning related item rows: %w", err)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (tl *Timeline) loadRelatedItem(ctx context.Context, tx *sql.Tx, itemRowID uint64) (*SearchResult, error) {
|
|
ir, err := tl.loadItemRow(ctx, tx, itemRowID, 0, nil, nil, nil, false)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("loading related item row: %w", err)
|
|
}
|
|
// TODO: dunno if this safe (QueryRow during a rows.Scan, by the caller of this function)
|
|
// TODO: if it would help increase performance, we could probably cache information about persons and person_identities...
|
|
// TODO: maybe there's a view we could use for this instead?
|
|
var p relatedEntity
|
|
if ir.AttributeID != nil {
|
|
err = tx.QueryRowContext(ctx, `
|
|
SELECT entities.id, entities.name, entities.picture_file, attributes.name, attributes.value, attributes.alt_value, attributes.longitude, attributes.latitude, attributes.altitude
|
|
FROM attributes, entities
|
|
JOIN entity_attributes
|
|
ON entity_attributes.entity_id = entities.id
|
|
AND entity_attributes.attribute_id = attributes.id
|
|
WHERE attributes.id=?`, ir.AttributeID).Scan(&p.ID, &p.Name, &p.Picture, &p.Attribute.Name, &p.Attribute.Value, &p.Attribute.AltValue, &p.Attribute.Longitude, &p.Attribute.Latitude, &p.Attribute.Altitude)
|
|
if err != nil {
|
|
return nil, fmt.Errorf("loading related entity: %w", err)
|
|
}
|
|
}
|
|
sr := &SearchResult{RepoID: tl.id.String(), ItemRow: ir}
|
|
if p.ID != nil {
|
|
sr.Entity = &p
|
|
}
|
|
return sr, nil
|
|
}
|
|
|
|
type SearchResult struct {
|
|
RepoID string `json:"repo_id,omitempty"`
|
|
ItemRow
|
|
HasEmbedding bool `json:"has_embedding,omitempty"`
|
|
Entity *relatedEntity `json:"entity,omitempty"`
|
|
Related []Related `json:"related,omitempty"`
|
|
Size int64 `json:"size,omitempty"`
|
|
|
|
// from ML model
|
|
Distance float64 `json:"distance,omitempty"`
|
|
Score float64 `json:"score,omitempty"`
|
|
}
|
|
|
|
// RelationParams describes a search using item relations.
|
|
type RelationParams struct {
|
|
Not bool `json:"not,omitempty"` // if true, only match items that do NOT have this relation
|
|
RelationLabel string `json:"relation_label,omitempty"`
|
|
}
|
|
|
|
// relatedEntity is a subset of Entity (so we need to make sure
|
|
// the JSON field names are the same), but with pointer field
|
|
// types because it is left-joined in queries which means they
|
|
// can be null.
|
|
type relatedEntity struct {
|
|
ID *uint64 `json:"id"`
|
|
Name *string `json:"name,omitempty"`
|
|
Picture *string `json:"picture,omitempty"`
|
|
Attribute nullableAttribute `json:"attribute,omitempty"` // TODO: experimental
|
|
}
|
|
|
|
// nullableAttribute is like Attribute but with nullable fields
|
|
// so that it can be I/O for the database
|
|
type nullableAttribute struct {
|
|
ID *uint64 `json:"id,omitempty"`
|
|
Name *string `json:"name,omitempty"`
|
|
Value *string `json:"value,omitempty"`
|
|
AltValue *string `json:"alt_value,omitempty"`
|
|
Latitude *float64 `json:"latitude,omitempty"`
|
|
Longitude *float64 `json:"longitude,omitempty"`
|
|
Altitude *float64 `json:"altitude,omitempty"`
|
|
}
|
|
|
|
func (na nullableAttribute) attribute() Attribute {
|
|
var a Attribute
|
|
if na.ID != nil {
|
|
a.ID = *na.ID
|
|
}
|
|
if na.Name != nil {
|
|
a.Name = *na.Name
|
|
}
|
|
if na.Value != nil {
|
|
a.Value = *na.Value
|
|
}
|
|
if na.AltValue != nil {
|
|
a.AltValue = *na.AltValue
|
|
}
|
|
a.Longitude = na.Longitude
|
|
a.Latitude = na.Latitude
|
|
a.Altitude = na.Altitude
|
|
return a
|
|
}
|
|
|
|
type Related struct {
|
|
Relation
|
|
RelationshipID int `json:"relationship_id"`
|
|
Value *string `json:"value,omitempty"`
|
|
FromItem *SearchResult `json:"from_item,omitempty"`
|
|
ToItem *SearchResult `json:"to_item,omitempty"`
|
|
FromEntity *relatedEntity `json:"from_entity,omitempty"`
|
|
ToEntity *relatedEntity `json:"to_entity,omitempty"`
|
|
Start *time.Time `json:"start,omitempty"`
|
|
End *time.Time `json:"end,omitempty"`
|
|
Metadata json.RawMessage `json:"metadata,omitempty"`
|
|
}
|
|
|
|
type SortDir string
|
|
|
|
const (
|
|
SortNone SortDir = "none" // special value that means to not include a ORDER BY clause
|
|
SortAsc SortDir = "ASC"
|
|
SortDesc SortDir = "DESC"
|
|
)
|