/* Timelinize Copyright (c) 2013 Matthew Holt This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . */ package timeline import ( "context" "database/sql" "encoding/json" "errors" "fmt" "math" "os" "path/filepath" "sort" "strconv" "strings" "time" ) // ItemSearchParams describes a search for items. // // Fields with a slice/array type typically mean "any of these" // (their elements are OR'ed together). type ItemSearchParams struct { // The UUID of the open timeline to search. Repo string `json:"repo,omitempty"` // ML searches -- currently, only one of these can be set at a time // TODO: add a QueryImage field for image search SemanticText string `json:"semantic_text,omitempty"` // similar to the centroid of the vector formed from this phrase SimilarTo int64 `json:"similar_to,omitempty"` // similar to the centroid of the vector of the specified item (TODO: should it just be embedding ID?) RowID []int64 `json:"row_id,omitempty"` // AccountID []int `json:"account_id,omitempty"` // TODO: restore this, if useful DataSourceName []string `json:"data_source,omitempty"` JobID []int64 `json:"job_id,omitempty"` AttributeID []int64 `json:"attribute_id,omitempty"` EntityID []uint64 `json:"entity_id,omitempty"` Classification []string `json:"classification,omitempty"` OriginalID []string `jsson:"original_id,omitempty"` DataType []string `json:"data_type,omitempty"` DataText []string `json:"data_text,omitempty"` DataFile []string `json:"data_file,omitempty"` // TODO: how do we effectively search metadata? maybe virtual columns? https://antonz.org/json-virtual-columns/ // TODO: Well, this query was fast: `SELECT * FROM items WHERE items.metadata->>'$.Make' = 'Google' LIMIT 10` // Metadata map[string][]any `json:"metadata,omitempty"` // TODO: a way to get items with EntityID, but also any relation FromEntityID or ToEntityID... ToAttributeID []int64 `json:"to_attribute_id,omitempty"` ToEntityID []int64 `json:"to_entity_id,omitempty"` // filter by relationship to other items // TODO: We can probably wrap ToAttributeID, ToEntityID, and maybe Astructured into this? Relations []RelationParams `json:"relations,omitempty"` // bounding box searches StartTimestamp *time.Time `json:"start_timestamp,omitempty"` EndTimestamp *time.Time `json:"end_timestamp,omitempty"` MinLatitude *float64 `json:"min_latitude,omitempty"` MaxLatitude *float64 `json:"max_latitude,omitempty"` MinLongitude *float64 `json:"min_longitude,omitempty"` MaxLongitude *float64 `json:"max_longitude,omitempty"` Inclusive bool `json:"inclusive,omitempty"` // if true, bounding box is <= and >= instead of < and > StrictStartTimestamp bool `json:"strict_start_timestamp,omitempty"` // if true, item timestamp must be after start_timestamp (even if item timespan is after) StrictEndTimestamp bool `json:"strict_end_timestamp,omitempty"` // if true, item timespan must be before end_timestamp (even if item timestamp is before) NoLocation bool `json:"no_location,omitempty"` // if true, require location columns to be NULL regardless of max/min lat/lon // proximity searches (location and time are mutually exclusive) Timestamp *time.Time `json:"timestamp,omitempty"` Latitude *float64 `json:"latitude,omitempty"` Longitude *float64 `json:"longitude,omitempty"` // If true, OR different fields instead of AND OrFields bool `json:"or_fields,omitempty"` // How to order results. Default: usually timestamp, // but can be "smart" depending on search parameters. // This can also be "stored" to order by date added // to DB. OrderBy string `json:"order_by,omitempty"` Sort SortDir `json:"sort,omitempty"` // ignored if doing proximity search (unless it's SortNone, which breaks proximity searches) Limit int `json:"limit,omitempty"` // number of rows to include (-1 for no limit); default 1000 Offset int `json:"offset,omitempty"` // number of rows to skip (can be slow if very large) Sample int `json:"sample,omitempty"` // retrieve every Nth row // TODO: Matt's note: Pagination can be done more efficiently than // Offset or WithTotal (below) by using little tricks. For example, // if the exact count isn't required, set a large limit N (but still // a sensible order of magnitude) and report simply "N+" items; or // instead of offset, use WHERE to exclude items we've already paged // through using an indexed column (for example, if ordering by // timestamp, use WHERE to exclude items with timestamps that // would have appeared on previous pages; i.e. keep a cursor of // last item's timestamp from prev. page) // query related items and people recursively to this many degrees // (e.g. 0 is only the item, 1 adds direct relationships, etc...) Related int `json:"related,omitempty"` // By default (when this is false), search results will prioritize // the root nodes of item graphs, i.e. items that are not on the // end of a directed relation/edge, since this makes more sense // when viewing the timeline. If this is true, the ends of // directed relations are ignored, and related items may appear // in the "top level" of search results as well as nested in // "related". // TODO: Rename? Like only_root_items (that would also invert the value...) Astructured bool `json:"flat,omitempty"` // TODO: Experimental, for autocomplete... not sure if useful. PreferPrefix bool `json:"prefer_prefix,omitempty"` // In addition to results, return a total count that ignores offset and limit. // This can be very slow for broad queries on large DBs. // TODO: This can be very slow for large DBs on broad queries... maybe cache results? Or find a way to paginate without this (see comment above) WithTotal bool `json:"with_total,omitempty"` // Only return a total count that ignores offset and limit. OnlyTotal bool `json:"only_total,omitempty"` // If true, the only fields that will be selected from the DB are // latitude and longitude, and the search results will not be filled // with items; instead, the results will contain a GeoJSON document // with the coordinates, which can then be used directly by mapping // software to render point or heatmap information. This approach is // an order of magnitude faster (10x in my testing) for large data // sets (1M+ rows) to render a heatmap, where only coordinates are // needed. It avoids scanning the entire item row and allocating // multiple structs and performing various other computations. // Instead it efficiently builds a string containing only the // coordinate data. Non-spatial data will be excluded. GeoJSON bool `json:"geojson,omitempty"` ObfuscatedGeoJSON []ObfuscatedLocation `json:"-"` // TODO: this is kind of a hack since it's the only data we encode before returning (for efficiency) // Include the size of the item content with the results. // For data files, this involves calling stat() on the file. WithSize bool `json:"with_size,omitempty"` // If true, include deleted items (that haven't been erased yet). Deleted bool `json:"deleted,omitempty"` // stores the converted names to row IDs classificationIDs []uint64 } type SearchResults struct { // If enabled, the total count of items matching the search query // without regard for limit and offset. Total *int `json:"total,omitempty"` // The items of the search result. Items []*SearchResult `json:"items,omitempty"` // The search results in GeoJSON mode. A GeoJSON document // useful for rendering heatmaps or clusters. GeoJSON string `json:"geojson,omitempty"` } func (tl *Timeline) Search(ctx context.Context, params ItemSearchParams) (SearchResults, error) { // setting a natural language input has big implications, so make sure it's not just whitespace by accident params.SemanticText = strings.TrimSpace(params.SemanticText) // get the DB query string and associated arguments q, args, err := tl.prepareSearchQuery(ctx, params) if err != nil { return SearchResults{}, err } // open DB transaction to hopefully make it more efficient; may involve many queries // (TODO: we don't currently commit this tx, because we didn't make changes - that's OK, right?) tx, err := tl.db.ReadPool.BeginTx(ctx, nil) if err != nil { return SearchResults{}, err } defer tx.Rollback() // in count-only mode, there's only a single row with a single field if params.OnlyTotal { var count int err := tx.QueryRowContext(ctx, q, args...).Scan(&count) if err != nil { return SearchResults{}, err } return SearchResults{Total: &count}, nil } // run query and scan results rows, err := tx.QueryContext(ctx, q, args...) if err != nil { return SearchResults{}, fmt.Errorf("querying db for items: %w", err) } // in GeoJSON mode, skip the expensive full-row scans etc; instead, efficiently // build a GeoJSON string directly -- it's very specific so we can do even better // than json.Marshal by using a strings builder. var sb strings.Builder if params.GeoJSON { // in testing, I found that Mapbox renders a single huge MultiPoint feature // way faster than many Point features sb.WriteString(`{"type":"Feature","geometry":{"type":"MultiPoint","coordinates":[`) } // for JSON serialization, always initialize so "0 results" is at least an empty list and not null results := make([]*SearchResult, 0) var count, totalCount int for rows.Next() { // fun fact, the first call to Next() is actually what runs the query // in GeoJSON mode, skip the usual full-row scan, and instead focus // on the coordinates which is much more efficient if params.GeoJSON { // read the coordinate data var rowID uint64 var lat, lon *float64 targets := []any{&rowID, &lat, &lon} if params.WithTotal { targets = append(targets, &totalCount) } err := rows.Scan(targets...) if err != nil { defer rows.Close() return SearchResults{}, err } // ensure it's valid, then obfuscate if enabled, then format as strings (waaaaay faster than reflection) if lat == nil || lon == nil { continue } for _, locob := range params.ObfuscatedGeoJSON { if locob.Contains(*lat, *lon) { newLat, newLon := locob.Obfuscate(*lat, *lon, rowID) lat = &newLat lon = &newLon } } latStr := strconv.FormatFloat(*lat, 'f', -1, 64) lonStr := strconv.FormatFloat(*lon, 'f', -1, 64) // write coordinate to the GeoJSON document if count > 0 { sb.WriteRune(',') } sb.WriteRune('[') sb.WriteString(lonStr) sb.WriteRune(',') sb.WriteString(latStr) sb.WriteRune(']') // continue with next row; skip all the expensive stuff count++ continue } var re relatedEntity // entity is left-joined, so could be null var embeddingDistance float64 var embeddingID *uint64 // just to know whether there are any embeddings for the item extraTargets := []any{&re.ID, &re.Name, &re.Picture, &re.Attribute.Name, &re.Attribute.Value, &re.Attribute.AltValue, &embeddingID} if params.WithTotal { extraTargets = append(extraTargets, &totalCount) } if params.SemanticText != "" || params.SimilarTo > 0 { extraTargets = append(extraTargets, &embeddingDistance) } itemRow, err := scanItemRow(rows, extraTargets) if err != nil { rows.Close() return SearchResults{}, err } sr := &SearchResult{RepoID: tl.id.String(), ItemRow: itemRow, HasEmbedding: embeddingID != nil, Distance: embeddingDistance} if re.ID != nil { sr.Entity = &re } results = append(results, sr) } rows.Close() if err = rows.Err(); err != nil { return SearchResults{}, fmt.Errorf("iterating item rows: %w", err) } // in GeoJSON mode, we can be done early by wrapping up our document if params.GeoJSON { sb.WriteString(`]}}`) return SearchResults{GeoJSON: sb.String(), Total: &totalCount}, nil } // TODO: this needs tuning if params.SemanticText != "" { // filter results for relevance by passing them through the classifier... // this is kind of a hack, but it's a well-known difficult problem apparently, // for any KNN search, to only show relevant results itemFiles := make(map[uint64]string) for _, result := range results { if result.DataFile == nil || result.DataType == nil { continue } if !strings.HasPrefix(*result.DataType, "image/") { continue } itemFiles[result.ID] = tl.FullPath(*result.DataFile) } scores, err := classify(ctx, itemFiles, []string{params.SemanticText}) if err != nil { return SearchResults{}, fmt.Errorf("classifying results: %w", err) } for _, sr := range results { if score, ok := scores[sr.ID]; ok { sr.Score = score } } // sort by score descending, then prefer items that weren't scored over scored items with a low score sort.Slice(results, func(i, j int) bool { _, iWasScored := scores[results[i].ID] _, jWasScored := scores[results[j].ID] return results[i].Score > results[j].Score || (jWasScored && !iWasScored) }) // chop off results that are irrelevant, starting with the first result that was scored and has a score near zero for i, sr := range results { _, wasScored := scores[sr.ID] // TODO: Be smarter: find the range the top/best results score, and only cull results below a significant threshold difference if sr.Score <= 0.002 && wasScored { results = results[:i] break } } } // traverse relationships for _, sr := range results { err = tl.expandRelationships(ctx, tx, params.Related, sr) if err != nil { return SearchResults{}, err } } // include size information, if requested if params.WithSize { for _, sr := range results { if sr.DataText != nil { sr.Size = int64(len(*sr.DataText)) } if sr.DataFile != nil { info, err := os.Stat(filepath.Join(tl.Dir(), *sr.DataFile)) if err == nil { sr.Size = info.Size() } } } } return SearchResults{Total: &totalCount, Items: results}, nil } // TODO: favorites? or maybe a more flexible albums/lists feature? what to call it... "scrapbooks" or "curations"? func (tl *Timeline) convertNamesToIDs(params *ItemSearchParams) { tl.cachesMu.RLock() for _, className := range params.Classification { if className == "" { params.classificationIDs = append(params.classificationIDs, 0) } else { params.classificationIDs = append(params.classificationIDs, tl.classifications[className]) } } tl.cachesMu.RUnlock() } func (tl *Timeline) prepareSearchQuery(ctx context.Context, params ItemSearchParams) (string, []any, error) { if (params.Latitude == nil && params.Longitude != nil) || (params.Latitude != nil && params.Longitude == nil) { return "", nil, errors.New("location proximity search must include both lat and lon coordinates") } if params.Timestamp != nil && (params.Latitude != nil || params.Longitude != nil) { return "", nil, errors.New("time proximity and location proximity are mutually exclusive") } const maxDegreesOfSeparation = 2 if params.Related > maxDegreesOfSeparation { // arbitrary, but I suspect it's a good idea to limit this for performance reasons return "", nil, errors.New("max degrees of separation for relationships is 2") } if params.WithTotal && params.OnlyTotal { return "", nil, errors.New("cannot query results with total and only total at the same time") } tl.convertNamesToIDs(¶ms) // When viewing a timeline, it can make more intuitive sense // to only show "root" nodes of item graphs at the "top" of // the search results, so that items which are only attachments, // for example, don't appear in the timeline multiple times // (once as an attachment, and once as "attached to" - it's even // more confusing if it's an attachment but looks like its own // item). So by default, we omit items from top level of search // results which appear at the end of a directed relation/edge. // Only if Astructured is true do we allow attachments and other // such items to appear as their own items in the top level. // Note: We also do astructured if specific row IDs are being queried // since we still want to select those specific rows even if // they are, say, attachments of a message. Querying specific // rows has priority over relationships. rootItemsOnly := !params.Astructured && len(params.RowID) == 0 // if searching by embeddings, we first select the items that could possibly // be in the results by filtering with other search parameters, then we do // distance calculations over that subset of the data, which is theoretically // faster (see https://github.com/asg017/sqlite-vec/issues/196#issuecomment-2643543058) var q string vectorSearch := params.SemanticText != "" || params.SimilarTo > 0 if vectorSearch { q = "WITH search_results AS (\n" } // honor inclusivity for bounding-box searches lt, gt := "<", ">" if params.Inclusive { lt, gt = "<=", ">=" } // TODO: use strings.Builder (also in RecentConversations()) q += fmt.Sprintf("\t\tSELECT %s, entities.id, entities.name, entities.picture_file, attributes.name, attributes.value, attributes.alt_value, embeddings.id", itemDBColumns) if params.OnlyTotal { q = "\t\tSELECT count(DISTINCT items.id)" } if params.GeoJSON { // GeoJSON mode is intended to be more efficient; as such, only select coordinate data q = "\t\tSELECT items.id, items.latitude, items.longitude" } if params.WithTotal { q += ", count() over() AS total_count" } q += ` FROM extended_items AS items LEFT JOIN attributes ON items.attribute_id = attributes.id LEFT JOIN entity_attributes ON attributes.id = entity_attributes.attribute_id LEFT JOIN entities ON entity_attributes.entity_id = entities.id LEFT JOIN embeddings ON embeddings.item_id = items.id` // TODO: It's possible that we could move all these (ToAttributeID, ToEntityID, rootItemsOnly) into RelationParams if len(params.ToAttributeID) > 0 || len(params.ToEntityID) > 0 { q += ` JOIN relationships ON relationships.from_item_id = items.id` } else if rootItemsOnly || len(params.Relations) > 0 { q += ` LEFT JOIN relationships ON relationships.to_item_id = items.id LEFT JOIN relations ON relations.id = relationships.relation_id` } // build the WHERE in terms of groups of OR's that are AND'ed together var args []any var clauseCount int and := func(ors func()) { clauseCount = 0 if len(args) == 0 { q += "\n\t\tWHERE" } else { if params.OrFields { q += " OR" } else { q += " AND" } } q += " (" ors() q += ")" // if the clause turned out to be empty, // this is a poor-man's way of undoing it q = strings.TrimSuffix(q, " OR ()") q = strings.TrimSuffix(q, " AND ()") q = strings.TrimSuffix(q, "\n\t\tWHERE ()") } or := func(clause string, val any) { if clauseCount > 0 { q += " OR " } q += clause args = append(args, val) clauseCount++ } and(func() { if params.RowID != nil && len(params.RowID) == 0 { or("items.id IS ?", nil) } for _, v := range params.RowID { or("items.id=?", v) } }) and(func() { if params.DataSourceName != nil && len(params.DataSourceName) == 0 { or("data_source_name IS ?", nil) } for _, v := range params.DataSourceName { or("data_source_name=?", v) } }) and(func() { if params.JobID != nil && len(params.JobID) == 0 { or("items.job_id IS ?", nil) } for _, v := range params.JobID { or("items.job_id=?", v) } }) and(func() { if params.AttributeID != nil && len(params.AttributeID) == 0 { or("items.attribute_id IS ?", nil) } for _, v := range params.AttributeID { or("items.attribute_id=?", v) } }) and(func() { if params.EntityID != nil && len(params.EntityID) == 0 { or("entities.id IS ?", nil) } for _, v := range params.EntityID { or("entities.id=?", v) } }) and(func() { for _, v := range params.classificationIDs { if v == 0 { or("items.classification_id IS ?", nil) } else { or("items.classification_id=?", v) } } }) and(func() { if params.OriginalID != nil && len(params.OriginalID) == 0 { or("items.original_id IS ?", nil) } for _, v := range params.OriginalID { or("items.original_id=?", v) } }) and(func() { if params.DataType != nil && len(params.DataType) == 0 { or("items.data_type IS ?", nil) } for _, v := range params.DataType { if strings.HasSuffix(v, "/*") { // useful for searching for all "video/*" files, for example or("items.data_type LIKE ? || '%'", v[:len(v)-1]) } else { or("items.data_type=?", v) } } }) and(func() { if params.DataText != nil && len(params.DataText) == 0 { or("items.data_text IS ?", nil) } for _, v := range params.DataText { or("items.data_text LIKE '%' || ? || '%'", v) } }) and(func() { if params.DataFile != nil && len(params.DataFile) == 0 { or("items.data_file IS ?", nil) } for _, v := range params.DataFile { or("items.data_file=?", v) } }) and(func() { // TODO: these can probably be in the same 'AND' group like this, right? for _, v := range params.ToAttributeID { or("relationships.to_attribute_id=?", v) } for _, v := range params.ToEntityID { or("relationships.to_entity_id=?", v) } }) // TODO: Use BETWEEN maybe if params.StartTimestamp != nil { and(func() { or("items.timestamp"+gt+" ?", params.StartTimestamp.UTC().UnixMilli()) if !params.StrictStartTimestamp { // if not strict, allow items to spill into the window even if the started before it or("items.timespan "+gt+" ?", params.StartTimestamp.UTC().UnixMilli()) } }) } if params.EndTimestamp != nil { and(func() { or("items.timestamp "+lt+" ?", params.EndTimestamp.UTC().UnixMilli()) }) if params.StrictEndTimestamp { // if strict, items' timespan must end before the EndTimestamp (item can't merely start before it) and(func() { or("items.timespan IS NULL OR items.timespan "+lt+" ?", params.EndTimestamp.UTC().UnixMilli()) }) } } if params.NoLocation { and(func() { or("items.latitude IS ?", nil) }) and(func() { or("items.longitude IS ?", nil) }) and(func() { or("items.altitude IS ?", nil) }) and(func() { or("items.coordinate_system IS ?", nil) }) } else { if params.MinLatitude != nil { and(func() { or("items.latitude "+gt+" ?", params.MinLatitude) }) } if params.MaxLatitude != nil { and(func() { or("items.latitude "+lt+" ?", params.MaxLatitude) }) } if params.MinLongitude != nil { and(func() { or("items.longitude "+gt+" ?", params.MinLongitude) }) } if params.MaxLongitude != nil { and(func() { or("items.longitude "+lt+" ?", params.MaxLongitude) }) } } // skip deleted items unless we are explicitly supposed to include them // (NOTE: we include them if the specific row IDs are requested) if !params.Deleted && len(params.RowID) == 0 { and(func() { or("items.deleted IS ?", nil) }) } // always skip hidden items and(func() { or("items.hidden IS ?", nil) }) // skip every so many items if sampling is enabled if params.Sample > 1 { and(func() { or("items.id % ? = 0", params.Sample) }) } if rootItemsOnly { // select only items which are not dependent on other items, i.e., items // that are not at the end of a directed relation from another item and(func() { or("relations.directed != ?", 1) or("relations.subordinating = ?", 0) or("relationships.to_item_id IS ?", nil) }) } // factor in relationships for _, relParam := range params.Relations { op := "IS" if relParam.Not { op = "IS NOT" } and(func() { if relParam.RelationLabel != "" { or("relations.label "+op+" ?", relParam.RelationLabel) } }) } if !params.OnlyTotal { q += "\n\t\tGROUP BY items.id" } // don't put order in the temporary table, since we'll be ordering by vector distance if !vectorSearch { if params.Sort != SortNone { q += "\n\t\tORDER BY " // TODO: not sure if this is how autocomplete will work or be useful, but basically // this sorts by data text so that if it's a prefix, it's weighed higher in the // sort, and if it's a suffix then put it at the end; i.e. favor term at beginning // of words instead of end... I think that's what this does, at least if params.PreferPrefix && len(params.DataText) == 1 { q += `CASE WHEN items.data_text LIKE ? || '%' THEN 1 WHEN items.data_text LIKE '%' || ? THEN 3 ELSE 2 END, ` args = append(args, params.DataText[0], params.DataText[0]) } // sort sortDir := strings.ToUpper(string(params.Sort)) if sortDir == "" { // smart sort: sort ASC by default if only "minimum" side of bounding boxes are specified if (params.StartTimestamp != nil || params.MinLatitude != nil || params.MinLongitude != nil) && params.EndTimestamp == nil && params.MaxLatitude == nil && params.MaxLongitude == nil { sortDir = string(SortAsc) } else { sortDir = string(SortDesc) } } if sortDir != string(SortAsc) && sortDir != string(SortDesc) { return "", nil, fmt.Errorf("invalid sort direction: %s", sortDir) } // location proximity search is only an approximation for simplicity // see https://stackoverflow.com/a/39298241/1048862 switch { case params.Latitude != nil && params.Longitude != nil: // nearest to location; account for shortened distances at poles // math.Cos() takes radians, hence the conversion to radians inside the cosine cosLat2 := math.Pow(math.Cos(*params.Latitude*math.Pi/180.0), 2) //nolint:mnd sortDir = string(SortAsc) // always sort ascending for nearest q += "((?-items.latitude) * (?-items.latitude)) + ((?-items.longitude) * (?-items.longitude) * ?), items.id " + sortDir args = append(args, params.Latitude, params.Latitude, params.Longitude, params.Longitude, cosLat2) case params.Timestamp != nil: // nearest to timestamp sortDir = string(SortAsc) // always sort ascending for nearest q += "abs(?-items.timestamp), items.id " + sortDir args = append(args, params.Timestamp.UnixMilli()) case params.OrderBy == "stored": q += "items.stored " + sortDir default: // generic sort, which is timestamp and row ID q += fmt.Sprintf("items.timestamp %s, items.id %s", sortDir, sortDir) } } // limit if !params.OnlyTotal { if params.Limit == 0 { params.Limit = 1000 } if params.Limit > 0 { q += "\n\t\tLIMIT ?" args = append(args, params.Limit) } if params.Offset > 0 { q += "\n\t\tOFFSET ?" args = append(args, params.Offset) } } } if vectorSearch { var targetVectorClause string if params.SimilarTo > 0 { targetVectorClause = "(SELECT embedding FROM embeddings JOIN items ON embeddings.item_id = items.id WHERE items.id=? LIMIT 1)" args = append(args, params.SimilarTo) } else if params.SemanticText != "" { // search relative to an arbitrary input (TODO: support image inputs too) - python server must be online if !pythonServerReady(ctx, false) { return "", nil, errors.New("python server not ready") } embedding, err := generateEmbedding(ctx, "text/plain", []byte(params.SemanticText), nil) if err != nil { return "", nil, err } targetVectorClause = "?" args = append(args, string(embedding)) } q += fmt.Sprintf(` ) SELECT search_results.*, vec_distance_l2(%s, embeddings.embedding) AS distance FROM search_results JOIN embeddings ON embeddings.item_id = search_results.id ORDER BY distance`, targetVectorClause) if params.Limit == 0 { params.Limit = 100 } if params.Limit > 0 { q += "\nLIMIT ?" args = append(args, params.Limit) } if params.Offset > 0 { q += "\nOFFSET ?" args = append(args, params.Offset) } } return q, args, nil } func (tl *Timeline) expandRelationships(ctx context.Context, tx *sql.Tx, degrees int, sr *SearchResult) error { if degrees <= 0 { return nil } // notice how we're careful to avoid recursion while we have open rows // scanning happening, so that we don't step on other select queries err := tl.expandRelationshipSingle(ctx, tx, sr) if err != nil { return err } // now expand relationships recursively until degrees of separation // from the original search results are reached for _, rel := range sr.Related { if rel.FromItem != nil { err := tl.expandRelationships(ctx, tx, degrees-1, rel.FromItem) if err != nil { return err } } if rel.ToItem != nil { err := tl.expandRelationships(ctx, tx, degrees-1, rel.ToItem) if err != nil { return err } } } return nil } func (tl *Timeline) expandRelationshipSingle(ctx context.Context, tx *sql.Tx, sr *SearchResult) error { // TODO: limit here is arbitrary I think... ho hum rows, err := tx.QueryContext(ctx, ` SELECT relationships.id, relations.directed, relations.label, relationships.value, relationships.start, relationships.end, relationships.metadata, relationships.from_item_id, relationships.to_item_id, from_entity.id, from_entity.name, from_entity.picture_file, from_attr.name, from_attr.value, from_attr.alt_value, to_entity.id, to_entity.name, to_entity.picture_file, to_attr.name, to_attr.value, to_attr.alt_value FROM relationships JOIN items ON items.id=? JOIN relations ON relations.id=relationships.relation_id LEFT JOIN attributes AS from_attr ON from_attr.id=relationships.from_attribute_id LEFT JOIN attributes AS to_attr ON to_attr.id=relationships.to_attribute_id LEFT JOIN entity_attributes AS from_ea ON from_ea.attribute_id = from_attr.id LEFT JOIN entity_attributes AS to_ea ON to_ea.attribute_id = to_attr.id LEFT JOIN entities AS from_entity ON from_entity.id = from_ea.entity_id LEFT JOIN entities AS to_entity ON to_entity.id = to_ea.entity_id WHERE (relationships.from_item_id=? OR relationships.to_item_id=?) AND items.hidden IS NULL GROUP BY relationships.id LIMIT 10`, sr.ItemRow.ID, sr.ItemRow.ID, sr.ItemRow.ID) if err != nil { return fmt.Errorf("querying db for relationships: %w", err) } defer rows.Close() for rows.Next() { var rel Related var fromItemID, toItemID *uint64 var relStart, relEnd *int64 var fromEntity, toEntity relatedEntity var relMeta *string err := rows.Scan(&rel.RelationshipID, &rel.Directed, &rel.Label, &rel.Value, &relStart, &relEnd, &relMeta, &fromItemID, &toItemID, &fromEntity.ID, &fromEntity.Name, &fromEntity.Picture, &fromEntity.Attribute.Name, &fromEntity.Attribute.Value, &fromEntity.Attribute.AltValue, &toEntity.ID, &toEntity.Name, &toEntity.Picture, &toEntity.Attribute.Name, &toEntity.Attribute.Value, &toEntity.Attribute.AltValue) if err != nil { return err } if fromEntity.ID != nil { rel.FromEntity = &fromEntity } if toEntity.ID != nil { rel.ToEntity = &toEntity } if relStart != nil { startTime := time.Unix(*relStart, 0) rel.Start = &startTime } if relEnd != nil { endTime := time.Unix(*relEnd, 0) rel.End = &endTime } if relMeta != nil { rel.Metadata = json.RawMessage(*relMeta) } // expand items, we have to do this until our code can support // loading an item from among other columns as well; expand // only if the item is distinct from the parent/starting item if fromItemID != nil && *fromItemID != sr.ID { fromRel, err := tl.loadRelatedItem(ctx, tx, *fromItemID) if err != nil { return fmt.Errorf("loading from_item: %w", err) } rel.FromItem = fromRel } if toItemID != nil && *toItemID != sr.ID { toRel, err := tl.loadRelatedItem(ctx, tx, *toItemID) if err != nil { return fmt.Errorf("loading to_item: %w", err) } rel.ToItem = toRel } sr.Related = append(sr.Related, rel) } if err = rows.Err(); err != nil { return fmt.Errorf("scanning related item rows: %w", err) } return nil } func (tl *Timeline) loadRelatedItem(ctx context.Context, tx *sql.Tx, itemRowID uint64) (*SearchResult, error) { ir, err := tl.loadItemRow(ctx, tx, itemRowID, 0, nil, nil, nil, false) if err != nil { return nil, fmt.Errorf("loading related item row: %w", err) } // TODO: dunno if this safe (QueryRow during a rows.Scan, by the caller of this function) // TODO: if it would help increase performance, we could probably cache information about persons and person_identities... // TODO: maybe there's a view we could use for this instead? var p relatedEntity if ir.AttributeID != nil { err = tx.QueryRowContext(ctx, ` SELECT entities.id, entities.name, entities.picture_file, attributes.name, attributes.value, attributes.alt_value, attributes.longitude, attributes.latitude, attributes.altitude FROM attributes, entities JOIN entity_attributes ON entity_attributes.entity_id = entities.id AND entity_attributes.attribute_id = attributes.id WHERE attributes.id=?`, ir.AttributeID).Scan(&p.ID, &p.Name, &p.Picture, &p.Attribute.Name, &p.Attribute.Value, &p.Attribute.AltValue, &p.Attribute.Longitude, &p.Attribute.Latitude, &p.Attribute.Altitude) if err != nil { return nil, fmt.Errorf("loading related entity: %w", err) } } sr := &SearchResult{RepoID: tl.id.String(), ItemRow: ir} if p.ID != nil { sr.Entity = &p } return sr, nil } type SearchResult struct { RepoID string `json:"repo_id,omitempty"` ItemRow HasEmbedding bool `json:"has_embedding,omitempty"` Entity *relatedEntity `json:"entity,omitempty"` Related []Related `json:"related,omitempty"` Size int64 `json:"size,omitempty"` // from ML model Distance float64 `json:"distance,omitempty"` Score float64 `json:"score,omitempty"` } // RelationParams describes a search using item relations. type RelationParams struct { Not bool `json:"not,omitempty"` // if true, only match items that do NOT have this relation RelationLabel string `json:"relation_label,omitempty"` } // relatedEntity is a subset of Entity (so we need to make sure // the JSON field names are the same), but with pointer field // types because it is left-joined in queries which means they // can be null. type relatedEntity struct { ID *uint64 `json:"id"` Name *string `json:"name,omitempty"` Picture *string `json:"picture,omitempty"` Attribute nullableAttribute `json:"attribute,omitempty"` // TODO: experimental } // nullableAttribute is like Attribute but with nullable fields // so that it can be I/O for the database type nullableAttribute struct { ID *uint64 `json:"id,omitempty"` Name *string `json:"name,omitempty"` Value *string `json:"value,omitempty"` AltValue *string `json:"alt_value,omitempty"` Latitude *float64 `json:"latitude,omitempty"` Longitude *float64 `json:"longitude,omitempty"` Altitude *float64 `json:"altitude,omitempty"` } func (na nullableAttribute) attribute() Attribute { var a Attribute if na.ID != nil { a.ID = *na.ID } if na.Name != nil { a.Name = *na.Name } if na.Value != nil { a.Value = *na.Value } if na.AltValue != nil { a.AltValue = *na.AltValue } a.Longitude = na.Longitude a.Latitude = na.Latitude a.Altitude = na.Altitude return a } type Related struct { Relation RelationshipID int `json:"relationship_id"` Value *string `json:"value,omitempty"` FromItem *SearchResult `json:"from_item,omitempty"` ToItem *SearchResult `json:"to_item,omitempty"` FromEntity *relatedEntity `json:"from_entity,omitempty"` ToEntity *relatedEntity `json:"to_entity,omitempty"` Start *time.Time `json:"start,omitempty"` End *time.Time `json:"end,omitempty"` Metadata json.RawMessage `json:"metadata,omitempty"` } type SortDir string const ( SortNone SortDir = "none" // special value that means to not include a ORDER BY clause SortAsc SortDir = "ASC" SortDesc SortDir = "DESC" )