timelinize/timeline/schema.sql

/*
	Timelinize
	Copyright (c) 2013 Matthew Holt

	This program is free software: you can redistribute it and/or modify
	it under the terms of the GNU Affero General Public License as published
	by the Free Software Foundation, either version 3 of the License, or
	(at your option) any later version.

	This program is distributed in the hope that it will be useful,
	but WITHOUT ANY WARRANTY; without even the implied warranty of
	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
	GNU Affero General Public License for more details.

	You should have received a copy of the GNU Affero General Public License
	along with this program.  If not, see <https://www.gnu.org/licenses/>.
*/

-- Generic key/value store for the repo itself (version, ID, etc), and settings.
CREATE TABLE IF NOT EXISTS "repo" (
	"key" TEXT PRIMARY KEY,
	"value",
	"type" TEXT -- hint to the application since all values come back as strings - TODO: can we just infer this? That would be much nicer, but could also be problematic
) WITHOUT ROWID;

-- A data source is where data comes from, like a content provider, like a cloud photo service,
-- social media site, or exported archive format. This table primarily exists to provide data
-- consistency for rows that refer to a data source enforced by the DB itself, using more compact
-- IDs than the program's string names.
CREATE TABLE IF NOT EXISTS "data_sources" (
	"id" INTEGER PRIMARY KEY, -- row ID, used only for DB consistency, applies only to this DB
	"name" TEXT NOT NULL UNIQUE, -- programming ID, e.g. "google_photos" as used in the application code and across different timeline DBs
	"title" TEXT NOT NULL, -- human-readable name/label
	"description" TEXT,
	"media" BLOB, -- icon/image, square dimensions work best
	"media_type" TEXT, -- MIME type of the icon
	"standard" BOOLEAN NOT NULL DEFAULT false -- true for data sources that come hard-coded with the app
);

CREATE TABLE IF NOT EXISTS "jobs" (
	"id" INTEGER PRIMARY KEY,
	"type" TEXT NOT NULL, -- import, thumbnails, etc...
	"name" TEXT, -- an optional user-assigned name
	"configuration" TEXT, -- encoded as JSON
	"hash" BLOB, -- for preventing duplicate jobs; opaque to everything except the code creating the job
	"state" TEXT NOT NULL DEFAULT 'queued', -- queued, started, paused, aborted, succeeded, failed
	"hostname" TEXT, -- hostname of the machine the job was created and configured on
	"created" INTEGER NOT NULL DEFAULT (cast(round(unixepoch('subsec')*1000) AS INTEGER)), -- timestamp job was stored/enqueued in unix milliseconds UTC
	"updated" INTEGER, -- timestamp of last DB sync (in unix milliseconds UTC)
	"start" INTEGER, -- timestamp job was actually started in unix milliseconds UTC *could be future, so not called "started")
	"ended" INTEGER, -- timestamp in unix milliseconds UTC when job ended (not paused)
	"message" TEXT, -- brief message describing current status to be shown to the user, changes less frequently than log emissions
	"total" INTEGER, -- total number of units to complete
	"progress" INTEGER, -- number of units completed towards the total count
	"checkpoint" TEXT, -- required state for resuming an incomplete job, as a JSON serialization
	-- if job is scheduled to run automatically at a certain interval, the following fields track that state
	"repeat" INTEGER, -- when this job is started, next job should be scheduled (inserted for future start) this many seconds from start time (not to be started if previous still running)
	"parent_job_id" INTEGER, -- the job before this one that scheduled or created this one, forming a chain or linked list
	FOREIGN KEY ("parent_job_id") REFERENCES "jobs"("id") ON UPDATE CASCADE ON DELETE SET NULL
) STRICT;

-- Entity type names are hard-coded (but their IDs are not).
CREATE TABLE IF NOT EXISTS "entity_types" (
	"id" INTEGER PRIMARY KEY,
	"name" TEXT NOT NULL UNIQUE
) STRICT;

-- An entity represents a person, place, or thing (i.e. nouns). The vast majority will be
-- persons, but some may be pets/animals, companies and organizations, political
-- entities, or landmarks/places. It is not uncommon for duplicate entities to appear in
-- this table through various data imports, because each data source has a different way
-- of identifying or describing users/entities. Manual curation or post-processing may be
-- required to reconcile duplicates.
CREATE TABLE IF NOT EXISTS "entities" (
	"id" INTEGER PRIMARY KEY,
	"type_id" INTEGER NOT NULL,
	"job_id" INTEGER, -- import that originally created this entity, if via a data import
	"stored" INTEGER NOT NULL DEFAULT (unixepoch()),
	"modified" INTEGER, -- timestamp when entity was locally/manually modified (may include linked attributes)
	"name" TEXT COLLATE NOCASE,
	"picture_file" TEXT,
	"metadata" TEXT, -- optional extra information, encoded as JSON (should almost never be used! use attributes instead)
	"hidden" INTEGER, -- if owner would like to forget about this person, don't show in search results, etc.
	"deleted" INTEGER, -- timestamp when item was moved to trash and can be purged after some amount of time after that
	FOREIGN KEY ("type_id") REFERENCES "entity_types"("id") ON UPDATE CASCADE,
	FOREIGN KEY ("job_id") REFERENCES "jobs"("id") ON UPDATE CASCADE ON DELETE SET NULL
) STRICT;

-- Attributes describe entities. An attribute has a name and a value. The name for a particular
-- kind of attribute should be consistent throughout because some attributes need special handling
-- or can be normalized in the code (e.g. phone numbers). A single attribute may describe one or
-- more entities; attributes that don't link to any entities (via the entity_attributes table)
-- should not exist (see the relevant trigger for this). An attribute may identify an entity,
-- for example, a phone number or email address is kind of a globally-unique identifier; or an
-- attribute named "twitter_id" for example would identify an entity on Twitter. An attribute
-- that is an identity for one entity may not be for another, so that is encoded in the
-- entity_attributes table instead. The special attribute named "_entity" acts as a pass-thru
-- attribute directly to the entity by ID; this is useful if there's no particular attribute
-- relevant or known, and instead we want to specify an entity directly instead.
-- TODO: For ancestors, their identifying attribute could be a FamilySearch record ID.
CREATE TABLE IF NOT EXISTS "attributes" (
	"id" INTEGER PRIMARY KEY,
	"name" TEXT NOT NULL,
	"value" ANY COLLATE NOCASE,
	"alt_value" TEXT, -- optional alternate value intended for display or as a description
	-- the coordinate values below are useful if the attribute value is, in fact, a location
	"longitude" REAL,
	"latitude" REAL,
	"altitude" REAL,
	"metadata" TEXT,  -- optional extra info encoded as JSON
	UNIQUE ("name", "value")
) STRICT;

-- This table links entities and attributes. An entity may have multiple attributes, and an attribute
-- may apply to multiple entities. If the attribute represents an entity's identity on a particular
-- data source, the data_source_id field will be filled out.
CREATE TABLE IF NOT EXISTS "entity_attributes" (
	"id" INTEGER PRIMARY KEY,
	"entity_id" INTEGER NOT NULL,
	"attribute_id" INTEGER NOT NULL,
	"data_source_id" INTEGER, -- if set, the attribute defines the entity's identity on this data source (a row of a certain entity_id and attribute_id can be duplicated if they are identities on different data sources)
	"job_id" INTEGER, -- the ID of the import that originated this row
	"start" INTEGER, -- when the attribute started applying to the entity
	"end" INTEGER,   -- when the attribute stopped applying to the entity
	FOREIGN KEY ("entity_id") REFERENCES "entities"("id") ON UPDATE CASCADE ON DELETE CASCADE,
	FOREIGN KEY ("attribute_id") REFERENCES "attributes"("id") ON UPDATE CASCADE ON DELETE CASCADE,
	FOREIGN KEY ("data_source_id") REFERENCES "data_sources"("id") ON UPDATE CASCADE ON DELETE CASCADE,
	FOREIGN KEY ("job_id") REFERENCES "jobs"("id") ON UPDATE CASCADE ON DELETE SET NULL,
	UNIQUE("entity_id", "attribute_id", "data_source_id")
) STRICT;

-- These indexes make loading entities by attributes faster, especially during import jobs, when it's in the hot path.
-- (If the query plan still shows "SCAN entities" then it might be due to the analyzer
-- determining that's faster than using the index; can test the index by adding "INDEXED BY...")
CREATE INDEX IF NOT EXISTS "idx_entities_type_name" ON "entities"("type_id", "name");
CREATE INDEX IF NOT EXISTS "idx_attributes_name_value" ON "attributes"("name", "value");
CREATE INDEX IF NOT EXISTS "idx_ea_entity_attr" ON "entity_attributes"("attribute_id", "entity_id");

-- A classification describes what kind of thing an item is, for example a text message, tweet, email, or location.
CREATE TABLE IF NOT EXISTS "classifications" (
	"id" INTEGER PRIMARY KEY,
	"standard" INTEGER NOT NULL DEFAULT 0, -- 1: hard-coded classification, 0: user-created
	"name" TEXT NOT NULL UNIQUE,  -- programming name, e.g. "message"
	"labels" TEXT NOT NULL, -- comma-separated human-friendly names, e.g. "Text message,SMS,MMS,iMessage,Texting"
	"description" TEXT NOT NULL
) STRICT;

-- TODO: Not used currently (but is supported, and accessed in queries).
CREATE TABLE IF NOT EXISTS "item_data" (
	"id" INTEGER PRIMARY KEY,
	"content" BLOB NOT NULL UNIQUE
) STRICT;

-- An item is something imported from a specific data source. They represent things that are
-- originated, created, or recorded; as opposed to entities, which just exist and don't have
-- specific temporal significance. Items typically exist as a result of entities existing.
CREATE TABLE IF NOT EXISTS "items" (
	"id" INTEGER PRIMARY KEY,
	"data_source_id" INTEGER,
	"job_id" INTEGER, -- the import job that originally inserted this item
	"modified_job_id" INTEGER, -- the import job that most recently modified this existing item
	"attribute_id" INTEGER, -- owner, creator, or originator attributed to this item
	"classification_id" INTEGER,
	"original_id" TEXT, -- ID provided by the data source
	"original_location" TEXT,     -- path or location of the file/data on the original data source; should include filename if applicable
	"intermediate_location" TEXT, -- path or location of the file/data from the import dataset (e.g. after exporting from the data source); should include filename if application
	"filename" TEXT, -- name of the original file as named by the owner, if known
	"timestamp" INTEGER, -- unix epoch millisecond timestamp when item content was originally created (NOT when the database row was created)
	"timespan" INTEGER,  -- ending unix epoch ms timestamp if this item spans time (instead of being a single point in time); can be used in conjunction with timeframe to suggest duration
	"timeframe" INTEGER, -- ending unix epoch ms timestamp if this item takes place somewhere between timestamp and timeframe, but it's not certain exactly when
	"time_offset" INTEGER, -- offset of original timestamp in seconds east of UTC/GMT (time zone); null means "local time" or "wall time"
	"time_offset_origin" TEXT, -- if the offset (time zone) was explicitly part of the timestamp, this will be null; otherwise a byte indicating how we sourced/inferred the offset
	"time_uncertainty" INTEGER, -- if nonzero, time columns may be inaccurate on the order of this number of milliseconds, essentially sliding the times in a fuzzy interval
	"stored" INTEGER NOT NULL DEFAULT (unixepoch()), -- unix epoch second timestamp when row was created or last retrieved from source
	"modified" INTEGER, -- unix epoch second timestamp when item was manually modified (not via an import); if not null, then item is "not clean"
	"data_id" INTEGER, -- TODO: not used currently, but may be used to store binary data in the database directly
	"data_type" TEXT,  -- the MIME type (aka "media type") of the data
	"data_text" TEXT COLLATE NOCASE, -- item content, if text-encoded and (by default) not very long
	"data_file" TEXT COLLATE NOCASE, -- item filename, if non-text or not suitable for storage in DB (usually media), relative to repo root
	"data_hash" BLOB, -- BLAKE3 checksum of contents of the data file
	"metadata" TEXT,  -- optional extra information, encoded as JSON for flexibility
	"longitude" REAL, -- or equivalent X-coord for the coordinate system
	"latitude" REAL,  -- or equivalent Y-coord for the coordinate system
	"altitude" REAL,  -- or equivalent Z-coord for the coordinate system
	"coordinate_system" TEXT, -- reserved for future use to define which coordinate system is being used or on which world coordinates refer to
	"coordinate_uncertainty" REAL, -- if nonzero, lat/lon values may be inaccurate by this amount (same unit as coordinates)
	"note" TEXT,      -- optional user-added information
	"starred" INTEGER, -- like a bookmark; TODO: different numbers indicate different kinds of stars or something?
	"thumb_hash" BLOB, -- bytes of the ThumbHash (https://evanw.github.io/thumbhash/ and https://github.com/evanw/thumbhash) - prefixed with aspect ratio; stored here with item for fast, easy retrieval
	"original_id_hash" BLOB, -- a hash of the data source and original ID of the item, also used for duplicate detection, optionally stored when item is deleted
	"initial_content_hash" BLOB, -- a hash computed during initial import, used for duplicate detection (remains same even if item is modified by user)
	"retrieval_key" BLOB UNIQUE, -- an optional opaque value that indicates this item may not be fully populated in a single import; not an ID but still a unique identifier
	"hidden" INTEGER,  -- if owner would like to forget about this item, don't show it in search results, etc. TODO: keep?
	"deleted" INTEGER, -- 1 = if the columns will be erased, they have been erased; >1 = a unix epoch timestamp after which the columns can be erased
	FOREIGN KEY ("data_source_id") REFERENCES "data_sources"("id") ON UPDATE CASCADE,
	FOREIGN KEY ("job_id") REFERENCES "jobs"("id") ON UPDATE CASCADE, --TODO: maybe add ON DELETE CASCADE someday, which would rely on a regular sweeping of the files (garbage collection)! or we could do SET NULL
	FOREIGN KEY ("modified_job_id") REFERENCES "jobs"("id") ON UPDATE CASCADE ON DELETE SET NULL, -- deleting that import won't undo the changes, however
	FOREIGN KEY ("attribute_id") REFERENCES "attributes"("id") ON UPDATE CASCADE,
	FOREIGN KEY ("classification_id") REFERENCES "classifications"("id") ON UPDATE CASCADE,
	FOREIGN KEY ("data_id") REFERENCES "item_data"("id") ON UPDATE CASCADE ON DELETE SET NULL,
	UNIQUE ("data_source_id", "original_id")
) STRICT;

CREATE INDEX IF NOT EXISTS "idx_items_timestamp" ON "items"("timestamp"); -- the need for this is obvious
CREATE INDEX IF NOT EXISTS "idx_items_data_file" ON "items"("data_file"); -- used during imports especially for maintenance

-- These next two partial indexes greatly speed up processing when importing items, or any queries that
-- check for existing rows that may have been deleted or modified from their original content. Because
-- they are partial indexes, I have found that they do not greatly impact insert performance since most
-- new items are not deleted or modified going in.
CREATE INDEX IF NOT EXISTS "idx_items_deleted_original_id_hash" ON "items"("deleted", "original_id_hash") WHERE deleted IS NOT NULL OR original_id_hash IS NOT NULL;
CREATE INDEX IF NOT EXISTS "idx_items_initial_content_hash" ON "items"("initial_content_hash") WHERE modified IS NOT NULL OR deleted IS NOT NULL;

-- This partial index speeds up processing for items that have incoming data and we need to check for
-- existing item by its data (i.e. Collection items, very common with photo libraries; one per media item!)
CREATE INDEX IF NOT EXISTS "idx_items_data_text_hash" ON "items"("data_text" COLLATE NOCASE, "data_hash") WHERE data_text IS NOT NULL OR data_hash IS NOT NULL;

-- This partial index speeds up processing for items that have IDs assigned by their data source.
CREATE INDEX IF NOT EXISTS "idx_items_original_id" ON "items"("original_id") WHERE original_ID IS NOT NULL;

-- Relationships may exist between and across items and entities. A row
-- in this table is an actual connection between items and/or entities.
CREATE TABLE IF NOT EXISTS "relationships" (
	"id" INTEGER PRIMARY KEY,
 	"relation_id" INTEGER NOT NULL,
	"value" ANY, -- optional; some relationships make sense if they contain a value (like a reaction: what was the reaction? or a collection element, what is its position?)
	-- exactly one of the from_ fields and one of the to_ fields should be non-NULL
	"from_item_id" INTEGER,
	"from_attribute_id" INTEGER,
	"to_item_id" INTEGER,
	"to_attribute_id" INTEGER,
	-- the next two fields optionally describe when the relationship started and/or ended as unix epoch timestamp in seconds
	"start" INTEGER,
	"end" INTEGER,
	"metadata" TEXT, -- optional extra info encoded as JSON
	FOREIGN KEY ("relation_id") REFERENCES "relations"("id") ON UPDATE CASCADE,
	FOREIGN KEY ("from_item_id") REFERENCES "items"("id") ON UPDATE CASCADE ON DELETE CASCADE,
	FOREIGN KEY ("to_item_id") REFERENCES "items"("id") ON UPDATE CASCADE ON DELETE CASCADE,
	FOREIGN KEY ("from_attribute_id") REFERENCES "attributes"("id") ON UPDATE CASCADE ON DELETE CASCADE,
	FOREIGN KEY ("to_attribute_id") REFERENCES "attributes"("id") ON UPDATE CASCADE ON DELETE CASCADE
) STRICT;

-- This speeds up inserts because we check for duplicate relationships in the app (it's complex with potential timeframes),
-- and this way only 1 index is necessary, not multiple on the various fields we check for uniqueness
CREATE INDEX IF NOT EXISTS "idx_relationships_from_item_id" ON "relationships"("from_item_id");

-- Relations define the way relationships connect. They are described by natural
-- language phrases such as "in reply to", "picture of", or "attached to"; or could
-- be words like "siblings", "coworkers", or "related". Relations will either be
-- directed (one way; to/from matter) or bidirectional (both ways), which depends
-- on the phrase and whatever makes sense if you were to put it into a sentence.
CREATE TABLE IF NOT EXISTS "relations" (
	"id" INTEGER PRIMARY KEY,
	"label" TEXT NOT NULL UNIQUE, -- natural phrase
	"directed" BOOLEAN NOT NULL DEFAULT true, -- false = bidirectional
	"subordinating" BOOLEAN NOT NULL DEFAULT false -- if true, item on the end of directed relation does not make sense on its own, e.g. attachment or motion picture
); -- not STRICT due to boolean type

-- A story is a user-created document with rich text formatting (basically HTML;
-- including hrefs to other items/entities). It can also embed timeline data, which
-- embeddings are stored in the story_elements table.
CREATE TABLE IF NOT EXISTS "stories" (
	"id" INTEGER PRIMARY KEY,
	"name" TEXT,
	"description" TEXT,
	"content" TEXT COLLATE NOCASE,
	"content_type" TEXT NOT NULL, -- MIME type; "text/html", "text/markdown", etc -- whatever form the content is in
	"modified" INTEGER NOT NULL DEFAULT (unixepoch())
) STRICT;

-- Elements of stories. An element can be rich text (HTML; including hrefs to other items/entities);
-- an embedded item, entity, or collection; or a query that loads a dynamic set of items, entities, or
-- collections. Rich text content can be provided together with embedded data as an annotation.
CREATE TABLE IF NOT EXISTS "story_elements" (
	"id" INTEGER PRIMARY KEY,
	"story_id" INTEGER NOT NULL,
	-- only one of the following 5 fields should be populated
	"item_id" INTEGER,       -- embedded item
	"entity_id" INTEGER,     -- embedded entity
	"query" TEXT,            -- embedded dynamic set; omits SELECT, e.g. "FROM items|entities WHERE ... ORDER BY ... LIMIT ..."
	"external" TEXT,         -- embedded data from other timeline/repo DB, of this format: "<repo_id>:<query>|item|entity:rowid"
	"note" TEXT,             -- optional annotation of embedded content
	FOREIGN KEY ("story_id") REFERENCES "stories"("id") ON UPDATE CASCADE ON DELETE CASCADE,
	FOREIGN KEY ("item_id") REFERENCES "items"("id") ON UPDATE CASCADE ON DELETE SET NULL,
	FOREIGN KEY ("entity_id") REFERENCES "entities"("id") ON UPDATE CASCADE ON DELETE SET NULL
) STRICT;

-- A tag is a simple descriptor on an item, put there by the user for organizational purposes.
CREATE TABLE IF NOT EXISTS "tags" (
	"id" INTEGER PRIMARY KEY,
	"label" TEXT NOT NULL UNIQUE,
	"color" TEXT
) STRICT;

-- The tags associated with items, entities, stories, or relationships.
-- The same tag may not be applied multiple times to the same entity.
CREATE TABLE IF NOT EXISTS "tagged" (
	"id" INTEGER PRIMARY KEY,
	"tag_id" INTEGER NOT NULL,
	-- only one of the following ID fields should be populated per row
	"item_id" INTEGER,
	"entity_id" INTEGER,
	"attribute_id" INTEGER,
	"story_id" INTEGER,
	"relationship_id" INTEGER,
	"data" TEXT, -- could be used for rich relationships, e.g. the coordinates of a entity tagged in a photo, etc.
	-- TODO: a way to indicate if this tag came as part of an import from a data source?
	FOREIGN KEY ("tag_id") REFERENCES "tags"("id") ON UPDATE CASCADE ON DELETE CASCADE,
	FOREIGN KEY ("item_id") REFERENCES "items"("id") ON UPDATE CASCADE ON DELETE CASCADE,
	FOREIGN KEY ("entity_id") REFERENCES "entities"("id") ON UPDATE CASCADE ON DELETE CASCADE,
	FOREIGN KEY ("attribute_id") REFERENCES "attributes"("id") ON UPDATE CASCADE ON DELETE CASCADE,
	FOREIGN KEY ("story_id") REFERENCES "stories"("id") ON UPDATE CASCADE ON DELETE CASCADE,
	FOREIGN KEY ("relationship_id") REFERENCES "relationships"("id") ON UPDATE CASCADE ON DELETE CASCADE
) STRICT;

-- TODO: I might get rid of this. This table would become user-created commentary on items/entities... but those could be items themselves...
CREATE TABLE IF NOT EXISTS "notes" (
	"id" INTEGER PRIMARY KEY,
	-- only one of the following ID fields should be populated per row
	"item_id" INTEGER,
	"entity_id" INTEGER,
	"content" TEXT, -- a user-friendly description that doesn't fit the DB structure
	"metadata" TEXT, -- optional metadata structured as JSON
	FOREIGN KEY ("item_id") REFERENCES "items"("id") ON UPDATE CASCADE ON DELETE CASCADE,
	FOREIGN KEY ("entity_id") REFERENCES "entities"("id") ON UPDATE CASCADE ON DELETE CASCADE
) STRICT;

-- A log of changes, notices, warnings, or errors pertaining  to elements of the timeline.
CREATE TABLE IF NOT EXISTS "logs" (
	"id" INTEGER PRIMARY KEY,
	"level" INTEGER, -- 0=update/modification 1=notice 2=error
	"message" TEXT,
	"metadata" TEXT, -- optional metadata structured as JSON
	"timestamp" INTEGER, -- unix timestamp milliseconds
	-- only one of the following ID fields should be populated per row
	"job_id" INTEGER,
	"item_id" INTEGER,
	"entity_id" INTEGER,
	"attribute_id" INTEGER,
	"relationship_id" INTEGER,
	"entity_attribute_id" INTEGER,
	FOREIGN KEY ("job_id") REFERENCES "jobs"("id") ON UPDATE CASCADE ON DELETE CASCADE,
	FOREIGN KEY ("item_id") REFERENCES "items"("id") ON UPDATE CASCADE ON DELETE CASCADE,
	FOREIGN KEY ("entity_id") REFERENCES "entities"("id") ON UPDATE CASCADE ON DELETE CASCADE,
	FOREIGN KEY ("attribute_id") REFERENCES "attributes"("id") ON UPDATE CASCADE ON DELETE CASCADE,
	FOREIGN KEY ("relationship_id") REFERENCES "relationships"("id") ON UPDATE CASCADE ON DELETE CASCADE,
	FOREIGN KEY ("entity_attribute_id") REFERENCES "entity_attributes"("id") ON UPDATE CASCADE ON DELETE CASCADE
) STRICT;

-- Embeddings enable "intelligent" search using ML models to derive semantics and meaning.
-- By finding other embeddings that are close (dot product or euclidean distance),
-- similarity searches are possible. This requires the sqlite-vec module.
CREATE TABLE IF NOT EXISTS "embeddings" (
	"id" INTEGER PRIMARY KEY,
	"item_id" INTEGER,
	"generated" INTEGER NOT NULL DEFAULT (unixepoch()), -- when the embedding was generated (timestamp in unix seconds UTC)
	"embedding" BLOB, -- TODO: could define as float[768] (unless STRICT) and then use `check(typeof(contents_embedding) == 'blob' AND vec_length(contents_embedding) == 768)`
	FOREIGN KEY ("item_id") REFERENCES "items"("id") ON UPDATE CASCADE ON DELETE CASCADE
) STRICT;

CREATE INDEX IF NOT EXISTS "idx_embeddings_item_id" ON "embeddings"("item_id");

-- TODO: this is convenient -- will probably keep this, because the db-based enums like data sources and classifications
-- don't get translated earlier; maybe we could, but I still need to think on that... if we do keep this,
-- I wonder if it'd be useful to loop in the attribute name and value as well? for item de-duplication in loadItemRow()....
CREATE VIEW IF NOT EXISTS "extended_items" AS
	SELECT
		items.*,
		data_sources.name AS data_source_name,
		data_sources.title AS data_source_title,
		classifications.name AS classification_name,
		-- TODO: verify these are useful fields and make the dashboard queries more efficient
		cast(strftime('%j', date(round(timestamp/1000), 'unixepoch')) AS INTEGER) AS day_of_year,
		cast(strftime('%W', date(round(timestamp/1000), 'unixepoch')) AS INTEGER) AS week_of_year,
		cast(strftime('%m', date(round(timestamp/1000), 'unixepoch')) AS INTEGER) AS month_of_year
	FROM items
	LEFT JOIN data_sources ON data_sources.id = items.data_source_id
	LEFT JOIN classifications ON classifications.id = items.classification_id;

-- Don't allow stray attributes; if for any reason no entity_attributes rows
-- point to a given attribute, always delete the attribute to clean up. The
-- only reason we have them separated into two tables is to allow multiple
-- entities to share an attribute, but it's pointless for an attribute to not
-- be associated with any entities.
CREATE TRIGGER IF NOT EXISTS prevent_stray_attributes
	AFTER DELETE ON entity_attributes
	FOR EACH ROW
	WHEN
		(SELECT count(1) FROM
			(SELECT id FROM entity_attributes WHERE attribute_id=OLD.attribute_id LIMIT 1)) = 0
	BEGIN
		DELETE FROM attributes WHERE id=OLD.attribute_id;
	END;