1
0
Fork 0
timelinize/timeline/maintenance.go
Matthew Holt 03d126ed68
Use PRAGMA optimize instead of ANALYZE
This is supposedly a smarter way to do ANALYZE, as it only analyzes what sqlite thinks is needful. Should hopefully address some reports of too-frequent, long-running analyze queries.

There was one time I noticed that the pragma didn't improve query plans, until I ran analyze specifically which did improve it, but that was using the old DB connection model where I had a single pool of mixed readers/writers, so maybe it's possible that the new pooling style (separate r/w pools) also addresses that, I dunno.
2025-10-23 11:30:05 -06:00

290 lines
10 KiB
Go

/*
Timelinize
Copyright (c) 2013 Matthew Holt
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published
by the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
package timeline
import (
"context"
"database/sql"
"fmt"
"strings"
"sync/atomic"
"time"
"go.uber.org/zap"
)
// maintenanceLoop runs various operations on the timeline while it is open.
func (tl *Timeline) maintenanceLoop() {
logger := Log.Named("maintenance")
err := tl.deleteExpiredItems(logger)
if err != nil {
logger.Error("problem deleting expired items at startup", zap.Error(err))
}
// Best practice, according to the SQLite docs:
//
// "Applications with long-lived database connections should run "PRAGMA
// optimize=0x10002" when the database connection first opens, then run
// "PRAGMA optimize" again at periodic intervals - perhaps once per day
// or even once per hour. All applications should run "PRAGMA optimize"
// after schema changes, especially CREATE INDEX."
// - https://www.sqlite.org/pragma.html#pragma_optimize
// - https://sqlite.org/lang_analyze.html#autoanalyze
//
// We used to just run ANALYZE, as it was fast enough for me, but
// some users reported extreme slowness (several minutes of blocking
// every few minutes), so we're back to following the instructions.
// (I'm a stubborn learner.)
//
// The docs do repeatedly say "per/each connection," and recommend for
// connection close, but we don't have control over that with Go's
// database/sql APIs, which pools connections. So we just hope the
// pooled connection, with its single writer, is sufficient, even
// though we have readers in another pool.
//
// https://x.com/mholt6/status/1865169910940471492
// --> https://x.com/carlsverre/status/1865185078067835167 (whole thread)
go tl.optimizeDB(logger)
// also optimize thumbnails DB while we're at it
go func() {
logger.Info("optimizing thumbnails database for performance")
start := time.Now()
_, err = tl.thumbs.WritePool.ExecContext(tl.ctx, "PRAGMA optimize")
if err != nil {
logger.Error("optimizing thumbnails database: %w", zap.Error(err))
}
logger.Info("finished optimizing thumbnails database", zap.Duration("duration", time.Since(start)))
}()
deletionTicker := time.NewTicker(time.Minute)
defer deletionTicker.Stop()
const analyzeInterval = 24 * time.Hour
analyzeTicker := time.NewTicker(analyzeInterval)
defer analyzeTicker.Stop()
for {
select {
case <-tl.ctx.Done():
return
case <-deletionTicker.C:
err := tl.deleteExpiredItems(logger)
if err != nil {
logger.Error("problem deleting expired items", zap.Error(err))
}
case <-analyzeTicker.C:
go tl.optimizeDB(logger)
}
}
}
// optimizeDB runs 'PRAGMA optimize' on the database.
// Recommended to run in a goroutine.
func (tl *Timeline) optimizeDB(logger *zap.Logger) {
// don't overlap if optimization is already running
if !atomic.CompareAndSwapInt64(tl.optimizing, 0, 1) {
return
}
defer atomic.CompareAndSwapInt64(tl.optimizing, 1, 0)
logger.Info("optimizing database for performance")
start := time.Now()
_, err := tl.db.WritePool.ExecContext(tl.ctx, "PRAGMA optimize")
if err != nil {
logger.Error("analyzing database: %w", zap.Error(err))
}
logger.Info("finished optimizing database", zap.Duration("duration", time.Since(start)))
}
// deleteExpiredItems finds items marked as deleted that have passed their retention period
// and actually erases them.
func (tl *Timeline) deleteExpiredItems(logger *zap.Logger) error {
tx, err := tl.db.WritePool.BeginTx(tl.ctx, nil)
if err != nil {
return fmt.Errorf("beginning transaction: %w", err)
}
defer tx.Rollback()
// first identify which items are ready to be erased; we need their row
// IDs and data files (we could do the erasure in a single UPDATE query,
// but we do need to get their data files first so we can delete those after)
rowIDsToEmpty, dataFilesToDelete, err := tl.findExpiredDeletedItems(tl.ctx, tx)
if err != nil {
return fmt.Errorf("finding expired deleted items: %w", err)
}
if len(rowIDsToEmpty) == 0 && len(dataFilesToDelete) == 0 {
return nil // nothing to do
}
// clear out their rows
err = tl.deleteDataInItemRows(tl.ctx, tx, rowIDsToEmpty, false)
if err != nil {
return fmt.Errorf("erasing deleted items (before deleting data files): %w", err)
}
// commit transaction so that the items in the DB are at least marked as
// deleted; if deleting any of the data files fails, we'll log it, but
// there's no good way to roll back the transaction for only the items
// of which the data file failed to delete (TODO: maybe we need a sweeper routine to clean up zombie/stray data files)
// this way the DB remains the source of truth
if err = tx.Commit(); err != nil {
return fmt.Errorf("committing transaction (no data files have been deleted yet): %w", err)
}
// now that the database shows the new truth, delete the data files to match
numFilesDeleted, err := tl.deleteRepoFiles(tl.ctx, logger, dataFilesToDelete)
if err != nil {
logger.Error("error when deleting data files of erased items (items have already been marked as deleted in DB)", zap.Error(err))
}
if len(rowIDsToEmpty) > 0 || numFilesDeleted > 0 {
logger.Info("erased deleted items",
zap.Int("count", len(rowIDsToEmpty)),
zap.Int("deleted_data_files", numFilesDeleted))
}
return nil
}
func (tl *Timeline) deleteThumbnails(ctx context.Context, itemRowIDs []uint64, dataFiles []string) error {
thumbsTx, err := tl.thumbs.WritePool.BeginTx(ctx, nil)
if err != nil {
return fmt.Errorf("beginning thumbnail transaction: %w", err)
}
defer thumbsTx.Rollback()
for _, itemID := range itemRowIDs {
_, err = thumbsTx.ExecContext(ctx, `DELETE FROM thumbnails WHERE item_id=?`, itemID)
if err != nil {
return fmt.Errorf("unable to delete thumbnail row for erased item %d: %w", itemID, err)
}
}
for _, dataFile := range dataFiles {
_, err = thumbsTx.ExecContext(ctx, `DELETE FROM thumbnails WHERE data_file=?`, dataFile)
if err != nil {
return fmt.Errorf("unable to delete thumbnail row for data file %s: %w", dataFile, err)
}
}
return thumbsTx.Commit()
}
func (tl *Timeline) findExpiredDeletedItems(ctx context.Context, tx *sql.Tx) (rowIDs []uint64, dataFilesToDelete []string, err error) {
now := time.Now().Unix()
// this query selects the rows that are pending deletion ("in the trash") and returns their
// row ID, the data file path, and the number of other items that reference the same data
// file. The ("scalar"?) subquery is very specific to select exact matches of the data file
// that aren't the current row and aren't being deleted at this time (because, of course, if
// the other items using it are also being deleted at this time, we can still delete the file).
rows, err := tx.QueryContext(ctx, `
SELECT
id, data_file,
(SELECT count() FROM items AS otherItems
WHERE otherItems.data_file = items.data_file
AND otherItems.id != items.id
AND (deleted IS NULL OR deleted > ?)
LIMIT 1)
AS other_items_using_data_file
FROM items
WHERE deleted > 1 AND deleted <= ?`, now, now)
if err != nil {
return nil, nil, fmt.Errorf("querying for deleted items: %w", err)
}
defer rows.Close()
// it's possible, though uncommon, for multiple items being deleted in this batch to
// share the same data file, so tidy up duplicates while we go
dataFilesMap := make(map[string]struct{})
for rows.Next() {
var id uint64
var dataFile *string
var otherItemsUsingFile int
if err := rows.Scan(&id, &dataFile, &otherItemsUsingFile); err != nil {
return nil, nil, fmt.Errorf("scanning item row: %w", err)
}
rowIDs = append(rowIDs, id)
if dataFile != nil && otherItemsUsingFile == 0 {
dataFilesMap[*dataFile] = struct{}{}
}
}
if err = rows.Err(); err != nil {
return nil, nil, fmt.Errorf("iterating item rows: %w", err)
}
// convert de-duplicated map entries into a slice
for dataFile := range dataFilesMap {
dataFilesToDelete = append(dataFilesToDelete, dataFile)
}
return
}
func (tl *Timeline) deleteDataInItemRows(ctx context.Context, tx *sql.Tx, rowIDs []uint64, preserveUserNotes bool) error {
if len(rowIDs) == 0 {
return nil
}
var sb strings.Builder
// Empty all the columns except for id, deleted, and row hashes (and possibly note).
// Set deleted to 1 to signify we have erased the row and completed the deletion.
// Keep id unchanged to preserve relationships. (TODO: This could be configurable in the future.)
// Keep the row hashes to remember the signature(s) of what was deleted.
sb.WriteString(`UPDATE items
SET data_source_id=NULL, job_id=NULL, modified_job_id=NULL, attribute_id=NULL,
classification_id=NULL, original_id=NULL, original_location=NULL, intermediate_location=NULL,
filename=NULL, timestamp=NULL, timespan=NULL, timeframe=NULL, time_offset=NULL, time_uncertainty=NULL,
stored=0, modified=NULL, data_type=NULL, data_text=NULL, data_file=NULL, data_hash=NULL,
metadata=NULL, longitude=NULL, latitude=NULL, altitude=NULL, coordinate_system=NULL,
coordinate_uncertainty=NULL, `)
if !preserveUserNotes {
sb.WriteString("note=NULL, ")
}
sb.WriteString("starred=NULL, hidden=NULL, deleted=1 WHERE id IN ")
// write the row IDs into the WHERE clause
array, args := sqlArray(rowIDs)
sb.WriteString(array)
_, err := tx.ExecContext(ctx, sb.String(), args...)
if err != nil {
return fmt.Errorf("erasing item rows: %w", err)
}
return nil
}
func (tl *Timeline) deleteRepoFiles(ctx context.Context, logger *zap.Logger, dataFilesToDelete []string) (int, error) {
for _, dataFile := range dataFilesToDelete {
if err := ctx.Err(); err != nil {
return 0, err
}
if err := tl.deleteRepoFile(dataFile); err != nil {
logger.Warn("deleting file within repo",
zap.String("repo_file", dataFile),
zap.Error(err))
}
}
return len(dataFilesToDelete), nil
}