timelinize/timeline/itemfiles.go

/*
	Timelinize
	Copyright (c) 2013 Matthew Holt

	This program is free software: you can redistribute it and/or modify
	it under the terms of the GNU Affero General Public License as published
	by the Free Software Foundation, either version 3 of the License, or
	(at your option) any later version.

	This program is distributed in the hope that it will be useful,
	but WITHOUT ANY WARRANTY; without even the implied warranty of
	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
	GNU Affero General Public License for more details.

	You should have received a copy of the GNU Affero General Public License
	along with this program.  If not, see <https://www.gnu.org/licenses/>.
*/

package timeline

import (
	"bytes"
	"context"
	"database/sql"
	"errors"
	"fmt"
	"io"
	"io/fs"
	weakrand "math/rand/v2"
	"mime"
	"os"
	"path"
	"path/filepath"
	"regexp"
	"sort"
	"strings"
	"time"

	"go.uber.org/zap"
)

// openUniqueCanonicalItemDataFile opens a file for saving the content of the given item. It
// ensures the filename is unique within its folder, but only according to the file system's
// case sensitivity. The transaction parameter is optional. If set, the DB will be consulted
// to ensure the chosen filename is case-insensitively unique, even if on a case-sensitive
// file system. That check uses the COLLATE NOCASE (i.e. do a case-insensitive lookup in
// the DB, to ensure the timeline repo will transfer from case-insensitive file systems to
// case-insensitive ones). It returns the file handle as well as the path to the file
// relative to the repo root, which can be stored in the data_file column.
func (tl *Timeline) openUniqueCanonicalItemDataFile(ctx context.Context, logger *zap.Logger, tx *sql.Tx, it *Item, dataSourceID string) (*os.File, string, error) {
	if dataSourceID == "" {
		return nil, "", errors.New("missing data source ID")
	}

	dir := tl.canonicalItemDataFileDir(it, dataSourceID)

	// let the timeline know we're working in this directory (creating the dir, and creating the file in the dir,
	// are not an atomic operation, so we want to avoid deleting an empty folder during parallel downloads)
	tl.dataFileWorkingDirsMu.Lock()
	tl.dataFileWorkingDirs[dir]++
	tl.dataFileWorkingDirsMu.Unlock()

	err := os.MkdirAll(tl.FullPath(dir), 0700)
	if err != nil {
		return nil, "", fmt.Errorf("making directory for data file: %w", err)
	}

	// find a unique filename for this item - we starrt with the desired file name based
	// on the info we have, but we may have to adjust it based on availability
	it.intendedDataFileName = tl.canonicalItemDataFileName(it)
	canonicalFilenameExt := path.Ext(it.intendedDataFileName)
	canonicalFilenameWithoutExt := strings.TrimSuffix(it.intendedDataFileName, canonicalFilenameExt)

	const randSuffixLen = 4

	// TODO: If this isn't enough iterations (for some reason!?) we can either increase the iteration count, or try an outer loop that extends the randSuffixLen to 5, even 6 chars.
	for namingAttempt := range 10 {
		// build the filepath to try; only add randomness to the filename if the original name isn't available
		tryPath := path.Join(dir, canonicalFilenameWithoutExt)
		if namingAttempt > 0 {
			tryPath += "__" + safeRandomString(randSuffixLen, true, nil) // same case == true for portability to case-insensitive file systems
		}
		tryPath += canonicalFilenameExt

		// see if the filename is available; create it with EXCLUSIVE so that we don't trample any existing
		// file, and instead we should get a special error that the file already exists if it's taken...
		// if it is taken we can try another filename, but if it doesn't, this syscall will immediately
		// claim it for us
		f, err := os.OpenFile(tl.FullPath(tryPath), os.O_CREATE|os.O_RDWR|os.O_EXCL, 0600)
		if errors.Is(err, fs.ErrExist) {
			continue // filename already taken; try another one
		}
		if err != nil {
			return nil, "", fmt.Errorf("creating data file: %w", err)
		}

		// also check with the database to see if filename is taken, case-insensitively (the column or index
		// should have COLLATE NOCASE; this is important to avoid file collisions when copying from a
		// case-sensitive FS to a case-insensitive FS!) -- if an item row has claim to it, then the file
		// is either still processing or was lost and needs to be reconstituted, but for now we should
		// not collide with it
		if tx != nil {
			var count int
			err = tx.QueryRowContext(ctx, `SELECT count() FROM items WHERE data_file=? LIMIT 1`, tryPath).Scan(&count)
			if err != nil {
				return nil, "", fmt.Errorf("checking DB for case-insensitive filename uniqueness: %w", err)
			}
			if count > 0 {
				// an existing item has claim to it, so let it keep that, it might still be processing
				logger.Warn("file did not exist on disk but is already claimed in database - will try to make filename unique", zap.String("filepath", tryPath))
				continue
			}
		}

		return f, tryPath, nil
	}

	return nil, "", fmt.Errorf("unable to find available filename for item: %s", it)
}

// canonicalItemDataFileName returns the filename to be used in the
// timeline repository for the item's data file. It pulls from elements
// of the item including its filename and timestamp, if present.
// It ensures the filename is a safe path component, and will not
// generate different names that end up conflicting on case-insensitive
// file systems.
func (tl *Timeline) canonicalItemDataFileName(it *Item) string {
	// this is stupid, but I really don't want to see .jfif at the end of my JPEGs ever again (or .moov on my MOVs)
	extensionByType := func(mediaType string) string {
		if exts, err := mime.ExtensionsByType(mediaType); err == nil && len(exts) > 0 {
			sort.Slice(exts, func(a, _ int) bool {
				if exts[a] == ".jpg" || exts[a] == ".jpeg" || exts[a] == ".mov" {
					return true
				}
				return false
			})
			return exts[0]
		}
		return ""
	}

	// ideally, the filename is simply the one provided with the item
	var filename string
	if it.Content.Filename != "" {
		filename = tl.safePathComponent(it.Content.Filename)
	}

	// otherwise, try a filename based on the item's original ID
	if filename == "" && it.ID != "" {
		filename = "item_id_" + tl.safePathComponent(it.ID) + extensionByType(it.Content.MediaType)
	}

	// otherwise, try a filename based on the item's timestamp
	if filename == "" && !it.Timestamp.IsZero() {
		filename = "item_ts_" + it.Timestamp.Format("2006_01_02_150405") + extensionByType(it.Content.MediaType)
	}

	// otherwise, out of options; revert to a random string
	// since no deterministic filename is available
	if filename == "" {
		const randomNameLen = 24
		filename = safeRandomString(randomNameLen, true, nil) // same case == true for portability to case-insensitive file systems
		filename += extensionByType(it.Content.MediaType)
	}

	// shorten the name if needed (thanks for nothing, Windows)
	filename = tl.ensureDataFileNameShortEnough(filename)

	return filename
}

// canonicalItemDataFileDir returns the path to the directory for the given item
// relative to the timeline root, using forward slash as path separators (this
// is the form used in the data_file column of the DB).
func (tl *Timeline) canonicalItemDataFileDir(it *Item, dataSourceID string) string {
	ts := it.Timestamp
	if ts.IsZero() {
		ts = time.Now()
	}

	if dataSourceID == "" {
		dataSourceID = "unknown"
	}

	// use "/" separators here and adjust for
	// OS path separator when accessing disk
	return path.Join(DataFolderName,
		fmt.Sprintf("%04d", ts.Year()),
		fmt.Sprintf("%02d", ts.Month()),
		tl.safePathComponent(dataSourceID))
}

// ensureDataFileNameShortEnough returns the filename that is guaranteed to be short
// enough for... yeah, you guessed it: Windows. Of course.
func (tl *Timeline) ensureDataFileNameShortEnough(filename string) string {
	// Windows max filename length is 255, but it's unclear exactly what the limit is; choose slightly less...
	// see https://www.fileside.app/blog/2023-03-17_windows-file-paths/
	// "Traditionally, a path on Windows could not exceed a total of 260 characters. Even today, this is
	// still the case for some apps, unless they have taken care to implement a workaround."
	const (
		maxWindowsFilenameLen = 250
		maxExtLen             = 20 // arbitrary and unlikely, but just in case
	)
	if len(filename) > maxWindowsFilenameLen {
		ext := path.Ext(filename)
		if len(ext) > maxExtLen {
			ext = ext[:maxExtLen]
		}
		filename = filename[:maxWindowsFilenameLen-len(ext)]
		filename += ext
	}
	return filename
}

// replaceWithExisting checks to see if the checksum of a file already exists in the database for a
// file that is not the file with the given canonical path or row ID. It returns true if the file
// already exists and a replacement occurred; false otherwise (i.e. the file was unique).
// TODO:/NOTE: If changing a file name, all items with same data_hash must also be updated to use same file name
//
// TODO: The newly refactored processor does not use this yet, since we have deemed the detection of
// duplicate rows to be different from finding an existing row that represents an item, but we can
// potentially use this in a function that removes duplicate items/rows.
//
//nolint:unused
func (p *processor) replaceWithExisting(ctx context.Context, tx *sql.Tx, canonical *string, checksum []byte, itemRowID uint64) (bool, error) {
	if canonical == nil || *canonical == "" || len(checksum) == 0 {
		return false, errors.New("missing data filename and/or hash of contents")
	}

	var existingDatafile *string
	err := tx.QueryRowContext(ctx, `SELECT data_file FROM items WHERE data_hash = ? AND id != ? AND data_file != ? LIMIT 1`,
		checksum, itemRowID, *canonical).Scan(&existingDatafile)
	if errors.Is(err, sql.ErrNoRows) {
		return false, nil // file is unique; carry on
	}
	if err != nil {
		return false, fmt.Errorf("querying DB: %w", err)
	}

	// file is a duplicate! by the time this function returns (if successful),
	// *canonical should not exist anymore and should have the value of
	// *existingDatafile instead.

	p.log.Info("data file is a duplicate",
		zap.Uint64("row_id", itemRowID),
		zap.Stringp("duplicate_data_file", canonical),
		zap.Stringp("existing_data_file", existingDatafile),
		zap.Binary("checksum", checksum))

	if existingDatafile == nil {
		// ... that's weird, how's this possible? it has a hash but no file name recorded
		return false, fmt.Errorf("item with matching hash is missing data file name; hash: %x", checksum)
	}

	// TODO: maybe this all should be limited to only when integrity checks are enabled? how do we know that this download has the right version/contents?
	p.log.Debug("verifying existing file is still the same",
		zap.Uint64("row_id", itemRowID),
		zap.Stringp("existing_data_file", existingDatafile),
		zap.Binary("checksum", checksum))

	// ensure the existing file is still the same
	h := newHash()
	f, err := os.Open(p.tl.FullPath(*existingDatafile))
	if err != nil {
		// TODO: This error is happening often when (re-?)importing SMS backup & restore MMS data files ("no such file or directory")
		return false, fmt.Errorf("opening existing file: %w", err)
	}
	defer f.Close()

	_, err = io.Copy(h, f)
	if err != nil {
		return false, fmt.Errorf("checking file integrity: %w", err)
	}

	existingFileHash := h.Sum(nil)

	if !bytes.Equal(checksum, existingFileHash) {
		// the existing file was corrupted, so restore it with
		// what we just downloaded, which presumably succeeded
		// (by simply renaming the file on disk, we don't have
		// to update any entries in the DB)
		p.log.Warn("existing data file failed integrity check (checksum on disk changed; file corrupted or modified?) - replacing existing file with this one",
			zap.Uint64("row_id", itemRowID),
			zap.Stringp("data_file", existingDatafile),
			zap.Binary("expected_checksum", checksum),
			zap.Binary("actual_checksum", existingFileHash))
		err := os.Rename(p.tl.FullPath(*canonical), p.tl.FullPath(*existingDatafile))
		if err != nil {
			return false, fmt.Errorf("replacing modified data file: %w", err)
		}
	} else {
		// everything checks out; delete the newly-downloaded file
		// and use the existing file instead of duplicating it
		p.log.Debug("existing file passed integrity check; using it instead of newly-downloaded duplicate",
			zap.Uint64("row_id", itemRowID),
			zap.Stringp("existing_data_file", existingDatafile),
			zap.Binary("checksum", checksum))
		err = p.tl.deleteRepoFile(*canonical)
		if err != nil {
			return false, fmt.Errorf("removing duplicate data file: %w", err)
		}
	}

	p.log.Info("merged duplicate data files based on integrity check",
		zap.Uint64("row_id", itemRowID),
		zap.Stringp("duplicate_data_file", canonical),
		zap.Stringp("existing_data_file", existingDatafile),
		zap.Binary("checksum", checksum))

	*canonical = *existingDatafile

	return true, nil
}

// randomString returns a string of n random characters.
// It is not even remotely secure or a proper distribution.
// But it's good enough for some things. It elides certain
// confusing characters like I, l, 1, 0, O, etc. If sameCase
// is true, then uppercase letters are excluded.
func randomString(n int, sameCase bool, r weakrand.Source) string {
	if n <= 0 {
		return ""
	}
	dict := []rune("ABCDEFGHJKLMNPQRTUVWXYabcdefghijkmnopqrstuvwxyz23456789")
	if sameCase {
		dict = dict[22:] // skip uppercase letters
	}
	b := make([]rune, n)
	for i := range b {
		var rnd uint64
		if r == nil {
			rnd = weakrand.Uint64() //nolint:gosec
		} else {
			rnd = r.Uint64()
		}
		b[i] = dict[rnd%uint64(len(dict))]
	}
	return string(b)
}

// FullPath returns the full file system path for a data file, including the repo path.
// It converts forward slashes in the input to the file system path separator.
func (tl *Timeline) FullPath(canonicalDatafileName string) string {
	return filepath.Join(tl.repoDir, filepath.FromSlash(canonicalDatafileName))
}

func (*Timeline) safePathComponent(s string) string {
	s = safePathRE.ReplaceAllLiteralString(s, "")
	s = strings.ReplaceAll(s, "..", "")
	if s == "." {
		s = ""
	}
	return s
}

func safeRandomString(n int, sameCase bool, r weakrand.Source) string {
	var s string
	for range 10 {
		s = randomString(n, sameCase, r)
		if !containsBlocklistedWord(s) {
			break
		}
	}
	return s
}

func containsBlocklistedWord(s string) bool {
	s = strings.ToLower(s)
	for _, word := range []string{
		"fuck",
		"shit",
		"poo",
		"butt",
		"cunt",
		"ass",
		"arse",
		"niga",
		"nigg",
		"hate",
		"kill",
		"die",
		"damn",
		"sex",
		"anal",
		"bitch",
		"cum",
		"peni",
		"vagin",
		"puss",
		"tit",
		"wtf",
		"wank",
		"ejac",
		"dick",
		"hor",
		"evil",
	} {
		if strings.Contains(s, word) {
			return true
		}
	}
	return false
}

// safePathRE matches any undesirable characters in a filepath.
// Note that this allows dots, so you'll have to strip ".." manually.
var safePathRE = regexp.MustCompile(`[^\w.-]`)