timelinize/datasources/googlephotos/takeoutarchive.go

/*
	Timelinize
	Copyright (c) 2013 Matthew Holt

	This program is free software: you can redistribute it and/or modify
	it under the terms of the GNU Affero General Public License as published
	by the Free Software Foundation, either version 3 of the License, or
	(at your option) any later version.

	This program is distributed in the hope that it will be useful,
	but WITHOUT ANY WARRANTY; without even the implied warranty of
	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
	GNU Affero General Public License for more details.

	You should have received a copy of the GNU Affero General Public License
	along with this program.  If not, see <https://www.gnu.org/licenses/>.
*/

package googlephotos

import (
	"context"
	"encoding/json"
	"errors"
	"fmt"
	"io"
	"io/fs"
	"path"
	"path/filepath"
	"sort"
	"strconv"
	"strings"
	"time"

	"github.com/maruel/natural"
	"github.com/timelinize/timelinize/datasources/media"
	"github.com/timelinize/timelinize/timeline"
	"go.uber.org/zap"
)

const googlePhotosPath = "Takeout/Google Photos"

func (fimp *FileImporter) listFromTakeoutArchive(ctx context.Context, opt timeline.ImportParams, dirEntry timeline.DirEntry) error {
	fimp.truncatedNames = make(map[string]int)

	var checkpoint string
	if opt.Checkpoint != nil {
		err := json.Unmarshal(opt.Checkpoint, &checkpoint)
		if err != nil {
			return fmt.Errorf("decoding checkpoint: %w", err)
		}
	}

	albumFolders, err := fs.ReadDir(dirEntry.FS, dirEntry.Filename)
	if err != nil {
		return fmt.Errorf("getting album list from %s: %w", googlePhotosPath, err)
	}

	// We don't use Walk() because we need to control the order in which we read
	// the files. It's quite niche, but I ran into it with my very first import
	// test: filenames that are more than 47 characters, where the first 47 chars
	// are all the same, are ambiguous when it comes to pairing the media file and
	// the metadata sidecar file (.json), because Google truncates long filenames for
	// some reason without an obvious way to undo the truncation deterministically.
	// Before truncating, Google apparently sorts filenames in a folder by "natural
	// sort", but Walk uses lexical sort. So we read the dir listings ourselves and
	// sort album contents with a natural sort in order and remember truncated file
	// names we've seen in order to hopefully accurately link a JSON file to its
	// associated media file, and thus generate the same retrieval key for both
	// files. This is needed because we can't be guaranteed that the media file and
	// its sidecar will even be in the same archive/import; so the retrieval key
	// lets us import partial item data as we discover it, but it HAS to be the
	// same, and we use the filename for that, so we HAVE to reliably compute it.
	for _, albumFolder := range albumFolders {
		if err := ctx.Err(); err != nil {
			return err
		}
		if !albumFolder.IsDir() {
			continue
		}

		thisAlbumFolderPath := path.Join(dirEntry.Filename, albumFolder.Name())

		albumMeta, err := fimp.readAlbumMetadata(dirEntry, thisAlbumFolderPath)
		if err != nil {
			if errors.Is(err, fs.ErrNotExist) {
				opt.Log.Warn("album metadata not found; maybe it is in another archive or this folder is not an album",
					zap.String("folder_path", thisAlbumFolderPath),
					zap.Error(err))
			} else {
				opt.Log.Error("could not open album metadata",
					zap.String("folder_path", thisAlbumFolderPath),
					zap.Error(err))
			}
		}

		// read album folder contents, then sort in what I think is the same way
		// Google does before truncating long filenames -- this is crucial to
		// matching up filenames correctly (metadata + media files)
		albumItems, err := fs.ReadDir(dirEntry.FS, thisAlbumFolderPath)
		if err != nil {
			return fmt.Errorf("reading album directory: %w", err)
		}
		sort.Slice(albumItems, func(i, j int) bool {
			iName, jName := albumItems[i].Name(), albumItems[j].Name()
			iNameNoExt, jNameNoExt := strings.TrimSuffix(iName, path.Ext(iName)), strings.TrimSuffix(jName, path.Ext(jName))
			// first sort by length
			if len(iNameNoExt) != len(jNameNoExt) {
				return len(iNameNoExt) < len(jNameNoExt)
			}
			// then use natural sort; i.e. [a1, a20, a10] => [a1, a10, a20]
			return natural.Less(albumItems[i].Name(), albumItems[j].Name())
		})

		for _, d := range albumItems {
			// make pauses more responsive
			if err := opt.Continue(); err != nil {
				return err
			}

			fpath := path.Join(thisAlbumFolderPath, d.Name())
			if checkpoint != "" {
				if fpath != checkpoint {
					continue // keep going until we find the checkpoint position
				}
				checkpoint = "" // at the checkpoint; clear it so we process all further items
			}
			if err := fimp.processAlbumItem(ctx, albumMeta, thisAlbumFolderPath, d, opt, dirEntry); err != nil {
				return fmt.Errorf("processing album item '%s': %w", fpath, err)
			}
		}
	}

	return nil
}

func (fimp *FileImporter) processAlbumItem(ctx context.Context, albumMeta albumArchiveMetadata, folderPath string, d fs.DirEntry, opt timeline.ImportParams, dirEntry timeline.DirEntry) error {
	if err := ctx.Err(); err != nil {
		return err
	}
	// skip the album metadata (it is consumed separately)
	// TODO: Also skip/use print-subscriptions.json, shared_album_comments.json, user-generated-memory-titles.json,... I guess? I haven't seen those though
	if d.Name() == albumMetadataFilename {
		return nil
	}

	// skip directories (there shouldn't be any in the first place... since Google Photos doesn't support sub-albums)
	if d.IsDir() {
		return nil
	}

	fpath := path.Join(folderPath, d.Name())

	// skip sidecar movie files ("live photos") because we'll connect them when
	// we process the actual photograph (hopefully they're in the same archive!)
	if media.IsSidecarVideo(dirEntry.FS, fpath) {
		return nil
	}

	f, err := dirEntry.FS.Open(fpath)
	if err != nil {
		return err
	}
	defer f.Close()

	var itemMeta mediaArchiveMetadata

	// this could be either the media file itself, or a metadata sidecar file; we
	// need the path to the media file, so start by assuming that's what this is
	mediaFilePath := fpath

	// if this is a JSON sidecar file, get the metadata it contains
	if path.Ext(fpath) == ".json" {
		err = json.NewDecoder(f).Decode(&itemMeta)
		if err != nil {
			return fmt.Errorf("decoding item metadata file %s: %w", fpath, err)
		}

		// I've heard that some JSON files in albums (other than the album metadata)
		// might be something else, so as a quick sanity check make sure it contained
		// what I presume is required info
		if itemMeta.Title == "" || itemMeta.URL == "" {
			return nil
		}

		// we don't totally trust the timstamp in the metadata file, but we'll
		// take it in case the actual media file doesn't contain any
		itemMeta.parsedPhotoTakenTime, err = itemMeta.timestamp()
		if err != nil && !errors.Is(err, errNoTimestamp) {
			return fmt.Errorf("parsing timestamp from item %s: %w", fpath, err)
		}

		mediaFilePath = fimp.determineMediaFilenameInArchive(fpath, itemMeta)
		opt.Log.Debug("mapped sidecar to target media file",
			zap.String("sidecar_file", fpath),
			zap.String("target_file", mediaFilePath))
	} else {
		itemMeta.source = dirEntry
	}

	ig := fimp.makeItemGraph(mediaFilePath, itemMeta, albumMeta, opt)

	// ensure item is within configured timeframe before continuing
	if !opt.Timeframe.Contains(ig.Item.Timestamp) {
		opt.Log.Debug("item is outside timeframe", zap.String("filename", fpath))
		return nil
	}

	// Between the JSON file and the actual media file, we typically prefer the
	// filename in the JSON file and everything else that overlaps in the media
	// file, since Google's metadata is known to be wrong sometimes (!?). However,
	// in a rare singular case of corrupted input, I have found non-nil timestamp
	// data that was completely wrong in the mvhd box of an MP4 file, captured on
	// an Android phone, with several other videos even that same hour that were
	// correct / not corrupted. The corrupted timestamp was 4165689599 (confirmed
	// via ffprobe), which apparently equates to 2036-01-01, but should have been
	// 2016-11-27. (The time was also truncated.) I can't explain the corruption.
	// I think in general, photos and videos from Google Takeout aren't from the
	// future, and probably aren't RIGHT at midnight on New Years (okay to be fair,
	// that's not so unlikely) -- maybe we can prefer the metadata timestamp in
	// those cases; though I'm not sure if this heuristic is reliable.
	if path.Ext(fpath) == ".json" {
		// metadata file should have good filename and metadata, but we prefer
		// the embedded timestamp if possible
		ig.Item.Retrieval.FieldUpdatePolicies = map[string]timeline.FieldUpdatePolicy{
			"filename":         timeline.UpdatePolicyOverwriteExisting,
			"metadata":         timeline.UpdatePolicyPreferIncoming, // applied per-key, so keys unique to this file will be kept
			"timestamp":        timeline.UpdatePolicyPreferExisting,
			"timespan":         timeline.UpdatePolicyPreferExisting,
			"timeframe":        timeline.UpdatePolicyPreferExisting,
			"time_offset":      timeline.UpdatePolicyPreferExisting,
			"time_uncertainty": timeline.UpdatePolicyPreferExisting,
			"latlon":           timeline.UpdatePolicyPreferExisting,
			"altitude":         timeline.UpdatePolicyPreferExisting,
		}
	} else {
		// always use the embedded timestamp, unless it looks like it is bad (I've encountered
		// several corrupt or very wrong embedded timestamps that actually cause UI bugs b/c
		// they're so wrong they can't be serialized to JSON) -- the processor will also try to
		// clear them, but in our case there are timestamps that generically "look valid", yet
		// we can know are invalid, and in those cases we can likely lean on the timestamp in
		// the JSON file, so we just need to adjust the update policy for timestamps based on
		// what we can infer about the timestamp
		tsUpdatePolicy := timeline.UpdatePolicyOverwriteExisting
		if isBadTimestamp(ig.Item.Timestamp) {
			tsUpdatePolicy = timeline.UpdatePolicyKeepExisting
			ig.Item.Timestamp = time.Time{}
		}
		ig.Item.Retrieval.FieldUpdatePolicies = map[string]timeline.FieldUpdatePolicy{
			"data":                  timeline.UpdatePolicyOverwriteExisting,
			"original_location":     timeline.UpdatePolicyOverwriteExisting,
			"intermediate_location": timeline.UpdatePolicyOverwriteExisting,
			"filename":              timeline.UpdatePolicyPreferExisting,
			"metadata":              timeline.UpdatePolicyPreferIncoming,
			"timestamp":             tsUpdatePolicy,
			"timespan":              tsUpdatePolicy,
			"timeframe":             tsUpdatePolicy,
			"time_offset":           tsUpdatePolicy,
			"time_uncertainty":      tsUpdatePolicy,
			"latlon":                timeline.UpdatePolicyPreferIncoming,
			"altitude":              timeline.UpdatePolicyPreferIncoming,
		}

		media.ConnectMotionPhoto(opt.Log, dirEntry, mediaFilePath, ig)
	}

	// if item has an "-edited" variant, relate it
	ext := path.Ext(mediaFilePath)
	editedPath := strings.TrimSuffix(mediaFilePath, ext) + "-edited" + ext
	if dirEntry.FileExists(editedPath) {
		mediaFilePath = editedPath
		edited := fimp.makeItemGraph(mediaFilePath, itemMeta, albumMeta, opt)
		ig.ToItem(timeline.RelEdit, edited.Item)
	}

	ig.Checkpoint = fpath

	opt.Pipeline <- ig

	return nil
}

func (fimp *FileImporter) makeItemGraph(mediaFilePath string, itemMeta mediaArchiveMetadata, albumMeta albumArchiveMetadata, opt timeline.ImportParams) *timeline.Graph {
	item := &timeline.Item{
		Classification: timeline.ClassMedia,
		// timestamp is not set here (we prefer timestamp embedded in file itself first, below)
		Location:             itemMeta.location(),
		IntermediateLocation: mediaFilePath,
		Content: timeline.ItemData{
			Filename: itemMeta.Title,
		},
		Metadata: timeline.Metadata{
			"Description":  itemMeta.Description,
			"Local folder": itemMeta.GooglePhotosOrigin.MobileUpload.DeviceFolder.LocalFolderName,
			"Device type":  itemMeta.GooglePhotosOrigin.MobileUpload.DeviceType,
			"Views":        itemMeta.ImageViews,
			"URL":          itemMeta.URL,
		},
	}
	if itemMeta.source.FS != nil {
		// don't send filename since we can't trust the filename we have here;
		// Google Takeout likes to truncate them, and also remove/replace special
		// characters without any indication of the original filename

		item.Content.Data = func(_ context.Context) (io.ReadCloser, error) {
			return itemMeta.source.FS.Open(path.Join(itemMeta.source.Filename, mediaFilePath))
		}

		// add metadata contained in the image file itself; note that this overwrites any overlapping
		// metadata that has already been filled in -- except timestamp which is not in the Metadata
		// field; however, apparently (according to the PhotoStructure devs), the timestamp in the
		// actual photo file is often more accurate than any in a sidecar metadata file, so prefer
		// the embedded timestamp first, and if there isn't one, then use the sidecar data
		_, err := media.ExtractAllMetadata(opt.Log, itemMeta.source.FS, path.Join(itemMeta.source.Filename, mediaFilePath), item, timeline.MetaMergeReplaceEmpty)
		if err != nil {
			opt.Log.Warn("extracting metadata", zap.Error(err))
		}
	}

	// set a timestamp if we only have the metadata file
	if item.Timestamp.IsZero() {
		item.Timestamp = itemMeta.parsedPhotoTakenTime
	}

	// the retrieval key is crucial so that we can store what data we have from an item
	// as we get it, without getting the whole item, even across different imports; it
	// consists of the data source name to avoid conflicts with other DSes, the name of
	// the archive (with the index part removed, of course, since a metadata file in
	// -001.zip might have its media file in -002.zip, but they should have the same
	// retrieval key; this does rely on them not being renamed), and the expected path
	// of the media file within the archive (if we're on the media file, it's just that
	// path, but if we're on the sidecar JSON file, we have to construct it with heuristics
	// since Google's naming convention isn't documented)
	archiveName := fimp.exportIDFromArchiveFilename()
	retKey := fmt.Sprintf("%s::%s::%s", dataSourceName, archiveName, mediaFilePath)
	item.Retrieval.SetKey(retKey)

	// since we don't know the filename if we are on the picture file,
	// and we don't know the data if we are on the metadata file, tell
	// the processor that a nil value of these means that we don't know
	// what it is, rather than us asserting that it's intentionally nil
	// (this is crucial to allow us to process takeouts with duplicates
	// without having duplicates in the timeline)
	item.Retrieval.UniqueConstraints = map[string]bool{
		"filename": item.Content.Filename != "",
		"data":     item.Content.Data != nil,
	}

	ig := &timeline.Graph{Item: item}

	// add to album/collection
	if albumMeta.Title != "" || albumMeta.Description != "" {
		// prefer title, but use description if that's all we have for some reason
		albumTitle := albumMeta.Title
		if albumTitle == "" {
			albumTitle = albumMeta.Description
			albumMeta.Description = ""
		}
		ig.ToItem(timeline.RelInCollection, &timeline.Item{
			Classification: timeline.ClassCollection,
			Content: timeline.ItemData{
				Data: timeline.StringData(albumTitle),
			},
			Owner: item.Owner,
			Metadata: timeline.Metadata{
				"Description": albumMeta.Description,
			},
		})
	}

	for _, person := range itemMeta.People {
		ig.ToEntity(timeline.RelIncludes, &timeline.Entity{
			Name: person.Name,
			Attributes: []timeline.Attribute{
				{
					Name:     "google_photos_name",
					Value:    person.Name,
					Identity: true,
				},
			},
		})
	}

	return ig
}

// exportIDFromArchiveFilename returns the name of the archive without the positional
// index(es) and without the extension. It assumes a Takeout archive filename that has
// NOT been renamed.
//
// A couple examples: given an import filepath of
// "/foo/takeout-20240516T230250Z-003.zip/Takeout/Google Photos", this returns
// "takeout-20240516T230250Z", which seems to be a unique identifier for the particular
// export this archive is a part of. For newer/larger (~Q3 2025) takeouts, an import
// filepath of "/foo/takeout-20250921T1994402Z-3-009.zip/Takeout/Google Photos" (notice
// this has another component in the archive filename) returns "takeout-20250921T1994402Z",
// which is the export ID.
//
// The archive name is extracted from the import path, trimming the Google Photos subpath
// ("Takeout/Google Photos"). The archive filename is not strictly parsed; it quite naively
// just uses the name up to the second "-", as long as whatever is before the second "-"
// is the same for all archives in the group.)
func (fimp *FileImporter) exportIDFromArchiveFilename() string {
	// For "/foo/takeout-20240516T230250Z-003.zip/Takeout/Google Photos", strip the
	// "Takeout/Google Photos" suffix to terminate the path at the root of the archive
	base := filepath.Base(strings.TrimSuffix(fimp.filename, googlePhotosPath))
	firstDashPos := strings.Index(base, "-")
	if firstDashPos < 0 {
		return base
	}
	secondDashPosRelative := strings.Index(base[firstDashPos+1:], "-")
	if secondDashPosRelative <= 0 {
		return base
	}
	absoluteSecondDashPos := firstDashPos + 1 + secondDashPosRelative
	return base[:absoluteSecondDashPos]
}

func (fimp *FileImporter) readAlbumMetadata(d timeline.DirEntry, albumFolderPath string) (albumArchiveMetadata, error) {
	albumMetadataFilePath := path.Join(d.Filename, albumFolderPath, albumMetadataFilename)
	albumMetadataFile, err := d.FS.Open(albumMetadataFilePath)
	if err != nil {
		return albumArchiveMetadata{}, fmt.Errorf("opening metadata file %s: %w", albumMetadataFilename, err)
	}
	defer albumMetadataFile.Close()

	var albumMeta albumArchiveMetadata
	err = json.NewDecoder(albumMetadataFile).Decode(&albumMeta)
	if err != nil {
		return albumArchiveMetadata{}, fmt.Errorf("decoding album metadata file %s: %w", albumMetadataFilename, err)
	}

	return albumMeta, nil
}

const albumMetadataFilename = "metadata.json"

type albumArchiveMetadata struct {
	Title       string `json:"title"`
	Description string `json:"description"`
	Access      string `json:"access"`
	Date        struct {
		Timestamp string `json:"timestamp"`
		Formatted string `json:"formatted"`
	} `json:"date"`
	GeoData struct {
		Latitude      float64 `json:"latitude"`
		Longitude     float64 `json:"longitude"`
		Altitude      float64 `json:"altitude"`
		LatitudeSpan  float64 `json:"latitudeSpan"`
		LongitudeSpan float64 `json:"longitudeSpan"`
	} `json:"geoData"`
	SharedAlbumComments []struct {
		Text         string `json:"text,omitempty"`
		CreationTime struct {
			Timestamp string `json:"timestamp"`
			Formatted string `json:"formatted"`
		} `json:"creationTime"`
		ContentOwnerName string `json:"contentOwnerName"`
		Liked            bool   `json:"liked,omitempty"`
	} `json:"sharedAlbumComments"`
}

type mediaArchiveMetadata struct {
	Title        string `json:"title"`
	Description  string `json:"description"`
	ImageViews   string `json:"imageViews"`
	CreationTime struct {
		Timestamp string `json:"timestamp"`
		Formatted string `json:"formatted"`
	} `json:"creationTime"`
	PhotoTakenTime struct {
		Timestamp string `json:"timestamp"`
		Formatted string `json:"formatted"`
	} `json:"photoTakenTime"`
	GeoData struct {
		Latitude      float64 `json:"latitude"`
		Longitude     float64 `json:"longitude"`
		Altitude      float64 `json:"altitude"`
		LatitudeSpan  float64 `json:"latitudeSpan"`
		LongitudeSpan float64 `json:"longitudeSpan"`
	} `json:"geoData"`
	GeoDataExif struct {
		Latitude      float64 `json:"latitude"`
		Longitude     float64 `json:"longitude"`
		Altitude      float64 `json:"altitude"`
		LatitudeSpan  float64 `json:"latitudeSpan"`
		LongitudeSpan float64 `json:"longitudeSpan"`
	} `json:"geoDataExif"`
	People []struct {
		Name string `json:"name"`
	} `json:"people"`
	URL                string `json:"url"`
	GooglePhotosOrigin struct {
		MobileUpload struct {
			DeviceFolder struct {
				LocalFolderName string `json:"localFolderName"`
			} `json:"deviceFolder"`
			DeviceType string `json:"deviceType"`
		} `json:"mobileUpload"`
		Composition struct {
			Type string `json:"type"`
		} `json:"composition"`
	} `json:"googlePhotosOrigin"`
	PhotoLastModifiedTime struct {
		Timestamp string `json:"timestamp"`
		Formatted string `json:"formatted"`
	} `json:"photoLastModifiedTime"`

	parsedPhotoTakenTime time.Time
	source               timeline.DirEntry // the parent DirEntry (not representing the actual file itself; the one we're starting the import from)
}

func (m mediaArchiveMetadata) location() timeline.Location {
	loc := timeline.Location{}
	if m.GeoData.Latitude != 0 {
		loc.Latitude = &m.GeoData.Latitude
	}
	if m.GeoData.Longitude != 0 {
		loc.Longitude = &m.GeoData.Longitude
	}
	if m.GeoData.Altitude != 0 {
		loc.Altitude = &m.GeoData.Altitude
	}
	if loc.Latitude == nil && m.GeoDataExif.Latitude != 0 {
		loc.Latitude = &m.GeoDataExif.Latitude
	}
	if loc.Longitude == nil && m.GeoDataExif.Longitude != 0 {
		loc.Longitude = &m.GeoDataExif.Longitude
	}
	if loc.Altitude == nil && m.GeoDataExif.Altitude != 0 {
		loc.Altitude = &m.GeoDataExif.Altitude
	}
	return loc
}

var errNoTimestamp = errors.New("no timestamp available")

// timestamp returns a timestamp derived from the metadata. It first
// prefers the PhotoTakenTime, then the CreationTime, then the
// PhotoLastModifiedTime. However, it has been reported by the
// PhotoStructure team that these timestamps can be wildly wrong,
// on the order of hours or days. Image metadata may be more reliable.
func (m mediaArchiveMetadata) timestamp() (time.Time, error) {
	ts := m.PhotoTakenTime.Timestamp
	if ts == "" {
		// if a photo is in multiple albums/folders, this can be different between the two
		ts = m.CreationTime.Timestamp
	}
	if ts == "" {
		ts = m.PhotoLastModifiedTime.Timestamp
	}
	if ts == "" {
		return time.Time{}, errNoTimestamp
	}
	parsed, err := strconv.ParseInt(ts, 10, 64)
	if err != nil {
		return time.Time{}, err
	}
	// timestamp represents UTC (no offset), so call UTC() since Unix() defaults to local offset
	return time.Unix(parsed, 0).UTC(), nil
}

// determineMediaFilenameInArchive returns the path to the media file in the archive
// that is associated with the given JSON sidecar metadata filepath.
//
// Google Photos export truncates long filenames. This function uses a lexical approach
// with the help of some count state to assemble the image filename that can be used to
// read it in the archive.
func (fimp *FileImporter) determineMediaFilenameInArchive(jsonFilePath string, itemMeta mediaArchiveMetadata) string {
	// target media file will be in the same directory
	dir := path.Dir(jsonFilePath)

	// the metadata contains the original filename; we use that to compute
	// what we hope is the filename in the archive based on... experience
	// (none of this is documented, but there's some writeups at TODO: link...)
	titleExt := path.Ext(itemMeta.Title)
	transformedTitle := strings.ReplaceAll(itemMeta.Title, "&", "_")
	transformedTitle = strings.ReplaceAll(transformedTitle, "?", "_")
	titleWithoutExt := strings.TrimSuffix(transformedTitle, titleExt)

	// Google truncates filenames longer than this (sans extension)
	const maxLength = 47

	// truncating filenames obviously introduces the chance of filename
	// collisions, if multiple files have the same long prefix; additionally,
	// they may also collide with a file whose entire name is the prefix (i.e.
	// collision with a file that is exactly the max length that does not get
	// truncated) -- for that reason, we need to count how many times we see
	// each filename up to the max length -- including path since each folder
	// has a distinct file list -- even if the name is not longer than the
	// max length.

	// if the filename is long enough, Google truncates it, so we need
	// to reconstruct it; this depends on the order we're reading the files,
	// because Google auto-increments a "uniqueness suffix" in the form of
	// "(N)" where N is how many times that truncated filename has already
	// appeared before this.
	truncateAt := min(maxLength, len(titleWithoutExt))
	truncatedTitle := titleWithoutExt[:truncateAt]
	truncatedTitleWithDir := path.Join(dir, truncatedTitle)
	fullTruncatedName := truncatedTitleWithDir + titleExt

	// then count this "hit" for the name
	fimp.truncatedNames[fullTruncatedName]++

	// now read the count; it will be at least 1
	seenCount := fimp.truncatedNames[fullTruncatedName]

	if len(titleWithoutExt) > maxLength {
		// a uniqueness suffix is only inserted (before the extension) if the
		// truncated filename has not already been seen in our walk, so if this
		// is the first (or only) occurrence, just return the truncated filename
		if seenCount == 1 {
			return fullTruncatedName
		}

		// otherwise, insert the uniqueness suffix between the truncated filename and
		// the extension; use seenCount-1 because the first instance doesn't have a
		// "uniqueness suffix (N)", the second one has "(1)", third has "(2)", etc;
		// it's how many times we've *already* seen this name before this
		return fmt.Sprintf("%s(%d)%s", truncatedTitleWithDir, seenCount-1, titleExt)
	}

	// short filenames are great... so simple (I think)
	return path.Join(dir, itemMeta.Title)
}

// isBadTimestamp tries to detect timestamps that are bad/corrupted, which would generally come from
// embedded metadata like EXIF or XMP, where either there is a parser bug or actual corruption. I have
// encountered both on my data sets, and I've encountered these specific situations.
// The processor will actually strip timestamps that are invalid (like, year is super out-of-range and
// can't be serialized by JSON), but in the case of a corrupt offset (TZ), it will only strip the offset;
// but in our case we can do better than that probably, since the sidecar json file usually has a valid
// and correct timestamp in the rare case the EXIF/XMP data is wrong. So we want to prefer the timestamp
// from the JSON when we detect a timestamp that the processor may still consider valid, but which we
// assume is probably wrong. For example: future year, exactly midnight on new years, or corrupted
// offset. In these cases, the timestamp from JSON should be preferred. In order to prefer the JSON
// timestamp, we need to clear any bad, embedded timestamp, since otherwise it will be preferred.
func isBadTimestamp(t time.Time) bool {
	futureYear := t.Year() > time.Now().Year()
	exactlyMidnightOnNewYears := t.Month() == time.January && t.Day() == 1 && t.Hour() == 0 && t.Minute() == 0 && t.Second() == 0

	const maxTimezoneOffsetSecFromUTC = 50400 // most distant time zone from UTC is apparently +-14 hours
	_, offsetSec := t.Zone()
	offsetCorrupted := offsetSec > maxTimezoneOffsetSecFromUTC || offsetSec < -maxTimezoneOffsetSecFromUTC

	return t.IsZero() || futureYear || exactlyMidnightOnNewYears || offsetCorrupted
}