1
0
Fork 0
timelinize/datasources/googlephotos/takeoutarchive.go
Matthew Holt ce601d5033
googlephotos: Ignore non-dir subfolders in Takeout albums
Also print data source in log messages when processing fails
2025-11-17 09:06:24 -07:00

649 lines
26 KiB
Go

/*
Timelinize
Copyright (c) 2013 Matthew Holt
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published
by the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
package googlephotos
import (
"context"
"encoding/json"
"errors"
"fmt"
"io"
"io/fs"
"path"
"path/filepath"
"sort"
"strconv"
"strings"
"time"
"github.com/maruel/natural"
"github.com/timelinize/timelinize/datasources/media"
"github.com/timelinize/timelinize/timeline"
"go.uber.org/zap"
)
const googlePhotosPath = "Takeout/Google Photos"
func (fimp *FileImporter) listFromTakeoutArchive(ctx context.Context, opt timeline.ImportParams, dirEntry timeline.DirEntry) error {
fimp.truncatedNames = make(map[string]int)
var checkpoint string
if opt.Checkpoint != nil {
err := json.Unmarshal(opt.Checkpoint, &checkpoint)
if err != nil {
return fmt.Errorf("decoding checkpoint: %w", err)
}
}
albumFolders, err := fs.ReadDir(dirEntry.FS, dirEntry.Filename)
if err != nil {
return fmt.Errorf("getting album list from %s: %w", googlePhotosPath, err)
}
// We don't use Walk() because we need to control the order in which we read
// the files. It's quite niche, but I ran into it with my very first import
// test: filenames that are more than 47 characters, where the first 47 chars
// are all the same, are ambiguous when it comes to pairing the media file and
// the metadata sidecar file (.json), because Google truncates long filenames for
// some reason without an obvious way to undo the truncation deterministically.
// Before truncating, Google apparently sorts filenames in a folder by "natural
// sort", but Walk uses lexical sort. So we read the dir listings ourselves and
// sort album contents with a natural sort in order and remember truncated file
// names we've seen in order to hopefully accurately link a JSON file to its
// associated media file, and thus generate the same retrieval key for both
// files. This is needed because we can't be guaranteed that the media file and
// its sidecar will even be in the same archive/import; so the retrieval key
// lets us import partial item data as we discover it, but it HAS to be the
// same, and we use the filename for that, so we HAVE to reliably compute it.
for _, albumFolder := range albumFolders {
if err := ctx.Err(); err != nil {
return err
}
if !albumFolder.IsDir() {
continue
}
thisAlbumFolderPath := path.Join(dirEntry.Filename, albumFolder.Name())
albumMeta, err := fimp.readAlbumMetadata(dirEntry, thisAlbumFolderPath)
if err != nil {
if errors.Is(err, fs.ErrNotExist) {
opt.Log.Warn("album metadata not found; maybe it is in another archive or this folder is not an album",
zap.String("folder_path", thisAlbumFolderPath),
zap.Error(err))
} else {
opt.Log.Error("could not open album metadata",
zap.String("folder_path", thisAlbumFolderPath),
zap.Error(err))
}
}
// read album folder contents, then sort in what I think is the same way
// Google does before truncating long filenames -- this is crucial to
// matching up filenames correctly (metadata + media files)
albumItems, err := fs.ReadDir(dirEntry.FS, thisAlbumFolderPath)
if err != nil {
return fmt.Errorf("reading album directory: %w", err)
}
sort.Slice(albumItems, func(i, j int) bool {
iName, jName := albumItems[i].Name(), albumItems[j].Name()
iNameNoExt, jNameNoExt := strings.TrimSuffix(iName, path.Ext(iName)), strings.TrimSuffix(jName, path.Ext(jName))
// first sort by length
if len(iNameNoExt) != len(jNameNoExt) {
return len(iNameNoExt) < len(jNameNoExt)
}
// then use natural sort; i.e. [a1, a20, a10] => [a1, a10, a20]
return natural.Less(albumItems[i].Name(), albumItems[j].Name())
})
for _, d := range albumItems {
// make pauses more responsive
if err := opt.Continue(); err != nil {
return err
}
fpath := path.Join(thisAlbumFolderPath, d.Name())
if checkpoint != "" {
if fpath != checkpoint {
continue // keep going until we find the checkpoint position
}
checkpoint = "" // at the checkpoint; clear it so we process all further items
}
if err := fimp.processAlbumItem(ctx, albumMeta, thisAlbumFolderPath, d, opt, dirEntry); err != nil {
return fmt.Errorf("processing album item '%s': %w", fpath, err)
}
}
}
return nil
}
func (fimp *FileImporter) processAlbumItem(ctx context.Context, albumMeta albumArchiveMetadata, folderPath string, d fs.DirEntry, opt timeline.ImportParams, dirEntry timeline.DirEntry) error {
if err := ctx.Err(); err != nil {
return err
}
// skip the album metadata (it is consumed separately)
// TODO: Also skip/use print-subscriptions.json, shared_album_comments.json, user-generated-memory-titles.json,... I guess? I haven't seen those though
if d.Name() == albumMetadataFilename {
return nil
}
// skip directories (there shouldn't be any in the first place... since Google Photos doesn't support sub-albums)
if d.IsDir() {
return nil
}
fpath := path.Join(folderPath, d.Name())
// skip sidecar movie files ("live photos") because we'll connect them when
// we process the actual photograph (hopefully they're in the same archive!)
if media.IsSidecarVideo(dirEntry.FS, fpath) {
return nil
}
f, err := dirEntry.FS.Open(fpath)
if err != nil {
return err
}
defer f.Close()
var itemMeta mediaArchiveMetadata
// this could be either the media file itself, or a metadata sidecar file; we
// need the path to the media file, so start by assuming that's what this is
mediaFilePath := fpath
// if this is a JSON sidecar file, get the metadata it contains
if path.Ext(fpath) == ".json" {
err = json.NewDecoder(f).Decode(&itemMeta)
if err != nil {
return fmt.Errorf("decoding item metadata file %s: %w", fpath, err)
}
// I've heard that some JSON files in albums (other than the album metadata)
// might be something else, so as a quick sanity check make sure it contained
// what I presume is required info
if itemMeta.Title == "" || itemMeta.URL == "" {
return nil
}
// we don't totally trust the timstamp in the metadata file, but we'll
// take it in case the actual media file doesn't contain any
itemMeta.parsedPhotoTakenTime, err = itemMeta.timestamp()
if err != nil && !errors.Is(err, errNoTimestamp) {
return fmt.Errorf("parsing timestamp from item %s: %w", fpath, err)
}
mediaFilePath = fimp.determineMediaFilenameInArchive(fpath, itemMeta)
opt.Log.Debug("mapped sidecar to target media file",
zap.String("sidecar_file", fpath),
zap.String("target_file", mediaFilePath))
} else {
itemMeta.source = dirEntry
}
ig := fimp.makeItemGraph(mediaFilePath, itemMeta, albumMeta, opt)
// ensure item is within configured timeframe before continuing
if !opt.Timeframe.Contains(ig.Item.Timestamp) {
opt.Log.Debug("item is outside timeframe", zap.String("filename", fpath))
return nil
}
// Between the JSON file and the actual media file, we typically prefer the
// filename in the JSON file and everything else that overlaps in the media
// file, since Google's metadata is known to be wrong sometimes (!?). However,
// in a rare singular case of corrupted input, I have found non-nil timestamp
// data that was completely wrong in the mvhd box of an MP4 file, captured on
// an Android phone, with several other videos even that same hour that were
// correct / not corrupted. The corrupted timestamp was 4165689599 (confirmed
// via ffprobe), which apparently equates to 2036-01-01, but should have been
// 2016-11-27. (The time was also truncated.) I can't explain the corruption.
// I think in general, photos and videos from Google Takeout aren't from the
// future, and probably aren't RIGHT at midnight on New Years (okay to be fair,
// that's not so unlikely) -- maybe we can prefer the metadata timestamp in
// those cases; though I'm not sure if this heuristic is reliable.
if path.Ext(fpath) == ".json" {
// metadata file should have good filename and metadata, but we prefer
// the embedded timestamp if possible
ig.Item.Retrieval.FieldUpdatePolicies = map[string]timeline.FieldUpdatePolicy{
"filename": timeline.UpdatePolicyOverwriteExisting,
"metadata": timeline.UpdatePolicyPreferIncoming, // applied per-key, so keys unique to this file will be kept
"timestamp": timeline.UpdatePolicyPreferExisting,
"timespan": timeline.UpdatePolicyPreferExisting,
"timeframe": timeline.UpdatePolicyPreferExisting,
"time_offset": timeline.UpdatePolicyPreferExisting,
"time_uncertainty": timeline.UpdatePolicyPreferExisting,
"latlon": timeline.UpdatePolicyPreferExisting,
"altitude": timeline.UpdatePolicyPreferExisting,
}
} else {
// always use the embedded timestamp, unless it looks like it is bad (I've encountered
// several corrupt or very wrong embedded timestamps that actually cause UI bugs b/c
// they're so wrong they can't be serialized to JSON) -- the processor will also try to
// clear them, but in our case there are timestamps that generically "look valid", yet
// we can know are invalid, and in those cases we can likely lean on the timestamp in
// the JSON file, so we just need to adjust the update policy for timestamps based on
// what we can infer about the timestamp
tsUpdatePolicy := timeline.UpdatePolicyOverwriteExisting
if isBadTimestamp(ig.Item.Timestamp) {
tsUpdatePolicy = timeline.UpdatePolicyKeepExisting
ig.Item.Timestamp = time.Time{}
}
ig.Item.Retrieval.FieldUpdatePolicies = map[string]timeline.FieldUpdatePolicy{
"data": timeline.UpdatePolicyOverwriteExisting,
"original_location": timeline.UpdatePolicyOverwriteExisting,
"intermediate_location": timeline.UpdatePolicyOverwriteExisting,
"filename": timeline.UpdatePolicyPreferExisting,
"metadata": timeline.UpdatePolicyPreferIncoming,
"timestamp": tsUpdatePolicy,
"timespan": tsUpdatePolicy,
"timeframe": tsUpdatePolicy,
"time_offset": tsUpdatePolicy,
"time_uncertainty": tsUpdatePolicy,
"latlon": timeline.UpdatePolicyPreferIncoming,
"altitude": timeline.UpdatePolicyPreferIncoming,
}
media.ConnectMotionPhoto(opt.Log, dirEntry, mediaFilePath, ig)
}
// if item has an "-edited" variant, relate it
ext := path.Ext(mediaFilePath)
editedPath := strings.TrimSuffix(mediaFilePath, ext) + "-edited" + ext
if dirEntry.FileExists(editedPath) {
mediaFilePath = editedPath
edited := fimp.makeItemGraph(mediaFilePath, itemMeta, albumMeta, opt)
ig.ToItem(timeline.RelEdit, edited.Item)
}
ig.Checkpoint = fpath
opt.Pipeline <- ig
return nil
}
func (fimp *FileImporter) makeItemGraph(mediaFilePath string, itemMeta mediaArchiveMetadata, albumMeta albumArchiveMetadata, opt timeline.ImportParams) *timeline.Graph {
item := &timeline.Item{
Classification: timeline.ClassMedia,
// timestamp is not set here (we prefer timestamp embedded in file itself first, below)
Location: itemMeta.location(),
IntermediateLocation: mediaFilePath,
Content: timeline.ItemData{
Filename: itemMeta.Title,
},
Metadata: timeline.Metadata{
"Description": itemMeta.Description,
"Local folder": itemMeta.GooglePhotosOrigin.MobileUpload.DeviceFolder.LocalFolderName,
"Device type": itemMeta.GooglePhotosOrigin.MobileUpload.DeviceType,
"Views": itemMeta.ImageViews,
"URL": itemMeta.URL,
},
}
if itemMeta.source.FS != nil {
// don't send filename since we can't trust the filename we have here;
// Google Takeout likes to truncate them, and also remove/replace special
// characters without any indication of the original filename
item.Content.Data = func(_ context.Context) (io.ReadCloser, error) {
return itemMeta.source.FS.Open(path.Join(itemMeta.source.Filename, mediaFilePath))
}
// add metadata contained in the image file itself; note that this overwrites any overlapping
// metadata that has already been filled in -- except timestamp which is not in the Metadata
// field; however, apparently (according to the PhotoStructure devs), the timestamp in the
// actual photo file is often more accurate than any in a sidecar metadata file, so prefer
// the embedded timestamp first, and if there isn't one, then use the sidecar data
_, err := media.ExtractAllMetadata(opt.Log, itemMeta.source.FS, path.Join(itemMeta.source.Filename, mediaFilePath), item, timeline.MetaMergeReplaceEmpty)
if err != nil {
opt.Log.Warn("extracting metadata", zap.Error(err))
}
}
// set a timestamp if we only have the metadata file
if item.Timestamp.IsZero() {
item.Timestamp = itemMeta.parsedPhotoTakenTime
}
// the retrieval key is crucial so that we can store what data we have from an item
// as we get it, without getting the whole item, even across different imports; it
// consists of the data source name to avoid conflicts with other DSes, the name of
// the archive (with the index part removed, of course, since a metadata file in
// -001.zip might have its media file in -002.zip, but they should have the same
// retrieval key; this does rely on them not being renamed), and the expected path
// of the media file within the archive (if we're on the media file, it's just that
// path, but if we're on the sidecar JSON file, we have to construct it with heuristics
// since Google's naming convention isn't documented)
archiveName := fimp.exportIDFromArchiveFilename()
retKey := fmt.Sprintf("%s::%s::%s", dataSourceName, archiveName, mediaFilePath)
item.Retrieval.SetKey(retKey)
// since we don't know the filename if we are on the picture file,
// and we don't know the data if we are on the metadata file, tell
// the processor that a nil value of these means that we don't know
// what it is, rather than us asserting that it's intentionally nil
// (this is crucial to allow us to process takeouts with duplicates
// without having duplicates in the timeline)
item.Retrieval.UniqueConstraints = map[string]bool{
"filename": item.Content.Filename != "",
"data": item.Content.Data != nil,
}
ig := &timeline.Graph{Item: item}
// add to album/collection
if albumMeta.Title != "" || albumMeta.Description != "" {
// prefer title, but use description if that's all we have for some reason
albumTitle := albumMeta.Title
if albumTitle == "" {
albumTitle = albumMeta.Description
albumMeta.Description = ""
}
ig.ToItem(timeline.RelInCollection, &timeline.Item{
Classification: timeline.ClassCollection,
Content: timeline.ItemData{
Data: timeline.StringData(albumTitle),
},
Owner: item.Owner,
Metadata: timeline.Metadata{
"Description": albumMeta.Description,
},
})
}
for _, person := range itemMeta.People {
ig.ToEntity(timeline.RelIncludes, &timeline.Entity{
Name: person.Name,
Attributes: []timeline.Attribute{
{
Name: "google_photos_name",
Value: person.Name,
Identity: true,
},
},
})
}
return ig
}
// exportIDFromArchiveFilename returns the name of the archive without the positional
// index(es) and without the extension. It assumes a Takeout archive filename that has
// NOT been renamed.
//
// A couple examples: given an import filepath of
// "/foo/takeout-20240516T230250Z-003.zip/Takeout/Google Photos", this returns
// "takeout-20240516T230250Z", which seems to be a unique identifier for the particular
// export this archive is a part of. For newer/larger (~Q3 2025) takeouts, an import
// filepath of "/foo/takeout-20250921T1994402Z-3-009.zip/Takeout/Google Photos" (notice
// this has another component in the archive filename) returns "takeout-20250921T1994402Z",
// which is the export ID.
//
// The archive name is extracted from the import path, trimming the Google Photos subpath
// ("Takeout/Google Photos"). The archive filename is not strictly parsed; it quite naively
// just uses the name up to the second "-", as long as whatever is before the second "-"
// is the same for all archives in the group.)
func (fimp *FileImporter) exportIDFromArchiveFilename() string {
// For "/foo/takeout-20240516T230250Z-003.zip/Takeout/Google Photos", strip the
// "Takeout/Google Photos" suffix to terminate the path at the root of the archive
base := filepath.Base(strings.TrimSuffix(fimp.filename, googlePhotosPath))
firstDashPos := strings.Index(base, "-")
if firstDashPos < 0 {
return base
}
secondDashPosRelative := strings.Index(base[firstDashPos+1:], "-")
if secondDashPosRelative <= 0 {
return base
}
absoluteSecondDashPos := firstDashPos + 1 + secondDashPosRelative
return base[:absoluteSecondDashPos]
}
func (fimp *FileImporter) readAlbumMetadata(d timeline.DirEntry, albumFolderPath string) (albumArchiveMetadata, error) {
albumMetadataFilePath := path.Join(d.Filename, albumFolderPath, albumMetadataFilename)
albumMetadataFile, err := d.FS.Open(albumMetadataFilePath)
if err != nil {
return albumArchiveMetadata{}, fmt.Errorf("opening metadata file %s: %w", albumMetadataFilename, err)
}
defer albumMetadataFile.Close()
var albumMeta albumArchiveMetadata
err = json.NewDecoder(albumMetadataFile).Decode(&albumMeta)
if err != nil {
return albumArchiveMetadata{}, fmt.Errorf("decoding album metadata file %s: %w", albumMetadataFilename, err)
}
return albumMeta, nil
}
const albumMetadataFilename = "metadata.json"
type albumArchiveMetadata struct {
Title string `json:"title"`
Description string `json:"description"`
Access string `json:"access"`
Date struct {
Timestamp string `json:"timestamp"`
Formatted string `json:"formatted"`
} `json:"date"`
GeoData struct {
Latitude float64 `json:"latitude"`
Longitude float64 `json:"longitude"`
Altitude float64 `json:"altitude"`
LatitudeSpan float64 `json:"latitudeSpan"`
LongitudeSpan float64 `json:"longitudeSpan"`
} `json:"geoData"`
SharedAlbumComments []struct {
Text string `json:"text,omitempty"`
CreationTime struct {
Timestamp string `json:"timestamp"`
Formatted string `json:"formatted"`
} `json:"creationTime"`
ContentOwnerName string `json:"contentOwnerName"`
Liked bool `json:"liked,omitempty"`
} `json:"sharedAlbumComments"`
}
type mediaArchiveMetadata struct {
Title string `json:"title"`
Description string `json:"description"`
ImageViews string `json:"imageViews"`
CreationTime struct {
Timestamp string `json:"timestamp"`
Formatted string `json:"formatted"`
} `json:"creationTime"`
PhotoTakenTime struct {
Timestamp string `json:"timestamp"`
Formatted string `json:"formatted"`
} `json:"photoTakenTime"`
GeoData struct {
Latitude float64 `json:"latitude"`
Longitude float64 `json:"longitude"`
Altitude float64 `json:"altitude"`
LatitudeSpan float64 `json:"latitudeSpan"`
LongitudeSpan float64 `json:"longitudeSpan"`
} `json:"geoData"`
GeoDataExif struct {
Latitude float64 `json:"latitude"`
Longitude float64 `json:"longitude"`
Altitude float64 `json:"altitude"`
LatitudeSpan float64 `json:"latitudeSpan"`
LongitudeSpan float64 `json:"longitudeSpan"`
} `json:"geoDataExif"`
People []struct {
Name string `json:"name"`
} `json:"people"`
URL string `json:"url"`
GooglePhotosOrigin struct {
MobileUpload struct {
DeviceFolder struct {
LocalFolderName string `json:"localFolderName"`
} `json:"deviceFolder"`
DeviceType string `json:"deviceType"`
} `json:"mobileUpload"`
Composition struct {
Type string `json:"type"`
} `json:"composition"`
} `json:"googlePhotosOrigin"`
PhotoLastModifiedTime struct {
Timestamp string `json:"timestamp"`
Formatted string `json:"formatted"`
} `json:"photoLastModifiedTime"`
parsedPhotoTakenTime time.Time
source timeline.DirEntry // the parent DirEntry (not representing the actual file itself; the one we're starting the import from)
}
func (m mediaArchiveMetadata) location() timeline.Location {
loc := timeline.Location{}
if m.GeoData.Latitude != 0 {
loc.Latitude = &m.GeoData.Latitude
}
if m.GeoData.Longitude != 0 {
loc.Longitude = &m.GeoData.Longitude
}
if m.GeoData.Altitude != 0 {
loc.Altitude = &m.GeoData.Altitude
}
if loc.Latitude == nil && m.GeoDataExif.Latitude != 0 {
loc.Latitude = &m.GeoDataExif.Latitude
}
if loc.Longitude == nil && m.GeoDataExif.Longitude != 0 {
loc.Longitude = &m.GeoDataExif.Longitude
}
if loc.Altitude == nil && m.GeoDataExif.Altitude != 0 {
loc.Altitude = &m.GeoDataExif.Altitude
}
return loc
}
var errNoTimestamp = errors.New("no timestamp available")
// timestamp returns a timestamp derived from the metadata. It first
// prefers the PhotoTakenTime, then the CreationTime, then the
// PhotoLastModifiedTime. However, it has been reported by the
// PhotoStructure team that these timestamps can be wildly wrong,
// on the order of hours or days. Image metadata may be more reliable.
func (m mediaArchiveMetadata) timestamp() (time.Time, error) {
ts := m.PhotoTakenTime.Timestamp
if ts == "" {
// if a photo is in multiple albums/folders, this can be different between the two
ts = m.CreationTime.Timestamp
}
if ts == "" {
ts = m.PhotoLastModifiedTime.Timestamp
}
if ts == "" {
return time.Time{}, errNoTimestamp
}
parsed, err := strconv.ParseInt(ts, 10, 64)
if err != nil {
return time.Time{}, err
}
// timestamp represents UTC (no offset), so call UTC() since Unix() defaults to local offset
return time.Unix(parsed, 0).UTC(), nil
}
// determineMediaFilenameInArchive returns the path to the media file in the archive
// that is associated with the given JSON sidecar metadata filepath.
//
// Google Photos export truncates long filenames. This function uses a lexical approach
// with the help of some count state to assemble the image filename that can be used to
// read it in the archive.
func (fimp *FileImporter) determineMediaFilenameInArchive(jsonFilePath string, itemMeta mediaArchiveMetadata) string {
// target media file will be in the same directory
dir := path.Dir(jsonFilePath)
// the metadata contains the original filename; we use that to compute
// what we hope is the filename in the archive based on... experience
// (none of this is documented, but there's some writeups at TODO: link...)
titleExt := path.Ext(itemMeta.Title)
transformedTitle := strings.ReplaceAll(itemMeta.Title, "&", "_")
transformedTitle = strings.ReplaceAll(transformedTitle, "?", "_")
titleWithoutExt := strings.TrimSuffix(transformedTitle, titleExt)
// Google truncates filenames longer than this (sans extension)
const maxLength = 47
// truncating filenames obviously introduces the chance of filename
// collisions, if multiple files have the same long prefix; additionally,
// they may also collide with a file whose entire name is the prefix (i.e.
// collision with a file that is exactly the max length that does not get
// truncated) -- for that reason, we need to count how many times we see
// each filename up to the max length -- including path since each folder
// has a distinct file list -- even if the name is not longer than the
// max length.
// if the filename is long enough, Google truncates it, so we need
// to reconstruct it; this depends on the order we're reading the files,
// because Google auto-increments a "uniqueness suffix" in the form of
// "(N)" where N is how many times that truncated filename has already
// appeared before this.
truncateAt := min(maxLength, len(titleWithoutExt))
truncatedTitle := titleWithoutExt[:truncateAt]
truncatedTitleWithDir := path.Join(dir, truncatedTitle)
fullTruncatedName := truncatedTitleWithDir + titleExt
// then count this "hit" for the name
fimp.truncatedNames[fullTruncatedName]++
// now read the count; it will be at least 1
seenCount := fimp.truncatedNames[fullTruncatedName]
if len(titleWithoutExt) > maxLength {
// a uniqueness suffix is only inserted (before the extension) if the
// truncated filename has not already been seen in our walk, so if this
// is the first (or only) occurrence, just return the truncated filename
if seenCount == 1 {
return fullTruncatedName
}
// otherwise, insert the uniqueness suffix between the truncated filename and
// the extension; use seenCount-1 because the first instance doesn't have a
// "uniqueness suffix (N)", the second one has "(1)", third has "(2)", etc;
// it's how many times we've *already* seen this name before this
return fmt.Sprintf("%s(%d)%s", truncatedTitleWithDir, seenCount-1, titleExt)
}
// short filenames are great... so simple (I think)
return path.Join(dir, itemMeta.Title)
}
// isBadTimestamp tries to detect timestamps that are bad/corrupted, which would generally come from
// embedded metadata like EXIF or XMP, where either there is a parser bug or actual corruption. I have
// encountered both on my data sets, and I've encountered these specific situations.
// The processor will actually strip timestamps that are invalid (like, year is super out-of-range and
// can't be serialized by JSON), but in the case of a corrupt offset (TZ), it will only strip the offset;
// but in our case we can do better than that probably, since the sidecar json file usually has a valid
// and correct timestamp in the rare case the EXIF/XMP data is wrong. So we want to prefer the timestamp
// from the JSON when we detect a timestamp that the processor may still consider valid, but which we
// assume is probably wrong. For example: future year, exactly midnight on new years, or corrupted
// offset. In these cases, the timestamp from JSON should be preferred. In order to prefer the JSON
// timestamp, we need to clear any bad, embedded timestamp, since otherwise it will be preferred.
func isBadTimestamp(t time.Time) bool {
futureYear := t.Year() > time.Now().Year()
exactlyMidnightOnNewYears := t.Month() == time.January && t.Day() == 1 && t.Hour() == 0 && t.Minute() == 0 && t.Second() == 0
const maxTimezoneOffsetSecFromUTC = 50400 // most distant time zone from UTC is apparently +-14 hours
_, offsetSec := t.Zone()
offsetCorrupted := offsetSec > maxTimezoneOffsetSecFromUTC || offsetSec < -maxTimezoneOffsetSecFromUTC
return t.IsZero() || futureYear || exactlyMidnightOnNewYears || offsetCorrupted
}