649 lines
26 KiB
Go
649 lines
26 KiB
Go
/*
|
|
Timelinize
|
|
Copyright (c) 2013 Matthew Holt
|
|
|
|
This program is free software: you can redistribute it and/or modify
|
|
it under the terms of the GNU Affero General Public License as published
|
|
by the Free Software Foundation, either version 3 of the License, or
|
|
(at your option) any later version.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU Affero General Public License for more details.
|
|
|
|
You should have received a copy of the GNU Affero General Public License
|
|
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
package googlephotos
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"io/fs"
|
|
"path"
|
|
"path/filepath"
|
|
"sort"
|
|
"strconv"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/maruel/natural"
|
|
"github.com/timelinize/timelinize/datasources/media"
|
|
"github.com/timelinize/timelinize/timeline"
|
|
"go.uber.org/zap"
|
|
)
|
|
|
|
const googlePhotosPath = "Takeout/Google Photos"
|
|
|
|
func (fimp *FileImporter) listFromTakeoutArchive(ctx context.Context, opt timeline.ImportParams, dirEntry timeline.DirEntry) error {
|
|
fimp.truncatedNames = make(map[string]int)
|
|
|
|
var checkpoint string
|
|
if opt.Checkpoint != nil {
|
|
err := json.Unmarshal(opt.Checkpoint, &checkpoint)
|
|
if err != nil {
|
|
return fmt.Errorf("decoding checkpoint: %w", err)
|
|
}
|
|
}
|
|
|
|
albumFolders, err := fs.ReadDir(dirEntry.FS, dirEntry.Filename)
|
|
if err != nil {
|
|
return fmt.Errorf("getting album list from %s: %w", googlePhotosPath, err)
|
|
}
|
|
|
|
// We don't use Walk() because we need to control the order in which we read
|
|
// the files. It's quite niche, but I ran into it with my very first import
|
|
// test: filenames that are more than 47 characters, where the first 47 chars
|
|
// are all the same, are ambiguous when it comes to pairing the media file and
|
|
// the metadata sidecar file (.json), because Google truncates long filenames for
|
|
// some reason without an obvious way to undo the truncation deterministically.
|
|
// Before truncating, Google apparently sorts filenames in a folder by "natural
|
|
// sort", but Walk uses lexical sort. So we read the dir listings ourselves and
|
|
// sort album contents with a natural sort in order and remember truncated file
|
|
// names we've seen in order to hopefully accurately link a JSON file to its
|
|
// associated media file, and thus generate the same retrieval key for both
|
|
// files. This is needed because we can't be guaranteed that the media file and
|
|
// its sidecar will even be in the same archive/import; so the retrieval key
|
|
// lets us import partial item data as we discover it, but it HAS to be the
|
|
// same, and we use the filename for that, so we HAVE to reliably compute it.
|
|
for _, albumFolder := range albumFolders {
|
|
if err := ctx.Err(); err != nil {
|
|
return err
|
|
}
|
|
if !albumFolder.IsDir() {
|
|
continue
|
|
}
|
|
|
|
thisAlbumFolderPath := path.Join(dirEntry.Filename, albumFolder.Name())
|
|
|
|
albumMeta, err := fimp.readAlbumMetadata(dirEntry, thisAlbumFolderPath)
|
|
if err != nil {
|
|
if errors.Is(err, fs.ErrNotExist) {
|
|
opt.Log.Warn("album metadata not found; maybe it is in another archive or this folder is not an album",
|
|
zap.String("folder_path", thisAlbumFolderPath),
|
|
zap.Error(err))
|
|
} else {
|
|
opt.Log.Error("could not open album metadata",
|
|
zap.String("folder_path", thisAlbumFolderPath),
|
|
zap.Error(err))
|
|
}
|
|
}
|
|
|
|
// read album folder contents, then sort in what I think is the same way
|
|
// Google does before truncating long filenames -- this is crucial to
|
|
// matching up filenames correctly (metadata + media files)
|
|
albumItems, err := fs.ReadDir(dirEntry.FS, thisAlbumFolderPath)
|
|
if err != nil {
|
|
return fmt.Errorf("reading album directory: %w", err)
|
|
}
|
|
sort.Slice(albumItems, func(i, j int) bool {
|
|
iName, jName := albumItems[i].Name(), albumItems[j].Name()
|
|
iNameNoExt, jNameNoExt := strings.TrimSuffix(iName, path.Ext(iName)), strings.TrimSuffix(jName, path.Ext(jName))
|
|
// first sort by length
|
|
if len(iNameNoExt) != len(jNameNoExt) {
|
|
return len(iNameNoExt) < len(jNameNoExt)
|
|
}
|
|
// then use natural sort; i.e. [a1, a20, a10] => [a1, a10, a20]
|
|
return natural.Less(albumItems[i].Name(), albumItems[j].Name())
|
|
})
|
|
|
|
for _, d := range albumItems {
|
|
// make pauses more responsive
|
|
if err := opt.Continue(); err != nil {
|
|
return err
|
|
}
|
|
|
|
fpath := path.Join(thisAlbumFolderPath, d.Name())
|
|
if checkpoint != "" {
|
|
if fpath != checkpoint {
|
|
continue // keep going until we find the checkpoint position
|
|
}
|
|
checkpoint = "" // at the checkpoint; clear it so we process all further items
|
|
}
|
|
if err := fimp.processAlbumItem(ctx, albumMeta, thisAlbumFolderPath, d, opt, dirEntry); err != nil {
|
|
return fmt.Errorf("processing album item '%s': %w", fpath, err)
|
|
}
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (fimp *FileImporter) processAlbumItem(ctx context.Context, albumMeta albumArchiveMetadata, folderPath string, d fs.DirEntry, opt timeline.ImportParams, dirEntry timeline.DirEntry) error {
|
|
if err := ctx.Err(); err != nil {
|
|
return err
|
|
}
|
|
// skip the album metadata (it is consumed separately)
|
|
// TODO: Also skip/use print-subscriptions.json, shared_album_comments.json, user-generated-memory-titles.json,... I guess? I haven't seen those though
|
|
if d.Name() == albumMetadataFilename {
|
|
return nil
|
|
}
|
|
|
|
// skip directories (there shouldn't be any in the first place... since Google Photos doesn't support sub-albums)
|
|
if d.IsDir() {
|
|
return nil
|
|
}
|
|
|
|
fpath := path.Join(folderPath, d.Name())
|
|
|
|
// skip sidecar movie files ("live photos") because we'll connect them when
|
|
// we process the actual photograph (hopefully they're in the same archive!)
|
|
if media.IsSidecarVideo(dirEntry.FS, fpath) {
|
|
return nil
|
|
}
|
|
|
|
f, err := dirEntry.FS.Open(fpath)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer f.Close()
|
|
|
|
var itemMeta mediaArchiveMetadata
|
|
|
|
// this could be either the media file itself, or a metadata sidecar file; we
|
|
// need the path to the media file, so start by assuming that's what this is
|
|
mediaFilePath := fpath
|
|
|
|
// if this is a JSON sidecar file, get the metadata it contains
|
|
if path.Ext(fpath) == ".json" {
|
|
err = json.NewDecoder(f).Decode(&itemMeta)
|
|
if err != nil {
|
|
return fmt.Errorf("decoding item metadata file %s: %w", fpath, err)
|
|
}
|
|
|
|
// I've heard that some JSON files in albums (other than the album metadata)
|
|
// might be something else, so as a quick sanity check make sure it contained
|
|
// what I presume is required info
|
|
if itemMeta.Title == "" || itemMeta.URL == "" {
|
|
return nil
|
|
}
|
|
|
|
// we don't totally trust the timstamp in the metadata file, but we'll
|
|
// take it in case the actual media file doesn't contain any
|
|
itemMeta.parsedPhotoTakenTime, err = itemMeta.timestamp()
|
|
if err != nil && !errors.Is(err, errNoTimestamp) {
|
|
return fmt.Errorf("parsing timestamp from item %s: %w", fpath, err)
|
|
}
|
|
|
|
mediaFilePath = fimp.determineMediaFilenameInArchive(fpath, itemMeta)
|
|
opt.Log.Debug("mapped sidecar to target media file",
|
|
zap.String("sidecar_file", fpath),
|
|
zap.String("target_file", mediaFilePath))
|
|
} else {
|
|
itemMeta.source = dirEntry
|
|
}
|
|
|
|
ig := fimp.makeItemGraph(mediaFilePath, itemMeta, albumMeta, opt)
|
|
|
|
// ensure item is within configured timeframe before continuing
|
|
if !opt.Timeframe.Contains(ig.Item.Timestamp) {
|
|
opt.Log.Debug("item is outside timeframe", zap.String("filename", fpath))
|
|
return nil
|
|
}
|
|
|
|
// Between the JSON file and the actual media file, we typically prefer the
|
|
// filename in the JSON file and everything else that overlaps in the media
|
|
// file, since Google's metadata is known to be wrong sometimes (!?). However,
|
|
// in a rare singular case of corrupted input, I have found non-nil timestamp
|
|
// data that was completely wrong in the mvhd box of an MP4 file, captured on
|
|
// an Android phone, with several other videos even that same hour that were
|
|
// correct / not corrupted. The corrupted timestamp was 4165689599 (confirmed
|
|
// via ffprobe), which apparently equates to 2036-01-01, but should have been
|
|
// 2016-11-27. (The time was also truncated.) I can't explain the corruption.
|
|
// I think in general, photos and videos from Google Takeout aren't from the
|
|
// future, and probably aren't RIGHT at midnight on New Years (okay to be fair,
|
|
// that's not so unlikely) -- maybe we can prefer the metadata timestamp in
|
|
// those cases; though I'm not sure if this heuristic is reliable.
|
|
if path.Ext(fpath) == ".json" {
|
|
// metadata file should have good filename and metadata, but we prefer
|
|
// the embedded timestamp if possible
|
|
ig.Item.Retrieval.FieldUpdatePolicies = map[string]timeline.FieldUpdatePolicy{
|
|
"filename": timeline.UpdatePolicyOverwriteExisting,
|
|
"metadata": timeline.UpdatePolicyPreferIncoming, // applied per-key, so keys unique to this file will be kept
|
|
"timestamp": timeline.UpdatePolicyPreferExisting,
|
|
"timespan": timeline.UpdatePolicyPreferExisting,
|
|
"timeframe": timeline.UpdatePolicyPreferExisting,
|
|
"time_offset": timeline.UpdatePolicyPreferExisting,
|
|
"time_uncertainty": timeline.UpdatePolicyPreferExisting,
|
|
"latlon": timeline.UpdatePolicyPreferExisting,
|
|
"altitude": timeline.UpdatePolicyPreferExisting,
|
|
}
|
|
} else {
|
|
// always use the embedded timestamp, unless it looks like it is bad (I've encountered
|
|
// several corrupt or very wrong embedded timestamps that actually cause UI bugs b/c
|
|
// they're so wrong they can't be serialized to JSON) -- the processor will also try to
|
|
// clear them, but in our case there are timestamps that generically "look valid", yet
|
|
// we can know are invalid, and in those cases we can likely lean on the timestamp in
|
|
// the JSON file, so we just need to adjust the update policy for timestamps based on
|
|
// what we can infer about the timestamp
|
|
tsUpdatePolicy := timeline.UpdatePolicyOverwriteExisting
|
|
if isBadTimestamp(ig.Item.Timestamp) {
|
|
tsUpdatePolicy = timeline.UpdatePolicyKeepExisting
|
|
ig.Item.Timestamp = time.Time{}
|
|
}
|
|
ig.Item.Retrieval.FieldUpdatePolicies = map[string]timeline.FieldUpdatePolicy{
|
|
"data": timeline.UpdatePolicyOverwriteExisting,
|
|
"original_location": timeline.UpdatePolicyOverwriteExisting,
|
|
"intermediate_location": timeline.UpdatePolicyOverwriteExisting,
|
|
"filename": timeline.UpdatePolicyPreferExisting,
|
|
"metadata": timeline.UpdatePolicyPreferIncoming,
|
|
"timestamp": tsUpdatePolicy,
|
|
"timespan": tsUpdatePolicy,
|
|
"timeframe": tsUpdatePolicy,
|
|
"time_offset": tsUpdatePolicy,
|
|
"time_uncertainty": tsUpdatePolicy,
|
|
"latlon": timeline.UpdatePolicyPreferIncoming,
|
|
"altitude": timeline.UpdatePolicyPreferIncoming,
|
|
}
|
|
|
|
media.ConnectMotionPhoto(opt.Log, dirEntry, mediaFilePath, ig)
|
|
}
|
|
|
|
// if item has an "-edited" variant, relate it
|
|
ext := path.Ext(mediaFilePath)
|
|
editedPath := strings.TrimSuffix(mediaFilePath, ext) + "-edited" + ext
|
|
if dirEntry.FileExists(editedPath) {
|
|
mediaFilePath = editedPath
|
|
edited := fimp.makeItemGraph(mediaFilePath, itemMeta, albumMeta, opt)
|
|
ig.ToItem(timeline.RelEdit, edited.Item)
|
|
}
|
|
|
|
ig.Checkpoint = fpath
|
|
|
|
opt.Pipeline <- ig
|
|
|
|
return nil
|
|
}
|
|
|
|
func (fimp *FileImporter) makeItemGraph(mediaFilePath string, itemMeta mediaArchiveMetadata, albumMeta albumArchiveMetadata, opt timeline.ImportParams) *timeline.Graph {
|
|
item := &timeline.Item{
|
|
Classification: timeline.ClassMedia,
|
|
// timestamp is not set here (we prefer timestamp embedded in file itself first, below)
|
|
Location: itemMeta.location(),
|
|
IntermediateLocation: mediaFilePath,
|
|
Content: timeline.ItemData{
|
|
Filename: itemMeta.Title,
|
|
},
|
|
Metadata: timeline.Metadata{
|
|
"Description": itemMeta.Description,
|
|
"Local folder": itemMeta.GooglePhotosOrigin.MobileUpload.DeviceFolder.LocalFolderName,
|
|
"Device type": itemMeta.GooglePhotosOrigin.MobileUpload.DeviceType,
|
|
"Views": itemMeta.ImageViews,
|
|
"URL": itemMeta.URL,
|
|
},
|
|
}
|
|
if itemMeta.source.FS != nil {
|
|
// don't send filename since we can't trust the filename we have here;
|
|
// Google Takeout likes to truncate them, and also remove/replace special
|
|
// characters without any indication of the original filename
|
|
|
|
item.Content.Data = func(_ context.Context) (io.ReadCloser, error) {
|
|
return itemMeta.source.FS.Open(path.Join(itemMeta.source.Filename, mediaFilePath))
|
|
}
|
|
|
|
// add metadata contained in the image file itself; note that this overwrites any overlapping
|
|
// metadata that has already been filled in -- except timestamp which is not in the Metadata
|
|
// field; however, apparently (according to the PhotoStructure devs), the timestamp in the
|
|
// actual photo file is often more accurate than any in a sidecar metadata file, so prefer
|
|
// the embedded timestamp first, and if there isn't one, then use the sidecar data
|
|
_, err := media.ExtractAllMetadata(opt.Log, itemMeta.source.FS, path.Join(itemMeta.source.Filename, mediaFilePath), item, timeline.MetaMergeReplaceEmpty)
|
|
if err != nil {
|
|
opt.Log.Warn("extracting metadata", zap.Error(err))
|
|
}
|
|
}
|
|
|
|
// set a timestamp if we only have the metadata file
|
|
if item.Timestamp.IsZero() {
|
|
item.Timestamp = itemMeta.parsedPhotoTakenTime
|
|
}
|
|
|
|
// the retrieval key is crucial so that we can store what data we have from an item
|
|
// as we get it, without getting the whole item, even across different imports; it
|
|
// consists of the data source name to avoid conflicts with other DSes, the name of
|
|
// the archive (with the index part removed, of course, since a metadata file in
|
|
// -001.zip might have its media file in -002.zip, but they should have the same
|
|
// retrieval key; this does rely on them not being renamed), and the expected path
|
|
// of the media file within the archive (if we're on the media file, it's just that
|
|
// path, but if we're on the sidecar JSON file, we have to construct it with heuristics
|
|
// since Google's naming convention isn't documented)
|
|
archiveName := fimp.exportIDFromArchiveFilename()
|
|
retKey := fmt.Sprintf("%s::%s::%s", dataSourceName, archiveName, mediaFilePath)
|
|
item.Retrieval.SetKey(retKey)
|
|
|
|
// since we don't know the filename if we are on the picture file,
|
|
// and we don't know the data if we are on the metadata file, tell
|
|
// the processor that a nil value of these means that we don't know
|
|
// what it is, rather than us asserting that it's intentionally nil
|
|
// (this is crucial to allow us to process takeouts with duplicates
|
|
// without having duplicates in the timeline)
|
|
item.Retrieval.UniqueConstraints = map[string]bool{
|
|
"filename": item.Content.Filename != "",
|
|
"data": item.Content.Data != nil,
|
|
}
|
|
|
|
ig := &timeline.Graph{Item: item}
|
|
|
|
// add to album/collection
|
|
if albumMeta.Title != "" || albumMeta.Description != "" {
|
|
// prefer title, but use description if that's all we have for some reason
|
|
albumTitle := albumMeta.Title
|
|
if albumTitle == "" {
|
|
albumTitle = albumMeta.Description
|
|
albumMeta.Description = ""
|
|
}
|
|
ig.ToItem(timeline.RelInCollection, &timeline.Item{
|
|
Classification: timeline.ClassCollection,
|
|
Content: timeline.ItemData{
|
|
Data: timeline.StringData(albumTitle),
|
|
},
|
|
Owner: item.Owner,
|
|
Metadata: timeline.Metadata{
|
|
"Description": albumMeta.Description,
|
|
},
|
|
})
|
|
}
|
|
|
|
for _, person := range itemMeta.People {
|
|
ig.ToEntity(timeline.RelIncludes, &timeline.Entity{
|
|
Name: person.Name,
|
|
Attributes: []timeline.Attribute{
|
|
{
|
|
Name: "google_photos_name",
|
|
Value: person.Name,
|
|
Identity: true,
|
|
},
|
|
},
|
|
})
|
|
}
|
|
|
|
return ig
|
|
}
|
|
|
|
// exportIDFromArchiveFilename returns the name of the archive without the positional
|
|
// index(es) and without the extension. It assumes a Takeout archive filename that has
|
|
// NOT been renamed.
|
|
//
|
|
// A couple examples: given an import filepath of
|
|
// "/foo/takeout-20240516T230250Z-003.zip/Takeout/Google Photos", this returns
|
|
// "takeout-20240516T230250Z", which seems to be a unique identifier for the particular
|
|
// export this archive is a part of. For newer/larger (~Q3 2025) takeouts, an import
|
|
// filepath of "/foo/takeout-20250921T1994402Z-3-009.zip/Takeout/Google Photos" (notice
|
|
// this has another component in the archive filename) returns "takeout-20250921T1994402Z",
|
|
// which is the export ID.
|
|
//
|
|
// The archive name is extracted from the import path, trimming the Google Photos subpath
|
|
// ("Takeout/Google Photos"). The archive filename is not strictly parsed; it quite naively
|
|
// just uses the name up to the second "-", as long as whatever is before the second "-"
|
|
// is the same for all archives in the group.)
|
|
func (fimp *FileImporter) exportIDFromArchiveFilename() string {
|
|
// For "/foo/takeout-20240516T230250Z-003.zip/Takeout/Google Photos", strip the
|
|
// "Takeout/Google Photos" suffix to terminate the path at the root of the archive
|
|
base := filepath.Base(strings.TrimSuffix(fimp.filename, googlePhotosPath))
|
|
firstDashPos := strings.Index(base, "-")
|
|
if firstDashPos < 0 {
|
|
return base
|
|
}
|
|
secondDashPosRelative := strings.Index(base[firstDashPos+1:], "-")
|
|
if secondDashPosRelative <= 0 {
|
|
return base
|
|
}
|
|
absoluteSecondDashPos := firstDashPos + 1 + secondDashPosRelative
|
|
return base[:absoluteSecondDashPos]
|
|
}
|
|
|
|
func (fimp *FileImporter) readAlbumMetadata(d timeline.DirEntry, albumFolderPath string) (albumArchiveMetadata, error) {
|
|
albumMetadataFilePath := path.Join(d.Filename, albumFolderPath, albumMetadataFilename)
|
|
albumMetadataFile, err := d.FS.Open(albumMetadataFilePath)
|
|
if err != nil {
|
|
return albumArchiveMetadata{}, fmt.Errorf("opening metadata file %s: %w", albumMetadataFilename, err)
|
|
}
|
|
defer albumMetadataFile.Close()
|
|
|
|
var albumMeta albumArchiveMetadata
|
|
err = json.NewDecoder(albumMetadataFile).Decode(&albumMeta)
|
|
if err != nil {
|
|
return albumArchiveMetadata{}, fmt.Errorf("decoding album metadata file %s: %w", albumMetadataFilename, err)
|
|
}
|
|
|
|
return albumMeta, nil
|
|
}
|
|
|
|
const albumMetadataFilename = "metadata.json"
|
|
|
|
type albumArchiveMetadata struct {
|
|
Title string `json:"title"`
|
|
Description string `json:"description"`
|
|
Access string `json:"access"`
|
|
Date struct {
|
|
Timestamp string `json:"timestamp"`
|
|
Formatted string `json:"formatted"`
|
|
} `json:"date"`
|
|
GeoData struct {
|
|
Latitude float64 `json:"latitude"`
|
|
Longitude float64 `json:"longitude"`
|
|
Altitude float64 `json:"altitude"`
|
|
LatitudeSpan float64 `json:"latitudeSpan"`
|
|
LongitudeSpan float64 `json:"longitudeSpan"`
|
|
} `json:"geoData"`
|
|
SharedAlbumComments []struct {
|
|
Text string `json:"text,omitempty"`
|
|
CreationTime struct {
|
|
Timestamp string `json:"timestamp"`
|
|
Formatted string `json:"formatted"`
|
|
} `json:"creationTime"`
|
|
ContentOwnerName string `json:"contentOwnerName"`
|
|
Liked bool `json:"liked,omitempty"`
|
|
} `json:"sharedAlbumComments"`
|
|
}
|
|
|
|
type mediaArchiveMetadata struct {
|
|
Title string `json:"title"`
|
|
Description string `json:"description"`
|
|
ImageViews string `json:"imageViews"`
|
|
CreationTime struct {
|
|
Timestamp string `json:"timestamp"`
|
|
Formatted string `json:"formatted"`
|
|
} `json:"creationTime"`
|
|
PhotoTakenTime struct {
|
|
Timestamp string `json:"timestamp"`
|
|
Formatted string `json:"formatted"`
|
|
} `json:"photoTakenTime"`
|
|
GeoData struct {
|
|
Latitude float64 `json:"latitude"`
|
|
Longitude float64 `json:"longitude"`
|
|
Altitude float64 `json:"altitude"`
|
|
LatitudeSpan float64 `json:"latitudeSpan"`
|
|
LongitudeSpan float64 `json:"longitudeSpan"`
|
|
} `json:"geoData"`
|
|
GeoDataExif struct {
|
|
Latitude float64 `json:"latitude"`
|
|
Longitude float64 `json:"longitude"`
|
|
Altitude float64 `json:"altitude"`
|
|
LatitudeSpan float64 `json:"latitudeSpan"`
|
|
LongitudeSpan float64 `json:"longitudeSpan"`
|
|
} `json:"geoDataExif"`
|
|
People []struct {
|
|
Name string `json:"name"`
|
|
} `json:"people"`
|
|
URL string `json:"url"`
|
|
GooglePhotosOrigin struct {
|
|
MobileUpload struct {
|
|
DeviceFolder struct {
|
|
LocalFolderName string `json:"localFolderName"`
|
|
} `json:"deviceFolder"`
|
|
DeviceType string `json:"deviceType"`
|
|
} `json:"mobileUpload"`
|
|
Composition struct {
|
|
Type string `json:"type"`
|
|
} `json:"composition"`
|
|
} `json:"googlePhotosOrigin"`
|
|
PhotoLastModifiedTime struct {
|
|
Timestamp string `json:"timestamp"`
|
|
Formatted string `json:"formatted"`
|
|
} `json:"photoLastModifiedTime"`
|
|
|
|
parsedPhotoTakenTime time.Time
|
|
source timeline.DirEntry // the parent DirEntry (not representing the actual file itself; the one we're starting the import from)
|
|
}
|
|
|
|
func (m mediaArchiveMetadata) location() timeline.Location {
|
|
loc := timeline.Location{}
|
|
if m.GeoData.Latitude != 0 {
|
|
loc.Latitude = &m.GeoData.Latitude
|
|
}
|
|
if m.GeoData.Longitude != 0 {
|
|
loc.Longitude = &m.GeoData.Longitude
|
|
}
|
|
if m.GeoData.Altitude != 0 {
|
|
loc.Altitude = &m.GeoData.Altitude
|
|
}
|
|
if loc.Latitude == nil && m.GeoDataExif.Latitude != 0 {
|
|
loc.Latitude = &m.GeoDataExif.Latitude
|
|
}
|
|
if loc.Longitude == nil && m.GeoDataExif.Longitude != 0 {
|
|
loc.Longitude = &m.GeoDataExif.Longitude
|
|
}
|
|
if loc.Altitude == nil && m.GeoDataExif.Altitude != 0 {
|
|
loc.Altitude = &m.GeoDataExif.Altitude
|
|
}
|
|
return loc
|
|
}
|
|
|
|
var errNoTimestamp = errors.New("no timestamp available")
|
|
|
|
// timestamp returns a timestamp derived from the metadata. It first
|
|
// prefers the PhotoTakenTime, then the CreationTime, then the
|
|
// PhotoLastModifiedTime. However, it has been reported by the
|
|
// PhotoStructure team that these timestamps can be wildly wrong,
|
|
// on the order of hours or days. Image metadata may be more reliable.
|
|
func (m mediaArchiveMetadata) timestamp() (time.Time, error) {
|
|
ts := m.PhotoTakenTime.Timestamp
|
|
if ts == "" {
|
|
// if a photo is in multiple albums/folders, this can be different between the two
|
|
ts = m.CreationTime.Timestamp
|
|
}
|
|
if ts == "" {
|
|
ts = m.PhotoLastModifiedTime.Timestamp
|
|
}
|
|
if ts == "" {
|
|
return time.Time{}, errNoTimestamp
|
|
}
|
|
parsed, err := strconv.ParseInt(ts, 10, 64)
|
|
if err != nil {
|
|
return time.Time{}, err
|
|
}
|
|
// timestamp represents UTC (no offset), so call UTC() since Unix() defaults to local offset
|
|
return time.Unix(parsed, 0).UTC(), nil
|
|
}
|
|
|
|
// determineMediaFilenameInArchive returns the path to the media file in the archive
|
|
// that is associated with the given JSON sidecar metadata filepath.
|
|
//
|
|
// Google Photos export truncates long filenames. This function uses a lexical approach
|
|
// with the help of some count state to assemble the image filename that can be used to
|
|
// read it in the archive.
|
|
func (fimp *FileImporter) determineMediaFilenameInArchive(jsonFilePath string, itemMeta mediaArchiveMetadata) string {
|
|
// target media file will be in the same directory
|
|
dir := path.Dir(jsonFilePath)
|
|
|
|
// the metadata contains the original filename; we use that to compute
|
|
// what we hope is the filename in the archive based on... experience
|
|
// (none of this is documented, but there's some writeups at TODO: link...)
|
|
titleExt := path.Ext(itemMeta.Title)
|
|
transformedTitle := strings.ReplaceAll(itemMeta.Title, "&", "_")
|
|
transformedTitle = strings.ReplaceAll(transformedTitle, "?", "_")
|
|
titleWithoutExt := strings.TrimSuffix(transformedTitle, titleExt)
|
|
|
|
// Google truncates filenames longer than this (sans extension)
|
|
const maxLength = 47
|
|
|
|
// truncating filenames obviously introduces the chance of filename
|
|
// collisions, if multiple files have the same long prefix; additionally,
|
|
// they may also collide with a file whose entire name is the prefix (i.e.
|
|
// collision with a file that is exactly the max length that does not get
|
|
// truncated) -- for that reason, we need to count how many times we see
|
|
// each filename up to the max length -- including path since each folder
|
|
// has a distinct file list -- even if the name is not longer than the
|
|
// max length.
|
|
|
|
// if the filename is long enough, Google truncates it, so we need
|
|
// to reconstruct it; this depends on the order we're reading the files,
|
|
// because Google auto-increments a "uniqueness suffix" in the form of
|
|
// "(N)" where N is how many times that truncated filename has already
|
|
// appeared before this.
|
|
truncateAt := min(maxLength, len(titleWithoutExt))
|
|
truncatedTitle := titleWithoutExt[:truncateAt]
|
|
truncatedTitleWithDir := path.Join(dir, truncatedTitle)
|
|
fullTruncatedName := truncatedTitleWithDir + titleExt
|
|
|
|
// then count this "hit" for the name
|
|
fimp.truncatedNames[fullTruncatedName]++
|
|
|
|
// now read the count; it will be at least 1
|
|
seenCount := fimp.truncatedNames[fullTruncatedName]
|
|
|
|
if len(titleWithoutExt) > maxLength {
|
|
// a uniqueness suffix is only inserted (before the extension) if the
|
|
// truncated filename has not already been seen in our walk, so if this
|
|
// is the first (or only) occurrence, just return the truncated filename
|
|
if seenCount == 1 {
|
|
return fullTruncatedName
|
|
}
|
|
|
|
// otherwise, insert the uniqueness suffix between the truncated filename and
|
|
// the extension; use seenCount-1 because the first instance doesn't have a
|
|
// "uniqueness suffix (N)", the second one has "(1)", third has "(2)", etc;
|
|
// it's how many times we've *already* seen this name before this
|
|
return fmt.Sprintf("%s(%d)%s", truncatedTitleWithDir, seenCount-1, titleExt)
|
|
}
|
|
|
|
// short filenames are great... so simple (I think)
|
|
return path.Join(dir, itemMeta.Title)
|
|
}
|
|
|
|
// isBadTimestamp tries to detect timestamps that are bad/corrupted, which would generally come from
|
|
// embedded metadata like EXIF or XMP, where either there is a parser bug or actual corruption. I have
|
|
// encountered both on my data sets, and I've encountered these specific situations.
|
|
// The processor will actually strip timestamps that are invalid (like, year is super out-of-range and
|
|
// can't be serialized by JSON), but in the case of a corrupt offset (TZ), it will only strip the offset;
|
|
// but in our case we can do better than that probably, since the sidecar json file usually has a valid
|
|
// and correct timestamp in the rare case the EXIF/XMP data is wrong. So we want to prefer the timestamp
|
|
// from the JSON when we detect a timestamp that the processor may still consider valid, but which we
|
|
// assume is probably wrong. For example: future year, exactly midnight on new years, or corrupted
|
|
// offset. In these cases, the timestamp from JSON should be preferred. In order to prefer the JSON
|
|
// timestamp, we need to clear any bad, embedded timestamp, since otherwise it will be preferred.
|
|
func isBadTimestamp(t time.Time) bool {
|
|
futureYear := t.Year() > time.Now().Year()
|
|
exactlyMidnightOnNewYears := t.Month() == time.January && t.Day() == 1 && t.Hour() == 0 && t.Minute() == 0 && t.Second() == 0
|
|
|
|
const maxTimezoneOffsetSecFromUTC = 50400 // most distant time zone from UTC is apparently +-14 hours
|
|
_, offsetSec := t.Zone()
|
|
offsetCorrupted := offsetSec > maxTimezoneOffsetSecFromUTC || offsetSec < -maxTimezoneOffsetSecFromUTC
|
|
|
|
return t.IsZero() || futureYear || exactlyMidnightOnNewYears || offsetCorrupted
|
|
}
|