1
0
Fork 0
timelinize/datasources/media/media.go

446 lines
14 KiB
Go

/*
Timelinize
Copyright (c) 2013 Matthew Holt
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published
by the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
// Package media implements a data source for general media files (photos, videos, and audio)
// that may have come from a camera roll, photo library, etc.
package media
import (
"context"
"fmt"
"io"
"io/fs"
"path"
"strings"
"sync"
"github.com/timelinize/timelinize/datasources/generic"
"github.com/timelinize/timelinize/timeline"
"go.uber.org/zap"
)
func init() {
err := timeline.RegisterDataSource(timeline.DataSource{
Name: "media",
Title: "Media",
Icon: "media.png",
NewOptions: func() any { return new(Options) },
NewFileImporter: func() timeline.FileImporter { return new(FileImporter) },
})
if err != nil {
timeline.Log.Fatal("registering data source", zap.Error(err))
}
}
// Options configures the data source.
type Options struct {
// We will attempt to extract timestamps for each media item in this order:
// embedded (EXIF/XMP), filepath (if enabled), and as a last resort, file
// modification time (if enabled). If more than one timestamp is available, a
// date range can be specified to help choose one. If specified, a timestamp
// within this range will be preferred over one not in the range. If both
// timestamps are either in or out of the range, then the range will be
// ignored in the choice of timestamp.
//
// Items with chosen timestamps that are outside this range will still be
// processed. To avoid importing items that don't have any timestamp in a
// certain range, use the Timeframe in ListingOptions instead.
DateRange timeline.Timeframe `json:"date_range,omitempty"`
// If true, a timestamp found in the file path may be used.
UseFilePathTime bool `json:"use_filepath_time,omitempty"`
// If true, fall back to file modification time if no other timestamp is found.
// This can be accurate in some cases, like exports/downloads from Google Photos
// of media files like "creations" (gifs, etc), or Apple .MOV files that don't
// have a timestamp embedded within them.
UseFileModTime bool `json:"use_file_mod_time,omitempty"`
// If true, a collection will be created for each folder with items in it.
FolderIsAlbum bool `json:"folder_is_album,omitempty"`
// The ID of the owner entity. REQUIRED if entity is to be related in DB.
OwnerEntityID uint64 `json:"owner_entity_id"`
}
// FileImporter can import the data from a file.
type FileImporter struct{}
// Recognize returns whether the file or folder is recognized.
func (FileImporter) Recognize(_ context.Context, dirEntry timeline.DirEntry, _ timeline.RecognizeParams) (timeline.Recognition, error) {
// this threshold is a little lower because it's not uncommon to have a photo library where
// each media file has a sidecar file for metadata, like .xmp.
// TODO: Should there be a way to tell the import planner to not count a non-recognized file against this threshold? kind of a "we can use this, just not by itself"?
rec := timeline.Recognition{DirThreshold: .45}
// we can import directories, but let the import planner figure that out; only recognize files
if dirEntry.IsDir() {
return rec, nil
}
// for regular files, file type must be recognized (relying on extension is hopefully good enough for now)
if _, ok := ItemClassByExtension(dirEntry.Name()); ok {
rec.Confidence = 1
}
return rec, nil
}
// FileImport imports data from the file or folder.
func (imp *FileImporter) FileImport(ctx context.Context, dirEntry timeline.DirEntry, params timeline.ImportParams) error {
dsOpt := params.DataSourceOptions.(*Options)
owner := timeline.Entity{ID: dsOpt.OwnerEntityID}
collections := make(map[string]*timeline.Item)
var collectionsMu sync.Mutex
// processing files in parallel can greatly speed up imports,
// but use a throttle to avoid unbounded goroutines
const maxGoroutines = 100
throttle := make(chan struct{}, maxGoroutines)
var wg sync.WaitGroup
// prevent subtle bug: we spawn goroutines which send graphs down the pipeline;
// if we return before they finish sending a value, they'll get deadlocked
// since the workers are stopped once we return, meaning their send will never
// get received; thus, we need to wait for the goroutines to finish before we
// return
defer wg.Wait()
err := fs.WalkDir(dirEntry.FS, dirEntry.Filename, func(fpath string, d fs.DirEntry, err error) error {
if err != nil {
return err
}
if err := ctx.Err(); err != nil {
return err
}
if strings.HasPrefix(d.Name(), ".") {
// skip hidden files & folders
if d.IsDir() {
return fs.SkipDir
}
return nil
}
if d.IsDir() {
return nil // traverse into subdirectories
}
// // get the extension and directory; but this is nuanced if it's a single file, which comes up as "." in the walk
// fname, itemPath := fpath, fpath
// if fpath == "." {
// if _, ok := fsys.(archives.FileFS); ok {
// fname = filename
// itemPath = filepath.Base(filename)
// }
// }
// TODO: a lot of this logic is duplicated by the iCloud importer, except:
// the concurrency model is a little different; we don't discover items by
// traversing a folder (they are listed in a CSV file); we don't assign
// timestamps from folder paths or modtime; collection membership is different;
// etc. Come to think of it, we might need to move a lot of this logic into
// the Google Photos importer as well.
class, supported := ItemClassByExtension(d.Name())
if !supported {
// skip unsupported files by filename extension (naive, but hopefully OK)
params.Log.Debug("skipping unrecognized file",
zap.String("filename", fpath),
zap.String("dir_entry_name", d.Name()))
return nil
}
// if this is a sidecar video for a photograph (a motion picture or live photo),
// skip it, since we'll come back to this when we get to the photograph file itself
if IsSidecarVideo(dirEntry.FS, fpath) {
return nil
}
throttle <- struct{}{}
wg.Add(1)
go func() {
defer func() {
<-throttle
wg.Done()
}()
if ctx.Err() != nil {
return
}
item := &timeline.Item{
Classification: class,
Owner: owner,
IntermediateLocation: fpath,
Content: timeline.ItemData{
Filename: d.Name(),
Data: func(_ context.Context) (io.ReadCloser, error) {
return dirEntry.FS.Open(fpath)
},
},
}
// get as much metadata as possible
pic, err := ExtractAllMetadata(params.Log, dirEntry.FS, fpath, item, timeline.MetaMergeAppend)
if err != nil {
params.Log.Warn("extracting metadata",
zap.String("file", fpath),
zap.Error(err))
}
// try to extract time from filepath (media files are often organized by date)
if dsOpt.UseFilePathTime {
fpathTimestamp, err := generic.TimestampFromFilePath(fpath)
if err == nil {
// use the filepath timestamp if the embedded timestamp is missing OR
// the embedded timestamp is not within the specified range but the
// filepath timestamp IS within range
if item.Timestamp.IsZero() ||
(!dsOpt.DateRange.Contains(item.Timestamp) && dsOpt.DateRange.Contains(fpathTimestamp)) {
item.Timestamp = fpathTimestamp
}
}
}
// as a last resort, use file modification time as a timestamp; I've seen this
// be accurate for downloads from Google Photos where the timestamp is otherwise
// missing: common with iPhone videos, gifs/animations, and other "creations"
if dsOpt.UseFileModTime {
info, err := d.Info()
if err == nil {
// use the modtime if the timestamp is still missing OR if
// the preferred timestamp is not within the specified range
// and this one is
modTime := info.ModTime()
if item.Timestamp.IsZero() ||
(!dsOpt.DateRange.Contains(item.Timestamp) && dsOpt.DateRange.Contains(modTime)) {
item.Timestamp = modTime
}
}
}
// Media items are often manually organized and data that was originally
// not digital (like scanned photos) might not have an exact date or time;
// thus we can set a timeframe for these items, so instead of an item
// being on a certain day at precisely midnight 00:00, we can mark it as
// during that day, without certainty as to what time.
item.SetTimeframe()
ig := &timeline.Graph{Item: item}
// if picture is attached, create related item for that
if pic != nil && len(pic.Data) > 0 {
embeddedPicName := "embedded" // you have a better idea?
if pic.Ext != "" {
embeddedPicName += "." + pic.Ext
}
picItem := &timeline.Item{
Classification: timeline.ClassMedia,
Owner: owner,
IntermediateLocation: path.Join(dirEntry.Filename, embeddedPicName), // this works, I guess? Not sure if it should be same as parent file as that might be confusing?
Content: timeline.ItemData{
Filename: embeddedPicName,
MediaType: pic.MIMEType,
Data: timeline.ByteData(pic.Data),
},
Metadata: timeline.Metadata{
"Description": pic.Description,
"Type": pic.Type,
},
}
// TODO: UI needs to support this
ig.ToItem(RelCoverArt, picItem)
}
ConnectMotionPhoto(params.Log, dirEntry.FS, fpath, ig)
// now assemble collection info
collectionsMu.Lock()
defer collectionsMu.Unlock()
// if item is part of an album according to metadata, add it to the collection
if albumStr, ok := item.Metadata["Album"].(string); ok && albumStr != "" {
collName := albumStr
if albumYear, ok := item.Metadata["Year"].(int); ok && albumYear > 0 {
collName += fmt.Sprintf(" (%d)", albumYear)
}
coll, ok := collections[collName]
if !ok {
coll = &timeline.Item{
Classification: timeline.ClassCollection,
Content: timeline.ItemData{
Data: timeline.StringData(collName),
},
}
}
// we can only be certain of the position of the track in the album if a track number
// is available AND this is disc 1 (that is most common, so by default, we assume there
// is just 1 disc, if unspecified)
var position any
if trackNum, ok := item.Metadata["Track"].(int); ok && trackNum > 0 {
if disc, ok := item.Metadata["Disc"].(int); !ok || (disc == 0 || disc == 1) {
position = trackNum
}
}
ig.ToItemWithValue(timeline.RelInCollection, coll, position)
collections[collName] = coll
}
// if enabled, add item to collection based on parent folder name
if dsOpt.FolderIsAlbum {
dir := path.Dir(dirEntry.Filename)
coll, ok := collections[dir]
if !ok {
coll = &timeline.Item{
Classification: timeline.ClassCollection,
Content: timeline.ItemData{
Data: timeline.StringData(path.Base(dir)),
},
Owner: owner,
}
}
ig.ToItem(timeline.RelInCollection, coll)
collections[dir] = coll
}
params.Pipeline <- ig
}()
return nil
})
if err != nil {
return err
}
// TODO: process the collections too (we still need to upgrade this logic after the schema rewrite)
return nil
}
// ItemClassByExtension uses the file extension to return a best-guess item classification.
// It returns false if no matching classification could be found for the file extension.
func ItemClassByExtension(filename string) (timeline.Classification, bool) {
ext := strings.ToLower(path.Ext(filename))
if _, ok := imageExts[ext]; ok {
return timeline.ClassMedia, true
}
if _, ok := videoExts[ext]; ok {
return timeline.ClassMedia, true
}
if _, ok := audioExts[ext]; ok {
return timeline.ClassMedia, true
}
return timeline.Classification{}, false
}
// Extensions used for file recognition.
var (
imageExts = map[string]struct{}{
".arw": {}, // x-sony-arw (Sony Alpha)
".bmp": {},
".cr2": {}, // x-canon-cr2 or x-dcraw
".crw": {}, // x-canon-crw
".dcr": {}, // x-kodak-dcr
".dng": {}, // x-adobe-dng
".erf": {}, // x-epson-erf
".gif": {},
".heic": {},
".heif": {},
".hif": {}, // fujifilm's heif extension
".jpeg": {},
".jpe": {},
".jpg": {},
".k25": {}, // x-kodak-x25
".kdc": {}, // x-kodak-kdc
".nef": {}, // x-nikon-nef
".orf": {}, // x-olympus-orf
".pbm": {},
".pef": {}, // x-pentax-pef
".pgm": {},
".png": {},
".pnm": {},
".ppm": {},
".raf": {}, // x-fuji-raf
".raw": {}, // x-panasonic-raw - most likely an image, but could be anything
".sr2": {}, // x-sony-sr2
".srf": {}, // x-sony-srf
".svg": {},
".tiff": {},
".webp": {},
}
videoExts = map[string]struct{}{
".3g2": {},
".3gp": {},
".3gpp": {},
".asf": {},
".avi": {},
".divx": {},
".flv": {},
".m2t": {},
".m2ts": {},
".m4v": {},
".mkv": {},
".mov": {},
".mp": {}, // One generation of Google Photos / Pixel phones motion pictures... sigh
".mp4": {},
".mpeg": {},
".mpg": {},
".mts": {},
".vob": {},
".wmv": {},
}
audioExts = map[string]struct{}{
".aa": {},
".aac": {},
".aax": {},
".aiff": {},
".alac": {},
".au": {},
".flac": {},
".m4a": {},
".m4b": {},
".m4p": {},
".mogg": {},
".mp3": {},
".oga": {},
".ogg": {},
".opus": {},
".wav": {},
".wma": {},
}
)
var (
// RelMotionPhoto describes a motion photo (live photo/picture, moving picture, etc.) relation.
// "<to> is a motion photo (aka 'live photo' or video) of <from>"
RelMotionPhoto = timeline.Relation{Label: "motion", Directed: true, Subordinating: true}
// RelCoverArt describes a relation for cover art.
// "<to> is the album art for <from>"
RelCoverArt = timeline.Relation{Label: "cover_art", Directed: true}
)