/* Timelinize Copyright (c) 2013 Matthew Holt This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . */ package generic import ( "bufio" "errors" "fmt" "io" "net/textproto" "regexp" "sort" "strings" "time" "github.com/cozy/goexif2/exif" ) func (fi fileItem) timestamp() time.Time { // first try EXIF; we can ignore errors, since they should // just mean there's no EXIF or valid timestamp in the file ts, err := fi.timestampFromExif() if err == nil { return ts } // see if file has MIME header with Date field (emails generally do) ts, err = fi.timestampFromDateHeader() if err == nil { return ts } // see if the file path has any timestamp ts, err = fi.timestampFromFilePath() if err == nil { return ts } // as a last resort, fall back to file modification date // TODO: Nooooo don't do this info, err := fi.dirEntry.Info() if err == nil { return info.ModTime() } return time.Time{} } func (fi fileItem) timestampFromExif() (time.Time, error) { // TODO: apparently, there are ways to get EXIF from video files: // https://superuser.com/questions/1036704/is-there-something-like-exif-for-video // (exiftool can help) file, err := fi.fsys.Open(fi.path) if err != nil { return time.Time{}, fmt.Errorf("unable to open file to attempt reading EXIF: %w", err) } defer file.Close() ex, err := exif.Decode(file) if err != nil { return time.Time{}, err } return ex.DateTime() } func (fi fileItem) timestampFromDateHeader() (time.Time, error) { file, err := fi.fsys.Open(fi.path) if err != nil { return time.Time{}, err } defer file.Close() // date header should probably be in the first kilobyte of file const kb = 1024 bufr := bufio.NewReader(io.LimitReader(file, kb)) tp := textproto.NewReader(bufr) header, err := tp.ReadMIMEHeader() if err != nil { return time.Time{}, err } date := header.Get("Date") if date == "" { return time.Time{}, errors.New("headers found, but no Date field") } return time.Parse(time.RFC1123Z, date) } func (fi fileItem) timestampFromFilePath() (time.Time, error) { return TimestampFromFilePath(fi.path) } // TimestampFromFilePath finds all timestamps that can be found in any // of the programmed formats in the file path. Overlapping timestamps // are deduplicated, preferring the most specific values. Then one is // chosen to be returned; partial timestamps (one date, another time) // are combined if possible. func TimestampFromFilePath(fpath string) (time.Time, error) { const ( maxYearsAgo = 300 // how many years in the past to allow a date maxYearsAhead = 1 // how many years in the future to allow a date ) stdCanonicalPath := strings.ReplaceAll(fpath, "\\", "/") now := time.Now() type foundTimestamp struct { ts time.Time start, end int } var tsFound []foundTimestamp // first try to find as many timestamps as we can for _, tsPattern := range timestampPatterns { for _, matchPos := range tsPattern.re.FindAllStringIndex(stdCanonicalPath, -1) { start, end := matchPos[0], matchPos[1] match := stdCanonicalPath[start:end] // TODO: Experimental; if matching a 4-digit year, because 4-digit numbers // TODO: might be common, it should be the whole path component...? // if len(match) <= 4 { // } // time formats are case-sensitive with regards to some // components like "AM" versus "am"... avoid this nuisance // by uppercasing everything match = strings.ToUpper(match) // a time format like "2006_1_2" can't parse "2021_10_3" (Oct. 3) // but works if not using underscores (maybe related to // https://github.com/golang/go/issues/11334) match = strings.ReplaceAll(match, "_", "-") format := strings.ReplaceAll(tsPattern.dateFormat, "_", "-") ts, err := time.ParseInLocation(format, match, time.Local) if err != nil { continue } // reject timestamp if it's a ridiculous amount in the past // or future; those are very most likely false positives if ts.Year() > 0 && (ts.Before(now.AddDate(-maxYearsAgo, 0, 0)) || ts.After(now.AddDate(maxYearsAhead, 0, 0))) { continue } tsFound = append(tsFound, foundTimestamp{ ts: ts, start: start, end: end, }) } } if len(tsFound) == 0 { return time.Time{}, fmt.Errorf("no timestamp found in file path: %s", fpath) } var candidateTimestamp []foundTimestamp // some timestamp formats overlap ("2006/1" is also in "2006/1/2", // but the latter is more specific), so find those which literally // overlap in the input string where they share a start or end // position OR one is wholly contained within another, so we can be // assured they are part of the same timestamp; then between those, // keep the more specific one candidates: for i := range tsFound { // tsI is our candidate timestamp, we'll compare it to every other // timestamp and see if we need to weed it out tsI := tsFound[i] for j := range tsFound { if j == i { continue } tsJ := tsFound[j] // tsJ is the competitor // we're looking for joint overlap where start or end are equal // or if one is entirely contained within the other; // disjoint overlap exists if start OR end of one is between start AND end of other; // hard to say which one is right, but likely at least one is wrong... // for example: "4 January 2022/3:59PM" has "2022/3" crossing into both, but is wrong if tsI.start == tsJ.start || tsI.end == tsJ.end || // same start or same end (tsI.start > tsJ.start && tsI.end < tsJ.end) || // tsJ contains tsI (tsI.start < tsJ.start && tsI.end > tsJ.end) || // tsI contains tsJ (tsI.start > tsJ.start && tsI.start < tsJ.end) || // tsI starts inside tsJ (disjoint overlap) (tsI.end > tsJ.start && tsI.end < tsJ.end) { // tsI ends inside tsJ (disjoint overlap) // if it was joint overlap, we can presume they are the same timestamp, // but likely have different components specified in them; keep the more // specific one, which SHOULD be the "later" or "higher" time value, // because more non-zero components add to the timestamp -- BUT I found // a counterexample: "1953/10-09-1953" has both Oct 9, 1953, and Oct 9, // 2019 ("10-09-19"), where the higher date is clearly wrong here, so I've // settled on always going with the longest substring match // if tsJ is more specific let tsI drop if (tsJ.end - tsJ.start) > (tsI.end - tsI.start) { continue candidates } } } // if we got here, the inner loop didn't skip this candidate, // so we can presumably use it for next phase candidateTimestamp = append(candidateTimestamp, tsI) } // if we ended up skipping all timestamps because they were // all positioned confusingly, keep them all and simply try // sorting (returning none when we found some seems unwise) if len(candidateTimestamp) == 0 { candidateTimestamp = tsFound } // we may have found multiple timestamps, for example one that // contains a date and another which contains time; try to find // them and combine them var tsDate, tsTime time.Time for _, ts := range candidateTimestamp { if (!zeroDate(ts.ts) && zeroTime(ts.ts)) && ts.ts.After(tsDate) { tsDate = ts.ts } if (zeroDate(ts.ts) && !zeroTime(ts.ts)) && timeOfDayIsLater(ts.ts, tsTime) { tsTime = ts.ts } } // TODO: we should still try to combine separate year, month, and day timestamps... somehow... if !tsDate.IsZero() && !tsTime.IsZero() { year, month, day := tsDate.Date() return tsTime.AddDate(year, int(month)-1, day-1).Local(), nil } // TODO: a date like "1959" should maybe set a timespan, from the first to the last second of that year, rather than just second 0 of that year? // if more than one timestamp remains, I dunno how to prefer one // over another since we've already taken care of overlapping // timestamps... but here's what I've found works well on a small // sample so far (some of which are in the test cases): // - Prefer longest substring match (most specific) // - Prefer last one (filename is likely more correct than prior path components) sort.Slice(candidateTimestamp, func(i, j int) bool { iLen := candidateTimestamp[i].end - candidateTimestamp[i].start jLen := candidateTimestamp[j].end - candidateTimestamp[j].start if iLen != jLen { return iLen > jLen } return candidateTimestamp[i].start > candidateTimestamp[j].start }) return candidateTimestamp[0].ts, nil } // zeroDate returns true if the date component of t is zero-valued // (after parsing, which sets the year to 0 if no year was parsed; default // time.Time structs set the year as 1 which is a little maddening). func zeroDate(t time.Time) bool { return t.Year() == 0 && t.Month() == time.January && t.Day() == 1 } // zeroTime returns true if the time component of t is zero-valued. func zeroTime(t time.Time) bool { return t.Hour() == 0 && t.Minute() == 0 && t.Second() == 0 } // timeOfDayIsLater returns true if t1 is at a later time of day than t2. func timeOfDayIsLater(t1, t2 time.Time) bool { t1BeginOfDay := time.Date(t1.Year(), t1.Month(), t1.Day(), 0, 0, 0, 0, time.UTC) t2BeginOfDay := time.Date(t2.Year(), t2.Month(), t2.Day(), 0, 0, 0, 0, time.UTC) return t1.Sub(t1BeginOfDay) > t2.Sub(t2BeginOfDay) } // timestampPatterns maps Go time format strings to the regexp that matches them. type timestampPattern struct { dateFormat string re *regexp.Regexp } var timestampPatterns = []timestampPattern{ {dateFormat: "2006/1/2", re: regexp.MustCompile(`\d{4}/\d\d?/\d\d?`)}, {dateFormat: "2006\\1\\2", re: regexp.MustCompile(`\d{4}\\d\d?\\d\d?`)}, {dateFormat: "1-2-06", re: regexp.MustCompile(`\d\d?-\d\d?-\d\d`)}, // NOTE: this is ambiguous depending on locale! Could be 2-1-06 as well {dateFormat: "1-2-2006", re: regexp.MustCompile(`\d\d?-\d\d?-\d{4}`)}, // NOTE: this is ambiguous depending on locale! Could be 2-1-2006 as well {dateFormat: "2006-1-2", re: regexp.MustCompile(`\d{4}-\d\d?-\d\d?`)}, {dateFormat: "2006_1_2", re: regexp.MustCompile(`\d{4}_\d\d?_\d\d?`)}, {dateFormat: "2 January 2006", re: regexp.MustCompile(`\d\d? \w+ \d{4}`)}, {dateFormat: "2 Jan 2006", re: regexp.MustCompile(`\d\d? \w{3} \d{4}`)}, {dateFormat: "2 January 06", re: regexp.MustCompile(`\d\d?, \w+ \d{2}`)}, {dateFormat: "January 2 2006", re: regexp.MustCompile(`\w+ \d\d? \d{4}`)}, {dateFormat: "January 2, 2006", re: regexp.MustCompile(`\w+ \d\d?, \d{4}`)}, {dateFormat: "Jan 2, 2006", re: regexp.MustCompile(`\w{3} \d\d?, \d{4}`)}, {dateFormat: "Jan 2, 06", re: regexp.MustCompile(`\w{3} \d\d?, \d{2}`)}, {dateFormat: "2006/January", re: regexp.MustCompile(`\d{4}/\w+`)}, {dateFormat: "2006/2 January", re: regexp.MustCompile(`\d{4}/\d\d? \w+`)}, {dateFormat: "2006/2 Jan", re: regexp.MustCompile(`\d{4}/\d\d? \w{3}`)}, {dateFormat: "2 January", re: regexp.MustCompile(`\d\d? \w+`)}, {dateFormat: "January 2", re: regexp.MustCompile(`\w+ \d\d?`)}, {dateFormat: "January 2006", re: regexp.MustCompile(`\w+ \d{4}`)}, {dateFormat: "January-2006", re: regexp.MustCompile(`\w+-\d{4}`)}, {dateFormat: "January_2006", re: regexp.MustCompile(`\w+_\d{4}`)}, {dateFormat: "Jan 2006", re: regexp.MustCompile(`\w{3} \d{4}`)}, {dateFormat: "Jan_2006", re: regexp.MustCompile(`\w{3}_\d{4}`)}, {dateFormat: "Jan-2006", re: regexp.MustCompile(`\w{3}-\d{4}`)}, {dateFormat: "1-2006", re: regexp.MustCompile(`\d\d?-\d{4}`)}, {dateFormat: "2006_January", re: regexp.MustCompile(`\d{4}_\w+`)}, {dateFormat: "2006_Jan", re: regexp.MustCompile(`\d{4}_\w{3}`)}, {dateFormat: "2006 January", re: regexp.MustCompile(`\d{4} \w+`)}, {dateFormat: "2006 Jan", re: regexp.MustCompile(`\d{4} \w{3}`)}, {dateFormat: "January", re: regexp.MustCompile(`\w+`)}, {dateFormat: "Jan", re: regexp.MustCompile(`\w{3}`)}, {dateFormat: "2006/1", re: regexp.MustCompile(`\d{4}/\d\d?`)}, {dateFormat: "2006\\1", re: regexp.MustCompile(`\d{4}\\d\d?`)}, {dateFormat: "2006-1", re: regexp.MustCompile(`\d{4}-\d\d?`)}, {dateFormat: "2006", re: regexp.MustCompile(`\d{4}`)}, // TODO: these next few formats have flaky tests... (UPDATE MAY 10, 2022, I think I got the flakiness gone by fixing the nested for loops above) sometimes it fails because it doesn't choose the one with PM..}, {dateFormat: "15:04", re: regexp.MustCompile(`\d\d?:\d\d`)}, {dateFormat: "3:04PM", re: regexp.MustCompile(`(?i)\d\d?:\d\d[AP]M`)}, {dateFormat: "3:04 PM", re: regexp.MustCompile(`(?i)\d\d?:\d\d [AP]M`)}, // TODO: would be nice to isolate these numbers somehow, i.e. surrounded by non-numbers or they are the whole component of a path..}, {dateFormat: "20060102150405", re: regexp.MustCompile(`\d{14}`)}, {dateFormat: "200601021504", re: regexp.MustCompile(`\d{12}`)}, {dateFormat: "20060102", re: regexp.MustCompile(`\d{8}`)}, {dateFormat: "20060102_1504", re: regexp.MustCompile(`\d{8}_\d{4}`)}, {dateFormat: "20060102-1504", re: regexp.MustCompile(`\d{8}-\d{4}`)}, {dateFormat: "2006-1-2_15-4-5", re: regexp.MustCompile(`\d{4}-\d\d?-\d\d?_\d\d?-\d\d?-\d\d?`)}, }