/* Timelinize Copyright (c) 2013 Matthew Holt This program is free software: you can redistribute it and/or modify it under the terms of the GNU Affero General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License for more details. You should have received a copy of the GNU Affero General Public License along with this program. If not, see . */ package timeline import ( "context" "encoding/json" "errors" "fmt" "io/fs" "path" "path/filepath" "sort" "time" "github.com/mholt/archives" ) type DataSourceRow struct { ID int64 `json:"id"` Name string `json:"name"` Title string `json:"title"` Description string `json:"description"` Media []byte `json:"-"` MediaType *string `json:"-"` Standard bool `json:"standard"` // Not part of the DB row, but useful to associate with // the struct so that future lookups can easily know // which repo has this data source in it; if multiple // repos do, any one should suffice as they should be // identical (in theory). RepoID string `json:"repo_id"` } // DataSource has information about a // data source that can be registered. // TODO: remove JSON tags? it shouldn't be serialized going forward, I don't think type DataSource struct { // A snake_cased name of the service // that uniquely identifies it from // all others. This is NOT the same // primary key used in the DB. Name string `json:"name"` // The human-readable or brand name of // the service. Title string `json:"title"` // The name of the image representing // this data source, relative to the // frontend/resources/images/data-sources // folder. // TODO: If we could get all icons to the same format (svg, ideally) we could remove this Icon string `json:"icon"` // Information that will help the user when choosing a data source. Description string `json:"description"` NewOptions func() any `json:"-"` NewFileImporter func() FileImporter `json:"-"` NewAPIImporter func() APIImporter `json:"-"` // // TODO: a way to declare what this data source needs, like SMS backup & restore needs the person_identity for the user this came from (their phone number) // // TODO: Maybe, if this is set, then we presume the data source requires a person identity to start with. // NewIdentity func(input Person, dataSourceOptions any) (Person, error) `json:"-"` } // UnmarshalOptions unmarshals the data source options into the data source's options type. func (ds DataSource) UnmarshalOptions(jsonOpt json.RawMessage) (any, error) { if ds.NewOptions == nil { return nil, nil } dsOpt := ds.NewOptions() if len(jsonOpt) == 0 { return dsOpt, nil } err := json.Unmarshal(jsonOpt, &dsOpt) if err != nil { return nil, fmt.Errorf("decoding data source options: %w", err) } return dsOpt, nil } // // authFunc gets the authentication function for this // // service. If s.Authenticate is set, it returns that; // // if s.OAuth2 is set, it uses a standard OAuth2 func. // func (ds DataSource) authFunc() AuthenticateFn { // if ds.Authenticate != nil { // return ds.Authenticate // } else if ds.OAuth2.ProviderID != "" { // return func(ctx context.Context, userID string, dataSourceOptions any) ([]byte, error) { // return authorizeWithOAuth2(ctx, ds.OAuth2) // } // } // return nil // } // RegisterDataSource registers ds as a data source. func RegisterDataSource(ds DataSource) error { if ds.Name == "" { return errors.New("missing ID") } if ds.Title == "" { return errors.New("missing title") } // register the data source if _, ok := dataSources[ds.Name]; ok { return fmt.Errorf("data source already registered: %s", ds.Name) } dataSources[ds.Name] = ds return nil } // AllDataSources returns all registered data sources sorted by ID strings. func AllDataSources() []DataSource { sources := make([]DataSource, 0, len(dataSources)) for _, ds := range dataSources { sources = append(sources, ds) } sort.Slice(sources, func(i, j int) bool { return sources[i].Name < sources[j].Name }) return sources } func (tl *Timeline) DataSources(ctx context.Context, targetDSName string) ([]DataSourceRow, error) { repoID := tl.id.String() var args []any q := "SELECT id, name, title, description, media, media_type, standard FROM data_sources" if targetDSName != "" { q += " WHERE name=? LIMIT 1" args = []any{targetDSName} } rows, err := tl.db.ReadPool.QueryContext(ctx, q, args...) if err != nil { return nil, err } defer rows.Close() var all []DataSourceRow for rows.Next() { ds := DataSourceRow{RepoID: repoID} err = rows.Scan(&ds.ID, &ds.Name, &ds.Title, &ds.Description, &ds.Media, &ds.MediaType, &ds.Standard) if err != nil { return nil, err } all = append(all, ds) } if err = rows.Err(); err != nil { return nil, err } return all, nil } func (tl *Timeline) DataSourceImage(ctx context.Context, dsName string) ([]byte, string, error) { var img []byte var mimeType string err := tl.db.ReadPool.QueryRowContext(ctx, "SELECT media, media_type FROM data_sources WHERE name=? LIMIT 1", dsName).Scan(&img, &mimeType) return img, mimeType, err } // TODO: WIP... type RecognizeParams struct { // TODO: Possible FastMode, which might result in less confident matches, but can go faster? } // DataSourceRecognition stores the result of whether a data source recognizes an input. type DataSourceRecognition struct { DataSource DataSource `json:"data_source"` Recognition } // Recognition is a type that indicates how well, if at all, an importer // recognized or supports an input, as well as any relevant information // regarding the data set that may be useful later or for storage. type Recognition struct { // TODO: rename to Score? Actually, just make a bool? Why would a source ever be unsure? (they shouldn't be traversing direcctories anyway) Confidence float64 `json:"confidence"` // If > 0, and the data source matches this much of all the entries // in a directory (sans hidden files), assign the entire directory // to the data source; 0 <= DirThreshold <= 1 DirThreshold float64 `json:"dir_threshold,omitempty"` // Optional; TODO: used? SnapshotDate *time.Time `json:"snapshot_date,omitempty"` // TODO: Add some sort of warning/notice/message if the recognizer has advice/info for the user } // DirEntry is a fs.DirEntry that represents a directory entry (file // or folder), and also carries the associated file system it came // from, the filename of the file (within the FS), and the root path // of the OS (as an OS-compatible filepath). // // TODO: This type kind of accidentally turned out to be an fs.FS, fs.StatFS, and fs.ReadDirFS, with FS being "SubFS'ed" by Filename. type DirEntry struct { fs.DirEntry // FS is a file system that can be used to access the // file represented by this DirEntry. During Recognize, // it is rooted at the file itself. During import, it // may be rooted at the file itself (if it's a directory) // or at the parent folder (if it's a file). Use // the Filename field to get the path of the DirEntry // within the FS. FS fs.FS // FSRoot is the root of the FS (OS-compatible filepath). FSRoot string // Filename is the name of the file in the FS represented // by this DirEntry. It is the fs.FS-compatible path that // can and should be used to access the file within the // associated FS. Thus, it is not an OS path, it has no // root component (no leading "/" or drive letter, for // example), may be either a directory or a file, and may // be "." or a true filename. To always get the true // filename, use the Name() method. (TODO: Verify this) Filename string } // Open opens the named file from the DirEntry's FS rooted at the DirEntry's Filename. func (d DirEntry) Open(filename string) (fs.File, error) { return d.FS.Open(path.Join(d.Filename, filename)) } // Stat stats the named file from the DirEntry's FS rooted at the DirEntry's Filename. func (d DirEntry) Stat(filename string) (fs.FileInfo, error) { return fs.Stat(d.FS, path.Join(d.Filename, filename)) } // ReadDir reads the direcftory at the named path from the DirEntry's FS rooted at the DirEntry's Filename. func (d DirEntry) ReadDir(dirName string) ([]fs.DirEntry, error) { return fs.ReadDir(d.FS, path.Join(d.Filename, dirName)) } // FileExists returns true if the named file exists in the DirEntry's FS starting from the DirEntry's filename. func (d DirEntry) FileExists(filename string) bool { return FileExistsFS(d.FS, path.Join(d.Filename, filename)) } // FullPath returns the full path of the directory entry, including the FS // root (if a known FS type from the archives package), and the filename // in the archive, as an OS filepath. func (d DirEntry) FullPath() string { var root string switch fsys := d.FS.(type) { case archives.FileFS: root = fsys.Path case archives.DirFS: root = string(fsys) case *archives.ArchiveFS: root = fsys.Path case *archives.DeepFS: root = fsys.Root default: root = d.FSRoot } return filepath.Join(root, filepath.FromSlash(d.Filename)) } // DataSourcesRecognize returns the list of data sources that reportedly // recognize the file described by the DirEntry. func DataSourcesRecognize(ctx context.Context, entry DirEntry, opts RecognizeParams) ([]DataSourceRecognition, error) { var results []DataSourceRecognition tryDataSource := func(ctx context.Context, ds DataSource) error { if err := ctx.Err(); err != nil { return err } if ds.NewFileImporter == nil { return nil } result, err := ds.NewFileImporter().Recognize(ctx, entry, opts) if err != nil { return fmt.Errorf("%s: %w", ds.Name, err) } if result.Confidence > 0 { results = append(results, DataSourceRecognition{ds, result}) } return nil } const maxDur = 120 * time.Second var cancel context.CancelFunc ctx, cancel = context.WithTimeout(ctx, maxDur) defer cancel() for _, ds := range dataSources { if ds.Name == "generic" { continue // this is only a special fallback data source, to be applied in special cases elsewhere } if err := ctx.Err(); err != nil { return nil, fmt.Errorf("trying data sources: %w", err) } if err := tryDataSource(ctx, ds); err != nil { return nil, err } } sort.Slice(results, func(i, j int) bool { return results[i].Confidence < results[j].Confidence }) return results, nil } // OAuth2 defines which OAuth2 provider a service // uses and which scopes it requires. type OAuth2 struct { // The ID of the service must be recognized // by the OAuth2 app configuration. ProviderID string `json:"provider_id,omitempty"` // The list of scopes to ask for during auth. Scopes []string `json:"scopes,omitempty"` } // TODO: unused? // AuthenticateFn is a function that authenticates userID with a service. // It returns the authorization or credentials needed to operate. The return // value should be byte-encoded so it can be stored in the DB to be reused. // To store arbitrary types, encode the value as a gob, for example. type AuthenticateFn func(ctx context.Context, userID string, dataSourceOptions any) ([]byte, error) // // NewClientFn is a function that returns a client which, given // // the account passed in, can interact with a service provider. // // It must honor context cancellation if there are any async calls. // type NewClientFn func(ctx context.Context, acc Account, dataSourceOptions any) (Client, error) // // Client is a type that can interact with a data source. // type Client interface { // // ListItems lists the items on the account. Items should be // // sent on itemChan as they are discovered, but related items // // should be combined onto a single ItemGraph so that their // // relationships can be stored. If the relationships are not // // discovered until later, that's OK: item processing is // // idempotent, so repeating an item from earlier should have // // no adverse effects. // // // // Implementations must honor the context's cancellation. If // // ctx.Done() is closed, the function should return. Typically, // // this is done by having an outer loop select over ctx.Done() // // and default, where the next page or set of items is handled // // in the default case. // // // // ListItems MUST close itemChan when returning. A // // `defer close(itemChan)` will usually suffice. Closing // // this channel signals to the processing goroutine that // // no more items are coming. // // // // Further options for listing items may be passed in opt. // // // // If opt.Filename is specified, the implementation is expected // // to open and list items from that file. If this is not // // supported, an error should be returned. Conversely, if a // // filename is not specified but required, an error should be // // returned. // // // // opt.Timeframe consists of two optional timestamp and/or item // // ID values. If set, item listings should be bounded in the // // respective direction by that timestamp / item ID. (Items // // are assumed to be part of a chronology; both timestamp and // // item ID *may be* provided, when possible, to accommodate // // data sources which do not constrain by timestamp but which // // do by item ID instead.) The respective time and item ID // // fields, if set, will not be in conflict, so either may be // // used if both are present. While it should be documented if // // timeframes are not supported, an error need not be returned // // if they cannot be honored. // // // // opt.Checkpoint consists of the last checkpoint for this // // account if the last call to ListItems did not finish and // // if a checkpoint was saved. If not nil, the checkpoint // // should be used to resume the listing instead of starting // // over from the beginning. Checkpoint values usually consist // // of page tokens or whatever state is required to resume. Call // // timeline.Checkpoint to set a checkpoint. Checkpoints are not // // required, but if the implementation sets checkpoints, it // // should be able to resume from one, too. // ListItems(ctx context.Context, itemChan chan<- *ItemGraph, opt ListingOptions) error // } // Timeframe represents a start and end time and/or // a start and end item, where either value could be // nil which means unbounded in that direction. // When items are used as the timeframe boundaries, // the ItemID fields will be populated. It is not // guaranteed that any particular field will be set // or unset just because other fields are set or unset. // However, if both Since or both Until fields are // set, that means the timestamp and items are // correlated; i.e. the Since timestamp is (approx.) // that of the item ID. Or, put another way: there // will never be conflicts among the fields which // are non-nil. // // A Contains method is provided to determine if a // time is within the timeframe, but because item IDs // are opaque strings, the respective data sources // are the only ones that can interpret their IDs and // determine if item IDs are within the timeframe. // (Most data sources use times, not item IDs, to // constrain time anyway.) // // Since ~= "After", Until ~= "Before" type Timeframe struct { Since *time.Time `json:"since,omitempty"` Until *time.Time `json:"until,omitempty"` // TODO: where are we actually enforcing these? are these still useful? (I think we used it for Twitter API results or maybe just any paginated API results IIRC?) SinceItemID *string `json:"since_item_id,omitempty"` UntilItemID *string `json:"until_item_id,omitempty"` } // IsEmpty returns true if the timeframe is not set in any way. func (tf Timeframe) IsEmpty() bool { return tf.Since == nil && tf.Until == nil && tf.SinceItemID == nil && tf.UntilItemID == nil } func (tf Timeframe) String() string { var sinceItemID, untilItemID string if tf.SinceItemID != nil { sinceItemID = *tf.SinceItemID } if tf.UntilItemID != nil { untilItemID = *tf.UntilItemID } return fmt.Sprintf("{Since:%s Until:%s SinceItemID:%s UntilItemID:%s}", tf.Since, tf.Until, sinceItemID, untilItemID) } // Contains returns true if the given time ts is inside the timeframe tf. // Only tf.Since and tf.Until are used; tf.SinceItemID and tf.UntilItemID // are ignored. // // A zero-value timestamp is considered to be in all timeframes. TODO: It's so that we don't omit items from the timeline... Is that surprising though? // // If both Since and Until are set, then the time must be between those // two times. If only Since is set, the time must be after Since. If only // Until is set, the time must be before Until. If neither are set, true // is always returned. func (tf Timeframe) Contains(ts time.Time) bool { if ts.IsZero() { return true } afterSince := tf.Since == nil || ts.After(*tf.Since) beforeUntil := tf.Until == nil || ts.Before(*tf.Until) return afterSince && beforeUntil } // ContainsItem returns true if the timeframe contains the item, // according to its timestamp and timespan (start and end) values, // with respect to strict mode. If strict mode is enabled, both the // item's timestamp and timespan must be entirely inside the timeframe; // otherwise, a timeframe is considered to contain an item if part of // its timespan is within the timeframe. func (tf Timeframe) ContainsItem(it *Item, strict bool) bool { if it == nil { return false } if it.Timestamp.IsZero() { return true } if strict && tf.Since != nil && tf.Until != nil { return it.Timestamp.After(*tf.Since) && it.Timespan.Before(*tf.Until) } afterSince := tf.Since == nil || it.Timestamp.After(*tf.Since) beforeUntil := tf.Until == nil || it.Timestamp.Before(*tf.Until) return afterSince && beforeUntil } // FileImporter is a type that can import data from files or folders. // // Implementations MUST treat the input parameters as read-only; i.e. // the values should not be changed. Doing so will cause bugs. type FileImporter interface { // Recognize determines whether the data source supports the input described by the DirEntry. // It should be implemented in a way that is efficient to be called many times on multiple // files throughout a walk (pool buffers for reuse, skip work that doesn't need to be done, // etc; for example, ignore hidden files). // // Recognize should perform any combination of 3 primary recognition algorithms: // // 1. Filename match (fast and easy, but less reliable) // 2. Content match (ideally just read a file's header or other small amount; more reliable) // 3. Directory structure match (for directories, check for presence of certain files and // possibly verify by reading part of them; i.e. also perform 1 and/or 2 for expected files). // // Recognize MUST NOT walk/traverse a directory if possible; spot-checking specific known or // expected files within it is okay. The walking is already performed by the import planner. // // To keep import planning quick, opening the DirEntry or any other files in the FS should // only happen if necessary, due to the possibility that it is within a compressed tar file, // which is not efficient as it requires decompressing potentially most of the archive to // find the file to open. File extensions may be a good way to avoid unnecessary Open() calls. // //nolint:inamedparam Recognize(context.Context, DirEntry, RecognizeParams) (Recognition, error) // TODO: write godoc. It is expected that context cancellation will be honored (i.e. return // if the context has an error or its done channel is closed), and that the function will not // return until all sends to the pipeline channel have completed. No more sends may happen // after returning, since the receiving goroutines will be terminated (sends would be // deadlocked without a receiver) or the pipeline will be closed (sending would cause a // panic). SUBTLETY: If spawning goroutines which send to the pipeline, they must also // be waited upon to terminate before returning. // //nolint:inamedparam FileImport(context.Context, DirEntry, ImportParams) error } // TODO: unused? // APIImporter is a type that can import data via a remote service API. type APIImporter interface { Authenticate(ctx context.Context, acc Account, dsOpt any) error //nolint:inamedparam APIImport(context.Context, Account, ImportParams) error } // TODO: experimental type SizeEstimator interface { //nolint:inamedparam EstimateSize(context.Context, DirEntry, ImportParams) (int, error) } var dataSources = make(map[string]DataSource) // keyed by name (not DB row ID)