Apparently it's not only tweets.js that is tweet.js, it's also tweets_media that is tweet_media, sigh.
365 lines
12 KiB
Go
365 lines
12 KiB
Go
/*
|
|
Timelinize
|
|
Copyright (c) 2013 Matthew Holt
|
|
|
|
This program is free software: you can redistribute it and/or modify
|
|
it under the terms of the GNU Affero General Public License as published
|
|
by the Free Software Foundation, either version 3 of the License, or
|
|
(at your option) any later version.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU Affero General Public License for more details.
|
|
|
|
You should have received a copy of the GNU Affero General Public License
|
|
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
// Package twitter implements a data source for importing and downloading data from X (formerly Twitter).
|
|
package twitter
|
|
|
|
import (
|
|
"context"
|
|
"errors"
|
|
"fmt"
|
|
"io/fs"
|
|
"path"
|
|
"time"
|
|
|
|
"github.com/timelinize/timelinize/timeline"
|
|
"go.uber.org/zap"
|
|
)
|
|
|
|
// var (
|
|
// oauth2 = timeline.OAuth2{
|
|
// ProviderID: "twitter",
|
|
// Scopes: []string{"tweet.read", "users.read", "offline.access"},
|
|
// }
|
|
|
|
// rateLimit = timeline.RateLimit{
|
|
// // TODO: from v1...
|
|
// // // from https://developer.twitter.com/en/docs/basics/rate-limits
|
|
// // // with some leeway since it's actually a pretty generous limit
|
|
// // RequestsPerHour: 5900,
|
|
// // as of December 2020, project caps allow pulling 500,000 tweets / mo.: https://developer.twitter.com/en/docs/projects/overview
|
|
// RequestsPerHour: 1800, // https://developer.twitter.com/en/docs/twitter-api/tweets/search/api-reference/get-tweets-search-recent
|
|
// }
|
|
// )
|
|
|
|
func init() {
|
|
err := timeline.RegisterDataSource(timeline.DataSource{
|
|
Name: "twitter",
|
|
Title: "Twitter",
|
|
Icon: "twitter.svg",
|
|
NewOptions: func() any { return new(Options) },
|
|
NewFileImporter: func() timeline.FileImporter { return new(Client) },
|
|
// NewAPIImporter: func() timeline.APIImporter { return new(Client) },
|
|
})
|
|
if err != nil {
|
|
timeline.Log.Fatal("registering data source", zap.Error(err))
|
|
}
|
|
}
|
|
|
|
// Client is a type that can process/import data from Twitter via API or file archive.
|
|
type Client struct {
|
|
// fsys fs.FS // if reading from an archive
|
|
// httpClient *http.Client
|
|
//nolint:dupword
|
|
// checkpoint checkpoint
|
|
// acc timeline.Account
|
|
owner timeline.Entity
|
|
// otherAccounts map[string]twitterAccount // keyed by user/account ID
|
|
}
|
|
|
|
func (c *Client) prepareTweet(_ context.Context, t *tweet, source string, opt Options) (skip bool, err error) {
|
|
// mark whether this tweet came from the API or an export file
|
|
t.source = source
|
|
|
|
// set the owner account information; this has to be done differently
|
|
// depending on the source (it's not embedded in the archive's tweets...)
|
|
switch t.source {
|
|
case srcArchive:
|
|
t.owner = c.owner
|
|
// case "api":
|
|
// if t.User != nil {
|
|
// if t.User.UserIDStr == c.owner.AttributeValue(identityAttribute) {
|
|
// // tweet author is the owner of the account - awesome
|
|
// t.owner = c.owner
|
|
// } else {
|
|
// // look up author's account info
|
|
// acc, ok := c.otherAccounts[t.User.UserIDStr]
|
|
// if !ok {
|
|
// acc, err = c.getAccountFromAPI("", t.User.UserIDStr)
|
|
// if err != nil {
|
|
// return false, fmt.Errorf("looking up tweet author's account information: %v", err)
|
|
// }
|
|
// // cache this for later
|
|
// if len(c.otherAccounts) > 2000 {
|
|
// for id := range c.otherAccounts {
|
|
// delete(c.otherAccounts, id)
|
|
// break
|
|
// }
|
|
// }
|
|
// c.otherAccounts[acc.id()] = acc
|
|
// }
|
|
// t.owner = acc.entity(ctx, c.fsys)
|
|
// }
|
|
// }
|
|
default:
|
|
return false, fmt.Errorf("unrecognized source: %s", t.source)
|
|
}
|
|
|
|
// skip empty tweets
|
|
if t.isEmpty() {
|
|
return true, nil
|
|
}
|
|
|
|
// skip tweets we aren't interested in
|
|
if !opt.Retweets && t.isRetweet() {
|
|
return true, nil
|
|
}
|
|
if !opt.Replies && t.InReplyToUserIDStr != "" && t.InReplyToUserIDStr != t.owner.AttributeValue(identityAttribute) {
|
|
// TODO: Replies should have more context, like what are we replying to, etc... the whole thread, even?
|
|
// this option is about replies to tweets other than our own (which are like a continuation of one thought)
|
|
return true, nil
|
|
}
|
|
|
|
// parse Twitter's time string into an actual time value
|
|
t.createdAtParsed, err = time.Parse("Mon Jan 2 15:04:05 -0700 2006", t.CreatedAt)
|
|
if err != nil {
|
|
return false, fmt.Errorf("parsing created_at time: %w", err)
|
|
}
|
|
|
|
return false, nil
|
|
}
|
|
|
|
func (c *Client) makeItemGraphFromTweet(_ context.Context, params timeline.ImportParams, t tweet, fsys fs.FS, _ Options) (*timeline.Graph, error) {
|
|
// oneMediaItem := t.hasExactlyOneMediaItem()
|
|
|
|
item := &timeline.Item{
|
|
ID: t.TweetIDStr,
|
|
Classification: timeline.ClassSocial,
|
|
Timestamp: t.createdAtParsed,
|
|
Location: t.location(),
|
|
Owner: t.owner,
|
|
Content: t.content(),
|
|
Metadata: timeline.Metadata{
|
|
"Retweets": int(t.RetweetCount),
|
|
"Likes": int(t.FavoriteCount),
|
|
},
|
|
}
|
|
ig := &timeline.Graph{Item: item}
|
|
|
|
hasText := t.text() != ""
|
|
|
|
// process the media items attached to the tweet
|
|
if t.ExtendedEntities != nil {
|
|
// TODO: use collection or attachments? (or both?) attachment relation is simpler, but collections preserves order
|
|
// var collItems []timeline.CollectionItem
|
|
|
|
for i, m := range t.ExtendedEntities.Media {
|
|
if i == 0 && !hasText {
|
|
// if tweet does not have any text, then the first media item was
|
|
// used as the main item's content
|
|
continue
|
|
}
|
|
|
|
m.parent = &t
|
|
|
|
dataFileName := m.fileName()
|
|
if dataFileName == "" {
|
|
// TODO: proper logging
|
|
// log.Printf("[ERROR][%s/%s] Tweet media has no data file name: %+v",
|
|
// DataSourceID, c.acc.User.UserID, m)
|
|
continue
|
|
}
|
|
|
|
switch t.source {
|
|
case srcArchive:
|
|
targetFileInArchive := path.Join("data", "tweets_media", dataFileName)
|
|
|
|
file, err := fsys.Open(targetFileInArchive)
|
|
if errors.Is(err, fs.ErrNotExist) {
|
|
targetFileInArchive = path.Join("data", "tweet_media", dataFileName) // archives from/until (?) 2022 use the singular word "tweet", similar to tweet.js vs. tweets.js
|
|
file, err = fsys.Open(targetFileInArchive)
|
|
}
|
|
if err == nil {
|
|
m.readCloser = file
|
|
} else {
|
|
params.Log.Error("could not open data file in archive", zap.String("filename", targetFileInArchive), zap.Error(err))
|
|
}
|
|
// case "api":
|
|
// mediaURL := m.getURL()
|
|
// if m.Type == "photo" {
|
|
// mediaURL += ":orig" // get original file, with metadata
|
|
// }
|
|
// resp, err := http.Get(mediaURL)
|
|
// if err != nil {
|
|
// return nil, fmt.Errorf("getting media resource %s: %v", m.MediaURLHTTPS, err)
|
|
// }
|
|
// if resp.StatusCode != http.StatusOK {
|
|
// return nil, fmt.Errorf("media resource returned HTTP status %s: %s", resp.Status, m.MediaURLHTTPS)
|
|
// }
|
|
// m.readCloser = resp.Body
|
|
|
|
default:
|
|
return nil, fmt.Errorf("unrecognized source value: must be api or archive: %s", t.source)
|
|
}
|
|
|
|
// if !oneMediaItem {
|
|
item := &timeline.Item{
|
|
ID: m.MediaIDStr,
|
|
Classification: timeline.ClassSocial,
|
|
Timestamp: m.parent.createdAtParsed,
|
|
Location: m.parent.location(),
|
|
Owner: m.owner(),
|
|
Content: m.content(),
|
|
}
|
|
ig.ToItem(timeline.RelAttachment, item)
|
|
// collItems = append(collItems, timeline.CollectionItem{
|
|
// Item: item,
|
|
// Position: i,
|
|
// })
|
|
// }
|
|
}
|
|
|
|
// if len(collItems) > 0 {
|
|
// ig.Collections = append(ig.Collections, timeline.Collection{
|
|
// OriginalID: "tweet_" + t.id(),
|
|
// Items: collItems,
|
|
// })
|
|
// }
|
|
}
|
|
|
|
// // if we're using the API, go ahead and get the
|
|
// // 'parent' tweet to which this tweet is a reply
|
|
// if t.source == "api" && t.InReplyToStatusIDStr != "" {
|
|
// inReplyToTweet, err := c.getTweetFromAPI(t.InReplyToStatusIDStr)
|
|
// if err != nil {
|
|
// return nil, fmt.Errorf("getting tweet that this tweet (%s) is in reply to (%s): %v",
|
|
// t.id(), t.InReplyToStatusIDStr, err)
|
|
// }
|
|
// skip, err := c.prepareTweet(ctx, &inReplyToTweet, "api", opt)
|
|
// if err != nil {
|
|
// return nil, fmt.Errorf("preparing reply-parent tweet: %v", err)
|
|
// }
|
|
// if !skip {
|
|
// repIG, err := c.makeItemGraphFromTweet(ctx, inReplyToTweet, fsys, opt)
|
|
// if err != nil {
|
|
// return nil, fmt.Errorf("making item from tweet that this tweet (%s) is in reply to (%s): %v",
|
|
// t.id(), inReplyToTweet.id(), err)
|
|
// }
|
|
// // TODO: is this relation backwards?
|
|
// ig.Edges = append(ig.Edges, timeline.Relationship{
|
|
// Relation: timeline.RelReply,
|
|
// To: repIG,
|
|
// })
|
|
// }
|
|
// }
|
|
|
|
// // if this tweet embeds/quotes/links to other tweets,
|
|
// // we should establish those relationships as well
|
|
// if t.source == "api" && t.Entities != nil {
|
|
// for _, urlEnt := range t.Entities.URLs {
|
|
// embeddedTweetID := getLinkedTweetID(urlEnt.ExpandedURL)
|
|
// if embeddedTweetID == "" {
|
|
// continue
|
|
// }
|
|
// embeddedTweet, err := c.getTweetFromAPI(embeddedTweetID)
|
|
// if err != nil {
|
|
// return nil, fmt.Errorf("getting tweet that this tweet (%s) embeds (%s): %v",
|
|
// t.id(), t.InReplyToStatusIDStr, err)
|
|
// }
|
|
// skip, err := c.prepareTweet(ctx, &embeddedTweet, "api", opt)
|
|
// if err != nil {
|
|
// return nil, fmt.Errorf("preparing embedded tweet: %v", err)
|
|
// }
|
|
// if !skip {
|
|
// embIG, err := c.makeItemGraphFromTweet(ctx, embeddedTweet, fsys, opt)
|
|
// if err != nil {
|
|
// return nil, fmt.Errorf("making item from tweet that this tweet (%s) embeds (%s): %v",
|
|
// t.id(), embeddedTweet.id(), err)
|
|
// }
|
|
// ig.Edges = append(ig.Edges, timeline.Relationship{
|
|
// Relation: timeline.RelQuotes,
|
|
// To: embIG,
|
|
// })
|
|
// }
|
|
// }
|
|
// }
|
|
|
|
return ig, nil
|
|
}
|
|
|
|
// Options customizes item listings.
|
|
type Options struct {
|
|
Retweets bool `json:"retweets"` // whether to include retweets
|
|
Replies bool `json:"replies"` // whether to include replies to tweets that are not our own; i.e. are not a continuation of thought
|
|
}
|
|
|
|
// type checkpoint struct {
|
|
// NextToken string
|
|
// }
|
|
|
|
// // save records the checkpoint.
|
|
// func (ch *checkpoint) save(ctx context.Context) {
|
|
// gobBytes, err := timeline.MarshalGob(ch)
|
|
// if err != nil {
|
|
// // TODO: proper logger
|
|
// // log.Printf("[ERROR][%s] Encoding checkpoint: %v", DataSourceID, err)
|
|
// }
|
|
// timeline.Checkpoint(ctx, gobBytes)
|
|
// }
|
|
|
|
// // load decodes the checkpoint.
|
|
// // TODO: see if we can just get the unmarshaled value and type-assert it
|
|
// // TODO: getting "gob: type mismatch: no fields matched compiling decoder for checkpoint" -- also, see if we can just save an any, why are we encoding as bytes first then marshaling as gob later?
|
|
// func (ch *checkpoint) load(checkpointGob []byte) {
|
|
// if len(checkpointGob) == 0 {
|
|
// return
|
|
// }
|
|
// err := timeline.UnmarshalGob(checkpointGob, ch)
|
|
// if err != nil {
|
|
// log.Printf("[ERROR][%s] Decoding checkpoint: %v", DataSourceID, err)
|
|
// }
|
|
// }
|
|
|
|
// // maxTweetID returns the higher of the two tweet IDs.
|
|
// // Errors parsing the strings as integers are ignored.
|
|
// // Empty string inputs are ignored so the other value
|
|
// // will win automatically. If both are empty, an empty
|
|
// // string is returned.
|
|
// func maxTweetID(id1, id2 string) string {
|
|
// if id1 == "" {
|
|
// return id2
|
|
// }
|
|
// if id2 == "" {
|
|
// return id1
|
|
// }
|
|
// id1int, _ := strconv.ParseInt(id1, 10, 64)
|
|
// id2int, _ := strconv.ParseInt(id2, 10, 64)
|
|
// if id1int > id2int {
|
|
// return id1
|
|
// }
|
|
// return id2
|
|
// }
|
|
|
|
// // getLinkedTweetID returns the ID of the tweet in
|
|
// // a link to a tweet, for example:
|
|
// // "https://twitter.com/foo/status/12345"
|
|
// // returns "12345". If the tweet ID cannot be found
|
|
// // or the URL does not match the right format,
|
|
// // an empty string is returned.
|
|
// func getLinkedTweetID(urlToTweet string) string {
|
|
// if !linkToTweetRE.MatchString(urlToTweet) {
|
|
// return ""
|
|
// }
|
|
// u, err := url.Parse(urlToTweet)
|
|
// if err != nil {
|
|
// return ""
|
|
// }
|
|
// return path.Base(u.Path)
|
|
// }
|
|
|
|
// var linkToTweetRE = regexp.MustCompile(`https?://twitter\.com/.*/status/[0-9]+`)
|