1
0
Fork 0
timelinize/datasources/twitter/archives.go
Matthew Holt b77238f86d
twitter: Try tweet.js if tweets.js isn't found
Thanks for the report. Apparently this occurs in archives made in 2022, possibly earlier.
2025-10-12 14:35:26 -06:00

407 lines
10 KiB
Go

/*
Timelinize
Copyright (c) 2013 Matthew Holt
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published
by the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
package twitter
import (
"context"
"encoding/json"
"errors"
"fmt"
"html"
"io"
"io/fs"
"net/url"
"path"
"strings"
"github.com/timelinize/timelinize/timeline"
)
// Recognize returns whether the input is supported.
func (Client) Recognize(_ context.Context, dirEntry timeline.DirEntry, _ timeline.RecognizeParams) (timeline.Recognition, error) {
manifestFile, err := dirEntry.Open(manifestFile)
if errors.Is(err, fs.ErrNotExist) {
return timeline.Recognition{}, nil
} else if err != nil {
return timeline.Recognition{}, nil
}
defer manifestFile.Close()
if err := stripPreface(manifestFile); err != nil {
return timeline.Recognition{}, err
}
var manifest archiveManifest
if err = json.NewDecoder(manifestFile).Decode(&manifest); err != nil {
return timeline.Recognition{}, err
}
return timeline.Recognition{
Confidence: 1,
SnapshotDate: &manifest.ArchiveInfo.GenerationDate,
}, nil
}
// FileImport imports data from the input.
func (c *Client) FileImport(ctx context.Context, dirEntry timeline.DirEntry, params timeline.ImportParams) error {
dsOpt := *params.DataSourceOptions.(*Options)
// load the owner info
acc, err := c.loadOwnerAccountFromArchive(dirEntry.FS)
if err != nil {
return fmt.Errorf("unable to get owner account: %w", err)
}
c.owner = acc.entity(ctx, dirEntry.FS)
// first pass - add tweets to timeline
err = c.processArchive(ctx, dirEntry.FS, params, c.makeItemGraphFromTweet, dsOpt)
if err != nil {
return fmt.Errorf("processing tweets: %w", err)
}
// TODO: not ready yet
// // second pass - add tweet relationships to timeline
// err = c.processArchive(ctx, fsys, itemChan, c.processReplyRelationFromArchive, dsOpt)
// if err != nil {
// return fmt.Errorf("processing tweets round 2: %v", err)
// }
err = c.processDirectMessages(ctx, dirEntry.FS, params, dsOpt)
if err != nil {
return fmt.Errorf("processing direct messages: %w", err)
}
return nil
}
func (c *Client) processArchive(ctx context.Context, fsys fs.FS, params timeline.ImportParams, processFunc archiveProcessFn, opt Options) error {
file, err := fsys.Open(tweetsFile)
if errors.Is(err, fs.ErrNotExist) {
file, err = fsys.Open(tweetsFile2022)
}
if err != nil {
return err
}
defer file.Close()
// consume non-JSON preface (JavaScript variable definition)
err = stripPreface(file)
if err != nil {
return fmt.Errorf("reading tweet file preface: %w", err)
}
err = c.processTweetsFromArchive(ctx, params, file, fsys, processFunc, opt)
if err != nil {
return fmt.Errorf("processing tweet file: %w", err)
}
return nil
}
func (c *Client) processDirectMessages(ctx context.Context, fsys fs.FS, params timeline.ImportParams, _ Options) error {
file, err := fsys.Open(dmsFile)
if err != nil {
return err
}
defer file.Close()
// consume non-JSON preface (JavaScript variable definition)
err = stripPreface(file)
if err != nil {
return fmt.Errorf("reading direct messages file preface: %w", err)
}
dec := json.NewDecoder(file)
// read array opening bracket '['
_, err = dec.Token()
if err != nil {
return fmt.Errorf("decoding opening token: %w", err)
}
for dec.More() {
if err := ctx.Err(); err != nil {
return err
}
var convo directMessages
err := dec.Decode(&convo)
if err != nil {
return fmt.Errorf("decoding conversation element: %w", err)
}
for _, msg := range convo.DMConversation.Messages {
// replace the annoying t.co links with the real links
text := msg.MessageCreate.Text
for _, u := range msg.MessageCreate.URLs {
text = strings.ReplaceAll(text, u.URL, u.Expanded)
}
text = html.UnescapeString(text)
ig := &timeline.Graph{
Item: &timeline.Item{
ID: msg.MessageCreate.ID,
Classification: timeline.ClassMessage,
Timestamp: msg.MessageCreate.CreatedAt,
Owner: timeline.Entity{
Attributes: []timeline.Attribute{
{
Name: identityAttribute,
Value: msg.MessageCreate.SenderID,
Identity: true,
},
},
},
Content: timeline.ItemData{
Data: timeline.StringData(text),
},
},
}
// relate recipient
ig.ToEntity(timeline.RelSent, &timeline.Entity{
Attributes: []timeline.Attribute{
{
Name: identityAttribute,
Value: msg.MessageCreate.RecipientID,
Identity: true,
},
},
})
// attach any media
for _, m := range msg.MessageCreate.MediaURLs {
u, err := url.Parse(m)
if err != nil {
continue
}
parts := strings.Split(u.Path, "/")
filename := path.Join("data", "direct_messages_media")
switch len(parts) {
case 4:
// /dm_gif/... paths
filename = path.Join(filename, msg.MessageCreate.ID+"-"+path.Base(u.Path))
case 5:
// /dm/... paths
filename = path.Join(filename, parts[2]+"-"+parts[4])
}
ig.ToItem(timeline.RelAttachment, &timeline.Item{
Classification: timeline.ClassMessage,
Timestamp: msg.MessageCreate.CreatedAt,
Owner: timeline.Entity{
Attributes: []timeline.Attribute{
{
Name: identityAttribute,
Value: msg.MessageCreate.SenderID,
Identity: true,
},
},
},
Content: timeline.ItemData{
Filename: parts[len(parts)-1],
Data: func(_ context.Context) (io.ReadCloser, error) {
return fsys.Open(filename)
},
},
})
}
params.Pipeline <- ig
}
}
return nil
}
func (c *Client) processTweetsFromArchive(ctx context.Context, params timeline.ImportParams, file io.Reader, fsys fs.FS, processFunc archiveProcessFn, opt Options) error {
dec := json.NewDecoder(file)
// read array opening bracket '['
_, err := dec.Token()
if err != nil {
return fmt.Errorf("decoding opening token: %w", err)
}
for dec.More() {
var container struct {
Tweet tweet `json:"tweet"`
}
err := dec.Decode(&container)
if err != nil {
return fmt.Errorf("decoding tweet element: %w", err)
}
t := container.Tweet
skip, err := c.prepareTweet(ctx, &t, "archive", opt)
if err != nil {
return fmt.Errorf("preparing tweet: %w", err)
}
if skip {
continue
}
ig, err := processFunc(ctx, params, t, fsys, opt)
if err != nil {
return fmt.Errorf("processing tweet: %w", err)
}
// send the tweet(s) for processing
if ig != nil {
params.Pipeline <- ig
}
}
return nil
}
// func (c *Client) processReplyRelationFromArchive(_ context.Context, t tweet, _ fs.FS, _ Options) (*timeline.Graph, error) {
// if t.InReplyToStatusIDStr == "" {
// // current tweet is not a reply, so no relationship to add
// return nil, nil
// }
// if t.InReplyToUserIDStr != "" && t.InReplyToUserIDStr != c.owner.AttributeValue(identityAttribute) {
// // from archives, we only support storing replies to self... (TODO:)
// return nil, nil
// }
// // TODO: with the new Relationship struct, we'll have to get the actual items, not just their IDs...
// // ig := &timeline.ItemGraph{
// // Relationships: []timeline.Relationship{
// // {
// // FromItemID: t.TweetIDStr,
// // ToItemID: t.InReplyToStatusIDStr,
// // Relation: timeline.RelReplyTo,
// // },
// // },
// // }
// return nil, errors.New("TODO: not implemented")
// }
func (c *Client) loadOwnerAccountFromArchive(fsys fs.FS) (twitterAccount, error) {
acc, err := c.getAccountInfoFromArchive(fsys)
if err != nil {
return twitterAccount{}, err
}
err = c.addPhoneNumbersFromArchive(fsys, &acc)
if err != nil {
return twitterAccount{}, err
}
err = c.addProfileFromArchive(fsys, &acc)
if err != nil {
return twitterAccount{}, err
}
return acc, nil
}
func (c *Client) getAccountInfoFromArchive(fsys fs.FS) (twitterAccount, error) {
file, err := fsys.Open("data/account.js")
if err != nil {
return twitterAccount{}, err
}
defer file.Close()
// consume non-JSON preface (JavaScript variable definition)
err = stripPreface(file)
if err != nil {
return twitterAccount{}, fmt.Errorf("reading account file preface: %w", err)
}
var accFile twitterAccountFile
err = json.NewDecoder(file).Decode(&accFile)
if err != nil {
return twitterAccount{}, fmt.Errorf("decoding account file: %w", err)
}
if len(accFile) == 0 {
return twitterAccount{}, errors.New("account file was empty")
}
return accFile[0].Account, nil
}
func (c *Client) addPhoneNumbersFromArchive(fsys fs.FS, acc *twitterAccount) error {
file, err := fsys.Open("data/phone-number.js")
if errors.Is(err, fs.ErrNotExist) {
return nil
}
if err != nil {
return err
}
defer file.Close()
// consume non-JSON preface (JavaScript variable definition)
err = stripPreface(file)
if err != nil {
return fmt.Errorf("reading preface: %w", err)
}
return json.NewDecoder(file).Decode(&acc.PhoneNumbers)
}
func (c *Client) addProfileFromArchive(fsys fs.FS, acc *twitterAccount) error {
file, err := fsys.Open("data/profile.js")
if errors.Is(err, fs.ErrNotExist) {
return nil
}
if err != nil {
return err
}
defer file.Close()
// consume non-JSON preface (JavaScript variable definition)
err = stripPreface(file)
if err != nil {
return fmt.Errorf("reading preface: %w", err)
}
return json.NewDecoder(file).Decode(&acc.Profile)
}
// stripPrefaces reads from f until a '=' is encountered.
// (Each .js file starts with a variable definition at the beginning.)
func stripPreface(f io.Reader) error {
b := make([]byte, 1)
for {
_, err := io.ReadFull(f, b)
if err != nil {
return err
}
if b[0] == '=' {
return nil
}
}
}
// archiveProcessFn is a function that processes a
// tweet from a Twitter export archive and returns
// an ItemGraph created from t.
type archiveProcessFn func(ctx context.Context, params timeline.ImportParams, t tweet, fsys fs.FS, opt Options) (*timeline.Graph, error)
const (
manifestFile = "data/manifest.js"
dmsFile = "data/direct-messages.js"
tweetsFile2022 = "data/tweet.js" // seen in an archive from 2022
tweetsFile = "data/tweets.js"
)