787 lines
25 KiB
Go
787 lines
25 KiB
Go
/*
|
|
Timelinize
|
|
Copyright (c) 2013 Matthew Holt
|
|
|
|
This program is free software: you can redistribute it and/or modify
|
|
it under the terms of the GNU Affero General Public License as published
|
|
by the Free Software Foundation, either version 3 of the License, or
|
|
(at your option) any later version.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU Affero General Public License for more details.
|
|
|
|
You should have received a copy of the GNU Affero General Public License
|
|
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
package facebook
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"io/fs"
|
|
"path"
|
|
"regexp"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/timelinize/timelinize/datasources/media"
|
|
"github.com/timelinize/timelinize/timeline"
|
|
"go.uber.org/zap"
|
|
)
|
|
|
|
// TODO: Add media from "album" folder (create Collections as well)
|
|
|
|
const (
|
|
pre2024ProfileInfoPath = "profile_information/profile_information.json"
|
|
pre2024YourUncategorizedPhotosPath = "posts/your_uncategorized_photos.json"
|
|
pre2024YourVideosPath = "posts/your_videos.json"
|
|
pre2024YourPostsPrefix = "posts/your_posts_"
|
|
pre2024MessagesPrefix = "messages"
|
|
messagesPrefix2025 = "your_instagram_activity/messages"
|
|
)
|
|
|
|
const (
|
|
year2024ProfileInfoPath = "personal_information/profile_information/profile_information.json"
|
|
year2024YourUncategorizedPhotosPath = "your_facebook_activity/posts/your_uncategorized_photos.json"
|
|
year2024YourVideosPath = "your_facebook_activity/posts/your_videos.json"
|
|
year2024YourPostsPrefix = "your_facebook_activity/posts/your_posts__check_ins__photos_and_videos_"
|
|
year2024MessagesPrefix = "your_facebook_activity/messages"
|
|
year2024PostMediaPrefix = "your_facebook_activity/posts/media"
|
|
year2024AlbumPrefix = "your_facebook_activity/posts/album"
|
|
year2024TaggedPlacesPath = "your_facebook_activity/posts/places_you_have_been_tagged_in.json"
|
|
year2024CheckInsPath = "your_facebook_activity/posts/check-ins.json"
|
|
)
|
|
|
|
// Archive implements the importer for Facebook archives.
|
|
type Archive struct {
|
|
owner timeline.Entity
|
|
}
|
|
|
|
// Recognize returns whether the input file is recognized.
|
|
func (Archive) Recognize(_ context.Context, dirEntry timeline.DirEntry, _ timeline.RecognizeParams) (timeline.Recognition, error) {
|
|
// reject HTML-formatted archives, which we don't support
|
|
const oneDotHTML = "1.html"
|
|
if dirEntry.FileExists(pre2024YourPostsPrefix+oneDotHTML) ||
|
|
dirEntry.FileExists(year2024YourPostsPrefix+oneDotHTML) ||
|
|
dirEntry.FileExists(path.Join(year2024AlbumPrefix, oneDotHTML)) {
|
|
return timeline.Recognition{}, nil
|
|
}
|
|
|
|
if dirEntry.FileExists(pre2024ProfileInfoPath) ||
|
|
dirEntry.FileExists(year2024ProfileInfoPath) {
|
|
return timeline.Recognition{Confidence: 1}, nil
|
|
}
|
|
if dirEntry.FileExists("your_facebook_activity") {
|
|
return timeline.Recognition{Confidence: .95}, nil
|
|
}
|
|
if strings.HasPrefix(dirEntry.Name(), "facebook-") {
|
|
return timeline.Recognition{Confidence: .9}, nil
|
|
}
|
|
return timeline.Recognition{}, nil
|
|
}
|
|
|
|
// FileImport imports the data in the file.
|
|
func (a Archive) FileImport(ctx context.Context, dirEntry timeline.DirEntry, params timeline.ImportParams) error {
|
|
dsOpt := params.DataSourceOptions.(*Options)
|
|
|
|
if err := a.setOwnerEntity(ctx, dirEntry, dsOpt); err != nil {
|
|
return err
|
|
}
|
|
|
|
// start with oldest supported archive version
|
|
postsFilePrefix := pre2024YourPostsPrefix
|
|
|
|
// posts
|
|
for i := 1; i < 10000; i++ {
|
|
postsFilename := fmt.Sprintf("%s%d.json", postsFilePrefix, i)
|
|
|
|
postsFile, err := dirEntry.Open(postsFilename)
|
|
if errors.Is(err, fs.ErrNotExist) && postsFilePrefix == pre2024YourPostsPrefix {
|
|
// try newer version
|
|
postsFilePrefix = year2024YourPostsPrefix
|
|
postsFilename = fmt.Sprintf("%s%d.json", postsFilePrefix, i)
|
|
postsFile, err = dirEntry.Open(postsFilename)
|
|
}
|
|
if errors.Is(err, fs.ErrNotExist) {
|
|
break // no more posts files
|
|
}
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
err = a.processPostsFile(ctx, dirEntry, postsFile, params)
|
|
postsFile.Close()
|
|
if err != nil {
|
|
return fmt.Errorf("processing %s: %w", postsFilename, err)
|
|
}
|
|
}
|
|
|
|
// album media
|
|
if err := a.processAlbumFiles(ctx, dirEntry, params, []string{
|
|
year2024AlbumPrefix,
|
|
}); err != nil {
|
|
return fmt.Errorf("processing album folder: %w", err)
|
|
}
|
|
|
|
// post media (done separately since they may not be in the same archive as the JSON manifest)
|
|
if err := a.processPostMedia(ctx, dirEntry, params, []string{
|
|
year2024PostMediaPrefix,
|
|
"your_activity_across_facebook", // also found this in a year 2024 archive, full of media files
|
|
}); err != nil {
|
|
return fmt.Errorf("processing post media: %w", err)
|
|
}
|
|
|
|
// uncategorized photos
|
|
if err := a.processPhotosOrVideos(ctx, dirEntry, params, []string{
|
|
pre2024YourUncategorizedPhotosPath,
|
|
year2024YourUncategorizedPhotosPath,
|
|
}, new(fbYourUncategorizedPhotos)); err != nil {
|
|
return fmt.Errorf("processing uncategorized photos: %w", err)
|
|
}
|
|
|
|
// uncategorized videos
|
|
if err := a.processPhotosOrVideos(ctx, dirEntry, params, []string{
|
|
pre2024YourVideosPath,
|
|
year2024YourVideosPath,
|
|
}, new(fbYourVideos)); err != nil {
|
|
return fmt.Errorf("processing videos: %w", err)
|
|
}
|
|
|
|
// tagged places
|
|
if err := a.processTaggedPlaces(ctx, dirEntry, params, []string{
|
|
year2024TaggedPlacesPath,
|
|
}); err != nil {
|
|
return fmt.Errorf("processing tagged places: %w", err)
|
|
}
|
|
|
|
// check-ins
|
|
if err := a.processCheckins(ctx, dirEntry, params, []string{
|
|
year2024CheckInsPath,
|
|
}); err != nil {
|
|
return fmt.Errorf("processing check-ins: %w", err)
|
|
}
|
|
|
|
// messages
|
|
err := GetMessages("facebook", dirEntry, params)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (a Archive) processPostsFile(ctx context.Context, d timeline.DirEntry, file fs.File, params timeline.ImportParams) error {
|
|
var posts yourPosts
|
|
if err := json.NewDecoder(file).Decode(&posts); err != nil {
|
|
return err
|
|
}
|
|
|
|
for _, post := range posts {
|
|
if err := ctx.Err(); err != nil {
|
|
return err
|
|
}
|
|
|
|
var postText string
|
|
for _, postData := range post.Data {
|
|
if postData.Post != "" {
|
|
postText = FixString(postData.Post)
|
|
break
|
|
}
|
|
}
|
|
|
|
item := &timeline.Item{
|
|
Classification: timeline.ClassSocial,
|
|
Timestamp: time.Unix(post.Timestamp, 0).UTC(), // these unix timestams are set in UTC
|
|
Owner: a.owner,
|
|
Content: timeline.ItemData{
|
|
Data: timeline.StringData(postText),
|
|
},
|
|
Metadata: timeline.Metadata{
|
|
"Title": post.Title,
|
|
},
|
|
}
|
|
ig := &timeline.Graph{Item: item}
|
|
|
|
const nameMatchIndex = 1
|
|
if matches := wroteOnOtherTimelineRegex.FindStringSubmatch(post.Title); len(matches) == nameMatchIndex+1 {
|
|
ig.ToEntity(timeline.RelSent, &timeline.Entity{
|
|
Name: matches[1],
|
|
Attributes: []timeline.Attribute{
|
|
{
|
|
Name: "facebook_name",
|
|
Value: matches[1],
|
|
Identifying: true,
|
|
},
|
|
},
|
|
})
|
|
}
|
|
|
|
// confusingly, an attachment object in this array can have multiple attachments
|
|
// of various types... I don't really know why they group them together, but we
|
|
// just store 1 point of data (whether it be a location, a media file, or text
|
|
// content, for example) per item
|
|
for _, attachmentGroup := range post.Attachments {
|
|
for _, attachment := range attachmentGroup.Data {
|
|
attachedItem := &timeline.Item{
|
|
Owner: a.owner,
|
|
Timestamp: item.Timestamp, // assume main item timestamp, can be changed later
|
|
Metadata: make(timeline.Metadata),
|
|
}
|
|
switch {
|
|
case attachment.Text != "":
|
|
text := FixString(attachment.Text)
|
|
if descStr, ok := attachedItem.Metadata["Description"].(string); ok && text != descStr {
|
|
attachedItem.Content.Data = timeline.StringData(text)
|
|
}
|
|
case attachment.Media.URI != "":
|
|
attachedItem.Classification = timeline.ClassSocial
|
|
attachment.Media.fillItem(attachedItem, d, postText, params.Log)
|
|
case attachment.ExternalContext.URL != "":
|
|
attachedItem.Content.Data = timeline.StringData(attachment.ExternalContext.Name)
|
|
attachedItem.Metadata["URL"] = attachment.ExternalContext.URL
|
|
attachedItem.Classification = timeline.ClassLocation
|
|
attachedItem.Timestamp = item.Timestamp
|
|
case attachment.Place.Name != "" ||
|
|
attachment.Place.Address != "" ||
|
|
attachment.Place.URL != "" ||
|
|
(attachment.Place.Coordinate.Latitude != 0 && attachment.Place.Coordinate.Longitude != 0):
|
|
// We don't know the context of this attachment; is it something like, "I visited this
|
|
// place, it was cool?" or is it like "Hey everyone, this place is having an event
|
|
// that you should go to" -- i.e. does it necessarily mean the owner is at this place?
|
|
// Maybe sometimes, but I don't know that we know for sure.
|
|
// I wonder if we should just store this as an entity and attach it
|
|
|
|
address := timeline.Attribute{
|
|
Name: "address",
|
|
Value: attachment.Place.Address,
|
|
}
|
|
if attachment.Place.Coordinate.Latitude != 0 && attachment.Place.Coordinate.Longitude != 0 {
|
|
address.Latitude = &attachment.Place.Coordinate.Latitude
|
|
address.Longitude = &attachment.Place.Coordinate.Longitude
|
|
}
|
|
|
|
place := &timeline.Entity{
|
|
Type: timeline.EntityPlace,
|
|
Name: attachment.Place.Name,
|
|
Attributes: []timeline.Attribute{address},
|
|
}
|
|
ig.ToEntity(timeline.RelAttachment, place)
|
|
}
|
|
|
|
// newDescription := strings.Join(allText, "\n")
|
|
// if existingDesc, ok := attachedItem.Metadata["Description"].(string); ok && existingDesc != "" {
|
|
// newDescription += "\n\n" + existingDesc
|
|
// }
|
|
// attachedItem.Metadata["Description"] = newDescription
|
|
|
|
if attachedItem.Content.Data != nil || attachedItem.Content.Filename != "" {
|
|
ig.ToItem(timeline.RelAttachment, attachedItem)
|
|
}
|
|
}
|
|
}
|
|
|
|
params.Pipeline <- ig
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (a Archive) processAlbumFiles(ctx context.Context, tlDirEntry timeline.DirEntry, opt timeline.ImportParams, pathsToTry []string) error {
|
|
for _, pathToTry := range pathsToTry {
|
|
err := fs.WalkDir(tlDirEntry, pathToTry, func(fpath string, d fs.DirEntry, err error) error {
|
|
if err != nil {
|
|
return fmt.Errorf("visiting %s: %w", fpath, err)
|
|
}
|
|
if err := ctx.Err(); err != nil {
|
|
return err
|
|
}
|
|
if d.IsDir() {
|
|
return nil
|
|
}
|
|
|
|
f, err := tlDirEntry.Open(fpath)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer f.Close()
|
|
|
|
var albumInfo fbAlbumMeta
|
|
if err := json.NewDecoder(f).Decode(&albumInfo); err != nil {
|
|
return fmt.Errorf("decoding album file: %s: %w", fpath, err)
|
|
}
|
|
|
|
for _, entry := range albumInfo.Photos {
|
|
if entry.URI == "" {
|
|
continue
|
|
}
|
|
it := &timeline.Item{
|
|
Classification: timeline.ClassMedia,
|
|
IntermediateLocation: entry.URI,
|
|
Owner: a.owner,
|
|
Timestamp: time.Unix(entry.CreationTimestamp, 0).UTC(), // this is not when the photo was taken, but we'll get that later, if it's in the actual photo itself
|
|
Content: timeline.ItemData{
|
|
Filename: path.Base(entry.URI),
|
|
},
|
|
Metadata: timeline.Metadata{
|
|
"Description": FixString(entry.Description),
|
|
},
|
|
}
|
|
it.Retrieval.SetKey(retrievalKey(tlDirEntry, entry.URI))
|
|
it.Retrieval.FieldUpdatePolicies = map[string]timeline.FieldUpdatePolicy{
|
|
"intermediate_location": timeline.UpdatePolicyPreferExisting,
|
|
"timestamp": timeline.UpdatePolicyPreferExisting,
|
|
"location": timeline.UpdatePolicyPreferIncoming,
|
|
"owner": timeline.UpdatePolicyPreferIncoming,
|
|
"data": timeline.UpdatePolicyPreferExisting,
|
|
"metadata": timeline.UpdatePolicyPreferIncoming,
|
|
}
|
|
|
|
g := &timeline.Graph{Item: it}
|
|
|
|
// add photo to album
|
|
g.ToItem(timeline.RelInCollection, &timeline.Item{
|
|
Classification: timeline.ClassCollection,
|
|
Content: timeline.ItemData{
|
|
Data: timeline.StringData(albumInfo.Name),
|
|
},
|
|
Owner: a.owner,
|
|
Metadata: timeline.Metadata{
|
|
"Description": FixString(albumInfo.Description),
|
|
},
|
|
})
|
|
|
|
opt.Pipeline <- g
|
|
}
|
|
|
|
return nil
|
|
})
|
|
if errors.Is(err, fs.ErrNotExist) {
|
|
continue // it's valid for an archive not to have this data
|
|
}
|
|
if err != nil {
|
|
return fmt.Errorf("could not walk known album folder: %s: %w", pathToTry, err)
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (a Archive) processPostMedia(ctx context.Context, tlDirEntry timeline.DirEntry, opt timeline.ImportParams, pathsToTry []string) error {
|
|
for _, pathToTry := range pathsToTry {
|
|
err := fs.WalkDir(tlDirEntry, pathToTry, func(fpath string, d fs.DirEntry, err error) error {
|
|
if err != nil {
|
|
return fmt.Errorf("visiting %s: %w", fpath, err)
|
|
}
|
|
if err := ctx.Err(); err != nil {
|
|
return err
|
|
}
|
|
if d.IsDir() {
|
|
return nil
|
|
}
|
|
|
|
info, err := d.Info()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
ext := strings.ToLower(path.Ext(info.Name()))
|
|
switch ext {
|
|
// in reality I've only seen .jpg, .mp4, and no extension, for valid media files
|
|
case ".jpg", ".jpeg", ".gif", ".png", ".heic", "", ".mp4", ".mov":
|
|
default:
|
|
return nil
|
|
}
|
|
|
|
it := &timeline.Item{
|
|
Classification: timeline.ClassMedia,
|
|
IntermediateLocation: fpath,
|
|
Owner: a.owner,
|
|
Content: timeline.ItemData{
|
|
Filename: d.Name(),
|
|
Size: uint64(info.Size()), //nolint:gosec // Sigh, yes, I know this can overflow... but will it really??
|
|
Data: func(_ context.Context) (io.ReadCloser, error) {
|
|
return tlDirEntry.Open(fpath)
|
|
},
|
|
},
|
|
}
|
|
|
|
_, err = media.ExtractAllMetadata(opt.Log, tlDirEntry, fpath, it, timeline.MetaMergeAppend)
|
|
if err != nil {
|
|
opt.Log.Error("extracting metadata from Facebook media",
|
|
zap.String("file", fpath),
|
|
zap.Error(err))
|
|
}
|
|
|
|
retKey := retrievalKey(tlDirEntry, fpath)
|
|
it.Retrieval.SetKey(retKey)
|
|
it.Retrieval.FieldUpdatePolicies = map[string]timeline.FieldUpdatePolicy{
|
|
"data": timeline.UpdatePolicyPreferIncoming,
|
|
"metadata": timeline.UpdatePolicyKeepExisting,
|
|
}
|
|
|
|
opt.Pipeline <- &timeline.Graph{Item: it}
|
|
|
|
return nil
|
|
})
|
|
if errors.Is(err, fs.ErrNotExist) {
|
|
continue // it's valid for an archive not to have this data
|
|
}
|
|
if err != nil {
|
|
return fmt.Errorf("could not open known post media folder: %w - tried: %s", err, pathToTry)
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (a Archive) processPhotosOrVideos(ctx context.Context, tlDirEntry timeline.DirEntry, opt timeline.ImportParams, pathsToTry []string, unmarshalInto any) error {
|
|
for _, pathToTry := range pathsToTry {
|
|
file, err := tlDirEntry.Open(pathToTry)
|
|
if errors.Is(err, fs.ErrNotExist) {
|
|
continue // it's valid for an archive not to have this data
|
|
}
|
|
if err != nil {
|
|
return fmt.Errorf("could not open known photos/videos folder: %w - tried: %s", err, pathToTry)
|
|
}
|
|
defer file.Close()
|
|
|
|
if err := json.NewDecoder(file).Decode(unmarshalInto); err != nil {
|
|
return err
|
|
}
|
|
|
|
var medias []fbArchiveMedia
|
|
switch v := unmarshalInto.(type) {
|
|
case *fbYourUncategorizedPhotos:
|
|
medias = v.OtherPhotosV2
|
|
case *fbYourVideos:
|
|
medias = v.VideosV2
|
|
}
|
|
|
|
for _, media := range medias {
|
|
if err := ctx.Err(); err != nil {
|
|
return err
|
|
}
|
|
|
|
item := &timeline.Item{
|
|
Owner: a.owner,
|
|
Classification: timeline.ClassMedia,
|
|
}
|
|
|
|
media.fillItem(item, tlDirEntry, "", opt.Log)
|
|
|
|
opt.Pipeline <- &timeline.Graph{Item: item}
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (a Archive) processTaggedPlaces(ctx context.Context, tlDirEntry timeline.DirEntry, params timeline.ImportParams, pathsToTry []string) error {
|
|
for _, pathToTry := range pathsToTry {
|
|
file, err := tlDirEntry.Open(pathToTry)
|
|
if errors.Is(err, fs.ErrNotExist) {
|
|
continue // it's valid for an archive not to have this data
|
|
}
|
|
if err != nil {
|
|
return fmt.Errorf("could not open known tagged places folder: %w - tried: %s", err, pathToTry)
|
|
}
|
|
defer file.Close()
|
|
|
|
var taggedPlaces fbTaggedPlaces
|
|
if err := json.NewDecoder(file).Decode(&taggedPlaces); err != nil {
|
|
return err
|
|
}
|
|
|
|
for _, taggedPlace := range taggedPlaces {
|
|
if err := ctx.Err(); err != nil {
|
|
return err
|
|
}
|
|
|
|
// TODO: a privacy-preserving way to get the coordinates would be good (possibly a lookup using the Facebook ID could do it)
|
|
|
|
visitItem := &timeline.Item{
|
|
Owner: a.owner,
|
|
Classification: timeline.ClassLocation, // TODO: Technically, we don't have coordinates, should we make a new class?? (leaning toward no: a location can still be a visit to a named place, even if coords unknown; the class just means this person was logged at being at a place)
|
|
}
|
|
place := &timeline.Entity{
|
|
Type: timeline.EntityPlace,
|
|
Attributes: []timeline.Attribute{
|
|
{
|
|
Name: "facebook_id",
|
|
Value: taggedPlace.FBID,
|
|
Identity: true,
|
|
},
|
|
},
|
|
}
|
|
|
|
for _, label := range taggedPlace.LabelValues {
|
|
switch label.Label {
|
|
case "Place name":
|
|
place.Name = label.Value
|
|
case "Visit time":
|
|
visitItem.Timestamp = time.Unix(int64(label.TimestampValue), 0).UTC()
|
|
case "Name of application used to tag this place":
|
|
visitItem.Metadata = timeline.Metadata{
|
|
"Source": label.Value,
|
|
}
|
|
}
|
|
}
|
|
|
|
g := &timeline.Graph{Item: visitItem}
|
|
g.ToEntity(timeline.RelVisit, place)
|
|
|
|
params.Pipeline <- g
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (a Archive) processCheckins(ctx context.Context, tlDirEntry timeline.DirEntry, params timeline.ImportParams, pathsToTry []string) error {
|
|
for _, pathToTry := range pathsToTry {
|
|
file, err := tlDirEntry.Open(pathToTry)
|
|
if errors.Is(err, fs.ErrNotExist) {
|
|
continue // it's valid for an archive not to have this data
|
|
}
|
|
if err != nil {
|
|
return fmt.Errorf("could not open known check-ins folder: %w - tried: %s", err, pathToTry)
|
|
}
|
|
defer file.Close()
|
|
|
|
// the check-ins file, at least as of 2024-2025, has two possible formats: an array of objects,
|
|
// or if there's only 1 check-in, it will have just that single object (no array)... sigh
|
|
var checkIns fbCheckIns
|
|
if err := json.NewDecoder(file).Decode(&checkIns); err != nil {
|
|
// wasn't an array, try the single-object decode
|
|
file.Close()
|
|
file, err = tlDirEntry.Open(pathToTry)
|
|
if err != nil {
|
|
return fmt.Errorf("reopening check-in file to try alternate decoding target: %s: %w", pathToTry, err)
|
|
}
|
|
defer file.Close()
|
|
var checkIn fbCheckIn
|
|
if err := json.NewDecoder(file).Decode(&checkIn); err != nil {
|
|
return err
|
|
}
|
|
checkIns = fbCheckIns{checkIn}
|
|
}
|
|
|
|
for _, checkIn := range checkIns {
|
|
if err := ctx.Err(); err != nil {
|
|
return err
|
|
}
|
|
|
|
visitItem := &timeline.Item{
|
|
Owner: a.owner,
|
|
Classification: timeline.ClassLocation,
|
|
Timestamp: time.Unix(int64(checkIn.Timestamp), 0).UTC(),
|
|
Metadata: timeline.Metadata{
|
|
"Facebook ID": checkIn.FBID,
|
|
},
|
|
}
|
|
|
|
place := &timeline.Entity{
|
|
Type: timeline.EntityPlace,
|
|
}
|
|
|
|
for _, label := range checkIn.LabelValues {
|
|
switch label.Label {
|
|
case "Message":
|
|
visitItem.Content.Data = timeline.StringData(strings.TrimSpace(label.Value))
|
|
case "Place tags":
|
|
for _, dictLabel := range label.Dict {
|
|
switch dictLabel.Label {
|
|
case "Coordinates":
|
|
lat, lon, err := parseCoordsFromDictString(dictLabel.Value)
|
|
if err != nil {
|
|
params.Log.Error("invalid check-in coordinates", zap.Error(err))
|
|
continue
|
|
}
|
|
if lat != 0 && lon != 0 {
|
|
visitItem.Location = timeline.Location{
|
|
Longitude: &lon,
|
|
Latitude: &lat,
|
|
}
|
|
}
|
|
case "Address":
|
|
place.Attributes = append(place.Attributes, timeline.Attribute{
|
|
Name: "address",
|
|
Value: strings.TrimSpace(dictLabel.Value),
|
|
})
|
|
case "Name":
|
|
place.Name = strings.TrimSpace(dictLabel.Value)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
g := &timeline.Graph{Item: visitItem}
|
|
if place.Name != "" || len(place.Attributes) > 0 {
|
|
g.ToEntity(timeline.RelVisit, place)
|
|
}
|
|
|
|
params.Pipeline <- g
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// loadProfileInfo loads the profile info found within the DirEntry.
|
|
// It is possible for there to be none, in which case an empty profileInfo
|
|
// will be returned with no error (because no error occurred while looking
|
|
// for it; it is valid for multi-archive exports to not contain any except
|
|
// in just one of the archives).
|
|
func (Archive) loadProfileInfo(tlDirEntry timeline.DirEntry) (profileInfo, error) {
|
|
for _, pathToTry := range []string{
|
|
year2024ProfileInfoPath,
|
|
pre2024ProfileInfoPath,
|
|
} {
|
|
file, err := tlDirEntry.Open(pathToTry)
|
|
if errors.Is(err, fs.ErrNotExist) {
|
|
continue
|
|
}
|
|
if err != nil {
|
|
return profileInfo{}, fmt.Errorf("could not open known places for the profile info: %w", err)
|
|
}
|
|
defer file.Close()
|
|
|
|
var profileInfo profileInfo
|
|
err = json.NewDecoder(file).Decode(&profileInfo)
|
|
return profileInfo, err
|
|
}
|
|
return profileInfo{}, nil
|
|
}
|
|
|
|
func (a *Archive) setOwnerEntity(ctx context.Context, d timeline.DirEntry, options *Options) error {
|
|
// we might need to get the username from the data source options, since one is not available to us in the data
|
|
if options.Username == "" {
|
|
if repoOwner, ok := ctx.Value(timeline.RepoOwnerCtxKey).(timeline.Entity); ok {
|
|
if accountUsername, ok := repoOwner.AttributeValue("facebook_username").(string); ok {
|
|
options.Username = accountUsername
|
|
}
|
|
}
|
|
}
|
|
|
|
profileInfo, err := a.loadProfileInfo(d)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if profileInfo.ProfileV2.Username == "" {
|
|
if options.Username == "" {
|
|
return errors.New("account username is needed, and cannot be empty")
|
|
}
|
|
|
|
// this archive doesn't contain profile info; that's expected with multi-archive exports
|
|
// for all the archives but one; so use the user-supplied username
|
|
a.owner = timeline.Entity{
|
|
Attributes: []timeline.Attribute{
|
|
{
|
|
Name: "facebook_username",
|
|
Value: options.Username,
|
|
Identity: true,
|
|
},
|
|
},
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// these have to match to ensure the imported data is attributed consistently to the correct owner entity
|
|
if profileInfo.ProfileV2.Username != options.Username {
|
|
return fmt.Errorf("configured username (%s) does not match what is in the profile manifest: %q",
|
|
options.Username, profileInfo.ProfileV2.Username)
|
|
}
|
|
|
|
name := FixString(profileInfo.ProfileV2.Name.FullName)
|
|
a.owner = timeline.Entity{
|
|
Name: name,
|
|
Attributes: []timeline.Attribute{
|
|
{
|
|
Name: "facebook_username",
|
|
Value: profileInfo.ProfileV2.Username,
|
|
Identity: true, // the data export only gives us this info for the owner, so it is the identity, but only for this user
|
|
},
|
|
{
|
|
Name: "facebook_name",
|
|
Value: name,
|
|
Identifying: true, // this is not a great identifier, but it's what we're given throughout the data archive
|
|
},
|
|
{
|
|
Name: timeline.AttributeGender,
|
|
Value: strings.ToLower(profileInfo.ProfileV2.Gender.Pronoun),
|
|
},
|
|
{
|
|
Name: "birth_place",
|
|
Value: profileInfo.ProfileV2.Hometown.Name,
|
|
},
|
|
},
|
|
}
|
|
|
|
if profileInfo.ProfileV2.Birthday.Month != 0 &&
|
|
profileInfo.ProfileV2.Birthday.Day != 0 {
|
|
bdate := time.Date(
|
|
profileInfo.ProfileV2.Birthday.Year,
|
|
time.Month(profileInfo.ProfileV2.Birthday.Month),
|
|
profileInfo.ProfileV2.Birthday.Day,
|
|
0, 0, 0, 0, time.Local)
|
|
a.owner.Attributes = append(a.owner.Attributes, timeline.Attribute{
|
|
Name: "birth_date",
|
|
Value: bdate,
|
|
})
|
|
}
|
|
|
|
for _, email := range profileInfo.ProfileV2.Emails.Emails {
|
|
a.owner.Attributes = append(a.owner.Attributes, timeline.Attribute{
|
|
Name: timeline.AttributeEmail,
|
|
Value: email,
|
|
Identifying: true,
|
|
})
|
|
}
|
|
for _, email := range profileInfo.ProfileV2.Emails.PreviousEmails {
|
|
a.owner.Attributes = append(a.owner.Attributes, timeline.Attribute{
|
|
Name: timeline.AttributeEmail,
|
|
Value: email,
|
|
Identifying: true,
|
|
})
|
|
}
|
|
for _, website := range profileInfo.ProfileV2.Websites {
|
|
a.owner.Attributes = append(a.owner.Attributes, timeline.Attribute{
|
|
Name: "website",
|
|
Value: website.Address,
|
|
})
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
// FixString fixes a malformed string created by decoding UTF-8-encoded JSON string
|
|
// values as UTF-16 strings. JSON string values *should* be encoded as UTF-16:
|
|
// https://datatracker.ietf.org/doc/html/rfc7159#section-7 -- but as of January 2023,
|
|
// Facebook's account archive exporter encodes emoji incorrectly with UTF-8 escapes.
|
|
// For example, code point U+1F642 is encoded as "\u00f0\u009f\u0099\u0082" instead of
|
|
// "\uD83D\uDE42" -- resulting in garbage like "ð". This function transforms the string
|
|
// to runes, then back to bytes as long as their rune value is < 255.
|
|
//
|
|
// Thanks to Jorropo on the Gophers Slack for helping me figure this out.
|
|
//
|
|
// TODO: what should we do in case of an error? continue, or would the whole string be malformed after?
|
|
func FixString(malformed string) string {
|
|
const maxByte = 255
|
|
asRunes := []rune(malformed)
|
|
final := make([]byte, len(asRunes))
|
|
for i, r := range asRunes {
|
|
if r > maxByte {
|
|
continue // TODO: FIXME: Is this the best thing to do? Would the rest of the string be corrupted?
|
|
}
|
|
final[i] = byte(r)
|
|
}
|
|
return string(final)
|
|
}
|
|
|
|
var wroteOnOtherTimelineRegex = regexp.MustCompile(`.* wrote on (.*)'s timeline.`)
|