371 lines
11 KiB
Go
371 lines
11 KiB
Go
/*
|
|
Timelinize
|
|
Copyright (c) 2013 Matthew Holt
|
|
|
|
This program is free software: you can redistribute it and/or modify
|
|
it under the terms of the GNU Affero General Public License as published
|
|
by the Free Software Foundation, either version 3 of the License, or
|
|
(at your option) any later version.
|
|
|
|
This program is distributed in the hope that it will be useful,
|
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
GNU Affero General Public License for more details.
|
|
|
|
You should have received a copy of the GNU Affero General Public License
|
|
along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
*/
|
|
|
|
// Package smsbackuprestore implements a data source for the Android SMS Backup & Restore app by SyncTech:
|
|
// https://synctech.com.au/sms-backup-restore/
|
|
package smsbackuprestore
|
|
|
|
import (
|
|
"context"
|
|
"encoding/json"
|
|
"encoding/xml"
|
|
"errors"
|
|
"fmt"
|
|
"io"
|
|
"io/fs"
|
|
"path"
|
|
"path/filepath"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/timelinize/timelinize/timeline"
|
|
"go.uber.org/zap"
|
|
"golang.org/x/net/html/charset"
|
|
)
|
|
|
|
func init() {
|
|
err := timeline.RegisterDataSource(timeline.DataSource{
|
|
Name: "sms_backup_restore",
|
|
Title: "SMS Backup & Restore",
|
|
Icon: "sms_backup_restore.png",
|
|
NewOptions: func() any { return new(Options) },
|
|
NewFileImporter: func() timeline.FileImporter { return new(FileImporter) },
|
|
})
|
|
if err != nil {
|
|
timeline.Log.Fatal("registering data source", zap.Error(err))
|
|
}
|
|
}
|
|
|
|
// FileImporter can import the data from a file.
|
|
type FileImporter struct{}
|
|
|
|
// Recognize returns whether this input is supported.
|
|
func (FileImporter) Recognize(_ context.Context, dirEntry timeline.DirEntry, _ timeline.RecognizeParams) (timeline.Recognition, error) {
|
|
// not a match if the file is a directory
|
|
if dirEntry.IsDir() {
|
|
return timeline.Recognition{}, nil
|
|
}
|
|
|
|
// skip unsupported file types
|
|
switch strings.ToLower(path.Ext(dirEntry.Name())) {
|
|
case ".xml", ".zip":
|
|
default:
|
|
return timeline.Recognition{}, nil
|
|
}
|
|
|
|
file, err := dirEntry.Open(".")
|
|
if err != nil {
|
|
return timeline.Recognition{}, fmt.Errorf("opening file: %w", err)
|
|
}
|
|
defer file.Close()
|
|
|
|
dec := xml.NewDecoder(file)
|
|
dec.CharsetReader = charset.NewReaderLabel // handle non-UTF-8 encodings
|
|
for {
|
|
// NOTE: I've seen JSON files successfully get a first token from the XML decoder
|
|
tkn, err := dec.Token()
|
|
if errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF) {
|
|
break // ignore short or empty files
|
|
}
|
|
var syntaxErr *xml.SyntaxError
|
|
if errors.As(err, &syntaxErr) {
|
|
break // invalid XML file
|
|
}
|
|
if err != nil {
|
|
// other errors indicate we're unsure whether we can recognize this
|
|
return timeline.Recognition{}, fmt.Errorf("parsing XML token: %w", err)
|
|
}
|
|
|
|
if startElem, ok := tkn.(xml.StartElement); ok {
|
|
if startElem.Name.Local == "smses" {
|
|
// has the start of the expected XML structure!
|
|
return timeline.Recognition{Confidence: 1}, nil
|
|
}
|
|
break
|
|
}
|
|
}
|
|
|
|
return timeline.Recognition{}, nil
|
|
}
|
|
|
|
// Options contains provider-specific options for using this data source.
|
|
type Options struct {
|
|
// The phone number from which this export file originated.
|
|
// SMS Backup & Restore does not provide any identifying
|
|
// information of the recipient of these messages AT ALL,
|
|
// so the user MUST supply their phone number.
|
|
OwnerPhoneNumber string `json:"owner_phone_number"`
|
|
|
|
// DefaultRegion is the region to assume for phone
|
|
// numbers that do not have an explicit country
|
|
// calling code. This value should be the ISO
|
|
// 3166-1 alpha-2 standard region code.
|
|
// Default: "US"
|
|
DefaultRegion string `json:"default_region,omitempty"`
|
|
}
|
|
|
|
// FileImport imports data from the input file.
|
|
func (imp *FileImporter) FileImport(ctx context.Context, dirEntry timeline.DirEntry, params timeline.ImportParams) error {
|
|
dsOpt := *params.DataSourceOptions.(*Options)
|
|
|
|
// we need the phone number of the phone that originated this file; if no phone number
|
|
// was given to the DS options, we try the timeline repo owner's phone number
|
|
if dsOpt.OwnerPhoneNumber == "" {
|
|
if repoOwner, ok := ctx.Value(timeline.RepoOwnerCtxKey).(timeline.Entity); ok {
|
|
if ownerPhone, ok := repoOwner.AttributeValue(timeline.AttributePhoneNumber).(string); ok {
|
|
dsOpt.OwnerPhoneNumber = ownerPhone
|
|
}
|
|
}
|
|
}
|
|
if dsOpt.OwnerPhoneNumber == "" {
|
|
return errors.New("originating phone number cannot be empty")
|
|
}
|
|
|
|
// standardize phone number, and ensure it is marked as identity
|
|
standardizedPhoneNum, err := timeline.NormalizePhoneNumber(ctx, dsOpt.OwnerPhoneNumber, dsOpt.DefaultRegion)
|
|
if err != nil {
|
|
return fmt.Errorf("standardizing owner's phone number '%s': %w", dsOpt.OwnerPhoneNumber, err)
|
|
}
|
|
dsOpt.OwnerPhoneNumber = standardizedPhoneNum
|
|
|
|
xmlFile, err := openFile(ctx, dirEntry)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer xmlFile.Close()
|
|
|
|
// can't decode a directory
|
|
info, err := xmlFile.Stat()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if info.IsDir() {
|
|
return nil
|
|
}
|
|
|
|
// load prior checkpoint, if set
|
|
var line, checkpoint int
|
|
if params.Checkpoint != nil {
|
|
err := json.Unmarshal(params.Checkpoint, &checkpoint)
|
|
if err != nil {
|
|
return fmt.Errorf("decoding checkpoint: %w", err)
|
|
}
|
|
}
|
|
|
|
// processing messages concurrently is not faster, based on my testing
|
|
|
|
dec := xml.NewDecoder(xmlFile)
|
|
dec.CharsetReader = charset.NewReaderLabel // handle non-UTF-8 encodings
|
|
for {
|
|
if err := ctx.Err(); err != nil {
|
|
return err
|
|
}
|
|
|
|
tkn, err := dec.Token()
|
|
if errors.Is(err, io.EOF) {
|
|
break
|
|
}
|
|
if err != nil {
|
|
return fmt.Errorf("decoding next XML token: %w", err)
|
|
}
|
|
|
|
if startElem, ok := tkn.(xml.StartElement); ok {
|
|
if startElem.Name.Local != "sms" && startElem.Name.Local != "mms" {
|
|
continue
|
|
}
|
|
|
|
// fast-forward to checkpoint if set and we haven't reached it already
|
|
if checkpoint > 0 && line <= checkpoint {
|
|
line++
|
|
continue
|
|
}
|
|
|
|
switch startElem.Name.Local {
|
|
case "sms":
|
|
var sms SMS
|
|
if err := dec.DecodeElement(&sms, &startElem); err != nil {
|
|
return fmt.Errorf("decoding XML element as SMS: %w", err)
|
|
}
|
|
imp.processSMS(line, sms, params, dsOpt)
|
|
case "mms":
|
|
var mms MMS
|
|
if err := dec.DecodeElement(&mms, &startElem); err != nil {
|
|
return fmt.Errorf("decoding XML element as MMS: %w", err)
|
|
}
|
|
imp.processMMS(ctx, line, mms, params, dsOpt)
|
|
}
|
|
|
|
line++
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
func (imp *FileImporter) processSMS(line int, sms SMS, opt timeline.ImportParams, dsOpt Options) {
|
|
if !sms.within(opt.Timeframe) {
|
|
return
|
|
}
|
|
|
|
sender, receiver := sms.people(dsOpt)
|
|
|
|
ig := &timeline.Graph{
|
|
Item: &timeline.Item{
|
|
Classification: timeline.ClassMessage,
|
|
Timestamp: time.UnixMilli(sms.Date).UTC(), // these unix timestamps represent the actual UTC date, not local time
|
|
Owner: sender,
|
|
Content: timeline.ItemData{
|
|
MediaType: "text/plain",
|
|
Data: timeline.StringData(strings.TrimSpace(sms.Body)),
|
|
},
|
|
Metadata: sms.metadata(),
|
|
},
|
|
Checkpoint: line,
|
|
}
|
|
|
|
ig.ToEntity(timeline.RelSent, &receiver)
|
|
|
|
opt.Pipeline <- ig
|
|
}
|
|
|
|
func (imp *FileImporter) processMMS(ctx context.Context, line int, mms MMS, opt timeline.ImportParams, dsOpt Options) {
|
|
if !mms.within(opt.Timeframe) {
|
|
return
|
|
}
|
|
|
|
sender, recipients := mms.people(ctx, dsOpt)
|
|
|
|
// the ordering of the parts is not guaranteed, and I've seen them
|
|
// switched around on different exports; I think it makes sense to
|
|
// prefer the part with text to be the "main" part as the root of
|
|
// the graph, with media being attachments, or kind of secondary;
|
|
// so move the text part to be first to have that guarantee.
|
|
mms.Parts.putTextPartFirst()
|
|
|
|
// TODO: what if the text part is empty? it results in a basically empty item,
|
|
// with the media being attachments. Should the first non-empty part be used
|
|
// as the main item instead?
|
|
|
|
var ig *timeline.Graph
|
|
for _, part := range mms.Parts.Part {
|
|
if part.Seq < 0 {
|
|
continue
|
|
}
|
|
|
|
// most MMS texts have useless rubbish filenames; ignore them since they waste space in the DB
|
|
filename := part.Filename
|
|
if _, ok := junkFilenames[filename]; ok {
|
|
filename = ""
|
|
}
|
|
|
|
node := &timeline.Item{
|
|
Classification: timeline.ClassMessage,
|
|
Timestamp: time.UnixMilli(mms.Date),
|
|
Owner: sender,
|
|
Content: timeline.ItemData{
|
|
MediaType: part.ContentType,
|
|
Filename: filename,
|
|
Data: part.data(),
|
|
},
|
|
Metadata: mms.metadata(),
|
|
}
|
|
|
|
if ig == nil {
|
|
ig = &timeline.Graph{Item: node, Checkpoint: line}
|
|
} else {
|
|
// TODO: this does not add a "sent" relation for the attachments,
|
|
// we'd have to traverse up to the root of the graph (usually the text
|
|
// node, if there is one) and then follow its "sent" edge to know
|
|
// who the attachment was sent to... smaller DB I guess, is that OK though?
|
|
ig.ToItem(timeline.RelAttachment, node)
|
|
}
|
|
}
|
|
|
|
// some MMS are empty (or only have Seq=-1); no content means nil ItemGraph
|
|
if ig == nil {
|
|
return
|
|
}
|
|
|
|
// add relations to make sure other participants in a group text
|
|
// are recorded; necessary if more than two participants
|
|
for i := range recipients {
|
|
ig.ToEntity(timeline.RelSent, &recipients[i])
|
|
}
|
|
|
|
opt.Pipeline <- ig
|
|
}
|
|
|
|
// TODO: update godoc etc...
|
|
// openFile opens the XML file at filename. However, as the Pro version
|
|
// of SMS Backup & Restore can compress them as .zip files, we also
|
|
// support that if the filename is a zip file. (The filename in the
|
|
// archive must be the same as the input filename without the .zip
|
|
// extension.)
|
|
func openFile(_ context.Context, dirEntry timeline.DirEntry) (fs.File, error) {
|
|
baseFilename := filepath.Base(dirEntry.Name())
|
|
|
|
// the pro version of the app can compress the .xml file into a .zip file
|
|
baseFilename = strings.TrimSuffix(baseFilename, ".zip")
|
|
|
|
return dirEntry.FS.Open(baseFilename)
|
|
}
|
|
|
|
// These filenames give us no information and waste space in the DB.
|
|
// And yes I have seen all of these myself.
|
|
var junkFilenames = map[string]struct{}{
|
|
null: {},
|
|
"0": {},
|
|
"text.000000.txt": {},
|
|
"text.000001.txt": {},
|
|
"text.000002.txt": {},
|
|
"text000001.txt": {},
|
|
"text000002.txt": {},
|
|
"text000003.txt": {},
|
|
"text.txt": {},
|
|
"text_0.txt": {},
|
|
"text_1.txt": {},
|
|
"text_2.txt": {},
|
|
"image000000.jpg": {},
|
|
}
|
|
|
|
// From https://synctech.com.au/sms-backup-restore/fields-in-xml-backup-files/ (ca. May 2022)
|
|
const (
|
|
unread = 0
|
|
read = 1
|
|
|
|
smsTypeReceived = 1
|
|
smsTypeSent = 2
|
|
smsTypeDraft = 3
|
|
smsTypeOutbox = 4
|
|
smsTypeFailed = 5
|
|
smsTypeQueued = 6
|
|
|
|
smsStatusNone = -1
|
|
smsStatusComplete = 0
|
|
smsStatusPending = 32
|
|
smsStatusFailed = 64
|
|
|
|
mmsMsgBoxReceived = 1
|
|
mmsMsgBoxSent = 2
|
|
mmsMsgBoxDraft = 3
|
|
mmsMsgBoxOutbox = 4
|
|
|
|
mmsAddrTypeBCC = 129
|
|
mmsAddrTypeCC = 130
|
|
mmsAddrTypeFrom = 137
|
|
mmsAddrTypeTo = 151
|
|
)
|