1
0
Fork 0
timelinize/datasources/smsbackuprestore/smsbackuprestore.go
2025-10-31 13:34:27 -06:00

371 lines
11 KiB
Go

/*
Timelinize
Copyright (c) 2013 Matthew Holt
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published
by the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
// Package smsbackuprestore implements a data source for the Android SMS Backup & Restore app by SyncTech:
// https://synctech.com.au/sms-backup-restore/
package smsbackuprestore
import (
"context"
"encoding/json"
"encoding/xml"
"errors"
"fmt"
"io"
"io/fs"
"path"
"path/filepath"
"strings"
"time"
"github.com/timelinize/timelinize/timeline"
"go.uber.org/zap"
"golang.org/x/net/html/charset"
)
func init() {
err := timeline.RegisterDataSource(timeline.DataSource{
Name: "sms_backup_restore",
Title: "SMS Backup & Restore",
Icon: "sms_backup_restore.png",
NewOptions: func() any { return new(Options) },
NewFileImporter: func() timeline.FileImporter { return new(FileImporter) },
})
if err != nil {
timeline.Log.Fatal("registering data source", zap.Error(err))
}
}
// FileImporter can import the data from a file.
type FileImporter struct{}
// Recognize returns whether this input is supported.
func (FileImporter) Recognize(_ context.Context, dirEntry timeline.DirEntry, _ timeline.RecognizeParams) (timeline.Recognition, error) {
// not a match if the file is a directory
if dirEntry.IsDir() {
return timeline.Recognition{}, nil
}
// skip unsupported file types
switch strings.ToLower(path.Ext(dirEntry.Name())) {
case ".xml", ".zip":
default:
return timeline.Recognition{}, nil
}
file, err := dirEntry.Open(".")
if err != nil {
return timeline.Recognition{}, fmt.Errorf("opening file: %w", err)
}
defer file.Close()
dec := xml.NewDecoder(file)
dec.CharsetReader = charset.NewReaderLabel // handle non-UTF-8 encodings
for {
// NOTE: I've seen JSON files successfully get a first token from the XML decoder
tkn, err := dec.Token()
if errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF) {
break // ignore short or empty files
}
var syntaxErr *xml.SyntaxError
if errors.As(err, &syntaxErr) {
break // invalid XML file
}
if err != nil {
// other errors indicate we're unsure whether we can recognize this
return timeline.Recognition{}, fmt.Errorf("parsing XML token: %w", err)
}
if startElem, ok := tkn.(xml.StartElement); ok {
if startElem.Name.Local == "smses" {
// has the start of the expected XML structure!
return timeline.Recognition{Confidence: 1}, nil
}
break
}
}
return timeline.Recognition{}, nil
}
// Options contains provider-specific options for using this data source.
type Options struct {
// The phone number from which this export file originated.
// SMS Backup & Restore does not provide any identifying
// information of the recipient of these messages AT ALL,
// so the user MUST supply their phone number.
OwnerPhoneNumber string `json:"owner_phone_number"`
// DefaultRegion is the region to assume for phone
// numbers that do not have an explicit country
// calling code. This value should be the ISO
// 3166-1 alpha-2 standard region code.
// Default: "US"
DefaultRegion string `json:"default_region,omitempty"`
}
// FileImport imports data from the input file.
func (imp *FileImporter) FileImport(ctx context.Context, dirEntry timeline.DirEntry, params timeline.ImportParams) error {
dsOpt := *params.DataSourceOptions.(*Options)
// we need the phone number of the phone that originated this file; if no phone number
// was given to the DS options, we try the timeline repo owner's phone number
if dsOpt.OwnerPhoneNumber == "" {
if repoOwner, ok := ctx.Value(timeline.RepoOwnerCtxKey).(timeline.Entity); ok {
if ownerPhone, ok := repoOwner.AttributeValue(timeline.AttributePhoneNumber).(string); ok {
dsOpt.OwnerPhoneNumber = ownerPhone
}
}
}
if dsOpt.OwnerPhoneNumber == "" {
return errors.New("originating phone number cannot be empty")
}
// standardize phone number, and ensure it is marked as identity
standardizedPhoneNum, err := timeline.NormalizePhoneNumber(ctx, dsOpt.OwnerPhoneNumber, dsOpt.DefaultRegion)
if err != nil {
return fmt.Errorf("standardizing owner's phone number '%s': %w", dsOpt.OwnerPhoneNumber, err)
}
dsOpt.OwnerPhoneNumber = standardizedPhoneNum
xmlFile, err := openFile(ctx, dirEntry)
if err != nil {
return err
}
defer xmlFile.Close()
// can't decode a directory
info, err := xmlFile.Stat()
if err != nil {
return err
}
if info.IsDir() {
return nil
}
// load prior checkpoint, if set
var line, checkpoint int
if params.Checkpoint != nil {
err := json.Unmarshal(params.Checkpoint, &checkpoint)
if err != nil {
return fmt.Errorf("decoding checkpoint: %w", err)
}
}
// processing messages concurrently is not faster, based on my testing
dec := xml.NewDecoder(xmlFile)
dec.CharsetReader = charset.NewReaderLabel // handle non-UTF-8 encodings
for {
if err := ctx.Err(); err != nil {
return err
}
tkn, err := dec.Token()
if errors.Is(err, io.EOF) {
break
}
if err != nil {
return fmt.Errorf("decoding next XML token: %w", err)
}
if startElem, ok := tkn.(xml.StartElement); ok {
if startElem.Name.Local != "sms" && startElem.Name.Local != "mms" {
continue
}
// fast-forward to checkpoint if set and we haven't reached it already
if checkpoint > 0 && line <= checkpoint {
line++
continue
}
switch startElem.Name.Local {
case "sms":
var sms SMS
if err := dec.DecodeElement(&sms, &startElem); err != nil {
return fmt.Errorf("decoding XML element as SMS: %w", err)
}
imp.processSMS(line, sms, params, dsOpt)
case "mms":
var mms MMS
if err := dec.DecodeElement(&mms, &startElem); err != nil {
return fmt.Errorf("decoding XML element as MMS: %w", err)
}
imp.processMMS(ctx, line, mms, params, dsOpt)
}
line++
}
}
return nil
}
func (imp *FileImporter) processSMS(line int, sms SMS, opt timeline.ImportParams, dsOpt Options) {
if !sms.within(opt.Timeframe) {
return
}
sender, receiver := sms.people(dsOpt)
ig := &timeline.Graph{
Item: &timeline.Item{
Classification: timeline.ClassMessage,
Timestamp: time.UnixMilli(sms.Date).UTC(), // these unix timestamps represent the actual UTC date, not local time
Owner: sender,
Content: timeline.ItemData{
MediaType: "text/plain",
Data: timeline.StringData(strings.TrimSpace(sms.Body)),
},
Metadata: sms.metadata(),
},
Checkpoint: line,
}
ig.ToEntity(timeline.RelSent, &receiver)
opt.Pipeline <- ig
}
func (imp *FileImporter) processMMS(ctx context.Context, line int, mms MMS, opt timeline.ImportParams, dsOpt Options) {
if !mms.within(opt.Timeframe) {
return
}
sender, recipients := mms.people(ctx, dsOpt)
// the ordering of the parts is not guaranteed, and I've seen them
// switched around on different exports; I think it makes sense to
// prefer the part with text to be the "main" part as the root of
// the graph, with media being attachments, or kind of secondary;
// so move the text part to be first to have that guarantee.
mms.Parts.putTextPartFirst()
// TODO: what if the text part is empty? it results in a basically empty item,
// with the media being attachments. Should the first non-empty part be used
// as the main item instead?
var ig *timeline.Graph
for _, part := range mms.Parts.Part {
if part.Seq < 0 {
continue
}
// most MMS texts have useless rubbish filenames; ignore them since they waste space in the DB
filename := part.Filename
if _, ok := junkFilenames[filename]; ok {
filename = ""
}
node := &timeline.Item{
Classification: timeline.ClassMessage,
Timestamp: time.UnixMilli(mms.Date),
Owner: sender,
Content: timeline.ItemData{
MediaType: part.ContentType,
Filename: filename,
Data: part.data(),
},
Metadata: mms.metadata(),
}
if ig == nil {
ig = &timeline.Graph{Item: node, Checkpoint: line}
} else {
// TODO: this does not add a "sent" relation for the attachments,
// we'd have to traverse up to the root of the graph (usually the text
// node, if there is one) and then follow its "sent" edge to know
// who the attachment was sent to... smaller DB I guess, is that OK though?
ig.ToItem(timeline.RelAttachment, node)
}
}
// some MMS are empty (or only have Seq=-1); no content means nil ItemGraph
if ig == nil {
return
}
// add relations to make sure other participants in a group text
// are recorded; necessary if more than two participants
for i := range recipients {
ig.ToEntity(timeline.RelSent, &recipients[i])
}
opt.Pipeline <- ig
}
// TODO: update godoc etc...
// openFile opens the XML file at filename. However, as the Pro version
// of SMS Backup & Restore can compress them as .zip files, we also
// support that if the filename is a zip file. (The filename in the
// archive must be the same as the input filename without the .zip
// extension.)
func openFile(_ context.Context, dirEntry timeline.DirEntry) (fs.File, error) {
baseFilename := filepath.Base(dirEntry.Name())
// the pro version of the app can compress the .xml file into a .zip file
baseFilename = strings.TrimSuffix(baseFilename, ".zip")
return dirEntry.FS.Open(baseFilename)
}
// These filenames give us no information and waste space in the DB.
// And yes I have seen all of these myself.
var junkFilenames = map[string]struct{}{
null: {},
"0": {},
"text.000000.txt": {},
"text.000001.txt": {},
"text.000002.txt": {},
"text000001.txt": {},
"text000002.txt": {},
"text000003.txt": {},
"text.txt": {},
"text_0.txt": {},
"text_1.txt": {},
"text_2.txt": {},
"image000000.jpg": {},
}
// From https://synctech.com.au/sms-backup-restore/fields-in-xml-backup-files/ (ca. May 2022)
const (
unread = 0
read = 1
smsTypeReceived = 1
smsTypeSent = 2
smsTypeDraft = 3
smsTypeOutbox = 4
smsTypeFailed = 5
smsTypeQueued = 6
smsStatusNone = -1
smsStatusComplete = 0
smsStatusPending = 32
smsStatusFailed = 64
mmsMsgBoxReceived = 1
mmsMsgBoxSent = 2
mmsMsgBoxDraft = 3
mmsMsgBoxOutbox = 4
mmsAddrTypeBCC = 129
mmsAddrTypeCC = 130
mmsAddrTypeFrom = 137
mmsAddrTypeTo = 151
)