1
0
Fork 0
timelinize/datasources/telegram/telegram.go
Matt Holt 746e5d6b5c
Refactored import flow, new import UI, thumbnails stored in timeline, etc. (close #3) (#43)
* Schema revisions for new import flow and thumbnails

* WIP settings

* WIP quick schema fix

* gallery: Image search using ML embeddings

Still very rough around the edges, but basically works.

'uv' gets auto-installed, but currently requires restarting Timelinize before it can be used.

Lots of tunings and optimizations are needed. There is much room for improvement.

Still migrating from imports -> jobs, so that part of the code and schema is still a mess.

* Implement search for similar items

* Finish import/planning rewrite; it compiles and tests pass

* Fix some bugs, probably introduce other bugs

* WIP new import planning page

* Fix Google Photos and Twitter recognition

* Finish most of import page UI; start button still WIP

* WIP: Start Import button

* Fixes to jobs, thumbnail job, import job, etc.

* Implement proper checkpointing support; jobs fixes
2024-12-06 11:03:29 -07:00

386 lines
9.1 KiB
Go

/*
Timelinize
Copyright (c) 2013 Matthew Holt
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as published
by the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Affero General Public License for more details.
You should have received a copy of the GNU Affero General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
*/
// Package telegram implements a data source for Telegram data exports.
package telegram
import (
"context"
"encoding/json"
"errors"
"fmt"
"io"
"path"
"strconv"
"strings"
"time"
"github.com/timelinize/timelinize/timeline"
"go.uber.org/zap"
)
func init() {
err := timeline.RegisterDataSource(timeline.DataSource{
Name: "telegram",
Title: "Telegram",
Icon: "telegram.svg",
NewOptions: func() any { return new(Options) },
NewFileImporter: func() timeline.FileImporter { return new(FileImporter) },
})
if err != nil {
timeline.Log.Fatal("registering data source", zap.Error(err))
}
}
// FileImporter can import iPhone backups from a folder on disk.
type FileImporter struct {
}
// Recognize returns whether the input is supported.
func (FileImporter) Recognize(_ context.Context, dirEntry timeline.DirEntry, _ timeline.RecognizeParams) (timeline.Recognition, error) {
f, err := dirEntry.FS.Open("result.json")
if err != nil {
return timeline.Recognition{}, nil
}
defer f.Close()
var hdr desktopExportHeader
if err = json.NewDecoder(f).Decode(&hdr); err != nil {
return timeline.Recognition{}, nil
}
if hdr.About == "" || hdr.PersonalInformation.UserID == 0 {
return timeline.Recognition{}, nil
}
return timeline.Recognition{Confidence: 1}, nil
}
type jsonToken struct {
json.Token
context string
}
func (jt jsonToken) String() string {
switch v := jt.Token.(type) {
case string:
return v
case int:
return strconv.Itoa(v)
case float64:
return strconv.FormatFloat(v, 'f', -1, 64)
case bool:
return strconv.FormatBool(v)
case json.Delim, nil:
return ""
}
return fmt.Sprintf("%v", jt.Token)
}
type jsonPath []jsonToken
func (jp jsonPath) String() string {
parts := make([]string, 0, len(jp))
for _, p := range jp {
val := p.String()
if len(parts) == 0 || val != "" {
parts = append(parts, val)
}
}
return strings.Join(parts, ".")
}
// FileImport imports data from the file.
func (fimp *FileImporter) FileImport(ctx context.Context, dirEntry timeline.DirEntry, params timeline.ImportParams) error {
// dsOpt := opt.DataSourceOptions.(*Options)
const jsonFilename = "result.json"
f, err := dirEntry.FS.Open(jsonFilename)
if err != nil {
return err
}
defer func() { f.Close() }() // value of f might change between now and returning
dec := json.NewDecoder(f)
var nesting jsonPath
var context string
var seenChats bool
var currentChatType string
var ownerEntity, currentChatEntity timeline.Entity
handler := func(scope string, dec *json.Decoder) error {
switch scope {
case ".personal_information":
// only need it once
if len(ownerEntity.Attributes) > 0 {
return nil
}
var pi personalInformation
if err = dec.Decode(&pi); err != nil {
return err
}
ownerEntity = timeline.Entity{
Name: pi.FirstName + " " + pi.LastName,
Attributes: []timeline.Attribute{
{
Name: "telegram_id",
Value: strconv.FormatInt(pi.UserID, 10),
Identity: true,
},
{
Name: timeline.AttributePhoneNumber,
Value: pi.PhoneNumber,
Identifying: true,
},
},
}
if seenChats {
// now start over and look for chats and contacts specifically
f, err = dirEntry.FS.Open(jsonFilename)
if err != nil {
return err
}
nesting = jsonPath{}
}
// TODO: hrm, any way we can do without this? maybe wrap Decode() or something?
if context == objectValue {
nesting = nesting[:len(nesting)-1]
context = objectKey
}
case ".profile_pictures":
params.Log.Debug("TODO: process profile pictures")
case ".contacts.list":
dec.Token() //nolint:errcheck // consume array opening '['
for dec.More() {
var c contact
if err = dec.Decode(&c); err != nil {
return err
}
if strings.HasPrefix(c.PhoneNumber, "001") {
// I have no idea why there's an extra 00 here but it messes up the phone number parser
c.PhoneNumber = strings.TrimPrefix(c.PhoneNumber, "00")
}
params.Pipeline <- &timeline.Graph{
Entity: &timeline.Entity{
Name: c.FirstName + " " + c.LastName,
Attributes: []timeline.Attribute{
{
Name: timeline.AttributePhoneNumber,
Value: c.PhoneNumber,
Identifying: true,
},
},
},
}
}
dec.Token() //nolint:errcheck // consume closing ']'
if context == objectValue {
nesting = nesting[:len(nesting)-1]
context = objectKey
}
case ".chats.list.type",
".chats.list.id",
".chats.list.messages":
seenChats = true
if len(ownerEntity.Attributes) == 0 {
return nil // have to get owner account info first
}
if scope == ".chats.list.type" {
next, err := dec.Token()
if err != nil {
return err
}
currentChatType = next.(string)
nesting = nesting[:len(nesting)-1]
context = objectKey
return nil
}
if scope == ".chats.list.id" {
next, err := dec.Token()
if err != nil {
return err
}
currentChatEntity.Attributes = []timeline.Attribute{
{
Name: "telegram_id",
Value: strconv.FormatFloat(next.(float64), 'f', -1, 64),
Identity: true,
},
}
nesting = nesting[:len(nesting)-1]
context = objectKey
return nil
}
if currentChatType != "personal_chat" {
params.Log.Warn("chat type not yet supported", zap.String("chat_type", currentChatType))
return nil
}
dec.Token() //nolint:errcheck // consume opening '['
for dec.More() {
var m message
if err = dec.Decode(&m); err != nil {
return err
}
m.fromID = strings.TrimPrefix(m.FromID, "user")
// TODO: I'm not sure what other types there are, yet
if m.Type != "message" {
params.Log.Warn("TODO: Types other than 'message' not yet supported", zap.String("type", m.Type))
continue
}
var sb strings.Builder
for _, t := range m.TextEntities {
sb.WriteString(t.Text)
}
dateTime, err := time.Parse("2006-01-02T15:04:05", m.Date)
if err != nil {
return err
}
// figure out who the message is sending and receiving the message
var from, to timeline.Entity
if m.fromID == ownerEntity.AttributeValue("telegram_id") {
from = ownerEntity
to = currentChatEntity
} else {
currentChatEntity.Name = m.From
from = currentChatEntity
to = ownerEntity
}
it := &timeline.Item{
// TODO: Telegram has message IDs, but they're autoincrement, and I don't trust that they're consistent across exports
Classification: timeline.ClassMessage,
Timestamp: dateTime,
Owner: from,
Content: timeline.ItemData{
Data: timeline.StringData(sb.String()),
},
}
ig := &timeline.Graph{Item: it}
ig.ToEntity(timeline.RelSent, &to)
params.Pipeline <- ig
}
dec.Token() //nolint:errcheck // consume closing ']'
if context == objectValue {
nesting = nesting[:len(nesting)-1]
context = objectKey
}
}
return nil
}
////
for {
if err := ctx.Err(); err != nil {
return err
}
context = ""
if len(nesting) > 0 {
above := nesting[len(nesting)-1]
if v, ok := above.Token.(json.Delim); ok {
switch v {
case '{':
context = objectKey
case '[':
context = "array"
}
} else if above.context == objectKey {
context = objectValue
}
}
if context != objectKey {
// if not an object key, let handler get it
if err := handler(nesting.String(), dec); err != nil {
return err
}
}
////////
next, err := dec.Token()
if errors.Is(err, io.EOF) {
break
}
if err != nil {
params.Log.Error("decoding JSON archive object",
zap.String("filename", path.Join(dirEntry.Filename, jsonFilename)),
zap.Error(err))
return nil
}
tkn := jsonToken{Token: next, context: context}
if v, ok := tkn.Token.(json.Delim); ok {
switch v {
case '{', '[':
nesting = append(nesting, tkn)
case '}', ']':
nesting = nesting[:len(nesting)-1]
if len(nesting) > 0 {
priorContext := nesting[len(nesting)-1].context
if priorContext != "array" && priorContext != objectValue {
nesting = nesting[:len(nesting)-1]
}
}
}
} else if context == objectKey {
nesting = append(nesting, tkn)
} else if context == objectValue {
nesting = nesting[:len(nesting)-1]
}
}
return nil
}
// Options contains provider-specific options for using this data source.
type Options struct {
// remember to use JSON tags
}
const (
objectValue = "object_value"
objectKey = "object_key"
)