1
0
Fork 0
timelinize/datasources/whatsapp/splitter.go
TherionAcribus 6bd8471a93
whatsapp: Manually parse message lines; support more inputs (#181)
* WhatsApp > change import rules

* better confidence if whatsapp in the file name

* WhatsApp > Inline helper + remove helpers.go

* Replace regex with manual parser

* WhatsApp: tests for splitter

* Datasource > Whatsapp > remove magic numbers in splitter

* Datasource > Whatsapp > satisfy staticcheck in splitter + remove unused LRO helper

* Datasource > Whatsapp > fix unconvert warnings in splitter tests + magic numbers
2026-01-20 10:58:54 -07:00

210 lines
4.7 KiB
Go

package whatsapp
import (
"bufio"
"bytes"
"strconv"
)
var lro = []byte{0xE2, 0x80, 0x8E} // U+200E (LRM)
const (
dateLen = 10
timeLenHM = 5
timeLenHMS = 8
splitTwo = 2
minYear = 1
minMonth = 1
maxMonth = 12
minDay = 1
maxDay = 31
scanOffset = 4
)
type messageHeader struct {
HeaderLen int
HasLRO bool
Date string
Time string
Name string
}
func chatSplit(data []byte, atEOF bool) (advance int, token []byte, err error) {
// Start a few bytes in to avoid matching a message start at the very beginning of the buffer
loc := findMessageStart(data, scanOffset)
if loc == -1 {
// We have the last message — return it
if atEOF {
return 0, data, bufio.ErrFinalToken
}
// We need more data to find a whole message
return 0, nil, nil
}
message := data[:loc]
return len(message), message, nil
}
// findMessageStart returns the index of the next message start at or after offset, or -1 if none.
func findMessageStart(data []byte, offset int) int {
for i := offset; i < len(data); i++ {
c := data[i]
if c != '[' && (c < '0' || c > '9') && c != lro[0] {
continue
}
if c == lro[0] && !bytes.HasPrefix(data[i:], lro) {
continue
}
if isMessageStart(data[i:]) {
return i
}
}
return -1
}
func isMessageStart(b []byte) bool {
_, ok := parseMessageHeader(b)
return ok
}
func parseMessageHeader(b []byte) (messageHeader, bool) {
origLen := len(b)
var h messageHeader
if bytes.HasPrefix(b, lro) {
h.HasLRO = true
b = b[len(lro):]
}
if len(b) < dateLen {
return messageHeader{}, false
}
// Standard WhatsApp export:
// [YYYY/MM/DD, HH:MM(:SS)] Name: message
if b[0] == '[' {
endBracket := bytes.IndexByte(b, ']')
if endBracket == -1 {
return messageHeader{}, false
}
header := b[1:endBracket]
parts := bytes.SplitN(header, []byte(", "), splitTwo)
if len(parts) != splitTwo || !isDate(parts[0]) || !isTime(parts[1]) {
return messageHeader{}, false
}
nameStart := endBracket + len("] ")
if nameStart >= len(b) || b[endBracket+1] != ' ' {
return messageHeader{}, false
}
nameEndRel := bytes.Index(b[nameStart:], []byte(": "))
if nameEndRel <= 0 {
return messageHeader{}, false
}
nameEnd := nameStart + nameEndRel
h.Date = string(parts[0])
h.Time = string(parts[1])
h.Name = string(b[nameStart:nameEnd])
hLen := nameEnd + len(": ")
if h.HasLRO {
hLen += len(lro)
}
// sanity
if hLen > origLen {
return messageHeader{}, false
}
h.HeaderLen = hLen
return h, true
}
// Mobile export variant:
// DD/MM/YYYY, HH:MM(:SS) - Name: message
parts := bytes.SplitN(b, []byte(", "), splitTwo)
if len(parts) != splitTwo || !isDate(parts[0]) {
return messageHeader{}, false
}
dateStr := parts[0]
parts2 := bytes.SplitN(parts[1], []byte(" - "), splitTwo)
if len(parts2) != splitTwo || !isTime(parts2[0]) {
return messageHeader{}, false
}
timeStr := parts2[0]
nameAndMsg := parts2[1]
nameEnd := bytes.Index(nameAndMsg, []byte(": "))
if nameEnd <= 0 {
return messageHeader{}, false
}
h.Date = string(dateStr)
h.Time = string(timeStr)
h.Name = string(nameAndMsg[:nameEnd])
hLen := len(dateStr) + len(", ") + len(timeStr) + len(" - ") + nameEnd + len(": ")
if h.HasLRO {
hLen += len(lro)
}
if hLen > origLen {
return messageHeader{}, false
}
h.HeaderLen = hLen
return h, true
}
func isDate(b []byte) bool {
if len(b) != dateLen {
return false
}
sep4 := b[4]
sep2 := b[2]
// YYYY?MM?DD
if sep4 == '-' || sep4 == '/' || sep4 == '.' {
if !isDigit(b[0]) || !isDigit(b[1]) || !isDigit(b[2]) || !isDigit(b[3]) ||
!isDigit(b[5]) || !isDigit(b[6]) || !isDigit(b[8]) || !isDigit(b[9]) ||
b[7] != sep4 {
return false
}
year, _ := strconv.Atoi(string(b[0:4]))
month, _ := strconv.Atoi(string(b[5:7]))
day, _ := strconv.Atoi(string(b[8:10]))
return year >= minYear && month >= minMonth && month <= maxMonth && day >= minDay && day <= maxDay
}
// DD?MM?YYYY
if sep2 == '-' || sep2 == '/' || sep2 == '.' {
if !isDigit(b[0]) || !isDigit(b[1]) ||
!isDigit(b[3]) || !isDigit(b[4]) ||
!isDigit(b[6]) || !isDigit(b[7]) || !isDigit(b[8]) || !isDigit(b[9]) ||
b[2] != sep2 || b[5] != sep2 {
return false
}
day, _ := strconv.Atoi(string(b[0:2]))
month, _ := strconv.Atoi(string(b[3:5]))
year, _ := strconv.Atoi(string(b[6:10]))
return year >= minYear && month >= minMonth && month <= maxMonth && day >= minDay && day <= maxDay
}
return false
}
func isTime(b []byte) bool {
if len(b) != timeLenHM && len(b) != timeLenHMS {
return false
}
if !isDigit(b[0]) || !isDigit(b[1]) || b[2] != ':' || !isDigit(b[3]) || !isDigit(b[4]) {
return false
}
if len(b) == timeLenHM {
return true
}
return b[5] == ':' && isDigit(b[6]) && isDigit(b[7])
}
func isDigit(c byte) bool {
return c >= '0' && c <= '9'
}