1
0
Fork 0
timelinize/datasources/whatsapp/splitter_test.go
TherionAcribus 6bd8471a93
whatsapp: Manually parse message lines; support more inputs (#181)
* WhatsApp > change import rules

* better confidence if whatsapp in the file name

* WhatsApp > Inline helper + remove helpers.go

* Replace regex with manual parser

* WhatsApp: tests for splitter

* Datasource > Whatsapp > remove magic numbers in splitter

* Datasource > Whatsapp > satisfy staticcheck in splitter + remove unused LRO helper

* Datasource > Whatsapp > fix unconvert warnings in splitter tests + magic numbers
2026-01-20 10:58:54 -07:00

204 lines
5.4 KiB
Go

package whatsapp
import (
"bufio"
"bytes"
"testing"
)
func TestParseMessageHeader(t *testing.T) {
tests := []struct {
name string
line string
wantOK bool
wantDate string
wantTime string
wantName string
wantLen int
wantLRO bool
}{
{
name: "bracket YYYY/MM/DD HH:MM",
line: "[2024/12/31, 12:34] Alice: hello",
wantOK: true,
wantDate: "2024/12/31",
wantTime: "12:34",
wantName: "Alice",
wantLen: len("[2024/12/31, 12:34] Alice: "),
},
{
name: "bracket DD/MM/YYYY HH:MM:SS",
line: "[31/12/2024, 12:34:56] Bob: hey",
wantOK: true,
wantDate: "31/12/2024",
wantTime: "12:34:56",
wantName: "Bob",
wantLen: len("[31/12/2024, 12:34:56] Bob: "),
},
{
name: "dash YYYY-MM-DD HH:MM",
line: "2024-12-31, 12:34 - Carol: hi",
wantOK: true,
wantDate: "2024-12-31",
wantTime: "12:34",
wantName: "Carol",
wantLen: len("2024-12-31, 12:34 - Carol: "),
},
{
name: "dash DD.MM.YYYY HH:MM:SS",
line: "31.12.2024, 12:34:56 - Dave: yo",
wantOK: true,
wantDate: "31.12.2024",
wantTime: "12:34:56",
wantName: "Dave",
wantLen: len("31.12.2024, 12:34:56 - Dave: "),
},
{
name: "LRO bracket",
line: "\u200E[2024/12/31, 12:34] Eve: hi",
wantOK: true,
wantDate: "2024/12/31",
wantTime: "12:34",
wantName: "Eve",
wantLen: len("\u200E") + len("[2024/12/31, 12:34] Eve: "),
wantLRO: true,
},
{
name: "LRO dash",
line: "\u200E2024-12-31, 12:34 - Frank: hi",
wantOK: true,
wantDate: "2024-12-31",
wantTime: "12:34",
wantName: "Frank",
wantLen: len("\u200E") + len("2024-12-31, 12:34 - Frank: "),
wantLRO: true,
},
{
name: "invalid date format",
line: "[2024_12_31, 12:34] Bad: nope",
wantOK: false,
},
{
name: "invalid date length",
line: "[2024/12/3, 12:34] Bad: nope",
wantOK: false,
},
{
name: "invalid time format",
line: "[2024/12/31, 2:34] Bad: nope",
wantOK: false,
},
{
name: "missing colon separator",
line: "[2024/12/31, 12:34] Alice nope",
wantOK: false,
},
{
name: "no name",
line: "[2024/12/31, 12:34] : msg",
wantOK: false,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
h, ok := parseMessageHeader([]byte(tt.line))
if ok != tt.wantOK {
t.Fatalf("ok mismatch: got %v want %v", ok, tt.wantOK)
}
if !ok {
return
}
if h.Date != tt.wantDate || h.Time != tt.wantTime || h.Name != tt.wantName {
t.Fatalf("header mismatch: got (%s,%s,%s) want (%s,%s,%s)", h.Date, h.Time, h.Name, tt.wantDate, tt.wantTime, tt.wantName)
}
if h.HeaderLen != tt.wantLen {
t.Fatalf("header length mismatch: got %d want %d", h.HeaderLen, tt.wantLen)
}
if h.HasLRO != tt.wantLRO {
t.Fatalf("HasLRO mismatch: got %v want %v", h.HasLRO, tt.wantLRO)
}
})
}
}
func TestChatSplitTwoMessages(t *testing.T) {
tokens := scanTokens(t, "[2024/12/31, 12:34] Alice: hello\n[2024/12/31, 12:35] Bob: hi there\n")
if len(tokens) != 2 {
t.Fatalf("expected 2 tokens, got %d", len(tokens))
}
if tokens[0] != "[2024/12/31, 12:34] Alice: hello\n" {
t.Fatalf("first token mismatch: got %q", tokens[0])
}
if tokens[1] != "[2024/12/31, 12:35] Bob: hi there\n" {
t.Fatalf("second token mismatch: got %q", tokens[1])
}
}
func TestChatSplitTwoMessagesDash(t *testing.T) {
tokens := scanTokens(t, "2024-12-31, 12:34 - Alice: hello\n2024-12-31, 12:35 - Bob: hi there\n")
if len(tokens) != 2 {
t.Fatalf("expected 2 tokens, got %d", len(tokens))
}
if tokens[0] != "2024-12-31, 12:34 - Alice: hello\n" {
t.Fatalf("first token mismatch: got %q", tokens[0])
}
if tokens[1] != "2024-12-31, 12:35 - Bob: hi there\n" {
t.Fatalf("second token mismatch: got %q", tokens[1])
}
}
func TestChatSplitTwoMessagesLRO(t *testing.T) {
tokens := scanTokens(t, "\u200E[2024/12/31, 12:34] Alice: hello\n\u200E[2024/12/31, 12:35] Bob: hi there\n")
if len(tokens) != 2 {
t.Fatalf("expected 2 tokens, got %d", len(tokens))
}
if tokens[0] != "\u200E[2024/12/31, 12:34] Alice: hello\n" {
t.Fatalf("first token mismatch: got %q", tokens[0])
}
if tokens[1] != "\u200E[2024/12/31, 12:35] Bob: hi there\n" {
t.Fatalf("second token mismatch: got %q", tokens[1])
}
}
func TestChatSplitMultilineMessage(t *testing.T) {
tokens := scanTokens(t, "[2024/12/31, 12:34] Alice: hello\ncontinued line\n[2024/12/31, 12:35] Bob: hi\n")
if len(tokens) != 2 {
t.Fatalf("expected 2 tokens, got %d", len(tokens))
}
if tokens[0] != "[2024/12/31, 12:34] Alice: hello\ncontinued line\n" {
t.Fatalf("first token should contain multiline content, got %q", tokens[0])
}
if tokens[1] != "[2024/12/31, 12:35] Bob: hi\n" {
t.Fatalf("second token mismatch: got %q", tokens[1])
}
}
func TestChatSplitSingleMessage(t *testing.T) {
tokens := scanTokens(t, "[2024/12/31, 12:34] Alice: hello\n")
if len(tokens) != 1 {
t.Fatalf("expected 1 token, got %d", len(tokens))
}
if tokens[0] == "" {
t.Fatalf("token should not be empty")
}
}
func scanTokens(t *testing.T, input string) []string {
t.Helper()
scanner := bufio.NewScanner(bytes.NewReader([]byte(input)))
scanner.Split(chatSplit)
var tokens []string
for scanner.Scan() {
// scanner.Bytes() is reused; copy and keep exact bytes including newlines
b := make([]byte, len(scanner.Bytes()))
copy(b, scanner.Bytes())
tokens = append(tokens, string(b))
}
if err := scanner.Err(); err != nil {
t.Fatalf("scanner error: %v", err)
}
return tokens
}