Files
OpsLog/internal/adif/parser.go
T
2026-06-02 01:17:26 +02:00

178 lines
5.5 KiB
Go
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// Package adif handles ADIF import and export (ADI text format).
//
// ADI tokenisation rules (per ADIF spec):
// - Free-form text is allowed up to the first <EOH> (header end).
// - After <EOH>, records are sequences of <FIELDNAME:LENGTH[:TYPE]>VALUE
// terminated by <EOR>.
// - The LENGTH is the byte count of the VALUE that immediately follows
// the closing '>' (no separator).
// - Tag names are case-insensitive.
// - Bytes between fields (whitespace, junk) are ignored.
package adif
import (
"bufio"
"fmt"
"io"
"strconv"
"strings"
"unicode/utf8"
)
// Record is a single ADIF record. Keys are lowercased field names.
type Record map[string]string
// Parse reads an ADI stream and invokes fn for each record (after <EOH>).
// Returning a non-nil error from fn stops parsing and is propagated.
// The header (text before <EOH>) is silently discarded.
func Parse(r io.Reader, fn func(Record) error) error {
return parseWith(r, nil, fn)
}
// ParseWithDecoder is like Parse but applies decodeValue to each field's
// raw bytes before storing as a string. ADIF field lengths are byte
// counts in the file's native encoding, so decoding MUST happen after
// reading exactly N bytes — wrapping the reader in a decoder would shift
// byte boundaries and chop multibyte chars in half (e.g. "<QTH:7>YAOUNDÉ"
// in Windows-1252 is 7 bytes; after upfront decoding it'd be 8 bytes of
// UTF-8 and the parser would only read the first 7, splitting É).
func ParseWithDecoder(r io.Reader, decodeValue func([]byte) string, fn func(Record) error) error {
return parseWith(r, decodeValue, fn)
}
func parseWith(r io.Reader, decodeValue func([]byte) string, fn func(Record) error) error {
br := bufio.NewReaderSize(r, 64*1024)
rec := Record{}
headerDone := false
for {
// Seek next '<'. Bytes before it are either header text or
// inter-field whitespace — both discardable.
if err := seekByte(br, '<'); err != nil {
if err == io.EOF {
return nil
}
return err
}
spec, err := readUntilByte(br, '>')
if err != nil {
if err == io.EOF {
return nil
}
return fmt.Errorf("unterminated tag: %w", err)
}
name, length := parseSpec(spec)
switch name {
case "eoh":
headerDone = true
rec = Record{}
continue
case "eor":
if headerDone && len(rec) > 0 {
if err := fn(rec); err != nil {
return err
}
}
rec = Record{}
continue
}
// Skip value bytes regardless of header state; we only emit
// records once we've crossed <EOH>.
if length > 0 {
val := make([]byte, length)
if _, err := io.ReadFull(br, val); err != nil {
return fmt.Errorf("read field %s: %w", name, err)
}
// Repair character-count lengths. The ADIF spec says LENGTH is a
// byte count, but some loggers (notably Log4OM's UDP "ADIF
// message") write the CHARACTER count instead. For UTF-8 values
// with accented chars that truncates mid-rune — e.g. "<QTH:7>
// Tóalmás" is 9 bytes but says 7, leaving an orphan byte that
// renders as "Tóalm". When we're in UTF-8 mode (no Windows-1252
// decoder) and the naive byte read isn't valid UTF-8, keep reading
// until the value holds `length` whole runes (or the next tag).
if decodeValue == nil && !utf8.Valid(val) {
val = extendToRunes(br, val, length)
}
if headerDone && name != "" {
if decodeValue != nil {
rec[name] = decodeValue(val)
} else {
rec[name] = string(val)
}
}
}
}
}
// extendToRunes recovers a value whose declared length was a character count
// rather than a byte count. `have` holds the first `wantRunes` BYTES of the
// value, which turned out to be invalid UTF-8 (a multibyte rune was cut). We
// append bytes from br until the value holds `wantRunes` complete runes — or
// until the next '<' (start of the following tag) / EOF, so we never cross
// into another field. Capped so a genuinely-corrupt value can't run away.
func extendToRunes(br *bufio.Reader, have []byte, wantRunes int) []byte {
const maxExtra = 8 // at most ~4 extra bytes/rune for the few cut runes
limit := len(have) + maxExtra*wantRunes + maxExtra
for len(have) < limit {
// Stop only when the value is complete UTF-8 (no partial trailing
// rune) AND holds enough runes. Checking utf8.RuneCount alone is a
// trap: a trailing orphan lead byte (e.g. the D0 of a cut Cyrillic
// "а") counts as one rune, so the loop would stop one continuation
// byte short → "Чайк". Requiring utf8.Valid forces us to read it.
if utf8.Valid(have) && utf8.RuneCount(have) >= wantRunes {
break
}
b, err := br.ReadByte()
if err != nil {
break
}
if b == '<' {
_ = br.UnreadByte() // belongs to the next tag — leave it
break
}
have = append(have, b)
}
return have
}
// parseSpec splits "callsign:5", "callsign:5:S" or "eor" into name and length.
// name is lowercased; length is 0 for control tags or when missing.
func parseSpec(spec string) (name string, length int) {
parts := strings.SplitN(strings.TrimSpace(spec), ":", 3)
name = strings.ToLower(strings.TrimSpace(parts[0]))
if len(parts) >= 2 {
if n, err := strconv.Atoi(strings.TrimSpace(parts[1])); err == nil && n > 0 {
length = n
}
}
return
}
func seekByte(br *bufio.Reader, target byte) error {
for {
b, err := br.ReadByte()
if err != nil {
return err
}
if b == target {
return nil
}
}
}
func readUntilByte(br *bufio.Reader, target byte) (string, error) {
var sb strings.Builder
for {
b, err := br.ReadByte()
if err != nil {
return sb.String(), err
}
if b == target {
return sb.String(), nil
}
sb.WriteByte(b)
}
}