OpsLog/internal/adif/parser.go

// Package adif handles ADIF import and export (ADI text format).
//
// ADI tokenisation rules (per ADIF spec):
//   - Free-form text is allowed up to the first <EOH> (header end).
//   - After <EOH>, records are sequences of <FIELDNAME:LENGTH[:TYPE]>VALUE
//     terminated by <EOR>.
//   - The LENGTH is the byte count of the VALUE that immediately follows
//     the closing '>' (no separator).
//   - Tag names are case-insensitive.
//   - Bytes between fields (whitespace, junk) are ignored.
package adif

import (
	"bufio"
	"fmt"
	"io"
	"strconv"
	"strings"
	"unicode/utf8"
)

// Record is a single ADIF record. Keys are lowercased field names.
type Record map[string]string

// Parse reads an ADI stream and invokes fn for each record (after <EOH>).
// Returning a non-nil error from fn stops parsing and is propagated.
// The header (text before <EOH>) is silently discarded.
func Parse(r io.Reader, fn func(Record) error) error {
	return parseWith(r, nil, fn)
}

// ParseWithDecoder is like Parse but applies decodeValue to each field's
// raw bytes before storing as a string. ADIF field lengths are byte
// counts in the file's native encoding, so decoding MUST happen after
// reading exactly N bytes — wrapping the reader in a decoder would shift
// byte boundaries and chop multibyte chars in half (e.g. "<QTH:7>YAOUNDÉ"
// in Windows-1252 is 7 bytes; after upfront decoding it'd be 8 bytes of
// UTF-8 and the parser would only read the first 7, splitting É).
func ParseWithDecoder(r io.Reader, decodeValue func([]byte) string, fn func(Record) error) error {
	return parseWith(r, decodeValue, fn)
}

func parseWith(r io.Reader, decodeValue func([]byte) string, fn func(Record) error) error {
	br := bufio.NewReaderSize(r, 64*1024)

	rec := Record{}
	headerDone := false

	for {
		// Seek next '<'. Bytes before it are either header text or
		// inter-field whitespace — both discardable.
		if err := seekByte(br, '<'); err != nil {
			if err == io.EOF {
				return nil
			}
			return err
		}
		spec, err := readUntilByte(br, '>')
		if err != nil {
			if err == io.EOF {
				return nil
			}
			return fmt.Errorf("unterminated tag: %w", err)
		}
		name, length := parseSpec(spec)
		switch name {
		case "eoh":
			headerDone = true
			rec = Record{}
			continue
		case "eor":
			if headerDone && len(rec) > 0 {
				if err := fn(rec); err != nil {
					return err
				}
			}
			rec = Record{}
			continue
		}
		// Skip value bytes regardless of header state; we only emit
		// records once we've crossed <EOH>.
		if length > 0 {
			val := make([]byte, length)
			if _, err := io.ReadFull(br, val); err != nil {
				return fmt.Errorf("read field %s: %w", name, err)
			}
			// Repair character-count lengths. The ADIF spec says LENGTH is a
			// byte count, but some loggers (notably Log4OM's UDP "ADIF
			// message") write the CHARACTER count instead. For UTF-8 values
			// with accented chars that truncates mid-rune — e.g. "<QTH:7>
			// Tóalmás" is 9 bytes but says 7, leaving an orphan byte that
			// renders as "Tóalm�". When we're in UTF-8 mode (no Windows-1252
			// decoder) and the naive byte read isn't valid UTF-8, keep reading
			// until the value holds `length` whole runes (or the next tag).
			if decodeValue == nil && !utf8.Valid(val) {
				val = extendToRunes(br, val, length)
			}
			if headerDone && name != "" {
				if decodeValue != nil {
					rec[name] = decodeValue(val)
				} else {
					rec[name] = string(val)
				}
			}
		}
	}
}

// extendToRunes recovers a value whose declared length was a character count
// rather than a byte count. `have` holds the first `wantRunes` BYTES of the
// value, which turned out to be invalid UTF-8 (a multibyte rune was cut). We
// append bytes from br until the value holds `wantRunes` complete runes — or
// until the next '<' (start of the following tag) / EOF, so we never cross
// into another field. Capped so a genuinely-corrupt value can't run away.
func extendToRunes(br *bufio.Reader, have []byte, wantRunes int) []byte {
	const maxExtra = 8 // at most ~4 extra bytes/rune for the few cut runes
	limit := len(have) + maxExtra*wantRunes + maxExtra
	for len(have) < limit {
		// Stop only when the value is complete UTF-8 (no partial trailing
		// rune) AND holds enough runes. Checking utf8.RuneCount alone is a
		// trap: a trailing orphan lead byte (e.g. the D0 of a cut Cyrillic
		// "а") counts as one rune, so the loop would stop one continuation
		// byte short → "Чайк�". Requiring utf8.Valid forces us to read it.
		if utf8.Valid(have) && utf8.RuneCount(have) >= wantRunes {
			break
		}
		b, err := br.ReadByte()
		if err != nil {
			break
		}
		if b == '<' {
			_ = br.UnreadByte() // belongs to the next tag — leave it
			break
		}
		have = append(have, b)
	}
	return have
}

// parseSpec splits "callsign:5", "callsign:5:S" or "eor" into name and length.
// name is lowercased; length is 0 for control tags or when missing.
func parseSpec(spec string) (name string, length int) {
	parts := strings.SplitN(strings.TrimSpace(spec), ":", 3)
	name = strings.ToLower(strings.TrimSpace(parts[0]))
	if len(parts) >= 2 {
		if n, err := strconv.Atoi(strings.TrimSpace(parts[1])); err == nil && n > 0 {
			length = n
		}
	}
	return
}

func seekByte(br *bufio.Reader, target byte) error {
	for {
		b, err := br.ReadByte()
		if err != nil {
			return err
		}
		if b == target {
			return nil
		}
	}
}

func readUntilByte(br *bufio.Reader, target byte) (string, error) {
	var sb strings.Builder
	for {
		b, err := br.ReadByte()
		if err != nil {
			return sb.String(), err
		}
		if b == target {
			return sb.String(), nil
		}
		sb.WriteByte(b)
	}
}