breakpilot-compliance/ai-compliance-sdk/internal/iace/distance_benchmark.go

package iace

import (
	"regexp"
	"strconv"
	"strings"
)

// Distance benchmark dimension: does the engine suggest the same numeric
// dimensions (mm gaps, mm/s speeds) as the professional (GT) for a session?
// The engine measures are partly GT-derived, so on the seed sessions this
// mainly guards DRIFT; its real value is NEW sessions, where the engine has not
// been fitted to the assessor. Pure + deterministic (no LLM) — parses prose.

// DistanceToken is one numeric dimension parsed from measure text.
type DistanceToken struct {
	Value float64 `json:"value"`
	Unit  string  `json:"unit"` // "mm" | "mm/s"
	Raw   string  `json:"raw"`
}

// DistanceComparison reports engine vs GT dimensional coverage for one session.
type DistanceComparison struct {
	GTCount      int             `json:"gt_count"`
	MatchedCount int             `json:"matched_count"`
	AgreementPct float64         `json:"agreement_pct"`
	Matched      []DistanceToken `json:"matched"`
	GTOnly       []DistanceToken `json:"gt_only"`     // Fachmann-Maße ohne Engine-Entsprechung (Lücken)
	EngineOnly   []DistanceToken `json:"engine_only"` // Engine-Maße ohne GT-Entsprechung
}

// matches a number (incl. German thousands "1.600" / decimal "2,5") + mm[/s].
var distanceRe = regexp.MustCompile(`(\d{1,3}(?:\.\d{3})+|\d+(?:[,.]\d+)?)\s*mm(/s)?`)
var thousandsRe = regexp.MustCompile(`^\d{1,3}(\.\d{3})+$`)

func normalizeNumber(s string) float64 {
	if thousandsRe.MatchString(s) {
		s = strings.ReplaceAll(s, ".", "") // German thousands separator
	} else {
		s = strings.ReplaceAll(s, ",", ".") // German decimal separator
	}
	v, _ := strconv.ParseFloat(s, 64)
	return v
}

// extractDistanceTokens pulls the distinct (value,unit) dimensions out of prose.
func extractDistanceTokens(texts []string) []DistanceToken {
	seen := map[string]bool{}
	var out []DistanceToken
	for _, t := range texts {
		for _, m := range distanceRe.FindAllStringSubmatch(t, -1) {
			unit := "mm"
			if m[2] == "/s" {
				unit = "mm/s"
			}
			val := normalizeNumber(m[1])
			if val == 0 {
				continue
			}
			key := unit + ":" + strconv.FormatFloat(val, 'f', 1, 64)
			if seen[key] {
				continue
			}
			seen[key] = true
			out = append(out, DistanceToken{Value: val, Unit: unit, Raw: strings.TrimSpace(m[0])})
		}
	}
	return out
}

func tokensMatch(a, b DistanceToken) bool {
	if a.Unit != b.Unit {
		return false
	}
	d := a.Value - b.Value
	if d < 0 {
		d = -d
	}
	return d < 0.05
}

// CompareDistances matches the professional's dimensions (gtTexts) against the
// engine's (engineTexts) and reports coverage + the gaps in both directions.
func CompareDistances(gtTexts, engineTexts []string) DistanceComparison {
	gt := extractDistanceTokens(gtTexts)
	eng := extractDistanceTokens(engineTexts)
	res := DistanceComparison{
		GTCount:    len(gt),
		Matched:    []DistanceToken{},
		GTOnly:     []DistanceToken{},
		EngineOnly: []DistanceToken{},
	}
	engMatched := make([]bool, len(eng))
	for _, g := range gt {
		found := false
		for i, e := range eng {
			if !engMatched[i] && tokensMatch(g, e) {
				found, engMatched[i] = true, true
				break
			}
		}
		if found {
			res.MatchedCount++
			res.Matched = append(res.Matched, g)
		} else {
			res.GTOnly = append(res.GTOnly, g)
		}
	}
	for i, e := range eng {
		if !engMatched[i] {
			res.EngineOnly = append(res.EngineOnly, e)
		}
	}
	if res.GTCount > 0 {
		res.AgreementPct = float64(res.MatchedCount) / float64(res.GTCount) * 100
	}
	return res
}

// CompareSessionDistances is the benchmark-facing helper: it pulls the measure
// prose from the GT entries and the engine mitigations and compares them.
func CompareSessionDistances(gt *GroundTruth, mitigations []Mitigation) DistanceComparison {
	var gtTexts []string
	if gt != nil {
		for _, e := range gt.Entries {
			gtTexts = append(gtTexts, e.Measures...)
			if e.Comment != "" {
				gtTexts = append(gtTexts, e.Comment)
			}
		}
	}
	var engTexts []string
	for _, m := range mitigations {
		engTexts = append(engTexts, m.Name, m.Description)
	}
	return CompareDistances(gtTexts, engTexts)
}