feat(iace): add distance dimension to GT benchmark
CompareBenchmark now also compares the engine's numeric dimensions (mm gaps, mm/s speeds) against the professional's GT measures: parses distance tokens from both sides (German thousands/decimal aware), reports matched / gt_only (gaps) / engine_only + an agreement %. Surfaces as result.distances on the existing benchmark endpoint. Deterministic, no LLM. On the GT-derived seed sessions it mainly guards DRIFT; its real value is new sessions. Real-GT test pins that the engine covers the Bremse (250 mm/s, 250/850 mm) and Kistenhub (25/120 mm, 150/75 mm/s) headline dimensions. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,137 @@
|
||||
package iace
|
||||
|
||||
import (
|
||||
"regexp"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// Distance benchmark dimension: does the engine suggest the same numeric
|
||||
// dimensions (mm gaps, mm/s speeds) as the professional (GT) for a session?
|
||||
// The engine measures are partly GT-derived, so on the seed sessions this
|
||||
// mainly guards DRIFT; its real value is NEW sessions, where the engine has not
|
||||
// been fitted to the assessor. Pure + deterministic (no LLM) — parses prose.
|
||||
|
||||
// DistanceToken is one numeric dimension parsed from measure text.
|
||||
type DistanceToken struct {
|
||||
Value float64 `json:"value"`
|
||||
Unit string `json:"unit"` // "mm" | "mm/s"
|
||||
Raw string `json:"raw"`
|
||||
}
|
||||
|
||||
// DistanceComparison reports engine vs GT dimensional coverage for one session.
|
||||
type DistanceComparison struct {
|
||||
GTCount int `json:"gt_count"`
|
||||
MatchedCount int `json:"matched_count"`
|
||||
AgreementPct float64 `json:"agreement_pct"`
|
||||
Matched []DistanceToken `json:"matched"`
|
||||
GTOnly []DistanceToken `json:"gt_only"` // Fachmann-Maße ohne Engine-Entsprechung (Lücken)
|
||||
EngineOnly []DistanceToken `json:"engine_only"` // Engine-Maße ohne GT-Entsprechung
|
||||
}
|
||||
|
||||
// matches a number (incl. German thousands "1.600" / decimal "2,5") + mm[/s].
|
||||
var distanceRe = regexp.MustCompile(`(\d{1,3}(?:\.\d{3})+|\d+(?:[,.]\d+)?)\s*mm(/s)?`)
|
||||
var thousandsRe = regexp.MustCompile(`^\d{1,3}(\.\d{3})+$`)
|
||||
|
||||
func normalizeNumber(s string) float64 {
|
||||
if thousandsRe.MatchString(s) {
|
||||
s = strings.ReplaceAll(s, ".", "") // German thousands separator
|
||||
} else {
|
||||
s = strings.ReplaceAll(s, ",", ".") // German decimal separator
|
||||
}
|
||||
v, _ := strconv.ParseFloat(s, 64)
|
||||
return v
|
||||
}
|
||||
|
||||
// extractDistanceTokens pulls the distinct (value,unit) dimensions out of prose.
|
||||
func extractDistanceTokens(texts []string) []DistanceToken {
|
||||
seen := map[string]bool{}
|
||||
var out []DistanceToken
|
||||
for _, t := range texts {
|
||||
for _, m := range distanceRe.FindAllStringSubmatch(t, -1) {
|
||||
unit := "mm"
|
||||
if m[2] == "/s" {
|
||||
unit = "mm/s"
|
||||
}
|
||||
val := normalizeNumber(m[1])
|
||||
if val == 0 {
|
||||
continue
|
||||
}
|
||||
key := unit + ":" + strconv.FormatFloat(val, 'f', 1, 64)
|
||||
if seen[key] {
|
||||
continue
|
||||
}
|
||||
seen[key] = true
|
||||
out = append(out, DistanceToken{Value: val, Unit: unit, Raw: strings.TrimSpace(m[0])})
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func tokensMatch(a, b DistanceToken) bool {
|
||||
if a.Unit != b.Unit {
|
||||
return false
|
||||
}
|
||||
d := a.Value - b.Value
|
||||
if d < 0 {
|
||||
d = -d
|
||||
}
|
||||
return d < 0.05
|
||||
}
|
||||
|
||||
// CompareDistances matches the professional's dimensions (gtTexts) against the
|
||||
// engine's (engineTexts) and reports coverage + the gaps in both directions.
|
||||
func CompareDistances(gtTexts, engineTexts []string) DistanceComparison {
|
||||
gt := extractDistanceTokens(gtTexts)
|
||||
eng := extractDistanceTokens(engineTexts)
|
||||
res := DistanceComparison{
|
||||
GTCount: len(gt),
|
||||
Matched: []DistanceToken{},
|
||||
GTOnly: []DistanceToken{},
|
||||
EngineOnly: []DistanceToken{},
|
||||
}
|
||||
engMatched := make([]bool, len(eng))
|
||||
for _, g := range gt {
|
||||
found := false
|
||||
for i, e := range eng {
|
||||
if !engMatched[i] && tokensMatch(g, e) {
|
||||
found, engMatched[i] = true, true
|
||||
break
|
||||
}
|
||||
}
|
||||
if found {
|
||||
res.MatchedCount++
|
||||
res.Matched = append(res.Matched, g)
|
||||
} else {
|
||||
res.GTOnly = append(res.GTOnly, g)
|
||||
}
|
||||
}
|
||||
for i, e := range eng {
|
||||
if !engMatched[i] {
|
||||
res.EngineOnly = append(res.EngineOnly, e)
|
||||
}
|
||||
}
|
||||
if res.GTCount > 0 {
|
||||
res.AgreementPct = float64(res.MatchedCount) / float64(res.GTCount) * 100
|
||||
}
|
||||
return res
|
||||
}
|
||||
|
||||
// CompareSessionDistances is the benchmark-facing helper: it pulls the measure
|
||||
// prose from the GT entries and the engine mitigations and compares them.
|
||||
func CompareSessionDistances(gt *GroundTruth, mitigations []Mitigation) DistanceComparison {
|
||||
var gtTexts []string
|
||||
if gt != nil {
|
||||
for _, e := range gt.Entries {
|
||||
gtTexts = append(gtTexts, e.Measures...)
|
||||
if e.Comment != "" {
|
||||
gtTexts = append(gtTexts, e.Comment)
|
||||
}
|
||||
}
|
||||
}
|
||||
var engTexts []string
|
||||
for _, m := range mitigations {
|
||||
engTexts = append(engTexts, m.Name, m.Description)
|
||||
}
|
||||
return CompareDistances(gtTexts, engTexts)
|
||||
}
|
||||
Reference in New Issue
Block a user