From 0d7194ef89b353301f235e3de5f427a8a25c3366 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Thu, 11 Jun 2026 14:59:47 +0200 Subject: [PATCH] feat(iace): add distance dimension to GT benchmark CompareBenchmark now also compares the engine's numeric dimensions (mm gaps, mm/s speeds) against the professional's GT measures: parses distance tokens from both sides (German thousands/decimal aware), reports matched / gt_only (gaps) / engine_only + an agreement %. Surfaces as result.distances on the existing benchmark endpoint. Deterministic, no LLM. On the GT-derived seed sessions it mainly guards DRIFT; its real value is new sessions. Real-GT test pins that the engine covers the Bremse (250 mm/s, 250/850 mm) and Kistenhub (25/120 mm, 150/75 mm/s) headline dimensions. Co-Authored-By: Claude Opus 4.7 --- .../internal/iace/benchmark_matcher.go | 9 +- .../internal/iace/benchmark_types.go | 40 ++--- .../internal/iace/distance_benchmark.go | 137 ++++++++++++++++++ .../internal/iace/distance_benchmark_test.go | 88 +++++++++++ 4 files changed, 252 insertions(+), 22 deletions(-) create mode 100644 ai-compliance-sdk/internal/iace/distance_benchmark.go create mode 100644 ai-compliance-sdk/internal/iace/distance_benchmark_test.go diff --git a/ai-compliance-sdk/internal/iace/benchmark_matcher.go b/ai-compliance-sdk/internal/iace/benchmark_matcher.go index 603a9f01..b9f3940d 100644 --- a/ai-compliance-sdk/internal/iace/benchmark_matcher.go +++ b/ai-compliance-sdk/internal/iace/benchmark_matcher.go @@ -149,6 +149,8 @@ func CompareBenchmark(gt *GroundTruth, hazards []Hazard, mitigations []Mitigatio coverage = float64(len(matched)) / float64(len(gt.Entries)) } + dist := CompareSessionDistances(gt, mitigations) + return &BenchmarkResult{ CoverageScore: coverage, MeasureCoverage: measCov, @@ -159,6 +161,7 @@ func CompareBenchmark(gt *GroundTruth, hazards []Hazard, mitigations []Mitigatio ExtraInEngine: extra, CategoryBreakdown: breakdown, RiskRankPairs: rankPairs, + Distances: &dist, } } @@ -439,9 +442,9 @@ func buildRiskRankPairs(matched []HazardMatchPair) []RiskRankPair { // Sort by GT risk descending to get GT rank type ranked struct { - idx int - gtRisk int - name string + idx int + gtRisk int + name string } items := make([]ranked, len(matched)) for i, m := range matched { diff --git a/ai-compliance-sdk/internal/iace/benchmark_types.go b/ai-compliance-sdk/internal/iace/benchmark_types.go index 34b147ab..93094312 100644 --- a/ai-compliance-sdk/internal/iace/benchmark_types.go +++ b/ai-compliance-sdk/internal/iace/benchmark_types.go @@ -16,23 +16,23 @@ type GroundTruth struct { // GroundTruthEntry represents a single hazard from a professional risk assessment. type GroundTruthEntry struct { - Nr string `json:"nr"` - HazardGroup string `json:"hazard_group"` - HazardGroupApplicable bool `json:"hazard_group_applicable"` - HazardSubgroup string `json:"hazard_subgroup"` - HazardType string `json:"hazard_type"` - HazardCause string `json:"hazard_cause"` - LifecyclePhases []string `json:"lifecycle_phases"` - ComponentZone string `json:"component_zone"` - RiskIn GTRisk `json:"risk_in"` - PLr *GTPLr `json:"plr,omitempty"` - Measures []string `json:"measures"` - MeasureType string `json:"measure_type"` - RiskOut GTRisk `json:"risk_out"` - NormReferences []string `json:"norm_references"` - Sufficient bool `json:"sufficient"` - Comment string `json:"comment,omitempty"` - ReductionSteps []GTReductionStep `json:"reduction_steps,omitempty"` + Nr string `json:"nr"` + HazardGroup string `json:"hazard_group"` + HazardGroupApplicable bool `json:"hazard_group_applicable"` + HazardSubgroup string `json:"hazard_subgroup"` + HazardType string `json:"hazard_type"` + HazardCause string `json:"hazard_cause"` + LifecyclePhases []string `json:"lifecycle_phases"` + ComponentZone string `json:"component_zone"` + RiskIn GTRisk `json:"risk_in"` + PLr *GTPLr `json:"plr,omitempty"` + Measures []string `json:"measures"` + MeasureType string `json:"measure_type"` + RiskOut GTRisk `json:"risk_out"` + NormReferences []string `json:"norm_references"` + Sufficient bool `json:"sufficient"` + Comment string `json:"comment,omitempty"` + ReductionSteps []GTReductionStep `json:"reduction_steps,omitempty"` } // GTRisk represents the EN 62061 additive risk: R = (F + W + P) * S. @@ -81,8 +81,10 @@ type BenchmarkResult struct { CategoryBreakdown []CategoryScore `json:"category_breakdown"` RiskRankPairs []RiskRankPair `json:"risk_rank_pairs"` // Risk-number comparison (tool vs professional) per matched hazard + aggregate. - RiskComparison []RiskComparisonPair `json:"risk_comparison,omitempty"` - RiskAgreement RiskAgreement `json:"risk_agreement"` + RiskComparison []RiskComparisonPair `json:"risk_comparison,omitempty"` + RiskAgreement RiskAgreement `json:"risk_agreement"` + // Dimensional comparison: do the engine's mm/mm-s values match the GT's? + Distances *DistanceComparison `json:"distances,omitempty"` } // HazardMatchPair links a GT entry to an engine hazard. diff --git a/ai-compliance-sdk/internal/iace/distance_benchmark.go b/ai-compliance-sdk/internal/iace/distance_benchmark.go new file mode 100644 index 00000000..d06a0b45 --- /dev/null +++ b/ai-compliance-sdk/internal/iace/distance_benchmark.go @@ -0,0 +1,137 @@ +package iace + +import ( + "regexp" + "strconv" + "strings" +) + +// Distance benchmark dimension: does the engine suggest the same numeric +// dimensions (mm gaps, mm/s speeds) as the professional (GT) for a session? +// The engine measures are partly GT-derived, so on the seed sessions this +// mainly guards DRIFT; its real value is NEW sessions, where the engine has not +// been fitted to the assessor. Pure + deterministic (no LLM) — parses prose. + +// DistanceToken is one numeric dimension parsed from measure text. +type DistanceToken struct { + Value float64 `json:"value"` + Unit string `json:"unit"` // "mm" | "mm/s" + Raw string `json:"raw"` +} + +// DistanceComparison reports engine vs GT dimensional coverage for one session. +type DistanceComparison struct { + GTCount int `json:"gt_count"` + MatchedCount int `json:"matched_count"` + AgreementPct float64 `json:"agreement_pct"` + Matched []DistanceToken `json:"matched"` + GTOnly []DistanceToken `json:"gt_only"` // Fachmann-Maße ohne Engine-Entsprechung (Lücken) + EngineOnly []DistanceToken `json:"engine_only"` // Engine-Maße ohne GT-Entsprechung +} + +// matches a number (incl. German thousands "1.600" / decimal "2,5") + mm[/s]. +var distanceRe = regexp.MustCompile(`(\d{1,3}(?:\.\d{3})+|\d+(?:[,.]\d+)?)\s*mm(/s)?`) +var thousandsRe = regexp.MustCompile(`^\d{1,3}(\.\d{3})+$`) + +func normalizeNumber(s string) float64 { + if thousandsRe.MatchString(s) { + s = strings.ReplaceAll(s, ".", "") // German thousands separator + } else { + s = strings.ReplaceAll(s, ",", ".") // German decimal separator + } + v, _ := strconv.ParseFloat(s, 64) + return v +} + +// extractDistanceTokens pulls the distinct (value,unit) dimensions out of prose. +func extractDistanceTokens(texts []string) []DistanceToken { + seen := map[string]bool{} + var out []DistanceToken + for _, t := range texts { + for _, m := range distanceRe.FindAllStringSubmatch(t, -1) { + unit := "mm" + if m[2] == "/s" { + unit = "mm/s" + } + val := normalizeNumber(m[1]) + if val == 0 { + continue + } + key := unit + ":" + strconv.FormatFloat(val, 'f', 1, 64) + if seen[key] { + continue + } + seen[key] = true + out = append(out, DistanceToken{Value: val, Unit: unit, Raw: strings.TrimSpace(m[0])}) + } + } + return out +} + +func tokensMatch(a, b DistanceToken) bool { + if a.Unit != b.Unit { + return false + } + d := a.Value - b.Value + if d < 0 { + d = -d + } + return d < 0.05 +} + +// CompareDistances matches the professional's dimensions (gtTexts) against the +// engine's (engineTexts) and reports coverage + the gaps in both directions. +func CompareDistances(gtTexts, engineTexts []string) DistanceComparison { + gt := extractDistanceTokens(gtTexts) + eng := extractDistanceTokens(engineTexts) + res := DistanceComparison{ + GTCount: len(gt), + Matched: []DistanceToken{}, + GTOnly: []DistanceToken{}, + EngineOnly: []DistanceToken{}, + } + engMatched := make([]bool, len(eng)) + for _, g := range gt { + found := false + for i, e := range eng { + if !engMatched[i] && tokensMatch(g, e) { + found, engMatched[i] = true, true + break + } + } + if found { + res.MatchedCount++ + res.Matched = append(res.Matched, g) + } else { + res.GTOnly = append(res.GTOnly, g) + } + } + for i, e := range eng { + if !engMatched[i] { + res.EngineOnly = append(res.EngineOnly, e) + } + } + if res.GTCount > 0 { + res.AgreementPct = float64(res.MatchedCount) / float64(res.GTCount) * 100 + } + return res +} + +// CompareSessionDistances is the benchmark-facing helper: it pulls the measure +// prose from the GT entries and the engine mitigations and compares them. +func CompareSessionDistances(gt *GroundTruth, mitigations []Mitigation) DistanceComparison { + var gtTexts []string + if gt != nil { + for _, e := range gt.Entries { + gtTexts = append(gtTexts, e.Measures...) + if e.Comment != "" { + gtTexts = append(gtTexts, e.Comment) + } + } + } + var engTexts []string + for _, m := range mitigations { + engTexts = append(engTexts, m.Name, m.Description) + } + return CompareDistances(gtTexts, engTexts) +} diff --git a/ai-compliance-sdk/internal/iace/distance_benchmark_test.go b/ai-compliance-sdk/internal/iace/distance_benchmark_test.go new file mode 100644 index 00000000..8824793d --- /dev/null +++ b/ai-compliance-sdk/internal/iace/distance_benchmark_test.go @@ -0,0 +1,88 @@ +package iace + +import ( + "encoding/json" + "os" + "strconv" + "testing" +) + +func TestExtractDistanceTokens_Normalisation(t *testing.T) { + toks := extractDistanceTokens([]string{ + "Abstand >= 25 mm und max. 250 mm/s", + "Hand-Speed 1.600 mm/s", // German thousands → 1600 + "Querschnitt 2,5 mm", // German decimal → 2.5 + "850mm ohne Leerzeichen", + }) + got := map[string]bool{} + for _, tk := range toks { + got[tk.Unit+":"+strconv.FormatFloat(tk.Value, 'f', 1, 64)] = true + } + for _, want := range []string{"mm:25.0", "mm/s:250.0", "mm/s:1600.0", "mm:2.5", "mm:850.0"} { + if !got[want] { + t.Errorf("expected token %s, got %+v", want, toks) + } + } +} + +func TestCompareDistances_MatchesAndGaps(t *testing.T) { + gt := []string{"Abstand >= 25 mm", "max. 250 mm/s", "min. 850 mm", "<= 150 mm/s"} + eng := []string{"Spalt 25 mm Fingerschutz", "Teach 250 mm/s", "850 mm Tunnel"} + cmp := CompareDistances(gt, eng) + if cmp.GTCount != 4 || cmp.MatchedCount != 3 { + t.Fatalf("expected 3/4 matched, got %d/%d", cmp.MatchedCount, cmp.GTCount) + } + if len(cmp.GTOnly) != 1 || cmp.GTOnly[0].Value != 150 { + t.Errorf("expected 150 mm/s as the gap, got %+v", cmp.GTOnly) + } +} + +// Real GT sessions: the engine library must cover the professional's headline +// dimensions (the engine measures were authored from these sessions). +func TestCompareSessionDistances_RealGT(t *testing.T) { + var engTexts []string + for _, m := range GetProtectiveMeasureLibrary() { + engTexts = append(engTexts, m.Name, m.Description) + } + + cases := []struct { + file string + must []DistanceToken + }{ + {"testdata/ground_truth_bremse.json", []DistanceToken{ + {Value: 250, Unit: "mm/s"}, {Value: 250, Unit: "mm"}, {Value: 850, Unit: "mm"}, + }}, + {"testdata/ground_truth_kistenhub.json", []DistanceToken{ + {Value: 25, Unit: "mm"}, {Value: 120, Unit: "mm"}, + {Value: 150, Unit: "mm/s"}, {Value: 75, Unit: "mm/s"}, // filled by M603/M605 + }}, + } + + for _, tc := range cases { + raw, err := os.ReadFile(tc.file) + if err != nil { + t.Fatalf("read %s: %v", tc.file, err) + } + var gt GroundTruth + if err := json.Unmarshal(raw, >); err != nil { + t.Fatalf("parse %s: %v", tc.file, err) + } + var gtTexts []string + for _, e := range gt.Entries { + gtTexts = append(gtTexts, e.Measures...) + } + cmp := CompareDistances(gtTexts, engTexts) + for _, want := range tc.must { + matched := false + for _, m := range cmp.Matched { + if m.Unit == want.Unit && m.Value == want.Value { + matched = true + break + } + } + if !matched { + t.Errorf("%s: engine should cover %.0f %s but it is a gap", tc.file, want.Value, want.Unit) + } + } + } +}