feat(iace): add distance dimension to GT benchmark
CompareBenchmark now also compares the engine's numeric dimensions (mm gaps, mm/s speeds) against the professional's GT measures: parses distance tokens from both sides (German thousands/decimal aware), reports matched / gt_only (gaps) / engine_only + an agreement %. Surfaces as result.distances on the existing benchmark endpoint. Deterministic, no LLM. On the GT-derived seed sessions it mainly guards DRIFT; its real value is new sessions. Real-GT test pins that the engine covers the Bremse (250 mm/s, 250/850 mm) and Kistenhub (25/120 mm, 150/75 mm/s) headline dimensions. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -149,6 +149,8 @@ func CompareBenchmark(gt *GroundTruth, hazards []Hazard, mitigations []Mitigatio
|
|||||||
coverage = float64(len(matched)) / float64(len(gt.Entries))
|
coverage = float64(len(matched)) / float64(len(gt.Entries))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
dist := CompareSessionDistances(gt, mitigations)
|
||||||
|
|
||||||
return &BenchmarkResult{
|
return &BenchmarkResult{
|
||||||
CoverageScore: coverage,
|
CoverageScore: coverage,
|
||||||
MeasureCoverage: measCov,
|
MeasureCoverage: measCov,
|
||||||
@@ -159,6 +161,7 @@ func CompareBenchmark(gt *GroundTruth, hazards []Hazard, mitigations []Mitigatio
|
|||||||
ExtraInEngine: extra,
|
ExtraInEngine: extra,
|
||||||
CategoryBreakdown: breakdown,
|
CategoryBreakdown: breakdown,
|
||||||
RiskRankPairs: rankPairs,
|
RiskRankPairs: rankPairs,
|
||||||
|
Distances: &dist,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -439,9 +442,9 @@ func buildRiskRankPairs(matched []HazardMatchPair) []RiskRankPair {
|
|||||||
|
|
||||||
// Sort by GT risk descending to get GT rank
|
// Sort by GT risk descending to get GT rank
|
||||||
type ranked struct {
|
type ranked struct {
|
||||||
idx int
|
idx int
|
||||||
gtRisk int
|
gtRisk int
|
||||||
name string
|
name string
|
||||||
}
|
}
|
||||||
items := make([]ranked, len(matched))
|
items := make([]ranked, len(matched))
|
||||||
for i, m := range matched {
|
for i, m := range matched {
|
||||||
|
|||||||
@@ -16,23 +16,23 @@ type GroundTruth struct {
|
|||||||
|
|
||||||
// GroundTruthEntry represents a single hazard from a professional risk assessment.
|
// GroundTruthEntry represents a single hazard from a professional risk assessment.
|
||||||
type GroundTruthEntry struct {
|
type GroundTruthEntry struct {
|
||||||
Nr string `json:"nr"`
|
Nr string `json:"nr"`
|
||||||
HazardGroup string `json:"hazard_group"`
|
HazardGroup string `json:"hazard_group"`
|
||||||
HazardGroupApplicable bool `json:"hazard_group_applicable"`
|
HazardGroupApplicable bool `json:"hazard_group_applicable"`
|
||||||
HazardSubgroup string `json:"hazard_subgroup"`
|
HazardSubgroup string `json:"hazard_subgroup"`
|
||||||
HazardType string `json:"hazard_type"`
|
HazardType string `json:"hazard_type"`
|
||||||
HazardCause string `json:"hazard_cause"`
|
HazardCause string `json:"hazard_cause"`
|
||||||
LifecyclePhases []string `json:"lifecycle_phases"`
|
LifecyclePhases []string `json:"lifecycle_phases"`
|
||||||
ComponentZone string `json:"component_zone"`
|
ComponentZone string `json:"component_zone"`
|
||||||
RiskIn GTRisk `json:"risk_in"`
|
RiskIn GTRisk `json:"risk_in"`
|
||||||
PLr *GTPLr `json:"plr,omitempty"`
|
PLr *GTPLr `json:"plr,omitempty"`
|
||||||
Measures []string `json:"measures"`
|
Measures []string `json:"measures"`
|
||||||
MeasureType string `json:"measure_type"`
|
MeasureType string `json:"measure_type"`
|
||||||
RiskOut GTRisk `json:"risk_out"`
|
RiskOut GTRisk `json:"risk_out"`
|
||||||
NormReferences []string `json:"norm_references"`
|
NormReferences []string `json:"norm_references"`
|
||||||
Sufficient bool `json:"sufficient"`
|
Sufficient bool `json:"sufficient"`
|
||||||
Comment string `json:"comment,omitempty"`
|
Comment string `json:"comment,omitempty"`
|
||||||
ReductionSteps []GTReductionStep `json:"reduction_steps,omitempty"`
|
ReductionSteps []GTReductionStep `json:"reduction_steps,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// GTRisk represents the EN 62061 additive risk: R = (F + W + P) * S.
|
// GTRisk represents the EN 62061 additive risk: R = (F + W + P) * S.
|
||||||
@@ -81,8 +81,10 @@ type BenchmarkResult struct {
|
|||||||
CategoryBreakdown []CategoryScore `json:"category_breakdown"`
|
CategoryBreakdown []CategoryScore `json:"category_breakdown"`
|
||||||
RiskRankPairs []RiskRankPair `json:"risk_rank_pairs"`
|
RiskRankPairs []RiskRankPair `json:"risk_rank_pairs"`
|
||||||
// Risk-number comparison (tool vs professional) per matched hazard + aggregate.
|
// Risk-number comparison (tool vs professional) per matched hazard + aggregate.
|
||||||
RiskComparison []RiskComparisonPair `json:"risk_comparison,omitempty"`
|
RiskComparison []RiskComparisonPair `json:"risk_comparison,omitempty"`
|
||||||
RiskAgreement RiskAgreement `json:"risk_agreement"`
|
RiskAgreement RiskAgreement `json:"risk_agreement"`
|
||||||
|
// Dimensional comparison: do the engine's mm/mm-s values match the GT's?
|
||||||
|
Distances *DistanceComparison `json:"distances,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
// HazardMatchPair links a GT entry to an engine hazard.
|
// HazardMatchPair links a GT entry to an engine hazard.
|
||||||
|
|||||||
@@ -0,0 +1,137 @@
|
|||||||
|
package iace
|
||||||
|
|
||||||
|
import (
|
||||||
|
"regexp"
|
||||||
|
"strconv"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
// Distance benchmark dimension: does the engine suggest the same numeric
|
||||||
|
// dimensions (mm gaps, mm/s speeds) as the professional (GT) for a session?
|
||||||
|
// The engine measures are partly GT-derived, so on the seed sessions this
|
||||||
|
// mainly guards DRIFT; its real value is NEW sessions, where the engine has not
|
||||||
|
// been fitted to the assessor. Pure + deterministic (no LLM) — parses prose.
|
||||||
|
|
||||||
|
// DistanceToken is one numeric dimension parsed from measure text.
|
||||||
|
type DistanceToken struct {
|
||||||
|
Value float64 `json:"value"`
|
||||||
|
Unit string `json:"unit"` // "mm" | "mm/s"
|
||||||
|
Raw string `json:"raw"`
|
||||||
|
}
|
||||||
|
|
||||||
|
// DistanceComparison reports engine vs GT dimensional coverage for one session.
|
||||||
|
type DistanceComparison struct {
|
||||||
|
GTCount int `json:"gt_count"`
|
||||||
|
MatchedCount int `json:"matched_count"`
|
||||||
|
AgreementPct float64 `json:"agreement_pct"`
|
||||||
|
Matched []DistanceToken `json:"matched"`
|
||||||
|
GTOnly []DistanceToken `json:"gt_only"` // Fachmann-Maße ohne Engine-Entsprechung (Lücken)
|
||||||
|
EngineOnly []DistanceToken `json:"engine_only"` // Engine-Maße ohne GT-Entsprechung
|
||||||
|
}
|
||||||
|
|
||||||
|
// matches a number (incl. German thousands "1.600" / decimal "2,5") + mm[/s].
|
||||||
|
var distanceRe = regexp.MustCompile(`(\d{1,3}(?:\.\d{3})+|\d+(?:[,.]\d+)?)\s*mm(/s)?`)
|
||||||
|
var thousandsRe = regexp.MustCompile(`^\d{1,3}(\.\d{3})+$`)
|
||||||
|
|
||||||
|
func normalizeNumber(s string) float64 {
|
||||||
|
if thousandsRe.MatchString(s) {
|
||||||
|
s = strings.ReplaceAll(s, ".", "") // German thousands separator
|
||||||
|
} else {
|
||||||
|
s = strings.ReplaceAll(s, ",", ".") // German decimal separator
|
||||||
|
}
|
||||||
|
v, _ := strconv.ParseFloat(s, 64)
|
||||||
|
return v
|
||||||
|
}
|
||||||
|
|
||||||
|
// extractDistanceTokens pulls the distinct (value,unit) dimensions out of prose.
|
||||||
|
func extractDistanceTokens(texts []string) []DistanceToken {
|
||||||
|
seen := map[string]bool{}
|
||||||
|
var out []DistanceToken
|
||||||
|
for _, t := range texts {
|
||||||
|
for _, m := range distanceRe.FindAllStringSubmatch(t, -1) {
|
||||||
|
unit := "mm"
|
||||||
|
if m[2] == "/s" {
|
||||||
|
unit = "mm/s"
|
||||||
|
}
|
||||||
|
val := normalizeNumber(m[1])
|
||||||
|
if val == 0 {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
key := unit + ":" + strconv.FormatFloat(val, 'f', 1, 64)
|
||||||
|
if seen[key] {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
seen[key] = true
|
||||||
|
out = append(out, DistanceToken{Value: val, Unit: unit, Raw: strings.TrimSpace(m[0])})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func tokensMatch(a, b DistanceToken) bool {
|
||||||
|
if a.Unit != b.Unit {
|
||||||
|
return false
|
||||||
|
}
|
||||||
|
d := a.Value - b.Value
|
||||||
|
if d < 0 {
|
||||||
|
d = -d
|
||||||
|
}
|
||||||
|
return d < 0.05
|
||||||
|
}
|
||||||
|
|
||||||
|
// CompareDistances matches the professional's dimensions (gtTexts) against the
|
||||||
|
// engine's (engineTexts) and reports coverage + the gaps in both directions.
|
||||||
|
func CompareDistances(gtTexts, engineTexts []string) DistanceComparison {
|
||||||
|
gt := extractDistanceTokens(gtTexts)
|
||||||
|
eng := extractDistanceTokens(engineTexts)
|
||||||
|
res := DistanceComparison{
|
||||||
|
GTCount: len(gt),
|
||||||
|
Matched: []DistanceToken{},
|
||||||
|
GTOnly: []DistanceToken{},
|
||||||
|
EngineOnly: []DistanceToken{},
|
||||||
|
}
|
||||||
|
engMatched := make([]bool, len(eng))
|
||||||
|
for _, g := range gt {
|
||||||
|
found := false
|
||||||
|
for i, e := range eng {
|
||||||
|
if !engMatched[i] && tokensMatch(g, e) {
|
||||||
|
found, engMatched[i] = true, true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if found {
|
||||||
|
res.MatchedCount++
|
||||||
|
res.Matched = append(res.Matched, g)
|
||||||
|
} else {
|
||||||
|
res.GTOnly = append(res.GTOnly, g)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for i, e := range eng {
|
||||||
|
if !engMatched[i] {
|
||||||
|
res.EngineOnly = append(res.EngineOnly, e)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if res.GTCount > 0 {
|
||||||
|
res.AgreementPct = float64(res.MatchedCount) / float64(res.GTCount) * 100
|
||||||
|
}
|
||||||
|
return res
|
||||||
|
}
|
||||||
|
|
||||||
|
// CompareSessionDistances is the benchmark-facing helper: it pulls the measure
|
||||||
|
// prose from the GT entries and the engine mitigations and compares them.
|
||||||
|
func CompareSessionDistances(gt *GroundTruth, mitigations []Mitigation) DistanceComparison {
|
||||||
|
var gtTexts []string
|
||||||
|
if gt != nil {
|
||||||
|
for _, e := range gt.Entries {
|
||||||
|
gtTexts = append(gtTexts, e.Measures...)
|
||||||
|
if e.Comment != "" {
|
||||||
|
gtTexts = append(gtTexts, e.Comment)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
var engTexts []string
|
||||||
|
for _, m := range mitigations {
|
||||||
|
engTexts = append(engTexts, m.Name, m.Description)
|
||||||
|
}
|
||||||
|
return CompareDistances(gtTexts, engTexts)
|
||||||
|
}
|
||||||
@@ -0,0 +1,88 @@
|
|||||||
|
package iace
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"os"
|
||||||
|
"strconv"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestExtractDistanceTokens_Normalisation(t *testing.T) {
|
||||||
|
toks := extractDistanceTokens([]string{
|
||||||
|
"Abstand >= 25 mm und max. 250 mm/s",
|
||||||
|
"Hand-Speed 1.600 mm/s", // German thousands → 1600
|
||||||
|
"Querschnitt 2,5 mm", // German decimal → 2.5
|
||||||
|
"850mm ohne Leerzeichen",
|
||||||
|
})
|
||||||
|
got := map[string]bool{}
|
||||||
|
for _, tk := range toks {
|
||||||
|
got[tk.Unit+":"+strconv.FormatFloat(tk.Value, 'f', 1, 64)] = true
|
||||||
|
}
|
||||||
|
for _, want := range []string{"mm:25.0", "mm/s:250.0", "mm/s:1600.0", "mm:2.5", "mm:850.0"} {
|
||||||
|
if !got[want] {
|
||||||
|
t.Errorf("expected token %s, got %+v", want, toks)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestCompareDistances_MatchesAndGaps(t *testing.T) {
|
||||||
|
gt := []string{"Abstand >= 25 mm", "max. 250 mm/s", "min. 850 mm", "<= 150 mm/s"}
|
||||||
|
eng := []string{"Spalt 25 mm Fingerschutz", "Teach 250 mm/s", "850 mm Tunnel"}
|
||||||
|
cmp := CompareDistances(gt, eng)
|
||||||
|
if cmp.GTCount != 4 || cmp.MatchedCount != 3 {
|
||||||
|
t.Fatalf("expected 3/4 matched, got %d/%d", cmp.MatchedCount, cmp.GTCount)
|
||||||
|
}
|
||||||
|
if len(cmp.GTOnly) != 1 || cmp.GTOnly[0].Value != 150 {
|
||||||
|
t.Errorf("expected 150 mm/s as the gap, got %+v", cmp.GTOnly)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Real GT sessions: the engine library must cover the professional's headline
|
||||||
|
// dimensions (the engine measures were authored from these sessions).
|
||||||
|
func TestCompareSessionDistances_RealGT(t *testing.T) {
|
||||||
|
var engTexts []string
|
||||||
|
for _, m := range GetProtectiveMeasureLibrary() {
|
||||||
|
engTexts = append(engTexts, m.Name, m.Description)
|
||||||
|
}
|
||||||
|
|
||||||
|
cases := []struct {
|
||||||
|
file string
|
||||||
|
must []DistanceToken
|
||||||
|
}{
|
||||||
|
{"testdata/ground_truth_bremse.json", []DistanceToken{
|
||||||
|
{Value: 250, Unit: "mm/s"}, {Value: 250, Unit: "mm"}, {Value: 850, Unit: "mm"},
|
||||||
|
}},
|
||||||
|
{"testdata/ground_truth_kistenhub.json", []DistanceToken{
|
||||||
|
{Value: 25, Unit: "mm"}, {Value: 120, Unit: "mm"},
|
||||||
|
{Value: 150, Unit: "mm/s"}, {Value: 75, Unit: "mm/s"}, // filled by M603/M605
|
||||||
|
}},
|
||||||
|
}
|
||||||
|
|
||||||
|
for _, tc := range cases {
|
||||||
|
raw, err := os.ReadFile(tc.file)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("read %s: %v", tc.file, err)
|
||||||
|
}
|
||||||
|
var gt GroundTruth
|
||||||
|
if err := json.Unmarshal(raw, >); err != nil {
|
||||||
|
t.Fatalf("parse %s: %v", tc.file, err)
|
||||||
|
}
|
||||||
|
var gtTexts []string
|
||||||
|
for _, e := range gt.Entries {
|
||||||
|
gtTexts = append(gtTexts, e.Measures...)
|
||||||
|
}
|
||||||
|
cmp := CompareDistances(gtTexts, engTexts)
|
||||||
|
for _, want := range tc.must {
|
||||||
|
matched := false
|
||||||
|
for _, m := range cmp.Matched {
|
||||||
|
if m.Unit == want.Unit && m.Value == want.Value {
|
||||||
|
matched = true
|
||||||
|
break
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !matched {
|
||||||
|
t.Errorf("%s: engine should cover %.0f %s but it is a gap", tc.file, want.Value, want.Unit)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user