Files
breakpilot-compliance/ai-compliance-sdk/internal/iace/gt_benchmark_harness_test.go
T
Benjamin Admin 005a2ed711 feat(iace): generic cross-domain leak gates + norm vocab reconciliation
- Domain-gate ~15 foreign machine classes (pool, amusement, paint booth,
  tank farm, reactor, lathe/chips, saw, film/carton, robot, mobile cab,
  asbestos, playground swing) in pattern_domain_gates.go so ungated hazard
  patterns stop leaking into unrelated machines; matching emit keywords
  added in keyword_dictionary.go (gate+emit share one vocabulary).
- Extend the cross-domain precision guard to 6 machine classes (press,
  cobot, motor, welding + the 2 GTs) with per-case homeDomains, so a
  machine's own domain terms are never flagged. GT coverage stays 100%.
- Reconcile the fine-grained norm machine-type vocabulary (455 keys) with
  the 68 canonical dropdown keys via canonicalMachineType() family folding
  in matchNorm — welding 0->17, robotics_cobot 0->6, press 8->13,
  circular_saw 1->35 machine-specific C-norms. Pattern gating left strict.
- Fix initialize?force=true summary index-shift that mislabeled counts
  (reported matched-patterns as "hazards"); now uses named step vars.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-10 22:29:10 +02:00

401 lines
17 KiB
Go

package iace
import (
"encoding/json"
"os"
"path/filepath"
"sort"
"strings"
"testing"
)
// ============================================================================
// Cross-GT real-narrative benchmark harness.
//
// Unlike gt_kistenhub_test.go (which feeds a hand-built MatchInput), this
// harness runs the FULL production pipeline: machine narrative → ParseNarrative
// → MatchInput → engine.Match → CompareBenchmark. That is exactly the path a
// real project WITHOUT ground truth takes, so it measures what actually ships.
//
// It runs every registered GT through the same code and prints per-GT plus a
// side-by-side table, so a generic engine change can be checked against ALL
// ground truths at once (no overfitting to a single machine).
// ============================================================================
// gtCase describes one ground-truth benchmark fixture.
type gtCase struct {
name string
path string
machineType string
// narrative is the machine description fed to ParseNarrative. We read it
// from the GT JSON's machine_description field; if absent we fall back to
// the GT's generic description. Authored narratives are intentionally NOT
// keyword-stuffed — they represent how an engineer would describe the
// machine, so the benchmark stays honest about extraction quality.
narrativeOverride string
// homeDomains lists the foreignDomainTerms domains that are NATIVE to this
// machine, so the cross-domain precision guard does not flag a press's own
// "stoessel"/"werkzeugeinbauraum" or a robot cell's "roboterzelle" as a leak.
// Empty for machines whose domain has no entry in foreignDomainTerms.
homeDomains []string
}
// gtBenchmarkCases is the registry the harness iterates over. Add a new GT
// here and it is automatically cross-validated against every engine change.
var gtBenchmarkCases = []gtCase{
{
name: "Bremse (Roboterzelle)",
path: "ground_truth_bremse.json",
machineType: "robotics_cobot",
narrativeOverride: "Automatisierte Roboterzelle zur Handhabung und Bearbeitung von " +
"Bremsscheiben. Ein Industrieroboter mit Greifer entnimmt Bremsscheiben vom " +
"Foerderband und legt sie in eine Bearbeitungsstation mit Drehtisch. Die Zelle ist " +
"mit Schutzzaun, verriegelter Schutztuer und Lichtgitter gesichert. Antrieb ueber " +
"Servomotoren und Frequenzumrichter, Steuerung ueber Sicherheits-SPS und Bedienpult. " +
"Pneumatische Greifer und Spannvorrichtungen. Betrieb im Automatikbetrieb, Einrichten " +
"und Einlernen (Teachen), Wartung und Stoerungsbeseitigung. Gefaehrdungen durch " +
"Quetschen und Einzug bei Roboterbewegung, elektrische Energie und Druckluft.",
homeDomains: []string{"robot"},
},
{
name: "Kistenhub (Hebevorrichtung)",
path: "ground_truth_kistenhub.json",
machineType: "lift",
narrativeOverride: "Mobiles, fahrbares Kistenhubgeraet zum Heben und Positionieren von " +
"Kisten und Lasten. Eine elektrisch angetriebene Hubplattform (Scherenhubtisch) hebt " +
"die Last ueber ein Hubwerk. Antrieb ueber Elektromotor, Schaltschrank und Steuerung " +
"mit Bedienpult. Das Geraet steht auf einem fahrbaren Fahrwerk mit Lenkrollen, daher " +
"sind Standsicherheit und Kippgefahr relevant. Bediener heben Kisten manuell auf die " +
"Plattform. Betrieb, manuelle Bedienung, Wartung, Reinigung und Transport. Elektrische " +
"Gefaehrdungen durch Netzanschluss, Schaltschrank und Leitungen.",
},
}
// precisionOnlyCases are real machines from breakpilot-core/docs-src that have a
// Grenzen description but NO expert GT hazard list, so they cannot be coverage-
// benchmarked — only checked for cross-domain precision (no foreign-domain
// nonsense). They diversify the gating guard beyond the 2 ground truths (lift +
// robot cell) across a press, a cobot, a motor and a welding system. Each leak
// they would otherwise produce (pool, carousel, paint booth, tank farm, lathe
// chuck, band saw, robot-into-press ...) is now a permanent regression guard.
var precisionOnlyCases = []gtCase{
{
name: "Kniehebelpresse (Presse)",
machineType: "mechanical_press",
homeDomains: []string{"press"},
narrativeOverride: "Vollautomatische Kniehebelpresse zur Kaltmassivumformung metallischer " +
"Rohlinge. Eine Transferanlage fuehrt Rohlinge ueber ein Foerderband in die Presse, wo sie " +
"in mehreren Stufen im Werkzeugeinbauraum zwischen Ober- und Unterwerkzeug umgeformt werden. " +
"Stoessel mit Schwungradantrieb, Hydraulikoel und Druckluft im System, integrierte " +
"Schmieranlage und Absaugung. Schutzumhausung mit verriegelten Tueren. Elektrische " +
"Versorgung 400 V, Steuerung ueber SPS. Betrieb vollautomatisch, Einrichten und Umruesten, " +
"Instandhaltung. Impulslaerm und heisse Werkstuecke beim Pressvorgang.",
},
{
name: "Eigenbauzelle (Cobot)",
machineType: "robotics_cobot",
homeDomains: []string{"robot"},
narrativeOverride: "Arbeitstisch mit integriertem kollaborierendem Roboterarm (Cobot) zur " +
"Bestueckung von Maschinen. Ein Sicherheitsscanner setzt den Roboterarm bei Annaeherung " +
"still. Programmierung ueber Touchscreen. Spannungsversorgung 230 V. Quetsch- und " +
"Stossgefahr im Roboterarbeitsraum durch Bewegung des Roboterarms. Betrieb kollaborierend " +
"und nicht kollaborierend, Teachen und Programmieren, Reinigung, Instandhaltung.",
},
{
name: "Elektromotoren (Antrieb)",
machineType: "general_industry",
homeDomains: nil,
narrativeOverride: "Gleichstrom- und Asynchronmotoren mit oder ohne integriertes Getriebe als " +
"Antrieb in Maschinen. Energieversorgung 24 bis 400 V Gleich- und Wechselstrom. Rotierende " +
"Welle und bewegliche Teile des Motors, Gehaeuse mit Stromschlag- und Erhitzungsgefahr, " +
"elektrische Anschluesse, Uebertemperaturueberwachung und Schutzleiter. Betrieb, Montage, " +
"Reinigung, Instandhaltung, Demontage.",
},
{
name: "Schwingarm (Rundschweissanlage)",
machineType: "welding",
homeDomains: []string{"welding"},
narrativeOverride: "Rundschweissanlage Schwingarm als Auf-Tisch-Version zum Schweissen von " +
"Rundnaehten. Pneumatisch bewegter Brennerarm, Anschluss an MIG/MAG- und TIG-Stromquellen, " +
"maximaler Schweissstrom 350 A. Werkstuecke werden in zwei Backenfuttern eingespannt und " +
"pneumatisch gesichert, rotierende Werkstueckaufnahme mit Reitstock. Formiergas durch die " +
"Hohlwelle. Leitfaehige Gehaeuseoberflaechen, Brenner mit Verbrennungsgefahr. Bedienung " +
"ueber Fusspedal, integrierte Steuerung.",
},
}
// readGTNarrative extracts a machine narrative from the raw GT JSON, trying the
// richer machine_description field before the generic description.
func readGTNarrative(t *testing.T, path string) (gt GroundTruth, narrative, machineName string) {
t.Helper()
raw, err := os.ReadFile(filepath.Join("testdata", path))
if err != nil {
t.Fatalf("read GT %s: %v", path, err)
}
if err := json.Unmarshal(raw, &gt); err != nil {
t.Fatalf("parse GT %s: %v", path, err)
}
var extra struct {
MachineName string `json:"machine_name"`
MachineDescription string `json:"machine_description"`
}
_ = json.Unmarshal(raw, &extra)
narrative = extra.MachineDescription
if narrative == "" {
narrative = gt.Description
}
return gt, narrative, extra.MachineName
}
// parseResultToMatchInput converts the deterministic narrative parse into the
// engine's MatchInput, mirroring what the production handler does.
func parseResultToMatchInput(pr ParseResult, machineType string) MatchInput {
compIDs := make([]string, 0, len(pr.Components))
for _, c := range pr.Components {
compIDs = append(compIDs, c.LibraryID)
}
energyIDs := make([]string, 0, len(pr.EnergySources))
for _, e := range pr.EnergySources {
energyIDs = append(energyIDs, e.SourceID)
}
mt := []string{}
if machineType != "" {
mt = []string{machineType}
}
return MatchInput{
ComponentLibraryIDs: compIDs,
EnergySourceIDs: energyIDs,
LifecyclePhases: pr.LifecyclePhases,
CustomTags: pr.CustomTags,
OperationalStates: pr.OperationalStates,
StateTransitions: pr.StateTransitions,
HumanRoles: pr.Roles,
MachineTypes: mt,
}
}
// runGTCase runs the full narrative→measures pipeline for one GT and returns
// the benchmark result plus the parse result for extraction-quality reporting.
func runGTCase(t *testing.T, c gtCase) (*BenchmarkResult, ParseResult) {
gt, narrative, _ := readGTNarrative(t, c.path)
if c.narrativeOverride != "" {
narrative = c.narrativeOverride
}
pr := ParseNarrative(narrative, c.machineType)
input := parseResultToMatchInput(pr, c.machineType)
engine := NewPatternEngine()
out := engine.Match(input)
hazards, mitigations := patternsToHazardsAndMitigations(out)
return CompareBenchmark(&gt, hazards, mitigations), pr
}
// TestGT_RealNarrativeBenchmark runs every registered GT through the real
// pipeline and prints a side-by-side comparison. Reporting only (no hard
// thresholds yet) — run with:
//
// go test -v -vet=off -run TestGT_RealNarrativeBenchmark ./internal/iace/
func TestGT_RealNarrativeBenchmark(t *testing.T) {
type row struct {
name string
comps, energy, tags int
gtN, matched, extra int
coverage, precision, measC float64
}
var rows []row
for _, c := range gtBenchmarkCases {
res, pr := runGTCase(t, c)
precision := 0.0
if res.TotalEngine > 0 {
precision = float64(len(res.MatchedPairs)) / float64(res.TotalEngine)
}
rows = append(rows, row{
name: c.name,
comps: len(pr.Components),
energy: len(pr.EnergySources),
tags: len(pr.CustomTags),
gtN: res.TotalGT,
matched: len(res.MatchedPairs),
extra: len(res.ExtraInEngine),
coverage: res.CoverageScore,
precision: precision,
measC: res.MeasureCoverage,
})
t.Logf("=== %s (machine_type=%s) ===", c.name, c.machineType)
t.Logf(" Narrative extraction: %d components, %d energy sources, %d custom tags",
len(pr.Components), len(pr.EnergySources), len(pr.CustomTags))
t.Logf(" Coverage: %.1f%% (%d/%d) | Precision: %.1f%% | Measure: %.1f%% | Extras: %d",
res.CoverageScore*100, len(res.MatchedPairs), res.TotalGT,
precision*100, res.MeasureCoverage*100, len(res.ExtraInEngine))
sample := res.ExtraInEngine
if len(sample) > 18 {
sample = sample[:18]
}
t.Logf(" --- Extra-Sample (unmatched engine hazards) ---")
for _, e := range sample {
t.Logf(" [%s] %s", e.Category, abbrev(e.Name, 70))
}
}
t.Logf("\n=== Cross-GT summary (real narrative pipeline) ===")
t.Logf(" %-28s %5s %5s %5s | %8s %9s %8s", "GT", "comp", "enrg", "tags", "coverage", "precision", "measure")
for _, r := range rows {
t.Logf(" %-28s %5d %5d %5d | %7.1f%% %8.1f%% %7.1f%%",
r.name, r.comps, r.energy, r.tags, r.coverage*100, r.precision*100, r.measC*100)
}
// Regression guard: the real narrative pipeline (what ships for projects
// without a GT) must keep high recall on both validated machines.
const coverageFloor = 0.90
for _, r := range rows {
if r.coverage < coverageFloor {
t.Errorf("%s: real-pipeline coverage %.1f%% below floor %.0f%%",
r.name, r.coverage*100, coverageFloor*100)
}
}
}
// foreignDomainTerms are machine-specific terms that betray a pattern's home
// domain. If a pattern's own scenario/name contains one of these but the
// pattern fires for an unrelated machine (a lift, a robot cell), it has leaked
// across domains — the precision bug. Used to prioritise capability-domain
// gating by real leak frequency, not guesswork.
var foreignDomainTerms = map[string]string{
"spritzgie": "plastics", "extruder": "plastics", "kunststoffschmelze": "plastics",
"spinnmaschine": "textile", "webmaschine": "textile", "spinnerei": "textile",
"zweiwalzenwerk": "rolling", "walzwerk": "rolling", "kalander": "rolling",
"gondel": "wind_lift", "pv-modul": "solar", "photovoltaik": "solar", "pv-anlage": "solar",
"presse": "press", "schliesseinheit": "plastics",
"drehmaschine": "cnc", "fraesmaschine": "cnc", "schleifscheibe": "grinding",
"traktor": "agri", "harvester": "agri", "maehdrescher": "agri", "ballenpresse": "agri",
"schweissen": "welding", "lichtbogenschweiss": "welding",
"rolltreppe": "escalator", "fahrtreppe": "escalator",
"spinnerei ": "textile", "extrusion": "plastics",
// construction / mobile machinery
"radlader": "construction", "bagger": "construction", "mobilkran": "crane",
"betonpump": "construction", "strassenwalze": "construction", "strassenbau": "construction",
// press / forming tool space
"werkzeugeinbauraum": "press", "stoessel": "press", "oberwerkzeug": "press",
"unterwerkzeug": "press", "abfuellstempel": "filling",
// machining coolant
"kss-": "machining", "kuehlschmierstoff": "machining",
// confined space / bulk material
"silo": "bulk", "gaerbehaelter": "bulk", "getreidesilo": "bulk", "mehlsilo": "bulk",
"schuettgut": "bulk", "sauerstoffmangel": "confined_space", "erstickung": "confined_space",
// medical
"patient": "medical", "sterilis": "medical", "defibrill": "medical",
// outdoor / biological / cold
"zecke": "outdoor", "hantavirus": "outdoor", "schimmel": "environmental",
"nagerkot": "outdoor", "winterarbeit": "outdoor", "tiefkuehl": "cold", "unterkuehl": "cold",
// playground / fitness
"klettergeraet": "playground", "spielplatz": "playground", "kraftstation": "fitness",
"bankdrueck": "fitness", "kniebeug": "fitness",
"schaukelkette": "playground", "nestschaukel": "playground",
// palletizer
"palettierer": "palletizer",
// aquatic / pool
"schwimmbecken": "aquatic", "schwimmbad": "aquatic", "beckenumrandung": "aquatic",
"massageduese": "aquatic", "schwimmbadtechnik": "aquatic", "sprungturm": "aquatic",
// amusement
"karussell": "amusement", "fahrgeschaeft": "amusement", "riesenrad": "amusement",
// mobile machine with driver cab
"fahrersitz": "mobile_cab", "fahrerkabine": "mobile_cab", "fahrerstand": "mobile_cab",
// coating / paint booth
"lackier": "coating", "loesemitteldampf": "coating", "pulverbeschicht": "coating",
// ex process / tank farm
"tanklager": "exproc", "raffinerie": "exproc",
// chemical reactor
"reaktor": "chem", "mischbereich": "chem", "exotherme reaktion": "chem",
// oxygen / gas supply
"sauerstoffanreicherung": "o2", "sauerstoff-versorgung": "o2",
// lathe / chip machining
"drehfutter": "cnc", "spannfutterbacke": "cnc", "spaeneflug": "cnc",
"spanflug": "cnc", "spindelumgebung": "cnc",
// sawing
"bandsaege": "sawing", "saegeband": "sawing",
// film / carton converting
"folienwickler": "converting", "folientrennbereich": "converting", "kartonschneider": "converting",
// blow molding (plastics)
"blasformwerkzeug": "plastics", "blasstation": "plastics",
// textile cutting
"stoffauflage": "textile", "konfektionierung": "textile",
// asbestos legacy
"asbest": "asbestos",
// robot (home for cobot/robot-cell cases via homeDomains)
"roboterzelle": "robot", "schwenkbereich roboter": "robot", "roboter-arbeitsraum": "robot",
}
// TestGT_DomainLeakage names the patterns that leak across domains. For each GT
// it runs the real pipeline, then flags every fired pattern whose own scenario
// text references a foreign machine. The output is the prioritised gating list
// for capability-domain hardening.
//
// go test -v -vet=off -run TestGT_DomainLeakage ./internal/iace/
func TestGT_DomainLeakage(t *testing.T) {
leakCount := map[string]int{} // patternID → #GTs it leaked into
leakInfo := map[string]string{}
for _, c := range gtBenchmarkCases {
_, narrative, _ := readGTNarrative(t, c.path)
if c.narrativeOverride != "" {
narrative = c.narrativeOverride
}
home := make(map[string]bool, len(c.homeDomains))
for _, d := range c.homeDomains {
home[d] = true
}
pr := ParseNarrative(narrative, c.machineType)
out := NewPatternEngine().Match(parseResultToMatchInput(pr, c.machineType))
var leaks []string
for _, pm := range out.MatchedPatterns {
text := normalizeDE(pm.PatternName + " " + pm.ScenarioDE)
for term, domain := range foreignDomainTerms {
if home[domain] {
continue // native to this machine — not a leak
}
if strings.Contains(text, term) {
leaks = append(leaks, pm.PatternID)
leakCount[pm.PatternID]++
leakInfo[pm.PatternID] = domain + " :: " + abbrev(pm.ScenarioDE, 55)
break
}
}
}
sort.Strings(leaks)
t.Logf("=== %s (machine_type=%s): %d/%d fired patterns leaked from foreign domains ===",
c.name, c.machineType, len(leaks), len(out.MatchedPatterns))
}
type lk struct {
id, info string
n int
}
var all []lk
for id, n := range leakCount {
all = append(all, lk{id, leakInfo[id], n})
}
sort.Slice(all, func(i, j int) bool {
if all[i].n != all[j].n {
return all[i].n > all[j].n
}
return all[i].id < all[j].id
})
t.Logf("\n--- Leaking patterns (prioritised; n=#GTs affected) ---")
t.Logf("Total distinct leaking patterns: %d", len(all))
for _, x := range all {
t.Logf(" n=%d %-9s [%s]", x.n, x.id, x.info)
}
// Regression guard: no domain-specific pattern may fire for an unrelated
// machine. A new leak means a pattern naming a foreign machine lacks its
// domain capability gate (pattern_domain_gates.go).
if len(all) > 0 {
t.Errorf("cross-domain leakage must be 0; %d patterns leaked. "+
"Add the betraying term → domain tag in pattern_domain_gates.go (and emit it in keyword_dictionary.go).",
len(all))
}
}