package iace import ( "encoding/json" "os" "path/filepath" "sort" "strings" "testing" ) // ============================================================================ // Cross-GT real-narrative benchmark harness. // // Unlike gt_kistenhub_test.go (which feeds a hand-built MatchInput), this // harness runs the FULL production pipeline: machine narrative → ParseNarrative // → MatchInput → engine.Match → CompareBenchmark. That is exactly the path a // real project WITHOUT ground truth takes, so it measures what actually ships. // // It runs every registered GT through the same code and prints per-GT plus a // side-by-side table, so a generic engine change can be checked against ALL // ground truths at once (no overfitting to a single machine). // ============================================================================ // gtCase describes one ground-truth benchmark fixture. type gtCase struct { name string path string machineType string // narrative is the machine description fed to ParseNarrative. We read it // from the GT JSON's machine_description field; if absent we fall back to // the GT's generic description. Authored narratives are intentionally NOT // keyword-stuffed — they represent how an engineer would describe the // machine, so the benchmark stays honest about extraction quality. narrativeOverride string // homeDomains lists the foreignDomainTerms domains that are NATIVE to this // machine, so the cross-domain precision guard does not flag a press's own // "stoessel"/"werkzeugeinbauraum" or a robot cell's "roboterzelle" as a leak. // Empty for machines whose domain has no entry in foreignDomainTerms. homeDomains []string } // gtBenchmarkCases is the registry the harness iterates over. Add a new GT // here and it is automatically cross-validated against every engine change. var gtBenchmarkCases = []gtCase{ { name: "Bremse (Roboterzelle)", path: "ground_truth_bremse.json", machineType: "robotics_cobot", narrativeOverride: "Automatisierte Roboterzelle zur Handhabung und Bearbeitung von " + "Bremsscheiben. Ein Industrieroboter mit Greifer entnimmt Bremsscheiben vom " + "Foerderband und legt sie in eine Bearbeitungsstation mit Drehtisch. Die Zelle ist " + "mit Schutzzaun, verriegelter Schutztuer und Lichtgitter gesichert. Antrieb ueber " + "Servomotoren und Frequenzumrichter, Steuerung ueber Sicherheits-SPS und Bedienpult. " + "Pneumatische Greifer und Spannvorrichtungen. Betrieb im Automatikbetrieb, Einrichten " + "und Einlernen (Teachen), Wartung und Stoerungsbeseitigung. Gefaehrdungen durch " + "Quetschen und Einzug bei Roboterbewegung, elektrische Energie und Druckluft.", homeDomains: []string{"robot"}, }, { name: "Kistenhub (Hebevorrichtung)", path: "ground_truth_kistenhub.json", machineType: "lift", narrativeOverride: "Mobiles, fahrbares Kistenhubgeraet zum Heben und Positionieren von " + "Kisten und Lasten. Eine elektrisch angetriebene Hubplattform (Scherenhubtisch) hebt " + "die Last ueber ein Hubwerk. Antrieb ueber Elektromotor, Schaltschrank und Steuerung " + "mit Bedienpult. Das Geraet steht auf einem fahrbaren Fahrwerk mit Lenkrollen, daher " + "sind Standsicherheit und Kippgefahr relevant. Bediener heben Kisten manuell auf die " + "Plattform. Betrieb, manuelle Bedienung, Wartung, Reinigung und Transport. Elektrische " + "Gefaehrdungen durch Netzanschluss, Schaltschrank und Leitungen.", }, } // precisionOnlyCases are real machines from breakpilot-core/docs-src that have a // Grenzen description but NO expert GT hazard list, so they cannot be coverage- // benchmarked — only checked for cross-domain precision (no foreign-domain // nonsense). They diversify the gating guard beyond the 2 ground truths (lift + // robot cell) across a press, a cobot, a motor and a welding system. Each leak // they would otherwise produce (pool, carousel, paint booth, tank farm, lathe // chuck, band saw, robot-into-press ...) is now a permanent regression guard. var precisionOnlyCases = []gtCase{ { name: "Kniehebelpresse (Presse)", machineType: "mechanical_press", homeDomains: []string{"press"}, narrativeOverride: "Vollautomatische Kniehebelpresse zur Kaltmassivumformung metallischer " + "Rohlinge. Eine Transferanlage fuehrt Rohlinge ueber ein Foerderband in die Presse, wo sie " + "in mehreren Stufen im Werkzeugeinbauraum zwischen Ober- und Unterwerkzeug umgeformt werden. " + "Stoessel mit Schwungradantrieb, Hydraulikoel und Druckluft im System, integrierte " + "Schmieranlage und Absaugung. Schutzumhausung mit verriegelten Tueren. Elektrische " + "Versorgung 400 V, Steuerung ueber SPS. Betrieb vollautomatisch, Einrichten und Umruesten, " + "Instandhaltung. Impulslaerm und heisse Werkstuecke beim Pressvorgang.", }, { name: "Eigenbauzelle (Cobot)", machineType: "robotics_cobot", homeDomains: []string{"robot"}, narrativeOverride: "Arbeitstisch mit integriertem kollaborierendem Roboterarm (Cobot) zur " + "Bestueckung von Maschinen. Ein Sicherheitsscanner setzt den Roboterarm bei Annaeherung " + "still. Programmierung ueber Touchscreen. Spannungsversorgung 230 V. Quetsch- und " + "Stossgefahr im Roboterarbeitsraum durch Bewegung des Roboterarms. Betrieb kollaborierend " + "und nicht kollaborierend, Teachen und Programmieren, Reinigung, Instandhaltung.", }, { name: "Elektromotoren (Antrieb)", machineType: "general_industry", homeDomains: nil, narrativeOverride: "Gleichstrom- und Asynchronmotoren mit oder ohne integriertes Getriebe als " + "Antrieb in Maschinen. Energieversorgung 24 bis 400 V Gleich- und Wechselstrom. Rotierende " + "Welle und bewegliche Teile des Motors, Gehaeuse mit Stromschlag- und Erhitzungsgefahr, " + "elektrische Anschluesse, Uebertemperaturueberwachung und Schutzleiter. Betrieb, Montage, " + "Reinigung, Instandhaltung, Demontage.", }, { name: "Schwingarm (Rundschweissanlage)", machineType: "welding", homeDomains: []string{"welding"}, narrativeOverride: "Rundschweissanlage Schwingarm als Auf-Tisch-Version zum Schweissen von " + "Rundnaehten. Pneumatisch bewegter Brennerarm, Anschluss an MIG/MAG- und TIG-Stromquellen, " + "maximaler Schweissstrom 350 A. Werkstuecke werden in zwei Backenfuttern eingespannt und " + "pneumatisch gesichert, rotierende Werkstueckaufnahme mit Reitstock. Formiergas durch die " + "Hohlwelle. Leitfaehige Gehaeuseoberflaechen, Brenner mit Verbrennungsgefahr. Bedienung " + "ueber Fusspedal, integrierte Steuerung.", }, } // readGTNarrative extracts a machine narrative from the raw GT JSON, trying the // richer machine_description field before the generic description. func readGTNarrative(t *testing.T, path string) (gt GroundTruth, narrative, machineName string) { t.Helper() raw, err := os.ReadFile(filepath.Join("testdata", path)) if err != nil { t.Fatalf("read GT %s: %v", path, err) } if err := json.Unmarshal(raw, >); err != nil { t.Fatalf("parse GT %s: %v", path, err) } var extra struct { MachineName string `json:"machine_name"` MachineDescription string `json:"machine_description"` } _ = json.Unmarshal(raw, &extra) narrative = extra.MachineDescription if narrative == "" { narrative = gt.Description } return gt, narrative, extra.MachineName } // parseResultToMatchInput converts the deterministic narrative parse into the // engine's MatchInput, mirroring what the production handler does. func parseResultToMatchInput(pr ParseResult, machineType string) MatchInput { compIDs := make([]string, 0, len(pr.Components)) for _, c := range pr.Components { compIDs = append(compIDs, c.LibraryID) } energyIDs := make([]string, 0, len(pr.EnergySources)) for _, e := range pr.EnergySources { energyIDs = append(energyIDs, e.SourceID) } mt := []string{} if machineType != "" { mt = []string{machineType} } return MatchInput{ ComponentLibraryIDs: compIDs, EnergySourceIDs: energyIDs, LifecyclePhases: pr.LifecyclePhases, CustomTags: pr.CustomTags, OperationalStates: pr.OperationalStates, StateTransitions: pr.StateTransitions, HumanRoles: pr.Roles, MachineTypes: mt, } } // runGTCase runs the full narrative→measures pipeline for one GT and returns // the benchmark result plus the parse result for extraction-quality reporting. func runGTCase(t *testing.T, c gtCase) (*BenchmarkResult, ParseResult) { gt, narrative, _ := readGTNarrative(t, c.path) if c.narrativeOverride != "" { narrative = c.narrativeOverride } pr := ParseNarrative(narrative, c.machineType) input := parseResultToMatchInput(pr, c.machineType) engine := NewPatternEngine() out := engine.Match(input) hazards, mitigations := patternsToHazardsAndMitigations(out) return CompareBenchmark(>, hazards, mitigations), pr } // TestGT_RealNarrativeBenchmark runs every registered GT through the real // pipeline and prints a side-by-side comparison. Reporting only (no hard // thresholds yet) — run with: // // go test -v -vet=off -run TestGT_RealNarrativeBenchmark ./internal/iace/ func TestGT_RealNarrativeBenchmark(t *testing.T) { type row struct { name string comps, energy, tags int gtN, matched, extra int coverage, precision, measC float64 } var rows []row for _, c := range gtBenchmarkCases { res, pr := runGTCase(t, c) precision := 0.0 if res.TotalEngine > 0 { precision = float64(len(res.MatchedPairs)) / float64(res.TotalEngine) } rows = append(rows, row{ name: c.name, comps: len(pr.Components), energy: len(pr.EnergySources), tags: len(pr.CustomTags), gtN: res.TotalGT, matched: len(res.MatchedPairs), extra: len(res.ExtraInEngine), coverage: res.CoverageScore, precision: precision, measC: res.MeasureCoverage, }) t.Logf("=== %s (machine_type=%s) ===", c.name, c.machineType) t.Logf(" Narrative extraction: %d components, %d energy sources, %d custom tags", len(pr.Components), len(pr.EnergySources), len(pr.CustomTags)) t.Logf(" Coverage: %.1f%% (%d/%d) | Precision: %.1f%% | Measure: %.1f%% | Extras: %d", res.CoverageScore*100, len(res.MatchedPairs), res.TotalGT, precision*100, res.MeasureCoverage*100, len(res.ExtraInEngine)) sample := res.ExtraInEngine if len(sample) > 18 { sample = sample[:18] } t.Logf(" --- Extra-Sample (unmatched engine hazards) ---") for _, e := range sample { t.Logf(" [%s] %s", e.Category, abbrev(e.Name, 70)) } } t.Logf("\n=== Cross-GT summary (real narrative pipeline) ===") t.Logf(" %-28s %5s %5s %5s | %8s %9s %8s", "GT", "comp", "enrg", "tags", "coverage", "precision", "measure") for _, r := range rows { t.Logf(" %-28s %5d %5d %5d | %7.1f%% %8.1f%% %7.1f%%", r.name, r.comps, r.energy, r.tags, r.coverage*100, r.precision*100, r.measC*100) } // Regression guard: the real narrative pipeline (what ships for projects // without a GT) must keep high recall on both validated machines. const coverageFloor = 0.90 for _, r := range rows { if r.coverage < coverageFloor { t.Errorf("%s: real-pipeline coverage %.1f%% below floor %.0f%%", r.name, r.coverage*100, coverageFloor*100) } } } // foreignDomainTerms are machine-specific terms that betray a pattern's home // domain. If a pattern's own scenario/name contains one of these but the // pattern fires for an unrelated machine (a lift, a robot cell), it has leaked // across domains — the precision bug. Used to prioritise capability-domain // gating by real leak frequency, not guesswork. var foreignDomainTerms = map[string]string{ "spritzgie": "plastics", "extruder": "plastics", "kunststoffschmelze": "plastics", "spinnmaschine": "textile", "webmaschine": "textile", "spinnerei": "textile", "zweiwalzenwerk": "rolling", "walzwerk": "rolling", "kalander": "rolling", "gondel": "wind_lift", "pv-modul": "solar", "photovoltaik": "solar", "pv-anlage": "solar", "presse": "press", "schliesseinheit": "plastics", "drehmaschine": "cnc", "fraesmaschine": "cnc", "schleifscheibe": "grinding", "traktor": "agri", "harvester": "agri", "maehdrescher": "agri", "ballenpresse": "agri", "schweissen": "welding", "lichtbogenschweiss": "welding", "rolltreppe": "escalator", "fahrtreppe": "escalator", "spinnerei ": "textile", "extrusion": "plastics", // construction / mobile machinery "radlader": "construction", "bagger": "construction", "mobilkran": "crane", "betonpump": "construction", "strassenwalze": "construction", "strassenbau": "construction", // press / forming tool space "werkzeugeinbauraum": "press", "stoessel": "press", "oberwerkzeug": "press", "unterwerkzeug": "press", "abfuellstempel": "filling", // machining coolant "kss-": "machining", "kuehlschmierstoff": "machining", // confined space / bulk material "silo": "bulk", "gaerbehaelter": "bulk", "getreidesilo": "bulk", "mehlsilo": "bulk", "schuettgut": "bulk", "sauerstoffmangel": "confined_space", "erstickung": "confined_space", // medical "patient": "medical", "sterilis": "medical", "defibrill": "medical", // outdoor / biological / cold "zecke": "outdoor", "hantavirus": "outdoor", "schimmel": "environmental", "nagerkot": "outdoor", "winterarbeit": "outdoor", "tiefkuehl": "cold", "unterkuehl": "cold", // playground / fitness "klettergeraet": "playground", "spielplatz": "playground", "kraftstation": "fitness", "bankdrueck": "fitness", "kniebeug": "fitness", "schaukelkette": "playground", "nestschaukel": "playground", // palletizer "palettierer": "palletizer", // aquatic / pool "schwimmbecken": "aquatic", "schwimmbad": "aquatic", "beckenumrandung": "aquatic", "massageduese": "aquatic", "schwimmbadtechnik": "aquatic", "sprungturm": "aquatic", // amusement "karussell": "amusement", "fahrgeschaeft": "amusement", "riesenrad": "amusement", // mobile machine with driver cab "fahrersitz": "mobile_cab", "fahrerkabine": "mobile_cab", "fahrerstand": "mobile_cab", // coating / paint booth "lackier": "coating", "loesemitteldampf": "coating", "pulverbeschicht": "coating", // ex process / tank farm "tanklager": "exproc", "raffinerie": "exproc", // chemical reactor "reaktor": "chem", "mischbereich": "chem", "exotherme reaktion": "chem", // oxygen / gas supply "sauerstoffanreicherung": "o2", "sauerstoff-versorgung": "o2", // lathe / chip machining "drehfutter": "cnc", "spannfutterbacke": "cnc", "spaeneflug": "cnc", "spanflug": "cnc", "spindelumgebung": "cnc", // sawing "bandsaege": "sawing", "saegeband": "sawing", // film / carton converting "folienwickler": "converting", "folientrennbereich": "converting", "kartonschneider": "converting", // blow molding (plastics) "blasformwerkzeug": "plastics", "blasstation": "plastics", // textile cutting "stoffauflage": "textile", "konfektionierung": "textile", // asbestos legacy "asbest": "asbestos", // robot (home for cobot/robot-cell cases via homeDomains) "roboterzelle": "robot", "schwenkbereich roboter": "robot", "roboter-arbeitsraum": "robot", } // TestGT_DomainLeakage names the patterns that leak across domains. For each GT // it runs the real pipeline, then flags every fired pattern whose own scenario // text references a foreign machine. The output is the prioritised gating list // for capability-domain hardening. // // go test -v -vet=off -run TestGT_DomainLeakage ./internal/iace/ func TestGT_DomainLeakage(t *testing.T) { leakCount := map[string]int{} // patternID → #GTs it leaked into leakInfo := map[string]string{} for _, c := range gtBenchmarkCases { _, narrative, _ := readGTNarrative(t, c.path) if c.narrativeOverride != "" { narrative = c.narrativeOverride } home := make(map[string]bool, len(c.homeDomains)) for _, d := range c.homeDomains { home[d] = true } pr := ParseNarrative(narrative, c.machineType) out := NewPatternEngine().Match(parseResultToMatchInput(pr, c.machineType)) var leaks []string for _, pm := range out.MatchedPatterns { text := normalizeDE(pm.PatternName + " " + pm.ScenarioDE) for term, domain := range foreignDomainTerms { if home[domain] { continue // native to this machine — not a leak } if strings.Contains(text, term) { leaks = append(leaks, pm.PatternID) leakCount[pm.PatternID]++ leakInfo[pm.PatternID] = domain + " :: " + abbrev(pm.ScenarioDE, 55) break } } } sort.Strings(leaks) t.Logf("=== %s (machine_type=%s): %d/%d fired patterns leaked from foreign domains ===", c.name, c.machineType, len(leaks), len(out.MatchedPatterns)) } type lk struct { id, info string n int } var all []lk for id, n := range leakCount { all = append(all, lk{id, leakInfo[id], n}) } sort.Slice(all, func(i, j int) bool { if all[i].n != all[j].n { return all[i].n > all[j].n } return all[i].id < all[j].id }) t.Logf("\n--- Leaking patterns (prioritised; n=#GTs affected) ---") t.Logf("Total distinct leaking patterns: %d", len(all)) for _, x := range all { t.Logf(" n=%d %-9s [%s]", x.n, x.id, x.info) } // Regression guard: no domain-specific pattern may fire for an unrelated // machine. A new leak means a pattern naming a foreign machine lacks its // domain capability gate (pattern_domain_gates.go). if len(all) > 0 { t.Errorf("cross-domain leakage must be 0; %d patterns leaked. "+ "Add the betraying term → domain tag in pattern_domain_gates.go (and emit it in keyword_dictionary.go).", len(all)) } }