From 7287e989a6fe02a74456c7eaf1fea3e26a6d24d0 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Thu, 25 Jun 2026 00:30:10 +0200 Subject: [PATCH 01/11] fix(ai-sdk): battery hazards require a battery, not generic stored_energy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit HP753 (lithium thermal runaway), HP754 (battery off-gassing) and HP755 (HV battery shock) were gated on stored_energy, which a frequency converter (C034, DC-link capacitors) legitimately carries — so they leaked into any machine with a VFD (surfaced by the dishwasher after the Frequenzumrichter narrative). Now require the "battery" tag; add lithium/batteriespeicher synonyms so real battery-storage machines still emit it. GT #3 100% recall unchanged, battery themes gone from the dishwasher log; Kistenhub 97.1% and Bremse pinned mappings unchanged. Co-Authored-By: Claude Opus 4.7 --- .../internal/iace/hazard_patterns_specific_machines.go | 6 +++--- ai-compliance-sdk/internal/iace/keyword_dictionary.go | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ai-compliance-sdk/internal/iace/hazard_patterns_specific_machines.go b/ai-compliance-sdk/internal/iace/hazard_patterns_specific_machines.go index f89d2af9..afde4519 100644 --- a/ai-compliance-sdk/internal/iace/hazard_patterns_specific_machines.go +++ b/ai-compliance-sdk/internal/iace/hazard_patterns_specific_machines.go @@ -375,7 +375,7 @@ func GetSpecificMachinePatterns() []HazardPattern { // ================================================================ { ID: "HP753", NameDE: "Thermal Runaway bei Lithium-Batterie", NameEN: "Thermal runaway of lithium battery", - RequiredComponentTags: []string{"stored_energy", "high_temperature"}, + RequiredComponentTags: []string{"battery", "high_temperature"}, RequiredEnergyTags: []string{"electrical_energy", "thermal"}, GeneratedHazardCats: []string{"thermal_hazard", "electrical_hazard"}, SuggestedMeasureIDs: []string{"M005", "M141"}, @@ -390,7 +390,7 @@ func GetSpecificMachinePatterns() []HazardPattern { }, { ID: "HP754", NameDE: "Ausgasung giftiger Daempfe aus Batterie", NameEN: "Toxic gas emission from battery", - RequiredComponentTags: []string{"stored_energy", "chemical_risk"}, + RequiredComponentTags: []string{"battery", "chemical_risk"}, RequiredEnergyTags: []string{}, GeneratedHazardCats: []string{"material_environmental"}, SuggestedMeasureIDs: []string{"M005", "M141"}, @@ -405,7 +405,7 @@ func GetSpecificMachinePatterns() []HazardPattern { }, { ID: "HP755", NameDE: "Elektrischer Schlag an Hochvolt-Batteriespeicher", NameEN: "Electric shock from high-voltage battery storage", - RequiredComponentTags: []string{"stored_energy", "electrical_part"}, + RequiredComponentTags: []string{"battery", "electrical_part"}, RequiredEnergyTags: []string{"electrical_energy"}, GeneratedHazardCats: []string{"electrical_hazard"}, SuggestedMeasureIDs: []string{"M082", "M141"}, diff --git a/ai-compliance-sdk/internal/iace/keyword_dictionary.go b/ai-compliance-sdk/internal/iace/keyword_dictionary.go index 0522bd6a..5db122ba 100644 --- a/ai-compliance-sdk/internal/iace/keyword_dictionary.go +++ b/ai-compliance-sdk/internal/iace/keyword_dictionary.go @@ -137,7 +137,7 @@ func GetKeywordDictionary() []KeywordEntry { {Keywords: []string{"kreiselmaeher", "scheibenmaeher", "maehwerk"}, ExtraTags: []string{"agri_mower"}}, {Keywords: []string{"spruehduese", "spritzduese", "spruehkopf"}, ExtraTags: []string{"spray_nozzle"}}, {Keywords: []string{"galvanikbad", "tauchbad", "beizbad", "chemiebad"}, ExtraTags: []string{"chemical_bath"}}, - {Keywords: []string{"batterie", "akku", "akkumulator", "traktionsbatterie"}, ExtraTags: []string{"battery"}}, + {Keywords: []string{"batterie", "akku", "akkumulator", "traktionsbatterie", "lithium", "batteriespeicher", "hochvoltbatterie", "lithium-batterie"}, ExtraTags: []string{"battery"}}, {Keywords: []string{"heizelement", "heizpatrone", "heizband"}, ExtraTags: []string{"heating_element"}}, {Keywords: []string{"uv-lampe", "uv-strahler", "uv-c-strahler"}, ExtraTags: []string{"uv_source"}}, {Keywords: []string{"roentgen", "radioaktiv", "strahlenquelle", "gammastrahl", "isotop"}, ExtraTags: []string{"radiation_source"}}, From 33790bb5e7afa22ffbaf51402ffa403cfc8c30c2 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Thu, 25 Jun 2026 00:41:41 +0200 Subject: [PATCH 02/11] fix(ai-sdk): pneumatic restenergy hazard requires actual pneumatics HP1717 was gated on the generic stored_energy tag (carried by a frequency converter's DC link) + pneumatic_pressure (emitted by "Boiler unter Druck"), so it leaked into the dishwasher despite the absence of any pneumatics. Require pneumatic_part instead. The Bremse pin is a static pattern->measure check (unaffected); full suite incl. Bremse coverage and Kistenhub 97.1% unchanged. Co-Authored-By: Claude Opus 4.7 --- ai-compliance-sdk/internal/iace/hazard_patterns_gt_bremse.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ai-compliance-sdk/internal/iace/hazard_patterns_gt_bremse.go b/ai-compliance-sdk/internal/iace/hazard_patterns_gt_bremse.go index 3d70e684..bf1a4215 100644 --- a/ai-compliance-sdk/internal/iace/hazard_patterns_gt_bremse.go +++ b/ai-compliance-sdk/internal/iace/hazard_patterns_gt_bremse.go @@ -157,7 +157,7 @@ func GetGTBremseHazardPatterns() []HazardPattern { // ════════════════════════════════════════════════════════════════ { ID: "HP1717", NameDE: "Verletzung durch unvermittelt austretende pneumatische Restenergie", NameEN: "Injury from unexpectedly released pneumatic stored energy", - RequiredComponentTags: []string{"stored_energy"}, + RequiredComponentTags: []string{"pneumatic_part"}, RequiredEnergyTags: []string{"pneumatic_pressure"}, GeneratedHazardCats: []string{"mechanical_hazard"}, SuggestedMeasureIDs: []string{"M485", "M534", "M527"}, From 8f89fbf8a7403657c5d4e93e8586ac9bf5b99999 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Thu, 25 Jun 2026 00:47:13 +0200 Subject: [PATCH 03/11] feat(ai-sdk): order the hazard log by ISO 12100 hazard group ListHazards returned hazards in pattern-firing order, which reads as a jumble. Sort by EN ISO 12100 hazard group (A. Mechanisch, B. Elektrisch, C. Thermisch, D. Pneumatik/Hydraulik, E. Laerm, F. Ergonomie, G. Stoffe, H. Software/Steuerung, I. Cyber, J. KI), stable within a group. Matches the frontend CATEGORY_LABELS. Co-Authored-By: Claude Opus 4.7 --- .../internal/iace/hazard_category_order.go | 50 +++++++++++++++++++ .../internal/iace/store_hazards.go | 1 + 2 files changed, 51 insertions(+) create mode 100644 ai-compliance-sdk/internal/iace/hazard_category_order.go diff --git a/ai-compliance-sdk/internal/iace/hazard_category_order.go b/ai-compliance-sdk/internal/iace/hazard_category_order.go new file mode 100644 index 00000000..d2a39e75 --- /dev/null +++ b/ai-compliance-sdk/internal/iace/hazard_category_order.go @@ -0,0 +1,50 @@ +package iace + +import "sort" + +// EN ISO 12100 hazard-group ordering for the hazard log. Without it the log is +// returned in pattern-firing order, which reads as a jumble. This groups the +// hazards top-down by type (A. Mechanisch, B. Elektrisch, C. Thermisch, …), +// matching the frontend CATEGORY_LABELS. +var isoCategoryRank = map[string]int{ + // A. Mechanisch + "mechanical_hazard": 10, "mechanical": 10, "maintenance_hazard": 11, + // B. Elektrisch + "electrical_hazard": 20, "electrical": 20, "emc_hazard": 21, + // C. Thermisch + "thermal_hazard": 30, "thermal": 30, "high_temperature": 31, "fire_explosion": 32, + // D. Pneumatik / Hydraulik + "pneumatic_hydraulic": 40, + // E. Laerm / Vibration + "noise_hazard": 50, "noise_vibration": 50, "vibration_hazard": 51, + // F. Ergonomie + "ergonomic_hazard": 60, "ergonomic": 60, + // G. Stoffe / Umwelt + "material_environmental": 70, "chemical_risk": 71, "radiation_hazard": 72, + // H. Software / Steuerung (funktionale Sicherheit) + "software_control": 80, "software_fault": 80, "safety_function_failure": 81, + "configuration_error": 82, "sensor_fault": 83, "hmi_error": 84, "mode_confusion": 85, + "communication_failure": 86, "update_failure": 87, + // I. Cyber / Netzwerk (zur Ordnungs-Vollstaendigkeit; im CE-Log ausgeschlossen) + "unauthorized_access": 90, "firmware_corruption": 91, "cyber_resilience": 92, + "cyber_network": 93, "logging_audit_failure": 94, "sensor_spoofing": 95, + // J. KI-spezifisch + "ai_specific": 100, "ai_misclassification": 100, "false_classification": 100, + "model_drift": 100, "data_poisoning": 100, "unintended_bias": 100, +} + +func categoryRank(cat string) int { + if r, ok := isoCategoryRank[cat]; ok { + return r + } + return 999 // unknown categories last +} + +// SortHazardsByISO12100 groups hazards by ISO 12100 hazard group. Stable: the +// relative order within a group (creation/priority order from the engine) is +// preserved. +func SortHazardsByISO12100(hazards []Hazard) { + sort.SliceStable(hazards, func(i, j int) bool { + return categoryRank(hazards[i].Category) < categoryRank(hazards[j].Category) + }) +} diff --git a/ai-compliance-sdk/internal/iace/store_hazards.go b/ai-compliance-sdk/internal/iace/store_hazards.go index 904e36cd..3bff2ab0 100644 --- a/ai-compliance-sdk/internal/iace/store_hazards.go +++ b/ai-compliance-sdk/internal/iace/store_hazards.go @@ -160,6 +160,7 @@ func (s *Store) ListHazards(ctx context.Context, projectID uuid.UUID) ([]Hazard, hazards = append(hazards, h) } + SortHazardsByISO12100(hazards) return hazards, nil } From a8c61eb320c81c168e2e0e02a28804373589ab33 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Thu, 25 Jun 2026 01:35:14 +0200 Subject: [PATCH 04/11] fix(ai-sdk): warewashing-scoped supersession of generic thermal duplicates The generic hot-surface patterns HP016 (high_temperature) and HP018 (actuator burn) fire for dishwashers via broad tags and duplicate the precise warewashing pattern HP2201 (Boiler/Tank/Spuelkammer). Suppress HP016/HP018 only when dom_warewashing is present, so the specific pattern wins and the duplicate is dropped. Scoped to the domain tag -> Kistenhub/Bremse and every non-warewashing machine keep the generic patterns unchanged. Warewashing recall stays 100% (25/25), precision 90% -> 92.6% (2 dupes removed). Bremse 26 pins and Kistenhub benchmark unaffected. Co-Authored-By: Claude Opus 4.7 --- .../internal/iace/pattern_enclosure.go | 21 +++++++++++++++++++ .../internal/iace/pattern_engine.go | 5 +++++ 2 files changed, 26 insertions(+) diff --git a/ai-compliance-sdk/internal/iace/pattern_enclosure.go b/ai-compliance-sdk/internal/iace/pattern_enclosure.go index a10cdf0e..343db431 100644 --- a/ai-compliance-sdk/internal/iace/pattern_enclosure.go +++ b/ai-compliance-sdk/internal/iace/pattern_enclosure.go @@ -42,3 +42,24 @@ func guardedLifecycles(p HazardPattern, tagSet map[string]bool) []string { } return p.ApplicableLifecycles } + +// Domain-specific supersession. +// +// A generic pattern that fires via a broad tag (e.g. high_temperature) can +// duplicate a domain-specific pattern that describes the same hazard more +// precisely. When the domain is present, the specific pattern wins and the +// generic duplicate is dropped. Scoped to the domain tag, so machines outside +// the domain keep the generic pattern — regression-safe by construction. +// +// HP016 (generic hot surfaces) -> HP2201 (Boiler/Tank/Spuelkammer) +// HP018 (actuator burn) -> HP2201 (same contact-burn hazard) +var genericSupersededByWarewashing = map[string]bool{ + "HP016": true, + "HP018": true, +} + +// supersededByDomainSpecific reports whether a generic pattern is replaced by a +// more precise equivalent that the project's domain already provides. +func supersededByDomainSpecific(p HazardPattern, tagSet map[string]bool) bool { + return tagSet["dom_warewashing"] && genericSupersededByWarewashing[p.ID] +} diff --git a/ai-compliance-sdk/internal/iace/pattern_engine.go b/ai-compliance-sdk/internal/iace/pattern_engine.go index cc37a560..3aacfc2d 100644 --- a/ai-compliance-sdk/internal/iace/pattern_engine.go +++ b/ai-compliance-sdk/internal/iace/pattern_engine.go @@ -416,6 +416,11 @@ func patternMatches(p HazardPattern, tagSet map[string]bool, input MatchInput) b return false } + // Domain-specific supersession (generic duplicate replaced by a precise one). + if supersededByDomainSpecific(p, tagSet) { + return false + } + return true } From 80862e707303a046b6660510f3ba43cecb3900ec Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Thu, 25 Jun 2026 08:13:14 +0200 Subject: [PATCH 05/11] fix(ai-sdk): supersede foreign-framed stored-energy duplicate for warewashing HP013 (stored electrical energy) fires for dishwashers via the broad stored_energy tag but its zone is framed for Batteriefaecher/USV-Anlagen, which a dishwasher does not have. The precise residual-voltage pattern HP144 (Frequenzumrichter/Zwischenkreis, Priority 90) already fires and covers the same hazard. Add HP013 to the warewashing-scoped supersession set so the duplicate is dropped only when dom_warewashing is present. Warewashing recall stays 100% (25/25), precision 92.6% -> 96.2%. Kistenhub/Bremse keep HP013 (no dom_warewashing); 26 Bremse pins + benchmark unaffected. Co-Authored-By: Claude Opus 4.7 --- ai-compliance-sdk/internal/iace/pattern_enclosure.go | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/ai-compliance-sdk/internal/iace/pattern_enclosure.go b/ai-compliance-sdk/internal/iace/pattern_enclosure.go index 343db431..b9c3e675 100644 --- a/ai-compliance-sdk/internal/iace/pattern_enclosure.go +++ b/ai-compliance-sdk/internal/iace/pattern_enclosure.go @@ -51,11 +51,16 @@ func guardedLifecycles(p HazardPattern, tagSet map[string]bool) []string { // generic duplicate is dropped. Scoped to the domain tag, so machines outside // the domain keep the generic pattern — regression-safe by construction. // -// HP016 (generic hot surfaces) -> HP2201 (Boiler/Tank/Spuelkammer) -// HP018 (actuator burn) -> HP2201 (same contact-burn hazard) +// HP016 (generic hot surfaces) -> HP2201 (Boiler/Tank/Spuelkammer) +// HP018 (actuator burn) -> HP2201 (same contact-burn hazard) +// HP013 (stored electrical NRG) -> HP144 (residual voltage; HP013's zone is +// framed for Batteriefaecher/USV-Anlagen a +// dishwasher does not have, HP144 is the +// Frequenzumrichter/Zwischenkreis variant) var genericSupersededByWarewashing = map[string]bool{ "HP016": true, "HP018": true, + "HP013": true, } // supersededByDomainSpecific reports whether a generic pattern is replaced by a From 8674b2cd9a04965458e9abae6d4893cf5f1e1ed0 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Thu, 25 Jun 2026 08:43:32 +0200 Subject: [PATCH 06/11] feat(ai-sdk): offline dedup-candidate proposer + deterministic GT wall (P2 slice 1) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit First thin slice of the offline library-improvement proposer. DEV-TIME ONLY, propose-only — it never mutates the pattern library or the runtime. - FindDedupCandidates (proposer_dedup.go): structural near-duplicate detection over the fired patterns (category + measure/zone/scenario overlap). Bakes in the P1 lesson: only same-category pairs compare, and pairs with different operational states are never proposed (normal-operation vs maintenance are legitimately distinct, e.g. HP011 vs HP077). - ScreenSupersession (proposer_screen.go): the wall. A proposal is safe only if (1) dropping the hazard does not reduce GT recall AND (2) keep/drop do not credit DIFFERENT GT entries. Check 2 catches distinct hazards that merely share measures (HP2201 hot surface GT 1.3 vs HP2202 hot ware GT 1.4) which recall alone would wave through. On real warewashing output: 3 candidates -> 1 BLOCKED (distinct GT), 2 RECALL-SAFE for human/LLM review (the update + winding/friction near-dupes). Nothing auto-applied. All 3 GTs unaffected (read-only). The LLM judgement and a CLI/file queue are slice 2. Co-Authored-By: Claude Opus 4.7 --- .../internal/iace/gt_warewashing_test.go | 54 ++++++- .../internal/iace/proposer_dedup.go | 152 ++++++++++++++++++ .../internal/iace/proposer_dedup_test.go | 67 ++++++++ .../internal/iace/proposer_screen.go | 61 +++++++ 4 files changed, 330 insertions(+), 4 deletions(-) create mode 100644 ai-compliance-sdk/internal/iace/proposer_dedup.go create mode 100644 ai-compliance-sdk/internal/iace/proposer_dedup_test.go create mode 100644 ai-compliance-sdk/internal/iace/proposer_screen.go diff --git a/ai-compliance-sdk/internal/iace/gt_warewashing_test.go b/ai-compliance-sdk/internal/iace/gt_warewashing_test.go index 6644bd36..71e8d960 100644 --- a/ai-compliance-sdk/internal/iace/gt_warewashing_test.go +++ b/ai-compliance-sdk/internal/iace/gt_warewashing_test.go @@ -45,7 +45,7 @@ var warewashingCyberCategories = map[string]bool{ // warewashingEngineOutput runs the production chain and returns the filtered // hazards/mitigations the user would see for the UC-M. -func warewashingEngineOutput() ([]Hazard, []Mitigation, int) { +func warewashingEngineOutput() ([]Hazard, []Mitigation, []PatternMatch) { res := ParseNarrative(warewashingNarrative, "Gewerbliche Untertisch-Geschirrspuelmaschine (vernetzt)") var compIDs, compNames []string @@ -94,7 +94,7 @@ func warewashingEngineOutput() ([]Hazard, []Mitigation, int) { filtered := *out filtered.MatchedPatterns = kept hazards, mitigations := patternsToHazardsAndMitigations(&filtered) - return hazards, mitigations, len(kept) + return hazards, mitigations, kept } func TestWarewashing_GTCoverage(t *testing.T) { @@ -119,8 +119,8 @@ func TestWarewashing_GTCoverage(t *testing.T) { t.Logf("Parsed components: %v", cn) } - hazards, mitigations, nPatterns := warewashingEngineOutput() - t.Logf("Engine: %d patterns kept (relevance+cyber filter) -> %d hazards", nPatterns, len(hazards)) + hazards, mitigations, keptPatterns := warewashingEngineOutput() + t.Logf("Engine: %d patterns kept (relevance+cyber filter) -> %d hazards", len(keptPatterns), len(hazards)) result := CompareBenchmark(>, hazards, mitigations) precision := 0.0 @@ -180,3 +180,49 @@ func TestWarewashing_GTCoverage(t *testing.T) { t.Errorf("warewashing recall below 40%% floor: %.1f%%", result.CoverageScore*100) } } + +// TestWarewashing_DedupProposer exercises the offline dedup-candidate proposer +// end-to-end on the real warewashing engine output: detect candidates, screen +// each against the GT, and log the human-review queue. It asserts the WALL is +// self-consistent — a PASS verdict may never coincide with a recall drop. +func TestWarewashing_DedupProposer(t *testing.T) { + raw, err := os.ReadFile(filepath.Join("testdata", "ground_truth_warewashing.json")) + if err != nil { + t.Fatalf("read GT: %v", err) + } + var gt GroundTruth + if err := json.Unmarshal(raw, >); err != nil { + t.Fatalf("parse GT: %v", err) + } + + hazards, mits, kept := warewashingEngineOutput() + // 0.25 is a deliberately permissive candidate threshold: the proposer is meant + // to over-surface, because the deterministic GT wall below (and a human, and in + // slice 2 an LLM) is the precision filter — not the detector. + candidates := FindDedupCandidates(kept, 0.25) + t.Logf("Proposer: %d dedup candidate(s) from %d fired patterns", len(candidates), len(kept)) + + safe, blocked := 0, 0 + for _, c := range candidates { + sr := ScreenSupersession(>, hazards, mits, c.KeepHazardName, c.DropName) + var verdict string + switch { + case sr.RecallAfter < sr.RecallBefore: + verdict, blocked = "BLOCK (recall-load-bearing)", blocked+1 + case sr.DistinctGT: + verdict, blocked = "BLOCK (distinct GT "+sr.KeepGT+" vs "+sr.DropGT+")", blocked+1 + default: + verdict, safe = "RECALL-SAFE (needs semantic review)", safe+1 + } + t.Logf("[%s] keep %s / drop %s score=%.2f recall %.1f%%->%.1f%% | %s", + verdict, c.KeepPattern, c.DropPattern, c.Score, + sr.RecallBefore*100, sr.RecallAfter*100, c.Rationale) + + // The wall must be sound: Safe implies recall preserved AND not distinct. + if sr.Safe && (sr.RecallAfter < sr.RecallBefore || sr.DistinctGT) { + t.Errorf("screen inconsistent for drop %s: Safe but recall dropped or distinct GT", c.DropPattern) + } + } + t.Logf("Proposer summary: %d RECALL-SAFE candidate(s) for human/LLM review, %d BLOCKED by the GT wall — propose-only, nothing auto-applied", + safe, blocked) +} diff --git a/ai-compliance-sdk/internal/iace/proposer_dedup.go b/ai-compliance-sdk/internal/iace/proposer_dedup.go new file mode 100644 index 00000000..9ed0ccb8 --- /dev/null +++ b/ai-compliance-sdk/internal/iace/proposer_dedup.go @@ -0,0 +1,152 @@ +package iace + +import ( + "fmt" + "math" + "regexp" + "sort" + "strings" +) + +// Offline dedup-candidate proposer (P2, type 1). DEV-TIME ONLY. +// +// It inspects the patterns that fired for one machine and proposes which look +// like duplicates, so a human (later an LLM) can decide a supersession/merge. It +// NEVER mutates the pattern library or the runtime — it only surfaces candidates. +// The deterministic GT screen (ScreenSupersession, proposer_screen.go) is the +// wall that proves a proposal is safe before a human ever sees it. +// +// Detection here is purely structural (category + zone + measure + scenario +// overlap) and therefore reproducible. Two safety rules bake in what P1 taught +// us about the dishwasher review: +// - only patterns with the SAME primary category are ever compared; +// - a pair with DIFFERENT operational states is NEVER proposed, because +// normal-operation and maintenance are legitimately distinct contexts with +// different protective measures (e.g. HP011 vs HP077). Merging them would +// erase the maintenance view. + +// DedupCandidate is a proposed near-duplicate pattern pair for one machine class. +type DedupCandidate struct { + KeepPattern string `json:"keep_pattern"` // higher-priority survivor + DropPattern string `json:"drop_pattern"` // supersession target + KeepName string `json:"keep_name"` + KeepHazardName string `json:"keep_hazard_name"` // keep pattern ScenarioDE (for the GT-distinctness screen) + DropName string `json:"drop_name"` // == generated hazard Name (ScenarioDE) of the drop pattern + Category string `json:"category"` + ZoneJaccard float64 `json:"zone_jaccard"` + MeasureJaccard float64 `json:"measure_jaccard"` + ScenarioJaccard float64 `json:"scenario_jaccard"` + Score float64 `json:"score"` + Rationale string `json:"rationale"` +} + +// FindDedupCandidates compares the fired patterns pairwise and returns near-dup +// candidates whose combined overlap score meets threshold, deterministically +// ordered (score desc, then drop-pattern id). The combined score weights measure +// overlap highest (shared measures are the strongest duplicate signal), then zone +// and scenario equally. +func FindDedupCandidates(fired []PatternMatch, threshold float64) []DedupCandidate { + var out []DedupCandidate + for i := 0; i < len(fired); i++ { + for j := i + 1; j < len(fired); j++ { + a, b := fired[i], fired[j] + ca := primaryCat(a) + if ca == "" || ca != primaryCat(b) { + continue + } + if !sameOpStateSet(a.OperationalStates, b.OperationalStates) { + continue // legitimate lifecycle variants — never propose a merge + } + zj := tokenJaccard(zoneTokenSet(a.ZoneDE), zoneTokenSet(b.ZoneDE)) + mj := tokenJaccard(toSet(a.SuggestedMeasureIDs), toSet(b.SuggestedMeasureIDs)) + sj := tokenJaccard(wordTokenSet(a.ScenarioDE), wordTokenSet(b.ScenarioDE)) + score := 0.4*mj + 0.3*zj + 0.3*sj + if score < threshold { + continue + } + keep, drop := a, b + if b.Priority > a.Priority { + keep, drop = b, a + } + out = append(out, DedupCandidate{ + KeepPattern: keep.PatternID, DropPattern: drop.PatternID, + KeepName: keep.PatternName, KeepHazardName: keep.ScenarioDE, DropName: drop.ScenarioDE, + Category: ca, ZoneJaccard: round2(zj), MeasureJaccard: round2(mj), + ScenarioJaccard: round2(sj), Score: round2(score), + Rationale: fmt.Sprintf( + "same category %q · measure overlap %.0f%% · zone overlap %.0f%% · scenario overlap %.0f%% → keep %s (P%d), supersede %s (P%d)", + ca, mj*100, zj*100, sj*100, keep.PatternID, keep.Priority, drop.PatternID, drop.Priority), + }) + } + } + sort.SliceStable(out, func(i, j int) bool { + if out[i].Score != out[j].Score { + return out[i].Score > out[j].Score + } + return out[i].DropPattern < out[j].DropPattern + }) + return out +} + +func primaryCat(pm PatternMatch) string { + if len(pm.HazardCats) == 0 { + return "" + } + return pm.HazardCats[0] +} + +func sameOpStateSet(a, b []string) bool { + sa, sb := toSet(a), toSet(b) + if len(sa) != len(sb) { + return false + } + for k := range sa { + if !sb[k] { + return false + } + } + return true +} + +var proposerWordSplit = regexp.MustCompile(`[^\p{L}]+`) + +// zoneTokenSet splits a comma-separated zone string into its component terms. +func zoneTokenSet(zone string) map[string]bool { + out := map[string]bool{} + for _, part := range strings.Split(strings.ToLower(zone), ",") { + if t := strings.TrimSpace(part); len([]rune(t)) >= 3 { + out[t] = true + } + } + return out +} + +// wordTokenSet tokenises free text into words of length >= 4 (drops connectives). +func wordTokenSet(s string) map[string]bool { + out := map[string]bool{} + for _, w := range proposerWordSplit.Split(strings.ToLower(s), -1) { + if len([]rune(w)) >= 4 { + out[w] = true + } + } + return out +} + +func tokenJaccard(a, b map[string]bool) float64 { + if len(a) == 0 && len(b) == 0 { + return 0 + } + inter := 0 + for k := range a { + if b[k] { + inter++ + } + } + union := len(a) + len(b) - inter + if union == 0 { + return 0 + } + return float64(inter) / float64(union) +} + +func round2(x float64) float64 { return math.Round(x*100) / 100 } diff --git a/ai-compliance-sdk/internal/iace/proposer_dedup_test.go b/ai-compliance-sdk/internal/iace/proposer_dedup_test.go new file mode 100644 index 00000000..d3418305 --- /dev/null +++ b/ai-compliance-sdk/internal/iace/proposer_dedup_test.go @@ -0,0 +1,67 @@ +package iace + +import "testing" + +func mkPM(id, cat, zone, scenario string, prio int, measures, opstates []string) PatternMatch { + return PatternMatch{ + PatternID: id, PatternName: id, Priority: prio, + HazardCats: []string{cat}, ZoneDE: zone, ScenarioDE: scenario, + SuggestedMeasureIDs: measures, OperationalStates: opstates, + } +} + +func TestFindDedupCandidates_FindsOverlappingPair(t *testing.T) { + fired := []PatternMatch{ + mkPM("HPa", "update_failure", "Steuerung, SPS", "Software-Update der Steuerung scheitert nach Abbruch", 80, + []string{"M138", "M146"}, nil), + mkPM("HPb", "update_failure", "Steuerung, Antriebsregler", "Software-Update der Steuerung schlaegt fehl", 75, + []string{"M138", "M146", "M141"}, nil), + mkPM("HPc", "mechanical_hazard", "Tuer", "Quetschen der Finger an der Tuer", 70, + []string{"M003"}, nil), + } + got := FindDedupCandidates(fired, 0.4) + if len(got) != 1 { + t.Fatalf("want 1 candidate, got %d: %+v", len(got), got) + } + // Higher-priority pattern survives, lower one is the drop target. + if got[0].KeepPattern != "HPa" || got[0].DropPattern != "HPb" { + t.Errorf("want keep HPa / drop HPb, got keep %s / drop %s", got[0].KeepPattern, got[0].DropPattern) + } + if got[0].DropName != "Software-Update der Steuerung schlaegt fehl" { + t.Errorf("DropName must equal drop pattern ScenarioDE, got %q", got[0].DropName) + } +} + +func TestFindDedupCandidates_LifecycleGuard(t *testing.T) { + // Same category, zone and measures — but normal-operation vs maintenance. + // These are legitimate variants (HP011 vs HP077) and must NOT be proposed. + fired := []PatternMatch{ + mkPM("HP011", "electrical_hazard", "Schaltschrank, Klemmenkasten", "Person beruehrt spannungsfuehrende Teile", 95, + []string{"M481", "M482"}, nil), + mkPM("HP077", "electrical_hazard", "Schaltschrank, Klemmenkasten", "Person beruehrt spannungsfuehrende Teile", 80, + []string{"M481", "M482"}, []string{"maintenance"}), + } + if got := FindDedupCandidates(fired, 0.4); len(got) != 0 { + t.Fatalf("lifecycle guard failed: want 0 candidates, got %d: %+v", len(got), got) + } +} + +func TestFindDedupCandidates_DifferentCategoryIgnored(t *testing.T) { + fired := []PatternMatch{ + mkPM("HPa", "thermal_hazard", "Boiler", "Heisse Oberflaeche am Boiler", 80, []string{"M071"}, nil), + mkPM("HPb", "mechanical_hazard", "Boiler", "Heisse Oberflaeche am Boiler", 80, []string{"M071"}, nil), + } + if got := FindDedupCandidates(fired, 0.3); len(got) != 0 { + t.Fatalf("cross-category pair must not be proposed, got %d", len(got)) + } +} + +func TestFindDedupCandidates_BelowThresholdDropped(t *testing.T) { + fired := []PatternMatch{ + mkPM("HPa", "mechanical_hazard", "Tuer", "Quetschen an der Tuer", 80, []string{"M003"}, nil), + mkPM("HPb", "mechanical_hazard", "Foerderband", "Einzug am Foerderband", 80, []string{"M540"}, nil), + } + if got := FindDedupCandidates(fired, 0.4); len(got) != 0 { + t.Fatalf("disjoint pair must be below threshold, got %d: %+v", len(got), got) + } +} diff --git a/ai-compliance-sdk/internal/iace/proposer_screen.go b/ai-compliance-sdk/internal/iace/proposer_screen.go new file mode 100644 index 00000000..f7f582b5 --- /dev/null +++ b/ai-compliance-sdk/internal/iace/proposer_screen.go @@ -0,0 +1,61 @@ +package iace + +import "github.com/google/uuid" + +// ScreenResult is the deterministic GT verdict for one proposed supersession. +type ScreenResult struct { + RecallBefore float64 `json:"recall_before"` + RecallAfter float64 `json:"recall_after"` + KeepGT string `json:"keep_gt,omitempty"` // GT entry the keeper credits (if any) + DropGT string `json:"drop_gt,omitempty"` // GT entry the drop credits (if any) + DistinctGT bool `json:"distinct_gt"` // keep & drop credit DIFFERENT GT entries -> distinct hazards + Safe bool `json:"safe"` // recall preserved AND not distinct +} + +// ScreenSupersession is the WALL between "propose" and "decide". A proposal is +// safe only if BOTH deterministic checks pass: +// +// 1. RECALL is not reduced when the drop-hazard (and its mitigations) is removed +// — otherwise the drop is load-bearing for GT coverage. +// 2. The two hazards do NOT credit DIFFERENT ground-truth entries. Recall alone +// is necessary but not sufficient: two genuinely distinct hazards that share +// the same measures (e.g. hot boiler surface vs hot ware on unloading) keep +// recall at 100% when one is dropped, yet must NOT be merged. If keep and +// drop each match a different GT entry, they are distinct. +// +// Whatever survives both is still only RECALL-SAFE — a candidate for a human (and +// in slice 2, an LLM) to confirm semantically. Deterministic; reuses +// CompareBenchmark; touches neither the library nor the runtime. +func ScreenSupersession(gt *GroundTruth, hazards []Hazard, mits []Mitigation, keepHazardName, dropHazardName string) ScreenResult { + before := CompareBenchmark(gt, hazards, mits) + + gtOf := map[string]string{} + for _, p := range before.MatchedPairs { + gtOf[p.EngineHazard.Name] = p.GTEntry.Nr + } + keepGT, dropGT := gtOf[keepHazardName], gtOf[dropHazardName] + distinct := keepGT != "" && dropGT != "" && keepGT != dropGT + + kept := make([]Hazard, 0, len(hazards)) + dropped := map[uuid.UUID]bool{} + for _, h := range hazards { + if h.Name == dropHazardName { + dropped[h.ID] = true + continue + } + kept = append(kept, h) + } + keptMits := make([]Mitigation, 0, len(mits)) + for _, m := range mits { + if !dropped[m.HazardID] { + keptMits = append(keptMits, m) + } + } + after := CompareBenchmark(gt, kept, keptMits) + + return ScreenResult{ + RecallBefore: before.CoverageScore, RecallAfter: after.CoverageScore, + KeepGT: keepGT, DropGT: dropGT, DistinctGT: distinct, + Safe: after.CoverageScore >= before.CoverageScore && !distinct, + } +} From 0ce4794767535cc075128325a19b43c237859905 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Thu, 25 Jun 2026 08:56:04 +0200 Subject: [PATCH 07/11] feat(ai-sdk): pluggable LLM judgment over recall-safe dedup candidates (P2 slice 2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the semantic judgement layer on top of the slice-1 detector + GT wall. DEV-TIME, propose-only — nothing mutates the library or runtime. - CandidateJudge interface with two implementations: HeuristicJudge (deterministic default/fallback, used in tests) and LLMJudge (offline, over the shared llm.ProviderRegistry via the LLMCompleter adapter). LLMJudge degrades to "uncertain" on any transport/parse error — it can never break a run. - BuildJudgePrompt: the ISO 12100 same-vs-distinct prompt, unit-tested deterministically even though the call is not. - RenderProposalQueue: markdown human-review queue with a suggested action per candidate (supersede / keep both / needs review). On real warewashing output the heuristic punts to "uncertain — needs the LLM judge" for exactly the two recall-safe near-dupes (HP807/HP033 update, HP101/HP096 winding-vs-friction), making the LLM's role explicit. All 3 GTs unaffected (read-only). Live qwen wiring + a CLI/file queue are slice 3. Co-Authored-By: Claude Opus 4.7 --- .../internal/iace/gt_warewashing_test.go | 43 +++-- .../internal/iace/proposer_judge.go | 174 ++++++++++++++++++ .../internal/iace/proposer_judge_test.go | 104 +++++++++++ .../internal/iace/proposer_queue.go | 47 +++++ 4 files changed, 351 insertions(+), 17 deletions(-) create mode 100644 ai-compliance-sdk/internal/iace/proposer_judge.go create mode 100644 ai-compliance-sdk/internal/iace/proposer_judge_test.go create mode 100644 ai-compliance-sdk/internal/iace/proposer_queue.go diff --git a/ai-compliance-sdk/internal/iace/gt_warewashing_test.go b/ai-compliance-sdk/internal/iace/gt_warewashing_test.go index 71e8d960..c42ff010 100644 --- a/ai-compliance-sdk/internal/iace/gt_warewashing_test.go +++ b/ai-compliance-sdk/internal/iace/gt_warewashing_test.go @@ -1,6 +1,7 @@ package iace import ( + "context" "encoding/json" "os" "path/filepath" @@ -196,33 +197,41 @@ func TestWarewashing_DedupProposer(t *testing.T) { } hazards, mits, kept := warewashingEngineOutput() + byID := map[string]PatternMatch{} + for _, pm := range kept { + byID[pm.PatternID] = pm + } // 0.25 is a deliberately permissive candidate threshold: the proposer is meant - // to over-surface, because the deterministic GT wall below (and a human, and in - // slice 2 an LLM) is the precision filter — not the detector. + // to over-surface, because the deterministic GT wall below (and a human, and the + // LLM judge) is the precision filter — not the detector. candidates := FindDedupCandidates(kept, 0.25) t.Logf("Proposer: %d dedup candidate(s) from %d fired patterns", len(candidates), len(kept)) - safe, blocked := 0, 0 + // Deterministic judge in the test; the dev-time CLI swaps in LLMJudge. + judge := HeuristicJudge{} + var judged []JudgedProposal + blocked := 0 for _, c := range candidates { sr := ScreenSupersession(>, hazards, mits, c.KeepHazardName, c.DropName) - var verdict string switch { case sr.RecallAfter < sr.RecallBefore: - verdict, blocked = "BLOCK (recall-load-bearing)", blocked+1 + t.Logf("[BLOCK recall-load-bearing] keep %s / drop %s", c.KeepPattern, c.DropPattern) + blocked++ case sr.DistinctGT: - verdict, blocked = "BLOCK (distinct GT "+sr.KeepGT+" vs "+sr.DropGT+")", blocked+1 + t.Logf("[BLOCK distinct GT %s vs %s] keep %s / drop %s", sr.KeepGT, sr.DropGT, c.KeepPattern, c.DropPattern) + blocked++ default: - verdict, safe = "RECALL-SAFE (needs semantic review)", safe+1 - } - t.Logf("[%s] keep %s / drop %s score=%.2f recall %.1f%%->%.1f%% | %s", - verdict, c.KeepPattern, c.DropPattern, c.Score, - sr.RecallBefore*100, sr.RecallAfter*100, c.Rationale) - - // The wall must be sound: Safe implies recall preserved AND not distinct. - if sr.Safe && (sr.RecallAfter < sr.RecallBefore || sr.DistinctGT) { - t.Errorf("screen inconsistent for drop %s: Safe but recall dropped or distinct GT", c.DropPattern) + if !sr.Safe { + t.Errorf("RECALL-SAFE branch but ScreenResult.Safe=false for drop %s", c.DropPattern) + } + v, conf, rat := judge.Judge(context.Background(), c, byID[c.KeepPattern], byID[c.DropPattern]) + judged = append(judged, JudgedProposal{ + Candidate: c, Screen: sr, Verdict: v, Confidence: conf, Rationale: rat, Judge: judge.Name(), + }) } } - t.Logf("Proposer summary: %d RECALL-SAFE candidate(s) for human/LLM review, %d BLOCKED by the GT wall — propose-only, nothing auto-applied", - safe, blocked) + + t.Logf("\n%s", RenderProposalQueue("Gewerbliche Geschirrspuelmaschine (vernetzt)", judged)) + t.Logf("Proposer summary: %d candidate(s) in queue (judge=%s), %d BLOCKED by the GT wall — propose-only, nothing auto-applied", + len(judged), judge.Name(), blocked) } diff --git a/ai-compliance-sdk/internal/iace/proposer_judge.go b/ai-compliance-sdk/internal/iace/proposer_judge.go new file mode 100644 index 00000000..d068656e --- /dev/null +++ b/ai-compliance-sdk/internal/iace/proposer_judge.go @@ -0,0 +1,174 @@ +package iace + +import ( + "context" + "encoding/json" + "fmt" + "strings" + + "github.com/breakpilot/ai-compliance-sdk/internal/llm" +) + +// Semantic judgement over RECALL-SAFE dedup candidates (P2 slice 2). DEV-TIME, +// propose-only. The deterministic GT wall (proposer_screen.go) has already +// removed candidates that would drop recall or that credit different GT entries; +// the judge only adds an opinion on whether the survivors are truly the same +// hazard, plus a rationale, for the human review queue. It NEVER mutates anything. +// +// The judge is pluggable behind CandidateJudge so the runtime/tests stay +// deterministic (HeuristicJudge) while the dev-time CLI can plug in the +// non-deterministic LLM (LLMJudge over the shared llm.ProviderRegistry). + +const ( + VerdictDuplicate = "duplicate" + VerdictDistinct = "distinct" + VerdictUncertain = "uncertain" +) + +// JudgedProposal is one candidate with its GT-wall result and the judge's opinion. +type JudgedProposal struct { + Candidate DedupCandidate `json:"candidate"` + Screen ScreenResult `json:"screen"` + Verdict string `json:"verdict"` + Confidence string `json:"confidence"` + Rationale string `json:"rationale"` + Judge string `json:"judge"` +} + +// CandidateJudge decides whether two near-duplicate patterns are the same hazard. +type CandidateJudge interface { + Name() string + Judge(ctx context.Context, c DedupCandidate, a, b PatternMatch) (verdict, confidence, rationale string) +} + +// HeuristicJudge is the deterministic default/fallback. It only ever returns "low" +// confidence — it is a placeholder for the LLM, and it deliberately punts to +// "uncertain" on the hard cases (low text overlap, shared measures) so the queue +// makes clear exactly where the LLM earns its keep. +type HeuristicJudge struct{} + +func (HeuristicJudge) Name() string { return "heuristic" } + +func (HeuristicJudge) Judge(_ context.Context, c DedupCandidate, _, _ PatternMatch) (string, string, string) { + switch { + case c.ScenarioJaccard >= 0.5 || (c.ZoneJaccard >= 0.5 && c.MeasureJaccard >= 0.5): + return VerdictDuplicate, "low", "structural: high scenario, or combined zone+measure, overlap" + case c.MeasureJaccard >= 0.99 && c.ZoneJaccard == 0 && c.ScenarioJaccard < 0.3: + return VerdictDistinct, "low", "structural: identical measures but no zone/scenario overlap — likely distinct hazards sharing generic measures" + default: + return VerdictUncertain, "low", "structural signal inconclusive — needs the LLM judge" + } +} + +// LLMJudge asks an offline model to make the semantic call. Non-deterministic, so +// it lives only in the dev-time tool, never in tests or the runtime. It degrades +// to "uncertain" on any transport or parse error — it must never break the run. +type LLMJudge struct { + Completer LLMCompleter + MachineClass string +} + +func (LLMJudge) Name() string { return "llm" } + +func (j LLMJudge) Judge(ctx context.Context, c DedupCandidate, a, b PatternMatch) (string, string, string) { + system, user := BuildJudgePrompt(j.MachineClass, a, b) + raw, err := j.Completer.Complete(ctx, system, user) + if err != nil { + return VerdictUncertain, "low", "LLM error: " + err.Error() + } + return parseJudgeJSON(raw) +} + +// BuildJudgePrompt is the real LLM artifact — built and unit-tested deterministically +// even though the call itself is not. It frames the ISO 12100 same-vs-distinct +// question and forces a JSON answer. +func BuildJudgePrompt(machineClass string, a, b PatternMatch) (system, user string) { + system = "Du bist Sachverstaendiger fuer Maschinensicherheit nach EN ISO 12100. " + + "Entscheide, ob zwei generierte Gefaehrdungen fuer DIESE Maschine DIESELBE Gefaehrdung " + + "beschreiben (Dublette) oder fachlich VERSCHIEDENE Gefaehrdungen sind, die nur zufaellig " + + "dieselben Schutzmassnahmen teilen. Verschieden, wenn Wirkort, Ausloeser oder " + + "Schadensmechanismus abweichen — auch bei gleicher Kategorie und gleichen Massnahmen. " + + "Antworte AUSSCHLIESSLICH als JSON: " + + `{"verdict":"duplicate|distinct|uncertain","confidence":"high|medium|low","rationale":"..."}.` + user = fmt.Sprintf(`Maschinenklasse: %s + +Gefaehrdung A (%s): + Name: %s + Kategorie: %s + Zone: %s + Szenario: %s + Ausloeser: %s + Schaden: %s + Massnahmen: %s + +Gefaehrdung B (%s): + Name: %s + Kategorie: %s + Zone: %s + Szenario: %s + Ausloeser: %s + Schaden: %s + Massnahmen: %s + +Sind A und B dieselbe Gefaehrdung fuer diese Maschine?`, + machineClass, + a.PatternID, a.PatternName, primaryCat(a), a.ZoneDE, a.ScenarioDE, a.TriggerDE, a.HarmDE, strings.Join(a.SuggestedMeasureIDs, ", "), + b.PatternID, b.PatternName, primaryCat(b), b.ZoneDE, b.ScenarioDE, b.TriggerDE, b.HarmDE, strings.Join(b.SuggestedMeasureIDs, ", ")) + return system, user +} + +func parseJudgeJSON(raw string) (verdict, confidence, rationale string) { + start, end := strings.Index(raw, "{"), strings.LastIndex(raw, "}") + if start < 0 || end <= start { + return VerdictUncertain, "low", "unparseable LLM output" + } + var v struct { + Verdict string `json:"verdict"` + Confidence string `json:"confidence"` + Rationale string `json:"rationale"` + } + if err := json.Unmarshal([]byte(raw[start:end+1]), &v); err != nil { + return VerdictUncertain, "low", "unparseable LLM JSON: " + err.Error() + } + switch v.Verdict { + case VerdictDuplicate, VerdictDistinct, VerdictUncertain: + default: + v.Verdict = VerdictUncertain + } + if v.Confidence == "" { + v.Confidence = "low" + } + return v.Verdict, v.Confidence, v.Rationale +} + +// LLMCompleter is the minimal text-in/text-out the LLM judge needs. Tests pass a +// stub; the dev-time tool passes a registry-backed adapter (NewRegistryCompleter). +type LLMCompleter interface { + Complete(ctx context.Context, system, user string) (string, error) +} + +type registryCompleter struct { + reg *llm.ProviderRegistry + model string +} + +// NewRegistryCompleter adapts the shared llm.ProviderRegistry to LLMCompleter so +// the proposer can reuse the platform's offline model wiring (e.g. self-hosted qwen). +func NewRegistryCompleter(reg *llm.ProviderRegistry, model string) LLMCompleter { + return ®istryCompleter{reg: reg, model: model} +} + +func (rc *registryCompleter) Complete(ctx context.Context, system, user string) (string, error) { + resp, err := rc.reg.Chat(ctx, &llm.ChatRequest{ + Model: rc.model, + Messages: []llm.Message{ + {Role: "system", Content: system}, + {Role: "user", Content: user}, + }, + Temperature: 0, + }) + if err != nil { + return "", err + } + return resp.Message.Content, nil +} diff --git a/ai-compliance-sdk/internal/iace/proposer_judge_test.go b/ai-compliance-sdk/internal/iace/proposer_judge_test.go new file mode 100644 index 00000000..fdfc043a --- /dev/null +++ b/ai-compliance-sdk/internal/iace/proposer_judge_test.go @@ -0,0 +1,104 @@ +package iace + +import ( + "context" + "errors" + "strings" + "testing" +) + +func TestHeuristicJudge_Verdicts(t *testing.T) { + tests := []struct { + name string + zone, meas float64 + scenario float64 + wantVerdict string + }{ + {"high scenario overlap -> duplicate", 0, 0.3, 0.6, VerdictDuplicate}, + {"high zone+measure -> duplicate", 0.6, 0.6, 0.1, VerdictDuplicate}, + {"identical measures, no text -> distinct", 0, 1.0, 0.0, VerdictDistinct}, + {"shared measures, low text -> uncertain", 0, 0.67, 0.19, VerdictUncertain}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + c := DedupCandidate{ZoneJaccard: tt.zone, MeasureJaccard: tt.meas, ScenarioJaccard: tt.scenario} + v, conf, _ := HeuristicJudge{}.Judge(context.Background(), c, PatternMatch{}, PatternMatch{}) + if v != tt.wantVerdict { + t.Errorf("verdict: want %s, got %s", tt.wantVerdict, v) + } + if conf != "low" { + t.Errorf("heuristic confidence must be low, got %s", conf) + } + }) + } +} + +func TestBuildJudgePrompt_ContainsKeyFacts(t *testing.T) { + a := PatternMatch{PatternID: "HPa", PatternName: "Heisse Flaeche", HazardCats: []string{"thermal_hazard"}, + ZoneDE: "Boiler", ScenarioDE: "Beruehrung heisser Boiler", SuggestedMeasureIDs: []string{"M071"}} + b := PatternMatch{PatternID: "HPb", PatternName: "Heisses Spuelgut", HazardCats: []string{"thermal_hazard"}, + ZoneDE: "Spuelgut", ScenarioDE: "Beruehrung heisses Geschirr", SuggestedMeasureIDs: []string{"M071"}} + system, user := BuildJudgePrompt("Geschirrspuelmaschine", a, b) + + for _, want := range []string{"EN ISO 12100", "JSON", "verdict"} { + if !strings.Contains(system, want) { + t.Errorf("system prompt missing %q", want) + } + } + for _, want := range []string{"Geschirrspuelmaschine", "HPa", "HPb", "Boiler", "Spuelgut", "thermal_hazard"} { + if !strings.Contains(user, want) { + t.Errorf("user prompt missing %q", want) + } + } +} + +type fakeCompleter struct { + out string + err error +} + +func (f fakeCompleter) Complete(_ context.Context, _, _ string) (string, error) { return f.out, f.err } + +func TestLLMJudge_ParsesAndDegrades(t *testing.T) { + cand := DedupCandidate{KeepPattern: "HPa", DropPattern: "HPb"} + + // Well-formed JSON, even wrapped in chatter, parses. + j := LLMJudge{Completer: fakeCompleter{out: "Sicher. {\"verdict\":\"distinct\",\"confidence\":\"high\",\"rationale\":\"andere Wirkorte\"}"}, MachineClass: "x"} + if v, conf, r := j.Judge(context.Background(), cand, PatternMatch{}, PatternMatch{}); v != VerdictDistinct || conf != "high" || r != "andere Wirkorte" { + t.Errorf("parse: got %s/%s/%q", v, conf, r) + } + + // Unknown verdict value normalises to uncertain. + j2 := LLMJudge{Completer: fakeCompleter{out: `{"verdict":"maybe","confidence":"medium","rationale":"x"}`}} + if v, _, _ := j2.Judge(context.Background(), cand, PatternMatch{}, PatternMatch{}); v != VerdictUncertain { + t.Errorf("unknown verdict must normalise to uncertain, got %s", v) + } + + // Transport error degrades gracefully, never panics. + j3 := LLMJudge{Completer: fakeCompleter{err: errors.New("connection refused")}} + if v, _, r := j3.Judge(context.Background(), cand, PatternMatch{}, PatternMatch{}); v != VerdictUncertain || !strings.Contains(r, "LLM error") { + t.Errorf("error path: got %s / %q", v, r) + } + + // Garbage (no JSON) degrades to uncertain. + j4 := LLMJudge{Completer: fakeCompleter{out: "no json here"}} + if v, _, _ := j4.Judge(context.Background(), cand, PatternMatch{}, PatternMatch{}); v != VerdictUncertain { + t.Errorf("garbage must degrade to uncertain, got %s", v) + } +} + +func TestRenderProposalQueue_ShowsActions(t *testing.T) { + proposals := []JudgedProposal{ + { + Candidate: DedupCandidate{KeepPattern: "HP807", DropPattern: "HP033", Category: "update_failure", Score: 0.32}, + Screen: ScreenResult{RecallBefore: 1, RecallAfter: 1}, + Verdict: VerdictDuplicate, Confidence: "medium", Rationale: "same update failure", Judge: "llm", + }, + } + out := RenderProposalQueue("Geschirrspuelmaschine", proposals) + for _, want := range []string{"HP807", "HP033", "update_failure", "supersession", "Propose-only"} { + if !strings.Contains(out, want) { + t.Errorf("queue missing %q\n%s", want, out) + } + } +} diff --git a/ai-compliance-sdk/internal/iace/proposer_queue.go b/ai-compliance-sdk/internal/iace/proposer_queue.go new file mode 100644 index 00000000..6d7a1aa3 --- /dev/null +++ b/ai-compliance-sdk/internal/iace/proposer_queue.go @@ -0,0 +1,47 @@ +package iace + +import ( + "fmt" + "strings" +) + +// RenderProposalQueue turns judged dedup proposals into the human-review queue +// (markdown). Deterministic. Nothing here applies a change — every entry is a +// suggestion for a human to confirm, edit, commit, and pin with a GT case. +func RenderProposalQueue(machine string, proposals []JudgedProposal) string { + var b strings.Builder + fmt.Fprintf(&b, "# Dedup proposal queue — %s\n\n", machine) + fmt.Fprintf(&b, "%d candidate(s) survived the deterministic GT wall. Propose-only — nothing is applied automatically.\n\n", len(proposals)) + + for i, p := range proposals { + c := p.Candidate + fmt.Fprintf(&b, "## %d. keep %s ⊃ drop %s [%s → %s (%s)]\n", + i+1, c.KeepPattern, c.DropPattern, p.Judge, p.Verdict, p.Confidence) + fmt.Fprintf(&b, "- category %s · score %.2f (measures %.0f%%, zone %.0f%%, scenario %.0f%%)\n", + c.Category, c.Score, c.MeasureJaccard*100, c.ZoneJaccard*100, c.ScenarioJaccard*100) + fmt.Fprintf(&b, "- GT recall %.1f%% → %.1f%% when %s is dropped (wall: %s)\n", + p.Screen.RecallBefore*100, p.Screen.RecallAfter*100, c.DropPattern, wallNote(p.Screen)) + fmt.Fprintf(&b, "- keep: %s\n- drop: %s\n", c.KeepHazardName, c.DropName) + fmt.Fprintf(&b, "- judge rationale: %s\n", p.Rationale) + fmt.Fprintf(&b, "- suggested action: %s\n\n", suggestedAction(p)) + } + return b.String() +} + +func wallNote(s ScreenResult) string { + if s.DistinctGT { + return fmt.Sprintf("distinct GT %s vs %s", s.KeepGT, s.DropGT) + } + return "recall-safe" +} + +func suggestedAction(p JudgedProposal) string { + switch p.Verdict { + case VerdictDuplicate: + return fmt.Sprintf("add %s to a supersession set, then a human confirms + commits + pins a GT case", p.Candidate.DropPattern) + case VerdictDistinct: + return "keep both — judge considers them distinct hazards" + default: + return "needs human (or higher-confidence LLM) review — no automatic action" + } +} From 8440ddfecb227eec30cc35f993e3e9a7c737dbd5 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Thu, 25 Jun 2026 09:10:43 +0200 Subject: [PATCH 08/11] feat(ai-sdk): runnable iace-audit propose CLI + live LLM wiring (P2 slice 3) Makes the offline proposer runnable end-to-end. - BuildProposerInput (proposer_input.go): non-test engine->hazards path. The PatternMatch->Hazard converter is lifted out of the GT test files into production scope so both the tests and the CLI share one pipeline. - iace-audit propose []: detect candidates -> GT-screen survivors (when a ground truth is given) -> judge (HeuristicJudge by default, LLMJudge over ollama when IACE_PROPOSE_LLM=1) -> write the human-review queue to audit-reports/proposals.{md,json}. Propose-only. Smoke run on a dishwasher narrative: 32 fired -> 3 candidates -> queue with a confident duplicate, a confident distinct, and one punted to the LLM judge; GT wall recall-safe. Live qwen is opt-in via env; the heuristic default keeps the tool runnable (and CI deterministic) without a model. Proposal types 2-4 (foreign-framing gates, vocab->tag, coverage blind spots) remain for slice 4. Co-Authored-By: Claude Opus 4.7 --- ai-compliance-sdk/cmd/iace-audit/main.go | 4 +- ai-compliance-sdk/cmd/iace-audit/propose.go | 141 ++++++++++++++++++ .../internal/iace/gt_kistenhub_test.go | 61 -------- .../internal/iace/proposer_input.go | 123 +++++++++++++++ .../internal/iace/proposer_input_test.go | 25 ++++ 5 files changed, 292 insertions(+), 62 deletions(-) create mode 100644 ai-compliance-sdk/cmd/iace-audit/propose.go create mode 100644 ai-compliance-sdk/internal/iace/proposer_input.go create mode 100644 ai-compliance-sdk/internal/iace/proposer_input_test.go diff --git a/ai-compliance-sdk/cmd/iace-audit/main.go b/ai-compliance-sdk/cmd/iace-audit/main.go index eda07173..edc53001 100644 --- a/ai-compliance-sdk/cmd/iace-audit/main.go +++ b/ai-compliance-sdk/cmd/iace-audit/main.go @@ -34,6 +34,8 @@ func main() { cmdEcho(os.Args[2:]) case "hierarchy": cmdHierarchy(os.Args[2:]) + case "propose": + cmdPropose(os.Args[2:]) default: usage() os.Exit(2) @@ -41,7 +43,7 @@ func main() { } func usage() { - fmt.Fprintln(os.Stderr, "Usage: iace-audit [args]") + fmt.Fprintln(os.Stderr, "Usage: iace-audit [args]") } func cmdReachability(_ []string) { diff --git a/ai-compliance-sdk/cmd/iace-audit/propose.go b/ai-compliance-sdk/cmd/iace-audit/propose.go new file mode 100644 index 00000000..45667432 --- /dev/null +++ b/ai-compliance-sdk/cmd/iace-audit/propose.go @@ -0,0 +1,141 @@ +package main + +import ( + "context" + "encoding/json" + "fmt" + "os" + "strconv" + + "github.com/breakpilot/ai-compliance-sdk/internal/iace" + "github.com/breakpilot/ai-compliance-sdk/internal/llm" +) + +type narrativeInput struct { + MachineType string `json:"machine_type"` + Narrative string `json:"narrative"` + MachineTypes []string `json:"machine_types,omitempty"` +} + +// cmdPropose — Method P: offline dedup-candidate proposer. +// +// iace-audit propose [] +// +// Detect near-duplicate patterns, screen survivors against a ground truth (if +// given), judge them (heuristic by default, LLM when enabled), and write the +// human-review queue to audit-reports/proposals.{md,json}. Propose-only — it +// writes a report and never mutates the pattern library. +// +// Env: +// +// IACE_PROPOSE_THRESHOLD candidate score threshold (default 0.30) +// IACE_PROPOSE_LLM=1 use the offline LLM judge instead of the heuristic +// OLLAMA_URL ollama base URL (default http://localhost:11434) +// SELF_HOSTED_LLM_MODEL model name (default qwen2.5:32b-instruct) +func cmdPropose(args []string) { + if len(args) < 1 { + fmt.Fprintln(os.Stderr, "propose: usage: iace-audit propose []") + os.Exit(2) + } + + var in narrativeInput + must(readJSONFile(args[0], &in)) + if in.Narrative == "" { + fmt.Fprintln(os.Stderr, "propose: narrative is empty") + os.Exit(2) + } + + var gt *iace.GroundTruth + if len(args) >= 2 { + var g iace.GroundTruth + must(readJSONFile(args[1], &g)) + gt = &g + } + + threshold := envFloat("IACE_PROPOSE_THRESHOLD", 0.30) + hazards, mits, fired := iace.BuildProposerInput(in.Narrative, in.MachineType, in.MachineTypes) + candidates := iace.FindDedupCandidates(fired, threshold) + + byID := make(map[string]iace.PatternMatch, len(fired)) + for _, pm := range fired { + byID[pm.PatternID] = pm + } + + judge := selectJudge(in.MachineType) + ctx := context.Background() + + var proposals []iace.JudgedProposal + blocked := 0 + for _, c := range candidates { + var sr iace.ScreenResult + if gt != nil { + sr = iace.ScreenSupersession(gt, hazards, mits, c.KeepHazardName, c.DropName) + if sr.RecallAfter < sr.RecallBefore || sr.DistinctGT { + blocked++ + continue + } + } + v, conf, rat := judge.Judge(ctx, c, byID[c.KeepPattern], byID[c.DropPattern]) + proposals = append(proposals, iace.JudgedProposal{ + Candidate: c, Screen: sr, Verdict: v, Confidence: conf, Rationale: rat, Judge: judge.Name(), + }) + } + + writeText("audit-reports/proposals.md", iace.RenderProposalQueue(in.MachineType, proposals)) + writeJSON("audit-reports/proposals.json", proposals) + + printSummary("Method P — Dedup Proposer ("+judge.Name()+")", map[string]int{ + "fired_patterns": len(fired), + "candidates": len(candidates), + "in_queue": len(proposals), + "gt_blocked": blocked, + }) + if gt == nil { + fmt.Fprintln(os.Stderr, "note: no ground truth provided — GT wall NOT applied (candidates not recall-screened)") + } +} + +func selectJudge(machineClass string) iace.CandidateJudge { + if os.Getenv("IACE_PROPOSE_LLM") != "1" { + return iace.HeuristicJudge{} + } + base := envStr("OLLAMA_URL", "http://localhost:11434") + model := envStr("SELF_HOSTED_LLM_MODEL", "qwen2.5:32b-instruct") + reg := llm.NewProviderRegistry("ollama", "") + reg.Register(llm.NewOllamaAdapter(base, model)) + fmt.Printf("using LLM judge (ollama %s, model %s)\n", base, model) + return iace.LLMJudge{Completer: iace.NewRegistryCompleter(reg, model), MachineClass: machineClass} +} + +func readJSONFile(path string, v any) error { + raw, err := os.ReadFile(path) + if err != nil { + return err + } + return json.Unmarshal(raw, v) +} + +func writeText(path, content string) { + _ = os.MkdirAll("audit-reports", 0o755) + if err := os.WriteFile(path, []byte(content), 0o644); err != nil { + fmt.Fprintln(os.Stderr, "warn: could not write", path, err) + return + } + fmt.Println("→ wrote", path) +} + +func envStr(key, def string) string { + if v := os.Getenv(key); v != "" { + return v + } + return def +} + +func envFloat(key string, def float64) float64 { + if v := os.Getenv(key); v != "" { + if f, err := strconv.ParseFloat(v, 64); err == nil { + return f + } + } + return def +} diff --git a/ai-compliance-sdk/internal/iace/gt_kistenhub_test.go b/ai-compliance-sdk/internal/iace/gt_kistenhub_test.go index 2212d1da..1144adfb 100644 --- a/ai-compliance-sdk/internal/iace/gt_kistenhub_test.go +++ b/ai-compliance-sdk/internal/iace/gt_kistenhub_test.go @@ -7,8 +7,6 @@ import ( "path/filepath" "sort" "testing" - - "github.com/google/uuid" ) // TestKistenhub_GTCoverage runs the Kistenhubgeraet ground truth (37 entries) @@ -110,65 +108,6 @@ func TestKistenhub_GTCoverage(t *testing.T) { // patternsToHazardsAndMitigations converts a pattern match output into the // Hazard/Mitigation shapes that CompareBenchmark expects. Mirrors what // iace_handler_init.go does in production but without DB writes. -func patternsToHazardsAndMitigations(out *MatchOutput) ([]Hazard, []Mitigation) { - hazards := make([]Hazard, 0, len(out.MatchedPatterns)) - patternToHazard := make(map[string]uuid.UUID, len(out.MatchedPatterns)) - - for _, pm := range out.MatchedPatterns { - cat := "" - if len(pm.HazardCats) > 0 { - cat = pm.HazardCats[0] - } - zone := pm.ZoneDE - lifecycle := "" - if len(pm.ApplicableLifecycles) > 0 { - lifecycle = pm.ApplicableLifecycles[0] - } - h := Hazard{ - ID: uuid.New(), - Name: pm.ScenarioDE, - Category: cat, - Description: pm.ScenarioDE, - Scenario: pm.ScenarioDE, - TriggerEvent: pm.TriggerDE, - PossibleHarm: pm.HarmDE, - AffectedPerson: pm.AffectedDE, - HazardousZone: zone, - LifecyclePhase: lifecycle, - } - if h.Name == "" { - h.Name = pm.PatternName - } - hazards = append(hazards, h) - patternToHazard[pm.PatternID] = h.ID - } - - measureNames := make(map[string]string) - for _, m := range GetProtectiveMeasureLibrary() { - measureNames[m.ID] = m.Name - } - - var mitigations []Mitigation - for _, sm := range out.SuggestedMeasures { - name := measureNames[sm.MeasureID] - if name == "" { - name = sm.MeasureID - } - for _, srcPattern := range sm.SourcePatterns { - hid, ok := patternToHazard[srcPattern] - if !ok { - continue - } - mitigations = append(mitigations, Mitigation{ - ID: uuid.New(), - HazardID: hid, - Name: name, - }) - } - } - return hazards, mitigations -} - func abbrev(s string, max int) string { if len(s) <= max { return s diff --git a/ai-compliance-sdk/internal/iace/proposer_input.go b/ai-compliance-sdk/internal/iace/proposer_input.go new file mode 100644 index 00000000..c92c484e --- /dev/null +++ b/ai-compliance-sdk/internal/iace/proposer_input.go @@ -0,0 +1,123 @@ +package iace + +import "github.com/google/uuid" + +// Non-test plumbing for the offline proposer (P2 slice 3): run the engine for a +// narrative and produce the fired patterns + the engine-built hazards/mitigations +// the dedup proposer and GT screen consume. This is the same pipeline the GT +// benchmark tests use, lifted out of test scope so the dev-time CLI can call it. + +// universalLifecyclePhases are appended so patterns gated to a specific lifecycle +// (maintenance/cleaning/setup/fault clearing) still fire — the proposer wants the +// full hazard picture, not only normal-operation hazards. +var universalLifecyclePhases = []string{"normal_operation", "maintenance", "cleaning", "setup", "fault_clearing"} + +// BuildProposerInput parses a narrative, runs the pattern engine, keeps the +// narrative-relevant patterns, and returns the hazards, mitigations and fired +// patterns. NOTE: it does not apply the CE cyber-category skip, so the proposer +// view may include cyber/AI hazards that the CE log excludes — harmless for the +// GT recall screen (they match no CE ground-truth entry). +func BuildProposerInput(narrative, machineType string, extraMachineTypes []string) ([]Hazard, []Mitigation, []PatternMatch) { + res := ParseNarrative(narrative, machineType) + + var compIDs, compNames, energyIDs []string + for _, c := range res.Components { + if c.Negated { + continue + } + compIDs = append(compIDs, c.LibraryID) + compNames = append(compNames, c.NameDE) + } + for _, e := range res.EnergySources { + energyIDs = append(energyIDs, e.SourceID) + } + + machineTypes := append([]string{}, extraMachineTypes...) + if machineType != "" { + machineTypes = append(machineTypes, machineType) + } + lifecycles := append(append([]string{}, res.LifecyclePhases...), universalLifecyclePhases...) + + out := NewPatternEngine().Match(MatchInput{ + ComponentLibraryIDs: compIDs, + EnergySourceIDs: energyIDs, + LifecyclePhases: lifecycles, + CustomTags: res.CustomTags, + OperationalStates: res.OperationalStates, + StateTransitions: res.StateTransitions, + HumanRoles: res.Roles, + MachineTypes: machineTypes, + }) + + kept := make([]PatternMatch, 0, len(out.MatchedPatterns)) + for _, pm := range out.MatchedPatterns { + if IsPatternRelevant(pm, narrative, compNames) { + kept = append(kept, pm) + } + } + filtered := *out + filtered.MatchedPatterns = kept + hazards, mits := patternsToHazardsAndMitigations(&filtered) + return hazards, mits, kept +} + +// patternsToHazardsAndMitigations converts engine output into the hazard/mitigation +// entities the benchmark + proposer compare on. Simplified vs InitializeProject +// (no risk estimation, no norm refs) — it only needs category/zone/scenario/measures. +func patternsToHazardsAndMitigations(out *MatchOutput) ([]Hazard, []Mitigation) { + hazards := make([]Hazard, 0, len(out.MatchedPatterns)) + patternToHazard := make(map[string]uuid.UUID, len(out.MatchedPatterns)) + + for _, pm := range out.MatchedPatterns { + cat := "" + if len(pm.HazardCats) > 0 { + cat = pm.HazardCats[0] + } + lifecycle := "" + if len(pm.ApplicableLifecycles) > 0 { + lifecycle = pm.ApplicableLifecycles[0] + } + h := Hazard{ + ID: uuid.New(), + Name: pm.ScenarioDE, + Category: cat, + Description: pm.ScenarioDE, + Scenario: pm.ScenarioDE, + TriggerEvent: pm.TriggerDE, + PossibleHarm: pm.HarmDE, + AffectedPerson: pm.AffectedDE, + HazardousZone: pm.ZoneDE, + LifecyclePhase: lifecycle, + } + if h.Name == "" { + h.Name = pm.PatternName + } + hazards = append(hazards, h) + patternToHazard[pm.PatternID] = h.ID + } + + measureNames := make(map[string]string) + for _, m := range GetProtectiveMeasureLibrary() { + measureNames[m.ID] = m.Name + } + + var mitigations []Mitigation + for _, sm := range out.SuggestedMeasures { + name := measureNames[sm.MeasureID] + if name == "" { + name = sm.MeasureID + } + for _, srcPattern := range sm.SourcePatterns { + hid, ok := patternToHazard[srcPattern] + if !ok { + continue + } + mitigations = append(mitigations, Mitigation{ + ID: uuid.New(), + HazardID: hid, + Name: name, + }) + } + } + return hazards, mitigations +} diff --git a/ai-compliance-sdk/internal/iace/proposer_input_test.go b/ai-compliance-sdk/internal/iace/proposer_input_test.go new file mode 100644 index 00000000..720f88a1 --- /dev/null +++ b/ai-compliance-sdk/internal/iace/proposer_input_test.go @@ -0,0 +1,25 @@ +package iace + +import "testing" + +func TestBuildProposerInput_WarewashingFires(t *testing.T) { + hazards, _, fired := BuildProposerInput( + warewashingNarrative, + "Gewerbliche Untertisch-Geschirrspuelmaschine (vernetzt)", + []string{"food_processing"}, + ) + if len(fired) == 0 || len(hazards) == 0 { + t.Fatalf("want fired patterns + hazards, got %d patterns / %d hazards", len(fired), len(hazards)) + } + has := func(id string) bool { + for _, pm := range fired { + if pm.PatternID == id { + return true + } + } + return false + } + if !has("HP2201") { + t.Errorf("warewashing-specific HP2201 must fire via BuildProposerInput") + } +} From 662aec209a66eb87f88b4aea80a7e5cb5d0d162c Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Thu, 25 Jun 2026 09:30:00 +0200 Subject: [PATCH 09/11] feat(ai-sdk): foreign-framing proposer (P2 slice 4, type 2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Surfaces fired patterns whose zone names terms the machine's narrative never mentions — foreign framing that leaks through terms not yet in domainGateTerms (once a term is a gate term, the ghost-pattern invariant already fences it out). - FindFramingCandidates (proposer_framing.go): per fired pattern, zone terms with no narrative echo (minus a generic hazard-location stoplist). Echo matching is bidirectional to survive German compounding (narrative "Steuerung" echoes zone "Steuerungssystem"). Heuristic verdict foreign (fully orphan) / plausible (partial). Over-surfaces by design — human/LLM is the precision filter. - Wired into iace-audit propose -> audit-reports/framing.{md,json}, threshold via IACE_FRAMING_MIN_ORPHAN (default 0.6). Honest finding: genuine wrong-MACHINE framing (Walzen, Transportbaender) no longer fires thanks to the machine-type gate; the residual is mostly cyber/control patterns with generic-industrial zone vocabulary, candidates for re-framing. Proposal types 3-4 (vocab->tag, coverage blind spots) remain for slice 5. Co-Authored-By: Claude Opus 4.7 --- ai-compliance-sdk/cmd/iace-audit/propose.go | 6 + .../internal/iace/proposer_framing.go | 154 ++++++++++++++++++ .../internal/iace/proposer_framing_test.go | 33 ++++ 3 files changed, 193 insertions(+) create mode 100644 ai-compliance-sdk/internal/iace/proposer_framing.go create mode 100644 ai-compliance-sdk/internal/iace/proposer_framing_test.go diff --git a/ai-compliance-sdk/cmd/iace-audit/propose.go b/ai-compliance-sdk/cmd/iace-audit/propose.go index 45667432..dfab7646 100644 --- a/ai-compliance-sdk/cmd/iace-audit/propose.go +++ b/ai-compliance-sdk/cmd/iace-audit/propose.go @@ -84,11 +84,17 @@ func cmdPropose(args []string) { writeText("audit-reports/proposals.md", iace.RenderProposalQueue(in.MachineType, proposals)) writeJSON("audit-reports/proposals.json", proposals) + // Type 2: foreign-framing candidates (zone terms with no narrative echo). + framing := iace.FindFramingCandidates(fired, in.Narrative, envFloat("IACE_FRAMING_MIN_ORPHAN", 0.6)) + writeText("audit-reports/framing.md", iace.RenderFramingQueue(in.MachineType, framing)) + writeJSON("audit-reports/framing.json", framing) + printSummary("Method P — Dedup Proposer ("+judge.Name()+")", map[string]int{ "fired_patterns": len(fired), "candidates": len(candidates), "in_queue": len(proposals), "gt_blocked": blocked, + "framing_flags": len(framing), }) if gt == nil { fmt.Fprintln(os.Stderr, "note: no ground truth provided — GT wall NOT applied (candidates not recall-screened)") diff --git a/ai-compliance-sdk/internal/iace/proposer_framing.go b/ai-compliance-sdk/internal/iace/proposer_framing.go new file mode 100644 index 00000000..687a2f40 --- /dev/null +++ b/ai-compliance-sdk/internal/iace/proposer_framing.go @@ -0,0 +1,154 @@ +package iace + +import ( + "fmt" + "sort" + "strings" +) + +// Foreign-framing proposer (P2 slice 4, type 2). DEV-TIME, propose-only. +// +// A pattern can fire for a machine yet describe its hazard with a zone text +// framed for a DIFFERENT machine (e.g. a dishwasher hazard whose zone names +// "Walzen, Transportbaender" or "Bearbeitungszone"). Such foreign framing leaks +// through terms that are NOT yet in domainGateTerms — once a term is a gate term, +// the ghost-pattern invariant already fences the pattern out. So we surface the +// candidates structurally: zone terms a fired pattern names that the machine's +// narrative never mentions (minus generic hazard-location vocabulary). A human +// (or the LLM) then decides: add a dom_* gate term, or re-frame the zone text. +// +// This OVER-surfaces by design — the human/LLM is the precision filter, not the +// detector (same contract as the dedup proposer). + +// genericHazardStop are hazard-LOCATION words that legitimately appear in zones +// without being echoed in a narrative — they are not evidence of foreign framing. +var genericHazardStop = map[string]bool{ + "quetschstelle": true, "einzugstelle": true, "einzugsstelle": true, "scherstelle": true, + "schneidstelle": true, "stossstelle": true, "fangstelle": true, "klemmstelle": true, + "gefahrbereich": true, "gefahrenbereich": true, "gefahrstelle": true, "gefahrenstelle": true, + "arbeitsbereich": true, "wirkbereich": true, "schutzbereich": true, "umgebung": true, + "bereich": true, "zugang": true, "oberflaeche": true, "oberflaechen": true, + "gehaeuse": true, "bauteil": true, "bauteile": true, "komponente": true, "maschine": true, +} + +// FramingCandidate is a fired pattern whose zone text looks foreign for the machine. +type FramingCandidate struct { + Pattern string `json:"pattern"` + Name string `json:"name"` + Category string `json:"category"` + Zone string `json:"zone"` + OrphanTerms []string `json:"orphan_terms"` + OrphanFraction float64 `json:"orphan_fraction"` + Verdict string `json:"verdict"` // heuristic lean: foreign | plausible + Evidence string `json:"evidence"` +} + +// FindFramingCandidates returns fired patterns whose zone is mostly not echoed in +// the narrative, sorted by orphan fraction descending (deterministic). +func FindFramingCandidates(fired []PatternMatch, narrative string, minFraction float64) []FramingCandidate { + nar := strings.ToLower(narrative) + var narStems []string + for _, w := range proposerWordSplit.Split(nar, -1) { + if len([]rune(w)) >= 5 { + narStems = append(narStems, w) + } + } + var out []FramingCandidate + for _, pm := range fired { + parts := zoneParts(pm.ZoneDE) + if len(parts) == 0 { + continue + } + var orphans []string + for _, p := range parts { + if !partEchoed(p, nar, narStems) { + orphans = append(orphans, p) + } + } + frac := float64(len(orphans)) / float64(len(parts)) + if len(orphans) == 0 || frac < minFraction { + continue + } + out = append(out, FramingCandidate{ + Pattern: pm.PatternID, Name: pm.PatternName, Category: primaryCat(pm), + Zone: pm.ZoneDE, OrphanTerms: orphans, OrphanFraction: round2(frac), + Verdict: framingHeuristicVerdict(frac), + Evidence: fmt.Sprintf("%d/%d zone terms have no narrative echo: %s", len(orphans), len(parts), strings.Join(orphans, ", ")), + }) + } + sort.SliceStable(out, func(i, j int) bool { + if out[i].OrphanFraction != out[j].OrphanFraction { + return out[i].OrphanFraction > out[j].OrphanFraction + } + return out[i].Pattern < out[j].Pattern + }) + return out +} + +func framingHeuristicVerdict(frac float64) string { + if frac >= 0.99 { + return "foreign" // nothing in the zone is echoed by the narrative + } + return "plausible" // partial echo — likely generic vocabulary, human to confirm +} + +// zoneParts splits a zone string into significant terms on commas, slashes, +// parentheses and semicolons, lowercased, length >= 4. +func zoneParts(zone string) []string { + fields := strings.FieldsFunc(strings.ToLower(zone), func(r rune) bool { + return r == ',' || r == '/' || r == ';' || r == '(' || r == ')' + }) + var out []string + for _, f := range fields { + if t := strings.TrimSpace(f); len([]rune(t)) >= 4 { + out = append(out, t) + } + } + return out +} + +// partEchoed reports whether a zone part is reflected in the narrative. Matching +// is bidirectional to survive German compounding: a zone word echoes if it is a +// generic hazard term, if it is a substring of the narrative, OR if any narrative +// stem (>= 5 chars) is a substring of the zone word (so narrative "Steuerung" +// echoes zone "Steuerungssystem"). +func partEchoed(part, narrative string, narStems []string) bool { + for _, w := range strings.Fields(part) { + if genericHazardStop[w] { + return true + } + if len([]rune(w)) < 4 { + continue + } + if strings.Contains(narrative, w) { + return true + } + for _, ns := range narStems { + if strings.Contains(w, ns) { + return true + } + } + } + return false +} + +// RenderFramingQueue renders foreign-framing candidates as a markdown review queue. +func RenderFramingQueue(machine string, candidates []FramingCandidate) string { + var b strings.Builder + fmt.Fprintf(&b, "# Foreign-framing review queue — %s\n\n", machine) + fmt.Fprintf(&b, "%d fired pattern(s) name zone terms the narrative never mentions. Propose-only — a human (or the LLM) decides: add a dom_* gate term, or re-frame the zone.\n\n", len(candidates)) + for i, c := range candidates { + fmt.Fprintf(&b, "## %d. %s — %s [%s, orphan %.0f%%]\n", i+1, c.Pattern, c.Name, c.Verdict, c.OrphanFraction*100) + fmt.Fprintf(&b, "- category: %s\n- zone: %s\n", c.Category, c.Zone) + fmt.Fprintf(&b, "- orphan terms (no narrative echo): %s\n", strings.Join(c.OrphanTerms, ", ")) + fmt.Fprintf(&b, "- suggested action: %s\n\n", framingAction(c.Verdict)) + } + return b.String() +} + +func framingAction(verdict string) string { + if verdict == "foreign" { + return "likely foreign-framed — propose a dom_* gate term for the orphan term(s), or re-frame the zone; human confirms + commits + pins a GT case" + } + return "partial echo — likely generic vocabulary; human to confirm whether any orphan term is a foreign-machine component" +} diff --git a/ai-compliance-sdk/internal/iace/proposer_framing_test.go b/ai-compliance-sdk/internal/iace/proposer_framing_test.go new file mode 100644 index 00000000..0acbb29e --- /dev/null +++ b/ai-compliance-sdk/internal/iace/proposer_framing_test.go @@ -0,0 +1,33 @@ +package iace + +import "testing" + +func TestFindFramingCandidates_FlagsForeignZone(t *testing.T) { + narrative := "Gewerbliche Geschirrspuelmaschine mit Boiler und Tank. Die Tuer ist verriegelt." + fired := []PatternMatch{ + mkPM("HPforeign", "mechanical_hazard", "Walzen, Transportbaender, Bearbeitungszone", "Einzug", 80, nil, nil), + mkPM("HPlocal", "thermal_hazard", "Boiler, Tank, Tuer", "Verbrennung", 80, nil, nil), + mkPM("HPgeneric", "mechanical_hazard", "Quetschstelle, Gefahrbereich", "Quetschen", 80, nil, nil), + } + got := FindFramingCandidates(fired, narrative, 0.6) + if len(got) != 1 || got[0].Pattern != "HPforeign" { + t.Fatalf("want only HPforeign flagged, got %+v", got) + } + if got[0].Verdict != "foreign" { + t.Errorf("fully-orphan zone should be 'foreign', got %s", got[0].Verdict) + } +} + +func TestFindFramingCandidates_PartialEchoIsPlausible(t *testing.T) { + narrative := "Maschine mit Boiler und Tank." + fired := []PatternMatch{ + mkPM("HPx", "thermal_hazard", "Boiler, Tank, Auspuffleitung", "x", 80, nil, nil), + } + got := FindFramingCandidates(fired, narrative, 0.3) + if len(got) != 1 { + t.Fatalf("want 1 candidate (1/3 orphan >= 0.3), got %d", len(got)) + } + if got[0].Verdict != "plausible" || len(got[0].OrphanTerms) != 1 || got[0].OrphanTerms[0] != "auspuffleitung" { + t.Errorf("want plausible + orphan [auspuffleitung], got %s %v", got[0].Verdict, got[0].OrphanTerms) + } +} From c13aa9183a97e12a8ed8093ec0a7f237bf75d3ac Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Thu, 25 Jun 2026 09:51:12 +0200 Subject: [PATCH 10/11] feat(ai-sdk): vocab->tag proposer (P2 slice 5, type 3) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extends Method C: for each unknown narrative token that pattern text names, suggest the keyword_dictionary tag = the RequiredComponentTags shared by the naming patterns (ranked by frequency, kept only when shared by >=40% of them, top 3). Surfaces real dictionary gaps like "zwischenkreis" -> stored_energy and "updates" -> has_software, which close coverage without hand-editing the dict. Two precision fixes to Method C while here: - patternsMentioning now matches WHOLE WORDS, not substrings — substring matching flagged fragments like "stehen" inside "entstehen" and produced nonsensical tag suggestions. - a token is only proposed with a tag if one is shared by >=40% of its naming patterns, so diffuse common verbs (spread across categories) drop out. Wired into iace-audit propose -> audit-reports/vocab.{md,json}. Residual common-verb noise is left to the human/LLM filter rather than a hand-grown stopword list. Type 4 (coverage blind spots) + P3 (pin accepted proposals into a GT case) remain for slice 6. Co-Authored-By: Claude Opus 4.7 --- ai-compliance-sdk/cmd/iace-audit/propose.go | 31 ++++++++ .../internal/iace/audit/stubs.go | 4 + .../internal/iace/audit/vocabulary.go | 79 +++++++++++++++++-- .../iace/audit/vocabulary_proposer_test.go | 36 +++++++++ 4 files changed, 143 insertions(+), 7 deletions(-) create mode 100644 ai-compliance-sdk/internal/iace/audit/vocabulary_proposer_test.go diff --git a/ai-compliance-sdk/cmd/iace-audit/propose.go b/ai-compliance-sdk/cmd/iace-audit/propose.go index dfab7646..75452b0f 100644 --- a/ai-compliance-sdk/cmd/iace-audit/propose.go +++ b/ai-compliance-sdk/cmd/iace-audit/propose.go @@ -6,8 +6,10 @@ import ( "fmt" "os" "strconv" + "strings" "github.com/breakpilot/ai-compliance-sdk/internal/iace" + "github.com/breakpilot/ai-compliance-sdk/internal/iace/audit" "github.com/breakpilot/ai-compliance-sdk/internal/llm" ) @@ -89,12 +91,25 @@ func cmdPropose(args []string) { writeText("audit-reports/framing.md", iace.RenderFramingQueue(in.MachineType, framing)) writeJSON("audit-reports/framing.json", framing) + // Type 3: vocab->tag proposals (unknown narrative tokens that pattern text + // names as a whole word, with a dominant shared required tag). + vocab := audit.RunVocabulary(map[string]any{"narrative": in.Narrative}) + var vgaps []audit.DictionarySuggestion + for _, s := range vocab.SuggestedDictionaryEntries { + if len(s.SuggestedTags) > 0 { + vgaps = append(vgaps, s) + } + } + writeText("audit-reports/vocab.md", renderVocabQueue(in.MachineType, vgaps)) + writeJSON("audit-reports/vocab.json", vgaps) + printSummary("Method P — Dedup Proposer ("+judge.Name()+")", map[string]int{ "fired_patterns": len(fired), "candidates": len(candidates), "in_queue": len(proposals), "gt_blocked": blocked, "framing_flags": len(framing), + "vocab_gaps": len(vgaps), }) if gt == nil { fmt.Fprintln(os.Stderr, "note: no ground truth provided — GT wall NOT applied (candidates not recall-screened)") @@ -145,3 +160,19 @@ func envFloat(key string, def float64) float64 { } return def } + +func renderVocabQueue(machine string, entries []audit.DictionarySuggestion) string { + var b strings.Builder + fmt.Fprintf(&b, "# Vocab→tag review queue — %s\n\n", machine) + fmt.Fprintf(&b, "%d unknown token(s) appear in pattern text but map to no dictionary tag. Propose-only — a human (or the LLM) confirms the tag, then adds a keyword_dictionary entry and pins a GT case.\n\n", len(entries)) + for i, s := range entries { + tag := "" + if len(s.SuggestedTags) > 0 { + tag = s.SuggestedTags[0] + } + fmt.Fprintf(&b, "## %d. \"%s\" → suggested tag(s): %s\n", i+1, s.Token, strings.Join(s.SuggestedTags, ", ")) + fmt.Fprintf(&b, "- named by %d pattern(s): %s\n", len(s.PatternIDs), strings.Join(s.PatternIDs, ", ")) + fmt.Fprintf(&b, "- suggested action: add keyword_dictionary entry {%q → %s} so narratives mentioning it trigger those patterns; human confirms\n\n", s.Token, tag) + } + return b.String() +} diff --git a/ai-compliance-sdk/internal/iace/audit/stubs.go b/ai-compliance-sdk/internal/iace/audit/stubs.go index 66661168..cd318a14 100644 --- a/ai-compliance-sdk/internal/iace/audit/stubs.go +++ b/ai-compliance-sdk/internal/iace/audit/stubs.go @@ -36,6 +36,10 @@ type DictionarySuggestion struct { Token string `json:"token"` Field string `json:"field"` PatternIDs []string `json:"pattern_ids"` + // SuggestedTags are the RequiredComponentTags shared by the naming patterns, + // ranked by frequency — the candidate tags a keyword_dictionary entry for this + // token would emit so narratives mentioning it can trigger those patterns. + SuggestedTags []string `json:"suggested_tags,omitempty"` } type VocabularyReport struct { diff --git a/ai-compliance-sdk/internal/iace/audit/vocabulary.go b/ai-compliance-sdk/internal/iace/audit/vocabulary.go index b97b427f..a237b58c 100644 --- a/ai-compliance-sdk/internal/iace/audit/vocabulary.go +++ b/ai-compliance-sdk/internal/iace/audit/vocabulary.go @@ -66,14 +66,19 @@ func runVocabulary(form map[string]any) VocabularyReport { // For each unknown token check if any pattern names it patterns := iace.AllPatterns() + byID := make(map[string]iace.HazardPattern, len(patterns)) + for _, p := range patterns { + byID[p.ID] = p + } for _, tok := range report.UnknownTokens { hits := patternsMentioning(tok, patterns) if len(hits) == 0 { continue } report.SuggestedDictionaryEntries = append(report.SuggestedDictionaryEntries, DictionarySuggestion{ - Token: tok, - PatternIDs: hits, + Token: tok, + PatternIDs: hits, + SuggestedTags: suggestTagsFor(hits, byID), }) } sort.Slice(report.SuggestedDictionaryEntries, func(i, j int) bool { @@ -129,18 +134,24 @@ func dictTokenHit(tok string, dict map[string]bool) bool { return false } -// patternsMentioning returns up to 8 pattern IDs whose scenario/trigger/ -// harm/zone text contains the token (case-insensitive substring). +// patternsMentioning returns up to 8 pattern IDs whose scenario/trigger/harm/ +// zone text names the token as a WHOLE WORD. Whole-word (not substring) matching +// is essential: a substring match flags common fragments like "stehen" inside +// "entstehen", producing spurious hits and nonsensical tag suggestions. func patternsMentioning(tok string, patterns []iace.HazardPattern) []string { tokLower := strings.ToLower(tok) seen := map[string]bool{} var out []string for _, p := range patterns { hay := strings.ToLower(p.ScenarioDE + " " + p.TriggerDE + " " + p.HarmDE + " " + p.ZoneDE + " " + p.NameDE) - if !strings.Contains(hay, tokLower) { - continue + matched := false + for _, w := range tokenRE.FindAllString(hay, -1) { + if w == tokLower { + matched = true + break + } } - if seen[p.ID] { + if !matched || seen[p.ID] { continue } seen[p.ID] = true @@ -151,3 +162,57 @@ func patternsMentioning(tok string, patterns []iace.HazardPattern) []string { } return out } + +// suggestTagsFor returns the RequiredComponentTags shared across the naming +// patterns, ranked by how many of them require each tag (ties broken by name), +// top 3. These are the candidate tags a dictionary entry for the token should +// emit so a narrative mentioning the token can trigger those patterns. +func suggestTagsFor(ids []string, byID map[string]iace.HazardPattern) []string { + freq := map[string]int{} + total := 0 + for _, id := range ids { + p, ok := byID[id] + if !ok { + continue + } + total++ + seen := map[string]bool{} + for _, tag := range p.RequiredComponentTags { + if seen[tag] { + continue + } + seen[tag] = true + freq[tag]++ + } + } + if total == 0 { + return nil + } + type tf struct { + tag string + n int + } + ranked := make([]tf, 0, len(freq)) + for t, n := range freq { + ranked = append(ranked, tf{t, n}) + } + sort.Slice(ranked, func(i, j int) bool { + if ranked[i].n != ranked[j].n { + return ranked[i].n > ranked[j].n + } + return ranked[i].tag < ranked[j].tag + }) + // Only suggest a tag shared by >= 40% of the naming patterns. Diffuse tokens + // (common verbs spread across categories) get no dominant tag and are dropped. + var out []string + for _, x := range ranked { + if float64(x.n)/float64(total) < 0.4 { + break + } + out = append(out, x.tag) + if len(out) >= 3 { + break + } + } + return out +} diff --git a/ai-compliance-sdk/internal/iace/audit/vocabulary_proposer_test.go b/ai-compliance-sdk/internal/iace/audit/vocabulary_proposer_test.go new file mode 100644 index 00000000..8f8b0a59 --- /dev/null +++ b/ai-compliance-sdk/internal/iace/audit/vocabulary_proposer_test.go @@ -0,0 +1,36 @@ +package audit + +import ( + "testing" + + "github.com/breakpilot/ai-compliance-sdk/internal/iace" +) + +func TestSuggestTagsFor_RanksSharedRequiredTags(t *testing.T) { + byID := map[string]iace.HazardPattern{ + "P1": {ID: "P1", RequiredComponentTags: []string{"backflow_risk", "dom_warewashing"}}, + "P2": {ID: "P2", RequiredComponentTags: []string{"backflow_risk"}}, + "P3": {ID: "P3", RequiredComponentTags: []string{"sharp_edge"}}, + } + got := suggestTagsFor([]string{"P1", "P2", "P3"}, byID) + if len(got) == 0 || got[0] != "backflow_risk" { + t.Fatalf("want backflow_risk ranked first (2 patterns), got %v", got) + } +} + +func TestSuggestTagsFor_TopThreeStableAlpha(t *testing.T) { + byID := map[string]iace.HazardPattern{ + "P1": {ID: "P1", RequiredComponentTags: []string{"d", "b", "a", "c"}}, + } + got := suggestTagsFor([]string{"P1"}, byID) + if len(got) != 3 || got[0] != "a" || got[1] != "b" || got[2] != "c" { + t.Fatalf("want stable alpha top-3 [a b c], got %v", got) + } +} + +func TestSuggestTagsFor_UnknownPatternIgnored(t *testing.T) { + byID := map[string]iace.HazardPattern{} + if got := suggestTagsFor([]string{"missing"}, byID); len(got) != 0 { + t.Fatalf("want empty for unknown patterns, got %v", got) + } +} From 4d225f73a8dc1bbf81d8ddd7159c13edefb91c19 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Thu, 25 Jun 2026 10:03:10 +0200 Subject: [PATCH 11/11] feat(ai-sdk): coverage blind-spot proposer (P2 slice 6, type 4) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Completes the proposer's four types. - FindCoverageGaps (proposer_coverage.go): deterministic — which EN ISO 12100 hazard groups A-G did the engine leave with zero hazards for this machine? An empty group is a structural blind-spot signal (the machine may truly lack it, or a pattern/GT case is missing). Useful with no model at all. - ProposeMissingHazards + BuildCoveragePrompt: optional LLM expansion of each gap into specific expected-but-missing hazards a safety assessor would name (propose-only, reuses LLMCompleter, degrades to nil on any error). - Wired into iace-audit propose -> audit-reports/coverage.{md,json}. On the dishwasher: D. Pneumatik (truly absent — nothing invented), E. Laerm (borderline), F. Ergonomie (a genuine gap: manual loading the engine did not produce). P3 (pin an accepted proposal into a GT case) remains as a human-in-the- loop follow-up. Co-Authored-By: Claude Opus 4.7 --- ai-compliance-sdk/cmd/iace-audit/propose.go | 10 ++ .../internal/iace/proposer_coverage.go | 143 ++++++++++++++++++ .../internal/iace/proposer_coverage_test.go | 59 ++++++++ 3 files changed, 212 insertions(+) create mode 100644 ai-compliance-sdk/internal/iace/proposer_coverage.go create mode 100644 ai-compliance-sdk/internal/iace/proposer_coverage_test.go diff --git a/ai-compliance-sdk/cmd/iace-audit/propose.go b/ai-compliance-sdk/cmd/iace-audit/propose.go index 75452b0f..2c3e03f9 100644 --- a/ai-compliance-sdk/cmd/iace-audit/propose.go +++ b/ai-compliance-sdk/cmd/iace-audit/propose.go @@ -103,6 +103,15 @@ func cmdPropose(args []string) { writeText("audit-reports/vocab.md", renderVocabQueue(in.MachineType, vgaps)) writeJSON("audit-reports/vocab.json", vgaps) + // Type 4: coverage blind-spots (empty ISO 12100 groups A-G) + LLM expansion. + gaps := iace.FindCoverageGaps(hazards) + var missing []iace.MissingHazard + if lj, ok := judge.(iace.LLMJudge); ok { + missing = iace.ProposeMissingHazards(ctx, lj.Completer, in.MachineType, in.Narrative, hazards, gaps) + } + writeText("audit-reports/coverage.md", iace.RenderCoverageQueue(in.MachineType, gaps, missing)) + writeJSON("audit-reports/coverage.json", gaps) + printSummary("Method P — Dedup Proposer ("+judge.Name()+")", map[string]int{ "fired_patterns": len(fired), "candidates": len(candidates), @@ -110,6 +119,7 @@ func cmdPropose(args []string) { "gt_blocked": blocked, "framing_flags": len(framing), "vocab_gaps": len(vgaps), + "coverage_gaps": len(gaps), }) if gt == nil { fmt.Fprintln(os.Stderr, "note: no ground truth provided — GT wall NOT applied (candidates not recall-screened)") diff --git a/ai-compliance-sdk/internal/iace/proposer_coverage.go b/ai-compliance-sdk/internal/iace/proposer_coverage.go new file mode 100644 index 00000000..836c2165 --- /dev/null +++ b/ai-compliance-sdk/internal/iace/proposer_coverage.go @@ -0,0 +1,143 @@ +package iace + +import ( + "context" + "encoding/json" + "fmt" + "strings" +) + +// Coverage blind-spot proposer (P2 slice 6, type 4). DEV-TIME, propose-only. +// +// Deterministic skeleton: which EN ISO 12100 hazard groups (A-G, the classic CE +// groups; H-J are control/CRA and routinely routed elsewhere) did the engine +// leave with ZERO hazards for this machine? An empty group is a structural +// blind-spot signal — the machine may genuinely lack that hazard, or a pattern +// may be missing. The LLM then expands each gap into specific expected-but-missing +// hazards a safety assessor would name, for a human to confirm into a new pattern +// or GT case. The gaps alone are useful without any model. + +type isoGroup struct { + Key string + Label string + Cats []string +} + +var iso12100Groups = []isoGroup{ + {"mechanical", "A. Mechanisch", []string{"mechanical_hazard", "mechanical", "maintenance_hazard"}}, + {"electrical", "B. Elektrisch", []string{"electrical_hazard", "electrical", "emc_hazard"}}, + {"thermal", "C. Thermisch", []string{"thermal_hazard", "thermal", "high_temperature", "fire_explosion"}}, + {"pneumatic_hydraulic", "D. Pneumatik/Hydraulik", []string{"pneumatic_hydraulic"}}, + {"noise_vibration", "E. Laerm/Vibration", []string{"noise_hazard", "noise_vibration", "vibration_hazard"}}, + {"ergonomic", "F. Ergonomie", []string{"ergonomic_hazard", "ergonomic"}}, + {"material", "G. Stoffe/Umwelt", []string{"material_environmental", "chemical_risk", "radiation_hazard"}}, +} + +// CoverageGap is an ISO 12100 hazard group with no engine hazard. +type CoverageGap struct { + Group string `json:"group"` + Key string `json:"key"` + Note string `json:"note"` +} + +// FindCoverageGaps returns the A-G hazard groups that produced zero hazards. +func FindCoverageGaps(hazards []Hazard) []CoverageGap { + present := make(map[string]bool, len(hazards)) + for _, h := range hazards { + present[h.Category] = true + } + var gaps []CoverageGap + for _, g := range iso12100Groups { + covered := false + for _, c := range g.Cats { + if present[c] { + covered = true + break + } + } + if !covered { + gaps = append(gaps, CoverageGap{ + Group: g.Label, Key: g.Key, + Note: "no engine hazard in this ISO 12100 group — verify the machine truly lacks it, or a pattern is missing", + }) + } + } + return gaps +} + +// MissingHazard is an LLM-proposed hazard a safety assessor would expect. +type MissingHazard struct { + Group string `json:"group"` + Hazard string `json:"hazard"` + Why string `json:"why"` +} + +// ProposeMissingHazards asks the LLM to expand the empty groups into specific +// expected hazards. Returns nil without a completer or on any error — propose-only, +// never breaks the run. +func ProposeMissingHazards(ctx context.Context, completer LLMCompleter, machineClass, narrative string, produced []Hazard, gaps []CoverageGap) []MissingHazard { + if completer == nil || len(gaps) == 0 { + return nil + } + system, user := BuildCoveragePrompt(machineClass, narrative, produced, gaps) + raw, err := completer.Complete(ctx, system, user) + if err != nil { + return nil + } + return parseMissingHazards(raw) +} + +// BuildCoveragePrompt frames the "what is missing?" question for the LLM. +func BuildCoveragePrompt(machineClass, narrative string, produced []Hazard, gaps []CoverageGap) (system, user string) { + system = "Du bist Sachverstaendiger fuer Maschinensicherheit nach EN ISO 12100. " + + "Dir werden eine Maschine, die bereits erkannten Gefaehrdungen und Gefaehrdungsgruppen OHNE Eintrag genannt. " + + "Nenne nur Gefaehrdungen, die ein Sachverstaendiger fuer DIESE Maschine ERWARTET, die aber FEHLEN. " + + "Erfinde nichts Maschinenfremdes. Antworte AUSSCHLIESSLICH als JSON-Array: " + + `[{"group":"...","hazard":"...","why":"..."}].` + + var have []string + seen := map[string]bool{} + for _, h := range produced { + if h.Category != "" && !seen[h.Category] { + seen[h.Category] = true + have = append(have, h.Category) + } + } + var empty []string + for _, g := range gaps { + empty = append(empty, g.Group) + } + user = fmt.Sprintf("Maschinenklasse: %s\n\nBeschreibung:\n%s\n\nBereits erkannte Kategorien: %s\n\nGruppen OHNE Eintrag (Fokus): %s\n\nWelche erwarteten Gefaehrdungen fehlen?", + machineClass, narrative, strings.Join(have, ", "), strings.Join(empty, ", ")) + return system, user +} + +func parseMissingHazards(raw string) []MissingHazard { + start, end := strings.Index(raw, "["), strings.LastIndex(raw, "]") + if start < 0 || end <= start { + return nil + } + var out []MissingHazard + if err := json.Unmarshal([]byte(raw[start:end+1]), &out); err != nil { + return nil + } + return out +} + +// RenderCoverageQueue renders the deterministic gaps plus any LLM-proposed missing +// hazards as a markdown review queue. +func RenderCoverageQueue(machine string, gaps []CoverageGap, missing []MissingHazard) string { + var b strings.Builder + fmt.Fprintf(&b, "# Coverage blind-spot queue — %s\n\n", machine) + fmt.Fprintf(&b, "%d ISO 12100 group(s) (A-G) have no engine hazard. Propose-only — a human confirms whether the machine truly lacks it or a pattern/GT case is missing.\n\n", len(gaps)) + for _, g := range gaps { + fmt.Fprintf(&b, "- **%s** — %s\n", g.Group, g.Note) + } + if len(missing) > 0 { + fmt.Fprintf(&b, "\n## LLM-proposed expected-but-missing hazards (%d)\n\n", len(missing)) + for i, m := range missing { + fmt.Fprintf(&b, "%d. [%s] %s\n - why: %s\n", i+1, m.Group, m.Hazard, m.Why) + } + } + return b.String() +} diff --git a/ai-compliance-sdk/internal/iace/proposer_coverage_test.go b/ai-compliance-sdk/internal/iace/proposer_coverage_test.go new file mode 100644 index 00000000..7e442d1c --- /dev/null +++ b/ai-compliance-sdk/internal/iace/proposer_coverage_test.go @@ -0,0 +1,59 @@ +package iace + +import ( + "context" + "strings" + "testing" +) + +func TestFindCoverageGaps(t *testing.T) { + hazards := []Hazard{ + {Category: "mechanical_hazard"}, + {Category: "thermal_hazard"}, + {Category: "electrical_hazard"}, + {Category: "material_environmental"}, + } + gapKeys := map[string]bool{} + for _, g := range FindCoverageGaps(hazards) { + gapKeys[g.Key] = true + } + for _, want := range []string{"pneumatic_hydraulic", "noise_vibration", "ergonomic"} { + if !gapKeys[want] { + t.Errorf("expected gap %s", want) + } + } + for _, notWant := range []string{"mechanical", "thermal", "electrical", "material"} { + if gapKeys[notWant] { + t.Errorf("did not expect gap %s (covered)", notWant) + } + } +} + +func TestBuildCoveragePrompt_ContainsContext(t *testing.T) { + produced := []Hazard{{Category: "thermal_hazard"}} + gaps := []CoverageGap{{Group: "F. Ergonomie", Key: "ergonomic"}} + system, user := BuildCoveragePrompt("Geschirrspuelmaschine", "Eine Spuelmaschine mit Tank.", produced, gaps) + if !strings.Contains(system, "EN ISO 12100") || !strings.Contains(system, "JSON") { + t.Errorf("system prompt missing framing") + } + for _, want := range []string{"Geschirrspuelmaschine", "thermal_hazard", "F. Ergonomie", "Spuelmaschine mit Tank"} { + if !strings.Contains(user, want) { + t.Errorf("user prompt missing %q", want) + } + } +} + +func TestProposeMissingHazards_ParsesAndDegrades(t *testing.T) { + gaps := []CoverageGap{{Group: "F. Ergonomie", Key: "ergonomic"}} + c := fakeCompleter{out: `Hier: [{"group":"F. Ergonomie","hazard":"Heben schwerer Koerbe","why":"manuelles Beladen"}] fertig`} + got := ProposeMissingHazards(context.Background(), c, "x", "n", nil, gaps) + if len(got) != 1 || got[0].Hazard != "Heben schwerer Koerbe" { + t.Fatalf("parse: got %+v", got) + } + if ProposeMissingHazards(context.Background(), nil, "x", "n", nil, gaps) != nil { + t.Errorf("nil completer must return nil") + } + if ProposeMissingHazards(context.Background(), fakeCompleter{err: context.DeadlineExceeded}, "x", "n", nil, gaps) != nil { + t.Errorf("error must return nil") + } +}