Files
breakpilot-compliance/ai-compliance-sdk/internal/iace/audit/echo.go
T
Benjamin Admin f534b52817 feat(iace): pattern audit suite + library hygiene wave
Add cmd/iace-audit CLI with 5 deterministic methods that find engine
gaps without ground truth:

- A reachability: 1058 patterns vs achievable tag universe
- B consistency: components vs their declared hazard categories
- C vocabulary: limits-form tokens vs keyword dictionary
- D echo: limits-form sentences vs generated hazards (jaccard)
- E hierarchy: hazards vs ISO 12100 design/protection/info levels

Library fixes triggered by A+B+C findings:

- tag_resolver: synonym map for electrical/pneumatic/hydraulic aliases
- component_library: crush_point + EN03 (gravitational) on C014/C128
  (Hubwerk family) - fixes HP1014/1015/1017/1018 which were silently
  weakly_reachable. noise_source added on 7 components (C006/C011/
  C017/C020/C031/C041/C096). electrical_part on 8 drive components
  (C031/C032/C033/C034/C035/C036/C037/C038/C077/C092). cyber tag
  on 10 sensors (C081-C090) + 3 IT components (C111/C112/C116) +
  KI module C119 (ai_model added). pneumatic_part+hydraulic_part
  on valves C091/C093, hydraulic_part+chemical_risk on pump C097,
  moving_part on motion controller C075
- keyword_dictionary: EN03 added to aufzug/lift/hubwerk/hubgeraet
  (was wrongly EN04-only). New keyword entries for hub-action verbs:
  absenken/senken/anheben/heben + hubhoehe/hubweg/hubgeschwindig

Audit impact:
- A: weakly_reachable 409 -> 358 (-51 patterns now fully reachable)
- B: incomplete components 46 -> 30 (-16, -33%)
- HP1018 (Person unter absenkendem Maschinenteil eingeklemmt):
  weakly_reachable -> reachable

Why: methods A/B/C surfaced that the Kistenhubgeraet test project
generated 0 crush-under-load hazards despite OSHA 1910.212(a)(3) +
EN ISO 12100 6.3.5.5 explicitly requiring them. Three orthogonal
bugs (missing crush_point tag, wrong energy source mapping, missing
action verbs in dictionary) silently disabled the entire lift crush
pattern family.
2026-05-21 10:51:08 +02:00

162 lines
4.5 KiB
Go

package audit
import (
"regexp"
"sort"
"strings"
)
// runEchoImpl checks if each meaningful phrase from the limits-form is
// echoed by at least one generated hazard. A phrase that names a concrete
// scenario, fault, or constraint must reappear (semantically) in some
// hazard's name, scenario, or description. Phrases without echo are gaps:
// the engineer documented the risk but the engine never lifted it into
// the hazard register.
//
// Echo detection here is a lightweight Jaccard overlap of content tokens
// (not embeddings) — robust enough for the demonstrative diagnostic and
// keeps the audit fully deterministic without an external model. The
// caller can later swap in a vector-based scorer.
func init() {
runEchoImpl = runEcho
}
// Significant limits-form fields. Each item is (key, label). We only
// audit the freeform fields where engineers describe risks — list/enum
// fields (operating_modes, person_groups, industry_sectors) are out of
// scope because they carry no narrative phrases.
var echoFields = []struct {
key string
label string
}{
{"general_description", "Allg. Beschreibung"},
{"intended_purpose", "Bestimmungsgemaesse Verwendung"},
{"variants", "Varianten"},
{"foreseeable_misuses", "Vorhersehbare Fehlanwendung"},
{"spatial_limits", "Raeumliche Grenzen"},
{"temporal_limits", "Zeitliche Grenzen"},
{"operating_conditions", "Betriebsbedingungen"},
{"energy_supply", "Energieversorgung"},
{"mechanical_interfaces", "Mechanische Schnittstellen"},
{"electrical_interfaces", "Elektrische Schnittstellen"},
{"software_interfaces", "Software-Schnittstellen"},
{"pneumatic_hydraulic_interfaces", "Pneumatik/Hydraulik"},
{"qualification_requirements", "Personenqualifikation"},
}
var sentenceSplit = regexp.MustCompile(`[.!?]\s+|\n+`)
var wordRE = regexp.MustCompile(`[a-zäöüßA-ZÄÖÜ]{4,}`)
// echoThreshold — minimum Jaccard overlap (between sentence content
// tokens and a hazard's content tokens) above which the sentence is
// considered echoed. Tuned by hand to give meaningful results without a
// labeled corpus; the audit reports the actual best score for each
// orphaned phrase so a human can re-tune if needed.
const echoThreshold = 0.18
func runEcho(form map[string]any, hazards []map[string]any) EchoReport {
limits := unwrapLimits(form)
// Precompute hazard token bags once
type bag struct {
tokens map[string]bool
text string
}
var hazardBags []bag
for _, h := range hazards {
txt := joinHazardText(h)
toks := contentTokenSet(txt)
hazardBags = append(hazardBags, bag{tokens: toks, text: txt})
}
report := EchoReport{}
for _, fld := range echoFields {
raw, _ := limits[fld.key].(string)
raw = strings.TrimSpace(raw)
if raw == "" {
continue
}
for _, sent := range sentenceSplit.Split(raw, -1) {
sent = strings.TrimSpace(sent)
if len(sent) < 30 {
// Skip very short fragments
continue
}
report.TotalPhrases++
st := contentTokenSet(sent)
if len(st) < 3 {
continue
}
bestScore := 0.0
for _, hb := range hazardBags {
score := jaccard(st, hb.tokens)
if score > bestScore {
bestScore = score
}
}
if bestScore >= echoThreshold {
report.Echoed++
continue
}
report.Orphaned++
report.OrphanedPhrases = append(report.OrphanedPhrases, OrphanedPhrase{
Field: fld.label,
Phrase: sent,
BestScore: bestScore,
})
}
}
sort.Slice(report.OrphanedPhrases, func(i, j int) bool {
// Lowest scores first — most clearly orphaned
return report.OrphanedPhrases[i].BestScore < report.OrphanedPhrases[j].BestScore
})
return report
}
func unwrapLimits(form map[string]any) map[string]any {
if inner, ok := form["limits_form"].(map[string]any); ok {
return inner
}
return form
}
func joinHazardText(h map[string]any) string {
parts := []string{}
for _, k := range []string{"name", "description", "scenario", "trigger_event", "possible_harm", "hazardous_zone", "category", "sub_category"} {
if v, ok := h[k].(string); ok {
parts = append(parts, v)
}
}
return strings.Join(parts, " ")
}
func contentTokenSet(s string) map[string]bool {
out := map[string]bool{}
for _, m := range wordRE.FindAllString(s, -1) {
w := strings.ToLower(m)
if stopWords[w] {
continue
}
out[w] = true
}
return out
}
func jaccard(a, b map[string]bool) float64 {
if len(a) == 0 || len(b) == 0 {
return 0
}
inter := 0
for x := range a {
if b[x] {
inter++
}
}
union := len(a) + len(b) - inter
if union == 0 {
return 0
}
return float64(inter) / float64(union)
}