feat(iace): pattern audit suite + library hygiene wave
Add cmd/iace-audit CLI with 5 deterministic methods that find engine gaps without ground truth: - A reachability: 1058 patterns vs achievable tag universe - B consistency: components vs their declared hazard categories - C vocabulary: limits-form tokens vs keyword dictionary - D echo: limits-form sentences vs generated hazards (jaccard) - E hierarchy: hazards vs ISO 12100 design/protection/info levels Library fixes triggered by A+B+C findings: - tag_resolver: synonym map for electrical/pneumatic/hydraulic aliases - component_library: crush_point + EN03 (gravitational) on C014/C128 (Hubwerk family) - fixes HP1014/1015/1017/1018 which were silently weakly_reachable. noise_source added on 7 components (C006/C011/ C017/C020/C031/C041/C096). electrical_part on 8 drive components (C031/C032/C033/C034/C035/C036/C037/C038/C077/C092). cyber tag on 10 sensors (C081-C090) + 3 IT components (C111/C112/C116) + KI module C119 (ai_model added). pneumatic_part+hydraulic_part on valves C091/C093, hydraulic_part+chemical_risk on pump C097, moving_part on motion controller C075 - keyword_dictionary: EN03 added to aufzug/lift/hubwerk/hubgeraet (was wrongly EN04-only). New keyword entries for hub-action verbs: absenken/senken/anheben/heben + hubhoehe/hubweg/hubgeschwindig Audit impact: - A: weakly_reachable 409 -> 358 (-51 patterns now fully reachable) - B: incomplete components 46 -> 30 (-16, -33%) - HP1018 (Person unter absenkendem Maschinenteil eingeklemmt): weakly_reachable -> reachable Why: methods A/B/C surfaced that the Kistenhubgeraet test project generated 0 crush-under-load hazards despite OSHA 1910.212(a)(3) + EN ISO 12100 6.3.5.5 explicitly requiring them. Three orthogonal bugs (missing crush_point tag, wrong energy source mapping, missing action verbs in dictionary) silently disabled the entire lift crush pattern family.
This commit is contained in:
@@ -0,0 +1,153 @@
|
||||
package audit
|
||||
|
||||
import (
|
||||
"regexp"
|
||||
"sort"
|
||||
"strings"
|
||||
|
||||
"github.com/breakpilot/ai-compliance-sdk/internal/iace"
|
||||
)
|
||||
|
||||
// runVocabularyImpl takes a limits-form payload (the structured machine
|
||||
// description filled in by the engineer) and asks: which of its words
|
||||
// are unknown to the keyword dictionary yet appear in any pattern's
|
||||
// scenario/trigger/harm/zone text? Each such word is a dictionary gap —
|
||||
// the engineer typed a term that some pattern is waiting for, but the
|
||||
// parser cannot translate it into a tag.
|
||||
func init() {
|
||||
runVocabularyImpl = runVocabulary
|
||||
}
|
||||
|
||||
var tokenRE = regexp.MustCompile(`[a-zäöüßA-ZÄÖÜ]{4,}`)
|
||||
|
||||
// German + English stop words that show up in any narrative but carry
|
||||
// no engineering meaning. Kept short on purpose — we only want to drop
|
||||
// obvious filler.
|
||||
var stopWords = map[string]bool{
|
||||
"oder": true, "und": true, "auch": true, "wenn": true, "wird": true,
|
||||
"werden": true, "kann": true, "koennen": true, "soll": true, "muss": true,
|
||||
"sind": true, "eine": true, "einer": true, "einem": true, "einen": true,
|
||||
"diese": true, "dieser": true, "dieses": true, "diesem": true, "diesen": true,
|
||||
"durch": true, "nach": true, "ueber": true, "unter": true, "zwischen": true,
|
||||
"nicht": true, "ohne": true, "fuer": true, "bzw": true, "etc": true,
|
||||
"sowie": true, "siehe": true, "etwa": true, "ggf": true, "the": true,
|
||||
"with": true, "from": true, "this": true, "that": true, "have": true,
|
||||
"insbesondere": true, "ausschliesslich": true, "ebenfalls": true,
|
||||
"jeweils": true, "weitere": true, "weiteren": true, "weiterer": true,
|
||||
}
|
||||
|
||||
func runVocabulary(form map[string]any) VocabularyReport {
|
||||
limits, ok := form["limits_form"].(map[string]any)
|
||||
if !ok {
|
||||
// Form may already be the inner object
|
||||
limits = form
|
||||
}
|
||||
|
||||
tokens := map[string]bool{}
|
||||
for _, v := range limits {
|
||||
extractTokens(v, tokens)
|
||||
}
|
||||
report := VocabularyReport{UniqueTokens: len(tokens)}
|
||||
|
||||
dictTokens := dictionaryVocabulary()
|
||||
|
||||
for tok := range tokens {
|
||||
if stopWords[tok] {
|
||||
continue
|
||||
}
|
||||
if dictTokenHit(tok, dictTokens) {
|
||||
report.KnownTokens = append(report.KnownTokens, tok)
|
||||
} else {
|
||||
report.UnknownTokens = append(report.UnknownTokens, tok)
|
||||
}
|
||||
}
|
||||
sort.Strings(report.KnownTokens)
|
||||
sort.Strings(report.UnknownTokens)
|
||||
|
||||
// For each unknown token check if any pattern names it
|
||||
patterns := iace.AllPatterns()
|
||||
for _, tok := range report.UnknownTokens {
|
||||
hits := patternsMentioning(tok, patterns)
|
||||
if len(hits) == 0 {
|
||||
continue
|
||||
}
|
||||
report.SuggestedDictionaryEntries = append(report.SuggestedDictionaryEntries, DictionarySuggestion{
|
||||
Token: tok,
|
||||
PatternIDs: hits,
|
||||
})
|
||||
}
|
||||
sort.Slice(report.SuggestedDictionaryEntries, func(i, j int) bool {
|
||||
return len(report.SuggestedDictionaryEntries[i].PatternIDs) > len(report.SuggestedDictionaryEntries[j].PatternIDs)
|
||||
})
|
||||
return report
|
||||
}
|
||||
|
||||
func extractTokens(v any, out map[string]bool) {
|
||||
switch x := v.(type) {
|
||||
case string:
|
||||
for _, m := range tokenRE.FindAllString(x, -1) {
|
||||
out[strings.ToLower(m)] = true
|
||||
}
|
||||
case []any:
|
||||
for _, e := range x {
|
||||
extractTokens(e, out)
|
||||
}
|
||||
case map[string]any:
|
||||
for _, e := range x {
|
||||
extractTokens(e, out)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// dictionaryVocabulary builds the lowercase set of all keyword strings
|
||||
// that the parser will recognize, including normalized forms (umlauts
|
||||
// replaced like in the keyword dictionary).
|
||||
func dictionaryVocabulary() map[string]bool {
|
||||
out := map[string]bool{}
|
||||
for _, kw := range iace.GetKeywordDictionary() {
|
||||
for _, k := range kw.Keywords {
|
||||
out[strings.ToLower(k)] = true
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// dictTokenHit returns true if the token would be matched by any
|
||||
// dictionary entry. Dictionary entries can be substrings, so we treat
|
||||
// the dict as a set of stem-like matchers: a token is "known" if it
|
||||
// equals a dict word OR contains a dict word as substring OR the dict
|
||||
// word contains the token.
|
||||
func dictTokenHit(tok string, dict map[string]bool) bool {
|
||||
if dict[tok] {
|
||||
return true
|
||||
}
|
||||
for d := range dict {
|
||||
if strings.Contains(tok, d) || strings.Contains(d, tok) {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
// patternsMentioning returns up to 8 pattern IDs whose scenario/trigger/
|
||||
// harm/zone text contains the token (case-insensitive substring).
|
||||
func patternsMentioning(tok string, patterns []iace.HazardPattern) []string {
|
||||
tokLower := strings.ToLower(tok)
|
||||
seen := map[string]bool{}
|
||||
var out []string
|
||||
for _, p := range patterns {
|
||||
hay := strings.ToLower(p.ScenarioDE + " " + p.TriggerDE + " " + p.HarmDE + " " + p.ZoneDE + " " + p.NameDE)
|
||||
if !strings.Contains(hay, tokLower) {
|
||||
continue
|
||||
}
|
||||
if seen[p.ID] {
|
||||
continue
|
||||
}
|
||||
seen[p.ID] = true
|
||||
out = append(out, p.ID)
|
||||
if len(out) >= 8 {
|
||||
break
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
Reference in New Issue
Block a user