f534b52817
Add cmd/iace-audit CLI with 5 deterministic methods that find engine gaps without ground truth: - A reachability: 1058 patterns vs achievable tag universe - B consistency: components vs their declared hazard categories - C vocabulary: limits-form tokens vs keyword dictionary - D echo: limits-form sentences vs generated hazards (jaccard) - E hierarchy: hazards vs ISO 12100 design/protection/info levels Library fixes triggered by A+B+C findings: - tag_resolver: synonym map for electrical/pneumatic/hydraulic aliases - component_library: crush_point + EN03 (gravitational) on C014/C128 (Hubwerk family) - fixes HP1014/1015/1017/1018 which were silently weakly_reachable. noise_source added on 7 components (C006/C011/ C017/C020/C031/C041/C096). electrical_part on 8 drive components (C031/C032/C033/C034/C035/C036/C037/C038/C077/C092). cyber tag on 10 sensors (C081-C090) + 3 IT components (C111/C112/C116) + KI module C119 (ai_model added). pneumatic_part+hydraulic_part on valves C091/C093, hydraulic_part+chemical_risk on pump C097, moving_part on motion controller C075 - keyword_dictionary: EN03 added to aufzug/lift/hubwerk/hubgeraet (was wrongly EN04-only). New keyword entries for hub-action verbs: absenken/senken/anheben/heben + hubhoehe/hubweg/hubgeschwindig Audit impact: - A: weakly_reachable 409 -> 358 (-51 patterns now fully reachable) - B: incomplete components 46 -> 30 (-16, -33%) - HP1018 (Person unter absenkendem Maschinenteil eingeklemmt): weakly_reachable -> reachable Why: methods A/B/C surfaced that the Kistenhubgeraet test project generated 0 crush-under-load hazards despite OSHA 1910.212(a)(3) + EN ISO 12100 6.3.5.5 explicitly requiring them. Three orthogonal bugs (missing crush_point tag, wrong energy source mapping, missing action verbs in dictionary) silently disabled the entire lift crush pattern family.
154 lines
4.6 KiB
Go
154 lines
4.6 KiB
Go
package audit
|
|
|
|
import (
|
|
"regexp"
|
|
"sort"
|
|
"strings"
|
|
|
|
"github.com/breakpilot/ai-compliance-sdk/internal/iace"
|
|
)
|
|
|
|
// runVocabularyImpl takes a limits-form payload (the structured machine
|
|
// description filled in by the engineer) and asks: which of its words
|
|
// are unknown to the keyword dictionary yet appear in any pattern's
|
|
// scenario/trigger/harm/zone text? Each such word is a dictionary gap —
|
|
// the engineer typed a term that some pattern is waiting for, but the
|
|
// parser cannot translate it into a tag.
|
|
func init() {
|
|
runVocabularyImpl = runVocabulary
|
|
}
|
|
|
|
var tokenRE = regexp.MustCompile(`[a-zäöüßA-ZÄÖÜ]{4,}`)
|
|
|
|
// German + English stop words that show up in any narrative but carry
|
|
// no engineering meaning. Kept short on purpose — we only want to drop
|
|
// obvious filler.
|
|
var stopWords = map[string]bool{
|
|
"oder": true, "und": true, "auch": true, "wenn": true, "wird": true,
|
|
"werden": true, "kann": true, "koennen": true, "soll": true, "muss": true,
|
|
"sind": true, "eine": true, "einer": true, "einem": true, "einen": true,
|
|
"diese": true, "dieser": true, "dieses": true, "diesem": true, "diesen": true,
|
|
"durch": true, "nach": true, "ueber": true, "unter": true, "zwischen": true,
|
|
"nicht": true, "ohne": true, "fuer": true, "bzw": true, "etc": true,
|
|
"sowie": true, "siehe": true, "etwa": true, "ggf": true, "the": true,
|
|
"with": true, "from": true, "this": true, "that": true, "have": true,
|
|
"insbesondere": true, "ausschliesslich": true, "ebenfalls": true,
|
|
"jeweils": true, "weitere": true, "weiteren": true, "weiterer": true,
|
|
}
|
|
|
|
func runVocabulary(form map[string]any) VocabularyReport {
|
|
limits, ok := form["limits_form"].(map[string]any)
|
|
if !ok {
|
|
// Form may already be the inner object
|
|
limits = form
|
|
}
|
|
|
|
tokens := map[string]bool{}
|
|
for _, v := range limits {
|
|
extractTokens(v, tokens)
|
|
}
|
|
report := VocabularyReport{UniqueTokens: len(tokens)}
|
|
|
|
dictTokens := dictionaryVocabulary()
|
|
|
|
for tok := range tokens {
|
|
if stopWords[tok] {
|
|
continue
|
|
}
|
|
if dictTokenHit(tok, dictTokens) {
|
|
report.KnownTokens = append(report.KnownTokens, tok)
|
|
} else {
|
|
report.UnknownTokens = append(report.UnknownTokens, tok)
|
|
}
|
|
}
|
|
sort.Strings(report.KnownTokens)
|
|
sort.Strings(report.UnknownTokens)
|
|
|
|
// For each unknown token check if any pattern names it
|
|
patterns := iace.AllPatterns()
|
|
for _, tok := range report.UnknownTokens {
|
|
hits := patternsMentioning(tok, patterns)
|
|
if len(hits) == 0 {
|
|
continue
|
|
}
|
|
report.SuggestedDictionaryEntries = append(report.SuggestedDictionaryEntries, DictionarySuggestion{
|
|
Token: tok,
|
|
PatternIDs: hits,
|
|
})
|
|
}
|
|
sort.Slice(report.SuggestedDictionaryEntries, func(i, j int) bool {
|
|
return len(report.SuggestedDictionaryEntries[i].PatternIDs) > len(report.SuggestedDictionaryEntries[j].PatternIDs)
|
|
})
|
|
return report
|
|
}
|
|
|
|
func extractTokens(v any, out map[string]bool) {
|
|
switch x := v.(type) {
|
|
case string:
|
|
for _, m := range tokenRE.FindAllString(x, -1) {
|
|
out[strings.ToLower(m)] = true
|
|
}
|
|
case []any:
|
|
for _, e := range x {
|
|
extractTokens(e, out)
|
|
}
|
|
case map[string]any:
|
|
for _, e := range x {
|
|
extractTokens(e, out)
|
|
}
|
|
}
|
|
}
|
|
|
|
// dictionaryVocabulary builds the lowercase set of all keyword strings
|
|
// that the parser will recognize, including normalized forms (umlauts
|
|
// replaced like in the keyword dictionary).
|
|
func dictionaryVocabulary() map[string]bool {
|
|
out := map[string]bool{}
|
|
for _, kw := range iace.GetKeywordDictionary() {
|
|
for _, k := range kw.Keywords {
|
|
out[strings.ToLower(k)] = true
|
|
}
|
|
}
|
|
return out
|
|
}
|
|
|
|
// dictTokenHit returns true if the token would be matched by any
|
|
// dictionary entry. Dictionary entries can be substrings, so we treat
|
|
// the dict as a set of stem-like matchers: a token is "known" if it
|
|
// equals a dict word OR contains a dict word as substring OR the dict
|
|
// word contains the token.
|
|
func dictTokenHit(tok string, dict map[string]bool) bool {
|
|
if dict[tok] {
|
|
return true
|
|
}
|
|
for d := range dict {
|
|
if strings.Contains(tok, d) || strings.Contains(d, tok) {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// patternsMentioning returns up to 8 pattern IDs whose scenario/trigger/
|
|
// harm/zone text contains the token (case-insensitive substring).
|
|
func patternsMentioning(tok string, patterns []iace.HazardPattern) []string {
|
|
tokLower := strings.ToLower(tok)
|
|
seen := map[string]bool{}
|
|
var out []string
|
|
for _, p := range patterns {
|
|
hay := strings.ToLower(p.ScenarioDE + " " + p.TriggerDE + " " + p.HarmDE + " " + p.ZoneDE + " " + p.NameDE)
|
|
if !strings.Contains(hay, tokLower) {
|
|
continue
|
|
}
|
|
if seen[p.ID] {
|
|
continue
|
|
}
|
|
seen[p.ID] = true
|
|
out = append(out, p.ID)
|
|
if len(out) >= 8 {
|
|
break
|
|
}
|
|
}
|
|
return out
|
|
}
|