f534b52817
Add cmd/iace-audit CLI with 5 deterministic methods that find engine gaps without ground truth: - A reachability: 1058 patterns vs achievable tag universe - B consistency: components vs their declared hazard categories - C vocabulary: limits-form tokens vs keyword dictionary - D echo: limits-form sentences vs generated hazards (jaccard) - E hierarchy: hazards vs ISO 12100 design/protection/info levels Library fixes triggered by A+B+C findings: - tag_resolver: synonym map for electrical/pneumatic/hydraulic aliases - component_library: crush_point + EN03 (gravitational) on C014/C128 (Hubwerk family) - fixes HP1014/1015/1017/1018 which were silently weakly_reachable. noise_source added on 7 components (C006/C011/ C017/C020/C031/C041/C096). electrical_part on 8 drive components (C031/C032/C033/C034/C035/C036/C037/C038/C077/C092). cyber tag on 10 sensors (C081-C090) + 3 IT components (C111/C112/C116) + KI module C119 (ai_model added). pneumatic_part+hydraulic_part on valves C091/C093, hydraulic_part+chemical_risk on pump C097, moving_part on motion controller C075 - keyword_dictionary: EN03 added to aufzug/lift/hubwerk/hubgeraet (was wrongly EN04-only). New keyword entries for hub-action verbs: absenken/senken/anheben/heben + hubhoehe/hubweg/hubgeschwindig Audit impact: - A: weakly_reachable 409 -> 358 (-51 patterns now fully reachable) - B: incomplete components 46 -> 30 (-16, -33%) - HP1018 (Person unter absenkendem Maschinenteil eingeklemmt): weakly_reachable -> reachable Why: methods A/B/C surfaced that the Kistenhubgeraet test project generated 0 crush-under-load hazards despite OSHA 1910.212(a)(3) + EN ISO 12100 6.3.5.5 explicitly requiring them. Three orthogonal bugs (missing crush_point tag, wrong energy source mapping, missing action verbs in dictionary) silently disabled the entire lift crush pattern family.
162 lines
4.5 KiB
Go
162 lines
4.5 KiB
Go
package audit
|
|
|
|
import (
|
|
"regexp"
|
|
"sort"
|
|
"strings"
|
|
)
|
|
|
|
// runEchoImpl checks if each meaningful phrase from the limits-form is
|
|
// echoed by at least one generated hazard. A phrase that names a concrete
|
|
// scenario, fault, or constraint must reappear (semantically) in some
|
|
// hazard's name, scenario, or description. Phrases without echo are gaps:
|
|
// the engineer documented the risk but the engine never lifted it into
|
|
// the hazard register.
|
|
//
|
|
// Echo detection here is a lightweight Jaccard overlap of content tokens
|
|
// (not embeddings) — robust enough for the demonstrative diagnostic and
|
|
// keeps the audit fully deterministic without an external model. The
|
|
// caller can later swap in a vector-based scorer.
|
|
func init() {
|
|
runEchoImpl = runEcho
|
|
}
|
|
|
|
// Significant limits-form fields. Each item is (key, label). We only
|
|
// audit the freeform fields where engineers describe risks — list/enum
|
|
// fields (operating_modes, person_groups, industry_sectors) are out of
|
|
// scope because they carry no narrative phrases.
|
|
var echoFields = []struct {
|
|
key string
|
|
label string
|
|
}{
|
|
{"general_description", "Allg. Beschreibung"},
|
|
{"intended_purpose", "Bestimmungsgemaesse Verwendung"},
|
|
{"variants", "Varianten"},
|
|
{"foreseeable_misuses", "Vorhersehbare Fehlanwendung"},
|
|
{"spatial_limits", "Raeumliche Grenzen"},
|
|
{"temporal_limits", "Zeitliche Grenzen"},
|
|
{"operating_conditions", "Betriebsbedingungen"},
|
|
{"energy_supply", "Energieversorgung"},
|
|
{"mechanical_interfaces", "Mechanische Schnittstellen"},
|
|
{"electrical_interfaces", "Elektrische Schnittstellen"},
|
|
{"software_interfaces", "Software-Schnittstellen"},
|
|
{"pneumatic_hydraulic_interfaces", "Pneumatik/Hydraulik"},
|
|
{"qualification_requirements", "Personenqualifikation"},
|
|
}
|
|
|
|
var sentenceSplit = regexp.MustCompile(`[.!?]\s+|\n+`)
|
|
var wordRE = regexp.MustCompile(`[a-zäöüßA-ZÄÖÜ]{4,}`)
|
|
|
|
// echoThreshold — minimum Jaccard overlap (between sentence content
|
|
// tokens and a hazard's content tokens) above which the sentence is
|
|
// considered echoed. Tuned by hand to give meaningful results without a
|
|
// labeled corpus; the audit reports the actual best score for each
|
|
// orphaned phrase so a human can re-tune if needed.
|
|
const echoThreshold = 0.18
|
|
|
|
func runEcho(form map[string]any, hazards []map[string]any) EchoReport {
|
|
limits := unwrapLimits(form)
|
|
|
|
// Precompute hazard token bags once
|
|
type bag struct {
|
|
tokens map[string]bool
|
|
text string
|
|
}
|
|
var hazardBags []bag
|
|
for _, h := range hazards {
|
|
txt := joinHazardText(h)
|
|
toks := contentTokenSet(txt)
|
|
hazardBags = append(hazardBags, bag{tokens: toks, text: txt})
|
|
}
|
|
|
|
report := EchoReport{}
|
|
for _, fld := range echoFields {
|
|
raw, _ := limits[fld.key].(string)
|
|
raw = strings.TrimSpace(raw)
|
|
if raw == "" {
|
|
continue
|
|
}
|
|
for _, sent := range sentenceSplit.Split(raw, -1) {
|
|
sent = strings.TrimSpace(sent)
|
|
if len(sent) < 30 {
|
|
// Skip very short fragments
|
|
continue
|
|
}
|
|
report.TotalPhrases++
|
|
st := contentTokenSet(sent)
|
|
if len(st) < 3 {
|
|
continue
|
|
}
|
|
bestScore := 0.0
|
|
for _, hb := range hazardBags {
|
|
score := jaccard(st, hb.tokens)
|
|
if score > bestScore {
|
|
bestScore = score
|
|
}
|
|
}
|
|
if bestScore >= echoThreshold {
|
|
report.Echoed++
|
|
continue
|
|
}
|
|
report.Orphaned++
|
|
report.OrphanedPhrases = append(report.OrphanedPhrases, OrphanedPhrase{
|
|
Field: fld.label,
|
|
Phrase: sent,
|
|
BestScore: bestScore,
|
|
})
|
|
}
|
|
}
|
|
|
|
sort.Slice(report.OrphanedPhrases, func(i, j int) bool {
|
|
// Lowest scores first — most clearly orphaned
|
|
return report.OrphanedPhrases[i].BestScore < report.OrphanedPhrases[j].BestScore
|
|
})
|
|
return report
|
|
}
|
|
|
|
func unwrapLimits(form map[string]any) map[string]any {
|
|
if inner, ok := form["limits_form"].(map[string]any); ok {
|
|
return inner
|
|
}
|
|
return form
|
|
}
|
|
|
|
func joinHazardText(h map[string]any) string {
|
|
parts := []string{}
|
|
for _, k := range []string{"name", "description", "scenario", "trigger_event", "possible_harm", "hazardous_zone", "category", "sub_category"} {
|
|
if v, ok := h[k].(string); ok {
|
|
parts = append(parts, v)
|
|
}
|
|
}
|
|
return strings.Join(parts, " ")
|
|
}
|
|
|
|
func contentTokenSet(s string) map[string]bool {
|
|
out := map[string]bool{}
|
|
for _, m := range wordRE.FindAllString(s, -1) {
|
|
w := strings.ToLower(m)
|
|
if stopWords[w] {
|
|
continue
|
|
}
|
|
out[w] = true
|
|
}
|
|
return out
|
|
}
|
|
|
|
func jaccard(a, b map[string]bool) float64 {
|
|
if len(a) == 0 || len(b) == 0 {
|
|
return 0
|
|
}
|
|
inter := 0
|
|
for x := range a {
|
|
if b[x] {
|
|
inter++
|
|
}
|
|
}
|
|
union := len(a) + len(b) - inter
|
|
if union == 0 {
|
|
return 0
|
|
}
|
|
return float64(inter) / float64(union)
|
|
}
|