package audit import ( "regexp" "sort" "strings" ) // runEchoImpl checks if each meaningful phrase from the limits-form is // echoed by at least one generated hazard. A phrase that names a concrete // scenario, fault, or constraint must reappear (semantically) in some // hazard's name, scenario, or description. Phrases without echo are gaps: // the engineer documented the risk but the engine never lifted it into // the hazard register. // // Echo detection here is a lightweight Jaccard overlap of content tokens // (not embeddings) — robust enough for the demonstrative diagnostic and // keeps the audit fully deterministic without an external model. The // caller can later swap in a vector-based scorer. func init() { runEchoImpl = runEcho } // Significant limits-form fields. Each item is (key, label). We only // audit the freeform fields where engineers describe risks — list/enum // fields (operating_modes, person_groups, industry_sectors) are out of // scope because they carry no narrative phrases. var echoFields = []struct { key string label string }{ {"general_description", "Allg. Beschreibung"}, {"intended_purpose", "Bestimmungsgemaesse Verwendung"}, {"variants", "Varianten"}, {"foreseeable_misuses", "Vorhersehbare Fehlanwendung"}, {"spatial_limits", "Raeumliche Grenzen"}, {"temporal_limits", "Zeitliche Grenzen"}, {"operating_conditions", "Betriebsbedingungen"}, {"energy_supply", "Energieversorgung"}, {"mechanical_interfaces", "Mechanische Schnittstellen"}, {"electrical_interfaces", "Elektrische Schnittstellen"}, {"software_interfaces", "Software-Schnittstellen"}, {"pneumatic_hydraulic_interfaces", "Pneumatik/Hydraulik"}, {"qualification_requirements", "Personenqualifikation"}, } var sentenceSplit = regexp.MustCompile(`[.!?]\s+|\n+`) var wordRE = regexp.MustCompile(`[a-zäöüßA-ZÄÖÜ]{4,}`) // echoThreshold — minimum Jaccard overlap (between sentence content // tokens and a hazard's content tokens) above which the sentence is // considered echoed. Tuned by hand to give meaningful results without a // labeled corpus; the audit reports the actual best score for each // orphaned phrase so a human can re-tune if needed. const echoThreshold = 0.18 func runEcho(form map[string]any, hazards []map[string]any) EchoReport { limits := unwrapLimits(form) // Precompute hazard token bags once type bag struct { tokens map[string]bool text string } var hazardBags []bag for _, h := range hazards { txt := joinHazardText(h) toks := contentTokenSet(txt) hazardBags = append(hazardBags, bag{tokens: toks, text: txt}) } report := EchoReport{} for _, fld := range echoFields { raw, _ := limits[fld.key].(string) raw = strings.TrimSpace(raw) if raw == "" { continue } for _, sent := range sentenceSplit.Split(raw, -1) { sent = strings.TrimSpace(sent) if len(sent) < 30 { // Skip very short fragments continue } report.TotalPhrases++ st := contentTokenSet(sent) if len(st) < 3 { continue } bestScore := 0.0 for _, hb := range hazardBags { score := jaccard(st, hb.tokens) if score > bestScore { bestScore = score } } if bestScore >= echoThreshold { report.Echoed++ continue } report.Orphaned++ report.OrphanedPhrases = append(report.OrphanedPhrases, OrphanedPhrase{ Field: fld.label, Phrase: sent, BestScore: bestScore, }) } } sort.Slice(report.OrphanedPhrases, func(i, j int) bool { // Lowest scores first — most clearly orphaned return report.OrphanedPhrases[i].BestScore < report.OrphanedPhrases[j].BestScore }) return report } func unwrapLimits(form map[string]any) map[string]any { if inner, ok := form["limits_form"].(map[string]any); ok { return inner } return form } func joinHazardText(h map[string]any) string { parts := []string{} for _, k := range []string{"name", "description", "scenario", "trigger_event", "possible_harm", "hazardous_zone", "category", "sub_category"} { if v, ok := h[k].(string); ok { parts = append(parts, v) } } return strings.Join(parts, " ") } func contentTokenSet(s string) map[string]bool { out := map[string]bool{} for _, m := range wordRE.FindAllString(s, -1) { w := strings.ToLower(m) if stopWords[w] { continue } out[w] = true } return out } func jaccard(a, b map[string]bool) float64 { if len(a) == 0 || len(b) == 0 { return 0 } inter := 0 for x := range a { if b[x] { inter++ } } union := len(a) + len(b) - inter if union == 0 { return 0 } return float64(inter) / float64(union) }