Files
breakpilot-compliance/ai-compliance-sdk/internal/iace/narrative_parser.go
T
Benjamin Admin afb3f83f30 feat(iace): cross-domain precision overhaul + component review + schema reconcile
Engine precision (stop foreign-machine patterns leaking into a project):
- Wire project.MachineType into the engine machine-type gate (empty input no
  longer fires every machine class — press/cnc/excavator/crane/medical...).
- Capability-domain gating extended by 7 domains (outdoor, ventilation,
  machining, bulk, palletizer, playground, fitness) so domain-specific hazards
  only fire when the narrative names that domain; emitted via keyword_dictionary.
- Relevance backstop moved into iace (single gating contract, testable), and its
  dominant false-anchor class removed (a long pattern word no longer matches a
  short common token; prepositions/leitung added to the generic stoplist).
- New guard tests: TestCrossDomainPrecision (full pipeline, 0 foreign per GT) and
  TestPatternReachability now asserts 0 dead patterns. Both GTs keep coverage 1.0.

Reachability fix: the 51 dead patterns required electrical/pneumatic/hydraulic
tags nothing produced — renamed to the canonical electrical_energy/
pneumatic_pressure/hydraulic_pressure/hydraulic_part.

Component review (negation is best-effort + expert-correctable):
- Parser surfaces negated components (ComponentMatch.Negated) instead of dropping
  them; negated contribute no tags/energy → no phantom hazards.
- presence_status (vorhanden|nicht_vorhanden|geloescht) + ce_marked on components;
  only `vorhanden` feed matching. CE+safety-relevant flags the PL/SIL obligation.
- Force re-seed preserves the expert's component decisions instead of wiping them.
- Tag-based component→hazard assignment (was: all on the first component).
- Negation-aware narrative parsing ("keine Pneumatik" no longer extracts it).

Local-dev DB: ai-sdk sets search_path=compliance,core,public; reconcile migrations
152-156 bring the consolidated local iace tables to the current schema + add the
presence_status/ce_marked columns. Machine-type vocabulary endpoint for the form.

[migration-approved]

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-10 17:15:55 +02:00

386 lines
12 KiB
Go

package iace
import (
"regexp"
"strconv"
"strings"
)
// ComponentMatch represents a component detected from narrative text.
type ComponentMatch struct {
LibraryID string `json:"library_id"`
NameDE string `json:"name_de"`
MatchedOn string `json:"matched_on"` // The keyword that triggered the match
Tags []string `json:"tags"`
Confidence float64 `json:"confidence"`
// Negated is the best-effort verdict that the keyword appeared only in a
// negated clause ("keine Pneumatik"). Negated components are surfaced for
// expert review but do NOT contribute tags/energy to pattern matching.
Negated bool `json:"negated"`
}
// EnergyMatch represents an energy source detected from narrative text.
type EnergyMatch struct {
SourceID string `json:"source_id"`
NameDE string `json:"name_de"`
MatchedOn string `json:"matched_on"`
Value string `json:"value,omitempty"` // e.g., "20000 kN", "400 V"
Severity int `json:"severity"` // Derived severity 1-5
}
// TechSpec represents an extracted technical specification.
type TechSpec struct {
Value float64 `json:"value"`
Unit string `json:"unit"`
Raw string `json:"raw"`
}
// ParseResult contains all entities extracted from a machine narrative.
type ParseResult struct {
Components []ComponentMatch `json:"components"`
EnergySources []EnergyMatch `json:"energy_sources"`
LifecyclePhases []string `json:"lifecycle_phases"`
Roles []string `json:"roles"`
CustomTags []string `json:"custom_tags"`
TechSpecs []TechSpec `json:"tech_specs"`
Confidence float64 `json:"confidence"`
OperationalStates []string `json:"operational_states,omitempty"`
StateTransitions []string `json:"state_transitions,omitempty"`
}
// techSpecPattern matches numeric values with engineering units.
var techSpecPattern = regexp.MustCompile(`(\d[\d.,]*)\s*(kN|Tonnen|tonnen|kJ|kW|MW|V|kV|Hz|°C|bar|mm|m³/h|/min|U/min|rpm|m/s)`)
// lifecycleKeywords maps German text patterns to lifecycle phase IDs.
var lifecycleKeywords = map[string]string{
"betrieb": "normal_operation",
"normalbetrieb": "normal_operation",
"automatikbetrieb": "auto_operation",
"einricht": "setup",
"umruest": "changeover",
"werkzeugwechsel": "changeover",
"wartung": "maintenance",
"instandhalt": "maintenance",
"instandsetz": "repair",
"reinig": "cleaning",
"transport": "transport",
"montage": "assembly",
"installation": "assembly",
"inbetriebnahme": "commissioning",
"ausserbetriebnahme": "decommissioning",
"demontage": "disposal",
"entsorgung": "disposal",
"reparatur": "repair",
"stoerungsbeseitig": "fault_clearing",
"stoerung": "fault_clearing",
"fehlersuche": "fault_clearing",
"klemm": "fault_clearing",
"blockier": "fault_clearing",
"stau": "fault_clearing",
}
// roleKeywords maps German text patterns to role IDs.
var roleKeywords = map[string]string{
"bedienpersonal": "operator",
"bediener": "operator",
"werker": "operator",
"einrichter": "setup_personnel",
"instandhalt": "maintenance_tech",
"wartungspersonal": "maintenance_tech",
"elektrofachkraft":"electrical_tech",
"besucher": "visitor",
"fremdfirma": "contractor",
"reinigungspersonal": "cleaning_staff",
"aufsichtsperson": "supervisor",
"programmierer": "programmer",
"auszubildend": "trainee",
"leiharbeiter": "temp_worker",
}
// operationalStateKeywords maps German text patterns to operational state IDs.
var operationalStateKeywords = map[string]string{
"hochfahren": "startup",
"anlauf": "startup",
"anfahren": "startup",
"referenzfahrt": "homing",
"referenzpunkt": "homing",
"automatikbetrieb": "automatic_operation",
"automatisch": "automatic_operation",
"handbetrieb": "manual_operation",
"manuell": "manual_operation",
"tippbetrieb": "manual_operation",
"teach": "teach_mode",
"einrichtbetrieb": "teach_mode",
"programmier": "teach_mode",
"wartung": "maintenance",
"instandhaltung": "maintenance",
"reinigung": "cleaning",
"not-halt": "emergency_stop",
"nothalt": "emergency_stop",
"notabschaltung": "emergency_stop",
"wiederanlauf": "recovery_mode",
"wiederinbetriebnahme":"recovery_mode",
"quittier": "recovery_mode",
}
// stateTransitionKeywords maps keyword combinations to state transitions.
var stateTransitionKeywords = map[string]string{
"unerwarteter wiederanlauf": "maintenance→automatic_operation",
"wiederanlauf nach not": "emergency_stop→recovery_mode",
"automatischer anlauf": "startup→automatic_operation",
"betriebsartwechsel": "manual_operation→automatic_operation",
}
// ParseNarrative extracts components, energy sources, lifecycle phases,
// roles, and tags from a machine description text. Fully deterministic,
// no LLM required.
// machineType is optional — if provided, keywords with MachineTypes
// restrictions are only matched when the machine type is in the list.
func ParseNarrative(text string, machineType ...string) ParseResult {
result := ParseResult{}
if text == "" {
return result
}
// Normalize text
lower := strings.ToLower(text)
lower = strings.ReplaceAll(lower, "ä", "ae")
lower = strings.ReplaceAll(lower, "ö", "oe")
lower = strings.ReplaceAll(lower, "ü", "ue")
lower = strings.ReplaceAll(lower, "ß", "ss")
// 1. Extract technical specifications
result.TechSpecs = extractTechSpecs(text)
// 2. Match keywords → components + energy + tags
dictionary := GetKeywordDictionary()
compLib := GetComponentLibrary()
compMap := make(map[string]ComponentLibraryEntry)
for _, c := range compLib {
compMap[c.ID] = c
}
seenComponents := make(map[string]bool)
seenEnergy := make(map[string]bool)
tagSet := make(map[string]bool)
// Resolve machine type for filtering
var mType string
if len(machineType) > 0 {
mType = machineType[0]
}
for _, entry := range dictionary {
// Skip keywords restricted to other machine types
if len(entry.MachineTypes) > 0 && mType != "" {
matched := false
for _, mt := range entry.MachineTypes {
if mt == mType {
matched = true
break
}
}
if !matched {
continue // This keyword is for a different machine type
}
}
for _, kw := range entry.Keywords {
kwNorm := strings.ToLower(kw)
kwNorm = strings.ReplaceAll(kwNorm, "ä", "ae")
kwNorm = strings.ReplaceAll(kwNorm, "ö", "oe")
kwNorm = strings.ReplaceAll(kwNorm, "ü", "ue")
kwNorm = strings.ReplaceAll(kwNorm, "ß", "ss")
if strings.Contains(lower, kwNorm) {
// Best-effort negation verdict: the keyword is present, but if
// every occurrence sits in a negated clause ("keine Pneumatik")
// the component is surfaced as negated and contributes NO tags /
// energy to matching (so it generates no phantom hazards). The
// expert can flip the verdict in the Components view.
negated := !hasUnnegatedOccurrence(lower, kwNorm)
// Add components (negated ones carry the flag, no tags)
for _, cid := range entry.ComponentIDs {
if !seenComponents[cid] {
seenComponents[cid] = true
comp := compMap[cid]
result.Components = append(result.Components, ComponentMatch{
LibraryID: cid,
NameDE: comp.NameDE,
MatchedOn: kw,
Tags: comp.Tags,
Confidence: 0.8,
Negated: negated,
})
if !negated {
for _, t := range comp.Tags {
tagSet[t] = true
}
}
}
}
if !negated {
// Add energy sources
for _, eid := range entry.EnergyIDs {
if !seenEnergy[eid] {
seenEnergy[eid] = true
result.EnergySources = append(result.EnergySources, EnergyMatch{
SourceID: eid,
NameDE: eid, // Will be enriched by caller
MatchedOn: kw,
})
}
}
// Add extra tags
for _, t := range entry.ExtraTags {
tagSet[t] = true
}
}
break // First keyword match is enough per entry
}
}
}
// 3. Derive energy from tech specs
for _, spec := range result.TechSpecs {
deriveEnergyFromSpec(spec, &result, seenEnergy, tagSet)
}
// 4. Extract lifecycle phases
phaseSet := make(map[string]bool)
for kw, phase := range lifecycleKeywords {
kwNorm := strings.ReplaceAll(kw, "ä", "ae")
kwNorm = strings.ReplaceAll(kwNorm, "ö", "oe")
kwNorm = strings.ReplaceAll(kwNorm, "ü", "ue")
if strings.Contains(lower, kwNorm) {
if !phaseSet[phase] {
phaseSet[phase] = true
result.LifecyclePhases = append(result.LifecyclePhases, phase)
}
}
}
// 5. Extract roles
roleSet := make(map[string]bool)
for kw, role := range roleKeywords {
if strings.Contains(lower, kw) {
if !roleSet[role] {
roleSet[role] = true
result.Roles = append(result.Roles, role)
}
}
}
// 6. Extract operational states
stateSet := make(map[string]bool)
for kw, state := range operationalStateKeywords {
kwNorm := strings.ReplaceAll(kw, "ä", "ae")
kwNorm = strings.ReplaceAll(kwNorm, "ö", "oe")
kwNorm = strings.ReplaceAll(kwNorm, "ü", "ue")
if strings.Contains(lower, kwNorm) {
if !stateSet[state] {
stateSet[state] = true
result.OperationalStates = append(result.OperationalStates, state)
}
}
}
// 7. Extract state transitions
transSet := make(map[string]bool)
for kw, trans := range stateTransitionKeywords {
if strings.Contains(lower, kw) {
if !transSet[trans] {
transSet[trans] = true
result.StateTransitions = append(result.StateTransitions, trans)
}
}
}
// 8. Collect all tags
for t := range tagSet {
result.CustomTags = append(result.CustomTags, t)
}
// 9. Calculate overall confidence
if len(result.Components) > 0 {
result.Confidence = float64(len(result.Components)) / 15.0 // Normalize to ~1.0 for 15 components
if result.Confidence > 1.0 {
result.Confidence = 1.0
}
}
return result
}
// extractTechSpecs finds numeric values with engineering units in the text.
func extractTechSpecs(text string) []TechSpec {
matches := techSpecPattern.FindAllStringSubmatch(text, -1)
var specs []TechSpec
for _, m := range matches {
valStr := strings.ReplaceAll(m[1], ".", "")
valStr = strings.ReplaceAll(valStr, ",", ".")
val, err := strconv.ParseFloat(valStr, 64)
if err != nil {
continue
}
specs = append(specs, TechSpec{
Value: val,
Unit: m[2],
Raw: m[0],
})
}
return specs
}
// deriveEnergyFromSpec maps technical values to energy sources and severity tags.
func deriveEnergyFromSpec(spec TechSpec, result *ParseResult, seen map[string]bool, tags map[string]bool) {
switch {
case (spec.Unit == "kN" || spec.Unit == "Tonnen" || spec.Unit == "tonnen") && spec.Value > 100:
addEnergy(result, seen, "EN01", spec.Raw)
tags["high_force"] = true
if spec.Value > 1000 {
tags["crush_point"] = true
}
case (spec.Unit == "V" || spec.Unit == "kV"):
if spec.Value >= 400 || spec.Unit == "kV" {
addEnergy(result, seen, "EN05", spec.Raw)
tags["high_voltage"] = true
} else if spec.Value >= 50 {
addEnergy(result, seen, "EN05", spec.Raw)
tags["electrical_part"] = true
}
case spec.Unit == "°C" && spec.Value > 60:
addEnergy(result, seen, "EN06", spec.Raw)
tags["high_temperature"] = true
if spec.Value > 100 {
tags["thermal_accumulation"] = true
}
case spec.Unit == "bar" && spec.Value > 10:
addEnergy(result, seen, "EN07", spec.Raw)
tags["high_pressure"] = true
case (spec.Unit == "kW" || spec.Unit == "MW") && spec.Value > 1:
addEnergy(result, seen, "EN02", spec.Raw)
tags["rotating_part"] = true
case (spec.Unit == "/min" || spec.Unit == "U/min" || spec.Unit == "rpm") && spec.Value > 100:
addEnergy(result, seen, "EN02", spec.Raw)
tags["rotating_part"] = true
if spec.Value > 500 {
tags["high_speed"] = true
}
case spec.Unit == "kJ" && spec.Value > 10:
addEnergy(result, seen, "EN03", spec.Raw)
tags["stored_energy"] = true
}
}
func addEnergy(result *ParseResult, seen map[string]bool, id, matchedOn string) {
if !seen[id] {
seen[id] = true
result.EnergySources = append(result.EnergySources, EnergyMatch{
SourceID: id,
MatchedOn: matchedOn,
})
}
}