diff --git a/ai-compliance-sdk/internal/ucca/pattern_loader.go b/ai-compliance-sdk/internal/ucca/pattern_loader.go new file mode 100644 index 0000000..b3f9d9d --- /dev/null +++ b/ai-compliance-sdk/internal/ucca/pattern_loader.go @@ -0,0 +1,260 @@ +package ucca + +import ( + "fmt" + "os" + "path/filepath" + "runtime" + "strings" + + "gopkg.in/yaml.v3" +) + +// ControlPattern represents a reusable control pattern template. +// Pattern ID format: CP-{DOMAIN}-{NNN} (e.g. CP-AUTH-001). +type ControlPattern struct { + ID string `yaml:"id" json:"id"` + Name string `yaml:"name" json:"name"` + NameDE string `yaml:"name_de" json:"name_de"` + Domain string `yaml:"domain" json:"domain"` + Category string `yaml:"category" json:"category"` + Description string `yaml:"description" json:"description"` + ObjectiveTemplate string `yaml:"objective_template" json:"objective_template"` + RationaleTemplate string `yaml:"rationale_template" json:"rationale_template"` + RequirementsTemplate []string `yaml:"requirements_template" json:"requirements_template"` + TestProcedureTemplate []string `yaml:"test_procedure_template" json:"test_procedure_template"` + EvidenceTemplate []string `yaml:"evidence_template" json:"evidence_template"` + SeverityDefault string `yaml:"severity_default" json:"severity_default"` + ImplementationEffortDefault string `yaml:"implementation_effort_default,omitempty" json:"implementation_effort_default,omitempty"` + OpenAnchorRefs []AnchorRef `yaml:"open_anchor_refs,omitempty" json:"open_anchor_refs,omitempty"` + ObligationMatchKeywords []string `yaml:"obligation_match_keywords" json:"obligation_match_keywords"` + Tags []string `yaml:"tags" json:"tags"` + ComposableWith []string `yaml:"composable_with,omitempty" json:"composable_with,omitempty"` +} + +// AnchorRef links a pattern to an open-source framework reference. +type AnchorRef struct { + Framework string `yaml:"framework" json:"framework"` + Ref string `yaml:"ref" json:"ref"` +} + +// patternFile is the top-level YAML structure. +type patternFile struct { + Version string `yaml:"version"` + Description string `yaml:"description"` + Patterns []ControlPattern `yaml:"patterns"` +} + +// ControlPatternIndex provides fast lookup of control patterns. +type ControlPatternIndex struct { + ByID map[string]*ControlPattern + ByDomain map[string][]*ControlPattern + ByCategory map[string][]*ControlPattern + ByTag map[string][]*ControlPattern + ByKeyword map[string][]*ControlPattern // keyword -> patterns (for obligation matching) + All []*ControlPattern +} + +// LoadControlPatterns loads all YAML pattern files from the control_patterns directory. +func LoadControlPatterns() (*ControlPatternIndex, error) { + dir, err := findPatternsDir() + if err != nil { + return nil, err + } + + entries, err := os.ReadDir(dir) + if err != nil { + return nil, fmt.Errorf("failed to read patterns directory: %w", err) + } + + var allPatterns []ControlPattern + for _, entry := range entries { + if entry.IsDir() { + continue + } + name := entry.Name() + if strings.HasPrefix(name, "_") { + continue // skip schema and metadata files + } + if !strings.HasSuffix(name, ".yaml") && !strings.HasSuffix(name, ".yml") { + continue + } + + data, err := os.ReadFile(filepath.Join(dir, name)) + if err != nil { + return nil, fmt.Errorf("failed to read %s: %w", name, err) + } + + var pf patternFile + if err := yaml.Unmarshal(data, &pf); err != nil { + return nil, fmt.Errorf("failed to parse %s: %w", name, err) + } + + allPatterns = append(allPatterns, pf.Patterns...) + } + + if len(allPatterns) == 0 { + return nil, fmt.Errorf("no control patterns found in %s", dir) + } + + idx, err := buildPatternIndex(allPatterns) + if err != nil { + return nil, err + } + + return idx, nil +} + +func findPatternsDir() (string, error) { + candidates := []string{ + "policies/control_patterns", + "../policies/control_patterns", + "../../policies/control_patterns", + } + + _, filename, _, ok := runtime.Caller(0) + if ok { + srcDir := filepath.Dir(filename) + candidates = append(candidates, + filepath.Join(srcDir, "../../policies/control_patterns"), + ) + } + + for _, p := range candidates { + abs, err := filepath.Abs(p) + if err != nil { + continue + } + info, err := os.Stat(abs) + if err == nil && info.IsDir() { + return abs, nil + } + } + + return "", fmt.Errorf("control_patterns directory not found in any candidate path") +} + +func buildPatternIndex(patterns []ControlPattern) (*ControlPatternIndex, error) { + idx := &ControlPatternIndex{ + ByID: make(map[string]*ControlPattern), + ByDomain: make(map[string][]*ControlPattern), + ByCategory: make(map[string][]*ControlPattern), + ByTag: make(map[string][]*ControlPattern), + ByKeyword: make(map[string][]*ControlPattern), + } + + for i := range patterns { + p := &patterns[i] + + // Validate ID uniqueness + if _, exists := idx.ByID[p.ID]; exists { + return nil, fmt.Errorf("duplicate pattern ID: %s", p.ID) + } + + idx.ByID[p.ID] = p + idx.ByDomain[p.Domain] = append(idx.ByDomain[p.Domain], p) + idx.ByCategory[p.Category] = append(idx.ByCategory[p.Category], p) + idx.All = append(idx.All, p) + + for _, tag := range p.Tags { + idx.ByTag[tag] = append(idx.ByTag[tag], p) + } + + for _, kw := range p.ObligationMatchKeywords { + lower := strings.ToLower(kw) + idx.ByKeyword[lower] = append(idx.ByKeyword[lower], p) + } + } + + return idx, nil +} + +// GetPattern returns a pattern by its ID (e.g. "CP-AUTH-001"). +func (idx *ControlPatternIndex) GetPattern(id string) (*ControlPattern, bool) { + p, ok := idx.ByID[strings.ToUpper(id)] + return p, ok +} + +// GetPatternsByDomain returns all patterns for a domain (e.g. "AUTH"). +func (idx *ControlPatternIndex) GetPatternsByDomain(domain string) []*ControlPattern { + return idx.ByDomain[strings.ToUpper(domain)] +} + +// GetPatternsByCategory returns all patterns for a category (e.g. "authentication"). +func (idx *ControlPatternIndex) GetPatternsByCategory(category string) []*ControlPattern { + return idx.ByCategory[strings.ToLower(category)] +} + +// GetPatternsByTag returns all patterns with a given tag. +func (idx *ControlPatternIndex) GetPatternsByTag(tag string) []*ControlPattern { + return idx.ByTag[strings.ToLower(tag)] +} + +// MatchByKeywords returns patterns whose obligation_match_keywords overlap with +// the given text. Returns matches sorted by score (number of keyword hits) descending. +func (idx *ControlPatternIndex) MatchByKeywords(text string) []PatternMatch { + textLower := strings.ToLower(text) + scores := make(map[string]int) + + for kw, patterns := range idx.ByKeyword { + if strings.Contains(textLower, kw) { + for _, p := range patterns { + scores[p.ID]++ + } + } + } + + if len(scores) == 0 { + return nil + } + + // Collect and sort by score descending + matches := make([]PatternMatch, 0, len(scores)) + for id, score := range scores { + p := idx.ByID[id] + matches = append(matches, PatternMatch{ + Pattern: p, + KeywordHits: score, + TotalKeywords: len(p.ObligationMatchKeywords), + }) + } + + // Simple insertion sort (small N) + for i := 1; i < len(matches); i++ { + for j := i; j > 0 && matches[j].KeywordHits > matches[j-1].KeywordHits; j-- { + matches[j], matches[j-1] = matches[j-1], matches[j] + } + } + + return matches +} + +// PatternMatch represents a keyword-based match result. +type PatternMatch struct { + Pattern *ControlPattern + KeywordHits int + TotalKeywords int +} + +// Score returns the match score as a ratio of hits to total keywords. +func (m PatternMatch) Score() float64 { + if m.TotalKeywords == 0 { + return 0 + } + return float64(m.KeywordHits) / float64(m.TotalKeywords) +} + +// ValidatePatternID checks if a pattern ID exists in the index. +func (idx *ControlPatternIndex) ValidatePatternID(id string) bool { + _, ok := idx.ByID[strings.ToUpper(id)] + return ok +} + +// Domains returns the list of unique domains that have patterns. +func (idx *ControlPatternIndex) Domains() []string { + domains := make([]string, 0, len(idx.ByDomain)) + for d := range idx.ByDomain { + domains = append(domains, d) + } + return domains +} diff --git a/ai-compliance-sdk/internal/ucca/pattern_loader_test.go b/ai-compliance-sdk/internal/ucca/pattern_loader_test.go new file mode 100644 index 0000000..743b779 --- /dev/null +++ b/ai-compliance-sdk/internal/ucca/pattern_loader_test.go @@ -0,0 +1,384 @@ +package ucca + +import ( + "strings" + "testing" +) + +func TestLoadControlPatterns_ValidFiles(t *testing.T) { + idx, err := LoadControlPatterns() + if err != nil { + t.Fatalf("Expected no error, got %v", err) + } + if idx == nil { + t.Fatal("Expected non-nil index") + } + if len(idx.All) != 50 { + t.Errorf("Expected 50 patterns, got %d", len(idx.All)) + } +} + +func TestLoadControlPatterns_NoDuplicateIDs(t *testing.T) { + idx, err := LoadControlPatterns() + if err != nil { + t.Fatalf("Failed to load patterns: %v", err) + } + + seen := make(map[string]bool) + for _, p := range idx.All { + if seen[p.ID] { + t.Errorf("Duplicate pattern ID: %s", p.ID) + } + seen[p.ID] = true + } +} + +func TestControlPatternIndex_GetPattern(t *testing.T) { + idx, err := LoadControlPatterns() + if err != nil { + t.Fatalf("Failed to load patterns: %v", err) + } + + tests := []struct { + name string + id string + expected bool + }{ + {"existing pattern CP-AUTH-001", "CP-AUTH-001", true}, + {"existing pattern CP-CRYP-001", "CP-CRYP-001", true}, + {"lowercase lookup", "cp-auth-001", true}, + {"non-existing pattern", "CP-FAKE-999", false}, + {"empty id", "", false}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + p, ok := idx.GetPattern(tt.id) + if ok != tt.expected { + t.Errorf("GetPattern(%q): expected found=%v, got found=%v", tt.id, tt.expected, ok) + } + if ok && p.ID == "" { + t.Error("Pattern found but has empty ID") + } + }) + } +} + +func TestControlPatternIndex_GetPatternsByDomain(t *testing.T) { + idx, err := LoadControlPatterns() + if err != nil { + t.Fatalf("Failed to load patterns: %v", err) + } + + tests := []struct { + domain string + minCount int + }{ + {"AUTH", 3}, + {"CRYP", 3}, + {"DATA", 5}, + {"SEC", 3}, + {"COMP", 5}, + {"LOG", 2}, + {"INC", 3}, + {"AI", 2}, + } + + for _, tt := range tests { + t.Run(tt.domain, func(t *testing.T) { + patterns := idx.GetPatternsByDomain(tt.domain) + if len(patterns) < tt.minCount { + t.Errorf("Domain %s: expected at least %d patterns, got %d", + tt.domain, tt.minCount, len(patterns)) + } + }) + } + + emptyPatterns := idx.GetPatternsByDomain("NOPE") + if len(emptyPatterns) != 0 { + t.Errorf("Expected 0 patterns for unknown domain, got %d", len(emptyPatterns)) + } +} + +func TestControlPatternIndex_GetPatternsByCategory(t *testing.T) { + idx, err := LoadControlPatterns() + if err != nil { + t.Fatalf("Failed to load patterns: %v", err) + } + + authPatterns := idx.GetPatternsByCategory("authentication") + if len(authPatterns) < 3 { + t.Errorf("Expected at least 3 authentication patterns, got %d", len(authPatterns)) + } + + encPatterns := idx.GetPatternsByCategory("encryption") + if len(encPatterns) < 3 { + t.Errorf("Expected at least 3 encryption patterns, got %d", len(encPatterns)) + } +} + +func TestControlPatternIndex_GetPatternsByTag(t *testing.T) { + idx, err := LoadControlPatterns() + if err != nil { + t.Fatalf("Failed to load patterns: %v", err) + } + + dpPatterns := idx.GetPatternsByTag("data_protection") + if len(dpPatterns) < 3 { + t.Errorf("Expected at least 3 data_protection tagged patterns, got %d", len(dpPatterns)) + } + + secPatterns := idx.GetPatternsByTag("security") + if len(secPatterns) >= 1 { + // At least 1 pattern tagged with "security" — good + } +} + +func TestControlPatternIndex_MatchByKeywords(t *testing.T) { + idx, err := LoadControlPatterns() + if err != nil { + t.Fatalf("Failed to load patterns: %v", err) + } + + tests := []struct { + name string + text string + expectPatternID string + }{ + { + "password related text", + "Die Passwortrichtlinie muss sicherstellen, dass Anmeldedaten geschuetzt sind", + "CP-AUTH-001", + }, + { + "encryption text", + "Verschluesselung ruhender Daten muss mit AES-256 erfolgen", + "CP-CRYP-001", + }, + { + "incident response text", + "Ein Vorfall-Reaktionsplan muss fuer Sicherheitsvorfaelle bereitstehen", + "CP-INC-001", + }, + { + "DSGVO consent text", + "Die Einwilligung der betroffenen Person muss freiwillig erfolgen", + "CP-DATA-004", + }, + { + "AI risk text", + "KI-Systeme mit hohem Risiko muessen einer Konformitaetsbewertung unterzogen werden", + "CP-AI-001", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + matches := idx.MatchByKeywords(tt.text) + if len(matches) == 0 { + t.Fatalf("Expected at least 1 match for text: %s", tt.text[:50]) + } + + // Check if the expected pattern is in top 3 matches + found := false + for i, m := range matches { + if i >= 3 { + break + } + if m.Pattern.ID == tt.expectPatternID { + found = true + break + } + } + if !found { + topIDs := make([]string, 0, 3) + for i, m := range matches { + if i >= 3 { + break + } + topIDs = append(topIDs, m.Pattern.ID) + } + t.Errorf("Expected %s in top 3, got %v", tt.expectPatternID, topIDs) + } + }) + } +} + +func TestControlPatternIndex_MatchByKeywords_NoMatch(t *testing.T) { + idx, err := LoadControlPatterns() + if err != nil { + t.Fatalf("Failed to load patterns: %v", err) + } + + matches := idx.MatchByKeywords("xyzzy foobar baz completely unrelated text") + if len(matches) != 0 { + t.Errorf("Expected 0 matches for unrelated text, got %d", len(matches)) + } +} + +func TestPatternMatch_Score(t *testing.T) { + match := PatternMatch{ + KeywordHits: 3, + TotalKeywords: 7, + } + + score := match.Score() + expected := 3.0 / 7.0 + if score < expected-0.01 || score > expected+0.01 { + t.Errorf("Expected score ~%.3f, got %.3f", expected, score) + } + + zeroMatch := PatternMatch{ + KeywordHits: 0, + TotalKeywords: 0, + } + if zeroMatch.Score() != 0 { + t.Errorf("Expected 0 score for zero keywords, got %f", zeroMatch.Score()) + } +} + +func TestControlPatternIndex_ValidatePatternID(t *testing.T) { + idx, err := LoadControlPatterns() + if err != nil { + t.Fatalf("Failed to load patterns: %v", err) + } + + if !idx.ValidatePatternID("CP-AUTH-001") { + t.Error("Expected CP-AUTH-001 to be valid") + } + if idx.ValidatePatternID("CP-FAKE-999") { + t.Error("Expected CP-FAKE-999 to be invalid") + } +} + +func TestControlPatternIndex_Domains(t *testing.T) { + idx, err := LoadControlPatterns() + if err != nil { + t.Fatalf("Failed to load patterns: %v", err) + } + + domains := idx.Domains() + if len(domains) < 5 { + t.Errorf("Expected at least 5 domains, got %d: %v", len(domains), domains) + } + + // Check critical domains are present + domainSet := make(map[string]bool) + for _, d := range domains { + domainSet[d] = true + } + + for _, required := range []string{"AUTH", "CRYP", "DATA", "SEC", "COMP"} { + if !domainSet[required] { + t.Errorf("Expected domain %s to be present", required) + } + } +} + +func TestControlPattern_FieldsNotEmpty(t *testing.T) { + idx, err := LoadControlPatterns() + if err != nil { + t.Fatalf("Failed to load patterns: %v", err) + } + + for _, p := range idx.All { + t.Run(p.ID, func(t *testing.T) { + if p.ID == "" { + t.Error("Empty ID") + } + if p.Name == "" { + t.Error("Empty Name") + } + if p.NameDE == "" { + t.Error("Empty NameDE") + } + if p.Domain == "" { + t.Error("Empty Domain") + } + if p.Category == "" { + t.Error("Empty Category") + } + if len(p.Description) < 20 { + t.Errorf("Description too short: %d chars", len(p.Description)) + } + if len(p.ObjectiveTemplate) < 20 { + t.Errorf("ObjectiveTemplate too short: %d chars", len(p.ObjectiveTemplate)) + } + if len(p.RationaleTemplate) < 20 { + t.Errorf("RationaleTemplate too short: %d chars", len(p.RationaleTemplate)) + } + if len(p.RequirementsTemplate) < 2 { + t.Errorf("Not enough requirements: %d", len(p.RequirementsTemplate)) + } + if len(p.TestProcedureTemplate) < 1 { + t.Errorf("Not enough test procedures: %d", len(p.TestProcedureTemplate)) + } + if len(p.EvidenceTemplate) < 1 { + t.Errorf("Not enough evidence items: %d", len(p.EvidenceTemplate)) + } + if len(p.ObligationMatchKeywords) < 3 { + t.Errorf("Not enough keywords: %d", len(p.ObligationMatchKeywords)) + } + if len(p.Tags) < 1 { + t.Errorf("Not enough tags: %d", len(p.Tags)) + } + + validSeverities := map[string]bool{"low": true, "medium": true, "high": true, "critical": true} + if !validSeverities[p.SeverityDefault] { + t.Errorf("Invalid severity: %s", p.SeverityDefault) + } + }) + } +} + +func TestControlPattern_IDDomainConsistency(t *testing.T) { + idx, err := LoadControlPatterns() + if err != nil { + t.Fatalf("Failed to load patterns: %v", err) + } + + for _, p := range idx.All { + parts := strings.Split(p.ID, "-") + if len(parts) != 3 { + t.Errorf("Pattern %s: expected 3 parts in ID, got %d", p.ID, len(parts)) + continue + } + idDomain := parts[1] + if idDomain != p.Domain { + t.Errorf("Pattern %s: ID domain '%s' != field domain '%s'", p.ID, idDomain, p.Domain) + } + } +} + +func TestControlPattern_ComposableWithValid(t *testing.T) { + idx, err := LoadControlPatterns() + if err != nil { + t.Fatalf("Failed to load patterns: %v", err) + } + + for _, p := range idx.All { + for _, ref := range p.ComposableWith { + if _, ok := idx.ByID[ref]; !ok { + t.Errorf("Pattern %s: composable_with ref '%s' does not exist", p.ID, ref) + } + if ref == p.ID { + t.Errorf("Pattern %s: composable_with contains self-reference", p.ID) + } + } + } +} + +func TestControlPattern_KeywordsLowercase(t *testing.T) { + idx, err := LoadControlPatterns() + if err != nil { + t.Fatalf("Failed to load patterns: %v", err) + } + + for _, p := range idx.All { + for _, kw := range p.ObligationMatchKeywords { + if kw != strings.ToLower(kw) { + t.Errorf("Pattern %s: keyword should be lowercase: '%s'", p.ID, kw) + } + } + } +} diff --git a/ai-compliance-sdk/policies/control_patterns/_pattern_schema.json b/ai-compliance-sdk/policies/control_patterns/_pattern_schema.json new file mode 100644 index 0000000..90d68d5 --- /dev/null +++ b/ai-compliance-sdk/policies/control_patterns/_pattern_schema.json @@ -0,0 +1,128 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://breakpilot.ai/schemas/control-pattern-v1", + "title": "Control Pattern Schema", + "description": "Schema for YAML control pattern definitions. Pattern ID format: CP-{DOMAIN}-{NNN}", + "type": "object", + "required": ["version", "patterns"], + "properties": { + "version": { + "type": "string", + "pattern": "^[0-9]+\\.[0-9]+$" + }, + "description": { + "type": "string" + }, + "patterns": { + "type": "array", + "items": { "$ref": "#/$defs/ControlPattern" }, + "minItems": 1 + } + }, + "$defs": { + "ControlPattern": { + "type": "object", + "required": [ + "id", "name", "name_de", "domain", "category", "description", + "objective_template", "rationale_template", "requirements_template", + "test_procedure_template", "evidence_template", "severity_default", + "obligation_match_keywords", "tags" + ], + "properties": { + "id": { + "type": "string", + "pattern": "^CP-[A-Z]+-[0-9]{3}$", + "description": "Unique pattern ID. Format: CP-{DOMAIN}-{NNN}" + }, + "name": { + "type": "string", + "pattern": "^[a-z][a-z0-9_]*$", + "description": "Machine-readable name (snake_case)" + }, + "name_de": { + "type": "string", + "description": "Human-readable German name" + }, + "domain": { + "type": "string", + "enum": ["AUTH", "CRYP", "NET", "DATA", "LOG", "ACC", "SEC", "INC", "AI", "COMP", "GOV", "LAB", "FIN", "TRD", "ENV", "HLT"], + "description": "Domain code matching DOMAIN_KEYWORDS in control_generator.py" + }, + "category": { + "type": "string", + "description": "Functional category (e.g. authentication, encryption, incident)" + }, + "description": { + "type": "string", + "minLength": 20, + "description": "Brief description of what this pattern covers" + }, + "objective_template": { + "type": "string", + "minLength": 20, + "description": "Template for the control objective. May contain {placeholders}." + }, + "rationale_template": { + "type": "string", + "minLength": 20, + "description": "Template explaining why this control matters." + }, + "requirements_template": { + "type": "array", + "items": { "type": "string" }, + "minItems": 2, + "description": "Template requirements. May contain {placeholder:default} syntax." + }, + "test_procedure_template": { + "type": "array", + "items": { "type": "string" }, + "minItems": 1 + }, + "evidence_template": { + "type": "array", + "items": { "type": "string" }, + "minItems": 1 + }, + "severity_default": { + "type": "string", + "enum": ["low", "medium", "high", "critical"] + }, + "implementation_effort_default": { + "type": "string", + "enum": ["s", "m", "l", "xl"] + }, + "open_anchor_refs": { + "type": "array", + "items": { + "type": "object", + "required": ["framework", "ref"], + "properties": { + "framework": { "type": "string" }, + "ref": { "type": "string" } + } + } + }, + "obligation_match_keywords": { + "type": "array", + "items": { "type": "string" }, + "minItems": 3, + "description": "Keywords for matching obligations to this pattern (de + en)" + }, + "tags": { + "type": "array", + "items": { "type": "string" }, + "minItems": 1 + }, + "composable_with": { + "type": "array", + "items": { + "type": "string", + "pattern": "^CP-[A-Z]+-[0-9]{3}$" + }, + "description": "Pattern IDs that combine well with this one" + } + }, + "additionalProperties": false + } + } +} diff --git a/ai-compliance-sdk/policies/control_patterns/core_patterns.yaml b/ai-compliance-sdk/policies/control_patterns/core_patterns.yaml new file mode 100644 index 0000000..94b2717 --- /dev/null +++ b/ai-compliance-sdk/policies/control_patterns/core_patterns.yaml @@ -0,0 +1,1361 @@ +version: "1.0" +description: > + 30 Core/Universal Control Patterns — domain-uebergreifende Sicherheits- und + Compliance-Massnahmen, die fuer nahezu jede Organisation relevant sind. + +patterns: + + # ========================================================================= + # AUTH Domain — Identity & Access Management (4 Patterns) + # ========================================================================= + + - id: CP-AUTH-001 + name: password_policy + name_de: Passwortrichtlinie + domain: AUTH + category: authentication + description: > + Mindestanforderungen an Passwort-Komplexitaet, -Rotation und -Speicherung + zur Reduktion von Credential-basierten Angriffen. + objective_template: > + Sicherstellen, dass Passwortrichtlinien ein angemessenes Sicherheitsniveau + gewaehrleisten und Brute-Force-Angriffe erschweren. + rationale_template: > + Schwache Passwoerter sind der haeufigste Angriffsvektor. Eine verbindliche + Passwortrichtlinie reduziert das Risiko kompromittierter Zugaenge erheblich. + requirements_template: + - "Mindestlaenge von {min_length:12} Zeichen fuer alle Benutzerkonten" + - "Komplexitaetsanforderungen: Gross-/Kleinbuchstaben, Ziffern, Sonderzeichen" + - "Ausschluss gaengiger Passwoerter gegen Wortlisten (z.B. HIBP)" + - "Passwort-Hashing mit {algorithm:bcrypt/argon2} und Salt" + - "Keine periodische Zwangsrotation ohne Anlass (NIST-konform)" + test_procedure_template: + - "Pruefung der Passwort-Policy-Konfiguration im Identity-Provider" + - "Testregistrierung mit zu schwachem Passwort (muss abgelehnt werden)" + - "Verifikation des Hashing-Algorithmus in der Datenbank" + evidence_template: + - "Passwortrichtlinie (Dokument)" + - "Screenshots: Policy-Konfiguration im IdP" + - "Penetrationstest-Bericht (Credential-Bereich)" + severity_default: high + implementation_effort_default: m + open_anchor_refs: + - { framework: "NIST SP 800-63B", ref: "Section 5.1.1" } + - { framework: "OWASP ASVS", ref: "V2.1" } + obligation_match_keywords: + - passwort + - authentifizierung + - zugang + - credential + - password + - kennwort + - anmeldedaten + tags: [authentication, password, credential] + composable_with: [CP-AUTH-002, CP-AUTH-003] + + - id: CP-AUTH-002 + name: multi_factor_authentication + name_de: Multi-Faktor-Authentifizierung + domain: AUTH + category: authentication + description: > + Absicherung von Benutzerkonten durch mindestens zwei unabhaengige + Authentifizierungsfaktoren aus unterschiedlichen Kategorien. + objective_template: > + Privilegierte und risikobehaftete Zugaenge durch mindestens zwei + unabhaengige Authentisierungsfaktoren schuetzen. + rationale_template: > + Einzelfaktor-Authentisierung bietet ungenuegenden Schutz gegen Phishing, + Credential Stuffing und Session Hijacking. MFA reduziert das Risiko + kompromittierter Konten um ueber 99%. + requirements_template: + - "Mindestens zwei Faktoren aus unterschiedlichen Kategorien (Wissen, Besitz, Biometrie)" + - "TOTP oder FIDO2/WebAuthn als zweiter Faktor unterstuetzt" + - "Recovery Codes sicher generiert und verschluesselt gespeichert" + - "MFA-Bypass nur mit dokumentierter Ausnahme und zeitlicher Begrenzung" + - "MFA-Enrollment bei Erstanmeldung erzwungen" + test_procedure_template: + - "Admin-Login ohne zweiten Faktor muss abgelehnt werden" + - "TOTP-Codes mit falschem Shared Secret muessen abgelehnt werden" + - "Recovery Codes nach einmaliger Nutzung invalidiert" + evidence_template: + - "MFA-Policy (Dokument)" + - "IdP-Konfiguration (Screenshots)" + - "MFA-Abdeckungsbericht (% der Nutzer mit aktivem MFA)" + severity_default: high + implementation_effort_default: m + open_anchor_refs: + - { framework: "OWASP ASVS", ref: "V2.8" } + - { framework: "NIST SP 800-63B", ref: "Section 4" } + obligation_match_keywords: + - multi-faktor + - zwei-faktor + - mfa + - 2fa + - authentifizierung + - two-factor + - authentication + tags: [authentication, mfa, identity] + composable_with: [CP-AUTH-001, CP-AUTH-003] + + - id: CP-AUTH-003 + name: session_management + name_de: Sitzungsverwaltung + domain: AUTH + category: authentication + description: > + Sichere Verwaltung von Benutzersitzungen inkl. Token-Erzeugung, + Timeout-Konfiguration und Invalidierung bei Logout oder Inaktivitaet. + objective_template: > + Sitzungstoken sicher erzeugen, uebertragen und nach Inaktivitaet oder + Logout zuverlaessig invalidieren. + rationale_template: > + Unsichere Sitzungsverwaltung ermoeglicht Session Hijacking, Replay-Angriffe + und unbefugten Zugriff auf authentifizierte Bereiche. + requirements_template: + - "Sitzungstoken mit kryptographisch sicherem Zufallsgenerator erzeugt (min. 128 Bit Entropie)" + - "Session-Timeout nach {idle_timeout:30} Minuten Inaktivitaet" + - "Absolute Session-Lifetime von maximal {max_lifetime:8} Stunden" + - "Token-Invalidierung bei Logout serverseitig durchgesetzt" + - "Secure-, HttpOnly- und SameSite-Flags auf Session-Cookies gesetzt" + test_procedure_template: + - "Pruefung der Token-Entropie (min. 128 Bit)" + - "Verifizierung des Inaktivitaets-Timeouts" + - "Test: Nach Logout kein Zugriff mit altem Token moeglich" + evidence_template: + - "Session-Management-Konfiguration" + - "Penetrationstest-Bericht (Session-Bereich)" + severity_default: high + implementation_effort_default: m + open_anchor_refs: + - { framework: "OWASP ASVS", ref: "V3.1" } + - { framework: "NIST SP 800-63B", ref: "Section 7" } + obligation_match_keywords: + - sitzung + - session + - token + - timeout + - anmeldung + - abmeldung + - logout + tags: [authentication, session, token] + composable_with: [CP-AUTH-001, CP-AUTH-002] + + - id: CP-ACC-001 + name: access_control + name_de: Zugriffskontrolle + domain: ACC + category: authorization + description: > + Rollenbasierte Zugriffskontrolle (RBAC) mit dem Prinzip der + geringsten Berechtigung fuer alle Systeme und Daten. + objective_template: > + Zugriff auf Systeme und Daten nach dem Need-to-Know-Prinzip beschraenken + und durch rollenbasierte Berechtigungen durchsetzen. + rationale_template: > + Unkontrollierter Zugriff erhoehrt das Risiko von Datenlecks, unautorisierten + Aenderungen und Compliance-Verstoessen. RBAC reduziert die Angriffsflaeche + und vereinfacht die Berechtigungsverwaltung. + requirements_template: + - "Rollenbasiertes Zugriffsmodell (RBAC) fuer alle Systeme implementiert" + - "Least-Privilege-Prinzip: Nur die fuer die Aufgabe notwendigen Rechte" + - "Regelmaessige Berechtigungsrezertifizierung (min. {review_interval:halbjaehrlich})" + - "Entzug von Berechtigungen bei Rollenwechsel oder Austritt innerhalb von {revoke_hours:24} Stunden" + - "Administratorrechte nur fuer explizit benannte Personen" + test_procedure_template: + - "Pruefung des Rollenmodells auf Least-Privilege-Konformitaet" + - "Stichprobe: Berechtigungen eines Nutzers nach Abteilungswechsel" + - "Audit-Log der letzten Berechtigungsaenderungen pruefen" + evidence_template: + - "Berechtigungskonzept (Dokument)" + - "Rezertifizierungsprotokoll" + - "Rollenzuordnungs-Matrix" + severity_default: high + implementation_effort_default: m + open_anchor_refs: + - { framework: "NIST SP 800-53", ref: "AC-2, AC-3, AC-6" } + - { framework: "OWASP ASVS", ref: "V4.1" } + obligation_match_keywords: + - zugriff + - berechtigung + - autorisierung + - rolle + - access + - permission + - authorization + - rbac + tags: [access_control, authorization, rbac] + composable_with: [CP-AUTH-001, CP-AUTH-002] + + # ========================================================================= + # CRYP Domain — Cryptographic Operations (3 Patterns) + # ========================================================================= + + - id: CP-CRYP-001 + name: encryption_at_rest + name_de: Verschluesselung ruhender Daten + domain: CRYP + category: encryption + description: > + Verschluesselung gespeicherter Daten (Datenbanken, Dateisysteme, Backups) + zum Schutz vor unberechtigtem physischem Zugriff. + objective_template: > + Alle personenbezogenen und vertraulichen Daten im Ruhezustand durch + angemessene Verschluesselung schuetzen. + rationale_template: > + Unverschluesselte Speichermedien sind bei Diebstahl, unsachgemaesser + Entsorgung oder Cloud-Fehlkonfiguration unmittelbar kompromittiert. + requirements_template: + - "AES-256 oder vergleichbar fuer alle personenbezogenen Daten" + - "Datenbank-Level Encryption (TDE) oder Filesystem-Verschluesselung" + - "Backup-Medien verschluesselt gespeichert" + - "Schluesselmanagement gemaess CP-CRYP-003" + test_procedure_template: + - "Pruefung der Verschluesselungskonfiguration in Datenbank und Storage" + - "Versuch, auf rohe Datenbankdateien ohne Key zuzugreifen" + - "Verifizierung der Backup-Verschluesselung" + evidence_template: + - "Verschluesselungsrichtlinie" + - "Konfigurationsnachweise (DB, Storage)" + - "Schluesselmanagement-Dokumentation" + severity_default: high + implementation_effort_default: m + open_anchor_refs: + - { framework: "NIST SP 800-111", ref: "Section 4" } + - { framework: "OWASP ASVS", ref: "V6.2" } + obligation_match_keywords: + - verschluesselung + - encryption + - ruhende daten + - at rest + - speicherung + - kryptographisch + - chiffrierung + tags: [encryption, data_protection, storage] + composable_with: [CP-CRYP-002, CP-CRYP-003] + + - id: CP-CRYP-002 + name: encryption_in_transit + name_de: Transportverschluesselung + domain: CRYP + category: encryption + description: > + Verschluesselung aller Daten waehrend der Uebertragung durch TLS 1.2+ + oder vergleichbare Protokolle. + objective_template: > + Alle Daten waehrend der Uebertragung durch aktuelle Transportverschluesselung + gegen Abhoeren und Manipulation schuetzen. + rationale_template: > + Unverschluesselte Kommunikation ermoeglicht Man-in-the-Middle-Angriffe, + Abhoeren sensibler Daten und Manipulation von Nachrichten. + requirements_template: + - "TLS 1.2 oder hoeher fuer alle externen Verbindungen" + - "TLS 1.3 bevorzugt wo technisch moeglich" + - "Deaktivierung veralteter Protokolle (SSLv3, TLS 1.0/1.1)" + - "HSTS-Header mit min. {hsts_max_age:31536000} Sekunden" + - "Certificate Pinning fuer kritische API-Verbindungen evaluiert" + test_procedure_template: + - "TLS-Scan aller oeffentlichen Endpunkte (z.B. testssl.sh)" + - "Pruefung auf deaktivierte Legacy-Protokolle" + - "Verifizierung der HSTS-Header" + evidence_template: + - "TLS-Scan-Ergebnis" + - "Webserver-/Load-Balancer-Konfiguration" + - "HSTS-Konfigurationsnachweis" + severity_default: high + implementation_effort_default: s + open_anchor_refs: + - { framework: "NIST SP 800-52", ref: "Rev. 2" } + - { framework: "OWASP ASVS", ref: "V9.1" } + obligation_match_keywords: + - transport + - uebertragung + - tls + - ssl + - verschluesselung + - kommunikation + - transit + - verbindung + tags: [encryption, transport, tls] + composable_with: [CP-CRYP-001, CP-CRYP-003] + + - id: CP-CRYP-003 + name: key_management + name_de: Schluesselmanagement + domain: CRYP + category: encryption + description: > + Sichere Erzeugung, Speicherung, Rotation und Vernichtung kryptographischer + Schluessel ueber den gesamten Lebenszyklus. + objective_template: > + Kryptographische Schluessel sicher erzeugen, speichern, regelmaessig + rotieren und bei Bedarf sicher vernichten. + rationale_template: > + Selbst starke Verschluesselung ist wertlos, wenn Schluessel unsicher + gespeichert, nie rotiert oder nicht ordnungsgemaess vernichtet werden. + requirements_template: + - "Schluessel in Hardware Security Module (HSM) oder Vault gespeichert" + - "Automatische Rotation alle {rotation_days:365} Tage" + - "Schluessel-Backup verschluesselt und getrennt gelagert" + - "Sichere Vernichtung: Cryptographic Erasure bei Ausserbetriebnahme" + - "Zugriff auf Schluessel nur fuer explizit autorisierte Dienste" + test_procedure_template: + - "Pruefung der Schluessel-Speicherorte (kein Klartext in Code/Config)" + - "Verifizierung der Rotationszyklen" + - "Nachweis der Schluessel-Zugriffskontrolle" + evidence_template: + - "Schluesselmanagement-Policy" + - "HSM/Vault-Konfiguration" + - "Rotationsprotokoll" + severity_default: high + implementation_effort_default: l + open_anchor_refs: + - { framework: "NIST SP 800-57", ref: "Part 1, Rev. 5" } + - { framework: "OWASP ASVS", ref: "V6.4" } + obligation_match_keywords: + - schluessel + - key management + - rotation + - kryptographisch + - hsm + - vault + - zertifikat + tags: [encryption, key_management, cryptography] + composable_with: [CP-CRYP-001, CP-CRYP-002] + + # ========================================================================= + # DATA Domain — Data Governance (5 Patterns) + # ========================================================================= + + - id: CP-DATA-001 + name: data_classification + name_de: Datenklassifizierung + domain: DATA + category: data_protection + description: > + Systematische Einstufung aller Datenbestaende nach Schutzbedarf + und Sensitivitaet als Grundlage fuer angemessene Schutzmassnahmen. + objective_template: > + Alle Datenbestaende nach einem einheitlichen Schema klassifizieren + und den Schutzbedarf dokumentieren. + rationale_template: > + Ohne Klassifizierung werden entweder alle Daten gleich behandelt + (uebermaessiger Aufwand) oder sensible Daten unzureichend geschuetzt. + requirements_template: + - "Klassifizierungsschema mit mindestens 4 Stufen (oeffentlich, intern, vertraulich, streng vertraulich)" + - "Alle Datenbestaende klassifiziert und im Dateninventar dokumentiert" + - "Automatische Klassifizierung fuer strukturierte Daten wo moeglich" + - "Regelmaessige Ueberpruefung der Klassifizierung (min. jaehrlich)" + test_procedure_template: + - "Stichprobe: 10 Datenbestaende auf korrekte Klassifizierung pruefen" + - "Pruefung ob neue Projekte ein Klassifizierungsverfahren durchlaufen" + evidence_template: + - "Datenklassifizierungsrichtlinie" + - "Dateninventar mit Schutzbedarfseinstufung" + - "Klassifizierungsprotokoll" + severity_default: medium + implementation_effort_default: m + open_anchor_refs: + - { framework: "NIST SP 800-60", ref: "Vol. 1, Rev. 1" } + obligation_match_keywords: + - klassifizierung + - schutzbedarf + - datenkategorie + - classification + - datenarten + - sensibilitaet + - vertraulich + tags: [data_protection, classification, governance] + composable_with: [CP-DATA-002, CP-DATA-003] + + - id: CP-DATA-002 + name: data_minimization + name_de: Datenminimierung + domain: DATA + category: data_protection + description: > + Beschraenkung der Datenerhebung und -verarbeitung auf das fuer den + jeweiligen Zweck erforderliche Mass (Art. 5 Abs. 1 lit. c DSGVO). + objective_template: > + Sicherstellen, dass nur die fuer den definierten Zweck notwendigen + personenbezogenen Daten erhoben und verarbeitet werden. + rationale_template: > + Uebermaessige Datenerhebung erhoehrt die Angriffsflaeche, das + Schadenspotential bei Datenpannen und das Risiko von DSGVO-Bussgeldern. + requirements_template: + - "Fuer jede Datenerhebung ist der Zweck dokumentiert" + - "Erhobene Datenfelder sind auf das Notwendige reduziert" + - "Optionale Felder sind klar als solche gekennzeichnet" + - "Regelmaessige Ueberpruefung bestehender Datenbestaende auf Notwendigkeit" + test_procedure_template: + - "Review aller Formulare und Eingabemasken auf ueberfluessige Felder" + - "Pruefung des VVT auf Zweckbindung jeder Verarbeitung" + evidence_template: + - "Verarbeitungsverzeichnis (VVT) mit Zweckangaben" + - "Review-Protokoll Datenminimierung" + severity_default: medium + implementation_effort_default: m + open_anchor_refs: + - { framework: "ENISA Guidelines", ref: "Data Protection by Design" } + obligation_match_keywords: + - datenminimierung + - datensparsamkeit + - zweckbindung + - erforderlich + - minimization + - purpose limitation + - notwendig + tags: [data_protection, minimization, gdpr] + composable_with: [CP-DATA-001, CP-DATA-003] + + - id: CP-DATA-003 + name: retention_policy + name_de: Aufbewahrungsfristen und Loeschkonzept + domain: DATA + category: data_protection + description: > + Definition und Durchsetzung von Aufbewahrungsfristen fuer alle + Datenkategorien mit automatisierter Loeschung nach Fristablauf. + objective_template: > + Aufbewahrungsfristen fuer alle Datenkategorien definieren und die + fristgerechte Loeschung technisch und organisatorisch sicherstellen. + rationale_template: > + Ohne definierte Loeschfristen wachsen Datenbestaende unbegrenzt, erhoehen + das Risiko bei Datenpannen und verstoessen gegen Art. 5 Abs. 1 lit. e DSGVO. + requirements_template: + - "Loeschkonzept fuer alle Datenkategorien dokumentiert" + - "Aufbewahrungsfristen je Kategorie definiert und begruendet" + - "Automatisierte Loeschprozesse implementiert wo technisch moeglich" + - "Protokollierung aller Loeschvorgaenge" + - "Regelmaessige Pruefung auf abgelaufene Aufbewahrungsfristen" + test_procedure_template: + - "Stichprobe: 5 Datenkategorien auf korrekte Loeschfristen pruefen" + - "Nachweis automatisierter Loeschlaeufe (Logs)" + - "Pruefung ob geloeschte Daten tatsaechlich nicht wiederherstellbar sind" + evidence_template: + - "Loeschkonzept" + - "Loeschprotokolle" + - "Konfiguration automatisierter Loeschprozesse" + severity_default: medium + implementation_effort_default: m + open_anchor_refs: + - { framework: "ENISA Guidelines", ref: "Data Retention" } + obligation_match_keywords: + - loeschung + - aufbewahrung + - retention + - speicherbegrenzung + - loeschfrist + - deletion + - speicherdauer + tags: [data_protection, retention, deletion] + composable_with: [CP-DATA-001, CP-DATA-002] + + - id: CP-DATA-004 + name: consent_management + name_de: Einwilligungsmanagement + domain: DATA + category: data_protection + description: > + Systematische Einholung, Dokumentation und Verwaltung von Einwilligungen + fuer die Verarbeitung personenbezogener Daten. + objective_template: > + Einwilligungen rechtskonform einholen, dokumentieren und jederzeit + den Nachweis einer gueltigen Einwilligung fuehren koennen. + rationale_template: > + Ohne nachweisbare Einwilligung fehlt die Rechtsgrundlage fuer + einwilligungsbasierte Verarbeitungen, was zu Bussgeldern fuehrt. + requirements_template: + - "Granulare Einwilligungsoption fuer jeden Verarbeitungszweck" + - "Einwilligung muss freiwillig, informiert, spezifisch und eindeutig sein" + - "Widerruf jederzeit moeglich und ebenso einfach wie Erteilung" + - "Versionierte Speicherung aller Einwilligungen mit Zeitstempel" + - "Nachweis der Einwilligung exportierbar fuer Aufsichtsbehoerden" + test_procedure_template: + - "Pruefung des Consent-Dialogs auf Granularitaet und Freiwilligkeit" + - "Test: Widerruf einer Einwilligung und Verifizierung der Umsetzung" + - "Export der Consent-Historie eines Testnutzers" + evidence_template: + - "Einwilligungsmanagement-Dokumentation" + - "Consent-Logs mit Zeitstempel" + - "Screenshots: Consent-Dialog" + severity_default: high + implementation_effort_default: m + open_anchor_refs: + - { framework: "ENISA Guidelines", ref: "Consent Management" } + obligation_match_keywords: + - einwilligung + - consent + - zustimmung + - widerruf + - opt-in + - opt-out + - freiwillig + tags: [data_protection, consent, gdpr] + composable_with: [CP-DATA-002, CP-COMP-003] + + - id: CP-DATA-005 + name: privacy_impact_assessment + name_de: Datenschutz-Folgenabschaetzung + domain: DATA + category: data_protection + description: > + Durchfuehrung einer Datenschutz-Folgenabschaetzung (DSFA) bei + Verarbeitungen mit hohem Risiko fuer die Rechte betroffener Personen. + objective_template: > + Risiken fuer die Rechte und Freiheiten betroffener Personen systematisch + bewerten und durch angemessene Massnahmen minimieren. + rationale_template: > + Art. 35 DSGVO verpflichtet zur DSFA bei Verarbeitungen mit voraussichtlich + hohem Risiko. Ohne DSFA drohen Bussgelder und unerkannte Datenschutzrisiken. + requirements_template: + - "DSFA-Pflicht fuer alle Verarbeitungen mit hohem Risiko (Art. 35 DSGVO)" + - "Systematische Risikobewertung mit dokumentierter Methodik" + - "Einbeziehung des Datenschutzbeauftragten" + - "Dokumentation der Abhilfemassnahmen und Restrisiken" + - "Konsultation der Aufsichtsbehoerde bei hohem Restrisiko (Art. 36 DSGVO)" + test_procedure_template: + - "Pruefung ob fuer alle Hochrisiko-Verarbeitungen eine DSFA vorliegt" + - "Review einer DSFA auf Vollstaendigkeit und Nachvollziehbarkeit" + evidence_template: + - "DSFA-Dokumentation" + - "Risikobewertungsmatrix" + - "Massnahmenplan" + severity_default: high + implementation_effort_default: l + open_anchor_refs: + - { framework: "ENISA Guidelines", ref: "DPIA Framework" } + obligation_match_keywords: + - folgenabschaetzung + - dsfa + - dpia + - risikobewertung + - hohes risiko + - impact assessment + - datenschutzrisiko + tags: [data_protection, dpia, risk_assessment] + composable_with: [CP-COMP-001, CP-COMP-004] + + # ========================================================================= + # LOG Domain — Logging & Monitoring (2 Patterns) + # ========================================================================= + + - id: CP-LOG-001 + name: audit_logging + name_de: Audit-Protokollierung + domain: LOG + category: logging + description: > + Lueckenlose Protokollierung sicherheitsrelevanter Ereignisse fuer + forensische Analyse, Compliance-Nachweis und Anomalie-Erkennung. + objective_template: > + Sicherheitsrelevante Ereignisse vollstaendig und manipulationssicher + protokollieren, um Vorfaelle nachvollziehen und nachweisen zu koennen. + rationale_template: > + Ohne Audit-Logs koennen Sicherheitsvorfaelle nicht erkannt, analysiert + oder nachgewiesen werden. Aufsichtsbehoerden verlangen nachvollziehbare + Protokolle fuer Compliance-Pruefungen. + requirements_template: + - "Protokollierung aller Anmelde-/Abmeldevorgaenge" + - "Protokollierung aller Berechtigungsaenderungen" + - "Protokollierung aller Zugriffe auf personenbezogene Daten" + - "Log-Integritaet durch Tamper-Detection (z.B. Signierung oder WORM-Speicher)" + - "Aufbewahrung fuer mindestens {retention_months:12} Monate" + - "Keine personenbezogenen Daten in Log-Nachrichten (Pseudonymisierung)" + test_procedure_template: + - "Stichprobe: 5 sicherheitsrelevante Aktionen im Audit-Log verifizieren" + - "Pruefung der Log-Integritaet (Manipulationsversuch)" + - "Verifizierung der Log-Aufbewahrungsdauer" + evidence_template: + - "Audit-Logging-Policy" + - "Log-Beispiele (anonymisiert)" + - "SIEM-Konfiguration" + severity_default: high + implementation_effort_default: m + open_anchor_refs: + - { framework: "NIST SP 800-92", ref: "Section 4" } + - { framework: "OWASP ASVS", ref: "V7.1" } + obligation_match_keywords: + - protokollierung + - logging + - audit + - nachvollziehbarkeit + - ueberwachung + - audit trail + - aufzeichnung + tags: [logging, audit, monitoring] + composable_with: [CP-LOG-002] + + - id: CP-LOG-002 + name: monitoring_alerting + name_de: Monitoring und Alarmierung + domain: LOG + category: logging + description: > + Kontinuierliche Ueberwachung von Systemen und Netzwerken mit + automatischer Alarmierung bei Anomalien und Sicherheitsvorfaellen. + objective_template: > + Sicherheitsrelevante Anomalien in Echtzeit erkennen und autorisiertes + Personal automatisch alarmieren. + rationale_template: > + Ohne proaktives Monitoring werden Sicherheitsvorfaelle erst nach + erheblichem Schaden erkannt. Fruehe Erkennung reduziert den Schaden + und die Reaktionszeit signifikant. + requirements_template: + - "SIEM oder vergleichbares Monitoring-System im Einsatz" + - "Alarmierung bei fehlgeschlagenen Login-Versuchen (Schwelle: {failed_login_threshold:5} in {time_window:5} Minuten)" + - "Alarmierung bei ungewoehnlichen Datenzugriffen" + - "Eskalationsprozess mit definierten Reaktionszeiten" + - "Regelmaessige Ueberpruefung und Anpassung der Alarmregeln" + test_procedure_template: + - "Simulation eines Brute-Force-Angriffs — Alarmierung pruefen" + - "Pruefung der Eskalationskette und Reaktionszeiten" + - "Review der letzten 10 Alarme auf korrekte Behandlung" + evidence_template: + - "Monitoring-Konzept" + - "SIEM-Dashboards (Screenshots)" + - "Alarm-Statistik der letzten 3 Monate" + severity_default: medium + implementation_effort_default: l + open_anchor_refs: + - { framework: "NIST SP 800-137", ref: "ISCM Framework" } + obligation_match_keywords: + - monitoring + - ueberwachung + - alarmierung + - erkennung + - anomalie + - detection + - alerting + tags: [monitoring, alerting, siem] + composable_with: [CP-LOG-001, CP-INC-001] + + # ========================================================================= + # INC Domain — Incident Response (2 Patterns) + # ========================================================================= + + - id: CP-INC-001 + name: incident_response + name_de: Vorfallreaktion + domain: INC + category: incident + description: > + Strukturierter Prozess fuer die Erkennung, Eindaemmung, Beseitigung + und Nachbereitung von Sicherheitsvorfaellen. + objective_template: > + Sicherheitsvorfaelle schnell erkennen, eindaemmen und strukturiert + beheben, um Schaden zu minimieren und Meldepflichten einzuhalten. + rationale_template: > + Ohne definierten Incident-Response-Prozess verzoegern sich Reaktionen, + Meldepflichten (72h DSGVO, 24h NIS2) werden verpasst und der Schaden + vergroessert sich. + requirements_template: + - "Incident-Response-Plan dokumentiert und allen Beteiligten bekannt" + - "Klare Rollen und Verantwortlichkeiten im Incident-Team" + - "Eskalationsstufen mit definierten Zeitfenstern" + - "Meldepflicht-Checkliste (72h DSGVO, 24h NIS2)" + - "Post-Incident-Review (Lessons Learned) nach jedem Vorfall" + - "Regelmaessige Uebungen (min. {exercise_interval:jaehrlich})" + test_procedure_template: + - "Tabletop-Uebung: Simulierter Datenschutzvorfall durchspielen" + - "Pruefung der Meldekette und Reaktionszeiten" + - "Review der letzten Post-Incident-Berichte" + evidence_template: + - "Incident-Response-Plan" + - "Uebungsprotokolle" + - "Post-Incident-Berichte" + severity_default: critical + implementation_effort_default: m + open_anchor_refs: + - { framework: "NIST SP 800-61", ref: "Rev. 2" } + obligation_match_keywords: + - vorfall + - incident + - sicherheitsvorfall + - datenpanne + - breach + - meldepflicht + - reaktion + - response + tags: [incident, response, breach] + composable_with: [CP-INC-002, CP-LOG-002] + + - id: CP-INC-002 + name: backup_recovery + name_de: Datensicherung und Wiederherstellung + domain: INC + category: continuity + description: > + Regelmaessige Datensicherung mit gepruefter Wiederherstellungsfaehigkeit + zur Sicherstellung der Verfuegbarkeit und Integritaet. + objective_template: > + Kritische Daten regelmaessig sichern und die Wiederherstellung + innerhalb definierter Zeitfenster (RTO/RPO) gewaehrleisten. + rationale_template: > + Datenverlust durch Ransomware, Hardwaredefekte oder menschliche Fehler + kann ohne funktionierende Backups existenzbedrohend sein. + requirements_template: + - "Automatisierte Backups fuer alle kritischen Systeme" + - "Recovery Point Objective (RPO): max. {rpo:24} Stunden" + - "Recovery Time Objective (RTO): max. {rto:4} Stunden fuer kritische Systeme" + - "3-2-1-Regel: 3 Kopien, 2 Medien, 1 Off-Site" + - "Verschluesselung aller Backup-Medien" + - "Regelmaessige Restore-Tests (min. {restore_test_interval:quartalsweise})" + test_procedure_template: + - "Restore-Test: Vollstaendige Wiederherstellung eines kritischen Systems" + - "Pruefung der Backup-Verschluesselung" + - "Verifizierung der RPO/RTO-Einhaltung" + evidence_template: + - "Backup-Konzept" + - "Restore-Test-Protokolle" + - "Backup-Job-Logs" + severity_default: critical + implementation_effort_default: m + open_anchor_refs: + - { framework: "NIST SP 800-34", ref: "Rev. 1" } + obligation_match_keywords: + - backup + - sicherung + - wiederherstellung + - recovery + - verfuegbarkeit + - restore + - datenverlust + tags: [backup, recovery, continuity] + composable_with: [CP-INC-001, CP-CRYP-001] + + # ========================================================================= + # SEC Domain — System Security (4 Patterns) + # ========================================================================= + + - id: CP-SEC-001 + name: patch_management + name_de: Patch-Management + domain: SEC + category: system + description: > + Systematischer Prozess zur zeitnahen Identifikation, Bewertung und + Installation von Sicherheitsupdates auf allen Systemen. + objective_template: > + Sicherheitsupdates zeitnah identifizieren, bewerten und auf allen + Systemen installieren, um bekannte Schwachstellen zu schliessen. + rationale_template: > + Ungepatchte Systeme sind der zweithaeufigste Angriffsvektor nach + schwachen Passwoertern. Viele erfolgreiche Angriffe nutzen + Schwachstellen, fuer die bereits Patches verfuegbar sind. + requirements_template: + - "Kritische Patches innerhalb von {critical_sla:72} Stunden nach Veroeffentlichung" + - "Hohe Patches innerhalb von {high_sla:7} Tagen" + - "Automatisierter Patch-Scan (min. taeglich)" + - "Patch-Dokumentation mit Risikobewertung" + - "Rollback-Verfahren fuer fehlgeschlagene Patches" + test_procedure_template: + - "Patch-Compliance-Scan aller Systeme" + - "Pruefung der Patch-SLAs anhand der letzten 10 kritischen Patches" + evidence_template: + - "Patch-Management-Richtlinie" + - "Patch-Compliance-Bericht" + - "Patch-Installationsprotokoll" + severity_default: high + implementation_effort_default: m + open_anchor_refs: + - { framework: "NIST SP 800-40", ref: "Rev. 4" } + obligation_match_keywords: + - patch + - update + - schwachstelle + - aktualisierung + - sicherheitsupdate + - vulnerability + - software update + tags: [patching, vulnerability, system] + composable_with: [CP-SEC-002, CP-SEC-003] + + - id: CP-SEC-002 + name: vulnerability_scanning + name_de: Schwachstellenanalyse + domain: SEC + category: system + description: > + Regelmaessige automatisierte Schwachstellenscans aller Systeme + und Anwendungen mit strukturiertem Behebungsprozess. + objective_template: > + Schwachstellen in Systemen und Anwendungen fruehzeitig erkennen + und nach Risikobewertung priorisiert beheben. + rationale_template: > + Ohne regelmaessige Scans bleiben Schwachstellen unentdeckt und + erhoehen das Angriffsrisiko. Automatisierte Scans decken bekannte + CVEs zuverlaessig auf. + requirements_template: + - "Automatisierter Schwachstellenscan (min. {scan_interval:woechentlich})" + - "Priorisierung nach CVSS-Score und Geschaeftsrelevanz" + - "Kritische Schwachstellen (CVSS >= 9.0) innerhalb von {critical_fix:48} Stunden behoben" + - "Tracking aller offenen Schwachstellen im Ticketsystem" + - "Regelmaessige manuelle Penetrationstests (min. jaehrlich)" + test_procedure_template: + - "Review des letzten Schwachstellenberichts" + - "Pruefung der Behebungsquote und -geschwindigkeit" + - "Stichprobe: 3 kritische Schwachstellen auf korrekte Behebung pruefen" + evidence_template: + - "Schwachstellenscan-Bericht" + - "Penetrationstest-Bericht" + - "Schwachstellen-Tracking (Ticketsystem)" + severity_default: high + implementation_effort_default: m + open_anchor_refs: + - { framework: "NIST SP 800-115", ref: "Technical Guide" } + - { framework: "OWASP ASVS", ref: "V14.2" } + obligation_match_keywords: + - schwachstelle + - scan + - vulnerability + - penetrationstest + - sicherheitspruefung + - cve + - security assessment + tags: [vulnerability, scanning, security] + composable_with: [CP-SEC-001, CP-SEC-003] + + - id: CP-SEC-003 + name: secure_configuration + name_de: Sichere Konfiguration + domain: SEC + category: system + description: > + Haertung aller Systeme durch Entfernung unnoetige Dienste, Aenderung + von Default-Credentials und Anwendung von Sicherheitsbaselines. + objective_template: > + Alle Systeme gemaess anerkannter Sicherheitsbaselines haerten und + unsichere Standardkonfigurationen eliminieren. + rationale_template: > + Standardkonfigurationen enthalten oft unsichere Defaults, aktivierte + Debug-Dienste und bekannte Standardpasswoerter, die trivial ausnutzbar sind. + requirements_template: + - "Sicherheitsbaseline fuer jedes Betriebssystem und jede Middleware definiert" + - "Default-Credentials bei Inbetriebnahme geaendert" + - "Nicht benoetigte Dienste und Ports deaktiviert" + - "Konfigurationsdrift-Erkennung implementiert" + - "Infrastructure-as-Code mit versionierten Konfigurationen" + test_procedure_template: + - "Baseline-Compliance-Scan (z.B. CIS Benchmark)" + - "Pruefung auf aktive Default-Credentials" + - "Port-Scan auf unerwartete offene Dienste" + evidence_template: + - "Haertungsrichtlinie" + - "CIS-Benchmark-Scan-Ergebnis" + - "Konfigurationsmanagement-Dokumentation" + severity_default: medium + implementation_effort_default: m + open_anchor_refs: + - { framework: "NIST SP 800-123", ref: "Server Security Guide" } + obligation_match_keywords: + - konfiguration + - haertung + - hardening + - baseline + - default + - sicherheitskonfiguration + - configuration + tags: [configuration, hardening, system] + composable_with: [CP-SEC-001, CP-SEC-002] + + - id: CP-NET-001 + name: network_segmentation + name_de: Netzwerksegmentierung + domain: NET + category: network + description: > + Aufteilung des Netzwerks in sicherheitstechnisch getrennte Zonen + zur Eindaemmung von Lateral Movement bei Sicherheitsvorfaellen. + objective_template: > + Netzwerk in sicherheitstechnisch getrennte Zonen aufteilen, um die + Ausbreitung von Angriffen zu begrenzen und sensible Systeme zu isolieren. + rationale_template: > + Flache Netzwerke ermoeglichen es Angreifern, sich nach initialem + Zugriff lateral durch das gesamte Netzwerk zu bewegen. + requirements_template: + - "Trennung in mindestens 3 Zonen: DMZ, intern, Management" + - "Firewall-Regeln zwischen allen Zonen definiert und dokumentiert" + - "Datenbankserver nicht direkt aus dem Internet erreichbar" + - "Micro-Segmentierung fuer kritische Anwendungen evaluiert" + - "Regelmaessige Ueberpruefung der Firewall-Regeln (min. {review_interval:halbjaehrlich})" + test_procedure_template: + - "Netzwerk-Topologie-Review auf korrekte Segmentierung" + - "Port-Scan zwischen Zonen auf unerlaubte Verbindungen" + - "Pruefung der Firewall-Regel-Dokumentation" + evidence_template: + - "Netzwerksegmentierungsplan" + - "Firewall-Regelwerk" + - "Netzwerk-Scan-Ergebnisse" + severity_default: high + implementation_effort_default: l + open_anchor_refs: + - { framework: "NIST SP 800-41", ref: "Rev. 1" } + obligation_match_keywords: + - netzwerk + - segmentierung + - firewall + - zone + - network + - segmentation + - isolation + tags: [network, segmentation, firewall] + composable_with: [CP-CRYP-002] + + # ========================================================================= + # COMP Domain — Compliance & Governance (5 Patterns) + # ========================================================================= + + - id: CP-COMP-001 + name: risk_assessment + name_de: Risikobewertung + domain: COMP + category: risk + description: > + Systematische Identifikation, Bewertung und Behandlung von + Informationssicherheits- und Datenschutzrisiken. + objective_template: > + Risiken fuer die Informationssicherheit und den Datenschutz + systematisch identifizieren, bewerten und angemessen behandeln. + rationale_template: > + Ohne systematische Risikobewertung werden Ressourcen nicht + risikobasiert eingesetzt und kritische Risiken bleiben unerkannt. + requirements_template: + - "Risikobewertungsmethodik definiert und dokumentiert" + - "Alle wesentlichen Assets und Verarbeitungen in der Risikobewertung erfasst" + - "Risikobewertung nach Eintrittswahrscheinlichkeit und Schadenshoehe" + - "Risikomatrix mit definierten Akzeptanzschwellen" + - "Risikobehandlungsplan fuer alle nicht-akzeptierten Risiken" + - "Regelmaessige Aktualisierung (min. {review_interval:jaehrlich})" + test_procedure_template: + - "Review der Risikobewertungsmethodik auf Vollstaendigkeit" + - "Stichprobe: 5 Risiken auf angemessene Bewertung und Behandlung pruefen" + - "Pruefung der Aktualitaet (letztes Review-Datum)" + evidence_template: + - "Risikobewertungsmethodik" + - "Risikoregister" + - "Risikobehandlungsplan" + severity_default: high + implementation_effort_default: l + open_anchor_refs: + - { framework: "NIST SP 800-30", ref: "Rev. 1" } + obligation_match_keywords: + - risiko + - bewertung + - risk + - assessment + - risikoanalyse + - bedrohung + - gefaehrdung + tags: [risk, assessment, governance] + composable_with: [CP-DATA-005, CP-COMP-004] + + - id: CP-COMP-002 + name: change_management + name_de: Aenderungsmanagement + domain: COMP + category: governance + description: > + Kontrollierter Prozess fuer alle Aenderungen an IT-Systemen, + Konfigurationen und sicherheitsrelevanten Prozessen. + objective_template: > + Alle Aenderungen an IT-Systemen und Prozessen kontrolliert + durchfuehren, um ungeplante Auswirkungen zu vermeiden. + rationale_template: > + Unkontrollierte Aenderungen sind eine der haeufigsten Ursachen fuer + Ausfaelle und Sicherheitsvorfaelle. Ein strukturierter Prozess + reduziert dieses Risiko erheblich. + requirements_template: + - "Change-Request-Verfahren fuer alle Aenderungen an Produktivsystemen" + - "Risikobewertung und Genehmigung vor Umsetzung" + - "Rollback-Plan fuer jede Aenderung dokumentiert" + - "Vier-Augen-Prinzip fuer kritische Aenderungen" + - "Dokumentation aller Aenderungen (wer, was, wann, warum)" + test_procedure_template: + - "Stichprobe: 5 Aenderungen auf Einhaltung des Verfahrens pruefen" + - "Pruefung ob Emergency Changes nachtraeglich genehmigt wurden" + evidence_template: + - "Change-Management-Richtlinie" + - "Change-Log / Ticketsystem" + - "Genehmigungsprotokolle" + severity_default: medium + implementation_effort_default: m + open_anchor_refs: + - { framework: "NIST SP 800-128", ref: "Configuration Management" } + obligation_match_keywords: + - aenderung + - change + - konfiguration + - release + - deployment + - freigabe + - genehmigung + tags: [change_management, governance, process] + composable_with: [CP-SEC-003] + + - id: CP-COMP-003 + name: awareness_training + name_de: Sensibilisierung und Schulung + domain: COMP + category: personnel + description: > + Regelmaessige Schulungen und Sensibilisierungsmassnahmen fuer alle + Mitarbeiter zu Informationssicherheit und Datenschutz. + objective_template: > + Alle Mitarbeiter fuer Informationssicherheits- und Datenschutzrisiken + sensibilisieren und handlungsfaehig machen. + rationale_template: > + Der Mensch ist das schwaechtste Glied in der Sicherheitskette. + Regelmaessige Schulungen reduzieren Phishing-Erfolgsraten und + unbeabsichtigte Datenschutzverletzungen nachweislich. + requirements_template: + - "Pflicht-Schulung fuer alle neuen Mitarbeiter vor Systemzugriff" + - "Jaehrliche Auffrischungsschulung fuer alle Mitarbeiter" + - "Phishing-Simulationen (min. {phishing_sim_interval:quartalsweise})" + - "Spezialschulungen fuer IT-Personal und Fuehrungskraefte" + - "Nachweis-System fuer absolvierte Schulungen" + test_procedure_template: + - "Schulungsquote pruefen (Ziel: >= {target_rate:95}%)" + - "Ergebnisse der letzten Phishing-Simulation auswerten" + - "Stichprobe: 5 Mitarbeiter auf aktuelle Schulungsnachweise pruefen" + evidence_template: + - "Schulungskonzept" + - "Schulungsnachweise / Teilnehmerlisten" + - "Phishing-Simulationsergebnisse" + severity_default: medium + implementation_effort_default: m + open_anchor_refs: + - { framework: "NIST SP 800-50", ref: "Security Awareness Training" } + obligation_match_keywords: + - schulung + - sensibilisierung + - awareness + - training + - mitarbeiter + - fortbildung + - unterweisung + tags: [training, awareness, personnel] + composable_with: [CP-INC-001] + + - id: CP-COMP-004 + name: compliance_governance + name_de: Compliance-Governance + domain: COMP + category: governance + description: > + Uebergreifendes Governance-Framework fuer die Steuerung und + Ueberwachung aller Compliance-Anforderungen der Organisation. + objective_template: > + Ein uebergreifendes Governance-Framework etablieren, das die + Einhaltung aller relevanten Compliance-Anforderungen steuert + und ueberwacht. + rationale_template: > + Ohne zentrales Compliance-Governance fehlt der Ueberblick ueber + regulatorische Anforderungen, Verantwortlichkeiten und den + Umsetzungsstand. + requirements_template: + - "Compliance-Organisation mit klaren Rollen und Verantwortlichkeiten" + - "Regulatorisches Register: Alle anwendbaren Vorschriften erfasst" + - "Regelmaessige Compliance-Bewertung (min. {review_interval:halbjaehrlich})" + - "Eskalationsprozess fuer Compliance-Verstoesse" + - "Berichtswesen an die Geschaeftsfuehrung" + test_procedure_template: + - "Pruefung des regulatorischen Registers auf Vollstaendigkeit" + - "Review der letzten Compliance-Bewertung" + - "Pruefung des Eskalationsprozesses" + evidence_template: + - "Compliance-Organisation (Organigramm)" + - "Regulatorisches Register" + - "Compliance-Berichte an die Geschaeftsfuehrung" + severity_default: high + implementation_effort_default: l + open_anchor_refs: + - { framework: "NIST Cybersecurity Framework", ref: "GV.OC" } + obligation_match_keywords: + - compliance + - governance + - konformitaet + - aufsicht + - vorschrift + - regulierung + - ueberwachung + tags: [compliance, governance, organization] + composable_with: [CP-COMP-001, CP-COMP-005] + + - id: CP-COMP-005 + name: supplier_assessment + name_de: Lieferanten- und Auftragnehmer-Bewertung + domain: COMP + category: supply_chain + description: > + Bewertung und Ueberwachung von Lieferanten und Auftragsverarbeitern + hinsichtlich Informationssicherheit und Datenschutz. + objective_template: > + Lieferanten und Auftragsverarbeiter vor Beauftragung und regelmaessig + danach auf Informationssicherheit und Datenschutz pruefen. + rationale_template: > + Auftragsverarbeiter-Haftung (Art. 28 DSGVO) erfordert nachweisbare + Pruefung. Supply-Chain-Angriffe nutzen schwache Glieder in der + Lieferkette aus. + requirements_template: + - "Sicherheitsbewertung vor Beauftragung (Due Diligence)" + - "Auftragsverarbeitungsvertrag (AVV) gemaess Art. 28 DSGVO" + - "Regelmaessige Ueberpruefung der Lieferanten (min. {review_interval:jaehrlich})" + - "Subunternehmer-Klausel mit Vorab-Genehmigungspflicht" + - "Exit-Strategie und Datenrueckgabe bei Vertragsende" + test_procedure_template: + - "Stichprobe: 5 Lieferanten auf vorhandenen AVV pruefen" + - "Pruefung der letzten Lieferanten-Bewertung" + - "Nachweis der Exit-Strategie fuer kritische Lieferanten" + evidence_template: + - "Lieferanten-Bewertungsbogen" + - "Auftragsverarbeitungsvertraege (AVVs)" + - "Lieferanten-Audit-Berichte" + severity_default: high + implementation_effort_default: m + open_anchor_refs: + - { framework: "NIST SP 800-161", ref: "Rev. 1" } + - { framework: "ENISA Guidelines", ref: "Supply Chain Security" } + obligation_match_keywords: + - lieferant + - auftragnehmer + - auftragsverarbeiter + - supplier + - vendor + - avv + - unterauftrag + - supply chain + tags: [supply_chain, vendor, assessment] + composable_with: [CP-COMP-004] + + # ========================================================================= + # GOV Domain — Public Sector & Regulatory (1 Pattern) + # ========================================================================= + + - id: CP-GOV-001 + name: regulatory_reporting + name_de: Regulatorische Berichtspflichten + domain: GOV + category: compliance + description: > + Einhaltung gesetzlicher Berichts- und Meldepflichten gegenueber + Aufsichtsbehoerden, einschliesslich Fristen und Formanforderungen. + objective_template: > + Alle regulatorischen Berichts- und Meldepflichten identifizieren, + terminieren und fristgerecht erfuellen. + rationale_template: > + Versaeumte Meldepflichten koennen zu empfindlichen Bussgeldern + fuehren (z.B. Art. 83 DSGVO, NIS2-Meldepflichten). + requirements_template: + - "Register aller Meldepflichten mit Fristen und Ansprechpartnern" + - "Automatische Fristenueberwachung und Erinnerungen" + - "Meldewege zu allen relevanten Aufsichtsbehoerden dokumentiert" + - "Melde-Templates fuer die haeufigsten Vorfallarten vorbereitet" + - "Schulung der meldeverantwortlichen Personen" + test_procedure_template: + - "Pruefung des Meldepflichten-Registers auf Vollstaendigkeit" + - "Simulation: Datenschutzvorfall — Meldekette bis Behoerde pruefen" + evidence_template: + - "Meldepflichten-Register" + - "Melde-Templates" + - "Fristen-Monitoring-Konfiguration" + severity_default: high + implementation_effort_default: m + open_anchor_refs: + - { framework: "ENISA Guidelines", ref: "Incident Reporting" } + obligation_match_keywords: + - meldepflicht + - berichtspflicht + - aufsichtsbehoerde + - reporting + - notification + - frist + - behoerde + tags: [regulatory, reporting, notification] + composable_with: [CP-INC-001] + + # ========================================================================= + # AI Domain — AI Governance (2 Patterns) + # ========================================================================= + + - id: CP-AI-001 + name: ai_risk_management + name_de: KI-Risikomanagement + domain: AI + category: governance + description: > + Risikobewertung und -management fuer den Einsatz von KI-Systemen + gemaess dem risikobasierten Ansatz des EU AI Act. + objective_template: > + Risiken des KI-Einsatzes systematisch bewerten, klassifizieren und + durch angemessene Massnahmen auf ein akzeptables Niveau reduzieren. + rationale_template: > + Der EU AI Act fordert ein risikobasiertes Managementsystem fuer + Hochrisiko-KI-Systeme. Ohne systematisches KI-Risikomanagement + drohen Verbote und Bussgelder bis 35 Mio. EUR. + requirements_template: + - "KI-Inventar: Alle eingesetzten KI-Systeme erfasst und klassifiziert" + - "Risikoklassifizierung nach EU AI Act (unakzeptabel, hoch, begrenzt, minimal)" + - "Fuer Hochrisiko-KI: Konformitaetsbewertung nach Anhang VI/VII" + - "Human-Oversight-Massnahmen fuer alle KI-gestuetzten Entscheidungen" + - "Dokumentation der KI-Modelle und Trainingsdaten" + - "Regelmaessige Bias- und Fairness-Audits" + test_procedure_template: + - "Pruefung des KI-Inventars auf Vollstaendigkeit" + - "Review der Risikoklassifizierung eines Hochrisiko-KI-Systems" + - "Nachweis der Human-Oversight-Implementierung" + evidence_template: + - "KI-Inventar" + - "KI-Risikobewertung" + - "Konformitaetsbewertungsbericht" + - "Human-Oversight-Dokumentation" + severity_default: high + implementation_effort_default: l + open_anchor_refs: + - { framework: "NIST AI RMF", ref: "AI 100-1" } + obligation_match_keywords: + - ki + - kuenstliche intelligenz + - ai + - artificial intelligence + - algorithmus + - machine learning + - hochrisiko + - ai act + tags: [ai, risk_management, governance] + composable_with: [CP-AI-002, CP-COMP-001] + + - id: CP-AI-002 + name: ai_transparency + name_de: KI-Transparenz + domain: AI + category: governance + description: > + Transparenzpflichten beim Einsatz von KI-Systemen gegenueber + Nutzern, Betroffenen und Aufsichtsbehoerden. + objective_template: > + Betroffene Personen ueber den Einsatz von KI-Systemen informieren + und die Nachvollziehbarkeit von KI-Entscheidungen sicherstellen. + rationale_template: > + Art. 13/14 DSGVO und Art. 52 AI Act fordern Transparenz beim Einsatz + automatisierter Entscheidungssysteme. Intransparente KI untergräbt + Vertrauen und verletzt Betroffenenrechte. + requirements_template: + - "Kennzeichnung aller KI-gestuetzten Interaktionen fuer Nutzer" + - "Erklaebarkeit: Wesentliche Entscheidungsfaktoren nachvollziehbar" + - "Informationspflicht bei automatisierten Einzelentscheidungen (Art. 22 DSGVO)" + - "Dokumentation der Modellarchitektur und -leistung" + - "Recht auf Anfechtung KI-gestuetzter Entscheidungen" + test_procedure_template: + - "Pruefung der KI-Kennzeichnung in der Benutzeroberflaeche" + - "Test: Erklaerung einer KI-Entscheidung anfordern" + - "Review der KI-Dokumentation auf Vollstaendigkeit" + evidence_template: + - "KI-Transparenzbericht" + - "Screenshots: KI-Kennzeichnung in der Anwendung" + - "Modell-Dokumentation" + severity_default: medium + implementation_effort_default: m + open_anchor_refs: + - { framework: "NIST AI RMF", ref: "MAP 5" } + obligation_match_keywords: + - transparenz + - erklaerbarkeit + - nachvollziehbarkeit + - kennzeichnung + - transparency + - explainability + - automated decision + tags: [ai, transparency, explainability] + composable_with: [CP-AI-001, CP-DATA-004] + + # ========================================================================= + # Remaining Universal Patterns (2 Patterns) + # ========================================================================= + + - id: CP-SEC-005 + name: physical_security + name_de: Physische Sicherheit + domain: SEC + category: physical + description: > + Schutz von Raeumlichkeiten, Hardware und Datentraegern vor + unbefugtem physischem Zugriff, Diebstahl und Umgebungsrisiken. + objective_template: > + IT-Infrastruktur und Datentraeger durch angemessene physische + Sicherheitsmassnahmen vor unbefugtem Zugriff schuetzen. + rationale_template: > + Physischer Zugriff auf Server, Netzwerkkomponenten oder Datentraeger + ermoeglicht die Umgehung aller logischen Sicherheitsmassnahmen. + requirements_template: + - "Zutrittskontrolle fuer Serverraeume und Rechenzentren" + - "Besucherregelung mit Begleitung in sensiblen Bereichen" + - "Sichere Aufbewahrung und Entsorgung von Datentraegern" + - "Umgebungsueberwachung (Temperatur, Feuchte, Wasser, Rauch)" + - "USV und Notstromversorgung fuer kritische Systeme" + test_procedure_template: + - "Physischer Zugangsversuch ohne Berechtigung" + - "Pruefung der Besucherprotokolle" + - "Verifizierung der Umgebungsueberwachung" + evidence_template: + - "Zutrittskontrollkonzept" + - "Besucherprotokolle" + - "USV-Wartungsprotokolle" + severity_default: medium + implementation_effort_default: l + open_anchor_refs: + - { framework: "NIST SP 800-53", ref: "PE-1 through PE-20" } + obligation_match_keywords: + - physisch + - zutritt + - physical + - gebaeude + - serverraum + - rechenzentrum + - datentraeger + tags: [physical, access, infrastructure] + composable_with: [CP-ACC-001] + + - id: CP-INC-003 + name: continuity_planning + name_de: Notfallplanung und Betriebskontinuitaet + domain: INC + category: continuity + description: > + Planung und regelmaessige Uebung von Massnahmen zur Aufrechterhaltung + oder schnellen Wiederherstellung kritischer Geschaeftsprozesse. + objective_template: > + Kritische Geschaeftsprozesse auch bei Stoerungen oder Katastrophen + aufrechterhalten oder schnellstmoeglich wiederherstellen. + rationale_template: > + Ohne BCM-Planung fuehren ungeplante Ausfaelle zu unkoordinierten + Reaktionen, verlaengerten Ausfallzeiten und potenziellem Datenverlust. + requirements_template: + - "Business-Impact-Analyse (BIA) fuer alle kritischen Prozesse" + - "Notfallplan mit klaren Anweisungen, Kontaktdaten und Eskalation" + - "Redundanz fuer kritische Systeme (Failover, Load Balancing)" + - "Regelmaessige BCM-Uebungen (min. {exercise_interval:jaehrlich})" + - "Dokumentierte Recovery-Reihenfolge nach Prioritaet" + test_procedure_template: + - "BCM-Uebung: Simulierter Ausfall eines kritischen Systems" + - "Pruefung der BIA auf Aktualitaet" + - "Verifizierung der Failover-Mechanismen" + evidence_template: + - "Business-Impact-Analyse" + - "Notfallplan" + - "BCM-Uebungsprotokolle" + severity_default: high + implementation_effort_default: l + open_anchor_refs: + - { framework: "NIST SP 800-34", ref: "Rev. 1" } + obligation_match_keywords: + - notfall + - kontinuitaet + - continuity + - disaster + - ausfallsicherheit + - wiederanlauf + - bcm + - emergency + tags: [continuity, disaster_recovery, planning] + composable_with: [CP-INC-001, CP-INC-002] diff --git a/ai-compliance-sdk/policies/control_patterns/domain_it_security.yaml b/ai-compliance-sdk/policies/control_patterns/domain_it_security.yaml new file mode 100644 index 0000000..0dd2c8a --- /dev/null +++ b/ai-compliance-sdk/policies/control_patterns/domain_it_security.yaml @@ -0,0 +1,917 @@ +version: "1.0" +description: > + 20 IT-Security Domain Patterns — spezialisierte Patterns fuer sichere + Softwareentwicklung, API-Sicherheit, Container, Cloud und DevSecOps. + +patterns: + + # ========================================================================= + # SDLC — Secure Software Development (5 Patterns) + # ========================================================================= + + - id: CP-SEC-006 + name: secure_sdlc + name_de: Sicherer Software-Entwicklungslebenszyklus + domain: SEC + category: application + description: > + Integration von Sicherheitsanforderungen in alle Phasen des + Software-Entwicklungslebenszyklus (Design, Implementierung, Test, Deployment). + objective_template: > + Sicherheitsanforderungen systematisch in alle Phasen der + Softwareentwicklung integrieren (Security by Design). + rationale_template: > + Sicherheitsmaengel, die erst nach dem Deployment entdeckt werden, + kosten bis zu 30x mehr als fruehe Erkennung. Ein sicherer SDLC + reduziert Schwachstellen und beschleunigt die Time-to-Market. + requirements_template: + - "Sicherheitsanforderungen in User Stories / Tickets erfasst" + - "Threat Modeling fuer neue Features und Architekturentscheidungen" + - "Security Champions in jedem Entwicklungsteam" + - "Sicherheits-Abnahme vor Produktiv-Deployment" + - "Security-Retrospektiven bei Schwachstellenfunden" + test_procedure_template: + - "Review: 5 aktuelle User Stories auf Sicherheitsanforderungen pruefen" + - "Nachweis eines Threat Models fuer eine neue Komponente" + - "Pruefung ob Security Champion je Team benannt ist" + evidence_template: + - "Secure SDLC Policy" + - "Threat-Model-Dokumentation" + - "Security-Champion-Liste" + severity_default: high + implementation_effort_default: l + open_anchor_refs: + - { framework: "OWASP SAMM", ref: "v2.0" } + - { framework: "NIST SP 800-218", ref: "SSDF" } + obligation_match_keywords: + - entwicklung + - sdlc + - software + - design + - security by design + - threat model + - development + tags: [sdlc, development, security_by_design] + composable_with: [CP-SEC-007, CP-SEC-008] + + - id: CP-SEC-007 + name: code_review + name_de: Code-Review und statische Analyse + domain: SEC + category: application + description: > + Systematische Pruefung von Quellcode durch Peer Reviews und + automatisierte statische Analyse (SAST) vor Zusammenfuehrung. + objective_template: > + Sicherheitsschwachstellen im Quellcode durch manuelle Reviews + und automatisierte statische Analyse vor dem Merge erkennen. + rationale_template: > + Automatisierte SAST-Tools erkennen bekannte Schwachstellenmuster + zuverlaessig, waehrend manuelle Reviews Logikfehler und Design- + Schwaechen aufdecken, die Tools uebersehen. + requirements_template: + - "SAST-Tool in CI/CD-Pipeline integriert (Build bricht bei kritischen Findings)" + - "Peer Review fuer alle Aenderungen vor Merge in Hauptbranch" + - "Mindestens ein Reviewer mit Security-Expertise fuer sicherheitsrelevante Aenderungen" + - "SAST-Regelwerk regelmaessig aktualisiert" + - "False-Positive-Management mit dokumentierter Begruendung" + test_procedure_template: + - "Pruefung der SAST-Integration in CI/CD" + - "Review: 5 aktuelle Merge Requests auf Peer-Review-Nachweis pruefen" + - "Stichprobe: SAST-Findings auf Behandlung pruefen" + evidence_template: + - "Code-Review-Richtlinie" + - "SAST-Konfiguration und -Berichte" + - "Merge-Request-Statistik mit Review-Nachweis" + severity_default: high + implementation_effort_default: m + open_anchor_refs: + - { framework: "OWASP ASVS", ref: "V14.1" } + - { framework: "NIST SP 800-218", ref: "PW.7" } + obligation_match_keywords: + - code review + - quellcode + - statische analyse + - sast + - review + - pruefung + - source code + tags: [code_review, sast, development] + composable_with: [CP-SEC-006, CP-SEC-008] + + - id: CP-SEC-008 + name: dependency_management + name_de: Abhaengigkeitsmanagement + domain: SEC + category: application + description: > + Verwaltung und Ueberwachung von Softwareabhaengigkeiten (Libraries, + Frameworks) auf bekannte Schwachstellen und Lizenzkonformitaet. + objective_template: > + Alle Softwareabhaengigkeiten inventarisieren, auf bekannte + Schwachstellen ueberwachen und zeitnah aktualisieren. + rationale_template: > + Supply-Chain-Angriffe ueber kompromittierte Abhaengigkeiten nehmen + stark zu. SBOM-Pflichten (CRA, NIS2) erfordern vollstaendige + Transparenz ueber eingesetzte Komponenten. + requirements_template: + - "Software Bill of Materials (SBOM) fuer alle Anwendungen" + - "Automatisierte Schwachstellenpruefung aller Abhaengigkeiten (SCA) in CI/CD" + - "Kritische Schwachstellen in Abhaengigkeiten innerhalb von {critical_sla:72} Stunden behoben" + - "Lizenzpruefung aller Abhaengigkeiten gegen Whitelist" + - "Lock-Files versioniert und integritaetsgeschuetzt" + test_procedure_template: + - "SBOM auf Vollstaendigkeit pruefen" + - "SCA-Bericht: Offene Schwachstellen in Abhaengigkeiten" + - "Lizenz-Compliance-Bericht pruefen" + evidence_template: + - "SBOM (CycloneDX oder SPDX Format)" + - "SCA-Bericht" + - "Lizenz-Compliance-Bericht" + severity_default: high + implementation_effort_default: m + open_anchor_refs: + - { framework: "OWASP ASVS", ref: "V14.2" } + - { framework: "NIST SP 800-218", ref: "PS.3" } + obligation_match_keywords: + - abhaengigkeit + - dependency + - sbom + - library + - framework + - supply chain + - komponente + - sca + tags: [dependency, sbom, supply_chain] + composable_with: [CP-SEC-006, CP-COMP-005] + + - id: CP-SEC-009 + name: input_validation + name_de: Eingabevalidierung + domain: SEC + category: application + description: > + Validierung und Bereinigung aller Eingabedaten an Systemgrenzen + zum Schutz vor Injection-Angriffen und Datenkorrumpierung. + objective_template: > + Alle Eingabedaten an Systemgrenzen validieren und bereinigen, + um Injection-Angriffe und Datenkorrumpierung zu verhindern. + rationale_template: > + Injection-Schwachstellen (SQL, XSS, Command Injection) bleiben + seit ueber einem Jahrzehnt in den OWASP Top 10. Konsequente + Eingabevalidierung ist die effektivste Gegenmassnahme. + requirements_template: + - "Whitelist-Validierung fuer alle Benutzereingaben" + - "Parametrisierte Queries fuer alle Datenbankzugriffe (kein String-Concat)" + - "Output-Encoding kontextabhaengig (HTML, JS, URL, SQL)" + - "Maximale Laengen und Typbeschraenkungen fuer alle Eingabefelder" + - "Server-seitige Validierung (Client-seitige Validierung allein genuegt nicht)" + test_procedure_template: + - "DAST-Scan auf Injection-Schwachstellen" + - "Code-Review: Stichprobe auf parametrisierte Queries" + - "Test: XSS-Payload in Eingabefelder eingeben" + evidence_template: + - "DAST-Scan-Bericht" + - "Code-Review-Ergebnis (Injection-Bereich)" + severity_default: critical + implementation_effort_default: m + open_anchor_refs: + - { framework: "OWASP ASVS", ref: "V5.1, V5.2, V5.3" } + - { framework: "OWASP Top 10", ref: "A03:2021 Injection" } + obligation_match_keywords: + - eingabe + - validierung + - input + - injection + - xss + - sanitization + - bereinigung + tags: [input_validation, injection, application_security] + composable_with: [CP-SEC-010, CP-SEC-007] + + - id: CP-SEC-010 + name: error_handling + name_de: Fehlerbehandlung und Informationspreisgabe + domain: SEC + category: application + description: > + Sichere Fehlerbehandlung, die keine sensitiven Informationen + in Fehlermeldungen, Stack Traces oder Logs preisgibt. + objective_template: > + Fehler sicher behandeln, sodass Nutzern hilfreiche aber keine + sicherheitsrelevanten Informationen angezeigt werden. + rationale_template: > + Detaillierte Fehlermeldungen in Produktion offenbaren Technologie-Stack, + Datenbankstruktur und interne Pfade — wertvolle Informationen fuer + Angreifer zur Angriffsvorbereitung. + requirements_template: + - "Generische Fehlermeldungen fuer Endnutzer in Produktion" + - "Detaillierte Fehler nur in internen Logs (nicht in API-Responses)" + - "Keine Stack Traces, SQL-Queries oder interne Pfade in Responses" + - "Custom Error Pages fuer HTTP 4xx/5xx" + - "Zentrales Exception-Handling in allen Anwendungen" + test_procedure_template: + - "Provozierung von Fehlern und Pruefung der Response auf sensitive Daten" + - "Pruefung der Error Pages auf Informationspreisgabe" + - "Review: Exception-Handling-Muster im Code" + evidence_template: + - "Error-Handling-Richtlinie" + - "Penetrationstest-Bericht (Information Disclosure)" + severity_default: medium + implementation_effort_default: s + open_anchor_refs: + - { framework: "OWASP ASVS", ref: "V7.4" } + - { framework: "OWASP Top 10", ref: "A05:2021 Security Misconfiguration" } + obligation_match_keywords: + - fehler + - error + - fehlermeldung + - informationspreisgabe + - information disclosure + - stack trace + - exception + tags: [error_handling, information_disclosure, application_security] + composable_with: [CP-SEC-009, CP-LOG-001] + + # ========================================================================= + # API & Web Security (3 Patterns) + # ========================================================================= + + - id: CP-SEC-011 + name: api_security + name_de: API-Sicherheit + domain: SEC + category: application + description: > + Absicherung von APIs durch Authentifizierung, Rate Limiting, + Eingabevalidierung und Zugriffskontrolle. + objective_template: > + Alle APIs durch angemessene Authentifizierung, Autorisierung, + Rate Limiting und Eingabevalidierung schuetzen. + rationale_template: > + APIs sind die primaere Angriffsflaeche moderner Anwendungen. + Ungeschuetzte APIs ermoeglichen Massendatenabfluss, Missbrauch + und Denial-of-Service. + requirements_template: + - "Authentifizierung fuer alle nicht-oeffentlichen API-Endpunkte" + - "Autorisierung auf Objektebene (BOLA/IDOR-Schutz)" + - "Rate Limiting und Throttling implementiert" + - "API-Schema-Validierung (OpenAPI/JSON Schema)" + - "API-Versionierung und Deprecation-Policy" + - "CORS-Policy restriktiv konfiguriert" + test_procedure_template: + - "API-Sicherheits-Scan (z.B. OWASP ZAP)" + - "Test: Zugriff auf fremde Ressourcen via IDOR" + - "Test: Rate Limit ueberschreiten" + evidence_template: + - "API-Sicherheitsrichtlinie" + - "API-Security-Scan-Bericht" + - "OpenAPI-Spezifikation" + severity_default: high + implementation_effort_default: m + open_anchor_refs: + - { framework: "OWASP API Security Top 10", ref: "2023 Edition" } + - { framework: "OWASP ASVS", ref: "V13.1" } + obligation_match_keywords: + - api + - schnittstelle + - endpunkt + - rest + - graphql + - interface + - webservice + tags: [api, security, web] + composable_with: [CP-AUTH-003, CP-SEC-009] + + - id: CP-SEC-012 + name: csrf_protection + name_de: CSRF-Schutz + domain: SEC + category: application + description: > + Schutz vor Cross-Site Request Forgery durch Token-basierte + Validierung und SameSite-Cookie-Attribute. + objective_template: > + Webanwendungen gegen Cross-Site Request Forgery schuetzen, sodass + unautorisierte Aktionen im Namen authentifizierter Nutzer verhindert werden. + rationale_template: > + CSRF ermoeglicht es Angreifern, authentifizierte Nutzer zu + unbeabsichtigten Aktionen zu verleiten — z.B. Passwortaenderung + oder Datenloeschung. + requirements_template: + - "CSRF-Token fuer alle zustandsaendernden Requests (POST, PUT, DELETE)" + - "SameSite=Strict oder Lax fuer Session-Cookies" + - "Double-Submit-Cookie-Pattern als Alternative wo CSRF-Tokens nicht moeglich" + - "Pruefung des Origin- oder Referer-Headers fuer kritische Aktionen" + test_procedure_template: + - "Test: Zustandsaendernden Request ohne CSRF-Token senden" + - "Pruefung der SameSite-Cookie-Attribute" + - "DAST-Scan: CSRF-Findings auswerten" + evidence_template: + - "DAST-Scan-Bericht (CSRF-Bereich)" + - "Cookie-Konfiguration" + severity_default: medium + implementation_effort_default: s + open_anchor_refs: + - { framework: "OWASP ASVS", ref: "V4.2.2" } + - { framework: "OWASP Top 10", ref: "A01:2021 Broken Access Control" } + obligation_match_keywords: + - csrf + - cross-site + - request forgery + - samesite + - token + - formular + tags: [csrf, web_security, application_security] + composable_with: [CP-SEC-009, CP-SEC-011] + + - id: CP-SEC-013 + name: content_security_policy + name_de: Content Security Policy + domain: SEC + category: application + description: > + Implementierung einer Content Security Policy (CSP) zum Schutz + vor XSS, Clickjacking und anderen Browser-basierten Angriffen. + objective_template: > + Browser-basierte Angriffe durch eine restriktive Content Security + Policy (CSP) und Security-Header begrenzen. + rationale_template: > + CSP ist die effektivste clientseitige Verteidigung gegen XSS-Angriffe + und reduziert die Auswirkungen von Injection-Schwachstellen erheblich. + requirements_template: + - "CSP-Header mit restriktiven Direktiven (kein unsafe-inline ohne Nonce)" + - "X-Frame-Options oder frame-ancestors gegen Clickjacking" + - "X-Content-Type-Options: nosniff" + - "Referrer-Policy konfiguriert" + - "CSP-Report-URI fuer Monitoring konfiguriert" + test_procedure_template: + - "Security-Header-Scan (z.B. securityheaders.com)" + - "Test: Inline-Script-Ausfuehrung ohne Nonce (muss blockiert werden)" + - "Review: CSP-Violation-Reports der letzten 30 Tage" + evidence_template: + - "Security-Header-Scan-Ergebnis" + - "CSP-Konfiguration" + - "CSP-Violation-Report-Statistik" + severity_default: medium + implementation_effort_default: m + open_anchor_refs: + - { framework: "OWASP ASVS", ref: "V14.4" } + obligation_match_keywords: + - csp + - content security + - header + - xss + - clickjacking + - browser + - security header + tags: [csp, web_security, headers] + composable_with: [CP-SEC-009, CP-SEC-012] + + # ========================================================================= + # Container & Cloud Security (3 Patterns) + # ========================================================================= + + - id: CP-SEC-014 + name: container_security + name_de: Container-Sicherheit + domain: SEC + category: system + description: > + Absicherung von Container-Umgebungen (Docker, Kubernetes) durch + Image-Scanning, Laufzeitschutz und sichere Konfiguration. + objective_template: > + Container-Umgebungen durch Image-Haertung, Laufzeitschutz und + Netzwerk-Policies sicher konfigurieren und betreiben. + rationale_template: > + Container mit bekannten Schwachstellen oder ueberprivilegierter + Konfiguration sind ein haeufiger Angriffsvektor. Die geteilte + Kernel-Architektur erfordert besondere Sicherheitsmassnahmen. + requirements_template: + - "Base-Image-Scanning in CI/CD-Pipeline" + - "Keine Container mit Root-Privileges in Produktion" + - "Read-Only-Filesystem wo moeglich" + - "Netzwerk-Policies zwischen Container-Namespaces" + - "Image-Registry nur aus vertrauenswuerdigen Quellen" + - "Automatische Rebuilds bei bekannten CVEs in Base Images" + test_procedure_template: + - "Container-Image-Scan aller Produktiv-Images" + - "Pruefung: Kein Container laeuft als Root" + - "Pruefung der Netzwerk-Policies" + evidence_template: + - "Container-Sicherheitsrichtlinie" + - "Image-Scan-Bericht" + - "Kubernetes-/Docker-Konfiguration" + severity_default: high + implementation_effort_default: m + open_anchor_refs: + - { framework: "NIST SP 800-190", ref: "Container Security Guide" } + obligation_match_keywords: + - container + - docker + - kubernetes + - k8s + - image + - pod + - deployment + tags: [container, docker, kubernetes] + composable_with: [CP-SEC-003, CP-SEC-015] + + - id: CP-SEC-015 + name: cloud_security_posture + name_de: Cloud-Sicherheitslage + domain: SEC + category: system + description: > + Kontinuierliche Ueberwachung und Durchsetzung von Sicherheitsrichtlinien + in Cloud-Umgebungen (IaaS, PaaS, SaaS). + objective_template: > + Cloud-Ressourcen durch automatisierte Richtlinienpruefung und + Konfigurationsmanagement sicher konfigurieren und ueberwachen. + rationale_template: > + Cloud-Fehlkonfigurationen (offene S3-Buckets, oeffentlich erreichbare + Datenbanken) sind fuer einen Grossteil der Cloud-Datenpannen verantwortlich. + requirements_template: + - "Cloud Security Posture Management (CSPM) Tool im Einsatz" + - "Automatisierte Pruefung auf oeffentlich erreichbare Ressourcen" + - "Verschluesselung aller Cloud-Storage-Dienste erzwungen" + - "IAM-Policies nach Least-Privilege konfiguriert" + - "Cloud-Trail / Audit-Logging aktiviert" + - "Multi-Region-Backup fuer kritische Daten" + test_procedure_template: + - "CSPM-Scan der Cloud-Umgebung" + - "Pruefung auf oeffentlich erreichbare Ressourcen" + - "Review der IAM-Policies auf ueberprivilegierte Accounts" + evidence_template: + - "CSPM-Bericht" + - "Cloud-IAM-Policy-Review" + - "Cloud-Audit-Log-Konfiguration" + severity_default: high + implementation_effort_default: l + open_anchor_refs: + - { framework: "NIST SP 800-144", ref: "Cloud Computing Guide" } + obligation_match_keywords: + - cloud + - iaas + - paas + - saas + - aws + - azure + - gcp + - cloud security + tags: [cloud, cspm, infrastructure] + composable_with: [CP-SEC-014, CP-CRYP-001] + + - id: CP-SEC-016 + name: infrastructure_as_code + name_de: Infrastruktur als Code + domain: SEC + category: system + description: > + Verwaltung der gesamten Infrastruktur als versionierter Code mit + Sicherheitspruefung vor dem Deployment. + objective_template: > + Infrastruktur-Konfigurationen als versionierten Code verwalten + und vor dem Deployment automatisiert auf Sicherheit pruefen. + rationale_template: > + IaC ermoeglicht reproduzierbare, audierbare und sicherheitsgeprufte + Infrastruktur. Manuelle Konfiguration fuehrt zu Configuration Drift + und nicht-nachvollziehbaren Aenderungen. + requirements_template: + - "Alle Infrastruktur-Konfigurationen in Git versioniert" + - "IaC-Security-Scanner in CI/CD (z.B. tfsec, checkov)" + - "Keine manuellen Aenderungen an Produktiv-Infrastruktur" + - "Review-Prozess fuer Infrastruktur-Aenderungen" + - "State-Dateien verschluesselt und zugriffsbeschraenkt" + test_procedure_template: + - "IaC-Security-Scan-Ergebnis pruefen" + - "Vergleich: Ist-Zustand vs. Code-Zustand (Drift-Detection)" + - "Review: Letzte 5 Infrastruktur-Aenderungen auf Audit-Trail pruefen" + evidence_template: + - "IaC-Repository (Zugangsnachweis)" + - "IaC-Security-Scan-Bericht" + - "Drift-Detection-Ergebnis" + severity_default: medium + implementation_effort_default: l + open_anchor_refs: + - { framework: "NIST SP 800-128", ref: "Configuration Management" } + obligation_match_keywords: + - infrastruktur + - terraform + - ansible + - konfiguration + - infrastructure + - iac + - deployment + - provisioning + tags: [iac, infrastructure, devops] + composable_with: [CP-SEC-003, CP-COMP-002] + + # ========================================================================= + # Identity & Secrets (3 Patterns) + # ========================================================================= + + - id: CP-AUTH-005 + name: secrets_management + name_de: Secrets-Management + domain: AUTH + category: authentication + description: > + Sichere Verwaltung von Secrets (API-Keys, Datenbank-Passwoerter, + Zertifikate) ueber ihren gesamten Lebenszyklus. + objective_template: > + Secrets sicher speichern, verteilen und rotieren — keine Secrets + im Quellcode, in Umgebungsvariablen auf Disk oder in Logs. + rationale_template: > + Geleakte Secrets in Git-Repositories sind eine der haeufigsten + Ursachen fuer Datenpannen. Automatisiertes Secrets-Management + eliminiert menschliche Fehler bei der Secret-Verwaltung. + requirements_template: + - "Secrets in einem dedizierten Secrets Manager (Vault, AWS SM, etc.)" + - "Keine Secrets in Quellcode, Docker-Images oder CI/CD-Logs" + - "Pre-Commit-Hook: Automatische Secret-Erkennung (z.B. gitleaks)" + - "Automatische Rotation fuer alle maschinellen Secrets" + - "Audit-Log fuer jeden Secret-Zugriff" + test_procedure_template: + - "Repository-Scan auf eingebettete Secrets (gitleaks, trufflehog)" + - "Pruefung der Secrets-Manager-Konfiguration" + - "Stichprobe: 5 Secrets auf Rotationshistorie pruefen" + evidence_template: + - "Secrets-Management-Richtlinie" + - "Secret-Scan-Bericht (Repository)" + - "Secrets-Manager-Audit-Log" + severity_default: critical + implementation_effort_default: m + open_anchor_refs: + - { framework: "OWASP ASVS", ref: "V6.4" } + - { framework: "NIST SP 800-57", ref: "Part 1" } + obligation_match_keywords: + - secret + - api key + - passwort + - credential + - token + - zertifikat + - schluessel + - vault + tags: [secrets, vault, credential] + composable_with: [CP-CRYP-003, CP-SEC-006] + + - id: CP-AUTH-006 + name: service_authentication + name_de: Service-zu-Service-Authentifizierung + domain: AUTH + category: authentication + description: > + Authentifizierung und Autorisierung zwischen internen Diensten + (Microservices) durch mTLS, JWT oder Service Mesh. + objective_template: > + Interne Service-Kommunikation authentifizieren und autorisieren, + um Lateral Movement bei kompromittierten Diensten zu verhindern. + rationale_template: > + Ohne Service-Authentifizierung kann ein kompromittierter Dienst + ungehindert auf alle anderen internen Dienste zugreifen. + requirements_template: + - "Mutual TLS (mTLS) oder JWT-basierte Authentifizierung zwischen Services" + - "Service-Identitaeten zentral verwaltet und automatisch rotiert" + - "Autorisierung auf API-Ebene (nicht nur Netzwerk-Ebene)" + - "Service Mesh oder API Gateway fuer zentrales Policy-Enforcement" + test_procedure_template: + - "Test: Nicht-authentifizierter Service-Zugriff (muss abgelehnt werden)" + - "Pruefung der mTLS-Konfiguration" + - "Review: Service-Autorisierungs-Policies" + evidence_template: + - "Service-Authentifizierungskonzept" + - "mTLS/JWT-Konfiguration" + - "Service-Mesh-Policies" + severity_default: high + implementation_effort_default: l + open_anchor_refs: + - { framework: "NIST SP 800-204", ref: "Microservices Security" } + obligation_match_keywords: + - service + - microservice + - mtls + - service mesh + - internal + - api gateway + - dienst + tags: [service_auth, microservices, mtls] + composable_with: [CP-CRYP-002, CP-SEC-011] + + - id: CP-AUTH-007 + name: identity_lifecycle + name_de: Identitaets-Lebenszyklus + domain: AUTH + category: identity + description: > + Verwaltung des vollstaendigen Lebenszyklus digitaler Identitaeten + vom Onboarding ueber Rollenaenderungen bis zum Offboarding. + objective_template: > + Digitale Identitaeten vom Onboarding bis zum Offboarding + lueckenlos und zeitnah verwalten. + rationale_template: > + Verwaiste Konten nach Mitarbeiter-Austritt sind ein haeufiger + Angriffsvektor. Unvollstaendiges Offboarding fuehrt zu + unautorisiertem Zugriff und Compliance-Verstoessen. + requirements_template: + - "Automatisiertes Onboarding mit Standardrechten je Rolle" + - "Offboarding-Checkliste mit Entzug aller Zugaenge innerhalb {offboard_hours:24} Stunden" + - "Automatische Deaktivierung bei Vertragsende" + - "Regelmaessige Kontenbereinigung auf verwaiste Accounts" + - "Gastkonten mit automatischem Ablaufdatum" + test_procedure_template: + - "Stichprobe: 3 kuerzlich ausgeschiedene Mitarbeiter — alle Zugaenge deaktiviert?" + - "Scan auf verwaiste Konten (keine Anmeldung > 90 Tage)" + - "Pruefung der Offboarding-Checkliste" + evidence_template: + - "Onboarding-/Offboarding-Prozessbeschreibung" + - "Bericht verwaiste Konten" + - "Offboarding-Checklisten (Stichprobe)" + severity_default: high + implementation_effort_default: m + open_anchor_refs: + - { framework: "NIST SP 800-53", ref: "AC-2, PS-4" } + obligation_match_keywords: + - identitaet + - onboarding + - offboarding + - lebenszyklus + - lifecycle + - konto + - account + - benutzer + tags: [identity, lifecycle, onboarding] + composable_with: [CP-ACC-001, CP-AUTH-001] + + # ========================================================================= + # DevSecOps & CI/CD (3 Patterns) + # ========================================================================= + + - id: CP-SEC-017 + name: cicd_security + name_de: CI/CD-Pipeline-Sicherheit + domain: SEC + category: application + description: > + Absicherung der CI/CD-Pipeline gegen Manipulation, Credential-Leaks + und Supply-Chain-Angriffe. + objective_template: > + CI/CD-Pipelines gegen Manipulation absichern und als vertrauenswuerdige + Deployment-Kette etablieren. + rationale_template: > + Eine kompromittierte CI/CD-Pipeline ermoeglicht das Einschleusen von + Schadcode in Produktionssysteme ueber den vertrauenswuerdigen Build-Prozess. + requirements_template: + - "Pipeline-Konfiguration versioniert und nur ueber Review aenderbar" + - "Secrets in CI/CD ueber Secrets Manager (nicht als Umgebungsvariablen)" + - "Build-Artefakte signiert und integritaetsgeprueft" + - "Minimale Berechtigungen fuer CI/CD-Runner" + - "Audit-Log aller Pipeline-Ausfuehrungen" + test_procedure_template: + - "Review: Pipeline-Konfiguration auf Secrets-Exposition pruefen" + - "Pruefung der Runner-Berechtigungen" + - "Pruefung der Artefakt-Signierung" + evidence_template: + - "CI/CD-Sicherheitsrichtlinie" + - "Pipeline-Konfiguration (Git-Review)" + - "Artefakt-Signierungs-Nachweis" + severity_default: high + implementation_effort_default: m + open_anchor_refs: + - { framework: "NIST SP 800-218", ref: "PO.3, PS.1" } + obligation_match_keywords: + - pipeline + - ci/cd + - build + - deployment + - continuous + - integration + - delivery + tags: [cicd, pipeline, devsecops] + composable_with: [CP-SEC-006, CP-AUTH-005] + + - id: CP-SEC-018 + name: dast_scanning + name_de: Dynamische Anwendungssicherheitstests + domain: SEC + category: application + description: > + Automatisierte dynamische Sicherheitstests (DAST) gegen laufende + Anwendungen zur Erkennung von Laufzeit-Schwachstellen. + objective_template: > + Laufende Anwendungen automatisiert auf Sicherheitsschwachstellen + testen, die nur zur Laufzeit erkennbar sind. + rationale_template: > + DAST erkennt Schwachstellen, die statische Analyse nicht findet: + Fehlkonfigurationen, fehlende Security-Header, CORS-Probleme und + Authentifizierungsfehler im Deployment-Kontext. + requirements_template: + - "Automatisierter DAST-Scan (min. {scan_interval:woechentlich}) gegen Staging" + - "Vollstaendiger DAST-Scan vor jedem Major Release" + - "Kritische DAST-Findings blockieren Produktiv-Deployment" + - "Authentifizierte Scans fuer geschuetzte Bereiche" + - "DAST-Tool-Konfiguration regelmaessig aktualisiert" + test_procedure_template: + - "Review des letzten DAST-Berichts" + - "Pruefung: Werden kritische Findings tatsaechlich behoben?" + - "Verifizierung der Scan-Abdeckung (alle Endpunkte)" + evidence_template: + - "DAST-Scan-Bericht" + - "Behebungsprotokoll fuer DAST-Findings" + severity_default: medium + implementation_effort_default: m + open_anchor_refs: + - { framework: "OWASP ASVS", ref: "V14.2" } + - { framework: "NIST SP 800-53", ref: "SA-11" } + obligation_match_keywords: + - dast + - dynamisch + - sicherheitstest + - penetrationstest + - runtime + - scanning + - web scanner + tags: [dast, scanning, testing] + composable_with: [CP-SEC-007, CP-SEC-002] + + - id: CP-LOG-003 + name: secure_logging_no_pii + name_de: Datenschutzkonformes Logging + domain: LOG + category: logging + description: > + Sicherstellung, dass Log-Daten keine personenbezogenen Daten + enthalten und dennoch fuer forensische Analyse nutzbar sind. + objective_template: > + Log-Daten so gestalten, dass sie fuer Sicherheitsanalyse und + Debugging nutzbar sind, ohne personenbezogene Daten preiszugeben. + rationale_template: > + Logs mit personenbezogenen Daten schaffen neue DSGVO-Verarbeitungen, + erhoehen die Angriffsflaeche und erschweren die Weitergabe an + Dritte (z.B. SOC-Provider). + requirements_template: + - "Pseudonymisierung aller Nutzer-IDs in Logs" + - "Kein Logging von Passwoertern, Tokens, Kreditkartendaten" + - "IP-Adressen maximal gekuerzt geloggt (Privacy-Modus)" + - "Log-Retention gemaess Loeschkonzept" + - "Automatisierte PII-Erkennung in Log-Streams" + test_procedure_template: + - "Stichprobe: 100 Log-Eintraege auf PII-Freiheit pruefen" + - "Test: Login mit Passwort — Passwort darf nicht in Logs erscheinen" + - "Pruefung der Log-Retention-Konfiguration" + evidence_template: + - "Logging-Richtlinie (DSGVO-konform)" + - "Log-Stichproben-Analyse" + - "PII-Scanner-Konfiguration" + severity_default: medium + implementation_effort_default: m + open_anchor_refs: + - { framework: "OWASP ASVS", ref: "V7.1.3" } + - { framework: "ENISA Guidelines", ref: "Privacy-preserving Logging" } + obligation_match_keywords: + - logging + - personenbezogen + - protokollierung + - pii + - pseudonymisierung + - datenschutz + - log + tags: [logging, privacy, gdpr] + composable_with: [CP-LOG-001, CP-DATA-002] + + # ========================================================================= + # Data & Privacy (3 Patterns) + # ========================================================================= + + - id: CP-DATA-006 + name: data_subject_rights + name_de: Betroffenenrechte + domain: DATA + category: data_protection + description: > + Prozesse zur Erfuellung der Betroffenenrechte (Auskunft, Berichtigung, + Loeschung, Datenportabilitaet) gemaess Art. 15-22 DSGVO. + objective_template: > + Betroffenenrechte fristgerecht (max. 30 Tage) erfuellen und den + gesamten Prozess nachweisbar dokumentieren. + rationale_template: > + Die Nichterfuellung von Betroffenenrechten ist einer der haeufigsten + DSGVO-Beschwerdgruende bei Aufsichtsbehoerden und kann zu + empfindlichen Bussgeldern fuehren. + requirements_template: + - "Anfrage-Kanal fuer Betroffenenrechte eingerichtet und dokumentiert" + - "Identitaetspruefung vor Auskunftserteilung" + - "Fristenueberwachung: Antwort innerhalb von {response_days:30} Tagen" + - "Automatisierte Datenextraktion fuer Auskunfts- und Portabilitaetsanfragen" + - "Loeschprozess fuer alle Systeme (inkl. Backups) definiert" + - "Dokumentation aller bearbeiteten Anfragen" + test_procedure_template: + - "Test: Auskunftsanfrage stellen und Antwort auf Vollstaendigkeit pruefen" + - "Test: Loeschanfrage stellen und Umsetzung in allen Systemen verifizieren" + - "Pruefung der Fristeneinhaltung der letzten 10 Anfragen" + evidence_template: + - "Betroffenenrechte-Prozessbeschreibung" + - "Bearbeitungsprotokoll (anonymisiert)" + - "Anfrageformular / Kontaktkanal-Dokumentation" + severity_default: high + implementation_effort_default: l + open_anchor_refs: + - { framework: "ENISA Guidelines", ref: "Data Subject Rights" } + obligation_match_keywords: + - betroffenenrechte + - auskunft + - loeschung + - berichtigung + - datenportabilitaet + - widerspruch + - data subject + tags: [data_protection, dsr, gdpr] + composable_with: [CP-DATA-003, CP-DATA-004] + + - id: CP-DATA-007 + name: data_transfer_safeguards + name_de: Datentransfer-Schutzmassnahmen + domain: DATA + category: data_protection + description: > + Schutzmassnahmen fuer die Uebermittlung personenbezogener Daten + in Drittlaender (SCC, Angemessenheitsbeschluss, BCR). + objective_template: > + Personenbezogene Daten nur unter Einhaltung der gesetzlichen + Uebermittlungsgarantien in Drittlaender transferieren. + rationale_template: > + Seit Schrems II erfordern Datentransfers in die USA und andere + Drittlaender zusaetzliche Schutzmassnahmen. Verstoesse fuehren + zu hohen Bussgeldern (z.B. Meta: 1,2 Mrd. EUR). + requirements_template: + - "Register aller Drittland-Datentransfers" + - "Transfer Impact Assessment (TIA) fuer jeden Drittland-Transfer" + - "Standardvertragsklauseln (SCC) oder Angemessenheitsbeschluss" + - "Supplementary Measures wo erforderlich" + - "Regelmaessige Neubewertung (bei Rechtsaenderung oder Schrems-Entscheid)" + test_procedure_template: + - "Pruefung des Drittland-Transfer-Registers auf Vollstaendigkeit" + - "Review: TIA fuer einen Drittland-Transfer" + - "Stichprobe: SCC fuer 3 Auftragsverarbeiter pruefen" + evidence_template: + - "Drittland-Transfer-Register" + - "Transfer Impact Assessments" + - "Standardvertragsklauseln (unterzeichnet)" + severity_default: critical + implementation_effort_default: l + open_anchor_refs: + - { framework: "ENISA Guidelines", ref: "International Data Transfers" } + obligation_match_keywords: + - drittland + - transfer + - uebermittlung + - scc + - angemessenheit + - third country + - schrems + - binding corporate rules + tags: [data_transfer, international, gdpr] + composable_with: [CP-COMP-005, CP-DATA-004] + + - id: CP-DATA-008 + name: disposal_procedure + name_de: Sichere Datentraegerentsorgung + domain: DATA + category: data_protection + description: > + Sichere Vernichtung oder Bereinigung von Datentraegern bei + Ausserbetriebnahme, Rueckgabe oder Entsorgung. + objective_template: > + Daten auf ausgemusterten Datentraegern unwiederbringlich vernichten, + bevor diese das Unternehmen verlassen. + rationale_template: > + Unsachgemaess entsorgte Datentraeger sind eine haeufige Quelle + fuer Datenpannen. Selbst formatierte Festplatten koennen + wiederhergestellt werden. + requirements_template: + - "Zertifizierte Datenvernichtung fuer alle ausgemusterten Datentraeger" + - "NIST SP 800-88 konforme Bereinigung (Clear, Purge oder Destroy)" + - "Vernichtungsprotokoll mit Seriennummer und Methode" + - "Verschluesselte Datentraeger: Cryptographic Erasure als Minimum" + - "Regelmaessige Audit der Entsorgungsprozesse" + test_procedure_template: + - "Pruefung der Vernichtungsprotokolle der letzten 6 Monate" + - "Stichprobe: Versuch, Daten von bereinigtem Datentraeger wiederherzustellen" + evidence_template: + - "Datentraegerentsorgungsrichtlinie" + - "Vernichtungsprotokolle / -zertifikate" + severity_default: medium + implementation_effort_default: s + open_anchor_refs: + - { framework: "NIST SP 800-88", ref: "Rev. 1" } + obligation_match_keywords: + - entsorgung + - vernichtung + - datentraeger + - disposal + - loeschung + - bereinigung + - sanitization + tags: [disposal, data_destruction, physical] + composable_with: [CP-SEC-005, CP-DATA-003] diff --git a/backend-compliance/compliance/api/__init__.py b/backend-compliance/compliance/api/__init__.py index d456d7a..dc36085 100644 --- a/backend-compliance/compliance/api/__init__.py +++ b/backend-compliance/compliance/api/__init__.py @@ -53,6 +53,7 @@ _ROUTER_MODULES = [ "wiki_routes", "canonical_control_routes", "control_generator_routes", + "crosswalk_routes", "process_task_routes", "evidence_check_routes", ] diff --git a/backend-compliance/compliance/api/crosswalk_routes.py b/backend-compliance/compliance/api/crosswalk_routes.py new file mode 100644 index 0000000..ae5ee7f --- /dev/null +++ b/backend-compliance/compliance/api/crosswalk_routes.py @@ -0,0 +1,623 @@ +""" +FastAPI routes for the Multi-Layer Control Architecture. + +Pattern Library, Obligation Extraction, Crosswalk Matrix, and Migration endpoints. + +Endpoints: + GET /v1/canonical/patterns — All patterns (with filters) + GET /v1/canonical/patterns/{pattern_id} — Single pattern + GET /v1/canonical/patterns/{pattern_id}/controls — Controls for a pattern + + POST /v1/canonical/obligations/extract — Extract obligations from text + GET /v1/canonical/crosswalk — Query crosswalk matrix + GET /v1/canonical/crosswalk/stats — Coverage statistics + + POST /v1/canonical/migrate/decompose — Pass 0a: Obligation extraction + POST /v1/canonical/migrate/compose-atomic — Pass 0b: Atomic control composition + POST /v1/canonical/migrate/link-obligations — Pass 1: Obligation linkage + POST /v1/canonical/migrate/classify-patterns — Pass 2: Pattern classification + POST /v1/canonical/migrate/triage — Pass 3: Quality triage + POST /v1/canonical/migrate/backfill-crosswalk — Pass 4: Crosswalk backfill + POST /v1/canonical/migrate/deduplicate — Pass 5: Deduplication + GET /v1/canonical/migrate/status — Migration progress + GET /v1/canonical/migrate/decomposition-status — Decomposition progress +""" + +import json +import logging +from typing import Optional, List + +from fastapi import APIRouter, HTTPException, Query +from pydantic import BaseModel +from sqlalchemy import text + +from database import SessionLocal + +logger = logging.getLogger(__name__) +router = APIRouter(prefix="/v1/canonical", tags=["crosswalk"]) + + +# ============================================================================= +# REQUEST / RESPONSE MODELS +# ============================================================================= + + +class PatternResponse(BaseModel): + id: str + name: str + name_de: str + domain: str + category: str + description: str + objective_template: str + severity_default: str + implementation_effort_default: str = "m" + tags: list = [] + composable_with: list = [] + open_anchor_refs: list = [] + controls_count: int = 0 + + +class PatternListResponse(BaseModel): + patterns: List[PatternResponse] + total: int + + +class PatternDetailResponse(PatternResponse): + rationale_template: str = "" + requirements_template: list = [] + test_procedure_template: list = [] + evidence_template: list = [] + obligation_match_keywords: list = [] + + +class ObligationExtractRequest(BaseModel): + text: str + regulation_code: Optional[str] = None + article: Optional[str] = None + paragraph: Optional[str] = None + + +class ObligationExtractResponse(BaseModel): + obligation_id: Optional[str] = None + obligation_title: Optional[str] = None + obligation_text: Optional[str] = None + method: str = "none" + confidence: float = 0.0 + regulation_id: Optional[str] = None + pattern_id: Optional[str] = None + pattern_confidence: float = 0.0 + + +class CrosswalkRow(BaseModel): + regulation_code: str = "" + article: Optional[str] = None + obligation_id: Optional[str] = None + pattern_id: Optional[str] = None + master_control_id: Optional[str] = None + confidence: float = 0.0 + source: str = "auto" + + +class CrosswalkQueryResponse(BaseModel): + rows: List[CrosswalkRow] + total: int + + +class CrosswalkStatsResponse(BaseModel): + total_rows: int = 0 + regulations_covered: int = 0 + obligations_linked: int = 0 + patterns_used: int = 0 + controls_linked: int = 0 + coverage_by_regulation: dict = {} + + +class MigrationRequest(BaseModel): + limit: int = 0 # 0 = no limit + + +class MigrationResponse(BaseModel): + status: str = "completed" + stats: dict = {} + + +class MigrationStatusResponse(BaseModel): + total_controls: int = 0 + has_obligation: int = 0 + has_pattern: int = 0 + fully_linked: int = 0 + deprecated: int = 0 + coverage_obligation_pct: float = 0.0 + coverage_pattern_pct: float = 0.0 + coverage_full_pct: float = 0.0 + + +class DecompositionStatusResponse(BaseModel): + rich_controls: int = 0 + decomposed_controls: int = 0 + total_candidates: int = 0 + validated: int = 0 + rejected: int = 0 + composed: int = 0 + atomic_controls: int = 0 + decomposition_pct: float = 0.0 + composition_pct: float = 0.0 + + +# ============================================================================= +# PATTERN LIBRARY ENDPOINTS +# ============================================================================= + + +@router.get("/patterns", response_model=PatternListResponse) +async def list_patterns( + domain: Optional[str] = Query(None, description="Filter by domain (e.g. AUTH, CRYP)"), + category: Optional[str] = Query(None, description="Filter by category"), + tag: Optional[str] = Query(None, description="Filter by tag"), +): + """List all control patterns with optional filters.""" + from compliance.services.pattern_matcher import PatternMatcher + + matcher = PatternMatcher() + matcher._load_patterns() + matcher._build_keyword_index() + + patterns = matcher._patterns + + if domain: + patterns = [p for p in patterns if p.domain == domain.upper()] + if category: + patterns = [p for p in patterns if p.category == category.lower()] + if tag: + patterns = [p for p in patterns if tag.lower() in [t.lower() for t in p.tags]] + + # Count controls per pattern from DB + control_counts = _get_pattern_control_counts() + + response_patterns = [] + for p in patterns: + response_patterns.append(PatternResponse( + id=p.id, + name=p.name, + name_de=p.name_de, + domain=p.domain, + category=p.category, + description=p.description, + objective_template=p.objective_template, + severity_default=p.severity_default, + implementation_effort_default=p.implementation_effort_default, + tags=p.tags, + composable_with=p.composable_with, + open_anchor_refs=p.open_anchor_refs, + controls_count=control_counts.get(p.id, 0), + )) + + return PatternListResponse(patterns=response_patterns, total=len(response_patterns)) + + +@router.get("/patterns/{pattern_id}", response_model=PatternDetailResponse) +async def get_pattern(pattern_id: str): + """Get a single control pattern by ID.""" + from compliance.services.pattern_matcher import PatternMatcher + + matcher = PatternMatcher() + matcher._load_patterns() + + pattern = matcher.get_pattern(pattern_id) + if not pattern: + raise HTTPException(status_code=404, detail=f"Pattern {pattern_id} not found") + + control_counts = _get_pattern_control_counts() + + return PatternDetailResponse( + id=pattern.id, + name=pattern.name, + name_de=pattern.name_de, + domain=pattern.domain, + category=pattern.category, + description=pattern.description, + objective_template=pattern.objective_template, + rationale_template=pattern.rationale_template, + requirements_template=pattern.requirements_template, + test_procedure_template=pattern.test_procedure_template, + evidence_template=pattern.evidence_template, + severity_default=pattern.severity_default, + implementation_effort_default=pattern.implementation_effort_default, + tags=pattern.tags, + composable_with=pattern.composable_with, + open_anchor_refs=pattern.open_anchor_refs, + obligation_match_keywords=pattern.obligation_match_keywords, + controls_count=control_counts.get(pattern.id, 0), + ) + + +@router.get("/patterns/{pattern_id}/controls") +async def get_pattern_controls( + pattern_id: str, + limit: int = Query(50, ge=1, le=500), + offset: int = Query(0, ge=0), +): + """Get controls generated from a specific pattern.""" + db = SessionLocal() + try: + result = db.execute( + text(""" + SELECT id, control_id, title, objective, severity, + release_state, category, obligation_ids + FROM canonical_controls + WHERE pattern_id = :pattern_id + AND release_state NOT IN ('deprecated') + ORDER BY control_id + LIMIT :limit OFFSET :offset + """), + {"pattern_id": pattern_id.upper(), "limit": limit, "offset": offset}, + ) + rows = result.fetchall() + + count_result = db.execute( + text(""" + SELECT count(*) FROM canonical_controls + WHERE pattern_id = :pattern_id + AND release_state NOT IN ('deprecated') + """), + {"pattern_id": pattern_id.upper()}, + ) + total = count_result.fetchone()[0] + + controls = [] + for row in rows: + obl_ids = row[7] + if isinstance(obl_ids, str): + try: + obl_ids = json.loads(obl_ids) + except (json.JSONDecodeError, TypeError): + obl_ids = [] + controls.append({ + "id": str(row[0]), + "control_id": row[1], + "title": row[2], + "objective": row[3], + "severity": row[4], + "release_state": row[5], + "category": row[6], + "obligation_ids": obl_ids or [], + }) + + return {"controls": controls, "total": total} + finally: + db.close() + + +# ============================================================================= +# OBLIGATION EXTRACTION ENDPOINT +# ============================================================================= + + +@router.post("/obligations/extract", response_model=ObligationExtractResponse) +async def extract_obligation(req: ObligationExtractRequest): + """Extract obligation from text using 3-tier strategy, then match to pattern.""" + from compliance.services.obligation_extractor import ObligationExtractor + from compliance.services.pattern_matcher import PatternMatcher + + extractor = ObligationExtractor() + await extractor.initialize() + + obligation = await extractor.extract( + chunk_text=req.text, + regulation_code=req.regulation_code or "", + article=req.article, + paragraph=req.paragraph, + ) + + # Also match to pattern + matcher = PatternMatcher() + matcher._load_patterns() + matcher._build_keyword_index() + + pattern_text = obligation.obligation_text or obligation.obligation_title or req.text[:500] + pattern_result = matcher._tier1_keyword(pattern_text, obligation.regulation_id) + + return ObligationExtractResponse( + obligation_id=obligation.obligation_id, + obligation_title=obligation.obligation_title, + obligation_text=obligation.obligation_text, + method=obligation.method, + confidence=obligation.confidence, + regulation_id=obligation.regulation_id, + pattern_id=pattern_result.pattern_id if pattern_result else None, + pattern_confidence=pattern_result.confidence if pattern_result else 0, + ) + + +# ============================================================================= +# CROSSWALK MATRIX ENDPOINTS +# ============================================================================= + + +@router.get("/crosswalk", response_model=CrosswalkQueryResponse) +async def query_crosswalk( + regulation_code: Optional[str] = Query(None), + article: Optional[str] = Query(None), + obligation_id: Optional[str] = Query(None), + pattern_id: Optional[str] = Query(None), + limit: int = Query(100, ge=1, le=1000), + offset: int = Query(0, ge=0), +): + """Query the crosswalk matrix with filters.""" + db = SessionLocal() + try: + conditions = ["1=1"] + params = {"limit": limit, "offset": offset} + + if regulation_code: + conditions.append("regulation_code = :reg") + params["reg"] = regulation_code + if article: + conditions.append("article = :art") + params["art"] = article + if obligation_id: + conditions.append("obligation_id = :obl") + params["obl"] = obligation_id + if pattern_id: + conditions.append("pattern_id = :pat") + params["pat"] = pattern_id + + where = " AND ".join(conditions) + + result = db.execute( + text(f""" + SELECT regulation_code, article, obligation_id, + pattern_id, master_control_id, confidence, source + FROM crosswalk_matrix + WHERE {where} + ORDER BY regulation_code, article + LIMIT :limit OFFSET :offset + """), + params, + ) + rows = result.fetchall() + + count_result = db.execute( + text(f"SELECT count(*) FROM crosswalk_matrix WHERE {where}"), + params, + ) + total = count_result.fetchone()[0] + + crosswalk_rows = [ + CrosswalkRow( + regulation_code=r[0] or "", + article=r[1], + obligation_id=r[2], + pattern_id=r[3], + master_control_id=r[4], + confidence=float(r[5] or 0), + source=r[6] or "auto", + ) + for r in rows + ] + + return CrosswalkQueryResponse(rows=crosswalk_rows, total=total) + finally: + db.close() + + +@router.get("/crosswalk/stats", response_model=CrosswalkStatsResponse) +async def crosswalk_stats(): + """Get crosswalk coverage statistics.""" + db = SessionLocal() + try: + row = db.execute(text(""" + SELECT + count(*) AS total, + count(DISTINCT regulation_code) FILTER (WHERE regulation_code != '') AS regs, + count(DISTINCT obligation_id) FILTER (WHERE obligation_id IS NOT NULL) AS obls, + count(DISTINCT pattern_id) FILTER (WHERE pattern_id IS NOT NULL) AS pats, + count(DISTINCT master_control_id) FILTER (WHERE master_control_id IS NOT NULL) AS ctrls + FROM crosswalk_matrix + """)).fetchone() + + # Coverage by regulation + reg_rows = db.execute(text(""" + SELECT regulation_code, count(*) AS cnt + FROM crosswalk_matrix + WHERE regulation_code != '' + GROUP BY regulation_code + ORDER BY cnt DESC + """)).fetchall() + + coverage = {r[0]: r[1] for r in reg_rows} + + return CrosswalkStatsResponse( + total_rows=row[0], + regulations_covered=row[1], + obligations_linked=row[2], + patterns_used=row[3], + controls_linked=row[4], + coverage_by_regulation=coverage, + ) + finally: + db.close() + + +# ============================================================================= +# MIGRATION ENDPOINTS +# ============================================================================= + + +@router.post("/migrate/decompose", response_model=MigrationResponse) +async def migrate_decompose(req: MigrationRequest): + """Pass 0a: Extract obligation candidates from rich controls.""" + from compliance.services.decomposition_pass import DecompositionPass + + db = SessionLocal() + try: + decomp = DecompositionPass(db=db) + stats = await decomp.run_pass0a(limit=req.limit) + return MigrationResponse(status="completed", stats=stats) + except Exception as e: + logger.error("Decomposition pass 0a failed: %s", e) + raise HTTPException(status_code=500, detail=str(e)) + finally: + db.close() + + +@router.post("/migrate/compose-atomic", response_model=MigrationResponse) +async def migrate_compose_atomic(req: MigrationRequest): + """Pass 0b: Compose atomic controls from obligation candidates.""" + from compliance.services.decomposition_pass import DecompositionPass + + db = SessionLocal() + try: + decomp = DecompositionPass(db=db) + stats = await decomp.run_pass0b(limit=req.limit) + return MigrationResponse(status="completed", stats=stats) + except Exception as e: + logger.error("Decomposition pass 0b failed: %s", e) + raise HTTPException(status_code=500, detail=str(e)) + finally: + db.close() + + +@router.post("/migrate/link-obligations", response_model=MigrationResponse) +async def migrate_link_obligations(req: MigrationRequest): + """Pass 1: Link controls to obligations via source_citation article.""" + from compliance.services.pipeline_adapter import MigrationPasses + + db = SessionLocal() + try: + migration = MigrationPasses(db=db) + await migration.initialize() + stats = await migration.run_pass1_obligation_linkage(limit=req.limit) + return MigrationResponse(status="completed", stats=stats) + except Exception as e: + logger.error("Migration pass 1 failed: %s", e) + raise HTTPException(status_code=500, detail=str(e)) + finally: + db.close() + + +@router.post("/migrate/classify-patterns", response_model=MigrationResponse) +async def migrate_classify_patterns(req: MigrationRequest): + """Pass 2: Classify controls into patterns via keyword matching.""" + from compliance.services.pipeline_adapter import MigrationPasses + + db = SessionLocal() + try: + migration = MigrationPasses(db=db) + await migration.initialize() + stats = await migration.run_pass2_pattern_classification(limit=req.limit) + return MigrationResponse(status="completed", stats=stats) + except Exception as e: + logger.error("Migration pass 2 failed: %s", e) + raise HTTPException(status_code=500, detail=str(e)) + finally: + db.close() + + +@router.post("/migrate/triage", response_model=MigrationResponse) +async def migrate_triage(): + """Pass 3: Quality triage — categorize by linkage completeness.""" + from compliance.services.pipeline_adapter import MigrationPasses + + db = SessionLocal() + try: + migration = MigrationPasses(db=db) + stats = migration.run_pass3_quality_triage() + return MigrationResponse(status="completed", stats=stats) + except Exception as e: + logger.error("Migration pass 3 failed: %s", e) + raise HTTPException(status_code=500, detail=str(e)) + finally: + db.close() + + +@router.post("/migrate/backfill-crosswalk", response_model=MigrationResponse) +async def migrate_backfill_crosswalk(): + """Pass 4: Create crosswalk rows for linked controls.""" + from compliance.services.pipeline_adapter import MigrationPasses + + db = SessionLocal() + try: + migration = MigrationPasses(db=db) + stats = migration.run_pass4_crosswalk_backfill() + return MigrationResponse(status="completed", stats=stats) + except Exception as e: + logger.error("Migration pass 4 failed: %s", e) + raise HTTPException(status_code=500, detail=str(e)) + finally: + db.close() + + +@router.post("/migrate/deduplicate", response_model=MigrationResponse) +async def migrate_deduplicate(): + """Pass 5: Mark duplicate controls (same obligation + pattern).""" + from compliance.services.pipeline_adapter import MigrationPasses + + db = SessionLocal() + try: + migration = MigrationPasses(db=db) + stats = migration.run_pass5_deduplication() + return MigrationResponse(status="completed", stats=stats) + except Exception as e: + logger.error("Migration pass 5 failed: %s", e) + raise HTTPException(status_code=500, detail=str(e)) + finally: + db.close() + + +@router.get("/migrate/status", response_model=MigrationStatusResponse) +async def migration_status(): + """Get overall migration progress.""" + from compliance.services.pipeline_adapter import MigrationPasses + + db = SessionLocal() + try: + migration = MigrationPasses(db=db) + status = migration.migration_status() + return MigrationStatusResponse(**status) + except Exception as e: + logger.error("Migration status failed: %s", e) + raise HTTPException(status_code=500, detail=str(e)) + finally: + db.close() + + +@router.get("/migrate/decomposition-status", response_model=DecompositionStatusResponse) +async def decomposition_status(): + """Get decomposition progress (Pass 0a/0b).""" + from compliance.services.decomposition_pass import DecompositionPass + + db = SessionLocal() + try: + decomp = DecompositionPass(db=db) + status = decomp.decomposition_status() + return DecompositionStatusResponse(**status) + except Exception as e: + logger.error("Decomposition status failed: %s", e) + raise HTTPException(status_code=500, detail=str(e)) + finally: + db.close() + + +# ============================================================================= +# HELPERS +# ============================================================================= + + +def _get_pattern_control_counts() -> dict[str, int]: + """Get count of controls per pattern_id from DB.""" + db = SessionLocal() + try: + result = db.execute(text(""" + SELECT pattern_id, count(*) AS cnt + FROM canonical_controls + WHERE pattern_id IS NOT NULL AND pattern_id != '' + AND release_state NOT IN ('deprecated') + GROUP BY pattern_id + """)) + return {row[0]: row[1] for row in result.fetchall()} + except Exception: + return {} + finally: + db.close() diff --git a/backend-compliance/compliance/services/control_composer.py b/backend-compliance/compliance/services/control_composer.py new file mode 100644 index 0000000..41cf1e4 --- /dev/null +++ b/backend-compliance/compliance/services/control_composer.py @@ -0,0 +1,546 @@ +"""Control Composer — Pattern + Obligation → Master Control. + +Takes an obligation (from ObligationExtractor) and a matched control pattern +(from PatternMatcher), then uses LLM to compose a structured, actionable +Master Control. Replaces the old Stage 3 (STRUCTURE/REFORM) with a +pattern-guided approach. + +Three composition modes based on license rules: + Rule 1: Obligation + Pattern + original text → full control + Rule 2: Obligation + Pattern + original text + citation → control + Rule 3: Obligation + Pattern (NO original text) → reformulated control + +Fallback: No pattern match → basic generation (tagged needs_pattern_assignment) + +Part of the Multi-Layer Control Architecture (Phase 6 of 8). +""" + +import json +import logging +import os +from dataclasses import dataclass, field +from typing import Optional + +from compliance.services.obligation_extractor import ( + ObligationMatch, + _llm_ollama, + _parse_json, +) +from compliance.services.pattern_matcher import ( + ControlPattern, + PatternMatchResult, +) + +logger = logging.getLogger(__name__) + +OLLAMA_MODEL = os.getenv("CONTROL_GEN_OLLAMA_MODEL", "qwen3.5:35b-a3b") + +# Valid values for generated control fields +VALID_SEVERITIES = {"low", "medium", "high", "critical"} +VALID_EFFORTS = {"s", "m", "l", "xl"} +VALID_VERIFICATION = {"code_review", "document", "tool", "hybrid"} + + +@dataclass +class ComposedControl: + """A Master Control composed from an obligation + pattern.""" + + # Core fields (match canonical_controls schema) + control_id: str = "" + title: str = "" + objective: str = "" + rationale: str = "" + scope: dict = field(default_factory=dict) + requirements: list = field(default_factory=list) + test_procedure: list = field(default_factory=list) + evidence: list = field(default_factory=list) + severity: str = "medium" + risk_score: float = 5.0 + implementation_effort: str = "m" + open_anchors: list = field(default_factory=list) + release_state: str = "draft" + tags: list = field(default_factory=list) + # 3-Rule License fields + license_rule: Optional[int] = None + source_original_text: Optional[str] = None + source_citation: Optional[dict] = None + customer_visible: bool = True + # Classification + verification_method: Optional[str] = None + category: Optional[str] = None + target_audience: Optional[list] = None + # Pattern + Obligation linkage + pattern_id: Optional[str] = None + obligation_ids: list = field(default_factory=list) + # Metadata + generation_metadata: dict = field(default_factory=dict) + composition_method: str = "pattern_guided" # pattern_guided | fallback + + def to_dict(self) -> dict: + """Serialize for DB storage or API response.""" + return { + "control_id": self.control_id, + "title": self.title, + "objective": self.objective, + "rationale": self.rationale, + "scope": self.scope, + "requirements": self.requirements, + "test_procedure": self.test_procedure, + "evidence": self.evidence, + "severity": self.severity, + "risk_score": self.risk_score, + "implementation_effort": self.implementation_effort, + "open_anchors": self.open_anchors, + "release_state": self.release_state, + "tags": self.tags, + "license_rule": self.license_rule, + "source_original_text": self.source_original_text, + "source_citation": self.source_citation, + "customer_visible": self.customer_visible, + "verification_method": self.verification_method, + "category": self.category, + "target_audience": self.target_audience, + "pattern_id": self.pattern_id, + "obligation_ids": self.obligation_ids, + "generation_metadata": self.generation_metadata, + "composition_method": self.composition_method, + } + + +class ControlComposer: + """Composes Master Controls from obligations + patterns. + + Usage:: + + composer = ControlComposer() + + control = await composer.compose( + obligation=obligation_match, + pattern_result=pattern_match_result, + chunk_text="...", + license_rule=1, + source_citation={...}, + ) + """ + + async def compose( + self, + obligation: ObligationMatch, + pattern_result: PatternMatchResult, + chunk_text: Optional[str] = None, + license_rule: int = 3, + source_citation: Optional[dict] = None, + regulation_code: Optional[str] = None, + ) -> ComposedControl: + """Compose a Master Control from obligation + pattern. + + Args: + obligation: The extracted obligation (from ObligationExtractor). + pattern_result: The matched pattern (from PatternMatcher). + chunk_text: Original RAG chunk text (only used for Rules 1-2). + license_rule: 1=free, 2=citation, 3=restricted. + source_citation: Citation metadata for Rule 2. + regulation_code: Source regulation code. + + Returns: + ComposedControl ready for storage. + """ + pattern = pattern_result.pattern if pattern_result else None + + if pattern: + control = await self._compose_with_pattern( + obligation, pattern, chunk_text, license_rule, source_citation, + ) + else: + control = await self._compose_fallback( + obligation, chunk_text, license_rule, source_citation, + ) + + # Set linkage fields + control.pattern_id = pattern.id if pattern else None + if obligation.obligation_id: + control.obligation_ids = [obligation.obligation_id] + + # Set license fields + control.license_rule = license_rule + if license_rule in (1, 2) and chunk_text: + control.source_original_text = chunk_text + if license_rule == 2 and source_citation: + control.source_citation = source_citation + if license_rule == 3: + control.customer_visible = False + control.source_original_text = None + control.source_citation = None + + # Build metadata + control.generation_metadata = { + "composition_method": control.composition_method, + "pattern_id": control.pattern_id, + "pattern_confidence": round(pattern_result.confidence, 3) if pattern_result else 0, + "pattern_method": pattern_result.method if pattern_result else "none", + "obligation_id": obligation.obligation_id, + "obligation_method": obligation.method, + "obligation_confidence": round(obligation.confidence, 3), + "license_rule": license_rule, + "regulation_code": regulation_code, + } + + # Validate and fix fields + _validate_control(control) + + return control + + async def compose_batch( + self, + items: list[dict], + ) -> list[ComposedControl]: + """Compose multiple controls. + + Args: + items: List of dicts with keys: obligation, pattern_result, + chunk_text, license_rule, source_citation, regulation_code. + + Returns: + List of ComposedControl instances. + """ + results = [] + for item in items: + control = await self.compose( + obligation=item["obligation"], + pattern_result=item.get("pattern_result", PatternMatchResult()), + chunk_text=item.get("chunk_text"), + license_rule=item.get("license_rule", 3), + source_citation=item.get("source_citation"), + regulation_code=item.get("regulation_code"), + ) + results.append(control) + return results + + # ----------------------------------------------------------------------- + # Pattern-guided composition + # ----------------------------------------------------------------------- + + async def _compose_with_pattern( + self, + obligation: ObligationMatch, + pattern: ControlPattern, + chunk_text: Optional[str], + license_rule: int, + source_citation: Optional[dict], + ) -> ComposedControl: + """Use LLM to fill the pattern template with obligation-specific details.""" + prompt = _build_compose_prompt(obligation, pattern, chunk_text, license_rule) + system_prompt = _compose_system_prompt(license_rule) + + llm_result = await _llm_ollama(prompt, system_prompt) + if not llm_result: + return self._compose_from_template(obligation, pattern) + + parsed = _parse_json(llm_result) + if not parsed: + return self._compose_from_template(obligation, pattern) + + control = ComposedControl( + title=parsed.get("title", pattern.name_de)[:255], + objective=parsed.get("objective", pattern.objective_template), + rationale=parsed.get("rationale", pattern.rationale_template), + requirements=_ensure_list(parsed.get("requirements", pattern.requirements_template)), + test_procedure=_ensure_list(parsed.get("test_procedure", pattern.test_procedure_template)), + evidence=_ensure_list(parsed.get("evidence", pattern.evidence_template)), + severity=parsed.get("severity", pattern.severity_default), + implementation_effort=parsed.get("implementation_effort", pattern.implementation_effort_default), + category=parsed.get("category", pattern.category), + tags=_ensure_list(parsed.get("tags", pattern.tags)), + target_audience=_ensure_list(parsed.get("target_audience", [])), + verification_method=parsed.get("verification_method"), + open_anchors=_anchors_from_pattern(pattern), + composition_method="pattern_guided", + ) + + return control + + def _compose_from_template( + self, + obligation: ObligationMatch, + pattern: ControlPattern, + ) -> ComposedControl: + """Fallback: fill template directly without LLM (when LLM fails).""" + obl_title = obligation.obligation_title or "" + obl_text = obligation.obligation_text or "" + + title = f"{pattern.name_de}" + if obl_title: + title = f"{pattern.name_de} — {obl_title}" + + objective = pattern.objective_template + if obl_text and len(obl_text) > 20: + objective = f"{pattern.objective_template} Bezug: {obl_text[:200]}" + + return ComposedControl( + title=title[:255], + objective=objective, + rationale=pattern.rationale_template, + requirements=list(pattern.requirements_template), + test_procedure=list(pattern.test_procedure_template), + evidence=list(pattern.evidence_template), + severity=pattern.severity_default, + implementation_effort=pattern.implementation_effort_default, + category=pattern.category, + tags=list(pattern.tags), + open_anchors=_anchors_from_pattern(pattern), + composition_method="template_only", + ) + + # ----------------------------------------------------------------------- + # Fallback (no pattern) + # ----------------------------------------------------------------------- + + async def _compose_fallback( + self, + obligation: ObligationMatch, + chunk_text: Optional[str], + license_rule: int, + source_citation: Optional[dict], + ) -> ComposedControl: + """Generate a control without a pattern template (old-style).""" + prompt = _build_fallback_prompt(obligation, chunk_text, license_rule) + system_prompt = _compose_system_prompt(license_rule) + + llm_result = await _llm_ollama(prompt, system_prompt) + parsed = _parse_json(llm_result) if llm_result else {} + + obl_text = obligation.obligation_text or "" + + control = ComposedControl( + title=parsed.get("title", obl_text[:100] if obl_text else "Untitled Control")[:255], + objective=parsed.get("objective", obl_text[:500]), + rationale=parsed.get("rationale", "Aus gesetzlicher Pflicht abgeleitet."), + requirements=_ensure_list(parsed.get("requirements", [])), + test_procedure=_ensure_list(parsed.get("test_procedure", [])), + evidence=_ensure_list(parsed.get("evidence", [])), + severity=parsed.get("severity", "medium"), + implementation_effort=parsed.get("implementation_effort", "m"), + category=parsed.get("category"), + tags=_ensure_list(parsed.get("tags", [])), + target_audience=_ensure_list(parsed.get("target_audience", [])), + verification_method=parsed.get("verification_method"), + composition_method="fallback", + release_state="needs_review", + ) + + return control + + +# --------------------------------------------------------------------------- +# Prompt builders +# --------------------------------------------------------------------------- + + +def _compose_system_prompt(license_rule: int) -> str: + """Build the system prompt based on license rule.""" + if license_rule == 3: + return ( + "Du bist ein Security-Compliance-Experte. Deine Aufgabe ist es, " + "eigenstaendige Security Controls zu formulieren. " + "Du formulierst IMMER in eigenen Worten. " + "KOPIERE KEINE Saetze aus dem Quelltext. " + "Verwende eigene Begriffe und Struktur. " + "NENNE NICHT die Quelle. Keine proprietaeren Bezeichner. " + "Antworte NUR mit validem JSON." + ) + return ( + "Du bist ein Security-Compliance-Experte. " + "Erstelle ein praxisorientiertes, umsetzbares Security Control. " + "Antworte NUR mit validem JSON." + ) + + +def _build_compose_prompt( + obligation: ObligationMatch, + pattern: ControlPattern, + chunk_text: Optional[str], + license_rule: int, +) -> str: + """Build the LLM prompt for pattern-guided composition.""" + obl_section = _obligation_section(obligation) + pattern_section = _pattern_section(pattern) + + if license_rule == 3: + context_section = "KONTEXT: Intern analysiert (keine Quellenangabe)." + elif chunk_text: + context_section = f"KONTEXT (Originaltext):\n{chunk_text[:2000]}" + else: + context_section = "KONTEXT: Kein Originaltext verfuegbar." + + return f"""Erstelle ein PRAXISORIENTIERTES Security Control. + +{obl_section} + +{pattern_section} + +{context_section} + +AUFGABE: +Fuelle das Muster mit pflicht-spezifischen Details. +Das Ergebnis muss UMSETZBAR sein — keine Gesetzesparaphrase. +Formuliere konkret und handlungsorientiert. + +Antworte als JSON: +{{ + "title": "Kurzer praegnanter Titel (max 100 Zeichen, deutsch)", + "objective": "Was soll erreicht werden? (1-3 Saetze)", + "rationale": "Warum ist das wichtig? (1-2 Saetze)", + "requirements": ["Konkrete Anforderung 1", "Anforderung 2", ...], + "test_procedure": ["Pruefschritt 1", "Pruefschritt 2", ...], + "evidence": ["Nachweis 1", "Nachweis 2", ...], + "severity": "low|medium|high|critical", + "implementation_effort": "s|m|l|xl", + "category": "{pattern.category}", + "tags": ["tag1", "tag2"], + "target_audience": ["unternehmen", "behoerden", "entwickler"], + "verification_method": "code_review|document|tool|hybrid" +}}""" + + +def _build_fallback_prompt( + obligation: ObligationMatch, + chunk_text: Optional[str], + license_rule: int, +) -> str: + """Build the LLM prompt for fallback composition (no pattern).""" + obl_section = _obligation_section(obligation) + + if license_rule == 3: + context_section = "KONTEXT: Intern analysiert (keine Quellenangabe)." + elif chunk_text: + context_section = f"KONTEXT (Originaltext):\n{chunk_text[:2000]}" + else: + context_section = "KONTEXT: Kein Originaltext verfuegbar." + + return f"""Erstelle ein Security Control aus der folgenden Pflicht. + +{obl_section} + +{context_section} + +AUFGABE: +Formuliere ein umsetzbares Security Control. +Keine Gesetzesparaphrase — konkrete Massnahmen beschreiben. + +Antworte als JSON: +{{ + "title": "Kurzer praegnanter Titel (max 100 Zeichen, deutsch)", + "objective": "Was soll erreicht werden? (1-3 Saetze)", + "rationale": "Warum ist das wichtig? (1-2 Saetze)", + "requirements": ["Konkrete Anforderung 1", "Anforderung 2", ...], + "test_procedure": ["Pruefschritt 1", "Pruefschritt 2", ...], + "evidence": ["Nachweis 1", "Nachweis 2", ...], + "severity": "low|medium|high|critical", + "implementation_effort": "s|m|l|xl", + "category": "one of: authentication, encryption, data_protection, etc.", + "tags": ["tag1", "tag2"], + "target_audience": ["unternehmen"], + "verification_method": "code_review|document|tool|hybrid" +}}""" + + +def _obligation_section(obligation: ObligationMatch) -> str: + """Format the obligation for the prompt.""" + parts = ["PFLICHT (was das Gesetz verlangt):"] + if obligation.obligation_title: + parts.append(f" Titel: {obligation.obligation_title}") + if obligation.obligation_text: + parts.append(f" Beschreibung: {obligation.obligation_text[:500]}") + if obligation.obligation_id: + parts.append(f" ID: {obligation.obligation_id}") + if obligation.regulation_id: + parts.append(f" Rechtsgrundlage: {obligation.regulation_id}") + if not obligation.obligation_text and not obligation.obligation_title: + parts.append(" (Keine spezifische Pflicht extrahiert)") + return "\n".join(parts) + + +def _pattern_section(pattern: ControlPattern) -> str: + """Format the pattern for the prompt.""" + reqs = "\n ".join(f"- {r}" for r in pattern.requirements_template[:5]) + tests = "\n ".join(f"- {t}" for t in pattern.test_procedure_template[:3]) + return f"""MUSTER (wie man es typischerweise umsetzt): + Pattern: {pattern.name_de} ({pattern.id}) + Domain: {pattern.domain} + Ziel-Template: {pattern.objective_template} + Anforderungs-Template: + {reqs} + Pruefverfahren-Template: + {tests}""" + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _ensure_list(value) -> list: + """Ensure a value is a list of strings.""" + if isinstance(value, list): + return [str(v) for v in value if v] + if isinstance(value, str): + return [value] + return [] + + +def _anchors_from_pattern(pattern: ControlPattern) -> list: + """Convert pattern's open_anchor_refs to control anchor format.""" + anchors = [] + for ref in pattern.open_anchor_refs: + anchors.append({ + "framework": ref.get("framework", ""), + "control_id": ref.get("ref", ""), + "title": "", + "alignment_score": 0.8, + }) + return anchors + + +def _validate_control(control: ComposedControl) -> None: + """Validate and fix control field values.""" + # Severity + if control.severity not in VALID_SEVERITIES: + control.severity = "medium" + + # Implementation effort + if control.implementation_effort not in VALID_EFFORTS: + control.implementation_effort = "m" + + # Verification method + if control.verification_method and control.verification_method not in VALID_VERIFICATION: + control.verification_method = None + + # Risk score + if not (0 <= control.risk_score <= 10): + control.risk_score = _severity_to_risk(control.severity) + + # Title length + if len(control.title) > 255: + control.title = control.title[:252] + "..." + + # Ensure minimum content + if not control.objective: + control.objective = control.title + if not control.rationale: + control.rationale = "Aus regulatorischer Anforderung abgeleitet." + if not control.requirements: + control.requirements = ["Anforderung gemaess Pflichtbeschreibung umsetzen"] + if not control.test_procedure: + control.test_procedure = ["Umsetzung der Anforderungen pruefen"] + if not control.evidence: + control.evidence = ["Dokumentation der Umsetzung"] + + +def _severity_to_risk(severity: str) -> float: + """Map severity to a default risk score.""" + return { + "critical": 9.0, + "high": 7.0, + "medium": 5.0, + "low": 3.0, + }.get(severity, 5.0) diff --git a/backend-compliance/compliance/services/decomposition_pass.py b/backend-compliance/compliance/services/decomposition_pass.py new file mode 100644 index 0000000..2f9ce4b --- /dev/null +++ b/backend-compliance/compliance/services/decomposition_pass.py @@ -0,0 +1,854 @@ +"""Decomposition Pass — Split Rich Controls into Atomic Controls. + +Pass 0 of the Multi-Layer Control Architecture migration. Runs BEFORE +Passes 1-5 (obligation linkage, pattern classification, etc.). + +Two sub-passes: + Pass 0a: Obligation Extraction — extract individual normative obligations + from a Rich Control using LLM with strict guardrails. + Pass 0b: Atomic Control Composition — turn each obligation candidate + into a standalone atomic control record. + +Plus a Quality Gate that validates extraction results. + +Guardrails (the 6 rules): + 1. Only normative statements (müssen, sicherzustellen, verpflichtet, ...) + 2. One main verb per obligation + 3. Test obligations separate from operational obligations + 4. Reporting obligations separate + 5. Don't split at evidence level + 6. Parent link always preserved +""" + +import json +import logging +import re +import uuid +from dataclasses import dataclass, field +from typing import Optional + +from sqlalchemy import text +from sqlalchemy.orm import Session + +logger = logging.getLogger(__name__) + + +# --------------------------------------------------------------------------- +# Normative signal detection (Rule 1) +# --------------------------------------------------------------------------- + +_NORMATIVE_SIGNALS = [ + r"\bmüssen\b", r"\bmuss\b", r"\bhat\s+sicherzustellen\b", + r"\bhaben\s+sicherzustellen\b", r"\bsind\s+verpflichtet\b", + r"\bist\s+verpflichtet\b", r"\bist\s+zu\s+\w+en\b", + r"\bsind\s+zu\s+\w+en\b", r"\bhat\s+zu\s+\w+en\b", + r"\bhaben\s+zu\s+\w+en\b", r"\bsoll\b", r"\bsollen\b", + r"\bgewährleisten\b", r"\bsicherstellen\b", + r"\bshall\b", r"\bmust\b", r"\brequired\b", + r"\bshould\b", r"\bensure\b", +] +_NORMATIVE_RE = re.compile("|".join(_NORMATIVE_SIGNALS), re.IGNORECASE) + +_RATIONALE_SIGNALS = [ + r"\bda\s+", r"\bweil\b", r"\bgrund\b", r"\berwägung", + r"\bbecause\b", r"\breason\b", r"\brationale\b", + r"\bkönnen\s+.*\s+verursachen\b", r"\bführt\s+zu\b", +] +_RATIONALE_RE = re.compile("|".join(_RATIONALE_SIGNALS), re.IGNORECASE) + +_TEST_SIGNALS = [ + r"\btesten\b", r"\btest\b", r"\bprüfung\b", r"\bprüfen\b", + r"\bgetestet\b", r"\bwirksamkeit\b", r"\baudit\b", + r"\bregelmäßig\b.*\b(prüf|test|kontroll)", + r"\beffectiveness\b", r"\bverif", +] +_TEST_RE = re.compile("|".join(_TEST_SIGNALS), re.IGNORECASE) + +_REPORTING_SIGNALS = [ + r"\bmelden\b", r"\bmeldung\b", r"\bunterricht", + r"\binformieren\b", r"\bbenachricht", r"\bnotif", + r"\breport\b", r"\bbehörd", +] +_REPORTING_RE = re.compile("|".join(_REPORTING_SIGNALS), re.IGNORECASE) + + +# --------------------------------------------------------------------------- +# Data classes +# --------------------------------------------------------------------------- + + +@dataclass +class ObligationCandidate: + """A single normative obligation extracted from a Rich Control.""" + + candidate_id: str = "" + parent_control_uuid: str = "" + obligation_text: str = "" + action: str = "" + object_: str = "" + condition: Optional[str] = None + normative_strength: str = "must" + is_test_obligation: bool = False + is_reporting_obligation: bool = False + extraction_confidence: float = 0.0 + quality_flags: dict = field(default_factory=dict) + release_state: str = "extracted" + + def to_dict(self) -> dict: + return { + "candidate_id": self.candidate_id, + "parent_control_uuid": self.parent_control_uuid, + "obligation_text": self.obligation_text, + "action": self.action, + "object": self.object_, + "condition": self.condition, + "normative_strength": self.normative_strength, + "is_test_obligation": self.is_test_obligation, + "is_reporting_obligation": self.is_reporting_obligation, + "extraction_confidence": self.extraction_confidence, + "quality_flags": self.quality_flags, + "release_state": self.release_state, + } + + +@dataclass +class AtomicControlCandidate: + """An atomic control composed from a single ObligationCandidate.""" + + candidate_id: str = "" + parent_control_uuid: str = "" + obligation_candidate_id: str = "" + title: str = "" + objective: str = "" + requirements: list = field(default_factory=list) + test_procedure: list = field(default_factory=list) + evidence: list = field(default_factory=list) + severity: str = "medium" + category: str = "" + domain: str = "" + source_regulation: str = "" + source_article: str = "" + + def to_dict(self) -> dict: + return { + "candidate_id": self.candidate_id, + "parent_control_uuid": self.parent_control_uuid, + "obligation_candidate_id": self.obligation_candidate_id, + "title": self.title, + "objective": self.objective, + "requirements": self.requirements, + "test_procedure": self.test_procedure, + "evidence": self.evidence, + "severity": self.severity, + "category": self.category, + "domain": self.domain, + } + + +# --------------------------------------------------------------------------- +# Quality Gate +# --------------------------------------------------------------------------- + + +def quality_gate(candidate: ObligationCandidate) -> dict: + """Validate an obligation candidate. Returns quality flags dict. + + Checks: + has_normative_signal: text contains normative language + single_action: only one main action (heuristic) + not_rationale: not just a justification/reasoning + not_evidence_only: not just an evidence requirement + min_length: text is long enough to be meaningful + has_parent_link: references back to parent control + """ + txt = candidate.obligation_text + flags = {} + + # 1. Normative signal + flags["has_normative_signal"] = bool(_NORMATIVE_RE.search(txt)) + + # 2. Single action heuristic — count "und" / "and" / "sowie" splits + # that connect different verbs (imperfect but useful) + multi_verb_re = re.compile( + r"\b(und|sowie|als auch)\b.*\b(müssen|sicherstellen|implementieren" + r"|dokumentieren|melden|testen|prüfen|überwachen|gewährleisten)\b", + re.IGNORECASE, + ) + flags["single_action"] = not bool(multi_verb_re.search(txt)) + + # 3. Not rationale + normative_count = len(_NORMATIVE_RE.findall(txt)) + rationale_count = len(_RATIONALE_RE.findall(txt)) + flags["not_rationale"] = normative_count >= rationale_count + + # 4. Not evidence-only (evidence fragments are typically short noun phrases) + evidence_only_re = re.compile( + r"^(Nachweis|Dokumentation|Screenshot|Protokoll|Bericht|Zertifikat)", + re.IGNORECASE, + ) + flags["not_evidence_only"] = not bool(evidence_only_re.match(txt.strip())) + + # 5. Min length + flags["min_length"] = len(txt.strip()) >= 20 + + # 6. Parent link + flags["has_parent_link"] = bool(candidate.parent_control_uuid) + + return flags + + +def passes_quality_gate(flags: dict) -> bool: + """Check if all critical quality flags pass.""" + critical = ["has_normative_signal", "not_evidence_only", "min_length", "has_parent_link"] + return all(flags.get(k, False) for k in critical) + + +# --------------------------------------------------------------------------- +# LLM Prompts +# --------------------------------------------------------------------------- + + +_PASS0A_SYSTEM_PROMPT = """\ +Du bist ein Rechts-Compliance-Experte. Du zerlegst Compliance-Controls \ +in einzelne atomare Pflichten. + +REGELN (STRIKT EINHALTEN): +1. Nur normative Aussagen extrahieren — erkennbar an: müssen, haben \ +sicherzustellen, sind verpflichtet, ist zu dokumentieren, ist zu melden, \ +ist zu testen, shall, must, required. +2. Jede Pflicht hat genau EIN Hauptverb / eine Handlung. +3. Testpflichten SEPARAT von operativen Pflichten (is_test_obligation=true). +4. Meldepflichten SEPARAT (is_reporting_obligation=true). +5. NICHT auf Evidence-Ebene zerlegen (z.B. "DR-Plan vorhanden" ist KEIN \ +eigenes Control, sondern Evidence). +6. Begründungen, Erläuterungen und Erwägungsgründe sind KEINE Pflichten \ +— NICHT extrahieren. + +Antworte NUR mit einem JSON-Array. Keine Erklärungen.""" + + +def _build_pass0a_prompt( + title: str, objective: str, requirements: str, + test_procedure: str, source_ref: str +) -> str: + return f"""\ +Analysiere das folgende Control und extrahiere alle einzelnen normativen \ +Pflichten als JSON-Array. + +CONTROL: +Titel: {title} +Ziel: {objective} +Anforderungen: {requirements} +Prüfverfahren: {test_procedure} +Quellreferenz: {source_ref} + +Antworte als JSON-Array: +[ + {{ + "obligation_text": "Kurze, präzise Formulierung der Pflicht", + "action": "Hauptverb/Handlung", + "object": "Gegenstand der Pflicht", + "condition": "Auslöser/Bedingung oder null", + "normative_strength": "must", + "is_test_obligation": false, + "is_reporting_obligation": false + }} +]""" + + +_PASS0B_SYSTEM_PROMPT = """\ +Du bist ein Security-Compliance-Experte. Du erstellst aus einer einzelnen \ +normativen Pflicht ein praxisorientiertes, atomares Security Control. + +Das Control muss UMSETZBAR sein — keine Gesetzesparaphrase. +Antworte NUR als JSON. Keine Erklärungen.""" + + +def _build_pass0b_prompt( + obligation_text: str, action: str, object_: str, + parent_title: str, parent_category: str, source_ref: str, +) -> str: + return f"""\ +Erstelle aus der folgenden Pflicht ein atomares Control. + +PFLICHT: {obligation_text} +HANDLUNG: {action} +GEGENSTAND: {object_} + +KONTEXT (Ursprungs-Control): +Titel: {parent_title} +Kategorie: {parent_category} +Quellreferenz: {source_ref} + +Antworte als JSON: +{{ + "title": "Kurzer Titel (max 80 Zeichen, deutsch)", + "objective": "Was muss erreicht werden? (1-2 Sätze)", + "requirements": ["Konkrete Anforderung 1", "Anforderung 2"], + "test_procedure": ["Prüfschritt 1", "Prüfschritt 2"], + "evidence": ["Nachweis 1", "Nachweis 2"], + "severity": "critical|high|medium|low", + "category": "security|privacy|governance|operations|finance|reporting" +}}""" + + +# --------------------------------------------------------------------------- +# Parse helpers +# --------------------------------------------------------------------------- + + +def _parse_json_array(text: str) -> list[dict]: + """Extract a JSON array from LLM response text.""" + # Try direct parse + try: + result = json.loads(text) + if isinstance(result, list): + return result + if isinstance(result, dict): + return [result] + except json.JSONDecodeError: + pass + + # Try extracting JSON array block + match = re.search(r"\[[\s\S]*\]", text) + if match: + try: + result = json.loads(match.group()) + if isinstance(result, list): + return result + except json.JSONDecodeError: + pass + + return [] + + +def _parse_json_object(text: str) -> dict: + """Extract a JSON object from LLM response text.""" + try: + result = json.loads(text) + if isinstance(result, dict): + return result + except json.JSONDecodeError: + pass + + match = re.search(r"\{[\s\S]*\}", text) + if match: + try: + result = json.loads(match.group()) + if isinstance(result, dict): + return result + except json.JSONDecodeError: + pass + + return {} + + +def _ensure_list(val) -> list: + """Ensure value is a list.""" + if isinstance(val, list): + return val + if isinstance(val, str): + return [val] if val else [] + return [] + + +# --------------------------------------------------------------------------- +# Decomposition Pass +# --------------------------------------------------------------------------- + + +class DecompositionPass: + """Pass 0: Decompose Rich Controls into atomic candidates. + + Usage:: + + decomp = DecompositionPass(db=session) + stats_0a = await decomp.run_pass0a(limit=100) + stats_0b = await decomp.run_pass0b(limit=100) + """ + + def __init__(self, db: Session): + self.db = db + + # ------------------------------------------------------------------- + # Pass 0a: Obligation Extraction + # ------------------------------------------------------------------- + + async def run_pass0a(self, limit: int = 0) -> dict: + """Extract obligation candidates from rich controls. + + Processes controls that have NOT been decomposed yet + (no rows in obligation_candidates for that control). + """ + from compliance.services.obligation_extractor import _llm_ollama + + # Find rich controls not yet decomposed + query = """ + SELECT cc.id, cc.control_id, cc.title, cc.objective, + cc.requirements, cc.test_procedure, + cc.source_citation, cc.category + FROM canonical_controls cc + WHERE cc.release_state NOT IN ('deprecated') + AND cc.parent_control_uuid IS NULL + AND NOT EXISTS ( + SELECT 1 FROM obligation_candidates oc + WHERE oc.parent_control_uuid = cc.id + ) + ORDER BY cc.created_at + """ + if limit > 0: + query += f" LIMIT {limit}" + + rows = self.db.execute(text(query)).fetchall() + + stats = { + "controls_processed": 0, + "obligations_extracted": 0, + "obligations_validated": 0, + "obligations_rejected": 0, + "controls_skipped_empty": 0, + "errors": 0, + } + + for row in rows: + control_uuid = str(row[0]) + control_id = row[1] or "" + title = row[2] or "" + objective = row[3] or "" + requirements = row[4] or "" + test_procedure = row[5] or "" + source_citation = row[6] or "" + category = row[7] or "" + + # Format requirements/test_procedure if JSON + req_str = _format_field(requirements) + test_str = _format_field(test_procedure) + source_str = _format_citation(source_citation) + + if not title and not objective and not req_str: + stats["controls_skipped_empty"] += 1 + continue + + try: + prompt = _build_pass0a_prompt( + title=title, + objective=objective, + requirements=req_str, + test_procedure=test_str, + source_ref=source_str, + ) + + llm_response = await _llm_ollama( + prompt=prompt, + system_prompt=_PASS0A_SYSTEM_PROMPT, + ) + + raw_obligations = _parse_json_array(llm_response) + + if not raw_obligations: + # Fallback: treat the whole control as one obligation + raw_obligations = [{ + "obligation_text": objective or title, + "action": "sicherstellen", + "object": title, + "condition": None, + "normative_strength": "must", + "is_test_obligation": False, + "is_reporting_obligation": False, + }] + + for idx, raw in enumerate(raw_obligations): + cand = ObligationCandidate( + candidate_id=f"OC-{control_id}-{idx + 1:02d}", + parent_control_uuid=control_uuid, + obligation_text=raw.get("obligation_text", ""), + action=raw.get("action", ""), + object_=raw.get("object", ""), + condition=raw.get("condition"), + normative_strength=raw.get("normative_strength", "must"), + is_test_obligation=bool(raw.get("is_test_obligation", False)), + is_reporting_obligation=bool(raw.get("is_reporting_obligation", False)), + ) + + # Auto-detect test/reporting if LLM missed it + if not cand.is_test_obligation and _TEST_RE.search(cand.obligation_text): + cand.is_test_obligation = True + if not cand.is_reporting_obligation and _REPORTING_RE.search(cand.obligation_text): + cand.is_reporting_obligation = True + + # Quality gate + flags = quality_gate(cand) + cand.quality_flags = flags + cand.extraction_confidence = _compute_extraction_confidence(flags) + + if passes_quality_gate(flags): + cand.release_state = "validated" + stats["obligations_validated"] += 1 + else: + cand.release_state = "rejected" + stats["obligations_rejected"] += 1 + + # Write to DB + self._write_obligation_candidate(cand) + stats["obligations_extracted"] += 1 + + stats["controls_processed"] += 1 + + except Exception as e: + logger.error("Pass 0a failed for %s: %s", control_id, e) + stats["errors"] += 1 + + self.db.commit() + logger.info("Pass 0a: %s", stats) + return stats + + # ------------------------------------------------------------------- + # Pass 0b: Atomic Control Composition + # ------------------------------------------------------------------- + + async def run_pass0b(self, limit: int = 0) -> dict: + """Compose atomic controls from validated obligation candidates. + + Processes obligation_candidates with release_state='validated' + that don't have a corresponding atomic control yet. + """ + from compliance.services.obligation_extractor import _llm_ollama + + query = """ + SELECT oc.id, oc.candidate_id, oc.parent_control_uuid, + oc.obligation_text, oc.action, oc.object, + oc.is_test_obligation, oc.is_reporting_obligation, + cc.title AS parent_title, + cc.category AS parent_category, + cc.source_citation AS parent_citation, + cc.severity AS parent_severity, + cc.control_id AS parent_control_id + FROM obligation_candidates oc + JOIN canonical_controls cc ON cc.id = oc.parent_control_uuid + WHERE oc.release_state = 'validated' + AND NOT EXISTS ( + SELECT 1 FROM canonical_controls ac + WHERE ac.parent_control_uuid = oc.parent_control_uuid + AND ac.decomposition_method = 'pass0b' + AND ac.title LIKE '%' || LEFT(oc.action, 20) || '%' + ) + """ + if limit > 0: + query += f" LIMIT {limit}" + + rows = self.db.execute(text(query)).fetchall() + + stats = { + "candidates_processed": 0, + "controls_created": 0, + "llm_failures": 0, + "errors": 0, + } + + for row in rows: + oc_id = str(row[0]) + candidate_id = row[1] or "" + parent_uuid = str(row[2]) + obligation_text = row[3] or "" + action = row[4] or "" + object_ = row[5] or "" + is_test = row[6] + is_reporting = row[7] + parent_title = row[8] or "" + parent_category = row[9] or "" + parent_citation = row[10] or "" + parent_severity = row[11] or "medium" + parent_control_id = row[12] or "" + + source_str = _format_citation(parent_citation) + + try: + prompt = _build_pass0b_prompt( + obligation_text=obligation_text, + action=action, + object_=object_, + parent_title=parent_title, + parent_category=parent_category, + source_ref=source_str, + ) + + llm_response = await _llm_ollama( + prompt=prompt, + system_prompt=_PASS0B_SYSTEM_PROMPT, + ) + + parsed = _parse_json_object(llm_response) + + if not parsed or not parsed.get("title"): + # Template fallback — no LLM needed + atomic = _template_fallback( + obligation_text=obligation_text, + action=action, + object_=object_, + parent_title=parent_title, + parent_severity=parent_severity, + parent_category=parent_category, + is_test=is_test, + is_reporting=is_reporting, + ) + stats["llm_failures"] += 1 + else: + atomic = AtomicControlCandidate( + title=parsed.get("title", "")[:200], + objective=parsed.get("objective", "")[:2000], + requirements=_ensure_list(parsed.get("requirements", [])), + test_procedure=_ensure_list(parsed.get("test_procedure", [])), + evidence=_ensure_list(parsed.get("evidence", [])), + severity=_normalize_severity(parsed.get("severity", parent_severity)), + category=parsed.get("category", parent_category), + ) + + atomic.parent_control_uuid = parent_uuid + atomic.obligation_candidate_id = candidate_id + + # Generate control_id from parent + seq = self._next_atomic_seq(parent_control_id) + atomic.candidate_id = f"{parent_control_id}-A{seq:02d}" + + # Write to canonical_controls + self._write_atomic_control(atomic, parent_uuid, candidate_id) + + # Mark obligation candidate as composed + self.db.execute( + text(""" + UPDATE obligation_candidates + SET release_state = 'composed' + WHERE id = CAST(:oc_id AS uuid) + """), + {"oc_id": oc_id}, + ) + + stats["controls_created"] += 1 + stats["candidates_processed"] += 1 + + except Exception as e: + logger.error("Pass 0b failed for %s: %s", candidate_id, e) + stats["errors"] += 1 + + self.db.commit() + logger.info("Pass 0b: %s", stats) + return stats + + # ------------------------------------------------------------------- + # Decomposition Status + # ------------------------------------------------------------------- + + def decomposition_status(self) -> dict: + """Return decomposition progress.""" + row = self.db.execute(text(""" + SELECT + (SELECT count(*) FROM canonical_controls + WHERE parent_control_uuid IS NULL + AND release_state NOT IN ('deprecated')) AS rich_controls, + (SELECT count(DISTINCT parent_control_uuid) FROM obligation_candidates) AS decomposed_controls, + (SELECT count(*) FROM obligation_candidates) AS total_candidates, + (SELECT count(*) FROM obligation_candidates WHERE release_state = 'validated') AS validated, + (SELECT count(*) FROM obligation_candidates WHERE release_state = 'rejected') AS rejected, + (SELECT count(*) FROM obligation_candidates WHERE release_state = 'composed') AS composed, + (SELECT count(*) FROM canonical_controls WHERE parent_control_uuid IS NOT NULL) AS atomic_controls + """)).fetchone() + + return { + "rich_controls": row[0], + "decomposed_controls": row[1], + "total_candidates": row[2], + "validated": row[3], + "rejected": row[4], + "composed": row[5], + "atomic_controls": row[6], + "decomposition_pct": round(row[1] / max(row[0], 1) * 100, 1), + "composition_pct": round(row[5] / max(row[3], 1) * 100, 1), + } + + # ------------------------------------------------------------------- + # DB Writers + # ------------------------------------------------------------------- + + def _write_obligation_candidate(self, cand: ObligationCandidate) -> None: + """Insert an obligation candidate into the DB.""" + self.db.execute( + text(""" + INSERT INTO obligation_candidates ( + parent_control_uuid, candidate_id, + obligation_text, action, object, condition, + normative_strength, is_test_obligation, + is_reporting_obligation, extraction_confidence, + quality_flags, release_state + ) VALUES ( + CAST(:parent_uuid AS uuid), :candidate_id, + :obligation_text, :action, :object, :condition, + :normative_strength, :is_test, :is_reporting, + :confidence, :quality_flags, :release_state + ) + """), + { + "parent_uuid": cand.parent_control_uuid, + "candidate_id": cand.candidate_id, + "obligation_text": cand.obligation_text, + "action": cand.action, + "object": cand.object_, + "condition": cand.condition, + "normative_strength": cand.normative_strength, + "is_test": cand.is_test_obligation, + "is_reporting": cand.is_reporting_obligation, + "confidence": cand.extraction_confidence, + "quality_flags": json.dumps(cand.quality_flags), + "release_state": cand.release_state, + }, + ) + + def _write_atomic_control( + self, atomic: AtomicControlCandidate, + parent_uuid: str, candidate_id: str, + ) -> None: + """Insert an atomic control into canonical_controls.""" + self.db.execute( + text(""" + INSERT INTO canonical_controls ( + control_id, title, objective, requirements, + test_procedure, evidence, severity, category, + release_state, parent_control_uuid, + decomposition_method, + generation_metadata + ) VALUES ( + :control_id, :title, :objective, + :requirements, :test_procedure, :evidence, + :severity, :category, 'draft', + CAST(:parent_uuid AS uuid), 'pass0b', + :gen_meta + ) + """), + { + "control_id": atomic.candidate_id, + "title": atomic.title, + "objective": atomic.objective, + "requirements": json.dumps(atomic.requirements), + "test_procedure": json.dumps(atomic.test_procedure), + "evidence": json.dumps(atomic.evidence), + "severity": atomic.severity, + "category": atomic.category, + "parent_uuid": parent_uuid, + "gen_meta": json.dumps({ + "decomposition_source": candidate_id, + "decomposition_method": "pass0b", + }), + }, + ) + + def _next_atomic_seq(self, parent_control_id: str) -> int: + """Get the next sequence number for atomic controls under a parent.""" + result = self.db.execute( + text(""" + SELECT count(*) FROM canonical_controls + WHERE parent_control_uuid = ( + SELECT id FROM canonical_controls + WHERE control_id = :parent_id + LIMIT 1 + ) + """), + {"parent_id": parent_control_id}, + ).fetchone() + return (result[0] if result else 0) + 1 + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _format_field(value) -> str: + """Format a requirements/test_procedure field for the LLM prompt.""" + if not value: + return "" + if isinstance(value, str): + try: + parsed = json.loads(value) + if isinstance(parsed, list): + return "\n".join(f"- {item}" for item in parsed) + return value + except (json.JSONDecodeError, TypeError): + return value + if isinstance(value, list): + return "\n".join(f"- {item}" for item in value) + return str(value) + + +def _format_citation(citation) -> str: + """Format source_citation for display.""" + if not citation: + return "" + if isinstance(citation, str): + try: + c = json.loads(citation) + if isinstance(c, dict): + parts = [] + if c.get("source"): + parts.append(c["source"]) + if c.get("article"): + parts.append(c["article"]) + if c.get("paragraph"): + parts.append(c["paragraph"]) + return " ".join(parts) if parts else citation + except (json.JSONDecodeError, TypeError): + return citation + return str(citation) + + +def _compute_extraction_confidence(flags: dict) -> float: + """Compute confidence score from quality flags.""" + score = 0.0 + weights = { + "has_normative_signal": 0.30, + "single_action": 0.20, + "not_rationale": 0.20, + "not_evidence_only": 0.15, + "min_length": 0.10, + "has_parent_link": 0.05, + } + for flag, weight in weights.items(): + if flags.get(flag, False): + score += weight + return round(score, 2) + + +def _normalize_severity(val: str) -> str: + """Normalize severity value.""" + val = (val or "medium").lower().strip() + if val in ("critical", "high", "medium", "low"): + return val + return "medium" + + +def _template_fallback( + obligation_text: str, action: str, object_: str, + parent_title: str, parent_severity: str, parent_category: str, + is_test: bool, is_reporting: bool, +) -> AtomicControlCandidate: + """Create an atomic control candidate from template when LLM fails.""" + if is_test: + title = f"Test: {object_[:60]}" if object_ else f"Test: {action[:60]}" + test_proc = [f"Prüfung der {object_ or action}"] + evidence = ["Testprotokoll", "Prüfbericht"] + elif is_reporting: + title = f"Meldepflicht: {object_[:60]}" if object_ else f"Meldung: {action[:60]}" + test_proc = ["Prüfung des Meldeprozesses", "Stichprobe gemeldeter Vorfälle"] + evidence = ["Meldeprozess-Dokumentation", "Meldeformulare"] + else: + title = f"{action.capitalize()}: {object_[:60]}" if object_ else parent_title[:80] + test_proc = [f"Prüfung der {action}"] + evidence = ["Dokumentation", "Konfigurationsnachweis"] + + return AtomicControlCandidate( + title=title[:200], + objective=obligation_text[:2000], + requirements=[obligation_text] if obligation_text else [], + test_procedure=test_proc, + evidence=evidence, + severity=_normalize_severity(parent_severity), + category=parent_category, + ) diff --git a/backend-compliance/compliance/services/obligation_extractor.py b/backend-compliance/compliance/services/obligation_extractor.py new file mode 100644 index 0000000..d9fd793 --- /dev/null +++ b/backend-compliance/compliance/services/obligation_extractor.py @@ -0,0 +1,562 @@ +"""Obligation Extractor — 3-Tier Chunk-to-Obligation Linking. + +Maps RAG chunks to obligations from the v2 obligation framework using +three tiers (fastest first): + + Tier 1: EXACT MATCH — regulation_code + article → obligation_id (~40%) + Tier 2: EMBEDDING — chunk text vs. obligation descriptions (~30%) + Tier 3: LLM EXTRACT — local Ollama extracts obligation text (~25%) + +Part of the Multi-Layer Control Architecture (Phase 4 of 8). +""" + +import json +import logging +import os +import re +from dataclasses import dataclass, field +from pathlib import Path +from typing import Optional + +import httpx + +logger = logging.getLogger(__name__) + +EMBEDDING_URL = os.getenv("EMBEDDING_URL", "http://embedding-service:8087") +OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434") +OLLAMA_MODEL = os.getenv("CONTROL_GEN_OLLAMA_MODEL", "qwen3.5:35b-a3b") +LLM_TIMEOUT = float(os.getenv("CONTROL_GEN_LLM_TIMEOUT", "180")) + +# Embedding similarity thresholds for Tier 2 +EMBEDDING_MATCH_THRESHOLD = 0.80 +EMBEDDING_CANDIDATE_THRESHOLD = 0.60 + +# --------------------------------------------------------------------------- +# Regulation code mapping: RAG chunk codes → obligation file regulation IDs +# --------------------------------------------------------------------------- + +_REGULATION_CODE_TO_ID = { + # DSGVO + "eu_2016_679": "dsgvo", + "dsgvo": "dsgvo", + "gdpr": "dsgvo", + # AI Act + "eu_2024_1689": "ai_act", + "ai_act": "ai_act", + "aiact": "ai_act", + # NIS2 + "eu_2022_2555": "nis2", + "nis2": "nis2", + "bsig": "nis2", + # BDSG + "bdsg": "bdsg", + # TTDSG + "ttdsg": "ttdsg", + # DSA + "eu_2022_2065": "dsa", + "dsa": "dsa", + # Data Act + "eu_2023_2854": "data_act", + "data_act": "data_act", + # EU Machinery + "eu_2023_1230": "eu_machinery", + "eu_machinery": "eu_machinery", + # DORA + "eu_2022_2554": "dora", + "dora": "dora", +} + + +@dataclass +class ObligationMatch: + """Result of obligation extraction.""" + + obligation_id: Optional[str] = None + obligation_title: Optional[str] = None + obligation_text: Optional[str] = None + method: str = "none" # exact_match | embedding_match | llm_extracted | inferred + confidence: float = 0.0 + regulation_id: Optional[str] = None # e.g. "dsgvo" + + def to_dict(self) -> dict: + return { + "obligation_id": self.obligation_id, + "obligation_title": self.obligation_title, + "obligation_text": self.obligation_text, + "method": self.method, + "confidence": self.confidence, + "regulation_id": self.regulation_id, + } + + +@dataclass +class _ObligationEntry: + """Internal representation of a loaded obligation.""" + + id: str + title: str + description: str + regulation_id: str + articles: list[str] = field(default_factory=list) # normalized: ["art. 30", "§ 38"] + embedding: list[float] = field(default_factory=list) + + +class ObligationExtractor: + """3-Tier obligation extraction from RAG chunks. + + Usage:: + + extractor = ObligationExtractor() + await extractor.initialize() # loads obligations + embeddings + + match = await extractor.extract( + chunk_text="...", + regulation_code="eu_2016_679", + article="Art. 30", + paragraph="Abs. 1", + ) + """ + + def __init__(self): + self._article_lookup: dict[str, list[str]] = {} # "dsgvo/art. 30" → ["DSGVO-OBL-001"] + self._obligations: dict[str, _ObligationEntry] = {} # id → entry + self._obligation_embeddings: list[list[float]] = [] + self._obligation_ids: list[str] = [] + self._initialized = False + + async def initialize(self) -> None: + """Load all obligations from v2 JSON files and compute embeddings.""" + if self._initialized: + return + + self._load_obligations() + await self._compute_embeddings() + self._initialized = True + logger.info( + "ObligationExtractor initialized: %d obligations, %d article lookups, %d embeddings", + len(self._obligations), + len(self._article_lookup), + sum(1 for e in self._obligation_embeddings if e), + ) + + async def extract( + self, + chunk_text: str, + regulation_code: str, + article: Optional[str] = None, + paragraph: Optional[str] = None, + ) -> ObligationMatch: + """Extract obligation from a chunk using 3-tier strategy.""" + if not self._initialized: + await self.initialize() + + reg_id = _normalize_regulation(regulation_code) + + # Tier 1: Exact match via article lookup + if article: + match = self._tier1_exact(reg_id, article) + if match: + return match + + # Tier 2: Embedding similarity + match = await self._tier2_embedding(chunk_text, reg_id) + if match: + return match + + # Tier 3: LLM extraction + match = await self._tier3_llm(chunk_text, regulation_code, article) + return match + + # ----------------------------------------------------------------------- + # Tier 1: Exact Match + # ----------------------------------------------------------------------- + + def _tier1_exact(self, reg_id: Optional[str], article: str) -> Optional[ObligationMatch]: + """Look up obligation by regulation + article.""" + if not reg_id: + return None + + norm_article = _normalize_article(article) + key = f"{reg_id}/{norm_article}" + + obl_ids = self._article_lookup.get(key) + if not obl_ids: + return None + + # Take the first match (highest priority) + obl_id = obl_ids[0] + entry = self._obligations.get(obl_id) + if not entry: + return None + + return ObligationMatch( + obligation_id=entry.id, + obligation_title=entry.title, + obligation_text=entry.description, + method="exact_match", + confidence=1.0, + regulation_id=reg_id, + ) + + # ----------------------------------------------------------------------- + # Tier 2: Embedding Match + # ----------------------------------------------------------------------- + + async def _tier2_embedding( + self, chunk_text: str, reg_id: Optional[str] + ) -> Optional[ObligationMatch]: + """Find nearest obligation by embedding similarity.""" + if not self._obligation_embeddings: + return None + + chunk_embedding = await _get_embedding(chunk_text[:2000]) + if not chunk_embedding: + return None + + best_idx = -1 + best_score = 0.0 + + for i, obl_emb in enumerate(self._obligation_embeddings): + if not obl_emb: + continue + # Prefer same-regulation matches + obl_id = self._obligation_ids[i] + entry = self._obligations.get(obl_id) + score = _cosine_sim(chunk_embedding, obl_emb) + + # Domain bonus: +0.05 if same regulation + if entry and reg_id and entry.regulation_id == reg_id: + score += 0.05 + + if score > best_score: + best_score = score + best_idx = i + + if best_idx < 0: + return None + + # Remove domain bonus for threshold comparison + raw_score = best_score + obl_id = self._obligation_ids[best_idx] + entry = self._obligations.get(obl_id) + if entry and reg_id and entry.regulation_id == reg_id: + raw_score -= 0.05 + + if raw_score >= EMBEDDING_MATCH_THRESHOLD: + return ObligationMatch( + obligation_id=entry.id if entry else obl_id, + obligation_title=entry.title if entry else None, + obligation_text=entry.description if entry else None, + method="embedding_match", + confidence=round(min(raw_score, 1.0), 3), + regulation_id=entry.regulation_id if entry else reg_id, + ) + + return None + + # ----------------------------------------------------------------------- + # Tier 3: LLM Extraction + # ----------------------------------------------------------------------- + + async def _tier3_llm( + self, chunk_text: str, regulation_code: str, article: Optional[str] + ) -> ObligationMatch: + """Use local LLM to extract the obligation from the chunk.""" + prompt = f"""Analysiere den folgenden Gesetzestext und extrahiere die zentrale rechtliche Pflicht. + +Text: +{chunk_text[:3000]} + +Quelle: {regulation_code} {article or ''} + +Antworte NUR als JSON: +{{ + "obligation_text": "Die zentrale Pflicht in einem Satz", + "actor": "Wer muss handeln (z.B. Verantwortlicher, Auftragsverarbeiter)", + "action": "Was muss getan werden", + "normative_strength": "muss|soll|kann" +}}""" + + system_prompt = ( + "Du bist ein Rechtsexperte fuer EU-Datenschutz- und Digitalrecht. " + "Extrahiere die zentrale rechtliche Pflicht aus Gesetzestexten. " + "Antworte ausschliesslich als JSON." + ) + + result_text = await _llm_ollama(prompt, system_prompt) + if not result_text: + return ObligationMatch( + method="llm_extracted", + confidence=0.0, + regulation_id=_normalize_regulation(regulation_code), + ) + + parsed = _parse_json(result_text) + obligation_text = parsed.get("obligation_text", result_text[:500]) + + return ObligationMatch( + obligation_id=None, + obligation_title=None, + obligation_text=obligation_text, + method="llm_extracted", + confidence=0.60, + regulation_id=_normalize_regulation(regulation_code), + ) + + # ----------------------------------------------------------------------- + # Initialization helpers + # ----------------------------------------------------------------------- + + def _load_obligations(self) -> None: + """Load all obligation files from v2 framework.""" + v2_dir = _find_obligations_dir() + if not v2_dir: + logger.warning("Obligations v2 directory not found — Tier 1 disabled") + return + + manifest_path = v2_dir / "_manifest.json" + if not manifest_path.exists(): + logger.warning("Manifest not found at %s", manifest_path) + return + + with open(manifest_path) as f: + manifest = json.load(f) + + for reg_info in manifest.get("regulations", []): + reg_id = reg_info["id"] + reg_file = v2_dir / reg_info["file"] + if not reg_file.exists(): + logger.warning("Regulation file not found: %s", reg_file) + continue + + with open(reg_file) as f: + data = json.load(f) + + for obl in data.get("obligations", []): + obl_id = obl["id"] + entry = _ObligationEntry( + id=obl_id, + title=obl.get("title", ""), + description=obl.get("description", ""), + regulation_id=reg_id, + ) + + # Build article lookup from legal_basis + for basis in obl.get("legal_basis", []): + article_raw = basis.get("article", "") + if article_raw: + norm_art = _normalize_article(article_raw) + key = f"{reg_id}/{norm_art}" + if key not in self._article_lookup: + self._article_lookup[key] = [] + self._article_lookup[key].append(obl_id) + entry.articles.append(norm_art) + + self._obligations[obl_id] = entry + + logger.info( + "Loaded %d obligations from %d regulations", + len(self._obligations), + len(manifest.get("regulations", [])), + ) + + async def _compute_embeddings(self) -> None: + """Compute embeddings for all obligation descriptions.""" + if not self._obligations: + return + + self._obligation_ids = list(self._obligations.keys()) + texts = [ + f"{self._obligations[oid].title}: {self._obligations[oid].description}" + for oid in self._obligation_ids + ] + + logger.info("Computing embeddings for %d obligations...", len(texts)) + self._obligation_embeddings = await _get_embeddings_batch(texts) + valid = sum(1 for e in self._obligation_embeddings if e) + logger.info("Got %d/%d valid embeddings", valid, len(texts)) + + # ----------------------------------------------------------------------- + # Stats + # ----------------------------------------------------------------------- + + def stats(self) -> dict: + """Return initialization statistics.""" + return { + "total_obligations": len(self._obligations), + "article_lookups": len(self._article_lookup), + "embeddings_valid": sum(1 for e in self._obligation_embeddings if e), + "regulations": list( + {e.regulation_id for e in self._obligations.values()} + ), + "initialized": self._initialized, + } + + +# --------------------------------------------------------------------------- +# Module-level helpers (reusable by other modules) +# --------------------------------------------------------------------------- + + +def _normalize_regulation(regulation_code: str) -> Optional[str]: + """Map a RAG regulation_code to obligation framework regulation ID.""" + if not regulation_code: + return None + code = regulation_code.lower().strip() + + # Direct lookup + if code in _REGULATION_CODE_TO_ID: + return _REGULATION_CODE_TO_ID[code] + + # Prefix matching for families + for prefix, reg_id in [ + ("eu_2016_679", "dsgvo"), + ("eu_2024_1689", "ai_act"), + ("eu_2022_2555", "nis2"), + ("eu_2022_2065", "dsa"), + ("eu_2023_2854", "data_act"), + ("eu_2023_1230", "eu_machinery"), + ("eu_2022_2554", "dora"), + ]: + if code.startswith(prefix): + return reg_id + + return None + + +def _normalize_article(article: str) -> str: + """Normalize article references for consistent lookup. + + Examples: + "Art. 30" → "art. 30" + "§ 38 BDSG" → "§ 38" + "Article 10" → "art. 10" + "Art. 30 Abs. 1" → "art. 30" + "Artikel 35" → "art. 35" + """ + if not article: + return "" + s = article.strip() + + # Remove trailing law name: "§ 38 BDSG" → "§ 38" + s = re.sub(r"\s+(DSGVO|BDSG|TTDSG|DSA|NIS2|DORA|AI.?Act)\s*$", "", s, flags=re.IGNORECASE) + + # Remove paragraph references: "Art. 30 Abs. 1" → "Art. 30" + s = re.sub(r"\s+(Abs|Absatz|para|paragraph|lit|Satz)\.?\s+.*$", "", s, flags=re.IGNORECASE) + + # Normalize "Article" / "Artikel" → "Art." + s = re.sub(r"^(Article|Artikel)\s+", "Art. ", s, flags=re.IGNORECASE) + + return s.lower().strip() + + +def _cosine_sim(a: list[float], b: list[float]) -> float: + """Compute cosine similarity between two vectors.""" + if not a or not b or len(a) != len(b): + return 0.0 + dot = sum(x * y for x, y in zip(a, b)) + norm_a = sum(x * x for x in a) ** 0.5 + norm_b = sum(x * x for x in b) ** 0.5 + if norm_a == 0 or norm_b == 0: + return 0.0 + return dot / (norm_a * norm_b) + + +def _find_obligations_dir() -> Optional[Path]: + """Locate the obligations v2 directory.""" + candidates = [ + Path(__file__).resolve().parent.parent.parent.parent + / "ai-compliance-sdk" / "policies" / "obligations" / "v2", + Path("/app/ai-compliance-sdk/policies/obligations/v2"), + Path("ai-compliance-sdk/policies/obligations/v2"), + ] + for p in candidates: + if p.is_dir() and (p / "_manifest.json").exists(): + return p + return None + + +async def _get_embedding(text: str) -> list[float]: + """Get embedding vector for a single text.""" + try: + async with httpx.AsyncClient(timeout=10.0) as client: + resp = await client.post( + f"{EMBEDDING_URL}/embed", + json={"texts": [text]}, + ) + resp.raise_for_status() + embeddings = resp.json().get("embeddings", []) + return embeddings[0] if embeddings else [] + except Exception: + return [] + + +async def _get_embeddings_batch( + texts: list[str], batch_size: int = 32 +) -> list[list[float]]: + """Get embeddings for multiple texts in batches.""" + all_embeddings: list[list[float]] = [] + for i in range(0, len(texts), batch_size): + batch = texts[i : i + batch_size] + try: + async with httpx.AsyncClient(timeout=30.0) as client: + resp = await client.post( + f"{EMBEDDING_URL}/embed", + json={"texts": batch}, + ) + resp.raise_for_status() + embeddings = resp.json().get("embeddings", []) + all_embeddings.extend(embeddings) + except Exception as e: + logger.warning("Batch embedding failed for %d texts: %s", len(batch), e) + all_embeddings.extend([[] for _ in batch]) + return all_embeddings + + +async def _llm_ollama(prompt: str, system_prompt: Optional[str] = None) -> str: + """Call local Ollama for LLM extraction.""" + messages = [] + if system_prompt: + messages.append({"role": "system", "content": system_prompt}) + messages.append({"role": "user", "content": prompt}) + + payload = { + "model": OLLAMA_MODEL, + "messages": messages, + "stream": False, + "options": {"num_predict": 512}, + "think": False, + } + + try: + async with httpx.AsyncClient(timeout=LLM_TIMEOUT) as client: + resp = await client.post(f"{OLLAMA_URL}/api/chat", json=payload) + if resp.status_code != 200: + logger.error( + "Ollama chat failed %d: %s", resp.status_code, resp.text[:300] + ) + return "" + data = resp.json() + return data.get("message", {}).get("content", "") + except Exception as e: + logger.warning("Ollama call failed: %s", e) + return "" + + +def _parse_json(text: str) -> dict: + """Extract JSON from LLM response text.""" + # Try direct parse + try: + return json.loads(text) + except json.JSONDecodeError: + pass + + # Try extracting JSON block + match = re.search(r"\{[^{}]*\}", text, re.DOTALL) + if match: + try: + return json.loads(match.group()) + except json.JSONDecodeError: + pass + + return {} diff --git a/backend-compliance/compliance/services/pattern_matcher.py b/backend-compliance/compliance/services/pattern_matcher.py new file mode 100644 index 0000000..60b3987 --- /dev/null +++ b/backend-compliance/compliance/services/pattern_matcher.py @@ -0,0 +1,532 @@ +"""Pattern Matcher — Obligation-to-Control-Pattern Linking. + +Maps obligations (from the ObligationExtractor) to control patterns +using two tiers: + + Tier 1: KEYWORD MATCH — obligation_match_keywords from patterns (~70%) + Tier 2: EMBEDDING — cosine similarity with domain bonus (~25%) + +Part of the Multi-Layer Control Architecture (Phase 5 of 8). +""" + +import logging +import os +from dataclasses import dataclass, field +from pathlib import Path +from typing import Optional + +import yaml + +from compliance.services.obligation_extractor import ( + _cosine_sim, + _get_embedding, + _get_embeddings_batch, +) + +logger = logging.getLogger(__name__) + +# Minimum keyword score to accept a match (at least 2 keyword hits) +KEYWORD_MATCH_MIN_HITS = 2 +# Embedding threshold for Tier 2 +EMBEDDING_PATTERN_THRESHOLD = 0.75 +# Domain bonus when regulation maps to the pattern's domain +DOMAIN_BONUS = 0.10 + +# Map regulation IDs to pattern domains that are likely relevant +_REGULATION_DOMAIN_AFFINITY = { + "dsgvo": ["DATA", "COMP", "GOV"], + "bdsg": ["DATA", "COMP"], + "ttdsg": ["DATA"], + "ai_act": ["AI", "COMP", "DATA"], + "nis2": ["SEC", "INC", "NET", "LOG", "CRYP"], + "dsa": ["DATA", "COMP"], + "data_act": ["DATA", "COMP"], + "eu_machinery": ["SEC", "COMP"], + "dora": ["SEC", "INC", "FIN", "COMP"], +} + + +@dataclass +class ControlPattern: + """Python representation of a control pattern from YAML.""" + + id: str + name: str + name_de: str + domain: str + category: str + description: str + objective_template: str + rationale_template: str + requirements_template: list[str] = field(default_factory=list) + test_procedure_template: list[str] = field(default_factory=list) + evidence_template: list[str] = field(default_factory=list) + severity_default: str = "medium" + implementation_effort_default: str = "m" + obligation_match_keywords: list[str] = field(default_factory=list) + tags: list[str] = field(default_factory=list) + composable_with: list[str] = field(default_factory=list) + open_anchor_refs: list[dict] = field(default_factory=list) + + +@dataclass +class PatternMatchResult: + """Result of pattern matching.""" + + pattern: Optional[ControlPattern] = None + pattern_id: Optional[str] = None + method: str = "none" # keyword | embedding | combined | none + confidence: float = 0.0 + keyword_hits: int = 0 + total_keywords: int = 0 + embedding_score: float = 0.0 + domain_bonus_applied: bool = False + composable_patterns: list[str] = field(default_factory=list) + + def to_dict(self) -> dict: + return { + "pattern_id": self.pattern_id, + "method": self.method, + "confidence": round(self.confidence, 3), + "keyword_hits": self.keyword_hits, + "total_keywords": self.total_keywords, + "embedding_score": round(self.embedding_score, 3), + "domain_bonus_applied": self.domain_bonus_applied, + "composable_patterns": self.composable_patterns, + } + + +class PatternMatcher: + """Links obligations to control patterns using keyword + embedding matching. + + Usage:: + + matcher = PatternMatcher() + await matcher.initialize() + + result = await matcher.match( + obligation_text="Fuehrung eines Verarbeitungsverzeichnisses...", + regulation_id="dsgvo", + ) + print(result.pattern_id) # e.g. "CP-COMP-001" + print(result.confidence) # e.g. 0.85 + """ + + def __init__(self): + self._patterns: list[ControlPattern] = [] + self._by_id: dict[str, ControlPattern] = {} + self._by_domain: dict[str, list[ControlPattern]] = {} + self._keyword_index: dict[str, list[str]] = {} # keyword → [pattern_ids] + self._pattern_embeddings: list[list[float]] = [] + self._pattern_ids: list[str] = [] + self._initialized = False + + async def initialize(self) -> None: + """Load patterns from YAML and compute embeddings.""" + if self._initialized: + return + + self._load_patterns() + self._build_keyword_index() + await self._compute_embeddings() + self._initialized = True + logger.info( + "PatternMatcher initialized: %d patterns, %d keywords, %d embeddings", + len(self._patterns), + len(self._keyword_index), + sum(1 for e in self._pattern_embeddings if e), + ) + + async def match( + self, + obligation_text: str, + regulation_id: Optional[str] = None, + top_n: int = 1, + ) -> PatternMatchResult: + """Match obligation text to the best control pattern. + + Args: + obligation_text: The obligation description to match against. + regulation_id: Source regulation (for domain bonus). + top_n: Number of top results to consider for composability. + + Returns: + PatternMatchResult with the best match. + """ + if not self._initialized: + await self.initialize() + + if not obligation_text or not self._patterns: + return PatternMatchResult() + + # Tier 1: Keyword matching + keyword_result = self._tier1_keyword(obligation_text, regulation_id) + + # Tier 2: Embedding matching + embedding_result = await self._tier2_embedding(obligation_text, regulation_id) + + # Combine scores: prefer keyword match, boost with embedding if available + best = self._combine_results(keyword_result, embedding_result) + + # Attach composable patterns + if best.pattern: + best.composable_patterns = [ + pid for pid in best.pattern.composable_with + if pid in self._by_id + ] + + return best + + async def match_top_n( + self, + obligation_text: str, + regulation_id: Optional[str] = None, + n: int = 3, + ) -> list[PatternMatchResult]: + """Return top-N pattern matches sorted by confidence descending.""" + if not self._initialized: + await self.initialize() + + if not obligation_text or not self._patterns: + return [] + + keyword_scores = self._keyword_scores(obligation_text, regulation_id) + embedding_scores = await self._embedding_scores(obligation_text, regulation_id) + + # Merge scores + all_pattern_ids = set(keyword_scores.keys()) | set(embedding_scores.keys()) + results: list[PatternMatchResult] = [] + + for pid in all_pattern_ids: + pattern = self._by_id.get(pid) + if not pattern: + continue + + kw_score = keyword_scores.get(pid, (0, 0, 0.0)) # (hits, total, score) + emb_score = embedding_scores.get(pid, (0.0, False)) # (score, bonus_applied) + + kw_hits, kw_total, kw_confidence = kw_score + emb_confidence, bonus_applied = emb_score + + # Combined confidence: max of keyword and embedding, with boost if both + if kw_confidence > 0 and emb_confidence > 0: + combined = max(kw_confidence, emb_confidence) + 0.05 + method = "combined" + elif kw_confidence > 0: + combined = kw_confidence + method = "keyword" + else: + combined = emb_confidence + method = "embedding" + + results.append(PatternMatchResult( + pattern=pattern, + pattern_id=pid, + method=method, + confidence=min(combined, 1.0), + keyword_hits=kw_hits, + total_keywords=kw_total, + embedding_score=emb_confidence, + domain_bonus_applied=bonus_applied, + composable_patterns=[ + p for p in pattern.composable_with if p in self._by_id + ], + )) + + # Sort by confidence descending + results.sort(key=lambda r: r.confidence, reverse=True) + return results[:n] + + # ----------------------------------------------------------------------- + # Tier 1: Keyword Match + # ----------------------------------------------------------------------- + + def _tier1_keyword( + self, obligation_text: str, regulation_id: Optional[str] + ) -> Optional[PatternMatchResult]: + """Match by counting keyword hits in the obligation text.""" + scores = self._keyword_scores(obligation_text, regulation_id) + if not scores: + return None + + # Find best match + best_pid = max(scores, key=lambda pid: scores[pid][2]) + hits, total, confidence = scores[best_pid] + + if hits < KEYWORD_MATCH_MIN_HITS: + return None + + pattern = self._by_id.get(best_pid) + if not pattern: + return None + + # Check domain bonus + bonus_applied = False + if regulation_id and self._domain_matches(pattern.domain, regulation_id): + confidence = min(confidence + DOMAIN_BONUS, 1.0) + bonus_applied = True + + return PatternMatchResult( + pattern=pattern, + pattern_id=best_pid, + method="keyword", + confidence=confidence, + keyword_hits=hits, + total_keywords=total, + domain_bonus_applied=bonus_applied, + ) + + def _keyword_scores( + self, text: str, regulation_id: Optional[str] + ) -> dict[str, tuple[int, int, float]]: + """Compute keyword match scores for all patterns. + + Returns dict: pattern_id → (hits, total_keywords, confidence). + """ + text_lower = text.lower() + hits_by_pattern: dict[str, int] = {} + + for keyword, pattern_ids in self._keyword_index.items(): + if keyword in text_lower: + for pid in pattern_ids: + hits_by_pattern[pid] = hits_by_pattern.get(pid, 0) + 1 + + result: dict[str, tuple[int, int, float]] = {} + for pid, hits in hits_by_pattern.items(): + pattern = self._by_id.get(pid) + if not pattern: + continue + total = len(pattern.obligation_match_keywords) + confidence = hits / total if total > 0 else 0.0 + result[pid] = (hits, total, confidence) + + return result + + # ----------------------------------------------------------------------- + # Tier 2: Embedding Match + # ----------------------------------------------------------------------- + + async def _tier2_embedding( + self, obligation_text: str, regulation_id: Optional[str] + ) -> Optional[PatternMatchResult]: + """Match by embedding similarity against pattern objective_templates.""" + scores = await self._embedding_scores(obligation_text, regulation_id) + if not scores: + return None + + best_pid = max(scores, key=lambda pid: scores[pid][0]) + emb_score, bonus_applied = scores[best_pid] + + if emb_score < EMBEDDING_PATTERN_THRESHOLD: + return None + + pattern = self._by_id.get(best_pid) + if not pattern: + return None + + return PatternMatchResult( + pattern=pattern, + pattern_id=best_pid, + method="embedding", + confidence=min(emb_score, 1.0), + embedding_score=emb_score, + domain_bonus_applied=bonus_applied, + ) + + async def _embedding_scores( + self, obligation_text: str, regulation_id: Optional[str] + ) -> dict[str, tuple[float, bool]]: + """Compute embedding similarity scores for all patterns. + + Returns dict: pattern_id → (score, domain_bonus_applied). + """ + if not self._pattern_embeddings: + return {} + + chunk_embedding = await _get_embedding(obligation_text[:2000]) + if not chunk_embedding: + return {} + + result: dict[str, tuple[float, bool]] = {} + for i, pat_emb in enumerate(self._pattern_embeddings): + if not pat_emb: + continue + pid = self._pattern_ids[i] + pattern = self._by_id.get(pid) + if not pattern: + continue + + score = _cosine_sim(chunk_embedding, pat_emb) + + # Domain bonus + bonus_applied = False + if regulation_id and self._domain_matches(pattern.domain, regulation_id): + score += DOMAIN_BONUS + bonus_applied = True + + result[pid] = (score, bonus_applied) + + return result + + # ----------------------------------------------------------------------- + # Score combination + # ----------------------------------------------------------------------- + + def _combine_results( + self, + keyword_result: Optional[PatternMatchResult], + embedding_result: Optional[PatternMatchResult], + ) -> PatternMatchResult: + """Combine keyword and embedding results into the best match.""" + if not keyword_result and not embedding_result: + return PatternMatchResult() + + if not keyword_result: + return embedding_result + if not embedding_result: + return keyword_result + + # Both matched — check if they agree + if keyword_result.pattern_id == embedding_result.pattern_id: + # Same pattern: boost confidence + combined_confidence = min( + max(keyword_result.confidence, embedding_result.confidence) + 0.05, + 1.0, + ) + return PatternMatchResult( + pattern=keyword_result.pattern, + pattern_id=keyword_result.pattern_id, + method="combined", + confidence=combined_confidence, + keyword_hits=keyword_result.keyword_hits, + total_keywords=keyword_result.total_keywords, + embedding_score=embedding_result.embedding_score, + domain_bonus_applied=( + keyword_result.domain_bonus_applied + or embedding_result.domain_bonus_applied + ), + ) + + # Different patterns: pick the one with higher confidence + if keyword_result.confidence >= embedding_result.confidence: + return keyword_result + return embedding_result + + # ----------------------------------------------------------------------- + # Domain affinity + # ----------------------------------------------------------------------- + + @staticmethod + def _domain_matches(pattern_domain: str, regulation_id: str) -> bool: + """Check if a pattern's domain has affinity with a regulation.""" + affine_domains = _REGULATION_DOMAIN_AFFINITY.get(regulation_id, []) + return pattern_domain in affine_domains + + # ----------------------------------------------------------------------- + # Initialization helpers + # ----------------------------------------------------------------------- + + def _load_patterns(self) -> None: + """Load control patterns from YAML files.""" + patterns_dir = _find_patterns_dir() + if not patterns_dir: + logger.warning("Control patterns directory not found") + return + + for yaml_file in sorted(patterns_dir.glob("*.yaml")): + if yaml_file.name.startswith("_"): + continue + try: + with open(yaml_file) as f: + data = yaml.safe_load(f) + if not data or "patterns" not in data: + continue + for p in data["patterns"]: + pattern = ControlPattern( + id=p["id"], + name=p["name"], + name_de=p["name_de"], + domain=p["domain"], + category=p["category"], + description=p["description"], + objective_template=p["objective_template"], + rationale_template=p["rationale_template"], + requirements_template=p.get("requirements_template", []), + test_procedure_template=p.get("test_procedure_template", []), + evidence_template=p.get("evidence_template", []), + severity_default=p.get("severity_default", "medium"), + implementation_effort_default=p.get("implementation_effort_default", "m"), + obligation_match_keywords=p.get("obligation_match_keywords", []), + tags=p.get("tags", []), + composable_with=p.get("composable_with", []), + open_anchor_refs=p.get("open_anchor_refs", []), + ) + self._patterns.append(pattern) + self._by_id[pattern.id] = pattern + domain_list = self._by_domain.setdefault(pattern.domain, []) + domain_list.append(pattern) + except Exception as e: + logger.error("Failed to load %s: %s", yaml_file.name, e) + + logger.info("Loaded %d patterns from %s", len(self._patterns), patterns_dir) + + def _build_keyword_index(self) -> None: + """Build reverse index: keyword → [pattern_ids].""" + for pattern in self._patterns: + for kw in pattern.obligation_match_keywords: + lower_kw = kw.lower() + if lower_kw not in self._keyword_index: + self._keyword_index[lower_kw] = [] + self._keyword_index[lower_kw].append(pattern.id) + + async def _compute_embeddings(self) -> None: + """Compute embeddings for all pattern objective templates.""" + if not self._patterns: + return + + self._pattern_ids = [p.id for p in self._patterns] + texts = [ + f"{p.name_de}: {p.objective_template}" + for p in self._patterns + ] + + logger.info("Computing embeddings for %d patterns...", len(texts)) + self._pattern_embeddings = await _get_embeddings_batch(texts) + valid = sum(1 for e in self._pattern_embeddings if e) + logger.info("Got %d/%d valid pattern embeddings", valid, len(texts)) + + # ----------------------------------------------------------------------- + # Public helpers + # ----------------------------------------------------------------------- + + def get_pattern(self, pattern_id: str) -> Optional[ControlPattern]: + """Get a pattern by its ID.""" + return self._by_id.get(pattern_id.upper()) + + def get_patterns_by_domain(self, domain: str) -> list[ControlPattern]: + """Get all patterns for a domain.""" + return self._by_domain.get(domain.upper(), []) + + def stats(self) -> dict: + """Return matcher statistics.""" + return { + "total_patterns": len(self._patterns), + "domains": list(self._by_domain.keys()), + "keywords": len(self._keyword_index), + "embeddings_valid": sum(1 for e in self._pattern_embeddings if e), + "initialized": self._initialized, + } + + +def _find_patterns_dir() -> Optional[Path]: + """Locate the control_patterns directory.""" + candidates = [ + Path(__file__).resolve().parent.parent.parent.parent + / "ai-compliance-sdk" / "policies" / "control_patterns", + Path("/app/ai-compliance-sdk/policies/control_patterns"), + Path("ai-compliance-sdk/policies/control_patterns"), + ] + for p in candidates: + if p.is_dir(): + return p + return None diff --git a/backend-compliance/compliance/services/pipeline_adapter.py b/backend-compliance/compliance/services/pipeline_adapter.py new file mode 100644 index 0000000..50f3a39 --- /dev/null +++ b/backend-compliance/compliance/services/pipeline_adapter.py @@ -0,0 +1,670 @@ +"""Pipeline Adapter — New 10-Stage Pipeline Integration. + +Bridges the existing 7-stage control_generator pipeline with the new +multi-layer components (ObligationExtractor, PatternMatcher, ControlComposer). + +New pipeline flow: + chunk → license_classify + → obligation_extract (Stage 4 — NEW) + → pattern_match (Stage 5 — NEW) + → control_compose (Stage 6 — replaces old Stage 3) + → harmonize → anchor → store + crosswalk → mark processed + +Can be used in two modes: + 1. INLINE: Called from _process_batch() to enrich the pipeline + 2. STANDALONE: Process chunks directly through new stages + +Part of the Multi-Layer Control Architecture (Phase 7 of 8). +""" + +import hashlib +import json +import logging +from dataclasses import dataclass, field +from typing import Optional + +from sqlalchemy import text +from sqlalchemy.orm import Session + +from compliance.services.control_composer import ComposedControl, ControlComposer +from compliance.services.obligation_extractor import ObligationExtractor, ObligationMatch +from compliance.services.pattern_matcher import PatternMatcher, PatternMatchResult + +logger = logging.getLogger(__name__) + + +@dataclass +class PipelineChunk: + """Input chunk for the new pipeline stages.""" + + text: str + collection: str = "" + regulation_code: str = "" + article: Optional[str] = None + paragraph: Optional[str] = None + license_rule: int = 3 + license_info: dict = field(default_factory=dict) + source_citation: Optional[dict] = None + chunk_hash: str = "" + + def compute_hash(self) -> str: + if not self.chunk_hash: + self.chunk_hash = hashlib.sha256(self.text.encode()).hexdigest() + return self.chunk_hash + + +@dataclass +class PipelineResult: + """Result of processing a chunk through the new pipeline.""" + + chunk: PipelineChunk + obligation: ObligationMatch = field(default_factory=ObligationMatch) + pattern_result: PatternMatchResult = field(default_factory=PatternMatchResult) + control: Optional[ComposedControl] = None + crosswalk_written: bool = False + error: Optional[str] = None + + def to_dict(self) -> dict: + return { + "chunk_hash": self.chunk.chunk_hash, + "obligation": self.obligation.to_dict() if self.obligation else None, + "pattern": self.pattern_result.to_dict() if self.pattern_result else None, + "control": self.control.to_dict() if self.control else None, + "crosswalk_written": self.crosswalk_written, + "error": self.error, + } + + +class PipelineAdapter: + """Integrates ObligationExtractor + PatternMatcher + ControlComposer. + + Usage:: + + adapter = PipelineAdapter(db) + await adapter.initialize() + + result = await adapter.process_chunk(PipelineChunk( + text="...", + regulation_code="eu_2016_679", + article="Art. 30", + license_rule=1, + )) + """ + + def __init__(self, db: Optional[Session] = None): + self.db = db + self._extractor = ObligationExtractor() + self._matcher = PatternMatcher() + self._composer = ControlComposer() + self._initialized = False + + async def initialize(self) -> None: + """Initialize all sub-components.""" + if self._initialized: + return + await self._extractor.initialize() + await self._matcher.initialize() + self._initialized = True + logger.info("PipelineAdapter initialized") + + async def process_chunk(self, chunk: PipelineChunk) -> PipelineResult: + """Process a single chunk through the new 3-stage pipeline. + + Stage 4: Obligation Extract + Stage 5: Pattern Match + Stage 6: Control Compose + """ + if not self._initialized: + await self.initialize() + + chunk.compute_hash() + result = PipelineResult(chunk=chunk) + + try: + # Stage 4: Obligation Extract + result.obligation = await self._extractor.extract( + chunk_text=chunk.text, + regulation_code=chunk.regulation_code, + article=chunk.article, + paragraph=chunk.paragraph, + ) + + # Stage 5: Pattern Match + obligation_text = ( + result.obligation.obligation_text + or result.obligation.obligation_title + or chunk.text[:500] + ) + result.pattern_result = await self._matcher.match( + obligation_text=obligation_text, + regulation_id=result.obligation.regulation_id, + ) + + # Stage 6: Control Compose + result.control = await self._composer.compose( + obligation=result.obligation, + pattern_result=result.pattern_result, + chunk_text=chunk.text if chunk.license_rule in (1, 2) else None, + license_rule=chunk.license_rule, + source_citation=chunk.source_citation, + regulation_code=chunk.regulation_code, + ) + + except Exception as e: + logger.error("Pipeline processing failed: %s", e) + result.error = str(e) + + return result + + async def process_batch(self, chunks: list[PipelineChunk]) -> list[PipelineResult]: + """Process multiple chunks through the pipeline.""" + results = [] + for chunk in chunks: + result = await self.process_chunk(chunk) + results.append(result) + return results + + def write_crosswalk(self, result: PipelineResult, control_uuid: str) -> bool: + """Write obligation_extraction + crosswalk_matrix rows for a processed chunk. + + Called AFTER the control is stored in canonical_controls. + """ + if not self.db or not result.control: + return False + + chunk = result.chunk + obligation = result.obligation + pattern = result.pattern_result + + try: + # 1. Write obligation_extraction row + self.db.execute( + text(""" + INSERT INTO obligation_extractions ( + chunk_hash, collection, regulation_code, + article, paragraph, obligation_id, + obligation_text, confidence, extraction_method, + pattern_id, pattern_match_score, control_uuid + ) VALUES ( + :chunk_hash, :collection, :regulation_code, + :article, :paragraph, :obligation_id, + :obligation_text, :confidence, :extraction_method, + :pattern_id, :pattern_match_score, + CAST(:control_uuid AS uuid) + ) + """), + { + "chunk_hash": chunk.chunk_hash, + "collection": chunk.collection, + "regulation_code": chunk.regulation_code, + "article": chunk.article, + "paragraph": chunk.paragraph, + "obligation_id": obligation.obligation_id if obligation else None, + "obligation_text": ( + obligation.obligation_text[:2000] + if obligation and obligation.obligation_text + else None + ), + "confidence": obligation.confidence if obligation else 0, + "extraction_method": obligation.method if obligation else "none", + "pattern_id": pattern.pattern_id if pattern else None, + "pattern_match_score": pattern.confidence if pattern else 0, + "control_uuid": control_uuid, + }, + ) + + # 2. Write crosswalk_matrix row + self.db.execute( + text(""" + INSERT INTO crosswalk_matrix ( + regulation_code, article, paragraph, + obligation_id, pattern_id, + master_control_id, master_control_uuid, + confidence, source + ) VALUES ( + :regulation_code, :article, :paragraph, + :obligation_id, :pattern_id, + :master_control_id, + CAST(:master_control_uuid AS uuid), + :confidence, :source + ) + """), + { + "regulation_code": chunk.regulation_code, + "article": chunk.article, + "paragraph": chunk.paragraph, + "obligation_id": obligation.obligation_id if obligation else None, + "pattern_id": pattern.pattern_id if pattern else None, + "master_control_id": result.control.control_id, + "master_control_uuid": control_uuid, + "confidence": min( + obligation.confidence if obligation else 0, + pattern.confidence if pattern else 0, + ), + "source": "auto", + }, + ) + + # 3. Update canonical_controls with pattern_id + obligation_ids + if result.control.pattern_id or result.control.obligation_ids: + self.db.execute( + text(""" + UPDATE canonical_controls + SET pattern_id = COALESCE(:pattern_id, pattern_id), + obligation_ids = COALESCE(:obligation_ids, obligation_ids) + WHERE id = CAST(:control_uuid AS uuid) + """), + { + "pattern_id": result.control.pattern_id, + "obligation_ids": json.dumps(result.control.obligation_ids), + "control_uuid": control_uuid, + }, + ) + + self.db.commit() + result.crosswalk_written = True + return True + + except Exception as e: + logger.error("Failed to write crosswalk: %s", e) + self.db.rollback() + return False + + def stats(self) -> dict: + """Return component statistics.""" + return { + "extractor": self._extractor.stats(), + "matcher": self._matcher.stats(), + "initialized": self._initialized, + } + + +# --------------------------------------------------------------------------- +# Migration Passes — Backfill existing 4,800+ controls +# --------------------------------------------------------------------------- + + +class MigrationPasses: + """Non-destructive migration passes for existing controls. + + Pass 1: Obligation Linkage (deterministic, article→obligation lookup) + Pass 2: Pattern Classification (keyword-based matching) + Pass 3: Quality Triage (categorize by linkage completeness) + Pass 4: Crosswalk Backfill (write crosswalk rows for linked controls) + Pass 5: Deduplication (mark duplicate controls) + + Usage:: + + migration = MigrationPasses(db) + await migration.initialize() + + result = await migration.run_pass1_obligation_linkage(limit=100) + result = await migration.run_pass2_pattern_classification(limit=100) + result = migration.run_pass3_quality_triage() + result = migration.run_pass4_crosswalk_backfill() + result = migration.run_pass5_deduplication() + """ + + def __init__(self, db: Session): + self.db = db + self._extractor = ObligationExtractor() + self._matcher = PatternMatcher() + self._initialized = False + + async def initialize(self) -> None: + """Initialize extractors (loads obligations + patterns).""" + if self._initialized: + return + self._extractor._load_obligations() + self._matcher._load_patterns() + self._matcher._build_keyword_index() + self._initialized = True + + # ------------------------------------------------------------------- + # Pass 1: Obligation Linkage (deterministic) + # ------------------------------------------------------------------- + + async def run_pass1_obligation_linkage(self, limit: int = 0) -> dict: + """Link existing controls to obligations via source_citation article. + + For each control with source_citation → extract regulation + article + → look up in obligation framework → set obligation_ids. + """ + if not self._initialized: + await self.initialize() + + query = """ + SELECT id, control_id, source_citation, generation_metadata + FROM canonical_controls + WHERE release_state NOT IN ('deprecated') + AND (obligation_ids IS NULL OR obligation_ids = '[]') + """ + if limit > 0: + query += f" LIMIT {limit}" + + rows = self.db.execute(text(query)).fetchall() + + stats = {"total": len(rows), "linked": 0, "no_match": 0, "no_citation": 0} + + for row in rows: + control_uuid = str(row[0]) + control_id = row[1] + citation = row[2] + metadata = row[3] + + # Extract regulation + article from citation or metadata + reg_code, article = _extract_regulation_article(citation, metadata) + if not reg_code: + stats["no_citation"] += 1 + continue + + # Tier 1: Exact match + match = self._extractor._tier1_exact(reg_code, article or "") + if match and match.obligation_id: + self.db.execute( + text(""" + UPDATE canonical_controls + SET obligation_ids = :obl_ids + WHERE id = CAST(:uuid AS uuid) + """), + { + "obl_ids": json.dumps([match.obligation_id]), + "uuid": control_uuid, + }, + ) + stats["linked"] += 1 + else: + stats["no_match"] += 1 + + self.db.commit() + logger.info("Pass 1: %s", stats) + return stats + + # ------------------------------------------------------------------- + # Pass 2: Pattern Classification (keyword-based) + # ------------------------------------------------------------------- + + async def run_pass2_pattern_classification(self, limit: int = 0) -> dict: + """Classify existing controls into patterns via keyword matching. + + For each control without pattern_id → keyword-match title+objective + against pattern library → assign best match. + """ + if not self._initialized: + await self.initialize() + + query = """ + SELECT id, control_id, title, objective + FROM canonical_controls + WHERE release_state NOT IN ('deprecated') + AND (pattern_id IS NULL OR pattern_id = '') + """ + if limit > 0: + query += f" LIMIT {limit}" + + rows = self.db.execute(text(query)).fetchall() + + stats = {"total": len(rows), "classified": 0, "no_match": 0} + + for row in rows: + control_uuid = str(row[0]) + title = row[2] or "" + objective = row[3] or "" + + # Keyword match + match_text = f"{title} {objective}" + result = self._matcher._tier1_keyword(match_text, None) + + if result and result.pattern_id and result.keyword_hits >= 2: + self.db.execute( + text(""" + UPDATE canonical_controls + SET pattern_id = :pattern_id + WHERE id = CAST(:uuid AS uuid) + """), + { + "pattern_id": result.pattern_id, + "uuid": control_uuid, + }, + ) + stats["classified"] += 1 + else: + stats["no_match"] += 1 + + self.db.commit() + logger.info("Pass 2: %s", stats) + return stats + + # ------------------------------------------------------------------- + # Pass 3: Quality Triage + # ------------------------------------------------------------------- + + def run_pass3_quality_triage(self) -> dict: + """Categorize controls by linkage completeness. + + Sets generation_metadata.triage_status: + - "review": has both obligation_id + pattern_id + - "needs_obligation": has pattern_id but no obligation_id + - "needs_pattern": has obligation_id but no pattern_id + - "legacy_unlinked": has neither + """ + categories = { + "review": """ + UPDATE canonical_controls + SET generation_metadata = jsonb_set( + COALESCE(generation_metadata::jsonb, '{}'::jsonb), + '{triage_status}', '"review"' + ) + WHERE release_state NOT IN ('deprecated') + AND obligation_ids IS NOT NULL AND obligation_ids != '[]' + AND pattern_id IS NOT NULL AND pattern_id != '' + """, + "needs_obligation": """ + UPDATE canonical_controls + SET generation_metadata = jsonb_set( + COALESCE(generation_metadata::jsonb, '{}'::jsonb), + '{triage_status}', '"needs_obligation"' + ) + WHERE release_state NOT IN ('deprecated') + AND (obligation_ids IS NULL OR obligation_ids = '[]') + AND pattern_id IS NOT NULL AND pattern_id != '' + """, + "needs_pattern": """ + UPDATE canonical_controls + SET generation_metadata = jsonb_set( + COALESCE(generation_metadata::jsonb, '{}'::jsonb), + '{triage_status}', '"needs_pattern"' + ) + WHERE release_state NOT IN ('deprecated') + AND obligation_ids IS NOT NULL AND obligation_ids != '[]' + AND (pattern_id IS NULL OR pattern_id = '') + """, + "legacy_unlinked": """ + UPDATE canonical_controls + SET generation_metadata = jsonb_set( + COALESCE(generation_metadata::jsonb, '{}'::jsonb), + '{triage_status}', '"legacy_unlinked"' + ) + WHERE release_state NOT IN ('deprecated') + AND (obligation_ids IS NULL OR obligation_ids = '[]') + AND (pattern_id IS NULL OR pattern_id = '') + """, + } + + stats = {} + for category, sql in categories.items(): + result = self.db.execute(text(sql)) + stats[category] = result.rowcount + + self.db.commit() + logger.info("Pass 3: %s", stats) + return stats + + # ------------------------------------------------------------------- + # Pass 4: Crosswalk Backfill + # ------------------------------------------------------------------- + + def run_pass4_crosswalk_backfill(self) -> dict: + """Create crosswalk_matrix rows for controls with obligation + pattern. + + Only creates rows that don't already exist. + """ + result = self.db.execute(text(""" + INSERT INTO crosswalk_matrix ( + regulation_code, obligation_id, pattern_id, + master_control_id, master_control_uuid, + confidence, source + ) + SELECT + COALESCE( + (generation_metadata::jsonb->>'source_regulation'), + '' + ) AS regulation_code, + obl.value::text AS obligation_id, + cc.pattern_id, + cc.control_id, + cc.id, + 0.80, + 'migrated' + FROM canonical_controls cc, + jsonb_array_elements_text( + COALESCE(cc.obligation_ids::jsonb, '[]'::jsonb) + ) AS obl(value) + WHERE cc.release_state NOT IN ('deprecated') + AND cc.pattern_id IS NOT NULL AND cc.pattern_id != '' + AND cc.obligation_ids IS NOT NULL AND cc.obligation_ids != '[]' + AND NOT EXISTS ( + SELECT 1 FROM crosswalk_matrix cw + WHERE cw.master_control_uuid = cc.id + AND cw.obligation_id = obl.value::text + ) + """)) + + rows_inserted = result.rowcount + self.db.commit() + logger.info("Pass 4: %d crosswalk rows inserted", rows_inserted) + return {"rows_inserted": rows_inserted} + + # ------------------------------------------------------------------- + # Pass 5: Deduplication + # ------------------------------------------------------------------- + + def run_pass5_deduplication(self) -> dict: + """Mark duplicate controls (same obligation + same pattern). + + Groups controls by (obligation_id, pattern_id), keeps the one with + highest evidence_confidence (or newest), marks rest as deprecated. + """ + # Find groups with duplicates + groups = self.db.execute(text(""" + SELECT cc.pattern_id, + obl.value::text AS obligation_id, + array_agg(cc.id ORDER BY cc.evidence_confidence DESC NULLS LAST, cc.created_at DESC) AS ids, + count(*) AS cnt + FROM canonical_controls cc, + jsonb_array_elements_text( + COALESCE(cc.obligation_ids::jsonb, '[]'::jsonb) + ) AS obl(value) + WHERE cc.release_state NOT IN ('deprecated') + AND cc.pattern_id IS NOT NULL AND cc.pattern_id != '' + GROUP BY cc.pattern_id, obl.value::text + HAVING count(*) > 1 + """)).fetchall() + + stats = {"groups_found": len(groups), "controls_deprecated": 0} + + for group in groups: + ids = group[2] # Array of UUIDs, first is the keeper + if len(ids) <= 1: + continue + + # Keep first (highest confidence), deprecate rest + deprecate_ids = ids[1:] + for dep_id in deprecate_ids: + self.db.execute( + text(""" + UPDATE canonical_controls + SET release_state = 'deprecated', + generation_metadata = jsonb_set( + COALESCE(generation_metadata::jsonb, '{}'::jsonb), + '{deprecated_reason}', '"duplicate_same_obligation_pattern"' + ) + WHERE id = CAST(:uuid AS uuid) + AND release_state != 'deprecated' + """), + {"uuid": str(dep_id)}, + ) + stats["controls_deprecated"] += 1 + + self.db.commit() + logger.info("Pass 5: %s", stats) + return stats + + def migration_status(self) -> dict: + """Return overall migration progress.""" + row = self.db.execute(text(""" + SELECT + count(*) AS total, + count(*) FILTER (WHERE obligation_ids IS NOT NULL AND obligation_ids != '[]') AS has_obligation, + count(*) FILTER (WHERE pattern_id IS NOT NULL AND pattern_id != '') AS has_pattern, + count(*) FILTER ( + WHERE obligation_ids IS NOT NULL AND obligation_ids != '[]' + AND pattern_id IS NOT NULL AND pattern_id != '' + ) AS fully_linked, + count(*) FILTER (WHERE release_state = 'deprecated') AS deprecated + FROM canonical_controls + """)).fetchone() + + return { + "total_controls": row[0], + "has_obligation": row[1], + "has_pattern": row[2], + "fully_linked": row[3], + "deprecated": row[4], + "coverage_obligation_pct": round(row[1] / max(row[0], 1) * 100, 1), + "coverage_pattern_pct": round(row[2] / max(row[0], 1) * 100, 1), + "coverage_full_pct": round(row[3] / max(row[0], 1) * 100, 1), + } + + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + + +def _extract_regulation_article( + citation: Optional[str], metadata: Optional[str] +) -> tuple[Optional[str], Optional[str]]: + """Extract regulation_code and article from control's citation/metadata.""" + from compliance.services.obligation_extractor import _normalize_regulation + + reg_code = None + article = None + + # Try citation first (JSON string or dict) + if citation: + try: + c = json.loads(citation) if isinstance(citation, str) else citation + if isinstance(c, dict): + article = c.get("article") or c.get("source_article") + # Try to get regulation from source field + source = c.get("source", "") + if source: + reg_code = _normalize_regulation(source) + except (json.JSONDecodeError, TypeError): + pass + + # Try metadata + if metadata and not reg_code: + try: + m = json.loads(metadata) if isinstance(metadata, str) else metadata + if isinstance(m, dict): + src_reg = m.get("source_regulation", "") + if src_reg: + reg_code = _normalize_regulation(src_reg) + if not article: + article = m.get("source_article") + except (json.JSONDecodeError, TypeError): + pass + + return reg_code, article diff --git a/backend-compliance/migrations/060_crosswalk_matrix.sql b/backend-compliance/migrations/060_crosswalk_matrix.sql new file mode 100644 index 0000000..b86ac5a --- /dev/null +++ b/backend-compliance/migrations/060_crosswalk_matrix.sql @@ -0,0 +1,120 @@ +-- Migration 060: Multi-Layer Control Architecture — DB Schema +-- Adds obligation_extractions, control_patterns, and crosswalk_matrix tables. +-- Extends canonical_controls with pattern_id and obligation_ids columns. +-- +-- Part of the Multi-Layer Control Architecture (Phase 1 of 8). +-- See: Legal Source → Obligation → Control Pattern → Master Control → Customer Instance + +-- ============================================================================= +-- 1. Obligation Extractions +-- Tracks how each RAG chunk was linked to an obligation (exact, embedding, LLM). +-- ============================================================================= + +CREATE TABLE IF NOT EXISTS obligation_extractions ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + chunk_hash VARCHAR(64) NOT NULL, + collection VARCHAR(100) NOT NULL, + regulation_code VARCHAR(100) NOT NULL, + article VARCHAR(100), + paragraph VARCHAR(100), + obligation_id VARCHAR(50), + obligation_text TEXT, + confidence NUMERIC(3,2) CHECK (confidence >= 0 AND confidence <= 1), + extraction_method VARCHAR(30) NOT NULL + CHECK (extraction_method IN ('exact_match', 'embedding_match', 'llm_extracted', 'inferred')), + pattern_id VARCHAR(50), + pattern_match_score NUMERIC(3,2) CHECK (pattern_match_score >= 0 AND pattern_match_score <= 1), + control_uuid UUID REFERENCES canonical_controls(id), + job_id UUID REFERENCES canonical_generation_jobs(id), + created_at TIMESTAMPTZ DEFAULT NOW() +); + +CREATE INDEX IF NOT EXISTS idx_oe_obligation ON obligation_extractions(obligation_id); +CREATE INDEX IF NOT EXISTS idx_oe_pattern ON obligation_extractions(pattern_id); +CREATE INDEX IF NOT EXISTS idx_oe_control ON obligation_extractions(control_uuid); +CREATE INDEX IF NOT EXISTS idx_oe_regulation ON obligation_extractions(regulation_code); +CREATE INDEX IF NOT EXISTS idx_oe_chunk ON obligation_extractions(chunk_hash); +CREATE INDEX IF NOT EXISTS idx_oe_method ON obligation_extractions(extraction_method); + +COMMENT ON TABLE obligation_extractions IS + 'Tracks chunk-to-obligation linkage from the 3-tier extraction pipeline (exact/embedding/LLM)'; + +-- ============================================================================= +-- 2. Control Patterns Registry +-- DB mirror of the YAML pattern library for SQL queries and joins. +-- ============================================================================= + +CREATE TABLE IF NOT EXISTS control_patterns ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + pattern_id VARCHAR(50) UNIQUE NOT NULL, + name VARCHAR(255) NOT NULL, + name_de VARCHAR(255), + domain VARCHAR(10) NOT NULL, + category VARCHAR(50), + description TEXT, + template_objective TEXT, + template_rationale TEXT, + template_requirements JSONB DEFAULT '[]', + template_test_procedure JSONB DEFAULT '[]', + template_evidence JSONB DEFAULT '[]', + severity_default VARCHAR(20) + CHECK (severity_default IN ('low', 'medium', 'high', 'critical')), + implementation_effort_default VARCHAR(2) + CHECK (implementation_effort_default IN ('s', 'm', 'l', 'xl')), + obligation_match_keywords JSONB DEFAULT '[]', + tags JSONB DEFAULT '[]', + open_anchor_refs JSONB DEFAULT '[]', + composable_with JSONB DEFAULT '[]', + version VARCHAR(10) DEFAULT '1.0', + created_at TIMESTAMPTZ DEFAULT NOW(), + updated_at TIMESTAMPTZ DEFAULT NOW() +); + +CREATE INDEX IF NOT EXISTS idx_cp_domain ON control_patterns(domain); +CREATE INDEX IF NOT EXISTS idx_cp_category ON control_patterns(category); +CREATE INDEX IF NOT EXISTS idx_cp_pattern_id ON control_patterns(pattern_id); + +COMMENT ON TABLE control_patterns IS + 'Registry of control patterns (DB mirror of YAML library). Pattern ID format: CP-{DOMAIN}-{NNN}'; + +-- ============================================================================= +-- 3. Crosswalk Matrix +-- The "golden thread" from legal source through to implementation. +-- ============================================================================= + +CREATE TABLE IF NOT EXISTS crosswalk_matrix ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + regulation_code VARCHAR(100) NOT NULL, + article VARCHAR(100), + paragraph VARCHAR(100), + obligation_id VARCHAR(50), + pattern_id VARCHAR(50), + master_control_id VARCHAR(20), + master_control_uuid UUID REFERENCES canonical_controls(id), + tom_control_id VARCHAR(30), + confidence NUMERIC(3,2) CHECK (confidence >= 0 AND confidence <= 1), + source VARCHAR(30) DEFAULT 'auto' + CHECK (source IN ('manual', 'auto', 'migrated')), + created_at TIMESTAMPTZ DEFAULT NOW() +); + +CREATE INDEX IF NOT EXISTS idx_cw_regulation ON crosswalk_matrix(regulation_code, article); +CREATE INDEX IF NOT EXISTS idx_cw_obligation ON crosswalk_matrix(obligation_id); +CREATE INDEX IF NOT EXISTS idx_cw_pattern ON crosswalk_matrix(pattern_id); +CREATE INDEX IF NOT EXISTS idx_cw_control ON crosswalk_matrix(master_control_id); +CREATE INDEX IF NOT EXISTS idx_cw_tom ON crosswalk_matrix(tom_control_id); + +COMMENT ON TABLE crosswalk_matrix IS + 'Golden thread: regulation → article → obligation → pattern → master control → TOM'; + +-- ============================================================================= +-- 4. Extend canonical_controls with pattern + obligation linkage +-- ============================================================================= + +ALTER TABLE canonical_controls + ADD COLUMN IF NOT EXISTS pattern_id VARCHAR(50); + +ALTER TABLE canonical_controls + ADD COLUMN IF NOT EXISTS obligation_ids JSONB DEFAULT '[]'; + +CREATE INDEX IF NOT EXISTS idx_cc_pattern ON canonical_controls(pattern_id); diff --git a/backend-compliance/migrations/061_obligation_candidates.sql b/backend-compliance/migrations/061_obligation_candidates.sql new file mode 100644 index 0000000..57468a1 --- /dev/null +++ b/backend-compliance/migrations/061_obligation_candidates.sql @@ -0,0 +1,49 @@ +-- Migration 061: Obligation Candidates + Decomposition Tracking +-- Supports Pass 0a (Obligation Extraction from Rich Controls) and +-- Pass 0b (Atomic Control Composition). +-- +-- Part of the Multi-Layer Control Architecture — Decomposition Pass. + +-- ============================================================================= +-- 1. Obligation Candidates +-- Individual normative obligations extracted from Rich Controls (Pass 0a). +-- ============================================================================= + +CREATE TABLE IF NOT EXISTS obligation_candidates ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + parent_control_uuid UUID NOT NULL REFERENCES canonical_controls(id), + candidate_id VARCHAR(30) NOT NULL, + obligation_text TEXT NOT NULL, + action VARCHAR(500), + object TEXT, + condition TEXT, + normative_strength VARCHAR(20) DEFAULT 'must' + CHECK (normative_strength IN ('must', 'should', 'may')), + is_test_obligation BOOLEAN DEFAULT FALSE, + is_reporting_obligation BOOLEAN DEFAULT FALSE, + extraction_confidence NUMERIC(3,2) DEFAULT 0.0 + CHECK (extraction_confidence >= 0 AND extraction_confidence <= 1), + quality_flags JSONB DEFAULT '{}', + release_state VARCHAR(30) DEFAULT 'extracted' + CHECK (release_state IN ('extracted', 'validated', 'rejected', 'composed')), + created_at TIMESTAMPTZ DEFAULT NOW() +); + +CREATE INDEX IF NOT EXISTS idx_oc_parent ON obligation_candidates(parent_control_uuid); +CREATE INDEX IF NOT EXISTS idx_oc_state ON obligation_candidates(release_state); +CREATE INDEX IF NOT EXISTS idx_oc_candidate ON obligation_candidates(candidate_id); + +COMMENT ON TABLE obligation_candidates IS + 'Individual normative obligations extracted from Rich Controls via Pass 0a decomposition'; + +-- ============================================================================= +-- 2. Extend canonical_controls for decomposition tracking +-- ============================================================================= + +ALTER TABLE canonical_controls + ADD COLUMN IF NOT EXISTS parent_control_uuid UUID REFERENCES canonical_controls(id); + +ALTER TABLE canonical_controls + ADD COLUMN IF NOT EXISTS decomposition_method VARCHAR(30); + +CREATE INDEX IF NOT EXISTS idx_cc_parent ON canonical_controls(parent_control_uuid); diff --git a/backend-compliance/tests/test_control_composer.py b/backend-compliance/tests/test_control_composer.py new file mode 100644 index 0000000..6ab3ef0 --- /dev/null +++ b/backend-compliance/tests/test_control_composer.py @@ -0,0 +1,890 @@ +"""Tests for Control Composer — Phase 6 of Multi-Layer Control Architecture. + +Validates: +- ComposedControl dataclass and serialization +- Pattern-guided composition (Tier 1) +- Template-only fallback (when LLM fails) +- Fallback composition (no pattern) +- License rule handling (Rules 1, 2, 3) +- Prompt building +- Field validation and fixing +- Batch composition +- Edge cases: empty inputs, missing data, malformed LLM responses +""" + +import json +from unittest.mock import AsyncMock, patch + +import pytest + +from compliance.services.control_composer import ( + ComposedControl, + ControlComposer, + _anchors_from_pattern, + _build_compose_prompt, + _build_fallback_prompt, + _compose_system_prompt, + _ensure_list, + _obligation_section, + _pattern_section, + _severity_to_risk, + _validate_control, +) +from compliance.services.obligation_extractor import ObligationMatch +from compliance.services.pattern_matcher import ControlPattern, PatternMatchResult + + +# ============================================================================= +# Fixtures +# ============================================================================= + + +def _make_obligation( + obligation_id="DSGVO-OBL-001", + title="Verarbeitungsverzeichnis fuehren", + text="Fuehrung eines Verzeichnisses aller Verarbeitungstaetigkeiten.", + method="exact_match", + confidence=1.0, + regulation_id="dsgvo", +) -> ObligationMatch: + return ObligationMatch( + obligation_id=obligation_id, + obligation_title=title, + obligation_text=text, + method=method, + confidence=confidence, + regulation_id=regulation_id, + ) + + +def _make_pattern( + pattern_id="CP-COMP-001", + name="compliance_governance", + name_de="Compliance-Governance", + domain="COMP", + category="compliance", +) -> ControlPattern: + return ControlPattern( + id=pattern_id, + name=name, + name_de=name_de, + domain=domain, + category=category, + description="Compliance management and governance framework", + objective_template="Sicherstellen, dass ein wirksames Compliance-Management existiert.", + rationale_template="Ohne Governance fehlt die Grundlage fuer Compliance.", + requirements_template=[ + "Compliance-Verantwortlichkeiten definieren", + "Regelmaessige Compliance-Bewertungen durchfuehren", + "Dokumentationspflichten einhalten", + ], + test_procedure_template=[ + "Pruefung der Compliance-Organisation", + "Stichproben der Dokumentation", + ], + evidence_template=[ + "Compliance-Handbuch", + "Pruefberichte", + ], + severity_default="high", + implementation_effort_default="l", + obligation_match_keywords=["compliance", "governance", "konformitaet"], + tags=["compliance", "governance"], + composable_with=["CP-COMP-002"], + open_anchor_refs=[ + {"framework": "ISO 27001", "ref": "A.18"}, + {"framework": "NIST CSF", "ref": "GV.OC"}, + ], + ) + + +def _make_pattern_result(pattern=None, confidence=0.85, method="keyword") -> PatternMatchResult: + if pattern is None: + pattern = _make_pattern() + return PatternMatchResult( + pattern=pattern, + pattern_id=pattern.id, + method=method, + confidence=confidence, + keyword_hits=4, + total_keywords=7, + ) + + +def _llm_success_response() -> str: + return json.dumps({ + "title": "Compliance-Governance fuer Verarbeitungstaetigkeiten", + "objective": "Sicherstellen, dass alle Verarbeitungstaetigkeiten dokumentiert und ueberwacht werden.", + "rationale": "Die DSGVO verlangt ein Verarbeitungsverzeichnis als Grundlage der Rechenschaftspflicht.", + "requirements": [ + "Verarbeitungsverzeichnis gemaess Art. 30 DSGVO fuehren", + "Regelmaessige Aktualisierung bei Aenderungen", + "Verantwortlichkeiten fuer die Pflege zuweisen", + ], + "test_procedure": [ + "Vollstaendigkeit des Verzeichnisses pruefen", + "Aktualitaet der Eintraege verifizieren", + ], + "evidence": [ + "Verarbeitungsverzeichnis", + "Aenderungsprotokoll", + ], + "severity": "high", + "implementation_effort": "m", + "category": "compliance", + "tags": ["dsgvo", "verarbeitungsverzeichnis", "governance"], + "target_audience": ["unternehmen", "behoerden"], + "verification_method": "document", + }) + + +# ============================================================================= +# Tests: ComposedControl +# ============================================================================= + + +class TestComposedControl: + """Tests for the ComposedControl dataclass.""" + + def test_defaults(self): + c = ComposedControl() + assert c.control_id == "" + assert c.title == "" + assert c.severity == "medium" + assert c.risk_score == 5.0 + assert c.implementation_effort == "m" + assert c.release_state == "draft" + assert c.license_rule is None + assert c.customer_visible is True + assert c.pattern_id is None + assert c.obligation_ids == [] + assert c.composition_method == "pattern_guided" + + def test_to_dict_keys(self): + c = ComposedControl() + d = c.to_dict() + expected_keys = { + "control_id", "title", "objective", "rationale", "scope", + "requirements", "test_procedure", "evidence", "severity", + "risk_score", "implementation_effort", "open_anchors", + "release_state", "tags", "license_rule", "source_original_text", + "source_citation", "customer_visible", "verification_method", + "category", "target_audience", "pattern_id", "obligation_ids", + "generation_metadata", "composition_method", + } + assert set(d.keys()) == expected_keys + + def test_to_dict_values(self): + c = ComposedControl( + title="Test Control", + pattern_id="CP-AUTH-001", + obligation_ids=["DSGVO-OBL-001"], + severity="high", + license_rule=1, + ) + d = c.to_dict() + assert d["title"] == "Test Control" + assert d["pattern_id"] == "CP-AUTH-001" + assert d["obligation_ids"] == ["DSGVO-OBL-001"] + assert d["severity"] == "high" + assert d["license_rule"] == 1 + + +# ============================================================================= +# Tests: _ensure_list +# ============================================================================= + + +class TestEnsureList: + def test_list_passthrough(self): + assert _ensure_list(["a", "b"]) == ["a", "b"] + + def test_string_to_list(self): + assert _ensure_list("hello") == ["hello"] + + def test_none_to_empty(self): + assert _ensure_list(None) == [] + + def test_empty_list(self): + assert _ensure_list([]) == [] + + def test_filters_empty_values(self): + assert _ensure_list(["a", "", "b"]) == ["a", "b"] + + def test_converts_to_strings(self): + assert _ensure_list([1, 2, 3]) == ["1", "2", "3"] + + +# ============================================================================= +# Tests: _anchors_from_pattern +# ============================================================================= + + +class TestAnchorsFromPattern: + def test_converts_anchors(self): + pattern = _make_pattern() + anchors = _anchors_from_pattern(pattern) + assert len(anchors) == 2 + assert anchors[0]["framework"] == "ISO 27001" + assert anchors[0]["control_id"] == "A.18" + assert anchors[0]["alignment_score"] == 0.8 + + def test_empty_anchors(self): + pattern = _make_pattern() + pattern.open_anchor_refs = [] + anchors = _anchors_from_pattern(pattern) + assert anchors == [] + + +# ============================================================================= +# Tests: _severity_to_risk +# ============================================================================= + + +class TestSeverityToRisk: + def test_critical(self): + assert _severity_to_risk("critical") == 9.0 + + def test_high(self): + assert _severity_to_risk("high") == 7.0 + + def test_medium(self): + assert _severity_to_risk("medium") == 5.0 + + def test_low(self): + assert _severity_to_risk("low") == 3.0 + + def test_unknown(self): + assert _severity_to_risk("xyz") == 5.0 + + +# ============================================================================= +# Tests: _validate_control +# ============================================================================= + + +class TestValidateControl: + def test_fixes_invalid_severity(self): + c = ComposedControl(severity="extreme") + _validate_control(c) + assert c.severity == "medium" + + def test_keeps_valid_severity(self): + c = ComposedControl(severity="critical") + _validate_control(c) + assert c.severity == "critical" + + def test_fixes_invalid_effort(self): + c = ComposedControl(implementation_effort="xxl") + _validate_control(c) + assert c.implementation_effort == "m" + + def test_fixes_invalid_verification(self): + c = ComposedControl(verification_method="magic") + _validate_control(c) + assert c.verification_method is None + + def test_keeps_valid_verification(self): + c = ComposedControl(verification_method="code_review") + _validate_control(c) + assert c.verification_method == "code_review" + + def test_fixes_risk_score_out_of_range(self): + c = ComposedControl(risk_score=15.0, severity="high") + _validate_control(c) + assert c.risk_score == 7.0 # from severity + + def test_truncates_long_title(self): + c = ComposedControl(title="A" * 300) + _validate_control(c) + assert len(c.title) <= 255 + + def test_ensures_minimum_content(self): + c = ComposedControl( + title="Test", + objective="", + rationale="", + requirements=[], + test_procedure=[], + evidence=[], + ) + _validate_control(c) + assert c.objective == "Test" # falls back to title + assert c.rationale != "" + assert len(c.requirements) >= 1 + assert len(c.test_procedure) >= 1 + assert len(c.evidence) >= 1 + + +# ============================================================================= +# Tests: Prompt builders +# ============================================================================= + + +class TestPromptBuilders: + def test_compose_system_prompt_rule1(self): + prompt = _compose_system_prompt(1) + assert "praxisorientiertes" in prompt + assert "KOPIERE KEINE" not in prompt + + def test_compose_system_prompt_rule3(self): + prompt = _compose_system_prompt(3) + assert "KOPIERE KEINE" in prompt + assert "NENNE NICHT die Quelle" in prompt + + def test_obligation_section_full(self): + obl = _make_obligation() + section = _obligation_section(obl) + assert "PFLICHT" in section + assert "Verarbeitungsverzeichnis" in section + assert "DSGVO-OBL-001" in section + assert "dsgvo" in section + + def test_obligation_section_minimal(self): + obl = ObligationMatch() + section = _obligation_section(obl) + assert "Keine spezifische Pflicht" in section + + def test_pattern_section(self): + pattern = _make_pattern() + section = _pattern_section(pattern) + assert "MUSTER" in section + assert "Compliance-Governance" in section + assert "CP-COMP-001" in section + assert "Compliance-Verantwortlichkeiten" in section + + def test_build_compose_prompt_rule1(self): + obl = _make_obligation() + pattern = _make_pattern() + prompt = _build_compose_prompt(obl, pattern, "Original text here", 1) + assert "PFLICHT" in prompt + assert "MUSTER" in prompt + assert "KONTEXT (Originaltext)" in prompt + assert "Original text here" in prompt + + def test_build_compose_prompt_rule3(self): + obl = _make_obligation() + pattern = _make_pattern() + prompt = _build_compose_prompt(obl, pattern, "Secret text", 3) + assert "Intern analysiert" in prompt + assert "Secret text" not in prompt + + def test_build_fallback_prompt(self): + obl = _make_obligation() + prompt = _build_fallback_prompt(obl, "Chunk text", 1) + assert "PFLICHT" in prompt + assert "KONTEXT (Originaltext)" in prompt + + def test_build_fallback_prompt_no_chunk(self): + obl = _make_obligation() + prompt = _build_fallback_prompt(obl, None, 1) + assert "Kein Originaltext" in prompt + + +# ============================================================================= +# Tests: ControlComposer — Pattern-guided composition +# ============================================================================= + + +class TestComposeWithPattern: + """Tests for pattern-guided control composition.""" + + def setup_method(self): + self.composer = ControlComposer() + self.obligation = _make_obligation() + self.pattern_result = _make_pattern_result() + + @pytest.mark.asyncio + async def test_compose_success_rule1(self): + """Successful LLM composition with Rule 1.""" + with patch( + "compliance.services.control_composer._llm_ollama", + new_callable=AsyncMock, + return_value=_llm_success_response(), + ): + control = await self.composer.compose( + obligation=self.obligation, + pattern_result=self.pattern_result, + chunk_text="Der Verantwortliche fuehrt ein Verzeichnis...", + license_rule=1, + ) + + assert control.composition_method == "pattern_guided" + assert control.title != "" + assert "Verarbeitungstaetigkeiten" in control.objective + assert len(control.requirements) >= 2 + assert len(control.test_procedure) >= 1 + assert len(control.evidence) >= 1 + assert control.severity == "high" + assert control.category == "compliance" + + @pytest.mark.asyncio + async def test_compose_sets_linkage(self): + """Pattern and obligation IDs should be set.""" + with patch( + "compliance.services.control_composer._llm_ollama", + new_callable=AsyncMock, + return_value=_llm_success_response(), + ): + control = await self.composer.compose( + obligation=self.obligation, + pattern_result=self.pattern_result, + license_rule=1, + ) + + assert control.pattern_id == "CP-COMP-001" + assert control.obligation_ids == ["DSGVO-OBL-001"] + + @pytest.mark.asyncio + async def test_compose_sets_metadata(self): + """Generation metadata should include composition details.""" + with patch( + "compliance.services.control_composer._llm_ollama", + new_callable=AsyncMock, + return_value=_llm_success_response(), + ): + control = await self.composer.compose( + obligation=self.obligation, + pattern_result=self.pattern_result, + license_rule=1, + regulation_code="eu_2016_679", + ) + + meta = control.generation_metadata + assert meta["composition_method"] == "pattern_guided" + assert meta["pattern_id"] == "CP-COMP-001" + assert meta["pattern_confidence"] == 0.85 + assert meta["obligation_id"] == "DSGVO-OBL-001" + assert meta["license_rule"] == 1 + assert meta["regulation_code"] == "eu_2016_679" + + @pytest.mark.asyncio + async def test_compose_rule1_stores_original(self): + """Rule 1: original text should be stored.""" + with patch( + "compliance.services.control_composer._llm_ollama", + new_callable=AsyncMock, + return_value=_llm_success_response(), + ): + control = await self.composer.compose( + obligation=self.obligation, + pattern_result=self.pattern_result, + chunk_text="Original DSGVO text", + license_rule=1, + ) + + assert control.license_rule == 1 + assert control.source_original_text == "Original DSGVO text" + assert control.customer_visible is True + + @pytest.mark.asyncio + async def test_compose_rule2_stores_citation(self): + """Rule 2: citation should be stored.""" + citation = { + "source": "OWASP ASVS", + "license": "CC-BY-SA-4.0", + "license_notice": "OWASP Foundation", + } + with patch( + "compliance.services.control_composer._llm_ollama", + new_callable=AsyncMock, + return_value=_llm_success_response(), + ): + control = await self.composer.compose( + obligation=self.obligation, + pattern_result=self.pattern_result, + chunk_text="OWASP text", + license_rule=2, + source_citation=citation, + ) + + assert control.license_rule == 2 + assert control.source_original_text == "OWASP text" + assert control.source_citation == citation + assert control.customer_visible is True + + @pytest.mark.asyncio + async def test_compose_rule3_no_original(self): + """Rule 3: no original text, not customer visible.""" + with patch( + "compliance.services.control_composer._llm_ollama", + new_callable=AsyncMock, + return_value=_llm_success_response(), + ): + control = await self.composer.compose( + obligation=self.obligation, + pattern_result=self.pattern_result, + chunk_text="BSI restricted text", + license_rule=3, + ) + + assert control.license_rule == 3 + assert control.source_original_text is None + assert control.source_citation is None + assert control.customer_visible is False + + +# ============================================================================= +# Tests: ControlComposer — Template-only fallback (LLM fails) +# ============================================================================= + + +class TestTemplateOnlyFallback: + """Tests for template-only composition when LLM fails.""" + + def setup_method(self): + self.composer = ControlComposer() + self.obligation = _make_obligation() + self.pattern_result = _make_pattern_result() + + @pytest.mark.asyncio + async def test_template_fallback_on_empty_llm(self): + """When LLM returns empty, should use template directly.""" + with patch( + "compliance.services.control_composer._llm_ollama", + new_callable=AsyncMock, + return_value="", + ): + control = await self.composer.compose( + obligation=self.obligation, + pattern_result=self.pattern_result, + license_rule=1, + ) + + assert control.composition_method == "template_only" + assert "Compliance-Governance" in control.title + assert control.severity == "high" # from pattern + assert len(control.requirements) >= 2 # from pattern template + + @pytest.mark.asyncio + async def test_template_fallback_on_invalid_json(self): + """When LLM returns non-JSON, should use template.""" + with patch( + "compliance.services.control_composer._llm_ollama", + new_callable=AsyncMock, + return_value="This is not JSON at all", + ): + control = await self.composer.compose( + obligation=self.obligation, + pattern_result=self.pattern_result, + license_rule=1, + ) + + assert control.composition_method == "template_only" + + @pytest.mark.asyncio + async def test_template_includes_obligation_title(self): + """Template fallback should include obligation title in control title.""" + with patch( + "compliance.services.control_composer._llm_ollama", + new_callable=AsyncMock, + return_value="", + ): + control = await self.composer.compose( + obligation=self.obligation, + pattern_result=self.pattern_result, + license_rule=1, + ) + + assert "Verarbeitungsverzeichnis" in control.title + + @pytest.mark.asyncio + async def test_template_has_open_anchors(self): + """Template fallback should include pattern anchors.""" + with patch( + "compliance.services.control_composer._llm_ollama", + new_callable=AsyncMock, + return_value="", + ): + control = await self.composer.compose( + obligation=self.obligation, + pattern_result=self.pattern_result, + license_rule=1, + ) + + assert len(control.open_anchors) == 2 + frameworks = [a["framework"] for a in control.open_anchors] + assert "ISO 27001" in frameworks + + +# ============================================================================= +# Tests: ControlComposer — Fallback (no pattern) +# ============================================================================= + + +class TestFallbackNoPattern: + """Tests for fallback composition without a pattern.""" + + def setup_method(self): + self.composer = ControlComposer() + self.obligation = _make_obligation() + + @pytest.mark.asyncio + async def test_fallback_with_llm(self): + """Fallback should work with LLM response.""" + response = json.dumps({ + "title": "Verarbeitungsverzeichnis", + "objective": "Verzeichnis fuehren", + "rationale": "DSGVO Art. 30", + "requirements": ["VVT anlegen"], + "test_procedure": ["VVT pruefen"], + "evidence": ["VVT Dokument"], + "severity": "high", + }) + with patch( + "compliance.services.control_composer._llm_ollama", + new_callable=AsyncMock, + return_value=response, + ): + control = await self.composer.compose( + obligation=self.obligation, + pattern_result=PatternMatchResult(), # No pattern + license_rule=1, + ) + + assert control.composition_method == "fallback" + assert control.pattern_id is None + assert control.release_state == "needs_review" + + @pytest.mark.asyncio + async def test_fallback_llm_fails(self): + """Fallback with LLM failure should still produce a control.""" + with patch( + "compliance.services.control_composer._llm_ollama", + new_callable=AsyncMock, + return_value="", + ): + control = await self.composer.compose( + obligation=self.obligation, + pattern_result=PatternMatchResult(), + license_rule=1, + ) + + assert control.composition_method == "fallback" + assert control.title != "" + # Validation ensures minimum content + assert len(control.requirements) >= 1 + assert len(control.test_procedure) >= 1 + + @pytest.mark.asyncio + async def test_fallback_no_obligation_text(self): + """Fallback with empty obligation should still work.""" + empty_obl = ObligationMatch() + with patch( + "compliance.services.control_composer._llm_ollama", + new_callable=AsyncMock, + return_value="", + ): + control = await self.composer.compose( + obligation=empty_obl, + pattern_result=PatternMatchResult(), + license_rule=3, + ) + + assert control.title != "" + assert control.customer_visible is False + + +# ============================================================================= +# Tests: ControlComposer — Batch composition +# ============================================================================= + + +class TestComposeBatch: + """Tests for batch composition.""" + + @pytest.mark.asyncio + async def test_batch_returns_list(self): + composer = ControlComposer() + items = [ + { + "obligation": _make_obligation(), + "pattern_result": _make_pattern_result(), + "license_rule": 1, + }, + { + "obligation": _make_obligation( + obligation_id="NIS2-OBL-001", + title="Incident Meldepflicht", + regulation_id="nis2", + ), + "pattern_result": PatternMatchResult(), + "license_rule": 3, + }, + ] + + with patch( + "compliance.services.control_composer._llm_ollama", + new_callable=AsyncMock, + return_value=_llm_success_response(), + ): + results = await composer.compose_batch(items) + + assert len(results) == 2 + assert results[0].pattern_id == "CP-COMP-001" + assert results[1].pattern_id is None + + @pytest.mark.asyncio + async def test_batch_empty(self): + composer = ControlComposer() + results = await composer.compose_batch([]) + assert results == [] + + +# ============================================================================= +# Tests: Validation integration +# ============================================================================= + + +class TestValidationIntegration: + """Tests that validation runs during compose.""" + + @pytest.mark.asyncio + async def test_compose_validates_severity(self): + """Invalid severity from LLM should be fixed.""" + response = json.dumps({ + "title": "Test", + "objective": "Test", + "severity": "EXTREME", + }) + composer = ControlComposer() + with patch( + "compliance.services.control_composer._llm_ollama", + new_callable=AsyncMock, + return_value=response, + ): + control = await composer.compose( + obligation=_make_obligation(), + pattern_result=_make_pattern_result(), + license_rule=1, + ) + + assert control.severity in {"low", "medium", "high", "critical"} + + @pytest.mark.asyncio + async def test_compose_ensures_minimum_content(self): + """Empty requirements from LLM should be filled with defaults.""" + response = json.dumps({ + "title": "Test", + "objective": "Test objective", + "requirements": [], + }) + composer = ControlComposer() + with patch( + "compliance.services.control_composer._llm_ollama", + new_callable=AsyncMock, + return_value=response, + ): + control = await composer.compose( + obligation=_make_obligation(), + pattern_result=_make_pattern_result(), + license_rule=1, + ) + + assert len(control.requirements) >= 1 + + +# ============================================================================= +# Tests: License rule edge cases +# ============================================================================= + + +class TestLicenseRuleEdgeCases: + """Tests for license rule handling edge cases.""" + + @pytest.mark.asyncio + async def test_rule1_no_chunk_text(self): + """Rule 1 without chunk text: original_text should be None.""" + composer = ControlComposer() + with patch( + "compliance.services.control_composer._llm_ollama", + new_callable=AsyncMock, + return_value=_llm_success_response(), + ): + control = await composer.compose( + obligation=_make_obligation(), + pattern_result=_make_pattern_result(), + chunk_text=None, + license_rule=1, + ) + + assert control.license_rule == 1 + assert control.source_original_text is None + assert control.customer_visible is True + + @pytest.mark.asyncio + async def test_rule2_no_citation(self): + """Rule 2 without citation: citation should be None.""" + composer = ControlComposer() + with patch( + "compliance.services.control_composer._llm_ollama", + new_callable=AsyncMock, + return_value=_llm_success_response(), + ): + control = await composer.compose( + obligation=_make_obligation(), + pattern_result=_make_pattern_result(), + chunk_text="Some text", + license_rule=2, + source_citation=None, + ) + + assert control.license_rule == 2 + assert control.source_citation is None + + @pytest.mark.asyncio + async def test_rule3_overrides_chunk_and_citation(self): + """Rule 3 should always clear original text and citation.""" + composer = ControlComposer() + with patch( + "compliance.services.control_composer._llm_ollama", + new_callable=AsyncMock, + return_value=_llm_success_response(), + ): + control = await composer.compose( + obligation=_make_obligation(), + pattern_result=_make_pattern_result(), + chunk_text="This should be cleared", + license_rule=3, + source_citation={"source": "BSI"}, + ) + + assert control.source_original_text is None + assert control.source_citation is None + assert control.customer_visible is False + + +# ============================================================================= +# Tests: Obligation without ID +# ============================================================================= + + +class TestObligationWithoutId: + """Tests for handling obligations without a known ID.""" + + @pytest.mark.asyncio + async def test_llm_extracted_obligation(self): + """LLM-extracted obligation (no ID) should still compose.""" + obl = ObligationMatch( + obligation_id=None, + obligation_title=None, + obligation_text="Pflicht zur Meldung von Sicherheitsvorfaellen", + method="llm_extracted", + confidence=0.60, + regulation_id="nis2", + ) + composer = ControlComposer() + with patch( + "compliance.services.control_composer._llm_ollama", + new_callable=AsyncMock, + return_value=_llm_success_response(), + ): + control = await composer.compose( + obligation=obl, + pattern_result=_make_pattern_result(), + license_rule=1, + ) + + assert control.obligation_ids == [] # No ID to link + assert control.pattern_id == "CP-COMP-001" + assert control.generation_metadata["obligation_method"] == "llm_extracted" diff --git a/backend-compliance/tests/test_control_patterns.py b/backend-compliance/tests/test_control_patterns.py new file mode 100644 index 0000000..cc69bd7 --- /dev/null +++ b/backend-compliance/tests/test_control_patterns.py @@ -0,0 +1,504 @@ +"""Tests for Control Pattern Library (Phase 2). + +Validates: +- JSON Schema structure +- YAML pattern files against schema +- Pattern ID uniqueness and format +- Domain/category consistency +- Keyword coverage +- Cross-references (composable_with) +- Template quality (min lengths, no placeholders without defaults) +""" + +import json +import re +from pathlib import Path +from collections import Counter + +import pytest +import yaml + +REPO_ROOT = Path(__file__).resolve().parent.parent.parent +PATTERNS_DIR = REPO_ROOT / "ai-compliance-sdk" / "policies" / "control_patterns" +SCHEMA_FILE = PATTERNS_DIR / "_pattern_schema.json" +CORE_FILE = PATTERNS_DIR / "core_patterns.yaml" +IT_SEC_FILE = PATTERNS_DIR / "domain_it_security.yaml" + +VALID_DOMAINS = [ + "AUTH", "CRYP", "NET", "DATA", "LOG", "ACC", "SEC", + "INC", "AI", "COMP", "GOV", "LAB", "FIN", "TRD", "ENV", "HLT", +] + +VALID_SEVERITIES = ["low", "medium", "high", "critical"] +VALID_EFFORTS = ["s", "m", "l", "xl"] + +PATTERN_ID_RE = re.compile(r"^CP-[A-Z]+-[0-9]{3}$") +NAME_RE = re.compile(r"^[a-z][a-z0-9_]*$") + + +# ============================================================================= +# Fixtures +# ============================================================================= + + +@pytest.fixture +def schema(): + """Load the JSON schema.""" + assert SCHEMA_FILE.exists(), f"Schema file not found: {SCHEMA_FILE}" + with open(SCHEMA_FILE) as f: + return json.load(f) + + +@pytest.fixture +def core_patterns(): + """Load core patterns.""" + assert CORE_FILE.exists(), f"Core patterns file not found: {CORE_FILE}" + with open(CORE_FILE) as f: + data = yaml.safe_load(f) + return data["patterns"] + + +@pytest.fixture +def it_sec_patterns(): + """Load IT security patterns.""" + assert IT_SEC_FILE.exists(), f"IT security patterns file not found: {IT_SEC_FILE}" + with open(IT_SEC_FILE) as f: + data = yaml.safe_load(f) + return data["patterns"] + + +@pytest.fixture +def all_patterns(core_patterns, it_sec_patterns): + """Combined list of all patterns.""" + return core_patterns + it_sec_patterns + + +# ============================================================================= +# Schema Tests +# ============================================================================= + + +class TestPatternSchema: + """Validate the JSON Schema file itself.""" + + def test_schema_exists(self): + assert SCHEMA_FILE.exists() + + def test_schema_is_valid_json(self, schema): + assert "$schema" in schema + assert "properties" in schema + + def test_schema_defines_pattern(self, schema): + assert "ControlPattern" in schema.get("$defs", {}) + + def test_schema_requires_key_fields(self, schema): + pattern_def = schema["$defs"]["ControlPattern"] + required = pattern_def["required"] + for field in [ + "id", "name", "name_de", "domain", "category", + "description", "objective_template", "rationale_template", + "requirements_template", "test_procedure_template", + "evidence_template", "severity_default", + "obligation_match_keywords", "tags", + ]: + assert field in required, f"Missing required field in schema: {field}" + + def test_schema_domain_enum(self, schema): + pattern_def = schema["$defs"]["ControlPattern"] + domain_enum = pattern_def["properties"]["domain"]["enum"] + assert set(domain_enum) == set(VALID_DOMAINS) + + +# ============================================================================= +# File Structure Tests +# ============================================================================= + + +class TestFileStructure: + """Validate YAML file structure.""" + + def test_core_file_exists(self): + assert CORE_FILE.exists() + + def test_it_sec_file_exists(self): + assert IT_SEC_FILE.exists() + + def test_core_has_version(self): + with open(CORE_FILE) as f: + data = yaml.safe_load(f) + assert "version" in data + assert data["version"] == "1.0" + + def test_it_sec_has_version(self): + with open(IT_SEC_FILE) as f: + data = yaml.safe_load(f) + assert "version" in data + assert data["version"] == "1.0" + + def test_core_has_description(self): + with open(CORE_FILE) as f: + data = yaml.safe_load(f) + assert "description" in data + assert len(data["description"]) > 20 + + def test_it_sec_has_description(self): + with open(IT_SEC_FILE) as f: + data = yaml.safe_load(f) + assert "description" in data + assert len(data["description"]) > 20 + + +# ============================================================================= +# Pattern Count Tests +# ============================================================================= + + +class TestPatternCounts: + """Verify expected number of patterns.""" + + def test_core_has_30_patterns(self, core_patterns): + assert len(core_patterns) == 30, ( + f"Expected 30 core patterns, got {len(core_patterns)}" + ) + + def test_it_sec_has_20_patterns(self, it_sec_patterns): + assert len(it_sec_patterns) == 20, ( + f"Expected 20 IT security patterns, got {len(it_sec_patterns)}" + ) + + def test_total_is_50(self, all_patterns): + assert len(all_patterns) == 50, ( + f"Expected 50 total patterns, got {len(all_patterns)}" + ) + + +# ============================================================================= +# Pattern ID Tests +# ============================================================================= + + +class TestPatternIDs: + """Validate pattern ID format and uniqueness.""" + + def test_all_ids_match_format(self, all_patterns): + for p in all_patterns: + assert PATTERN_ID_RE.match(p["id"]), ( + f"Invalid pattern ID format: {p['id']} (expected CP-DOMAIN-NNN)" + ) + + def test_all_ids_unique(self, all_patterns): + ids = [p["id"] for p in all_patterns] + duplicates = [id for id, count in Counter(ids).items() if count > 1] + assert not duplicates, f"Duplicate pattern IDs: {duplicates}" + + def test_all_names_unique(self, all_patterns): + names = [p["name"] for p in all_patterns] + duplicates = [n for n, count in Counter(names).items() if count > 1] + assert not duplicates, f"Duplicate pattern names: {duplicates}" + + def test_id_domain_matches_domain_field(self, all_patterns): + """The domain in the ID (CP-{DOMAIN}-NNN) should match the domain field.""" + for p in all_patterns: + id_domain = p["id"].split("-")[1] + assert id_domain == p["domain"], ( + f"Pattern {p['id']}: ID domain '{id_domain}' != field domain '{p['domain']}'" + ) + + def test_all_names_are_snake_case(self, all_patterns): + for p in all_patterns: + assert NAME_RE.match(p["name"]), ( + f"Pattern {p['id']}: name '{p['name']}' is not snake_case" + ) + + +# ============================================================================= +# Domain & Category Tests +# ============================================================================= + + +class TestDomainCategories: + """Validate domain and category assignments.""" + + def test_all_domains_valid(self, all_patterns): + for p in all_patterns: + assert p["domain"] in VALID_DOMAINS, ( + f"Pattern {p['id']}: invalid domain '{p['domain']}'" + ) + + def test_domain_coverage(self, all_patterns): + """At least 5 different domains should be covered.""" + domains = {p["domain"] for p in all_patterns} + assert len(domains) >= 5, ( + f"Only {len(domains)} domains covered: {domains}" + ) + + def test_all_have_category(self, all_patterns): + for p in all_patterns: + assert p.get("category"), ( + f"Pattern {p['id']}: missing category" + ) + + def test_category_not_empty(self, all_patterns): + for p in all_patterns: + assert len(p["category"]) >= 3, ( + f"Pattern {p['id']}: category too short: '{p['category']}'" + ) + + +# ============================================================================= +# Template Quality Tests +# ============================================================================= + + +class TestTemplateQuality: + """Validate template content quality.""" + + def test_description_min_length(self, all_patterns): + for p in all_patterns: + desc = p["description"].strip() + assert len(desc) >= 30, ( + f"Pattern {p['id']}: description too short ({len(desc)} chars)" + ) + + def test_objective_min_length(self, all_patterns): + for p in all_patterns: + obj = p["objective_template"].strip() + assert len(obj) >= 30, ( + f"Pattern {p['id']}: objective_template too short ({len(obj)} chars)" + ) + + def test_rationale_min_length(self, all_patterns): + for p in all_patterns: + rat = p["rationale_template"].strip() + assert len(rat) >= 30, ( + f"Pattern {p['id']}: rationale_template too short ({len(rat)} chars)" + ) + + def test_requirements_min_count(self, all_patterns): + for p in all_patterns: + reqs = p["requirements_template"] + assert len(reqs) >= 2, ( + f"Pattern {p['id']}: needs at least 2 requirements, got {len(reqs)}" + ) + + def test_requirements_not_empty(self, all_patterns): + for p in all_patterns: + for i, req in enumerate(p["requirements_template"]): + assert len(req.strip()) >= 10, ( + f"Pattern {p['id']}: requirement {i} too short" + ) + + def test_test_procedure_min_count(self, all_patterns): + for p in all_patterns: + tests = p["test_procedure_template"] + assert len(tests) >= 1, ( + f"Pattern {p['id']}: needs at least 1 test procedure" + ) + + def test_evidence_min_count(self, all_patterns): + for p in all_patterns: + evidence = p["evidence_template"] + assert len(evidence) >= 1, ( + f"Pattern {p['id']}: needs at least 1 evidence item" + ) + + def test_name_de_exists(self, all_patterns): + for p in all_patterns: + assert p.get("name_de"), ( + f"Pattern {p['id']}: missing German name (name_de)" + ) + assert len(p["name_de"]) >= 5, ( + f"Pattern {p['id']}: name_de too short: '{p['name_de']}'" + ) + + +# ============================================================================= +# Severity & Effort Tests +# ============================================================================= + + +class TestSeverityEffort: + """Validate severity and effort assignments.""" + + def test_all_have_valid_severity(self, all_patterns): + for p in all_patterns: + assert p["severity_default"] in VALID_SEVERITIES, ( + f"Pattern {p['id']}: invalid severity '{p['severity_default']}'" + ) + + def test_all_have_effort(self, all_patterns): + for p in all_patterns: + if "implementation_effort_default" in p: + assert p["implementation_effort_default"] in VALID_EFFORTS, ( + f"Pattern {p['id']}: invalid effort '{p['implementation_effort_default']}'" + ) + + def test_severity_distribution(self, all_patterns): + """At least 2 different severity levels should be used.""" + severities = {p["severity_default"] for p in all_patterns} + assert len(severities) >= 2, ( + f"Only {len(severities)} severity levels used: {severities}" + ) + + +# ============================================================================= +# Keyword Tests +# ============================================================================= + + +class TestKeywords: + """Validate obligation match keywords.""" + + def test_all_have_keywords(self, all_patterns): + for p in all_patterns: + kws = p["obligation_match_keywords"] + assert len(kws) >= 3, ( + f"Pattern {p['id']}: needs at least 3 keywords, got {len(kws)}" + ) + + def test_keywords_not_empty(self, all_patterns): + for p in all_patterns: + for kw in p["obligation_match_keywords"]: + assert len(kw.strip()) >= 2, ( + f"Pattern {p['id']}: empty or too short keyword: '{kw}'" + ) + + def test_keywords_lowercase(self, all_patterns): + for p in all_patterns: + for kw in p["obligation_match_keywords"]: + assert kw == kw.lower(), ( + f"Pattern {p['id']}: keyword should be lowercase: '{kw}'" + ) + + def test_has_german_and_english_keywords(self, all_patterns): + """Each pattern should have keywords in both languages (spot check).""" + # At minimum, keywords should have a mix (not all German, not all English) + for p in all_patterns: + kws = p["obligation_match_keywords"] + assert len(kws) >= 3, ( + f"Pattern {p['id']}: too few keywords for bilingual coverage" + ) + + +# ============================================================================= +# Tags Tests +# ============================================================================= + + +class TestTags: + """Validate tags.""" + + def test_all_have_tags(self, all_patterns): + for p in all_patterns: + assert len(p["tags"]) >= 1, ( + f"Pattern {p['id']}: needs at least 1 tag" + ) + + def test_tags_are_strings(self, all_patterns): + for p in all_patterns: + for tag in p["tags"]: + assert isinstance(tag, str) and len(tag) >= 2, ( + f"Pattern {p['id']}: invalid tag: {tag}" + ) + + +# ============================================================================= +# Open Anchor Tests +# ============================================================================= + + +class TestOpenAnchors: + """Validate open anchor references.""" + + def test_most_have_anchors(self, all_patterns): + """At least 80% of patterns should have open anchor references.""" + with_anchors = sum( + 1 for p in all_patterns + if p.get("open_anchor_refs") and len(p["open_anchor_refs"]) >= 1 + ) + ratio = with_anchors / len(all_patterns) + assert ratio >= 0.80, ( + f"Only {with_anchors}/{len(all_patterns)} ({ratio:.0%}) patterns have " + f"open anchor references (need >= 80%)" + ) + + def test_anchor_structure(self, all_patterns): + for p in all_patterns: + for anchor in p.get("open_anchor_refs", []): + assert "framework" in anchor, ( + f"Pattern {p['id']}: anchor missing 'framework'" + ) + assert "ref" in anchor, ( + f"Pattern {p['id']}: anchor missing 'ref'" + ) + + +# ============================================================================= +# Composability Tests +# ============================================================================= + + +class TestComposability: + """Validate composable_with references.""" + + def test_composable_refs_are_valid_ids(self, all_patterns): + all_ids = {p["id"] for p in all_patterns} + for p in all_patterns: + for ref in p.get("composable_with", []): + assert PATTERN_ID_RE.match(ref), ( + f"Pattern {p['id']}: composable_with ref '{ref}' is not valid ID format" + ) + assert ref in all_ids, ( + f"Pattern {p['id']}: composable_with ref '{ref}' does not exist" + ) + + def test_no_self_references(self, all_patterns): + for p in all_patterns: + composable = p.get("composable_with", []) + assert p["id"] not in composable, ( + f"Pattern {p['id']}: composable_with contains self-reference" + ) + + +# ============================================================================= +# Cross-File Consistency Tests +# ============================================================================= + + +class TestCrossFileConsistency: + """Validate consistency between core and IT security files.""" + + def test_no_id_overlap(self, core_patterns, it_sec_patterns): + core_ids = {p["id"] for p in core_patterns} + it_sec_ids = {p["id"] for p in it_sec_patterns} + overlap = core_ids & it_sec_ids + assert not overlap, f"ID overlap between files: {overlap}" + + def test_no_name_overlap(self, core_patterns, it_sec_patterns): + core_names = {p["name"] for p in core_patterns} + it_sec_names = {p["name"] for p in it_sec_patterns} + overlap = core_names & it_sec_names + assert not overlap, f"Name overlap between files: {overlap}" + + +# ============================================================================= +# Placeholder Syntax Tests +# ============================================================================= + + +class TestPlaceholderSyntax: + """Validate {placeholder:default} syntax in templates.""" + + PLACEHOLDER_RE = re.compile(r"\{(\w+)(?::([^}]+))?\}") + + def test_placeholders_have_defaults(self, all_patterns): + """All placeholders in requirements should have defaults.""" + for p in all_patterns: + for req in p["requirements_template"]: + for match in self.PLACEHOLDER_RE.finditer(req): + placeholder = match.group(1) + default = match.group(2) + # Placeholders should have defaults + assert default is not None, ( + f"Pattern {p['id']}: placeholder '{{{placeholder}}}' has no default value" + ) diff --git a/backend-compliance/tests/test_crosswalk_routes.py b/backend-compliance/tests/test_crosswalk_routes.py new file mode 100644 index 0000000..080ba4d --- /dev/null +++ b/backend-compliance/tests/test_crosswalk_routes.py @@ -0,0 +1,1131 @@ +"""Tests for Multi-Layer Control Architecture routes (crosswalk_routes.py). + +Covers: +- Pydantic model validation (Pattern, Obligation, Crosswalk, Migration) +- Pattern Library endpoints (list, get, get controls) +- Obligation extraction endpoint +- Crosswalk query + stats endpoints +- Migration pass endpoints (1-5) + status +- Helper: _get_pattern_control_counts +""" + +import json +import pytest +from unittest.mock import MagicMock, patch, AsyncMock +from typing import Optional + +from fastapi import FastAPI +from fastapi.testclient import TestClient + +from compliance.api.crosswalk_routes import ( + PatternResponse, + PatternListResponse, + PatternDetailResponse, + ObligationExtractRequest, + ObligationExtractResponse, + CrosswalkRow, + CrosswalkQueryResponse, + CrosswalkStatsResponse, + MigrationRequest, + MigrationResponse, + MigrationStatusResponse, + DecompositionStatusResponse, + router, + _get_pattern_control_counts, +) + + +# --------------------------------------------------------------------------- +# TestClient setup +# --------------------------------------------------------------------------- + +_app = FastAPI() +_app.include_router(router, prefix="/api/compliance") +_client = TestClient(_app) + + +# --------------------------------------------------------------------------- +# MODEL TESTS +# --------------------------------------------------------------------------- + + +class TestPatternResponse: + """Tests for PatternResponse model.""" + + def test_basic_creation(self): + resp = PatternResponse( + id="CP-AUTH-001", + name="password_policy", + name_de="Passwortrichtlinie", + domain="AUTH", + category="authentication", + description="Password policy requirements", + objective_template="Ensure passwords meet complexity standards.", + severity_default="high", + ) + assert resp.id == "CP-AUTH-001" + assert resp.domain == "AUTH" + assert resp.severity_default == "high" + + def test_default_values(self): + resp = PatternResponse( + id="CP-AUTH-001", + name="test", + name_de="Test", + domain="AUTH", + category="auth", + description="desc", + objective_template="obj", + severity_default="medium", + ) + assert resp.implementation_effort_default == "m" + assert resp.tags == [] + assert resp.composable_with == [] + assert resp.open_anchor_refs == [] + assert resp.controls_count == 0 + + def test_full_model(self): + resp = PatternResponse( + id="CP-CRYP-001", + name="encryption_at_rest", + name_de="Verschluesselung ruhender Daten", + domain="CRYP", + category="encryption", + description="Encrypt data at rest", + objective_template="Ensure all stored data is encrypted.", + severity_default="critical", + implementation_effort_default="l", + tags=["encryption", "storage"], + composable_with=["CP-CRYP-002"], + open_anchor_refs=[{"framework": "NIST", "ref": "SC-28"}], + controls_count=42, + ) + assert resp.controls_count == 42 + assert len(resp.tags) == 2 + assert resp.implementation_effort_default == "l" + + +class TestPatternDetailResponse: + """Tests for PatternDetailResponse model (extends PatternResponse).""" + + def test_has_extended_fields(self): + resp = PatternDetailResponse( + id="CP-AUTH-001", + name="mfa", + name_de="MFA", + domain="AUTH", + category="authentication", + description="Multi-factor authentication", + objective_template="Require MFA.", + severity_default="high", + rationale_template="Passwords alone are insufficient.", + requirements_template=["Require TOTP or hardware key"], + test_procedure_template=["Test login without MFA"], + evidence_template=["MFA config screenshot"], + obligation_match_keywords=["authentifizierung", "mfa"], + ) + assert resp.rationale_template == "Passwords alone are insufficient." + assert len(resp.requirements_template) == 1 + assert len(resp.obligation_match_keywords) == 2 + + def test_defaults(self): + resp = PatternDetailResponse( + id="CP-AUTH-001", + name="test", + name_de="Test", + domain="AUTH", + category="auth", + description="desc", + objective_template="obj", + severity_default="medium", + ) + assert resp.rationale_template == "" + assert resp.requirements_template == [] + assert resp.test_procedure_template == [] + assert resp.evidence_template == [] + assert resp.obligation_match_keywords == [] + + +class TestObligationModels: + """Tests for ObligationExtractRequest/Response models.""" + + def test_request_minimal(self): + req = ObligationExtractRequest(text="Ein Verantwortlicher muss...") + assert req.text == "Ein Verantwortlicher muss..." + assert req.regulation_code is None + assert req.article is None + assert req.paragraph is None + + def test_request_full(self): + req = ObligationExtractRequest( + text="Art. 32 DSGVO", + regulation_code="eu_2016_679", + article="Art. 32", + paragraph="Abs. 1", + ) + assert req.regulation_code == "eu_2016_679" + assert req.paragraph == "Abs. 1" + + def test_response_defaults(self): + resp = ObligationExtractResponse() + assert resp.obligation_id is None + assert resp.method == "none" + assert resp.confidence == 0.0 + assert resp.pattern_id is None + assert resp.pattern_confidence == 0.0 + + def test_response_full(self): + resp = ObligationExtractResponse( + obligation_id="DSGVO-OBL-001", + obligation_title="Verzeichnis der Verarbeitungstaetigkeiten", + obligation_text="Der Verantwortliche muss...", + method="exact_match", + confidence=1.0, + regulation_id="dsgvo", + pattern_id="CP-GOV-001", + pattern_confidence=0.85, + ) + assert resp.obligation_id == "DSGVO-OBL-001" + assert resp.method == "exact_match" + assert resp.confidence == 1.0 + assert resp.pattern_confidence == 0.85 + + +class TestCrosswalkModels: + """Tests for CrosswalkRow and CrosswalkQueryResponse models.""" + + def test_row_defaults(self): + row = CrosswalkRow() + assert row.regulation_code == "" + assert row.article is None + assert row.obligation_id is None + assert row.confidence == 0.0 + assert row.source == "auto" + + def test_row_full(self): + row = CrosswalkRow( + regulation_code="eu_2016_679", + article="Art. 32", + obligation_id="DSGVO-OBL-002", + pattern_id="CP-CRYP-001", + master_control_id="CRYP-001", + confidence=0.95, + source="manual", + ) + assert row.regulation_code == "eu_2016_679" + assert row.confidence == 0.95 + + def test_query_response(self): + resp = CrosswalkQueryResponse( + rows=[ + CrosswalkRow(regulation_code="eu_2016_679"), + CrosswalkRow(regulation_code="eu_2022_2554"), + ], + total=100, + ) + assert len(resp.rows) == 2 + assert resp.total == 100 + + +class TestCrosswalkStatsResponse: + """Tests for CrosswalkStatsResponse model.""" + + def test_defaults(self): + resp = CrosswalkStatsResponse() + assert resp.total_rows == 0 + assert resp.regulations_covered == 0 + assert resp.obligations_linked == 0 + assert resp.patterns_used == 0 + assert resp.controls_linked == 0 + assert resp.coverage_by_regulation == {} + + def test_full(self): + resp = CrosswalkStatsResponse( + total_rows=500, + regulations_covered=9, + obligations_linked=200, + patterns_used=45, + controls_linked=350, + coverage_by_regulation={"eu_2016_679": 150, "eu_2022_2554": 80}, + ) + assert resp.total_rows == 500 + assert resp.coverage_by_regulation["eu_2016_679"] == 150 + + +class TestMigrationModels: + """Tests for MigrationRequest/Response/Status models.""" + + def test_request_default(self): + req = MigrationRequest() + assert req.limit == 0 + + def test_request_with_limit(self): + req = MigrationRequest(limit=100) + assert req.limit == 100 + + def test_response_defaults(self): + resp = MigrationResponse() + assert resp.status == "completed" + assert resp.stats == {} + + def test_status_defaults(self): + s = MigrationStatusResponse() + assert s.total_controls == 0 + assert s.coverage_obligation_pct == 0.0 + assert s.coverage_pattern_pct == 0.0 + assert s.coverage_full_pct == 0.0 + + +# --------------------------------------------------------------------------- +# HELPER TESTS +# --------------------------------------------------------------------------- + + +class TestGetPatternControlCounts: + """Tests for _get_pattern_control_counts helper.""" + + @patch("compliance.api.crosswalk_routes.SessionLocal") + def test_returns_counts(self, mock_session_class): + mock_db = MagicMock() + mock_session_class.return_value = mock_db + mock_result = MagicMock() + mock_result.fetchall.return_value = [ + ("CP-AUTH-001", 15), + ("CP-CRYP-001", 8), + ] + mock_db.execute.return_value = mock_result + + counts = _get_pattern_control_counts() + assert counts == {"CP-AUTH-001": 15, "CP-CRYP-001": 8} + mock_db.close.assert_called_once() + + @patch("compliance.api.crosswalk_routes.SessionLocal") + def test_returns_empty_on_error(self, mock_session_class): + mock_db = MagicMock() + mock_session_class.return_value = mock_db + mock_db.execute.side_effect = Exception("DB down") + + counts = _get_pattern_control_counts() + assert counts == {} + mock_db.close.assert_called_once() + + @patch("compliance.api.crosswalk_routes.SessionLocal") + def test_returns_empty_when_no_patterns(self, mock_session_class): + mock_db = MagicMock() + mock_session_class.return_value = mock_db + mock_result = MagicMock() + mock_result.fetchall.return_value = [] + mock_db.execute.return_value = mock_result + + counts = _get_pattern_control_counts() + assert counts == {} + + +# --------------------------------------------------------------------------- +# PATTERN LIBRARY ENDPOINT TESTS +# --------------------------------------------------------------------------- + + +class _FakePattern: + """Minimal pattern stub for list_patterns / get_pattern tests.""" + + def __init__(self, id, name="test", name_de="Test", domain="AUTH", + category="auth", description="desc", + objective_template="obj", severity_default="medium", + implementation_effort_default="m", tags=None, + composable_with=None, open_anchor_refs=None, + rationale_template="", requirements_template=None, + test_procedure_template=None, evidence_template=None, + obligation_match_keywords=None): + self.id = id + self.name = name + self.name_de = name_de + self.domain = domain + self.category = category + self.description = description + self.objective_template = objective_template + self.severity_default = severity_default + self.implementation_effort_default = implementation_effort_default + self.tags = tags or [] + self.composable_with = composable_with or [] + self.open_anchor_refs = open_anchor_refs or [] + self.rationale_template = rationale_template + self.requirements_template = requirements_template or [] + self.test_procedure_template = test_procedure_template or [] + self.evidence_template = evidence_template or [] + self.obligation_match_keywords = obligation_match_keywords or [] + + +class TestListPatternsEndpoint: + """Tests for GET /patterns.""" + + @patch("compliance.api.crosswalk_routes._get_pattern_control_counts") + @patch("compliance.api.crosswalk_routes.PatternMatcher", create=True) + def test_list_all_patterns(self, mock_matcher_import, mock_counts): + """Patch the PatternMatcher class used inside the endpoint.""" + fake_patterns = [ + _FakePattern("CP-AUTH-001", domain="AUTH", tags=["auth"]), + _FakePattern("CP-CRYP-001", domain="CRYP", tags=["encryption"]), + ] + mock_counts.return_value = {"CP-AUTH-001": 5} + + with patch("compliance.services.pattern_matcher.PatternMatcher") as MockPM: + instance = MagicMock() + instance._patterns = fake_patterns + MockPM.return_value = instance + + resp = _client.get("/api/compliance/v1/canonical/patterns") + assert resp.status_code == 200 + data = resp.json() + assert data["total"] == 2 + assert len(data["patterns"]) == 2 + assert data["patterns"][0]["id"] == "CP-AUTH-001" + assert data["patterns"][0]["controls_count"] == 5 + assert data["patterns"][1]["controls_count"] == 0 + + @patch("compliance.api.crosswalk_routes._get_pattern_control_counts") + def test_filter_by_domain(self, mock_counts): + fake_patterns = [ + _FakePattern("CP-AUTH-001", domain="AUTH"), + _FakePattern("CP-CRYP-001", domain="CRYP"), + ] + mock_counts.return_value = {} + + with patch("compliance.services.pattern_matcher.PatternMatcher") as MockPM: + instance = MagicMock() + instance._patterns = fake_patterns + MockPM.return_value = instance + + resp = _client.get("/api/compliance/v1/canonical/patterns?domain=auth") + assert resp.status_code == 200 + data = resp.json() + assert data["total"] == 1 + assert data["patterns"][0]["id"] == "CP-AUTH-001" + + @patch("compliance.api.crosswalk_routes._get_pattern_control_counts") + def test_filter_by_category(self, mock_counts): + fake_patterns = [ + _FakePattern("CP-AUTH-001", category="authentication"), + _FakePattern("CP-CRYP-001", category="encryption"), + ] + mock_counts.return_value = {} + + with patch("compliance.services.pattern_matcher.PatternMatcher") as MockPM: + instance = MagicMock() + instance._patterns = fake_patterns + MockPM.return_value = instance + + resp = _client.get("/api/compliance/v1/canonical/patterns?category=encryption") + assert resp.status_code == 200 + data = resp.json() + assert data["total"] == 1 + assert data["patterns"][0]["id"] == "CP-CRYP-001" + + @patch("compliance.api.crosswalk_routes._get_pattern_control_counts") + def test_filter_by_tag(self, mock_counts): + fake_patterns = [ + _FakePattern("CP-AUTH-001", tags=["auth", "password"]), + _FakePattern("CP-CRYP-001", tags=["encryption"]), + ] + mock_counts.return_value = {} + + with patch("compliance.services.pattern_matcher.PatternMatcher") as MockPM: + instance = MagicMock() + instance._patterns = fake_patterns + MockPM.return_value = instance + + resp = _client.get("/api/compliance/v1/canonical/patterns?tag=password") + assert resp.status_code == 200 + data = resp.json() + assert data["total"] == 1 + assert data["patterns"][0]["id"] == "CP-AUTH-001" + + +class TestGetPatternEndpoint: + """Tests for GET /patterns/{pattern_id}.""" + + @patch("compliance.api.crosswalk_routes._get_pattern_control_counts") + def test_get_existing_pattern(self, mock_counts): + fake = _FakePattern( + "CP-AUTH-001", + name="password_policy", + rationale_template="Weak passwords are risky.", + requirements_template=["Min 12 chars"], + obligation_match_keywords=["passwort"], + ) + mock_counts.return_value = {"CP-AUTH-001": 10} + + with patch("compliance.services.pattern_matcher.PatternMatcher") as MockPM: + instance = MagicMock() + instance.get_pattern.return_value = fake + MockPM.return_value = instance + + resp = _client.get("/api/compliance/v1/canonical/patterns/CP-AUTH-001") + assert resp.status_code == 200 + data = resp.json() + assert data["id"] == "CP-AUTH-001" + assert data["rationale_template"] == "Weak passwords are risky." + assert data["obligation_match_keywords"] == ["passwort"] + assert data["controls_count"] == 10 + + @patch("compliance.api.crosswalk_routes._get_pattern_control_counts") + def test_get_nonexistent_pattern(self, mock_counts): + mock_counts.return_value = {} + + with patch("compliance.services.pattern_matcher.PatternMatcher") as MockPM: + instance = MagicMock() + instance.get_pattern.return_value = None + MockPM.return_value = instance + + resp = _client.get("/api/compliance/v1/canonical/patterns/CP-FAKE-999") + assert resp.status_code == 404 + assert "not found" in resp.json()["detail"].lower() + + +class TestGetPatternControlsEndpoint: + """Tests for GET /patterns/{pattern_id}/controls.""" + + @patch("compliance.api.crosswalk_routes.SessionLocal") + def test_returns_controls(self, mock_session_class): + mock_db = MagicMock() + mock_session_class.return_value = mock_db + + # Main query result + mock_result = MagicMock() + mock_result.fetchall.return_value = [ + ("uuid-1", "AUTH-001", "MFA", "Require MFA", "high", "draft", "authentication", '["DSGVO-OBL-001"]'), + ("uuid-2", "AUTH-002", "SSO", "Implement SSO", "medium", "draft", "authentication", None), + ] + + # Count query result + mock_count = MagicMock() + mock_count.fetchone.return_value = (2,) + + mock_db.execute.side_effect = [mock_result, mock_count] + + resp = _client.get("/api/compliance/v1/canonical/patterns/CP-AUTH-001/controls") + assert resp.status_code == 200 + data = resp.json() + assert data["total"] == 2 + assert len(data["controls"]) == 2 + assert data["controls"][0]["control_id"] == "AUTH-001" + assert data["controls"][0]["obligation_ids"] == ["DSGVO-OBL-001"] + assert data["controls"][1]["obligation_ids"] == [] + mock_db.close.assert_called_once() + + @patch("compliance.api.crosswalk_routes.SessionLocal") + def test_pagination(self, mock_session_class): + mock_db = MagicMock() + mock_session_class.return_value = mock_db + + mock_result = MagicMock() + mock_result.fetchall.return_value = [] + mock_count = MagicMock() + mock_count.fetchone.return_value = (0,) + mock_db.execute.side_effect = [mock_result, mock_count] + + resp = _client.get("/api/compliance/v1/canonical/patterns/CP-AUTH-001/controls?limit=10&offset=20") + assert resp.status_code == 200 + data = resp.json() + assert data["total"] == 0 + assert data["controls"] == [] + + @patch("compliance.api.crosswalk_routes.SessionLocal") + def test_json_parse_error_in_obligation_ids(self, mock_session_class): + mock_db = MagicMock() + mock_session_class.return_value = mock_db + + mock_result = MagicMock() + mock_result.fetchall.return_value = [ + ("uuid-1", "AUTH-001", "MFA", "obj", "high", "draft", "auth", "not-valid-json"), + ] + mock_count = MagicMock() + mock_count.fetchone.return_value = (1,) + mock_db.execute.side_effect = [mock_result, mock_count] + + resp = _client.get("/api/compliance/v1/canonical/patterns/CP-AUTH-001/controls") + assert resp.status_code == 200 + data = resp.json() + assert data["controls"][0]["obligation_ids"] == [] + + +# --------------------------------------------------------------------------- +# OBLIGATION EXTRACTION ENDPOINT TESTS +# --------------------------------------------------------------------------- + + +class TestExtractObligationEndpoint: + """Tests for POST /obligations/extract.""" + + @patch("compliance.services.pattern_matcher.PatternMatcher") + @patch("compliance.services.obligation_extractor.ObligationExtractor") + def test_extract_with_pattern_match(self, MockExtractor, MockMatcher): + # Mock extractor + mock_ext = AsyncMock() + MockExtractor.return_value = mock_ext + mock_obligation = MagicMock() + mock_obligation.obligation_id = "DSGVO-OBL-001" + mock_obligation.obligation_title = "VVT" + mock_obligation.obligation_text = "Der Verantwortliche muss..." + mock_obligation.method = "exact_match" + mock_obligation.confidence = 1.0 + mock_obligation.regulation_id = "dsgvo" + mock_ext.extract.return_value = mock_obligation + + # Mock matcher + mock_pm = MagicMock() + MockMatcher.return_value = mock_pm + mock_pattern_result = MagicMock() + mock_pattern_result.pattern_id = "CP-GOV-001" + mock_pattern_result.confidence = 0.85 + mock_pm._tier1_keyword.return_value = mock_pattern_result + + resp = _client.post( + "/api/compliance/v1/canonical/obligations/extract", + json={"text": "Art. 30 DSGVO Verzeichnis", "regulation_code": "eu_2016_679"}, + ) + assert resp.status_code == 200 + data = resp.json() + assert data["obligation_id"] == "DSGVO-OBL-001" + assert data["method"] == "exact_match" + assert data["pattern_id"] == "CP-GOV-001" + assert data["pattern_confidence"] == 0.85 + + @patch("compliance.services.pattern_matcher.PatternMatcher") + @patch("compliance.services.obligation_extractor.ObligationExtractor") + def test_extract_no_pattern_match(self, MockExtractor, MockMatcher): + mock_ext = AsyncMock() + MockExtractor.return_value = mock_ext + mock_obligation = MagicMock() + mock_obligation.obligation_id = None + mock_obligation.obligation_title = None + mock_obligation.obligation_text = "Some text" + mock_obligation.method = "llm_extracted" + mock_obligation.confidence = 0.6 + mock_obligation.regulation_id = None + mock_ext.extract.return_value = mock_obligation + + mock_pm = MagicMock() + MockMatcher.return_value = mock_pm + mock_pm._tier1_keyword.return_value = None + + resp = _client.post( + "/api/compliance/v1/canonical/obligations/extract", + json={"text": "Some random regulation text"}, + ) + assert resp.status_code == 200 + data = resp.json() + assert data["obligation_id"] is None + assert data["pattern_id"] is None + assert data["pattern_confidence"] == 0.0 + + +# --------------------------------------------------------------------------- +# CROSSWALK ENDPOINT TESTS +# --------------------------------------------------------------------------- + + +class TestQueryCrosswalkEndpoint: + """Tests for GET /crosswalk.""" + + @patch("compliance.api.crosswalk_routes.SessionLocal") + def test_query_no_filters(self, mock_session_class): + mock_db = MagicMock() + mock_session_class.return_value = mock_db + + mock_result = MagicMock() + mock_result.fetchall.return_value = [ + ("eu_2016_679", "Art. 32", "DSGVO-OBL-002", "CP-CRYP-001", "CRYP-001", 0.95, "auto"), + ] + mock_count = MagicMock() + mock_count.fetchone.return_value = (1,) + mock_db.execute.side_effect = [mock_result, mock_count] + + resp = _client.get("/api/compliance/v1/canonical/crosswalk") + assert resp.status_code == 200 + data = resp.json() + assert data["total"] == 1 + assert data["rows"][0]["regulation_code"] == "eu_2016_679" + assert data["rows"][0]["confidence"] == 0.95 + mock_db.close.assert_called_once() + + @patch("compliance.api.crosswalk_routes.SessionLocal") + def test_query_with_regulation_filter(self, mock_session_class): + mock_db = MagicMock() + mock_session_class.return_value = mock_db + + mock_result = MagicMock() + mock_result.fetchall.return_value = [] + mock_count = MagicMock() + mock_count.fetchone.return_value = (0,) + mock_db.execute.side_effect = [mock_result, mock_count] + + resp = _client.get("/api/compliance/v1/canonical/crosswalk?regulation_code=eu_2016_679") + assert resp.status_code == 200 + data = resp.json() + assert data["total"] == 0 + + # Verify SQL contained regulation filter + call_args = mock_db.execute.call_args_list[0] + sql_text = call_args[0][0].text + assert "regulation_code = :reg" in sql_text + + @patch("compliance.api.crosswalk_routes.SessionLocal") + def test_query_with_all_filters(self, mock_session_class): + mock_db = MagicMock() + mock_session_class.return_value = mock_db + + mock_result = MagicMock() + mock_result.fetchall.return_value = [] + mock_count = MagicMock() + mock_count.fetchone.return_value = (0,) + mock_db.execute.side_effect = [mock_result, mock_count] + + resp = _client.get( + "/api/compliance/v1/canonical/crosswalk" + "?regulation_code=eu_2016_679" + "&article=Art.%2032" + "&obligation_id=DSGVO-OBL-002" + "&pattern_id=CP-CRYP-001" + ) + assert resp.status_code == 200 + + # Verify params were passed + call_args = mock_db.execute.call_args_list[0] + params = call_args[0][1] + assert params["reg"] == "eu_2016_679" + assert params["art"] == "Art. 32" + assert params["obl"] == "DSGVO-OBL-002" + assert params["pat"] == "CP-CRYP-001" + + @patch("compliance.api.crosswalk_routes.SessionLocal") + def test_query_null_values_handled(self, mock_session_class): + mock_db = MagicMock() + mock_session_class.return_value = mock_db + + mock_result = MagicMock() + mock_result.fetchall.return_value = [ + ("eu_2016_679", None, None, None, None, None, None), + ] + mock_count = MagicMock() + mock_count.fetchone.return_value = (1,) + mock_db.execute.side_effect = [mock_result, mock_count] + + resp = _client.get("/api/compliance/v1/canonical/crosswalk") + assert resp.status_code == 200 + data = resp.json() + row = data["rows"][0] + assert row["confidence"] == 0.0 + assert row["source"] == "auto" + + @patch("compliance.api.crosswalk_routes.SessionLocal") + def test_query_pagination(self, mock_session_class): + mock_db = MagicMock() + mock_session_class.return_value = mock_db + + mock_result = MagicMock() + mock_result.fetchall.return_value = [] + mock_count = MagicMock() + mock_count.fetchone.return_value = (500,) + mock_db.execute.side_effect = [mock_result, mock_count] + + resp = _client.get("/api/compliance/v1/canonical/crosswalk?limit=50&offset=100") + assert resp.status_code == 200 + data = resp.json() + assert data["total"] == 500 + + call_args = mock_db.execute.call_args_list[0] + params = call_args[0][1] + assert params["limit"] == 50 + assert params["offset"] == 100 + + +class TestCrosswalkStatsEndpoint: + """Tests for GET /crosswalk/stats.""" + + @patch("compliance.api.crosswalk_routes.SessionLocal") + def test_returns_stats(self, mock_session_class): + mock_db = MagicMock() + mock_session_class.return_value = mock_db + + # Main stats query + mock_main = MagicMock() + mock_main.fetchone.return_value = (500, 9, 200, 45, 350) + + # Coverage by regulation query + mock_reg = MagicMock() + mock_reg.fetchall.return_value = [ + ("eu_2016_679", 150), + ("eu_2022_2554", 80), + ] + + mock_db.execute.side_effect = [mock_main, mock_reg] + + resp = _client.get("/api/compliance/v1/canonical/crosswalk/stats") + assert resp.status_code == 200 + data = resp.json() + assert data["total_rows"] == 500 + assert data["regulations_covered"] == 9 + assert data["obligations_linked"] == 200 + assert data["patterns_used"] == 45 + assert data["controls_linked"] == 350 + assert data["coverage_by_regulation"]["eu_2016_679"] == 150 + mock_db.close.assert_called_once() + + @patch("compliance.api.crosswalk_routes.SessionLocal") + def test_empty_stats(self, mock_session_class): + mock_db = MagicMock() + mock_session_class.return_value = mock_db + + mock_main = MagicMock() + mock_main.fetchone.return_value = (0, 0, 0, 0, 0) + mock_reg = MagicMock() + mock_reg.fetchall.return_value = [] + + mock_db.execute.side_effect = [mock_main, mock_reg] + + resp = _client.get("/api/compliance/v1/canonical/crosswalk/stats") + assert resp.status_code == 200 + data = resp.json() + assert data["total_rows"] == 0 + assert data["coverage_by_regulation"] == {} + + +# --------------------------------------------------------------------------- +# MIGRATION ENDPOINT TESTS +# --------------------------------------------------------------------------- + + +class TestMigratePass1Endpoint: + """Tests for POST /migrate/link-obligations.""" + + @patch("compliance.api.crosswalk_routes.SessionLocal") + @patch("compliance.services.pipeline_adapter.MigrationPasses") + def test_pass1_success(self, MockMigration, mock_session_class): + mock_db = MagicMock() + mock_session_class.return_value = mock_db + + mock_mig = AsyncMock() + MockMigration.return_value = mock_mig + mock_mig.run_pass1_obligation_linkage.return_value = { + "processed": 100, "linked": 60, "skipped": 40, + } + + resp = _client.post( + "/api/compliance/v1/canonical/migrate/link-obligations", + json={"limit": 100}, + ) + assert resp.status_code == 200 + data = resp.json() + assert data["status"] == "completed" + assert data["stats"]["linked"] == 60 + mock_db.close.assert_called_once() + + @patch("compliance.api.crosswalk_routes.SessionLocal") + @patch("compliance.services.pipeline_adapter.MigrationPasses") + def test_pass1_failure(self, MockMigration, mock_session_class): + mock_db = MagicMock() + mock_session_class.return_value = mock_db + + mock_mig = AsyncMock() + MockMigration.return_value = mock_mig + mock_mig.run_pass1_obligation_linkage.side_effect = RuntimeError("DB connection lost") + + resp = _client.post( + "/api/compliance/v1/canonical/migrate/link-obligations", + json={}, + ) + assert resp.status_code == 500 + assert "DB connection lost" in resp.json()["detail"] + mock_db.close.assert_called_once() + + +class TestMigratePass2Endpoint: + """Tests for POST /migrate/classify-patterns.""" + + @patch("compliance.api.crosswalk_routes.SessionLocal") + @patch("compliance.services.pipeline_adapter.MigrationPasses") + def test_pass2_success(self, MockMigration, mock_session_class): + mock_db = MagicMock() + mock_session_class.return_value = mock_db + + mock_mig = AsyncMock() + MockMigration.return_value = mock_mig + mock_mig.run_pass2_pattern_classification.return_value = { + "processed": 200, "classified": 140, "candidates": 30, "unmatched": 30, + } + + resp = _client.post( + "/api/compliance/v1/canonical/migrate/classify-patterns", + json={"limit": 200}, + ) + assert resp.status_code == 200 + data = resp.json() + assert data["stats"]["classified"] == 140 + + +class TestMigratePass3Endpoint: + """Tests for POST /migrate/triage.""" + + @patch("compliance.api.crosswalk_routes.SessionLocal") + @patch("compliance.services.pipeline_adapter.MigrationPasses") + def test_pass3_success(self, MockMigration, mock_session_class): + mock_db = MagicMock() + mock_session_class.return_value = mock_db + + mock_mig = MagicMock() + MockMigration.return_value = mock_mig + mock_mig.run_pass3_quality_triage.return_value = { + "review": 60, "needs_obligation": 20, "needs_pattern": 15, "legacy_unlinked": 5, + } + + resp = _client.post("/api/compliance/v1/canonical/migrate/triage") + assert resp.status_code == 200 + data = resp.json() + assert data["stats"]["review"] == 60 + + @patch("compliance.api.crosswalk_routes.SessionLocal") + @patch("compliance.services.pipeline_adapter.MigrationPasses") + def test_pass3_failure(self, MockMigration, mock_session_class): + mock_db = MagicMock() + mock_session_class.return_value = mock_db + + mock_mig = MagicMock() + MockMigration.return_value = mock_mig + mock_mig.run_pass3_quality_triage.side_effect = Exception("triage error") + + resp = _client.post("/api/compliance/v1/canonical/migrate/triage") + assert resp.status_code == 500 + + +class TestMigratePass4Endpoint: + """Tests for POST /migrate/backfill-crosswalk.""" + + @patch("compliance.api.crosswalk_routes.SessionLocal") + @patch("compliance.services.pipeline_adapter.MigrationPasses") + def test_pass4_success(self, MockMigration, mock_session_class): + mock_db = MagicMock() + mock_session_class.return_value = mock_db + + mock_mig = MagicMock() + MockMigration.return_value = mock_mig + mock_mig.run_pass4_crosswalk_backfill.return_value = { + "rows_created": 250, + } + + resp = _client.post("/api/compliance/v1/canonical/migrate/backfill-crosswalk") + assert resp.status_code == 200 + data = resp.json() + assert data["stats"]["rows_created"] == 250 + + +class TestMigratePass5Endpoint: + """Tests for POST /migrate/deduplicate.""" + + @patch("compliance.api.crosswalk_routes.SessionLocal") + @patch("compliance.services.pipeline_adapter.MigrationPasses") + def test_pass5_success(self, MockMigration, mock_session_class): + mock_db = MagicMock() + mock_session_class.return_value = mock_db + + mock_mig = MagicMock() + MockMigration.return_value = mock_mig + mock_mig.run_pass5_deduplication.return_value = { + "groups_found": 80, "deprecated": 120, "kept": 80, + } + + resp = _client.post("/api/compliance/v1/canonical/migrate/deduplicate") + assert resp.status_code == 200 + data = resp.json() + assert data["stats"]["deprecated"] == 120 + + +class TestMigrationStatusEndpoint: + """Tests for GET /migrate/status.""" + + @patch("compliance.api.crosswalk_routes.SessionLocal") + @patch("compliance.services.pipeline_adapter.MigrationPasses") + def test_status_success(self, MockMigration, mock_session_class): + mock_db = MagicMock() + mock_session_class.return_value = mock_db + + mock_mig = MagicMock() + MockMigration.return_value = mock_mig + mock_mig.migration_status.return_value = { + "total_controls": 4800, + "has_obligation": 2880, + "has_pattern": 3360, + "fully_linked": 2400, + "deprecated": 1200, + "coverage_obligation_pct": 60.0, + "coverage_pattern_pct": 70.0, + "coverage_full_pct": 50.0, + } + + resp = _client.get("/api/compliance/v1/canonical/migrate/status") + assert resp.status_code == 200 + data = resp.json() + assert data["total_controls"] == 4800 + assert data["coverage_full_pct"] == 50.0 + + @patch("compliance.api.crosswalk_routes.SessionLocal") + @patch("compliance.services.pipeline_adapter.MigrationPasses") + def test_status_failure(self, MockMigration, mock_session_class): + mock_db = MagicMock() + mock_session_class.return_value = mock_db + + mock_mig = MagicMock() + MockMigration.return_value = mock_mig + mock_mig.migration_status.side_effect = Exception("DB error") + + resp = _client.get("/api/compliance/v1/canonical/migrate/status") + assert resp.status_code == 500 + + +# --------------------------------------------------------------------------- +# DECOMPOSITION ENDPOINT TESTS (Pass 0a / 0b) +# --------------------------------------------------------------------------- + + +class TestDecompositionStatusModel: + """Tests for DecompositionStatusResponse model.""" + + def test_defaults(self): + s = DecompositionStatusResponse() + assert s.rich_controls == 0 + assert s.decomposition_pct == 0.0 + assert s.composition_pct == 0.0 + + def test_full(self): + s = DecompositionStatusResponse( + rich_controls=5000, + decomposed_controls=1000, + total_candidates=3000, + validated=2500, + rejected=200, + composed=2000, + atomic_controls=1800, + decomposition_pct=20.0, + composition_pct=80.0, + ) + assert s.rich_controls == 5000 + assert s.atomic_controls == 1800 + + +class TestMigrateDecomposeEndpoint: + """Tests for POST /migrate/decompose (Pass 0a).""" + + @patch("compliance.api.crosswalk_routes.SessionLocal") + @patch("compliance.services.decomposition_pass.DecompositionPass") + def test_pass0a_success(self, MockDecomp, mock_session_class): + mock_db = MagicMock() + mock_session_class.return_value = mock_db + + mock_decomp = AsyncMock() + MockDecomp.return_value = mock_decomp + mock_decomp.run_pass0a.return_value = { + "controls_processed": 50, + "obligations_extracted": 180, + "obligations_validated": 160, + "obligations_rejected": 20, + "controls_skipped_empty": 5, + "errors": 0, + } + + resp = _client.post( + "/api/compliance/v1/canonical/migrate/decompose", + json={"limit": 50}, + ) + assert resp.status_code == 200 + data = resp.json() + assert data["status"] == "completed" + assert data["stats"]["obligations_extracted"] == 180 + mock_db.close.assert_called_once() + + @patch("compliance.api.crosswalk_routes.SessionLocal") + @patch("compliance.services.decomposition_pass.DecompositionPass") + def test_pass0a_failure(self, MockDecomp, mock_session_class): + mock_db = MagicMock() + mock_session_class.return_value = mock_db + + mock_decomp = AsyncMock() + MockDecomp.return_value = mock_decomp + mock_decomp.run_pass0a.side_effect = RuntimeError("LLM timeout") + + resp = _client.post( + "/api/compliance/v1/canonical/migrate/decompose", + json={}, + ) + assert resp.status_code == 500 + assert "LLM timeout" in resp.json()["detail"] + + +class TestMigrateComposeAtomicEndpoint: + """Tests for POST /migrate/compose-atomic (Pass 0b).""" + + @patch("compliance.api.crosswalk_routes.SessionLocal") + @patch("compliance.services.decomposition_pass.DecompositionPass") + def test_pass0b_success(self, MockDecomp, mock_session_class): + mock_db = MagicMock() + mock_session_class.return_value = mock_db + + mock_decomp = AsyncMock() + MockDecomp.return_value = mock_decomp + mock_decomp.run_pass0b.return_value = { + "candidates_processed": 160, + "controls_created": 155, + "llm_failures": 5, + "errors": 0, + } + + resp = _client.post( + "/api/compliance/v1/canonical/migrate/compose-atomic", + json={"limit": 200}, + ) + assert resp.status_code == 200 + data = resp.json() + assert data["stats"]["controls_created"] == 155 + + +class TestDecompositionStatusEndpoint: + """Tests for GET /migrate/decomposition-status.""" + + @patch("compliance.api.crosswalk_routes.SessionLocal") + @patch("compliance.services.decomposition_pass.DecompositionPass") + def test_status_success(self, MockDecomp, mock_session_class): + mock_db = MagicMock() + mock_session_class.return_value = mock_db + + mock_decomp = MagicMock() + MockDecomp.return_value = mock_decomp + mock_decomp.decomposition_status.return_value = { + "rich_controls": 5000, + "decomposed_controls": 1000, + "total_candidates": 3000, + "validated": 2500, + "rejected": 200, + "composed": 2000, + "atomic_controls": 1800, + "decomposition_pct": 20.0, + "composition_pct": 80.0, + } + + resp = _client.get("/api/compliance/v1/canonical/migrate/decomposition-status") + assert resp.status_code == 200 + data = resp.json() + assert data["rich_controls"] == 5000 + assert data["atomic_controls"] == 1800 + assert data["decomposition_pct"] == 20.0 + + @patch("compliance.api.crosswalk_routes.SessionLocal") + @patch("compliance.services.decomposition_pass.DecompositionPass") + def test_status_failure(self, MockDecomp, mock_session_class): + mock_db = MagicMock() + mock_session_class.return_value = mock_db + + mock_decomp = MagicMock() + MockDecomp.return_value = mock_decomp + mock_decomp.decomposition_status.side_effect = Exception("DB error") + + resp = _client.get("/api/compliance/v1/canonical/migrate/decomposition-status") + assert resp.status_code == 500 diff --git a/backend-compliance/tests/test_decomposition_pass.py b/backend-compliance/tests/test_decomposition_pass.py new file mode 100644 index 0000000..2aee63f --- /dev/null +++ b/backend-compliance/tests/test_decomposition_pass.py @@ -0,0 +1,816 @@ +"""Tests for Decomposition Pass (Pass 0a + 0b). + +Covers: +- ObligationCandidate / AtomicControlCandidate dataclasses +- Normative signal detection (regex patterns) +- Quality Gate (all 6 checks) +- passes_quality_gate logic +- _compute_extraction_confidence +- _parse_json_array / _parse_json_object +- _format_field / _format_citation +- _normalize_severity +- _template_fallback +- _build_pass0a_prompt / _build_pass0b_prompt +- DecompositionPass.run_pass0a (mocked LLM + DB) +- DecompositionPass.run_pass0b (mocked LLM + DB) +- DecompositionPass.decomposition_status (mocked DB) +""" + +import json +import pytest +from unittest.mock import MagicMock, patch, AsyncMock + +from compliance.services.decomposition_pass import ( + ObligationCandidate, + AtomicControlCandidate, + quality_gate, + passes_quality_gate, + _NORMATIVE_RE, + _RATIONALE_RE, + _TEST_RE, + _REPORTING_RE, + _parse_json_array, + _parse_json_object, + _ensure_list, + _format_field, + _format_citation, + _compute_extraction_confidence, + _normalize_severity, + _template_fallback, + _build_pass0a_prompt, + _build_pass0b_prompt, + _PASS0A_SYSTEM_PROMPT, + _PASS0B_SYSTEM_PROMPT, + DecompositionPass, +) + + +# --------------------------------------------------------------------------- +# DATACLASS TESTS +# --------------------------------------------------------------------------- + + +class TestObligationCandidate: + """Tests for ObligationCandidate dataclass.""" + + def test_defaults(self): + oc = ObligationCandidate() + assert oc.candidate_id == "" + assert oc.normative_strength == "must" + assert oc.is_test_obligation is False + assert oc.release_state == "extracted" + assert oc.quality_flags == {} + + def test_to_dict(self): + oc = ObligationCandidate( + candidate_id="OC-001-01", + parent_control_uuid="uuid-1", + obligation_text="Betreiber müssen MFA implementieren", + action="implementieren", + object_="MFA", + ) + d = oc.to_dict() + assert d["candidate_id"] == "OC-001-01" + assert d["object"] == "MFA" + assert "object_" not in d # should be "object" in dict + + def test_full_creation(self): + oc = ObligationCandidate( + candidate_id="OC-MICA-0001-01", + parent_control_uuid="uuid-abc", + obligation_text="Betreiber müssen Kontinuität sicherstellen", + action="sicherstellen", + object_="Dienstleistungskontinuität", + condition="bei Ausfall des Handelssystems", + normative_strength="must", + is_test_obligation=False, + is_reporting_obligation=False, + extraction_confidence=0.90, + ) + assert oc.condition == "bei Ausfall des Handelssystems" + assert oc.extraction_confidence == 0.90 + + +class TestAtomicControlCandidate: + """Tests for AtomicControlCandidate dataclass.""" + + def test_defaults(self): + ac = AtomicControlCandidate() + assert ac.severity == "medium" + assert ac.requirements == [] + assert ac.test_procedure == [] + + def test_to_dict(self): + ac = AtomicControlCandidate( + candidate_id="AC-FIN-001", + title="Service Continuity Mechanism", + objective="Ensure continuity upon failure.", + requirements=["Failover mechanism"], + ) + d = ac.to_dict() + assert d["title"] == "Service Continuity Mechanism" + assert len(d["requirements"]) == 1 + + +# --------------------------------------------------------------------------- +# NORMATIVE SIGNAL DETECTION TESTS +# --------------------------------------------------------------------------- + + +class TestNormativeSignals: + """Tests for normative regex patterns.""" + + def test_muessen_detected(self): + assert _NORMATIVE_RE.search("Betreiber müssen sicherstellen") + + def test_muss_detected(self): + assert _NORMATIVE_RE.search("Das System muss implementiert sein") + + def test_hat_sicherzustellen(self): + assert _NORMATIVE_RE.search("Der Verantwortliche hat sicherzustellen") + + def test_sind_verpflichtet(self): + assert _NORMATIVE_RE.search("Anbieter sind verpflichtet zu melden") + + def test_ist_zu_dokumentieren(self): + assert _NORMATIVE_RE.search("Der Vorfall ist zu dokumentieren") + + def test_shall(self): + assert _NORMATIVE_RE.search("The operator shall implement MFA") + + def test_no_signal(self): + assert not _NORMATIVE_RE.search("Die Sonne scheint heute") + + def test_rationale_detected(self): + assert _RATIONALE_RE.search("da schwache Passwörter Risiken bergen") + + def test_test_signal_detected(self): + assert _TEST_RE.search("regelmäßige Tests der Wirksamkeit") + + def test_reporting_signal_detected(self): + assert _REPORTING_RE.search("Behörden sind zu unterrichten") + + +# --------------------------------------------------------------------------- +# QUALITY GATE TESTS +# --------------------------------------------------------------------------- + + +class TestQualityGate: + """Tests for quality_gate function.""" + + def test_valid_normative_obligation(self): + oc = ObligationCandidate( + parent_control_uuid="uuid-1", + obligation_text="Betreiber müssen Verschlüsselung implementieren", + ) + flags = quality_gate(oc) + assert flags["has_normative_signal"] is True + assert flags["not_evidence_only"] is True + assert flags["min_length"] is True + assert flags["has_parent_link"] is True + + def test_rationale_detected(self): + oc = ObligationCandidate( + parent_control_uuid="uuid-1", + obligation_text="Schwache Passwörter können zu Risiken führen, weil sie leicht zu erraten sind", + ) + flags = quality_gate(oc) + assert flags["not_rationale"] is False + + def test_evidence_only_rejected(self): + oc = ObligationCandidate( + parent_control_uuid="uuid-1", + obligation_text="Screenshot der Konfiguration", + ) + flags = quality_gate(oc) + assert flags["not_evidence_only"] is False + + def test_too_short_rejected(self): + oc = ObligationCandidate( + parent_control_uuid="uuid-1", + obligation_text="MFA", + ) + flags = quality_gate(oc) + assert flags["min_length"] is False + + def test_no_parent_link(self): + oc = ObligationCandidate( + parent_control_uuid="", + obligation_text="Betreiber müssen MFA implementieren", + ) + flags = quality_gate(oc) + assert flags["has_parent_link"] is False + + def test_multi_verb_detected(self): + oc = ObligationCandidate( + parent_control_uuid="uuid-1", + obligation_text="Betreiber müssen implementieren und dokumentieren sowie regelmäßig testen", + ) + flags = quality_gate(oc) + assert flags["single_action"] is False + + def test_single_verb_passes(self): + oc = ObligationCandidate( + parent_control_uuid="uuid-1", + obligation_text="Betreiber müssen MFA für alle privilegierten Konten implementieren", + ) + flags = quality_gate(oc) + assert flags["single_action"] is True + + def test_no_normative_signal(self): + oc = ObligationCandidate( + parent_control_uuid="uuid-1", + obligation_text="Ein DR-Plan beschreibt die Wiederherstellungsprozeduren im Detail", + ) + flags = quality_gate(oc) + assert flags["has_normative_signal"] is False + + +class TestPassesQualityGate: + """Tests for passes_quality_gate function.""" + + def test_all_critical_pass(self): + flags = { + "has_normative_signal": True, + "single_action": True, + "not_rationale": True, + "not_evidence_only": True, + "min_length": True, + "has_parent_link": True, + } + assert passes_quality_gate(flags) is True + + def test_no_normative_signal_fails(self): + flags = { + "has_normative_signal": False, + "single_action": True, + "not_rationale": True, + "not_evidence_only": True, + "min_length": True, + "has_parent_link": True, + } + assert passes_quality_gate(flags) is False + + def test_evidence_only_fails(self): + flags = { + "has_normative_signal": True, + "single_action": True, + "not_rationale": True, + "not_evidence_only": False, + "min_length": True, + "has_parent_link": True, + } + assert passes_quality_gate(flags) is False + + def test_non_critical_dont_block(self): + """single_action and not_rationale are NOT critical — should still pass.""" + flags = { + "has_normative_signal": True, + "single_action": False, # Not critical + "not_rationale": False, # Not critical + "not_evidence_only": True, + "min_length": True, + "has_parent_link": True, + } + assert passes_quality_gate(flags) is True + + +# --------------------------------------------------------------------------- +# HELPER TESTS +# --------------------------------------------------------------------------- + + +class TestComputeExtractionConfidence: + """Tests for _compute_extraction_confidence.""" + + def test_all_flags_pass(self): + flags = { + "has_normative_signal": True, + "single_action": True, + "not_rationale": True, + "not_evidence_only": True, + "min_length": True, + "has_parent_link": True, + } + assert _compute_extraction_confidence(flags) == 1.0 + + def test_no_flags_pass(self): + flags = { + "has_normative_signal": False, + "single_action": False, + "not_rationale": False, + "not_evidence_only": False, + "min_length": False, + "has_parent_link": False, + } + assert _compute_extraction_confidence(flags) == 0.0 + + def test_partial_flags(self): + flags = { + "has_normative_signal": True, # 0.30 + "single_action": False, + "not_rationale": True, # 0.20 + "not_evidence_only": True, # 0.15 + "min_length": True, # 0.10 + "has_parent_link": True, # 0.05 + } + assert _compute_extraction_confidence(flags) == 0.80 + + +class TestParseJsonArray: + """Tests for _parse_json_array.""" + + def test_valid_array(self): + result = _parse_json_array('[{"a": 1}, {"a": 2}]') + assert len(result) == 2 + assert result[0]["a"] == 1 + + def test_single_object_wrapped(self): + result = _parse_json_array('{"a": 1}') + assert len(result) == 1 + + def test_embedded_in_text(self): + result = _parse_json_array('Here is the result:\n[{"a": 1}]\nDone.') + assert len(result) == 1 + + def test_invalid_returns_empty(self): + result = _parse_json_array("not json at all") + assert result == [] + + def test_empty_array(self): + result = _parse_json_array("[]") + assert result == [] + + +class TestParseJsonObject: + """Tests for _parse_json_object.""" + + def test_valid_object(self): + result = _parse_json_object('{"title": "MFA"}') + assert result["title"] == "MFA" + + def test_embedded_in_text(self): + result = _parse_json_object('```json\n{"title": "MFA"}\n```') + assert result["title"] == "MFA" + + def test_invalid_returns_empty(self): + result = _parse_json_object("not json") + assert result == {} + + +class TestEnsureList: + """Tests for _ensure_list.""" + + def test_list_passthrough(self): + assert _ensure_list(["a", "b"]) == ["a", "b"] + + def test_string_wrapped(self): + assert _ensure_list("hello") == ["hello"] + + def test_empty_string(self): + assert _ensure_list("") == [] + + def test_none(self): + assert _ensure_list(None) == [] + + def test_int(self): + assert _ensure_list(42) == [] + + +class TestFormatField: + """Tests for _format_field.""" + + def test_string_passthrough(self): + assert _format_field("hello") == "hello" + + def test_json_list_string(self): + result = _format_field('["Req 1", "Req 2"]') + assert "- Req 1" in result + assert "- Req 2" in result + + def test_list_input(self): + result = _format_field(["A", "B"]) + assert "- A" in result + assert "- B" in result + + def test_empty(self): + assert _format_field("") == "" + assert _format_field(None) == "" + + +class TestFormatCitation: + """Tests for _format_citation.""" + + def test_json_dict(self): + result = _format_citation('{"source": "MiCA", "article": "Art. 8"}') + assert "MiCA" in result + assert "Art. 8" in result + + def test_plain_string(self): + assert _format_citation("MiCA Art. 8") == "MiCA Art. 8" + + def test_empty(self): + assert _format_citation("") == "" + assert _format_citation(None) == "" + + +class TestNormalizeSeverity: + """Tests for _normalize_severity.""" + + def test_valid_values(self): + assert _normalize_severity("critical") == "critical" + assert _normalize_severity("HIGH") == "high" + assert _normalize_severity(" Medium ") == "medium" + assert _normalize_severity("low") == "low" + + def test_invalid_defaults_to_medium(self): + assert _normalize_severity("unknown") == "medium" + assert _normalize_severity("") == "medium" + assert _normalize_severity(None) == "medium" + + +class TestTemplateFallback: + """Tests for _template_fallback.""" + + def test_normal_obligation(self): + ac = _template_fallback( + obligation_text="Betreiber müssen MFA implementieren", + action="implementieren", + object_="MFA", + parent_title="Authentication Controls", + parent_severity="high", + parent_category="authentication", + is_test=False, + is_reporting=False, + ) + assert "Implementieren" in ac.title + assert ac.severity == "high" + assert len(ac.requirements) == 1 + + def test_test_obligation(self): + ac = _template_fallback( + obligation_text="MFA muss regelmäßig getestet werden", + action="testen", + object_="MFA-Wirksamkeit", + parent_title="MFA Control", + parent_severity="medium", + parent_category="auth", + is_test=True, + is_reporting=False, + ) + assert "Test:" in ac.title + assert "Testprotokoll" in ac.evidence + + def test_reporting_obligation(self): + ac = _template_fallback( + obligation_text="Behörden sind über Vorfälle zu informieren", + action="informieren", + object_="zuständige Behörden", + parent_title="Incident Reporting", + parent_severity="high", + parent_category="governance", + is_test=False, + is_reporting=True, + ) + assert "Meldepflicht:" in ac.title + assert "Meldeprozess-Dokumentation" in ac.evidence + + +# --------------------------------------------------------------------------- +# PROMPT BUILDER TESTS +# --------------------------------------------------------------------------- + + +class TestPromptBuilders: + """Tests for LLM prompt builders.""" + + def test_pass0a_prompt_contains_all_fields(self): + prompt = _build_pass0a_prompt( + title="MFA Control", + objective="Implement MFA", + requirements="- Require TOTP\n- Hardware key", + test_procedure="- Test login", + source_ref="DSGVO Art. 32", + ) + assert "MFA Control" in prompt + assert "Implement MFA" in prompt + assert "Require TOTP" in prompt + assert "DSGVO Art. 32" in prompt + assert "JSON-Array" in prompt + + def test_pass0b_prompt_contains_all_fields(self): + prompt = _build_pass0b_prompt( + obligation_text="MFA implementieren", + action="implementieren", + object_="MFA", + parent_title="Auth Controls", + parent_category="authentication", + source_ref="DSGVO Art. 32", + ) + assert "MFA implementieren" in prompt + assert "implementieren" in prompt + assert "Auth Controls" in prompt + assert "JSON" in prompt + + def test_system_prompts_exist(self): + assert "REGELN" in _PASS0A_SYSTEM_PROMPT + assert "atomares" in _PASS0B_SYSTEM_PROMPT + + +# --------------------------------------------------------------------------- +# DECOMPOSITION PASS INTEGRATION TESTS +# --------------------------------------------------------------------------- + + +class TestDecompositionPassRun0a: + """Tests for DecompositionPass.run_pass0a.""" + + @pytest.mark.asyncio + async def test_pass0a_extracts_obligations(self): + mock_db = MagicMock() + + # Rich controls to decompose + mock_rows = MagicMock() + mock_rows.fetchall.return_value = [ + ( + "uuid-1", "CTRL-001", + "Service Continuity", + "Sicherstellen der Dienstleistungskontinuität", + '["Mechanismen implementieren", "Systeme testen"]', + '["Prüfung der Mechanismen"]', + '{"source": "MiCA", "article": "Art. 8"}', + "finance", + ), + ] + mock_db.execute.return_value = mock_rows + + llm_response = json.dumps([ + { + "obligation_text": "Betreiber müssen Mechanismen zur Dienstleistungskontinuität implementieren", + "action": "implementieren", + "object": "Kontinuitätsmechanismen", + "condition": "bei Ausfall des Handelssystems", + "normative_strength": "must", + "is_test_obligation": False, + "is_reporting_obligation": False, + }, + { + "obligation_text": "Kontinuitätsmechanismen müssen regelmäßig getestet werden", + "action": "testen", + "object": "Kontinuitätsmechanismen", + "condition": None, + "normative_strength": "must", + "is_test_obligation": True, + "is_reporting_obligation": False, + }, + ]) + + with patch("compliance.services.obligation_extractor._llm_ollama", new_callable=AsyncMock) as mock_llm: + mock_llm.return_value = llm_response + + decomp = DecompositionPass(db=mock_db) + stats = await decomp.run_pass0a(limit=10) + + assert stats["controls_processed"] == 1 + assert stats["obligations_extracted"] == 2 + assert stats["obligations_validated"] == 2 + assert stats["errors"] == 0 + + # Verify DB writes: 1 SELECT + 2 INSERTs + 1 COMMIT + assert mock_db.execute.call_count >= 3 + mock_db.commit.assert_called_once() + + @pytest.mark.asyncio + async def test_pass0a_fallback_on_empty_llm(self): + mock_db = MagicMock() + + mock_rows = MagicMock() + mock_rows.fetchall.return_value = [ + ( + "uuid-1", "CTRL-001", + "MFA Control", + "Betreiber müssen MFA implementieren", + "", "", "", "auth", + ), + ] + mock_db.execute.return_value = mock_rows + + with patch("compliance.services.obligation_extractor._llm_ollama", new_callable=AsyncMock) as mock_llm: + mock_llm.return_value = "I cannot help with that." # Invalid JSON + + decomp = DecompositionPass(db=mock_db) + stats = await decomp.run_pass0a(limit=10) + + assert stats["controls_processed"] == 1 + # Fallback should create 1 obligation from the objective + assert stats["obligations_extracted"] == 1 + + @pytest.mark.asyncio + async def test_pass0a_skips_empty_controls(self): + mock_db = MagicMock() + + mock_rows = MagicMock() + mock_rows.fetchall.return_value = [ + ("uuid-1", "CTRL-001", "", "", "", "", "", ""), + ] + mock_db.execute.return_value = mock_rows + + # No LLM call needed — empty controls are skipped before LLM + decomp = DecompositionPass(db=mock_db) + stats = await decomp.run_pass0a(limit=10) + + assert stats["controls_skipped_empty"] == 1 + assert stats["controls_processed"] == 0 + + @pytest.mark.asyncio + async def test_pass0a_rejects_evidence_only(self): + mock_db = MagicMock() + + mock_rows = MagicMock() + mock_rows.fetchall.return_value = [ + ( + "uuid-1", "CTRL-001", + "Evidence List", + "Betreiber müssen Nachweise erbringen", + "", "", "", "governance", + ), + ] + mock_db.execute.return_value = mock_rows + + llm_response = json.dumps([ + { + "obligation_text": "Dokumentation der Konfiguration", + "action": "dokumentieren", + "object": "Konfiguration", + "condition": None, + "normative_strength": "must", + "is_test_obligation": False, + "is_reporting_obligation": False, + }, + ]) + + with patch("compliance.services.obligation_extractor._llm_ollama", new_callable=AsyncMock) as mock_llm: + mock_llm.return_value = llm_response + + decomp = DecompositionPass(db=mock_db) + stats = await decomp.run_pass0a(limit=10) + + assert stats["obligations_extracted"] == 1 + assert stats["obligations_rejected"] == 1 + + +class TestDecompositionPassRun0b: + """Tests for DecompositionPass.run_pass0b.""" + + @pytest.mark.asyncio + async def test_pass0b_creates_atomic_controls(self): + mock_db = MagicMock() + + # Validated obligation candidates + mock_rows = MagicMock() + mock_rows.fetchall.return_value = [ + ( + "oc-uuid-1", "OC-CTRL-001-01", "parent-uuid-1", + "Betreiber müssen Kontinuität sicherstellen", + "sicherstellen", "Dienstleistungskontinuität", + False, False, # is_test, is_reporting + "Service Continuity", "finance", + '{"source": "MiCA", "article": "Art. 8"}', + "high", "FIN-001", + ), + ] + + # Mock _next_atomic_seq result + mock_seq = MagicMock() + mock_seq.fetchone.return_value = (0,) + + # Call sequence: 1=SELECT, 2=_next_atomic_seq, 3=INSERT control, 4=UPDATE oc + call_count = [0] + def side_effect(*args, **kwargs): + call_count[0] += 1 + if call_count[0] == 1: + return mock_rows # SELECT candidates + if call_count[0] == 2: + return mock_seq # _next_atomic_seq + return MagicMock() # INSERT/UPDATE + + mock_db.execute.side_effect = side_effect + + llm_response = json.dumps({ + "title": "Dienstleistungskontinuität bei Systemausfall", + "objective": "Sicherstellen, dass Dienstleistungen fortgeführt werden.", + "requirements": ["Failover-Mechanismus implementieren"], + "test_procedure": ["Failover-Test durchführen"], + "evidence": ["Systemarchitektur", "DR-Plan"], + "severity": "high", + "category": "operations", + }) + + with patch("compliance.services.obligation_extractor._llm_ollama", new_callable=AsyncMock) as mock_llm: + mock_llm.return_value = llm_response + + decomp = DecompositionPass(db=mock_db) + stats = await decomp.run_pass0b(limit=10) + + assert stats["candidates_processed"] == 1 + assert stats["controls_created"] == 1 + assert stats["llm_failures"] == 0 + + @pytest.mark.asyncio + async def test_pass0b_template_fallback(self): + mock_db = MagicMock() + + mock_rows = MagicMock() + mock_rows.fetchall.return_value = [ + ( + "oc-uuid-1", "OC-CTRL-001-01", "parent-uuid-1", + "Betreiber müssen MFA implementieren", + "implementieren", "MFA", + False, False, + "Auth Controls", "authentication", + "", "high", "AUTH-001", + ), + ] + + mock_seq = MagicMock() + mock_seq.fetchone.return_value = (0,) + + call_count = [0] + def side_effect(*args, **kwargs): + call_count[0] += 1 + if call_count[0] == 1: + return mock_rows + if call_count[0] == 2: + return mock_seq + return MagicMock() + + mock_db.execute.side_effect = side_effect + + with patch("compliance.services.obligation_extractor._llm_ollama", new_callable=AsyncMock) as mock_llm: + mock_llm.return_value = "Sorry, invalid response" # LLM fails + + decomp = DecompositionPass(db=mock_db) + stats = await decomp.run_pass0b(limit=10) + + assert stats["controls_created"] == 1 + assert stats["llm_failures"] == 1 + + +class TestDecompositionStatus: + """Tests for DecompositionPass.decomposition_status.""" + + def test_returns_status(self): + mock_db = MagicMock() + mock_result = MagicMock() + mock_result.fetchone.return_value = (5000, 1000, 3000, 2500, 200, 2000, 1800) + mock_db.execute.return_value = mock_result + + decomp = DecompositionPass(db=mock_db) + status = decomp.decomposition_status() + + assert status["rich_controls"] == 5000 + assert status["decomposed_controls"] == 1000 + assert status["total_candidates"] == 3000 + assert status["validated"] == 2500 + assert status["rejected"] == 200 + assert status["composed"] == 2000 + assert status["atomic_controls"] == 1800 + assert status["decomposition_pct"] == 20.0 + assert status["composition_pct"] == 80.0 + + def test_handles_zero_division(self): + mock_db = MagicMock() + mock_result = MagicMock() + mock_result.fetchone.return_value = (0, 0, 0, 0, 0, 0, 0) + mock_db.execute.return_value = mock_result + + decomp = DecompositionPass(db=mock_db) + status = decomp.decomposition_status() + + assert status["decomposition_pct"] == 0.0 + assert status["composition_pct"] == 0.0 + + +# --------------------------------------------------------------------------- +# MIGRATION 061 SCHEMA TESTS +# --------------------------------------------------------------------------- + + +class TestMigration061: + """Tests for migration 061 SQL file.""" + + def test_migration_file_exists(self): + from pathlib import Path + migration = Path(__file__).parent.parent / "migrations" / "061_obligation_candidates.sql" + assert migration.exists(), "Migration 061 file missing" + + def test_migration_contains_required_tables(self): + from pathlib import Path + migration = Path(__file__).parent.parent / "migrations" / "061_obligation_candidates.sql" + content = migration.read_text() + assert "obligation_candidates" in content + assert "parent_control_uuid" in content + assert "decomposition_method" in content + assert "candidate_id" in content + assert "quality_flags" in content diff --git a/backend-compliance/tests/test_migration_060.py b/backend-compliance/tests/test_migration_060.py new file mode 100644 index 0000000..10c990e --- /dev/null +++ b/backend-compliance/tests/test_migration_060.py @@ -0,0 +1,428 @@ +"""Tests for Migration 060: Multi-Layer Control Architecture DB Schema. + +Validates SQL syntax, table definitions, constraints, and indexes +defined in 060_crosswalk_matrix.sql. + +Uses an in-memory SQLite-compatible approach: we parse the SQL and validate +the structure, then run it against a real PostgreSQL test database if available. +""" + +import re +from pathlib import Path + +import pytest + +MIGRATION_FILE = ( + Path(__file__).resolve().parent.parent / "migrations" / "060_crosswalk_matrix.sql" +) + + +@pytest.fixture +def migration_sql(): + """Load the migration SQL file.""" + assert MIGRATION_FILE.exists(), f"Migration file not found: {MIGRATION_FILE}" + return MIGRATION_FILE.read_text(encoding="utf-8") + + +# ============================================================================= +# SQL File Structure Tests +# ============================================================================= + + +class TestMigrationFileStructure: + """Validate the migration file exists and has correct structure.""" + + def test_file_exists(self): + assert MIGRATION_FILE.exists() + + def test_file_not_empty(self, migration_sql): + assert len(migration_sql.strip()) > 100 + + def test_has_migration_header_comment(self, migration_sql): + assert "Migration 060" in migration_sql + assert "Multi-Layer Control Architecture" in migration_sql + + def test_no_explicit_transaction_control(self, migration_sql): + """Migration runner strips BEGIN/COMMIT — file should not contain them.""" + lines = migration_sql.split("\n") + for line in lines: + stripped = line.strip().upper() + if stripped.startswith("--"): + continue + assert stripped != "BEGIN;", "Migration should not contain explicit BEGIN" + assert stripped != "COMMIT;", "Migration should not contain explicit COMMIT" + + +# ============================================================================= +# Table Definition Tests +# ============================================================================= + + +class TestObligationExtractionsTable: + """Validate obligation_extractions table definition.""" + + def test_create_table_present(self, migration_sql): + assert "CREATE TABLE IF NOT EXISTS obligation_extractions" in migration_sql + + def test_has_primary_key(self, migration_sql): + # Extract the CREATE TABLE block + block = _extract_create_table(migration_sql, "obligation_extractions") + assert "id UUID PRIMARY KEY" in block + + def test_has_chunk_hash_column(self, migration_sql): + block = _extract_create_table(migration_sql, "obligation_extractions") + assert "chunk_hash VARCHAR(64) NOT NULL" in block + + def test_has_collection_column(self, migration_sql): + block = _extract_create_table(migration_sql, "obligation_extractions") + assert "collection VARCHAR(100) NOT NULL" in block + + def test_has_regulation_code_column(self, migration_sql): + block = _extract_create_table(migration_sql, "obligation_extractions") + assert "regulation_code VARCHAR(100) NOT NULL" in block + + def test_has_obligation_id_column(self, migration_sql): + block = _extract_create_table(migration_sql, "obligation_extractions") + assert "obligation_id VARCHAR(50)" in block + + def test_has_confidence_column_with_check(self, migration_sql): + block = _extract_create_table(migration_sql, "obligation_extractions") + assert "confidence NUMERIC(3,2)" in block + assert "confidence >= 0" in block + assert "confidence <= 1" in block + + def test_extraction_method_check_constraint(self, migration_sql): + block = _extract_create_table(migration_sql, "obligation_extractions") + assert "extraction_method VARCHAR(30) NOT NULL" in block + for method in ("exact_match", "embedding_match", "llm_extracted", "inferred"): + assert method in block, f"Missing extraction_method: {method}" + + def test_has_pattern_id_column(self, migration_sql): + block = _extract_create_table(migration_sql, "obligation_extractions") + assert "pattern_id VARCHAR(50)" in block + + def test_has_pattern_match_score_with_check(self, migration_sql): + block = _extract_create_table(migration_sql, "obligation_extractions") + assert "pattern_match_score NUMERIC(3,2)" in block + + def test_has_control_uuid_fk(self, migration_sql): + block = _extract_create_table(migration_sql, "obligation_extractions") + assert "control_uuid UUID REFERENCES canonical_controls(id)" in block + + def test_has_job_id_fk(self, migration_sql): + block = _extract_create_table(migration_sql, "obligation_extractions") + assert "job_id UUID REFERENCES canonical_generation_jobs(id)" in block + + def test_has_created_at(self, migration_sql): + block = _extract_create_table(migration_sql, "obligation_extractions") + assert "created_at TIMESTAMPTZ" in block + + def test_indexes_created(self, migration_sql): + expected_indexes = [ + "idx_oe_obligation", + "idx_oe_pattern", + "idx_oe_control", + "idx_oe_regulation", + "idx_oe_chunk", + "idx_oe_method", + ] + for idx in expected_indexes: + assert idx in migration_sql, f"Missing index: {idx}" + + +class TestControlPatternsTable: + """Validate control_patterns table definition.""" + + def test_create_table_present(self, migration_sql): + assert "CREATE TABLE IF NOT EXISTS control_patterns" in migration_sql + + def test_has_primary_key(self, migration_sql): + block = _extract_create_table(migration_sql, "control_patterns") + assert "id UUID PRIMARY KEY" in block + + def test_pattern_id_unique(self, migration_sql): + block = _extract_create_table(migration_sql, "control_patterns") + assert "pattern_id VARCHAR(50) UNIQUE NOT NULL" in block + + def test_has_name_column(self, migration_sql): + block = _extract_create_table(migration_sql, "control_patterns") + assert "name VARCHAR(255) NOT NULL" in block + + def test_has_name_de_column(self, migration_sql): + block = _extract_create_table(migration_sql, "control_patterns") + assert "name_de VARCHAR(255)" in block + + def test_has_domain_column(self, migration_sql): + block = _extract_create_table(migration_sql, "control_patterns") + assert "domain VARCHAR(10) NOT NULL" in block + + def test_has_category_column(self, migration_sql): + block = _extract_create_table(migration_sql, "control_patterns") + assert "category VARCHAR(50)" in block + + def test_has_template_fields(self, migration_sql): + block = _extract_create_table(migration_sql, "control_patterns") + assert "template_objective TEXT" in block + assert "template_rationale TEXT" in block + assert "template_requirements JSONB" in block + assert "template_test_procedure JSONB" in block + assert "template_evidence JSONB" in block + + def test_severity_check_constraint(self, migration_sql): + block = _extract_create_table(migration_sql, "control_patterns") + for severity in ("low", "medium", "high", "critical"): + assert severity in block, f"Missing severity: {severity}" + + def test_effort_check_constraint(self, migration_sql): + block = _extract_create_table(migration_sql, "control_patterns") + assert "implementation_effort_default" in block + + def test_has_keyword_and_tag_fields(self, migration_sql): + block = _extract_create_table(migration_sql, "control_patterns") + assert "obligation_match_keywords JSONB" in block + assert "tags JSONB" in block + + def test_has_anchor_refs(self, migration_sql): + block = _extract_create_table(migration_sql, "control_patterns") + assert "open_anchor_refs JSONB" in block + + def test_has_composable_with(self, migration_sql): + block = _extract_create_table(migration_sql, "control_patterns") + assert "composable_with JSONB" in block + + def test_has_version(self, migration_sql): + block = _extract_create_table(migration_sql, "control_patterns") + assert "version VARCHAR(10)" in block + + def test_indexes_created(self, migration_sql): + expected_indexes = ["idx_cp_domain", "idx_cp_category", "idx_cp_pattern_id"] + for idx in expected_indexes: + assert idx in migration_sql, f"Missing index: {idx}" + + +class TestCrosswalkMatrixTable: + """Validate crosswalk_matrix table definition.""" + + def test_create_table_present(self, migration_sql): + assert "CREATE TABLE IF NOT EXISTS crosswalk_matrix" in migration_sql + + def test_has_primary_key(self, migration_sql): + block = _extract_create_table(migration_sql, "crosswalk_matrix") + assert "id UUID PRIMARY KEY" in block + + def test_has_regulation_code(self, migration_sql): + block = _extract_create_table(migration_sql, "crosswalk_matrix") + assert "regulation_code VARCHAR(100) NOT NULL" in block + + def test_has_article_paragraph(self, migration_sql): + block = _extract_create_table(migration_sql, "crosswalk_matrix") + assert "article VARCHAR(100)" in block + assert "paragraph VARCHAR(100)" in block + + def test_has_obligation_id(self, migration_sql): + block = _extract_create_table(migration_sql, "crosswalk_matrix") + assert "obligation_id VARCHAR(50)" in block + + def test_has_pattern_id(self, migration_sql): + block = _extract_create_table(migration_sql, "crosswalk_matrix") + assert "pattern_id VARCHAR(50)" in block + + def test_has_master_control_fields(self, migration_sql): + block = _extract_create_table(migration_sql, "crosswalk_matrix") + assert "master_control_id VARCHAR(20)" in block + assert "master_control_uuid UUID REFERENCES canonical_controls(id)" in block + + def test_has_tom_control_id(self, migration_sql): + block = _extract_create_table(migration_sql, "crosswalk_matrix") + assert "tom_control_id VARCHAR(30)" in block + + def test_confidence_check(self, migration_sql): + block = _extract_create_table(migration_sql, "crosswalk_matrix") + assert "confidence NUMERIC(3,2)" in block + + def test_source_check_constraint(self, migration_sql): + block = _extract_create_table(migration_sql, "crosswalk_matrix") + for source_val in ("manual", "auto", "migrated"): + assert source_val in block, f"Missing source value: {source_val}" + + def test_indexes_created(self, migration_sql): + expected_indexes = [ + "idx_cw_regulation", + "idx_cw_obligation", + "idx_cw_pattern", + "idx_cw_control", + "idx_cw_tom", + ] + for idx in expected_indexes: + assert idx in migration_sql, f"Missing index: {idx}" + + +# ============================================================================= +# ALTER TABLE Tests (canonical_controls extensions) +# ============================================================================= + + +class TestCanonicalControlsExtension: + """Validate ALTER TABLE additions to canonical_controls.""" + + def test_adds_pattern_id_column(self, migration_sql): + assert "ALTER TABLE canonical_controls" in migration_sql + assert "pattern_id VARCHAR(50)" in migration_sql + + def test_adds_obligation_ids_column(self, migration_sql): + assert "obligation_ids JSONB" in migration_sql + + def test_uses_if_not_exists(self, migration_sql): + alter_lines = [ + line.strip() + for line in migration_sql.split("\n") + if "ALTER TABLE canonical_controls" in line + and "ADD COLUMN" in line + ] + for line in alter_lines: + assert "IF NOT EXISTS" in line, ( + f"ALTER TABLE missing IF NOT EXISTS: {line}" + ) + + def test_pattern_id_index(self, migration_sql): + assert "idx_cc_pattern" in migration_sql + + +# ============================================================================= +# Cross-Cutting Concerns +# ============================================================================= + + +class TestSQLSafety: + """Validate SQL safety and idempotency.""" + + def test_all_tables_use_if_not_exists(self, migration_sql): + create_statements = re.findall( + r"CREATE TABLE\s+(?:IF NOT EXISTS\s+)?(\w+)", migration_sql + ) + for match in re.finditer(r"CREATE TABLE\s+(\w+)", migration_sql): + table_name = match.group(1) + if table_name == "IF": + continue # This is part of "IF NOT EXISTS" + full_match = migration_sql[match.start() : match.start() + 60] + assert "IF NOT EXISTS" in full_match, ( + f"CREATE TABLE {table_name} missing IF NOT EXISTS" + ) + + def test_all_indexes_use_if_not_exists(self, migration_sql): + for match in re.finditer(r"CREATE INDEX\s+(\w+)", migration_sql): + idx_name = match.group(1) + if idx_name == "IF": + continue + full_match = migration_sql[match.start() : match.start() + 80] + assert "IF NOT EXISTS" in full_match, ( + f"CREATE INDEX {idx_name} missing IF NOT EXISTS" + ) + + def test_no_drop_statements(self, migration_sql): + """Migration should only add, never drop.""" + lines = [ + l.strip() + for l in migration_sql.split("\n") + if not l.strip().startswith("--") + ] + sql_content = "\n".join(lines) + assert "DROP TABLE" not in sql_content + assert "DROP INDEX" not in sql_content + assert "DROP COLUMN" not in sql_content + + def test_no_truncate(self, migration_sql): + lines = [ + l.strip() + for l in migration_sql.split("\n") + if not l.strip().startswith("--") + ] + sql_content = "\n".join(lines) + assert "TRUNCATE" not in sql_content + + def test_fk_references_existing_tables(self, migration_sql): + """All REFERENCES must point to canonical_controls or canonical_generation_jobs.""" + refs = re.findall(r"REFERENCES\s+(\w+)\(", migration_sql) + allowed_tables = {"canonical_controls", "canonical_generation_jobs"} + for ref in refs: + assert ref in allowed_tables, ( + f"FK reference to unknown table: {ref}" + ) + + def test_consistent_varchar_sizes(self, migration_sql): + """Key fields should use consistent sizes across tables.""" + # obligation_id should be VARCHAR(50) everywhere + obligation_id_matches = re.findall( + r"obligation_id\s+VARCHAR\((\d+)\)", migration_sql + ) + for size in obligation_id_matches: + assert size == "50", f"obligation_id should be VARCHAR(50), got {size}" + + # pattern_id should be VARCHAR(50) everywhere + pattern_id_matches = re.findall( + r"pattern_id\s+VARCHAR\((\d+)\)", migration_sql + ) + for size in pattern_id_matches: + assert size == "50", f"pattern_id should be VARCHAR(50), got {size}" + + # regulation_code should be VARCHAR(100) everywhere + reg_code_matches = re.findall( + r"regulation_code\s+VARCHAR\((\d+)\)", migration_sql + ) + for size in reg_code_matches: + assert size == "100", f"regulation_code should be VARCHAR(100), got {size}" + + +class TestTableComments: + """Validate that all new tables have COMMENT ON TABLE.""" + + def test_obligation_extractions_comment(self, migration_sql): + assert "COMMENT ON TABLE obligation_extractions" in migration_sql + + def test_control_patterns_comment(self, migration_sql): + assert "COMMENT ON TABLE control_patterns" in migration_sql + + def test_crosswalk_matrix_comment(self, migration_sql): + assert "COMMENT ON TABLE crosswalk_matrix" in migration_sql + + +# ============================================================================= +# Data Type Compatibility Tests +# ============================================================================= + + +class TestDataTypeCompatibility: + """Ensure data types are compatible with existing schema.""" + + def test_chunk_hash_matches_processed_chunks(self, migration_sql): + """chunk_hash in obligation_extractions should match canonical_processed_chunks.""" + block = _extract_create_table(migration_sql, "obligation_extractions") + assert "chunk_hash VARCHAR(64)" in block + + def test_collection_matches_processed_chunks(self, migration_sql): + """collection size should match canonical_processed_chunks.""" + block = _extract_create_table(migration_sql, "obligation_extractions") + assert "collection VARCHAR(100)" in block + + def test_control_id_size_matches_canonical_controls(self, migration_sql): + """master_control_id VARCHAR(20) should match canonical_controls.control_id VARCHAR(20).""" + block = _extract_create_table(migration_sql, "crosswalk_matrix") + assert "master_control_id VARCHAR(20)" in block + + def test_pattern_id_format_documented(self, migration_sql): + """Pattern ID format CP-{DOMAIN}-{NNN} should be documented.""" + assert "CP-{DOMAIN}-{NNN}" in migration_sql or "CP-" in migration_sql + + +# ============================================================================= +# Helpers +# ============================================================================= + + +def _extract_create_table(sql: str, table_name: str) -> str: + """Extract a CREATE TABLE block from SQL.""" + pattern = rf"CREATE TABLE IF NOT EXISTS {table_name}\s*\((.*?)\);" + match = re.search(pattern, sql, re.DOTALL) + if not match: + pytest.fail(f"Could not find CREATE TABLE for {table_name}") + return match.group(1) diff --git a/backend-compliance/tests/test_obligation_extractor.py b/backend-compliance/tests/test_obligation_extractor.py new file mode 100644 index 0000000..27ca585 --- /dev/null +++ b/backend-compliance/tests/test_obligation_extractor.py @@ -0,0 +1,939 @@ +"""Tests for Obligation Extractor — Phase 4 of Multi-Layer Control Architecture. + +Validates: +- Regulation code normalization (_normalize_regulation) +- Article reference normalization (_normalize_article) +- Cosine similarity (_cosine_sim) +- JSON parsing from LLM responses (_parse_json) +- Obligation loading from v2 framework +- 3-Tier extraction: exact_match → embedding_match → llm_extracted +- ObligationMatch serialization +- Edge cases: empty inputs, missing data, fallback behavior +""" + +import json +import math +from pathlib import Path +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest + +from compliance.services.obligation_extractor import ( + EMBEDDING_CANDIDATE_THRESHOLD, + EMBEDDING_MATCH_THRESHOLD, + ObligationExtractor, + ObligationMatch, + _ObligationEntry, + _cosine_sim, + _find_obligations_dir, + _normalize_article, + _normalize_regulation, + _parse_json, +) + +REPO_ROOT = Path(__file__).resolve().parent.parent.parent +V2_DIR = REPO_ROOT / "ai-compliance-sdk" / "policies" / "obligations" / "v2" + + +# ============================================================================= +# Tests: _normalize_regulation +# ============================================================================= + + +class TestNormalizeRegulation: + """Tests for regulation code normalization.""" + + def test_dsgvo_eu_code(self): + assert _normalize_regulation("eu_2016_679") == "dsgvo" + + def test_dsgvo_short(self): + assert _normalize_regulation("dsgvo") == "dsgvo" + + def test_gdpr_alias(self): + assert _normalize_regulation("gdpr") == "dsgvo" + + def test_ai_act_eu_code(self): + assert _normalize_regulation("eu_2024_1689") == "ai_act" + + def test_ai_act_short(self): + assert _normalize_regulation("ai_act") == "ai_act" + + def test_nis2_eu_code(self): + assert _normalize_regulation("eu_2022_2555") == "nis2" + + def test_nis2_short(self): + assert _normalize_regulation("nis2") == "nis2" + + def test_bsig_alias(self): + assert _normalize_regulation("bsig") == "nis2" + + def test_bdsg(self): + assert _normalize_regulation("bdsg") == "bdsg" + + def test_ttdsg(self): + assert _normalize_regulation("ttdsg") == "ttdsg" + + def test_dsa_eu_code(self): + assert _normalize_regulation("eu_2022_2065") == "dsa" + + def test_data_act_eu_code(self): + assert _normalize_regulation("eu_2023_2854") == "data_act" + + def test_eu_machinery_eu_code(self): + assert _normalize_regulation("eu_2023_1230") == "eu_machinery" + + def test_dora_eu_code(self): + assert _normalize_regulation("eu_2022_2554") == "dora" + + def test_case_insensitive(self): + assert _normalize_regulation("DSGVO") == "dsgvo" + assert _normalize_regulation("AI_ACT") == "ai_act" + assert _normalize_regulation("NIS2") == "nis2" + + def test_whitespace_stripped(self): + assert _normalize_regulation(" dsgvo ") == "dsgvo" + + def test_empty_string(self): + assert _normalize_regulation("") is None + + def test_none(self): + assert _normalize_regulation(None) is None + + def test_unknown_code(self): + assert _normalize_regulation("mica") is None + + def test_prefix_matching(self): + """EU codes with suffixes should still match via prefix.""" + assert _normalize_regulation("eu_2016_679_consolidated") == "dsgvo" + + def test_all_nine_regulations_covered(self): + """Every regulation in the manifest should be normalizable.""" + regulation_ids = ["dsgvo", "ai_act", "nis2", "bdsg", "ttdsg", "dsa", + "data_act", "eu_machinery", "dora"] + for reg_id in regulation_ids: + result = _normalize_regulation(reg_id) + assert result == reg_id, f"Regulation {reg_id} not found" + + +# ============================================================================= +# Tests: _normalize_article +# ============================================================================= + + +class TestNormalizeArticle: + """Tests for article reference normalization.""" + + def test_art_with_dot(self): + assert _normalize_article("Art. 30") == "art. 30" + + def test_article_english(self): + assert _normalize_article("Article 10") == "art. 10" + + def test_artikel_german(self): + assert _normalize_article("Artikel 35") == "art. 35" + + def test_paragraph_symbol(self): + assert _normalize_article("§ 38") == "§ 38" + + def test_paragraph_with_law_suffix(self): + """§ 38 BDSG → § 38 (law name stripped).""" + assert _normalize_article("§ 38 BDSG") == "§ 38" + + def test_paragraph_with_dsgvo_suffix(self): + assert _normalize_article("Art. 6 DSGVO") == "art. 6" + + def test_removes_absatz(self): + """Art. 30 Abs. 1 → art. 30""" + assert _normalize_article("Art. 30 Abs. 1") == "art. 30" + + def test_removes_paragraph(self): + assert _normalize_article("Art. 5 paragraph 2") == "art. 5" + + def test_removes_lit(self): + assert _normalize_article("Art. 6 lit. a") == "art. 6" + + def test_removes_satz(self): + assert _normalize_article("Art. 12 Satz 3") == "art. 12" + + def test_lowercase_output(self): + assert _normalize_article("ART. 30") == "art. 30" + assert _normalize_article("ARTICLE 10") == "art. 10" + + def test_whitespace_stripped(self): + assert _normalize_article(" Art. 30 ") == "art. 30" + + def test_empty_string(self): + assert _normalize_article("") == "" + + def test_none(self): + assert _normalize_article(None) == "" + + def test_complex_reference(self): + """Art. 30 Abs. 1 Satz 2 lit. c DSGVO → art. 30""" + result = _normalize_article("Art. 30 Abs. 1 Satz 2 lit. c DSGVO") + # Should at minimum remove DSGVO and Abs references + assert result.startswith("art. 30") + + def test_nis2_article(self): + assert _normalize_article("Art. 21 NIS2") == "art. 21" + + def test_dora_article(self): + assert _normalize_article("Art. 5 DORA") == "art. 5" + + def test_ai_act_article(self): + result = _normalize_article("Article 6 AI Act") + assert result == "art. 6" + + +# ============================================================================= +# Tests: _cosine_sim +# ============================================================================= + + +class TestCosineSim: + """Tests for cosine similarity calculation.""" + + def test_identical_vectors(self): + v = [1.0, 2.0, 3.0] + assert abs(_cosine_sim(v, v) - 1.0) < 1e-6 + + def test_orthogonal_vectors(self): + a = [1.0, 0.0] + b = [0.0, 1.0] + assert abs(_cosine_sim(a, b)) < 1e-6 + + def test_opposite_vectors(self): + a = [1.0, 2.0, 3.0] + b = [-1.0, -2.0, -3.0] + assert abs(_cosine_sim(a, b) - (-1.0)) < 1e-6 + + def test_known_value(self): + a = [1.0, 0.0] + b = [1.0, 1.0] + expected = 1.0 / math.sqrt(2) + assert abs(_cosine_sim(a, b) - expected) < 1e-6 + + def test_empty_vectors(self): + assert _cosine_sim([], []) == 0.0 + + def test_one_empty(self): + assert _cosine_sim([1.0, 2.0], []) == 0.0 + assert _cosine_sim([], [1.0, 2.0]) == 0.0 + + def test_different_lengths(self): + assert _cosine_sim([1.0, 2.0], [1.0]) == 0.0 + + def test_zero_vector(self): + assert _cosine_sim([0.0, 0.0], [1.0, 2.0]) == 0.0 + + def test_both_zero(self): + assert _cosine_sim([0.0, 0.0], [0.0, 0.0]) == 0.0 + + def test_high_dimensional(self): + """Test with realistic embedding dimensions (1024).""" + import random + random.seed(42) + a = [random.gauss(0, 1) for _ in range(1024)] + b = [random.gauss(0, 1) for _ in range(1024)] + score = _cosine_sim(a, b) + assert -1.0 <= score <= 1.0 + + +# ============================================================================= +# Tests: _parse_json +# ============================================================================= + + +class TestParseJson: + """Tests for JSON extraction from LLM responses.""" + + def test_direct_json(self): + text = '{"obligation_text": "Test", "actor": "Controller"}' + result = _parse_json(text) + assert result["obligation_text"] == "Test" + assert result["actor"] == "Controller" + + def test_json_in_markdown_block(self): + """LLMs often wrap JSON in markdown code blocks.""" + text = '''Some explanation text +```json +{"obligation_text": "Test"} +``` +More text''' + result = _parse_json(text) + assert result.get("obligation_text") == "Test" + + def test_json_with_prefix_text(self): + text = 'Here is the result: {"obligation_text": "Pflicht", "actor": "Verantwortlicher"}' + result = _parse_json(text) + assert result["obligation_text"] == "Pflicht" + + def test_invalid_json(self): + result = _parse_json("not json at all") + assert result == {} + + def test_empty_string(self): + result = _parse_json("") + assert result == {} + + def test_nested_braces_picks_first(self): + """With nested objects, the regex picks the inner simple object.""" + text = '{"outer": {"inner": "value"}}' + result = _parse_json(text) + # Direct parse should work for valid nested JSON + assert "outer" in result + + def test_json_with_german_umlauts(self): + text = '{"obligation_text": "Pflicht zur Datenschutz-Folgenabschaetzung"}' + result = _parse_json(text) + assert "Datenschutz" in result["obligation_text"] + + +# ============================================================================= +# Tests: ObligationMatch +# ============================================================================= + + +class TestObligationMatch: + """Tests for the ObligationMatch dataclass.""" + + def test_defaults(self): + match = ObligationMatch() + assert match.obligation_id is None + assert match.obligation_title is None + assert match.obligation_text is None + assert match.method == "none" + assert match.confidence == 0.0 + assert match.regulation_id is None + + def test_to_dict(self): + match = ObligationMatch( + obligation_id="DSGVO-OBL-001", + obligation_title="Verarbeitungsverzeichnis", + obligation_text="Fuehrung eines Verzeichnisses...", + method="exact_match", + confidence=1.0, + regulation_id="dsgvo", + ) + d = match.to_dict() + assert d["obligation_id"] == "DSGVO-OBL-001" + assert d["method"] == "exact_match" + assert d["confidence"] == 1.0 + assert d["regulation_id"] == "dsgvo" + + def test_to_dict_keys(self): + match = ObligationMatch() + d = match.to_dict() + expected_keys = { + "obligation_id", "obligation_title", "obligation_text", + "method", "confidence", "regulation_id", + } + assert set(d.keys()) == expected_keys + + def test_to_dict_none_values(self): + match = ObligationMatch() + d = match.to_dict() + assert d["obligation_id"] is None + assert d["obligation_title"] is None + + +# ============================================================================= +# Tests: _find_obligations_dir +# ============================================================================= + + +class TestFindObligationsDir: + """Tests for finding the v2 obligations directory.""" + + def test_finds_v2_directory(self): + """Should find the v2 dir relative to the source file.""" + result = _find_obligations_dir() + # May be None in CI without the SDK, but if found, verify it's valid + if result is not None: + assert result.is_dir() + assert (result / "_manifest.json").exists() + + def test_v2_dir_exists_in_repo(self): + """The v2 dir should exist in the repo for local tests.""" + assert V2_DIR.exists(), f"v2 dir not found at {V2_DIR}" + assert (V2_DIR / "_manifest.json").exists() + + +# ============================================================================= +# Tests: ObligationExtractor — _load_obligations +# ============================================================================= + + +class TestObligationExtractorLoad: + """Tests for obligation loading from v2 JSON files.""" + + def test_load_obligations_populates_lookup(self): + extractor = ObligationExtractor() + extractor._load_obligations() + assert len(extractor._obligations) > 0 + + def test_load_obligations_count(self): + """Should load all 325 obligations from 9 regulations.""" + extractor = ObligationExtractor() + extractor._load_obligations() + assert len(extractor._obligations) == 325 + + def test_article_lookup_populated(self): + """Article lookup should have entries for obligations with legal_basis.""" + extractor = ObligationExtractor() + extractor._load_obligations() + assert len(extractor._article_lookup) > 0 + + def test_article_lookup_dsgvo_art30(self): + """DSGVO Art. 30 should resolve to DSGVO-OBL-001.""" + extractor = ObligationExtractor() + extractor._load_obligations() + key = "dsgvo/art. 30" + assert key in extractor._article_lookup + assert "DSGVO-OBL-001" in extractor._article_lookup[key] + + def test_obligations_have_required_fields(self): + """Every loaded obligation should have id, title, description, regulation_id.""" + extractor = ObligationExtractor() + extractor._load_obligations() + for obl_id, entry in extractor._obligations.items(): + assert entry.id == obl_id + assert entry.title, f"{obl_id}: empty title" + assert entry.description, f"{obl_id}: empty description" + assert entry.regulation_id, f"{obl_id}: empty regulation_id" + + def test_all_nine_regulations_loaded(self): + """All 9 regulations from the manifest should be loaded.""" + extractor = ObligationExtractor() + extractor._load_obligations() + regulation_ids = {e.regulation_id for e in extractor._obligations.values()} + expected = {"dsgvo", "ai_act", "nis2", "bdsg", "ttdsg", "dsa", + "data_act", "eu_machinery", "dora"} + assert regulation_ids == expected + + def test_obligation_id_format(self): + """All obligation IDs should follow the pattern {REG}-OBL-{NNN}.""" + extractor = ObligationExtractor() + extractor._load_obligations() + import re + # Allow letters, digits, underscores in prefix (e.g. NIS2-OBL-001, EU_MACHINERY-OBL-001) + pattern = re.compile(r"^[A-Z0-9_]+-OBL-\d{3}$") + for obl_id in extractor._obligations: + assert pattern.match(obl_id), f"Invalid obligation ID format: {obl_id}" + + def test_no_duplicate_obligation_ids(self): + """All obligation IDs should be unique.""" + extractor = ObligationExtractor() + extractor._load_obligations() + ids = list(extractor._obligations.keys()) + assert len(ids) == len(set(ids)) + + +# ============================================================================= +# Tests: ObligationExtractor — Tier 1 (Exact Match) +# ============================================================================= + + +class TestTier1ExactMatch: + """Tests for Tier 1 exact article lookup.""" + + def setup_method(self): + self.extractor = ObligationExtractor() + self.extractor._load_obligations() + + def test_exact_match_dsgvo_art30(self): + match = self.extractor._tier1_exact("dsgvo", "Art. 30") + assert match is not None + assert match.obligation_id == "DSGVO-OBL-001" + assert match.method == "exact_match" + assert match.confidence == 1.0 + assert match.regulation_id == "dsgvo" + + def test_exact_match_case_insensitive_article(self): + match = self.extractor._tier1_exact("dsgvo", "ART. 30") + assert match is not None + assert match.obligation_id == "DSGVO-OBL-001" + + def test_exact_match_article_variant(self): + """'Article 30' should normalize to 'art. 30' and match.""" + match = self.extractor._tier1_exact("dsgvo", "Article 30") + assert match is not None + assert match.obligation_id == "DSGVO-OBL-001" + + def test_exact_match_artikel_variant(self): + match = self.extractor._tier1_exact("dsgvo", "Artikel 30") + assert match is not None + assert match.obligation_id == "DSGVO-OBL-001" + + def test_exact_match_strips_absatz(self): + """Art. 30 Abs. 1 → art. 30 → should match.""" + match = self.extractor._tier1_exact("dsgvo", "Art. 30 Abs. 1") + assert match is not None + assert match.obligation_id == "DSGVO-OBL-001" + + def test_no_match_wrong_article(self): + match = self.extractor._tier1_exact("dsgvo", "Art. 999") + assert match is None + + def test_no_match_unknown_regulation(self): + match = self.extractor._tier1_exact("unknown_reg", "Art. 30") + assert match is None + + def test_no_match_none_regulation(self): + match = self.extractor._tier1_exact(None, "Art. 30") + assert match is None + + def test_match_has_title(self): + match = self.extractor._tier1_exact("dsgvo", "Art. 30") + assert match is not None + assert match.obligation_title is not None + assert len(match.obligation_title) > 0 + + def test_match_has_text(self): + match = self.extractor._tier1_exact("dsgvo", "Art. 30") + assert match is not None + assert match.obligation_text is not None + assert len(match.obligation_text) > 20 + + +# ============================================================================= +# Tests: ObligationExtractor — Tier 2 (Embedding Match) +# ============================================================================= + + +class TestTier2EmbeddingMatch: + """Tests for Tier 2 embedding-based matching.""" + + def setup_method(self): + self.extractor = ObligationExtractor() + self.extractor._load_obligations() + # Prepare fake embeddings for testing (no real embedding service) + self.extractor._obligation_ids = list(self.extractor._obligations.keys()) + # Create simple 3D embeddings per obligation — avoid zero vectors + self.extractor._obligation_embeddings = [] + for i in range(len(self.extractor._obligation_ids)): + # Each obligation gets a unique-ish non-zero vector + self.extractor._obligation_embeddings.append( + [float(i % 10 + 1), float((i * 3) % 10 + 1), float((i * 7) % 10 + 1)] + ) + + @pytest.mark.asyncio + async def test_embedding_match_above_threshold(self): + """When cosine > 0.80, should return embedding_match.""" + # Mock the embedding service to return a vector very similar to obligation 0 + target_embedding = self.extractor._obligation_embeddings[0] + + with patch( + "compliance.services.obligation_extractor._get_embedding", + new_callable=AsyncMock, + return_value=target_embedding, + ): + match = await self.extractor._tier2_embedding("test text", "dsgvo") + + # Should find a match (cosine = 1.0 for identical vector) + assert match is not None + assert match.method == "embedding_match" + assert match.confidence >= EMBEDDING_MATCH_THRESHOLD + + @pytest.mark.asyncio + async def test_embedding_match_returns_none_below_threshold(self): + """When cosine < 0.80, should return None.""" + # Return a vector orthogonal to all obligations + orthogonal = [100.0, -100.0, 0.0] + + with patch( + "compliance.services.obligation_extractor._get_embedding", + new_callable=AsyncMock, + return_value=orthogonal, + ): + match = await self.extractor._tier2_embedding("unrelated text", None) + + # May or may not match depending on vector distribution + # But we can verify it's either None or has correct method + if match is not None: + assert match.method == "embedding_match" + + @pytest.mark.asyncio + async def test_embedding_match_empty_embeddings(self): + """When no embeddings loaded, should return None.""" + self.extractor._obligation_embeddings = [] + match = await self.extractor._tier2_embedding("any text", "dsgvo") + assert match is None + + @pytest.mark.asyncio + async def test_embedding_match_failed_embedding(self): + """When embedding service returns empty, should return None.""" + with patch( + "compliance.services.obligation_extractor._get_embedding", + new_callable=AsyncMock, + return_value=[], + ): + match = await self.extractor._tier2_embedding("some text", "dsgvo") + assert match is None + + @pytest.mark.asyncio + async def test_domain_bonus_same_regulation(self): + """Matching regulation should add +0.05 bonus.""" + # Set up two obligations with same embeddings but different regulations + self.extractor._obligation_ids = ["DSGVO-OBL-001", "NIS2-OBL-001"] + self.extractor._obligation_embeddings = [ + [1.0, 0.0, 0.0], + [1.0, 0.0, 0.0], + ] + + with patch( + "compliance.services.obligation_extractor._get_embedding", + new_callable=AsyncMock, + return_value=[1.0, 0.0, 0.0], + ): + match = await self.extractor._tier2_embedding("test", "dsgvo") + + # Should match (cosine = 1.0 ≥ 0.80) + assert match is not None + assert match.method == "embedding_match" + # With domain bonus, DSGVO should be preferred + assert match.regulation_id == "dsgvo" + + @pytest.mark.asyncio + async def test_confidence_capped_at_1(self): + """Confidence should not exceed 1.0 even with domain bonus.""" + self.extractor._obligation_ids = ["DSGVO-OBL-001"] + self.extractor._obligation_embeddings = [[1.0, 0.0, 0.0]] + + with patch( + "compliance.services.obligation_extractor._get_embedding", + new_callable=AsyncMock, + return_value=[1.0, 0.0, 0.0], + ): + match = await self.extractor._tier2_embedding("test", "dsgvo") + + assert match is not None + assert match.confidence <= 1.0 + + +# ============================================================================= +# Tests: ObligationExtractor — Tier 3 (LLM Extraction) +# ============================================================================= + + +class TestTier3LLMExtraction: + """Tests for Tier 3 LLM-based obligation extraction.""" + + def setup_method(self): + self.extractor = ObligationExtractor() + + @pytest.mark.asyncio + async def test_llm_extraction_success(self): + """Successful LLM extraction returns obligation_text with confidence 0.60.""" + llm_response = json.dumps({ + "obligation_text": "Pflicht zur Fuehrung eines Verarbeitungsverzeichnisses", + "actor": "Verantwortlicher", + "action": "Verarbeitungsverzeichnis fuehren", + "normative_strength": "muss", + }) + + with patch( + "compliance.services.obligation_extractor._llm_ollama", + new_callable=AsyncMock, + return_value=llm_response, + ): + match = await self.extractor._tier3_llm( + "Der Verantwortliche fuehrt ein Verzeichnis...", + "eu_2016_679", + "Art. 30", + ) + + assert match.method == "llm_extracted" + assert match.confidence == 0.60 + assert "Verarbeitungsverzeichnis" in match.obligation_text + assert match.obligation_id is None # LLM doesn't assign IDs + assert match.regulation_id == "dsgvo" + + @pytest.mark.asyncio + async def test_llm_extraction_failure(self): + """When LLM returns empty, should return match with confidence 0.""" + with patch( + "compliance.services.obligation_extractor._llm_ollama", + new_callable=AsyncMock, + return_value="", + ): + match = await self.extractor._tier3_llm("some text", "dsgvo", "Art. 1") + + assert match.method == "llm_extracted" + assert match.confidence == 0.0 + assert match.obligation_text is None + + @pytest.mark.asyncio + async def test_llm_extraction_malformed_json(self): + """When LLM returns non-JSON, should use raw text as fallback.""" + with patch( + "compliance.services.obligation_extractor._llm_ollama", + new_callable=AsyncMock, + return_value="Dies ist die Pflicht: Daten schuetzen", + ): + match = await self.extractor._tier3_llm("some text", "dsgvo", None) + + assert match.method == "llm_extracted" + assert match.confidence == 0.60 + # Fallback: uses first 500 chars of response as obligation_text + assert "Pflicht" in match.obligation_text or "Daten" in match.obligation_text + + @pytest.mark.asyncio + async def test_llm_regulation_normalization(self): + """Regulation code should be normalized in result.""" + with patch( + "compliance.services.obligation_extractor._llm_ollama", + new_callable=AsyncMock, + return_value='{"obligation_text": "Test"}', + ): + match = await self.extractor._tier3_llm( + "text", "eu_2024_1689", "Art. 6" + ) + + assert match.regulation_id == "ai_act" + + +# ============================================================================= +# Tests: ObligationExtractor — Full 3-Tier extract() +# ============================================================================= + + +class TestExtractFullFlow: + """Tests for the full 3-tier extraction flow.""" + + def setup_method(self): + self.extractor = ObligationExtractor() + self.extractor._load_obligations() + # Mark as initialized to skip async initialize + self.extractor._initialized = True + # Empty embeddings — Tier 2 will return None + self.extractor._obligation_embeddings = [] + self.extractor._obligation_ids = [] + + @pytest.mark.asyncio + async def test_tier1_takes_priority(self): + """When Tier 1 matches, Tier 2 and 3 should not be called.""" + with patch.object( + self.extractor, "_tier2_embedding", new_callable=AsyncMock + ) as mock_t2, patch.object( + self.extractor, "_tier3_llm", new_callable=AsyncMock + ) as mock_t3: + match = await self.extractor.extract( + chunk_text="irrelevant", + regulation_code="eu_2016_679", + article="Art. 30", + ) + + assert match.method == "exact_match" + mock_t2.assert_not_called() + mock_t3.assert_not_called() + + @pytest.mark.asyncio + async def test_tier2_when_tier1_misses(self): + """When Tier 1 misses, Tier 2 should be tried.""" + tier2_result = ObligationMatch( + obligation_id="DSGVO-OBL-050", + method="embedding_match", + confidence=0.85, + regulation_id="dsgvo", + ) + + with patch.object( + self.extractor, "_tier2_embedding", + new_callable=AsyncMock, + return_value=tier2_result, + ) as mock_t2, patch.object( + self.extractor, "_tier3_llm", new_callable=AsyncMock + ) as mock_t3: + match = await self.extractor.extract( + chunk_text="some compliance text", + regulation_code="eu_2016_679", + article="Art. 999", # Non-matching article + ) + + assert match.method == "embedding_match" + mock_t2.assert_called_once() + mock_t3.assert_not_called() + + @pytest.mark.asyncio + async def test_tier3_when_tier1_and_2_miss(self): + """When Tier 1 and 2 miss, Tier 3 should be called.""" + tier3_result = ObligationMatch( + obligation_text="LLM extracted obligation", + method="llm_extracted", + confidence=0.60, + ) + + with patch.object( + self.extractor, "_tier2_embedding", + new_callable=AsyncMock, + return_value=None, + ), patch.object( + self.extractor, "_tier3_llm", + new_callable=AsyncMock, + return_value=tier3_result, + ): + match = await self.extractor.extract( + chunk_text="unrelated text", + regulation_code="unknown_reg", + article="Art. 999", + ) + + assert match.method == "llm_extracted" + + @pytest.mark.asyncio + async def test_no_article_skips_tier1(self): + """When no article is provided, Tier 1 should be skipped.""" + with patch.object( + self.extractor, "_tier2_embedding", + new_callable=AsyncMock, + return_value=None, + ) as mock_t2, patch.object( + self.extractor, "_tier3_llm", + new_callable=AsyncMock, + return_value=ObligationMatch(method="llm_extracted", confidence=0.60), + ): + match = await self.extractor.extract( + chunk_text="some text", + regulation_code="dsgvo", + article=None, + ) + + # Tier 2 should be called (Tier 1 skipped due to no article) + mock_t2.assert_called_once() + + @pytest.mark.asyncio + async def test_auto_initialize(self): + """If not initialized, extract should call initialize().""" + extractor = ObligationExtractor() + assert not extractor._initialized + + with patch.object( + extractor, "initialize", new_callable=AsyncMock + ) as mock_init: + # After mock init, set initialized to True + async def side_effect(): + extractor._initialized = True + extractor._load_obligations() + extractor._obligation_embeddings = [] + extractor._obligation_ids = [] + + mock_init.side_effect = side_effect + + with patch.object( + extractor, "_tier2_embedding", + new_callable=AsyncMock, + return_value=None, + ), patch.object( + extractor, "_tier3_llm", + new_callable=AsyncMock, + return_value=ObligationMatch(method="llm_extracted", confidence=0.60), + ): + await extractor.extract( + chunk_text="test", + regulation_code="dsgvo", + article=None, + ) + + mock_init.assert_called_once() + + +# ============================================================================= +# Tests: ObligationExtractor — stats() +# ============================================================================= + + +class TestExtractorStats: + """Tests for the stats() method.""" + + def test_stats_before_init(self): + extractor = ObligationExtractor() + stats = extractor.stats() + assert stats["total_obligations"] == 0 + assert stats["article_lookups"] == 0 + assert stats["initialized"] is False + + def test_stats_after_load(self): + extractor = ObligationExtractor() + extractor._load_obligations() + stats = extractor.stats() + assert stats["total_obligations"] == 325 + assert stats["article_lookups"] > 0 + assert "dsgvo" in stats["regulations"] + assert stats["initialized"] is False # not fully initialized (no embeddings) + + def test_stats_regulations_complete(self): + extractor = ObligationExtractor() + extractor._load_obligations() + stats = extractor.stats() + expected_regs = {"dsgvo", "ai_act", "nis2", "bdsg", "ttdsg", + "dsa", "data_act", "eu_machinery", "dora"} + assert set(stats["regulations"]) == expected_regs + + +# ============================================================================= +# Tests: Integration — Regulation-to-Obligation mapping coverage +# ============================================================================= + + +class TestRegulationObligationCoverage: + """Verify that the article lookup provides reasonable coverage.""" + + def setup_method(self): + self.extractor = ObligationExtractor() + self.extractor._load_obligations() + + def test_dsgvo_has_article_lookups(self): + """DSGVO (80 obligations) should have many article lookups.""" + dsgvo_keys = [k for k in self.extractor._article_lookup if k.startswith("dsgvo/")] + assert len(dsgvo_keys) >= 20, f"Only {len(dsgvo_keys)} DSGVO article lookups" + + def test_ai_act_has_article_lookups(self): + ai_keys = [k for k in self.extractor._article_lookup if k.startswith("ai_act/")] + assert len(ai_keys) >= 10, f"Only {len(ai_keys)} AI Act article lookups" + + def test_nis2_has_article_lookups(self): + nis2_keys = [k for k in self.extractor._article_lookup if k.startswith("nis2/")] + assert len(nis2_keys) >= 5, f"Only {len(nis2_keys)} NIS2 article lookups" + + def test_all_article_lookup_values_are_valid(self): + """Every obligation ID in article_lookup should exist in _obligations.""" + for key, obl_ids in self.extractor._article_lookup.items(): + for obl_id in obl_ids: + assert obl_id in self.extractor._obligations, ( + f"Article lookup {key} references missing obligation {obl_id}" + ) + + def test_article_lookup_key_format(self): + """All keys should be in format 'regulation_id/normalized_article'.""" + for key in self.extractor._article_lookup: + parts = key.split("/", 1) + assert len(parts) == 2, f"Invalid key format: {key}" + reg_id, article = parts + assert reg_id, f"Empty regulation ID in key: {key}" + assert article, f"Empty article in key: {key}" + assert article == article.lower(), f"Article not lowercase: {key}" + + +# ============================================================================= +# Tests: Constants and thresholds +# ============================================================================= + + +class TestConstants: + """Tests for module-level constants.""" + + def test_embedding_thresholds_ordering(self): + """Match threshold should be higher than candidate threshold.""" + assert EMBEDDING_MATCH_THRESHOLD > EMBEDDING_CANDIDATE_THRESHOLD + + def test_embedding_thresholds_range(self): + """Thresholds should be between 0 and 1.""" + assert 0 < EMBEDDING_MATCH_THRESHOLD <= 1.0 + assert 0 < EMBEDDING_CANDIDATE_THRESHOLD <= 1.0 + + def test_match_threshold_is_80(self): + assert EMBEDDING_MATCH_THRESHOLD == 0.80 + + def test_candidate_threshold_is_60(self): + assert EMBEDDING_CANDIDATE_THRESHOLD == 0.60 diff --git a/backend-compliance/tests/test_pattern_matcher.py b/backend-compliance/tests/test_pattern_matcher.py new file mode 100644 index 0000000..ed7e892 --- /dev/null +++ b/backend-compliance/tests/test_pattern_matcher.py @@ -0,0 +1,901 @@ +"""Tests for Pattern Matcher — Phase 5 of Multi-Layer Control Architecture. + +Validates: +- Pattern loading from YAML files +- Keyword index construction +- Keyword matching (Tier 1) +- Embedding matching (Tier 2) with domain bonus +- Score combination logic +- Domain affinity mapping +- Top-N matching +- PatternMatchResult serialization +- Edge cases: empty inputs, no matches, missing data +""" + +from pathlib import Path +from unittest.mock import AsyncMock, patch + +import pytest + +from compliance.services.pattern_matcher import ( + DOMAIN_BONUS, + EMBEDDING_PATTERN_THRESHOLD, + KEYWORD_MATCH_MIN_HITS, + ControlPattern, + PatternMatchResult, + PatternMatcher, + _REGULATION_DOMAIN_AFFINITY, + _find_patterns_dir, +) + +REPO_ROOT = Path(__file__).resolve().parent.parent.parent +PATTERNS_DIR = REPO_ROOT / "ai-compliance-sdk" / "policies" / "control_patterns" + + +# ============================================================================= +# Tests: _find_patterns_dir +# ============================================================================= + + +class TestFindPatternsDir: + """Tests for locating the control_patterns directory.""" + + def test_finds_patterns_dir(self): + result = _find_patterns_dir() + if result is not None: + assert result.is_dir() + + def test_patterns_dir_exists_in_repo(self): + assert PATTERNS_DIR.exists(), f"Patterns dir not found at {PATTERNS_DIR}" + + +# ============================================================================= +# Tests: ControlPattern +# ============================================================================= + + +class TestControlPattern: + """Tests for the ControlPattern dataclass.""" + + def test_defaults(self): + p = ControlPattern( + id="CP-TEST-001", + name="test_pattern", + name_de="Test-Muster", + domain="SEC", + category="testing", + description="A test pattern", + objective_template="Test objective", + rationale_template="Test rationale", + ) + assert p.id == "CP-TEST-001" + assert p.severity_default == "medium" + assert p.implementation_effort_default == "m" + assert p.obligation_match_keywords == [] + assert p.tags == [] + assert p.composable_with == [] + + def test_full_pattern(self): + p = ControlPattern( + id="CP-AUTH-001", + name="password_policy", + name_de="Passwortrichtlinie", + domain="AUTH", + category="authentication", + description="Password requirements", + objective_template="Ensure strong passwords", + rationale_template="Weak passwords are risky", + obligation_match_keywords=["passwort", "password", "credential"], + tags=["authentication", "password"], + composable_with=["CP-AUTH-002"], + ) + assert len(p.obligation_match_keywords) == 3 + assert "CP-AUTH-002" in p.composable_with + + +# ============================================================================= +# Tests: PatternMatchResult +# ============================================================================= + + +class TestPatternMatchResult: + """Tests for the PatternMatchResult dataclass.""" + + def test_defaults(self): + result = PatternMatchResult() + assert result.pattern is None + assert result.pattern_id is None + assert result.method == "none" + assert result.confidence == 0.0 + assert result.keyword_hits == 0 + assert result.embedding_score == 0.0 + assert result.composable_patterns == [] + + def test_to_dict(self): + result = PatternMatchResult( + pattern_id="CP-AUTH-001", + method="keyword", + confidence=0.857, + keyword_hits=6, + total_keywords=7, + embedding_score=0.823, + domain_bonus_applied=True, + composable_patterns=["CP-AUTH-002"], + ) + d = result.to_dict() + assert d["pattern_id"] == "CP-AUTH-001" + assert d["method"] == "keyword" + assert d["confidence"] == 0.857 + assert d["keyword_hits"] == 6 + assert d["total_keywords"] == 7 + assert d["embedding_score"] == 0.823 + assert d["domain_bonus_applied"] is True + assert d["composable_patterns"] == ["CP-AUTH-002"] + + def test_to_dict_keys(self): + result = PatternMatchResult() + d = result.to_dict() + expected_keys = { + "pattern_id", "method", "confidence", "keyword_hits", + "total_keywords", "embedding_score", "domain_bonus_applied", + "composable_patterns", + } + assert set(d.keys()) == expected_keys + + +# ============================================================================= +# Tests: PatternMatcher — Loading +# ============================================================================= + + +class TestPatternMatcherLoad: + """Tests for loading patterns from YAML.""" + + def test_load_patterns(self): + matcher = PatternMatcher() + matcher._load_patterns() + assert len(matcher._patterns) == 50 + + def test_by_id_populated(self): + matcher = PatternMatcher() + matcher._load_patterns() + assert "CP-AUTH-001" in matcher._by_id + assert "CP-CRYP-001" in matcher._by_id + + def test_by_domain_populated(self): + matcher = PatternMatcher() + matcher._load_patterns() + assert "AUTH" in matcher._by_domain + assert "DATA" in matcher._by_domain + assert len(matcher._by_domain["AUTH"]) >= 3 + + def test_pattern_fields_valid(self): + """Every loaded pattern should have all required fields.""" + matcher = PatternMatcher() + matcher._load_patterns() + for p in matcher._patterns: + assert p.id, "Empty pattern ID" + assert p.name, f"{p.id}: empty name" + assert p.name_de, f"{p.id}: empty name_de" + assert p.domain, f"{p.id}: empty domain" + assert p.category, f"{p.id}: empty category" + assert p.description, f"{p.id}: empty description" + assert p.objective_template, f"{p.id}: empty objective_template" + assert len(p.obligation_match_keywords) >= 3, ( + f"{p.id}: only {len(p.obligation_match_keywords)} keywords" + ) + + def test_no_duplicate_ids(self): + matcher = PatternMatcher() + matcher._load_patterns() + ids = [p.id for p in matcher._patterns] + assert len(ids) == len(set(ids)) + + +# ============================================================================= +# Tests: PatternMatcher — Keyword Index +# ============================================================================= + + +class TestKeywordIndex: + """Tests for the reverse keyword index.""" + + def setup_method(self): + self.matcher = PatternMatcher() + self.matcher._load_patterns() + self.matcher._build_keyword_index() + + def test_keyword_index_populated(self): + assert len(self.matcher._keyword_index) > 50 + + def test_keyword_maps_to_patterns(self): + """'passwort' should map to CP-AUTH-001.""" + assert "passwort" in self.matcher._keyword_index + assert "CP-AUTH-001" in self.matcher._keyword_index["passwort"] + + def test_keyword_lowercase(self): + """All keywords in the index should be lowercase.""" + for kw in self.matcher._keyword_index: + assert kw == kw.lower(), f"Keyword not lowercase: {kw}" + + def test_keyword_shared_across_patterns(self): + """Some keywords like 'verschluesselung' may appear in multiple patterns.""" + # This just verifies the structure allows multi-pattern keywords + for kw, pattern_ids in self.matcher._keyword_index.items(): + assert len(pattern_ids) >= 1 + + +# ============================================================================= +# Tests: PatternMatcher — Tier 1 (Keyword Match) +# ============================================================================= + + +class TestTier1KeywordMatch: + """Tests for keyword-based pattern matching.""" + + def setup_method(self): + self.matcher = PatternMatcher() + self.matcher._load_patterns() + self.matcher._build_keyword_index() + + def test_password_text_matches_auth(self): + """Text about passwords should match CP-AUTH-001.""" + result = self.matcher._tier1_keyword( + "Die Passwortrichtlinie muss sicherstellen dass Anmeldedaten " + "und Credentials geschuetzt sind und authentifizierung robust ist", + None, + ) + assert result is not None + assert result.pattern_id == "CP-AUTH-001" + assert result.method == "keyword" + assert result.keyword_hits >= KEYWORD_MATCH_MIN_HITS + + def test_encryption_text_matches_cryp(self): + """Text about encryption should match CP-CRYP-001.""" + result = self.matcher._tier1_keyword( + "Verschluesselung ruhender Daten muss mit AES-256 encryption erfolgen", + None, + ) + assert result is not None + assert result.pattern_id == "CP-CRYP-001" + assert result.keyword_hits >= KEYWORD_MATCH_MIN_HITS + + def test_incident_text_matches_inc(self): + result = self.matcher._tier1_keyword( + "Ein Vorfall-Reaktionsplan muss fuer Sicherheitsvorfaelle " + "und incident response bereitstehen", + None, + ) + assert result is not None + assert "INC" in result.pattern_id + + def test_no_match_for_unrelated_text(self): + result = self.matcher._tier1_keyword( + "xyzzy foobar completely unrelated text with no keywords", + None, + ) + assert result is None + + def test_single_keyword_below_threshold(self): + """A single keyword hit should not be enough.""" + result = self.matcher._tier1_keyword("passwort", None) + assert result is None # Only 1 hit < KEYWORD_MATCH_MIN_HITS (2) + + def test_domain_bonus_applied(self): + """Domain bonus should be added when regulation matches.""" + result_without = self.matcher._tier1_keyword( + "Personenbezogene Daten muessen durch Datenschutz Massnahmen " + "und datensicherheit geschuetzt werden mit datenminimierung", + None, + ) + result_with = self.matcher._tier1_keyword( + "Personenbezogene Daten muessen durch Datenschutz Massnahmen " + "und datensicherheit geschuetzt werden mit datenminimierung", + "dsgvo", + ) + if result_without and result_with: + # With DSGVO regulation, DATA domain patterns should get a bonus + if result_with.domain_bonus_applied: + assert result_with.confidence >= result_without.confidence + + def test_keyword_scores_returns_dict(self): + scores = self.matcher._keyword_scores( + "Passwort authentifizierung credential zugang", + None, + ) + assert isinstance(scores, dict) + assert "CP-AUTH-001" in scores + hits, total, confidence = scores["CP-AUTH-001"] + assert hits >= 3 + assert total > 0 + assert 0 < confidence <= 1.0 + + +# ============================================================================= +# Tests: PatternMatcher — Tier 2 (Embedding Match) +# ============================================================================= + + +class TestTier2EmbeddingMatch: + """Tests for embedding-based pattern matching.""" + + def setup_method(self): + self.matcher = PatternMatcher() + self.matcher._load_patterns() + self.matcher._build_keyword_index() + # Set up fake embeddings + self.matcher._pattern_ids = [p.id for p in self.matcher._patterns] + self.matcher._pattern_embeddings = [] + for i in range(len(self.matcher._patterns)): + self.matcher._pattern_embeddings.append( + [float(i % 10 + 1), float((i * 3) % 10 + 1), float((i * 7) % 10 + 1)] + ) + + @pytest.mark.asyncio + async def test_embedding_match_identical_vector(self): + """Identical vector should produce cosine = 1.0 > threshold.""" + target = self.matcher._pattern_embeddings[0] + with patch( + "compliance.services.pattern_matcher._get_embedding", + new_callable=AsyncMock, + return_value=target, + ): + result = await self.matcher._tier2_embedding("test text", None) + + assert result is not None + assert result.method == "embedding" + assert result.confidence >= EMBEDDING_PATTERN_THRESHOLD + + @pytest.mark.asyncio + async def test_embedding_match_empty(self): + """Empty embeddings should return None.""" + self.matcher._pattern_embeddings = [] + result = await self.matcher._tier2_embedding("test text", None) + assert result is None + + @pytest.mark.asyncio + async def test_embedding_match_failed_service(self): + """Failed embedding service should return None.""" + with patch( + "compliance.services.pattern_matcher._get_embedding", + new_callable=AsyncMock, + return_value=[], + ): + result = await self.matcher._tier2_embedding("test", None) + assert result is None + + @pytest.mark.asyncio + async def test_embedding_domain_bonus(self): + """Domain bonus should increase score for affine regulation.""" + # Set all patterns to same embedding + for i in range(len(self.matcher._pattern_embeddings)): + self.matcher._pattern_embeddings[i] = [1.0, 0.0, 0.0] + + with patch( + "compliance.services.pattern_matcher._get_embedding", + new_callable=AsyncMock, + return_value=[1.0, 0.0, 0.0], + ): + scores = await self.matcher._embedding_scores("test", "dsgvo") + + # DATA domain patterns should have bonus applied + data_patterns = [p.id for p in self.matcher._patterns if p.domain == "DATA"] + if data_patterns: + pid = data_patterns[0] + score, bonus = scores.get(pid, (0, False)) + assert bonus is True + assert score > 1.0 # 1.0 cosine + 0.10 bonus + + +# ============================================================================= +# Tests: PatternMatcher — Score Combination +# ============================================================================= + + +class TestScoreCombination: + """Tests for combining keyword and embedding results.""" + + def setup_method(self): + self.matcher = PatternMatcher() + self.pattern = ControlPattern( + id="CP-TEST-001", name="test", name_de="Test", + domain="SEC", category="test", description="d", + objective_template="o", rationale_template="r", + ) + + def test_both_none(self): + result = self.matcher._combine_results(None, None) + assert result.method == "none" + assert result.confidence == 0.0 + + def test_only_keyword(self): + kw = PatternMatchResult( + pattern=self.pattern, pattern_id="CP-TEST-001", + method="keyword", confidence=0.7, keyword_hits=5, + ) + result = self.matcher._combine_results(kw, None) + assert result.method == "keyword" + assert result.confidence == 0.7 + + def test_only_embedding(self): + emb = PatternMatchResult( + pattern=self.pattern, pattern_id="CP-TEST-001", + method="embedding", confidence=0.85, embedding_score=0.85, + ) + result = self.matcher._combine_results(None, emb) + assert result.method == "embedding" + assert result.confidence == 0.85 + + def test_same_pattern_combined(self): + """When both tiers agree, confidence gets +0.05 boost.""" + kw = PatternMatchResult( + pattern=self.pattern, pattern_id="CP-TEST-001", + method="keyword", confidence=0.7, keyword_hits=5, total_keywords=7, + ) + emb = PatternMatchResult( + pattern=self.pattern, pattern_id="CP-TEST-001", + method="embedding", confidence=0.8, embedding_score=0.8, + ) + result = self.matcher._combine_results(kw, emb) + assert result.method == "combined" + assert abs(result.confidence - 0.85) < 1e-9 # max(0.7, 0.8) + 0.05 + assert result.keyword_hits == 5 + assert result.embedding_score == 0.8 + + def test_same_pattern_combined_capped(self): + """Combined confidence should not exceed 1.0.""" + kw = PatternMatchResult( + pattern=self.pattern, pattern_id="CP-TEST-001", + method="keyword", confidence=0.95, + ) + emb = PatternMatchResult( + pattern=self.pattern, pattern_id="CP-TEST-001", + method="embedding", confidence=0.98, embedding_score=0.98, + ) + result = self.matcher._combine_results(kw, emb) + assert result.confidence <= 1.0 + + def test_different_patterns_picks_higher(self): + """When tiers disagree, pick the higher confidence.""" + p2 = ControlPattern( + id="CP-TEST-002", name="test2", name_de="Test2", + domain="SEC", category="test", description="d", + objective_template="o", rationale_template="r", + ) + kw = PatternMatchResult( + pattern=self.pattern, pattern_id="CP-TEST-001", + method="keyword", confidence=0.6, + ) + emb = PatternMatchResult( + pattern=p2, pattern_id="CP-TEST-002", + method="embedding", confidence=0.9, embedding_score=0.9, + ) + result = self.matcher._combine_results(kw, emb) + assert result.pattern_id == "CP-TEST-002" + assert result.confidence == 0.9 + + def test_different_patterns_keyword_wins(self): + p2 = ControlPattern( + id="CP-TEST-002", name="test2", name_de="Test2", + domain="SEC", category="test", description="d", + objective_template="o", rationale_template="r", + ) + kw = PatternMatchResult( + pattern=self.pattern, pattern_id="CP-TEST-001", + method="keyword", confidence=0.9, + ) + emb = PatternMatchResult( + pattern=p2, pattern_id="CP-TEST-002", + method="embedding", confidence=0.6, embedding_score=0.6, + ) + result = self.matcher._combine_results(kw, emb) + assert result.pattern_id == "CP-TEST-001" + + +# ============================================================================= +# Tests: PatternMatcher — Domain Affinity +# ============================================================================= + + +class TestDomainAffinity: + """Tests for regulation-to-domain affinity mapping.""" + + def test_dsgvo_affine_with_data(self): + assert PatternMatcher._domain_matches("DATA", "dsgvo") + + def test_dsgvo_affine_with_comp(self): + assert PatternMatcher._domain_matches("COMP", "dsgvo") + + def test_ai_act_affine_with_ai(self): + assert PatternMatcher._domain_matches("AI", "ai_act") + + def test_nis2_affine_with_sec(self): + assert PatternMatcher._domain_matches("SEC", "nis2") + + def test_nis2_affine_with_inc(self): + assert PatternMatcher._domain_matches("INC", "nis2") + + def test_dora_affine_with_fin(self): + assert PatternMatcher._domain_matches("FIN", "dora") + + def test_no_affinity_auth_dsgvo(self): + """AUTH is not in DSGVO's affinity list.""" + assert not PatternMatcher._domain_matches("AUTH", "dsgvo") + + def test_unknown_regulation(self): + assert not PatternMatcher._domain_matches("DATA", "unknown_reg") + + def test_all_regulations_have_affinity(self): + """All 9 regulations should have at least one affine domain.""" + expected_regs = [ + "dsgvo", "bdsg", "ttdsg", "ai_act", "nis2", + "dsa", "data_act", "eu_machinery", "dora", + ] + for reg in expected_regs: + assert reg in _REGULATION_DOMAIN_AFFINITY, f"{reg} missing from affinity map" + assert len(_REGULATION_DOMAIN_AFFINITY[reg]) >= 1 + + +# ============================================================================= +# Tests: PatternMatcher — Full match() +# ============================================================================= + + +class TestMatchFull: + """Tests for the full match() method.""" + + def setup_method(self): + self.matcher = PatternMatcher() + self.matcher._load_patterns() + self.matcher._build_keyword_index() + self.matcher._initialized = True + # Empty embeddings — Tier 2 returns None + self.matcher._pattern_embeddings = [] + self.matcher._pattern_ids = [] + + @pytest.mark.asyncio + async def test_match_password_text(self): + """Password text should match CP-AUTH-001 via keywords.""" + with patch( + "compliance.services.pattern_matcher._get_embedding", + new_callable=AsyncMock, + return_value=[], + ): + result = await self.matcher.match( + obligation_text=( + "Passwortrichtlinie muss sicherstellen dass Anmeldedaten " + "und credential geschuetzt sind und authentifizierung robust ist" + ), + regulation_id="nis2", + ) + assert result.pattern_id == "CP-AUTH-001" + assert result.confidence > 0 + + @pytest.mark.asyncio + async def test_match_encryption_text(self): + with patch( + "compliance.services.pattern_matcher._get_embedding", + new_callable=AsyncMock, + return_value=[], + ): + result = await self.matcher.match( + obligation_text=( + "Verschluesselung ruhender Daten muss mit AES-256 encryption " + "und schluesselmanagement kryptographie erfolgen" + ), + ) + assert result.pattern_id == "CP-CRYP-001" + + @pytest.mark.asyncio + async def test_match_empty_text(self): + result = await self.matcher.match(obligation_text="") + assert result.method == "none" + assert result.confidence == 0.0 + + @pytest.mark.asyncio + async def test_match_no_patterns(self): + """When no patterns loaded, should return empty result.""" + matcher = PatternMatcher() + matcher._initialized = True + result = await matcher.match(obligation_text="test") + assert result.method == "none" + + @pytest.mark.asyncio + async def test_match_composable_patterns(self): + """Result should include composable_with references.""" + with patch( + "compliance.services.pattern_matcher._get_embedding", + new_callable=AsyncMock, + return_value=[], + ): + result = await self.matcher.match( + obligation_text=( + "Passwortrichtlinie muss sicherstellen dass Anmeldedaten " + "und credential geschuetzt sind und authentifizierung robust ist" + ), + ) + if result.pattern and result.pattern.composable_with: + assert len(result.composable_patterns) >= 1 + + @pytest.mark.asyncio + async def test_match_with_domain_bonus(self): + """DSGVO obligation with DATA keywords should get domain bonus.""" + with patch( + "compliance.services.pattern_matcher._get_embedding", + new_callable=AsyncMock, + return_value=[], + ): + result = await self.matcher.match( + obligation_text=( + "Personenbezogene Daten muessen durch Datenschutz und " + "datensicherheit geschuetzt werden mit datenminimierung " + "und speicherbegrenzung und loeschung" + ), + regulation_id="dsgvo", + ) + # Should match a DATA-domain pattern + if result.pattern and result.pattern.domain == "DATA": + assert result.domain_bonus_applied is True + + +# ============================================================================= +# Tests: PatternMatcher — match_top_n() +# ============================================================================= + + +class TestMatchTopN: + """Tests for top-N matching.""" + + def setup_method(self): + self.matcher = PatternMatcher() + self.matcher._load_patterns() + self.matcher._build_keyword_index() + self.matcher._initialized = True + self.matcher._pattern_embeddings = [] + self.matcher._pattern_ids = [] + + @pytest.mark.asyncio + async def test_top_n_returns_list(self): + with patch( + "compliance.services.pattern_matcher._get_embedding", + new_callable=AsyncMock, + return_value=[], + ): + results = await self.matcher.match_top_n( + obligation_text=( + "Passwortrichtlinie muss sicherstellen dass Anmeldedaten " + "und credential geschuetzt sind und authentifizierung robust ist" + ), + n=3, + ) + assert isinstance(results, list) + assert len(results) >= 1 + + @pytest.mark.asyncio + async def test_top_n_sorted_by_confidence(self): + with patch( + "compliance.services.pattern_matcher._get_embedding", + new_callable=AsyncMock, + return_value=[], + ): + results = await self.matcher.match_top_n( + obligation_text=( + "Verschluesselung und kryptographie und schluesselmanagement " + "und authentifizierung und password und zugriffskontrolle" + ), + n=5, + ) + if len(results) >= 2: + for i in range(len(results) - 1): + assert results[i].confidence >= results[i + 1].confidence + + @pytest.mark.asyncio + async def test_top_n_empty_text(self): + with patch( + "compliance.services.pattern_matcher._get_embedding", + new_callable=AsyncMock, + return_value=[], + ): + results = await self.matcher.match_top_n(obligation_text="", n=3) + assert results == [] + + @pytest.mark.asyncio + async def test_top_n_respects_limit(self): + with patch( + "compliance.services.pattern_matcher._get_embedding", + new_callable=AsyncMock, + return_value=[], + ): + results = await self.matcher.match_top_n( + obligation_text=( + "Verschluesselung und kryptographie und schluesselmanagement " + "und authentifizierung und password und zugriffskontrolle" + ), + n=2, + ) + assert len(results) <= 2 + + +# ============================================================================= +# Tests: PatternMatcher — Public Helpers +# ============================================================================= + + +class TestPublicHelpers: + """Tests for get_pattern, get_patterns_by_domain, stats.""" + + def setup_method(self): + self.matcher = PatternMatcher() + self.matcher._load_patterns() + self.matcher._build_keyword_index() + + def test_get_pattern_existing(self): + p = self.matcher.get_pattern("CP-AUTH-001") + assert p is not None + assert p.id == "CP-AUTH-001" + + def test_get_pattern_case_insensitive(self): + p = self.matcher.get_pattern("cp-auth-001") + assert p is not None + + def test_get_pattern_nonexistent(self): + p = self.matcher.get_pattern("CP-FAKE-999") + assert p is None + + def test_get_patterns_by_domain(self): + patterns = self.matcher.get_patterns_by_domain("AUTH") + assert len(patterns) >= 3 + + def test_get_patterns_by_domain_case_insensitive(self): + patterns = self.matcher.get_patterns_by_domain("auth") + assert len(patterns) >= 3 + + def test_get_patterns_by_domain_unknown(self): + patterns = self.matcher.get_patterns_by_domain("NOPE") + assert patterns == [] + + def test_stats(self): + stats = self.matcher.stats() + assert stats["total_patterns"] == 50 + assert len(stats["domains"]) >= 5 + assert stats["keywords"] > 50 + assert stats["initialized"] is False + + +# ============================================================================= +# Tests: PatternMatcher — auto initialize +# ============================================================================= + + +class TestAutoInitialize: + """Tests for auto-initialization on first match call.""" + + @pytest.mark.asyncio + async def test_auto_init_on_match(self): + matcher = PatternMatcher() + assert not matcher._initialized + + with patch.object( + matcher, "initialize", new_callable=AsyncMock + ) as mock_init: + async def side_effect(): + matcher._initialized = True + matcher._load_patterns() + matcher._build_keyword_index() + matcher._pattern_embeddings = [] + matcher._pattern_ids = [] + + mock_init.side_effect = side_effect + + with patch( + "compliance.services.pattern_matcher._get_embedding", + new_callable=AsyncMock, + return_value=[], + ): + await matcher.match(obligation_text="test text") + + mock_init.assert_called_once() + + @pytest.mark.asyncio + async def test_no_double_init(self): + matcher = PatternMatcher() + matcher._initialized = True + matcher._patterns = [] + + with patch.object( + matcher, "initialize", new_callable=AsyncMock + ) as mock_init: + await matcher.match(obligation_text="test text") + mock_init.assert_not_called() + + +# ============================================================================= +# Tests: Constants +# ============================================================================= + + +class TestConstants: + """Tests for module-level constants.""" + + def test_keyword_min_hits(self): + assert KEYWORD_MATCH_MIN_HITS >= 1 + + def test_embedding_threshold_range(self): + assert 0 < EMBEDDING_PATTERN_THRESHOLD <= 1.0 + + def test_domain_bonus_range(self): + assert 0 < DOMAIN_BONUS <= 0.20 + + def test_domain_bonus_is_010(self): + assert DOMAIN_BONUS == 0.10 + + def test_embedding_threshold_is_075(self): + assert EMBEDDING_PATTERN_THRESHOLD == 0.75 + + +# ============================================================================= +# Tests: Integration — Real keyword matching scenarios +# ============================================================================= + + +class TestRealKeywordScenarios: + """Integration tests with realistic obligation texts.""" + + def setup_method(self): + self.matcher = PatternMatcher() + self.matcher._load_patterns() + self.matcher._build_keyword_index() + + def test_dsgvo_consent_obligation(self): + """DSGVO consent obligation should match data protection patterns.""" + scores = self.matcher._keyword_scores( + "Die Einwilligung der betroffenen Person muss freiwillig und " + "informiert erfolgen. Eine Verarbeitung personenbezogener Daten " + "ist nur mit gültiger Einwilligung zulaessig. Datenschutz.", + "dsgvo", + ) + # Should have matches in DATA domain patterns + data_matches = [pid for pid in scores if pid.startswith("CP-DATA")] + assert len(data_matches) >= 1 + + def test_ai_act_risk_assessment(self): + """AI Act risk assessment should match AI patterns.""" + scores = self.matcher._keyword_scores( + "KI-Systeme mit hohem Risiko muessen einer Konformitaetsbewertung " + "unterzogen werden. Transparenz und Erklaerbarkeit sind Pflicht.", + "ai_act", + ) + ai_matches = [pid for pid in scores if pid.startswith("CP-AI")] + assert len(ai_matches) >= 1 + + def test_nis2_incident_response(self): + """NIS2 incident text should match INC patterns.""" + scores = self.matcher._keyword_scores( + "Sicherheitsvorfaelle muessen innerhalb von 24 Stunden gemeldet " + "werden. Ein incident response plan und Eskalationsverfahren " + "sind zu etablieren fuer Vorfall und Wiederherstellung.", + "nis2", + ) + inc_matches = [pid for pid in scores if pid.startswith("CP-INC")] + assert len(inc_matches) >= 1 + + def test_audit_logging_obligation(self): + """Audit logging obligation should match LOG patterns.""" + scores = self.matcher._keyword_scores( + "Alle sicherheitsrelevanten Ereignisse muessen protokolliert werden. " + "Audit-Trail und Monitoring der Zugriffe sind Pflicht. " + "Protokollierung muss manipulationssicher sein.", + None, + ) + log_matches = [pid for pid in scores if pid.startswith("CP-LOG")] + assert len(log_matches) >= 1 + + def test_access_control_obligation(self): + """Access control text should match ACC patterns.""" + scores = self.matcher._keyword_scores( + "Zugriffskontrolle nach dem Least-Privilege-Prinzip. " + "Rollenbasierte Autorisierung und Berechtigung fuer alle Systeme.", + None, + ) + acc_matches = [pid for pid in scores if pid.startswith("CP-ACC")] + assert len(acc_matches) >= 1 diff --git a/backend-compliance/tests/test_pipeline_adapter.py b/backend-compliance/tests/test_pipeline_adapter.py new file mode 100644 index 0000000..6c6c713 --- /dev/null +++ b/backend-compliance/tests/test_pipeline_adapter.py @@ -0,0 +1,682 @@ +"""Tests for Pipeline Adapter — Phase 7 of Multi-Layer Control Architecture. + +Validates: +- PipelineChunk and PipelineResult dataclasses +- PipelineAdapter.process_chunk() — full 3-stage flow +- PipelineAdapter.process_batch() — batch processing +- PipelineAdapter.write_crosswalk() — DB write logic (mocked) +- MigrationPasses — all 5 passes (with mocked DB) +- _extract_regulation_article helper +- Edge cases: missing data, LLM failures, initialization +""" + +import json +from unittest.mock import AsyncMock, MagicMock, patch, call + +import pytest + +from compliance.services.pipeline_adapter import ( + MigrationPasses, + PipelineAdapter, + PipelineChunk, + PipelineResult, + _extract_regulation_article, +) +from compliance.services.obligation_extractor import ObligationMatch +from compliance.services.pattern_matcher import ControlPattern, PatternMatchResult +from compliance.services.control_composer import ComposedControl + + +# ============================================================================= +# Tests: PipelineChunk +# ============================================================================= + + +class TestPipelineChunk: + def test_defaults(self): + chunk = PipelineChunk(text="test") + assert chunk.text == "test" + assert chunk.collection == "" + assert chunk.regulation_code == "" + assert chunk.license_rule == 3 + assert chunk.chunk_hash == "" + + def test_compute_hash(self): + chunk = PipelineChunk(text="hello world") + h = chunk.compute_hash() + assert len(h) == 64 # SHA256 hex + assert h == chunk.chunk_hash # cached + + def test_compute_hash_deterministic(self): + chunk1 = PipelineChunk(text="same text") + chunk2 = PipelineChunk(text="same text") + assert chunk1.compute_hash() == chunk2.compute_hash() + + def test_compute_hash_idempotent(self): + chunk = PipelineChunk(text="test") + h1 = chunk.compute_hash() + h2 = chunk.compute_hash() + assert h1 == h2 + + +# ============================================================================= +# Tests: PipelineResult +# ============================================================================= + + +class TestPipelineResult: + def test_defaults(self): + chunk = PipelineChunk(text="test") + result = PipelineResult(chunk=chunk) + assert result.control is None + assert result.crosswalk_written is False + assert result.error is None + + def test_to_dict(self): + chunk = PipelineChunk(text="test") + chunk.compute_hash() + result = PipelineResult( + chunk=chunk, + obligation=ObligationMatch( + obligation_id="DSGVO-OBL-001", + method="exact_match", + confidence=1.0, + ), + pattern_result=PatternMatchResult( + pattern_id="CP-AUTH-001", + method="keyword", + confidence=0.85, + ), + control=ComposedControl(title="Test Control"), + ) + d = result.to_dict() + assert d["chunk_hash"] == chunk.chunk_hash + assert d["obligation"]["obligation_id"] == "DSGVO-OBL-001" + assert d["pattern"]["pattern_id"] == "CP-AUTH-001" + assert d["control"]["title"] == "Test Control" + assert d["error"] is None + + +# ============================================================================= +# Tests: _extract_regulation_article +# ============================================================================= + + +class TestExtractRegulationArticle: + def test_from_citation_json(self): + citation = json.dumps({ + "source": "eu_2016_679", + "article": "Art. 30", + }) + reg, art = _extract_regulation_article(citation, None) + assert reg == "dsgvo" + assert art == "Art. 30" + + def test_from_metadata(self): + metadata = json.dumps({ + "source_regulation": "eu_2024_1689", + "source_article": "Art. 6", + }) + reg, art = _extract_regulation_article(None, metadata) + assert reg == "ai_act" + assert art == "Art. 6" + + def test_citation_takes_priority(self): + citation = json.dumps({"source": "dsgvo", "article": "Art. 30"}) + metadata = json.dumps({"source_regulation": "nis2", "source_article": "Art. 21"}) + reg, art = _extract_regulation_article(citation, metadata) + assert reg == "dsgvo" + assert art == "Art. 30" + + def test_empty_inputs(self): + reg, art = _extract_regulation_article(None, None) + assert reg is None + assert art is None + + def test_invalid_json(self): + reg, art = _extract_regulation_article("not json", "also not json") + assert reg is None + assert art is None + + def test_citation_as_dict(self): + citation = {"source": "bdsg", "article": "§ 38"} + reg, art = _extract_regulation_article(citation, None) + assert reg == "bdsg" + assert art == "§ 38" + + def test_source_article_key(self): + citation = json.dumps({"source": "dsgvo", "source_article": "Art. 32"}) + reg, art = _extract_regulation_article(citation, None) + assert reg == "dsgvo" + assert art == "Art. 32" + + def test_unknown_source(self): + citation = json.dumps({"source": "unknown_law", "article": "Art. 1"}) + reg, art = _extract_regulation_article(citation, None) + assert reg is None # _normalize_regulation returns None + assert art == "Art. 1" + + +# ============================================================================= +# Tests: PipelineAdapter — process_chunk +# ============================================================================= + + +class TestPipelineAdapterProcessChunk: + """Tests for the full 3-stage chunk processing.""" + + @pytest.mark.asyncio + async def test_process_chunk_full_flow(self): + """Process a chunk through all 3 stages.""" + adapter = PipelineAdapter() + + obligation = ObligationMatch( + obligation_id="DSGVO-OBL-001", + obligation_title="Verarbeitungsverzeichnis", + obligation_text="Fuehrung eines Verzeichnisses", + method="exact_match", + confidence=1.0, + regulation_id="dsgvo", + ) + pattern_result = PatternMatchResult( + pattern_id="CP-COMP-001", + method="keyword", + confidence=0.85, + ) + composed = ComposedControl( + title="Test Control", + objective="Test objective", + pattern_id="CP-COMP-001", + ) + + with patch.object( + adapter._extractor, "initialize", new_callable=AsyncMock + ), patch.object( + adapter._matcher, "initialize", new_callable=AsyncMock + ), patch.object( + adapter._extractor, "extract", + new_callable=AsyncMock, return_value=obligation, + ), patch.object( + adapter._matcher, "match", + new_callable=AsyncMock, return_value=pattern_result, + ), patch.object( + adapter._composer, "compose", + new_callable=AsyncMock, return_value=composed, + ): + adapter._initialized = True + chunk = PipelineChunk( + text="Art. 30 DSGVO Verarbeitungsverzeichnis", + regulation_code="eu_2016_679", + article="Art. 30", + license_rule=1, + ) + result = await adapter.process_chunk(chunk) + + assert result.obligation.obligation_id == "DSGVO-OBL-001" + assert result.pattern_result.pattern_id == "CP-COMP-001" + assert result.control.title == "Test Control" + assert result.error is None + assert result.chunk.chunk_hash != "" + + @pytest.mark.asyncio + async def test_process_chunk_error_handling(self): + """Errors during processing should be captured, not raised.""" + adapter = PipelineAdapter() + adapter._initialized = True + + with patch.object( + adapter._extractor, "extract", + new_callable=AsyncMock, side_effect=Exception("LLM timeout"), + ): + chunk = PipelineChunk(text="test text") + result = await adapter.process_chunk(chunk) + + assert result.error == "LLM timeout" + assert result.control is None + + @pytest.mark.asyncio + async def test_process_chunk_uses_obligation_text_for_pattern(self): + """Pattern matcher should receive obligation text, not raw chunk.""" + adapter = PipelineAdapter() + adapter._initialized = True + + obligation = ObligationMatch( + obligation_text="Specific obligation text", + regulation_id="dsgvo", + ) + + with patch.object( + adapter._extractor, "extract", + new_callable=AsyncMock, return_value=obligation, + ), patch.object( + adapter._matcher, "match", + new_callable=AsyncMock, return_value=PatternMatchResult(), + ) as mock_match, patch.object( + adapter._composer, "compose", + new_callable=AsyncMock, return_value=ComposedControl(), + ): + await adapter.process_chunk(PipelineChunk(text="raw chunk text")) + + # Pattern matcher should receive the obligation text + mock_match.assert_called_once() + call_args = mock_match.call_args + assert call_args.kwargs["obligation_text"] == "Specific obligation text" + + @pytest.mark.asyncio + async def test_process_chunk_fallback_to_chunk_text(self): + """When obligation has no text, use chunk text for pattern matching.""" + adapter = PipelineAdapter() + adapter._initialized = True + + obligation = ObligationMatch() # No text + + with patch.object( + adapter._extractor, "extract", + new_callable=AsyncMock, return_value=obligation, + ), patch.object( + adapter._matcher, "match", + new_callable=AsyncMock, return_value=PatternMatchResult(), + ) as mock_match, patch.object( + adapter._composer, "compose", + new_callable=AsyncMock, return_value=ComposedControl(), + ): + await adapter.process_chunk(PipelineChunk(text="fallback chunk text")) + + call_args = mock_match.call_args + assert "fallback chunk text" in call_args.kwargs["obligation_text"] + + +# ============================================================================= +# Tests: PipelineAdapter — process_batch +# ============================================================================= + + +class TestPipelineAdapterBatch: + @pytest.mark.asyncio + async def test_process_batch(self): + adapter = PipelineAdapter() + adapter._initialized = True + + with patch.object( + adapter, "process_chunk", + new_callable=AsyncMock, + return_value=PipelineResult(chunk=PipelineChunk(text="x")), + ): + chunks = [PipelineChunk(text="a"), PipelineChunk(text="b")] + results = await adapter.process_batch(chunks) + + assert len(results) == 2 + + @pytest.mark.asyncio + async def test_process_batch_empty(self): + adapter = PipelineAdapter() + adapter._initialized = True + results = await adapter.process_batch([]) + assert results == [] + + +# ============================================================================= +# Tests: PipelineAdapter — write_crosswalk +# ============================================================================= + + +class TestWriteCrosswalk: + def test_write_crosswalk_success(self): + """write_crosswalk should execute 3 DB statements.""" + mock_db = MagicMock() + mock_db.execute = MagicMock() + mock_db.commit = MagicMock() + + adapter = PipelineAdapter(db=mock_db) + chunk = PipelineChunk( + text="test", regulation_code="eu_2016_679", + article="Art. 30", collection="bp_compliance_ce", + ) + chunk.compute_hash() + + result = PipelineResult( + chunk=chunk, + obligation=ObligationMatch( + obligation_id="DSGVO-OBL-001", + method="exact_match", + confidence=1.0, + ), + pattern_result=PatternMatchResult( + pattern_id="CP-COMP-001", + confidence=0.85, + ), + control=ComposedControl( + control_id="COMP-001", + pattern_id="CP-COMP-001", + obligation_ids=["DSGVO-OBL-001"], + ), + ) + + success = adapter.write_crosswalk(result, "uuid-123") + assert success is True + assert mock_db.execute.call_count == 3 # insert + insert + update + mock_db.commit.assert_called_once() + + def test_write_crosswalk_no_db(self): + adapter = PipelineAdapter(db=None) + chunk = PipelineChunk(text="test") + result = PipelineResult(chunk=chunk, control=ComposedControl()) + assert adapter.write_crosswalk(result, "uuid") is False + + def test_write_crosswalk_no_control(self): + mock_db = MagicMock() + adapter = PipelineAdapter(db=mock_db) + chunk = PipelineChunk(text="test") + result = PipelineResult(chunk=chunk, control=None) + assert adapter.write_crosswalk(result, "uuid") is False + + def test_write_crosswalk_db_error(self): + mock_db = MagicMock() + mock_db.execute = MagicMock(side_effect=Exception("DB error")) + mock_db.rollback = MagicMock() + + adapter = PipelineAdapter(db=mock_db) + chunk = PipelineChunk(text="test") + chunk.compute_hash() + result = PipelineResult( + chunk=chunk, + obligation=ObligationMatch(), + pattern_result=PatternMatchResult(), + control=ComposedControl(control_id="X-001"), + ) + assert adapter.write_crosswalk(result, "uuid") is False + mock_db.rollback.assert_called_once() + + +# ============================================================================= +# Tests: PipelineAdapter — stats and initialization +# ============================================================================= + + +class TestPipelineAdapterInit: + def test_stats_before_init(self): + adapter = PipelineAdapter() + stats = adapter.stats() + assert stats["initialized"] is False + + @pytest.mark.asyncio + async def test_auto_initialize(self): + adapter = PipelineAdapter() + with patch.object( + adapter, "initialize", new_callable=AsyncMock, + ) as mock_init: + async def side_effect(): + adapter._initialized = True + mock_init.side_effect = side_effect + + with patch.object( + adapter._extractor, "extract", + new_callable=AsyncMock, return_value=ObligationMatch(), + ), patch.object( + adapter._matcher, "match", + new_callable=AsyncMock, return_value=PatternMatchResult(), + ), patch.object( + adapter._composer, "compose", + new_callable=AsyncMock, return_value=ComposedControl(), + ): + await adapter.process_chunk(PipelineChunk(text="test")) + + mock_init.assert_called_once() + + +# ============================================================================= +# Tests: MigrationPasses — Pass 1 (Obligation Linkage) +# ============================================================================= + + +class TestPass1ObligationLinkage: + @pytest.mark.asyncio + async def test_pass1_links_controls(self): + """Pass 1 should link controls with matching articles to obligations.""" + mock_db = MagicMock() + + # Simulate 2 controls: one with citation, one without + mock_db.execute.return_value.fetchall.return_value = [ + ( + "uuid-1", "COMP-001", + json.dumps({"source": "eu_2016_679", "article": "Art. 30"}), + json.dumps({"source_regulation": "eu_2016_679"}), + ), + ( + "uuid-2", "SEC-001", + None, # No citation + None, # No metadata + ), + ] + + migration = MigrationPasses(db=mock_db) + await migration.initialize() + + # Reset mock after initialize queries + mock_db.execute.reset_mock() + mock_db.execute.return_value.fetchall.return_value = [ + ( + "uuid-1", "COMP-001", + json.dumps({"source": "eu_2016_679", "article": "Art. 30"}), + json.dumps({"source_regulation": "eu_2016_679"}), + ), + ( + "uuid-2", "SEC-001", + None, + None, + ), + ] + + stats = await migration.run_pass1_obligation_linkage() + + assert stats["total"] == 2 + assert stats["no_citation"] >= 1 + + @pytest.mark.asyncio + async def test_pass1_with_limit(self): + """Pass 1 should respect limit parameter.""" + mock_db = MagicMock() + mock_db.execute.return_value.fetchall.return_value = [] + + migration = MigrationPasses(db=mock_db) + migration._initialized = True + migration._extractor._load_obligations() + + stats = await migration.run_pass1_obligation_linkage(limit=10) + assert stats["total"] == 0 + + # Check that LIMIT was in the SQL text clause + query_call = mock_db.execute.call_args + sql_text_obj = query_call[0][0] # first positional arg is the text() object + assert "LIMIT" in sql_text_obj.text + + +# ============================================================================= +# Tests: MigrationPasses — Pass 2 (Pattern Classification) +# ============================================================================= + + +class TestPass2PatternClassification: + @pytest.mark.asyncio + async def test_pass2_classifies_controls(self): + """Pass 2 should match controls to patterns via keywords.""" + mock_db = MagicMock() + mock_db.execute.return_value.fetchall.return_value = [ + ( + "uuid-1", "AUTH-001", + "Passwortrichtlinie und Authentifizierung", + "Sicherstellen dass Anmeldedaten credential geschuetzt sind", + ), + ] + + migration = MigrationPasses(db=mock_db) + await migration.initialize() + + mock_db.execute.reset_mock() + mock_db.execute.return_value.fetchall.return_value = [ + ( + "uuid-1", "AUTH-001", + "Passwortrichtlinie und Authentifizierung", + "Sicherstellen dass Anmeldedaten credential geschuetzt sind", + ), + ] + + stats = await migration.run_pass2_pattern_classification() + + assert stats["total"] == 1 + # Should classify because "passwort", "authentifizierung", "anmeldedaten" are keywords + assert stats["classified"] == 1 + + @pytest.mark.asyncio + async def test_pass2_no_match(self): + """Controls without keyword matches should be counted as no_match.""" + mock_db = MagicMock() + mock_db.execute.return_value.fetchall.return_value = [ + ( + "uuid-1", "MISC-001", + "Completely unrelated title", + "No keywords match here at all", + ), + ] + + migration = MigrationPasses(db=mock_db) + await migration.initialize() + + mock_db.execute.reset_mock() + mock_db.execute.return_value.fetchall.return_value = [ + ( + "uuid-1", "MISC-001", + "Completely unrelated title", + "No keywords match here at all", + ), + ] + + stats = await migration.run_pass2_pattern_classification() + assert stats["no_match"] == 1 + + +# ============================================================================= +# Tests: MigrationPasses — Pass 3 (Quality Triage) +# ============================================================================= + + +class TestPass3QualityTriage: + def test_pass3_executes_4_updates(self): + """Pass 3 should execute exactly 4 UPDATE statements.""" + mock_db = MagicMock() + mock_result = MagicMock() + mock_result.rowcount = 10 + mock_db.execute.return_value = mock_result + + migration = MigrationPasses(db=mock_db) + stats = migration.run_pass3_quality_triage() + + assert mock_db.execute.call_count == 4 + mock_db.commit.assert_called_once() + assert "review" in stats + assert "needs_obligation" in stats + assert "needs_pattern" in stats + assert "legacy_unlinked" in stats + + +# ============================================================================= +# Tests: MigrationPasses — Pass 4 (Crosswalk Backfill) +# ============================================================================= + + +class TestPass4CrosswalkBackfill: + def test_pass4_inserts_crosswalk_rows(self): + mock_db = MagicMock() + mock_result = MagicMock() + mock_result.rowcount = 42 + mock_db.execute.return_value = mock_result + + migration = MigrationPasses(db=mock_db) + stats = migration.run_pass4_crosswalk_backfill() + + assert stats["rows_inserted"] == 42 + mock_db.commit.assert_called_once() + + +# ============================================================================= +# Tests: MigrationPasses — Pass 5 (Deduplication) +# ============================================================================= + + +class TestPass5Deduplication: + def test_pass5_no_duplicates(self): + mock_db = MagicMock() + mock_db.execute.return_value.fetchall.return_value = [] + + migration = MigrationPasses(db=mock_db) + stats = migration.run_pass5_deduplication() + + assert stats["groups_found"] == 0 + assert stats["controls_deprecated"] == 0 + + def test_pass5_deprecates_duplicates(self): + """Pass 5 should keep first (highest confidence) and deprecate rest.""" + mock_db = MagicMock() + + # First call: groups query returns one group with 3 controls + groups_result = MagicMock() + groups_result.fetchall.return_value = [ + ( + "CP-AUTH-001", # pattern_id + "DSGVO-OBL-001", # obligation_id + ["uuid-1", "uuid-2", "uuid-3"], # ids (ordered by confidence) + 3, # count + ), + ] + + # Subsequent calls: UPDATE queries + update_result = MagicMock() + update_result.rowcount = 1 + + mock_db.execute.side_effect = [groups_result, update_result, update_result] + + migration = MigrationPasses(db=mock_db) + stats = migration.run_pass5_deduplication() + + assert stats["groups_found"] == 1 + assert stats["controls_deprecated"] == 2 # uuid-2, uuid-3 + mock_db.commit.assert_called_once() + + +# ============================================================================= +# Tests: MigrationPasses — migration_status +# ============================================================================= + + +class TestMigrationStatus: + def test_migration_status(self): + mock_db = MagicMock() + mock_db.execute.return_value.fetchone.return_value = ( + 4800, # total + 2880, # has_obligation (60%) + 3360, # has_pattern (70%) + 2400, # fully_linked (50%) + 300, # deprecated + ) + + migration = MigrationPasses(db=mock_db) + status = migration.migration_status() + + assert status["total_controls"] == 4800 + assert status["has_obligation"] == 2880 + assert status["has_pattern"] == 3360 + assert status["fully_linked"] == 2400 + assert status["deprecated"] == 300 + assert status["coverage_obligation_pct"] == 60.0 + assert status["coverage_pattern_pct"] == 70.0 + assert status["coverage_full_pct"] == 50.0 + + def test_migration_status_empty_db(self): + mock_db = MagicMock() + mock_db.execute.return_value.fetchone.return_value = (0, 0, 0, 0, 0) + + migration = MigrationPasses(db=mock_db) + status = migration.migration_status() + + assert status["total_controls"] == 0 + assert status["coverage_obligation_pct"] == 0.0 diff --git a/docs-src/services/sdk-modules/canonical-control-library.md b/docs-src/services/sdk-modules/canonical-control-library.md index 287364e..3ebd239 100644 --- a/docs-src/services/sdk-modules/canonical-control-library.md +++ b/docs-src/services/sdk-modules/canonical-control-library.md @@ -707,3 +707,258 @@ Die Generator-Tests decken folgende Bereiche ab: - **`TestAnchorFinder`** (2 Tests) — RAG-Suche filtert Rule 3 Quellen aus, Web-Suche erkennt Frameworks - **`TestPipelineMocked`** (5 Tests) — End-to-End mit Mocks: Lizenz-Klassifikation, Rule 3 Blocking, Hash-Deduplizierung, Config-Defaults (`batch_size: 5`), Rule 1 Citation-Generierung + +--- + +## Multi-Layer Control Architecture + +Erweitert die bestehende Pipeline um ein 5-Schichten-Modell: + +``` +Legal Source → Obligation → Control Pattern → Master Control → Customer Instance +``` + +### Architektur-Uebersicht + +| Layer | Asset | Beschreibung | +|-------|-------|-------------| +| 1: Legal Sources | Qdrant 5 Collections, 105K+ Chunks | RAG-Rohdaten | +| 2: Obligations | v2 Framework (325 Pflichten, 9 Verordnungen) | Rechtliche Pflichten | +| 3: Control Patterns | 50 YAML Patterns (30 Core + 20 IT-Security) | Umsetzungsmuster | +| 4: Master Controls | canonical_controls (atomare Controls nach Dedup) | Kanonische Controls | +| 5: Customer Instance | TOM Controls + Gap Mapping | Kundenspezifisch | + +### Control-Ebenen + +| Ebene | Beschreibung | Nutzen | +|-------|-------------|--------| +| **Rich Controls** | Narrativ, erklaerend, kontextreich (~25.000) | Schulung, Audit-Fragen, Massnahmenplaene | +| **Atomare Controls** | 1 Pflicht = 1 Control (nach Decomposition + Dedup) | Systemaudits, Code-Checks, Gap-Analyse, Traceability | + +### Pipeline-Erweiterung (10-Stage) + +``` +Stage 1: RAG SCAN (unveraendert) +Stage 2: LICENSE CLASSIFY (unveraendert) +Stage 3: PREFILTER (unveraendert) +Stage 4: OBLIGATION EXTRACT (NEU — 3-Tier: exact → embedding → LLM) +Stage 5: PATTERN MATCH (NEU — Keyword + Embedding + Domain-Bonus) +Stage 6: CONTROL COMPOSE (NEU — Pattern + Obligation → Control) +Stage 7: HARMONIZE (unveraendert) +Stage 8: ANCHOR SEARCH (unveraendert) +Stage 9: STORE + CROSSWALK (erweitert — Crosswalk-Matrix) +Stage 10: MARK PROCESSED (unveraendert) +``` + +--- + +### Obligation Extractor (Stage 4) + +3-Tier Extraktion (schnellste zuerst): + +| Tier | Methode | Latenz | Trefferquote | +|------|---------|--------|--------------| +| 1 | Exact Match (regulation_code + article → obligation_id) | <1ms | ~40% | +| 2 | Embedding Match (Cosine > 0.80 gegen 325 Obligations) | ~50ms | ~30% | +| 3 | LLM Extraction (lokales Ollama, nur Fallback) | ~2s | ~25% | + +**Datei:** `compliance/services/obligation_extractor.py` + +### Pattern Library (Stage 5) + +50 YAML-basierte Control Patterns in 16 Domains: + +| Datei | Patterns | Domains | +|-------|----------|---------| +| `core_patterns.yaml` | 30 | AUTH, CRYP, NET, DATA, LOG, ACC, SEC, INC, COMP, GOV, RES | +| `domain_it_security.yaml` | 20 | SEC, NET, AUTH, LOG, CRYP | + +**Pattern ID Format:** `CP-{DOMAIN}-{NNN}` (z.B. `CP-AUTH-001`) + +**Matching:** 2-Tier (Keyword-Index + Embedding), Domain-Bonus (+0.10) + +**Dateien:** +- `ai-compliance-sdk/policies/control_patterns/core_patterns.yaml` +- `ai-compliance-sdk/policies/control_patterns/domain_it_security.yaml` +- `compliance/services/pattern_matcher.py` + +### Control Composer (Stage 6) + +Drei Kompositions-Modi: + +| Modus | Wann | Qualitaet | +|-------|------|-----------| +| Pattern-guided | Pattern gefunden, LLM antwortet | Hoch | +| Template-only | LLM-Fehler, aber Pattern vorhanden | Mittel | +| Fallback | Kein Pattern-Match | Basis | + +**Datei:** `compliance/services/control_composer.py` + +--- + +### Decomposition Pass (Pass 0) + +Zerlegt Rich Controls in atomare Controls. Laeuft VOR den Migration Passes 1-5. + +#### Pass 0a — Obligation Extraction + +Extrahiert einzelne normative Pflichten aus einem Rich Control per LLM. + +**6 Guardrails:** + +1. Nur normative Aussagen (müssen, sicherzustellen, verpflichtet, ...) +2. Ein Hauptverb pro Pflicht +3. Testpflichten separat +4. Meldepflichten separat +5. Nicht auf Evidence-Ebene zerlegen +6. Parent-Link immer erhalten + +**Quality Gate:** Jeder Kandidat wird gegen 6 Kriterien geprueft: + +- `has_normative_signal` — Normatives Sprachsignal erkannt +- `single_action` — Nur eine Handlung +- `not_rationale` — Keine blosse Begruendung +- `not_evidence_only` — Kein reines Evidence-Fragment +- `min_length` — Mindestlaenge erreicht +- `has_parent_link` — Referenz zum Rich Control + +Kritische Checks: `has_normative_signal`, `not_evidence_only`, `min_length`, `has_parent_link` + +#### Pass 0b — Atomic Control Composition + +Erstellt aus jedem validierten Obligation Candidate ein atomares Control +(LLM-gestuetzt mit Template-Fallback). + +**Datei:** `compliance/services/decomposition_pass.py` + +--- + +### Migration Passes (1-5) + +Nicht-destruktive Passes fuer bestehende Controls: + +| Pass | Beschreibung | Methode | +|------|-------------|---------| +| 1 | Obligation Linkage | source_citation → article → obligation_id (deterministisch) | +| 2 | Pattern Classification | Keyword-Matching gegen Pattern Library | +| 3 | Quality Triage | Kategorisierung: review / needs_obligation / needs_pattern / legacy_unlinked | +| 4 | Crosswalk Backfill | crosswalk_matrix Zeilen fuer verlinkte Controls | +| 5 | Deduplication | Gleiche obligation_id + pattern_id → Duplikat markieren | + +**Datei:** `compliance/services/pipeline_adapter.py` + +--- + +### Crosswalk Matrix + +Der "goldene Faden" von Gesetz bis Umsetzung: + +``` +Regulation → Article → Obligation → Pattern → Master Control → TOM +``` + +Ein atomares Control kann von **mehreren Gesetzen** gleichzeitig gefordert sein. +Die Crosswalk-Matrix bildet diese N:M-Beziehung ab. + +--- + +### DB-Schema (Migrations 060 + 061) + +**Migration 060:** Multi-Layer Basistabellen + +| Tabelle | Beschreibung | +|---------|-------------| +| `obligation_extractions` | Chunk→Obligation Verknuepfungen (3-Tier Tracking) | +| `control_patterns` | DB-Spiegel der YAML-Patterns fuer SQL-Queries | +| `crosswalk_matrix` | Goldener Faden: Regulation→Obligation→Pattern→Control | +| `canonical_controls.pattern_id` | Pattern-Zuordnung (neues Feld) | +| `canonical_controls.obligation_ids` | Obligation-IDs als JSONB-Array (neues Feld) | + +**Migration 061:** Decomposition-Tabellen + +| Tabelle | Beschreibung | +|---------|-------------| +| `obligation_candidates` | Extrahierte atomare Pflichten aus Rich Controls | +| `canonical_controls.parent_control_uuid` | Self-Referenz zum Rich Control (neues Feld) | +| `canonical_controls.decomposition_method` | Zerlegungsmethode (neues Feld) | + +--- + +### API Endpoints (Crosswalk Routes) + +Alle Endpoints unter `/api/compliance/v1/canonical/`: + +#### Pattern Library + +| Methode | Pfad | Beschreibung | +|---------|------|-------------| +| GET | `/patterns` | Alle Patterns (Filter: domain, category, tag) | +| GET | `/patterns/{pattern_id}` | Einzelnes Pattern mit Details | +| GET | `/patterns/{pattern_id}/controls` | Controls aus einem Pattern | + +#### Obligation Extraction + +| Methode | Pfad | Beschreibung | +|---------|------|-------------| +| POST | `/obligations/extract` | Obligation aus Text extrahieren + Pattern matchen | + +#### Crosswalk Matrix + +| Methode | Pfad | Beschreibung | +|---------|------|-------------| +| GET | `/crosswalk` | Query (Filter: regulation, article, obligation, pattern) | +| GET | `/crosswalk/stats` | Abdeckungs-Statistiken | + +#### Migration + Decomposition + +| Methode | Pfad | Beschreibung | +|---------|------|-------------| +| POST | `/migrate/decompose` | Pass 0a: Obligation Extraction aus Rich Controls | +| POST | `/migrate/compose-atomic` | Pass 0b: Atomare Control-Komposition | +| POST | `/migrate/link-obligations` | Pass 1: Obligation-Linkage | +| POST | `/migrate/classify-patterns` | Pass 2: Pattern-Klassifikation | +| POST | `/migrate/triage` | Pass 3: Quality Triage | +| POST | `/migrate/backfill-crosswalk` | Pass 4: Crosswalk-Backfill | +| POST | `/migrate/deduplicate` | Pass 5: Deduplizierung | +| GET | `/migrate/status` | Migrations-Fortschritt | +| GET | `/migrate/decomposition-status` | Decomposition-Fortschritt | + +**Route-Datei:** `compliance/api/crosswalk_routes.py` + +--- + +### Multi-Layer Tests + +| Datei | Tests | Schwerpunkt | +|-------|-------|-------------| +| `tests/test_obligation_extractor.py` | 107 | 3-Tier Extraktion, Helpers, Regex | +| `tests/test_pattern_matcher.py` | 72 | Keyword-Index, Embedding, Domain-Affinity | +| `tests/test_control_composer.py` | 54 | Composition, Templates, License-Rules | +| `tests/test_pipeline_adapter.py` | 36 | Pipeline Integration, 5 Migration Passes | +| `tests/test_crosswalk_routes.py` | 57 | 15 API Endpoints, Pydantic Models | +| `tests/test_decomposition_pass.py` | 68 | Pass 0a/0b, Quality Gate, 6 Guardrails | +| `tests/test_migration_060.py` | 12 | Schema-Validierung | +| `tests/test_control_patterns.py` | 18 | YAML-Validierung, Pattern-Schema | +| **Gesamt Multi-Layer** | | **424 Tests** | + +### Geplanter Migrationsflow + +``` +Rich Controls (~25.000, release_state=raw) + ↓ +Pass 0a: Obligation Extraction (LLM + Quality Gate) + ↓ +Pass 0b: Atomic Control Composition (LLM + Template Fallback) + ↓ +Pass 1: Obligation Linking (deterministisch) + ↓ +Pass 2: Pattern Classification (Keyword + Embedding) + ↓ +Pass 3: Quality Triage + ↓ +Pass 4: Crosswalk Backfill + ↓ +Pass 5: Dedup / Merge + ↓ +Master Controls (~15.000-20.000 mit voller Traceability) +```