Adds a routing layer between Pass 0a and Pass 0b that classifies obligations into atomic/compound/framework_container. Framework-container obligations (e.g. "CCM-Praktiken fuer AIS") are decomposed into concrete sub-obligations via an internal framework registry before Pass 0b composition. - New: framework_decomposition.py with routing, matching, decomposition - New: Framework registry (NIST SP 800-53, OWASP ASVS, CSA CCM) as JSON - New: Composite detection flags on atomic controls (is_composite, atomicity) - New: gen_meta fields: framework_ref, framework_domain, decomposition_source - Integration: _route_and_compose() in run_pass0b() deterministic path - 248 tests (198 decomposition + 50 framework), all passing Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
3525 lines
135 KiB
Python
3525 lines
135 KiB
Python
"""Decomposition Pass — Split Rich Controls into Atomic Controls.
|
|
|
|
Pass 0 of the Multi-Layer Control Architecture migration. Runs BEFORE
|
|
Passes 1-5 (obligation linkage, pattern classification, etc.).
|
|
|
|
Two sub-passes:
|
|
Pass 0a: Obligation Extraction — extract individual normative obligations
|
|
from a Rich Control using LLM with strict guardrails.
|
|
Pass 0b: Atomic Control Composition — turn each obligation candidate
|
|
into a standalone atomic control record.
|
|
|
|
Plus a Quality Gate that validates extraction results.
|
|
|
|
Guardrails (the 6 rules):
|
|
1. Only normative statements (müssen, sicherzustellen, verpflichtet, ...)
|
|
2. One main verb per obligation
|
|
3. Test obligations separate from operational obligations
|
|
4. Reporting obligations separate
|
|
5. Don't split at evidence level
|
|
6. Parent link always preserved
|
|
"""
|
|
|
|
import json
|
|
import logging
|
|
import os
|
|
import re
|
|
import uuid
|
|
from dataclasses import dataclass, field
|
|
from typing import Optional
|
|
|
|
import httpx
|
|
from sqlalchemy import text
|
|
from sqlalchemy.orm import Session
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# LLM Provider Config
|
|
# ---------------------------------------------------------------------------
|
|
|
|
ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY", "")
|
|
ANTHROPIC_MODEL = os.getenv("DECOMPOSITION_LLM_MODEL", "claude-haiku-4-5-20251001")
|
|
DECOMPOSITION_BATCH_SIZE = int(os.getenv("DECOMPOSITION_BATCH_SIZE", "5"))
|
|
LLM_TIMEOUT = float(os.getenv("DECOMPOSITION_LLM_TIMEOUT", "120"))
|
|
ANTHROPIC_API_URL = "https://api.anthropic.com/v1"
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Normative signal detection — 3-Tier Classification
|
|
# ---------------------------------------------------------------------------
|
|
# Tier 1: Pflicht (mandatory) — strong normative signals
|
|
# Tier 2: Empfehlung (recommendation) — weaker normative signals
|
|
# Tier 3: Kann (optional/permissive) — permissive signals
|
|
# Nothing is rejected — everything is classified.
|
|
|
|
_PFLICHT_SIGNALS = [
|
|
# Deutsche modale Pflichtformulierungen
|
|
r"\bmüssen\b", r"\bmuss\b", r"\bhat\s+sicherzustellen\b",
|
|
r"\bhaben\s+sicherzustellen\b", r"\bsind\s+verpflichtet\b",
|
|
r"\bist\s+verpflichtet\b",
|
|
# "ist zu prüfen", "sind zu dokumentieren" (direkt)
|
|
r"\bist\s+zu\s+\w+en\b", r"\bsind\s+zu\s+\w+en\b",
|
|
r"\bhat\s+zu\s+\w+en\b", r"\bhaben\s+zu\s+\w+en\b",
|
|
# "ist festzustellen", "sind vorzunehmen" (Compound-Verben, eingebettetes zu)
|
|
r"\bist\s+\w+zu\w+en\b", r"\bsind\s+\w+zu\w+en\b",
|
|
# "ist zusätzlich zu prüfen", "sind regelmäßig zu überwachen" (Adverb dazwischen)
|
|
r"\bist\s+\w+\s+zu\s+\w+en\b", r"\bsind\s+\w+\s+zu\s+\w+en\b",
|
|
r"\bhat\s+\w+\s+zu\s+\w+en\b", r"\bhaben\s+\w+\s+zu\s+\w+en\b",
|
|
# Englische Pflicht-Signale
|
|
r"\bshall\b", r"\bmust\b", r"\brequired\b",
|
|
# Compound-Infinitive (Gerundivum): mitzuteilen, anzuwenden, bereitzustellen
|
|
r"\b\w+zuteilen\b", r"\b\w+zuwenden\b", r"\b\w+zustellen\b", r"\b\w+zulegen\b",
|
|
r"\b\w+zunehmen\b", r"\b\w+zuführen\b", r"\b\w+zuhalten\b", r"\b\w+zusetzen\b",
|
|
r"\b\w+zuweisen\b", r"\b\w+zuordnen\b", r"\b\w+zufügen\b", r"\b\w+zugeben\b",
|
|
# Breites Pattern: "ist ... [bis 80 Zeichen] ... zu + Infinitiv"
|
|
r"\bist\b.{1,80}\bzu\s+\w+en\b", r"\bsind\b.{1,80}\bzu\s+\w+en\b",
|
|
]
|
|
_PFLICHT_RE = re.compile("|".join(_PFLICHT_SIGNALS), re.IGNORECASE)
|
|
|
|
_EMPFEHLUNG_SIGNALS = [
|
|
# Modale Verben (schwaecher als "muss")
|
|
r"\bsoll\b", r"\bsollen\b", r"\bsollte\b", r"\bsollten\b",
|
|
r"\bgewährleisten\b", r"\bsicherstellen\b",
|
|
# Englische Empfehlungs-Signale
|
|
r"\bshould\b", r"\bensure\b", r"\brecommend\w*\b",
|
|
# Haeufige normative Infinitive (ohne Hilfsverb, als Empfehlung)
|
|
r"\bnachweisen\b", r"\beinhalten\b", r"\bunterlassen\b", r"\bwahren\b",
|
|
r"\bdokumentieren\b", r"\bimplementieren\b", r"\büberprüfen\b", r"\büberwachen\b",
|
|
# Pruefanweisungen als normative Aussage
|
|
r"\bprüfen,\s+ob\b", r"\bkontrollieren,\s+ob\b",
|
|
]
|
|
_EMPFEHLUNG_RE = re.compile("|".join(_EMPFEHLUNG_SIGNALS), re.IGNORECASE)
|
|
|
|
_KANN_SIGNALS = [
|
|
r"\bkann\b", r"\bkönnen\b", r"\bdarf\b", r"\bdürfen\b",
|
|
r"\bmay\b", r"\boptional\b",
|
|
]
|
|
_KANN_RE = re.compile("|".join(_KANN_SIGNALS), re.IGNORECASE)
|
|
|
|
# Union of all normative signals (for backward-compatible has_normative_signal flag)
|
|
_NORMATIVE_RE = re.compile(
|
|
"|".join(_PFLICHT_SIGNALS + _EMPFEHLUNG_SIGNALS + _KANN_SIGNALS),
|
|
re.IGNORECASE,
|
|
)
|
|
|
|
_RATIONALE_SIGNALS = [
|
|
r"\bda\s+", r"\bweil\b", r"\bgrund\b", r"\berwägung",
|
|
r"\bbecause\b", r"\breason\b", r"\brationale\b",
|
|
r"\bkönnen\s+.*\s+verursachen\b", r"\bführt\s+zu\b",
|
|
]
|
|
_RATIONALE_RE = re.compile("|".join(_RATIONALE_SIGNALS), re.IGNORECASE)
|
|
|
|
_TEST_SIGNALS = [
|
|
r"\btesten\b", r"\btest\b", r"\bprüfung\b", r"\bprüfen\b",
|
|
r"\bgetestet\b", r"\bwirksamkeit\b", r"\baudit\b",
|
|
r"\bregelmäßig\b.*\b(prüf|test|kontroll)",
|
|
r"\beffectiveness\b", r"\bverif",
|
|
]
|
|
_TEST_RE = re.compile("|".join(_TEST_SIGNALS), re.IGNORECASE)
|
|
|
|
_REPORTING_SIGNALS = [
|
|
r"\bmelden\b", r"\bmeldung\b", r"\bunterricht",
|
|
r"\binformieren\b", r"\bbenachricht", r"\bnotif",
|
|
r"\breport\b", r"\bbehörd",
|
|
]
|
|
_REPORTING_RE = re.compile("|".join(_REPORTING_SIGNALS), re.IGNORECASE)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Merge & Enrichment helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
# Trigger-type detection patterns
|
|
_EVENT_TRIGGERS = re.compile(
|
|
r"\b(vorfall|incident|breach|verletzung|sicherheitsvorfall|meldung|entdeckung"
|
|
r"|feststellung|erkennung|ereignis|eintritt|bei\s+auftreten|im\s+falle"
|
|
r"|wenn\s+ein|sobald|unverzüglich|upon|in\s+case\s+of|when\s+a)\b",
|
|
re.IGNORECASE,
|
|
)
|
|
_PERIODIC_TRIGGERS = re.compile(
|
|
r"\b(jährlich|monatlich|quartalsweise|regelmäßig|periodisch|annually"
|
|
r"|monthly|quarterly|periodic|mindestens\s+(einmal|alle)|turnusmäßig"
|
|
r"|wiederkehrend|in\s+regelmäßigen\s+abständen)\b",
|
|
re.IGNORECASE,
|
|
)
|
|
|
|
# Implementation-specific keywords (concrete tools/protocols/formats)
|
|
_IMPL_SPECIFIC_PATTERNS = re.compile(
|
|
r"\b(TLS|SSL|AES|RSA|SHA-\d|HTTPS|LDAP|SAML|OAuth|OIDC|MFA|2FA"
|
|
r"|SIEM|IDS|IPS|WAF|VPN|VLAN|DMZ|HSM|PKI|RBAC|ABAC"
|
|
r"|ISO\s*27\d{3}|SOC\s*2|PCI[\s-]DSS|NIST"
|
|
r"|Firewall|Antivirus|EDR|XDR|SOAR|DLP"
|
|
r"|SMS|E-Mail|Fax|Telefon"
|
|
r"|JSON|XML|CSV|PDF|YAML"
|
|
r"|PostgreSQL|MySQL|MongoDB|Redis|Kafka"
|
|
r"|Docker|Kubernetes|AWS|Azure|GCP"
|
|
r"|Active\s*Directory|RADIUS|Kerberos"
|
|
r"|RSyslog|Splunk|ELK|Grafana|Prometheus"
|
|
r"|Git|Jenkins|Terraform|Ansible)\b",
|
|
re.IGNORECASE,
|
|
)
|
|
|
|
|
|
def _classify_trigger_type(obligation_text: str, condition: str) -> str:
|
|
"""Classify when an obligation is triggered: event/periodic/continuous."""
|
|
combined = f"{obligation_text} {condition}"
|
|
if _EVENT_TRIGGERS.search(combined):
|
|
return "event"
|
|
if _PERIODIC_TRIGGERS.search(combined):
|
|
return "periodic"
|
|
return "continuous"
|
|
|
|
|
|
def _is_implementation_specific_text(
|
|
obligation_text: str, action: str, obj: str
|
|
) -> bool:
|
|
"""Check if an obligation references concrete implementation details."""
|
|
combined = f"{obligation_text} {action} {obj}"
|
|
matches = _IMPL_SPECIFIC_PATTERNS.findall(combined)
|
|
return len(matches) >= 1
|
|
|
|
|
|
def _text_similar(a: str, b: str, threshold: float = 0.75) -> bool:
|
|
"""Quick token-overlap similarity check (Jaccard on words)."""
|
|
if not a or not b:
|
|
return False
|
|
tokens_a = set(a.split())
|
|
tokens_b = set(b.split())
|
|
if not tokens_a or not tokens_b:
|
|
return False
|
|
intersection = tokens_a & tokens_b
|
|
union = tokens_a | tokens_b
|
|
return len(intersection) / len(union) >= threshold
|
|
|
|
|
|
def _is_more_implementation_specific(text_a: str, text_b: str) -> bool:
|
|
"""Return True if text_a is more implementation-specific than text_b."""
|
|
matches_a = len(_IMPL_SPECIFIC_PATTERNS.findall(text_a))
|
|
matches_b = len(_IMPL_SPECIFIC_PATTERNS.findall(text_b))
|
|
if matches_a != matches_b:
|
|
return matches_a > matches_b
|
|
# Tie-break: longer text is usually more specific
|
|
return len(text_a) > len(text_b)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Data classes
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@dataclass
|
|
class ObligationCandidate:
|
|
"""A single normative obligation extracted from a Rich Control."""
|
|
|
|
candidate_id: str = ""
|
|
parent_control_uuid: str = ""
|
|
obligation_text: str = ""
|
|
action: str = ""
|
|
object_: str = ""
|
|
condition: Optional[str] = None
|
|
normative_strength: str = "must"
|
|
obligation_type: str = "pflicht" # pflicht | empfehlung | kann
|
|
is_test_obligation: bool = False
|
|
is_reporting_obligation: bool = False
|
|
extraction_confidence: float = 0.0
|
|
quality_flags: dict = field(default_factory=dict)
|
|
release_state: str = "extracted"
|
|
|
|
def to_dict(self) -> dict:
|
|
return {
|
|
"candidate_id": self.candidate_id,
|
|
"parent_control_uuid": self.parent_control_uuid,
|
|
"obligation_text": self.obligation_text,
|
|
"action": self.action,
|
|
"object": self.object_,
|
|
"condition": self.condition,
|
|
"normative_strength": self.normative_strength,
|
|
"obligation_type": self.obligation_type,
|
|
"is_test_obligation": self.is_test_obligation,
|
|
"is_reporting_obligation": self.is_reporting_obligation,
|
|
"extraction_confidence": self.extraction_confidence,
|
|
"quality_flags": self.quality_flags,
|
|
"release_state": self.release_state,
|
|
}
|
|
|
|
|
|
@dataclass
|
|
class AtomicControlCandidate:
|
|
"""An atomic control composed from a single ObligationCandidate."""
|
|
|
|
candidate_id: str = ""
|
|
parent_control_uuid: str = ""
|
|
obligation_candidate_id: str = ""
|
|
title: str = ""
|
|
objective: str = ""
|
|
requirements: list = field(default_factory=list)
|
|
test_procedure: list = field(default_factory=list)
|
|
evidence: list = field(default_factory=list)
|
|
severity: str = "medium"
|
|
category: str = ""
|
|
domain: str = ""
|
|
source_regulation: str = ""
|
|
source_article: str = ""
|
|
|
|
def to_dict(self) -> dict:
|
|
return {
|
|
"candidate_id": self.candidate_id,
|
|
"parent_control_uuid": self.parent_control_uuid,
|
|
"obligation_candidate_id": self.obligation_candidate_id,
|
|
"title": self.title,
|
|
"objective": self.objective,
|
|
"requirements": self.requirements,
|
|
"test_procedure": self.test_procedure,
|
|
"evidence": self.evidence,
|
|
"severity": self.severity,
|
|
"category": self.category,
|
|
"domain": self.domain,
|
|
}
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Quality Gate
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def classify_obligation_type(txt: str) -> str:
|
|
"""Classify obligation text into pflicht/empfehlung/kann.
|
|
|
|
Priority: pflicht > empfehlung > kann > empfehlung (default).
|
|
Nothing is rejected — obligations without normative signal default
|
|
to 'empfehlung' (recommendation).
|
|
"""
|
|
if _PFLICHT_RE.search(txt):
|
|
return "pflicht"
|
|
if _EMPFEHLUNG_RE.search(txt):
|
|
return "empfehlung"
|
|
if _KANN_RE.search(txt):
|
|
return "kann"
|
|
# No signal at all — LLM thought it was an obligation, classify
|
|
# as recommendation (the user can still use it).
|
|
return "empfehlung"
|
|
|
|
|
|
def quality_gate(candidate: ObligationCandidate) -> dict:
|
|
"""Validate an obligation candidate. Returns quality flags dict.
|
|
|
|
Checks:
|
|
has_normative_signal: text contains normative language (informational)
|
|
obligation_type: pflicht | empfehlung | kann (classified, never rejected)
|
|
single_action: only one main action (heuristic)
|
|
not_rationale: not just a justification/reasoning
|
|
not_evidence_only: not just an evidence requirement
|
|
min_length: text is long enough to be meaningful
|
|
has_parent_link: references back to parent control
|
|
"""
|
|
txt = candidate.obligation_text
|
|
flags = {}
|
|
|
|
# 1. Normative signal (informational — no longer used for rejection)
|
|
flags["has_normative_signal"] = bool(_NORMATIVE_RE.search(txt))
|
|
|
|
# 1b. Obligation type classification
|
|
flags["obligation_type"] = classify_obligation_type(txt)
|
|
|
|
# 2. Single action heuristic — count "und" / "and" / "sowie" splits
|
|
# that connect different verbs (imperfect but useful)
|
|
multi_verb_re = re.compile(
|
|
r"\b(und|sowie|als auch)\b.*\b(müssen|sicherstellen|implementieren"
|
|
r"|dokumentieren|melden|testen|prüfen|überwachen|gewährleisten)\b",
|
|
re.IGNORECASE,
|
|
)
|
|
flags["single_action"] = not bool(multi_verb_re.search(txt))
|
|
|
|
# 3. Not rationale
|
|
normative_count = len(_NORMATIVE_RE.findall(txt))
|
|
rationale_count = len(_RATIONALE_RE.findall(txt))
|
|
flags["not_rationale"] = normative_count >= rationale_count
|
|
|
|
# 4. Not evidence-only (evidence fragments are typically short noun phrases)
|
|
evidence_only_re = re.compile(
|
|
r"^(Nachweis|Dokumentation|Screenshot|Protokoll|Bericht|Zertifikat)",
|
|
re.IGNORECASE,
|
|
)
|
|
flags["not_evidence_only"] = not bool(evidence_only_re.match(txt.strip()))
|
|
|
|
# 5. Min length
|
|
flags["min_length"] = len(txt.strip()) >= 20
|
|
|
|
# 6. Parent link
|
|
flags["has_parent_link"] = bool(candidate.parent_control_uuid)
|
|
|
|
return flags
|
|
|
|
|
|
def passes_quality_gate(flags: dict) -> bool:
|
|
"""Check if critical quality flags pass.
|
|
|
|
Note: has_normative_signal is NO LONGER critical — obligations without
|
|
normative signal are classified as 'empfehlung' instead of being rejected.
|
|
"""
|
|
critical = ["not_evidence_only", "min_length", "has_parent_link"]
|
|
return all(flags.get(k, False) for k in critical)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# LLM Prompts
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
_PASS0A_SYSTEM_PROMPT = """\
|
|
Du bist ein Rechts-Compliance-Experte. Du zerlegst Compliance-Controls \
|
|
in einzelne atomare Pflichten.
|
|
|
|
ANALYSE-SCHRITTE (intern durchfuehren, NICHT im Output!):
|
|
1. Identifiziere den Adressaten (Wer muss handeln?)
|
|
2. Identifiziere die Handlung (Was muss getan werden?)
|
|
3. Bestimme die normative Staerke (muss/soll/kann)
|
|
4. Pruefe ob Test- oder Meldepflicht vorliegt (separat erfassen!)
|
|
5. Formuliere jede Pflicht als eigenstaendiges JSON-Objekt
|
|
|
|
REGELN (STRIKT EINHALTEN):
|
|
1. Nur normative Aussagen extrahieren — erkennbar an: müssen, haben \
|
|
sicherzustellen, sind verpflichtet, ist zu dokumentieren, ist zu melden, \
|
|
ist zu testen, shall, must, required.
|
|
2. Jede Pflicht hat genau EIN Hauptverb / eine Handlung.
|
|
3. Testpflichten SEPARAT von operativen Pflichten (is_test_obligation=true).
|
|
4. Meldepflichten SEPARAT (is_reporting_obligation=true).
|
|
5. NICHT auf Evidence-Ebene zerlegen (z.B. "DR-Plan vorhanden" ist KEIN \
|
|
eigenes Control, sondern Evidence).
|
|
6. Begründungen, Erläuterungen und Erwägungsgründe sind KEINE Pflichten \
|
|
— NICHT extrahieren.
|
|
|
|
Antworte NUR mit einem JSON-Array. Keine Erklärungen."""
|
|
|
|
|
|
def _build_pass0a_prompt(
|
|
title: str, objective: str, requirements: str,
|
|
test_procedure: str, source_ref: str
|
|
) -> str:
|
|
return f"""\
|
|
Analysiere das folgende Control und extrahiere alle einzelnen normativen \
|
|
Pflichten als JSON-Array.
|
|
|
|
CONTROL:
|
|
Titel: {title}
|
|
Ziel: {objective}
|
|
Anforderungen: {requirements}
|
|
Prüfverfahren: {test_procedure}
|
|
Quellreferenz: {source_ref}
|
|
|
|
Antworte als JSON-Array:
|
|
[
|
|
{{
|
|
"obligation_text": "Kurze, präzise Formulierung der Pflicht",
|
|
"action": "Hauptverb/Handlung",
|
|
"object": "Gegenstand der Pflicht",
|
|
"condition": "Auslöser/Bedingung oder null",
|
|
"normative_strength": "must",
|
|
"is_test_obligation": false,
|
|
"is_reporting_obligation": false
|
|
}}
|
|
]"""
|
|
|
|
|
|
_PASS0B_SYSTEM_PROMPT = """\
|
|
Du bist ein Security-Compliance-Experte. Du erstellst aus einer einzelnen \
|
|
normativen Pflicht ein praxisorientiertes, atomares Security Control.
|
|
|
|
ANALYSE-SCHRITTE (intern durchfuehren, NICHT im Output!):
|
|
1. Identifiziere die konkrete Anforderung aus der Pflicht
|
|
2. Leite eine umsetzbare technische/organisatorische Massnahme ab
|
|
3. Definiere ein Pruefverfahren (wie wird Umsetzung verifiziert?)
|
|
4. Bestimme den Nachweis (welches Dokument/Artefakt belegt Compliance?)
|
|
|
|
Das Control muss UMSETZBAR sein — keine Gesetzesparaphrase.
|
|
Antworte NUR als JSON. Keine Erklärungen."""
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Deterministic Atomic Control Composition Engine v2
|
|
# ---------------------------------------------------------------------------
|
|
# Transforms obligation candidates into atomic controls WITHOUT LLM.
|
|
#
|
|
# Pipeline:
|
|
# 1. split_compound_action() — split "erstellen und implementieren" → 2
|
|
# 2. classify_action() — 18 fine-grained action types
|
|
# 3. classify_object() — policy / technical / process / register / …
|
|
# 4. trigger_qualifier() — periodic / event / continuous → timing text
|
|
# 5. template lookup — (action_type, object_class) → test + evidence
|
|
# 6. compose — assemble AtomicControlCandidate
|
|
# ---------------------------------------------------------------------------
|
|
|
|
# ── 1. Compound Action Splitter ──────────────────────────────────────────
|
|
|
|
_COMPOUND_SPLIT_RE = re.compile(
|
|
r"\s+(?:und|sowie|als\s+auch|,\s*(?:und|sowie))\s+", re.IGNORECASE
|
|
)
|
|
|
|
# Phrases that should never be split (stylistic variants, not separate actions)
|
|
_NO_SPLIT_PHRASES: set[str] = {
|
|
"pflegen und aufrechterhalten",
|
|
"aufrechterhalten und pflegen",
|
|
"erkennen und verhindern",
|
|
"verhindern und erkennen",
|
|
"sichern und schützen",
|
|
"schützen und sichern",
|
|
"schützen und absichern",
|
|
"absichern und schützen",
|
|
"ermitteln und bewerten",
|
|
"bewerten und ermitteln",
|
|
"prüfen und überwachen",
|
|
"überwachen und prüfen",
|
|
}
|
|
|
|
|
|
def _split_compound_action(action: str) -> list[str]:
|
|
"""Split compound actions into individual sub-actions.
|
|
|
|
Only splits if:
|
|
- the parts map to *different* action types
|
|
- the phrase is not in the no-split list
|
|
|
|
Keeps phrases like 'aufrechterhalten und pflegen' together
|
|
because both map to 'maintain'.
|
|
"""
|
|
if not action:
|
|
return [action]
|
|
|
|
# Check no-split list first
|
|
if action.lower().strip() in _NO_SPLIT_PHRASES:
|
|
return [action]
|
|
|
|
parts = _COMPOUND_SPLIT_RE.split(action.strip())
|
|
if len(parts) <= 1:
|
|
return [action]
|
|
|
|
# Classify each part — only split if types differ
|
|
types = [_classify_action(p.strip()) for p in parts]
|
|
if len(set(types)) > 1:
|
|
return [p.strip() for p in parts if p.strip()]
|
|
|
|
return [action]
|
|
|
|
|
|
# ── 2. Action Type Classification (18 types) ────────────────────────────
|
|
|
|
_ACTION_PRIORITY = [
|
|
"implement", "configure", "encrypt", "restrict_access",
|
|
"monitor", "review", "assess", "audit",
|
|
"test", "verify", "validate",
|
|
"report", "notify", "train",
|
|
"delete", "retain", "ensure",
|
|
"define", "document", "maintain",
|
|
"approve", "remediate",
|
|
"perform", "obtain",
|
|
]
|
|
|
|
_ACTION_KEYWORDS: list[tuple[str, str]] = [
|
|
# Multi-word patterns first (longest match wins)
|
|
("aktuell halten", "maintain"),
|
|
("aufrechterhalten", "maintain"),
|
|
("sicherstellen", "ensure"),
|
|
("gewährleisten", "ensure"),
|
|
("benachrichtigen", "notify"),
|
|
("sensibilisieren", "train"),
|
|
("authentifizieren", "restrict_access"),
|
|
("verschlüsseln", "encrypt"),
|
|
("implementieren", "implement"),
|
|
("konfigurieren", "configure"),
|
|
("bereitstellen", "implement"),
|
|
("protokollieren", "document"),
|
|
("dokumentieren", "document"),
|
|
("kontrollieren", "monitor"),
|
|
("installieren", "implement"),
|
|
("autorisieren", "restrict_access"),
|
|
("beschränken", "restrict_access"),
|
|
("berechtigen", "restrict_access"),
|
|
("aufbewahren", "retain"),
|
|
("archivieren", "retain"),
|
|
("überwachen", "monitor"),
|
|
("überprüfen", "review"),
|
|
("auditieren", "audit"),
|
|
("informieren", "notify"),
|
|
("analysieren", "assess"),
|
|
("verifizieren", "verify"),
|
|
("validieren", "validate"),
|
|
("evaluieren", "assess"),
|
|
("integrieren", "implement"),
|
|
("aktivieren", "configure"),
|
|
("einrichten", "configure"),
|
|
("einführen", "implement"),
|
|
("unterweisen", "train"),
|
|
("durchführen", "perform"),
|
|
("verarbeiten", "perform"),
|
|
("vernichten", "delete"),
|
|
("entfernen", "delete"),
|
|
("absichern", "implement"),
|
|
("schützen", "implement"),
|
|
("bewerten", "assess"),
|
|
("umsetzen", "implement"),
|
|
("aufbauen", "implement"),
|
|
("erstellen", "document"),
|
|
("definieren", "define"),
|
|
("festlegen", "define"),
|
|
("vorgeben", "define"),
|
|
("verfassen", "document"),
|
|
("einholen", "obtain"),
|
|
("genehmigen", "approve"),
|
|
("freigeben", "approve"),
|
|
("zulassen", "approve"),
|
|
("beheben", "remediate"),
|
|
("korrigieren", "remediate"),
|
|
("beseitigen", "remediate"),
|
|
("nachbessern", "remediate"),
|
|
("speichern", "retain"),
|
|
("mitteilen", "notify"),
|
|
("berichten", "report"),
|
|
("schulen", "train"),
|
|
("melden", "report"),
|
|
("prüfen", "review"),
|
|
("testen", "test"),
|
|
("führen", "document"),
|
|
("pflegen", "maintain"),
|
|
("wahren", "maintain"),
|
|
("löschen", "delete"),
|
|
("angeben", "document"),
|
|
("beifügen", "document"),
|
|
# English fallbacks
|
|
("implement", "implement"),
|
|
("configure", "configure"),
|
|
("establish", "define"),
|
|
("document", "document"),
|
|
("maintain", "maintain"),
|
|
("monitor", "monitor"),
|
|
("review", "review"),
|
|
("assess", "assess"),
|
|
("audit", "audit"),
|
|
("encrypt", "encrypt"),
|
|
("restrict", "restrict_access"),
|
|
("authorize", "restrict_access"),
|
|
("verify", "verify"),
|
|
("validate", "validate"),
|
|
("report", "report"),
|
|
("notify", "notify"),
|
|
("train", "train"),
|
|
("test", "test"),
|
|
("delete", "delete"),
|
|
("retain", "retain"),
|
|
("ensure", "ensure"),
|
|
("approve", "approve"),
|
|
("remediate", "remediate"),
|
|
("perform", "perform"),
|
|
("obtain", "obtain"),
|
|
]
|
|
|
|
|
|
def _classify_action(action: str) -> str:
|
|
"""Classify an obligation action string into one of 18 action types.
|
|
|
|
For compound actions, returns the highest-priority matching type.
|
|
"""
|
|
if not action:
|
|
return "default"
|
|
action_lower = action.lower().strip()
|
|
|
|
matches: set[str] = set()
|
|
for keyword, atype in _ACTION_KEYWORDS:
|
|
if keyword in action_lower:
|
|
matches.add(atype)
|
|
|
|
if not matches:
|
|
return "default"
|
|
|
|
for prio in _ACTION_PRIORITY:
|
|
if prio in matches:
|
|
return prio
|
|
|
|
return next(iter(matches))
|
|
|
|
|
|
# ── 3. Object Class Classification ──────────────────────────────────────
|
|
|
|
_OBJECT_CLASS_KEYWORDS: dict[str, list[str]] = {
|
|
# ── Governance / Documentation ────────────────────────────
|
|
"policy": [
|
|
"richtlinie", "policy", "konzept", "strategie", "leitlinie",
|
|
"vorgabe", "regelung", "ordnung", "anweisung", "standard",
|
|
"rahmenwerk", "sicherheitskonzept", "datenschutzkonzept",
|
|
],
|
|
"procedure": [
|
|
"verfahren", "workflow", "ablauf",
|
|
"vorgehensweise", "methodik", "prozedur", "handlungsanweisung",
|
|
],
|
|
"register": [
|
|
"verzeichnis", "register", "inventar", "liste", "katalog",
|
|
"übersicht", "bestandsaufnahme",
|
|
],
|
|
"record": [
|
|
"protokoll", "log", "aufzeichnung", "nachweis",
|
|
"evidenz", "artefakt", "dokumentation",
|
|
],
|
|
"report": [
|
|
"meldung", "bericht", "report", "benachrichtigung",
|
|
"mitteilung", "anzeige", "statusbericht",
|
|
],
|
|
# ── Technical / Security ──────────────────────────────────
|
|
"technical_control": [
|
|
"mfa", "firewall", "verschlüsselung", "backup", "antivirus",
|
|
"ids", "ips", "waf", "vpn", "tls", "ssl",
|
|
"patch", "update", "härtung", "segmentierung",
|
|
"alarmierung", "monitoring",
|
|
],
|
|
"access_control": [
|
|
"authentifizierung", "autorisierung", "zugriff",
|
|
"berechtigung", "passwort", "kennwort", "anmeldung",
|
|
"sso", "rbac", "session",
|
|
],
|
|
"cryptographic_control": [
|
|
"schlüssel", "zertifikat", "signatur", "kryptographi",
|
|
"cipher", "hash", "token",
|
|
],
|
|
"configuration": [
|
|
"konfiguration", "einstellung", "parameter",
|
|
"baseline", "hardening", "härtungsprofil",
|
|
],
|
|
"account": [
|
|
"account", "konto", "benutzer", "privilegiert",
|
|
"admin", "root", "dienstkonto", "servicekonto",
|
|
],
|
|
# ── Data / Systems ────────────────────────────────────────
|
|
"system": [
|
|
"system", "plattform", "dienst", "service", "anwendung",
|
|
"software", "komponente", "infrastruktur", "netzwerk",
|
|
"server", "datenbank", "produkt", "gerät", "endgerät",
|
|
],
|
|
"data": [
|
|
"daten", "information", "personenbezogen", "datensatz",
|
|
"datei", "inhalt", "verarbeitungstätigkeit",
|
|
],
|
|
"interface": [
|
|
"schnittstelle", "interface", "api", "integration",
|
|
"datenfluss", "datenübermittlung",
|
|
],
|
|
# ── People / Org ──────────────────────────────────────────
|
|
"role": [
|
|
"mitarbeiter", "personal", "rolle", "beauftragter",
|
|
"verantwortlicher", "team", "abteilung", "beschäftigte",
|
|
],
|
|
"training": [
|
|
"schulung", "training", "sensibilisierung", "awareness",
|
|
"unterweisung", "fortbildung", "qualifikation",
|
|
],
|
|
# ── Incident / Risk ───────────────────────────────────────
|
|
"incident": [
|
|
"vorfall", "incident", "sicherheitsvorfall", "störung",
|
|
"notfall", "krise", "bedrohung",
|
|
],
|
|
"risk_artifact": [
|
|
"risiko", "schwachstelle", "vulnerability", "gefährdung",
|
|
"risikoanalyse", "risikobewertung", "schutzbedarfsfeststellung",
|
|
],
|
|
# ── Process / Consent ────────────────────────────────────
|
|
"process": [
|
|
"prozess", "geschäftsprozess", "betriebsprozess",
|
|
"managementprozess", "steuerungsprozess",
|
|
],
|
|
"consent": [
|
|
"einwilligung", "consent", "einverständnis",
|
|
"zustimmung", "opt-in", "opt-out",
|
|
],
|
|
}
|
|
|
|
|
|
def _classify_object(object_: str) -> str:
|
|
"""Classify the obligation object into a domain class."""
|
|
if not object_:
|
|
return "general"
|
|
obj_lower = object_.lower()
|
|
for obj_class, keywords in _OBJECT_CLASS_KEYWORDS.items():
|
|
if any(k in obj_lower for k in keywords):
|
|
return obj_class
|
|
return "general"
|
|
|
|
|
|
# ── 4. Trigger / Timing Qualifier ───────────────────────────────────────
|
|
|
|
_FREQUENCY_PATTERNS: list[tuple[str, str]] = [
|
|
(r"jährl", "jährlich"),
|
|
(r"quartal", "quartalsweise"),
|
|
(r"halbjähr", "halbjährlich"),
|
|
(r"monatl", "monatlich"),
|
|
(r"wöchentl", "wöchentlich"),
|
|
(r"regelmäßig", "regelmässig"),
|
|
(r"72\s*Stunden", "innerhalb von 72 Stunden"),
|
|
(r"unverzüglich", "unverzüglich"),
|
|
(r"ohne\s+Verzögerung", "ohne unangemessene Verzögerung"),
|
|
(r"vor\s+Inbetriebnahme", "vor Inbetriebnahme"),
|
|
(r"vor\s+Markteinführung", "vor Markteinführung"),
|
|
(r"vor\s+Freigabe", "vor Freigabe"),
|
|
]
|
|
|
|
|
|
def _extract_trigger_qualifier(
|
|
trigger_type: Optional[str], obligation_text: str,
|
|
) -> str:
|
|
"""Extract timing/trigger context for test procedures."""
|
|
# Try to find specific frequency in obligation text
|
|
for pattern, qualifier in _FREQUENCY_PATTERNS:
|
|
if re.search(pattern, obligation_text, re.IGNORECASE):
|
|
return qualifier
|
|
|
|
if trigger_type == "event":
|
|
return "bei Eintreten des auslösenden Ereignisses"
|
|
if trigger_type == "periodic":
|
|
return "periodisch"
|
|
return "" # continuous → no special qualifier
|
|
|
|
|
|
# ── 4b. Structured Timing Extraction ────────────────────────────────────
|
|
|
|
_STRUCTURED_FREQUENCY_MAP: list[tuple[str, Optional[int], Optional[str]]] = [
|
|
# (regex_pattern, deadline_hours, frequency)
|
|
(r"72\s*Stunden", 72, None),
|
|
(r"48\s*Stunden", 48, None),
|
|
(r"24\s*Stunden", 24, None),
|
|
(r"unverzüglich", 0, None),
|
|
(r"ohne\s+Verzögerung", 0, None),
|
|
(r"sofort", 0, None),
|
|
(r"jährl", None, "yearly"),
|
|
(r"halbjähr", None, "semi_annually"),
|
|
(r"quartal", None, "quarterly"),
|
|
(r"monatl", None, "monthly"),
|
|
(r"wöchentl", None, "weekly"),
|
|
(r"täglich", None, "daily"),
|
|
(r"regelmäßig", None, "periodic"),
|
|
(r"periodisch", None, "periodic"),
|
|
(r"vor\s+Inbetriebnahme", None, "before_deployment"),
|
|
(r"vor\s+Markteinführung", None, "before_launch"),
|
|
(r"vor\s+Freigabe", None, "before_release"),
|
|
]
|
|
|
|
|
|
def _extract_structured_timing(
|
|
obligation_text: str,
|
|
) -> tuple[Optional[int], Optional[str]]:
|
|
"""Extract deadline_hours and frequency from obligation text.
|
|
|
|
Returns (deadline_hours, frequency). Both may be None.
|
|
"""
|
|
for pattern, deadline, freq in _STRUCTURED_FREQUENCY_MAP:
|
|
if re.search(pattern, obligation_text, re.IGNORECASE):
|
|
return (deadline, freq)
|
|
return (None, None)
|
|
|
|
|
|
# ── 5. Template Matrix: (action_type, object_class) → templates ─────────
|
|
#
|
|
# Specific combos override base templates. Lookup order:
|
|
# 1. _SPECIFIC_TEMPLATES[(action_type, object_class)]
|
|
# 2. _ACTION_TEMPLATES[action_type]
|
|
# 3. _DEFAULT_ACTION_TEMPLATE
|
|
|
|
_ACTION_TEMPLATES: dict[str, dict[str, list[str]]] = {
|
|
# ── Create / Define / Document ─────────────────────────────
|
|
"define": {
|
|
"test_procedure": [
|
|
"Prüfung, ob {object} definiert und formal freigegeben ist",
|
|
"Review der Inhalte auf Vollständigkeit und Angemessenheit",
|
|
"Verifizierung, dass {object} den Betroffenen kommuniziert wurde",
|
|
],
|
|
"evidence": [
|
|
"Freigegebenes Dokument mit Geltungsbereich",
|
|
"Kommunikationsnachweis (E-Mail, Intranet, Schulung)",
|
|
],
|
|
},
|
|
"document": {
|
|
"test_procedure": [
|
|
"Prüfung, ob {object} dokumentiert und aktuell ist",
|
|
"Sichtung der Dokumentation auf Vollständigkeit",
|
|
"Verifizierung der Versionierung und des Review-Zyklus",
|
|
],
|
|
"evidence": [
|
|
"Dokument mit Versionshistorie",
|
|
"Freigabenachweis (Unterschrift/Approval)",
|
|
],
|
|
},
|
|
"maintain": {
|
|
"test_procedure": [
|
|
"Prüfung, ob {object} aktuell gehalten wird",
|
|
"Vergleich der letzten Aktualisierung mit dem Review-Zyklus",
|
|
"Stichprobe: Änderungen nach relevanten Ereignissen nachvollzogen",
|
|
],
|
|
"evidence": [
|
|
"Änderungshistorie mit Datum und Verantwortlichem",
|
|
"Nachweis des letzten Reviews",
|
|
],
|
|
},
|
|
# ── Implement / Configure ──────────────────────────────────
|
|
"implement": {
|
|
"test_procedure": [
|
|
"Prüfung der technischen Umsetzung von {object}",
|
|
"Funktionstest der implementierten Massnahme",
|
|
"Review der Konfiguration gegen die Anforderungsspezifikation",
|
|
],
|
|
"evidence": [
|
|
"Konfigurationsnachweis (Screenshot/Export)",
|
|
"Implementierungsdokumentation",
|
|
],
|
|
},
|
|
"configure": {
|
|
"test_procedure": [
|
|
"Prüfung der Konfiguration von {object} gegen Soll-Vorgaben",
|
|
"Vergleich mit Hardening-Baseline oder Best Practice",
|
|
"Automatisierter Konfigurationsscan (falls verfügbar)",
|
|
],
|
|
"evidence": [
|
|
"Konfigurationsexport mit Soll-/Ist-Vergleich",
|
|
"Scan-Bericht oder Compliance-Check-Ergebnis",
|
|
],
|
|
},
|
|
# ── Monitor / Review / Assess / Audit ──────────────────────
|
|
"monitor": {
|
|
"test_procedure": [
|
|
"Prüfung der laufenden Überwachung von {object}",
|
|
"Stichprobe der Protokolle/Logs der letzten 3 Monate",
|
|
"Verifizierung der Alarmierungs- und Eskalationsprozesse",
|
|
],
|
|
"evidence": [
|
|
"Monitoring-Dashboard-Export oder Log-Auszüge",
|
|
"Alarmierungsregeln und Eskalationsmatrix",
|
|
],
|
|
},
|
|
"review": {
|
|
"test_procedure": [
|
|
"Prüfung, ob {object} im vorgesehenen Zyklus überprüft wurde",
|
|
"Sichtung des Review-Protokolls auf Massnahmenableitung",
|
|
"Verifizierung der Umsetzung identifizierter Massnahmen",
|
|
],
|
|
"evidence": [
|
|
"Review-Protokoll mit Datum und Teilnehmern",
|
|
"Massnahmenplan mit Umsetzungsstatus",
|
|
],
|
|
},
|
|
"assess": {
|
|
"test_procedure": [
|
|
"Prüfung der Bewertungsmethodik für {object}",
|
|
"Sichtung der letzten Bewertungsergebnisse",
|
|
"Verifizierung, dass Ergebnisse in Massnahmen überführt wurden",
|
|
],
|
|
"evidence": [
|
|
"Bewertungsbericht mit Methodik und Ergebnissen",
|
|
"Abgeleiteter Massnahmenplan",
|
|
],
|
|
},
|
|
"audit": {
|
|
"test_procedure": [
|
|
"Prüfung des Audit-Plans und der Audit-Durchführung für {object}",
|
|
"Sichtung der Audit-Berichte und Findings",
|
|
"Verifizierung der Nachverfolgung offener Findings",
|
|
],
|
|
"evidence": [
|
|
"Audit-Bericht mit Findings und Empfehlungen",
|
|
"Finding-Tracker mit Umsetzungsstatus",
|
|
],
|
|
},
|
|
# ── Test / Verify / Validate ───────────────────────────────
|
|
"test": {
|
|
"test_procedure": [
|
|
"Review der Testpläne und -methodik für {object}",
|
|
"Stichprobe der Testergebnisse und Massnahmenableitung",
|
|
"Prüfung der Testabdeckung und -häufigkeit",
|
|
],
|
|
"evidence": [
|
|
"Testprotokoll mit Ergebnissen",
|
|
"Testplan und Abdeckungsbericht",
|
|
],
|
|
},
|
|
"verify": {
|
|
"test_procedure": [
|
|
"Prüfung der Verifikationsmethodik für {object}",
|
|
"Nachvollzug der Verifikationsergebnisse gegen Spezifikation",
|
|
"Prüfung, ob alle Anforderungen abgedeckt sind",
|
|
],
|
|
"evidence": [
|
|
"Verifikationsbericht mit Soll-/Ist-Abgleich",
|
|
"Anforderungs-Traceability-Matrix",
|
|
],
|
|
},
|
|
"validate": {
|
|
"test_procedure": [
|
|
"Prüfung der Validierungsmethodik für {object}",
|
|
"Bewertung, ob die Massnahme den Zweck im Praxiseinsatz erfüllt",
|
|
"Auswertung von Nutzerfeedback oder Betriebsdaten",
|
|
],
|
|
"evidence": [
|
|
"Validierungsbericht",
|
|
"Praxisnachweis (Betriebsdaten, Nutzerfeedback)",
|
|
],
|
|
},
|
|
# ── Report / Notify ────────────────────────────────────────
|
|
"report": {
|
|
"test_procedure": [
|
|
"Prüfung des Meldeprozesses für {object}",
|
|
"Stichprobe gemeldeter Vorfälle auf Vollständigkeit und Fristeneinhaltung",
|
|
"Verifizierung der Meldekanäle und Zuständigkeiten",
|
|
],
|
|
"evidence": [
|
|
"Meldeprozess-Dokumentation",
|
|
"Nachweise über erfolgte Meldungen mit Zeitstempeln",
|
|
],
|
|
},
|
|
"notify": {
|
|
"test_procedure": [
|
|
"Prüfung des Benachrichtigungsprozesses für {object}",
|
|
"Verifizierung der Empfänger und Kommunikationskanäle",
|
|
"Stichprobe: Benachrichtigungen fristgerecht versendet",
|
|
],
|
|
"evidence": [
|
|
"Benachrichtigungsvorlagen und Verteiler",
|
|
"Versandnachweise mit Zeitstempeln",
|
|
],
|
|
},
|
|
# ── Train ──────────────────────────────────────────────────
|
|
"train": {
|
|
"test_procedure": [
|
|
"Prüfung der Schulungsinhalte und -unterlagen zu {object}",
|
|
"Verifizierung der Teilnehmerlisten und Schulungsfrequenz",
|
|
"Stichprobe: Wissensstand durch Befragung oder Kurztest",
|
|
],
|
|
"evidence": [
|
|
"Schulungsunterlagen und Schulungsplan",
|
|
"Teilnehmerlisten mit Datum und Unterschrift",
|
|
"Ergebnisse von Wissenstests (falls durchgeführt)",
|
|
],
|
|
},
|
|
# ── Access / Encrypt ───────────────────────────────────────
|
|
"restrict_access": {
|
|
"test_procedure": [
|
|
"Review der Zugriffsberechtigungen für {object}",
|
|
"Prüfung der Berechtigungsmatrix auf Aktualität und Least-Privilege",
|
|
"Stichprobe: Entzug von Rechten bei Rollenwechsel/Austritt",
|
|
],
|
|
"evidence": [
|
|
"Aktuelle Berechtigungsmatrix",
|
|
"Zugriffsprotokolle der letzten 3 Monate",
|
|
"Nachweis des letzten Berechtigungs-Reviews",
|
|
],
|
|
},
|
|
"encrypt": {
|
|
"test_procedure": [
|
|
"Prüfung der Verschlüsselungskonfiguration für {object}",
|
|
"Verifizierung der Algorithmen und Schlüssellängen gegen BSI-Empfehlungen",
|
|
"Prüfung des Schlüsselmanagement-Prozesses (Rotation, Speicherung)",
|
|
],
|
|
"evidence": [
|
|
"Kryptographie-Konzept",
|
|
"Zertifikats- und Schlüsselinventar",
|
|
"Schlüsselrotations-Nachweis",
|
|
],
|
|
},
|
|
# ── Delete / Retain ────────────────────────────────────────
|
|
"delete": {
|
|
"test_procedure": [
|
|
"Prüfung des Löschkonzepts für {object}",
|
|
"Verifizierung der Löschfristen und automatisierten Löschmechanismen",
|
|
"Stichprobe: Löschung nach Fristablauf tatsächlich durchgeführt",
|
|
],
|
|
"evidence": [
|
|
"Löschkonzept mit definierten Fristen",
|
|
"Löschprotokolle oder -nachweise",
|
|
],
|
|
},
|
|
"retain": {
|
|
"test_procedure": [
|
|
"Prüfung der Aufbewahrungsfristen und -orte für {object}",
|
|
"Verifizierung der Zugriffskontrollen auf archivierte Daten",
|
|
"Prüfung der automatischen Löschung nach Ablauf der Aufbewahrungsfrist",
|
|
],
|
|
"evidence": [
|
|
"Aufbewahrungsrichtlinie mit Fristen",
|
|
"Speicherort-Dokumentation mit Zugriffskonzept",
|
|
],
|
|
},
|
|
# ── Ensure (catch-all for sicherstellen/gewährleisten) ─────
|
|
"ensure": {
|
|
"test_procedure": [
|
|
"Prüfung, ob Massnahmen für {object} wirksam umgesetzt sind",
|
|
"Stichprobenprüfung der Einhaltung im operativen Betrieb",
|
|
"Review der zugehörigen Prozessdokumentation",
|
|
],
|
|
"evidence": [
|
|
"Nachweis der Umsetzung (Konfiguration/Prozess)",
|
|
"Prüfprotokoll der letzten Überprüfung",
|
|
],
|
|
},
|
|
# ── Perform / Obtain ───────────────────────────────────────
|
|
"perform": {
|
|
"test_procedure": [
|
|
"Prüfung der Durchführung von {object}",
|
|
"Verifizierung der Zuständigkeiten und Freigabeschritte",
|
|
"Stichprobe der Durchführung anhand aktueller Fälle",
|
|
],
|
|
"evidence": [
|
|
"Durchführungsnachweise (Tickets, Protokolle)",
|
|
"Prozessdokumentation mit Verantwortlichkeiten",
|
|
],
|
|
},
|
|
"obtain": {
|
|
"test_procedure": [
|
|
"Prüfung des Einholungsprozesses für {object}",
|
|
"Verifizierung der Vollständigkeit und Gültigkeit",
|
|
"Stichprobe: Einholung vor Beginn der Verarbeitung nachgewiesen",
|
|
],
|
|
"evidence": [
|
|
"Nachweise der Einholung (Einwilligungen, Freigaben)",
|
|
"Gültigkeitsprüfung mit Zeitstempeln",
|
|
],
|
|
},
|
|
# ── Approve / Remediate ───────────────────────────────────
|
|
"approve": {
|
|
"test_procedure": [
|
|
"Prüfung des Genehmigungsprozesses für {object}",
|
|
"Verifizierung der Freigabeberechtigungen und Eskalationswege",
|
|
"Stichprobe: Genehmigung vor Umsetzung/Nutzung nachgewiesen",
|
|
],
|
|
"evidence": [
|
|
"Freigabenachweis (Signatur, Ticket, Workflow)",
|
|
"Genehmigungsmatrix mit Zuständigkeiten",
|
|
],
|
|
},
|
|
"remediate": {
|
|
"test_procedure": [
|
|
"Prüfung des Behebungsprozesses für {object}",
|
|
"Verifizierung der Korrekturmassnahmen und Wirksamkeit",
|
|
"Stichprobe: Abweichungen fristgerecht behoben",
|
|
],
|
|
"evidence": [
|
|
"Korrekturmassnahmen-Dokumentation",
|
|
"Nachprüfprotokoll der Wirksamkeit",
|
|
],
|
|
},
|
|
}
|
|
|
|
# ── Specific (action_type, object_class) overrides ───────────────────────
|
|
|
|
_SPECIFIC_TEMPLATES: dict[tuple[str, str], dict[str, list[str]]] = {
|
|
("implement", "policy"): {
|
|
"test_procedure": [
|
|
"Prüfung, ob {object} dokumentiert, freigegeben und in relevanten Prozessen umgesetzt ist",
|
|
"Interview mit Prozessverantwortlichen zur tatsächlichen Umsetzung",
|
|
"Stichprobe: Nachweis der Umsetzung in der Praxis",
|
|
],
|
|
"evidence": [
|
|
"Freigegebenes Richtliniendokument",
|
|
"Nachweis der Kommunikation an Betroffene",
|
|
"Stichproben der operativen Umsetzung",
|
|
],
|
|
},
|
|
("implement", "technical_control"): {
|
|
"test_procedure": [
|
|
"Prüfung der technischen Konfiguration von {object}",
|
|
"Funktionstest: Wirksamkeit der Massnahme verifizieren",
|
|
"Vulnerability-Scan oder Penetrationstest (falls anwendbar)",
|
|
],
|
|
"evidence": [
|
|
"Konfigurationsnachweis (Screenshot/Export)",
|
|
"Testprotokoll mit Ergebnissen",
|
|
"Scan-Bericht (falls durchgeführt)",
|
|
],
|
|
},
|
|
("implement", "process"): {
|
|
"test_procedure": [
|
|
"Prüfung der Prozessdokumentation für {object}",
|
|
"Verifizierung, dass der Prozess operativ gelebt wird",
|
|
"Stichprobe: Prozessdurchführung anhand aktueller Fälle",
|
|
],
|
|
"evidence": [
|
|
"Prozessdokumentation mit RACI-Matrix",
|
|
"Durchführungsnachweise der letzten 3 Monate",
|
|
],
|
|
},
|
|
("define", "policy"): {
|
|
"test_procedure": [
|
|
"Prüfung, ob {object} formal definiert und durch Management freigegeben ist",
|
|
"Review des Geltungsbereichs und der Adressaten",
|
|
"Verifizierung der regelmässigen Aktualisierung",
|
|
],
|
|
"evidence": [
|
|
"Freigegebene Policy mit Unterschrift der Geschäftsleitung",
|
|
"Geltungsbereich und Verteiler",
|
|
"Letzte Aktualisierung mit Änderungshistorie",
|
|
],
|
|
},
|
|
("monitor", "system"): {
|
|
"test_procedure": [
|
|
"Prüfung der Monitoring-Konfiguration für {object}",
|
|
"Stichprobe der System-Logs und Alerts der letzten 3 Monate",
|
|
"Verifizierung: Alerts führen zu dokumentierten Reaktionen",
|
|
],
|
|
"evidence": [
|
|
"Monitoring-Dashboard-Export",
|
|
"Alert-Konfiguration und Eskalationsregeln",
|
|
"Incident-Tickets aus Alert-Eskalation",
|
|
],
|
|
},
|
|
("monitor", "incident"): {
|
|
"test_procedure": [
|
|
"Prüfung des Incident-Monitoring-Prozesses für {object}",
|
|
"Stichprobe erkannter Vorfälle auf Reaktionszeit",
|
|
"Verifizierung der Eskalationswege",
|
|
],
|
|
"evidence": [
|
|
"Incident-Log mit Erkennungs- und Reaktionszeiten",
|
|
"Eskalationsmatrix",
|
|
],
|
|
},
|
|
("review", "policy"): {
|
|
"test_procedure": [
|
|
"Prüfung, ob {object} im vorgesehenen Zyklus durch Management reviewed wurde",
|
|
"Sichtung des Review-Protokolls auf Aktualisierungsbedarf",
|
|
"Verifizierung, dass Änderungen umgesetzt wurden",
|
|
],
|
|
"evidence": [
|
|
"Review-Protokoll mit Datum und Teilnehmern",
|
|
"Aktualisierte Version der Richtlinie (falls geändert)",
|
|
],
|
|
},
|
|
("assess", "incident"): {
|
|
"test_procedure": [
|
|
"Prüfung der Risikobewertung für {object}",
|
|
"Nachvollzug der Bewertungskriterien (Schwere, Auswirkung, Wahrscheinlichkeit)",
|
|
"Verifizierung der abgeleiteten Massnahmen",
|
|
],
|
|
"evidence": [
|
|
"Risikobewertungs-Matrix",
|
|
"Massnahmenplan mit Priorisierung",
|
|
],
|
|
},
|
|
("report", "incident"): {
|
|
"test_procedure": [
|
|
"Prüfung des Meldeprozesses für {object} an zuständige Behörden",
|
|
"Verifizierung der Meldefristen (z.B. 72h DSGVO, 24h NIS2)",
|
|
"Stichprobe: Meldungen fristgerecht und vollständig",
|
|
],
|
|
"evidence": [
|
|
"Meldeprozess mit Fristen und Zuständigkeiten",
|
|
"Kopien erfolgter Behördenmeldungen",
|
|
"Zeitstempel-Nachweis der Fristwahrung",
|
|
],
|
|
},
|
|
("train", "role"): {
|
|
"test_procedure": [
|
|
"Prüfung der Schulungspflicht für {object}",
|
|
"Verifizierung der Teilnahme aller betroffenen Personen",
|
|
"Stichprobe: Wissensstand durch Kurztest oder Befragung",
|
|
],
|
|
"evidence": [
|
|
"Schulungsplan mit Zielgruppen und Frequenz",
|
|
"Teilnehmerlisten mit Datum und Unterschrift",
|
|
"Testergebnisse oder Teilnahmebestätigungen",
|
|
],
|
|
},
|
|
("restrict_access", "data"): {
|
|
"test_procedure": [
|
|
"Prüfung der Zugriffskontrollen für {object}",
|
|
"Review der Berechtigungen nach Need-to-Know-Prinzip",
|
|
"Stichprobe: Keine überprivilegierten Zugänge",
|
|
],
|
|
"evidence": [
|
|
"Berechtigungsmatrix mit Rollen und Datenklassen",
|
|
"Zugriffsprotokolle",
|
|
"Ergebnis des letzten Access Reviews",
|
|
],
|
|
},
|
|
("restrict_access", "system"): {
|
|
"test_procedure": [
|
|
"Prüfung der Zugriffskontrollen für {object}",
|
|
"Review der Admin-/Privileged-Zugänge",
|
|
"Stichprobe: MFA aktiv, Passwort-Policy eingehalten",
|
|
],
|
|
"evidence": [
|
|
"Berechtigungsmatrix",
|
|
"Audit-Log privilegierter Zugriffe",
|
|
"MFA-Konfigurationsnachweis",
|
|
],
|
|
},
|
|
("encrypt", "data"): {
|
|
"test_procedure": [
|
|
"Prüfung der Verschlüsselung von {object} at Rest und in Transit",
|
|
"Verifizierung der Algorithmen gegen BSI TR-02102",
|
|
"Prüfung des Key-Management-Prozesses",
|
|
],
|
|
"evidence": [
|
|
"Kryptographie-Konzept mit Algorithmen und Schlüssellängen",
|
|
"TLS-Konfigurationsnachweis",
|
|
"Key-Rotation-Protokoll",
|
|
],
|
|
},
|
|
("delete", "data"): {
|
|
"test_procedure": [
|
|
"Prüfung des Löschkonzepts für {object}",
|
|
"Verifizierung der automatisierten Löschmechanismen",
|
|
"Stichprobe: Löschung personenbezogener Daten nach Fristablauf",
|
|
],
|
|
"evidence": [
|
|
"Löschkonzept mit Datenklassen und Fristen",
|
|
"Löschprotokolle",
|
|
"Nachweis der Vernichtung (bei physischen Medien)",
|
|
],
|
|
},
|
|
("retain", "data"): {
|
|
"test_procedure": [
|
|
"Prüfung der Aufbewahrungsfristen für {object}",
|
|
"Verifizierung der Speicherorte und Zugriffskontrollen",
|
|
"Prüfung: Keine Aufbewahrung über gesetzliche Frist hinaus",
|
|
],
|
|
"evidence": [
|
|
"Aufbewahrungsrichtlinie mit gesetzlichen Grundlagen",
|
|
"Speicherort-Inventar mit Zugriffskonzept",
|
|
],
|
|
},
|
|
("obtain", "data"): {
|
|
"test_procedure": [
|
|
"Prüfung des Einwilligungsprozesses für {object}",
|
|
"Verifizierung der Einwilligungstexte auf Rechtskonformität",
|
|
"Stichprobe: Einwilligung vor Verarbeitungsbeginn eingeholt",
|
|
],
|
|
"evidence": [
|
|
"Einwilligungsformulare/-dialoge",
|
|
"Consent-Log mit Zeitstempeln",
|
|
"Widerrufsprozess-Dokumentation",
|
|
],
|
|
},
|
|
}
|
|
|
|
_DEFAULT_ACTION_TEMPLATE: dict[str, list[str]] = {
|
|
"test_procedure": [
|
|
"Prüfung der Umsetzung von {object}",
|
|
"Verifizierung der zugehörigen Dokumentation und Nachweisführung",
|
|
],
|
|
"evidence": [
|
|
"Umsetzungsnachweis",
|
|
"Zugehörige Dokumentation",
|
|
],
|
|
}
|
|
|
|
|
|
# ── 6. Title Suffix (action_type → past participle / state) ─────────────
|
|
|
|
_ACTION_STATE_SUFFIX: dict[str, str] = {
|
|
"define": "definiert und freigegeben",
|
|
"document": "dokumentiert",
|
|
"maintain": "aktuell gehalten",
|
|
"implement": "umgesetzt",
|
|
"configure": "konfiguriert",
|
|
"monitor": "überwacht",
|
|
"review": "überprüft",
|
|
"assess": "bewertet",
|
|
"audit": "auditiert",
|
|
"test": "getestet",
|
|
"verify": "verifiziert",
|
|
"validate": "validiert",
|
|
"report": "gemeldet",
|
|
"notify": "benachrichtigt",
|
|
"train": "geschult",
|
|
"restrict_access": "zugriffsbeschränkt",
|
|
"encrypt": "verschlüsselt",
|
|
"delete": "gelöscht",
|
|
"retain": "aufbewahrt",
|
|
"ensure": "sichergestellt",
|
|
"approve": "genehmigt",
|
|
"remediate": "behoben",
|
|
"perform": "durchgeführt",
|
|
"obtain": "eingeholt",
|
|
}
|
|
|
|
|
|
# ── 6b. Pattern Candidates ──────────────────────────────────────────────
|
|
|
|
_PATTERN_CANDIDATES_MAP: dict[tuple[str, str], list[str]] = {
|
|
("define", "policy"): ["policy_documented", "policy_approved"],
|
|
("document", "policy"): ["policy_documented"],
|
|
("implement", "technical_control"): ["technical_safeguard_enabled", "security_control_tested"],
|
|
("implement", "policy"): ["policy_implemented", "policy_communicated"],
|
|
("implement", "process"): ["process_established", "process_operational"],
|
|
("monitor", "system"): ["continuous_monitoring_active"],
|
|
("monitor", "incident"): ["incident_detection_active"],
|
|
("review", "policy"): ["policy_review_completed"],
|
|
("review", "risk_artifact"): ["risk_review_completed"],
|
|
("assess", "risk_artifact"): ["risk_assessment_completed"],
|
|
("restrict_access", "data"): ["access_control_enforced"],
|
|
("restrict_access", "system"): ["access_control_enforced", "privilege_management_active"],
|
|
("restrict_access", "access_control"): ["access_control_enforced"],
|
|
("encrypt", "data"): ["encryption_at_rest", "encryption_in_transit"],
|
|
("encrypt", "cryptographic_control"): ["encryption_at_rest", "key_management_active"],
|
|
("train", "role"): ["awareness_training_completed"],
|
|
("train", "training"): ["awareness_training_completed"],
|
|
("report", "incident"): ["incident_reported_timely"],
|
|
("notify", "incident"): ["notification_sent_timely"],
|
|
("delete", "data"): ["data_deletion_completed"],
|
|
("retain", "data"): ["data_retention_enforced"],
|
|
("audit", "system"): ["audit_completed"],
|
|
("test", "technical_control"): ["security_control_tested"],
|
|
("obtain", "consent"): ["consent_obtained"],
|
|
("obtain", "data"): ["consent_obtained"],
|
|
("approve", "policy"): ["policy_approved"],
|
|
}
|
|
|
|
_PATTERN_CANDIDATES_BY_ACTION: dict[str, list[str]] = {
|
|
"define": ["policy_documented"],
|
|
"document": ["policy_documented"],
|
|
"implement": ["control_implemented"],
|
|
"monitor": ["continuous_monitoring_active"],
|
|
"review": ["review_completed"],
|
|
"assess": ["assessment_completed"],
|
|
"audit": ["audit_completed"],
|
|
"test": ["security_control_tested"],
|
|
"report": ["incident_reported_timely"],
|
|
"notify": ["notification_sent_timely"],
|
|
"train": ["awareness_training_completed"],
|
|
"restrict_access": ["access_control_enforced"],
|
|
"encrypt": ["encryption_at_rest"],
|
|
"delete": ["data_deletion_completed"],
|
|
"retain": ["data_retention_enforced"],
|
|
"ensure": ["control_implemented"],
|
|
"approve": ["policy_approved"],
|
|
"remediate": ["remediation_completed"],
|
|
"perform": ["activity_performed"],
|
|
"obtain": ["consent_obtained"],
|
|
"configure": ["technical_safeguard_enabled"],
|
|
"verify": ["verification_completed"],
|
|
"validate": ["validation_completed"],
|
|
"maintain": ["control_maintained"],
|
|
}
|
|
|
|
|
|
# ── 6c. Raw Infinitives (for validator Negativregeln) ────────────────────
|
|
|
|
_RAW_INFINITIVES: set[str] = {
|
|
"implementieren", "dokumentieren", "definieren", "konfigurieren",
|
|
"überwachen", "überprüfen", "auditieren", "testen", "verifizieren",
|
|
"validieren", "melden", "benachrichtigen", "schulen", "verschlüsseln",
|
|
"löschen", "aufbewahren", "sicherstellen", "gewährleisten",
|
|
"genehmigen", "beheben", "durchführen", "einholen", "erstellen",
|
|
"festlegen", "bereitstellen", "installieren", "einrichten",
|
|
"bewerten", "analysieren", "kontrollieren", "protokollieren",
|
|
}
|
|
|
|
|
|
# ── 7. Object Normalization (with synonym mapping) ──────────────────────
|
|
|
|
_OBJECT_SYNONYMS: dict[str, str] = {
|
|
"verzeichnis": "register",
|
|
"inventar": "register",
|
|
"katalog": "register",
|
|
"bestandsaufnahme": "register",
|
|
"richtlinie": "policy",
|
|
"konzept": "policy",
|
|
"strategie": "policy",
|
|
"leitlinie": "policy",
|
|
"vorgabe": "policy",
|
|
"regelung": "policy",
|
|
"anweisung": "policy",
|
|
"rahmenwerk": "policy",
|
|
"sicherheitskonzept": "policy",
|
|
"datenschutzkonzept": "policy",
|
|
"verfahren": "procedure",
|
|
"ablauf": "procedure",
|
|
"vorgehensweise": "procedure",
|
|
"methodik": "procedure",
|
|
"prozedur": "procedure",
|
|
"handlungsanweisung": "procedure",
|
|
"protokoll": "record",
|
|
"aufzeichnung": "record",
|
|
"nachweis": "record",
|
|
"evidenz": "record",
|
|
"vorfall": "incident",
|
|
"störung": "incident",
|
|
"sicherheitsvorfall": "incident",
|
|
"notfall": "incident",
|
|
"krise": "incident",
|
|
"schwachstelle": "risk_artifact",
|
|
"gefährdung": "risk_artifact",
|
|
"risikoanalyse": "risk_artifact",
|
|
"risikobewertung": "risk_artifact",
|
|
"mitarbeiter": "role",
|
|
"personal": "role",
|
|
"beauftragter": "role",
|
|
"verantwortlicher": "role",
|
|
"schulung": "training",
|
|
"sensibilisierung": "training",
|
|
"unterweisung": "training",
|
|
"verschlüsselung": "technical_control",
|
|
"firewall": "technical_control",
|
|
"backup": "technical_control",
|
|
"meldung": "report",
|
|
"bericht": "report",
|
|
"benachrichtigung": "report",
|
|
"berechtigung": "access_control",
|
|
"authentifizierung": "access_control",
|
|
"zugriff": "access_control",
|
|
"einwilligung": "consent",
|
|
"zustimmung": "consent",
|
|
}
|
|
|
|
|
|
def _normalize_object(object_raw: str) -> str:
|
|
"""Normalize object text to a snake_case key for merge hints.
|
|
|
|
Applies synonym mapping to collapse German terms to canonical forms
|
|
(e.g., 'Richtlinie' -> 'policy', 'Verzeichnis' -> 'register').
|
|
"""
|
|
if not object_raw:
|
|
return "unknown"
|
|
|
|
obj_lower = object_raw.strip().lower()
|
|
|
|
# Synonym mapping — find the longest matching synonym
|
|
best_match = ""
|
|
best_canonical = ""
|
|
for synonym, canonical in _OBJECT_SYNONYMS.items():
|
|
if synonym in obj_lower and len(synonym) > len(best_match):
|
|
best_match = synonym
|
|
best_canonical = canonical
|
|
|
|
if best_canonical:
|
|
obj_lower = obj_lower.replace(best_match, best_canonical, 1)
|
|
|
|
obj = re.sub(r"\s+", "_", obj_lower.strip())
|
|
for src, dst in [("ä", "ae"), ("ö", "oe"), ("ü", "ue"), ("ß", "ss")]:
|
|
obj = obj.replace(src, dst)
|
|
obj = re.sub(r"[^a-z0-9_]", "", obj)
|
|
return obj[:80] or "unknown"
|
|
|
|
|
|
# ── 7b. Framework / Composite Detection ──────────────────────────────────
|
|
|
|
_FRAMEWORK_KEYWORDS: list[str] = [
|
|
"praktiken", "kontrollen gemäß", "maßnahmen gemäß", "anforderungen aus",
|
|
"anforderungen gemäß", "gemäß .+ umzusetzen", "framework", "standard",
|
|
"controls for", "practices for", "requirements from",
|
|
]
|
|
|
|
_COMPOSITE_OBJECT_KEYWORDS: list[str] = [
|
|
"ccm", "nist", "iso 27001", "iso 27002", "owasp", "bsi",
|
|
"cis controls", "cobit", "sox", "pci dss", "hitrust",
|
|
"soc 2", "soc2", "enisa", "kritis",
|
|
]
|
|
|
|
_COMPOSITE_RE = re.compile(
|
|
"|".join(_FRAMEWORK_KEYWORDS + _COMPOSITE_OBJECT_KEYWORDS),
|
|
re.IGNORECASE,
|
|
)
|
|
|
|
|
|
def _is_composite_obligation(obligation_text: str, object_: str) -> bool:
|
|
"""Detect framework-level / composite obligations that are NOT atomic.
|
|
|
|
Returns True if the obligation references a framework domain, standard,
|
|
or set of practices rather than a single auditable requirement.
|
|
"""
|
|
combined = f"{obligation_text} {object_}"
|
|
return bool(_COMPOSITE_RE.search(combined))
|
|
|
|
|
|
# ── 7c. Output Validator (Negativregeln) ─────────────────────────────────
|
|
|
|
def _validate_atomic_control(
|
|
atomic: "AtomicControlCandidate",
|
|
action_type: str,
|
|
object_class: str,
|
|
) -> list[str]:
|
|
"""Validate an atomic control against Pflichtfelder + Negativregeln.
|
|
|
|
Returns a list of issue strings (ERROR: / WARN:).
|
|
Logs warnings but never rejects the control.
|
|
"""
|
|
issues: list[str] = []
|
|
|
|
# ── Pflichtfelder ──────────────────────────────────────
|
|
if not atomic.title.strip():
|
|
issues.append("ERROR: title is empty")
|
|
if not atomic.objective.strip():
|
|
issues.append("ERROR: objective is empty")
|
|
if not atomic.test_procedure:
|
|
issues.append("ERROR: test_procedure is empty")
|
|
if not atomic.evidence:
|
|
issues.append("ERROR: evidence is empty")
|
|
|
|
# ── Negativregeln ──────────────────────────────────────
|
|
if len(atomic.title) > 80:
|
|
issues.append(f"ERROR: title exceeds 80 chars ({len(atomic.title)})")
|
|
|
|
# Detect garbage pattern: "Prüfung der {raw_infinitive}" (leaked action)
|
|
for i, tp in enumerate(atomic.test_procedure):
|
|
for inf in _RAW_INFINITIVES:
|
|
if re.search(
|
|
rf"\b(?:der|des|die)\s+{re.escape(inf)}\b", tp, re.IGNORECASE,
|
|
):
|
|
issues.append(
|
|
f"ERROR: test_procedure[{i}] contains raw infinitive '{inf}'"
|
|
)
|
|
break
|
|
|
|
for i, ev in enumerate(atomic.evidence):
|
|
if not ev.strip():
|
|
issues.append(f"ERROR: evidence[{i}] is empty string")
|
|
|
|
# ── Warnregeln ─────────────────────────────────────────
|
|
confidence = getattr(atomic, "_decomposition_confidence", None)
|
|
if confidence is not None and confidence < 0.5:
|
|
issues.append(f"WARN: low confidence ({confidence})")
|
|
|
|
if object_class == "general":
|
|
issues.append("WARN: object_class is 'general' (unclassified)")
|
|
|
|
if getattr(atomic, "_is_composite", False):
|
|
issues.append("WARN: composite/framework obligation — requires further decomposition")
|
|
|
|
for issue in issues:
|
|
if issue.startswith("ERROR:"):
|
|
logger.warning("Validation: %s — title=%s", issue, atomic.title[:60])
|
|
else:
|
|
logger.debug("Validation: %s — title=%s", issue, atomic.title[:60])
|
|
|
|
return issues
|
|
|
|
|
|
# ── 8. Confidence Scoring ───────────────────────────────────────────────
|
|
|
|
def _score_pass0b_confidence(
|
|
action_type: str,
|
|
object_class: str,
|
|
trigger_q: str,
|
|
has_specific_template: bool,
|
|
) -> float:
|
|
"""Score decomposition confidence for a Pass 0b candidate."""
|
|
score = 0.3 # base
|
|
if action_type != "default":
|
|
score += 0.25
|
|
if object_class != "general":
|
|
score += 0.20
|
|
if trigger_q:
|
|
score += 0.10
|
|
if has_specific_template:
|
|
score += 0.15
|
|
return round(min(score, 1.0), 2)
|
|
|
|
|
|
# ── 9. Compose Function ─────────────────────────────────────────────────
|
|
|
|
|
|
def _compose_deterministic(
|
|
obligation_text: str,
|
|
action: str,
|
|
object_: str,
|
|
parent_title: str,
|
|
parent_severity: str,
|
|
parent_category: str,
|
|
is_test: bool,
|
|
is_reporting: bool,
|
|
trigger_type: Optional[str] = None,
|
|
condition: Optional[str] = None,
|
|
) -> "AtomicControlCandidate":
|
|
"""Compose an atomic control deterministically from obligation data.
|
|
|
|
No LLM required. Uses action-type classification, object-class
|
|
matching, and trigger-aware templates. Generates:
|
|
- Title as '{Object} {state suffix}'
|
|
- Statement as '{condition_prefix} {object} ist {trigger} {action}'
|
|
- Evidence/test bundles from (action_type, object_class) matrix
|
|
- Pattern candidates for downstream categorization
|
|
- Merge hint for downstream dedup
|
|
- Structured timing (deadline_hours, frequency)
|
|
- Confidence score
|
|
- Validation issues (Negativregeln)
|
|
"""
|
|
# Override action type for flagged obligations
|
|
if is_test:
|
|
action_type = "test"
|
|
elif is_reporting:
|
|
action_type = "report"
|
|
else:
|
|
action_type = _classify_action(action)
|
|
|
|
object_class = _classify_object(object_)
|
|
|
|
# Template lookup: specific combo → action base → default
|
|
has_specific = (action_type, object_class) in _SPECIFIC_TEMPLATES
|
|
template = (
|
|
_SPECIFIC_TEMPLATES.get((action_type, object_class))
|
|
or _ACTION_TEMPLATES.get(action_type)
|
|
or _DEFAULT_ACTION_TEMPLATE
|
|
)
|
|
|
|
# Object for template substitution (fallback to parent title)
|
|
obj_display = object_.strip() if object_ else parent_title
|
|
|
|
# ── Title: "{Object} {Zustand}" ───────────────────────────
|
|
state = _ACTION_STATE_SUFFIX.get(action_type, "umgesetzt")
|
|
if object_:
|
|
title = f"{object_.strip()} {state}"[:80]
|
|
elif action:
|
|
title = f"{action.strip().capitalize()} {state}"[:80]
|
|
else:
|
|
title = f"{parent_title} {state}"[:80]
|
|
|
|
# ── Objective = obligation text (the normative statement) ─
|
|
objective = obligation_text.strip()[:2000]
|
|
|
|
# ── Requirements = obligation as concrete requirement ─────
|
|
requirements = [obligation_text.strip()] if obligation_text else []
|
|
|
|
# ── Test procedure from templates with object substitution
|
|
test_procedure = [
|
|
tp.replace("{object}", obj_display)
|
|
for tp in template["test_procedure"]
|
|
]
|
|
|
|
# ── Trigger qualifier → add timing test step ──────────────
|
|
trigger_q = _extract_trigger_qualifier(trigger_type, obligation_text)
|
|
if trigger_q and test_procedure:
|
|
test_procedure.append(
|
|
f"Prüfung der Frist-/Trigger-Einhaltung: {trigger_q}"
|
|
)
|
|
|
|
# ── Evidence from templates ───────────────────────────────
|
|
evidence = list(template["evidence"])
|
|
|
|
# ── Merge hint for downstream dedup ───────────────────────
|
|
norm_obj = _normalize_object(object_)
|
|
trigger_key = trigger_type or "none"
|
|
merge_hint = f"{action_type}:{norm_obj}:{trigger_key}"
|
|
|
|
# ── Statement: structured normative sentence ──────────────
|
|
condition_prefix = ""
|
|
if condition and condition.strip():
|
|
condition_prefix = condition.strip().rstrip(",") + ","
|
|
trigger_clause = trigger_q if trigger_q else ""
|
|
obj_for_stmt = object_.strip() if object_ else parent_title
|
|
if obj_for_stmt:
|
|
parts = [p for p in [condition_prefix, obj_for_stmt, "ist", trigger_clause, state] if p]
|
|
statement = " ".join(parts)
|
|
else:
|
|
statement = ""
|
|
|
|
# ── Pattern candidates ────────────────────────────────────
|
|
pattern_candidates = _PATTERN_CANDIDATES_MAP.get(
|
|
(action_type, object_class),
|
|
_PATTERN_CANDIDATES_BY_ACTION.get(action_type, []),
|
|
)
|
|
|
|
# ── Structured timing ─────────────────────────────────────
|
|
deadline_hours, frequency = _extract_structured_timing(obligation_text)
|
|
|
|
# ── Confidence score ──────────────────────────────────────
|
|
confidence = _score_pass0b_confidence(
|
|
action_type, object_class, trigger_q, has_specific,
|
|
)
|
|
|
|
atomic = AtomicControlCandidate(
|
|
title=title,
|
|
objective=objective,
|
|
requirements=requirements,
|
|
test_procedure=test_procedure,
|
|
evidence=evidence,
|
|
severity=_normalize_severity(parent_severity),
|
|
category=parent_category or "governance",
|
|
)
|
|
# Attach extra metadata (stored in generation_metadata)
|
|
atomic.domain = f"{action_type}:{object_class}"
|
|
atomic.source_regulation = merge_hint
|
|
atomic._decomposition_confidence = confidence # type: ignore[attr-defined]
|
|
atomic._statement = statement # type: ignore[attr-defined]
|
|
atomic._pattern_candidates = list(pattern_candidates) # type: ignore[attr-defined]
|
|
atomic._deadline_hours = deadline_hours # type: ignore[attr-defined]
|
|
atomic._frequency = frequency # type: ignore[attr-defined]
|
|
|
|
# ── Composite / Framework detection ───────────────────────
|
|
is_composite = _is_composite_obligation(obligation_text, object_)
|
|
atomic._is_composite = is_composite # type: ignore[attr-defined]
|
|
atomic._atomicity = "composite" if is_composite else "atomic" # type: ignore[attr-defined]
|
|
atomic._requires_decomposition = is_composite # type: ignore[attr-defined]
|
|
|
|
# ── Validate (log issues, never reject) ───────────────────
|
|
validation_issues = _validate_atomic_control(atomic, action_type, object_class)
|
|
atomic._validation_issues = validation_issues # type: ignore[attr-defined]
|
|
|
|
return atomic
|
|
|
|
|
|
def _build_pass0b_prompt(
|
|
obligation_text: str, action: str, object_: str,
|
|
parent_title: str, parent_category: str, source_ref: str,
|
|
) -> str:
|
|
return f"""\
|
|
Erstelle aus der folgenden Pflicht ein atomares Control.
|
|
|
|
PFLICHT: {obligation_text}
|
|
HANDLUNG: {action}
|
|
GEGENSTAND: {object_}
|
|
|
|
KONTEXT (Ursprungs-Control):
|
|
Titel: {parent_title}
|
|
Kategorie: {parent_category}
|
|
Quellreferenz: {source_ref}
|
|
|
|
Antworte als JSON:
|
|
{{
|
|
"title": "Kurzer Titel (max 80 Zeichen, deutsch)",
|
|
"objective": "Was muss erreicht werden? (1-2 Sätze)",
|
|
"requirements": ["Konkrete Anforderung 1", "Anforderung 2"],
|
|
"test_procedure": ["Prüfschritt 1", "Prüfschritt 2"],
|
|
"evidence": ["Nachweis 1", "Nachweis 2"],
|
|
"severity": "critical|high|medium|low",
|
|
"category": "security|privacy|governance|operations|finance|reporting"
|
|
}}"""
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Batch Prompts (multiple controls/obligations per API call)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _build_pass0a_batch_prompt(controls: list[dict]) -> str:
|
|
"""Build a prompt for extracting obligations from multiple controls.
|
|
|
|
Each control dict needs: control_id, title, objective, requirements,
|
|
test_procedure, source_ref.
|
|
"""
|
|
parts = []
|
|
for i, ctrl in enumerate(controls, 1):
|
|
parts.append(
|
|
f"--- CONTROL {i} (ID: {ctrl['control_id']}) ---\n"
|
|
f"Titel: {ctrl['title']}\n"
|
|
f"Ziel: {ctrl['objective']}\n"
|
|
f"Anforderungen: {ctrl['requirements']}\n"
|
|
f"Prüfverfahren: {ctrl['test_procedure']}\n"
|
|
f"Quellreferenz: {ctrl['source_ref']}"
|
|
)
|
|
|
|
controls_text = "\n\n".join(parts)
|
|
ids_example = ", ".join(f'"{c["control_id"]}": [...]' for c in controls[:2])
|
|
|
|
return f"""\
|
|
Analysiere die folgenden {len(controls)} Controls und extrahiere aus JEDEM \
|
|
alle einzelnen normativen Pflichten.
|
|
|
|
{controls_text}
|
|
|
|
Antworte als JSON-Objekt. Fuer JEDES Control ein Key (die Control-ID) mit \
|
|
einem Array von Pflichten:
|
|
{{
|
|
{ids_example}
|
|
}}
|
|
|
|
Jede Pflicht hat dieses Format:
|
|
{{
|
|
"obligation_text": "Kurze, präzise Formulierung der Pflicht",
|
|
"action": "Hauptverb/Handlung",
|
|
"object": "Gegenstand der Pflicht",
|
|
"condition": null,
|
|
"normative_strength": "must",
|
|
"is_test_obligation": false,
|
|
"is_reporting_obligation": false
|
|
}}"""
|
|
|
|
|
|
def _build_pass0b_batch_prompt(obligations: list[dict]) -> str:
|
|
"""Build a prompt for composing multiple atomic controls.
|
|
|
|
Each obligation dict needs: candidate_id, obligation_text, action,
|
|
object, parent_title, parent_category, source_ref.
|
|
"""
|
|
parts = []
|
|
for i, obl in enumerate(obligations, 1):
|
|
parts.append(
|
|
f"--- PFLICHT {i} (ID: {obl['candidate_id']}) ---\n"
|
|
f"PFLICHT: {obl['obligation_text']}\n"
|
|
f"HANDLUNG: {obl['action']}\n"
|
|
f"GEGENSTAND: {obl['object']}\n"
|
|
f"KONTEXT: {obl['parent_title']} | {obl['parent_category']}\n"
|
|
f"Quellreferenz: {obl['source_ref']}"
|
|
)
|
|
|
|
obligations_text = "\n\n".join(parts)
|
|
ids_example = ", ".join(f'"{o["candidate_id"]}": {{...}}' for o in obligations[:2])
|
|
|
|
return f"""\
|
|
Erstelle aus den folgenden {len(obligations)} Pflichten je ein atomares Control.
|
|
|
|
{obligations_text}
|
|
|
|
Antworte als JSON-Objekt. Fuer JEDE Pflicht ein Key (die Pflicht-ID):
|
|
{{
|
|
{ids_example}
|
|
}}
|
|
|
|
Jedes Control hat dieses Format:
|
|
{{
|
|
"title": "Kurzer Titel (max 80 Zeichen, deutsch)",
|
|
"objective": "Was muss erreicht werden? (1-2 Sätze)",
|
|
"requirements": ["Konkrete Anforderung 1", "Anforderung 2"],
|
|
"test_procedure": ["Prüfschritt 1", "Prüfschritt 2"],
|
|
"evidence": ["Nachweis 1", "Nachweis 2"],
|
|
"severity": "critical|high|medium|low",
|
|
"category": "security|privacy|governance|operations|finance|reporting"
|
|
}}"""
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Anthropic API (with prompt caching)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
async def _llm_anthropic(
|
|
prompt: str,
|
|
system_prompt: str,
|
|
max_tokens: int = 8192,
|
|
) -> str:
|
|
"""Call Anthropic Messages API with prompt caching for system prompt."""
|
|
if not ANTHROPIC_API_KEY:
|
|
raise RuntimeError("ANTHROPIC_API_KEY not set")
|
|
|
|
headers = {
|
|
"x-api-key": ANTHROPIC_API_KEY,
|
|
"anthropic-version": "2023-06-01",
|
|
"content-type": "application/json",
|
|
}
|
|
payload = {
|
|
"model": ANTHROPIC_MODEL,
|
|
"max_tokens": max_tokens,
|
|
"system": [
|
|
{
|
|
"type": "text",
|
|
"text": system_prompt,
|
|
"cache_control": {"type": "ephemeral"},
|
|
}
|
|
],
|
|
"messages": [{"role": "user", "content": prompt}],
|
|
}
|
|
|
|
try:
|
|
async with httpx.AsyncClient(timeout=LLM_TIMEOUT) as client:
|
|
resp = await client.post(
|
|
f"{ANTHROPIC_API_URL}/messages",
|
|
headers=headers,
|
|
json=payload,
|
|
)
|
|
if resp.status_code != 200:
|
|
logger.error(
|
|
"Anthropic API %d: %s", resp.status_code, resp.text[:300]
|
|
)
|
|
return ""
|
|
data = resp.json()
|
|
# Log cache performance
|
|
usage = data.get("usage", {})
|
|
cached = usage.get("cache_read_input_tokens", 0)
|
|
if cached > 0:
|
|
logger.debug(
|
|
"Prompt cache hit: %d cached tokens", cached
|
|
)
|
|
content = data.get("content", [])
|
|
if content and isinstance(content, list):
|
|
return content[0].get("text", "")
|
|
return ""
|
|
except Exception as e:
|
|
logger.error("Anthropic request failed: %s", e)
|
|
return ""
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Anthropic Batch API (50% cost reduction, async processing)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
async def create_anthropic_batch(
|
|
requests: list[dict],
|
|
) -> dict:
|
|
"""Submit a batch of requests to Anthropic Batch API.
|
|
|
|
Each request: {"custom_id": "...", "params": {model, max_tokens, system, messages}}
|
|
Returns batch metadata including batch_id.
|
|
"""
|
|
if not ANTHROPIC_API_KEY:
|
|
raise RuntimeError("ANTHROPIC_API_KEY not set")
|
|
|
|
headers = {
|
|
"x-api-key": ANTHROPIC_API_KEY,
|
|
"anthropic-version": "2023-06-01",
|
|
"content-type": "application/json",
|
|
}
|
|
|
|
async with httpx.AsyncClient(timeout=60) as client:
|
|
resp = await client.post(
|
|
f"{ANTHROPIC_API_URL}/messages/batches",
|
|
headers=headers,
|
|
json={"requests": requests},
|
|
)
|
|
if resp.status_code not in (200, 201):
|
|
raise RuntimeError(
|
|
f"Batch API failed {resp.status_code}: {resp.text[:500]}"
|
|
)
|
|
return resp.json()
|
|
|
|
|
|
async def check_batch_status(batch_id: str) -> dict:
|
|
"""Check the processing status of a batch."""
|
|
headers = {
|
|
"x-api-key": ANTHROPIC_API_KEY,
|
|
"anthropic-version": "2023-06-01",
|
|
}
|
|
|
|
async with httpx.AsyncClient(timeout=30) as client:
|
|
resp = await client.get(
|
|
f"{ANTHROPIC_API_URL}/messages/batches/{batch_id}",
|
|
headers=headers,
|
|
)
|
|
resp.raise_for_status()
|
|
return resp.json()
|
|
|
|
|
|
async def fetch_batch_results(batch_id: str) -> list[dict]:
|
|
"""Fetch results of a completed batch. Returns list of result objects."""
|
|
headers = {
|
|
"x-api-key": ANTHROPIC_API_KEY,
|
|
"anthropic-version": "2023-06-01",
|
|
}
|
|
|
|
async with httpx.AsyncClient(timeout=120) as client:
|
|
resp = await client.get(
|
|
f"{ANTHROPIC_API_URL}/messages/batches/{batch_id}/results",
|
|
headers=headers,
|
|
)
|
|
resp.raise_for_status()
|
|
# Response is JSONL (one JSON object per line)
|
|
results = []
|
|
for line in resp.text.strip().split("\n"):
|
|
if line.strip():
|
|
results.append(json.loads(line))
|
|
return results
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Parse helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _parse_json_array(text: str) -> list[dict]:
|
|
"""Extract a JSON array from LLM response text."""
|
|
# Try direct parse
|
|
try:
|
|
result = json.loads(text)
|
|
if isinstance(result, list):
|
|
return result
|
|
if isinstance(result, dict):
|
|
return [result]
|
|
except json.JSONDecodeError:
|
|
pass
|
|
|
|
# Try extracting JSON array block
|
|
match = re.search(r"\[[\s\S]*\]", text)
|
|
if match:
|
|
try:
|
|
result = json.loads(match.group())
|
|
if isinstance(result, list):
|
|
return result
|
|
except json.JSONDecodeError:
|
|
pass
|
|
|
|
return []
|
|
|
|
|
|
def _parse_json_object(text: str) -> dict:
|
|
"""Extract a JSON object from LLM response text."""
|
|
try:
|
|
result = json.loads(text)
|
|
if isinstance(result, dict):
|
|
return result
|
|
except json.JSONDecodeError:
|
|
pass
|
|
|
|
match = re.search(r"\{[\s\S]*\}", text)
|
|
if match:
|
|
try:
|
|
result = json.loads(match.group())
|
|
if isinstance(result, dict):
|
|
return result
|
|
except json.JSONDecodeError:
|
|
pass
|
|
|
|
return {}
|
|
|
|
|
|
def _ensure_list(val) -> list:
|
|
"""Ensure value is a list."""
|
|
if isinstance(val, list):
|
|
return val
|
|
if isinstance(val, str):
|
|
return [val] if val else []
|
|
return []
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Decomposition Pass
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class DecompositionPass:
|
|
"""Pass 0: Decompose Rich Controls into atomic candidates.
|
|
|
|
Usage::
|
|
|
|
decomp = DecompositionPass(db=session)
|
|
stats_0a = await decomp.run_pass0a(limit=100)
|
|
stats_0b = await decomp.run_pass0b(limit=100)
|
|
"""
|
|
|
|
def __init__(self, db: Session, dedup_enabled: bool = False):
|
|
self.db = db
|
|
self._dedup = None
|
|
if dedup_enabled:
|
|
from compliance.services.control_dedup import (
|
|
ControlDedupChecker, DEDUP_ENABLED,
|
|
)
|
|
if DEDUP_ENABLED:
|
|
self._dedup = ControlDedupChecker(db)
|
|
|
|
# -------------------------------------------------------------------
|
|
# Pass 0a: Obligation Extraction
|
|
# -------------------------------------------------------------------
|
|
|
|
async def run_pass0a(
|
|
self,
|
|
limit: int = 0,
|
|
batch_size: int = 0,
|
|
use_anthropic: bool = False,
|
|
category_filter: Optional[str] = None,
|
|
source_filter: Optional[str] = None,
|
|
) -> dict:
|
|
"""Extract obligation candidates from rich controls.
|
|
|
|
Args:
|
|
limit: Max controls to process (0 = no limit).
|
|
batch_size: Controls per LLM call (0 = use DECOMPOSITION_BATCH_SIZE
|
|
env var, or 1 for single mode). Only >1 with Anthropic.
|
|
use_anthropic: Use Anthropic API (True) or Ollama (False).
|
|
category_filter: Only process controls matching this category
|
|
(comma-separated, e.g. "security,privacy").
|
|
source_filter: Only process controls from these source regulations
|
|
(comma-separated, e.g. "Maschinenverordnung,Cyber Resilience Act").
|
|
Matches against source_citation->>'source' using ILIKE.
|
|
"""
|
|
if batch_size <= 0:
|
|
batch_size = DECOMPOSITION_BATCH_SIZE if use_anthropic else 1
|
|
|
|
# Find rich controls not yet decomposed
|
|
query = """
|
|
SELECT cc.id, cc.control_id, cc.title, cc.objective,
|
|
cc.requirements, cc.test_procedure,
|
|
cc.source_citation, cc.category
|
|
FROM canonical_controls cc
|
|
WHERE cc.release_state NOT IN ('deprecated')
|
|
AND cc.parent_control_uuid IS NULL
|
|
AND NOT EXISTS (
|
|
SELECT 1 FROM obligation_candidates oc
|
|
WHERE oc.parent_control_uuid = cc.id
|
|
)
|
|
"""
|
|
params = {}
|
|
if category_filter:
|
|
cats = [c.strip() for c in category_filter.split(",") if c.strip()]
|
|
if cats:
|
|
query += " AND cc.category IN :cats"
|
|
params["cats"] = tuple(cats)
|
|
|
|
if source_filter:
|
|
sources = [s.strip() for s in source_filter.split(",") if s.strip()]
|
|
if sources:
|
|
clauses = []
|
|
for idx, src in enumerate(sources):
|
|
key = f"src_{idx}"
|
|
clauses.append(f"cc.source_citation::text ILIKE :{key}")
|
|
params[key] = f"%{src}%"
|
|
query += " AND (" + " OR ".join(clauses) + ")"
|
|
|
|
query += " ORDER BY cc.created_at"
|
|
if limit > 0:
|
|
query += f" LIMIT {limit}"
|
|
|
|
rows = self.db.execute(text(query), params).fetchall()
|
|
|
|
stats = {
|
|
"controls_processed": 0,
|
|
"obligations_extracted": 0,
|
|
"obligations_validated": 0,
|
|
"obligations_rejected": 0,
|
|
"controls_skipped_empty": 0,
|
|
"llm_calls": 0,
|
|
"errors": 0,
|
|
"provider": "anthropic" if use_anthropic else "deterministic",
|
|
"batch_size": batch_size,
|
|
}
|
|
|
|
# Prepare control data
|
|
prepared = []
|
|
for row in rows:
|
|
title = row[2] or ""
|
|
objective = row[3] or ""
|
|
req_str = _format_field(row[4] or "")
|
|
test_str = _format_field(row[5] or "")
|
|
source_str = _format_citation(row[6] or "")
|
|
|
|
if not title and not objective and not req_str:
|
|
stats["controls_skipped_empty"] += 1
|
|
continue
|
|
|
|
prepared.append({
|
|
"uuid": str(row[0]),
|
|
"control_id": row[1] or "",
|
|
"title": title,
|
|
"objective": objective,
|
|
"requirements": req_str,
|
|
"test_procedure": test_str,
|
|
"source_ref": source_str,
|
|
"category": row[7] or "",
|
|
})
|
|
|
|
# Process in batches
|
|
for i in range(0, len(prepared), batch_size):
|
|
batch = prepared[i : i + batch_size]
|
|
try:
|
|
if use_anthropic and len(batch) > 1:
|
|
# Batched Anthropic call
|
|
prompt = _build_pass0a_batch_prompt(batch)
|
|
llm_response = await _llm_anthropic(
|
|
prompt=prompt,
|
|
system_prompt=_PASS0A_SYSTEM_PROMPT,
|
|
max_tokens=max(8192, len(batch) * 2000),
|
|
)
|
|
stats["llm_calls"] += 1
|
|
results_by_id = _parse_json_object(llm_response)
|
|
for ctrl in batch:
|
|
raw_obls = results_by_id.get(ctrl["control_id"], [])
|
|
if not isinstance(raw_obls, list):
|
|
raw_obls = [raw_obls] if raw_obls else []
|
|
if not raw_obls:
|
|
raw_obls = [_fallback_obligation(ctrl)]
|
|
self._process_pass0a_obligations(
|
|
raw_obls, ctrl["control_id"], ctrl["uuid"], stats
|
|
)
|
|
stats["controls_processed"] += 1
|
|
elif use_anthropic:
|
|
# Single Anthropic call
|
|
ctrl = batch[0]
|
|
prompt = _build_pass0a_prompt(
|
|
title=ctrl["title"], objective=ctrl["objective"],
|
|
requirements=ctrl["requirements"],
|
|
test_procedure=ctrl["test_procedure"],
|
|
source_ref=ctrl["source_ref"],
|
|
)
|
|
llm_response = await _llm_anthropic(
|
|
prompt=prompt,
|
|
system_prompt=_PASS0A_SYSTEM_PROMPT,
|
|
)
|
|
stats["llm_calls"] += 1
|
|
raw_obls = _parse_json_array(llm_response)
|
|
if not raw_obls:
|
|
raw_obls = [_fallback_obligation(ctrl)]
|
|
self._process_pass0a_obligations(
|
|
raw_obls, ctrl["control_id"], ctrl["uuid"], stats
|
|
)
|
|
stats["controls_processed"] += 1
|
|
else:
|
|
# Ollama (single only)
|
|
from compliance.services.obligation_extractor import _llm_ollama
|
|
ctrl = batch[0]
|
|
prompt = _build_pass0a_prompt(
|
|
title=ctrl["title"], objective=ctrl["objective"],
|
|
requirements=ctrl["requirements"],
|
|
test_procedure=ctrl["test_procedure"],
|
|
source_ref=ctrl["source_ref"],
|
|
)
|
|
llm_response = await _llm_ollama(
|
|
prompt=prompt,
|
|
system_prompt=_PASS0A_SYSTEM_PROMPT,
|
|
)
|
|
stats["llm_calls"] += 1
|
|
raw_obls = _parse_json_array(llm_response)
|
|
if not raw_obls:
|
|
raw_obls = [_fallback_obligation(ctrl)]
|
|
self._process_pass0a_obligations(
|
|
raw_obls, ctrl["control_id"], ctrl["uuid"], stats
|
|
)
|
|
stats["controls_processed"] += 1
|
|
|
|
# Commit after each successful sub-batch to avoid losing work
|
|
self.db.commit()
|
|
|
|
except Exception as e:
|
|
ids = ", ".join(c["control_id"] for c in batch)
|
|
logger.error("Pass 0a failed for [%s]: %s", ids, e)
|
|
stats["errors"] += 1
|
|
try:
|
|
self.db.rollback()
|
|
except Exception:
|
|
pass
|
|
logger.info("Pass 0a: %s", stats)
|
|
return stats
|
|
|
|
_NORMATIVE_STRENGTH_MAP = {
|
|
"muss": "must", "must": "must",
|
|
"soll": "should", "should": "should",
|
|
"kann": "may", "may": "may",
|
|
}
|
|
|
|
def _process_pass0a_obligations(
|
|
self,
|
|
raw_obligations: list[dict],
|
|
control_id: str,
|
|
control_uuid: str,
|
|
stats: dict,
|
|
) -> None:
|
|
"""Validate and write obligation candidates from LLM output."""
|
|
for idx, raw in enumerate(raw_obligations):
|
|
raw_strength = raw.get("normative_strength", "must").lower().strip()
|
|
normative_strength = self._NORMATIVE_STRENGTH_MAP.get(
|
|
raw_strength, "must"
|
|
)
|
|
cand = ObligationCandidate(
|
|
candidate_id=f"OC-{control_id}-{idx + 1:02d}",
|
|
parent_control_uuid=control_uuid,
|
|
obligation_text=raw.get("obligation_text", ""),
|
|
action=raw.get("action", ""),
|
|
object_=raw.get("object", ""),
|
|
condition=raw.get("condition"),
|
|
normative_strength=normative_strength,
|
|
is_test_obligation=bool(raw.get("is_test_obligation", False)),
|
|
is_reporting_obligation=bool(raw.get("is_reporting_obligation", False)),
|
|
)
|
|
|
|
# Auto-detect test/reporting if LLM missed it
|
|
if not cand.is_test_obligation and _TEST_RE.search(cand.obligation_text):
|
|
cand.is_test_obligation = True
|
|
if not cand.is_reporting_obligation and _REPORTING_RE.search(cand.obligation_text):
|
|
cand.is_reporting_obligation = True
|
|
|
|
# Quality gate + obligation type classification
|
|
flags = quality_gate(cand)
|
|
cand.quality_flags = flags
|
|
cand.extraction_confidence = _compute_extraction_confidence(flags)
|
|
cand.obligation_type = flags.get("obligation_type", "empfehlung")
|
|
|
|
if passes_quality_gate(flags):
|
|
cand.release_state = "validated"
|
|
stats["obligations_validated"] += 1
|
|
else:
|
|
cand.release_state = "rejected"
|
|
stats["obligations_rejected"] += 1
|
|
|
|
self._write_obligation_candidate(cand)
|
|
stats["obligations_extracted"] += 1
|
|
|
|
# -------------------------------------------------------------------
|
|
# Pass 0b: Atomic Control Composition
|
|
# -------------------------------------------------------------------
|
|
|
|
async def run_pass0b(
|
|
self,
|
|
limit: int = 0,
|
|
batch_size: int = 0,
|
|
use_anthropic: bool = False,
|
|
) -> dict:
|
|
"""Compose atomic controls from validated obligation candidates.
|
|
|
|
Args:
|
|
limit: Max candidates to process (0 = no limit).
|
|
batch_size: Commit interval (0 = auto). For LLM: API batch size.
|
|
use_anthropic: Use Anthropic API (True) or deterministic engine (False).
|
|
"""
|
|
if batch_size <= 0:
|
|
batch_size = DECOMPOSITION_BATCH_SIZE if use_anthropic else 50
|
|
|
|
query = """
|
|
SELECT oc.id, oc.candidate_id, oc.parent_control_uuid,
|
|
oc.obligation_text, oc.action, oc.object,
|
|
oc.condition,
|
|
oc.is_test_obligation, oc.is_reporting_obligation,
|
|
cc.title AS parent_title,
|
|
cc.category AS parent_category,
|
|
cc.source_citation AS parent_citation,
|
|
cc.severity AS parent_severity,
|
|
cc.control_id AS parent_control_id,
|
|
oc.trigger_type,
|
|
oc.is_implementation_specific
|
|
FROM obligation_candidates oc
|
|
JOIN canonical_controls cc ON cc.id = oc.parent_control_uuid
|
|
WHERE oc.release_state = 'validated'
|
|
AND oc.merged_into_id IS NULL
|
|
AND NOT EXISTS (
|
|
SELECT 1 FROM canonical_controls ac
|
|
WHERE ac.parent_control_uuid = oc.parent_control_uuid
|
|
AND ac.decomposition_method = 'pass0b'
|
|
AND ac.title LIKE '%' || LEFT(oc.action, 20) || '%'
|
|
)
|
|
"""
|
|
if limit > 0:
|
|
query += f" LIMIT {limit}"
|
|
|
|
rows = self.db.execute(text(query)).fetchall()
|
|
|
|
stats = {
|
|
"candidates_processed": 0,
|
|
"controls_created": 0,
|
|
"llm_failures": 0,
|
|
"llm_calls": 0,
|
|
"errors": 0,
|
|
"provider": "anthropic" if use_anthropic else "deterministic",
|
|
"batch_size": batch_size,
|
|
"dedup_enabled": self._dedup is not None,
|
|
"dedup_linked": 0,
|
|
"dedup_review": 0,
|
|
"skipped_merged": 0,
|
|
}
|
|
|
|
# Prepare obligation data
|
|
prepared = []
|
|
for row in rows:
|
|
prepared.append({
|
|
"oc_id": str(row[0]),
|
|
"candidate_id": row[1] or "",
|
|
"parent_uuid": str(row[2]),
|
|
"obligation_text": row[3] or "",
|
|
"action": row[4] or "",
|
|
"object": row[5] or "",
|
|
"condition": row[6] or "",
|
|
"is_test": row[7],
|
|
"is_reporting": row[8],
|
|
"parent_title": row[9] or "",
|
|
"parent_category": row[10] or "",
|
|
"parent_citation": row[11] or "",
|
|
"parent_severity": row[12] or "medium",
|
|
"parent_control_id": row[13] or "",
|
|
"source_ref": _format_citation(row[11] or ""),
|
|
"trigger_type": row[14] or "continuous",
|
|
"is_implementation_specific": row[15] or False,
|
|
})
|
|
|
|
# Process in batches
|
|
for i in range(0, len(prepared), batch_size):
|
|
batch = prepared[i : i + batch_size]
|
|
try:
|
|
if use_anthropic and len(batch) > 1:
|
|
# Batched Anthropic call
|
|
prompt = _build_pass0b_batch_prompt(batch)
|
|
llm_response = await _llm_anthropic(
|
|
prompt=prompt,
|
|
system_prompt=_PASS0B_SYSTEM_PROMPT,
|
|
max_tokens=min(16384, max(4096, len(batch) * 500)),
|
|
)
|
|
stats["llm_calls"] += 1
|
|
results_by_id = _parse_json_object(llm_response)
|
|
for obl in batch:
|
|
parsed = results_by_id.get(obl["candidate_id"], {})
|
|
await self._process_pass0b_control(obl, parsed, stats)
|
|
elif use_anthropic:
|
|
obl = batch[0]
|
|
prompt = _build_pass0b_prompt(
|
|
obligation_text=obl["obligation_text"],
|
|
action=obl["action"], object_=obl["object"],
|
|
parent_title=obl["parent_title"],
|
|
parent_category=obl["parent_category"],
|
|
source_ref=obl["source_ref"],
|
|
)
|
|
llm_response = await _llm_anthropic(
|
|
prompt=prompt,
|
|
system_prompt=_PASS0B_SYSTEM_PROMPT,
|
|
)
|
|
stats["llm_calls"] += 1
|
|
parsed = _parse_json_object(llm_response)
|
|
await self._process_pass0b_control(obl, parsed, stats)
|
|
else:
|
|
# Deterministic engine — no LLM required
|
|
for obl in batch:
|
|
await self._route_and_compose(obl, stats)
|
|
|
|
# Commit after each successful sub-batch
|
|
self.db.commit()
|
|
|
|
except Exception as e:
|
|
ids = ", ".join(o["candidate_id"] for o in batch)
|
|
logger.error("Pass 0b failed for [%s]: %s", ids, e)
|
|
stats["errors"] += 1
|
|
try:
|
|
self.db.rollback()
|
|
except Exception:
|
|
pass
|
|
logger.info("Pass 0b: %s", stats)
|
|
return stats
|
|
|
|
async def _route_and_compose(
|
|
self, obl: dict, stats: dict,
|
|
) -> None:
|
|
"""Route an obligation through the framework detection layer,
|
|
then compose atomic controls.
|
|
|
|
Routing types:
|
|
- atomic: compose directly via _compose_deterministic
|
|
- compound: split compound verbs, compose each
|
|
- framework_container: decompose via framework registry,
|
|
then compose each sub-obligation
|
|
"""
|
|
from compliance.services.framework_decomposition import (
|
|
classify_routing,
|
|
decompose_framework_container,
|
|
)
|
|
|
|
routing = classify_routing(
|
|
obligation_text=obl["obligation_text"],
|
|
action_raw=obl["action"],
|
|
object_raw=obl["object"],
|
|
condition_raw=obl.get("condition"),
|
|
)
|
|
|
|
if routing.routing_type == "framework_container" and routing.framework_ref:
|
|
# Decompose framework container into sub-obligations
|
|
result = decompose_framework_container(
|
|
obligation_candidate_id=obl["candidate_id"],
|
|
parent_control_id=obl["parent_control_id"],
|
|
obligation_text=obl["obligation_text"],
|
|
framework_ref=routing.framework_ref,
|
|
framework_domain=routing.framework_domain,
|
|
)
|
|
stats.setdefault("framework_decomposed", 0)
|
|
stats.setdefault("framework_sub_obligations", 0)
|
|
|
|
if result.release_state == "decomposed" and result.decomposed_obligations:
|
|
stats["framework_decomposed"] += 1
|
|
stats["framework_sub_obligations"] += len(result.decomposed_obligations)
|
|
logger.info(
|
|
"Framework decomposition: %s → %s/%s → %d sub-obligations",
|
|
obl["candidate_id"], routing.framework_ref,
|
|
routing.framework_domain, len(result.decomposed_obligations),
|
|
)
|
|
# Compose each sub-obligation
|
|
for d_obl in result.decomposed_obligations:
|
|
sub_obl = {
|
|
**obl,
|
|
"obligation_text": d_obl.obligation_text,
|
|
"action": d_obl.action_raw,
|
|
"object": d_obl.object_raw,
|
|
}
|
|
sub_actions = _split_compound_action(sub_obl["action"])
|
|
for sub_action in sub_actions:
|
|
atomic = _compose_deterministic(
|
|
obligation_text=sub_obl["obligation_text"],
|
|
action=sub_action,
|
|
object_=sub_obl["object"],
|
|
parent_title=obl["parent_title"],
|
|
parent_severity=obl["parent_severity"],
|
|
parent_category=obl["parent_category"],
|
|
is_test=obl["is_test"],
|
|
is_reporting=obl["is_reporting"],
|
|
trigger_type=obl.get("trigger_type"),
|
|
condition=obl.get("condition"),
|
|
)
|
|
# Enrich gen_meta with framework info
|
|
atomic._framework_ref = routing.framework_ref # type: ignore[attr-defined]
|
|
atomic._framework_domain = routing.framework_domain # type: ignore[attr-defined]
|
|
atomic._framework_subcontrol_id = d_obl.subcontrol_id # type: ignore[attr-defined]
|
|
atomic._decomposition_source = "framework_decomposition" # type: ignore[attr-defined]
|
|
await self._process_pass0b_control(
|
|
obl, {}, stats, atomic=atomic,
|
|
)
|
|
return
|
|
else:
|
|
# Unmatched framework — fall through to normal composition
|
|
logger.warning(
|
|
"Framework decomposition unmatched: %s — %s",
|
|
obl["candidate_id"], result.issues,
|
|
)
|
|
|
|
# Atomic or compound or unmatched framework: normal composition
|
|
sub_actions = _split_compound_action(obl["action"])
|
|
for sub_action in sub_actions:
|
|
atomic = _compose_deterministic(
|
|
obligation_text=obl["obligation_text"],
|
|
action=sub_action,
|
|
object_=obl["object"],
|
|
parent_title=obl["parent_title"],
|
|
parent_severity=obl["parent_severity"],
|
|
parent_category=obl["parent_category"],
|
|
is_test=obl["is_test"],
|
|
is_reporting=obl["is_reporting"],
|
|
trigger_type=obl.get("trigger_type"),
|
|
condition=obl.get("condition"),
|
|
)
|
|
await self._process_pass0b_control(
|
|
obl, {}, stats, atomic=atomic,
|
|
)
|
|
|
|
async def _process_pass0b_control(
|
|
self, obl: dict, parsed: dict, stats: dict,
|
|
atomic: Optional[AtomicControlCandidate] = None,
|
|
) -> None:
|
|
"""Create atomic control from deterministic engine, LLM output, or fallback.
|
|
|
|
If dedup is enabled, checks for duplicates before insertion:
|
|
- LINK: adds parent link to existing control instead of creating new
|
|
- REVIEW: queues for human review, does not create control
|
|
- NEW: creates new control and indexes in Qdrant
|
|
"""
|
|
if atomic is not None:
|
|
# Deterministic engine — atomic already composed
|
|
pass
|
|
elif not parsed or not parsed.get("title"):
|
|
# LLM failed → use deterministic engine as fallback
|
|
atomic = _compose_deterministic(
|
|
obligation_text=obl["obligation_text"],
|
|
action=obl["action"], object_=obl["object"],
|
|
parent_title=obl["parent_title"],
|
|
parent_severity=obl["parent_severity"],
|
|
parent_category=obl["parent_category"],
|
|
is_test=obl["is_test"],
|
|
is_reporting=obl["is_reporting"],
|
|
condition=obl.get("condition"),
|
|
)
|
|
stats["llm_failures"] += 1
|
|
else:
|
|
atomic = AtomicControlCandidate(
|
|
title=parsed.get("title", "")[:200],
|
|
objective=parsed.get("objective", "")[:2000],
|
|
requirements=_ensure_list(parsed.get("requirements", [])),
|
|
test_procedure=_ensure_list(parsed.get("test_procedure", [])),
|
|
evidence=_ensure_list(parsed.get("evidence", [])),
|
|
severity=_normalize_severity(
|
|
parsed.get("severity", obl["parent_severity"])
|
|
),
|
|
category=parsed.get("category", obl["parent_category"]),
|
|
)
|
|
|
|
atomic.parent_control_uuid = obl["parent_uuid"]
|
|
atomic.obligation_candidate_id = obl["candidate_id"]
|
|
|
|
# Cap severity for implementation-specific obligations
|
|
if obl.get("is_implementation_specific") and atomic.severity in (
|
|
"critical", "high"
|
|
):
|
|
atomic.severity = "medium"
|
|
|
|
# Override category for test obligations
|
|
if obl.get("is_test"):
|
|
atomic.category = "testing"
|
|
|
|
# ── Dedup check (if enabled) ────────────────────────────
|
|
if self._dedup:
|
|
pattern_id = None
|
|
# Try to get pattern_id from parent control
|
|
pid_row = self.db.execute(text(
|
|
"SELECT pattern_id FROM canonical_controls WHERE id = CAST(:uid AS uuid)"
|
|
), {"uid": obl["parent_uuid"]}).fetchone()
|
|
if pid_row:
|
|
pattern_id = pid_row[0]
|
|
|
|
result = await self._dedup.check_duplicate(
|
|
action=obl.get("action", ""),
|
|
obj=obl.get("object", ""),
|
|
title=atomic.title,
|
|
pattern_id=pattern_id,
|
|
)
|
|
|
|
if result.verdict == "link":
|
|
self._dedup.add_parent_link(
|
|
control_uuid=result.matched_control_uuid,
|
|
parent_control_uuid=obl["parent_uuid"],
|
|
link_type="dedup_merge",
|
|
confidence=result.similarity_score,
|
|
)
|
|
stats.setdefault("dedup_linked", 0)
|
|
stats["dedup_linked"] += 1
|
|
stats["candidates_processed"] += 1
|
|
logger.info("Dedup LINK: %s → %s (%.3f, %s)",
|
|
atomic.title[:60], result.matched_control_id,
|
|
result.similarity_score, result.stage)
|
|
return
|
|
|
|
if result.verdict == "review":
|
|
self._dedup.write_review(
|
|
candidate_control_id=atomic.candidate_id or "",
|
|
candidate_title=atomic.title,
|
|
candidate_objective=atomic.objective,
|
|
result=result,
|
|
parent_control_uuid=obl["parent_uuid"],
|
|
obligation_candidate_id=obl.get("oc_id"),
|
|
)
|
|
stats.setdefault("dedup_review", 0)
|
|
stats["dedup_review"] += 1
|
|
stats["candidates_processed"] += 1
|
|
logger.info("Dedup REVIEW: %s ↔ %s (%.3f, %s)",
|
|
atomic.title[:60], result.matched_control_id,
|
|
result.similarity_score, result.stage)
|
|
return
|
|
|
|
# ── Create new atomic control ───────────────────────────
|
|
seq = self._next_atomic_seq(obl["parent_control_id"])
|
|
atomic.candidate_id = f"{obl['parent_control_id']}-A{seq:02d}"
|
|
|
|
new_uuid = self._write_atomic_control(atomic, obl)
|
|
|
|
self.db.execute(
|
|
text("""
|
|
UPDATE obligation_candidates
|
|
SET release_state = 'composed'
|
|
WHERE id = CAST(:oc_id AS uuid)
|
|
"""),
|
|
{"oc_id": obl["oc_id"]},
|
|
)
|
|
|
|
# Index in Qdrant for future dedup checks
|
|
if self._dedup and new_uuid:
|
|
pattern_id_val = None
|
|
pid_row2 = self.db.execute(text(
|
|
"SELECT pattern_id FROM canonical_controls WHERE id = CAST(:uid AS uuid)"
|
|
), {"uid": obl["parent_uuid"]}).fetchone()
|
|
if pid_row2:
|
|
pattern_id_val = pid_row2[0]
|
|
|
|
if pattern_id_val:
|
|
await self._dedup.index_control(
|
|
control_uuid=new_uuid,
|
|
control_id=atomic.candidate_id,
|
|
title=atomic.title,
|
|
action=obl.get("action", ""),
|
|
obj=obl.get("object", ""),
|
|
pattern_id=pattern_id_val,
|
|
)
|
|
|
|
stats["controls_created"] += 1
|
|
stats["candidates_processed"] += 1
|
|
|
|
# -------------------------------------------------------------------
|
|
# Merge Pass: Deduplicate implementation-level obligations
|
|
# -------------------------------------------------------------------
|
|
|
|
def run_merge_pass(self) -> dict:
|
|
"""Merge implementation-level duplicate obligations within each parent.
|
|
|
|
When the same parent control has multiple obligations with nearly
|
|
identical action+object (e.g. "SMS-Verbot" + "Policy-as-Code" both
|
|
implementing a communication restriction), keep the more abstract one
|
|
and mark the concrete one as merged.
|
|
|
|
No LLM calls — purely rule-based using text similarity.
|
|
"""
|
|
stats = {
|
|
"parents_checked": 0,
|
|
"obligations_merged": 0,
|
|
"obligations_kept": 0,
|
|
}
|
|
|
|
# Get all parents that have >1 validated obligation
|
|
parents = self.db.execute(text("""
|
|
SELECT parent_control_uuid, count(*) AS cnt
|
|
FROM obligation_candidates
|
|
WHERE release_state = 'validated'
|
|
AND merged_into_id IS NULL
|
|
GROUP BY parent_control_uuid
|
|
HAVING count(*) > 1
|
|
""")).fetchall()
|
|
|
|
for parent_uuid, cnt in parents:
|
|
stats["parents_checked"] += 1
|
|
obligs = self.db.execute(text("""
|
|
SELECT id, candidate_id, obligation_text, action, object
|
|
FROM obligation_candidates
|
|
WHERE parent_control_uuid = CAST(:pid AS uuid)
|
|
AND release_state = 'validated'
|
|
AND merged_into_id IS NULL
|
|
ORDER BY created_at
|
|
"""), {"pid": str(parent_uuid)}).fetchall()
|
|
|
|
merged_ids = set()
|
|
oblig_list = list(obligs)
|
|
|
|
for i in range(len(oblig_list)):
|
|
if str(oblig_list[i][0]) in merged_ids:
|
|
continue
|
|
for j in range(i + 1, len(oblig_list)):
|
|
if str(oblig_list[j][0]) in merged_ids:
|
|
continue
|
|
|
|
action_i = (oblig_list[i][3] or "").lower().strip()
|
|
action_j = (oblig_list[j][3] or "").lower().strip()
|
|
obj_i = (oblig_list[i][4] or "").lower().strip()
|
|
obj_j = (oblig_list[j][4] or "").lower().strip()
|
|
|
|
# Check if actions are similar enough to be duplicates
|
|
if not _text_similar(action_i, action_j, threshold=0.75):
|
|
continue
|
|
if not _text_similar(obj_i, obj_j, threshold=0.60):
|
|
continue
|
|
|
|
# Keep the more abstract one (shorter text = less specific)
|
|
text_i = oblig_list[i][2] or ""
|
|
text_j = oblig_list[j][2] or ""
|
|
if _is_more_implementation_specific(text_j, text_i):
|
|
survivor_id = str(oblig_list[i][0])
|
|
merged_id = str(oblig_list[j][0])
|
|
else:
|
|
survivor_id = str(oblig_list[j][0])
|
|
merged_id = str(oblig_list[i][0])
|
|
|
|
self.db.execute(text("""
|
|
UPDATE obligation_candidates
|
|
SET release_state = 'merged',
|
|
merged_into_id = CAST(:survivor AS uuid)
|
|
WHERE id = CAST(:merged AS uuid)
|
|
"""), {"survivor": survivor_id, "merged": merged_id})
|
|
|
|
merged_ids.add(merged_id)
|
|
stats["obligations_merged"] += 1
|
|
|
|
# Commit per parent to avoid large transactions
|
|
self.db.commit()
|
|
|
|
stats["obligations_kept"] = self.db.execute(text("""
|
|
SELECT count(*) FROM obligation_candidates
|
|
WHERE release_state = 'validated' AND merged_into_id IS NULL
|
|
""")).fetchone()[0]
|
|
|
|
logger.info("Merge pass: %s", stats)
|
|
return stats
|
|
|
|
# -------------------------------------------------------------------
|
|
# Enrich Pass: Add metadata to obligations
|
|
# -------------------------------------------------------------------
|
|
|
|
def enrich_obligations(self) -> dict:
|
|
"""Add trigger_type and is_implementation_specific to obligations.
|
|
|
|
Rule-based enrichment — no LLM calls.
|
|
"""
|
|
stats = {
|
|
"enriched": 0,
|
|
"trigger_event": 0,
|
|
"trigger_periodic": 0,
|
|
"trigger_continuous": 0,
|
|
"implementation_specific": 0,
|
|
}
|
|
|
|
obligs = self.db.execute(text("""
|
|
SELECT id, obligation_text, condition, action, object
|
|
FROM obligation_candidates
|
|
WHERE release_state = 'validated'
|
|
AND merged_into_id IS NULL
|
|
AND trigger_type IS NULL
|
|
""")).fetchall()
|
|
|
|
for row in obligs:
|
|
oc_id = str(row[0])
|
|
obl_text = row[1] or ""
|
|
condition = row[2] or ""
|
|
action = row[3] or ""
|
|
obj = row[4] or ""
|
|
|
|
trigger = _classify_trigger_type(obl_text, condition)
|
|
impl = _is_implementation_specific_text(obl_text, action, obj)
|
|
|
|
self.db.execute(text("""
|
|
UPDATE obligation_candidates
|
|
SET trigger_type = :trigger,
|
|
is_implementation_specific = :impl
|
|
WHERE id = CAST(:oid AS uuid)
|
|
"""), {"trigger": trigger, "impl": impl, "oid": oc_id})
|
|
|
|
stats["enriched"] += 1
|
|
stats[f"trigger_{trigger}"] += 1
|
|
if impl:
|
|
stats["implementation_specific"] += 1
|
|
|
|
self.db.commit()
|
|
logger.info("Enrich pass: %s", stats)
|
|
return stats
|
|
|
|
# -------------------------------------------------------------------
|
|
# Decomposition Status
|
|
# -------------------------------------------------------------------
|
|
|
|
def decomposition_status(self) -> dict:
|
|
"""Return decomposition progress."""
|
|
row = self.db.execute(text("""
|
|
SELECT
|
|
(SELECT count(*) FROM canonical_controls
|
|
WHERE parent_control_uuid IS NULL
|
|
AND release_state NOT IN ('deprecated')) AS rich_controls,
|
|
(SELECT count(DISTINCT parent_control_uuid) FROM obligation_candidates) AS decomposed_controls,
|
|
(SELECT count(*) FROM obligation_candidates) AS total_candidates,
|
|
(SELECT count(*) FROM obligation_candidates WHERE release_state = 'validated') AS validated,
|
|
(SELECT count(*) FROM obligation_candidates WHERE release_state = 'rejected') AS rejected,
|
|
(SELECT count(*) FROM obligation_candidates WHERE release_state = 'composed') AS composed,
|
|
(SELECT count(*) FROM canonical_controls WHERE parent_control_uuid IS NOT NULL) AS atomic_controls,
|
|
(SELECT count(*) FROM obligation_candidates WHERE release_state = 'merged') AS merged,
|
|
(SELECT count(*) FROM obligation_candidates WHERE trigger_type IS NOT NULL) AS enriched
|
|
""")).fetchone()
|
|
|
|
validated_for_0b = row[3] - (row[7] or 0) # validated minus merged
|
|
|
|
return {
|
|
"rich_controls": row[0],
|
|
"decomposed_controls": row[1],
|
|
"total_candidates": row[2],
|
|
"validated": row[3],
|
|
"rejected": row[4],
|
|
"composed": row[5],
|
|
"atomic_controls": row[6],
|
|
"merged": row[7] or 0,
|
|
"enriched": row[8] or 0,
|
|
"ready_for_pass0b": validated_for_0b,
|
|
"decomposition_pct": round(row[1] / max(row[0], 1) * 100, 1),
|
|
"composition_pct": round(row[5] / max(validated_for_0b, 1) * 100, 1),
|
|
}
|
|
|
|
# -------------------------------------------------------------------
|
|
# DB Writers
|
|
# -------------------------------------------------------------------
|
|
|
|
def _write_obligation_candidate(self, cand: ObligationCandidate) -> None:
|
|
"""Insert an obligation candidate into the DB."""
|
|
self.db.execute(
|
|
text("""
|
|
INSERT INTO obligation_candidates (
|
|
parent_control_uuid, candidate_id,
|
|
obligation_text, action, object, condition,
|
|
normative_strength, is_test_obligation,
|
|
is_reporting_obligation, extraction_confidence,
|
|
quality_flags, release_state
|
|
) VALUES (
|
|
CAST(:parent_uuid AS uuid), :candidate_id,
|
|
:obligation_text, :action, :object, :condition,
|
|
:normative_strength, :is_test, :is_reporting,
|
|
:confidence, :quality_flags, :release_state
|
|
)
|
|
"""),
|
|
{
|
|
"parent_uuid": cand.parent_control_uuid,
|
|
"candidate_id": cand.candidate_id,
|
|
"obligation_text": cand.obligation_text,
|
|
"action": cand.action,
|
|
"object": cand.object_,
|
|
"condition": cand.condition,
|
|
"normative_strength": cand.normative_strength,
|
|
"is_test": cand.is_test_obligation,
|
|
"is_reporting": cand.is_reporting_obligation,
|
|
"confidence": cand.extraction_confidence,
|
|
"quality_flags": json.dumps(cand.quality_flags),
|
|
"release_state": cand.release_state,
|
|
},
|
|
)
|
|
|
|
def _write_atomic_control(
|
|
self, atomic: AtomicControlCandidate, obl: dict,
|
|
) -> Optional[str]:
|
|
"""Insert an atomic control and create parent link.
|
|
|
|
Returns the UUID of the newly created control, or None on failure.
|
|
"""
|
|
parent_uuid = obl["parent_uuid"]
|
|
candidate_id = obl["candidate_id"]
|
|
|
|
result = self.db.execute(
|
|
text("""
|
|
INSERT INTO canonical_controls (
|
|
control_id, title, objective, rationale,
|
|
scope, requirements,
|
|
test_procedure, evidence, severity,
|
|
open_anchors, category,
|
|
release_state, parent_control_uuid,
|
|
decomposition_method,
|
|
generation_metadata,
|
|
framework_id,
|
|
generation_strategy, pipeline_version
|
|
) VALUES (
|
|
:control_id, :title, :objective, :rationale,
|
|
:scope, :requirements,
|
|
:test_procedure, :evidence,
|
|
:severity, :open_anchors, :category,
|
|
'draft',
|
|
CAST(:parent_uuid AS uuid), 'pass0b',
|
|
:gen_meta,
|
|
CAST(:framework_id AS uuid),
|
|
'pass0b', 2
|
|
)
|
|
RETURNING id::text
|
|
"""),
|
|
{
|
|
"control_id": atomic.candidate_id,
|
|
"title": atomic.title,
|
|
"objective": atomic.objective,
|
|
"rationale": getattr(atomic, "rationale", None) or "Aus Obligation abgeleitet.",
|
|
"scope": json.dumps({}),
|
|
"requirements": json.dumps(atomic.requirements),
|
|
"test_procedure": json.dumps(atomic.test_procedure),
|
|
"evidence": json.dumps(atomic.evidence),
|
|
"severity": atomic.severity,
|
|
"open_anchors": json.dumps([]),
|
|
"category": atomic.category,
|
|
"parent_uuid": parent_uuid,
|
|
"gen_meta": json.dumps({
|
|
"decomposition_source": candidate_id,
|
|
"decomposition_method": "pass0b",
|
|
"engine_version": "v2",
|
|
"action_object_class": getattr(atomic, "domain", ""),
|
|
"merge_group_hint": atomic.source_regulation or "",
|
|
"decomposition_confidence": getattr(
|
|
atomic, "_decomposition_confidence", None
|
|
),
|
|
"statement": getattr(atomic, "_statement", ""),
|
|
"pattern_candidates": getattr(atomic, "_pattern_candidates", []),
|
|
"deadline_hours": getattr(atomic, "_deadline_hours", None),
|
|
"frequency": getattr(atomic, "_frequency", None),
|
|
"validation_issues": getattr(atomic, "_validation_issues", []),
|
|
"is_composite": getattr(atomic, "_is_composite", False),
|
|
"atomicity": getattr(atomic, "_atomicity", "atomic"),
|
|
"requires_decomposition": getattr(atomic, "_requires_decomposition", False),
|
|
"framework_ref": getattr(atomic, "_framework_ref", None),
|
|
"framework_domain": getattr(atomic, "_framework_domain", None),
|
|
"framework_subcontrol_id": getattr(atomic, "_framework_subcontrol_id", None),
|
|
"decomposition_source": getattr(atomic, "_decomposition_source", "direct"),
|
|
}),
|
|
"framework_id": "14b1bdd2-abc7-4a43-adae-14471ee5c7cf",
|
|
},
|
|
)
|
|
|
|
row = result.fetchone()
|
|
new_uuid = row[0] if row else None
|
|
|
|
# Create M:N parent link (control_parent_links)
|
|
if new_uuid:
|
|
citation = _parse_citation(obl.get("parent_citation", ""))
|
|
self.db.execute(
|
|
text("""
|
|
INSERT INTO control_parent_links
|
|
(control_uuid, parent_control_uuid, link_type, confidence,
|
|
source_regulation, source_article, obligation_candidate_id)
|
|
VALUES
|
|
(CAST(:cu AS uuid), CAST(:pu AS uuid), 'decomposition', 1.0,
|
|
:sr, :sa, CAST(:oci AS uuid))
|
|
ON CONFLICT (control_uuid, parent_control_uuid) DO NOTHING
|
|
"""),
|
|
{
|
|
"cu": new_uuid,
|
|
"pu": parent_uuid,
|
|
"sr": citation.get("source", ""),
|
|
"sa": citation.get("article", ""),
|
|
"oci": obl["oc_id"],
|
|
},
|
|
)
|
|
|
|
return new_uuid
|
|
|
|
def _next_atomic_seq(self, parent_control_id: str) -> int:
|
|
"""Get the next sequence number for atomic controls under a parent."""
|
|
result = self.db.execute(
|
|
text("""
|
|
SELECT count(*) FROM canonical_controls
|
|
WHERE parent_control_uuid = (
|
|
SELECT id FROM canonical_controls
|
|
WHERE control_id = :parent_id
|
|
LIMIT 1
|
|
)
|
|
"""),
|
|
{"parent_id": parent_control_id},
|
|
).fetchone()
|
|
return (result[0] if result else 0) + 1
|
|
|
|
|
|
# -------------------------------------------------------------------
|
|
# Anthropic Batch API: Submit all controls as async batch (50% off)
|
|
# -------------------------------------------------------------------
|
|
|
|
async def submit_batch_pass0a(
|
|
self,
|
|
limit: int = 0,
|
|
batch_size: int = 5,
|
|
category_filter: Optional[str] = None,
|
|
source_filter: Optional[str] = None,
|
|
) -> dict:
|
|
"""Create an Anthropic Batch API request for Pass 0a.
|
|
|
|
Groups controls into content-batches of `batch_size`, then submits
|
|
all batches as one Anthropic Batch (up to 10,000 requests).
|
|
Returns batch metadata for polling.
|
|
"""
|
|
query = """
|
|
SELECT cc.id, cc.control_id, cc.title, cc.objective,
|
|
cc.requirements, cc.test_procedure,
|
|
cc.source_citation, cc.category
|
|
FROM canonical_controls cc
|
|
WHERE cc.release_state NOT IN ('deprecated')
|
|
AND cc.parent_control_uuid IS NULL
|
|
AND NOT EXISTS (
|
|
SELECT 1 FROM obligation_candidates oc
|
|
WHERE oc.parent_control_uuid = cc.id
|
|
)
|
|
"""
|
|
params = {}
|
|
if category_filter:
|
|
cats = [c.strip() for c in category_filter.split(",") if c.strip()]
|
|
if cats:
|
|
query += " AND cc.category IN :cats"
|
|
params["cats"] = tuple(cats)
|
|
|
|
if source_filter:
|
|
sources = [s.strip() for s in source_filter.split(",") if s.strip()]
|
|
if sources:
|
|
clauses = []
|
|
for idx, src in enumerate(sources):
|
|
key = f"src_{idx}"
|
|
clauses.append(f"cc.source_citation::text ILIKE :{key}")
|
|
params[key] = f"%{src}%"
|
|
query += " AND (" + " OR ".join(clauses) + ")"
|
|
|
|
query += " ORDER BY cc.created_at"
|
|
if limit > 0:
|
|
query += f" LIMIT {limit}"
|
|
|
|
rows = self.db.execute(text(query), params).fetchall()
|
|
|
|
# Prepare control data (skip empty)
|
|
prepared = []
|
|
for row in rows:
|
|
title = row[2] or ""
|
|
objective = row[3] or ""
|
|
req_str = _format_field(row[4] or "")
|
|
if not title and not objective and not req_str:
|
|
continue
|
|
prepared.append({
|
|
"uuid": str(row[0]),
|
|
"control_id": row[1] or "",
|
|
"title": title,
|
|
"objective": objective,
|
|
"requirements": req_str,
|
|
"test_procedure": _format_field(row[5] or ""),
|
|
"source_ref": _format_citation(row[6] or ""),
|
|
"category": row[7] or "",
|
|
})
|
|
|
|
if not prepared:
|
|
return {"status": "empty", "total_controls": 0}
|
|
|
|
# Build batch requests (each request = batch_size controls)
|
|
requests = []
|
|
for i in range(0, len(prepared), batch_size):
|
|
batch = prepared[i : i + batch_size]
|
|
if len(batch) > 1:
|
|
prompt = _build_pass0a_batch_prompt(batch)
|
|
else:
|
|
ctrl = batch[0]
|
|
prompt = _build_pass0a_prompt(
|
|
title=ctrl["title"], objective=ctrl["objective"],
|
|
requirements=ctrl["requirements"],
|
|
test_procedure=ctrl["test_procedure"],
|
|
source_ref=ctrl["source_ref"],
|
|
)
|
|
|
|
# Control IDs in custom_id for result mapping
|
|
ids_str = "+".join(c["control_id"] for c in batch)
|
|
requests.append({
|
|
"custom_id": f"p0a_{ids_str}",
|
|
"params": {
|
|
"model": ANTHROPIC_MODEL,
|
|
"max_tokens": max(8192, len(batch) * 2000),
|
|
"system": [
|
|
{
|
|
"type": "text",
|
|
"text": _PASS0A_SYSTEM_PROMPT,
|
|
"cache_control": {"type": "ephemeral"},
|
|
}
|
|
],
|
|
"messages": [{"role": "user", "content": prompt}],
|
|
},
|
|
})
|
|
|
|
batch_result = await create_anthropic_batch(requests)
|
|
batch_id = batch_result.get("id", "")
|
|
|
|
logger.info(
|
|
"Batch API submitted: %s — %d requests (%d controls, batch_size=%d)",
|
|
batch_id, len(requests), len(prepared), batch_size,
|
|
)
|
|
|
|
return {
|
|
"status": "submitted",
|
|
"batch_id": batch_id,
|
|
"total_controls": len(prepared),
|
|
"total_requests": len(requests),
|
|
"batch_size": batch_size,
|
|
"category_filter": category_filter,
|
|
"source_filter": source_filter,
|
|
}
|
|
|
|
async def submit_batch_pass0b(
|
|
self,
|
|
limit: int = 0,
|
|
batch_size: int = 5,
|
|
) -> dict:
|
|
"""Create an Anthropic Batch API request for Pass 0b."""
|
|
query = """
|
|
SELECT oc.id, oc.candidate_id, oc.parent_control_uuid,
|
|
oc.obligation_text, oc.action, oc.object,
|
|
oc.is_test_obligation, oc.is_reporting_obligation,
|
|
cc.title AS parent_title,
|
|
cc.category AS parent_category,
|
|
cc.source_citation AS parent_citation,
|
|
cc.severity AS parent_severity,
|
|
cc.control_id AS parent_control_id
|
|
FROM obligation_candidates oc
|
|
JOIN canonical_controls cc ON cc.id = oc.parent_control_uuid
|
|
WHERE oc.release_state = 'validated'
|
|
AND NOT EXISTS (
|
|
SELECT 1 FROM canonical_controls ac
|
|
WHERE ac.parent_control_uuid = oc.parent_control_uuid
|
|
AND ac.decomposition_method = 'pass0b'
|
|
AND ac.title LIKE '%' || LEFT(oc.action, 20) || '%'
|
|
)
|
|
"""
|
|
if limit > 0:
|
|
query += f" LIMIT {limit}"
|
|
|
|
rows = self.db.execute(text(query)).fetchall()
|
|
|
|
prepared = []
|
|
for row in rows:
|
|
prepared.append({
|
|
"oc_id": str(row[0]),
|
|
"candidate_id": row[1] or "",
|
|
"parent_uuid": str(row[2]),
|
|
"obligation_text": row[3] or "",
|
|
"action": row[4] or "",
|
|
"object": row[5] or "",
|
|
"is_test": row[6],
|
|
"is_reporting": row[7],
|
|
"parent_title": row[8] or "",
|
|
"parent_category": row[9] or "",
|
|
"parent_citation": row[10] or "",
|
|
"parent_severity": row[11] or "medium",
|
|
"parent_control_id": row[12] or "",
|
|
"source_ref": _format_citation(row[10] or ""),
|
|
})
|
|
|
|
if not prepared:
|
|
return {"status": "empty", "total_candidates": 0}
|
|
|
|
requests = []
|
|
for i in range(0, len(prepared), batch_size):
|
|
batch = prepared[i : i + batch_size]
|
|
if len(batch) > 1:
|
|
prompt = _build_pass0b_batch_prompt(batch)
|
|
else:
|
|
obl = batch[0]
|
|
prompt = _build_pass0b_prompt(
|
|
obligation_text=obl["obligation_text"],
|
|
action=obl["action"], object_=obl["object"],
|
|
parent_title=obl["parent_title"],
|
|
parent_category=obl["parent_category"],
|
|
source_ref=obl["source_ref"],
|
|
)
|
|
|
|
ids_str = "+".join(o["candidate_id"] for o in batch)
|
|
requests.append({
|
|
"custom_id": f"p0b_{ids_str}",
|
|
"params": {
|
|
"model": ANTHROPIC_MODEL,
|
|
"max_tokens": max(8192, len(batch) * 1500),
|
|
"system": [
|
|
{
|
|
"type": "text",
|
|
"text": _PASS0B_SYSTEM_PROMPT,
|
|
"cache_control": {"type": "ephemeral"},
|
|
}
|
|
],
|
|
"messages": [{"role": "user", "content": prompt}],
|
|
},
|
|
})
|
|
|
|
batch_result = await create_anthropic_batch(requests)
|
|
batch_id = batch_result.get("id", "")
|
|
|
|
logger.info(
|
|
"Batch API Pass 0b submitted: %s — %d requests (%d candidates)",
|
|
batch_id, len(requests), len(prepared),
|
|
)
|
|
|
|
return {
|
|
"status": "submitted",
|
|
"batch_id": batch_id,
|
|
"total_candidates": len(prepared),
|
|
"total_requests": len(requests),
|
|
"batch_size": batch_size,
|
|
}
|
|
|
|
async def process_batch_results(
|
|
self, batch_id: str, pass_type: str = "0a",
|
|
) -> dict:
|
|
"""Fetch and process results from a completed Anthropic batch.
|
|
|
|
Args:
|
|
batch_id: Anthropic batch ID.
|
|
pass_type: "0a" or "0b".
|
|
"""
|
|
# Check status first
|
|
status = await check_batch_status(batch_id)
|
|
if status.get("processing_status") != "ended":
|
|
return {
|
|
"status": "not_ready",
|
|
"processing_status": status.get("processing_status"),
|
|
"request_counts": status.get("request_counts", {}),
|
|
}
|
|
|
|
results = await fetch_batch_results(batch_id)
|
|
stats = {
|
|
"results_processed": 0,
|
|
"results_succeeded": 0,
|
|
"results_failed": 0,
|
|
"errors": 0,
|
|
}
|
|
|
|
if pass_type == "0a":
|
|
stats.update({
|
|
"controls_processed": 0,
|
|
"obligations_extracted": 0,
|
|
"obligations_validated": 0,
|
|
"obligations_rejected": 0,
|
|
})
|
|
else:
|
|
stats.update({
|
|
"candidates_processed": 0,
|
|
"controls_created": 0,
|
|
"llm_failures": 0,
|
|
})
|
|
|
|
for result in results:
|
|
custom_id = result.get("custom_id", "")
|
|
result_data = result.get("result", {})
|
|
stats["results_processed"] += 1
|
|
|
|
if result_data.get("type") != "succeeded":
|
|
stats["results_failed"] += 1
|
|
logger.warning("Batch result failed: %s — %s", custom_id, result_data)
|
|
continue
|
|
|
|
stats["results_succeeded"] += 1
|
|
message = result_data.get("message", {})
|
|
content = message.get("content", [])
|
|
text_content = content[0].get("text", "") if content else ""
|
|
|
|
try:
|
|
if pass_type == "0a":
|
|
self._handle_batch_result_0a(custom_id, text_content, stats)
|
|
else:
|
|
await self._handle_batch_result_0b(custom_id, text_content, stats)
|
|
except Exception as e:
|
|
logger.error("Processing batch result %s: %s", custom_id, e)
|
|
stats["errors"] += 1
|
|
|
|
self.db.commit()
|
|
stats["status"] = "completed"
|
|
return stats
|
|
|
|
def _handle_batch_result_0a(
|
|
self, custom_id: str, text_content: str, stats: dict,
|
|
) -> None:
|
|
"""Process a single Pass 0a batch result."""
|
|
# custom_id format: p0a_CTRL-001+CTRL-002+...
|
|
prefix = "p0a_"
|
|
control_ids = custom_id[len(prefix):].split("+") if custom_id.startswith(prefix) else []
|
|
|
|
if len(control_ids) == 1:
|
|
raw_obls = _parse_json_array(text_content)
|
|
control_id = control_ids[0]
|
|
uuid_row = self.db.execute(
|
|
text("SELECT id FROM canonical_controls WHERE control_id = :cid LIMIT 1"),
|
|
{"cid": control_id},
|
|
).fetchone()
|
|
if not uuid_row:
|
|
return
|
|
control_uuid = str(uuid_row[0])
|
|
if not raw_obls:
|
|
raw_obls = [{"obligation_text": control_id, "action": "sicherstellen",
|
|
"object": control_id}]
|
|
self._process_pass0a_obligations(raw_obls, control_id, control_uuid, stats)
|
|
stats["controls_processed"] += 1
|
|
else:
|
|
results_by_id = _parse_json_object(text_content)
|
|
for control_id in control_ids:
|
|
uuid_row = self.db.execute(
|
|
text("SELECT id FROM canonical_controls WHERE control_id = :cid LIMIT 1"),
|
|
{"cid": control_id},
|
|
).fetchone()
|
|
if not uuid_row:
|
|
continue
|
|
control_uuid = str(uuid_row[0])
|
|
raw_obls = results_by_id.get(control_id, [])
|
|
if not isinstance(raw_obls, list):
|
|
raw_obls = [raw_obls] if raw_obls else []
|
|
if not raw_obls:
|
|
raw_obls = [{"obligation_text": control_id, "action": "sicherstellen",
|
|
"object": control_id}]
|
|
self._process_pass0a_obligations(raw_obls, control_id, control_uuid, stats)
|
|
stats["controls_processed"] += 1
|
|
|
|
async def _handle_batch_result_0b(
|
|
self, custom_id: str, text_content: str, stats: dict,
|
|
) -> None:
|
|
"""Process a single Pass 0b batch result."""
|
|
prefix = "p0b_"
|
|
candidate_ids = custom_id[len(prefix):].split("+") if custom_id.startswith(prefix) else []
|
|
|
|
if len(candidate_ids) == 1:
|
|
parsed = _parse_json_object(text_content)
|
|
obl = self._load_obligation_for_0b(candidate_ids[0])
|
|
if obl:
|
|
await self._process_pass0b_control(obl, parsed, stats)
|
|
else:
|
|
results_by_id = _parse_json_object(text_content)
|
|
for cand_id in candidate_ids:
|
|
parsed = results_by_id.get(cand_id, {})
|
|
obl = self._load_obligation_for_0b(cand_id)
|
|
if obl:
|
|
await self._process_pass0b_control(obl, parsed, stats)
|
|
|
|
def _load_obligation_for_0b(self, candidate_id: str) -> Optional[dict]:
|
|
"""Load obligation data needed for Pass 0b processing."""
|
|
row = self.db.execute(
|
|
text("""
|
|
SELECT oc.id, oc.candidate_id, oc.parent_control_uuid,
|
|
oc.obligation_text, oc.action, oc.object,
|
|
oc.is_test_obligation, oc.is_reporting_obligation,
|
|
cc.title, cc.category, cc.source_citation,
|
|
cc.severity, cc.control_id
|
|
FROM obligation_candidates oc
|
|
JOIN canonical_controls cc ON cc.id = oc.parent_control_uuid
|
|
WHERE oc.candidate_id = :cid
|
|
"""),
|
|
{"cid": candidate_id},
|
|
).fetchone()
|
|
if not row:
|
|
return None
|
|
return {
|
|
"oc_id": str(row[0]),
|
|
"candidate_id": row[1] or "",
|
|
"parent_uuid": str(row[2]),
|
|
"obligation_text": row[3] or "",
|
|
"action": row[4] or "",
|
|
"object": row[5] or "",
|
|
"is_test": row[6],
|
|
"is_reporting": row[7],
|
|
"parent_title": row[8] or "",
|
|
"parent_category": row[9] or "",
|
|
"parent_citation": row[10] or "",
|
|
"parent_severity": row[11] or "medium",
|
|
"parent_control_id": row[12] or "",
|
|
"source_ref": _format_citation(row[10] or ""),
|
|
}
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _fallback_obligation(ctrl: dict) -> dict:
|
|
"""Create a single fallback obligation when LLM returns nothing."""
|
|
return {
|
|
"obligation_text": ctrl.get("objective") or ctrl.get("title", ""),
|
|
"action": "sicherstellen",
|
|
"object": ctrl.get("title", ""),
|
|
"condition": None,
|
|
"normative_strength": "must",
|
|
"is_test_obligation": False,
|
|
"is_reporting_obligation": False,
|
|
}
|
|
|
|
|
|
def _format_field(value) -> str:
|
|
"""Format a requirements/test_procedure field for the LLM prompt."""
|
|
if not value:
|
|
return ""
|
|
if isinstance(value, str):
|
|
try:
|
|
parsed = json.loads(value)
|
|
if isinstance(parsed, list):
|
|
return "\n".join(f"- {item}" for item in parsed)
|
|
return value
|
|
except (json.JSONDecodeError, TypeError):
|
|
return value
|
|
if isinstance(value, list):
|
|
return "\n".join(f"- {item}" for item in value)
|
|
return str(value)
|
|
|
|
|
|
def _format_citation(citation) -> str:
|
|
"""Format source_citation for display."""
|
|
if not citation:
|
|
return ""
|
|
if isinstance(citation, str):
|
|
try:
|
|
c = json.loads(citation)
|
|
if isinstance(c, dict):
|
|
parts = []
|
|
if c.get("source"):
|
|
parts.append(c["source"])
|
|
if c.get("article"):
|
|
parts.append(c["article"])
|
|
if c.get("paragraph"):
|
|
parts.append(c["paragraph"])
|
|
return " ".join(parts) if parts else citation
|
|
except (json.JSONDecodeError, TypeError):
|
|
return citation
|
|
return str(citation)
|
|
|
|
|
|
def _parse_citation(citation) -> dict:
|
|
"""Parse source_citation JSONB into a dict with source/article/paragraph."""
|
|
if not citation:
|
|
return {}
|
|
if isinstance(citation, dict):
|
|
return citation
|
|
if isinstance(citation, str):
|
|
try:
|
|
c = json.loads(citation)
|
|
if isinstance(c, dict):
|
|
return c
|
|
except (json.JSONDecodeError, TypeError):
|
|
pass
|
|
return {}
|
|
|
|
|
|
def _compute_extraction_confidence(flags: dict) -> float:
|
|
"""Compute confidence score from quality flags."""
|
|
score = 0.0
|
|
weights = {
|
|
"has_normative_signal": 0.30,
|
|
"single_action": 0.20,
|
|
"not_rationale": 0.20,
|
|
"not_evidence_only": 0.15,
|
|
"min_length": 0.10,
|
|
"has_parent_link": 0.05,
|
|
}
|
|
for flag, weight in weights.items():
|
|
if flags.get(flag, False):
|
|
score += weight
|
|
return round(score, 2)
|
|
|
|
|
|
def _normalize_severity(val: str) -> str:
|
|
"""Normalize severity value."""
|
|
val = (val or "medium").lower().strip()
|
|
if val in ("critical", "high", "medium", "low"):
|
|
return val
|
|
return "medium"
|
|
|
|
|
|
# _template_fallback removed — replaced by _compose_deterministic engine
|