Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 47s
CI/CD / test-python-backend-compliance (push) Successful in 33s
CI/CD / test-python-document-crawler (push) Successful in 24s
CI/CD / test-python-dsms-gateway (push) Successful in 18s
CI/CD / validate-canonical-controls (push) Successful in 11s
CI/CD / Deploy (push) Has been skipped
Implements the full Multi-Layer Control Architecture for migrating ~25,000 Rich Controls into atomic, deduplicated Master Controls with full traceability. Architecture: Legal Source → Obligation → Control Pattern → Master Control → Customer Instance New services: - ObligationExtractor: 3-tier extraction (exact → embedding → LLM) - PatternMatcher: 2-tier matching (keyword + embedding + domain-bonus) - ControlComposer: Pattern + Obligation → Master Control - PipelineAdapter: Pipeline integration + Migration Passes 1-5 - DecompositionPass: Pass 0a/0b — Rich Control → atomic Controls - CrosswalkRoutes: 15 API endpoints under /v1/canonical/ New DB schema: - Migration 060: obligation_extractions, control_patterns, crosswalk_matrix - Migration 061: obligation_candidates, parent_control_uuid tracking Pattern Library: 50 YAML patterns (30 core + 20 IT-security) Go SDK: Pattern loader with YAML validation and indexing Documentation: MkDocs updated with full architecture overview 500 Python tests passing across all components. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
855 lines
31 KiB
Python
855 lines
31 KiB
Python
"""Decomposition Pass — Split Rich Controls into Atomic Controls.
|
|
|
|
Pass 0 of the Multi-Layer Control Architecture migration. Runs BEFORE
|
|
Passes 1-5 (obligation linkage, pattern classification, etc.).
|
|
|
|
Two sub-passes:
|
|
Pass 0a: Obligation Extraction — extract individual normative obligations
|
|
from a Rich Control using LLM with strict guardrails.
|
|
Pass 0b: Atomic Control Composition — turn each obligation candidate
|
|
into a standalone atomic control record.
|
|
|
|
Plus a Quality Gate that validates extraction results.
|
|
|
|
Guardrails (the 6 rules):
|
|
1. Only normative statements (müssen, sicherzustellen, verpflichtet, ...)
|
|
2. One main verb per obligation
|
|
3. Test obligations separate from operational obligations
|
|
4. Reporting obligations separate
|
|
5. Don't split at evidence level
|
|
6. Parent link always preserved
|
|
"""
|
|
|
|
import json
|
|
import logging
|
|
import re
|
|
import uuid
|
|
from dataclasses import dataclass, field
|
|
from typing import Optional
|
|
|
|
from sqlalchemy import text
|
|
from sqlalchemy.orm import Session
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Normative signal detection (Rule 1)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
_NORMATIVE_SIGNALS = [
|
|
r"\bmüssen\b", r"\bmuss\b", r"\bhat\s+sicherzustellen\b",
|
|
r"\bhaben\s+sicherzustellen\b", r"\bsind\s+verpflichtet\b",
|
|
r"\bist\s+verpflichtet\b", r"\bist\s+zu\s+\w+en\b",
|
|
r"\bsind\s+zu\s+\w+en\b", r"\bhat\s+zu\s+\w+en\b",
|
|
r"\bhaben\s+zu\s+\w+en\b", r"\bsoll\b", r"\bsollen\b",
|
|
r"\bgewährleisten\b", r"\bsicherstellen\b",
|
|
r"\bshall\b", r"\bmust\b", r"\brequired\b",
|
|
r"\bshould\b", r"\bensure\b",
|
|
]
|
|
_NORMATIVE_RE = re.compile("|".join(_NORMATIVE_SIGNALS), re.IGNORECASE)
|
|
|
|
_RATIONALE_SIGNALS = [
|
|
r"\bda\s+", r"\bweil\b", r"\bgrund\b", r"\berwägung",
|
|
r"\bbecause\b", r"\breason\b", r"\brationale\b",
|
|
r"\bkönnen\s+.*\s+verursachen\b", r"\bführt\s+zu\b",
|
|
]
|
|
_RATIONALE_RE = re.compile("|".join(_RATIONALE_SIGNALS), re.IGNORECASE)
|
|
|
|
_TEST_SIGNALS = [
|
|
r"\btesten\b", r"\btest\b", r"\bprüfung\b", r"\bprüfen\b",
|
|
r"\bgetestet\b", r"\bwirksamkeit\b", r"\baudit\b",
|
|
r"\bregelmäßig\b.*\b(prüf|test|kontroll)",
|
|
r"\beffectiveness\b", r"\bverif",
|
|
]
|
|
_TEST_RE = re.compile("|".join(_TEST_SIGNALS), re.IGNORECASE)
|
|
|
|
_REPORTING_SIGNALS = [
|
|
r"\bmelden\b", r"\bmeldung\b", r"\bunterricht",
|
|
r"\binformieren\b", r"\bbenachricht", r"\bnotif",
|
|
r"\breport\b", r"\bbehörd",
|
|
]
|
|
_REPORTING_RE = re.compile("|".join(_REPORTING_SIGNALS), re.IGNORECASE)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Data classes
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
@dataclass
|
|
class ObligationCandidate:
|
|
"""A single normative obligation extracted from a Rich Control."""
|
|
|
|
candidate_id: str = ""
|
|
parent_control_uuid: str = ""
|
|
obligation_text: str = ""
|
|
action: str = ""
|
|
object_: str = ""
|
|
condition: Optional[str] = None
|
|
normative_strength: str = "must"
|
|
is_test_obligation: bool = False
|
|
is_reporting_obligation: bool = False
|
|
extraction_confidence: float = 0.0
|
|
quality_flags: dict = field(default_factory=dict)
|
|
release_state: str = "extracted"
|
|
|
|
def to_dict(self) -> dict:
|
|
return {
|
|
"candidate_id": self.candidate_id,
|
|
"parent_control_uuid": self.parent_control_uuid,
|
|
"obligation_text": self.obligation_text,
|
|
"action": self.action,
|
|
"object": self.object_,
|
|
"condition": self.condition,
|
|
"normative_strength": self.normative_strength,
|
|
"is_test_obligation": self.is_test_obligation,
|
|
"is_reporting_obligation": self.is_reporting_obligation,
|
|
"extraction_confidence": self.extraction_confidence,
|
|
"quality_flags": self.quality_flags,
|
|
"release_state": self.release_state,
|
|
}
|
|
|
|
|
|
@dataclass
|
|
class AtomicControlCandidate:
|
|
"""An atomic control composed from a single ObligationCandidate."""
|
|
|
|
candidate_id: str = ""
|
|
parent_control_uuid: str = ""
|
|
obligation_candidate_id: str = ""
|
|
title: str = ""
|
|
objective: str = ""
|
|
requirements: list = field(default_factory=list)
|
|
test_procedure: list = field(default_factory=list)
|
|
evidence: list = field(default_factory=list)
|
|
severity: str = "medium"
|
|
category: str = ""
|
|
domain: str = ""
|
|
source_regulation: str = ""
|
|
source_article: str = ""
|
|
|
|
def to_dict(self) -> dict:
|
|
return {
|
|
"candidate_id": self.candidate_id,
|
|
"parent_control_uuid": self.parent_control_uuid,
|
|
"obligation_candidate_id": self.obligation_candidate_id,
|
|
"title": self.title,
|
|
"objective": self.objective,
|
|
"requirements": self.requirements,
|
|
"test_procedure": self.test_procedure,
|
|
"evidence": self.evidence,
|
|
"severity": self.severity,
|
|
"category": self.category,
|
|
"domain": self.domain,
|
|
}
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Quality Gate
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def quality_gate(candidate: ObligationCandidate) -> dict:
|
|
"""Validate an obligation candidate. Returns quality flags dict.
|
|
|
|
Checks:
|
|
has_normative_signal: text contains normative language
|
|
single_action: only one main action (heuristic)
|
|
not_rationale: not just a justification/reasoning
|
|
not_evidence_only: not just an evidence requirement
|
|
min_length: text is long enough to be meaningful
|
|
has_parent_link: references back to parent control
|
|
"""
|
|
txt = candidate.obligation_text
|
|
flags = {}
|
|
|
|
# 1. Normative signal
|
|
flags["has_normative_signal"] = bool(_NORMATIVE_RE.search(txt))
|
|
|
|
# 2. Single action heuristic — count "und" / "and" / "sowie" splits
|
|
# that connect different verbs (imperfect but useful)
|
|
multi_verb_re = re.compile(
|
|
r"\b(und|sowie|als auch)\b.*\b(müssen|sicherstellen|implementieren"
|
|
r"|dokumentieren|melden|testen|prüfen|überwachen|gewährleisten)\b",
|
|
re.IGNORECASE,
|
|
)
|
|
flags["single_action"] = not bool(multi_verb_re.search(txt))
|
|
|
|
# 3. Not rationale
|
|
normative_count = len(_NORMATIVE_RE.findall(txt))
|
|
rationale_count = len(_RATIONALE_RE.findall(txt))
|
|
flags["not_rationale"] = normative_count >= rationale_count
|
|
|
|
# 4. Not evidence-only (evidence fragments are typically short noun phrases)
|
|
evidence_only_re = re.compile(
|
|
r"^(Nachweis|Dokumentation|Screenshot|Protokoll|Bericht|Zertifikat)",
|
|
re.IGNORECASE,
|
|
)
|
|
flags["not_evidence_only"] = not bool(evidence_only_re.match(txt.strip()))
|
|
|
|
# 5. Min length
|
|
flags["min_length"] = len(txt.strip()) >= 20
|
|
|
|
# 6. Parent link
|
|
flags["has_parent_link"] = bool(candidate.parent_control_uuid)
|
|
|
|
return flags
|
|
|
|
|
|
def passes_quality_gate(flags: dict) -> bool:
|
|
"""Check if all critical quality flags pass."""
|
|
critical = ["has_normative_signal", "not_evidence_only", "min_length", "has_parent_link"]
|
|
return all(flags.get(k, False) for k in critical)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# LLM Prompts
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
_PASS0A_SYSTEM_PROMPT = """\
|
|
Du bist ein Rechts-Compliance-Experte. Du zerlegst Compliance-Controls \
|
|
in einzelne atomare Pflichten.
|
|
|
|
REGELN (STRIKT EINHALTEN):
|
|
1. Nur normative Aussagen extrahieren — erkennbar an: müssen, haben \
|
|
sicherzustellen, sind verpflichtet, ist zu dokumentieren, ist zu melden, \
|
|
ist zu testen, shall, must, required.
|
|
2. Jede Pflicht hat genau EIN Hauptverb / eine Handlung.
|
|
3. Testpflichten SEPARAT von operativen Pflichten (is_test_obligation=true).
|
|
4. Meldepflichten SEPARAT (is_reporting_obligation=true).
|
|
5. NICHT auf Evidence-Ebene zerlegen (z.B. "DR-Plan vorhanden" ist KEIN \
|
|
eigenes Control, sondern Evidence).
|
|
6. Begründungen, Erläuterungen und Erwägungsgründe sind KEINE Pflichten \
|
|
— NICHT extrahieren.
|
|
|
|
Antworte NUR mit einem JSON-Array. Keine Erklärungen."""
|
|
|
|
|
|
def _build_pass0a_prompt(
|
|
title: str, objective: str, requirements: str,
|
|
test_procedure: str, source_ref: str
|
|
) -> str:
|
|
return f"""\
|
|
Analysiere das folgende Control und extrahiere alle einzelnen normativen \
|
|
Pflichten als JSON-Array.
|
|
|
|
CONTROL:
|
|
Titel: {title}
|
|
Ziel: {objective}
|
|
Anforderungen: {requirements}
|
|
Prüfverfahren: {test_procedure}
|
|
Quellreferenz: {source_ref}
|
|
|
|
Antworte als JSON-Array:
|
|
[
|
|
{{
|
|
"obligation_text": "Kurze, präzise Formulierung der Pflicht",
|
|
"action": "Hauptverb/Handlung",
|
|
"object": "Gegenstand der Pflicht",
|
|
"condition": "Auslöser/Bedingung oder null",
|
|
"normative_strength": "must",
|
|
"is_test_obligation": false,
|
|
"is_reporting_obligation": false
|
|
}}
|
|
]"""
|
|
|
|
|
|
_PASS0B_SYSTEM_PROMPT = """\
|
|
Du bist ein Security-Compliance-Experte. Du erstellst aus einer einzelnen \
|
|
normativen Pflicht ein praxisorientiertes, atomares Security Control.
|
|
|
|
Das Control muss UMSETZBAR sein — keine Gesetzesparaphrase.
|
|
Antworte NUR als JSON. Keine Erklärungen."""
|
|
|
|
|
|
def _build_pass0b_prompt(
|
|
obligation_text: str, action: str, object_: str,
|
|
parent_title: str, parent_category: str, source_ref: str,
|
|
) -> str:
|
|
return f"""\
|
|
Erstelle aus der folgenden Pflicht ein atomares Control.
|
|
|
|
PFLICHT: {obligation_text}
|
|
HANDLUNG: {action}
|
|
GEGENSTAND: {object_}
|
|
|
|
KONTEXT (Ursprungs-Control):
|
|
Titel: {parent_title}
|
|
Kategorie: {parent_category}
|
|
Quellreferenz: {source_ref}
|
|
|
|
Antworte als JSON:
|
|
{{
|
|
"title": "Kurzer Titel (max 80 Zeichen, deutsch)",
|
|
"objective": "Was muss erreicht werden? (1-2 Sätze)",
|
|
"requirements": ["Konkrete Anforderung 1", "Anforderung 2"],
|
|
"test_procedure": ["Prüfschritt 1", "Prüfschritt 2"],
|
|
"evidence": ["Nachweis 1", "Nachweis 2"],
|
|
"severity": "critical|high|medium|low",
|
|
"category": "security|privacy|governance|operations|finance|reporting"
|
|
}}"""
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Parse helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _parse_json_array(text: str) -> list[dict]:
|
|
"""Extract a JSON array from LLM response text."""
|
|
# Try direct parse
|
|
try:
|
|
result = json.loads(text)
|
|
if isinstance(result, list):
|
|
return result
|
|
if isinstance(result, dict):
|
|
return [result]
|
|
except json.JSONDecodeError:
|
|
pass
|
|
|
|
# Try extracting JSON array block
|
|
match = re.search(r"\[[\s\S]*\]", text)
|
|
if match:
|
|
try:
|
|
result = json.loads(match.group())
|
|
if isinstance(result, list):
|
|
return result
|
|
except json.JSONDecodeError:
|
|
pass
|
|
|
|
return []
|
|
|
|
|
|
def _parse_json_object(text: str) -> dict:
|
|
"""Extract a JSON object from LLM response text."""
|
|
try:
|
|
result = json.loads(text)
|
|
if isinstance(result, dict):
|
|
return result
|
|
except json.JSONDecodeError:
|
|
pass
|
|
|
|
match = re.search(r"\{[\s\S]*\}", text)
|
|
if match:
|
|
try:
|
|
result = json.loads(match.group())
|
|
if isinstance(result, dict):
|
|
return result
|
|
except json.JSONDecodeError:
|
|
pass
|
|
|
|
return {}
|
|
|
|
|
|
def _ensure_list(val) -> list:
|
|
"""Ensure value is a list."""
|
|
if isinstance(val, list):
|
|
return val
|
|
if isinstance(val, str):
|
|
return [val] if val else []
|
|
return []
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Decomposition Pass
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
class DecompositionPass:
|
|
"""Pass 0: Decompose Rich Controls into atomic candidates.
|
|
|
|
Usage::
|
|
|
|
decomp = DecompositionPass(db=session)
|
|
stats_0a = await decomp.run_pass0a(limit=100)
|
|
stats_0b = await decomp.run_pass0b(limit=100)
|
|
"""
|
|
|
|
def __init__(self, db: Session):
|
|
self.db = db
|
|
|
|
# -------------------------------------------------------------------
|
|
# Pass 0a: Obligation Extraction
|
|
# -------------------------------------------------------------------
|
|
|
|
async def run_pass0a(self, limit: int = 0) -> dict:
|
|
"""Extract obligation candidates from rich controls.
|
|
|
|
Processes controls that have NOT been decomposed yet
|
|
(no rows in obligation_candidates for that control).
|
|
"""
|
|
from compliance.services.obligation_extractor import _llm_ollama
|
|
|
|
# Find rich controls not yet decomposed
|
|
query = """
|
|
SELECT cc.id, cc.control_id, cc.title, cc.objective,
|
|
cc.requirements, cc.test_procedure,
|
|
cc.source_citation, cc.category
|
|
FROM canonical_controls cc
|
|
WHERE cc.release_state NOT IN ('deprecated')
|
|
AND cc.parent_control_uuid IS NULL
|
|
AND NOT EXISTS (
|
|
SELECT 1 FROM obligation_candidates oc
|
|
WHERE oc.parent_control_uuid = cc.id
|
|
)
|
|
ORDER BY cc.created_at
|
|
"""
|
|
if limit > 0:
|
|
query += f" LIMIT {limit}"
|
|
|
|
rows = self.db.execute(text(query)).fetchall()
|
|
|
|
stats = {
|
|
"controls_processed": 0,
|
|
"obligations_extracted": 0,
|
|
"obligations_validated": 0,
|
|
"obligations_rejected": 0,
|
|
"controls_skipped_empty": 0,
|
|
"errors": 0,
|
|
}
|
|
|
|
for row in rows:
|
|
control_uuid = str(row[0])
|
|
control_id = row[1] or ""
|
|
title = row[2] or ""
|
|
objective = row[3] or ""
|
|
requirements = row[4] or ""
|
|
test_procedure = row[5] or ""
|
|
source_citation = row[6] or ""
|
|
category = row[7] or ""
|
|
|
|
# Format requirements/test_procedure if JSON
|
|
req_str = _format_field(requirements)
|
|
test_str = _format_field(test_procedure)
|
|
source_str = _format_citation(source_citation)
|
|
|
|
if not title and not objective and not req_str:
|
|
stats["controls_skipped_empty"] += 1
|
|
continue
|
|
|
|
try:
|
|
prompt = _build_pass0a_prompt(
|
|
title=title,
|
|
objective=objective,
|
|
requirements=req_str,
|
|
test_procedure=test_str,
|
|
source_ref=source_str,
|
|
)
|
|
|
|
llm_response = await _llm_ollama(
|
|
prompt=prompt,
|
|
system_prompt=_PASS0A_SYSTEM_PROMPT,
|
|
)
|
|
|
|
raw_obligations = _parse_json_array(llm_response)
|
|
|
|
if not raw_obligations:
|
|
# Fallback: treat the whole control as one obligation
|
|
raw_obligations = [{
|
|
"obligation_text": objective or title,
|
|
"action": "sicherstellen",
|
|
"object": title,
|
|
"condition": None,
|
|
"normative_strength": "must",
|
|
"is_test_obligation": False,
|
|
"is_reporting_obligation": False,
|
|
}]
|
|
|
|
for idx, raw in enumerate(raw_obligations):
|
|
cand = ObligationCandidate(
|
|
candidate_id=f"OC-{control_id}-{idx + 1:02d}",
|
|
parent_control_uuid=control_uuid,
|
|
obligation_text=raw.get("obligation_text", ""),
|
|
action=raw.get("action", ""),
|
|
object_=raw.get("object", ""),
|
|
condition=raw.get("condition"),
|
|
normative_strength=raw.get("normative_strength", "must"),
|
|
is_test_obligation=bool(raw.get("is_test_obligation", False)),
|
|
is_reporting_obligation=bool(raw.get("is_reporting_obligation", False)),
|
|
)
|
|
|
|
# Auto-detect test/reporting if LLM missed it
|
|
if not cand.is_test_obligation and _TEST_RE.search(cand.obligation_text):
|
|
cand.is_test_obligation = True
|
|
if not cand.is_reporting_obligation and _REPORTING_RE.search(cand.obligation_text):
|
|
cand.is_reporting_obligation = True
|
|
|
|
# Quality gate
|
|
flags = quality_gate(cand)
|
|
cand.quality_flags = flags
|
|
cand.extraction_confidence = _compute_extraction_confidence(flags)
|
|
|
|
if passes_quality_gate(flags):
|
|
cand.release_state = "validated"
|
|
stats["obligations_validated"] += 1
|
|
else:
|
|
cand.release_state = "rejected"
|
|
stats["obligations_rejected"] += 1
|
|
|
|
# Write to DB
|
|
self._write_obligation_candidate(cand)
|
|
stats["obligations_extracted"] += 1
|
|
|
|
stats["controls_processed"] += 1
|
|
|
|
except Exception as e:
|
|
logger.error("Pass 0a failed for %s: %s", control_id, e)
|
|
stats["errors"] += 1
|
|
|
|
self.db.commit()
|
|
logger.info("Pass 0a: %s", stats)
|
|
return stats
|
|
|
|
# -------------------------------------------------------------------
|
|
# Pass 0b: Atomic Control Composition
|
|
# -------------------------------------------------------------------
|
|
|
|
async def run_pass0b(self, limit: int = 0) -> dict:
|
|
"""Compose atomic controls from validated obligation candidates.
|
|
|
|
Processes obligation_candidates with release_state='validated'
|
|
that don't have a corresponding atomic control yet.
|
|
"""
|
|
from compliance.services.obligation_extractor import _llm_ollama
|
|
|
|
query = """
|
|
SELECT oc.id, oc.candidate_id, oc.parent_control_uuid,
|
|
oc.obligation_text, oc.action, oc.object,
|
|
oc.is_test_obligation, oc.is_reporting_obligation,
|
|
cc.title AS parent_title,
|
|
cc.category AS parent_category,
|
|
cc.source_citation AS parent_citation,
|
|
cc.severity AS parent_severity,
|
|
cc.control_id AS parent_control_id
|
|
FROM obligation_candidates oc
|
|
JOIN canonical_controls cc ON cc.id = oc.parent_control_uuid
|
|
WHERE oc.release_state = 'validated'
|
|
AND NOT EXISTS (
|
|
SELECT 1 FROM canonical_controls ac
|
|
WHERE ac.parent_control_uuid = oc.parent_control_uuid
|
|
AND ac.decomposition_method = 'pass0b'
|
|
AND ac.title LIKE '%' || LEFT(oc.action, 20) || '%'
|
|
)
|
|
"""
|
|
if limit > 0:
|
|
query += f" LIMIT {limit}"
|
|
|
|
rows = self.db.execute(text(query)).fetchall()
|
|
|
|
stats = {
|
|
"candidates_processed": 0,
|
|
"controls_created": 0,
|
|
"llm_failures": 0,
|
|
"errors": 0,
|
|
}
|
|
|
|
for row in rows:
|
|
oc_id = str(row[0])
|
|
candidate_id = row[1] or ""
|
|
parent_uuid = str(row[2])
|
|
obligation_text = row[3] or ""
|
|
action = row[4] or ""
|
|
object_ = row[5] or ""
|
|
is_test = row[6]
|
|
is_reporting = row[7]
|
|
parent_title = row[8] or ""
|
|
parent_category = row[9] or ""
|
|
parent_citation = row[10] or ""
|
|
parent_severity = row[11] or "medium"
|
|
parent_control_id = row[12] or ""
|
|
|
|
source_str = _format_citation(parent_citation)
|
|
|
|
try:
|
|
prompt = _build_pass0b_prompt(
|
|
obligation_text=obligation_text,
|
|
action=action,
|
|
object_=object_,
|
|
parent_title=parent_title,
|
|
parent_category=parent_category,
|
|
source_ref=source_str,
|
|
)
|
|
|
|
llm_response = await _llm_ollama(
|
|
prompt=prompt,
|
|
system_prompt=_PASS0B_SYSTEM_PROMPT,
|
|
)
|
|
|
|
parsed = _parse_json_object(llm_response)
|
|
|
|
if not parsed or not parsed.get("title"):
|
|
# Template fallback — no LLM needed
|
|
atomic = _template_fallback(
|
|
obligation_text=obligation_text,
|
|
action=action,
|
|
object_=object_,
|
|
parent_title=parent_title,
|
|
parent_severity=parent_severity,
|
|
parent_category=parent_category,
|
|
is_test=is_test,
|
|
is_reporting=is_reporting,
|
|
)
|
|
stats["llm_failures"] += 1
|
|
else:
|
|
atomic = AtomicControlCandidate(
|
|
title=parsed.get("title", "")[:200],
|
|
objective=parsed.get("objective", "")[:2000],
|
|
requirements=_ensure_list(parsed.get("requirements", [])),
|
|
test_procedure=_ensure_list(parsed.get("test_procedure", [])),
|
|
evidence=_ensure_list(parsed.get("evidence", [])),
|
|
severity=_normalize_severity(parsed.get("severity", parent_severity)),
|
|
category=parsed.get("category", parent_category),
|
|
)
|
|
|
|
atomic.parent_control_uuid = parent_uuid
|
|
atomic.obligation_candidate_id = candidate_id
|
|
|
|
# Generate control_id from parent
|
|
seq = self._next_atomic_seq(parent_control_id)
|
|
atomic.candidate_id = f"{parent_control_id}-A{seq:02d}"
|
|
|
|
# Write to canonical_controls
|
|
self._write_atomic_control(atomic, parent_uuid, candidate_id)
|
|
|
|
# Mark obligation candidate as composed
|
|
self.db.execute(
|
|
text("""
|
|
UPDATE obligation_candidates
|
|
SET release_state = 'composed'
|
|
WHERE id = CAST(:oc_id AS uuid)
|
|
"""),
|
|
{"oc_id": oc_id},
|
|
)
|
|
|
|
stats["controls_created"] += 1
|
|
stats["candidates_processed"] += 1
|
|
|
|
except Exception as e:
|
|
logger.error("Pass 0b failed for %s: %s", candidate_id, e)
|
|
stats["errors"] += 1
|
|
|
|
self.db.commit()
|
|
logger.info("Pass 0b: %s", stats)
|
|
return stats
|
|
|
|
# -------------------------------------------------------------------
|
|
# Decomposition Status
|
|
# -------------------------------------------------------------------
|
|
|
|
def decomposition_status(self) -> dict:
|
|
"""Return decomposition progress."""
|
|
row = self.db.execute(text("""
|
|
SELECT
|
|
(SELECT count(*) FROM canonical_controls
|
|
WHERE parent_control_uuid IS NULL
|
|
AND release_state NOT IN ('deprecated')) AS rich_controls,
|
|
(SELECT count(DISTINCT parent_control_uuid) FROM obligation_candidates) AS decomposed_controls,
|
|
(SELECT count(*) FROM obligation_candidates) AS total_candidates,
|
|
(SELECT count(*) FROM obligation_candidates WHERE release_state = 'validated') AS validated,
|
|
(SELECT count(*) FROM obligation_candidates WHERE release_state = 'rejected') AS rejected,
|
|
(SELECT count(*) FROM obligation_candidates WHERE release_state = 'composed') AS composed,
|
|
(SELECT count(*) FROM canonical_controls WHERE parent_control_uuid IS NOT NULL) AS atomic_controls
|
|
""")).fetchone()
|
|
|
|
return {
|
|
"rich_controls": row[0],
|
|
"decomposed_controls": row[1],
|
|
"total_candidates": row[2],
|
|
"validated": row[3],
|
|
"rejected": row[4],
|
|
"composed": row[5],
|
|
"atomic_controls": row[6],
|
|
"decomposition_pct": round(row[1] / max(row[0], 1) * 100, 1),
|
|
"composition_pct": round(row[5] / max(row[3], 1) * 100, 1),
|
|
}
|
|
|
|
# -------------------------------------------------------------------
|
|
# DB Writers
|
|
# -------------------------------------------------------------------
|
|
|
|
def _write_obligation_candidate(self, cand: ObligationCandidate) -> None:
|
|
"""Insert an obligation candidate into the DB."""
|
|
self.db.execute(
|
|
text("""
|
|
INSERT INTO obligation_candidates (
|
|
parent_control_uuid, candidate_id,
|
|
obligation_text, action, object, condition,
|
|
normative_strength, is_test_obligation,
|
|
is_reporting_obligation, extraction_confidence,
|
|
quality_flags, release_state
|
|
) VALUES (
|
|
CAST(:parent_uuid AS uuid), :candidate_id,
|
|
:obligation_text, :action, :object, :condition,
|
|
:normative_strength, :is_test, :is_reporting,
|
|
:confidence, :quality_flags, :release_state
|
|
)
|
|
"""),
|
|
{
|
|
"parent_uuid": cand.parent_control_uuid,
|
|
"candidate_id": cand.candidate_id,
|
|
"obligation_text": cand.obligation_text,
|
|
"action": cand.action,
|
|
"object": cand.object_,
|
|
"condition": cand.condition,
|
|
"normative_strength": cand.normative_strength,
|
|
"is_test": cand.is_test_obligation,
|
|
"is_reporting": cand.is_reporting_obligation,
|
|
"confidence": cand.extraction_confidence,
|
|
"quality_flags": json.dumps(cand.quality_flags),
|
|
"release_state": cand.release_state,
|
|
},
|
|
)
|
|
|
|
def _write_atomic_control(
|
|
self, atomic: AtomicControlCandidate,
|
|
parent_uuid: str, candidate_id: str,
|
|
) -> None:
|
|
"""Insert an atomic control into canonical_controls."""
|
|
self.db.execute(
|
|
text("""
|
|
INSERT INTO canonical_controls (
|
|
control_id, title, objective, requirements,
|
|
test_procedure, evidence, severity, category,
|
|
release_state, parent_control_uuid,
|
|
decomposition_method,
|
|
generation_metadata
|
|
) VALUES (
|
|
:control_id, :title, :objective,
|
|
:requirements, :test_procedure, :evidence,
|
|
:severity, :category, 'draft',
|
|
CAST(:parent_uuid AS uuid), 'pass0b',
|
|
:gen_meta
|
|
)
|
|
"""),
|
|
{
|
|
"control_id": atomic.candidate_id,
|
|
"title": atomic.title,
|
|
"objective": atomic.objective,
|
|
"requirements": json.dumps(atomic.requirements),
|
|
"test_procedure": json.dumps(atomic.test_procedure),
|
|
"evidence": json.dumps(atomic.evidence),
|
|
"severity": atomic.severity,
|
|
"category": atomic.category,
|
|
"parent_uuid": parent_uuid,
|
|
"gen_meta": json.dumps({
|
|
"decomposition_source": candidate_id,
|
|
"decomposition_method": "pass0b",
|
|
}),
|
|
},
|
|
)
|
|
|
|
def _next_atomic_seq(self, parent_control_id: str) -> int:
|
|
"""Get the next sequence number for atomic controls under a parent."""
|
|
result = self.db.execute(
|
|
text("""
|
|
SELECT count(*) FROM canonical_controls
|
|
WHERE parent_control_uuid = (
|
|
SELECT id FROM canonical_controls
|
|
WHERE control_id = :parent_id
|
|
LIMIT 1
|
|
)
|
|
"""),
|
|
{"parent_id": parent_control_id},
|
|
).fetchone()
|
|
return (result[0] if result else 0) + 1
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _format_field(value) -> str:
|
|
"""Format a requirements/test_procedure field for the LLM prompt."""
|
|
if not value:
|
|
return ""
|
|
if isinstance(value, str):
|
|
try:
|
|
parsed = json.loads(value)
|
|
if isinstance(parsed, list):
|
|
return "\n".join(f"- {item}" for item in parsed)
|
|
return value
|
|
except (json.JSONDecodeError, TypeError):
|
|
return value
|
|
if isinstance(value, list):
|
|
return "\n".join(f"- {item}" for item in value)
|
|
return str(value)
|
|
|
|
|
|
def _format_citation(citation) -> str:
|
|
"""Format source_citation for display."""
|
|
if not citation:
|
|
return ""
|
|
if isinstance(citation, str):
|
|
try:
|
|
c = json.loads(citation)
|
|
if isinstance(c, dict):
|
|
parts = []
|
|
if c.get("source"):
|
|
parts.append(c["source"])
|
|
if c.get("article"):
|
|
parts.append(c["article"])
|
|
if c.get("paragraph"):
|
|
parts.append(c["paragraph"])
|
|
return " ".join(parts) if parts else citation
|
|
except (json.JSONDecodeError, TypeError):
|
|
return citation
|
|
return str(citation)
|
|
|
|
|
|
def _compute_extraction_confidence(flags: dict) -> float:
|
|
"""Compute confidence score from quality flags."""
|
|
score = 0.0
|
|
weights = {
|
|
"has_normative_signal": 0.30,
|
|
"single_action": 0.20,
|
|
"not_rationale": 0.20,
|
|
"not_evidence_only": 0.15,
|
|
"min_length": 0.10,
|
|
"has_parent_link": 0.05,
|
|
}
|
|
for flag, weight in weights.items():
|
|
if flags.get(flag, False):
|
|
score += weight
|
|
return round(score, 2)
|
|
|
|
|
|
def _normalize_severity(val: str) -> str:
|
|
"""Normalize severity value."""
|
|
val = (val or "medium").lower().strip()
|
|
if val in ("critical", "high", "medium", "low"):
|
|
return val
|
|
return "medium"
|
|
|
|
|
|
def _template_fallback(
|
|
obligation_text: str, action: str, object_: str,
|
|
parent_title: str, parent_severity: str, parent_category: str,
|
|
is_test: bool, is_reporting: bool,
|
|
) -> AtomicControlCandidate:
|
|
"""Create an atomic control candidate from template when LLM fails."""
|
|
if is_test:
|
|
title = f"Test: {object_[:60]}" if object_ else f"Test: {action[:60]}"
|
|
test_proc = [f"Prüfung der {object_ or action}"]
|
|
evidence = ["Testprotokoll", "Prüfbericht"]
|
|
elif is_reporting:
|
|
title = f"Meldepflicht: {object_[:60]}" if object_ else f"Meldung: {action[:60]}"
|
|
test_proc = ["Prüfung des Meldeprozesses", "Stichprobe gemeldeter Vorfälle"]
|
|
evidence = ["Meldeprozess-Dokumentation", "Meldeformulare"]
|
|
else:
|
|
title = f"{action.capitalize()}: {object_[:60]}" if object_ else parent_title[:80]
|
|
test_proc = [f"Prüfung der {action}"]
|
|
evidence = ["Dokumentation", "Konfigurationsnachweis"]
|
|
|
|
return AtomicControlCandidate(
|
|
title=title[:200],
|
|
objective=obligation_text[:2000],
|
|
requirements=[obligation_text] if obligation_text else [],
|
|
test_procedure=test_proc,
|
|
evidence=evidence,
|
|
severity=_normalize_severity(parent_severity),
|
|
category=parent_category,
|
|
)
|