Files
breakpilot-compliance/backend-compliance/compliance/services/obligation_extractor.py
Benjamin Admin 825e070ed9
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 47s
CI/CD / test-python-backend-compliance (push) Successful in 33s
CI/CD / test-python-document-crawler (push) Successful in 24s
CI/CD / test-python-dsms-gateway (push) Successful in 18s
CI/CD / validate-canonical-controls (push) Successful in 11s
CI/CD / Deploy (push) Has been skipped
feat(multi-layer): complete Multi-Layer Control Architecture (Phases 1-8 + Pass 0)
Implements the full Multi-Layer Control Architecture for migrating ~25,000
Rich Controls into atomic, deduplicated Master Controls with full traceability.

Architecture: Legal Source → Obligation → Control Pattern → Master Control → Customer Instance

New services:
- ObligationExtractor: 3-tier extraction (exact → embedding → LLM)
- PatternMatcher: 2-tier matching (keyword + embedding + domain-bonus)
- ControlComposer: Pattern + Obligation → Master Control
- PipelineAdapter: Pipeline integration + Migration Passes 1-5
- DecompositionPass: Pass 0a/0b — Rich Control → atomic Controls
- CrosswalkRoutes: 15 API endpoints under /v1/canonical/

New DB schema:
- Migration 060: obligation_extractions, control_patterns, crosswalk_matrix
- Migration 061: obligation_candidates, parent_control_uuid tracking

Pattern Library: 50 YAML patterns (30 core + 20 IT-security)
Go SDK: Pattern loader with YAML validation and indexing
Documentation: MkDocs updated with full architecture overview

500 Python tests passing across all components.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-17 09:00:37 +01:00

563 lines
18 KiB
Python

"""Obligation Extractor — 3-Tier Chunk-to-Obligation Linking.
Maps RAG chunks to obligations from the v2 obligation framework using
three tiers (fastest first):
Tier 1: EXACT MATCH — regulation_code + article → obligation_id (~40%)
Tier 2: EMBEDDING — chunk text vs. obligation descriptions (~30%)
Tier 3: LLM EXTRACT — local Ollama extracts obligation text (~25%)
Part of the Multi-Layer Control Architecture (Phase 4 of 8).
"""
import json
import logging
import os
import re
from dataclasses import dataclass, field
from pathlib import Path
from typing import Optional
import httpx
logger = logging.getLogger(__name__)
EMBEDDING_URL = os.getenv("EMBEDDING_URL", "http://embedding-service:8087")
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
OLLAMA_MODEL = os.getenv("CONTROL_GEN_OLLAMA_MODEL", "qwen3.5:35b-a3b")
LLM_TIMEOUT = float(os.getenv("CONTROL_GEN_LLM_TIMEOUT", "180"))
# Embedding similarity thresholds for Tier 2
EMBEDDING_MATCH_THRESHOLD = 0.80
EMBEDDING_CANDIDATE_THRESHOLD = 0.60
# ---------------------------------------------------------------------------
# Regulation code mapping: RAG chunk codes → obligation file regulation IDs
# ---------------------------------------------------------------------------
_REGULATION_CODE_TO_ID = {
# DSGVO
"eu_2016_679": "dsgvo",
"dsgvo": "dsgvo",
"gdpr": "dsgvo",
# AI Act
"eu_2024_1689": "ai_act",
"ai_act": "ai_act",
"aiact": "ai_act",
# NIS2
"eu_2022_2555": "nis2",
"nis2": "nis2",
"bsig": "nis2",
# BDSG
"bdsg": "bdsg",
# TTDSG
"ttdsg": "ttdsg",
# DSA
"eu_2022_2065": "dsa",
"dsa": "dsa",
# Data Act
"eu_2023_2854": "data_act",
"data_act": "data_act",
# EU Machinery
"eu_2023_1230": "eu_machinery",
"eu_machinery": "eu_machinery",
# DORA
"eu_2022_2554": "dora",
"dora": "dora",
}
@dataclass
class ObligationMatch:
"""Result of obligation extraction."""
obligation_id: Optional[str] = None
obligation_title: Optional[str] = None
obligation_text: Optional[str] = None
method: str = "none" # exact_match | embedding_match | llm_extracted | inferred
confidence: float = 0.0
regulation_id: Optional[str] = None # e.g. "dsgvo"
def to_dict(self) -> dict:
return {
"obligation_id": self.obligation_id,
"obligation_title": self.obligation_title,
"obligation_text": self.obligation_text,
"method": self.method,
"confidence": self.confidence,
"regulation_id": self.regulation_id,
}
@dataclass
class _ObligationEntry:
"""Internal representation of a loaded obligation."""
id: str
title: str
description: str
regulation_id: str
articles: list[str] = field(default_factory=list) # normalized: ["art. 30", "§ 38"]
embedding: list[float] = field(default_factory=list)
class ObligationExtractor:
"""3-Tier obligation extraction from RAG chunks.
Usage::
extractor = ObligationExtractor()
await extractor.initialize() # loads obligations + embeddings
match = await extractor.extract(
chunk_text="...",
regulation_code="eu_2016_679",
article="Art. 30",
paragraph="Abs. 1",
)
"""
def __init__(self):
self._article_lookup: dict[str, list[str]] = {} # "dsgvo/art. 30" → ["DSGVO-OBL-001"]
self._obligations: dict[str, _ObligationEntry] = {} # id → entry
self._obligation_embeddings: list[list[float]] = []
self._obligation_ids: list[str] = []
self._initialized = False
async def initialize(self) -> None:
"""Load all obligations from v2 JSON files and compute embeddings."""
if self._initialized:
return
self._load_obligations()
await self._compute_embeddings()
self._initialized = True
logger.info(
"ObligationExtractor initialized: %d obligations, %d article lookups, %d embeddings",
len(self._obligations),
len(self._article_lookup),
sum(1 for e in self._obligation_embeddings if e),
)
async def extract(
self,
chunk_text: str,
regulation_code: str,
article: Optional[str] = None,
paragraph: Optional[str] = None,
) -> ObligationMatch:
"""Extract obligation from a chunk using 3-tier strategy."""
if not self._initialized:
await self.initialize()
reg_id = _normalize_regulation(regulation_code)
# Tier 1: Exact match via article lookup
if article:
match = self._tier1_exact(reg_id, article)
if match:
return match
# Tier 2: Embedding similarity
match = await self._tier2_embedding(chunk_text, reg_id)
if match:
return match
# Tier 3: LLM extraction
match = await self._tier3_llm(chunk_text, regulation_code, article)
return match
# -----------------------------------------------------------------------
# Tier 1: Exact Match
# -----------------------------------------------------------------------
def _tier1_exact(self, reg_id: Optional[str], article: str) -> Optional[ObligationMatch]:
"""Look up obligation by regulation + article."""
if not reg_id:
return None
norm_article = _normalize_article(article)
key = f"{reg_id}/{norm_article}"
obl_ids = self._article_lookup.get(key)
if not obl_ids:
return None
# Take the first match (highest priority)
obl_id = obl_ids[0]
entry = self._obligations.get(obl_id)
if not entry:
return None
return ObligationMatch(
obligation_id=entry.id,
obligation_title=entry.title,
obligation_text=entry.description,
method="exact_match",
confidence=1.0,
regulation_id=reg_id,
)
# -----------------------------------------------------------------------
# Tier 2: Embedding Match
# -----------------------------------------------------------------------
async def _tier2_embedding(
self, chunk_text: str, reg_id: Optional[str]
) -> Optional[ObligationMatch]:
"""Find nearest obligation by embedding similarity."""
if not self._obligation_embeddings:
return None
chunk_embedding = await _get_embedding(chunk_text[:2000])
if not chunk_embedding:
return None
best_idx = -1
best_score = 0.0
for i, obl_emb in enumerate(self._obligation_embeddings):
if not obl_emb:
continue
# Prefer same-regulation matches
obl_id = self._obligation_ids[i]
entry = self._obligations.get(obl_id)
score = _cosine_sim(chunk_embedding, obl_emb)
# Domain bonus: +0.05 if same regulation
if entry and reg_id and entry.regulation_id == reg_id:
score += 0.05
if score > best_score:
best_score = score
best_idx = i
if best_idx < 0:
return None
# Remove domain bonus for threshold comparison
raw_score = best_score
obl_id = self._obligation_ids[best_idx]
entry = self._obligations.get(obl_id)
if entry and reg_id and entry.regulation_id == reg_id:
raw_score -= 0.05
if raw_score >= EMBEDDING_MATCH_THRESHOLD:
return ObligationMatch(
obligation_id=entry.id if entry else obl_id,
obligation_title=entry.title if entry else None,
obligation_text=entry.description if entry else None,
method="embedding_match",
confidence=round(min(raw_score, 1.0), 3),
regulation_id=entry.regulation_id if entry else reg_id,
)
return None
# -----------------------------------------------------------------------
# Tier 3: LLM Extraction
# -----------------------------------------------------------------------
async def _tier3_llm(
self, chunk_text: str, regulation_code: str, article: Optional[str]
) -> ObligationMatch:
"""Use local LLM to extract the obligation from the chunk."""
prompt = f"""Analysiere den folgenden Gesetzestext und extrahiere die zentrale rechtliche Pflicht.
Text:
{chunk_text[:3000]}
Quelle: {regulation_code} {article or ''}
Antworte NUR als JSON:
{{
"obligation_text": "Die zentrale Pflicht in einem Satz",
"actor": "Wer muss handeln (z.B. Verantwortlicher, Auftragsverarbeiter)",
"action": "Was muss getan werden",
"normative_strength": "muss|soll|kann"
}}"""
system_prompt = (
"Du bist ein Rechtsexperte fuer EU-Datenschutz- und Digitalrecht. "
"Extrahiere die zentrale rechtliche Pflicht aus Gesetzestexten. "
"Antworte ausschliesslich als JSON."
)
result_text = await _llm_ollama(prompt, system_prompt)
if not result_text:
return ObligationMatch(
method="llm_extracted",
confidence=0.0,
regulation_id=_normalize_regulation(regulation_code),
)
parsed = _parse_json(result_text)
obligation_text = parsed.get("obligation_text", result_text[:500])
return ObligationMatch(
obligation_id=None,
obligation_title=None,
obligation_text=obligation_text,
method="llm_extracted",
confidence=0.60,
regulation_id=_normalize_regulation(regulation_code),
)
# -----------------------------------------------------------------------
# Initialization helpers
# -----------------------------------------------------------------------
def _load_obligations(self) -> None:
"""Load all obligation files from v2 framework."""
v2_dir = _find_obligations_dir()
if not v2_dir:
logger.warning("Obligations v2 directory not found — Tier 1 disabled")
return
manifest_path = v2_dir / "_manifest.json"
if not manifest_path.exists():
logger.warning("Manifest not found at %s", manifest_path)
return
with open(manifest_path) as f:
manifest = json.load(f)
for reg_info in manifest.get("regulations", []):
reg_id = reg_info["id"]
reg_file = v2_dir / reg_info["file"]
if not reg_file.exists():
logger.warning("Regulation file not found: %s", reg_file)
continue
with open(reg_file) as f:
data = json.load(f)
for obl in data.get("obligations", []):
obl_id = obl["id"]
entry = _ObligationEntry(
id=obl_id,
title=obl.get("title", ""),
description=obl.get("description", ""),
regulation_id=reg_id,
)
# Build article lookup from legal_basis
for basis in obl.get("legal_basis", []):
article_raw = basis.get("article", "")
if article_raw:
norm_art = _normalize_article(article_raw)
key = f"{reg_id}/{norm_art}"
if key not in self._article_lookup:
self._article_lookup[key] = []
self._article_lookup[key].append(obl_id)
entry.articles.append(norm_art)
self._obligations[obl_id] = entry
logger.info(
"Loaded %d obligations from %d regulations",
len(self._obligations),
len(manifest.get("regulations", [])),
)
async def _compute_embeddings(self) -> None:
"""Compute embeddings for all obligation descriptions."""
if not self._obligations:
return
self._obligation_ids = list(self._obligations.keys())
texts = [
f"{self._obligations[oid].title}: {self._obligations[oid].description}"
for oid in self._obligation_ids
]
logger.info("Computing embeddings for %d obligations...", len(texts))
self._obligation_embeddings = await _get_embeddings_batch(texts)
valid = sum(1 for e in self._obligation_embeddings if e)
logger.info("Got %d/%d valid embeddings", valid, len(texts))
# -----------------------------------------------------------------------
# Stats
# -----------------------------------------------------------------------
def stats(self) -> dict:
"""Return initialization statistics."""
return {
"total_obligations": len(self._obligations),
"article_lookups": len(self._article_lookup),
"embeddings_valid": sum(1 for e in self._obligation_embeddings if e),
"regulations": list(
{e.regulation_id for e in self._obligations.values()}
),
"initialized": self._initialized,
}
# ---------------------------------------------------------------------------
# Module-level helpers (reusable by other modules)
# ---------------------------------------------------------------------------
def _normalize_regulation(regulation_code: str) -> Optional[str]:
"""Map a RAG regulation_code to obligation framework regulation ID."""
if not regulation_code:
return None
code = regulation_code.lower().strip()
# Direct lookup
if code in _REGULATION_CODE_TO_ID:
return _REGULATION_CODE_TO_ID[code]
# Prefix matching for families
for prefix, reg_id in [
("eu_2016_679", "dsgvo"),
("eu_2024_1689", "ai_act"),
("eu_2022_2555", "nis2"),
("eu_2022_2065", "dsa"),
("eu_2023_2854", "data_act"),
("eu_2023_1230", "eu_machinery"),
("eu_2022_2554", "dora"),
]:
if code.startswith(prefix):
return reg_id
return None
def _normalize_article(article: str) -> str:
"""Normalize article references for consistent lookup.
Examples:
"Art. 30""art. 30"
"§ 38 BDSG""§ 38"
"Article 10""art. 10"
"Art. 30 Abs. 1""art. 30"
"Artikel 35""art. 35"
"""
if not article:
return ""
s = article.strip()
# Remove trailing law name: "§ 38 BDSG" → "§ 38"
s = re.sub(r"\s+(DSGVO|BDSG|TTDSG|DSA|NIS2|DORA|AI.?Act)\s*$", "", s, flags=re.IGNORECASE)
# Remove paragraph references: "Art. 30 Abs. 1" → "Art. 30"
s = re.sub(r"\s+(Abs|Absatz|para|paragraph|lit|Satz)\.?\s+.*$", "", s, flags=re.IGNORECASE)
# Normalize "Article" / "Artikel" → "Art."
s = re.sub(r"^(Article|Artikel)\s+", "Art. ", s, flags=re.IGNORECASE)
return s.lower().strip()
def _cosine_sim(a: list[float], b: list[float]) -> float:
"""Compute cosine similarity between two vectors."""
if not a or not b or len(a) != len(b):
return 0.0
dot = sum(x * y for x, y in zip(a, b))
norm_a = sum(x * x for x in a) ** 0.5
norm_b = sum(x * x for x in b) ** 0.5
if norm_a == 0 or norm_b == 0:
return 0.0
return dot / (norm_a * norm_b)
def _find_obligations_dir() -> Optional[Path]:
"""Locate the obligations v2 directory."""
candidates = [
Path(__file__).resolve().parent.parent.parent.parent
/ "ai-compliance-sdk" / "policies" / "obligations" / "v2",
Path("/app/ai-compliance-sdk/policies/obligations/v2"),
Path("ai-compliance-sdk/policies/obligations/v2"),
]
for p in candidates:
if p.is_dir() and (p / "_manifest.json").exists():
return p
return None
async def _get_embedding(text: str) -> list[float]:
"""Get embedding vector for a single text."""
try:
async with httpx.AsyncClient(timeout=10.0) as client:
resp = await client.post(
f"{EMBEDDING_URL}/embed",
json={"texts": [text]},
)
resp.raise_for_status()
embeddings = resp.json().get("embeddings", [])
return embeddings[0] if embeddings else []
except Exception:
return []
async def _get_embeddings_batch(
texts: list[str], batch_size: int = 32
) -> list[list[float]]:
"""Get embeddings for multiple texts in batches."""
all_embeddings: list[list[float]] = []
for i in range(0, len(texts), batch_size):
batch = texts[i : i + batch_size]
try:
async with httpx.AsyncClient(timeout=30.0) as client:
resp = await client.post(
f"{EMBEDDING_URL}/embed",
json={"texts": batch},
)
resp.raise_for_status()
embeddings = resp.json().get("embeddings", [])
all_embeddings.extend(embeddings)
except Exception as e:
logger.warning("Batch embedding failed for %d texts: %s", len(batch), e)
all_embeddings.extend([[] for _ in batch])
return all_embeddings
async def _llm_ollama(prompt: str, system_prompt: Optional[str] = None) -> str:
"""Call local Ollama for LLM extraction."""
messages = []
if system_prompt:
messages.append({"role": "system", "content": system_prompt})
messages.append({"role": "user", "content": prompt})
payload = {
"model": OLLAMA_MODEL,
"messages": messages,
"stream": False,
"options": {"num_predict": 512},
"think": False,
}
try:
async with httpx.AsyncClient(timeout=LLM_TIMEOUT) as client:
resp = await client.post(f"{OLLAMA_URL}/api/chat", json=payload)
if resp.status_code != 200:
logger.error(
"Ollama chat failed %d: %s", resp.status_code, resp.text[:300]
)
return ""
data = resp.json()
return data.get("message", {}).get("content", "")
except Exception as e:
logger.warning("Ollama call failed: %s", e)
return ""
def _parse_json(text: str) -> dict:
"""Extract JSON from LLM response text."""
# Try direct parse
try:
return json.loads(text)
except json.JSONDecodeError:
pass
# Try extracting JSON block
match = re.search(r"\{[^{}]*\}", text, re.DOTALL)
if match:
try:
return json.loads(match.group())
except json.JSONDecodeError:
pass
return {}