Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 47s
CI/CD / test-python-backend-compliance (push) Successful in 33s
CI/CD / test-python-document-crawler (push) Successful in 24s
CI/CD / test-python-dsms-gateway (push) Successful in 18s
CI/CD / validate-canonical-controls (push) Successful in 11s
CI/CD / Deploy (push) Has been skipped
Implements the full Multi-Layer Control Architecture for migrating ~25,000 Rich Controls into atomic, deduplicated Master Controls with full traceability. Architecture: Legal Source → Obligation → Control Pattern → Master Control → Customer Instance New services: - ObligationExtractor: 3-tier extraction (exact → embedding → LLM) - PatternMatcher: 2-tier matching (keyword + embedding + domain-bonus) - ControlComposer: Pattern + Obligation → Master Control - PipelineAdapter: Pipeline integration + Migration Passes 1-5 - DecompositionPass: Pass 0a/0b — Rich Control → atomic Controls - CrosswalkRoutes: 15 API endpoints under /v1/canonical/ New DB schema: - Migration 060: obligation_extractions, control_patterns, crosswalk_matrix - Migration 061: obligation_candidates, parent_control_uuid tracking Pattern Library: 50 YAML patterns (30 core + 20 IT-security) Go SDK: Pattern loader with YAML validation and indexing Documentation: MkDocs updated with full architecture overview 500 Python tests passing across all components. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
563 lines
18 KiB
Python
563 lines
18 KiB
Python
"""Obligation Extractor — 3-Tier Chunk-to-Obligation Linking.
|
|
|
|
Maps RAG chunks to obligations from the v2 obligation framework using
|
|
three tiers (fastest first):
|
|
|
|
Tier 1: EXACT MATCH — regulation_code + article → obligation_id (~40%)
|
|
Tier 2: EMBEDDING — chunk text vs. obligation descriptions (~30%)
|
|
Tier 3: LLM EXTRACT — local Ollama extracts obligation text (~25%)
|
|
|
|
Part of the Multi-Layer Control Architecture (Phase 4 of 8).
|
|
"""
|
|
|
|
import json
|
|
import logging
|
|
import os
|
|
import re
|
|
from dataclasses import dataclass, field
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
|
|
import httpx
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
EMBEDDING_URL = os.getenv("EMBEDDING_URL", "http://embedding-service:8087")
|
|
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
|
|
OLLAMA_MODEL = os.getenv("CONTROL_GEN_OLLAMA_MODEL", "qwen3.5:35b-a3b")
|
|
LLM_TIMEOUT = float(os.getenv("CONTROL_GEN_LLM_TIMEOUT", "180"))
|
|
|
|
# Embedding similarity thresholds for Tier 2
|
|
EMBEDDING_MATCH_THRESHOLD = 0.80
|
|
EMBEDDING_CANDIDATE_THRESHOLD = 0.60
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Regulation code mapping: RAG chunk codes → obligation file regulation IDs
|
|
# ---------------------------------------------------------------------------
|
|
|
|
_REGULATION_CODE_TO_ID = {
|
|
# DSGVO
|
|
"eu_2016_679": "dsgvo",
|
|
"dsgvo": "dsgvo",
|
|
"gdpr": "dsgvo",
|
|
# AI Act
|
|
"eu_2024_1689": "ai_act",
|
|
"ai_act": "ai_act",
|
|
"aiact": "ai_act",
|
|
# NIS2
|
|
"eu_2022_2555": "nis2",
|
|
"nis2": "nis2",
|
|
"bsig": "nis2",
|
|
# BDSG
|
|
"bdsg": "bdsg",
|
|
# TTDSG
|
|
"ttdsg": "ttdsg",
|
|
# DSA
|
|
"eu_2022_2065": "dsa",
|
|
"dsa": "dsa",
|
|
# Data Act
|
|
"eu_2023_2854": "data_act",
|
|
"data_act": "data_act",
|
|
# EU Machinery
|
|
"eu_2023_1230": "eu_machinery",
|
|
"eu_machinery": "eu_machinery",
|
|
# DORA
|
|
"eu_2022_2554": "dora",
|
|
"dora": "dora",
|
|
}
|
|
|
|
|
|
@dataclass
|
|
class ObligationMatch:
|
|
"""Result of obligation extraction."""
|
|
|
|
obligation_id: Optional[str] = None
|
|
obligation_title: Optional[str] = None
|
|
obligation_text: Optional[str] = None
|
|
method: str = "none" # exact_match | embedding_match | llm_extracted | inferred
|
|
confidence: float = 0.0
|
|
regulation_id: Optional[str] = None # e.g. "dsgvo"
|
|
|
|
def to_dict(self) -> dict:
|
|
return {
|
|
"obligation_id": self.obligation_id,
|
|
"obligation_title": self.obligation_title,
|
|
"obligation_text": self.obligation_text,
|
|
"method": self.method,
|
|
"confidence": self.confidence,
|
|
"regulation_id": self.regulation_id,
|
|
}
|
|
|
|
|
|
@dataclass
|
|
class _ObligationEntry:
|
|
"""Internal representation of a loaded obligation."""
|
|
|
|
id: str
|
|
title: str
|
|
description: str
|
|
regulation_id: str
|
|
articles: list[str] = field(default_factory=list) # normalized: ["art. 30", "§ 38"]
|
|
embedding: list[float] = field(default_factory=list)
|
|
|
|
|
|
class ObligationExtractor:
|
|
"""3-Tier obligation extraction from RAG chunks.
|
|
|
|
Usage::
|
|
|
|
extractor = ObligationExtractor()
|
|
await extractor.initialize() # loads obligations + embeddings
|
|
|
|
match = await extractor.extract(
|
|
chunk_text="...",
|
|
regulation_code="eu_2016_679",
|
|
article="Art. 30",
|
|
paragraph="Abs. 1",
|
|
)
|
|
"""
|
|
|
|
def __init__(self):
|
|
self._article_lookup: dict[str, list[str]] = {} # "dsgvo/art. 30" → ["DSGVO-OBL-001"]
|
|
self._obligations: dict[str, _ObligationEntry] = {} # id → entry
|
|
self._obligation_embeddings: list[list[float]] = []
|
|
self._obligation_ids: list[str] = []
|
|
self._initialized = False
|
|
|
|
async def initialize(self) -> None:
|
|
"""Load all obligations from v2 JSON files and compute embeddings."""
|
|
if self._initialized:
|
|
return
|
|
|
|
self._load_obligations()
|
|
await self._compute_embeddings()
|
|
self._initialized = True
|
|
logger.info(
|
|
"ObligationExtractor initialized: %d obligations, %d article lookups, %d embeddings",
|
|
len(self._obligations),
|
|
len(self._article_lookup),
|
|
sum(1 for e in self._obligation_embeddings if e),
|
|
)
|
|
|
|
async def extract(
|
|
self,
|
|
chunk_text: str,
|
|
regulation_code: str,
|
|
article: Optional[str] = None,
|
|
paragraph: Optional[str] = None,
|
|
) -> ObligationMatch:
|
|
"""Extract obligation from a chunk using 3-tier strategy."""
|
|
if not self._initialized:
|
|
await self.initialize()
|
|
|
|
reg_id = _normalize_regulation(regulation_code)
|
|
|
|
# Tier 1: Exact match via article lookup
|
|
if article:
|
|
match = self._tier1_exact(reg_id, article)
|
|
if match:
|
|
return match
|
|
|
|
# Tier 2: Embedding similarity
|
|
match = await self._tier2_embedding(chunk_text, reg_id)
|
|
if match:
|
|
return match
|
|
|
|
# Tier 3: LLM extraction
|
|
match = await self._tier3_llm(chunk_text, regulation_code, article)
|
|
return match
|
|
|
|
# -----------------------------------------------------------------------
|
|
# Tier 1: Exact Match
|
|
# -----------------------------------------------------------------------
|
|
|
|
def _tier1_exact(self, reg_id: Optional[str], article: str) -> Optional[ObligationMatch]:
|
|
"""Look up obligation by regulation + article."""
|
|
if not reg_id:
|
|
return None
|
|
|
|
norm_article = _normalize_article(article)
|
|
key = f"{reg_id}/{norm_article}"
|
|
|
|
obl_ids = self._article_lookup.get(key)
|
|
if not obl_ids:
|
|
return None
|
|
|
|
# Take the first match (highest priority)
|
|
obl_id = obl_ids[0]
|
|
entry = self._obligations.get(obl_id)
|
|
if not entry:
|
|
return None
|
|
|
|
return ObligationMatch(
|
|
obligation_id=entry.id,
|
|
obligation_title=entry.title,
|
|
obligation_text=entry.description,
|
|
method="exact_match",
|
|
confidence=1.0,
|
|
regulation_id=reg_id,
|
|
)
|
|
|
|
# -----------------------------------------------------------------------
|
|
# Tier 2: Embedding Match
|
|
# -----------------------------------------------------------------------
|
|
|
|
async def _tier2_embedding(
|
|
self, chunk_text: str, reg_id: Optional[str]
|
|
) -> Optional[ObligationMatch]:
|
|
"""Find nearest obligation by embedding similarity."""
|
|
if not self._obligation_embeddings:
|
|
return None
|
|
|
|
chunk_embedding = await _get_embedding(chunk_text[:2000])
|
|
if not chunk_embedding:
|
|
return None
|
|
|
|
best_idx = -1
|
|
best_score = 0.0
|
|
|
|
for i, obl_emb in enumerate(self._obligation_embeddings):
|
|
if not obl_emb:
|
|
continue
|
|
# Prefer same-regulation matches
|
|
obl_id = self._obligation_ids[i]
|
|
entry = self._obligations.get(obl_id)
|
|
score = _cosine_sim(chunk_embedding, obl_emb)
|
|
|
|
# Domain bonus: +0.05 if same regulation
|
|
if entry and reg_id and entry.regulation_id == reg_id:
|
|
score += 0.05
|
|
|
|
if score > best_score:
|
|
best_score = score
|
|
best_idx = i
|
|
|
|
if best_idx < 0:
|
|
return None
|
|
|
|
# Remove domain bonus for threshold comparison
|
|
raw_score = best_score
|
|
obl_id = self._obligation_ids[best_idx]
|
|
entry = self._obligations.get(obl_id)
|
|
if entry and reg_id and entry.regulation_id == reg_id:
|
|
raw_score -= 0.05
|
|
|
|
if raw_score >= EMBEDDING_MATCH_THRESHOLD:
|
|
return ObligationMatch(
|
|
obligation_id=entry.id if entry else obl_id,
|
|
obligation_title=entry.title if entry else None,
|
|
obligation_text=entry.description if entry else None,
|
|
method="embedding_match",
|
|
confidence=round(min(raw_score, 1.0), 3),
|
|
regulation_id=entry.regulation_id if entry else reg_id,
|
|
)
|
|
|
|
return None
|
|
|
|
# -----------------------------------------------------------------------
|
|
# Tier 3: LLM Extraction
|
|
# -----------------------------------------------------------------------
|
|
|
|
async def _tier3_llm(
|
|
self, chunk_text: str, regulation_code: str, article: Optional[str]
|
|
) -> ObligationMatch:
|
|
"""Use local LLM to extract the obligation from the chunk."""
|
|
prompt = f"""Analysiere den folgenden Gesetzestext und extrahiere die zentrale rechtliche Pflicht.
|
|
|
|
Text:
|
|
{chunk_text[:3000]}
|
|
|
|
Quelle: {regulation_code} {article or ''}
|
|
|
|
Antworte NUR als JSON:
|
|
{{
|
|
"obligation_text": "Die zentrale Pflicht in einem Satz",
|
|
"actor": "Wer muss handeln (z.B. Verantwortlicher, Auftragsverarbeiter)",
|
|
"action": "Was muss getan werden",
|
|
"normative_strength": "muss|soll|kann"
|
|
}}"""
|
|
|
|
system_prompt = (
|
|
"Du bist ein Rechtsexperte fuer EU-Datenschutz- und Digitalrecht. "
|
|
"Extrahiere die zentrale rechtliche Pflicht aus Gesetzestexten. "
|
|
"Antworte ausschliesslich als JSON."
|
|
)
|
|
|
|
result_text = await _llm_ollama(prompt, system_prompt)
|
|
if not result_text:
|
|
return ObligationMatch(
|
|
method="llm_extracted",
|
|
confidence=0.0,
|
|
regulation_id=_normalize_regulation(regulation_code),
|
|
)
|
|
|
|
parsed = _parse_json(result_text)
|
|
obligation_text = parsed.get("obligation_text", result_text[:500])
|
|
|
|
return ObligationMatch(
|
|
obligation_id=None,
|
|
obligation_title=None,
|
|
obligation_text=obligation_text,
|
|
method="llm_extracted",
|
|
confidence=0.60,
|
|
regulation_id=_normalize_regulation(regulation_code),
|
|
)
|
|
|
|
# -----------------------------------------------------------------------
|
|
# Initialization helpers
|
|
# -----------------------------------------------------------------------
|
|
|
|
def _load_obligations(self) -> None:
|
|
"""Load all obligation files from v2 framework."""
|
|
v2_dir = _find_obligations_dir()
|
|
if not v2_dir:
|
|
logger.warning("Obligations v2 directory not found — Tier 1 disabled")
|
|
return
|
|
|
|
manifest_path = v2_dir / "_manifest.json"
|
|
if not manifest_path.exists():
|
|
logger.warning("Manifest not found at %s", manifest_path)
|
|
return
|
|
|
|
with open(manifest_path) as f:
|
|
manifest = json.load(f)
|
|
|
|
for reg_info in manifest.get("regulations", []):
|
|
reg_id = reg_info["id"]
|
|
reg_file = v2_dir / reg_info["file"]
|
|
if not reg_file.exists():
|
|
logger.warning("Regulation file not found: %s", reg_file)
|
|
continue
|
|
|
|
with open(reg_file) as f:
|
|
data = json.load(f)
|
|
|
|
for obl in data.get("obligations", []):
|
|
obl_id = obl["id"]
|
|
entry = _ObligationEntry(
|
|
id=obl_id,
|
|
title=obl.get("title", ""),
|
|
description=obl.get("description", ""),
|
|
regulation_id=reg_id,
|
|
)
|
|
|
|
# Build article lookup from legal_basis
|
|
for basis in obl.get("legal_basis", []):
|
|
article_raw = basis.get("article", "")
|
|
if article_raw:
|
|
norm_art = _normalize_article(article_raw)
|
|
key = f"{reg_id}/{norm_art}"
|
|
if key not in self._article_lookup:
|
|
self._article_lookup[key] = []
|
|
self._article_lookup[key].append(obl_id)
|
|
entry.articles.append(norm_art)
|
|
|
|
self._obligations[obl_id] = entry
|
|
|
|
logger.info(
|
|
"Loaded %d obligations from %d regulations",
|
|
len(self._obligations),
|
|
len(manifest.get("regulations", [])),
|
|
)
|
|
|
|
async def _compute_embeddings(self) -> None:
|
|
"""Compute embeddings for all obligation descriptions."""
|
|
if not self._obligations:
|
|
return
|
|
|
|
self._obligation_ids = list(self._obligations.keys())
|
|
texts = [
|
|
f"{self._obligations[oid].title}: {self._obligations[oid].description}"
|
|
for oid in self._obligation_ids
|
|
]
|
|
|
|
logger.info("Computing embeddings for %d obligations...", len(texts))
|
|
self._obligation_embeddings = await _get_embeddings_batch(texts)
|
|
valid = sum(1 for e in self._obligation_embeddings if e)
|
|
logger.info("Got %d/%d valid embeddings", valid, len(texts))
|
|
|
|
# -----------------------------------------------------------------------
|
|
# Stats
|
|
# -----------------------------------------------------------------------
|
|
|
|
def stats(self) -> dict:
|
|
"""Return initialization statistics."""
|
|
return {
|
|
"total_obligations": len(self._obligations),
|
|
"article_lookups": len(self._article_lookup),
|
|
"embeddings_valid": sum(1 for e in self._obligation_embeddings if e),
|
|
"regulations": list(
|
|
{e.regulation_id for e in self._obligations.values()}
|
|
),
|
|
"initialized": self._initialized,
|
|
}
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Module-level helpers (reusable by other modules)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _normalize_regulation(regulation_code: str) -> Optional[str]:
|
|
"""Map a RAG regulation_code to obligation framework regulation ID."""
|
|
if not regulation_code:
|
|
return None
|
|
code = regulation_code.lower().strip()
|
|
|
|
# Direct lookup
|
|
if code in _REGULATION_CODE_TO_ID:
|
|
return _REGULATION_CODE_TO_ID[code]
|
|
|
|
# Prefix matching for families
|
|
for prefix, reg_id in [
|
|
("eu_2016_679", "dsgvo"),
|
|
("eu_2024_1689", "ai_act"),
|
|
("eu_2022_2555", "nis2"),
|
|
("eu_2022_2065", "dsa"),
|
|
("eu_2023_2854", "data_act"),
|
|
("eu_2023_1230", "eu_machinery"),
|
|
("eu_2022_2554", "dora"),
|
|
]:
|
|
if code.startswith(prefix):
|
|
return reg_id
|
|
|
|
return None
|
|
|
|
|
|
def _normalize_article(article: str) -> str:
|
|
"""Normalize article references for consistent lookup.
|
|
|
|
Examples:
|
|
"Art. 30" → "art. 30"
|
|
"§ 38 BDSG" → "§ 38"
|
|
"Article 10" → "art. 10"
|
|
"Art. 30 Abs. 1" → "art. 30"
|
|
"Artikel 35" → "art. 35"
|
|
"""
|
|
if not article:
|
|
return ""
|
|
s = article.strip()
|
|
|
|
# Remove trailing law name: "§ 38 BDSG" → "§ 38"
|
|
s = re.sub(r"\s+(DSGVO|BDSG|TTDSG|DSA|NIS2|DORA|AI.?Act)\s*$", "", s, flags=re.IGNORECASE)
|
|
|
|
# Remove paragraph references: "Art. 30 Abs. 1" → "Art. 30"
|
|
s = re.sub(r"\s+(Abs|Absatz|para|paragraph|lit|Satz)\.?\s+.*$", "", s, flags=re.IGNORECASE)
|
|
|
|
# Normalize "Article" / "Artikel" → "Art."
|
|
s = re.sub(r"^(Article|Artikel)\s+", "Art. ", s, flags=re.IGNORECASE)
|
|
|
|
return s.lower().strip()
|
|
|
|
|
|
def _cosine_sim(a: list[float], b: list[float]) -> float:
|
|
"""Compute cosine similarity between two vectors."""
|
|
if not a or not b or len(a) != len(b):
|
|
return 0.0
|
|
dot = sum(x * y for x, y in zip(a, b))
|
|
norm_a = sum(x * x for x in a) ** 0.5
|
|
norm_b = sum(x * x for x in b) ** 0.5
|
|
if norm_a == 0 or norm_b == 0:
|
|
return 0.0
|
|
return dot / (norm_a * norm_b)
|
|
|
|
|
|
def _find_obligations_dir() -> Optional[Path]:
|
|
"""Locate the obligations v2 directory."""
|
|
candidates = [
|
|
Path(__file__).resolve().parent.parent.parent.parent
|
|
/ "ai-compliance-sdk" / "policies" / "obligations" / "v2",
|
|
Path("/app/ai-compliance-sdk/policies/obligations/v2"),
|
|
Path("ai-compliance-sdk/policies/obligations/v2"),
|
|
]
|
|
for p in candidates:
|
|
if p.is_dir() and (p / "_manifest.json").exists():
|
|
return p
|
|
return None
|
|
|
|
|
|
async def _get_embedding(text: str) -> list[float]:
|
|
"""Get embedding vector for a single text."""
|
|
try:
|
|
async with httpx.AsyncClient(timeout=10.0) as client:
|
|
resp = await client.post(
|
|
f"{EMBEDDING_URL}/embed",
|
|
json={"texts": [text]},
|
|
)
|
|
resp.raise_for_status()
|
|
embeddings = resp.json().get("embeddings", [])
|
|
return embeddings[0] if embeddings else []
|
|
except Exception:
|
|
return []
|
|
|
|
|
|
async def _get_embeddings_batch(
|
|
texts: list[str], batch_size: int = 32
|
|
) -> list[list[float]]:
|
|
"""Get embeddings for multiple texts in batches."""
|
|
all_embeddings: list[list[float]] = []
|
|
for i in range(0, len(texts), batch_size):
|
|
batch = texts[i : i + batch_size]
|
|
try:
|
|
async with httpx.AsyncClient(timeout=30.0) as client:
|
|
resp = await client.post(
|
|
f"{EMBEDDING_URL}/embed",
|
|
json={"texts": batch},
|
|
)
|
|
resp.raise_for_status()
|
|
embeddings = resp.json().get("embeddings", [])
|
|
all_embeddings.extend(embeddings)
|
|
except Exception as e:
|
|
logger.warning("Batch embedding failed for %d texts: %s", len(batch), e)
|
|
all_embeddings.extend([[] for _ in batch])
|
|
return all_embeddings
|
|
|
|
|
|
async def _llm_ollama(prompt: str, system_prompt: Optional[str] = None) -> str:
|
|
"""Call local Ollama for LLM extraction."""
|
|
messages = []
|
|
if system_prompt:
|
|
messages.append({"role": "system", "content": system_prompt})
|
|
messages.append({"role": "user", "content": prompt})
|
|
|
|
payload = {
|
|
"model": OLLAMA_MODEL,
|
|
"messages": messages,
|
|
"stream": False,
|
|
"options": {"num_predict": 512},
|
|
"think": False,
|
|
}
|
|
|
|
try:
|
|
async with httpx.AsyncClient(timeout=LLM_TIMEOUT) as client:
|
|
resp = await client.post(f"{OLLAMA_URL}/api/chat", json=payload)
|
|
if resp.status_code != 200:
|
|
logger.error(
|
|
"Ollama chat failed %d: %s", resp.status_code, resp.text[:300]
|
|
)
|
|
return ""
|
|
data = resp.json()
|
|
return data.get("message", {}).get("content", "")
|
|
except Exception as e:
|
|
logger.warning("Ollama call failed: %s", e)
|
|
return ""
|
|
|
|
|
|
def _parse_json(text: str) -> dict:
|
|
"""Extract JSON from LLM response text."""
|
|
# Try direct parse
|
|
try:
|
|
return json.loads(text)
|
|
except json.JSONDecodeError:
|
|
pass
|
|
|
|
# Try extracting JSON block
|
|
match = re.search(r"\{[^{}]*\}", text, re.DOTALL)
|
|
if match:
|
|
try:
|
|
return json.loads(match.group())
|
|
except json.JSONDecodeError:
|
|
pass
|
|
|
|
return {}
|