feat(rag): optimize RAG pipeline — JSON-Mode, CoT, Hybrid Search, Re-Ranking, Cross-Reg Dedup, chunk 1024
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 42s
CI/CD / test-python-backend-compliance (push) Successful in 1m38s
CI/CD / test-python-document-crawler (push) Successful in 20s
CI/CD / test-python-dsms-gateway (push) Successful in 17s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped

Phase 1 (LLM Quality):
- Add format=json to all Ollama payloads (obligation_extractor, control_generator, citation_backfill)
- Add Chain-of-Thought analysis steps to Pass 0a/0b system prompts

Phase 2 (Retrieval Quality):
- Hybrid search via Qdrant Query API with RRF fusion + automatic text index (legal_rag.go)
- Fallback to dense-only search if Query API unavailable
- Cross-encoder re-ranking with BGE Reranker v2 (RERANK_ENABLED=false by default)
- CPU-only PyTorch dependency to keep Docker image small

Phase 3 (Data Layer):
- Cross-regulation dedup pass (threshold 0.95) links controls across regulations
- DedupResult.link_type field distinguishes dedup_merge vs cross_regulation
- Chunk size defaults updated 512/50 → 1024/128 for new ingestions only
- Existing collections and controls are NOT affected

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-21 11:49:43 +01:00
parent c3a53fe5d2
commit c52dbdb8f1
24 changed files with 2620 additions and 139 deletions

View File

@@ -69,7 +69,7 @@ class AnchorFinder:
tags_str = " ".join(control.tags[:3]) if control.tags else ""
query = f"{control.title} {tags_str}".strip()
results = await self.rag.search(
results = await self.rag.search_with_rerank(
query=query,
collection="bp_compliance_ce",
top_k=15,

View File

@@ -391,6 +391,7 @@ async def _llm_ollama(prompt: str, system_prompt: Optional[str] = None) -> str:
"model": OLLAMA_MODEL,
"messages": messages,
"stream": False,
"format": "json",
"options": {"num_predict": 256},
"think": False,
}

View File

@@ -0,0 +1,733 @@
"""Control Deduplication Engine — 4-Stage Matching Pipeline.
Prevents duplicate atomic controls during Pass 0b by checking candidates
against existing controls before insertion.
Stages:
1. Pattern-Gate: pattern_id must match (hard gate)
2. Action-Check: normalized action verb must match (hard gate)
3. Object-Norm: normalized object must match (soft gate with high threshold)
4. Embedding: cosine similarity with tiered thresholds (Qdrant)
Verdicts:
- NEW: create a new atomic control
- LINK: add parent link to existing control (similarity > LINK_THRESHOLD)
- REVIEW: queue for human review (REVIEW_THRESHOLD < sim < LINK_THRESHOLD)
"""
import logging
import os
import re
from dataclasses import dataclass, field
from typing import Optional, Callable, Awaitable
import httpx
logger = logging.getLogger(__name__)
# ── Configuration ────────────────────────────────────────────────────
DEDUP_ENABLED = os.getenv("DEDUP_ENABLED", "true").lower() == "true"
LINK_THRESHOLD = float(os.getenv("DEDUP_LINK_THRESHOLD", "0.92"))
REVIEW_THRESHOLD = float(os.getenv("DEDUP_REVIEW_THRESHOLD", "0.85"))
LINK_THRESHOLD_DIFF_OBJECT = float(os.getenv("DEDUP_LINK_THRESHOLD_DIFF_OBJ", "0.95"))
CROSS_REG_LINK_THRESHOLD = float(os.getenv("DEDUP_CROSS_REG_THRESHOLD", "0.95"))
QDRANT_COLLECTION = os.getenv("DEDUP_QDRANT_COLLECTION", "atomic_controls")
QDRANT_URL = os.getenv("QDRANT_URL", "http://host.docker.internal:6333")
EMBEDDING_URL = os.getenv("EMBEDDING_URL", "http://embedding-service:8087")
# ── Result Dataclass ─────────────────────────────────────────────────
@dataclass
class DedupResult:
"""Outcome of the dedup check."""
verdict: str # "new" | "link" | "review"
matched_control_uuid: Optional[str] = None
matched_control_id: Optional[str] = None
matched_title: Optional[str] = None
stage: str = "" # which stage decided
similarity_score: float = 0.0
link_type: str = "dedup_merge" # "dedup_merge" | "cross_regulation"
details: dict = field(default_factory=dict)
# ── Action Normalization ─────────────────────────────────────────────
_ACTION_SYNONYMS: dict[str, str] = {
# German → canonical English
"implementieren": "implement",
"umsetzen": "implement",
"einrichten": "implement",
"einführen": "implement",
"aufbauen": "implement",
"bereitstellen": "implement",
"aktivieren": "implement",
"konfigurieren": "configure",
"einstellen": "configure",
"parametrieren": "configure",
"testen": "test",
"prüfen": "test",
"überprüfen": "test",
"verifizieren": "test",
"validieren": "test",
"kontrollieren": "test",
"auditieren": "audit",
"dokumentieren": "document",
"protokollieren": "log",
"aufzeichnen": "log",
"loggen": "log",
"überwachen": "monitor",
"monitoring": "monitor",
"beobachten": "monitor",
"schulen": "train",
"trainieren": "train",
"sensibilisieren": "train",
"löschen": "delete",
"entfernen": "delete",
"verschlüsseln": "encrypt",
"sperren": "block",
"beschränken": "restrict",
"einschränken": "restrict",
"begrenzen": "restrict",
"autorisieren": "authorize",
"genehmigen": "authorize",
"freigeben": "authorize",
"authentifizieren": "authenticate",
"identifizieren": "identify",
"melden": "report",
"benachrichtigen": "notify",
"informieren": "notify",
"aktualisieren": "update",
"erneuern": "update",
"sichern": "backup",
"wiederherstellen": "restore",
# English passthrough
"implement": "implement",
"configure": "configure",
"test": "test",
"verify": "test",
"validate": "test",
"audit": "audit",
"document": "document",
"log": "log",
"monitor": "monitor",
"train": "train",
"delete": "delete",
"encrypt": "encrypt",
"restrict": "restrict",
"authorize": "authorize",
"authenticate": "authenticate",
"report": "report",
"update": "update",
"backup": "backup",
"restore": "restore",
}
def normalize_action(action: str) -> str:
"""Normalize an action verb to a canonical English form."""
if not action:
return ""
action = action.strip().lower()
# Strip German infinitive/conjugation suffixes for lookup
action_base = re.sub(r"(en|t|st|e|te|tet|end)$", "", action)
# Try exact match first, then base form
if action in _ACTION_SYNONYMS:
return _ACTION_SYNONYMS[action]
if action_base in _ACTION_SYNONYMS:
return _ACTION_SYNONYMS[action_base]
# Fuzzy: check if action starts with any known verb
for verb, canonical in _ACTION_SYNONYMS.items():
if action.startswith(verb) or verb.startswith(action):
return canonical
return action # fallback: return as-is
# ── Object Normalization ─────────────────────────────────────────────
_OBJECT_SYNONYMS: dict[str, str] = {
# Authentication / Access
"mfa": "multi_factor_auth",
"multi-faktor-authentifizierung": "multi_factor_auth",
"mehrfaktorauthentifizierung": "multi_factor_auth",
"multi-factor authentication": "multi_factor_auth",
"two-factor": "multi_factor_auth",
"2fa": "multi_factor_auth",
"passwort": "password_policy",
"kennwort": "password_policy",
"password": "password_policy",
"zugangsdaten": "credentials",
"credentials": "credentials",
"admin-konten": "privileged_access",
"admin accounts": "privileged_access",
"administratorkonten": "privileged_access",
"privilegierte zugriffe": "privileged_access",
"privileged accounts": "privileged_access",
"remote-zugriff": "remote_access",
"fernzugriff": "remote_access",
"remote access": "remote_access",
"session": "session_management",
"sitzung": "session_management",
"sitzungsverwaltung": "session_management",
# Encryption
"verschlüsselung": "encryption",
"encryption": "encryption",
"kryptografie": "encryption",
"kryptografische verfahren": "encryption",
"schlüssel": "key_management",
"key management": "key_management",
"schlüsselverwaltung": "key_management",
"zertifikat": "certificate_management",
"certificate": "certificate_management",
"tls": "transport_encryption",
"ssl": "transport_encryption",
"https": "transport_encryption",
# Network
"firewall": "firewall",
"netzwerk": "network_security",
"network": "network_security",
"vpn": "vpn",
"segmentierung": "network_segmentation",
"segmentation": "network_segmentation",
# Logging / Monitoring
"audit-log": "audit_logging",
"audit log": "audit_logging",
"protokoll": "audit_logging",
"logging": "audit_logging",
"monitoring": "monitoring",
"überwachung": "monitoring",
"alerting": "alerting",
"alarmierung": "alerting",
"siem": "siem",
# Data
"personenbezogene daten": "personal_data",
"personal data": "personal_data",
"sensible daten": "sensitive_data",
"sensitive data": "sensitive_data",
"datensicherung": "backup",
"backup": "backup",
"wiederherstellung": "disaster_recovery",
"disaster recovery": "disaster_recovery",
# Policy / Process
"richtlinie": "policy",
"policy": "policy",
"verfahrensanweisung": "procedure",
"procedure": "procedure",
"prozess": "process",
"schulung": "training",
"training": "training",
"awareness": "awareness",
"sensibilisierung": "awareness",
# Incident
"vorfall": "incident",
"incident": "incident",
"sicherheitsvorfall": "security_incident",
"security incident": "security_incident",
# Vulnerability
"schwachstelle": "vulnerability",
"vulnerability": "vulnerability",
"patch": "patch_management",
"update": "patch_management",
"patching": "patch_management",
}
# Precompile for substring matching (longest first)
_OBJECT_KEYS_SORTED = sorted(_OBJECT_SYNONYMS.keys(), key=len, reverse=True)
def normalize_object(obj: str) -> str:
"""Normalize a compliance object to a canonical token."""
if not obj:
return ""
obj_lower = obj.strip().lower()
# Exact match
if obj_lower in _OBJECT_SYNONYMS:
return _OBJECT_SYNONYMS[obj_lower]
# Substring match (longest first)
for phrase in _OBJECT_KEYS_SORTED:
if phrase in obj_lower:
return _OBJECT_SYNONYMS[phrase]
# Fallback: strip articles/prepositions, join with underscore
cleaned = re.sub(r"\b(der|die|das|den|dem|des|ein|eine|eines|einem|einen"
r"|für|von|zu|auf|in|an|bei|mit|nach|über|unter|the|a|an"
r"|for|of|to|on|in|at|by|with)\b", "", obj_lower)
tokens = [t for t in cleaned.split() if len(t) > 2]
return "_".join(tokens[:4]) if tokens else obj_lower.replace(" ", "_")
# ── Canonicalization ─────────────────────────────────────────────────
def canonicalize_text(action: str, obj: str, title: str = "") -> str:
"""Build a canonical English text for embedding.
Transforms German compliance text into normalized English tokens
for more stable embedding comparisons.
"""
norm_action = normalize_action(action)
norm_object = normalize_object(obj)
# Build canonical sentence
parts = [norm_action, norm_object]
if title:
# Add title keywords (stripped of common filler)
title_clean = re.sub(
r"\b(und|oder|für|von|zu|der|die|das|den|dem|des|ein|eine"
r"|bei|mit|nach|gemäß|gem\.|laut|entsprechend)\b",
"", title.lower()
)
title_tokens = [t for t in title_clean.split() if len(t) > 3][:5]
if title_tokens:
parts.append("for")
parts.extend(title_tokens)
return " ".join(parts)
# ── Embedding Helper ─────────────────────────────────────────────────
async def get_embedding(text: str) -> list[float]:
"""Get embedding vector for a single text via embedding service."""
try:
async with httpx.AsyncClient(timeout=10.0) as client:
resp = await client.post(
f"{EMBEDDING_URL}/embed",
json={"texts": [text]},
)
embeddings = resp.json().get("embeddings", [])
return embeddings[0] if embeddings else []
except Exception as e:
logger.warning("Embedding failed: %s", e)
return []
def cosine_similarity(a: list[float], b: list[float]) -> float:
"""Compute cosine similarity between two vectors."""
if not a or not b or len(a) != len(b):
return 0.0
dot = sum(x * y for x, y in zip(a, b))
norm_a = sum(x * x for x in a) ** 0.5
norm_b = sum(x * x for x in b) ** 0.5
if norm_a == 0 or norm_b == 0:
return 0.0
return dot / (norm_a * norm_b)
# ── Qdrant Helpers ───────────────────────────────────────────────────
async def qdrant_search(
embedding: list[float],
pattern_id: str,
top_k: int = 10,
) -> list[dict]:
"""Search Qdrant for similar atomic controls, filtered by pattern_id."""
if not embedding:
return []
body: dict = {
"vector": embedding,
"limit": top_k,
"with_payload": True,
"filter": {
"must": [
{"key": "pattern_id", "match": {"value": pattern_id}}
]
},
}
try:
async with httpx.AsyncClient(timeout=10.0) as client:
resp = await client.post(
f"{QDRANT_URL}/collections/{QDRANT_COLLECTION}/points/search",
json=body,
)
if resp.status_code != 200:
logger.warning("Qdrant search failed: %d", resp.status_code)
return []
return resp.json().get("result", [])
except Exception as e:
logger.warning("Qdrant search error: %s", e)
return []
async def qdrant_search_cross_regulation(
embedding: list[float],
top_k: int = 5,
) -> list[dict]:
"""Search Qdrant for similar controls across ALL regulations (no pattern_id filter).
Used for cross-regulation linking (e.g. DSGVO Art. 25 ↔ NIS2 Art. 21).
"""
if not embedding:
return []
body: dict = {
"vector": embedding,
"limit": top_k,
"with_payload": True,
}
try:
async with httpx.AsyncClient(timeout=10.0) as client:
resp = await client.post(
f"{QDRANT_URL}/collections/{QDRANT_COLLECTION}/points/search",
json=body,
)
if resp.status_code != 200:
logger.warning("Qdrant cross-reg search failed: %d", resp.status_code)
return []
return resp.json().get("result", [])
except Exception as e:
logger.warning("Qdrant cross-reg search error: %s", e)
return []
async def qdrant_upsert(
point_id: str,
embedding: list[float],
payload: dict,
) -> bool:
"""Upsert a single point into the atomic_controls Qdrant collection."""
if not embedding:
return False
body = {
"points": [{
"id": point_id,
"vector": embedding,
"payload": payload,
}]
}
try:
async with httpx.AsyncClient(timeout=10.0) as client:
resp = await client.put(
f"{QDRANT_URL}/collections/{QDRANT_COLLECTION}/points",
json=body,
)
return resp.status_code == 200
except Exception as e:
logger.warning("Qdrant upsert error: %s", e)
return False
async def ensure_qdrant_collection(vector_size: int = 1024) -> bool:
"""Create the Qdrant collection if it doesn't exist (idempotent)."""
try:
async with httpx.AsyncClient(timeout=10.0) as client:
# Check if exists
resp = await client.get(f"{QDRANT_URL}/collections/{QDRANT_COLLECTION}")
if resp.status_code == 200:
return True
# Create
resp = await client.put(
f"{QDRANT_URL}/collections/{QDRANT_COLLECTION}",
json={
"vectors": {"size": vector_size, "distance": "Cosine"},
},
)
if resp.status_code == 200:
logger.info("Created Qdrant collection: %s", QDRANT_COLLECTION)
# Create payload indexes
for field_name in ["pattern_id", "action_normalized", "object_normalized", "control_id"]:
await client.put(
f"{QDRANT_URL}/collections/{QDRANT_COLLECTION}/index",
json={"field_name": field_name, "field_schema": "keyword"},
)
return True
logger.error("Failed to create Qdrant collection: %d", resp.status_code)
return False
except Exception as e:
logger.warning("Qdrant collection check error: %s", e)
return False
# ── Main Dedup Checker ───────────────────────────────────────────────
class ControlDedupChecker:
"""4-stage dedup checker for atomic controls.
Usage:
checker = ControlDedupChecker(db_session)
result = await checker.check_duplicate(candidate_action, candidate_object, candidate_title, pattern_id)
if result.verdict == "link":
checker.add_parent_link(result.matched_control_uuid, parent_uuid)
elif result.verdict == "review":
checker.write_review(candidate, result)
else:
# Insert new control
"""
def __init__(
self,
db,
embed_fn: Optional[Callable[[str], Awaitable[list[float]]]] = None,
search_fn: Optional[Callable] = None,
):
self.db = db
self._embed = embed_fn or get_embedding
self._search = search_fn or qdrant_search
self._cache: dict[str, list[dict]] = {} # pattern_id → existing controls
def _load_existing(self, pattern_id: str) -> list[dict]:
"""Load existing atomic controls with same pattern_id from DB."""
if pattern_id in self._cache:
return self._cache[pattern_id]
from sqlalchemy import text
rows = self.db.execute(text("""
SELECT id::text, control_id, title, objective,
pattern_id,
generation_metadata->>'obligation_type' as obligation_type
FROM canonical_controls
WHERE parent_control_uuid IS NOT NULL
AND release_state != 'deprecated'
AND pattern_id = :pid
"""), {"pid": pattern_id}).fetchall()
result = [
{
"uuid": r[0], "control_id": r[1], "title": r[2],
"objective": r[3], "pattern_id": r[4],
"obligation_type": r[5],
}
for r in rows
]
self._cache[pattern_id] = result
return result
async def check_duplicate(
self,
action: str,
obj: str,
title: str,
pattern_id: Optional[str],
) -> DedupResult:
"""Run the 4-stage dedup pipeline + cross-regulation linking.
Returns DedupResult with verdict: new/link/review.
"""
# No pattern_id → can't dedup meaningfully
if not pattern_id:
return DedupResult(verdict="new", stage="no_pattern")
# Stage 1: Pattern-Gate
existing = self._load_existing(pattern_id)
if not existing:
return DedupResult(
verdict="new", stage="pattern_gate",
details={"reason": "no existing controls with this pattern_id"},
)
# Stage 2: Action-Check
norm_action = normalize_action(action)
# We don't have action stored on existing controls from DB directly,
# so we use embedding for controls that passed pattern gate.
# But we CAN check via generation_metadata if available.
# Stage 3: Object-Normalization
norm_object = normalize_object(obj)
# Stage 4: Embedding Similarity
canonical = canonicalize_text(action, obj, title)
embedding = await self._embed(canonical)
if not embedding:
# Can't compute embedding → default to new
return DedupResult(
verdict="new", stage="embedding_unavailable",
details={"canonical_text": canonical},
)
# Search Qdrant
results = await self._search(embedding, pattern_id, top_k=5)
if not results:
# No intra-pattern matches → try cross-regulation
return await self._check_cross_regulation(embedding, DedupResult(
verdict="new", stage="no_qdrant_matches",
details={"canonical_text": canonical, "action": norm_action, "object": norm_object},
))
# Evaluate best match
best = results[0]
best_score = best.get("score", 0.0)
best_payload = best.get("payload", {})
best_action = best_payload.get("action_normalized", "")
best_object = best_payload.get("object_normalized", "")
# Action differs → NEW (even if embedding is high)
if best_action and norm_action and best_action != norm_action:
return await self._check_cross_regulation(embedding, DedupResult(
verdict="new", stage="action_mismatch",
similarity_score=best_score,
matched_control_id=best_payload.get("control_id"),
details={
"candidate_action": norm_action,
"existing_action": best_action,
"similarity": best_score,
},
))
# Object differs → use higher threshold
if best_object and norm_object and best_object != norm_object:
if best_score > LINK_THRESHOLD_DIFF_OBJECT:
return DedupResult(
verdict="link", stage="embedding_diff_object",
matched_control_uuid=best_payload.get("control_uuid"),
matched_control_id=best_payload.get("control_id"),
matched_title=best_payload.get("title"),
similarity_score=best_score,
details={"candidate_object": norm_object, "existing_object": best_object},
)
return await self._check_cross_regulation(embedding, DedupResult(
verdict="new", stage="object_mismatch_below_threshold",
similarity_score=best_score,
matched_control_id=best_payload.get("control_id"),
details={
"candidate_object": norm_object,
"existing_object": best_object,
"threshold": LINK_THRESHOLD_DIFF_OBJECT,
},
))
# Same action + same object → tiered thresholds
if best_score > LINK_THRESHOLD:
return DedupResult(
verdict="link", stage="embedding_match",
matched_control_uuid=best_payload.get("control_uuid"),
matched_control_id=best_payload.get("control_id"),
matched_title=best_payload.get("title"),
similarity_score=best_score,
)
if best_score > REVIEW_THRESHOLD:
return DedupResult(
verdict="review", stage="embedding_review",
matched_control_uuid=best_payload.get("control_uuid"),
matched_control_id=best_payload.get("control_id"),
matched_title=best_payload.get("title"),
similarity_score=best_score,
)
return await self._check_cross_regulation(embedding, DedupResult(
verdict="new", stage="embedding_below_threshold",
similarity_score=best_score,
details={"threshold": REVIEW_THRESHOLD},
))
async def _check_cross_regulation(
self,
embedding: list[float],
intra_result: DedupResult,
) -> DedupResult:
"""Second pass: cross-regulation linking for controls deemed 'new'.
Searches Qdrant WITHOUT pattern_id filter. Uses a higher threshold
(0.95) to avoid false positives across regulation boundaries.
"""
if intra_result.verdict != "new" or not embedding:
return intra_result
cross_results = await qdrant_search_cross_regulation(embedding, top_k=5)
if not cross_results:
return intra_result
best = cross_results[0]
best_score = best.get("score", 0.0)
if best_score > CROSS_REG_LINK_THRESHOLD:
best_payload = best.get("payload", {})
return DedupResult(
verdict="link",
stage="cross_regulation",
matched_control_uuid=best_payload.get("control_uuid"),
matched_control_id=best_payload.get("control_id"),
matched_title=best_payload.get("title"),
similarity_score=best_score,
link_type="cross_regulation",
details={
"cross_reg_score": best_score,
"cross_reg_threshold": CROSS_REG_LINK_THRESHOLD,
},
)
return intra_result
def add_parent_link(
self,
control_uuid: str,
parent_control_uuid: str,
link_type: str = "dedup_merge",
confidence: float = 0.0,
source_regulation: Optional[str] = None,
source_article: Optional[str] = None,
obligation_candidate_id: Optional[str] = None,
) -> None:
"""Add a parent link to an existing atomic control."""
from sqlalchemy import text
self.db.execute(text("""
INSERT INTO control_parent_links
(control_uuid, parent_control_uuid, link_type, confidence,
source_regulation, source_article, obligation_candidate_id)
VALUES (:cu, :pu, :lt, :conf, :sr, :sa, :oci::uuid)
ON CONFLICT (control_uuid, parent_control_uuid) DO NOTHING
"""), {
"cu": control_uuid,
"pu": parent_control_uuid,
"lt": link_type,
"conf": confidence,
"sr": source_regulation,
"sa": source_article,
"oci": obligation_candidate_id,
})
self.db.commit()
def write_review(
self,
candidate_control_id: str,
candidate_title: str,
candidate_objective: str,
result: DedupResult,
parent_control_uuid: Optional[str] = None,
obligation_candidate_id: Optional[str] = None,
) -> None:
"""Write a dedup review queue entry."""
from sqlalchemy import text
self.db.execute(text("""
INSERT INTO control_dedup_reviews
(candidate_control_id, candidate_title, candidate_objective,
matched_control_uuid, matched_control_id,
similarity_score, dedup_stage, dedup_details,
parent_control_uuid, obligation_candidate_id)
VALUES (:ccid, :ct, :co, :mcu::uuid, :mci, :ss, :ds,
:dd::jsonb, :pcu::uuid, :oci)
"""), {
"ccid": candidate_control_id,
"ct": candidate_title,
"co": candidate_objective,
"mcu": result.matched_control_uuid,
"mci": result.matched_control_id,
"ss": result.similarity_score,
"ds": result.stage,
"dd": __import__("json").dumps(result.details),
"pcu": parent_control_uuid,
"oci": obligation_candidate_id,
})
self.db.commit()
async def index_control(
self,
control_uuid: str,
control_id: str,
title: str,
action: str,
obj: str,
pattern_id: str,
) -> bool:
"""Index a new atomic control in Qdrant for future dedup checks."""
norm_action = normalize_action(action)
norm_object = normalize_object(obj)
canonical = canonicalize_text(action, obj, title)
embedding = await self._embed(canonical)
if not embedding:
return False
return await qdrant_upsert(
point_id=control_uuid,
embedding=embedding,
payload={
"control_uuid": control_uuid,
"control_id": control_id,
"title": title,
"pattern_id": pattern_id,
"action_normalized": norm_action,
"object_normalized": norm_object,
"canonical_text": canonical,
},
)

View File

@@ -75,12 +75,12 @@ REGULATION_LICENSE_MAP: dict[str, dict] = {
# RULE 1: FREE USE — Laws, Public Domain
# source_type: "law" = binding legislation, "guideline" = authority guidance (soft law),
# "standard" = voluntary framework/best practice, "restricted" = protected norm
# EU Regulations
"eu_2016_679": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "DSGVO"},
"eu_2024_1689": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "AI Act (KI-Verordnung)"},
"eu_2022_2555": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "NIS2"},
# EU Regulations — names MUST match canonical DB source names
"eu_2016_679": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "DSGVO (EU) 2016/679"},
"eu_2024_1689": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "KI-Verordnung (EU) 2024/1689"},
"eu_2022_2555": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "NIS2-Richtlinie (EU) 2022/2555"},
"eu_2024_2847": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "Cyber Resilience Act (CRA)"},
"eu_2023_1230": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "Maschinenverordnung"},
"eu_2023_1230": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "Maschinenverordnung (EU) 2023/1230"},
"eu_2022_2065": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "Digital Services Act (DSA)"},
"eu_2022_1925": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "Digital Markets Act (DMA)"},
"eu_2022_868": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "Data Governance Act (DGA)"},
@@ -88,52 +88,52 @@ REGULATION_LICENSE_MAP: dict[str, dict] = {
"eu_2021_914": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "Standardvertragsklauseln (SCC)"},
"eu_2002_58": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "ePrivacy-Richtlinie"},
"eu_2000_31": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "E-Commerce-Richtlinie"},
"eu_2023_1803": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "IFRS-Uebernahmeverordnung"},
"eu_2023_1803": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "IFRS-Übernahmeverordnung"},
"eucsa": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "EU Cybersecurity Act"},
"dataact": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "Data Act"},
"dora": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "Digital Operational Resilience Act"},
"ehds": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "European Health Data Space"},
"gpsr": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "Allgemeine Produktsicherheitsverordnung"},
"eu_2023_988": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "Allgemeine Produktsicherheitsverordnung (GPSR)"},
"eu_2023_1542": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "Batterieverordnung"},
"mica": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "Markets in Crypto-Assets"},
"eu_2023_1542": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "Batterieverordnung (EU) 2023/1542"},
"mica": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "Markets in Crypto-Assets (MiCA)"},
"psd2": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "Zahlungsdiensterichtlinie 2"},
"dpf": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "EU-US Data Privacy Framework"},
"dsm": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "DSM-Urheberrechtsrichtlinie"},
"amlr": {"license": "EU_LAW", "rule": 1, "source_type": "law", "name": "AML-Verordnung"},
"eu_blue_guide_2022": {"license": "EU_PUBLIC", "rule": 1, "source_type": "guideline", "name": "Blue Guide 2022"},
"eu_blue_guide_2022": {"license": "EU_PUBLIC", "rule": 1, "source_type": "guideline", "name": "EU Blue Guide 2022"},
# NIST (Public Domain — NOT laws, voluntary standards)
"nist_sp_800_53": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard", "name": "NIST SP 800-53"},
"nist_sp800_53r5": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard", "name": "NIST SP 800-53 Rev.5"},
"nist_sp_800_63b": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard", "name": "NIST SP 800-63B"},
"nist_sp_800_53": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard", "name": "NIST SP 800-53 Rev. 5"},
"nist_sp800_53r5": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard", "name": "NIST SP 800-53 Rev. 5"},
"nist_sp_800_63b": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard", "name": "NIST SP 800-63-3"},
"nist_sp800_63_3": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard", "name": "NIST SP 800-63-3"},
"nist_csf_2_0": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard", "name": "NIST CSF 2.0"},
"nist_sp_800_218": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard", "name": "NIST SSDF"},
"nist_sp800_218": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard", "name": "NIST SSDF"},
"nist_sp800_207": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard", "name": "NIST SP 800-207 Zero Trust"},
"nist_csf_2_0": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard", "name": "NIST Cybersecurity Framework 2.0"},
"nist_sp_800_218": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard", "name": "NIST SP 800-218 (SSDF)"},
"nist_sp800_218": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard", "name": "NIST SP 800-218 (SSDF)"},
"nist_sp800_207": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard", "name": "NIST SP 800-207 (Zero Trust)"},
"nist_ai_rmf": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard", "name": "NIST AI Risk Management Framework"},
"nist_privacy_1_0": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard", "name": "NIST Privacy Framework 1.0"},
"nistir_8259a": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard", "name": "NISTIR 8259A IoT Security"},
"cisa_secure_by_design": {"license": "US_GOV_PUBLIC", "rule": 1, "source_type": "standard", "name": "CISA Secure by Design"},
# German Laws
"bdsg": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "BDSG"},
"bdsg_2018_komplett": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "BDSG 2018"},
"bdsg": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "Bundesdatenschutzgesetz (BDSG)"},
"bdsg_2018_komplett": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "Bundesdatenschutzgesetz (BDSG)"},
"ttdsg": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "TTDSG"},
"tdddg_25": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "TDDDG"},
"tkg": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "TKG"},
"de_tkg": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "TKG"},
"bgb_komplett": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "BGB"},
"hgb": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "HGB"},
"hgb_komplett": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "HGB"},
"hgb": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "Handelsgesetzbuch (HGB)"},
"hgb_komplett": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "Handelsgesetzbuch (HGB)"},
"urhg_komplett": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "UrhG"},
"uwg": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "UWG"},
"tmg_komplett": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "TMG"},
"gewo": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "GewO"},
"ao": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "Abgabenordnung"},
"ao_komplett": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "Abgabenordnung"},
"gewo": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "Gewerbeordnung (GewO)"},
"ao": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "Abgabenordnung (AO)"},
"ao_komplett": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "Abgabenordnung (AO)"},
"battdg": {"license": "DE_LAW", "rule": 1, "source_type": "law", "name": "Batteriegesetz"},
# Austrian Laws
"at_dsg": {"license": "AT_LAW", "rule": 1, "source_type": "law", "name": "AT DSG"},
"at_dsg": {"license": "AT_LAW", "rule": 1, "source_type": "law", "name": "Österreichisches Datenschutzgesetz (DSG)"},
"at_abgb": {"license": "AT_LAW", "rule": 1, "source_type": "law", "name": "AT ABGB"},
"at_abgb_agb": {"license": "AT_LAW", "rule": 1, "source_type": "law", "name": "AT ABGB AGB-Recht"},
"at_bao": {"license": "AT_LAW", "rule": 1, "source_type": "law", "name": "AT BAO"},
@@ -141,7 +141,7 @@ REGULATION_LICENSE_MAP: dict[str, dict] = {
"at_ecg": {"license": "AT_LAW", "rule": 1, "source_type": "law", "name": "AT E-Commerce-Gesetz"},
"at_kschg": {"license": "AT_LAW", "rule": 1, "source_type": "law", "name": "AT Konsumentenschutzgesetz"},
"at_medieng": {"license": "AT_LAW", "rule": 1, "source_type": "law", "name": "AT Mediengesetz"},
"at_tkg": {"license": "AT_LAW", "rule": 1, "source_type": "law", "name": "AT TKG"},
"at_tkg": {"license": "AT_LAW", "rule": 1, "source_type": "law", "name": "Telekommunikationsgesetz Oesterreich"},
"at_ugb": {"license": "AT_LAW", "rule": 1, "source_type": "law", "name": "AT UGB"},
"at_ugb_ret": {"license": "AT_LAW", "rule": 1, "source_type": "law", "name": "AT UGB Retention"},
"at_uwg": {"license": "AT_LAW", "rule": 1, "source_type": "law", "name": "AT UWG"},
@@ -179,21 +179,21 @@ REGULATION_LICENSE_MAP: dict[str, dict] = {
"wp260_transparency": {"license": "EU_PUBLIC", "rule": 1, "source_type": "guideline", "name": "WP29 Transparency"},
# RULE 2: CITATION REQUIRED — CC-BY, CC-BY-SA (voluntary standards)
"owasp_asvs": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard", "name": "OWASP ASVS",
"owasp_asvs": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard", "name": "OWASP ASVS 4.0",
"attribution": "OWASP Foundation, CC BY-SA 4.0"},
"owasp_masvs": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard", "name": "OWASP MASVS",
"owasp_masvs": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard", "name": "OWASP MASVS 2.0",
"attribution": "OWASP Foundation, CC BY-SA 4.0"},
"owasp_top10": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard", "name": "OWASP Top 10",
"owasp_top10": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard", "name": "OWASP Top 10 (2021)",
"attribution": "OWASP Foundation, CC BY-SA 4.0"},
"owasp_top10_2021": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard", "name": "OWASP Top 10 2021",
"owasp_top10_2021": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard", "name": "OWASP Top 10 (2021)",
"attribution": "OWASP Foundation, CC BY-SA 4.0"},
"owasp_api_top10_2023": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard", "name": "OWASP API Top 10 2023",
"owasp_api_top10_2023": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard", "name": "OWASP API Security Top 10 (2023)",
"attribution": "OWASP Foundation, CC BY-SA 4.0"},
"owasp_samm": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard", "name": "OWASP SAMM",
"owasp_samm": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard", "name": "OWASP SAMM 2.0",
"attribution": "OWASP Foundation, CC BY-SA 4.0"},
"owasp_mobile_top10": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard", "name": "OWASP Mobile Top 10",
"attribution": "OWASP Foundation, CC BY-SA 4.0"},
"oecd_ai_principles": {"license": "OECD_PUBLIC", "rule": 2, "source_type": "standard", "name": "OECD AI Principles",
"oecd_ai_principles": {"license": "OECD_PUBLIC", "rule": 2, "source_type": "standard", "name": "OECD KI-Empfehlung",
"attribution": "OECD"},
# RULE 3: RESTRICTED — Full reformulation required
@@ -626,6 +626,7 @@ async def _llm_ollama(prompt: str, system_prompt: Optional[str] = None) -> str:
"model": OLLAMA_MODEL,
"messages": messages,
"stream": False,
"format": "json",
"options": {"num_predict": 512}, # Limit response length for speed
"think": False, # Disable thinking for faster responses
}
@@ -1040,8 +1041,10 @@ Quelle: {chunk.regulation_name} ({chunk.regulation_code}), {chunk.article}"""
effective_paragraph = llm_paragraph or chunk.paragraph or ""
control.license_rule = 1
control.source_original_text = chunk.text
# Use canonical name from REGULATION_LICENSE_MAP, not Qdrant's regulation_name
canonical_source = license_info.get("name", chunk.regulation_name)
control.source_citation = {
"source": chunk.regulation_name,
"source": canonical_source,
"article": effective_article,
"paragraph": effective_paragraph,
"license": license_info.get("license", ""),
@@ -1105,8 +1108,10 @@ Quelle: {chunk.regulation_name}, {chunk.article}"""
effective_paragraph = llm_paragraph or chunk.paragraph or ""
control.license_rule = 2
control.source_original_text = chunk.text
# Use canonical name from REGULATION_LICENSE_MAP, not Qdrant's regulation_name
canonical_source = license_info.get("name", chunk.regulation_name)
control.source_citation = {
"source": chunk.regulation_name,
"source": canonical_source,
"article": effective_article,
"paragraph": effective_paragraph,
"license": license_info.get("license", ""),
@@ -1277,8 +1282,10 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Elementen. Fuer Chunks ohne A
effective_paragraph = llm_paragraph or chunk.paragraph or ""
if lic["rule"] in (1, 2):
control.source_original_text = chunk.text
# Use canonical name from REGULATION_LICENSE_MAP, not Qdrant's regulation_name
canonical_source = lic.get("name", chunk.regulation_name)
control.source_citation = {
"source": chunk.regulation_name,
"source": canonical_source,
"article": effective_article,
"paragraph": effective_paragraph,
"license": lic.get("license", ""),

View File

@@ -46,20 +46,62 @@ ANTHROPIC_API_URL = "https://api.anthropic.com/v1"
# ---------------------------------------------------------------------------
# Normative signal detection (Rule 1)
# Normative signal detection — 3-Tier Classification
# ---------------------------------------------------------------------------
# Tier 1: Pflicht (mandatory) — strong normative signals
# Tier 2: Empfehlung (recommendation) — weaker normative signals
# Tier 3: Kann (optional/permissive) — permissive signals
# Nothing is rejected — everything is classified.
_NORMATIVE_SIGNALS = [
_PFLICHT_SIGNALS = [
# Deutsche modale Pflichtformulierungen
r"\bmüssen\b", r"\bmuss\b", r"\bhat\s+sicherzustellen\b",
r"\bhaben\s+sicherzustellen\b", r"\bsind\s+verpflichtet\b",
r"\bist\s+verpflichtet\b", r"\bist\s+zu\s+\w+en\b",
r"\bsind\s+zu\s+\w+en\b", r"\bhat\s+zu\s+\w+en\b",
r"\bhaben\s+zu\s+\w+en\b", r"\bsoll\b", r"\bsollen\b",
r"\bgewährleisten\b", r"\bsicherstellen\b",
r"\bist\s+verpflichtet\b",
# "ist zu prüfen", "sind zu dokumentieren" (direkt)
r"\bist\s+zu\s+\w+en\b", r"\bsind\s+zu\s+\w+en\b",
r"\bhat\s+zu\s+\w+en\b", r"\bhaben\s+zu\s+\w+en\b",
# "ist festzustellen", "sind vorzunehmen" (Compound-Verben, eingebettetes zu)
r"\bist\s+\w+zu\w+en\b", r"\bsind\s+\w+zu\w+en\b",
# "ist zusätzlich zu prüfen", "sind regelmäßig zu überwachen" (Adverb dazwischen)
r"\bist\s+\w+\s+zu\s+\w+en\b", r"\bsind\s+\w+\s+zu\s+\w+en\b",
r"\bhat\s+\w+\s+zu\s+\w+en\b", r"\bhaben\s+\w+\s+zu\s+\w+en\b",
# Englische Pflicht-Signale
r"\bshall\b", r"\bmust\b", r"\brequired\b",
r"\bshould\b", r"\bensure\b",
# Compound-Infinitive (Gerundivum): mitzuteilen, anzuwenden, bereitzustellen
r"\b\w+zuteilen\b", r"\b\w+zuwenden\b", r"\b\w+zustellen\b", r"\b\w+zulegen\b",
r"\b\w+zunehmen\b", r"\b\w+zuführen\b", r"\b\w+zuhalten\b", r"\b\w+zusetzen\b",
r"\b\w+zuweisen\b", r"\b\w+zuordnen\b", r"\b\w+zufügen\b", r"\b\w+zugeben\b",
# Breites Pattern: "ist ... [bis 80 Zeichen] ... zu + Infinitiv"
r"\bist\b.{1,80}\bzu\s+\w+en\b", r"\bsind\b.{1,80}\bzu\s+\w+en\b",
]
_NORMATIVE_RE = re.compile("|".join(_NORMATIVE_SIGNALS), re.IGNORECASE)
_PFLICHT_RE = re.compile("|".join(_PFLICHT_SIGNALS), re.IGNORECASE)
_EMPFEHLUNG_SIGNALS = [
# Modale Verben (schwaecher als "muss")
r"\bsoll\b", r"\bsollen\b", r"\bsollte\b", r"\bsollten\b",
r"\bgewährleisten\b", r"\bsicherstellen\b",
# Englische Empfehlungs-Signale
r"\bshould\b", r"\bensure\b", r"\brecommend\w*\b",
# Haeufige normative Infinitive (ohne Hilfsverb, als Empfehlung)
r"\bnachweisen\b", r"\beinhalten\b", r"\bunterlassen\b", r"\bwahren\b",
r"\bdokumentieren\b", r"\bimplementieren\b", r"\büberprüfen\b", r"\büberwachen\b",
# Pruefanweisungen als normative Aussage
r"\bprüfen,\s+ob\b", r"\bkontrollieren,\s+ob\b",
]
_EMPFEHLUNG_RE = re.compile("|".join(_EMPFEHLUNG_SIGNALS), re.IGNORECASE)
_KANN_SIGNALS = [
r"\bkann\b", r"\bkönnen\b", r"\bdarf\b", r"\bdürfen\b",
r"\bmay\b", r"\boptional\b",
]
_KANN_RE = re.compile("|".join(_KANN_SIGNALS), re.IGNORECASE)
# Union of all normative signals (for backward-compatible has_normative_signal flag)
_NORMATIVE_RE = re.compile(
"|".join(_PFLICHT_SIGNALS + _EMPFEHLUNG_SIGNALS + _KANN_SIGNALS),
re.IGNORECASE,
)
_RATIONALE_SIGNALS = [
r"\bda\s+", r"\bweil\b", r"\bgrund\b", r"\berwägung",
@@ -100,6 +142,7 @@ class ObligationCandidate:
object_: str = ""
condition: Optional[str] = None
normative_strength: str = "must"
obligation_type: str = "pflicht" # pflicht | empfehlung | kann
is_test_obligation: bool = False
is_reporting_obligation: bool = False
extraction_confidence: float = 0.0
@@ -115,6 +158,7 @@ class ObligationCandidate:
"object": self.object_,
"condition": self.condition,
"normative_strength": self.normative_strength,
"obligation_type": self.obligation_type,
"is_test_obligation": self.is_test_obligation,
"is_reporting_obligation": self.is_reporting_obligation,
"extraction_confidence": self.extraction_confidence,
@@ -162,11 +206,30 @@ class AtomicControlCandidate:
# ---------------------------------------------------------------------------
def classify_obligation_type(txt: str) -> str:
"""Classify obligation text into pflicht/empfehlung/kann.
Priority: pflicht > empfehlung > kann > empfehlung (default).
Nothing is rejected — obligations without normative signal default
to 'empfehlung' (recommendation).
"""
if _PFLICHT_RE.search(txt):
return "pflicht"
if _EMPFEHLUNG_RE.search(txt):
return "empfehlung"
if _KANN_RE.search(txt):
return "kann"
# No signal at all — LLM thought it was an obligation, classify
# as recommendation (the user can still use it).
return "empfehlung"
def quality_gate(candidate: ObligationCandidate) -> dict:
"""Validate an obligation candidate. Returns quality flags dict.
Checks:
has_normative_signal: text contains normative language
has_normative_signal: text contains normative language (informational)
obligation_type: pflicht | empfehlung | kann (classified, never rejected)
single_action: only one main action (heuristic)
not_rationale: not just a justification/reasoning
not_evidence_only: not just an evidence requirement
@@ -176,9 +239,12 @@ def quality_gate(candidate: ObligationCandidate) -> dict:
txt = candidate.obligation_text
flags = {}
# 1. Normative signal
# 1. Normative signal (informational — no longer used for rejection)
flags["has_normative_signal"] = bool(_NORMATIVE_RE.search(txt))
# 1b. Obligation type classification
flags["obligation_type"] = classify_obligation_type(txt)
# 2. Single action heuristic — count "und" / "and" / "sowie" splits
# that connect different verbs (imperfect but useful)
multi_verb_re = re.compile(
@@ -210,8 +276,12 @@ def quality_gate(candidate: ObligationCandidate) -> dict:
def passes_quality_gate(flags: dict) -> bool:
"""Check if all critical quality flags pass."""
critical = ["has_normative_signal", "not_evidence_only", "min_length", "has_parent_link"]
"""Check if critical quality flags pass.
Note: has_normative_signal is NO LONGER critical — obligations without
normative signal are classified as 'empfehlung' instead of being rejected.
"""
critical = ["not_evidence_only", "min_length", "has_parent_link"]
return all(flags.get(k, False) for k in critical)
@@ -224,6 +294,13 @@ _PASS0A_SYSTEM_PROMPT = """\
Du bist ein Rechts-Compliance-Experte. Du zerlegst Compliance-Controls \
in einzelne atomare Pflichten.
ANALYSE-SCHRITTE (intern durchfuehren, NICHT im Output!):
1. Identifiziere den Adressaten (Wer muss handeln?)
2. Identifiziere die Handlung (Was muss getan werden?)
3. Bestimme die normative Staerke (muss/soll/kann)
4. Pruefe ob Test- oder Meldepflicht vorliegt (separat erfassen!)
5. Formuliere jede Pflicht als eigenstaendiges JSON-Objekt
REGELN (STRIKT EINHALTEN):
1. Nur normative Aussagen extrahieren — erkennbar an: müssen, haben \
sicherzustellen, sind verpflichtet, ist zu dokumentieren, ist zu melden, \
@@ -272,6 +349,12 @@ _PASS0B_SYSTEM_PROMPT = """\
Du bist ein Security-Compliance-Experte. Du erstellst aus einer einzelnen \
normativen Pflicht ein praxisorientiertes, atomares Security Control.
ANALYSE-SCHRITTE (intern durchfuehren, NICHT im Output!):
1. Identifiziere die konkrete Anforderung aus der Pflicht
2. Leite eine umsetzbare technische/organisatorische Massnahme ab
3. Definiere ein Pruefverfahren (wie wird Umsetzung verifiziert?)
4. Bestimme den Nachweis (welches Dokument/Artefakt belegt Compliance?)
Das Control muss UMSETZBAR sein — keine Gesetzesparaphrase.
Antworte NUR als JSON. Keine Erklärungen."""
@@ -603,8 +686,15 @@ class DecompositionPass:
stats_0b = await decomp.run_pass0b(limit=100)
"""
def __init__(self, db: Session):
def __init__(self, db: Session, dedup_enabled: bool = False):
self.db = db
self._dedup = None
if dedup_enabled:
from compliance.services.control_dedup import (
ControlDedupChecker, DEDUP_ENABLED,
)
if DEDUP_ENABLED:
self._dedup = ControlDedupChecker(db)
# -------------------------------------------------------------------
# Pass 0a: Obligation Extraction
@@ -810,10 +900,11 @@ class DecompositionPass:
if not cand.is_reporting_obligation and _REPORTING_RE.search(cand.obligation_text):
cand.is_reporting_obligation = True
# Quality gate
# Quality gate + obligation type classification
flags = quality_gate(cand)
cand.quality_flags = flags
cand.extraction_confidence = _compute_extraction_confidence(flags)
cand.obligation_type = flags.get("obligation_type", "empfehlung")
if passes_quality_gate(flags):
cand.release_state = "validated"
@@ -877,6 +968,9 @@ class DecompositionPass:
"errors": 0,
"provider": "anthropic" if use_anthropic else "ollama",
"batch_size": batch_size,
"dedup_enabled": self._dedup is not None,
"dedup_linked": 0,
"dedup_review": 0,
}
# Prepare obligation data
@@ -915,7 +1009,7 @@ class DecompositionPass:
results_by_id = _parse_json_object(llm_response)
for obl in batch:
parsed = results_by_id.get(obl["candidate_id"], {})
self._process_pass0b_control(obl, parsed, stats)
await self._process_pass0b_control(obl, parsed, stats)
elif use_anthropic:
obl = batch[0]
prompt = _build_pass0b_prompt(
@@ -931,7 +1025,7 @@ class DecompositionPass:
)
stats["llm_calls"] += 1
parsed = _parse_json_object(llm_response)
self._process_pass0b_control(obl, parsed, stats)
await self._process_pass0b_control(obl, parsed, stats)
else:
from compliance.services.obligation_extractor import _llm_ollama
obl = batch[0]
@@ -948,7 +1042,7 @@ class DecompositionPass:
)
stats["llm_calls"] += 1
parsed = _parse_json_object(llm_response)
self._process_pass0b_control(obl, parsed, stats)
await self._process_pass0b_control(obl, parsed, stats)
except Exception as e:
ids = ", ".join(o["candidate_id"] for o in batch)
@@ -959,10 +1053,16 @@ class DecompositionPass:
logger.info("Pass 0b: %s", stats)
return stats
def _process_pass0b_control(
async def _process_pass0b_control(
self, obl: dict, parsed: dict, stats: dict,
) -> None:
"""Create atomic control from parsed LLM output or template fallback."""
"""Create atomic control from parsed LLM output or template fallback.
If dedup is enabled, checks for duplicates before insertion:
- LINK: adds parent link to existing control instead of creating new
- REVIEW: queues for human review, does not create control
- NEW: creates new control and indexes in Qdrant
"""
if not parsed or not parsed.get("title"):
atomic = _template_fallback(
obligation_text=obl["obligation_text"],
@@ -990,6 +1090,56 @@ class DecompositionPass:
atomic.parent_control_uuid = obl["parent_uuid"]
atomic.obligation_candidate_id = obl["candidate_id"]
# ── Dedup check (if enabled) ────────────────────────────
if self._dedup:
pattern_id = None
# Try to get pattern_id from parent control
pid_row = self.db.execute(text(
"SELECT pattern_id FROM canonical_controls WHERE id = CAST(:uid AS uuid)"
), {"uid": obl["parent_uuid"]}).fetchone()
if pid_row:
pattern_id = pid_row[0]
result = await self._dedup.check_duplicate(
action=obl.get("action", ""),
obj=obl.get("object", ""),
title=atomic.title,
pattern_id=pattern_id,
)
if result.verdict == "link":
self._dedup.add_parent_link(
control_uuid=result.matched_control_uuid,
parent_control_uuid=obl["parent_uuid"],
link_type="dedup_merge",
confidence=result.similarity_score,
)
stats.setdefault("dedup_linked", 0)
stats["dedup_linked"] += 1
stats["candidates_processed"] += 1
logger.info("Dedup LINK: %s%s (%.3f, %s)",
atomic.title[:60], result.matched_control_id,
result.similarity_score, result.stage)
return
if result.verdict == "review":
self._dedup.write_review(
candidate_control_id=atomic.candidate_id or "",
candidate_title=atomic.title,
candidate_objective=atomic.objective,
result=result,
parent_control_uuid=obl["parent_uuid"],
obligation_candidate_id=obl.get("oc_id"),
)
stats.setdefault("dedup_review", 0)
stats["dedup_review"] += 1
stats["candidates_processed"] += 1
logger.info("Dedup REVIEW: %s%s (%.3f, %s)",
atomic.title[:60], result.matched_control_id,
result.similarity_score, result.stage)
return
# ── Create new atomic control ───────────────────────────
seq = self._next_atomic_seq(obl["parent_control_id"])
atomic.candidate_id = f"{obl['parent_control_id']}-A{seq:02d}"
@@ -1006,6 +1156,29 @@ class DecompositionPass:
{"oc_id": obl["oc_id"]},
)
# Index in Qdrant for future dedup checks
if self._dedup:
pattern_id_val = None
pid_row2 = self.db.execute(text(
"SELECT pattern_id FROM canonical_controls WHERE id = CAST(:uid AS uuid)"
), {"uid": obl["parent_uuid"]}).fetchone()
if pid_row2:
pattern_id_val = pid_row2[0]
# Get the UUID of the newly inserted control
new_row = self.db.execute(text(
"SELECT id::text FROM canonical_controls WHERE control_id = :cid ORDER BY created_at DESC LIMIT 1"
), {"cid": atomic.candidate_id}).fetchone()
if new_row and pattern_id_val:
await self._dedup.index_control(
control_uuid=new_row[0],
control_id=atomic.candidate_id,
title=atomic.title,
action=obl.get("action", ""),
obj=obl.get("object", ""),
pattern_id=pattern_id_val,
)
stats["controls_created"] += 1
stats["candidates_processed"] += 1
@@ -1415,7 +1588,7 @@ class DecompositionPass:
if pass_type == "0a":
self._handle_batch_result_0a(custom_id, text_content, stats)
else:
self._handle_batch_result_0b(custom_id, text_content, stats)
await self._handle_batch_result_0b(custom_id, text_content, stats)
except Exception as e:
logger.error("Processing batch result %s: %s", custom_id, e)
stats["errors"] += 1
@@ -1466,7 +1639,7 @@ class DecompositionPass:
self._process_pass0a_obligations(raw_obls, control_id, control_uuid, stats)
stats["controls_processed"] += 1
def _handle_batch_result_0b(
async def _handle_batch_result_0b(
self, custom_id: str, text_content: str, stats: dict,
) -> None:
"""Process a single Pass 0b batch result."""
@@ -1477,14 +1650,14 @@ class DecompositionPass:
parsed = _parse_json_object(text_content)
obl = self._load_obligation_for_0b(candidate_ids[0])
if obl:
self._process_pass0b_control(obl, parsed, stats)
await self._process_pass0b_control(obl, parsed, stats)
else:
results_by_id = _parse_json_object(text_content)
for cand_id in candidate_ids:
parsed = results_by_id.get(cand_id, {})
obl = self._load_obligation_for_0b(cand_id)
if obl:
self._process_pass0b_control(obl, parsed, stats)
await self._process_pass0b_control(obl, parsed, stats)
def _load_obligation_for_0b(self, candidate_id: str) -> Optional[dict]:
"""Load obligation data needed for Pass 0b processing."""

View File

@@ -524,6 +524,7 @@ async def _llm_ollama(prompt: str, system_prompt: Optional[str] = None) -> str:
"model": OLLAMA_MODEL,
"messages": messages,
"stream": False,
"format": "json",
"options": {"num_predict": 512},
"think": False,
}

View File

@@ -100,6 +100,40 @@ class ComplianceRAGClient:
logger.warning("RAG search failed: %s", e)
return []
async def search_with_rerank(
self,
query: str,
collection: str = "bp_compliance_ce",
regulations: Optional[List[str]] = None,
top_k: int = 5,
) -> List[RAGSearchResult]:
"""
Search with optional cross-encoder re-ranking.
Fetches top_k*4 results from RAG, then re-ranks with cross-encoder
and returns top_k. Falls back to regular search if reranker is disabled.
"""
from .reranker import get_reranker
reranker = get_reranker()
if reranker is None:
return await self.search(query, collection, regulations, top_k)
# Fetch more candidates for re-ranking
candidates = await self.search(
query, collection, regulations, top_k=max(top_k * 4, 20)
)
if not candidates:
return []
texts = [c.text for c in candidates]
try:
ranked_indices = reranker.rerank(query, texts, top_k=top_k)
return [candidates[i] for i in ranked_indices]
except Exception as e:
logger.warning("Reranking failed, returning unranked: %s", e)
return candidates[:top_k]
async def scroll(
self,
collection: str,

View File

@@ -0,0 +1,85 @@
"""
Cross-Encoder Re-Ranking for RAG Search Results.
Uses BGE Reranker v2 (BAAI/bge-reranker-v2-m3, MIT license) to re-rank
search results from Qdrant for improved retrieval quality.
Lazy-loads the model on first use. Disabled by default (RERANK_ENABLED=false).
"""
import logging
import os
from typing import Optional
logger = logging.getLogger(__name__)
RERANK_ENABLED = os.getenv("RERANK_ENABLED", "false").lower() == "true"
RERANK_MODEL = os.getenv("RERANK_MODEL", "BAAI/bge-reranker-v2-m3")
class Reranker:
"""Cross-encoder reranker using sentence-transformers."""
def __init__(self, model_name: str = RERANK_MODEL):
self._model = None # Lazy init
self._model_name = model_name
def _ensure_model(self) -> None:
"""Load model on first use."""
if self._model is not None:
return
try:
from sentence_transformers import CrossEncoder
logger.info("Loading reranker model: %s", self._model_name)
self._model = CrossEncoder(self._model_name)
logger.info("Reranker model loaded successfully")
except ImportError:
logger.error(
"sentence-transformers not installed. "
"Install with: pip install sentence-transformers"
)
raise
except Exception as e:
logger.error("Failed to load reranker model: %s", e)
raise
def rerank(
self, query: str, texts: list[str], top_k: int = 5
) -> list[int]:
"""
Return indices of top_k texts sorted by relevance (highest first).
Args:
query: The search query.
texts: List of candidate texts to re-rank.
top_k: Number of top results to return.
Returns:
List of indices into the original texts list, sorted by relevance.
"""
if not texts:
return []
self._ensure_model()
pairs = [[query, text] for text in texts]
scores = self._model.predict(pairs)
# Sort by score descending, return indices
ranked = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)
return ranked[:top_k]
# Module-level singleton
_reranker: Optional[Reranker] = None
def get_reranker() -> Optional[Reranker]:
"""Get the shared reranker instance. Returns None if disabled."""
global _reranker
if not RERANK_ENABLED:
return None
if _reranker is None:
_reranker = Reranker()
return _reranker