feat: Control Library check via SQL (canonical_controls) instead of Qdrant

Complete rewrite of rag_document_checker.py:
- Queries canonical_controls table (294K controls, 10K data_protection)
- Filters by category + title keywords per document type
- Uses test_procedure field as actual check instructions
- Regex pre-check extracts key terms from procedure → fast match
- LLM fallback only for regex misses (saves tokens)
- /no_think prefix for direct JSON output

SQL approach advantages:
- Structured data with test_procedure, pass_criteria, fail_criteria
- Category filtering (data_protection, compliance, governance)
- No Qdrant API key issues
- Controls are actual check criteria, not general legal texts

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-05-06 20:26:56 +02:00
parent 7e7f31c344
commit fa45b5793c
2 changed files with 185 additions and 200 deletions
@@ -199,17 +199,28 @@ async def _check_single_document(entry: DocCheckEntry) -> list[DocCheckResult]:
# Main document check (full text against primary type) # Main document check (full text against primary type)
main_result = _run_checklist(doc_text, entry.doc_type, entry.label, entry.url, word_count) main_result = _run_checklist(doc_text, entry.doc_type, entry.label, entry.url, word_count)
# RAG-based deep check — DISABLED until Master Controls (G1 Decision Trace) are ready. # Control Library deep check — verifies against canonical_controls (SQL)
# The current 144K controls are general legal texts, not specific check criteria. try:
# Enable via rag_check=true in request when Master Controls are available. from compliance.services.rag_document_checker import check_document_with_controls
# try: from classroom_engine.database import SessionLocal
# from compliance.services.rag_document_checker import check_document_with_rag db = SessionLocal()
# rag_checks = await check_document_with_rag(doc_text, entry.doc_type, entry.label, entry.url) try:
# if rag_checks: ctrl_checks = await check_document_with_controls(
# for rc in rag_checks: doc_text, entry.doc_type, entry.label, db,
# main_result.checks.append(CheckItem(...)) )
# except Exception as e: logger.info("Control check: %d results for '%s'", len(ctrl_checks) if ctrl_checks else 0, entry.label)
# logger.warning("RAG check failed: %s", e) if ctrl_checks:
for rc in ctrl_checks:
main_result.checks.append(CheckItem(
id=rc["id"], label=rc["label"], passed=rc["passed"],
severity=rc["severity"], matched_text=rc.get("matched_text", ""),
))
if not rc["passed"]:
main_result.findings_count += 1
finally:
db.close()
except Exception as e:
logger.warning("Control check failed for %s: %s %s", entry.label, type(e).__name__, e)
all_results.append(main_result) all_results.append(main_result)
@@ -1,22 +1,22 @@
""" """
RAG-based Document Checker — semantic verification against Control Library. Document Checker with Canonical Controls — SQL-based verification.
Instead of fixed regex patterns, this uses: Uses canonical_controls from PostgreSQL (not Qdrant) with:
1. RAG search to find relevant controls for a document type - test_procedure: specific check instructions
2. LLM (Qwen 3.5:35b) to verify if each control is fulfilled - pass_criteria / evidence: what to look for
3. Template Generator for corrections when controls are not met - Regex pre-check (fast) + LLM verification (semantic, for regex misses)
Flow: Flow:
Document text + type Document text + type
Filter controls by regulation (144K → ~500) SQL query for relevant controls by category + title keywords
Semantic search for relevant controls (500 → 10-15) For each control: check test_procedure against document text
→ LLM checks each control against text → LLM verifies if control requirements are met
→ Returns fulfilled/missing + evidence + correction
""" """
import logging import logging
import os import os
import re import re
import json as _json
from typing import Optional from typing import Optional
import httpx import httpx
@@ -25,179 +25,174 @@ logger = logging.getLogger(__name__)
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434") OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "qwen3.5:35b-a3b") OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "qwen3.5:35b-a3b")
SDK_URL = os.getenv("SDK_URL", "http://ai-compliance-sdk:8090")
QDRANT_URL = os.getenv("QDRANT_INTERNAL_URL", "http://bp-core-qdrant:6333")
# Document type → Regulation keywords for RAG filtering # Document type → SQL filter keywords for canonical_controls
DOC_TYPE_REGULATIONS = { DOC_TYPE_FILTERS = {
"dse": ["DSGVO Art. 13", "DSGVO Art. 14", "Datenschutzinformation", "Informationspflicht"], "dse": {
"cookie": ["TDDDG §25", "ePrivacy", "Cookie", "Einwilligung Cookie"], "category": "data_protection",
"impressum": ["TMG §5", "MStV §18", "Impressum", "Anbieterkennzeichnung"], "keywords": ["informationspflicht", "datenschutzerkl", "art. 13", "art. 14",
"widerruf": ["BGB §355", "BGB §312g", "Widerrufsrecht", "Widerrufsbelehrung"], "betroffenenrecht", "verantwortlich", "datenschutzbeauftrag"],
"agb": ["BGB §305", "BGB §307", "BGB §309", "AGB", "Allgemeine Geschaeftsbedingungen"], },
"dsfa": ["DSGVO Art. 35", "Datenschutz-Folgenabschaetzung", "DSFA", "Risikoanalyse"], "cookie": {
"avv": ["DSGVO Art. 28", "Auftragsverarbeitung", "AVV"], "category": "data_protection",
"loeschkonzept": ["DSGVO Art. 5", "DIN 66398", "Loeschkonzept", "Aufbewahrungsfrist"], "keywords": ["cookie", "einwilligung", "tracking", "consent"],
},
"impressum": {
"category": "compliance",
"keywords": ["impressum", "anbieterkennzeichnung", "telemedien", "tmg"],
},
"widerruf": {
"category": "compliance",
"keywords": ["widerruf", "verbraucher", "fernabsatz"],
},
"agb": {
"category": "compliance",
"keywords": ["geschäftsbedingung", "agb", "vertragsklausel"],
},
} }
async def check_document_with_rag( async def check_document_with_controls(
text: str, text: str,
doc_type: str, doc_type: str,
doc_title: str, doc_title: str,
doc_url: str, db_session,
max_controls: int = 10, max_controls: int = 10,
) -> list[dict]: ) -> list[dict]:
"""Check document against relevant controls from RAG + LLM verification. """Check document against relevant canonical controls from DB."""
Returns list of check results with:
- id, label, passed, severity, matched_text, control_text, correction
"""
if not text or len(text) < 100: if not text or len(text) < 100:
return [] return []
# Step 1: Find relevant controls via RAG filters = DOC_TYPE_FILTERS.get(doc_type, DOC_TYPE_FILTERS.get("dse", {}))
regulations = DOC_TYPE_REGULATIONS.get(doc_type, DOC_TYPE_REGULATIONS["dse"]) category = filters.get("category", "data_protection")
controls = await _search_relevant_controls(text[:2000], regulations, max_controls) keywords = filters.get("keywords", [])
# Query relevant controls from DB
controls = _query_controls(db_session, category, keywords, max_controls)
if not controls: if not controls:
logger.info("No RAG controls found for %s (%s)", doc_title, doc_type) logger.info("No canonical controls found for '%s' (%s)", doc_title, doc_type)
return [] return []
logger.info("Found %d relevant controls for '%s' (%s)", len(controls), doc_title, doc_type) logger.info("Found %d canonical controls for '%s' (%s)", len(controls), doc_title, doc_type)
# Step 2: LLM verification for each control # Verify each control against document text
results = [] results = []
for control in controls: for control in controls:
check_result = await _verify_control_with_llm(text, control, doc_title) check_result = await _verify_control(text, control)
if check_result: if check_result:
results.append(check_result) results.append(check_result)
return results return results
async def _search_relevant_controls( def _query_controls(db_session, category: str, keywords: list[str], limit: int) -> list[dict]:
text_excerpt: str, """Query canonical_controls by category + title keywords."""
regulations: list[str], from sqlalchemy import text
top_k: int = 10,
) -> list[dict]:
"""Search for relevant controls — tries Go SDK first, falls back to direct Qdrant."""
# Try Go SDK RAG endpoint first
controls = await _search_via_sdk(regulations, top_k)
if controls:
return controls
# Fallback: search directly in Qdrant (local Mac Mini) # Build keyword filter
controls = await _search_via_qdrant(regulations, top_k) keyword_clauses = " OR ".join([f"title ILIKE :kw{i}" for i in range(len(keywords))])
return controls params = {f"kw{i}": f"%{kw}%" for i, kw in enumerate(keywords)}
params["cat"] = category
params["limit"] = limit
query = text(f"""
SELECT id, title, objective, test_procedure, severity, category
FROM compliance.canonical_controls
WHERE category = :cat
AND release_state != 'deleted'
AND ({keyword_clauses})
ORDER BY risk_score DESC NULLS LAST
LIMIT :limit
""")
async def _search_via_sdk(regulations: list[str], top_k: int) -> list[dict]:
"""Search via Go SDK RAG endpoint."""
try: try:
query = f"{regulations[0]} Anforderungen Pflichtangaben" result = db_session.execute(query, params)
async with httpx.AsyncClient(timeout=15.0) as client: controls = []
resp = await client.post(f"{SDK_URL}/sdk/v1/rag/search", json={ for row in result:
"query": query, controls.append({
"collection": "bp_compliance_datenschutz", "id": str(row[0]),
"top_k": top_k, "title": row[1],
"objective": row[2],
"test_procedure": row[3],
"severity": row[4],
"category": row[5],
}) })
if resp.status_code != 200: return controls
return []
data = resp.json()
return [{
"text": r.get("text", ""),
"regulation": r.get("regulation_code", "") or r.get("regulation_short", ""),
"article": r.get("article", ""),
"score": r.get("score", 0.0),
} for r in data.get("results", [])]
except Exception:
return []
EMBEDDING_URL = os.getenv("EMBEDDING_SERVICE_URL", "http://bp-core-embedding-service:8087")
async def _search_via_qdrant(regulations: list[str], top_k: int) -> list[dict]:
"""Semantic search in local Qdrant via embedding + vector search."""
try:
# Step 1: Embed the query
query_text = " ".join(regulations[:3]) + " Pflichtangaben Anforderungen"
async with httpx.AsyncClient(timeout=15.0) as client:
emb_resp = await client.post(f"{EMBEDDING_URL}/embed", json={"texts": [query_text]})
if emb_resp.status_code != 200:
logger.warning("Embedding failed: %d", emb_resp.status_code)
return []
vector = emb_resp.json().get("embeddings", [[]])[0]
if not vector:
return []
# Step 2: Search Qdrant with vector
all_results = []
for collection in ["bp_compliance_datenschutz", "bp_compliance_gesetze"]:
async with httpx.AsyncClient(timeout=10.0) as client:
resp = await client.post(f"{QDRANT_URL}/collections/{collection}/points/search", json={
"vector": vector,
"limit": top_k,
"with_payload": True,
})
if resp.status_code != 200:
continue
data = resp.json()
for point in data.get("result", []):
payload = point.get("payload", {})
chunk = payload.get("chunk_text", "")
if not chunk or len(chunk) < 50:
continue
all_results.append({
"text": chunk[:500],
"regulation": payload.get("regulation_id", "") or payload.get("section", ""),
"article": payload.get("section", ""),
"score": point.get("score", 0.0),
})
# Sort by score descending
all_results.sort(key=lambda x: x["score"], reverse=True)
logger.info("Qdrant semantic search: found %d results", len(all_results))
return all_results[:top_k]
except Exception as e: except Exception as e:
logger.warning("Qdrant semantic search failed: %s", e) logger.warning("Control query failed: %s", e)
return [] return []
async def _verify_control_with_llm( async def _verify_control(text: str, control: dict) -> Optional[dict]:
document_text: str, """Verify if a control's test_procedure is fulfilled by the document text."""
control: dict, title = control["title"]
doc_title: str, test_proc = control.get("test_procedure", "[]")
) -> Optional[dict]:
"""Ask LLM if a specific control requirement is fulfilled in the document."""
control_text = control["text"]
regulation = control.get("regulation", "")
# Truncate document for LLM context (keep first + last portion) # Parse test_procedure JSON
if len(document_text) > 8000: try:
doc_excerpt = document_text[:5000] + "\n...\n" + document_text[-3000:] procedures = _json.loads(test_proc) if isinstance(test_proc, str) else test_proc
except Exception:
procedures = [test_proc] if test_proc else []
if not procedures:
return None
# Quick regex pre-check — extract keywords from test procedure
proc_text = " ".join(str(p) for p in procedures).lower()
doc_lower = text.lower()
# Extract key terms from procedure
key_terms = re.findall(r'\b(?:prüf|überprüf|kontroll|verifiz|feststell|validier)\w*\s+(?:ob|dass|der|die|das)\s+(\w+(?:\s+\w+){0,3})', proc_text)
# If we can find key terms via regex, skip LLM
regex_found = False
evidence = ""
for term in key_terms:
if term in doc_lower:
idx = doc_lower.find(term)
evidence = doc_lower[max(0, idx-20):idx+len(term)+20]
regex_found = True
break
if regex_found:
return {
"id": f"ctrl-{control['id'][:8]}",
"label": title[:80],
"passed": True,
"severity": control.get("severity", "medium").upper(),
"matched_text": evidence[:100],
"control_text": title,
"regulation": control.get("category", ""),
}
# LLM verification for cases regex can't handle
return await _llm_verify(text, title, procedures, control)
async def _llm_verify(text: str, title: str, procedures: list, control: dict) -> Optional[dict]:
"""Ask LLM if control requirements are met."""
proc_str = "\n".join(f"- {p}" for p in procedures[:5])
# Truncate document
if len(text) > 6000:
doc_excerpt = text[:4000] + "\n...\n" + text[-2000:]
else: else:
doc_excerpt = document_text doc_excerpt = text
prompt = ( prompt = (
f"Pruefe ob der folgende Dokumenttext die Anforderung erfuellt.\n\n" f"/no_think\n"
f"ANFORDERUNG ({regulation}):\n{control_text[:500]}\n\n" f"Pruefe ob das Dokument die folgenden Anforderungen erfuellt.\n\n"
f"DOKUMENTTEXT:\n{doc_excerpt}\n\n" f"CONTROL: {title}\n"
f"Antworte NUR mit JSON (kein anderer Text):\n" f"PRUEFSCHRITTE:\n{proc_str}\n\n"
f'{{"fulfilled": true/false, "evidence": "gefundene Textstelle (max 100 Zeichen)", ' f"DOKUMENT (Auszug):\n{doc_excerpt[:3000]}\n\n"
f'"issue": "was fehlt oder falsch ist (leer wenn fulfilled)", ' f'Antworte NUR mit JSON: {{"fulfilled": true/false, "evidence": "textstelle max 80 zeichen"}}'
f'"severity": "HIGH/MEDIUM/LOW"}}'
) )
try: try:
async with httpx.AsyncClient(timeout=120.0) as client: async with httpx.AsyncClient(timeout=90.0) as client:
resp = await client.post(f"{OLLAMA_URL}/api/generate", json={ resp = await client.post(f"{OLLAMA_URL}/api/generate", json={
"model": OLLAMA_MODEL, "model": OLLAMA_MODEL,
"prompt": "/no_think\n" + prompt, # Disable thinking mode "prompt": prompt,
"stream": False, "stream": False,
"options": {"num_predict": 300}, "options": {"num_predict": 300},
}) })
@@ -206,62 +201,41 @@ async def _verify_control_with_llm(
return None return None
data = resp.json() data = resp.json()
# Qwen 3.5 may return content in 'response' or 'thinking' field raw = data.get("response", "") or data.get("thinking", "")
raw = data.get("response", "").strip()
if not raw:
raw = data.get("thinking", "").strip()
# Strip think tags if present
raw = re.sub(r"<think>.*?</think>", "", raw, flags=re.DOTALL).strip() raw = re.sub(r"<think>.*?</think>", "", raw, flags=re.DOTALL).strip()
# Parse JSON response — handle LLM quirks # Parse JSON
import json
# Find JSON in response
json_match = re.search(r"\{[^{}]+\}", raw) json_match = re.search(r"\{[^{}]+\}", raw)
if not json_match: if json_match:
# Fallback: try to extract fulfilled/evidence from raw text
fulfilled = "true" in raw.lower()[:100] or "yes" in raw.lower()[:100] or "erfüllt" in raw.lower()[:100]
return {
"id": f"rag-{hash(control_text) % 10000}",
"label": f"{regulation}: {control_text[:80]}...",
"passed": fulfilled,
"severity": "LOW" if fulfilled else "MEDIUM",
"matched_text": raw[:100] if fulfilled else "",
"issue": "" if fulfilled else raw[:100],
"control_text": control_text[:200],
"regulation": regulation,
}
json_str = json_match.group() json_str = json_match.group()
# Fix common LLM JSON issues json_str = re.sub(r'(?<=[{,])\s*(\w+)\s*:', r' "\1":', json_str)
json_str = re.sub(r'(?<=[{,])\s*(\w+)\s*:', r' "\1":', json_str) # Unquoted keys
json_str = json_str.replace("True", "true").replace("False", "false") json_str = json_str.replace("True", "true").replace("False", "false")
try: try:
result = json.loads(json_str) result = _json.loads(json_str)
except json.JSONDecodeError:
# Last resort: extract boolean from raw text
fulfilled = "true" in json_str.lower() or "fulfilled" in raw.lower()[:200]
return { return {
"id": f"rag-{hash(control_text) % 10000}", "id": f"ctrl-{control['id'][:8]}",
"label": f"{regulation}: {control_text[:80]}...", "label": title[:80],
"passed": fulfilled,
"severity": "LOW" if fulfilled else "MEDIUM",
"matched_text": "",
"issue": "",
"control_text": control_text[:200],
"regulation": regulation,
}
return {
"id": f"rag-{hash(control_text) % 10000}",
"label": f"{regulation}: {control_text[:80]}...",
"passed": result.get("fulfilled", False), "passed": result.get("fulfilled", False),
"severity": result.get("severity", "MEDIUM"), "severity": control.get("severity", "medium").upper(),
"matched_text": result.get("evidence", ""), "matched_text": result.get("evidence", "")[:100],
"issue": result.get("issue", ""), "control_text": title,
"control_text": control_text[:200], "regulation": control.get("category", ""),
"regulation": regulation, }
except _json.JSONDecodeError:
pass
# Fallback
fulfilled = "true" in raw.lower()[:200] or "fulfilled" in raw.lower()[:200]
return {
"id": f"ctrl-{control['id'][:8]}",
"label": title[:80],
"passed": fulfilled,
"severity": control.get("severity", "medium").upper(),
"matched_text": "",
"control_text": title,
"regulation": control.get("category", ""),
} }
except Exception as e: except Exception as e:
logger.warning("LLM verification failed: %s %s", type(e).__name__, e) logger.warning("LLM control verify failed: %s %s", type(e).__name__, e)
return None return None