fa45b5793c
Complete rewrite of rag_document_checker.py: - Queries canonical_controls table (294K controls, 10K data_protection) - Filters by category + title keywords per document type - Uses test_procedure field as actual check instructions - Regex pre-check extracts key terms from procedure → fast match - LLM fallback only for regex misses (saves tokens) - /no_think prefix for direct JSON output SQL approach advantages: - Structured data with test_procedure, pass_criteria, fail_criteria - Category filtering (data_protection, compliance, governance) - No Qdrant API key issues - Controls are actual check criteria, not general legal texts Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
242 lines
8.0 KiB
Python
242 lines
8.0 KiB
Python
"""
|
|
Document Checker with Canonical Controls — SQL-based verification.
|
|
|
|
Uses canonical_controls from PostgreSQL (not Qdrant) with:
|
|
- test_procedure: specific check instructions
|
|
- pass_criteria / evidence: what to look for
|
|
- Regex pre-check (fast) + LLM verification (semantic, for regex misses)
|
|
|
|
Flow:
|
|
Document text + type
|
|
→ SQL query for relevant controls by category + title keywords
|
|
→ For each control: check test_procedure against document text
|
|
→ LLM verifies if control requirements are met
|
|
"""
|
|
|
|
import logging
|
|
import os
|
|
import re
|
|
import json as _json
|
|
from typing import Optional
|
|
|
|
import httpx
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
|
|
OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "qwen3.5:35b-a3b")
|
|
|
|
# Document type → SQL filter keywords for canonical_controls
|
|
DOC_TYPE_FILTERS = {
|
|
"dse": {
|
|
"category": "data_protection",
|
|
"keywords": ["informationspflicht", "datenschutzerkl", "art. 13", "art. 14",
|
|
"betroffenenrecht", "verantwortlich", "datenschutzbeauftrag"],
|
|
},
|
|
"cookie": {
|
|
"category": "data_protection",
|
|
"keywords": ["cookie", "einwilligung", "tracking", "consent"],
|
|
},
|
|
"impressum": {
|
|
"category": "compliance",
|
|
"keywords": ["impressum", "anbieterkennzeichnung", "telemedien", "tmg"],
|
|
},
|
|
"widerruf": {
|
|
"category": "compliance",
|
|
"keywords": ["widerruf", "verbraucher", "fernabsatz"],
|
|
},
|
|
"agb": {
|
|
"category": "compliance",
|
|
"keywords": ["geschäftsbedingung", "agb", "vertragsklausel"],
|
|
},
|
|
}
|
|
|
|
|
|
async def check_document_with_controls(
|
|
text: str,
|
|
doc_type: str,
|
|
doc_title: str,
|
|
db_session,
|
|
max_controls: int = 10,
|
|
) -> list[dict]:
|
|
"""Check document against relevant canonical controls from DB."""
|
|
if not text or len(text) < 100:
|
|
return []
|
|
|
|
filters = DOC_TYPE_FILTERS.get(doc_type, DOC_TYPE_FILTERS.get("dse", {}))
|
|
category = filters.get("category", "data_protection")
|
|
keywords = filters.get("keywords", [])
|
|
|
|
# Query relevant controls from DB
|
|
controls = _query_controls(db_session, category, keywords, max_controls)
|
|
if not controls:
|
|
logger.info("No canonical controls found for '%s' (%s)", doc_title, doc_type)
|
|
return []
|
|
|
|
logger.info("Found %d canonical controls for '%s' (%s)", len(controls), doc_title, doc_type)
|
|
|
|
# Verify each control against document text
|
|
results = []
|
|
for control in controls:
|
|
check_result = await _verify_control(text, control)
|
|
if check_result:
|
|
results.append(check_result)
|
|
|
|
return results
|
|
|
|
|
|
def _query_controls(db_session, category: str, keywords: list[str], limit: int) -> list[dict]:
|
|
"""Query canonical_controls by category + title keywords."""
|
|
from sqlalchemy import text
|
|
|
|
# Build keyword filter
|
|
keyword_clauses = " OR ".join([f"title ILIKE :kw{i}" for i in range(len(keywords))])
|
|
params = {f"kw{i}": f"%{kw}%" for i, kw in enumerate(keywords)}
|
|
params["cat"] = category
|
|
params["limit"] = limit
|
|
|
|
query = text(f"""
|
|
SELECT id, title, objective, test_procedure, severity, category
|
|
FROM compliance.canonical_controls
|
|
WHERE category = :cat
|
|
AND release_state != 'deleted'
|
|
AND ({keyword_clauses})
|
|
ORDER BY risk_score DESC NULLS LAST
|
|
LIMIT :limit
|
|
""")
|
|
|
|
try:
|
|
result = db_session.execute(query, params)
|
|
controls = []
|
|
for row in result:
|
|
controls.append({
|
|
"id": str(row[0]),
|
|
"title": row[1],
|
|
"objective": row[2],
|
|
"test_procedure": row[3],
|
|
"severity": row[4],
|
|
"category": row[5],
|
|
})
|
|
return controls
|
|
except Exception as e:
|
|
logger.warning("Control query failed: %s", e)
|
|
return []
|
|
|
|
|
|
async def _verify_control(text: str, control: dict) -> Optional[dict]:
|
|
"""Verify if a control's test_procedure is fulfilled by the document text."""
|
|
title = control["title"]
|
|
test_proc = control.get("test_procedure", "[]")
|
|
|
|
# Parse test_procedure JSON
|
|
try:
|
|
procedures = _json.loads(test_proc) if isinstance(test_proc, str) else test_proc
|
|
except Exception:
|
|
procedures = [test_proc] if test_proc else []
|
|
|
|
if not procedures:
|
|
return None
|
|
|
|
# Quick regex pre-check — extract keywords from test procedure
|
|
proc_text = " ".join(str(p) for p in procedures).lower()
|
|
doc_lower = text.lower()
|
|
|
|
# Extract key terms from procedure
|
|
key_terms = re.findall(r'\b(?:prüf|überprüf|kontroll|verifiz|feststell|validier)\w*\s+(?:ob|dass|der|die|das)\s+(\w+(?:\s+\w+){0,3})', proc_text)
|
|
|
|
# If we can find key terms via regex, skip LLM
|
|
regex_found = False
|
|
evidence = ""
|
|
for term in key_terms:
|
|
if term in doc_lower:
|
|
idx = doc_lower.find(term)
|
|
evidence = doc_lower[max(0, idx-20):idx+len(term)+20]
|
|
regex_found = True
|
|
break
|
|
|
|
if regex_found:
|
|
return {
|
|
"id": f"ctrl-{control['id'][:8]}",
|
|
"label": title[:80],
|
|
"passed": True,
|
|
"severity": control.get("severity", "medium").upper(),
|
|
"matched_text": evidence[:100],
|
|
"control_text": title,
|
|
"regulation": control.get("category", ""),
|
|
}
|
|
|
|
# LLM verification for cases regex can't handle
|
|
return await _llm_verify(text, title, procedures, control)
|
|
|
|
|
|
async def _llm_verify(text: str, title: str, procedures: list, control: dict) -> Optional[dict]:
|
|
"""Ask LLM if control requirements are met."""
|
|
proc_str = "\n".join(f"- {p}" for p in procedures[:5])
|
|
|
|
# Truncate document
|
|
if len(text) > 6000:
|
|
doc_excerpt = text[:4000] + "\n...\n" + text[-2000:]
|
|
else:
|
|
doc_excerpt = text
|
|
|
|
prompt = (
|
|
f"/no_think\n"
|
|
f"Pruefe ob das Dokument die folgenden Anforderungen erfuellt.\n\n"
|
|
f"CONTROL: {title}\n"
|
|
f"PRUEFSCHRITTE:\n{proc_str}\n\n"
|
|
f"DOKUMENT (Auszug):\n{doc_excerpt[:3000]}\n\n"
|
|
f'Antworte NUR mit JSON: {{"fulfilled": true/false, "evidence": "textstelle max 80 zeichen"}}'
|
|
)
|
|
|
|
try:
|
|
async with httpx.AsyncClient(timeout=90.0) as client:
|
|
resp = await client.post(f"{OLLAMA_URL}/api/generate", json={
|
|
"model": OLLAMA_MODEL,
|
|
"prompt": prompt,
|
|
"stream": False,
|
|
"options": {"num_predict": 300},
|
|
})
|
|
|
|
if resp.status_code != 200:
|
|
return None
|
|
|
|
data = resp.json()
|
|
raw = data.get("response", "") or data.get("thinking", "")
|
|
raw = re.sub(r"<think>.*?</think>", "", raw, flags=re.DOTALL).strip()
|
|
|
|
# Parse JSON
|
|
json_match = re.search(r"\{[^{}]+\}", raw)
|
|
if json_match:
|
|
json_str = json_match.group()
|
|
json_str = re.sub(r'(?<=[{,])\s*(\w+)\s*:', r' "\1":', json_str)
|
|
json_str = json_str.replace("True", "true").replace("False", "false")
|
|
try:
|
|
result = _json.loads(json_str)
|
|
return {
|
|
"id": f"ctrl-{control['id'][:8]}",
|
|
"label": title[:80],
|
|
"passed": result.get("fulfilled", False),
|
|
"severity": control.get("severity", "medium").upper(),
|
|
"matched_text": result.get("evidence", "")[:100],
|
|
"control_text": title,
|
|
"regulation": control.get("category", ""),
|
|
}
|
|
except _json.JSONDecodeError:
|
|
pass
|
|
|
|
# Fallback
|
|
fulfilled = "true" in raw.lower()[:200] or "fulfilled" in raw.lower()[:200]
|
|
return {
|
|
"id": f"ctrl-{control['id'][:8]}",
|
|
"label": title[:80],
|
|
"passed": fulfilled,
|
|
"severity": control.get("severity", "medium").upper(),
|
|
"matched_text": "",
|
|
"control_text": title,
|
|
"regulation": control.get("category", ""),
|
|
}
|
|
|
|
except Exception as e:
|
|
logger.warning("LLM control verify failed: %s %s", type(e).__name__, e)
|
|
return None
|