feat: Integrate 1.874 Master Controls into document checking
Rewritten rag_document_checker.py to use doc_check_controls table instead of generic canonical_controls. Each MC has: - check_question: binary YES/NO for LLM - pass_criteria: JSONB list of concrete requirements - fail_criteria: JSONB list of common mistakes Flow: Regex checks (fast) → LLM verify FAILs → MC deep check (15 per doc) MC results appear as additional L2 checks in the report. Coverage: 571 DSE, 381 Cookie, 309 Loeschkonzept, 153 Widerruf, 147 DSFA, 125 AVV, 113 AGB, 75 Impressum = 1.874 total. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -283,10 +283,23 @@ async def _check_single_document(entry: DocCheckEntry) -> list[DocCheckResult]:
|
|||||||
# Main document check (full text against primary type)
|
# Main document check (full text against primary type)
|
||||||
main_result = await _run_checklist(doc_text, entry.doc_type, entry.label, entry.url, word_count)
|
main_result = await _run_checklist(doc_text, entry.doc_type, entry.label, entry.url, word_count)
|
||||||
|
|
||||||
# Control Library deep check — DISABLED until doc-check-specific
|
# Master Control deep check — 1.874 doc_check_controls with
|
||||||
# Master Controls with binary pass/fail criteria are available.
|
# binary pass/fail criteria verified by LLM (Qwen)
|
||||||
# See: zeroclaw/INSTRUCTION-master-controls-for-doc-check.md
|
try:
|
||||||
# Code: compliance/services/rag_document_checker.py (ready to re-enable)
|
from compliance.services.rag_document_checker import check_document_with_controls
|
||||||
|
mc_results = await check_document_with_controls(
|
||||||
|
doc_text, entry.doc_type, entry.label, max_controls=15,
|
||||||
|
)
|
||||||
|
if mc_results:
|
||||||
|
# Add MC results as additional checks to the main result
|
||||||
|
for mc in mc_results:
|
||||||
|
main_result.checks.append(CheckItem(**mc))
|
||||||
|
# Recompute correctness with MC results
|
||||||
|
l2 = [c for c in main_result.checks if c.level == 2 and not c.skipped]
|
||||||
|
l2_passed = sum(1 for c in l2 if c.passed)
|
||||||
|
main_result.correctness_pct = round(l2_passed / len(l2) * 100) if l2 else 0
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("MC check skipped: %s", e)
|
||||||
|
|
||||||
all_results.append(main_result)
|
all_results.append(main_result)
|
||||||
|
|
||||||
|
|||||||
@@ -1,16 +1,17 @@
|
|||||||
"""
|
"""
|
||||||
Document Checker with Canonical Controls — SQL-based verification.
|
Document Checker with Master Controls — SQL-based deep verification.
|
||||||
|
|
||||||
Uses canonical_controls from PostgreSQL (not Qdrant) with:
|
Uses doc_check_controls from PostgreSQL with:
|
||||||
- test_procedure: specific check instructions
|
- check_question: binary YES/NO question
|
||||||
- pass_criteria / evidence: what to look for
|
- pass_criteria: JSONB list of concrete must-haves
|
||||||
- Regex pre-check (fast) + LLM verification (semantic, for regex misses)
|
- fail_criteria: JSONB list of common mistakes
|
||||||
|
- LLM (Qwen) verifies each control against document text
|
||||||
|
|
||||||
Flow:
|
Flow:
|
||||||
Document text + type
|
Document text + doc_type
|
||||||
→ SQL query for relevant controls by category + title keywords
|
→ SQL query: SELECT * FROM compliance.doc_check_controls WHERE doc_type = ?
|
||||||
→ For each control: check test_procedure against document text
|
→ For each control: LLM answers check_question with pass/fail criteria
|
||||||
→ LLM verifies if control requirements are met
|
→ Returns structured results compatible with CheckItem format
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
@@ -26,33 +27,22 @@ logger = logging.getLogger(__name__)
|
|||||||
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
|
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
|
||||||
OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "qwen3.5:35b-a3b")
|
OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "qwen3.5:35b-a3b")
|
||||||
|
|
||||||
# Document type → SQL filter keywords for canonical_controls
|
# Map our doc_types to the DB doc_type values
|
||||||
DOC_TYPE_FILTERS = {
|
DOC_TYPE_MAP = {
|
||||||
"dse": {
|
"dse": "dse",
|
||||||
"category": "data_protection",
|
"datenschutz": "dse",
|
||||||
"keywords": ["informationspflicht"],
|
"privacy": "dse",
|
||||||
"test_proc_must_contain": ["datenschutzerkl", "informationspflicht", "art. 13", "art. 14"],
|
"cookie": "cookie",
|
||||||
},
|
"impressum": "impressum",
|
||||||
"cookie": {
|
"imprint": "impressum",
|
||||||
"category": "data_protection",
|
"widerruf": "widerruf",
|
||||||
"keywords": ["cookie", "einwilligung"],
|
"withdrawal": "widerruf",
|
||||||
"test_proc_must_contain": ["cookie", "einwilligung", "consent", "banner"],
|
"agb": "agb",
|
||||||
},
|
"terms": "agb",
|
||||||
"impressum": {
|
"dsfa": "dsfa",
|
||||||
"category": "compliance",
|
"social_media": "dse",
|
||||||
"keywords": ["impressum", "anbieterkennzeichnung"],
|
"avv": "avv",
|
||||||
"test_proc_must_contain": ["impressum", "anbieterkennzeichnung"],
|
"loeschkonzept": "loeschkonzept",
|
||||||
},
|
|
||||||
"widerruf": {
|
|
||||||
"category": "compliance",
|
|
||||||
"keywords": ["widerruf", "verbraucher"],
|
|
||||||
"test_proc_must_contain": ["widerruf", "fernabsatz"],
|
|
||||||
},
|
|
||||||
"agb": {
|
|
||||||
"category": "compliance",
|
|
||||||
"keywords": ["geschäftsbedingung", "agb"],
|
|
||||||
"test_proc_must_contain": ["geschäftsbedingung", "agb", "vertragsbedingung"],
|
|
||||||
},
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -60,199 +50,152 @@ async def check_document_with_controls(
|
|||||||
text: str,
|
text: str,
|
||||||
doc_type: str,
|
doc_type: str,
|
||||||
doc_title: str,
|
doc_title: str,
|
||||||
db_session,
|
db_url: str = "",
|
||||||
max_controls: int = 10,
|
max_controls: int = 20,
|
||||||
) -> list[dict]:
|
) -> list[dict]:
|
||||||
"""Check document against relevant canonical controls from DB."""
|
"""Check document against doc_check_controls from DB.
|
||||||
|
|
||||||
|
Returns list of CheckItem-compatible dicts.
|
||||||
|
"""
|
||||||
if not text or len(text) < 100:
|
if not text or len(text) < 100:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
filters = DOC_TYPE_FILTERS.get(doc_type, DOC_TYPE_FILTERS.get("dse", {}))
|
mapped_type = DOC_TYPE_MAP.get(doc_type, doc_type)
|
||||||
category = filters.get("category", "data_protection")
|
|
||||||
keywords = filters.get("keywords", [])
|
|
||||||
|
|
||||||
# Query relevant controls from DB
|
try:
|
||||||
test_proc_kw = filters.get("test_proc_must_contain")
|
import asyncpg
|
||||||
controls = _query_controls(db_session, category, keywords, max_controls, test_proc_kw)
|
db = db_url or os.getenv(
|
||||||
if not controls:
|
"DATABASE_URL",
|
||||||
logger.info("No canonical controls found for '%s' (%s)", doc_title, doc_type)
|
"postgresql://breakpilot:breakpilot@bp-core-postgres:5432/breakpilot",
|
||||||
|
)
|
||||||
|
conn = await asyncpg.connect(db)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("DB connection failed: %s", e)
|
||||||
return []
|
return []
|
||||||
|
|
||||||
logger.info("Found %d canonical controls for '%s' (%s)", len(controls), doc_title, doc_type)
|
try:
|
||||||
|
rows = await conn.fetch(
|
||||||
|
"""SELECT id, control_id, title, regulation, check_question,
|
||||||
|
pass_criteria, fail_criteria, severity
|
||||||
|
FROM compliance.doc_check_controls
|
||||||
|
WHERE doc_type = $1
|
||||||
|
ORDER BY severity DESC, title
|
||||||
|
LIMIT $2""",
|
||||||
|
mapped_type, max_controls,
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("MC query failed: %s", e)
|
||||||
|
await conn.close()
|
||||||
|
return []
|
||||||
|
|
||||||
|
await conn.close()
|
||||||
|
|
||||||
|
if not rows:
|
||||||
|
logger.info("No MCs for doc_type '%s' (%s)", mapped_type, doc_title)
|
||||||
|
return []
|
||||||
|
|
||||||
|
logger.info("Checking %d MCs for '%s' (%s)", len(rows), doc_title, mapped_type)
|
||||||
|
|
||||||
# Verify each control against document text
|
|
||||||
results = []
|
results = []
|
||||||
for control in controls:
|
for row in rows:
|
||||||
check_result = await _verify_control(text, control)
|
result = await _verify_mc(text, dict(row))
|
||||||
if check_result:
|
if result:
|
||||||
results.append(check_result)
|
results.append(result)
|
||||||
|
|
||||||
|
passed = sum(1 for r in results if r["passed"])
|
||||||
|
logger.info("MC results: %d/%d passed for '%s'", passed, len(results), doc_title)
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
def _query_controls(db_session, category: str, keywords: list[str], limit: int,
|
async def _verify_mc(text: str, mc: dict) -> Optional[dict]:
|
||||||
test_proc_keywords: list[str] | None = None) -> list[dict]:
|
"""Verify one master control against document text via LLM."""
|
||||||
"""Query canonical_controls by category + title + test_procedure keywords."""
|
question = mc.get("check_question", "")
|
||||||
from sqlalchemy import text
|
if not question:
|
||||||
|
|
||||||
# Build keyword filter for title
|
|
||||||
keyword_clauses = " OR ".join([f"title ILIKE :kw{i}" for i in range(len(keywords))])
|
|
||||||
params = {f"kw{i}": f"%{kw}%" for i, kw in enumerate(keywords)}
|
|
||||||
|
|
||||||
# Build test_procedure filter (ensures controls are relevant to document type)
|
|
||||||
proc_filter = ""
|
|
||||||
if test_proc_keywords:
|
|
||||||
proc_clauses = " OR ".join([f"test_procedure::text ILIKE :tp{i}" for i in range(len(test_proc_keywords))])
|
|
||||||
for i, tp in enumerate(test_proc_keywords):
|
|
||||||
params[f"tp{i}"] = f"%{tp}%"
|
|
||||||
proc_filter = f"AND ({proc_clauses})"
|
|
||||||
|
|
||||||
params["cat"] = category
|
|
||||||
params["limit"] = limit
|
|
||||||
|
|
||||||
query = text(f"""
|
|
||||||
SELECT id, title, objective, test_procedure, severity, category
|
|
||||||
FROM compliance.canonical_controls
|
|
||||||
WHERE category = :cat
|
|
||||||
AND release_state != 'deleted'
|
|
||||||
AND ({keyword_clauses})
|
|
||||||
{proc_filter}
|
|
||||||
AND test_procedure::text != '[]'
|
|
||||||
ORDER BY risk_score DESC NULLS LAST
|
|
||||||
LIMIT :limit
|
|
||||||
""")
|
|
||||||
|
|
||||||
try:
|
|
||||||
result = db_session.execute(query, params)
|
|
||||||
controls = []
|
|
||||||
for row in result:
|
|
||||||
controls.append({
|
|
||||||
"id": str(row[0]),
|
|
||||||
"title": row[1],
|
|
||||||
"objective": row[2],
|
|
||||||
"test_procedure": row[3],
|
|
||||||
"severity": row[4],
|
|
||||||
"category": row[5],
|
|
||||||
})
|
|
||||||
return controls
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning("Control query failed: %s", e)
|
|
||||||
return []
|
|
||||||
|
|
||||||
|
|
||||||
async def _verify_control(text: str, control: dict) -> Optional[dict]:
|
|
||||||
"""Verify if a control's test_procedure is fulfilled by the document text."""
|
|
||||||
title = control["title"]
|
|
||||||
test_proc = control.get("test_procedure", "[]")
|
|
||||||
|
|
||||||
# Parse test_procedure JSON
|
|
||||||
try:
|
|
||||||
procedures = _json.loads(test_proc) if isinstance(test_proc, str) else test_proc
|
|
||||||
except Exception:
|
|
||||||
procedures = [test_proc] if test_proc else []
|
|
||||||
|
|
||||||
if not procedures:
|
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# Quick regex pre-check — extract keywords from test procedure
|
pass_crit = mc.get("pass_criteria", [])
|
||||||
proc_text = " ".join(str(p) for p in procedures).lower()
|
fail_crit = mc.get("fail_criteria", [])
|
||||||
doc_lower = text.lower()
|
|
||||||
|
|
||||||
# Extract key terms from procedure
|
# Parse JSON if string
|
||||||
key_terms = re.findall(r'\b(?:prüf|überprüf|kontroll|verifiz|feststell|validier)\w*\s+(?:ob|dass|der|die|das)\s+(\w+(?:\s+\w+){0,3})', proc_text)
|
if isinstance(pass_crit, str):
|
||||||
|
try:
|
||||||
|
pass_crit = _json.loads(pass_crit)
|
||||||
|
except Exception:
|
||||||
|
pass_crit = [pass_crit]
|
||||||
|
if isinstance(fail_crit, str):
|
||||||
|
try:
|
||||||
|
fail_crit = _json.loads(fail_crit)
|
||||||
|
except Exception:
|
||||||
|
fail_crit = [fail_crit]
|
||||||
|
|
||||||
# If we can find key terms via regex, skip LLM
|
pass_str = "\n".join(f" - {p}" for p in pass_crit[:5])
|
||||||
regex_found = False
|
fail_str = "\n".join(f" - {f}" for f in fail_crit[:5])
|
||||||
evidence = ""
|
|
||||||
for term in key_terms:
|
|
||||||
if term in doc_lower:
|
|
||||||
idx = doc_lower.find(term)
|
|
||||||
evidence = doc_lower[max(0, idx-20):idx+len(term)+20]
|
|
||||||
regex_found = True
|
|
||||||
break
|
|
||||||
|
|
||||||
if regex_found:
|
# Truncate text
|
||||||
return {
|
doc_excerpt = text[:6000] if len(text) <= 8000 else text[:4000] + "\n...\n" + text[-2000:]
|
||||||
"id": f"ctrl-{control['id'][:8]}",
|
|
||||||
"label": title[:80],
|
|
||||||
"passed": True,
|
|
||||||
"severity": control.get("severity", "medium").upper(),
|
|
||||||
"matched_text": evidence[:100],
|
|
||||||
"control_text": title,
|
|
||||||
"regulation": control.get("category", ""),
|
|
||||||
}
|
|
||||||
|
|
||||||
# LLM verification for cases regex can't handle
|
|
||||||
return await _llm_verify(text, title, procedures, control)
|
|
||||||
|
|
||||||
|
|
||||||
async def _llm_verify(text: str, title: str, procedures: list, control: dict) -> Optional[dict]:
|
|
||||||
"""Ask LLM if control requirements are met."""
|
|
||||||
proc_str = "\n".join(f"- {p}" for p in procedures[:5])
|
|
||||||
|
|
||||||
# Truncate document
|
|
||||||
if len(text) > 6000:
|
|
||||||
doc_excerpt = text[:4000] + "\n...\n" + text[-2000:]
|
|
||||||
else:
|
|
||||||
doc_excerpt = text
|
|
||||||
|
|
||||||
prompt = (
|
prompt = (
|
||||||
f"/no_think\n"
|
f"/no_think\n"
|
||||||
f"Pruefe ob das Dokument die folgenden Anforderungen erfuellt.\n\n"
|
f"FRAGE: {question}\n\n"
|
||||||
f"CONTROL: {title}\n"
|
f"PASS wenn ALLE zutreffen:\n{pass_str}\n\n"
|
||||||
f"PRUEFSCHRITTE:\n{proc_str}\n\n"
|
f"FAIL wenn EINES zutrifft:\n{fail_str}\n\n"
|
||||||
f"DOKUMENT (Auszug):\n{doc_excerpt[:3000]}\n\n"
|
f"DOKUMENT:\n{doc_excerpt[:5000]}\n\n"
|
||||||
f'Antworte NUR mit JSON: {{"fulfilled": true/false, "evidence": "textstelle max 80 zeichen"}}'
|
f'Antworte NUR mit JSON: {{"passed": true/false, "evidence": "Textstelle max 80 Zeichen oder leer"}}'
|
||||||
)
|
)
|
||||||
|
|
||||||
try:
|
try:
|
||||||
async with httpx.AsyncClient(timeout=90.0) as client:
|
async with httpx.AsyncClient(timeout=30.0) as client:
|
||||||
resp = await client.post(f"{OLLAMA_URL}/api/generate", json={
|
resp = await client.post(f"{OLLAMA_URL}/api/generate", json={
|
||||||
"model": OLLAMA_MODEL,
|
"model": OLLAMA_MODEL,
|
||||||
"prompt": prompt,
|
"prompt": prompt,
|
||||||
"stream": False,
|
"stream": False,
|
||||||
"options": {"num_predict": 300},
|
"options": {"temperature": 0.0, "num_predict": 200},
|
||||||
})
|
})
|
||||||
|
|
||||||
if resp.status_code != 200:
|
if resp.status_code != 200:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
data = resp.json()
|
raw = resp.json().get("response", "")
|
||||||
raw = data.get("response", "") or data.get("thinking", "")
|
|
||||||
raw = re.sub(r"<think>.*?</think>", "", raw, flags=re.DOTALL).strip()
|
raw = re.sub(r"<think>.*?</think>", "", raw, flags=re.DOTALL).strip()
|
||||||
|
|
||||||
# Parse JSON
|
# Parse JSON
|
||||||
json_match = re.search(r"\{[^{}]+\}", raw)
|
json_match = re.search(r"\{[^{}]+\}", raw)
|
||||||
if json_match:
|
if json_match:
|
||||||
json_str = json_match.group()
|
json_str = json_match.group()
|
||||||
json_str = re.sub(r'(?<=[{,])\s*(\w+)\s*:', r' "\1":', json_str)
|
json_str = re.sub(r"(?<=[{,])\s*(\w+)\s*:", r' "\1":', json_str)
|
||||||
json_str = json_str.replace("True", "true").replace("False", "false")
|
json_str = json_str.replace("True", "true").replace("False", "false")
|
||||||
try:
|
try:
|
||||||
result = _json.loads(json_str)
|
result = _json.loads(json_str)
|
||||||
return {
|
return {
|
||||||
"id": f"ctrl-{control['id'][:8]}",
|
"id": f"mc-{mc.get('control_id', mc['id'][:8])}",
|
||||||
"label": title[:80],
|
"label": mc["title"][:80],
|
||||||
"passed": result.get("fulfilled", False),
|
"passed": bool(result.get("passed", False)),
|
||||||
"severity": control.get("severity", "medium").upper(),
|
"severity": (mc.get("severity") or "MEDIUM").upper(),
|
||||||
"matched_text": result.get("evidence", "")[:100],
|
"matched_text": str(result.get("evidence", ""))[:100],
|
||||||
"control_text": title,
|
"level": 2,
|
||||||
"regulation": control.get("category", ""),
|
"parent": None,
|
||||||
|
"skipped": False,
|
||||||
|
"hint": question,
|
||||||
|
"source": "master_control",
|
||||||
}
|
}
|
||||||
except _json.JSONDecodeError:
|
except _json.JSONDecodeError:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
# Fallback
|
# Fallback
|
||||||
fulfilled = "true" in raw.lower()[:200] or "fulfilled" in raw.lower()[:200]
|
passed = '"passed": true' in raw.lower() or '"passed":true' in raw.lower()
|
||||||
return {
|
return {
|
||||||
"id": f"ctrl-{control['id'][:8]}",
|
"id": f"mc-{mc.get('control_id', mc['id'][:8])}",
|
||||||
"label": title[:80],
|
"label": mc["title"][:80],
|
||||||
"passed": fulfilled,
|
"passed": passed,
|
||||||
"severity": control.get("severity", "medium").upper(),
|
"severity": (mc.get("severity") or "MEDIUM").upper(),
|
||||||
"matched_text": "",
|
"matched_text": "",
|
||||||
"control_text": title,
|
"level": 2,
|
||||||
"regulation": control.get("category", ""),
|
"parent": None,
|
||||||
|
"skipped": False,
|
||||||
|
"hint": question,
|
||||||
|
"source": "master_control",
|
||||||
}
|
}
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning("LLM control verify failed: %s %s", type(e).__name__, e)
|
logger.warning("MC verify failed for '%s': %s", mc["title"][:40], e)
|
||||||
return None
|
return None
|
||||||
|
|||||||
Reference in New Issue
Block a user