""" Document Checker with Canonical Controls — SQL-based verification. Uses canonical_controls from PostgreSQL (not Qdrant) with: - test_procedure: specific check instructions - pass_criteria / evidence: what to look for - Regex pre-check (fast) + LLM verification (semantic, for regex misses) Flow: Document text + type → SQL query for relevant controls by category + title keywords → For each control: check test_procedure against document text → LLM verifies if control requirements are met """ import logging import os import re import json as _json from typing import Optional import httpx logger = logging.getLogger(__name__) OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434") OLLAMA_MODEL = os.getenv("OLLAMA_MODEL", "qwen3.5:35b-a3b") # Document type → SQL filter keywords for canonical_controls DOC_TYPE_FILTERS = { "dse": { "category": "data_protection", "keywords": ["informationspflicht", "datenschutzerkl", "art. 13", "art. 14", "betroffenenrecht", "verantwortlich", "datenschutzbeauftrag"], }, "cookie": { "category": "data_protection", "keywords": ["cookie", "einwilligung", "tracking", "consent"], }, "impressum": { "category": "compliance", "keywords": ["impressum", "anbieterkennzeichnung", "telemedien", "tmg"], }, "widerruf": { "category": "compliance", "keywords": ["widerruf", "verbraucher", "fernabsatz"], }, "agb": { "category": "compliance", "keywords": ["geschäftsbedingung", "agb", "vertragsklausel"], }, } async def check_document_with_controls( text: str, doc_type: str, doc_title: str, db_session, max_controls: int = 10, ) -> list[dict]: """Check document against relevant canonical controls from DB.""" if not text or len(text) < 100: return [] filters = DOC_TYPE_FILTERS.get(doc_type, DOC_TYPE_FILTERS.get("dse", {})) category = filters.get("category", "data_protection") keywords = filters.get("keywords", []) # Query relevant controls from DB controls = _query_controls(db_session, category, keywords, max_controls) if not controls: logger.info("No canonical controls found for '%s' (%s)", doc_title, doc_type) return [] logger.info("Found %d canonical controls for '%s' (%s)", len(controls), doc_title, doc_type) # Verify each control against document text results = [] for control in controls: check_result = await _verify_control(text, control) if check_result: results.append(check_result) return results def _query_controls(db_session, category: str, keywords: list[str], limit: int) -> list[dict]: """Query canonical_controls by category + title keywords.""" from sqlalchemy import text # Build keyword filter keyword_clauses = " OR ".join([f"title ILIKE :kw{i}" for i in range(len(keywords))]) params = {f"kw{i}": f"%{kw}%" for i, kw in enumerate(keywords)} params["cat"] = category params["limit"] = limit query = text(f""" SELECT id, title, objective, test_procedure, severity, category FROM compliance.canonical_controls WHERE category = :cat AND release_state != 'deleted' AND ({keyword_clauses}) ORDER BY risk_score DESC NULLS LAST LIMIT :limit """) try: result = db_session.execute(query, params) controls = [] for row in result: controls.append({ "id": str(row[0]), "title": row[1], "objective": row[2], "test_procedure": row[3], "severity": row[4], "category": row[5], }) return controls except Exception as e: logger.warning("Control query failed: %s", e) return [] async def _verify_control(text: str, control: dict) -> Optional[dict]: """Verify if a control's test_procedure is fulfilled by the document text.""" title = control["title"] test_proc = control.get("test_procedure", "[]") # Parse test_procedure JSON try: procedures = _json.loads(test_proc) if isinstance(test_proc, str) else test_proc except Exception: procedures = [test_proc] if test_proc else [] if not procedures: return None # Quick regex pre-check — extract keywords from test procedure proc_text = " ".join(str(p) for p in procedures).lower() doc_lower = text.lower() # Extract key terms from procedure key_terms = re.findall(r'\b(?:prüf|überprüf|kontroll|verifiz|feststell|validier)\w*\s+(?:ob|dass|der|die|das)\s+(\w+(?:\s+\w+){0,3})', proc_text) # If we can find key terms via regex, skip LLM regex_found = False evidence = "" for term in key_terms: if term in doc_lower: idx = doc_lower.find(term) evidence = doc_lower[max(0, idx-20):idx+len(term)+20] regex_found = True break if regex_found: return { "id": f"ctrl-{control['id'][:8]}", "label": title[:80], "passed": True, "severity": control.get("severity", "medium").upper(), "matched_text": evidence[:100], "control_text": title, "regulation": control.get("category", ""), } # LLM verification for cases regex can't handle return await _llm_verify(text, title, procedures, control) async def _llm_verify(text: str, title: str, procedures: list, control: dict) -> Optional[dict]: """Ask LLM if control requirements are met.""" proc_str = "\n".join(f"- {p}" for p in procedures[:5]) # Truncate document if len(text) > 6000: doc_excerpt = text[:4000] + "\n...\n" + text[-2000:] else: doc_excerpt = text prompt = ( f"/no_think\n" f"Pruefe ob das Dokument die folgenden Anforderungen erfuellt.\n\n" f"CONTROL: {title}\n" f"PRUEFSCHRITTE:\n{proc_str}\n\n" f"DOKUMENT (Auszug):\n{doc_excerpt[:3000]}\n\n" f'Antworte NUR mit JSON: {{"fulfilled": true/false, "evidence": "textstelle max 80 zeichen"}}' ) try: async with httpx.AsyncClient(timeout=90.0) as client: resp = await client.post(f"{OLLAMA_URL}/api/generate", json={ "model": OLLAMA_MODEL, "prompt": prompt, "stream": False, "options": {"num_predict": 300}, }) if resp.status_code != 200: return None data = resp.json() raw = data.get("response", "") or data.get("thinking", "") raw = re.sub(r".*?", "", raw, flags=re.DOTALL).strip() # Parse JSON json_match = re.search(r"\{[^{}]+\}", raw) if json_match: json_str = json_match.group() json_str = re.sub(r'(?<=[{,])\s*(\w+)\s*:', r' "\1":', json_str) json_str = json_str.replace("True", "true").replace("False", "false") try: result = _json.loads(json_str) return { "id": f"ctrl-{control['id'][:8]}", "label": title[:80], "passed": result.get("fulfilled", False), "severity": control.get("severity", "medium").upper(), "matched_text": result.get("evidence", "")[:100], "control_text": title, "regulation": control.get("category", ""), } except _json.JSONDecodeError: pass # Fallback fulfilled = "true" in raw.lower()[:200] or "fulfilled" in raw.lower()[:200] return { "id": f"ctrl-{control['id'][:8]}", "label": title[:80], "passed": fulfilled, "severity": control.get("severity", "medium").upper(), "matched_text": "", "control_text": title, "regulation": control.get("category", ""), } except Exception as e: logger.warning("LLM control verify failed: %s %s", type(e).__name__, e) return None