diff --git a/backend-compliance/compliance/api/agent_doc_check_routes.py b/backend-compliance/compliance/api/agent_doc_check_routes.py index 1e9db57..51975e1 100644 --- a/backend-compliance/compliance/api/agent_doc_check_routes.py +++ b/backend-compliance/compliance/api/agent_doc_check_routes.py @@ -202,7 +202,7 @@ async def _check_single_document(entry: DocCheckEntry) -> list[DocCheckResult]: all_results: list[DocCheckResult] = [] # Main document check (full text against primary type) - main_result = _run_checklist(doc_text, entry.doc_type, entry.label, entry.url, word_count) + main_result = await _run_checklist(doc_text, entry.doc_type, entry.label, entry.url, word_count) # Control Library deep check — DISABLED until doc-check-specific # Master Controls with binary pass/fail criteria are available. @@ -215,7 +215,7 @@ async def _check_single_document(entry: DocCheckEntry) -> list[DocCheckResult]: for section in sections: if section["word_count"] < 100: continue - sub_result = _run_checklist( + sub_result = await _run_checklist( section["text"], section["doc_type"], section["title"], entry.url, section["word_count"], @@ -232,8 +232,8 @@ async def _check_single_document(entry: DocCheckEntry) -> list[DocCheckResult]: )] -def _run_checklist(text: str, doc_type: str, label: str, url: str, word_count: int = 0) -> DocCheckResult: - """Run checklist against text and return structured result.""" +async def _run_checklist(text: str, doc_type: str, label: str, url: str, word_count: int = 0) -> DocCheckResult: + """Run checklist against text, then LLM-verify failed checks.""" findings = check_document_completeness(text, doc_type, label, url) all_checks: list[CheckItem] = [] @@ -253,6 +253,29 @@ def _run_checklist(text: str, doc_type: str, label: str, url: str, word_count: i completeness = f.get("completeness_pct", 0) correctness = f.get("correctness_pct", 0) + # LLM verification: re-check regex FAILs to eliminate false positives + failed = [c for c in all_checks if not c.passed and not c.skipped and c.hint] + if failed: + try: + from compliance.services.doc_checks.llm_verify import verify_failed_checks + overturns = await verify_failed_checks( + text, + [{"id": c.id, "label": c.label, "hint": c.hint} for c in failed], + label, + ) + for c in all_checks: + if c.id in overturns and overturns[c.id]["overturned"]: + c.passed = True + c.matched_text = f"[LLM] {overturns[c.id]['evidence']}" + logger.info("LLM overturned: %s in %s", c.label, label) + # Recompute correctness after overturns + l2_active = [c for c in all_checks if c.level == 2 and not c.skipped] + l2_passed = sum(1 for c in l2_active if c.passed) + if l2_active: + correctness = round(l2_passed / len(l2_active) * 100) + except Exception as e: + logger.warning("LLM verification skipped: %s", e) + non_score = [f for f in findings if "SCORE" not in f.get("code", "")] return DocCheckResult( label=label, url=url, doc_type=doc_type, @@ -315,6 +338,7 @@ def _split_into_sections(text: str, parent_label: str, url: str) -> list[dict]: "word_count": len(sec_text.split()), }) + prev_blank = False for line in lines: stripped = line.strip() is_heading = ( @@ -322,6 +346,10 @@ def _split_into_sections(text: str, parent_label: str, url: str) -> list[dict]: and not stripped.endswith(".") and not stripped.endswith(",") and stripped[0].isupper() + # Require preceding blank line OR line > 15 chars to avoid + # table column headers ("Funktion", "Speicherdauer") being + # treated as section headings + and (prev_blank or len(stripped) > 15) ) is_skip = is_heading and stripped.lower().strip() in SKIP_HEADINGS @@ -334,6 +362,8 @@ def _split_into_sections(text: str, parent_label: str, url: str) -> list[dict]: else: current_text.append(line) + prev_blank = len(stripped) == 0 + # Last section if current_heading: _save_section(current_heading, current_text) diff --git a/backend-compliance/compliance/services/doc_checks/dsfa_checks.py b/backend-compliance/compliance/services/doc_checks/dsfa_checks.py index c736bb7..6415c14 100644 --- a/backend-compliance/compliance/services/doc_checks/dsfa_checks.py +++ b/backend-compliance/compliance/services/doc_checks/dsfa_checks.py @@ -233,6 +233,9 @@ DSFA_CHECKLIST = [ r"(?:dokument|ergebnis|bericht).*(?:dsfa|folgenabsch(?:ae|ä)tzung)", r"(?:ergebnis|schlussfolgerung|bewertung).*(?:risiko|verarbeitung)", r"vorliegend.*(?:dsfa|analyse|bewertung|absch(?:ae|ä)tzung)", + r"\d\.\)\s*ergebnis", + r"(?:risiko|gefahr).*(?:gering|mittel|hoch).*(?:einstufen|bewerten|einsch(?:ae|ä)tz)", + r"(?:gering|mittel|hoch).*(?:einzustufen|zu\s+bewerten)", ], "severity": "MEDIUM", "hint": "Die Ergebnisse der DSFA sind nicht zusammenfassend dokumentiert. Erstellen Sie einen Ergebnisabschnitt, der die Schlussfolgerungen der Folgenabschaetzung und die Gesamtbewertung des Restrisikos festhält.", diff --git a/backend-compliance/compliance/services/doc_checks/llm_verify.py b/backend-compliance/compliance/services/doc_checks/llm_verify.py new file mode 100644 index 0000000..58b4325 --- /dev/null +++ b/backend-compliance/compliance/services/doc_checks/llm_verify.py @@ -0,0 +1,128 @@ +""" +LLM verification for regex check results. + +When a regex check FAILs, the LLM re-checks the original text +to confirm or overturn the finding. This eliminates false positives +caused by regex limitations (unusual formatting, synonyms, etc.). + +Uses the self-hosted Ollama endpoint (Qwen) for fast local inference. +""" + +import logging +import os +import httpx + +logger = logging.getLogger(__name__) + +OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434") +OLLAMA_MODEL = os.getenv("OLLAMA_VERIFY_MODEL", "qwen3:32b") +TIMEOUT = 30.0 + + +async def verify_failed_checks( + text: str, + failed_checks: list[dict], + doc_title: str, +) -> dict[str, dict]: + """Verify regex FAIL results using LLM. + + For each failed check, asks the LLM a binary YES/NO question. + Returns a dict mapping check_id -> {"overturned": bool, "evidence": str}. + + Only checks with a "hint" field are verified (hints contain the + natural-language question the LLM can answer). + """ + results: dict[str, dict] = {} + + if not failed_checks: + return results + + # Truncate text to fit context window + text_excerpt = text[:8000] + + for check in failed_checks: + check_id = check.get("id", "") + label = check.get("label", "") + hint = check.get("hint", "") + + if not hint: + continue + + try: + answer = await _ask_llm(text_excerpt, label, hint, doc_title) + overturned = answer.get("found", False) + results[check_id] = { + "overturned": overturned, + "evidence": answer.get("evidence", ""), + } + if overturned: + logger.info( + "LLM overturned regex FAIL for '%s' in '%s': %s", + label, doc_title, answer.get("evidence", "")[:80], + ) + except Exception as e: + logger.warning("LLM verify failed for '%s': %s", label, e) + + return results + + +async def _ask_llm( + text: str, check_label: str, hint: str, doc_title: str, +) -> dict: + """Ask the LLM a binary verification question.""" + prompt = f"""/no_think +Pruefe ob der folgende Dokumenttext die Anforderung erfuellt. + +ANFORDERUNG: {check_label} +DETAILS: {hint} +DOKUMENT: "{doc_title}" + +TEXT: +{text} + +Antworte NUR mit einem JSON-Objekt (keine Erklaerung): +{{"found": true/false, "evidence": "Zitat aus dem Text das die Anforderung belegt (max 100 Zeichen), oder leer wenn nicht gefunden"}} +""" + + async with httpx.AsyncClient(timeout=TIMEOUT) as client: + resp = await client.post( + f"{OLLAMA_URL}/api/generate", + json={ + "model": OLLAMA_MODEL, + "prompt": prompt, + "stream": False, + "options": {"temperature": 0.0, "num_predict": 200}, + }, + ) + resp.raise_for_status() + raw = resp.json().get("response", "") + + return _parse_llm_response(raw) + + +def _parse_llm_response(raw: str) -> dict: + """Parse LLM JSON response with fallback extraction.""" + import json + import re + + # Try direct JSON parse + raw = raw.strip() + # Extract JSON from markdown code blocks + m = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", raw, re.DOTALL) + if m: + raw = m.group(1) + # Or just find the JSON object + m = re.search(r"\{[^}]*\"found\"[^}]*\}", raw, re.DOTALL) + if m: + raw = m.group(0) + + try: + data = json.loads(raw) + return { + "found": bool(data.get("found", False)), + "evidence": str(data.get("evidence", ""))[:150], + } + except (json.JSONDecodeError, ValueError): + # Fallback: look for "found": true/false + found = '"found": true' in raw.lower() or '"found":true' in raw.lower() + return {"found": found, "evidence": ""}