Files
breakpilot-compliance/backend-compliance/compliance/services/doc_checks/llm_verify.py
T
Benjamin Admin 8a44e67293 feat(compliance-check): unlock all 1874 MCs + close gap-table items
User: 'wir haben 1800 MCs erstellt um sie zu 10% zu nutzen — das ist
Schwachsinn'. Fixed all 6 gaps from the audit.

#1 max_controls=0 (was 20):
- agent_compliance_check_routes _check_single: passes max_controls=0 to
  check_document_with_controls -> ALL MCs evaluated per doc_type.
- 8 doc_types now use 1874 MCs instead of 160 (10x coverage).
- Regex matching is cheap (<1s per doc); LLM-enrich cap of 10 stays.

#2 LLM-verify fixed:
- llm_verify.py was getting 0/N parsed. Causes: qwen3 thinking-mode
  wrapped output in <think>...</think>, /api/generate doesn't enforce
  JSON, prompt didn't handle code-fence wrappers.
- Now uses /api/chat with format='json' (forces valid JSON).
- _parse_batch_response strips <think> tags, accepts {results:[...]}
  AND bare [...], adds richer regex-fallback parse, logs raw head on
  total parse failure for diagnosis.

#3 Loeschkonzept checklist (new):
- doc_checks/loeschkonzept_checks.py — 9 L1 + 7 L2 checks per DIN 66398
  + Art. 5(1)(e)/17/32 DSGVO: scope+responsibility, data categories,
  retention periods, legal basis refs (HGB/AO/BGB), deletion trigger,
  deletion process+technical+systems, deletion proof, exceptions +
  Art. 18 lock, review cycle, DSGVO references.
- runner.py registered for loeschkonzept/loeschung/loeschfristen.

#4 regulation backfill script:
- backend-compliance/scripts/backfill_mc_regulation.py — regex-detects
  DSGVO/TDDDG/TMG/BGB/HGB/AO/MStV/UWG/VSBG/PAngV/GwG/BDSG/EU-VO
  references in MC title+question+pass_criteria, UPDATEs regulation +
  article fields.
- Idempotent (only NULL rows), --dry-run flag, batched 200/UPDATE.
- Run inside container: docker exec bp-compliance-backend python3 \
    /app/scripts/backfill_mc_regulation.py

#5 MC alias-fallback:
- rag_document_checker._MC_ALIAS_FALLBACK maps doc_types without own
  MCs to a related set: nutzungsbedingungen->agb, social_media->dse,
  sub_processor/scc/tom_annex->avv, loeschfristen->loeschkonzept,
  eu_institution/dsb->dse.
- _load_controls retries with the alias when the primary query
  returns 0 rows.
- 14 additional doc_types now get MC coverage transparently.

#6 cross-domain auto-discovery:
- _autodiscover_missing builds a crawl plan: primary submitted base
  + up to 2 related domains sharing the owner SLD (e.g. BMW Group:
  bmw.de + bmwgroup.com + bmwgroup.jobs).
- Detection: regex over submitted texts for https?://...<owner>...
  hostnames distinct from the primary base.
- Each crawled base contributes documents + cmp_payloads to the
  discovery pool.

Net effect for BMW: 1874 MCs evaluated (90 from cookie alone, was
20), Loeschkonzept Pflichtangaben benoten-bar, LLM overturns false
regex FAILs, Joint-Controller policies on bmwgroup.jobs (Social
Media) jetzt entdeckbar. Same wins will apply to CRA-Compliance check.
2026-05-17 13:07:50 +02:00

189 lines
6.0 KiB
Python

"""
LLM verification for regex check results.
When a regex check FAILs, the LLM re-checks the original text
to confirm or overturn the finding. This eliminates false positives
caused by regex limitations (unusual formatting, synonyms, etc.).
Uses the self-hosted Ollama endpoint (Qwen) for fast local inference.
"""
import logging
import os
import httpx
logger = logging.getLogger(__name__)
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
OLLAMA_MODEL = os.getenv("OLLAMA_VERIFY_MODEL", "qwen3.5:35b-a3b")
TIMEOUT = 30.0
async def verify_failed_checks(
text: str,
failed_checks: list[dict],
doc_title: str,
) -> dict[str, dict]:
"""Verify regex FAIL results using LLM — single batched call.
Sends ALL failed checks in one LLM prompt instead of one call per check.
Returns a dict mapping check_id -> {"overturned": bool, "evidence": str}.
"""
results: dict[str, dict] = {}
checks_with_hints = [c for c in failed_checks if c.get("hint")]
if not checks_with_hints:
return results
# Truncate text to fit context window
text_excerpt = text[:8000]
try:
batch_results = await _ask_llm_batch(
text_excerpt, checks_with_hints, doc_title,
)
for check_id, answer in batch_results.items():
overturned = answer.get("found", False)
results[check_id] = {
"overturned": overturned,
"evidence": answer.get("evidence", ""),
}
if overturned:
logger.info(
"LLM overturned regex FAIL for '%s' in '%s': %s",
check_id, doc_title, answer.get("evidence", "")[:80],
)
except Exception as e:
logger.warning("LLM batch verify failed for '%s': %s", doc_title, e)
return results
async def _ask_llm_batch(
text: str, checks: list[dict], doc_title: str,
) -> dict[str, dict]:
"""Ask the LLM to verify ALL failed checks in a single call.
Uses /api/chat with format='json' so Ollama enforces a valid JSON
response object — much more reliable than the previous /api/generate
+ free-text approach which qwen3 often wrapped in <think>...</think>
reasoning tokens.
"""
checklist_lines = []
for i, c in enumerate(checks, 1):
checklist_lines.append(
f'{i}. ID="{c["id"]}" | {c["label"]} | {c.get("hint", "")[:120]}'
)
checklist_str = "\n".join(checklist_lines)
system_msg = (
"Du pruefst ob ein Dokument bestimmte Pflichtangaben enthaelt. "
"Antworte AUSSCHLIESSLICH mit einem JSON-Objekt: "
'{"results": [{"id": "<check-id>", "found": true|false, '
'"evidence": "<kurzes Zitat oder leer>"}]}. '
"Keine Erklaerungen, keine Reasoning-Tags, kein Markdown."
)
user_msg = (
f'DOKUMENT: "{doc_title}"\n\n'
f"ANFORDERUNGEN:\n{checklist_str}\n\n"
f"TEXT:\n{text}"
)
payload = {
"model": OLLAMA_MODEL,
"messages": [
{"role": "system", "content": system_msg},
{"role": "user", "content": user_msg},
],
"stream": False,
"format": "json", # forces valid JSON output
"options": {"temperature": 0.0, "num_predict": 3000},
}
async with httpx.AsyncClient(timeout=120.0) as client:
resp = await client.post(f"{OLLAMA_URL}/api/chat", json=payload)
resp.raise_for_status()
data = resp.json()
raw = (data.get("message") or {}).get("content", "")
return _parse_batch_response(raw, checks)
def _parse_batch_response(raw: str, checks: list[dict]) -> dict[str, dict]:
"""Parse batch LLM response. Tolerates <think>…</think> wrappers,
code-fences, and either {results: [...]} or top-level [...]."""
import json
import re
results: dict[str, dict] = {}
if not raw:
logger.info("LLM batch: empty response from model")
return results
text = raw.strip()
# Strip qwen3 thinking tags
text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip()
# Strip markdown code fences
m = re.search(r"```(?:json)?\s*(.+?)\s*```", text, re.DOTALL)
if m:
text = m.group(1).strip()
# Try parse as-is
parsed = None
try:
parsed = json.loads(text)
except (json.JSONDecodeError, ValueError):
# Try finding the first JSON object or array in the text
for pattern in (r"\{.*\}", r"\[.*\]"):
mm = re.search(pattern, text, re.DOTALL)
if mm:
try:
parsed = json.loads(mm.group(0))
break
except (json.JSONDecodeError, ValueError):
continue
if parsed is None:
logger.info(
"LLM batch: 0/%d checks parsed (raw head: %r)",
len(checks), raw[:120],
)
return results
# Accept both {"results": [...]} (preferred) and bare list
items = None
if isinstance(parsed, dict):
for key in ("results", "checks", "items", "verifications"):
if isinstance(parsed.get(key), list):
items = parsed[key]
break
elif isinstance(parsed, list):
items = parsed
if not items:
# Final fallback: regex over individual id/found pairs
for mm in re.finditer(
r'\{[^}]*"id"\s*:\s*"([^"]+)"[^}]*"found"\s*:\s*(true|false)[^}]*\}',
raw, re.DOTALL,
):
results[mm.group(1)] = {
"found": mm.group(2) == "true", "evidence": "",
}
logger.info("LLM batch: %d/%d checks parsed (regex fallback)",
len(results), len(checks))
return results
for item in items:
if not isinstance(item, dict):
continue
cid = item.get("id", "")
if not cid:
continue
results[cid] = {
"found": bool(item.get("found", False)),
"evidence": str(item.get("evidence", ""))[:150],
}
logger.info("LLM batch: %d/%d checks parsed", len(results), len(checks))
return results