b87c27d104
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / detect-changes (push) Successful in 10s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / loc-budget (push) Successful in 21s
CI / go-lint (push) Has been skipped
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
CI / validate-canonical-controls (push) Successful in 18s
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Has been skipped
CI / test-go (push) Has been skipped
CI / iace-gt-coverage (push) Has been skipped
CI / test-python-backend (push) Successful in 40s
Bug: qwen3.5:35b-a3b liefert mit format='json' + Batch-Prompt leere
Strings zurueck ('LLM batch: empty response from model'). Im echten
Compliance-Check lief der LLM-Verifier deshalb wirkungslos —
False-Positive-Findings wie 'Vorstand nicht erkannt' (BMW: Klammer-
Liste) wurden nicht overturned.
Fix: Default auf qwen3:30b-a3b umgestellt. Verifiziert mit BMW-
Impressum-Text: representative_person wird mit Evidence 'Milan
Nedeljkovic, Vorsitzender' overturned=True markiert.
OLLAMA_VERIFY_MODEL Env-Var bleibt als Override-Moeglichkeit.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
191 lines
6.2 KiB
Python
191 lines
6.2 KiB
Python
"""
|
|
LLM verification for regex check results.
|
|
|
|
When a regex check FAILs, the LLM re-checks the original text
|
|
to confirm or overturn the finding. This eliminates false positives
|
|
caused by regex limitations (unusual formatting, synonyms, etc.).
|
|
|
|
Uses the self-hosted Ollama endpoint (Qwen) for fast local inference.
|
|
"""
|
|
|
|
import logging
|
|
import os
|
|
import httpx
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
|
|
# P13: qwen3:30b-a3b liefert zuverlaessige JSON-Antworten im Batch-Modus.
|
|
# qwen3.5:35b-a3b lieferte mit format='json' + langem Prompt leere Strings.
|
|
OLLAMA_MODEL = os.getenv("OLLAMA_VERIFY_MODEL", "qwen3:30b-a3b")
|
|
TIMEOUT = 30.0
|
|
|
|
|
|
async def verify_failed_checks(
|
|
text: str,
|
|
failed_checks: list[dict],
|
|
doc_title: str,
|
|
) -> dict[str, dict]:
|
|
"""Verify regex FAIL results using LLM — single batched call.
|
|
|
|
Sends ALL failed checks in one LLM prompt instead of one call per check.
|
|
Returns a dict mapping check_id -> {"overturned": bool, "evidence": str}.
|
|
"""
|
|
results: dict[str, dict] = {}
|
|
|
|
checks_with_hints = [c for c in failed_checks if c.get("hint")]
|
|
if not checks_with_hints:
|
|
return results
|
|
|
|
# Truncate text to fit context window
|
|
text_excerpt = text[:8000]
|
|
|
|
try:
|
|
batch_results = await _ask_llm_batch(
|
|
text_excerpt, checks_with_hints, doc_title,
|
|
)
|
|
for check_id, answer in batch_results.items():
|
|
overturned = answer.get("found", False)
|
|
results[check_id] = {
|
|
"overturned": overturned,
|
|
"evidence": answer.get("evidence", ""),
|
|
}
|
|
if overturned:
|
|
logger.info(
|
|
"LLM overturned regex FAIL for '%s' in '%s': %s",
|
|
check_id, doc_title, answer.get("evidence", "")[:80],
|
|
)
|
|
except Exception as e:
|
|
logger.warning("LLM batch verify failed for '%s': %s", doc_title, e)
|
|
|
|
return results
|
|
|
|
|
|
async def _ask_llm_batch(
|
|
text: str, checks: list[dict], doc_title: str,
|
|
) -> dict[str, dict]:
|
|
"""Ask the LLM to verify ALL failed checks in a single call.
|
|
|
|
Uses /api/chat with format='json' so Ollama enforces a valid JSON
|
|
response object — much more reliable than the previous /api/generate
|
|
+ free-text approach which qwen3 often wrapped in <think>...</think>
|
|
reasoning tokens.
|
|
"""
|
|
checklist_lines = []
|
|
for i, c in enumerate(checks, 1):
|
|
checklist_lines.append(
|
|
f'{i}. ID="{c["id"]}" | {c["label"]} | {c.get("hint", "")[:120]}'
|
|
)
|
|
checklist_str = "\n".join(checklist_lines)
|
|
|
|
system_msg = (
|
|
"Du pruefst ob ein Dokument bestimmte Pflichtangaben enthaelt. "
|
|
"Antworte AUSSCHLIESSLICH mit einem JSON-Objekt: "
|
|
'{"results": [{"id": "<check-id>", "found": true|false, '
|
|
'"evidence": "<kurzes Zitat oder leer>"}]}. '
|
|
"Keine Erklaerungen, keine Reasoning-Tags, kein Markdown."
|
|
)
|
|
user_msg = (
|
|
f'DOKUMENT: "{doc_title}"\n\n'
|
|
f"ANFORDERUNGEN:\n{checklist_str}\n\n"
|
|
f"TEXT:\n{text}"
|
|
)
|
|
|
|
payload = {
|
|
"model": OLLAMA_MODEL,
|
|
"messages": [
|
|
{"role": "system", "content": system_msg},
|
|
{"role": "user", "content": user_msg},
|
|
],
|
|
"stream": False,
|
|
"format": "json", # forces valid JSON output
|
|
"options": {"temperature": 0.0, "num_predict": 3000},
|
|
}
|
|
|
|
async with httpx.AsyncClient(timeout=120.0) as client:
|
|
resp = await client.post(f"{OLLAMA_URL}/api/chat", json=payload)
|
|
resp.raise_for_status()
|
|
data = resp.json()
|
|
raw = (data.get("message") or {}).get("content", "")
|
|
|
|
return _parse_batch_response(raw, checks)
|
|
|
|
|
|
def _parse_batch_response(raw: str, checks: list[dict]) -> dict[str, dict]:
|
|
"""Parse batch LLM response. Tolerates <think>…</think> wrappers,
|
|
code-fences, and either {results: [...]} or top-level [...]."""
|
|
import json
|
|
import re
|
|
|
|
results: dict[str, dict] = {}
|
|
if not raw:
|
|
logger.info("LLM batch: empty response from model")
|
|
return results
|
|
|
|
text = raw.strip()
|
|
# Strip qwen3 thinking tags
|
|
text = re.sub(r"<think>.*?</think>", "", text, flags=re.DOTALL).strip()
|
|
# Strip markdown code fences
|
|
m = re.search(r"```(?:json)?\s*(.+?)\s*```", text, re.DOTALL)
|
|
if m:
|
|
text = m.group(1).strip()
|
|
|
|
# Try parse as-is
|
|
parsed = None
|
|
try:
|
|
parsed = json.loads(text)
|
|
except (json.JSONDecodeError, ValueError):
|
|
# Try finding the first JSON object or array in the text
|
|
for pattern in (r"\{.*\}", r"\[.*\]"):
|
|
mm = re.search(pattern, text, re.DOTALL)
|
|
if mm:
|
|
try:
|
|
parsed = json.loads(mm.group(0))
|
|
break
|
|
except (json.JSONDecodeError, ValueError):
|
|
continue
|
|
|
|
if parsed is None:
|
|
logger.info(
|
|
"LLM batch: 0/%d checks parsed (raw head: %r)",
|
|
len(checks), raw[:120],
|
|
)
|
|
return results
|
|
|
|
# Accept both {"results": [...]} (preferred) and bare list
|
|
items = None
|
|
if isinstance(parsed, dict):
|
|
for key in ("results", "checks", "items", "verifications"):
|
|
if isinstance(parsed.get(key), list):
|
|
items = parsed[key]
|
|
break
|
|
elif isinstance(parsed, list):
|
|
items = parsed
|
|
|
|
if not items:
|
|
# Final fallback: regex over individual id/found pairs
|
|
for mm in re.finditer(
|
|
r'\{[^}]*"id"\s*:\s*"([^"]+)"[^}]*"found"\s*:\s*(true|false)[^}]*\}',
|
|
raw, re.DOTALL,
|
|
):
|
|
results[mm.group(1)] = {
|
|
"found": mm.group(2) == "true", "evidence": "",
|
|
}
|
|
logger.info("LLM batch: %d/%d checks parsed (regex fallback)",
|
|
len(results), len(checks))
|
|
return results
|
|
|
|
for item in items:
|
|
if not isinstance(item, dict):
|
|
continue
|
|
cid = item.get("id", "")
|
|
if not cid:
|
|
continue
|
|
results[cid] = {
|
|
"found": bool(item.get("found", False)),
|
|
"evidence": str(item.get("evidence", ""))[:150],
|
|
}
|
|
|
|
logger.info("LLM batch: %d/%d checks parsed", len(results), len(checks))
|
|
return results
|