Files
Benjamin Admin 6ed30dae5b feat(agent): MC scorecard + audit drill-down + tenant trend (A1-A6)
Now that all 1874 MCs run per check (Task #30 cap removal), the report
was about to drown in noise. This commit adds the full aggregation /
persistence / drill-down stack so each MC is actionable, not just
counted.

A1 mc_scorecard.py (new):
  build_scorecard(checks)    -> per-regulation PASS/FAIL/SKIP + severity
  top_fails(checks, n)       -> N most severe failed MCs
  full_audit_records(...)    -> flat rows ready for sidecar SQLite

A2 Email rendering:
  agent_doc_check_scorecard.py (new) builds an HTML scorecard table
  (regulation × passed/failed/HIGH/MEDIUM/score) shown at the top of
  the email. agent_doc_check_report._render_document now collapses
  the 500-MC L2 forest into 'X/Y bestanden (Z Fail)' summary plus
  a top-10 fails block per doc — old verbose render is gone.

A3 compliance_audit_log.py (new) — sidecar SQLite at
  /data/compliance_audits.db (separate from compliance Postgres
  schema to comply with the no-new-migrations rule in CLAUDE.md):
    check_runs(check_id, ts, tenant_id, site_name, base_domain,
               doc_count, scorecard json, vvt_summary json)
    mc_results(check_id, doc_type, mc_id, label, passed, skipped,
               severity, regulation, matched_text, hint)
  Route persists every run after the email is sent.
  docker-compose.yml adds compliance-audit volume + env.

A4 backfill_mc_regulation_llm.py (new) — Qwen-tagged backfill for
  the 1636 MCs the regex pass couldn't classify. Batches of 25,
  format=json, output constrained to the canonical regulation list.
  Run manually: docker exec bp-compliance-backend python3 \
                 /app/scripts/backfill_mc_regulation_llm.py [--dry-run]

A5 Admin audit tab — GET /api/compliance/agent/audit/<check_id>
  proxied via /api/sdk/v1/agent/audit/<id>. New page
  /sdk/agent/audit/[checkId] renders scorecard + filterable MC table
  (status / doc_type / regulation, expandable rows with matched_text
  + hint). ComplianceCheckTab now shows 'Voll-Audit oeffnen' link.

A6 Trend per tenant — GET /api/compliance/agent/audit/tenant/<id>
  returns recent runs. Email scorecard shows per-regulation delta
  badges ('(+12%)', '(-3%)') compared with the previous run for the
  same tenant + base_domain. Lookup is one SQLite query.

Plumbing:
  rag_document_checker.py — SELECT now includes 'article'; MC results
    carry 'regulation' + 'article' through to CheckItem.
  agent_doc_check_routes.CheckItem schema gains regulation + article
    fields (defaults '') so old clients still parse.
  agent_compliance_check_routes — response gains 'check_id' so the
    frontend can build the audit link.
2026-05-17 13:45:58 +02:00

182 lines
6.0 KiB
Python

"""
LLM-tagged regulation backfill v2.
The regex pass (backfill_mc_regulation.py) covers ~13% of MCs that
quote a norm inline ('Art. X DSGVO', '§Y TDDDG'). The remaining 1636
are abstract security/process controls that don't cite a norm by name
but DO map to one (e.g. 'Pseudonymisierung' -> DSGVO Art. 32).
This script asks Qwen (Ollama) in batches of 25 to classify each MC.
Output is constrained: one of DSGVO / TDDDG / BGB / HGB / AO / TMG /
BDSG / MStV / UWG / VSBG / NIS2 / ISO27001 / BSI-GS / NIST / OTHER /
NONE — plus an optional article hint.
Run inside the backend container:
docker exec bp-compliance-backend python3 \\
/app/scripts/backfill_mc_regulation_llm.py [--dry-run] [--limit N]
"""
from __future__ import annotations
import argparse
import asyncio
import json
import logging
import os
import re
import sys
import asyncpg
import httpx
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
logger = logging.getLogger(__name__)
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
MODEL = os.getenv("CMP_LLM_MODEL", "qwen3:30b-a3b")
BATCH = 25
_ALLOWED = {
"DSGVO", "TDDDG", "BGB", "HGB", "AO", "TMG", "BDSG", "MStV", "UWG",
"VSBG", "PAngV", "GwG", "NIS2", "ISO27001", "BSI-GS", "NIST", "TKG",
"EU-VO", "OTHER", "NONE",
}
_SYSTEM = (
"Du klassifizierst Compliance Master-Controls nach der wichtigsten "
"rechtlichen oder normativen Grundlage. Antworte AUSSCHLIESSLICH "
'mit JSON: {"items": [{"id":"<id>", "regulation":"<one of '
'DSGVO|TDDDG|BGB|HGB|AO|TMG|BDSG|MStV|UWG|VSBG|PAngV|GwG|NIS2|'
'ISO27001|BSI-GS|NIST|TKG|EU-VO|OTHER|NONE>", "article":"<optional '
'kurze Norm-Referenz oder leer>"}]}. '
"Eine MC -> EIN Eintrag. Wenn keine Norm passt: NONE. "
"Keine Erklaerung, kein Markdown."
)
def _build_user_prompt(batch: list[dict]) -> str:
lines = ["Klassifiziere diese Master-Controls:"]
for r in batch:
title = (r["title"] or "")[:120]
q = (r["check_question"] or "")[:200]
lines.append(f'- id={r["id_short"]} | {title}')
if q:
lines.append(f" -> {q}")
return "\n".join(lines)
async def _ask_llm(batch: list[dict]) -> dict[str, dict]:
payload = {
"model": MODEL,
"messages": [
{"role": "system", "content": _SYSTEM},
{"role": "user", "content": _build_user_prompt(batch)},
],
"stream": False, "format": "json",
"options": {"temperature": 0.0, "num_predict": 2500},
}
try:
async with httpx.AsyncClient(timeout=120.0) as client:
resp = await client.post(
f"{OLLAMA_URL.rstrip('/')}/api/chat", json=payload,
)
resp.raise_for_status()
content = (resp.json().get("message") or {}).get("content", "")
except Exception as e:
logger.warning("LLM call failed: %s", e)
return {}
if not content:
return {}
# Strip qwen thinking
content = re.sub(r"<think>.*?</think>", "", content, flags=re.DOTALL).strip()
try:
obj = json.loads(content)
except Exception:
return {}
items = obj.get("items") if isinstance(obj, dict) else None
if not isinstance(items, list):
return {}
out: dict[str, dict] = {}
for it in items:
if not isinstance(it, dict):
continue
sid = str(it.get("id", "")).strip()
reg = str(it.get("regulation", "")).strip().upper()
art = str(it.get("article", "")).strip()
if not sid or reg not in _ALLOWED:
continue
if reg == "NONE":
continue
out[sid] = {"regulation": reg, "article": art[:120]}
return out
async def main(dry_run: bool, limit: int) -> None:
db = os.getenv("DATABASE_URL")
if not db:
print("DATABASE_URL not set", file=sys.stderr)
sys.exit(1)
conn = await asyncpg.connect(db)
sql = ("SELECT id, title, check_question FROM compliance.doc_check_controls "
"WHERE regulation IS NULL ORDER BY id")
if limit > 0:
sql += f" LIMIT {limit}"
rows = await conn.fetch(sql)
print(f"{len(rows)} MCs without regulation — calling LLM in batches of {BATCH}")
by_short: dict[str, str] = {}
batches: list[list[dict]] = []
cur: list[dict] = []
for r in rows:
sid = str(r["id"])[:8]
by_short[sid] = str(r["id"])
cur.append({"id_short": sid, "title": r["title"],
"check_question": r["check_question"]})
if len(cur) >= BATCH:
batches.append(cur); cur = []
if cur:
batches.append(cur)
updates: list[tuple[str, str, str]] = [] # (regulation, article, uuid)
hits: dict[str, int] = {}
for i, batch in enumerate(batches, 1):
logger.info("batch %d/%d (%d items)", i, len(batches), len(batch))
res = await _ask_llm(batch)
for short, m in res.items():
uuid = by_short.get(short)
if not uuid:
continue
updates.append((m["regulation"], m["article"], uuid))
hits[m["regulation"]] = hits.get(m["regulation"], 0) + 1
print(f"\nLLM classified: {sum(hits.values())} / {len(rows)}")
for k, v in sorted(hits.items(), key=lambda x: -x[1]):
print(f" {k:10s} {v:>5}")
if dry_run:
print("\nDRY RUN — no UPDATE issued.")
await conn.close()
return
for i in range(0, len(updates), 200):
chunk = updates[i:i + 200]
await conn.executemany(
"UPDATE compliance.doc_check_controls "
"SET regulation = $1, article = $2 WHERE id = $3::uuid",
chunk,
)
print(f"\nApplied {len(updates)} updates.")
await conn.close()
if __name__ == "__main__":
ap = argparse.ArgumentParser()
ap.add_argument("--dry-run", action="store_true")
ap.add_argument("--limit", type=int, default=0,
help="process only N MCs (0 = all)")
args = ap.parse_args()
asyncio.run(main(args.dry_run, args.limit))