""" P68 — Reverse-Audit: eigene Templates gegen alle MCs pruefen. Statt 'gegeben einen Kunden-Text → welche MCs fail' machen wir den umgekehrten Test: 'gegeben unseren BreakPilot-Standard-Template-Pool (95 Templates) → welche MCs werden NICHT abgedeckt? Wo sind Luecken?' Liefert einen Coverage-Report: - Total MCs in DB: ~1800 - MCs abgedeckt durch min. 1 unserer Templates: X - MCs ohne Coverage: Y (Liste) - Templates ohne MC-Wirkung: Z (Liste) Zweck: Audit unserer eigenen Code-Base. Wenn ein Customer einen Lauf macht und 50 Findings produziert sind, sollten 90%+ davon durch unsere Template-Bibliothek korrigierbar sein. Wenn nicht → Templates fehlen. """ from __future__ import annotations import logging import re from sqlalchemy import text as sa_text from sqlalchemy.orm import Session logger = logging.getLogger(__name__) def run_reverse_audit(db: Session) -> dict: """Hauptfunktion. Returns coverage-report dict.""" # 1) Alle MCs aus doc_check_controls laden mc_rows = db.execute(sa_text( """ SELECT id::text, control_id, doc_type, title, check_question, pass_criteria, severity FROM compliance.doc_check_controls ORDER BY doc_type, severity DESC """ )).fetchall() # 2) Templates aus DB (doc_templates oder legal_templates oder analog) try: tpl_rows = db.execute(sa_text( """ SELECT id::text, doc_type, title, body FROM compliance.doc_templates WHERE active = TRUE """ )).fetchall() except Exception: # Fallback auf evtl. andere Template-Tabelle try: tpl_rows = db.execute(sa_text( """ SELECT id::text, doc_type, name AS title, content AS body FROM compliance.legal_templates """ )).fetchall() except Exception as e: logger.warning("template table not found: %s", e) tpl_rows = [] # 3) Coverage-Matrix: pro MC, ob ein Template sie abdeckt templates_by_doctype: dict[str, list[dict]] = {} for tid, dt, title, body in tpl_rows: templates_by_doctype.setdefault(dt or "other", []).append({ "id": tid, "title": title, "body": (body or "")[:50000], }) covered_mc_ids: set[str] = set() uncovered: list[dict] = [] for mc_id, ctrl_id, dt, title, q, pc, sev in mc_rows: tpls = templates_by_doctype.get(dt or "other") or [] if not tpls: uncovered.append({ "mc_id": ctrl_id, "doc_type": dt, "title": title, "severity": sev, "reason": "no_template_for_doctype", }) continue # Heuristik: pass_criteria sind Pattern. Wenn IRGENDEIN Template # die Pattern enthaelt → covered. criteria = _extract_patterns_from_pc(pc) if not criteria: # ohne klare Pattern: per Title-Keywords pruefen criteria = _title_keywords(title or "") ok = False for tpl in tpls: body = tpl["body"].lower() hits = sum(1 for p in criteria if p and p.lower() in body) if hits >= max(1, len(criteria) // 2): ok = True break if ok: covered_mc_ids.add(mc_id) else: uncovered.append({ "mc_id": ctrl_id, "doc_type": dt, "title": title, "severity": sev, "reason": "no_template_match", "criteria_sample": criteria[:5], }) # 4) Templates ohne MC-Wirkung used_template_ids: set[str] = set() for mc_id, ctrl_id, dt, title, q, pc, sev in mc_rows: if mc_id not in covered_mc_ids: continue tpls = templates_by_doctype.get(dt or "other") or [] criteria = _extract_patterns_from_pc(pc) or _title_keywords(title or "") for tpl in tpls: body = tpl["body"].lower() hits = sum(1 for p in criteria if p and p.lower() in body) if hits >= max(1, len(criteria) // 2): used_template_ids.add(tpl["id"]) break all_template_ids = {t["id"] for tpls in templates_by_doctype.values() for t in tpls} unused_templates = all_template_ids - used_template_ids return { "total_mcs": len(mc_rows), "total_templates": len(all_template_ids), "covered_mcs": len(covered_mc_ids), "uncovered_mcs": len(uncovered), "coverage_pct": round(len(covered_mc_ids) / max(1, len(mc_rows)) * 100, 1), "unused_templates": sorted(unused_templates), "top_uncovered_high": [u for u in uncovered if u.get("severity") == "HIGH"][:30], "by_doctype": _summarize_by_doctype(mc_rows, covered_mc_ids), } def _extract_patterns_from_pc(pc) -> list[str]: """pc ist jsonb mit z.B. {required_phrases: [...]}, {keywords: [...]}""" if not pc: return [] if isinstance(pc, str): try: import json as _j pc = _j.loads(pc) except Exception: return [pc[:50]] if isinstance(pc, dict): out: list[str] = [] for k in ("required_phrases", "keywords", "must_contain", "patterns", "phrases"): v = pc.get(k) if isinstance(v, list): out.extend([str(x)[:80] for x in v if x]) return out if isinstance(pc, list): return [str(x)[:80] for x in pc if x] return [] def _title_keywords(title: str) -> list[str]: """Fallback wenn pass_criteria leer: extrahiere Substantive aus Title.""" if not title: return [] # primitive: alle Worte > 4 Buchstaben return [w for w in re.findall(r"\b\w{5,}\b", title)][:5] def _summarize_by_doctype(mc_rows, covered_mc_ids: set[str]) -> dict: out: dict[str, dict] = {} for mc_id, ctrl_id, dt, title, q, pc, sev in mc_rows: dt = dt or "other" d = out.setdefault(dt, {"total": 0, "covered": 0}) d["total"] += 1 if mc_id in covered_mc_ids: d["covered"] += 1 for dt, d in out.items(): d["pct"] = round(d["covered"] / max(1, d["total"]) * 100, 1) return out