"""
P68 — Reverse-Audit: eigene Templates gegen alle MCs pruefen.

Statt 'gegeben einen Kunden-Text → welche MCs fail' machen wir den
umgekehrten Test: 'gegeben unseren BreakPilot-Standard-Template-Pool
(95 Templates) → welche MCs werden NICHT abgedeckt? Wo sind Luecken?'

Liefert einen Coverage-Report:
   - Total MCs in DB: ~1800
   - MCs abgedeckt durch min. 1 unserer Templates: X
   - MCs ohne Coverage: Y (Liste)
   - Templates ohne MC-Wirkung: Z (Liste)

Zweck: Audit unserer eigenen Code-Base. Wenn ein Customer einen Lauf
macht und 50 Findings produziert sind, sollten 90%+ davon durch unsere
Template-Bibliothek korrigierbar sein. Wenn nicht → Templates fehlen.
"""

from __future__ import annotations

import logging
import re

from sqlalchemy import text as sa_text
from sqlalchemy.orm import Session

logger = logging.getLogger(__name__)


def run_reverse_audit(db: Session) -> dict:
    """Hauptfunktion. Returns coverage-report dict."""
    # 1) Alle MCs aus doc_check_controls laden
    mc_rows = db.execute(sa_text(
        """
        SELECT id::text, control_id, doc_type, title, check_question,
               pass_criteria, severity
        FROM compliance.doc_check_controls
        ORDER BY doc_type, severity DESC
        """
    )).fetchall()

    # 2) Templates aus DB (doc_templates oder legal_templates oder analog)
    try:
        tpl_rows = db.execute(sa_text(
            """
            SELECT id::text, doc_type, title, body
            FROM compliance.doc_templates
            WHERE active = TRUE
            """
        )).fetchall()
    except Exception:
        # Fallback auf evtl. andere Template-Tabelle
        try:
            tpl_rows = db.execute(sa_text(
                """
                SELECT id::text, doc_type, name AS title, content AS body
                FROM compliance.legal_templates
                """
            )).fetchall()
        except Exception as e:
            logger.warning("template table not found: %s", e)
            tpl_rows = []

    # 3) Coverage-Matrix: pro MC, ob ein Template sie abdeckt
    templates_by_doctype: dict[str, list[dict]] = {}
    for tid, dt, title, body in tpl_rows:
        templates_by_doctype.setdefault(dt or "other", []).append({
            "id": tid, "title": title, "body": (body or "")[:50000],
        })

    covered_mc_ids: set[str] = set()
    uncovered: list[dict] = []
    for mc_id, ctrl_id, dt, title, q, pc, sev in mc_rows:
        tpls = templates_by_doctype.get(dt or "other") or []
        if not tpls:
            uncovered.append({
                "mc_id": ctrl_id, "doc_type": dt, "title": title,
                "severity": sev, "reason": "no_template_for_doctype",
            })
            continue
        # Heuristik: pass_criteria sind Pattern. Wenn IRGENDEIN Template
        # die Pattern enthaelt → covered.
        criteria = _extract_patterns_from_pc(pc)
        if not criteria:
            # ohne klare Pattern: per Title-Keywords pruefen
            criteria = _title_keywords(title or "")
        ok = False
        for tpl in tpls:
            body = tpl["body"].lower()
            hits = sum(1 for p in criteria if p and p.lower() in body)
            if hits >= max(1, len(criteria) // 2):
                ok = True
                break
        if ok:
            covered_mc_ids.add(mc_id)
        else:
            uncovered.append({
                "mc_id": ctrl_id, "doc_type": dt, "title": title,
                "severity": sev, "reason": "no_template_match",
                "criteria_sample": criteria[:5],
            })

    # 4) Templates ohne MC-Wirkung
    used_template_ids: set[str] = set()
    for mc_id, ctrl_id, dt, title, q, pc, sev in mc_rows:
        if mc_id not in covered_mc_ids:
            continue
        tpls = templates_by_doctype.get(dt or "other") or []
        criteria = _extract_patterns_from_pc(pc) or _title_keywords(title or "")
        for tpl in tpls:
            body = tpl["body"].lower()
            hits = sum(1 for p in criteria if p and p.lower() in body)
            if hits >= max(1, len(criteria) // 2):
                used_template_ids.add(tpl["id"])
                break
    all_template_ids = {t["id"] for tpls in templates_by_doctype.values()
                         for t in tpls}
    unused_templates = all_template_ids - used_template_ids

    return {
        "total_mcs":           len(mc_rows),
        "total_templates":     len(all_template_ids),
        "covered_mcs":         len(covered_mc_ids),
        "uncovered_mcs":       len(uncovered),
        "coverage_pct":        round(len(covered_mc_ids) / max(1, len(mc_rows)) * 100, 1),
        "unused_templates":    sorted(unused_templates),
        "top_uncovered_high":  [u for u in uncovered if u.get("severity") == "HIGH"][:30],
        "by_doctype":          _summarize_by_doctype(mc_rows, covered_mc_ids),
    }


def _extract_patterns_from_pc(pc) -> list[str]:
    """pc ist jsonb mit z.B. {required_phrases: [...]}, {keywords: [...]}"""
    if not pc:
        return []
    if isinstance(pc, str):
        try:
            import json as _j
            pc = _j.loads(pc)
        except Exception:
            return [pc[:50]]
    if isinstance(pc, dict):
        out: list[str] = []
        for k in ("required_phrases", "keywords", "must_contain",
                  "patterns", "phrases"):
            v = pc.get(k)
            if isinstance(v, list):
                out.extend([str(x)[:80] for x in v if x])
        return out
    if isinstance(pc, list):
        return [str(x)[:80] for x in pc if x]
    return []


def _title_keywords(title: str) -> list[str]:
    """Fallback wenn pass_criteria leer: extrahiere Substantive aus Title."""
    if not title:
        return []
    # primitive: alle Worte > 4 Buchstaben
    return [w for w in re.findall(r"\b\w{5,}\b", title)][:5]


def _summarize_by_doctype(mc_rows, covered_mc_ids: set[str]) -> dict:
    out: dict[str, dict] = {}
    for mc_id, ctrl_id, dt, title, q, pc, sev in mc_rows:
        dt = dt or "other"
        d = out.setdefault(dt, {"total": 0, "covered": 0})
        d["total"] += 1
        if mc_id in covered_mc_ids:
            d["covered"] += 1
    for dt, d in out.items():
        d["pct"] = round(d["covered"] / max(1, d["total"]) * 100, 1)
    return out