breakpilot-compliance/backend-compliance/compliance/services/mc_scorecard.py

"""
Master-Control Scorecard — group + summarise MC results.

With max_controls=0 (#30 fix) every doc-check now evaluates 75-571 MCs
per document. Rendering all of them verbatim makes the email + frontend
unreadable. This module produces three structured artefacts:

1. `build_scorecard(check_results)` — per-regulation aggregate (PASS /
   FAIL / SKIP counts + severity histogram + compliance %)

2. `top_fails(check_results, n=10)` — top-N failed MCs ranked by
   severity then absence of evidence

3. `full_audit_records(check_results, check_id, tenant_id)` — flat
   list ready for SQLite persistence + JSON export

The functions are pure — no DB / network — so they're cheap to call
from inside the route and unit-testable.
"""

from __future__ import annotations

import logging
from collections import defaultdict
from datetime import datetime, timezone

logger = logging.getLogger(__name__)

# Severity order: CRITICAL > HIGH > MEDIUM > LOW > INFO
_SEV_RANK = {"CRITICAL": 0, "HIGH": 1, "MEDIUM": 2, "LOW": 3, "INFO": 4}


def build_scorecard(check_results: list[dict]) -> dict:
    """Aggregate per-regulation pass/fail/skip + severity buckets.

    Args:
      check_results: list of dicts, each typically a CheckItem-like
        record with keys: id, label, passed, severity, skipped,
        regulation, doc_type.

    Returns:
      {
        "by_regulation": [
          {"regulation": "DSGVO", "total": 193, "passed": 167,
           "failed": 24, "skipped": 2, "pct": 87,
           "severity": {"HIGH": 22, "MEDIUM": 2}}
        ],
        "totals": {"total": 1874, "passed": 1300, "failed": 540,
                   "skipped": 34, "pct": 70},
      }
    """
    buckets: dict[str, dict] = defaultdict(
        lambda: {"total": 0, "passed": 0, "failed": 0, "skipped": 0,
                 "severity": defaultdict(int)},
    )
    for r in check_results or []:
        reg = (r.get("regulation") or "—").strip() or "—"
        b = buckets[reg]
        b["total"] += 1
        if r.get("skipped"):
            b["skipped"] += 1
        elif r.get("passed"):
            b["passed"] += 1
        # P106 — interner Check ist KEIN Fail (zaehlt als skipped fuer
        # die Score-Berechnung damit der Score realistisch ist).
        elif r.get("audit_status") == "check":
            b["skipped"] += 1
            b.setdefault("internal_checks", 0)
            b["internal_checks"] += 1
        else:
            b["failed"] += 1
            sev = (r.get("severity") or "MEDIUM").upper()
            b["severity"][sev] += 1

    rows = []
    grand_total = grand_passed = grand_failed = grand_skipped = 0
    for reg, b in buckets.items():
        # Convert defaultdict for serialisability
        sev_dict = dict(b["severity"])
        active = b["total"] - b["skipped"]
        pct = round(b["passed"] / active * 100) if active else 0
        rows.append({
            "regulation": reg,
            "total": b["total"],
            "passed": b["passed"],
            "failed": b["failed"],
            "skipped": b["skipped"],
            "pct": pct,
            "severity": sev_dict,
        })
        grand_total += b["total"]
        grand_passed += b["passed"]
        grand_failed += b["failed"]
        grand_skipped += b["skipped"]

    rows.sort(key=lambda r: (-r["failed"], r["regulation"]))

    grand_active = grand_total - grand_skipped
    grand_pct = round(grand_passed / grand_active * 100) if grand_active else 0
    return {
        "by_regulation": rows,
        "totals": {
            "total": grand_total, "passed": grand_passed,
            "failed": grand_failed, "skipped": grand_skipped,
            "pct": grand_pct,
        },
    }


_DEDUP_KEYWORDS = [
    "einfache sprache", "verstaendliche sprache", "verständliche sprache",
    "klare sprache", "einwilligungstexte", "einwilligungsaufforderung",
    "einwilligungserklaerung", "einwilligungserklärung",
    "mehrdeutige", "verstaendliche form", "verständliche form",
    "fachbegriffe erklaeren", "fachbegriffe erklären",
]


def _dedup_key(label: str) -> str:
    """Cluster label to a stable dedup-key: if it contains one of the
    well-known repetitive Sprache/Einwilligungs-Aufforderungs-Concepts,
    collapse them all to that single concept. Otherwise return original."""
    l = (label or "").lower()
    for kw in _DEDUP_KEYWORDS:
        if kw in l:
            return f"_dup:{kw}"
    return label


_CONDITIONAL_MARKERS = ("falls ", "sofern ", "wenn ", "soweit ",
                        "bei bedarf", "ggf.", "gegebenenfalls")


def _is_hard_finding(r: dict) -> bool:
    """Echtes Finding = wir haben einen positiven Treffer im Text der den
    Verstoss belegt. Stille im Text reicht NICHT — das wandert ins MC-Audit
    als "selbst pruefen", nicht ins Email als HIGH-Drohung.

    Heuristik:
      - matched_text nicht leer = textuelle Evidenz vorhanden → hart
      - konditionales Label ("falls / sofern / wenn") UND matched_text leer
        → weich (Pre-Condition nicht belegt) → raus aus Top-Fails
      - sonst: hart (klassische Pflichtangaben-Lücke wie "DSB fehlt")
    """
    mt = (r.get("matched_text") or "").strip()
    if mt:
        return True
    label_low = (r.get("label") or "").lower()
    if any(m in label_low for m in _CONDITIONAL_MARKERS):
        return False
    return True


def top_fails(check_results: list[dict], n: int = 10) -> list[dict]:
    """Return top-N failing MCs sorted by severity then label.

    Skipped + passed MCs are excluded. INFO severity is excluded by
    default since those are guidance, not findings. Konditionale MCs
    ohne Negativ-Beleg (P8) werden ebenfalls ausgesteuert — sie
    erscheinen nur noch im MC-Audit als "selbst pruefen".

    Near-duplicates (multiple MCs that all complain about "einfache
    Sprache" / "Einwilligungsaufforderung" / ...) are collapsed to ONE
    representative entry — sonst dominieren UI-Sprache-Hinweise die
    Top-Liste und echte Lecks gehen unter.
    """
    fails = [
        r for r in (check_results or [])
        if not r.get("passed") and not r.get("skipped")
        and (r.get("severity") or "").upper() != "INFO"
        and _is_hard_finding(r)
    ]
    fails.sort(key=lambda r: (
        _SEV_RANK.get((r.get("severity") or "MEDIUM").upper(), 5),
        r.get("label", ""),
    ))
    seen_keys: set[str] = set()
    deduped: list[dict] = []
    for r in fails:
        k = _dedup_key(r.get("label", ""))
        if k in seen_keys:
            continue
        seen_keys.add(k)
        deduped.append(r)
        if len(deduped) >= n:
            break
    return deduped


def full_audit_records(
    check_results: list[dict],
    check_id: str,
    tenant_id: str = "",
    doc_type: str = "",
) -> list[dict]:
    """Flatten check results into rows ready for SQLite persistence.

    Returns one record per MC. Keeps the original fields plus
    check_id + doc_type + tenant_id + ts.
    """
    ts = datetime.now(timezone.utc).isoformat()
    out: list[dict] = []
    for r in check_results or []:
        out.append({
            "check_id": check_id,
            "tenant_id": tenant_id,
            "doc_type": doc_type,
            "ts": ts,
            "mc_id": r.get("id", ""),
            "label": (r.get("label") or "")[:300],
            "passed": bool(r.get("passed")),
            "skipped": bool(r.get("skipped")),
            "severity": (r.get("severity") or "").upper(),
            "regulation": r.get("regulation") or "",
            "matched_text": (r.get("matched_text") or "")[:500],
            "hint": (r.get("hint") or "")[:500],
            "level": int(r.get("level") or 1),
        })
    return out