breakpilot-compliance/backend-compliance/compliance/services/unified_findings_store.py

"""
Unified-Findings sidecar store.

A compliance check produces findings from 4 sources today:
  - Master-Controls (mc_results table — already persisted)
  - Pflichtangaben (L1/L2 doc checks, e.g. Impressum-Vollstaendigkeit)
  - Vendor scans (per cmp_vendor: missing privacy url, no opt-out, ...)
  - Redundancies (multi-vendor in same category)

Previously the DSB had to look in 4 different blocks of the email to
find everything. This store flattens all of them into ONE searchable
table so the /audit/<check_id> frontend can show a unified list with
source / severity / status / doc_type filters.

Sidecar SQLite (same DB as compliance_audit_log) — no Postgres
migration needed.
"""

from __future__ import annotations

import json
import logging
import os
import sqlite3
from pathlib import Path

logger = logging.getLogger(__name__)

DB_PATH = os.getenv("COMPLIANCE_AUDIT_DB", "/data/compliance_audits.db")


def _ensure_table() -> None:
    Path(DB_PATH).parent.mkdir(parents=True, exist_ok=True)
    with sqlite3.connect(DB_PATH) as conn:
        conn.executescript("""
            CREATE TABLE IF NOT EXISTS unified_findings (
                id              INTEGER PRIMARY KEY AUTOINCREMENT,
                check_id        TEXT NOT NULL,
                source_type     TEXT NOT NULL,     -- mc|pflichtangabe|vendor|redundanz
                doc_type        TEXT,              -- impressum|dse|cookie|... or '-' for vendor/redundanz
                severity        TEXT,              -- CRITICAL|HIGH|MEDIUM|LOW|INFO
                status          TEXT,              -- failed|passed|skipped|na|info
                regulation      TEXT,
                label           TEXT,
                hint            TEXT,
                action_recipe   TEXT,              -- JSON {what,why,fix_text,where,example}
                anchor_excerpt  TEXT,
                anchor_conf     REAL,
                vendor_name     TEXT,
                category        TEXT,
                payload         TEXT               -- JSON extras (matched_text, cookies count, ...)
            );
            CREATE INDEX IF NOT EXISTS idx_uf_check    ON unified_findings(check_id);
            CREATE INDEX IF NOT EXISTS idx_uf_source   ON unified_findings(check_id, source_type);
            CREATE INDEX IF NOT EXISTS idx_uf_status   ON unified_findings(check_id, status);
            CREATE INDEX IF NOT EXISTS idx_uf_severity ON unified_findings(check_id, severity);
        """)


def record_findings(check_id: str, findings: list[dict]) -> int:
    """Bulk-insert all findings for a check. Idempotent on check_id."""
    if not check_id:
        return 0
    try:
        _ensure_table()
        with sqlite3.connect(DB_PATH) as conn:
            conn.execute(
                "DELETE FROM unified_findings WHERE check_id=?", (check_id,),
            )
            if not findings:
                conn.commit()
                return 0
            rows = [
                (
                    check_id,
                    (f.get("source_type") or "mc")[:24],
                    (f.get("doc_type") or "")[:32],
                    (f.get("severity") or "MEDIUM").upper()[:16],
                    (f.get("status") or "failed")[:16],
                    (f.get("regulation") or "")[:64],
                    (f.get("label") or "")[:400],
                    (f.get("hint") or "")[:1200],
                    json.dumps(f.get("action_recipe") or {}, ensure_ascii=False),
                    (f.get("anchor_excerpt") or "")[:800],
                    float(f.get("anchor_conf") or 0.0),
                    (f.get("vendor_name") or "")[:160],
                    (f.get("category") or "")[:64],
                    json.dumps(f.get("payload") or {}, ensure_ascii=False),
                )
                for f in findings
            ]
            conn.executemany(
                "INSERT INTO unified_findings "
                "(check_id, source_type, doc_type, severity, status, regulation, "
                " label, hint, action_recipe, anchor_excerpt, anchor_conf, "
                " vendor_name, category, payload) "
                "VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)",
                rows,
            )
            conn.commit()
        logger.info(
            "unified_findings: %s rows=%d sources=%s",
            check_id, len(rows),
            sorted(set(f.get("source_type", "mc") for f in findings)),
        )
        return len(rows)
    except Exception as e:
        logger.warning("record_findings failed for %s: %s", check_id, e)
        return 0


def list_findings(
    check_id: str,
    source_type: str | None = None,
    severity: str | None = None,
    doc_type: str | None = None,
    status: str | None = None,
    q: str | None = None,
    limit: int = 1000,
) -> list[dict]:
    """Return filtered findings. q matches label OR vendor_name (case-insensitive)."""
    try:
        _ensure_table()
        where = ["check_id = ?"]
        params: list = [check_id]
        if source_type and source_type != "all":
            where.append("source_type = ?")
            params.append(source_type)
        if severity and severity != "all":
            where.append("severity = ?")
            params.append(severity.upper())
        if doc_type and doc_type != "all":
            where.append("doc_type = ?")
            params.append(doc_type)
        if status and status != "all":
            where.append("status = ?")
            params.append(status)
        if q:
            where.append("(LOWER(label) LIKE ? OR LOWER(vendor_name) LIKE ?)")
            needle = f"%{q.lower()}%"
            params.extend([needle, needle])
        sql = ("SELECT * FROM unified_findings WHERE " + " AND ".join(where) +
               " ORDER BY CASE severity "
               "  WHEN 'CRITICAL' THEN 0 WHEN 'HIGH' THEN 1 "
               "  WHEN 'MEDIUM' THEN 2 WHEN 'LOW' THEN 3 "
               "  ELSE 4 END, source_type, label LIMIT ?")
        params.append(int(limit))
        with sqlite3.connect(DB_PATH) as conn:
            conn.row_factory = sqlite3.Row
            rows = conn.execute(sql, params).fetchall()
            out = []
            for r in rows:
                d = dict(r)
                d["action_recipe"] = json.loads(d.get("action_recipe") or "{}")
                d["payload"] = json.loads(d.get("payload") or "{}")
                out.append(d)
            return out
    except Exception as e:
        logger.warning("list_findings failed: %s", e)
        return []


def findings_summary(check_id: str) -> dict:
    """Return aggregate counts for the filter UI (source/severity/status)."""
    out = {
        "total": 0,
        "by_source": {},
        "by_severity": {},
        "by_status": {},
        "by_doc_type": {},
    }
    try:
        _ensure_table()
        with sqlite3.connect(DB_PATH) as conn:
            conn.row_factory = sqlite3.Row
            for col in ("source_type", "severity", "status", "doc_type"):
                rows = conn.execute(
                    f"SELECT {col} AS k, COUNT(*) AS n FROM unified_findings "
                    f"WHERE check_id=? GROUP BY {col}",
                    (check_id,),
                ).fetchall()
                bucket = f"by_{col if col != 'source_type' else 'source'}"
                if col == "doc_type":
                    bucket = "by_doc_type"
                out[bucket] = {r["k"] or "-": r["n"] for r in rows}
                out["total"] = max(out["total"], sum(r["n"] for r in rows))
        return out
    except Exception as e:
        logger.warning("findings_summary failed: %s", e)
        return out