breakpilot-compliance/backend-compliance/compliance/services/rag_provenance.py

"""
P70 — RAG-Provenance-Marker.

Wenn ein Finding aus dem RAG-Korpus belegt ist (z.B. Art-Match auf
einen konkreten Gesetzes-Paragrafen aus dem ingestierten DSGVO/TDDDG/
TMG-Korpus), bekommt es einen ✓-Marker. Wenn es nur aus unserer
Heuristik kommt (Pattern-Match ohne RAG-Belegung), bekommt es ein ⚠
"Heuristik".

Dadurch sieht der Nutzer sofort welche Aussagen rechtlich verbindlich
gestuetzt sind vs welche unsere Eigeninterpretation sind.

Generisch: dataclass-aehnliche Funktion die ein Finding-dict klassifiziert.
"""

from __future__ import annotations

import logging
import re

logger = logging.getLogger(__name__)


# Pattern fuer "Belegt aus Korpus": Finding enthaelt expliziten
# Norm-Bezug mit Artikel + Quelle.
_NORM_RE = re.compile(
    r"(Art\.?\s*\d+(?:\s*Abs\.?\s*\d+)?(?:\s*lit\.?\s*[a-z])?\s*"
    r"(?:DSGVO|GDPR|TDDDG|TMG|BDSG|UWG|TKG|EuGH|EDPB)|"
    r"\(?(EU|VO)\s*\d{4}/\d+\)?|"
    r"§\s*\d+[a-z]?\s*(TMG|UWG|BDSG|TKG|TDDDG))",
    re.I,
)


def classify_finding_provenance(finding: dict) -> str:
    """Returns 'rag', 'heuristic', or 'mixed'.

    rag       — Norm-Bezug + Quellen-URL (verbindlich)
    heuristic — Pattern-Match ohne Norm-Bezug (Eigeninterpretation)
    mixed     — Norm-Bezug aber ohne Quellen-URL (teilweise belegbar)
    """
    if not isinstance(finding, dict):
        return "heuristic"
    legal = (finding.get("legal_basis") or "").strip()
    detail = (finding.get("detail") or "").strip()
    rag_id = finding.get("rag_chunk_id")
    rag_url = finding.get("rag_source_url")
    blob = " ".join([legal, detail])
    has_norm = bool(_NORM_RE.search(blob))
    has_source = bool(rag_id or rag_url or
                       "https://" in legal or "https://" in detail)
    if has_norm and has_source:
        return "rag"
    if has_norm:
        return "mixed"
    return "heuristic"


def provenance_badge_html(provenance: str) -> str:
    if provenance == "rag":
        return (
            '<span style="background:#dcfce7;color:#166534;'
            'padding:1px 5px;border-radius:8px;font-size:9px;'
            'font-weight:600;margin-left:4px" '
            'title="Aussage durch RAG-Korpus belegt (Gesetzestext + Quelle)">'
            '✓ RAG</span>'
        )
    if provenance == "mixed":
        return (
            '<span style="background:#dbeafe;color:#1e40af;'
            'padding:1px 5px;border-radius:8px;font-size:9px;'
            'font-weight:600;margin-left:4px" '
            'title="Norm-Bezug ohne direkte Quellen-URL">'
            'NORM</span>'
        )
    return (
        '<span style="background:#f1f5f9;color:#475569;'
        'padding:1px 5px;border-radius:8px;font-size:9px;'
        'font-weight:600;margin-left:4px" '
        'title="Heuristik / Eigeninterpretation ohne Korpus-Beleg">'
        '⚠ HEURISTIK</span>'
    )


def annotate_findings(findings: list[dict]) -> list[dict]:
    """In-place: setzt finding['provenance'] auf jeden Eintrag."""
    for f in (findings or []):
        if isinstance(f, dict) and "provenance" not in f:
            f["provenance"] = classify_finding_provenance(f)
    return findings