""" P70 — RAG-Provenance-Marker. Wenn ein Finding aus dem RAG-Korpus belegt ist (z.B. Art-Match auf einen konkreten Gesetzes-Paragrafen aus dem ingestierten DSGVO/TDDDG/ TMG-Korpus), bekommt es einen ✓-Marker. Wenn es nur aus unserer Heuristik kommt (Pattern-Match ohne RAG-Belegung), bekommt es ein ⚠ "Heuristik". Dadurch sieht der Nutzer sofort welche Aussagen rechtlich verbindlich gestuetzt sind vs welche unsere Eigeninterpretation sind. Generisch: dataclass-aehnliche Funktion die ein Finding-dict klassifiziert. """ from __future__ import annotations import logging import re logger = logging.getLogger(__name__) # Pattern fuer "Belegt aus Korpus": Finding enthaelt expliziten # Norm-Bezug mit Artikel + Quelle. _NORM_RE = re.compile( r"(Art\.?\s*\d+(?:\s*Abs\.?\s*\d+)?(?:\s*lit\.?\s*[a-z])?\s*" r"(?:DSGVO|GDPR|TDDDG|TMG|BDSG|UWG|TKG|EuGH|EDPB)|" r"\(?(EU|VO)\s*\d{4}/\d+\)?|" r"§\s*\d+[a-z]?\s*(TMG|UWG|BDSG|TKG|TDDDG))", re.I, ) def classify_finding_provenance(finding: dict) -> str: """Returns 'rag', 'heuristic', or 'mixed'. rag — Norm-Bezug + Quellen-URL (verbindlich) heuristic — Pattern-Match ohne Norm-Bezug (Eigeninterpretation) mixed — Norm-Bezug aber ohne Quellen-URL (teilweise belegbar) """ if not isinstance(finding, dict): return "heuristic" legal = (finding.get("legal_basis") or "").strip() detail = (finding.get("detail") or "").strip() rag_id = finding.get("rag_chunk_id") rag_url = finding.get("rag_source_url") blob = " ".join([legal, detail]) has_norm = bool(_NORM_RE.search(blob)) has_source = bool(rag_id or rag_url or "https://" in legal or "https://" in detail) if has_norm and has_source: return "rag" if has_norm: return "mixed" return "heuristic" def provenance_badge_html(provenance: str) -> str: if provenance == "rag": return ( '' '✓ RAG' ) if provenance == "mixed": return ( '' 'NORM' ) return ( '' '⚠ HEURISTIK' ) def annotate_findings(findings: list[dict]) -> list[dict]: """In-place: setzt finding['provenance'] auf jeden Eintrag.""" for f in (findings or []): if isinstance(f, dict) and "provenance" not in f: f["provenance"] = classify_finding_provenance(f) return findings