"""Reine Helfer der Obligation Discovery Pipeline (keine schweren Imports → unit-testbar). Die Pipeline leitet aus großen Compliance-Korpora eine regulatorische Ontologie ab: Controls → Mikro-Cluster → Meta-Cluster/Review-Units → LLM-Synthese → Obligation Registry. Architekturregel: RUNTIME bleibt deterministisch; DISCOVERY (dieses Tooling) darf LLM-gestützt sein und läuft EINMALIG/offline. Siehe docs-src/development/obligation_discovery_pipeline_v1.md. """ from __future__ import annotations import ast import json import math from typing import Optional SEMANTIC_EDGE_TYPES = ("depends_on", "supports", "produces_evidence_for", "implements", "derived_from") def parse_req(req) -> list: """requirements-Spalte (JSON ODER Python-Repr ODER String) robust zu Liste.""" if isinstance(req, list): return req if isinstance(req, str): for fn in (json.loads, ast.literal_eval): try: v = fn(req) return v if isinstance(v, list) else [str(v)] except Exception: pass return [req] return [] def cosine(a, b) -> float: if not a or not b: return 0.0 dot = sum(x * y for x, y in zip(a, b)) na = math.sqrt(sum(x * x for x in a)) nb = math.sqrt(sum(y * y for y in b)) return dot / (na * nb) if na and nb else 0.0 def greedy_cluster(vecs: list, thr: float) -> list[dict]: """Single-Pass-Greedy-Clustering: jeder Vektor joint den ersten Cluster, dessen Seed cosine ≥ thr ist, sonst neuer Cluster. Deterministisch (stabile Reihenfolge).""" clusters: list[dict] = [] for i, v in enumerate(vecs): if not v: clusters.append({"seed": None, "members": [i]}) continue best, best_sim = None, thr for c in clusters: if c["seed"] is None: continue s = cosine(v, c["seed"]) if s >= best_sim: best_sim, best = s, c if best: best["members"].append(i) else: clusters.append({"seed": v, "members": [i]}) return clusters def centroid(idxs: list[int], vecs: list) -> Optional[list]: vs = [vecs[i] for i in idxs if vecs[i]] if not vs: return None n = len(vs) return [sum(col) / n for col in zip(*vs)] def validate_registry(reg: dict) -> dict: """Belastbarkeits-Checks (User-Regeln): LEGAL_MINIMUM braucht legal_basis, member_controls vollständig, out_of_scope separat, >8-Obligations/Review-Unit-Warnung.""" obls = reg.get("obligations", []) lm = [o for o in obls if o.get("tier") == "LEGAL_MINIMUM"] lm_without_basis = [o["id"] for o in lm if not o.get("legal_basis")] empty_members = [o["id"] for o in obls if not o.get("member_controls")] per_unit: dict[str, int] = {} for o in obls: ru = (o.get("provenance") or {}).get("source_meta_cluster") if ru: per_unit[ru] = per_unit.get(ru, 0) + 1 over8 = {ru: n for ru, n in per_unit.items() if n > 8} rels = reg.get("relationships", []) return { "obligations": len(obls), "legal_minimum": len(lm), "lm_without_legal_basis": lm_without_basis, "empty_member_controls": empty_members, "over8_per_review_unit": over8, "out_of_scope": sum(1 for r in rels if r.get("type") == "out_of_scope"), "semantic_edges": sum(1 for r in rels if r.get("type") in SEMANTIC_EDGE_TYPES), "passed": not lm_without_basis and not empty_members and not over8, } def merge_edges(relationships: list[dict], proposed: list[dict]) -> tuple[list[dict], int]: """Proposed semantische Kanten dedupliziert in relationships mergen. Gibt (merged, added).""" existing = {(r.get("type"), r.get("from"), r.get("to")) for r in relationships if r.get("from")} added = 0 out = list(relationships) for e in proposed: if e.get("type") not in SEMANTIC_EDGE_TYPES: continue key = (e["type"], e.get("from"), e.get("to")) if key in existing or not e.get("from") or not e.get("to"): continue out.append(e) existing.add(key) added += 1 return out, added