"""Reine Helfer der Obligation Discovery Pipeline (keine schweren Imports → unit-testbar).

Die Pipeline leitet aus großen Compliance-Korpora eine regulatorische Ontologie ab:
  Controls → Mikro-Cluster → Meta-Cluster/Review-Units → LLM-Synthese → Obligation Registry.
Architekturregel: RUNTIME bleibt deterministisch; DISCOVERY (dieses Tooling) darf LLM-gestützt
sein und läuft EINMALIG/offline. Siehe docs-src/development/obligation_discovery_pipeline_v1.md.
"""
from __future__ import annotations

import ast
import json
import math
from typing import Optional

SEMANTIC_EDGE_TYPES = ("depends_on", "supports", "produces_evidence_for",
                       "implements", "derived_from")


def parse_req(req) -> list:
    """requirements-Spalte (JSON ODER Python-Repr ODER String) robust zu Liste."""
    if isinstance(req, list):
        return req
    if isinstance(req, str):
        for fn in (json.loads, ast.literal_eval):
            try:
                v = fn(req)
                return v if isinstance(v, list) else [str(v)]
            except Exception:
                pass
        return [req]
    return []


def cosine(a, b) -> float:
    if not a or not b:
        return 0.0
    dot = sum(x * y for x, y in zip(a, b))
    na = math.sqrt(sum(x * x for x in a))
    nb = math.sqrt(sum(y * y for y in b))
    return dot / (na * nb) if na and nb else 0.0


def greedy_cluster(vecs: list, thr: float) -> list[dict]:
    """Single-Pass-Greedy-Clustering: jeder Vektor joint den ersten Cluster, dessen Seed
    cosine ≥ thr ist, sonst neuer Cluster. Deterministisch (stabile Reihenfolge)."""
    clusters: list[dict] = []
    for i, v in enumerate(vecs):
        if not v:
            clusters.append({"seed": None, "members": [i]})
            continue
        best, best_sim = None, thr
        for c in clusters:
            if c["seed"] is None:
                continue
            s = cosine(v, c["seed"])
            if s >= best_sim:
                best_sim, best = s, c
        if best:
            best["members"].append(i)
        else:
            clusters.append({"seed": v, "members": [i]})
    return clusters


def centroid(idxs: list[int], vecs: list) -> Optional[list]:
    vs = [vecs[i] for i in idxs if vecs[i]]
    if not vs:
        return None
    n = len(vs)
    return [sum(col) / n for col in zip(*vs)]


def validate_registry(reg: dict) -> dict:
    """Belastbarkeits-Checks (User-Regeln): LEGAL_MINIMUM braucht legal_basis,
    member_controls vollständig, out_of_scope separat, >8-Obligations/Review-Unit-Warnung."""
    obls = reg.get("obligations", [])
    lm = [o for o in obls if o.get("tier") == "LEGAL_MINIMUM"]
    lm_without_basis = [o["id"] for o in lm if not o.get("legal_basis")]
    empty_members = [o["id"] for o in obls if not o.get("member_controls")]
    per_unit: dict[str, int] = {}
    for o in obls:
        ru = (o.get("provenance") or {}).get("source_meta_cluster")
        if ru:
            per_unit[ru] = per_unit.get(ru, 0) + 1
    over8 = {ru: n for ru, n in per_unit.items() if n > 8}
    rels = reg.get("relationships", [])
    return {
        "obligations": len(obls),
        "legal_minimum": len(lm),
        "lm_without_legal_basis": lm_without_basis,
        "empty_member_controls": empty_members,
        "over8_per_review_unit": over8,
        "out_of_scope": sum(1 for r in rels if r.get("type") == "out_of_scope"),
        "semantic_edges": sum(1 for r in rels if r.get("type") in SEMANTIC_EDGE_TYPES),
        "passed": not lm_without_basis and not empty_members and not over8,
    }


def merge_edges(relationships: list[dict], proposed: list[dict]) -> tuple[list[dict], int]:
    """Proposed semantische Kanten dedupliziert in relationships mergen. Gibt (merged, added)."""
    existing = {(r.get("type"), r.get("from"), r.get("to"))
                for r in relationships if r.get("from")}
    added = 0
    out = list(relationships)
    for e in proposed:
        if e.get("type") not in SEMANTIC_EDGE_TYPES:
            continue
        key = (e["type"], e.get("from"), e.get("to"))
        if key in existing or not e.get("from") or not e.get("to"):
            continue
        out.append(e)
        existing.add(key)
        added += 1
    return out, added