Add obligation discovery pipeline tooling
Sichert die validierte Obligation Discovery Pipeline aus /tmp als dauerhaftes, committetes Tooling (scripts/obligation_discovery/) — der eigentliche Vermögenswert. Stufen: precluster (Embedding-Cache + Mikro-Cluster) → meta_cluster (Review Units, Skalierungs-Fix) → synthesize_obligations (Opus, Key aus ENV, Streaming, harte Tier-Regel, Provenance) → validate_registry → merge_review_diff. Reine Helfer in _core.py, 16 Unit-Tests. Doku docs-src/development/obligation_discovery_pipeline_v1.md mit Meilensteinen (SBOM/Vuln reproduziert, Auth 4408→170 Review Units→54→kuriert 29) und der Architekturregel: Runtime deterministisch, Discovery LLM-gestützt. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,114 @@
|
||||
"""Reine Helfer der Obligation Discovery Pipeline (keine schweren Imports → unit-testbar).
|
||||
|
||||
Die Pipeline leitet aus großen Compliance-Korpora eine regulatorische Ontologie ab:
|
||||
Controls → Mikro-Cluster → Meta-Cluster/Review-Units → LLM-Synthese → Obligation Registry.
|
||||
Architekturregel: RUNTIME bleibt deterministisch; DISCOVERY (dieses Tooling) darf LLM-gestützt
|
||||
sein und läuft EINMALIG/offline. Siehe docs-src/development/obligation_discovery_pipeline_v1.md.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import ast
|
||||
import json
|
||||
import math
|
||||
from typing import Optional
|
||||
|
||||
SEMANTIC_EDGE_TYPES = ("depends_on", "supports", "produces_evidence_for",
|
||||
"implements", "derived_from")
|
||||
|
||||
|
||||
def parse_req(req) -> list:
|
||||
"""requirements-Spalte (JSON ODER Python-Repr ODER String) robust zu Liste."""
|
||||
if isinstance(req, list):
|
||||
return req
|
||||
if isinstance(req, str):
|
||||
for fn in (json.loads, ast.literal_eval):
|
||||
try:
|
||||
v = fn(req)
|
||||
return v if isinstance(v, list) else [str(v)]
|
||||
except Exception:
|
||||
pass
|
||||
return [req]
|
||||
return []
|
||||
|
||||
|
||||
def cosine(a, b) -> float:
|
||||
if not a or not b:
|
||||
return 0.0
|
||||
dot = sum(x * y for x, y in zip(a, b))
|
||||
na = math.sqrt(sum(x * x for x in a))
|
||||
nb = math.sqrt(sum(y * y for y in b))
|
||||
return dot / (na * nb) if na and nb else 0.0
|
||||
|
||||
|
||||
def greedy_cluster(vecs: list, thr: float) -> list[dict]:
|
||||
"""Single-Pass-Greedy-Clustering: jeder Vektor joint den ersten Cluster, dessen Seed
|
||||
cosine ≥ thr ist, sonst neuer Cluster. Deterministisch (stabile Reihenfolge)."""
|
||||
clusters: list[dict] = []
|
||||
for i, v in enumerate(vecs):
|
||||
if not v:
|
||||
clusters.append({"seed": None, "members": [i]})
|
||||
continue
|
||||
best, best_sim = None, thr
|
||||
for c in clusters:
|
||||
if c["seed"] is None:
|
||||
continue
|
||||
s = cosine(v, c["seed"])
|
||||
if s >= best_sim:
|
||||
best_sim, best = s, c
|
||||
if best:
|
||||
best["members"].append(i)
|
||||
else:
|
||||
clusters.append({"seed": v, "members": [i]})
|
||||
return clusters
|
||||
|
||||
|
||||
def centroid(idxs: list[int], vecs: list) -> Optional[list]:
|
||||
vs = [vecs[i] for i in idxs if vecs[i]]
|
||||
if not vs:
|
||||
return None
|
||||
n = len(vs)
|
||||
return [sum(col) / n for col in zip(*vs)]
|
||||
|
||||
|
||||
def validate_registry(reg: dict) -> dict:
|
||||
"""Belastbarkeits-Checks (User-Regeln): LEGAL_MINIMUM braucht legal_basis,
|
||||
member_controls vollständig, out_of_scope separat, >8-Obligations/Review-Unit-Warnung."""
|
||||
obls = reg.get("obligations", [])
|
||||
lm = [o for o in obls if o.get("tier") == "LEGAL_MINIMUM"]
|
||||
lm_without_basis = [o["id"] for o in lm if not o.get("legal_basis")]
|
||||
empty_members = [o["id"] for o in obls if not o.get("member_controls")]
|
||||
per_unit: dict[str, int] = {}
|
||||
for o in obls:
|
||||
ru = (o.get("provenance") or {}).get("source_meta_cluster")
|
||||
if ru:
|
||||
per_unit[ru] = per_unit.get(ru, 0) + 1
|
||||
over8 = {ru: n for ru, n in per_unit.items() if n > 8}
|
||||
rels = reg.get("relationships", [])
|
||||
return {
|
||||
"obligations": len(obls),
|
||||
"legal_minimum": len(lm),
|
||||
"lm_without_legal_basis": lm_without_basis,
|
||||
"empty_member_controls": empty_members,
|
||||
"over8_per_review_unit": over8,
|
||||
"out_of_scope": sum(1 for r in rels if r.get("type") == "out_of_scope"),
|
||||
"semantic_edges": sum(1 for r in rels if r.get("type") in SEMANTIC_EDGE_TYPES),
|
||||
"passed": not lm_without_basis and not empty_members and not over8,
|
||||
}
|
||||
|
||||
|
||||
def merge_edges(relationships: list[dict], proposed: list[dict]) -> tuple[list[dict], int]:
|
||||
"""Proposed semantische Kanten dedupliziert in relationships mergen. Gibt (merged, added)."""
|
||||
existing = {(r.get("type"), r.get("from"), r.get("to"))
|
||||
for r in relationships if r.get("from")}
|
||||
added = 0
|
||||
out = list(relationships)
|
||||
for e in proposed:
|
||||
if e.get("type") not in SEMANTIC_EDGE_TYPES:
|
||||
continue
|
||||
key = (e["type"], e.get("from"), e.get("to"))
|
||||
if key in existing or not e.get("from") or not e.get("to"):
|
||||
continue
|
||||
out.append(e)
|
||||
existing.add(key)
|
||||
added += 1
|
||||
return out, added
|
||||
Reference in New Issue
Block a user