Add obligation discovery pipeline tooling

Sichert die validierte Obligation Discovery Pipeline aus /tmp als dauerhaftes,
committetes Tooling (scripts/obligation_discovery/) — der eigentliche Vermögenswert.

Stufen: precluster (Embedding-Cache + Mikro-Cluster) → meta_cluster (Review Units,
Skalierungs-Fix) → synthesize_obligations (Opus, Key aus ENV, Streaming, harte Tier-Regel,
Provenance) → validate_registry → merge_review_diff. Reine Helfer in _core.py, 16 Unit-Tests.

Doku docs-src/development/obligation_discovery_pipeline_v1.md mit Meilensteinen
(SBOM/Vuln reproduziert, Auth 4408→170 Review Units→54→kuriert 29) und der Architekturregel:
Runtime deterministisch, Discovery LLM-gestützt.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-06-25 07:41:45 +02:00
parent 48e39423e6
commit e1b270c36e
8 changed files with 595 additions and 0 deletions
@@ -0,0 +1,73 @@
"""Stufe 1 — Pre-Cluster: Controls (scope) → BGE-M3-Embedding (gecacht) → Mikro-Cluster.
Deterministisch. Im bp-compliance-backend-Container ausführen (PYTHONPATH=/app).
python3 scripts/obligation_discovery/precluster.py --scope sbom
python3 scripts/obligation_discovery/precluster.py --patterns '%sbom%,%software bill%' --micro-thr 0.78
"""
from __future__ import annotations
import argparse
import asyncio
import json
import os
import pickle
from _core import greedy_cluster, parse_req
SCOPES = {
"sbom": ["%SBOM%", "%software bill%", "%stückliste%", "%komponentenliste%"],
"vuln": ["%schwachstellenbehandl%", "%schwachstellenmanagement%", "%vulnerability handling%",
"%coordinated vulnerab%", "%vulnerability disclosure%", "%cvd-konzept%"],
"auth": ["%authentisierung%", "%authentifizierung%", "%authentication%"],
}
async def run(scope: str, patterns: list[str], micro_thr: float, outdir: str) -> None:
import asyncpg
from compliance.services.mc_embedding_matcher import _embed_texts
dsn = os.getenv("DATABASE_URL") or os.getenv("COMPLIANCE_DATABASE_URL")
conn = await asyncpg.connect(dsn)
where = " or ".join(f"title ilike ${i+1}" for i in range(len(patterns)))
rows = await conn.fetch(
f"select control_id, title, requirements from compliance.canonical_controls "
f"where {where} order by control_id", *patterns)
await conn.close()
items = [{"control_id": r["control_id"], "title": r["title"] or "",
"embed_text": (r["title"] or "") + ". " + " ".join(parse_req(r["requirements"])[:2])}
for r in rows]
print(f"scope={scope}: {len(items)} controls")
cache = os.path.join(outdir, f"{scope}_vecs.pkl")
if os.path.exists(cache):
vecs = pickle.load(open(cache, "rb"))
print(f"embeddings from cache ({len(vecs)})")
else:
vecs = await _embed_texts([it["embed_text"] for it in items])
pickle.dump(vecs, open(cache, "wb"))
print(f"embeddings fresh+cached ({len(vecs)})")
micro = greedy_cluster(vecs, micro_thr)
print(f"pass-1 (micro-thr={micro_thr}): {len(items)}{len(micro)} micro-clusters")
out = [{"micro_id": i, "size": len(c["members"]), "member_indices": c["members"],
"control_ids": [items[j]["control_id"] for j in c["members"]],
"titles": [items[j]["title"] for j in c["members"][:6]]}
for i, c in enumerate(micro)]
path = os.path.join(outdir, f"{scope}_micro_clusters.json")
json.dump(out, open(path, "w", encoding="utf-8"), ensure_ascii=False, indent=1)
print(f"written: {path}")
def main() -> None:
ap = argparse.ArgumentParser()
ap.add_argument("--scope", default="sbom")
ap.add_argument("--patterns", default="", help="comma-separated SQL ILIKE patterns (overrides --scope)")
ap.add_argument("--micro-thr", type=float, default=0.78)
ap.add_argument("--outdir", default="/tmp")
a = ap.parse_args()
patterns = [p for p in a.patterns.split(",") if p] or SCOPES[a.scope]
asyncio.run(run(a.scope, patterns, a.micro_thr, a.outdir))
if __name__ == "__main__":
main()