Add obligation discovery pipeline tooling
Sichert die validierte Obligation Discovery Pipeline aus /tmp als dauerhaftes, committetes Tooling (scripts/obligation_discovery/) — der eigentliche Vermögenswert. Stufen: precluster (Embedding-Cache + Mikro-Cluster) → meta_cluster (Review Units, Skalierungs-Fix) → synthesize_obligations (Opus, Key aus ENV, Streaming, harte Tier-Regel, Provenance) → validate_registry → merge_review_diff. Reine Helfer in _core.py, 16 Unit-Tests. Doku docs-src/development/obligation_discovery_pipeline_v1.md mit Meilensteinen (SBOM/Vuln reproduziert, Auth 4408→170 Review Units→54→kuriert 29) und der Architekturregel: Runtime deterministisch, Discovery LLM-gestützt. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,73 @@
|
||||
"""Stufe 1 — Pre-Cluster: Controls (scope) → BGE-M3-Embedding (gecacht) → Mikro-Cluster.
|
||||
Deterministisch. Im bp-compliance-backend-Container ausführen (PYTHONPATH=/app).
|
||||
|
||||
python3 scripts/obligation_discovery/precluster.py --scope sbom
|
||||
python3 scripts/obligation_discovery/precluster.py --patterns '%sbom%,%software bill%' --micro-thr 0.78
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
import pickle
|
||||
|
||||
from _core import greedy_cluster, parse_req
|
||||
|
||||
SCOPES = {
|
||||
"sbom": ["%SBOM%", "%software bill%", "%stückliste%", "%komponentenliste%"],
|
||||
"vuln": ["%schwachstellenbehandl%", "%schwachstellenmanagement%", "%vulnerability handling%",
|
||||
"%coordinated vulnerab%", "%vulnerability disclosure%", "%cvd-konzept%"],
|
||||
"auth": ["%authentisierung%", "%authentifizierung%", "%authentication%"],
|
||||
}
|
||||
|
||||
|
||||
async def run(scope: str, patterns: list[str], micro_thr: float, outdir: str) -> None:
|
||||
import asyncpg
|
||||
from compliance.services.mc_embedding_matcher import _embed_texts
|
||||
|
||||
dsn = os.getenv("DATABASE_URL") or os.getenv("COMPLIANCE_DATABASE_URL")
|
||||
conn = await asyncpg.connect(dsn)
|
||||
where = " or ".join(f"title ilike ${i+1}" for i in range(len(patterns)))
|
||||
rows = await conn.fetch(
|
||||
f"select control_id, title, requirements from compliance.canonical_controls "
|
||||
f"where {where} order by control_id", *patterns)
|
||||
await conn.close()
|
||||
items = [{"control_id": r["control_id"], "title": r["title"] or "",
|
||||
"embed_text": (r["title"] or "") + ". " + " ".join(parse_req(r["requirements"])[:2])}
|
||||
for r in rows]
|
||||
print(f"scope={scope}: {len(items)} controls")
|
||||
|
||||
cache = os.path.join(outdir, f"{scope}_vecs.pkl")
|
||||
if os.path.exists(cache):
|
||||
vecs = pickle.load(open(cache, "rb"))
|
||||
print(f"embeddings from cache ({len(vecs)})")
|
||||
else:
|
||||
vecs = await _embed_texts([it["embed_text"] for it in items])
|
||||
pickle.dump(vecs, open(cache, "wb"))
|
||||
print(f"embeddings fresh+cached ({len(vecs)})")
|
||||
|
||||
micro = greedy_cluster(vecs, micro_thr)
|
||||
print(f"pass-1 (micro-thr={micro_thr}): {len(items)} → {len(micro)} micro-clusters")
|
||||
out = [{"micro_id": i, "size": len(c["members"]), "member_indices": c["members"],
|
||||
"control_ids": [items[j]["control_id"] for j in c["members"]],
|
||||
"titles": [items[j]["title"] for j in c["members"][:6]]}
|
||||
for i, c in enumerate(micro)]
|
||||
path = os.path.join(outdir, f"{scope}_micro_clusters.json")
|
||||
json.dump(out, open(path, "w", encoding="utf-8"), ensure_ascii=False, indent=1)
|
||||
print(f"written: {path}")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
ap = argparse.ArgumentParser()
|
||||
ap.add_argument("--scope", default="sbom")
|
||||
ap.add_argument("--patterns", default="", help="comma-separated SQL ILIKE patterns (overrides --scope)")
|
||||
ap.add_argument("--micro-thr", type=float, default=0.78)
|
||||
ap.add_argument("--outdir", default="/tmp")
|
||||
a = ap.parse_args()
|
||||
patterns = [p for p in a.patterns.split(",") if p] or SCOPES[a.scope]
|
||||
asyncio.run(run(a.scope, patterns, a.micro_thr, a.outdir))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user