#!/usr/bin/env python3 """Test Pass 0a (Obligation Extraction) on 5-10 controls. Standalone script — no SQLAlchemy dependency. Uses psycopg2 + requests. Copies prompts and quality gate from decomposition_pass.py. Usage: python3 test_pass0a.py # 10 controls, Anthropic python3 test_pass0a.py --limit 5 # 5 controls python3 test_pass0a.py --source "DSGVO" # filter by source python3 test_pass0a.py --dry-run # show controls, no LLM call """ import argparse import json import os import re import sys import time import urllib.parse import psycopg2 import requests # ── Config ──────────────────────────────────────────────────────────── ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", "") ANTHROPIC_MODEL = os.environ.get("DECOMPOSITION_LLM_MODEL", "claude-sonnet-4-6") ANTHROPIC_API_URL = "https://api.anthropic.com/v1" # ── Prompts (from decomposition_pass.py) ────────────────────────────── SYSTEM_PROMPT = """\ Du bist ein Rechts-Compliance-Experte. Du zerlegst Compliance-Controls \ in einzelne atomare Pflichten. REGELN (STRIKT EINHALTEN): 1. Nur normative Aussagen extrahieren — erkennbar an: müssen, haben \ sicherzustellen, sind verpflichtet, ist zu dokumentieren, ist zu melden, \ ist zu testen, shall, must, required. 2. Jede Pflicht hat genau EIN Hauptverb / eine Handlung. 3. Testpflichten SEPARAT von operativen Pflichten (is_test_obligation=true). 4. Meldepflichten SEPARAT (is_reporting_obligation=true). 5. NICHT auf Evidence-Ebene zerlegen (z.B. "DR-Plan vorhanden" ist KEIN \ eigenes Control, sondern Evidence). 6. Begründungen, Erläuterungen und Erwägungsgründe sind KEINE Pflichten \ — NICHT extrahieren. Antworte NUR mit einem JSON-Array. Keine Erklärungen.""" def build_prompt(title, objective, requirements, test_procedure, source_ref): return f"""\ Analysiere das folgende Control und extrahiere alle einzelnen normativen \ Pflichten als JSON-Array. CONTROL: Titel: {title} Ziel: {objective} Anforderungen: {requirements} Prüfverfahren: {test_procedure} Quellreferenz: {source_ref} Antworte als JSON-Array: [ {{ "obligation_text": "Kurze, präzise Formulierung der Pflicht", "action": "Hauptverb/Handlung", "object": "Gegenstand der Pflicht", "condition": "Auslöser/Bedingung oder null", "normative_strength": "must", "is_test_obligation": false, "is_reporting_obligation": false }} ]""" # ── Quality Gate — 3-Tier Classification (from decomposition_pass.py) ── # Tier 1: Pflicht (mandatory) _PFLICHT_RE = re.compile( r"\bmüssen\b|\bmuss\b|\bhat\s+sicherzustellen\b|\bhaben\s+sicherzustellen\b" r"|\bsind\s+verpflichtet\b|\bist\s+verpflichtet\b" r"|\bist\s+zu\s+\w+en\b|\bsind\s+zu\s+\w+en\b" r"|\bhat\s+zu\s+\w+en\b|\bhaben\s+zu\s+\w+en\b" r"|\bist\s+\w+zu\w+en\b|\bsind\s+\w+zu\w+en\b" r"|\bist\s+\w+\s+zu\s+\w+en\b|\bsind\s+\w+\s+zu\s+\w+en\b" r"|\bhat\s+\w+\s+zu\s+\w+en\b|\bhaben\s+\w+\s+zu\s+\w+en\b" r"|\bshall\b|\bmust\b|\brequired\b" r"|\b\w+zuteilen\b|\b\w+zuwenden\b|\b\w+zustellen\b|\b\w+zulegen\b" r"|\b\w+zunehmen\b|\b\w+zuführen\b|\b\w+zuhalten\b|\b\w+zusetzen\b" r"|\b\w+zuweisen\b|\b\w+zuordnen\b|\b\w+zufügen\b|\b\w+zugeben\b" r"|\bist\b.{1,80}\bzu\s+\w+en\b|\bsind\b.{1,80}\bzu\s+\w+en\b", re.IGNORECASE, ) # Tier 2: Empfehlung (recommendation) _EMPFEHLUNG_RE = re.compile( r"\bsoll\b|\bsollen\b|\bsollte\b|\bsollten\b" r"|\bgewährleisten\b|\bsicherstellen\b" r"|\bshould\b|\bensure\b|\brecommend\w*\b" r"|\bnachweisen\b|\beinhalten\b|\bunterlassen\b|\bwahren\b" r"|\bdokumentieren\b|\bimplementieren\b|\büberprüfen\b|\büberwachen\b" r"|\bprüfen,\s+ob\b|\bkontrollieren,\s+ob\b", re.IGNORECASE, ) # Tier 3: Kann (optional/permissive) _KANN_RE = re.compile( r"\bkann\b|\bkönnen\b|\bdarf\b|\bdürfen\b|\bmay\b|\boptional\b", re.IGNORECASE, ) # Union (backward compat) _NORMATIVE_RE = re.compile( _PFLICHT_RE.pattern + "|" + _EMPFEHLUNG_RE.pattern + "|" + _KANN_RE.pattern, re.IGNORECASE, ) _RATIONALE_RE = re.compile( r"\bda\s+|\bweil\b|\bgrund\b|\berwägung|\bbecause\b|\breason\b|\brationale\b", re.IGNORECASE, ) _TEST_RE = re.compile( r"\btesten\b|\btest\b|\bprüfung\b|\bprüfen\b|\bgetestet\b|\bwirksamkeit\b" r"|\baudit\b|\bregelmäßig\b.*\b(prüf|test|kontroll)|\beffectiveness\b|\bverif", re.IGNORECASE, ) _REPORTING_RE = re.compile( r"\bmelden\b|\bmeldung\b|\bunterricht|\binformieren\b|\bbenachricht" r"|\bnotif|\breport\b|\bbehörd", re.IGNORECASE, ) def classify_obligation_type(txt): """Classify: pflicht > empfehlung > kann > empfehlung (default).""" if _PFLICHT_RE.search(txt): return "pflicht" if _EMPFEHLUNG_RE.search(txt): return "empfehlung" if _KANN_RE.search(txt): return "kann" return "empfehlung" def quality_gate(obl_text, parent_uuid): """Validate + classify obligation. Returns (flags_dict, passed_bool, confidence, obligation_type).""" flags = {} # 1. Normative signal (informational) flags["has_normative_signal"] = bool(_NORMATIVE_RE.search(obl_text)) # 1b. Obligation type classification obl_type = classify_obligation_type(obl_text) flags["obligation_type"] = obl_type # 2. Single action multi_verb_re = re.compile( r"\b(und|sowie|als auch)\b.*\b(müssen|sicherstellen|implementieren" r"|dokumentieren|melden|testen|prüfen|überwachen|gewährleisten)\b", re.IGNORECASE, ) flags["single_action"] = not bool(multi_verb_re.search(obl_text)) # 3. Not rationale normative_count = len(_NORMATIVE_RE.findall(obl_text)) rationale_count = len(_RATIONALE_RE.findall(obl_text)) flags["not_rationale"] = normative_count >= rationale_count # 4. Not evidence-only evidence_only_re = re.compile( r"^(Nachweis|Dokumentation|Screenshot|Protokoll|Bericht|Zertifikat)", re.IGNORECASE, ) flags["not_evidence_only"] = not bool(evidence_only_re.match(obl_text.strip())) # 5. Min length flags["min_length"] = len(obl_text.strip()) >= 20 # 6. Parent link flags["has_parent_link"] = bool(parent_uuid) # Confidence weights = { "has_normative_signal": 0.25, "single_action": 0.20, "not_rationale": 0.20, "not_evidence_only": 0.15, "min_length": 0.10, "has_parent_link": 0.05, } # Bonus for pflicht classification confidence = sum(weights[k] for k, v in flags.items() if v and k in weights) if obl_type == "pflicht": confidence = min(confidence + 0.05, 1.0) # Pass check — has_normative_signal is NO LONGER critical critical = ["not_evidence_only", "min_length", "has_parent_link"] passed = all(flags.get(k, False) for k in critical) return flags, passed, confidence, obl_type # ── JSON parsing ────────────────────────────────────────────────────── def parse_json_array(text): try: result = json.loads(text) if isinstance(result, list): return result if isinstance(result, dict): return [result] except json.JSONDecodeError: pass match = re.search(r"\[[\s\S]*\]", text) if match: try: result = json.loads(match.group()) if isinstance(result, list): return result except json.JSONDecodeError: pass return [] # ── API call ────────────────────────────────────────────────────────── def call_anthropic(prompt): headers = { "x-api-key": ANTHROPIC_API_KEY, "anthropic-version": "2023-06-01", "content-type": "application/json", } payload = { "model": ANTHROPIC_MODEL, "max_tokens": 8192, "system": [{"type": "text", "text": SYSTEM_PROMPT, "cache_control": {"type": "ephemeral"}}], "messages": [{"role": "user", "content": prompt}], } resp = requests.post(f"{ANTHROPIC_API_URL}/messages", headers=headers, json=payload, timeout=120) if resp.status_code != 200: return None, {}, f"HTTP {resp.status_code}: {resp.text[:200]}" data = resp.json() usage = data.get("usage", {}) content = data.get("content", []) text = content[0].get("text", "") if content else "" return text, usage, None # ── Format helpers ──────────────────────────────────────────────────── def fmt_json(val): if val is None: return "" if isinstance(val, str): try: val = json.loads(val) except (json.JSONDecodeError, TypeError): return val if isinstance(val, list): return "\n".join(f" - {item}" for item in val) return str(val) # ── Main ────────────────────────────────────────────────────────────── def main(): parser = argparse.ArgumentParser(description="Test Pass 0a on small sample") parser.add_argument("--limit", type=int, default=10) parser.add_argument("--source", type=str) parser.add_argument("--dry-run", action="store_true") args = parser.parse_args() if not ANTHROPIC_API_KEY and not args.dry_run: print("ERROR: Set ANTHROPIC_API_KEY") sys.exit(1) db_url = os.environ["DATABASE_URL"] p = urllib.parse.urlparse(db_url) conn = psycopg2.connect( host=p.hostname, port=p.port or 5432, user=p.username, password=p.password, dbname=p.path.lstrip("/"), options="-c search_path=compliance,public", ) cur = conn.cursor() # Select diverse sample query = """ SELECT id, control_id, title, objective, requirements, test_procedure, source_citation, category FROM compliance.canonical_controls WHERE release_state NOT IN ('deprecated', 'duplicate', 'too_close') AND parent_control_uuid IS NULL AND title IS NOT NULL AND objective IS NOT NULL AND length(coalesce(objective,'') || coalesce(requirements::text,'')) > 100 """ params = [] if args.source: query += " AND source_citation->>'source' ILIKE %s" params.append(f"%{args.source}%") query += " ORDER BY source_citation->>'source', random()" query += f" LIMIT {args.limit}" cur.execute(query, params) controls = cur.fetchall() if not controls: print("No controls found.") return print(f"{'='*70}") print(f"Pass 0a Test — {len(controls)} Controls") print(f"Model: {ANTHROPIC_MODEL}") print(f"{'='*70}") total_in = total_out = total_obls = 0 type_counts = {"pflicht": 0, "empfehlung": 0, "kann": 0} total_rejected = 0 # only evidence-only / too-short / no-parent all_results = [] t_start = time.time() for i, row in enumerate(controls, 1): ctrl_uuid, ctrl_id, title, objective, reqs, test_proc, src_cit, category = row req_str = fmt_json(reqs) test_str = fmt_json(test_proc) source_str = "" if src_cit: sc = src_cit if isinstance(src_cit, dict) else json.loads(src_cit) source_str = f"{sc.get('source', '')} {sc.get('article', '')}" print(f"\n{'─'*70}") print(f"[{i}/{len(controls)}] {ctrl_id}: {title}") print(f" Source: {source_str} | Category: {category or 'N/A'}") print(f" Objective: {(objective or '')[:200]}") if args.dry_run: print(" [DRY RUN]") continue prompt = build_prompt(title or "", objective or "", req_str, test_str, source_str) t0 = time.time() response_text, usage, error = call_anthropic(prompt) elapsed = time.time() - t0 if error: print(f" ERROR: {error}") continue in_tok = usage.get("input_tokens", 0) out_tok = usage.get("output_tokens", 0) cached = usage.get("cache_read_input_tokens", 0) total_in += in_tok total_out += out_tok obligations = parse_json_array(response_text) total_obls += len(obligations) print(f" API: {elapsed:.1f}s | {in_tok} in / {out_tok} out" f"{f' ({cached} cached)' if cached else ''}" f" | {len(obligations)} obligation(s)") for j, obl in enumerate(obligations, 1): obl_text = obl.get("obligation_text", "") action = obl.get("action", "") obj = obl.get("object", "") condition = obl.get("condition") strength = obl.get("normative_strength", "must") is_test = bool(obl.get("is_test_obligation", False)) is_report = bool(obl.get("is_reporting_obligation", False)) # Auto-detect if not is_test and _TEST_RE.search(obl_text): is_test = True if not is_report and _REPORTING_RE.search(obl_text): is_report = True flags, passed, conf, obl_type = quality_gate(obl_text, str(ctrl_uuid)) if passed: type_counts[obl_type] = type_counts.get(obl_type, 0) + 1 else: total_rejected += 1 tag = "" if is_test: tag = " [TEST]" elif is_report: tag = " [MELDEPFLICHT]" # Show type instead of PASS/REJECT type_label = {"pflicht": "PFLICHT", "empfehlung": "EMPFEHLUNG", "kann": "KANN"} if not passed: status = "REJECT" else: status = type_label.get(obl_type, "EMPFEHLUNG") failed = [k for k, v in flags.items() if isinstance(v, bool) and not v] print(f"\n {j}. [{status}] conf={conf:.0%}{tag} strength={strength}") print(f" {obl_text}") print(f" Handlung: {action} | Gegenstand: {obj}") if condition: print(f" Bedingung: {condition}") if not passed: print(f" Abgelehnt: {', '.join(failed)}") all_results.append({ "control_id": ctrl_id, "obligation_text": obl_text, "obligation_type": obl_type if passed else "rejected", "action": action, "object": obj, "condition": condition, "confidence": round(conf, 2), "is_test": is_test, "is_reporting": is_report, "passed": passed, "flags": {k: v for k, v in flags.items()}, }) time.sleep(0.5) # ── Summary ────────────────────────────────────────────────────── elapsed_total = time.time() - t_start cost = (total_in * 3 + total_out * 15) / 1_000_000 total_classified = sum(type_counts.values()) print(f"\n\n{'='*70}") print(f"ZUSAMMENFASSUNG — 3-Tier-Klassifizierung") print(f"{'='*70}") print(f" Controls: {len(controls)}") print(f" Obligations: {total_obls} ({total_obls/max(len(controls),1):.1f} pro Control)") print(f" ── Klassifizierung ──") print(f" Pflicht: {type_counts['pflicht']}" f" ({type_counts['pflicht']*100/max(total_obls,1):.0f}%)") print(f" Empfehlung: {type_counts['empfehlung']}" f" ({type_counts['empfehlung']*100/max(total_obls,1):.0f}%)") print(f" Kann: {type_counts['kann']}" f" ({type_counts['kann']*100/max(total_obls,1):.0f}%)") print(f" Rejected: {total_rejected}" f" ({total_rejected*100/max(total_obls,1):.0f}%)" f" (nur evidence-only/zu kurz/kein parent)") print(f" ── Kosten ──") print(f" Laufzeit: {elapsed_total:.1f}s") print(f" Tokens: {total_in:,} in / {total_out:,} out") print(f" Kosten: ${cost:.4f}") if len(controls) > 0 and not args.dry_run and total_obls > 0: n = 6000 factor = n / len(controls) print(f"\n --- Hochrechnung auf {n:,} Controls ---") print(f" Tokens: {int(total_in * factor):,} in / {int(total_out * factor):,} out") print(f" Kosten: ${cost * factor:.2f}") print(f" Laufzeit: {elapsed_total * factor / 3600:.1f}h") print(f" Obligations: ~{int(total_obls / len(controls) * n):,}") pf = int(type_counts['pflicht'] * factor) ef = int(type_counts['empfehlung'] * factor) kf = int(type_counts['kann'] * factor) print(f" Pflicht: ~{pf:,}") print(f" Empfehlung: ~{ef:,}") print(f" Kann: ~{kf:,}") # Save results JSON for later analysis if all_results: out_path = f"/tmp/pass0a_results_{len(controls)}controls.json" with open(out_path, "w") as f: json.dump(all_results, f, ensure_ascii=False, indent=2) print(f"\n Ergebnisse gespeichert: {out_path}") conn.close() if __name__ == "__main__": main()