Files
breakpilot-compliance/scripts/qa/test_pass0a.py
Benjamin Admin 643b26618f
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 31s
CI/CD / test-python-backend-compliance (push) Successful in 1m35s
CI/CD / test-python-document-crawler (push) Successful in 20s
CI/CD / test-python-dsms-gateway (push) Successful in 17s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped
feat: Control Library UI, dedup migration, QA tooling, docs
- Control Library: parent control display, ObligationTypeBadge,
  GenerationStrategyBadge variants, evidence string fallback
- API: expose parent_control_uuid/id/title in canonical controls
- Fix: DSFA SQLAlchemy 2.0 Row._mapping compatibility
- Migration 074: control_parent_links + control_dedup_reviews tables
- QA scripts: benchmark, gap analysis, OSCAL import, OWASP cleanup,
  phase5 normalize, phase74 gap fill, sync_db, run_job
- Docs: dedup engine, RAG benchmark, lessons learned, pipeline docs

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-21 11:56:08 +01:00

471 lines
17 KiB
Python

#!/usr/bin/env python3
"""Test Pass 0a (Obligation Extraction) on 5-10 controls.
Standalone script — no SQLAlchemy dependency. Uses psycopg2 + requests.
Copies prompts and quality gate from decomposition_pass.py.
Usage:
python3 test_pass0a.py # 10 controls, Anthropic
python3 test_pass0a.py --limit 5 # 5 controls
python3 test_pass0a.py --source "DSGVO" # filter by source
python3 test_pass0a.py --dry-run # show controls, no LLM call
"""
import argparse
import json
import os
import re
import sys
import time
import urllib.parse
import psycopg2
import requests
# ── Config ────────────────────────────────────────────────────────────
ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", "")
ANTHROPIC_MODEL = os.environ.get("DECOMPOSITION_LLM_MODEL", "claude-sonnet-4-6")
ANTHROPIC_API_URL = "https://api.anthropic.com/v1"
# ── Prompts (from decomposition_pass.py) ──────────────────────────────
SYSTEM_PROMPT = """\
Du bist ein Rechts-Compliance-Experte. Du zerlegst Compliance-Controls \
in einzelne atomare Pflichten.
REGELN (STRIKT EINHALTEN):
1. Nur normative Aussagen extrahieren — erkennbar an: müssen, haben \
sicherzustellen, sind verpflichtet, ist zu dokumentieren, ist zu melden, \
ist zu testen, shall, must, required.
2. Jede Pflicht hat genau EIN Hauptverb / eine Handlung.
3. Testpflichten SEPARAT von operativen Pflichten (is_test_obligation=true).
4. Meldepflichten SEPARAT (is_reporting_obligation=true).
5. NICHT auf Evidence-Ebene zerlegen (z.B. "DR-Plan vorhanden" ist KEIN \
eigenes Control, sondern Evidence).
6. Begründungen, Erläuterungen und Erwägungsgründe sind KEINE Pflichten \
— NICHT extrahieren.
Antworte NUR mit einem JSON-Array. Keine Erklärungen."""
def build_prompt(title, objective, requirements, test_procedure, source_ref):
return f"""\
Analysiere das folgende Control und extrahiere alle einzelnen normativen \
Pflichten als JSON-Array.
CONTROL:
Titel: {title}
Ziel: {objective}
Anforderungen: {requirements}
Prüfverfahren: {test_procedure}
Quellreferenz: {source_ref}
Antworte als JSON-Array:
[
{{
"obligation_text": "Kurze, präzise Formulierung der Pflicht",
"action": "Hauptverb/Handlung",
"object": "Gegenstand der Pflicht",
"condition": "Auslöser/Bedingung oder null",
"normative_strength": "must",
"is_test_obligation": false,
"is_reporting_obligation": false
}}
]"""
# ── Quality Gate — 3-Tier Classification (from decomposition_pass.py) ──
# Tier 1: Pflicht (mandatory)
_PFLICHT_RE = re.compile(
r"\bmüssen\b|\bmuss\b|\bhat\s+sicherzustellen\b|\bhaben\s+sicherzustellen\b"
r"|\bsind\s+verpflichtet\b|\bist\s+verpflichtet\b"
r"|\bist\s+zu\s+\w+en\b|\bsind\s+zu\s+\w+en\b"
r"|\bhat\s+zu\s+\w+en\b|\bhaben\s+zu\s+\w+en\b"
r"|\bist\s+\w+zu\w+en\b|\bsind\s+\w+zu\w+en\b"
r"|\bist\s+\w+\s+zu\s+\w+en\b|\bsind\s+\w+\s+zu\s+\w+en\b"
r"|\bhat\s+\w+\s+zu\s+\w+en\b|\bhaben\s+\w+\s+zu\s+\w+en\b"
r"|\bshall\b|\bmust\b|\brequired\b"
r"|\b\w+zuteilen\b|\b\w+zuwenden\b|\b\w+zustellen\b|\b\w+zulegen\b"
r"|\b\w+zunehmen\b|\b\w+zuführen\b|\b\w+zuhalten\b|\b\w+zusetzen\b"
r"|\b\w+zuweisen\b|\b\w+zuordnen\b|\b\w+zufügen\b|\b\w+zugeben\b"
r"|\bist\b.{1,80}\bzu\s+\w+en\b|\bsind\b.{1,80}\bzu\s+\w+en\b",
re.IGNORECASE,
)
# Tier 2: Empfehlung (recommendation)
_EMPFEHLUNG_RE = re.compile(
r"\bsoll\b|\bsollen\b|\bsollte\b|\bsollten\b"
r"|\bgewährleisten\b|\bsicherstellen\b"
r"|\bshould\b|\bensure\b|\brecommend\w*\b"
r"|\bnachweisen\b|\beinhalten\b|\bunterlassen\b|\bwahren\b"
r"|\bdokumentieren\b|\bimplementieren\b|\büberprüfen\b|\büberwachen\b"
r"|\bprüfen,\s+ob\b|\bkontrollieren,\s+ob\b",
re.IGNORECASE,
)
# Tier 3: Kann (optional/permissive)
_KANN_RE = re.compile(
r"\bkann\b|\bkönnen\b|\bdarf\b|\bdürfen\b|\bmay\b|\boptional\b",
re.IGNORECASE,
)
# Union (backward compat)
_NORMATIVE_RE = re.compile(
_PFLICHT_RE.pattern + "|" + _EMPFEHLUNG_RE.pattern + "|" + _KANN_RE.pattern,
re.IGNORECASE,
)
_RATIONALE_RE = re.compile(
r"\bda\s+|\bweil\b|\bgrund\b|\berwägung|\bbecause\b|\breason\b|\brationale\b",
re.IGNORECASE,
)
_TEST_RE = re.compile(
r"\btesten\b|\btest\b|\bprüfung\b|\bprüfen\b|\bgetestet\b|\bwirksamkeit\b"
r"|\baudit\b|\bregelmäßig\b.*\b(prüf|test|kontroll)|\beffectiveness\b|\bverif",
re.IGNORECASE,
)
_REPORTING_RE = re.compile(
r"\bmelden\b|\bmeldung\b|\bunterricht|\binformieren\b|\bbenachricht"
r"|\bnotif|\breport\b|\bbehörd",
re.IGNORECASE,
)
def classify_obligation_type(txt):
"""Classify: pflicht > empfehlung > kann > empfehlung (default)."""
if _PFLICHT_RE.search(txt):
return "pflicht"
if _EMPFEHLUNG_RE.search(txt):
return "empfehlung"
if _KANN_RE.search(txt):
return "kann"
return "empfehlung"
def quality_gate(obl_text, parent_uuid):
"""Validate + classify obligation. Returns (flags_dict, passed_bool, confidence, obligation_type)."""
flags = {}
# 1. Normative signal (informational)
flags["has_normative_signal"] = bool(_NORMATIVE_RE.search(obl_text))
# 1b. Obligation type classification
obl_type = classify_obligation_type(obl_text)
flags["obligation_type"] = obl_type
# 2. Single action
multi_verb_re = re.compile(
r"\b(und|sowie|als auch)\b.*\b(müssen|sicherstellen|implementieren"
r"|dokumentieren|melden|testen|prüfen|überwachen|gewährleisten)\b",
re.IGNORECASE,
)
flags["single_action"] = not bool(multi_verb_re.search(obl_text))
# 3. Not rationale
normative_count = len(_NORMATIVE_RE.findall(obl_text))
rationale_count = len(_RATIONALE_RE.findall(obl_text))
flags["not_rationale"] = normative_count >= rationale_count
# 4. Not evidence-only
evidence_only_re = re.compile(
r"^(Nachweis|Dokumentation|Screenshot|Protokoll|Bericht|Zertifikat)",
re.IGNORECASE,
)
flags["not_evidence_only"] = not bool(evidence_only_re.match(obl_text.strip()))
# 5. Min length
flags["min_length"] = len(obl_text.strip()) >= 20
# 6. Parent link
flags["has_parent_link"] = bool(parent_uuid)
# Confidence
weights = {
"has_normative_signal": 0.25, "single_action": 0.20,
"not_rationale": 0.20, "not_evidence_only": 0.15,
"min_length": 0.10, "has_parent_link": 0.05,
}
# Bonus for pflicht classification
confidence = sum(weights[k] for k, v in flags.items() if v and k in weights)
if obl_type == "pflicht":
confidence = min(confidence + 0.05, 1.0)
# Pass check — has_normative_signal is NO LONGER critical
critical = ["not_evidence_only", "min_length", "has_parent_link"]
passed = all(flags.get(k, False) for k in critical)
return flags, passed, confidence, obl_type
# ── JSON parsing ──────────────────────────────────────────────────────
def parse_json_array(text):
try:
result = json.loads(text)
if isinstance(result, list):
return result
if isinstance(result, dict):
return [result]
except json.JSONDecodeError:
pass
match = re.search(r"\[[\s\S]*\]", text)
if match:
try:
result = json.loads(match.group())
if isinstance(result, list):
return result
except json.JSONDecodeError:
pass
return []
# ── API call ──────────────────────────────────────────────────────────
def call_anthropic(prompt):
headers = {
"x-api-key": ANTHROPIC_API_KEY,
"anthropic-version": "2023-06-01",
"content-type": "application/json",
}
payload = {
"model": ANTHROPIC_MODEL,
"max_tokens": 8192,
"system": [{"type": "text", "text": SYSTEM_PROMPT, "cache_control": {"type": "ephemeral"}}],
"messages": [{"role": "user", "content": prompt}],
}
resp = requests.post(f"{ANTHROPIC_API_URL}/messages", headers=headers, json=payload, timeout=120)
if resp.status_code != 200:
return None, {}, f"HTTP {resp.status_code}: {resp.text[:200]}"
data = resp.json()
usage = data.get("usage", {})
content = data.get("content", [])
text = content[0].get("text", "") if content else ""
return text, usage, None
# ── Format helpers ────────────────────────────────────────────────────
def fmt_json(val):
if val is None:
return ""
if isinstance(val, str):
try:
val = json.loads(val)
except (json.JSONDecodeError, TypeError):
return val
if isinstance(val, list):
return "\n".join(f" - {item}" for item in val)
return str(val)
# ── Main ──────────────────────────────────────────────────────────────
def main():
parser = argparse.ArgumentParser(description="Test Pass 0a on small sample")
parser.add_argument("--limit", type=int, default=10)
parser.add_argument("--source", type=str)
parser.add_argument("--dry-run", action="store_true")
args = parser.parse_args()
if not ANTHROPIC_API_KEY and not args.dry_run:
print("ERROR: Set ANTHROPIC_API_KEY")
sys.exit(1)
db_url = os.environ["DATABASE_URL"]
p = urllib.parse.urlparse(db_url)
conn = psycopg2.connect(
host=p.hostname, port=p.port or 5432,
user=p.username, password=p.password,
dbname=p.path.lstrip("/"),
options="-c search_path=compliance,public",
)
cur = conn.cursor()
# Select diverse sample
query = """
SELECT id, control_id, title, objective, requirements,
test_procedure, source_citation, category
FROM compliance.canonical_controls
WHERE release_state NOT IN ('deprecated', 'duplicate', 'too_close')
AND parent_control_uuid IS NULL
AND title IS NOT NULL AND objective IS NOT NULL
AND length(coalesce(objective,'') || coalesce(requirements::text,'')) > 100
"""
params = []
if args.source:
query += " AND source_citation->>'source' ILIKE %s"
params.append(f"%{args.source}%")
query += " ORDER BY source_citation->>'source', random()"
query += f" LIMIT {args.limit}"
cur.execute(query, params)
controls = cur.fetchall()
if not controls:
print("No controls found.")
return
print(f"{'='*70}")
print(f"Pass 0a Test — {len(controls)} Controls")
print(f"Model: {ANTHROPIC_MODEL}")
print(f"{'='*70}")
total_in = total_out = total_obls = 0
type_counts = {"pflicht": 0, "empfehlung": 0, "kann": 0}
total_rejected = 0 # only evidence-only / too-short / no-parent
all_results = []
t_start = time.time()
for i, row in enumerate(controls, 1):
ctrl_uuid, ctrl_id, title, objective, reqs, test_proc, src_cit, category = row
req_str = fmt_json(reqs)
test_str = fmt_json(test_proc)
source_str = ""
if src_cit:
sc = src_cit if isinstance(src_cit, dict) else json.loads(src_cit)
source_str = f"{sc.get('source', '')} {sc.get('article', '')}"
print(f"\n{''*70}")
print(f"[{i}/{len(controls)}] {ctrl_id}: {title}")
print(f" Source: {source_str} | Category: {category or 'N/A'}")
print(f" Objective: {(objective or '')[:200]}")
if args.dry_run:
print(" [DRY RUN]")
continue
prompt = build_prompt(title or "", objective or "", req_str, test_str, source_str)
t0 = time.time()
response_text, usage, error = call_anthropic(prompt)
elapsed = time.time() - t0
if error:
print(f" ERROR: {error}")
continue
in_tok = usage.get("input_tokens", 0)
out_tok = usage.get("output_tokens", 0)
cached = usage.get("cache_read_input_tokens", 0)
total_in += in_tok
total_out += out_tok
obligations = parse_json_array(response_text)
total_obls += len(obligations)
print(f" API: {elapsed:.1f}s | {in_tok} in / {out_tok} out"
f"{f' ({cached} cached)' if cached else ''}"
f" | {len(obligations)} obligation(s)")
for j, obl in enumerate(obligations, 1):
obl_text = obl.get("obligation_text", "")
action = obl.get("action", "")
obj = obl.get("object", "")
condition = obl.get("condition")
strength = obl.get("normative_strength", "must")
is_test = bool(obl.get("is_test_obligation", False))
is_report = bool(obl.get("is_reporting_obligation", False))
# Auto-detect
if not is_test and _TEST_RE.search(obl_text):
is_test = True
if not is_report and _REPORTING_RE.search(obl_text):
is_report = True
flags, passed, conf, obl_type = quality_gate(obl_text, str(ctrl_uuid))
if passed:
type_counts[obl_type] = type_counts.get(obl_type, 0) + 1
else:
total_rejected += 1
tag = ""
if is_test:
tag = " [TEST]"
elif is_report:
tag = " [MELDEPFLICHT]"
# Show type instead of PASS/REJECT
type_label = {"pflicht": "PFLICHT", "empfehlung": "EMPFEHLUNG", "kann": "KANN"}
if not passed:
status = "REJECT"
else:
status = type_label.get(obl_type, "EMPFEHLUNG")
failed = [k for k, v in flags.items()
if isinstance(v, bool) and not v]
print(f"\n {j}. [{status}] conf={conf:.0%}{tag} strength={strength}")
print(f" {obl_text}")
print(f" Handlung: {action} | Gegenstand: {obj}")
if condition:
print(f" Bedingung: {condition}")
if not passed:
print(f" Abgelehnt: {', '.join(failed)}")
all_results.append({
"control_id": ctrl_id,
"obligation_text": obl_text,
"obligation_type": obl_type if passed else "rejected",
"action": action,
"object": obj,
"condition": condition,
"confidence": round(conf, 2),
"is_test": is_test,
"is_reporting": is_report,
"passed": passed,
"flags": {k: v for k, v in flags.items()},
})
time.sleep(0.5)
# ── Summary ──────────────────────────────────────────────────────
elapsed_total = time.time() - t_start
cost = (total_in * 3 + total_out * 15) / 1_000_000
total_classified = sum(type_counts.values())
print(f"\n\n{'='*70}")
print(f"ZUSAMMENFASSUNG — 3-Tier-Klassifizierung")
print(f"{'='*70}")
print(f" Controls: {len(controls)}")
print(f" Obligations: {total_obls} ({total_obls/max(len(controls),1):.1f} pro Control)")
print(f" ── Klassifizierung ──")
print(f" Pflicht: {type_counts['pflicht']}"
f" ({type_counts['pflicht']*100/max(total_obls,1):.0f}%)")
print(f" Empfehlung: {type_counts['empfehlung']}"
f" ({type_counts['empfehlung']*100/max(total_obls,1):.0f}%)")
print(f" Kann: {type_counts['kann']}"
f" ({type_counts['kann']*100/max(total_obls,1):.0f}%)")
print(f" Rejected: {total_rejected}"
f" ({total_rejected*100/max(total_obls,1):.0f}%)"
f" (nur evidence-only/zu kurz/kein parent)")
print(f" ── Kosten ──")
print(f" Laufzeit: {elapsed_total:.1f}s")
print(f" Tokens: {total_in:,} in / {total_out:,} out")
print(f" Kosten: ${cost:.4f}")
if len(controls) > 0 and not args.dry_run and total_obls > 0:
n = 6000
factor = n / len(controls)
print(f"\n --- Hochrechnung auf {n:,} Controls ---")
print(f" Tokens: {int(total_in * factor):,} in / {int(total_out * factor):,} out")
print(f" Kosten: ${cost * factor:.2f}")
print(f" Laufzeit: {elapsed_total * factor / 3600:.1f}h")
print(f" Obligations: ~{int(total_obls / len(controls) * n):,}")
pf = int(type_counts['pflicht'] * factor)
ef = int(type_counts['empfehlung'] * factor)
kf = int(type_counts['kann'] * factor)
print(f" Pflicht: ~{pf:,}")
print(f" Empfehlung: ~{ef:,}")
print(f" Kann: ~{kf:,}")
# Save results JSON for later analysis
if all_results:
out_path = f"/tmp/pass0a_results_{len(controls)}controls.json"
with open(out_path, "w") as f:
json.dump(all_results, f, ensure_ascii=False, indent=2)
print(f"\n Ergebnisse gespeichert: {out_path}")
conn.close()
if __name__ == "__main__":
main()