Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 31s
CI/CD / test-python-backend-compliance (push) Successful in 1m35s
CI/CD / test-python-document-crawler (push) Successful in 20s
CI/CD / test-python-dsms-gateway (push) Successful in 17s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped
- Control Library: parent control display, ObligationTypeBadge, GenerationStrategyBadge variants, evidence string fallback - API: expose parent_control_uuid/id/title in canonical controls - Fix: DSFA SQLAlchemy 2.0 Row._mapping compatibility - Migration 074: control_parent_links + control_dedup_reviews tables - QA scripts: benchmark, gap analysis, OSCAL import, OWASP cleanup, phase5 normalize, phase74 gap fill, sync_db, run_job - Docs: dedup engine, RAG benchmark, lessons learned, pipeline docs Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
471 lines
17 KiB
Python
471 lines
17 KiB
Python
#!/usr/bin/env python3
|
|
"""Test Pass 0a (Obligation Extraction) on 5-10 controls.
|
|
|
|
Standalone script — no SQLAlchemy dependency. Uses psycopg2 + requests.
|
|
Copies prompts and quality gate from decomposition_pass.py.
|
|
|
|
Usage:
|
|
python3 test_pass0a.py # 10 controls, Anthropic
|
|
python3 test_pass0a.py --limit 5 # 5 controls
|
|
python3 test_pass0a.py --source "DSGVO" # filter by source
|
|
python3 test_pass0a.py --dry-run # show controls, no LLM call
|
|
"""
|
|
import argparse
|
|
import json
|
|
import os
|
|
import re
|
|
import sys
|
|
import time
|
|
import urllib.parse
|
|
|
|
import psycopg2
|
|
import requests
|
|
|
|
# ── Config ────────────────────────────────────────────────────────────
|
|
ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", "")
|
|
ANTHROPIC_MODEL = os.environ.get("DECOMPOSITION_LLM_MODEL", "claude-sonnet-4-6")
|
|
ANTHROPIC_API_URL = "https://api.anthropic.com/v1"
|
|
|
|
# ── Prompts (from decomposition_pass.py) ──────────────────────────────
|
|
|
|
SYSTEM_PROMPT = """\
|
|
Du bist ein Rechts-Compliance-Experte. Du zerlegst Compliance-Controls \
|
|
in einzelne atomare Pflichten.
|
|
|
|
REGELN (STRIKT EINHALTEN):
|
|
1. Nur normative Aussagen extrahieren — erkennbar an: müssen, haben \
|
|
sicherzustellen, sind verpflichtet, ist zu dokumentieren, ist zu melden, \
|
|
ist zu testen, shall, must, required.
|
|
2. Jede Pflicht hat genau EIN Hauptverb / eine Handlung.
|
|
3. Testpflichten SEPARAT von operativen Pflichten (is_test_obligation=true).
|
|
4. Meldepflichten SEPARAT (is_reporting_obligation=true).
|
|
5. NICHT auf Evidence-Ebene zerlegen (z.B. "DR-Plan vorhanden" ist KEIN \
|
|
eigenes Control, sondern Evidence).
|
|
6. Begründungen, Erläuterungen und Erwägungsgründe sind KEINE Pflichten \
|
|
— NICHT extrahieren.
|
|
|
|
Antworte NUR mit einem JSON-Array. Keine Erklärungen."""
|
|
|
|
|
|
def build_prompt(title, objective, requirements, test_procedure, source_ref):
|
|
return f"""\
|
|
Analysiere das folgende Control und extrahiere alle einzelnen normativen \
|
|
Pflichten als JSON-Array.
|
|
|
|
CONTROL:
|
|
Titel: {title}
|
|
Ziel: {objective}
|
|
Anforderungen: {requirements}
|
|
Prüfverfahren: {test_procedure}
|
|
Quellreferenz: {source_ref}
|
|
|
|
Antworte als JSON-Array:
|
|
[
|
|
{{
|
|
"obligation_text": "Kurze, präzise Formulierung der Pflicht",
|
|
"action": "Hauptverb/Handlung",
|
|
"object": "Gegenstand der Pflicht",
|
|
"condition": "Auslöser/Bedingung oder null",
|
|
"normative_strength": "must",
|
|
"is_test_obligation": false,
|
|
"is_reporting_obligation": false
|
|
}}
|
|
]"""
|
|
|
|
|
|
# ── Quality Gate — 3-Tier Classification (from decomposition_pass.py) ──
|
|
|
|
# Tier 1: Pflicht (mandatory)
|
|
_PFLICHT_RE = re.compile(
|
|
r"\bmüssen\b|\bmuss\b|\bhat\s+sicherzustellen\b|\bhaben\s+sicherzustellen\b"
|
|
r"|\bsind\s+verpflichtet\b|\bist\s+verpflichtet\b"
|
|
r"|\bist\s+zu\s+\w+en\b|\bsind\s+zu\s+\w+en\b"
|
|
r"|\bhat\s+zu\s+\w+en\b|\bhaben\s+zu\s+\w+en\b"
|
|
r"|\bist\s+\w+zu\w+en\b|\bsind\s+\w+zu\w+en\b"
|
|
r"|\bist\s+\w+\s+zu\s+\w+en\b|\bsind\s+\w+\s+zu\s+\w+en\b"
|
|
r"|\bhat\s+\w+\s+zu\s+\w+en\b|\bhaben\s+\w+\s+zu\s+\w+en\b"
|
|
r"|\bshall\b|\bmust\b|\brequired\b"
|
|
r"|\b\w+zuteilen\b|\b\w+zuwenden\b|\b\w+zustellen\b|\b\w+zulegen\b"
|
|
r"|\b\w+zunehmen\b|\b\w+zuführen\b|\b\w+zuhalten\b|\b\w+zusetzen\b"
|
|
r"|\b\w+zuweisen\b|\b\w+zuordnen\b|\b\w+zufügen\b|\b\w+zugeben\b"
|
|
r"|\bist\b.{1,80}\bzu\s+\w+en\b|\bsind\b.{1,80}\bzu\s+\w+en\b",
|
|
re.IGNORECASE,
|
|
)
|
|
# Tier 2: Empfehlung (recommendation)
|
|
_EMPFEHLUNG_RE = re.compile(
|
|
r"\bsoll\b|\bsollen\b|\bsollte\b|\bsollten\b"
|
|
r"|\bgewährleisten\b|\bsicherstellen\b"
|
|
r"|\bshould\b|\bensure\b|\brecommend\w*\b"
|
|
r"|\bnachweisen\b|\beinhalten\b|\bunterlassen\b|\bwahren\b"
|
|
r"|\bdokumentieren\b|\bimplementieren\b|\büberprüfen\b|\büberwachen\b"
|
|
r"|\bprüfen,\s+ob\b|\bkontrollieren,\s+ob\b",
|
|
re.IGNORECASE,
|
|
)
|
|
# Tier 3: Kann (optional/permissive)
|
|
_KANN_RE = re.compile(
|
|
r"\bkann\b|\bkönnen\b|\bdarf\b|\bdürfen\b|\bmay\b|\boptional\b",
|
|
re.IGNORECASE,
|
|
)
|
|
# Union (backward compat)
|
|
_NORMATIVE_RE = re.compile(
|
|
_PFLICHT_RE.pattern + "|" + _EMPFEHLUNG_RE.pattern + "|" + _KANN_RE.pattern,
|
|
re.IGNORECASE,
|
|
)
|
|
_RATIONALE_RE = re.compile(
|
|
r"\bda\s+|\bweil\b|\bgrund\b|\berwägung|\bbecause\b|\breason\b|\brationale\b",
|
|
re.IGNORECASE,
|
|
)
|
|
_TEST_RE = re.compile(
|
|
r"\btesten\b|\btest\b|\bprüfung\b|\bprüfen\b|\bgetestet\b|\bwirksamkeit\b"
|
|
r"|\baudit\b|\bregelmäßig\b.*\b(prüf|test|kontroll)|\beffectiveness\b|\bverif",
|
|
re.IGNORECASE,
|
|
)
|
|
_REPORTING_RE = re.compile(
|
|
r"\bmelden\b|\bmeldung\b|\bunterricht|\binformieren\b|\bbenachricht"
|
|
r"|\bnotif|\breport\b|\bbehörd",
|
|
re.IGNORECASE,
|
|
)
|
|
|
|
|
|
def classify_obligation_type(txt):
|
|
"""Classify: pflicht > empfehlung > kann > empfehlung (default)."""
|
|
if _PFLICHT_RE.search(txt):
|
|
return "pflicht"
|
|
if _EMPFEHLUNG_RE.search(txt):
|
|
return "empfehlung"
|
|
if _KANN_RE.search(txt):
|
|
return "kann"
|
|
return "empfehlung"
|
|
|
|
|
|
def quality_gate(obl_text, parent_uuid):
|
|
"""Validate + classify obligation. Returns (flags_dict, passed_bool, confidence, obligation_type)."""
|
|
flags = {}
|
|
|
|
# 1. Normative signal (informational)
|
|
flags["has_normative_signal"] = bool(_NORMATIVE_RE.search(obl_text))
|
|
|
|
# 1b. Obligation type classification
|
|
obl_type = classify_obligation_type(obl_text)
|
|
flags["obligation_type"] = obl_type
|
|
|
|
# 2. Single action
|
|
multi_verb_re = re.compile(
|
|
r"\b(und|sowie|als auch)\b.*\b(müssen|sicherstellen|implementieren"
|
|
r"|dokumentieren|melden|testen|prüfen|überwachen|gewährleisten)\b",
|
|
re.IGNORECASE,
|
|
)
|
|
flags["single_action"] = not bool(multi_verb_re.search(obl_text))
|
|
|
|
# 3. Not rationale
|
|
normative_count = len(_NORMATIVE_RE.findall(obl_text))
|
|
rationale_count = len(_RATIONALE_RE.findall(obl_text))
|
|
flags["not_rationale"] = normative_count >= rationale_count
|
|
|
|
# 4. Not evidence-only
|
|
evidence_only_re = re.compile(
|
|
r"^(Nachweis|Dokumentation|Screenshot|Protokoll|Bericht|Zertifikat)",
|
|
re.IGNORECASE,
|
|
)
|
|
flags["not_evidence_only"] = not bool(evidence_only_re.match(obl_text.strip()))
|
|
|
|
# 5. Min length
|
|
flags["min_length"] = len(obl_text.strip()) >= 20
|
|
|
|
# 6. Parent link
|
|
flags["has_parent_link"] = bool(parent_uuid)
|
|
|
|
# Confidence
|
|
weights = {
|
|
"has_normative_signal": 0.25, "single_action": 0.20,
|
|
"not_rationale": 0.20, "not_evidence_only": 0.15,
|
|
"min_length": 0.10, "has_parent_link": 0.05,
|
|
}
|
|
# Bonus for pflicht classification
|
|
confidence = sum(weights[k] for k, v in flags.items() if v and k in weights)
|
|
if obl_type == "pflicht":
|
|
confidence = min(confidence + 0.05, 1.0)
|
|
|
|
# Pass check — has_normative_signal is NO LONGER critical
|
|
critical = ["not_evidence_only", "min_length", "has_parent_link"]
|
|
passed = all(flags.get(k, False) for k in critical)
|
|
|
|
return flags, passed, confidence, obl_type
|
|
|
|
|
|
# ── JSON parsing ──────────────────────────────────────────────────────
|
|
|
|
def parse_json_array(text):
|
|
try:
|
|
result = json.loads(text)
|
|
if isinstance(result, list):
|
|
return result
|
|
if isinstance(result, dict):
|
|
return [result]
|
|
except json.JSONDecodeError:
|
|
pass
|
|
match = re.search(r"\[[\s\S]*\]", text)
|
|
if match:
|
|
try:
|
|
result = json.loads(match.group())
|
|
if isinstance(result, list):
|
|
return result
|
|
except json.JSONDecodeError:
|
|
pass
|
|
return []
|
|
|
|
|
|
# ── API call ──────────────────────────────────────────────────────────
|
|
|
|
def call_anthropic(prompt):
|
|
headers = {
|
|
"x-api-key": ANTHROPIC_API_KEY,
|
|
"anthropic-version": "2023-06-01",
|
|
"content-type": "application/json",
|
|
}
|
|
payload = {
|
|
"model": ANTHROPIC_MODEL,
|
|
"max_tokens": 8192,
|
|
"system": [{"type": "text", "text": SYSTEM_PROMPT, "cache_control": {"type": "ephemeral"}}],
|
|
"messages": [{"role": "user", "content": prompt}],
|
|
}
|
|
resp = requests.post(f"{ANTHROPIC_API_URL}/messages", headers=headers, json=payload, timeout=120)
|
|
if resp.status_code != 200:
|
|
return None, {}, f"HTTP {resp.status_code}: {resp.text[:200]}"
|
|
data = resp.json()
|
|
usage = data.get("usage", {})
|
|
content = data.get("content", [])
|
|
text = content[0].get("text", "") if content else ""
|
|
return text, usage, None
|
|
|
|
|
|
# ── Format helpers ────────────────────────────────────────────────────
|
|
|
|
def fmt_json(val):
|
|
if val is None:
|
|
return ""
|
|
if isinstance(val, str):
|
|
try:
|
|
val = json.loads(val)
|
|
except (json.JSONDecodeError, TypeError):
|
|
return val
|
|
if isinstance(val, list):
|
|
return "\n".join(f" - {item}" for item in val)
|
|
return str(val)
|
|
|
|
|
|
# ── Main ──────────────────────────────────────────────────────────────
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(description="Test Pass 0a on small sample")
|
|
parser.add_argument("--limit", type=int, default=10)
|
|
parser.add_argument("--source", type=str)
|
|
parser.add_argument("--dry-run", action="store_true")
|
|
args = parser.parse_args()
|
|
|
|
if not ANTHROPIC_API_KEY and not args.dry_run:
|
|
print("ERROR: Set ANTHROPIC_API_KEY")
|
|
sys.exit(1)
|
|
|
|
db_url = os.environ["DATABASE_URL"]
|
|
p = urllib.parse.urlparse(db_url)
|
|
conn = psycopg2.connect(
|
|
host=p.hostname, port=p.port or 5432,
|
|
user=p.username, password=p.password,
|
|
dbname=p.path.lstrip("/"),
|
|
options="-c search_path=compliance,public",
|
|
)
|
|
cur = conn.cursor()
|
|
|
|
# Select diverse sample
|
|
query = """
|
|
SELECT id, control_id, title, objective, requirements,
|
|
test_procedure, source_citation, category
|
|
FROM compliance.canonical_controls
|
|
WHERE release_state NOT IN ('deprecated', 'duplicate', 'too_close')
|
|
AND parent_control_uuid IS NULL
|
|
AND title IS NOT NULL AND objective IS NOT NULL
|
|
AND length(coalesce(objective,'') || coalesce(requirements::text,'')) > 100
|
|
"""
|
|
params = []
|
|
if args.source:
|
|
query += " AND source_citation->>'source' ILIKE %s"
|
|
params.append(f"%{args.source}%")
|
|
|
|
query += " ORDER BY source_citation->>'source', random()"
|
|
query += f" LIMIT {args.limit}"
|
|
|
|
cur.execute(query, params)
|
|
controls = cur.fetchall()
|
|
|
|
if not controls:
|
|
print("No controls found.")
|
|
return
|
|
|
|
print(f"{'='*70}")
|
|
print(f"Pass 0a Test — {len(controls)} Controls")
|
|
print(f"Model: {ANTHROPIC_MODEL}")
|
|
print(f"{'='*70}")
|
|
|
|
total_in = total_out = total_obls = 0
|
|
type_counts = {"pflicht": 0, "empfehlung": 0, "kann": 0}
|
|
total_rejected = 0 # only evidence-only / too-short / no-parent
|
|
all_results = []
|
|
t_start = time.time()
|
|
|
|
for i, row in enumerate(controls, 1):
|
|
ctrl_uuid, ctrl_id, title, objective, reqs, test_proc, src_cit, category = row
|
|
|
|
req_str = fmt_json(reqs)
|
|
test_str = fmt_json(test_proc)
|
|
source_str = ""
|
|
if src_cit:
|
|
sc = src_cit if isinstance(src_cit, dict) else json.loads(src_cit)
|
|
source_str = f"{sc.get('source', '')} {sc.get('article', '')}"
|
|
|
|
print(f"\n{'─'*70}")
|
|
print(f"[{i}/{len(controls)}] {ctrl_id}: {title}")
|
|
print(f" Source: {source_str} | Category: {category or 'N/A'}")
|
|
print(f" Objective: {(objective or '')[:200]}")
|
|
|
|
if args.dry_run:
|
|
print(" [DRY RUN]")
|
|
continue
|
|
|
|
prompt = build_prompt(title or "", objective or "", req_str, test_str, source_str)
|
|
|
|
t0 = time.time()
|
|
response_text, usage, error = call_anthropic(prompt)
|
|
elapsed = time.time() - t0
|
|
|
|
if error:
|
|
print(f" ERROR: {error}")
|
|
continue
|
|
|
|
in_tok = usage.get("input_tokens", 0)
|
|
out_tok = usage.get("output_tokens", 0)
|
|
cached = usage.get("cache_read_input_tokens", 0)
|
|
total_in += in_tok
|
|
total_out += out_tok
|
|
|
|
obligations = parse_json_array(response_text)
|
|
total_obls += len(obligations)
|
|
|
|
print(f" API: {elapsed:.1f}s | {in_tok} in / {out_tok} out"
|
|
f"{f' ({cached} cached)' if cached else ''}"
|
|
f" | {len(obligations)} obligation(s)")
|
|
|
|
for j, obl in enumerate(obligations, 1):
|
|
obl_text = obl.get("obligation_text", "")
|
|
action = obl.get("action", "")
|
|
obj = obl.get("object", "")
|
|
condition = obl.get("condition")
|
|
strength = obl.get("normative_strength", "must")
|
|
is_test = bool(obl.get("is_test_obligation", False))
|
|
is_report = bool(obl.get("is_reporting_obligation", False))
|
|
|
|
# Auto-detect
|
|
if not is_test and _TEST_RE.search(obl_text):
|
|
is_test = True
|
|
if not is_report and _REPORTING_RE.search(obl_text):
|
|
is_report = True
|
|
|
|
flags, passed, conf, obl_type = quality_gate(obl_text, str(ctrl_uuid))
|
|
if passed:
|
|
type_counts[obl_type] = type_counts.get(obl_type, 0) + 1
|
|
else:
|
|
total_rejected += 1
|
|
|
|
tag = ""
|
|
if is_test:
|
|
tag = " [TEST]"
|
|
elif is_report:
|
|
tag = " [MELDEPFLICHT]"
|
|
|
|
# Show type instead of PASS/REJECT
|
|
type_label = {"pflicht": "PFLICHT", "empfehlung": "EMPFEHLUNG", "kann": "KANN"}
|
|
if not passed:
|
|
status = "REJECT"
|
|
else:
|
|
status = type_label.get(obl_type, "EMPFEHLUNG")
|
|
|
|
failed = [k for k, v in flags.items()
|
|
if isinstance(v, bool) and not v]
|
|
|
|
print(f"\n {j}. [{status}] conf={conf:.0%}{tag} strength={strength}")
|
|
print(f" {obl_text}")
|
|
print(f" Handlung: {action} | Gegenstand: {obj}")
|
|
if condition:
|
|
print(f" Bedingung: {condition}")
|
|
if not passed:
|
|
print(f" Abgelehnt: {', '.join(failed)}")
|
|
|
|
all_results.append({
|
|
"control_id": ctrl_id,
|
|
"obligation_text": obl_text,
|
|
"obligation_type": obl_type if passed else "rejected",
|
|
"action": action,
|
|
"object": obj,
|
|
"condition": condition,
|
|
"confidence": round(conf, 2),
|
|
"is_test": is_test,
|
|
"is_reporting": is_report,
|
|
"passed": passed,
|
|
"flags": {k: v for k, v in flags.items()},
|
|
})
|
|
|
|
time.sleep(0.5)
|
|
|
|
# ── Summary ──────────────────────────────────────────────────────
|
|
elapsed_total = time.time() - t_start
|
|
cost = (total_in * 3 + total_out * 15) / 1_000_000
|
|
total_classified = sum(type_counts.values())
|
|
|
|
print(f"\n\n{'='*70}")
|
|
print(f"ZUSAMMENFASSUNG — 3-Tier-Klassifizierung")
|
|
print(f"{'='*70}")
|
|
print(f" Controls: {len(controls)}")
|
|
print(f" Obligations: {total_obls} ({total_obls/max(len(controls),1):.1f} pro Control)")
|
|
print(f" ── Klassifizierung ──")
|
|
print(f" Pflicht: {type_counts['pflicht']}"
|
|
f" ({type_counts['pflicht']*100/max(total_obls,1):.0f}%)")
|
|
print(f" Empfehlung: {type_counts['empfehlung']}"
|
|
f" ({type_counts['empfehlung']*100/max(total_obls,1):.0f}%)")
|
|
print(f" Kann: {type_counts['kann']}"
|
|
f" ({type_counts['kann']*100/max(total_obls,1):.0f}%)")
|
|
print(f" Rejected: {total_rejected}"
|
|
f" ({total_rejected*100/max(total_obls,1):.0f}%)"
|
|
f" (nur evidence-only/zu kurz/kein parent)")
|
|
print(f" ── Kosten ──")
|
|
print(f" Laufzeit: {elapsed_total:.1f}s")
|
|
print(f" Tokens: {total_in:,} in / {total_out:,} out")
|
|
print(f" Kosten: ${cost:.4f}")
|
|
|
|
if len(controls) > 0 and not args.dry_run and total_obls > 0:
|
|
n = 6000
|
|
factor = n / len(controls)
|
|
print(f"\n --- Hochrechnung auf {n:,} Controls ---")
|
|
print(f" Tokens: {int(total_in * factor):,} in / {int(total_out * factor):,} out")
|
|
print(f" Kosten: ${cost * factor:.2f}")
|
|
print(f" Laufzeit: {elapsed_total * factor / 3600:.1f}h")
|
|
print(f" Obligations: ~{int(total_obls / len(controls) * n):,}")
|
|
pf = int(type_counts['pflicht'] * factor)
|
|
ef = int(type_counts['empfehlung'] * factor)
|
|
kf = int(type_counts['kann'] * factor)
|
|
print(f" Pflicht: ~{pf:,}")
|
|
print(f" Empfehlung: ~{ef:,}")
|
|
print(f" Kann: ~{kf:,}")
|
|
|
|
# Save results JSON for later analysis
|
|
if all_results:
|
|
out_path = f"/tmp/pass0a_results_{len(controls)}controls.json"
|
|
with open(out_path, "w") as f:
|
|
json.dump(all_results, f, ensure_ascii=False, indent=2)
|
|
print(f"\n Ergebnisse gespeichert: {out_path}")
|
|
|
|
conn.close()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|