breakpilot-compliance/scripts/qa/test_pass0a.py

#!/usr/bin/env python3
"""Test Pass 0a (Obligation Extraction) on 5-10 controls.

Standalone script — no SQLAlchemy dependency. Uses psycopg2 + requests.
Copies prompts and quality gate from decomposition_pass.py.

Usage:
    python3 test_pass0a.py                          # 10 controls, Anthropic
    python3 test_pass0a.py --limit 5                # 5 controls
    python3 test_pass0a.py --source "DSGVO"         # filter by source
    python3 test_pass0a.py --dry-run                # show controls, no LLM call
"""
import argparse
import json
import os
import re
import sys
import time
import urllib.parse

import psycopg2
import requests

# ── Config ────────────────────────────────────────────────────────────
ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", "")
ANTHROPIC_MODEL = os.environ.get("DECOMPOSITION_LLM_MODEL", "claude-sonnet-4-6")
ANTHROPIC_API_URL = "https://api.anthropic.com/v1"

# ── Prompts (from decomposition_pass.py) ──────────────────────────────

SYSTEM_PROMPT = """\
Du bist ein Rechts-Compliance-Experte. Du zerlegst Compliance-Controls \
in einzelne atomare Pflichten.

REGELN (STRIKT EINHALTEN):
1. Nur normative Aussagen extrahieren — erkennbar an: müssen, haben \
sicherzustellen, sind verpflichtet, ist zu dokumentieren, ist zu melden, \
ist zu testen, shall, must, required.
2. Jede Pflicht hat genau EIN Hauptverb / eine Handlung.
3. Testpflichten SEPARAT von operativen Pflichten (is_test_obligation=true).
4. Meldepflichten SEPARAT (is_reporting_obligation=true).
5. NICHT auf Evidence-Ebene zerlegen (z.B. "DR-Plan vorhanden" ist KEIN \
eigenes Control, sondern Evidence).
6. Begründungen, Erläuterungen und Erwägungsgründe sind KEINE Pflichten \
— NICHT extrahieren.

Antworte NUR mit einem JSON-Array. Keine Erklärungen."""


def build_prompt(title, objective, requirements, test_procedure, source_ref):
    return f"""\
Analysiere das folgende Control und extrahiere alle einzelnen normativen \
Pflichten als JSON-Array.

CONTROL:
Titel: {title}
Ziel: {objective}
Anforderungen: {requirements}
Prüfverfahren: {test_procedure}
Quellreferenz: {source_ref}

Antworte als JSON-Array:
[
  {{
    "obligation_text": "Kurze, präzise Formulierung der Pflicht",
    "action": "Hauptverb/Handlung",
    "object": "Gegenstand der Pflicht",
    "condition": "Auslöser/Bedingung oder null",
    "normative_strength": "must",
    "is_test_obligation": false,
    "is_reporting_obligation": false
  }}
]"""


# ── Quality Gate — 3-Tier Classification (from decomposition_pass.py) ──

# Tier 1: Pflicht (mandatory)
_PFLICHT_RE = re.compile(
    r"\bmüssen\b|\bmuss\b|\bhat\s+sicherzustellen\b|\bhaben\s+sicherzustellen\b"
    r"|\bsind\s+verpflichtet\b|\bist\s+verpflichtet\b"
    r"|\bist\s+zu\s+\w+en\b|\bsind\s+zu\s+\w+en\b"
    r"|\bhat\s+zu\s+\w+en\b|\bhaben\s+zu\s+\w+en\b"
    r"|\bist\s+\w+zu\w+en\b|\bsind\s+\w+zu\w+en\b"
    r"|\bist\s+\w+\s+zu\s+\w+en\b|\bsind\s+\w+\s+zu\s+\w+en\b"
    r"|\bhat\s+\w+\s+zu\s+\w+en\b|\bhaben\s+\w+\s+zu\s+\w+en\b"
    r"|\bshall\b|\bmust\b|\brequired\b"
    r"|\b\w+zuteilen\b|\b\w+zuwenden\b|\b\w+zustellen\b|\b\w+zulegen\b"
    r"|\b\w+zunehmen\b|\b\w+zuführen\b|\b\w+zuhalten\b|\b\w+zusetzen\b"
    r"|\b\w+zuweisen\b|\b\w+zuordnen\b|\b\w+zufügen\b|\b\w+zugeben\b"
    r"|\bist\b.{1,80}\bzu\s+\w+en\b|\bsind\b.{1,80}\bzu\s+\w+en\b",
    re.IGNORECASE,
)
# Tier 2: Empfehlung (recommendation)
_EMPFEHLUNG_RE = re.compile(
    r"\bsoll\b|\bsollen\b|\bsollte\b|\bsollten\b"
    r"|\bgewährleisten\b|\bsicherstellen\b"
    r"|\bshould\b|\bensure\b|\brecommend\w*\b"
    r"|\bnachweisen\b|\beinhalten\b|\bunterlassen\b|\bwahren\b"
    r"|\bdokumentieren\b|\bimplementieren\b|\büberprüfen\b|\büberwachen\b"
    r"|\bprüfen,\s+ob\b|\bkontrollieren,\s+ob\b",
    re.IGNORECASE,
)
# Tier 3: Kann (optional/permissive)
_KANN_RE = re.compile(
    r"\bkann\b|\bkönnen\b|\bdarf\b|\bdürfen\b|\bmay\b|\boptional\b",
    re.IGNORECASE,
)
# Union (backward compat)
_NORMATIVE_RE = re.compile(
    _PFLICHT_RE.pattern + "|" + _EMPFEHLUNG_RE.pattern + "|" + _KANN_RE.pattern,
    re.IGNORECASE,
)
_RATIONALE_RE = re.compile(
    r"\bda\s+|\bweil\b|\bgrund\b|\berwägung|\bbecause\b|\breason\b|\brationale\b",
    re.IGNORECASE,
)
_TEST_RE = re.compile(
    r"\btesten\b|\btest\b|\bprüfung\b|\bprüfen\b|\bgetestet\b|\bwirksamkeit\b"
    r"|\baudit\b|\bregelmäßig\b.*\b(prüf|test|kontroll)|\beffectiveness\b|\bverif",
    re.IGNORECASE,
)
_REPORTING_RE = re.compile(
    r"\bmelden\b|\bmeldung\b|\bunterricht|\binformieren\b|\bbenachricht"
    r"|\bnotif|\breport\b|\bbehörd",
    re.IGNORECASE,
)


def classify_obligation_type(txt):
    """Classify: pflicht > empfehlung > kann > empfehlung (default)."""
    if _PFLICHT_RE.search(txt):
        return "pflicht"
    if _EMPFEHLUNG_RE.search(txt):
        return "empfehlung"
    if _KANN_RE.search(txt):
        return "kann"
    return "empfehlung"


def quality_gate(obl_text, parent_uuid):
    """Validate + classify obligation. Returns (flags_dict, passed_bool, confidence, obligation_type)."""
    flags = {}

    # 1. Normative signal (informational)
    flags["has_normative_signal"] = bool(_NORMATIVE_RE.search(obl_text))

    # 1b. Obligation type classification
    obl_type = classify_obligation_type(obl_text)
    flags["obligation_type"] = obl_type

    # 2. Single action
    multi_verb_re = re.compile(
        r"\b(und|sowie|als auch)\b.*\b(müssen|sicherstellen|implementieren"
        r"|dokumentieren|melden|testen|prüfen|überwachen|gewährleisten)\b",
        re.IGNORECASE,
    )
    flags["single_action"] = not bool(multi_verb_re.search(obl_text))

    # 3. Not rationale
    normative_count = len(_NORMATIVE_RE.findall(obl_text))
    rationale_count = len(_RATIONALE_RE.findall(obl_text))
    flags["not_rationale"] = normative_count >= rationale_count

    # 4. Not evidence-only
    evidence_only_re = re.compile(
        r"^(Nachweis|Dokumentation|Screenshot|Protokoll|Bericht|Zertifikat)",
        re.IGNORECASE,
    )
    flags["not_evidence_only"] = not bool(evidence_only_re.match(obl_text.strip()))

    # 5. Min length
    flags["min_length"] = len(obl_text.strip()) >= 20

    # 6. Parent link
    flags["has_parent_link"] = bool(parent_uuid)

    # Confidence
    weights = {
        "has_normative_signal": 0.25, "single_action": 0.20,
        "not_rationale": 0.20, "not_evidence_only": 0.15,
        "min_length": 0.10, "has_parent_link": 0.05,
    }
    # Bonus for pflicht classification
    confidence = sum(weights[k] for k, v in flags.items() if v and k in weights)
    if obl_type == "pflicht":
        confidence = min(confidence + 0.05, 1.0)

    # Pass check — has_normative_signal is NO LONGER critical
    critical = ["not_evidence_only", "min_length", "has_parent_link"]
    passed = all(flags.get(k, False) for k in critical)

    return flags, passed, confidence, obl_type


# ── JSON parsing ──────────────────────────────────────────────────────

def parse_json_array(text):
    try:
        result = json.loads(text)
        if isinstance(result, list):
            return result
        if isinstance(result, dict):
            return [result]
    except json.JSONDecodeError:
        pass
    match = re.search(r"\[[\s\S]*\]", text)
    if match:
        try:
            result = json.loads(match.group())
            if isinstance(result, list):
                return result
        except json.JSONDecodeError:
            pass
    return []


# ── API call ──────────────────────────────────────────────────────────

def call_anthropic(prompt):
    headers = {
        "x-api-key": ANTHROPIC_API_KEY,
        "anthropic-version": "2023-06-01",
        "content-type": "application/json",
    }
    payload = {
        "model": ANTHROPIC_MODEL,
        "max_tokens": 8192,
        "system": [{"type": "text", "text": SYSTEM_PROMPT, "cache_control": {"type": "ephemeral"}}],
        "messages": [{"role": "user", "content": prompt}],
    }
    resp = requests.post(f"{ANTHROPIC_API_URL}/messages", headers=headers, json=payload, timeout=120)
    if resp.status_code != 200:
        return None, {}, f"HTTP {resp.status_code}: {resp.text[:200]}"
    data = resp.json()
    usage = data.get("usage", {})
    content = data.get("content", [])
    text = content[0].get("text", "") if content else ""
    return text, usage, None


# ── Format helpers ────────────────────────────────────────────────────

def fmt_json(val):
    if val is None:
        return ""
    if isinstance(val, str):
        try:
            val = json.loads(val)
        except (json.JSONDecodeError, TypeError):
            return val
    if isinstance(val, list):
        return "\n".join(f"  - {item}" for item in val)
    return str(val)


# ── Main ──────────────────────────────────────────────────────────────

def main():
    parser = argparse.ArgumentParser(description="Test Pass 0a on small sample")
    parser.add_argument("--limit", type=int, default=10)
    parser.add_argument("--source", type=str)
    parser.add_argument("--dry-run", action="store_true")
    args = parser.parse_args()

    if not ANTHROPIC_API_KEY and not args.dry_run:
        print("ERROR: Set ANTHROPIC_API_KEY")
        sys.exit(1)

    db_url = os.environ["DATABASE_URL"]
    p = urllib.parse.urlparse(db_url)
    conn = psycopg2.connect(
        host=p.hostname, port=p.port or 5432,
        user=p.username, password=p.password,
        dbname=p.path.lstrip("/"),
        options="-c search_path=compliance,public",
    )
    cur = conn.cursor()

    # Select diverse sample
    query = """
        SELECT id, control_id, title, objective, requirements,
               test_procedure, source_citation, category
        FROM compliance.canonical_controls
        WHERE release_state NOT IN ('deprecated', 'duplicate', 'too_close')
          AND parent_control_uuid IS NULL
          AND title IS NOT NULL AND objective IS NOT NULL
          AND length(coalesce(objective,'') || coalesce(requirements::text,'')) > 100
    """
    params = []
    if args.source:
        query += " AND source_citation->>'source' ILIKE %s"
        params.append(f"%{args.source}%")

    query += " ORDER BY source_citation->>'source', random()"
    query += f" LIMIT {args.limit}"

    cur.execute(query, params)
    controls = cur.fetchall()

    if not controls:
        print("No controls found.")
        return

    print(f"{'='*70}")
    print(f"Pass 0a Test — {len(controls)} Controls")
    print(f"Model: {ANTHROPIC_MODEL}")
    print(f"{'='*70}")

    total_in = total_out = total_obls = 0
    type_counts = {"pflicht": 0, "empfehlung": 0, "kann": 0}
    total_rejected = 0  # only evidence-only / too-short / no-parent
    all_results = []
    t_start = time.time()

    for i, row in enumerate(controls, 1):
        ctrl_uuid, ctrl_id, title, objective, reqs, test_proc, src_cit, category = row

        req_str = fmt_json(reqs)
        test_str = fmt_json(test_proc)
        source_str = ""
        if src_cit:
            sc = src_cit if isinstance(src_cit, dict) else json.loads(src_cit)
            source_str = f"{sc.get('source', '')} {sc.get('article', '')}"

        print(f"\n{'─'*70}")
        print(f"[{i}/{len(controls)}] {ctrl_id}: {title}")
        print(f"  Source: {source_str} | Category: {category or 'N/A'}")
        print(f"  Objective: {(objective or '')[:200]}")

        if args.dry_run:
            print("  [DRY RUN]")
            continue

        prompt = build_prompt(title or "", objective or "", req_str, test_str, source_str)

        t0 = time.time()
        response_text, usage, error = call_anthropic(prompt)
        elapsed = time.time() - t0

        if error:
            print(f"  ERROR: {error}")
            continue

        in_tok = usage.get("input_tokens", 0)
        out_tok = usage.get("output_tokens", 0)
        cached = usage.get("cache_read_input_tokens", 0)
        total_in += in_tok
        total_out += out_tok

        obligations = parse_json_array(response_text)
        total_obls += len(obligations)

        print(f"  API: {elapsed:.1f}s | {in_tok} in / {out_tok} out"
              f"{f' ({cached} cached)' if cached else ''}"
              f" | {len(obligations)} obligation(s)")

        for j, obl in enumerate(obligations, 1):
            obl_text = obl.get("obligation_text", "")
            action = obl.get("action", "")
            obj = obl.get("object", "")
            condition = obl.get("condition")
            strength = obl.get("normative_strength", "must")
            is_test = bool(obl.get("is_test_obligation", False))
            is_report = bool(obl.get("is_reporting_obligation", False))

            # Auto-detect
            if not is_test and _TEST_RE.search(obl_text):
                is_test = True
            if not is_report and _REPORTING_RE.search(obl_text):
                is_report = True

            flags, passed, conf, obl_type = quality_gate(obl_text, str(ctrl_uuid))
            if passed:
                type_counts[obl_type] = type_counts.get(obl_type, 0) + 1
            else:
                total_rejected += 1

            tag = ""
            if is_test:
                tag = " [TEST]"
            elif is_report:
                tag = " [MELDEPFLICHT]"

            # Show type instead of PASS/REJECT
            type_label = {"pflicht": "PFLICHT", "empfehlung": "EMPFEHLUNG", "kann": "KANN"}
            if not passed:
                status = "REJECT"
            else:
                status = type_label.get(obl_type, "EMPFEHLUNG")

            failed = [k for k, v in flags.items()
                      if isinstance(v, bool) and not v]

            print(f"\n    {j}. [{status}] conf={conf:.0%}{tag} strength={strength}")
            print(f"       {obl_text}")
            print(f"       Handlung: {action} | Gegenstand: {obj}")
            if condition:
                print(f"       Bedingung: {condition}")
            if not passed:
                print(f"       Abgelehnt: {', '.join(failed)}")

            all_results.append({
                "control_id": ctrl_id,
                "obligation_text": obl_text,
                "obligation_type": obl_type if passed else "rejected",
                "action": action,
                "object": obj,
                "condition": condition,
                "confidence": round(conf, 2),
                "is_test": is_test,
                "is_reporting": is_report,
                "passed": passed,
                "flags": {k: v for k, v in flags.items()},
            })

        time.sleep(0.5)

    # ── Summary ──────────────────────────────────────────────────────
    elapsed_total = time.time() - t_start
    cost = (total_in * 3 + total_out * 15) / 1_000_000
    total_classified = sum(type_counts.values())

    print(f"\n\n{'='*70}")
    print(f"ZUSAMMENFASSUNG — 3-Tier-Klassifizierung")
    print(f"{'='*70}")
    print(f"  Controls:       {len(controls)}")
    print(f"  Obligations:    {total_obls} ({total_obls/max(len(controls),1):.1f} pro Control)")
    print(f"  ── Klassifizierung ──")
    print(f"  Pflicht:        {type_counts['pflicht']}"
          f" ({type_counts['pflicht']*100/max(total_obls,1):.0f}%)")
    print(f"  Empfehlung:     {type_counts['empfehlung']}"
          f" ({type_counts['empfehlung']*100/max(total_obls,1):.0f}%)")
    print(f"  Kann:           {type_counts['kann']}"
          f" ({type_counts['kann']*100/max(total_obls,1):.0f}%)")
    print(f"  Rejected:       {total_rejected}"
          f" ({total_rejected*100/max(total_obls,1):.0f}%)"
          f"  (nur evidence-only/zu kurz/kein parent)")
    print(f"  ── Kosten ──")
    print(f"  Laufzeit:       {elapsed_total:.1f}s")
    print(f"  Tokens:         {total_in:,} in / {total_out:,} out")
    print(f"  Kosten:         ${cost:.4f}")

    if len(controls) > 0 and not args.dry_run and total_obls > 0:
        n = 6000
        factor = n / len(controls)
        print(f"\n  --- Hochrechnung auf {n:,} Controls ---")
        print(f"  Tokens:         {int(total_in * factor):,} in / {int(total_out * factor):,} out")
        print(f"  Kosten:         ${cost * factor:.2f}")
        print(f"  Laufzeit:       {elapsed_total * factor / 3600:.1f}h")
        print(f"  Obligations:    ~{int(total_obls / len(controls) * n):,}")
        pf = int(type_counts['pflicht'] * factor)
        ef = int(type_counts['empfehlung'] * factor)
        kf = int(type_counts['kann'] * factor)
        print(f"  Pflicht:        ~{pf:,}")
        print(f"  Empfehlung:     ~{ef:,}")
        print(f"  Kann:           ~{kf:,}")

    # Save results JSON for later analysis
    if all_results:
        out_path = f"/tmp/pass0a_results_{len(controls)}controls.json"
        with open(out_path, "w") as f:
            json.dump(all_results, f, ensure_ascii=False, indent=2)
        print(f"\n  Ergebnisse gespeichert: {out_path}")

    conn.close()


if __name__ == "__main__":
    main()