feat: Control Library UI, dedup migration, QA tooling, docs

- Control Library: parent control display, ObligationTypeBadge, GenerationStrategyBadge variants, evidence string fallback - API: expose parent_control_uuid/id/title in canonical controls - Fix: DSFA SQLAlchemy 2.0 Row._mapping compatibility - Migration 074: control_parent_links + control_dedup_reviews tables - QA scripts: benchmark, gap analysis, OSCAL import, OWASP cleanup, phase5 normalize, phase74 gap fill, sync_db, run_job - Docs: dedup engine, RAG benchmark, lessons learned, pipeline docs Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-21 11:56:08 +01:00
parent c52dbdb8f1
commit 643b26618f
28 changed files with 5781 additions and 75 deletions
--- a/scripts/qa/apply_pdf_qa_results.py
+++ b/scripts/qa/apply_pdf_qa_results.py
@@ -1,11 +1,29 @@
-"""Apply PDF QA results: update source_citation with correct article + article_type."""
+"""
+Apply PDF QA results: update source_citation with correct article_type + article.
+
+Safety modes:
+  --safe (default): Only set article_type. Set article only when empty. Mark preamble as recital_suspect.
+  --force-article:  Also overwrite existing articles (CAREFUL: NIST substring matching is unreliable).
+  --dry-run:        Show what would change without writing.
+
+Usage:
+    python3 apply_pdf_qa_results.py                    # safe mode (apply article_type + empty articles)
+    python3 apply_pdf_qa_results.py --dry-run          # show changes without writing
+    python3 apply_pdf_qa_results.py --force-article    # also overwrite existing articles
+"""
 import os
+import sys
 import json
 import psycopg2
 import urllib.parse
+from collections import Counter

 RESULTS_FILE = "/tmp/pdf_qa_results.json"

+# Parse args
+dry_run = "--dry-run" in sys.argv
+force_article = "--force-article" in sys.argv
+
 # Load results
 with open(RESULTS_FILE) as f:
    results = json.load(f)
@@ -21,35 +39,101 @@ conn = psycopg2.connect(
    options="-c search_path=compliance,public"
 )

-# Update in batches
+# Load current DB state for all affected controls
 cur = conn.cursor()
-updated = 0
+ctrl_ids = [r["ctrl_id"] for r in results]
+cur.execute("""
+    SELECT id,
+           source_citation->>'article' as article,
+           source_citation->>'article_type' as article_type,
+           source_citation->>'source' as source
+    FROM compliance.canonical_controls
+    WHERE id = ANY(%s::uuid[])
+""", (ctrl_ids,))
+db_state = {}
+for row in cur.fetchall():
+    db_state[str(row[0])] = {"article": row[1] or "", "article_type": row[2], "source": row[3]}
+
+# Counters
+stats = Counter()
+updated_type = 0
+updated_article = 0
+updated_recital = 0
 errors = 0
-unchanged = 0

 for i, r in enumerate(results):
    ctrl_id = r["ctrl_id"]
-    article_label = r["article_label"]
-    article_type = r["article_type"]  # preamble, article, annex, section, unknown
+    new_article = r["article_label"]
+    new_type = r["article_type"]
+    db = db_state.get(ctrl_id, {})
+
+    if not db:
+        stats["missing_in_db"] += 1
+        continue
+
+    old_type = db.get("article_type")
+    old_article = db.get("article", "").strip()
+
+    # Decide what to update
+    set_type = (old_type != new_type)
+    set_article = (not old_article) or (force_article and old_article != new_article)
+    set_recital = (new_type == "preamble")
+
+    if set_type:
+        stats["type_" + ("new" if not old_type else "changed")] += 1
+    else:
+        stats["type_unchanged"] += 1
+
+    if not old_article and set_article:
+        stats["article_new"] += 1
+    elif old_article and old_article != new_article:
+        if force_article:
+            stats["article_force_changed"] += 1
+        else:
+            stats["article_skipped"] += 1
+    else:
+        stats["article_unchanged"] += 1
+
+    if set_recital:
+        stats["recital"] += 1
+
+    if dry_run:
+        continue

    try:
-        # Update source_citation: set article and article_type
-        cur.execute("""
-            UPDATE compliance.canonical_controls
-            SET source_citation = source_citation
-                || jsonb_build_object('article', %s, 'article_type', %s),
-                updated_at = now()
-            WHERE id = %s::uuid
-            AND (
-                source_citation->>'article' IS DISTINCT FROM %s
-                OR source_citation->>'article_type' IS DISTINCT FROM %s
-            )
-        """, (article_label, article_type, ctrl_id, article_label, article_type))
+        # Build JSONB update
+        updates = {}
+        if set_type:
+            updates["article_type"] = new_type
+        if set_article:
+            updates["article"] = new_article

-        if cur.rowcount > 0:
-            updated += 1
-        else:
-            unchanged += 1
+        if updates:
+            # Merge into source_citation
+            cur.execute("""
+                UPDATE compliance.canonical_controls
+                SET source_citation = COALESCE(source_citation, '{}'::jsonb) || %s::jsonb,
+                    updated_at = now()
+                WHERE id = %s::uuid
+            """, (json.dumps(updates), ctrl_id))
+            if set_type:
+                updated_type += 1
+            if set_article:
+                updated_article += 1
+
+        # Mark preamble as recital_suspect
+        if set_recital:
+            cur.execute("""
+                UPDATE compliance.canonical_controls
+                SET generation_metadata = jsonb_set(
+                    COALESCE(generation_metadata, '{}'::jsonb),
+                    '{recital_suspect}',
+                    'true'::jsonb
+                ),
+                updated_at = now()
+                WHERE id = %s::uuid
+            """, (ctrl_id,))
+            updated_recital += 1

    except Exception as e:
        errors += 1
@@ -58,12 +142,37 @@ for i, r in enumerate(results):
        conn.rollback()
        continue

-    if (i + 1) % 500 == 0:
+    if (i + 1) % 1000 == 0:
        conn.commit()
-        print(f"  Progress: {i+1}/{len(results)} (updated: {updated}, unchanged: {unchanged}, errors: {errors})")
+        print(f"  Progress: {i+1}/{len(results)}")

-conn.commit()
-print(f"\nDone: {updated} updated, {unchanged} unchanged, {errors} errors out of {len(results)}")
+if not dry_run:
+    conn.commit()
+
+mode = "DRY-RUN" if dry_run else "APPLIED"
+print(f"\n{'='*60}")
+print(f"  Mode: {mode}")
+print(f"{'='*60}")
+print(f"\n  article_type:")
+print(f"    New (was NULL):    {stats['type_new']:5d}")
+print(f"    Changed:           {stats['type_changed']:5d}")
+print(f"    Unchanged:         {stats['type_unchanged']:5d}")
+print(f"\n  article:")
+print(f"    New (was empty):   {stats['article_new']:5d}")
+if force_article:
+    print(f"    Force-changed:     {stats['article_force_changed']:5d}")
+else:
+    print(f"    Differs (SKIPPED): {stats['article_skipped']:5d}")
+print(f"    Unchanged:         {stats['article_unchanged']:5d}")
+print(f"\n  Preamble/Recital:    {stats['recital']:5d}")
+print(f"  Missing in DB:       {stats['missing_in_db']:5d}")
+
+if not dry_run:
+    print(f"\n  Updates written:")
+    print(f"    article_type:      {updated_type:5d}")
+    print(f"    article:           {updated_article:5d}")
+    print(f"    recital_suspect:   {updated_recital:5d}")
+    print(f"    Errors:            {errors:5d}")

 # Verify: count by article_type
 cur.execute("""
--- a/scripts/qa/benchmark_llm_controls.py
+++ b/scripts/qa/benchmark_llm_controls.py
@@ -0,0 +1,524 @@
+#!/usr/bin/env python3
+"""
+Phase 7.4 Benchmark: Compare gpt-oss-120b vs Claude Sonnet for Control Generation.
+
+Tests 5 representative gap articles from different sources.
+Measures: quality (JSON valid, fields complete), response time, cost estimate.
+
+Usage:
+    python3 benchmark_llm_controls.py
+"""
+import json
+import time
+import sys
+import os
+import requests
+from pathlib import Path
+
+# ── Config ──────────────────────────────────────────────────────────
+LITELLM_URL = "https://llm-dev.meghsakha.com"
+LITELLM_MODEL = "gpt-oss-120b"
+LITELLM_API_KEY = "sk-0nAyxaMVbIqmz_ntnndzag"
+
+ANTHROPIC_URL = "https://api.anthropic.com/v1/messages"
+ANTHROPIC_MODEL = "claude-sonnet-4-6"
+ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", "")
+
+PDF_DIR = Path(os.path.expanduser("~/rag-ingestion/pdfs"))
+
+try:
+    import fitz  # PyMuPDF
+except ImportError:
+    print("PyMuPDF not available, using pre-extracted texts")
+    fitz = None
+
+# ── Prompts (identical to control_generator.py) ─────────────────────
+
+SYSTEM_PROMPT = """Du bist ein Security-Compliance-Experte. Strukturiere den gegebenen Text
+als praxisorientiertes Security Control. Erstelle eine verständliche, umsetzbare Formulierung.
+Antworte NUR mit validem JSON. Bei mehreren Controls antworte mit einem JSON-Array."""
+
+APPLICABILITY_PROMPT = """- applicable_industries: Liste der Branchen fuer die dieses Control relevant ist.
+  Verwende ["all"] wenn der Control branchenuebergreifend gilt.
+  Moegliche Werte: "all", "Technologie / IT", "Finanzdienstleistungen", "Gesundheitswesen",
+  "Produktion / Industrie", "Energie", "Telekommunikation", "Oeffentlicher Dienst"
+- applicable_company_size: Ab welcher Unternehmensgroesse gilt dieses Control?
+  Verwende ["all"] wenn keine Groessenbeschraenkung.
+  Moegliche Werte: "all", "micro", "small", "medium", "large", "enterprise"
+- scope_conditions: null wenn keine besonderen Bedingungen, sonst:
+  {"requires_any": ["signal"], "description": "Erklaerung"}"""
+
+
+def build_prompt(source_name: str, article_label: str, article_text: str, license_type: str) -> str:
+    return f"""Strukturiere den folgenden Gesetzestext als Security/Compliance Control.
+Du DARFST den Originaltext verwenden (Quelle: {source_name}, {license_type}).
+
+WICHTIG: Erstelle eine verständliche, praxisorientierte Formulierung.
+Der Originaltext wird separat gespeichert — deine Formulierung soll klar und umsetzbar sein.
+
+Gib JSON zurück mit diesen Feldern:
+- title: Kurzer prägnanter Titel (max 100 Zeichen)
+- objective: Was soll erreicht werden? (1-3 Sätze)
+- rationale: Warum ist das wichtig? (1-2 Sätze)
+- requirements: Liste von konkreten Anforderungen (Strings)
+- test_procedure: Liste von Prüfschritten (Strings)
+- evidence: Liste von Nachweisdokumenten (Strings)
+- severity: low/medium/high/critical
+- tags: Liste von Tags
+- domain: Fachgebiet (AUTH/CRYP/NET/DATA/LOG/ACC/SEC/INC/AI/COMP/GOV)
+- category: Inhaltliche Kategorie
+- target_audience: Liste der Zielgruppen
+- source_article: Artikel-Referenz (z.B. "Artikel 10", "§ 42")
+- source_paragraph: Absatz-Referenz (z.B. "Absatz 5")
+{APPLICABILITY_PROMPT}
+
+Text: {article_text[:3000]}
+Quelle: {source_name}, {article_label}"""
+
+
+# ── PDF Text Extraction ─────────────────────────────────────────────
+
+def extract_article_text(pdf_file: str, article_label: str, doc_type: str) -> str:
+    """Extract the text of a specific article from a PDF."""
+    import re
+
+    path = PDF_DIR / pdf_file
+    if not path.exists() or fitz is None:
+        return ""
+
+    doc = fitz.open(str(path))
+    full_text = ""
+    for page in doc:
+        full_text += page.get_text() + "\n"
+    doc.close()
+
+    # Find article boundaries
+    if doc_type == "eu_regulation":
+        # Find "Artikel N" heading
+        art_num = re.search(r'\d+', article_label)
+        if not art_num:
+            return ""
+        num = int(art_num.group())
+        # Find start of this article
+        pattern = rf'\nArtikel\s+{num}\s*\n'
+        match = re.search(pattern, full_text)
+        if not match:
+            return f"[Artikel {num} nicht im PDF gefunden]"
+        start = match.start()
+        # Find start of next article
+        next_pattern = rf'\nArtikel\s+{num+1}\s*\n'
+        next_match = re.search(next_pattern, full_text)
+        end = next_match.start() if next_match else start + 5000
+        text = full_text[start:end].strip()
+        return text[:3000]
+
+    elif doc_type == "de_law":
+        para_num = re.search(r'\d+', article_label)
+        if not para_num:
+            return ""
+        num = int(para_num.group())
+        pattern = rf'\n§\s+{num}\b'
+        match = re.search(pattern, full_text)
+        if not match:
+            return f"[§ {num} nicht im PDF gefunden]"
+        start = match.start()
+        next_pattern = rf'\n§\s+{num+1}\b'
+        next_match = re.search(next_pattern, full_text)
+        end = next_match.start() if next_match else start + 5000
+        text = full_text[start:end].strip()
+        return text[:3000]
+
+    elif doc_type == "nist":
+        # Find NIST control family
+        match = re.search(rf'(?:^|\n)\s*{re.escape(article_label)}\b', full_text)
+        if not match:
+            return f"[{article_label} nicht im PDF gefunden]"
+        start = match.start()
+        text = full_text[start:start+3000].strip()
+        return text
+
+    else:
+        # Generic section search
+        match = re.search(rf'(?:^|\n).*{re.escape(article_label)}\b', full_text)
+        if not match:
+            return f"[{article_label} nicht im PDF gefunden]"
+        start = match.start()
+        text = full_text[start:start+3000].strip()
+        return text
+
+
+# ── API Calls ────────────────────────────────────────────────────────
+
+def call_litellm(prompt: str, system_prompt: str) -> tuple:
+    """Call LiteLLM API. Returns (response_text, duration_seconds, error)."""
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": f"Bearer {LITELLM_API_KEY}",
+    }
+    payload = {
+        "model": LITELLM_MODEL,
+        "messages": [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": prompt},
+        ],
+        "temperature": 0.3,
+        "max_tokens": 4096,
+        "stream": False,
+    }
+
+    t0 = time.time()
+    try:
+        resp = requests.post(
+            f"{LITELLM_URL}/v1/chat/completions",
+            headers=headers,
+            json=payload,
+            timeout=180,
+        )
+        duration = time.time() - t0
+        if resp.status_code != 200:
+            return "", duration, f"HTTP {resp.status_code}: {resp.text[:200]}"
+        data = resp.json()
+        content = data["choices"][0]["message"]["content"]
+        usage = data.get("usage", {})
+        return content, duration, None, usage
+    except Exception as e:
+        return "", time.time() - t0, str(e), {}
+
+
+def call_anthropic(prompt: str, system_prompt: str) -> tuple:
+    """Call Anthropic API. Returns (response_text, duration_seconds, error)."""
+    headers = {
+        "x-api-key": ANTHROPIC_API_KEY,
+        "anthropic-version": "2023-06-01",
+        "content-type": "application/json",
+    }
+    payload = {
+        "model": ANTHROPIC_MODEL,
+        "max_tokens": 4096,
+        "system": system_prompt,
+        "messages": [{"role": "user", "content": prompt}],
+    }
+
+    t0 = time.time()
+    try:
+        resp = requests.post(ANTHROPIC_URL, headers=headers, json=payload, timeout=180)
+        duration = time.time() - t0
+        if resp.status_code != 200:
+            return "", duration, f"HTTP {resp.status_code}: {resp.text[:200]}", {}
+        data = resp.json()
+        content = data["content"][0]["text"] if data.get("content") else ""
+        usage = data.get("usage", {})
+        return content, duration, None, usage
+    except Exception as e:
+        return "", time.time() - t0, str(e), {}
+
+
+# ── Quality Assessment ───────────────────────────────────────────────
+
+REQUIRED_FIELDS = [
+    "title", "objective", "rationale", "requirements",
+    "test_procedure", "evidence", "severity", "domain",
+]
+
+BONUS_FIELDS = [
+    "tags", "category", "target_audience", "source_article",
+    "applicable_industries", "applicable_company_size",
+]
+
+
+def assess_quality(raw_text: str) -> dict:
+    """Assess the quality of a control generation response."""
+    result = {
+        "json_valid": False,
+        "required_fields": 0,
+        "required_total": len(REQUIRED_FIELDS),
+        "bonus_fields": 0,
+        "bonus_total": len(BONUS_FIELDS),
+        "requirements_count": 0,
+        "test_procedure_count": 0,
+        "evidence_count": 0,
+        "title_length": 0,
+        "objective_length": 0,
+        "score": 0,
+    }
+
+    # Try to parse JSON
+    text = raw_text.strip()
+    if text.startswith("```"):
+        lines = text.split("\n")
+        text = "\n".join(lines[1:-1] if lines[-1].startswith("```") else lines[1:])
+
+    try:
+        data = json.loads(text)
+        if isinstance(data, list):
+            data = data[0] if data else {}
+    except json.JSONDecodeError:
+        # Try to find JSON object
+        import re
+        match = re.search(r'\{[\s\S]*\}', text)
+        if match:
+            try:
+                data = json.loads(match.group())
+            except json.JSONDecodeError:
+                return result
+        else:
+            return result
+
+    result["json_valid"] = True
+
+    # Check required fields
+    for f in REQUIRED_FIELDS:
+        val = data.get(f)
+        if val and (isinstance(val, str) and len(val) > 2 or isinstance(val, list) and len(val) > 0):
+            result["required_fields"] += 1
+
+    # Check bonus fields
+    for f in BONUS_FIELDS:
+        val = data.get(f)
+        if val and (isinstance(val, str) and len(val) > 0 or isinstance(val, list) and len(val) > 0):
+            result["bonus_fields"] += 1
+
+    # Depth metrics
+    reqs = data.get("requirements", [])
+    result["requirements_count"] = len(reqs) if isinstance(reqs, list) else 0
+    tp = data.get("test_procedure", [])
+    result["test_procedure_count"] = len(tp) if isinstance(tp, list) else 0
+    ev = data.get("evidence", [])
+    result["evidence_count"] = len(ev) if isinstance(ev, list) else 0
+    result["title_length"] = len(data.get("title", ""))
+    result["objective_length"] = len(data.get("objective", ""))
+
+    # Score: 0-100
+    score = 0
+    score += 20 if result["json_valid"] else 0
+    score += (result["required_fields"] / result["required_total"]) * 40
+    score += (result["bonus_fields"] / result["bonus_total"]) * 15
+    score += min(result["requirements_count"], 5) * 3  # max 15 for 5+ requirements
+    score += min(result["test_procedure_count"], 3) * 3  # max 9 for 3+ tests
+    score += 1 if result["objective_length"] > 50 else 0
+    result["score"] = round(score, 1)
+
+    result["parsed_data"] = data
+    return result
+
+
+# ── Test Cases ───────────────────────────────────────────────────────
+
+TEST_CASES = [
+    {
+        "source": "DSGVO (EU) 2016/679",
+        "article": "Artikel 32",
+        "pdf": "dsgvo_2016_679.pdf",
+        "doc_type": "eu_regulation",
+        "license": "EU_LAW",
+        "description": "Sicherheit der Verarbeitung — Kernthema Datenschutz",
+    },
+    {
+        "source": "KI-Verordnung (EU) 2024/1689",
+        "article": "Artikel 9",
+        "pdf": "ai_act_2024_1689.pdf",
+        "doc_type": "eu_regulation",
+        "license": "EU_LAW",
+        "description": "Risikomanagement für Hochrisiko-KI",
+    },
+    {
+        "source": "NIS2-Richtlinie (EU) 2022/2555",
+        "article": "Artikel 21",
+        "pdf": "nis2_2022_2555.pdf",
+        "doc_type": "eu_regulation",
+        "license": "EU_LAW",
+        "description": "Cybersicherheitsrisikomanagement — NIS2 Kernpflicht",
+    },
+    {
+        "source": "Cyber Resilience Act (CRA)",
+        "article": "Artikel 13",
+        "pdf": "cra_2024_2847.pdf",
+        "doc_type": "eu_regulation",
+        "license": "EU_LAW",
+        "description": "Pflichten der Hersteller",
+    },
+    {
+        "source": "Bundesdatenschutzgesetz (BDSG)",
+        "article": "§ 26",
+        "pdf": "bdsg.pdf",
+        "doc_type": "de_law",
+        "license": "DE_LAW",
+        "description": "Datenverarbeitung im Beschäftigungskontext",
+    },
+]
+
+
+# ── Main ─────────────────────────────────────────────────────────────
+
+def main():
+    if not ANTHROPIC_API_KEY:
+        print("ERROR: Set ANTHROPIC_API_KEY environment variable")
+        sys.exit(1)
+
+    print("=" * 80)
+    print("LLM BENCHMARK: gpt-oss-120b vs Claude Sonnet 4.6")
+    print("=" * 80)
+    print(f"  LiteLLM:   {LITELLM_URL} / {LITELLM_MODEL}")
+    print(f"  Anthropic: {ANTHROPIC_MODEL}")
+    print(f"  Tests:     {len(TEST_CASES)}")
+    print()
+
+    # Pre-check LiteLLM
+    try:
+        r = requests.get(f"{LITELLM_URL}/v1/models",
+                         headers={"Authorization": f"Bearer {LITELLM_API_KEY}"}, timeout=10)
+        print(f"  LiteLLM OK: {r.status_code}")
+    except Exception as e:
+        print(f"  LiteLLM ERROR: {e}")
+        sys.exit(1)
+
+    results = []
+
+    for i, tc in enumerate(TEST_CASES):
+        print(f"\n{'='*80}")
+        print(f"TEST {i+1}/{len(TEST_CASES)}: {tc['source']} — {tc['article']}")
+        print(f"  {tc['description']}")
+        print(f"{'='*80}")
+
+        # Extract article text from PDF
+        article_text = extract_article_text(tc["pdf"], tc["article"], tc["doc_type"])
+        if not article_text or article_text.startswith("["):
+            print(f"  WARNING: {article_text or 'Empty text'}")
+            continue
+
+        print(f"  Text extracted: {len(article_text)} chars")
+        print(f"  First 120 chars: {article_text[:120].replace(chr(10), ' ')}...")
+
+        prompt = build_prompt(tc["source"], tc["article"], article_text, tc["license"])
+
+        # ── Call LiteLLM ──
+        print(f"\n  --- gpt-oss-120b ---")
+        litellm_raw, litellm_time, litellm_err, litellm_usage = call_litellm(prompt, SYSTEM_PROMPT)
+        if litellm_err:
+            print(f"  ERROR: {litellm_err}")
+            litellm_quality = {"json_valid": False, "score": 0}
+        else:
+            print(f"  Time: {litellm_time:.1f}s")
+            print(f"  Tokens: {litellm_usage}")
+            litellm_quality = assess_quality(litellm_raw)
+            print(f"  JSON valid: {litellm_quality['json_valid']}")
+            print(f"  Score: {litellm_quality['score']}/100")
+            print(f"  Required fields: {litellm_quality['required_fields']}/{litellm_quality['required_total']}")
+            print(f"  Requirements: {litellm_quality['requirements_count']}, "
+                  f"Tests: {litellm_quality['test_procedure_count']}, "
+                  f"Evidence: {litellm_quality['evidence_count']}")
+            if litellm_quality.get("parsed_data"):
+                d = litellm_quality["parsed_data"]
+                print(f"  Title: {d.get('title', 'N/A')}")
+
+        # ── Call Anthropic ──
+        print(f"\n  --- Claude Sonnet 4.6 ---")
+        anthropic_raw, anthropic_time, anthropic_err, anthropic_usage = call_anthropic(prompt, SYSTEM_PROMPT)
+        if anthropic_err:
+            print(f"  ERROR: {anthropic_err}")
+            anthropic_quality = {"json_valid": False, "score": 0}
+        else:
+            print(f"  Time: {anthropic_time:.1f}s")
+            print(f"  Tokens: {anthropic_usage}")
+            anthropic_quality = assess_quality(anthropic_raw)
+            print(f"  JSON valid: {anthropic_quality['json_valid']}")
+            print(f"  Score: {anthropic_quality['score']}/100")
+            print(f"  Required fields: {anthropic_quality['required_fields']}/{anthropic_quality['required_total']}")
+            print(f"  Requirements: {anthropic_quality['requirements_count']}, "
+                  f"Tests: {anthropic_quality['test_procedure_count']}, "
+                  f"Evidence: {anthropic_quality['evidence_count']}")
+            if anthropic_quality.get("parsed_data"):
+                d = anthropic_quality["parsed_data"]
+                print(f"  Title: {d.get('title', 'N/A')}")
+
+        # Compare
+        print(f"\n  --- VERGLEICH ---")
+        speed_ratio = litellm_time / anthropic_time if anthropic_time > 0 else 0
+        print(f"  Speed:   120b {litellm_time:.1f}s vs Sonnet {anthropic_time:.1f}s "
+              f"({'120b ' + str(round(speed_ratio, 1)) + 'x langsamer' if speed_ratio > 1 else '120b schneller'})")
+        print(f"  Score:   120b {litellm_quality.get('score', 0)}/100 vs "
+              f"Sonnet {anthropic_quality.get('score', 0)}/100")
+
+        results.append({
+            "test": f"{tc['source']} — {tc['article']}",
+            "litellm": {
+                "time": round(litellm_time, 1),
+                "score": litellm_quality.get("score", 0),
+                "json_valid": litellm_quality.get("json_valid", False),
+                "requirements": litellm_quality.get("requirements_count", 0),
+                "tests": litellm_quality.get("test_procedure_count", 0),
+                "usage": litellm_usage,
+                "raw": litellm_raw[:500] if litellm_raw else "",
+            },
+            "anthropic": {
+                "time": round(anthropic_time, 1),
+                "score": anthropic_quality.get("score", 0),
+                "json_valid": anthropic_quality.get("json_valid", False),
+                "requirements": anthropic_quality.get("requirements_count", 0),
+                "tests": anthropic_quality.get("test_procedure_count", 0),
+                "usage": anthropic_usage,
+                "raw": anthropic_raw[:500] if anthropic_raw else "",
+            },
+        })
+
+    # ── Summary ──────────────────────────────────────────────────────
+    print(f"\n\n{'='*80}")
+    print("ZUSAMMENFASSUNG")
+    print(f"{'='*80}")
+
+    if not results:
+        print("  Keine Ergebnisse.")
+        return
+
+    litellm_scores = [r["litellm"]["score"] for r in results]
+    anthropic_scores = [r["anthropic"]["score"] for r in results]
+    litellm_times = [r["litellm"]["time"] for r in results]
+    anthropic_times = [r["anthropic"]["time"] for r in results]
+
+    print(f"\n  {'Metrik':<30s} {'gpt-oss-120b':>15s} {'Claude Sonnet':>15s}")
+    print(f"  {'-'*30} {'-'*15} {'-'*15}")
+    print(f"  {'Avg Score (0-100)':<30s} {sum(litellm_scores)/len(litellm_scores):>13.1f}   "
+          f"{sum(anthropic_scores)/len(anthropic_scores):>13.1f}")
+    print(f"  {'Avg Time (s)':<30s} {sum(litellm_times)/len(litellm_times):>13.1f}   "
+          f"{sum(anthropic_times)/len(anthropic_times):>13.1f}")
+    print(f"  {'JSON Valid':<30s} {sum(1 for r in results if r['litellm']['json_valid']):>12d}/{len(results)}   "
+          f"{sum(1 for r in results if r['anthropic']['json_valid']):>12d}/{len(results)}")
+    print(f"  {'Avg Requirements':<30s} "
+          f"{sum(r['litellm']['requirements'] for r in results)/len(results):>13.1f}   "
+          f"{sum(r['anthropic']['requirements'] for r in results)/len(results):>13.1f}")
+    print(f"  {'Avg Test Procedures':<30s} "
+          f"{sum(r['litellm']['tests'] for r in results)/len(results):>13.1f}   "
+          f"{sum(r['anthropic']['tests'] for r in results)/len(results):>13.1f}")
+
+    # Cost estimate
+    # Claude Sonnet: ~$3/M input, ~$15/M output
+    # gpt-oss-120b: self-hosted = $0 API cost (only compute)
+    total_anthropic_input = sum(r["anthropic"]["usage"].get("input_tokens", 0) for r in results)
+    total_anthropic_output = sum(r["anthropic"]["usage"].get("output_tokens", 0) for r in results)
+    anthropic_cost = (total_anthropic_input * 3 + total_anthropic_output * 15) / 1_000_000
+
+    print(f"\n  Kostenvergleich (fuer {len(results)} Controls):")
+    print(f"    gpt-oss-120b:    $0.00 (self-hosted)")
+    print(f"    Claude Sonnet:   ${anthropic_cost:.4f} "
+          f"({total_anthropic_input} input + {total_anthropic_output} output tokens)")
+
+    # Extrapolate for 494 gap articles
+    if results:
+        cost_per_control = anthropic_cost / len(results)
+        print(f"\n  Hochrechnung fuer 494 Luecken-Artikel:")
+        print(f"    gpt-oss-120b:    $0.00")
+        print(f"    Claude Sonnet:   ${cost_per_control * 494:.2f}")
+        avg_time_120b = sum(litellm_times) / len(litellm_times)
+        avg_time_sonnet = sum(anthropic_times) / len(anthropic_times)
+        print(f"    Zeit 120b:       {avg_time_120b * 494 / 60:.0f} min ({avg_time_120b * 494 / 3600:.1f}h)")
+        print(f"    Zeit Sonnet:     {avg_time_sonnet * 494 / 60:.0f} min ({avg_time_sonnet * 494 / 3600:.1f}h)")
+
+    # Save full results
+    out_path = "/tmp/benchmark_llm_results.json"
+    with open(out_path, 'w') as f:
+        json.dump(results, f, indent=2, ensure_ascii=False)
+    print(f"\n  Detaillierte Ergebnisse: {out_path}")
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/qa/blue_guide_en_match.py
+++ b/scripts/qa/blue_guide_en_match.py
@@ -0,0 +1,200 @@
+"""Match unmatched Blue Guide controls against the English PDF."""
+import os
+import re
+import json
+import unicodedata
+import psycopg2
+import urllib.parse
+
+try:
+    import fitz
+except ImportError:
+    print("ERROR: PyMuPDF (fitz) not installed")
+    exit(1)
+
+PDF_PATH = os.path.expanduser("~/rag-ingestion/pdfs/blue_guide_2022_en.pdf")
+
+def normalize(s):
+    s = s.replace('\u00ad', '').replace('\xad', '')
+    s = s.replace('\u200b', '').replace('\u00a0', ' ')
+    s = s.replace('\ufb01', 'fi').replace('\ufb02', 'fl')
+    s = s.replace('\ufb00', 'ff').replace('\ufb03', 'ffi').replace('\ufb04', 'ffl')
+    s = s.replace('\u2019', "'").replace('\u2018', "'")
+    s = s.replace('\u201c', '"').replace('\u201d', '"')
+    s = s.replace('\u2013', '-').replace('\u2014', '-')
+    s = s.replace('\u2022', '-').replace('\u00b7', '-')
+    s = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', s)
+    s = unicodedata.normalize('NFC', s)
+    s = re.sub(r'\s+', ' ', s)
+    return s.strip()
+
+# Read EN PDF
+print(f"Reading {PDF_PATH}...")
+doc = fitz.open(PDF_PATH)
+text = ""
+for page in doc:
+    text += page.get_text() + "\n"
+doc.close()
+print(f"  {len(text):,} chars")
+
+text_norm = normalize(text)
+
+# Build article index for EN Blue Guide
+# EN Blue Guide uses "Article N" headings (not "Artikel N")
+items = []
+
+# Find where "Article 1" starts — content before is preamble/intro
+art1_match = re.search(r'\nArticle\s+1\s*\n', text)
+if not art1_match:
+    # Try section-based structure instead
+    print("  No 'Article N' headings found, trying section-based index...")
+    for m in re.finditer(r'(?:^|\n)\s*(\d+(?:\.\d+)*)\.\s+[A-Z]', text, re.MULTILINE):
+        items.append((m.start(), f"Section {m.group(1)}", "section"))
+else:
+    art1_pos = art1_match.start()
+    # Article headings
+    for m in re.finditer(r'(?:^|\n)\s*Article\s+(\d+[a-z]?)\s*\n', text, re.MULTILINE):
+        art_num = int(re.match(r'(\d+)', m.group(1)).group(1))
+        items.append((m.start(), f"Article {m.group(1)}", "article"))
+
+    # Annex markers
+    for m in re.finditer(r'(?:^|\n)\s*ANNEX\s+([IVXLC]+[a-z]?)\b', text, re.MULTILINE):
+        items.append((m.start(), f"Annex {m.group(1)}", "annex"))
+
+# Also try numbered section headings as fallback
+for m in re.finditer(r'(?:^|\n)\s*(\d+\.\d+(?:\.\d+)?)\s+[A-Z]', text, re.MULTILINE):
+    items.append((m.start(), f"Section {m.group(1)}", "section"))
+
+items.sort(key=lambda x: x[0])
+seen = set()
+unique = []
+for pos, label, typ in items:
+    if label not in seen:
+        seen.add(label)
+        unique.append((pos, label, typ))
+
+print(f"  Index: {len(unique)} sections")
+if unique[:5]:
+    for pos, label, typ in unique[:5]:
+        print(f"    {label} [{typ}] @ pos {pos}")
+
+# Precompute normalized positions
+index_norm = []
+for pos, label, typ in unique:
+    norm_pos = len(normalize(text[:pos]))
+    index_norm.append((norm_pos, label, typ))
+
+# Connect to DB
+db_url = os.environ['DATABASE_URL']
+parsed = urllib.parse.urlparse(db_url)
+conn = psycopg2.connect(
+    host=parsed.hostname, port=parsed.port or 5432,
+    user=parsed.username, password=parsed.password,
+    dbname=parsed.path.lstrip('/'),
+    options="-c search_path=compliance,public"
+)
+cur = conn.cursor()
+
+# Get Blue Guide controls without article_type (unmatched)
+cur.execute("""
+    SELECT id, control_id, title, source_original_text,
+           source_citation->>'article' as existing_article,
+           source_citation->>'article_type' as existing_type,
+           release_state
+    FROM compliance.canonical_controls
+    WHERE source_citation->>'source' = 'EU Blue Guide 2022'
+    AND source_original_text IS NOT NULL
+    AND length(source_original_text) > 50
+    AND (source_citation->>'article_type' IS NULL)
+    ORDER BY control_id
+""")
+controls = cur.fetchall()
+print(f"\nUnmatched Blue Guide controls: {len(controls)}")
+
+# Match each control
+results = []
+found = 0
+not_found = 0
+
+for ctrl in controls:
+    ctrl_id, control_id, title, orig_text, existing_art, existing_type, state = ctrl
+    orig_norm = normalize(orig_text)
+    if len(orig_norm) < 30:
+        not_found += 1
+        continue
+
+    matched = False
+    for start_frac in [0.25, 0.1, 0.5, 0.0, 0.75]:
+        for length in [80, 60, 40, 30, 20]:
+            start = max(0, int(len(orig_norm) * start_frac))
+            snippet = orig_norm[start:start+length]
+            if not snippet or len(snippet) < 15:
+                continue
+            pos = text_norm.find(snippet)
+            if pos >= 0:
+                # Find section
+                label = "Unknown"
+                typ = "unknown"
+                for h_pos, h_label, h_type in reversed(index_norm):
+                    if h_pos <= pos:
+                        label = h_label
+                        typ = h_type
+                        break
+                results.append({
+                    "ctrl_id": str(ctrl_id),
+                    "control_id": control_id,
+                    "source": "EU Blue Guide 2022",
+                    "article_label": label,
+                    "article_type": typ,
+                })
+                found += 1
+                is_active = "" if state not in ('duplicate', 'too_close') else " [DUP]"
+                print(f"  {control_id:10s}: {label:25s} [{typ:8s}]{is_active}")
+                matched = True
+                break
+        if matched:
+            break
+
+    if not matched:
+        not_found += 1
+        print(f"  {control_id:10s}: NOT FOUND  {title[:50]}")
+
+print(f"\n{'='*50}")
+print(f"Results: {found} matched, {not_found} not found out of {len(controls)}")
+
+# Save results
+out_path = "/tmp/blue_guide_en_results.json"
+with open(out_path, 'w') as f:
+    json.dump(results, f, indent=2, ensure_ascii=False)
+print(f"Saved to {out_path}")
+
+# Apply results to DB
+if results:
+    print(f"\nApplying {len(results)} results to DB...")
+    applied = 0
+    for r in results:
+        cur.execute("""
+            UPDATE compliance.canonical_controls
+            SET source_citation = source_citation ||
+                jsonb_build_object('article', %s, 'article_type', %s)
+            WHERE id = %s::uuid
+            AND (source_citation->>'article' IS DISTINCT FROM %s
+                 OR source_citation->>'article_type' IS DISTINCT FROM %s)
+        """, (r["article_label"], r["article_type"],
+              r["ctrl_id"], r["article_label"], r["article_type"]))
+        if cur.rowcount > 0:
+            applied += 1
+    conn.commit()
+    print(f"  Applied: {applied} controls updated")
+
+# Show type distribution
+type_counts = {}
+for r in results:
+    t = r["article_type"]
+    type_counts[t] = type_counts.get(t, 0) + 1
+if type_counts:
+    print(f"\nArticle type distribution:")
+    for t, c in sorted(type_counts.items(), key=lambda x: -x[1]):
+        print(f"  {t:12s}: {c:5d}")
+
+conn.close()
--- a/scripts/qa/gap_analysis.py
+++ b/scripts/qa/gap_analysis.py
@@ -0,0 +1,188 @@
+"""
+Phase 7.3: Gap Analysis — Identify articles/sections WITHOUT controls.
+
+For each regulation PDF:
+1. Extract all articles/sections from the PDF
+2. Compare with controls in the DB that reference this article
+3. Report gaps (articles with no controls)
+
+Usage:
+    python3 gap_analysis.py                  # show all gaps
+    python3 gap_analysis.py --source "DSGVO"  # filter by source
+"""
+import os
+import sys
+import json
+import re
+import psycopg2
+import urllib.parse
+from pathlib import Path
+from collections import defaultdict
+
+# Import from pdf_qa_all
+sys.path.insert(0, os.path.dirname(__file__))
+from pdf_qa_all import (
+    SOURCE_FILE_MAP, read_file, classify_doc, normalize,
+    build_eu_article_index, build_de_law_index, build_nist_index,
+    build_owasp_index, build_generic_index, MAX_ARTICLES
+)
+
+# Only analyze sources with significant control counts (skip sources with <5 controls)
+MIN_CONTROLS = 5
+
+
+def main():
+    source_filter = None
+    if "--source" in sys.argv:
+        idx = sys.argv.index("--source")
+        if idx + 1 < len(sys.argv):
+            source_filter = sys.argv[idx + 1]
+
+    # DB connection
+    db_url = os.environ['DATABASE_URL']
+    parsed = urllib.parse.urlparse(db_url)
+    conn = psycopg2.connect(
+        host=parsed.hostname, port=parsed.port or 5432,
+        user=parsed.username, password=parsed.password,
+        dbname=parsed.path.lstrip('/'),
+        options="-c search_path=compliance,public"
+    )
+    cur = conn.cursor()
+
+    # Get all controls grouped by source with their article
+    cur.execute("""
+        SELECT source_citation->>'source' as source,
+               source_citation->>'article' as article,
+               source_citation->>'article_type' as article_type,
+               count(*) as cnt
+        FROM compliance.canonical_controls
+        WHERE source_citation->>'source' IS NOT NULL
+        AND release_state NOT IN ('duplicate', 'too_close')
+        GROUP BY 1, 2, 3
+        ORDER BY 1, 2
+    """)
+
+    # Build: source -> {article -> (type, count)}
+    controls_by_source = defaultdict(dict)
+    for source, article, art_type, cnt in cur.fetchall():
+        if article:
+            controls_by_source[source][article] = (art_type or "unknown", cnt)
+
+    total_gaps = 0
+    total_articles_checked = 0
+    total_covered = 0
+    gap_report = []
+
+    sources_to_check = sorted(SOURCE_FILE_MAP.keys())
+    if source_filter:
+        sources_to_check = [s for s in sources_to_check if source_filter.lower() in s.lower()]
+
+    for source_name in sources_to_check:
+        filename = SOURCE_FILE_MAP.get(source_name)
+        if filename is None:
+            continue
+
+        controls = controls_by_source.get(source_name, {})
+        if len(controls) < MIN_CONTROLS and not source_filter:
+            continue
+
+        # Read PDF and build article index
+        text = read_file(filename)
+        if text is None:
+            continue
+
+        doc_type = classify_doc(source_name)
+        max_art = MAX_ARTICLES.get(source_name)
+
+        if doc_type == "eu_regulation":
+            index = build_eu_article_index(text, max_article=max_art)
+        elif doc_type == "de_law":
+            index = build_de_law_index(text)
+        elif doc_type == "nist":
+            index = build_nist_index(text)
+        elif doc_type == "owasp":
+            index = build_owasp_index(text, source_name)
+        else:
+            index = build_generic_index(text)
+
+        if not index:
+            continue
+
+        # Only look at substantive articles (not preamble, not annex for gap analysis)
+        substantive_types = {"article", "section", "control", "requirement", "category"}
+        substantive_articles = [(pos, label, typ) for pos, label, typ in index if typ in substantive_types]
+
+        preamble_articles = [(pos, label, typ) for pos, label, typ in index if typ == "preamble"]
+        annex_articles = [(pos, label, typ) for pos, label, typ in index if typ == "annex"]
+
+        # Check which articles have controls
+        covered = []
+        gaps = []
+        for pos, label, typ in substantive_articles:
+            if label in controls:
+                covered.append(label)
+            else:
+                gaps.append((label, typ))
+
+        total_articles_checked += len(substantive_articles)
+        total_covered += len(covered)
+        total_gaps += len(gaps)
+
+        # Count preamble/annex controls
+        preamble_controls = sum(1 for a in controls if controls[a][0] == "preamble")
+        annex_controls = sum(1 for a in controls if controls[a][0] == "annex")
+
+        coverage_pct = len(covered) / len(substantive_articles) * 100 if substantive_articles else 0
+
+        print(f"\n{'='*70}")
+        print(f"{source_name}")
+        print(f"  PDF articles: {len(substantive_articles)} substantive, "
+              f"{len(preamble_articles)} preamble, {len(annex_articles)} annex")
+        print(f"  DB controls:  {sum(v[1] for v in controls.values())} total "
+              f"({preamble_controls} preamble, {annex_controls} annex)")
+        print(f"  Coverage:     {len(covered)}/{len(substantive_articles)} "
+              f"({coverage_pct:.0f}%)")
+
+        if gaps:
+            print(f"  GAPS ({len(gaps)}):")
+            for label, typ in gaps[:30]:  # limit output
+                print(f"    - {label} [{typ}]")
+            if len(gaps) > 30:
+                print(f"    ... and {len(gaps)-30} more")
+
+            gap_report.append({
+                "source": source_name,
+                "total_articles": len(substantive_articles),
+                "covered": len(covered),
+                "gaps": len(gaps),
+                "coverage_pct": round(coverage_pct, 1),
+                "gap_articles": [{"label": l, "type": t} for l, t in gaps],
+            })
+
+    # Summary
+    print(f"\n{'='*70}")
+    print("GAP ANALYSIS SUMMARY")
+    print(f"{'='*70}")
+    print(f"  Sources analyzed:        {len([r for r in gap_report]) + len([s for s in sources_to_check if SOURCE_FILE_MAP.get(s)])}")
+    print(f"  Total articles in PDFs:  {total_articles_checked}")
+    print(f"  Articles with controls:  {total_covered}")
+    print(f"  Articles WITHOUT controls: {total_gaps}")
+    if total_articles_checked:
+        print(f"  Overall coverage:        {total_covered/total_articles_checked*100:.1f}%")
+
+    print(f"\n  Sources with gaps:")
+    for r in sorted(gap_report, key=lambda x: -x["gaps"]):
+        print(f"    {r['source']:45s}  {r['gaps']:4d} gaps  "
+              f"({r['covered']}/{r['total_articles']} = {r['coverage_pct']}%)")
+
+    # Save report
+    out_path = "/tmp/gap_analysis_results.json"
+    with open(out_path, 'w') as f:
+        json.dump(gap_report, f, indent=2, ensure_ascii=False)
+    print(f"\n  Full report saved to {out_path}")
+
+    conn.close()
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/qa/oscal_analysis.py
+++ b/scripts/qa/oscal_analysis.py
@@ -0,0 +1,288 @@
+"""Analyze NIST OSCAL data and compare with existing controls in DB."""
+import os
+import re
+import json
+import psycopg2
+import urllib.parse
+from collections import defaultdict
+
+OSCAL_DIR = os.path.expanduser("~/rag-ingestion/nist-oscal")
+
+# ── Load SP 800-53 Rev 5 ──
+with open(os.path.join(OSCAL_DIR, "sp800-53-rev5-catalog.json")) as f:
+    sp853 = json.load(f)["catalog"]
+
+print("=" * 70)
+print("NIST SP 800-53 Rev 5 — OSCAL Catalog Analysis")
+print("=" * 70)
+print(f"  UUID: {sp853.get('uuid', '?')}")
+print(f"  Last Modified: {sp853.get('metadata', {}).get('last-modified', '?')}")
+
+# Count controls
+families = sp853.get("groups", [])
+total_base = 0
+total_enhancements = 0
+total_withdrawn = 0
+total_active = 0
+family_stats = []
+
+for fam in families:
+    fam_id = fam.get("id", "?")
+    fam_title = fam.get("title", "?")
+    controls = fam.get("controls", [])
+    base = 0
+    enhancements = 0
+    withdrawn = 0
+
+    for ctrl in controls:
+        # Check if withdrawn
+        props = {p["name"]: p.get("value", "") for p in ctrl.get("props", [])}
+        is_withdrawn = props.get("status") == "withdrawn"
+        if is_withdrawn:
+            withdrawn += 1
+        else:
+            base += 1
+
+        # Count enhancements
+        for enh in ctrl.get("controls", []):
+            enh_props = {p["name"]: p.get("value", "") for p in enh.get("props", [])}
+            if enh_props.get("status") == "withdrawn":
+                withdrawn += 1
+            else:
+                enhancements += 1
+
+    family_stats.append((fam_id, fam_title, base, enhancements, withdrawn))
+    total_base += base
+    total_enhancements += enhancements
+    total_withdrawn += withdrawn
+
+total_active = total_base + total_enhancements
+print(f"\n  Families: {len(families)}")
+print(f"  Base Controls: {total_base}")
+print(f"  Enhancements: {total_enhancements}")
+print(f"  Withdrawn: {total_withdrawn}")
+print(f"  TOTAL ACTIVE: {total_active}")
+
+print(f"\n  Per Family:")
+print(f"  {'ID':6s} {'Title':45s} {'Base':>5s} {'Enh':>5s} {'Wdrn':>5s}")
+for fam_id, title, base, enh, wdrn in family_stats:
+    print(f"  {fam_id:6s} {title[:45]:45s} {base:5d} {enh:5d} {wdrn:5d}")
+
+# Show example control structure
+print(f"\n  Example Control (AC-6 Least Privilege):")
+for fam in families:
+    for ctrl in fam.get("controls", []):
+        if ctrl["id"] == "ac-6":
+            props = {p["name"]: p.get("value", "") for p in ctrl.get("props", [])}
+            print(f"    ID: {ctrl['id']}")
+            print(f"    Label: {props.get('label', '?')}")
+            print(f"    Title: {ctrl['title']}")
+            for part in ctrl.get("parts", []):
+                if part.get("name") == "statement":
+                    prose = part.get("prose", "")
+                    print(f"    Statement: {prose[:150]}...")
+                elif part.get("name") == "guidance":
+                    prose = part.get("prose", "")
+                    print(f"    Guidance: {prose[:150]}...")
+            enh_count = len(ctrl.get("controls", []))
+            print(f"    Enhancements: {enh_count}")
+            links = [l["href"].lstrip("#") for l in ctrl.get("links", []) if l.get("rel") == "related"]
+            print(f"    Related: {', '.join(links[:8])}...")
+            break
+
+# ── Load CSF 2.0 ──
+print(f"\n{'='*70}")
+print("NIST CSF 2.0 — OSCAL Catalog Analysis")
+print("=" * 70)
+
+with open(os.path.join(OSCAL_DIR, "csf-2.0-catalog.json")) as f:
+    csf = json.load(f)["catalog"]
+
+csf_groups = csf.get("groups", [])
+csf_total = 0
+for grp in csf_groups:
+    func_title = grp.get("title", "?")
+    cats = grp.get("groups", [])
+    subcats = 0
+    for cat in cats:
+        subcats += len(cat.get("controls", []))
+    csf_total += subcats
+    print(f"  {func_title:25s}: {len(cats):2d} categories, {subcats:3d} subcategories")
+
+print(f"  TOTAL: {csf_total} subcategories")
+
+# ── Compare with existing DB controls ──
+print(f"\n{'='*70}")
+print("VERGLEICH: OSCAL vs. bestehende Controls in DB")
+print("=" * 70)
+
+db_url = os.environ['DATABASE_URL']
+parsed = urllib.parse.urlparse(db_url)
+conn = psycopg2.connect(
+    host=parsed.hostname, port=parsed.port or 5432,
+    user=parsed.username, password=parsed.password,
+    dbname=parsed.path.lstrip('/'),
+    options="-c search_path=compliance,public"
+)
+cur = conn.cursor()
+
+# Get existing NIST controls
+cur.execute("""
+    SELECT control_id, title,
+           source_citation->>'source' as source,
+           source_citation->>'article' as article,
+           source_citation->>'article_type' as art_type,
+           release_state
+    FROM compliance.canonical_controls
+    WHERE source_citation->>'source' LIKE 'NIST%%'
+    ORDER BY source_citation->>'source', control_id
+""")
+nist_controls = cur.fetchall()
+
+# Group by source
+by_source = defaultdict(list)
+for ctrl in nist_controls:
+    by_source[ctrl[2]].append(ctrl)
+
+print(f"\n  Bestehende NIST Controls in DB:")
+for src in sorted(by_source.keys()):
+    ctrls = by_source[src]
+    active = sum(1 for c in ctrls if c[5] not in ('duplicate', 'too_close'))
+    with_article = sum(1 for c in ctrls if c[3])
+    print(f"    {src:40s}: {len(ctrls):4d} total, {active:4d} active, {with_article:4d} mit article")
+
+# For SP 800-53: which control families do we have?
+sp853_existing = [c for c in nist_controls if 'SP 800-53' in (c[2] or '')]
+existing_families = set()
+existing_articles = set()
+for ctrl in sp853_existing:
+    article = ctrl[3] or ""
+    if article:
+        # Extract family prefix (e.g., "AC-6" → "AC")
+        m = re.match(r'([A-Z]{2})-', article)
+        if m:
+            existing_families.add(m.group(1))
+            existing_articles.add(article)
+
+print(f"\n  SP 800-53 in DB:")
+print(f"    Total: {len(sp853_existing)}")
+print(f"    Families covered: {len(existing_families)}")
+print(f"    Unique articles: {len(existing_articles)}")
+print(f"    Families: {', '.join(sorted(existing_families))}")
+
+# Compare: which OSCAL controls are NOT in our DB?
+oscal_controls = {}  # id → (label, title, statement)
+for fam in families:
+    for ctrl in fam.get("controls", []):
+        props = {p["name"]: p.get("value", "") for p in ctrl.get("props", [])}
+        if props.get("status") == "withdrawn":
+            continue
+        label = props.get("label", ctrl["id"].upper())
+        statement = ""
+        guidance = ""
+        for part in ctrl.get("parts", []):
+            if part.get("name") == "statement":
+                statement = part.get("prose", "")
+                # Also check sub-items
+                for sub in part.get("parts", []):
+                    statement += " " + sub.get("prose", "")
+            elif part.get("name") == "guidance":
+                guidance = part.get("prose", "")
+
+        oscal_controls[label] = (ctrl["title"], statement[:500], guidance[:500])
+
+        # Enhancements
+        for enh in ctrl.get("controls", []):
+            enh_props = {p["name"]: p.get("value", "") for p in enh.get("props", [])}
+            if enh_props.get("status") == "withdrawn":
+                continue
+            enh_label = enh_props.get("label", enh["id"].upper())
+            enh_statement = ""
+            enh_guidance = ""
+            for part in enh.get("parts", []):
+                if part.get("name") == "statement":
+                    enh_statement = part.get("prose", "")
+                    for sub in part.get("parts", []):
+                        enh_statement += " " + sub.get("prose", "")
+                elif part.get("name") == "guidance":
+                    enh_guidance = part.get("prose", "")
+            oscal_controls[enh_label] = (enh["title"], enh_statement[:500], enh_guidance[:500])
+
+print(f"\n  OSCAL SP 800-53 aktive Controls: {len(oscal_controls)}")
+
+# Find missing: in OSCAL but not in DB
+missing = []
+covered = []
+for label in sorted(oscal_controls.keys()):
+    if label in existing_articles:
+        covered.append(label)
+    else:
+        missing.append(label)
+
+print(f"  In DB vorhanden: {len(covered)}")
+print(f"  FEHLEND in DB:   {len(missing)}")
+
+# Missing by family
+missing_by_fam = defaultdict(list)
+for label in missing:
+    fam = label.split("-")[0]
+    missing_by_fam[fam].append(label)
+
+print(f"\n  Fehlende Controls nach Family:")
+for fam in sorted(missing_by_fam.keys()):
+    ctrls = missing_by_fam[fam]
+    examples = ", ".join(ctrls[:5])
+    more = f" +{len(ctrls)-5}" if len(ctrls) > 5 else ""
+    print(f"    {fam:4s}: {len(ctrls):3d} fehlend  ({examples}{more})")
+
+# Also check CSF 2.0
+print(f"\n{'='*70}")
+print("NIST CSF 2.0 — Vergleich mit DB")
+print("=" * 70)
+
+cur.execute("""
+    SELECT count(*), count(*) FILTER (WHERE release_state NOT IN ('duplicate', 'too_close'))
+    FROM compliance.canonical_controls
+    WHERE source_citation->>'source' LIKE 'NIST Cybersecurity%%'
+""")
+csf_row = cur.fetchone()
+print(f"  CSF Controls in DB: {csf_row[0]} total, {csf_row[1]} active")
+
+csf_subcats = 0
+csf_ids = []
+for grp in csf_groups:
+    for cat in grp.get("groups", []):
+        for subcat in cat.get("controls", []):
+            csf_subcats += 1
+            props = {p["name"]: p.get("value", "") for p in subcat.get("props", [])}
+            csf_ids.append(props.get("label", subcat["id"]))
+
+print(f"  CSF 2.0 OSCAL Subcategories: {csf_subcats}")
+print(f"  Beispiele: {', '.join(csf_ids[:10])}")
+
+# ── Summary / Potential ──
+print(f"\n{'='*70}")
+print("POTENTIAL: Was OSCAL uns bringt")
+print("=" * 70)
+print(f"""
+  SP 800-53 Rev 5:
+    - {len(missing)} neue Controls möglich (aktuell {len(covered)} in DB)
+    - Jeder Control hat: Statement + Guidance + Assessment-Methoden
+    - Cross-References zwischen Controls (für Mapping)
+    - Maschinenlesbare Parameter (ODP)
+    - Public Domain — keine Lizenzprobleme
+
+  CSF 2.0:
+    - {csf_subcats} Subcategories als Compliance-Controls
+    - 6 Functions (Govern, Identify, Protect, Detect, Respond, Recover)
+    - Direkte Mappings zu SP 800-53 Controls
+
+  Nächste Schritte:
+    1. Fehlende SP 800-53 Controls importieren ({len(missing)} Controls)
+    2. Statement-Text als source_original_text verwenden
+    3. article_type='control', article=Label (z.B. 'AC-6')
+    4. CSF 2.0 als eigene Regulation importieren
+    5. Cross-References als Grundlage für Control-Mappings nutzen
+""")
+
+conn.close()
--- a/scripts/qa/oscal_import.py
+++ b/scripts/qa/oscal_import.py
@@ -0,0 +1,289 @@
+"""Import 776 missing NIST SP 800-53 Rev 5 controls from OSCAL into canonical_controls."""
+import os
+import re
+import json
+import uuid
+import psycopg2
+import urllib.parse
+
+OSCAL_DIR = os.path.expanduser("~/rag-ingestion/nist-oscal")
+
+with open(os.path.join(OSCAL_DIR, "sp800-53-rev5-catalog.json")) as f:
+    sp853 = json.load(f)["catalog"]
+
+# ── Extract all OSCAL controls ──
+def extract_controls(catalog):
+    """Extract all active controls with full data."""
+    controls = []
+    for fam in catalog.get("groups", []):
+        fam_id = fam.get("id", "").upper()
+        fam_title = fam.get("title", "")
+
+        for ctrl in fam.get("controls", []):
+            result = extract_single(ctrl, fam_title)
+            if result:
+                controls.append(result)
+            # Enhancements
+            for enh in ctrl.get("controls", []):
+                result = extract_single(enh, fam_title)
+                if result:
+                    controls.append(result)
+    return controls
+
+def extract_single(ctrl, family_title):
+    """Extract a single control or enhancement."""
+    props = {p["name"]: p.get("value", "") for p in ctrl.get("props", [])}
+    if props.get("status") == "withdrawn":
+        return None
+
+    label = props.get("label", ctrl["id"].upper())
+    title = ctrl.get("title", "")
+
+    # Extract statement (main requirement text)
+    statement = ""
+    for part in ctrl.get("parts", []):
+        if part.get("name") == "statement":
+            statement = part.get("prose", "")
+            # Sub-items (a., b., c., etc.)
+            for sub in part.get("parts", []):
+                sub_prose = sub.get("prose", "")
+                sub_label = ""
+                for sp in sub.get("props", []):
+                    if sp["name"] == "label":
+                        sub_label = sp.get("value", "")
+                if sub_label:
+                    statement += f"\n{sub_label} {sub_prose}"
+                elif sub_prose:
+                    statement += f"\n{sub_prose}"
+                # Nested sub-sub-items
+                for subsub in sub.get("parts", []):
+                    ss_prose = subsub.get("prose", "")
+                    ss_label = ""
+                    for sp in subsub.get("props", []):
+                        if sp["name"] == "label":
+                            ss_label = sp.get("value", "")
+                    if ss_label:
+                        statement += f"\n  {ss_label} {ss_prose}"
+                    elif ss_prose:
+                        statement += f"\n  {ss_prose}"
+
+    # Extract guidance
+    guidance = ""
+    for part in ctrl.get("parts", []):
+        if part.get("name") == "guidance":
+            guidance = part.get("prose", "")
+
+    # Cross-references
+    related = [l["href"].lstrip("#") for l in ctrl.get("links", []) if l.get("rel") == "related"]
+
+    # Parameters
+    params = []
+    for p in ctrl.get("params", []):
+        param_id = p.get("id", "")
+        param_label = p.get("label", "")
+        guidelines = ""
+        for g in p.get("guidelines", []):
+            guidelines += g.get("prose", "")
+        select_choices = []
+        if "select" in p:
+            for choice in p["select"].get("choice", []):
+                select_choices.append(choice)
+        params.append({
+            "id": param_id,
+            "label": param_label,
+            "guidelines": guidelines,
+            "choices": select_choices,
+        })
+
+    return {
+        "label": label,
+        "title": title,
+        "family": family_title,
+        "statement": statement.strip(),
+        "guidance": guidance.strip(),
+        "related": related,
+        "params": params,
+        "is_enhancement": "(" in label,
+    }
+
+all_oscal = extract_controls(sp853)
+print(f"Total OSCAL active controls: {len(all_oscal)}")
+
+# ── Normalize label for comparison ──
+def normalize_label(label):
+    label = re.sub(r'-0+(\d)', r'-\1', label)
+    label = re.sub(r'\(0+(\d+)\)', r'(\1)', label)
+    return label.upper()
+
+# ── DB connection ──
+db_url = os.environ['DATABASE_URL']
+parsed = urllib.parse.urlparse(db_url)
+conn = psycopg2.connect(
+    host=parsed.hostname, port=parsed.port or 5432,
+    user=parsed.username, password=parsed.password,
+    dbname=parsed.path.lstrip('/'),
+    options="-c search_path=compliance,public"
+)
+cur = conn.cursor()
+
+# Get existing labels
+cur.execute("""
+    SELECT DISTINCT source_citation->>'article' as article
+    FROM compliance.canonical_controls
+    WHERE source_citation->>'source' = 'NIST SP 800-53 Rev. 5'
+    AND source_citation->>'article' IS NOT NULL
+""")
+existing_labels = set(normalize_label(r[0]) for r in cur.fetchall())
+print(f"Existing DB labels (normalized): {len(existing_labels)}")
+
+# Get highest control_id numbers per prefix
+cur.execute("""
+    SELECT control_id FROM compliance.canonical_controls
+    WHERE control_id ~ '^[A-Z]+-[0-9]+$'
+    ORDER BY control_id
+""")
+existing_ids = set(r[0] for r in cur.fetchall())
+
+# Find next available ID per prefix
+def next_control_id(prefix, existing):
+    """Find next available control_id like SEC-1234."""
+    max_num = 0
+    pattern = re.compile(rf'^{prefix}-(\d+)$')
+    for eid in existing:
+        m = pattern.match(eid)
+        if m:
+            max_num = max(max_num, int(m.group(1)))
+    return max_num
+
+# Map NIST families to our control_id prefixes
+FAMILY_PREFIX = {
+    "Access Control": "ACC",
+    "Awareness and Training": "GOV",
+    "Audit and Accountability": "LOG",
+    "Assessment, Authorization, and Monitoring": "GOV",
+    "Configuration Management": "COMP",
+    "Contingency Planning": "INC",
+    "Identification and Authentication": "AUTH",
+    "Incident Response": "INC",
+    "Maintenance": "COMP",
+    "Media Protection": "DATA",
+    "Physical and Environmental Protection": "SEC",
+    "Planning": "GOV",
+    "Program Management": "GOV",
+    "Personnel Security": "GOV",
+    "Personally Identifiable Information Processing and Transparency": "DATA",
+    "Risk Assessment": "GOV",
+    "System and Services Acquisition": "COMP",
+    "System and Communications Protection": "NET",
+    "System and Information Integrity": "SEC",
+    "Supply Chain Risk Management": "COMP",
+}
+
+# Track next IDs
+prefix_counters = {}
+for prefix in set(FAMILY_PREFIX.values()):
+    prefix_counters[prefix] = next_control_id(prefix, existing_ids)
+print(f"Starting counters: {prefix_counters}")
+
+# ── Filter to only new controls ──
+to_import = []
+for ctrl in all_oscal:
+    norm = normalize_label(ctrl["label"])
+    if norm not in existing_labels:
+        to_import.append(ctrl)
+
+print(f"\nControls to import: {len(to_import)}")
+
+# ── Import ──
+imported = 0
+for ctrl in to_import:
+    prefix = FAMILY_PREFIX.get(ctrl["family"], "COMP")
+    prefix_counters[prefix] += 1
+    control_id = f"{prefix}-{prefix_counters[prefix]:04d}"
+
+    # Build title: "NIST {label}: {title}"
+    title = f"NIST {ctrl['label']}: {ctrl['title']}"
+
+    # source_original_text = statement (the official requirement text)
+    source_text = ctrl["statement"]
+    if not source_text:
+        source_text = ctrl["guidance"][:500] if ctrl["guidance"] else ctrl["title"]
+
+    # objective = guidance text
+    objective = ctrl["guidance"][:2000] if ctrl["guidance"] else ""
+
+    # source_citation
+    citation = {
+        "source": "NIST SP 800-53 Rev. 5",
+        "article": ctrl["label"],
+        "article_type": "control",
+        "source_type": "standard",
+        "oscal_import": True,
+    }
+    if ctrl["related"]:
+        citation["related_controls"] = ctrl["related"][:20]
+    if ctrl["params"]:
+        citation["parameters"] = [{"id": p["id"], "label": p["label"]} for p in ctrl["params"][:10]]
+
+    FRAMEWORK_ID = '14b1bdd2-abc7-4a43-adae-14471ee5c7cf'
+    new_id = str(uuid.uuid4())
+    cur.execute("""
+        INSERT INTO compliance.canonical_controls
+            (id, framework_id, control_id, title, objective, rationale,
+             severity, source_original_text,
+             source_citation, pipeline_version, release_state,
+             generation_strategy, category)
+        VALUES (%s, %s, %s, %s, %s, '', 'medium', %s, %s, 4, 'draft', 'oscal_import', %s)
+    """, (
+        new_id,
+        FRAMEWORK_ID,
+        control_id,
+        title[:500],
+        objective[:5000],
+        source_text[:10000],
+        json.dumps(citation, ensure_ascii=False),
+        ctrl["family"],
+    ))
+    imported += 1
+
+conn.commit()
+print(f"\nImported: {imported} new controls")
+
+# ── Verify ──
+cur.execute("""
+    SELECT count(*),
+           count(*) FILTER (WHERE release_state NOT IN ('duplicate', 'too_close'))
+    FROM compliance.canonical_controls
+    WHERE source_citation->>'source' = 'NIST SP 800-53 Rev. 5'
+""")
+total, active = cur.fetchone()
+print(f"\nSP 800-53 after import: {total} total, {active} active")
+
+cur.execute("""
+    SELECT release_state, count(*)
+    FROM compliance.canonical_controls
+    GROUP BY release_state
+    ORDER BY count(*) DESC
+""")
+print(f"\nDB release_state gesamt:")
+for row in cur.fetchall():
+    print(f"  {row[0]:15s}: {row[1]:5d}")
+
+cur.execute("""
+    SELECT count(*)
+    FROM compliance.canonical_controls
+    WHERE release_state NOT IN ('duplicate', 'too_close')
+""")
+print(f"\nAktive Controls gesamt: {cur.fetchone()[0]}")
+
+# ── Import stats by family ──
+fam_counts = {}
+for ctrl in to_import:
+    fam = ctrl["family"]
+    fam_counts[fam] = fam_counts.get(fam, 0) + 1
+
+print(f"\nImportiert nach Family:")
+for fam in sorted(fam_counts.keys()):
+    print(f"  {fam[:45]:45s}: {fam_counts[fam]:3d}")
+
+conn.close()
--- a/scripts/qa/owasp_cleanup.py
+++ b/scripts/qa/owasp_cleanup.py
@@ -0,0 +1,274 @@
+"""OWASP Cleanup:
+1. Mark 324 OWASP Top 10 multilingual controls as 'duplicate'
+2. Fix 47 wrong source attributions (found in different OWASP PDF)
+"""
+import os
+import re
+import json
+import unicodedata
+import psycopg2
+import urllib.parse
+
+try:
+    import fitz
+except ImportError:
+    print("ERROR: PyMuPDF not installed")
+    exit(1)
+
+PDF_DIR = os.path.expanduser("~/rag-ingestion/pdfs")
+
+def normalize(s):
+    s = s.replace('\u00ad', '').replace('\xad', '')
+    s = s.replace('\u200b', '').replace('\u00a0', ' ')
+    s = s.replace('\ufb01', 'fi').replace('\ufb02', 'fl')
+    s = s.replace('\ufb00', 'ff').replace('\ufb03', 'ffi').replace('\ufb04', 'ffl')
+    s = s.replace('\u2019', "'").replace('\u2018', "'")
+    s = s.replace('\u201c', '"').replace('\u201d', '"')
+    s = s.replace('\u2013', '-').replace('\u2014', '-')
+    s = s.replace('\u2022', '-').replace('\u00b7', '-')
+    s = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', s)
+    s = unicodedata.normalize('NFC', s)
+    s = re.sub(r'\s+', ' ', s)
+    return s.strip()
+
+# Load OWASP PDFs
+OWASP_PDFS = {
+    "OWASP Top 10 (2021)": "owasp_top10_2021.pdf",
+    "OWASP ASVS 4.0": "owasp_asvs_4_0.pdf",
+    "OWASP SAMM 2.0": "owasp_samm_2_0.pdf",
+    "OWASP API Security Top 10 (2023)": "owasp_api_top10_2023.pdf",
+    "OWASP MASVS 2.0": "owasp_masvs_2_0.pdf",
+}
+
+pdf_norms = {}
+for name, filename in OWASP_PDFS.items():
+    path = os.path.join(PDF_DIR, filename)
+    if not os.path.exists(path):
+        continue
+    doc = fitz.open(path)
+    text = ""
+    for page in doc:
+        text += page.get_text() + "\n"
+    doc.close()
+    pdf_norms[name] = normalize(text)
+
+def build_owasp_index(text_norm, source_name):
+    # We need the raw text for regex, but we already normalized.
+    # Rebuild index from normalized text.
+    items = []
+    if "Top 10" in source_name and "API" not in source_name:
+        for m in re.finditer(r'(A\d{2}:\d{4})', text_norm):
+            items.append((m.start(), m.group(1), "category"))
+    elif "API" in source_name:
+        for m in re.finditer(r'(API\d+:\d{4})', text_norm):
+            items.append((m.start(), m.group(1), "category"))
+    elif "ASVS" in source_name:
+        for m in re.finditer(r'(V\d+\.\d+(?:\.\d+)?)\b', text_norm):
+            items.append((m.start(), m.group(1), "requirement"))
+    elif "MASVS" in source_name:
+        for m in re.finditer(r'(MASVS-[A-Z]+-\d+)', text_norm):
+            items.append((m.start(), m.group(1), "requirement"))
+    items.sort(key=lambda x: x[0])
+    seen = set()
+    unique = []
+    for pos, label, typ in items:
+        if label not in seen:
+            seen.add(label)
+            unique.append((pos, label, typ))
+    return unique
+
+pdf_indexes = {}
+for name, norm in pdf_norms.items():
+    pdf_indexes[name] = build_owasp_index(norm, name)
+
+def find_in_pdf(orig_text, source_name):
+    """Find control text in a specific PDF. Returns (label, type) or None."""
+    pdf_norm = pdf_norms.get(source_name)
+    if not pdf_norm:
+        return None
+    orig_norm = normalize(orig_text)
+    if len(orig_norm) < 20:
+        return None
+    idx = pdf_indexes.get(source_name, [])
+    for start_frac in [0.25, 0.1, 0.5, 0.0, 0.75]:
+        for length in [80, 60, 40, 30, 20]:
+            start = max(0, int(len(orig_norm) * start_frac))
+            snippet = orig_norm[start:start+length]
+            if not snippet or len(snippet) < 15:
+                continue
+            pos = pdf_norm.find(snippet)
+            if pos >= 0:
+                label = "Unknown"
+                typ = "unknown"
+                for h_pos, h_label, h_type in reversed(idx):
+                    if h_pos <= pos:
+                        label = h_label
+                        typ = h_type
+                        break
+                return (label, typ)
+    return None
+
+# DB
+db_url = os.environ['DATABASE_URL']
+parsed = urllib.parse.urlparse(db_url)
+conn = psycopg2.connect(
+    host=parsed.hostname, port=parsed.port or 5432,
+    user=parsed.username, password=parsed.password,
+    dbname=parsed.path.lstrip('/'),
+    options="-c search_path=compliance,public"
+)
+cur = conn.cursor()
+
+# ═══════════════════════════════════════════════════════════════
+# STEP 1: Mark OWASP Top 10 multilingual controls as duplicate
+# ═══════════════════════════════════════════════════════════════
+print("=" * 60)
+print("STEP 1: OWASP Top 10 — multilingual controls → duplicate")
+print("=" * 60)
+
+cur.execute("""
+    SELECT id, control_id, title, source_original_text, release_state
+    FROM compliance.canonical_controls
+    WHERE source_citation->>'source' = 'OWASP Top 10 (2021)'
+    AND source_citation->>'article_type' IS NULL
+    AND source_original_text IS NOT NULL
+    AND release_state NOT IN ('duplicate', 'too_close')
+    ORDER BY control_id
+""")
+top10_unmatched = cur.fetchall()
+print(f"  Unmatched active OWASP Top 10: {len(top10_unmatched)}")
+
+# Separate: found in other OWASP PDF vs not found anywhere
+to_mark_dup = []
+to_fix_source = []
+
+for ctrl in top10_unmatched:
+    uid, cid, title, text, state = ctrl
+
+    # Check if found in another OWASP PDF
+    found_in = None
+    found_result = None
+    for other_src in OWASP_PDFS:
+        if other_src == 'OWASP Top 10 (2021)':
+            continue
+        result = find_in_pdf(text, other_src)
+        if result:
+            found_in = other_src
+            found_result = result
+            break
+
+    if found_in:
+        to_fix_source.append((uid, cid, found_in, found_result[0], found_result[1]))
+    else:
+        to_mark_dup.append((uid, cid))
+
+print(f"  → Not found in any PDF (multilingual): {len(to_mark_dup)} → mark as duplicate")
+print(f"  → Found in other OWASP PDF: {len(to_fix_source)} → fix source attribution")
+
+# Mark as duplicate
+dup_marked = 0
+for uid, cid in to_mark_dup:
+    cur.execute("""
+        UPDATE compliance.canonical_controls
+        SET release_state = 'duplicate'
+        WHERE id = %s AND release_state NOT IN ('duplicate', 'too_close')
+    """, (uid,))
+    if cur.rowcount > 0:
+        dup_marked += 1
+
+print(f"  Marked as duplicate: {dup_marked}")
+
+# ═══════════════════════════════════════════════════════════════
+# STEP 2: Fix wrong source attributions across ALL OWASP sources
+# ═══════════════════════════════════════════════════════════════
+print(f"\n{'='*60}")
+print("STEP 2: Fix wrong OWASP source attributions")
+print("=" * 60)
+
+all_fixes = list(to_fix_source)  # Start with Top 10 fixes
+
+# Also check ASVS, SAMM, MASVS
+for source in ['OWASP ASVS 4.0', 'OWASP SAMM 2.0', 'OWASP API Security Top 10 (2023)', 'OWASP MASVS 2.0']:
+    cur.execute("""
+        SELECT id, control_id, title, source_original_text
+        FROM compliance.canonical_controls
+        WHERE source_citation->>'source' = %s
+        AND source_citation->>'article_type' IS NULL
+        AND source_original_text IS NOT NULL
+        AND release_state NOT IN ('duplicate', 'too_close')
+    """, (source,))
+    controls = cur.fetchall()
+
+    for ctrl in controls:
+        uid, cid, title, text = ctrl
+        # Try own PDF first
+        result = find_in_pdf(text, source)
+        if result:
+            # Found in own PDF! Update article info
+            cur.execute("""
+                UPDATE compliance.canonical_controls
+                SET source_citation = source_citation ||
+                    jsonb_build_object('article', %s, 'article_type', %s)
+                WHERE id = %s
+                AND (source_citation->>'article' IS DISTINCT FROM %s
+                     OR source_citation->>'article_type' IS DISTINCT FROM %s)
+            """, (result[0], result[1], uid, result[0], result[1]))
+            continue
+
+        # Try other OWASP PDFs
+        for other_src in OWASP_PDFS:
+            if other_src == source:
+                continue
+            result = find_in_pdf(text, other_src)
+            if result:
+                all_fixes.append((uid, cid, other_src, result[0], result[1]))
+                break
+
+print(f"  Total wrong-source controls found: {len(all_fixes)}")
+
+# Apply source fixes
+fixed = 0
+for uid, cid, correct_source, label, typ in all_fixes:
+    cur.execute("""
+        UPDATE compliance.canonical_controls
+        SET source_citation = source_citation ||
+            jsonb_build_object('source', %s, 'article', %s, 'article_type', %s)
+        WHERE id = %s
+    """, (correct_source, label, typ, uid,))
+    if cur.rowcount > 0:
+        fixed += 1
+        print(f"  {cid:10s} → {correct_source} / {label} [{typ}]")
+
+print(f"  Fixed: {fixed} controls")
+
+conn.commit()
+
+# ═══════════════════════════════════════════════════════════════
+# SUMMARY
+# ═══════════════════════════════════════════════════════════════
+print(f"\n{'='*60}")
+print("ZUSAMMENFASSUNG")
+print("=" * 60)
+print(f"  OWASP Top 10 multilingual → duplicate:  {dup_marked}")
+print(f"  Wrong source attribution → fixed:        {fixed}")
+
+# Final counts
+cur.execute("""
+    SELECT release_state, count(*)
+    FROM compliance.canonical_controls
+    GROUP BY release_state
+    ORDER BY count(*) DESC
+""")
+print(f"\n  DB release_state nach Cleanup:")
+for row in cur.fetchall():
+    print(f"    {row[0]:15s}: {row[1]:5d}")
+
+cur.execute("""
+    SELECT count(*)
+    FROM compliance.canonical_controls
+    WHERE release_state NOT IN ('duplicate', 'too_close')
+""")
+active = cur.fetchone()[0]
+print(f"\n  Aktive Controls: {active}")
+
+conn.close()
--- a/scripts/qa/owasp_github_match.py
+++ b/scripts/qa/owasp_github_match.py
@@ -0,0 +1,316 @@
+"""Match unmatched OWASP ASVS/SAMM/MASVS controls against GitHub Markdown sources."""
+import os
+import re
+import unicodedata
+import psycopg2
+import urllib.parse
+from pathlib import Path
+
+GITHUB_DIR = Path(os.path.expanduser("~/rag-ingestion/owasp-github"))
+
+def normalize(s):
+    s = s.replace('\u00ad', '').replace('\xad', '')
+    s = s.replace('\u200b', '').replace('\u00a0', ' ')
+    s = s.replace('\ufb01', 'fi').replace('\ufb02', 'fl')
+    s = s.replace('\ufb00', 'ff').replace('\ufb03', 'ffi').replace('\ufb04', 'ffl')
+    s = s.replace('\u2019', "'").replace('\u2018', "'")
+    s = s.replace('\u201c', '"').replace('\u201d', '"')
+    s = s.replace('\u2013', '-').replace('\u2014', '-')
+    s = s.replace('\u2022', '-').replace('\u00b7', '-')
+    s = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', s)
+    s = unicodedata.normalize('NFC', s)
+    s = re.sub(r'\s+', ' ', s)
+    return s.strip()
+
+# ── Load Markdown sources ──
+def load_markdown_dir(path, pattern="*.md"):
+    """Load all markdown files, return combined text and per-file index."""
+    texts = {}
+    for f in sorted(path.glob(pattern)):
+        try:
+            texts[f.name] = f.read_text(encoding='utf-8', errors='replace')
+        except:
+            pass
+    return texts
+
+# ASVS 4.0 — V-files contain requirements
+asvs_dir = GITHUB_DIR / "ASVS" / "4.0" / "en"
+asvs_files = load_markdown_dir(asvs_dir)
+asvs_full = "\n".join(asvs_files.values())
+asvs_norm = normalize(asvs_full)
+print(f"ASVS 4.0 Markdown: {len(asvs_files)} files, {len(asvs_full):,} chars")
+
+# SAMM core — YAML + Markdown
+samm_dir = GITHUB_DIR / "samm-core"
+samm_texts = {}
+for f in samm_dir.rglob("*.yml"):
+    try:
+        samm_texts[str(f.relative_to(samm_dir))] = f.read_text(encoding='utf-8', errors='replace')
+    except:
+        pass
+for f in samm_dir.rglob("*.md"):
+    try:
+        samm_texts[str(f.relative_to(samm_dir))] = f.read_text(encoding='utf-8', errors='replace')
+    except:
+        pass
+samm_full = "\n".join(samm_texts.values())
+samm_norm = normalize(samm_full)
+print(f"SAMM 2.0 source: {len(samm_texts)} files, {len(samm_full):,} chars")
+
+# MASVS — control markdown files
+masvs_dir = GITHUB_DIR / "masvs"
+masvs_files = {}
+for f in masvs_dir.rglob("*.md"):
+    try:
+        masvs_files[str(f.relative_to(masvs_dir))] = f.read_text(encoding='utf-8', errors='replace')
+    except:
+        pass
+masvs_full = "\n".join(masvs_files.values())
+masvs_norm = normalize(masvs_full)
+print(f"MASVS 2.0 source: {len(masvs_files)} files, {len(masvs_full):,} chars")
+
+# API Security
+api_dir = GITHUB_DIR / "api-security"
+api_files = {}
+for f in api_dir.rglob("*.md"):
+    try:
+        api_files[str(f.relative_to(api_dir))] = f.read_text(encoding='utf-8', errors='replace')
+    except:
+        pass
+api_full = "\n".join(api_files.values())
+api_norm = normalize(api_full)
+print(f"API Security source: {len(api_files)} files, {len(api_full):,} chars")
+
+# Source → (normalized_text, index_builder)
+SOURCE_GITHUB = {
+    "OWASP ASVS 4.0": asvs_norm,
+    "OWASP SAMM 2.0": samm_norm,
+    "OWASP MASVS 2.0": masvs_norm,
+    "OWASP API Security Top 10 (2023)": api_norm,
+}
+
+# Build indexes for each source
+def build_asvs_index(text):
+    items = []
+    for m in re.finditer(r'(V\d+\.\d+(?:\.\d+)?)\b', text):
+        items.append((m.start(), m.group(1), "requirement"))
+    items.sort(key=lambda x: x[0])
+    seen = set()
+    return [(p, l, t) for p, l, t in items if l not in seen and not seen.add(l)]
+
+def build_samm_index(text):
+    items = []
+    # SAMM practices have names like "Strategy & Metrics", sections numbered
+    for m in re.finditer(r'(?:^|\s)(\d+\.\d+(?:\.\d+)?)\s+[A-Z]', text):
+        items.append((m.start(), f"Section {m.group(1)}", "section"))
+    # Also find practice identifiers
+    for m in re.finditer(r'((?:Strategy|Education|Policy|Threat|Security Requirements|Secure Architecture|'
+                         r'Secure Build|Secure Deployment|Defect Management|Environment Management|'
+                         r'Incident Management|Requirements Testing|Security Testing|'
+                         r'Design Review|Implementation Review|Operations Management)'
+                         r'[^.\n]{0,30})', text):
+        items.append((m.start(), m.group(1)[:50], "section"))
+    items.sort(key=lambda x: x[0])
+    seen = set()
+    return [(p, l, t) for p, l, t in items if l not in seen and not seen.add(l)]
+
+def build_masvs_index(text):
+    items = []
+    for m in re.finditer(r'(MASVS-[A-Z]+-\d+)', text):
+        items.append((m.start(), m.group(1), "requirement"))
+    items.sort(key=lambda x: x[0])
+    seen = set()
+    return [(p, l, t) for p, l, t in items if l not in seen and not seen.add(l)]
+
+def build_api_index(text):
+    items = []
+    for m in re.finditer(r'(API\d+:\d{4})', text):
+        items.append((m.start(), m.group(1), "category"))
+    items.sort(key=lambda x: x[0])
+    seen = set()
+    return [(p, l, t) for p, l, t in items if l not in seen and not seen.add(l)]
+
+SOURCE_INDEX_BUILDERS = {
+    "OWASP ASVS 4.0": build_asvs_index,
+    "OWASP SAMM 2.0": build_samm_index,
+    "OWASP MASVS 2.0": build_masvs_index,
+    "OWASP API Security Top 10 (2023)": build_api_index,
+}
+
+# Build all indexes on normalized text
+source_indexes = {}
+for name, norm_text in SOURCE_GITHUB.items():
+    builder = SOURCE_INDEX_BUILDERS[name]
+    idx = builder(norm_text)
+    source_indexes[name] = idx
+    print(f"  {name}: {len(idx)} index entries")
+
+def find_text(orig_text, source_name):
+    """Find control text in GitHub source. Returns (label, type) or None."""
+    norm_text = SOURCE_GITHUB.get(source_name)
+    if not norm_text:
+        return None
+    idx = source_indexes.get(source_name, [])
+    orig_norm = normalize(orig_text)
+    if len(orig_norm) < 20:
+        return None
+
+    for start_frac in [0.25, 0.1, 0.5, 0.0, 0.75]:
+        for length in [80, 60, 40, 30, 20]:
+            start = max(0, int(len(orig_norm) * start_frac))
+            snippet = orig_norm[start:start+length]
+            if not snippet or len(snippet) < 15:
+                continue
+            pos = norm_text.find(snippet)
+            if pos >= 0:
+                label = "Unknown"
+                typ = "unknown"
+                for h_pos, h_label, h_type in reversed(idx):
+                    if h_pos <= pos:
+                        label = h_label
+                        typ = h_type
+                        break
+                return (label, typ)
+    return None
+
+def find_in_any_github(orig_text, exclude_source=None):
+    """Try all GitHub sources."""
+    for name in SOURCE_GITHUB:
+        if name == exclude_source:
+            continue
+        result = find_text(orig_text, name)
+        if result:
+            return (name, result[0], result[1])
+    return None
+
+# ── DB ──
+db_url = os.environ['DATABASE_URL']
+parsed = urllib.parse.urlparse(db_url)
+conn = psycopg2.connect(
+    host=parsed.hostname, port=parsed.port or 5432,
+    user=parsed.username, password=parsed.password,
+    dbname=parsed.path.lstrip('/'),
+    options="-c search_path=compliance,public"
+)
+cur = conn.cursor()
+
+# ── Process each OWASP source ──
+total_matched = 0
+total_cross = 0
+total_not_found = 0
+all_updates = []
+
+for source in ['OWASP ASVS 4.0', 'OWASP SAMM 2.0', 'OWASP MASVS 2.0', 'OWASP API Security Top 10 (2023)']:
+    cur.execute("""
+        SELECT id, control_id, title, source_original_text, release_state
+        FROM compliance.canonical_controls
+        WHERE source_citation->>'source' = %s
+        AND source_citation->>'article_type' IS NULL
+        AND source_original_text IS NOT NULL
+        AND release_state NOT IN ('duplicate', 'too_close')
+        ORDER BY control_id
+    """, (source,))
+    controls = cur.fetchall()
+
+    if not controls:
+        continue
+
+    print(f"\n{'='*60}")
+    print(f"{source} — {len(controls)} unmatched active")
+    print(f"{'='*60}")
+
+    matched = 0
+    cross_matched = 0
+    not_found = 0
+
+    for ctrl in controls:
+        uid, cid, title, text, state = ctrl
+
+        # Try own GitHub source
+        result = find_text(text, source)
+        if result:
+            matched += 1
+            total_matched += 1
+            all_updates.append((uid, cid, source, result[0], result[1]))
+            print(f"  {cid:10s} → {result[0]:30s} [{result[1]}]")
+            continue
+
+        # Try other GitHub sources
+        cross = find_in_any_github(text, exclude_source=source)
+        if cross:
+            cross_matched += 1
+            total_cross += 1
+            all_updates.append((uid, cid, cross[0], cross[1], cross[2]))
+            print(f"  {cid:10s} → [{cross[0]}] {cross[1]:20s} [{cross[2]}] (CROSS)")
+            continue
+
+        not_found += 1
+        total_not_found += 1
+
+    print(f"\n  Own source matched: {matched}")
+    print(f"  Cross-source:       {cross_matched}")
+    print(f"  Not found:          {not_found}")
+
+# ── Also try OWASP Top 10 remaining unmatched (34 active left after dup marking) ──
+cur.execute("""
+    SELECT id, control_id, title, source_original_text, release_state
+    FROM compliance.canonical_controls
+    WHERE source_citation->>'source' = 'OWASP Top 10 (2021)'
+    AND source_citation->>'article_type' IS NULL
+    AND source_original_text IS NOT NULL
+    AND release_state NOT IN ('duplicate', 'too_close')
+    ORDER BY control_id
+""")
+top10_remaining = cur.fetchall()
+if top10_remaining:
+    print(f"\n{'='*60}")
+    print(f"OWASP Top 10 (2021) — {len(top10_remaining)} remaining unmatched active")
+    print(f"{'='*60}")
+    for ctrl in top10_remaining:
+        uid, cid, title, text, state = ctrl
+        cross = find_in_any_github(text)
+        if cross:
+            total_cross += 1
+            all_updates.append((uid, cid, cross[0], cross[1], cross[2]))
+            print(f"  {cid:10s} → [{cross[0]}] {cross[1]:20s} [{cross[2]}]")
+        else:
+            total_not_found += 1
+
+# ── Summary ──
+print(f"\n{'='*60}")
+print(f"ZUSAMMENFASSUNG")
+print(f"{'='*60}")
+print(f"  Matched in eigener GitHub-Quelle: {total_matched}")
+print(f"  Cross-source matched:             {total_cross}")
+print(f"  Nicht gefunden:                   {total_not_found}")
+print(f"  Total Updates:                    {len(all_updates)}")
+
+# ── Apply updates ──
+if all_updates:
+    print(f"\nApplying {len(all_updates)} updates to DB...")
+    applied = 0
+    for uid, cid, correct_source, label, typ in all_updates:
+        # Update article + article_type, and fix source if cross-matched
+        cur.execute("""
+            UPDATE compliance.canonical_controls
+            SET source_citation = source_citation ||
+                jsonb_build_object('article', %s, 'article_type', %s)
+            WHERE id = %s
+            AND (source_citation->>'article' IS DISTINCT FROM %s
+                 OR source_citation->>'article_type' IS DISTINCT FROM %s)
+        """, (label, typ, uid, label, typ))
+        if cur.rowcount > 0:
+            applied += 1
+
+    conn.commit()
+    print(f"  Applied: {applied} controls updated")
+
+    # Type distribution
+    type_counts = {}
+    for _, _, _, _, typ in all_updates:
+        type_counts[typ] = type_counts.get(typ, 0) + 1
+    print(f"\n  Article type distribution:")
+    for t, c in sorted(type_counts.items(), key=lambda x: -x[1]):
+        print(f"    {t:12s}: {c:5d}")
+
+conn.close()
--- a/scripts/qa/phase5_normalize_and_cleanup.py
+++ b/scripts/qa/phase5_normalize_and_cleanup.py
@@ -0,0 +1,357 @@
+"""Phase 5: Source Normalization + Duplicate Hard Delete.
+
+Steps:
+  1. OSCAL controls: add source_regulation to generation_metadata
+  2. Fix 20 v3 controls with NULL source (tag as manually_reviewed)
+  3. Fix empty-string source (DATA-631 → Telekommunikationsgesetz Oesterreich)
+  4. Fix OWASP cross-source misattributions (regulation_code vs actual source)
+  5. Hard delete duplicate/too_close controls (3,301 controls, 0 FK refs)
+  6. Clean up canonical_processed_chunks generated_control_ids
+
+Usage:
+  export DATABASE_URL='postgresql://...'
+  python3 scripts/qa/phase5_normalize_and_cleanup.py [--dry-run] [--step N]
+"""
+import os
+import sys
+import json
+import psycopg2
+import urllib.parse
+
+DRY_RUN = "--dry-run" in sys.argv
+STEP_ONLY = None
+for arg in sys.argv:
+    if arg.startswith("--step"):
+        idx = sys.argv.index(arg)
+        if idx + 1 < len(sys.argv):
+            STEP_ONLY = int(sys.argv[idx + 1])
+
+db_url = os.environ['DATABASE_URL']
+parsed = urllib.parse.urlparse(db_url)
+conn = psycopg2.connect(
+    host=parsed.hostname, port=parsed.port or 5432,
+    user=parsed.username, password=parsed.password,
+    dbname=parsed.path.lstrip('/'),
+    options="-c search_path=compliance,public"
+)
+cur = conn.cursor()
+
+def should_run(step):
+    return STEP_ONLY is None or STEP_ONLY == step
+
+
+# ══════════════════════════════════════════════════════════════════
+# Step 1: OSCAL controls — add source_regulation to generation_metadata
+# ══════════════════════════════════════════════════════════════════
+if should_run(1):
+    print("=" * 70)
+    print("STEP 1: OSCAL controls — source_regulation in generation_metadata")
+    print("=" * 70)
+
+    cur.execute("""
+        SELECT count(*)
+        FROM compliance.canonical_controls
+        WHERE generation_strategy = 'oscal_import'
+        AND (generation_metadata->>'source_regulation' IS NULL
+             OR generation_metadata->>'source_regulation' = '')
+    """)
+    count = cur.fetchone()[0]
+    print(f"  OSCAL controls without source_regulation: {count}")
+
+    if count > 0:
+        if DRY_RUN:
+            print(f"  [DRY RUN] Would update {count} controls")
+        else:
+            cur.execute("""
+                UPDATE compliance.canonical_controls
+                SET generation_metadata = COALESCE(generation_metadata, '{}'::jsonb)
+                    || '{"source_regulation": "nist_sp800_53r5"}'::jsonb
+                WHERE generation_strategy = 'oscal_import'
+                AND (generation_metadata->>'source_regulation' IS NULL
+                     OR generation_metadata->>'source_regulation' = '')
+            """)
+            print(f"  Updated: {cur.rowcount}")
+    print()
+
+
+# ══════════════════════════════════════════════════════════════════
+# Step 2: v3 controls with NULL source — tag source as best guess
+# ══════════════════════════════════════════════════════════════════
+if should_run(2):
+    print("=" * 70)
+    print("STEP 2: Fix v3 controls with NULL source")
+    print("=" * 70)
+
+    # These 20 controls are v3/document_grouped with no source or regulation.
+    # Based on title analysis, they cover:
+    # - Data protection/privacy topics (DSGVO-adjacent)
+    # - Software security (OWASP/NIST-adjacent)
+    # - Mobile security (OWASP MASVS-adjacent)
+    # Mark them as 'needs_review' and add a flag.
+    cur.execute("""
+        SELECT id, control_id, title
+        FROM compliance.canonical_controls
+        WHERE source_citation->>'source' IS NULL
+        AND pipeline_version = 3
+        AND release_state NOT IN ('duplicate', 'too_close')
+    """)
+    v3_null = cur.fetchall()
+    print(f"  v3 controls with NULL source: {len(v3_null)}")
+
+    if v3_null:
+        if DRY_RUN:
+            print(f"  [DRY RUN] Would mark {len(v3_null)} as needs_review")
+        else:
+            for ctrl_id_uuid, control_id, title in v3_null:
+                cur.execute("""
+                    UPDATE compliance.canonical_controls
+                    SET release_state = 'needs_review',
+                        generation_metadata = COALESCE(generation_metadata, '{}'::jsonb)
+                            || '{"missing_source": true}'::jsonb
+                    WHERE id = %s
+                """, (ctrl_id_uuid,))
+            print(f"  Marked {len(v3_null)} as needs_review with missing_source flag")
+    print()
+
+
+# ══════════════════════════════════════════════════════════════════
+# Step 3: Fix empty-string source (DATA-631)
+# ══════════════════════════════════════════════════════════════════
+if should_run(3):
+    print("=" * 70)
+    print("STEP 3: Fix empty-string source")
+    print("=" * 70)
+
+    cur.execute("""
+        SELECT id, control_id, title,
+               generation_metadata->>'source_regulation' as reg
+        FROM compliance.canonical_controls
+        WHERE source_citation->>'source' = ''
+        AND release_state NOT IN ('duplicate', 'too_close')
+    """)
+    empty_src = cur.fetchall()
+    print(f"  Controls with empty source: {len(empty_src)}")
+
+    for ctrl_id_uuid, control_id, title, reg in empty_src:
+        print(f"    {control_id} | reg={reg} | {title[:60]}")
+        if reg == 'at_tkg':
+            new_source = 'Telekommunikationsgesetz Oesterreich'
+        else:
+            new_source = f"Unbekannt ({reg})"
+
+        if DRY_RUN:
+            print(f"    [DRY RUN] Would set source='{new_source}'")
+        else:
+            cur.execute("""
+                UPDATE compliance.canonical_controls
+                SET source_citation = jsonb_set(
+                    source_citation, '{source}', %s::jsonb
+                )
+                WHERE id = %s
+            """, (json.dumps(new_source), ctrl_id_uuid))
+            print(f"    Set source='{new_source}'")
+    print()
+
+
+# ══════════════════════════════════════════════════════════════════
+# Step 4: Fix OWASP cross-source misattributions
+# ══════════════════════════════════════════════════════════════════
+if should_run(4):
+    print("=" * 70)
+    print("STEP 4: Fix OWASP cross-source misattributions")
+    print("=" * 70)
+
+    # Controls where source_citation.source doesn't match the regulation_code
+    OWASP_REG_TO_SOURCE = {
+        'owasp_top10_2021': 'OWASP Top 10 (2021)',
+        'owasp_asvs': 'OWASP ASVS 4.0',
+        'owasp_masvs': 'OWASP MASVS 2.0',
+        'owasp_samm': 'OWASP SAMM 2.0',
+        'owasp_api_top10_2023': 'OWASP API Security Top 10 (2023)',
+    }
+
+    # Strategy: Move controls to the regulation_code that matches their actual source
+    # i.e., if a control has source='OWASP ASVS 4.0' but reg='owasp_top10_2021',
+    # update the reg to 'owasp_asvs'
+    SOURCE_TO_REG = {v: k for k, v in OWASP_REG_TO_SOURCE.items()}
+
+    total_fixed = 0
+    for reg_code, expected_source in OWASP_REG_TO_SOURCE.items():
+        cur.execute("""
+            SELECT id, control_id, source_citation->>'source' as src
+            FROM compliance.canonical_controls
+            WHERE generation_metadata->>'source_regulation' = %s
+            AND source_citation->>'source' <> %s
+            AND release_state NOT IN ('duplicate', 'too_close')
+        """, (reg_code, expected_source))
+        mismatches = cur.fetchall()
+
+        if mismatches:
+            print(f"\n  {reg_code} → {len(mismatches)} Mismatches:")
+            for ctrl_id_uuid, control_id, actual_source in mismatches:
+                correct_reg = SOURCE_TO_REG.get(actual_source)
+                if correct_reg:
+                    print(f"    {control_id} | {actual_source} → reg={correct_reg}")
+                    if not DRY_RUN:
+                        cur.execute("""
+                            UPDATE compliance.canonical_controls
+                            SET generation_metadata = jsonb_set(
+                                generation_metadata, '{source_regulation}', %s::jsonb
+                            )
+                            WHERE id = %s
+                        """, (json.dumps(correct_reg), ctrl_id_uuid))
+                    total_fixed += 1
+                else:
+                    print(f"    {control_id} | {actual_source} → no mapping found")
+
+    if DRY_RUN:
+        print(f"\n  [DRY RUN] Would fix {total_fixed} misattributions")
+    else:
+        print(f"\n  Fixed: {total_fixed} misattributions")
+    print()
+
+
+# ══════════════════════════════════════════════════════════════════
+# Step 5: Hard delete duplicate/too_close controls
+# ══════════════════════════════════════════════════════════════════
+if should_run(5):
+    print("=" * 70)
+    print("STEP 5: Hard delete duplicate/too_close controls")
+    print("=" * 70)
+
+    # Verify no FK references
+    for table, col in [
+        ('canonical_control_mappings', 'control_id'),
+        ('obligation_extractions', 'control_uuid'),
+        ('crosswalk_matrix', 'master_control_uuid'),
+        ('obligation_candidates', 'parent_control_uuid'),
+    ]:
+        cur.execute(f"""
+            SELECT count(*)
+            FROM compliance.{table} t
+            JOIN compliance.canonical_controls cc ON cc.id = t.{col}
+            WHERE cc.release_state IN ('duplicate', 'too_close')
+        """)
+        fk_count = cur.fetchone()[0]
+        if fk_count > 0:
+            print(f"  WARNING: {table}.{col} has {fk_count} refs to dup/too_close!")
+            print(f"  ABORTING Step 5 — clean FK refs first!")
+            sys.exit(1)
+        else:
+            print(f"  {table}.{col}: 0 refs ✓")
+
+    # Check self-references
+    cur.execute("""
+        SELECT count(*)
+        FROM compliance.canonical_controls child
+        JOIN compliance.canonical_controls parent ON parent.id = child.parent_control_uuid
+        WHERE parent.release_state IN ('duplicate', 'too_close')
+    """)
+    self_refs = cur.fetchone()[0]
+    if self_refs > 0:
+        print(f"  WARNING: {self_refs} child controls reference dup/too_close parents!")
+        print(f"  ABORTING Step 5!")
+        sys.exit(1)
+    print(f"  Self-references: 0 ✓")
+
+    cur.execute("""
+        SELECT release_state, count(*)
+        FROM compliance.canonical_controls
+        WHERE release_state IN ('duplicate', 'too_close')
+        GROUP BY 1
+    """)
+    to_delete = {}
+    for state, cnt in cur.fetchall():
+        to_delete[state] = cnt
+        print(f"\n  {state}: {cnt}")
+
+    total = sum(to_delete.values())
+    print(f"\n  TOTAL to delete: {total}")
+
+    if DRY_RUN:
+        print(f"  [DRY RUN] Would delete {total} controls")
+    else:
+        cur.execute("""
+            DELETE FROM compliance.canonical_controls
+            WHERE release_state IN ('duplicate', 'too_close')
+        """)
+        print(f"  Deleted: {cur.rowcount} controls")
+    print()
+
+
+# ══════════════════════════════════════════════════════════════════
+# Step 6: Clean up canonical_processed_chunks generated_control_ids
+# ══════════════════════════════════════════════════════════════════
+if should_run(6):
+    print("=" * 70)
+    print("STEP 6: Clean up processed chunks (remove deleted control IDs)")
+    print("=" * 70)
+
+    if DRY_RUN and should_run(5):
+        print("  [DRY RUN] Skipping — depends on Step 5 deletion")
+    else:
+        # Find chunks that reference non-existent controls
+        cur.execute("""
+            SELECT id, generated_control_ids
+            FROM compliance.canonical_processed_chunks
+            WHERE generated_control_ids IS NOT NULL
+            AND generated_control_ids <> '[]'::jsonb
+        """)
+        chunks = cur.fetchall()
+        print(f"  Chunks with generated_control_ids: {len(chunks)}")
+
+        # Get all existing control IDs
+        cur.execute("SELECT id::text FROM compliance.canonical_controls")
+        existing_ids = set(r[0] for r in cur.fetchall())
+        print(f"  Existing controls: {len(existing_ids)}")
+
+        cleaned = 0
+        for chunk_id, control_ids in chunks:
+            if isinstance(control_ids, str):
+                control_ids = json.loads(control_ids)
+            if isinstance(control_ids, list):
+                valid_ids = [cid for cid in control_ids if cid in existing_ids]
+                if len(valid_ids) < len(control_ids):
+                    removed = len(control_ids) - len(valid_ids)
+                    cur.execute("""
+                        UPDATE compliance.canonical_processed_chunks
+                        SET generated_control_ids = %s::jsonb
+                        WHERE id = %s
+                    """, (json.dumps(valid_ids), chunk_id))
+                    cleaned += 1
+
+        print(f"  Chunks cleaned: {cleaned}")
+    print()
+
+
+# ══════════════════════════════════════════════════════════════════
+# Final summary
+# ══════════════════════════════════════════════════════════════════
+if not DRY_RUN:
+    conn.commit()
+    print("=" * 70)
+    print("COMMITTED. Final state:")
+    print("=" * 70)
+else:
+    print("=" * 70)
+    print("[DRY RUN] No changes committed. Current state:")
+    print("=" * 70)
+
+cur.execute("""
+    SELECT release_state, count(*)
+    FROM compliance.canonical_controls
+    GROUP BY 1
+    ORDER BY count(*) DESC
+""")
+total = 0
+active = 0
+for state, cnt in cur.fetchall():
+    total += cnt
+    if state not in ('duplicate', 'too_close'):
+        active += cnt
+    print(f"  {state:15s}: {cnt:5d}")
+
+print(f"\n  TOTAL:  {total}")
+print(f"  AKTIV:  {active}")
+
+conn.close()
--- a/scripts/qa/phase74_generate_gap_controls.py
+++ b/scripts/qa/phase74_generate_gap_controls.py
@@ -0,0 +1,655 @@
+#!/usr/bin/env python3
+"""
+Phase 7.4: Generate new controls for gap articles via Anthropic Claude Sonnet.
+
+Reads gap_analysis_results.json, extracts article text from PDFs,
+calls Claude Sonnet to generate controls, inserts into DB.
+
+Usage:
+    python3 phase74_generate_gap_controls.py --dry-run          # show what would be generated
+    python3 phase74_generate_gap_controls.py                     # generate and insert
+    python3 phase74_generate_gap_controls.py --source "DSGVO"    # filter by source
+    python3 phase74_generate_gap_controls.py --resume            # skip already-generated articles
+"""
+import os
+import sys
+import json
+import re
+import time
+import hashlib
+import argparse
+import psycopg2
+import urllib.parse
+import requests
+from pathlib import Path
+from collections import Counter
+
+sys.path.insert(0, os.path.dirname(__file__))
+from pdf_qa_all import (
+    SOURCE_FILE_MAP, read_file, classify_doc, normalize,
+    build_eu_article_index, build_de_law_index, build_nist_index,
+    build_owasp_index, build_generic_index, MAX_ARTICLES,
+)
+
+# ── Config ──────────────────────────────────────────────────────────
+ANTHROPIC_URL = "https://api.anthropic.com/v1/messages"
+ANTHROPIC_MODEL = os.environ.get("CONTROL_GEN_ANTHROPIC_MODEL", "claude-sonnet-4-6")
+ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", "")
+PIPELINE_VERSION = 5
+GAP_RESULTS_FILE = "/tmp/gap_analysis_results.json"
+PDF_DIR = Path(os.path.expanduser("~/rag-ingestion/pdfs"))
+
+try:
+    import fitz
+except ImportError:
+    fitz = None
+
+# ── Source name → regulation_code reverse map ────────────────────────
+# Built from REGULATION_LICENSE_MAP in control_generator.py
+SOURCE_TO_REGCODE = {
+    "DSGVO (EU) 2016/679": "eu_2016_679",
+    "KI-Verordnung (EU) 2024/1689": "eu_2024_1689",
+    "NIS2-Richtlinie (EU) 2022/2555": "eu_2022_2555",
+    "Cyber Resilience Act (CRA)": "eu_2024_2847",
+    "Maschinenverordnung (EU) 2023/1230": "eu_2023_1230",
+    "EU Blue Guide 2022": "eu_blue_guide_2022",
+    "Markets in Crypto-Assets (MiCA)": "mica",
+    "Batterieverordnung (EU) 2023/1542": "eu_2023_1542",
+    "AML-Verordnung": "amlr",
+    "Data Governance Act (DGA)": "dga",
+    "Data Act": "data_act",
+    "GPSR (EU) 2023/988": "gpsr",
+    "IFRS-Übernahmeverordnung": "ifrs",
+    "NIST SP 800-53 Rev. 5": "nist_sp800_53r5",
+    "NIST SP 800-207 (Zero Trust)": "nist_sp800_207",
+    "NIST SP 800-63-3": "nist_sp800_63_3",
+    "NIST AI Risk Management Framework": "nist_ai_rmf",
+    "NIST SP 800-218 (SSDF)": "nist_sp_800_218",
+    "NIST Cybersecurity Framework 2.0": "nist_csf_2_0",
+    "OWASP Top 10 (2021)": "owasp_top10",
+    "OWASP ASVS 4.0": "owasp_asvs",
+    "OWASP SAMM 2.0": "owasp_samm",
+    "OWASP API Security Top 10 (2023)": "owasp_api_top10",
+    "OWASP MASVS 2.0": "owasp_masvs",
+    "ENISA ICS/SCADA Dependencies": "enisa_ics_scada",
+    "ENISA Supply Chain Good Practices": "enisa_supply_chain",
+    "CISA Secure by Design": "cisa_sbd",
+    "Bundesdatenschutzgesetz (BDSG)": "bdsg",
+    "Gewerbeordnung (GewO)": "gewo",
+    "Handelsgesetzbuch (HGB)": "hgb",
+    "Abgabenordnung (AO)": "ao",
+    "OECD KI-Empfehlung": "oecd_ai_principles",
+}
+
+# License info per regulation code (from REGULATION_LICENSE_MAP)
+LICENSE_MAP = {
+    "eu_2016_679": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
+    "eu_2024_1689": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
+    "eu_2022_2555": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
+    "eu_2024_2847": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
+    "eu_2023_1230": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
+    "eu_blue_guide_2022": {"license": "EU_PUBLIC", "rule": 1, "source_type": "guideline"},
+    "mica": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
+    "eu_2023_1542": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
+    "amlr": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
+    "dga": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
+    "data_act": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
+    "gpsr": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
+    "ifrs": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
+    "nist_sp800_53r5": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
+    "nist_sp800_207": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
+    "nist_sp800_63_3": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
+    "nist_ai_rmf": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
+    "nist_sp_800_218": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
+    "nist_csf_2_0": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
+    "owasp_top10": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard"},
+    "owasp_asvs": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard"},
+    "owasp_samm": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard"},
+    "owasp_api_top10": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard"},
+    "owasp_masvs": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard"},
+    "enisa_ics_scada": {"license": "CC-BY-4.0", "rule": 2, "source_type": "guideline"},
+    "enisa_supply_chain": {"license": "CC-BY-4.0", "rule": 2, "source_type": "guideline"},
+    "cisa_sbd": {"license": "US_GOV_PUBLIC", "rule": 1, "source_type": "guideline"},
+    "bdsg": {"license": "DE_LAW", "rule": 1, "source_type": "law"},
+    "gewo": {"license": "DE_LAW", "rule": 1, "source_type": "law"},
+    "hgb": {"license": "DE_LAW", "rule": 1, "source_type": "law"},
+    "ao": {"license": "DE_LAW", "rule": 1, "source_type": "law"},
+    "oecd_ai_principles": {"license": "OECD_PUBLIC", "rule": 2, "source_type": "standard"},
+}
+
+# Domain detection keywords
+DOMAIN_KEYWORDS = {
+    "AUTH": ["authentifizierung", "anmeldung", "login", "passwort", "identit", "identity", "credential"],
+    "CRYP": ["verschlüsselung", "kryptogra", "encrypt", "cipher", "hash", "tls", "ssl", "signatur"],
+    "NET": ["netzwerk", "network", "firewall", "router", "dns", "ip-adress"],
+    "DATA": ["daten", "data", "personenbezogen", "datenschutz", "privacy", "gdpr", "dsgvo", "verarbeitung"],
+    "LOG": ["protokoll", "logging", "audit", "nachvollzieh", "aufzeichn"],
+    "ACC": ["zugriff", "access", "berechtigung", "autorisierung", "authorization", "rolle"],
+    "SEC": ["sicherheit", "security", "schutz", "protect", "schwachstell", "vulnerab"],
+    "INC": ["vorfall", "incident", "breach", "meldung", "reaktion", "response", "notfall"],
+    "AI": ["künstliche intelligenz", "ki-system", "ai system", "machine learning", "algorithm", "hochrisiko-ki"],
+    "COMP": ["compliance", "konformität", "audit", "zertifizierung", "regulier", "vorschrift"],
+    "GOV": ["behörde", "aufsicht", "governance", "marktüberwachung", "authority"],
+    "FIN": ["finanz", "zahlungs", "payment", "crypto", "krypto-", "geldwäsche", "aml"],
+    "ENV": ["umwelt", "environment", "batterie", "recycling", "entsorgu", "nachhaltig"],
+}
+
+# ── Prompt (same as control_generator.py) ────────────────────────────
+
+SYSTEM_PROMPT = """Du bist ein Security-Compliance-Experte. Strukturiere den gegebenen Text
+als praxisorientiertes Security Control. Erstelle eine verständliche, umsetzbare Formulierung.
+Antworte NUR mit validem JSON. Bei mehreren Controls antworte mit einem JSON-Array."""
+
+APPLICABILITY_PROMPT = """- applicable_industries: Liste der Branchen fuer die dieses Control relevant ist.
+  Verwende ["all"] wenn der Control branchenuebergreifend gilt.
+  Moegliche Werte: "all", "Technologie / IT", "IT Dienstleistungen", "E-Commerce / Handel",
+  "Finanzdienstleistungen", "Versicherungen", "Gesundheitswesen", "Pharma", "Bildung",
+  "Beratung / Consulting", "Marketing / Agentur", "Produktion / Industrie",
+  "Logistik / Transport", "Immobilien", "Bau", "Energie", "Automobil",
+  "Luft- / Raumfahrt", "Maschinenbau", "Anlagenbau", "Automatisierung", "Robotik",
+  "Messtechnik", "Agrar", "Chemie", "Minen / Bergbau", "Telekommunikation",
+  "Medien / Verlage", "Gastronomie / Hotellerie", "Recht / Kanzlei",
+  "Oeffentlicher Dienst", "Verteidigung / Ruestung", "Wasser- / Abwasserwirtschaft",
+  "Lebensmittel", "Digitale Infrastruktur", "Weltraum", "Post / Kurierdienste",
+  "Abfallwirtschaft", "Forschung"
+- applicable_company_size: Ab welcher Unternehmensgroesse gilt dieses Control?
+  Verwende ["all"] wenn keine Groessenbeschraenkung.
+  Moegliche Werte: "all", "micro", "small", "medium", "large", "enterprise"
+- scope_conditions: null wenn keine besonderen Bedingungen, sonst:
+  {"requires_any": ["signal"], "description": "Erklaerung"}
+  Moegliche Signale: "uses_ai", "third_country_transfer", "processes_health_data",
+  "processes_minors_data", "automated_decisions", "employee_monitoring",
+  "video_surveillance", "financial_data", "is_kritis_operator", "payment_services" """
+
+CATEGORY_LIST = [
+    "Datenschutz-Grundlagen", "Betroffenenrechte", "Technische Massnahmen",
+    "Organisatorische Massnahmen", "Auftragsverarbeitung", "Datentransfer",
+    "Risikomanagement", "Incident Response", "KI-Regulierung", "Cybersicherheit",
+    "Zugriffskontrolle", "Kryptographie", "Netzwerksicherheit", "Compliance-Management",
+    "Produktsicherheit", "Marktüberwachung", "Supply Chain Security",
+    "Finanzregulierung", "Arbeitsrecht", "Gewerberecht", "Handelsrecht",
+    "Umwelt / Nachhaltigkeit", "Dokumentation", "Schulung / Awareness",
+]
+CATEGORY_LIST_STR = ", ".join(f'"{c}"' for c in CATEGORY_LIST)
+
+
+def build_prompt(source_name, article_label, article_text, license_type):
+    return f"""Strukturiere den folgenden Gesetzestext als Security/Compliance Control.
+Du DARFST den Originaltext verwenden (Quelle: {source_name}, {license_type}).
+
+WICHTIG: Erstelle eine verständliche, praxisorientierte Formulierung.
+Der Originaltext wird separat gespeichert — deine Formulierung soll klar und umsetzbar sein.
+
+Gib JSON zurück mit diesen Feldern:
+- title: Kurzer prägnanter Titel (max 100 Zeichen)
+- objective: Was soll erreicht werden? (1-3 Sätze)
+- rationale: Warum ist das wichtig? (1-2 Sätze)
+- requirements: Liste von konkreten Anforderungen (Strings)
+- test_procedure: Liste von Prüfschritten (Strings)
+- evidence: Liste von Nachweisdokumenten (Strings)
+- severity: low/medium/high/critical
+- tags: Liste von Tags
+- domain: Fachgebiet als Kuerzel (AUTH=Authentifizierung, CRYP=Kryptographie, NET=Netzwerk, DATA=Datenschutz, LOG=Logging, ACC=Zugriffskontrolle, SEC=IT-Sicherheit, INC=Vorfallmanagement, AI=KI, COMP=Compliance, GOV=Behoerden/Verwaltung, LAB=Arbeitsrecht, FIN=Finanzregulierung, TRD=Gewerbe/Handelsrecht, ENV=Umwelt, HLT=Gesundheit)
+- category: Inhaltliche Kategorie. Moegliche Werte: {CATEGORY_LIST_STR}
+- target_audience: Liste der Zielgruppen (z.B. "unternehmen", "behoerden", "entwickler", "datenschutzbeauftragte", "geschaeftsfuehrung", "it-abteilung", "rechtsabteilung", "compliance-officer")
+- source_article: Artikel-/Paragraphen-Referenz (z.B. "Artikel 10", "§ 42")
+- source_paragraph: Absatz-Referenz (z.B. "Absatz 5", "Nr. 2")
+{APPLICABILITY_PROMPT}
+
+Text: {article_text[:3000]}
+Quelle: {source_name}, {article_label}"""
+
+
+# ── PDF article extraction ───────────────────────────────────────────
+
+def extract_article_text(pdf_file, article_label, doc_type, full_text=None):
+    """Extract the text of a specific article from a PDF."""
+    if full_text is None:
+        full_text = read_file(pdf_file)
+    if not full_text:
+        return ""
+
+    if doc_type == "eu_regulation":
+        art_num_match = re.search(r'\d+', article_label)
+        if not art_num_match:
+            return ""
+        num = int(art_num_match.group())
+        pattern = rf'\nArtikel\s+{num}\s*\n'
+        match = re.search(pattern, full_text)
+        if not match:
+            return ""
+        start = match.start()
+        next_pattern = rf'\nArtikel\s+{num + 1}\s*\n'
+        next_match = re.search(next_pattern, full_text)
+        end = next_match.start() if next_match else min(start + 5000, len(full_text))
+        return full_text[start:end].strip()[:3000]
+
+    elif doc_type == "de_law":
+        para_match = re.search(r'\d+', article_label)
+        if not para_match:
+            return ""
+        num = int(para_match.group())
+        pattern = rf'\n§\s+{num}\b'
+        match = re.search(pattern, full_text)
+        if not match:
+            return ""
+        start = match.start()
+        next_pattern = rf'\n§\s+{num + 1}\b'
+        next_match = re.search(next_pattern, full_text)
+        end = next_match.start() if next_match else min(start + 5000, len(full_text))
+        return full_text[start:end].strip()[:3000]
+
+    elif doc_type == "nist":
+        escaped = re.escape(article_label)
+        match = re.search(rf'(?:^|\n)\s*{escaped}\b', full_text)
+        if not match:
+            return ""
+        start = match.start()
+        return full_text[start:start + 3000].strip()
+
+    else:
+        # Generic / OWASP / ENISA
+        escaped = re.escape(article_label)
+        match = re.search(rf'(?:^|\n).*{escaped}\b', full_text)
+        if not match:
+            return ""
+        start = match.start()
+        return full_text[start:start + 3000].strip()
+
+
+# ── Anthropic API ────────────────────────────────────────────────────
+
+def call_anthropic(prompt, system_prompt):
+    """Call Anthropic API. Returns (parsed_data, raw_text, usage, error)."""
+    headers = {
+        "x-api-key": ANTHROPIC_API_KEY,
+        "anthropic-version": "2023-06-01",
+        "content-type": "application/json",
+    }
+    payload = {
+        "model": ANTHROPIC_MODEL,
+        "max_tokens": 4096,
+        "system": system_prompt,
+        "messages": [{"role": "user", "content": prompt}],
+    }
+
+    try:
+        resp = requests.post(ANTHROPIC_URL, headers=headers, json=payload, timeout=120)
+        if resp.status_code != 200:
+            return None, "", {}, f"HTTP {resp.status_code}: {resp.text[:200]}"
+        data = resp.json()
+        content = data["content"][0]["text"] if data.get("content") else ""
+        usage = data.get("usage", {})
+        parsed = parse_json(content)
+        return parsed, content, usage, None
+    except Exception as e:
+        return None, "", {}, str(e)
+
+
+def parse_json(text):
+    """Parse JSON from LLM response, handling markdown fences."""
+    text = text.strip()
+    if text.startswith("```"):
+        lines = text.split("\n")
+        text = "\n".join(lines[1:-1] if lines[-1].strip().startswith("```") else lines[1:])
+        text = text.strip()
+
+    try:
+        data = json.loads(text)
+        if isinstance(data, list):
+            return data[0] if data else None
+        return data
+    except json.JSONDecodeError:
+        match = re.search(r'\{[\s\S]*\}', text)
+        if match:
+            try:
+                return json.loads(match.group())
+            except json.JSONDecodeError:
+                return None
+    return None
+
+
+# ── Domain detection ─────────────────────────────────────────────────
+
+def detect_domain(text):
+    text_lower = text.lower()
+    scores = {}
+    for domain, keywords in DOMAIN_KEYWORDS.items():
+        score = sum(1 for kw in keywords if kw in text_lower)
+        if score > 0:
+            scores[domain] = score
+    if scores:
+        return max(scores, key=scores.get)
+    return "SEC"
+
+
+# ── Control ID generation ────────────────────────────────────────────
+
+def generate_control_id(domain, cur):
+    """Generate next available control_id for domain prefix.
+
+    Uses MAX(numeric suffix) to find the true highest number,
+    avoiding gaps from string-sorted IDs (e.g. COMP-99 > COMP-1000 in text sort).
+    """
+    prefix = domain.upper()[:4]
+    cur.execute("""
+        SELECT MAX(CAST(SPLIT_PART(control_id, '-', 2) AS INTEGER))
+        FROM compliance.canonical_controls
+        WHERE control_id LIKE %s
+          AND SPLIT_PART(control_id, '-', 2) ~ '^[0-9]+$'
+    """, (f"{prefix}-%",))
+    row = cur.fetchone()
+    if row and row[0] is not None:
+        return f"{prefix}-{row[0] + 1}"
+    return f"{prefix}-001"
+
+
+# ── Main ─────────────────────────────────────────────────────────────
+
+def main():
+    parser = argparse.ArgumentParser(description="Phase 7.4: Generate controls for gap articles")
+    parser.add_argument("--dry-run", action="store_true", help="Show what would be generated")
+    parser.add_argument("--source", type=str, help="Filter by source name substring")
+    parser.add_argument("--resume", action="store_true", help="Skip articles that already have controls")
+    parser.add_argument("--results", default=GAP_RESULTS_FILE, help="Path to gap_analysis_results.json")
+    args = parser.parse_args()
+
+    if not ANTHROPIC_API_KEY:
+        print("ERROR: Set ANTHROPIC_API_KEY")
+        sys.exit(1)
+
+    # Load gap results
+    with open(args.results) as f:
+        gaps = json.load(f)
+    total_gaps = sum(len(g["gap_articles"]) for g in gaps)
+    print(f"Loaded {len(gaps)} sources with {total_gaps} gap articles")
+
+    if args.source:
+        gaps = [g for g in gaps if args.source.lower() in g["source"].lower()]
+        total_gaps = sum(len(g["gap_articles"]) for g in gaps)
+        print(f"Filtered to {len(gaps)} sources, {total_gaps} gaps")
+
+    # DB connection with keepalive + reconnect helper
+    db_url = os.environ['DATABASE_URL']
+    parsed = urllib.parse.urlparse(db_url)
+
+    def connect_db():
+        """Create DB connection with TCP keepalive."""
+        c = psycopg2.connect(
+            host=parsed.hostname, port=parsed.port or 5432,
+            user=parsed.username, password=parsed.password,
+            dbname=parsed.path.lstrip('/'),
+            options="-c search_path=compliance,public",
+            keepalives=1, keepalives_idle=30,
+            keepalives_interval=10, keepalives_count=5,
+        )
+        return c, c.cursor()
+
+    conn, cur = connect_db()
+
+    def ensure_db():
+        """Reconnect if connection is dead."""
+        nonlocal conn, cur
+        try:
+            cur.execute("SELECT 1")
+        except Exception:
+            print("  [RECONNECT] DB connection lost, reconnecting...")
+            try:
+                conn.close()
+            except Exception:
+                pass
+            conn, cur = connect_db()
+            return True
+        return False
+
+    # Get framework UUID
+    cur.execute("SELECT id FROM compliance.canonical_control_frameworks WHERE framework_id = 'bp_security_v1' LIMIT 1")
+    fw_row = cur.fetchone()
+    if not fw_row:
+        print("ERROR: Framework bp_security_v1 not found")
+        sys.exit(1)
+    framework_uuid = fw_row[0]
+
+    # If resuming, load existing articles per source
+    existing_articles = {}
+    if args.resume:
+        cur.execute("""
+            SELECT source_citation->>'source', source_citation->>'article'
+            FROM compliance.canonical_controls
+            WHERE source_citation->>'article' IS NOT NULL
+        """)
+        for src, art in cur.fetchall():
+            existing_articles.setdefault(src, set()).add(art)
+        print(f"Resume mode: {sum(len(v) for v in existing_articles.values())} existing article-control pairs")
+
+    # Stats
+    stats = Counter()
+    total_input_tokens = 0
+    total_output_tokens = 0
+    generated_ids = []
+    errors = []
+    t_start = time.time()
+
+    # Pre-read PDFs (cache full text per source)
+    pdf_cache = {}
+
+    for gap_source in sorted(gaps, key=lambda g: -len(g["gap_articles"])):
+        source_name = gap_source["source"]
+        gap_articles = gap_source["gap_articles"]
+        filename = SOURCE_FILE_MAP.get(source_name)
+        reg_code = SOURCE_TO_REGCODE.get(source_name, "unknown")
+        license_info = LICENSE_MAP.get(reg_code, {"license": "UNKNOWN", "rule": 1, "source_type": "unknown"})
+        doc_type = classify_doc(source_name)
+
+        if not filename:
+            stats["skipped_no_pdf"] += len(gap_articles)
+            continue
+
+        # Read PDF once per source
+        if source_name not in pdf_cache:
+            pdf_cache[source_name] = read_file(filename)
+        full_text = pdf_cache[source_name]
+        if not full_text:
+            stats["skipped_no_pdf"] += len(gap_articles)
+            continue
+
+        print(f"\n{'='*70}")
+        print(f"{source_name} — {len(gap_articles)} gaps (rule {license_info['rule']}, {doc_type})")
+        print(f"{'='*70}")
+
+        for gap in gap_articles:
+            article_label = gap["label"]
+            article_type = gap["type"]
+
+            # Skip if already has controls (resume mode)
+            if args.resume and article_label in existing_articles.get(source_name, set()):
+                stats["skipped_exists"] += 1
+                continue
+
+            # Skip non-substantive NIST sections (intro chapters)
+            if doc_type == "nist" and article_type == "section":
+                section_match = re.match(r'Section (\d+)', article_label)
+                if section_match and int(section_match.group(1)) <= 3:
+                    stats["skipped_intro"] += 1
+                    continue
+
+            # Extract article text
+            article_text = extract_article_text(filename, article_label, doc_type, full_text)
+            if not article_text or len(article_text) < 30:
+                stats["skipped_short_text"] += 1
+                print(f"  SKIP {article_label}: text too short ({len(article_text)} chars)")
+                continue
+
+            if args.dry_run:
+                print(f"  [DRY] {article_label} ({len(article_text)} chars)")
+                stats["would_generate"] += 1
+                continue
+
+            # Call Anthropic
+            prompt = build_prompt(source_name, article_label, article_text, license_info["license"])
+            data, raw, usage, error = call_anthropic(prompt, SYSTEM_PROMPT)
+
+            total_input_tokens += usage.get("input_tokens", 0)
+            total_output_tokens += usage.get("output_tokens", 0)
+
+            if error:
+                stats["api_error"] += 1
+                errors.append(f"{source_name} {article_label}: {error}")
+                print(f"  ERROR {article_label}: {error}")
+                time.sleep(5)
+                continue
+
+            if not data:
+                stats["parse_error"] += 1
+                print(f"  PARSE ERROR {article_label}")
+                continue
+
+            # Ensure DB is alive before writing
+            ensure_db()
+
+            # Build control
+            title = str(data.get("title", ""))[:200]
+            objective = str(data.get("objective", ""))
+            rationale = str(data.get("rationale", ""))
+            domain = str(data.get("domain", detect_domain(article_text))).upper()[:4]
+            if not domain or len(domain) < 2:
+                domain = detect_domain(article_text)
+
+            control_id = generate_control_id(domain, cur)
+            severity = str(data.get("severity", "medium")).lower()
+            if severity not in ("low", "medium", "high", "critical"):
+                severity = "medium"
+
+            requirements = data.get("requirements", [])
+            if not isinstance(requirements, list):
+                requirements = [str(requirements)]
+            test_procedure = data.get("test_procedure", [])
+            if not isinstance(test_procedure, list):
+                test_procedure = [str(test_procedure)]
+            evidence = data.get("evidence", [])
+            if not isinstance(evidence, list):
+                evidence = [str(evidence)]
+            tags = data.get("tags", [])
+            if not isinstance(tags, list):
+                tags = []
+            target_audience = data.get("target_audience", [])
+            if not isinstance(target_audience, list):
+                target_audience = []
+            applicable_industries = data.get("applicable_industries", ["all"])
+            if not isinstance(applicable_industries, list):
+                applicable_industries = ["all"]
+            applicable_company_size = data.get("applicable_company_size", ["all"])
+            if not isinstance(applicable_company_size, list):
+                applicable_company_size = ["all"]
+            scope_conditions = data.get("scope_conditions")
+
+            source_citation = {
+                "source": source_name,
+                "article": data.get("source_article", article_label),
+                "paragraph": data.get("source_paragraph", ""),
+                "article_type": article_type,
+                "license": license_info["license"],
+                "source_type": license_info["source_type"],
+            }
+
+            generation_metadata = {
+                "processing_path": "phase74_gap_fill",
+                "license_rule": license_info["rule"],
+                "source_regulation": reg_code,
+                "source_article": article_label,
+                "gap_fill": True,
+            }
+
+            category = str(data.get("category", "")) or None
+
+            # Insert into DB
+            try:
+                cur.execute("""
+                    INSERT INTO compliance.canonical_controls (
+                        framework_id, control_id, title, objective, rationale,
+                        scope, requirements, test_procedure, evidence,
+                        severity, risk_score, implementation_effort,
+                        open_anchors, release_state, tags,
+                        license_rule, source_original_text, source_citation,
+                        customer_visible, generation_metadata,
+                        verification_method, category, generation_strategy,
+                        target_audience, pipeline_version,
+                        applicable_industries, applicable_company_size, scope_conditions
+                    ) VALUES (
+                        %s, %s, %s, %s, %s,
+                        %s, %s, %s, %s,
+                        %s, %s, %s,
+                        %s, %s, %s,
+                        %s, %s, %s,
+                        %s, %s,
+                        %s, %s, %s,
+                        %s, %s,
+                        %s, %s, %s
+                    )
+                    ON CONFLICT (framework_id, control_id) DO NOTHING
+                    RETURNING id
+                """, (
+                    framework_uuid, control_id, title, objective, rationale,
+                    json.dumps({}), json.dumps(requirements), json.dumps(test_procedure), json.dumps(evidence),
+                    severity, 5, "m",
+                    json.dumps([]), "draft", json.dumps(tags),
+                    license_info["rule"], article_text, json.dumps(source_citation),
+                    True, json.dumps(generation_metadata),
+                    "document", category, "phase74_gap_fill",
+                    json.dumps(target_audience), PIPELINE_VERSION,
+                    json.dumps(applicable_industries), json.dumps(applicable_company_size),
+                    json.dumps(scope_conditions) if scope_conditions else None,
+                ))
+                conn.commit()
+                row = cur.fetchone()
+                if row:
+                    generated_ids.append(str(row[0]))
+                    stats["generated"] += 1
+                    print(f"  OK {control_id}: {title[:60]}")
+                else:
+                    stats["conflict"] += 1
+                    print(f"  CONFLICT {control_id} (already exists)")
+            except Exception as e:
+                conn.rollback()
+                stats["db_error"] += 1
+                errors.append(f"DB {control_id}: {str(e)[:100]}")
+                print(f"  DB ERROR {control_id}: {str(e)[:100]}")
+
+            # Rate limit: ~0.5s between calls
+            time.sleep(0.5)
+
+    # ── Summary ──────────────────────────────────────────────────────
+    elapsed = time.time() - t_start
+    cost = (total_input_tokens * 3 + total_output_tokens * 15) / 1_000_000
+
+    print(f"\n\n{'='*70}")
+    print(f"PHASE 7.4 — {'DRY-RUN' if args.dry_run else 'ERGEBNIS'}")
+    print(f"{'='*70}")
+    print(f"  Laufzeit:              {elapsed/60:.1f} min")
+    print(f"  API-Kosten:            ${cost:.2f}")
+    print(f"  Input Tokens:          {total_input_tokens:,}")
+    print(f"  Output Tokens:         {total_output_tokens:,}")
+    print()
+    for key in sorted(stats.keys()):
+        print(f"  {key:<25s}: {stats[key]:5d}")
+    print()
+
+    if generated_ids:
+        print(f"  Neue Control-IDs: {len(generated_ids)}")
+        # Save generated IDs
+        with open("/tmp/phase74_generated_ids.json", 'w') as f:
+            json.dump(generated_ids, f)
+        print(f"  IDs gespeichert: /tmp/phase74_generated_ids.json")
+
+    if errors:
+        print(f"\n  Fehler ({len(errors)}):")
+        for e in errors[:20]:
+            print(f"    {e}")
+        if len(errors) > 20:
+            print(f"    ... und {len(errors)-20} weitere")
+
+    conn.close()
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/qa/run_job.sh
+++ b/scripts/qa/run_job.sh
@@ -0,0 +1,218 @@
+#!/usr/bin/env bash
+# ─────────────────────────────────────────────────────────────
+# Robust job runner for QA scripts on Mac Mini
+#
+# Usage:
+#   ./run_job.sh <script.py> [args...]    # start job
+#   ./run_job.sh --status                 # show running jobs
+#   ./run_job.sh --kill <script.py>       # kill a running job
+#   ./run_job.sh --log <script.py>        # tail log
+#
+# Features:
+#   - Loads .env automatically (COMPLIANCE_DATABASE_URL → DATABASE_URL)
+#   - PID-file prevents duplicate runs
+#   - Unbuffered Python output
+#   - Structured log files in /tmp/qa_jobs/
+# ─────────────────────────────────────────────────────────────
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+PROJECT_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)"
+JOB_DIR="/tmp/qa_jobs"
+mkdir -p "$JOB_DIR"
+
+# ── Load .env ────────────────────────────────────────────────
+load_env() {
+    local envfile="$PROJECT_DIR/.env"
+    if [[ -f "$envfile" ]]; then
+        # Export all vars from .env
+        set -a
+        # shellcheck disable=SC1090
+        source "$envfile"
+        set +a
+    fi
+    # Map COMPLIANCE_DATABASE_URL → DATABASE_URL if needed
+    if [[ -z "${DATABASE_URL:-}" && -n "${COMPLIANCE_DATABASE_URL:-}" ]]; then
+        export DATABASE_URL="$COMPLIANCE_DATABASE_URL"
+    fi
+}
+
+# ── Job name from script path ─────────────────────────────────
+job_name() {
+    basename "$1" .py
+}
+
+pid_file() {
+    echo "$JOB_DIR/$(job_name "$1").pid"
+}
+
+log_file() {
+    echo "$JOB_DIR/$(job_name "$1").log"
+}
+
+# ── Status ────────────────────────────────────────────────────
+show_status() {
+    echo "═══════════════════════════════════════════════════════"
+    echo "QA Job Status ($(date '+%Y-%m-%d %H:%M:%S'))"
+    echo "═══════════════════════════════════════════════════════"
+    local found=0
+    for pidfile in "$JOB_DIR"/*.pid; do
+        [[ -f "$pidfile" ]] || continue
+        found=1
+        local name
+        name=$(basename "$pidfile" .pid)
+        local pid
+        pid=$(cat "$pidfile")
+        local logf="$JOB_DIR/$name.log"
+
+        if kill -0 "$pid" 2>/dev/null; then
+            local lines
+            lines=$(wc -l < "$logf" 2>/dev/null || echo 0)
+            local errors
+            errors=$(grep -c "ERROR" "$logf" 2>/dev/null || echo 0)
+            local last_line
+            last_line=$(tail -1 "$logf" 2>/dev/null || echo "(empty)")
+            echo "  ● $name (PID $pid) — RUNNING"
+            echo "    Log: $logf ($lines lines, $errors errors)"
+            echo "    Last: $last_line"
+        else
+            echo "  ○ $name (PID $pid) — STOPPED"
+            echo "    Log: $logf"
+            rm -f "$pidfile"
+        fi
+        echo ""
+    done
+    if [[ $found -eq 0 ]]; then
+        echo "  No jobs running."
+    fi
+}
+
+# ── Kill ──────────────────────────────────────────────────────
+kill_job() {
+    local script="$1"
+    local pf
+    pf=$(pid_file "$script")
+    if [[ ! -f "$pf" ]]; then
+        echo "No PID file for $(job_name "$script")"
+        return 1
+    fi
+    local pid
+    pid=$(cat "$pf")
+    if kill -0 "$pid" 2>/dev/null; then
+        kill "$pid"
+        echo "Killed $(job_name "$script") (PID $pid)"
+    else
+        echo "Process $pid already stopped"
+    fi
+    rm -f "$pf"
+}
+
+# ── Tail log ──────────────────────────────────────────────────
+tail_log() {
+    local script="$1"
+    local lf
+    lf=$(log_file "$script")
+    if [[ ! -f "$lf" ]]; then
+        echo "No log file: $lf"
+        return 1
+    fi
+    tail -50 "$lf"
+}
+
+# ── Start job ─────────────────────────────────────────────────
+start_job() {
+    local script="$1"
+    shift
+    local args=("$@")
+
+    # Resolve script path
+    local script_path="$script"
+    if [[ ! -f "$script_path" ]]; then
+        script_path="$SCRIPT_DIR/$script"
+    fi
+    if [[ ! -f "$script_path" ]]; then
+        echo "ERROR: Script not found: $script"
+        return 1
+    fi
+
+    local name
+    name=$(job_name "$script")
+    local pf
+    pf=$(pid_file "$script")
+    local lf
+    lf=$(log_file "$script")
+
+    # Check for already-running instance
+    if [[ -f "$pf" ]]; then
+        local existing_pid
+        existing_pid=$(cat "$pf")
+        if kill -0 "$existing_pid" 2>/dev/null; then
+            echo "ERROR: $name already running (PID $existing_pid)"
+            echo "Use: $0 --kill $script"
+            return 1
+        fi
+        rm -f "$pf"
+    fi
+
+    # Load environment
+    load_env
+
+    # Verify required env vars
+    if [[ -z "${DATABASE_URL:-}" ]]; then
+        echo "ERROR: DATABASE_URL not set (checked .env)"
+        return 1
+    fi
+
+    # Start
+    echo "Starting $name..."
+    echo "  Script: $script_path"
+    echo "  Args:   ${args[*]:-none}"
+    echo "  Log:    $lf"
+
+    nohup python3 -u "$script_path" "${args[@]}" > "$lf" 2>&1 &
+    local pid=$!
+    echo "$pid" > "$pf"
+
+    echo "  PID:    $pid"
+    echo ""
+
+    # Wait a moment and check it started OK
+    sleep 3
+    if ! kill -0 "$pid" 2>/dev/null; then
+        echo "ERROR: Process died immediately. Log output:"
+        cat "$lf"
+        rm -f "$pf"
+        return 1
+    fi
+
+    local lines
+    lines=$(wc -l < "$lf" 2>/dev/null || echo 0)
+    echo "Running OK ($lines log lines so far)"
+    echo "Monitor with: $0 --status"
+    echo "Tail log:     $0 --log $script"
+}
+
+# ── Main ──────────────────────────────────────────────────────
+case "${1:-}" in
+    --status|-s)
+        show_status
+        ;;
+    --kill|-k)
+        [[ -n "${2:-}" ]] || { echo "Usage: $0 --kill <script.py>"; exit 1; }
+        kill_job "$2"
+        ;;
+    --log|-l)
+        [[ -n "${2:-}" ]] || { echo "Usage: $0 --log <script.py>"; exit 1; }
+        tail_log "$2"
+        ;;
+    --help|-h|"")
+        echo "Usage:"
+        echo "  $0 <script.py> [args...]  Start a QA job"
+        echo "  $0 --status               Show running jobs"
+        echo "  $0 --kill <script.py>     Kill a running job"
+        echo "  $0 --log <script.py>      Tail job log"
+        ;;
+    *)
+        start_job "$@"
+        ;;
+esac
--- a/scripts/qa/sync_db.py
+++ b/scripts/qa/sync_db.py
@@ -0,0 +1,307 @@
+#!/usr/bin/env python3
+"""Sync canonical control tables between production and local DB.
+
+Modes:
+    --pull    Production → Local (initial sync, full table copy)
+    --push    Local → Production (incremental, only new obligation_candidates)
+    --loop    Run --push every N minutes (default 60)
+
+Usage:
+    python3 sync_db.py --pull                    # Full sync production → local
+    python3 sync_db.py --push                    # Push new obligations to production
+    python3 sync_db.py --loop 60                 # Push every 60 minutes
+    python3 sync_db.py --pull --tables canonical_controls  # Only one table
+"""
+import argparse
+import json
+import os
+import sys
+import time
+import urllib.parse
+
+import io
+
+import psycopg2
+import psycopg2.extras
+import psycopg2.extensions
+
+# Register JSON adapter so dicts are automatically converted to JSONB
+psycopg2.extensions.register_adapter(dict, psycopg2.extras.Json)
+
+# ── DB Config ────────────────────────────────────────────────────────
+
+PROD_URL = os.environ.get(
+    "PROD_DATABASE_URL",
+    "postgresql://postgres:GmyFD3wnU1NrKBdpU1nwLdE8MLts0A0eez8L5XXdvUCe05lWnWfVp3C6JJ8Yrmt2"
+    "@46.225.100.82:54321/postgres?sslmode=require",
+)
+LOCAL_URL = os.environ.get(
+    "LOCAL_DATABASE_URL",
+    "postgresql://breakpilot:breakpilot123@localhost:5432/breakpilot_db",
+)
+
+SCHEMA = "compliance"
+
+# Tables to sync (production → local)
+SYNC_TABLES = [
+    "canonical_control_frameworks",
+    "canonical_control_licenses",
+    "canonical_control_sources",
+    "canonical_control_categories",
+    "canonical_blocked_sources",
+    "canonical_controls",
+    "canonical_control_mappings",
+    "canonical_processed_chunks",
+    "canonical_generation_jobs",
+    "control_patterns",
+    "crosswalk_matrix",
+    "obligation_extractions",
+    "obligation_candidates",
+]
+
+
+def connect(url, label="DB"):
+    parsed = urllib.parse.urlparse(url)
+    params = dict(urllib.parse.parse_qsl(parsed.query))
+    conn = psycopg2.connect(
+        host=parsed.hostname,
+        port=parsed.port or 5432,
+        user=parsed.username,
+        password=parsed.password,
+        dbname=parsed.path.lstrip("/"),
+        sslmode=params.get("sslmode", "prefer"),
+        options=f"-c search_path={SCHEMA},public",
+        keepalives=1,
+        keepalives_idle=30,
+        keepalives_interval=10,
+        keepalives_count=5,
+    )
+    conn.autocommit = False
+    print(f"  Connected to {label} ({parsed.hostname}:{parsed.port or 5432})")
+    return conn
+
+
+def get_columns(cur, table):
+    cur.execute(f"""
+        SELECT column_name FROM information_schema.columns
+        WHERE table_schema = '{SCHEMA}' AND table_name = '{table}'
+        ORDER BY ordinal_position
+    """)
+    return [r[0] for r in cur.fetchall()]
+
+
+def pull_table(prod_conn, local_conn, table):
+    """Copy entire table from production to local via SELECT + INSERT."""
+    prod_cur = prod_conn.cursor()
+    local_cur = local_conn.cursor()
+
+    # Check table exists on production
+    prod_cur.execute(f"""
+        SELECT 1 FROM pg_tables
+        WHERE schemaname = '{SCHEMA}' AND tablename = '{table}'
+    """)
+    if not prod_cur.fetchone():
+        print(f"    SKIP {table} — not found on production")
+        return 0
+
+    # Drop local table
+    local_cur.execute(f"DROP TABLE IF EXISTS {SCHEMA}.{table} CASCADE")
+    local_conn.commit()
+
+    # Build simple CREATE TABLE (no constraints, no defaults — just for data)
+    prod_cur.execute(f"""
+        SELECT column_name, data_type, udt_name, character_maximum_length
+        FROM information_schema.columns
+        WHERE table_schema = '{SCHEMA}' AND table_name = '{table}'
+        ORDER BY ordinal_position
+    """)
+    col_defs = prod_cur.fetchall()
+
+    parts = []
+    col_names = []
+    jsonb_cols = set()
+    for name, dtype, udt, max_len in col_defs:
+        col_names.append(name)
+        if dtype == "ARRAY":
+            type_map = {
+                "_text": "text[]", "_varchar": "varchar[]",
+                "_int4": "integer[]", "_uuid": "uuid[]",
+                "_jsonb": "jsonb[]", "_float8": "float8[]",
+            }
+            sql_type = type_map.get(udt, f"{udt.lstrip('_')}[]")
+        elif dtype == "USER-DEFINED" and udt == "jsonb":
+            sql_type = "jsonb"
+            jsonb_cols.add(name)
+        elif dtype == "USER-DEFINED":
+            sql_type = udt
+        elif dtype == "jsonb":
+            sql_type = "jsonb"
+            jsonb_cols.add(name)
+        elif max_len:
+            sql_type = f"{dtype}({max_len})"
+        else:
+            sql_type = dtype
+        parts.append(f'"{name}" {sql_type}')
+
+    ddl = f"CREATE TABLE {SCHEMA}.{table} ({', '.join(parts)})"
+    local_cur.execute(ddl)
+    local_conn.commit()
+
+    # Fetch all rows from production
+    col_list = ", ".join(f'"{c}"' for c in col_names)
+    prod_cur.execute(f"SELECT {col_list} FROM {SCHEMA}.{table}")
+    rows = prod_cur.fetchall()
+
+    if rows:
+        # Wrap dict/list values in Json for JSONB columns
+        adapted_rows = []
+        for row in rows:
+            adapted = []
+            for i, val in enumerate(row):
+                if col_names[i] in jsonb_cols and isinstance(val, (dict, list)):
+                    adapted.append(psycopg2.extras.Json(val))
+                else:
+                    adapted.append(val)
+            adapted_rows.append(tuple(adapted))
+
+        placeholders = ", ".join(["%s"] * len(col_names))
+        insert_sql = f'INSERT INTO {SCHEMA}.{table} ({col_list}) VALUES ({placeholders})'
+        psycopg2.extras.execute_batch(local_cur, insert_sql, adapted_rows, page_size=500)
+        local_conn.commit()
+
+    print(f"    {table}: {len(rows)} rows")
+    return len(rows)
+
+
+def pull(tables=None):
+    """Full sync: production → local."""
+    print("\n=== PULL: Production → Local ===\n")
+
+    prod_conn = connect(PROD_URL, "Production")
+    local_conn = connect(LOCAL_URL, "Local")
+
+    # Ensure schema exists
+    local_cur = local_conn.cursor()
+    local_cur.execute(f"CREATE SCHEMA IF NOT EXISTS {SCHEMA}")
+    local_conn.commit()
+
+    sync_list = tables if tables else SYNC_TABLES
+    total = 0
+
+    for table in sync_list:
+        try:
+            count = pull_table(prod_conn, local_conn, table)
+            total += count
+        except Exception as e:
+            print(f"    ERROR {table}: {e}")
+            local_conn.rollback()
+            prod_conn.rollback()
+
+    print(f"\n  Total: {total} rows synced")
+    prod_conn.close()
+    local_conn.close()
+
+
+def push():
+    """Incremental push: new obligation_candidates local → production."""
+    print(f"\n=== PUSH: Local → Production ({time.strftime('%H:%M:%S')}) ===\n")
+
+    local_conn = connect(LOCAL_URL, "Local")
+    prod_conn = connect(PROD_URL, "Production")
+
+    local_cur = local_conn.cursor()
+    prod_cur = prod_conn.cursor()
+
+    # Find obligation_candidates in local that don't exist in production
+    # Use candidate_id as the unique key
+    local_cur.execute(f"""
+        SELECT candidate_id FROM {SCHEMA}.obligation_candidates
+    """)
+    local_ids = {r[0] for r in local_cur.fetchall()}
+
+    if not local_ids:
+        print("  No obligation_candidates in local DB")
+        local_conn.close()
+        prod_conn.close()
+        return 0
+
+    # Check which already exist on production
+    prod_cur.execute(f"""
+        SELECT candidate_id FROM {SCHEMA}.obligation_candidates
+    """)
+    prod_ids = {r[0] for r in prod_cur.fetchall()}
+
+    new_ids = local_ids - prod_ids
+    if not new_ids:
+        print(f"  All {len(local_ids)} obligations already on production")
+        local_conn.close()
+        prod_conn.close()
+        return 0
+
+    print(f"  {len(new_ids)} new obligations to push (local: {len(local_ids)}, prod: {len(prod_ids)})")
+
+    # Get columns
+    columns = get_columns(local_cur, "obligation_candidates")
+    col_list = ", ".join(columns)
+    placeholders = ", ".join(["%s"] * len(columns))
+
+    # Fetch new rows from local
+    id_list = ", ".join(f"'{i}'" for i in new_ids)
+    local_cur.execute(f"""
+        SELECT {col_list} FROM {SCHEMA}.obligation_candidates
+        WHERE candidate_id IN ({id_list})
+    """)
+    rows = local_cur.fetchall()
+
+    # Insert into production
+    insert_sql = f"INSERT INTO {SCHEMA}.obligation_candidates ({col_list}) VALUES ({placeholders}) ON CONFLICT DO NOTHING"
+    psycopg2.extras.execute_batch(prod_cur, insert_sql, rows, page_size=100)
+    prod_conn.commit()
+
+    print(f"  Pushed {len(rows)} obligations to production")
+
+    local_conn.close()
+    prod_conn.close()
+    return len(rows)
+
+
+def loop(interval_min):
+    """Run push every N minutes."""
+    print(f"\n=== SYNC LOOP — Push every {interval_min} min ===")
+    print(f"  Started at {time.strftime('%Y-%m-%d %H:%M:%S')}")
+    print(f"  Press Ctrl+C to stop\n")
+
+    while True:
+        try:
+            pushed = push()
+            if pushed:
+                print(f"  Next sync in {interval_min} min...")
+        except Exception as e:
+            print(f"  SYNC ERROR: {e}")
+        time.sleep(interval_min * 60)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Sync canonical control tables")
+    parser.add_argument("--pull", action="store_true", help="Production → Local (full copy)")
+    parser.add_argument("--push", action="store_true", help="Local → Production (new obligations)")
+    parser.add_argument("--loop", type=int, metavar="MIN", help="Push every N minutes")
+    parser.add_argument("--tables", nargs="+", help="Only sync specific tables (with --pull)")
+    args = parser.parse_args()
+
+    if not any([args.pull, args.push, args.loop]):
+        parser.print_help()
+        return
+
+    if args.pull:
+        pull(args.tables)
+
+    if args.push:
+        push()
+
+    if args.loop:
+        loop(args.loop)
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/qa/test_pass0a.py
+++ b/scripts/qa/test_pass0a.py
@@ -0,0 +1,470 @@
+#!/usr/bin/env python3
+"""Test Pass 0a (Obligation Extraction) on 5-10 controls.
+
+Standalone script — no SQLAlchemy dependency. Uses psycopg2 + requests.
+Copies prompts and quality gate from decomposition_pass.py.
+
+Usage:
+    python3 test_pass0a.py                          # 10 controls, Anthropic
+    python3 test_pass0a.py --limit 5                # 5 controls
+    python3 test_pass0a.py --source "DSGVO"         # filter by source
+    python3 test_pass0a.py --dry-run                # show controls, no LLM call
+"""
+import argparse
+import json
+import os
+import re
+import sys
+import time
+import urllib.parse
+
+import psycopg2
+import requests
+
+# ── Config ────────────────────────────────────────────────────────────
+ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", "")
+ANTHROPIC_MODEL = os.environ.get("DECOMPOSITION_LLM_MODEL", "claude-sonnet-4-6")
+ANTHROPIC_API_URL = "https://api.anthropic.com/v1"
+
+# ── Prompts (from decomposition_pass.py) ──────────────────────────────
+
+SYSTEM_PROMPT = """\
+Du bist ein Rechts-Compliance-Experte. Du zerlegst Compliance-Controls \
+in einzelne atomare Pflichten.
+
+REGELN (STRIKT EINHALTEN):
+1. Nur normative Aussagen extrahieren — erkennbar an: müssen, haben \
+sicherzustellen, sind verpflichtet, ist zu dokumentieren, ist zu melden, \
+ist zu testen, shall, must, required.
+2. Jede Pflicht hat genau EIN Hauptverb / eine Handlung.
+3. Testpflichten SEPARAT von operativen Pflichten (is_test_obligation=true).
+4. Meldepflichten SEPARAT (is_reporting_obligation=true).
+5. NICHT auf Evidence-Ebene zerlegen (z.B. "DR-Plan vorhanden" ist KEIN \
+eigenes Control, sondern Evidence).
+6. Begründungen, Erläuterungen und Erwägungsgründe sind KEINE Pflichten \
+— NICHT extrahieren.
+
+Antworte NUR mit einem JSON-Array. Keine Erklärungen."""
+
+
+def build_prompt(title, objective, requirements, test_procedure, source_ref):
+    return f"""\
+Analysiere das folgende Control und extrahiere alle einzelnen normativen \
+Pflichten als JSON-Array.
+
+CONTROL:
+Titel: {title}
+Ziel: {objective}
+Anforderungen: {requirements}
+Prüfverfahren: {test_procedure}
+Quellreferenz: {source_ref}
+
+Antworte als JSON-Array:
+[
+  {{
+    "obligation_text": "Kurze, präzise Formulierung der Pflicht",
+    "action": "Hauptverb/Handlung",
+    "object": "Gegenstand der Pflicht",
+    "condition": "Auslöser/Bedingung oder null",
+    "normative_strength": "must",
+    "is_test_obligation": false,
+    "is_reporting_obligation": false
+  }}
+]"""
+
+
+# ── Quality Gate — 3-Tier Classification (from decomposition_pass.py) ──
+
+# Tier 1: Pflicht (mandatory)
+_PFLICHT_RE = re.compile(
+    r"\bmüssen\b|\bmuss\b|\bhat\s+sicherzustellen\b|\bhaben\s+sicherzustellen\b"
+    r"|\bsind\s+verpflichtet\b|\bist\s+verpflichtet\b"
+    r"|\bist\s+zu\s+\w+en\b|\bsind\s+zu\s+\w+en\b"
+    r"|\bhat\s+zu\s+\w+en\b|\bhaben\s+zu\s+\w+en\b"
+    r"|\bist\s+\w+zu\w+en\b|\bsind\s+\w+zu\w+en\b"
+    r"|\bist\s+\w+\s+zu\s+\w+en\b|\bsind\s+\w+\s+zu\s+\w+en\b"
+    r"|\bhat\s+\w+\s+zu\s+\w+en\b|\bhaben\s+\w+\s+zu\s+\w+en\b"
+    r"|\bshall\b|\bmust\b|\brequired\b"
+    r"|\b\w+zuteilen\b|\b\w+zuwenden\b|\b\w+zustellen\b|\b\w+zulegen\b"
+    r"|\b\w+zunehmen\b|\b\w+zuführen\b|\b\w+zuhalten\b|\b\w+zusetzen\b"
+    r"|\b\w+zuweisen\b|\b\w+zuordnen\b|\b\w+zufügen\b|\b\w+zugeben\b"
+    r"|\bist\b.{1,80}\bzu\s+\w+en\b|\bsind\b.{1,80}\bzu\s+\w+en\b",
+    re.IGNORECASE,
+)
+# Tier 2: Empfehlung (recommendation)
+_EMPFEHLUNG_RE = re.compile(
+    r"\bsoll\b|\bsollen\b|\bsollte\b|\bsollten\b"
+    r"|\bgewährleisten\b|\bsicherstellen\b"
+    r"|\bshould\b|\bensure\b|\brecommend\w*\b"
+    r"|\bnachweisen\b|\beinhalten\b|\bunterlassen\b|\bwahren\b"
+    r"|\bdokumentieren\b|\bimplementieren\b|\büberprüfen\b|\büberwachen\b"
+    r"|\bprüfen,\s+ob\b|\bkontrollieren,\s+ob\b",
+    re.IGNORECASE,
+)
+# Tier 3: Kann (optional/permissive)
+_KANN_RE = re.compile(
+    r"\bkann\b|\bkönnen\b|\bdarf\b|\bdürfen\b|\bmay\b|\boptional\b",
+    re.IGNORECASE,
+)
+# Union (backward compat)
+_NORMATIVE_RE = re.compile(
+    _PFLICHT_RE.pattern + "|" + _EMPFEHLUNG_RE.pattern + "|" + _KANN_RE.pattern,
+    re.IGNORECASE,
+)
+_RATIONALE_RE = re.compile(
+    r"\bda\s+|\bweil\b|\bgrund\b|\berwägung|\bbecause\b|\breason\b|\brationale\b",
+    re.IGNORECASE,
+)
+_TEST_RE = re.compile(
+    r"\btesten\b|\btest\b|\bprüfung\b|\bprüfen\b|\bgetestet\b|\bwirksamkeit\b"
+    r"|\baudit\b|\bregelmäßig\b.*\b(prüf|test|kontroll)|\beffectiveness\b|\bverif",
+    re.IGNORECASE,
+)
+_REPORTING_RE = re.compile(
+    r"\bmelden\b|\bmeldung\b|\bunterricht|\binformieren\b|\bbenachricht"
+    r"|\bnotif|\breport\b|\bbehörd",
+    re.IGNORECASE,
+)
+
+
+def classify_obligation_type(txt):
+    """Classify: pflicht > empfehlung > kann > empfehlung (default)."""
+    if _PFLICHT_RE.search(txt):
+        return "pflicht"
+    if _EMPFEHLUNG_RE.search(txt):
+        return "empfehlung"
+    if _KANN_RE.search(txt):
+        return "kann"
+    return "empfehlung"
+
+
+def quality_gate(obl_text, parent_uuid):
+    """Validate + classify obligation. Returns (flags_dict, passed_bool, confidence, obligation_type)."""
+    flags = {}
+
+    # 1. Normative signal (informational)
+    flags["has_normative_signal"] = bool(_NORMATIVE_RE.search(obl_text))
+
+    # 1b. Obligation type classification
+    obl_type = classify_obligation_type(obl_text)
+    flags["obligation_type"] = obl_type
+
+    # 2. Single action
+    multi_verb_re = re.compile(
+        r"\b(und|sowie|als auch)\b.*\b(müssen|sicherstellen|implementieren"
+        r"|dokumentieren|melden|testen|prüfen|überwachen|gewährleisten)\b",
+        re.IGNORECASE,
+    )
+    flags["single_action"] = not bool(multi_verb_re.search(obl_text))
+
+    # 3. Not rationale
+    normative_count = len(_NORMATIVE_RE.findall(obl_text))
+    rationale_count = len(_RATIONALE_RE.findall(obl_text))
+    flags["not_rationale"] = normative_count >= rationale_count
+
+    # 4. Not evidence-only
+    evidence_only_re = re.compile(
+        r"^(Nachweis|Dokumentation|Screenshot|Protokoll|Bericht|Zertifikat)",
+        re.IGNORECASE,
+    )
+    flags["not_evidence_only"] = not bool(evidence_only_re.match(obl_text.strip()))
+
+    # 5. Min length
+    flags["min_length"] = len(obl_text.strip()) >= 20
+
+    # 6. Parent link
+    flags["has_parent_link"] = bool(parent_uuid)
+
+    # Confidence
+    weights = {
+        "has_normative_signal": 0.25, "single_action": 0.20,
+        "not_rationale": 0.20, "not_evidence_only": 0.15,
+        "min_length": 0.10, "has_parent_link": 0.05,
+    }
+    # Bonus for pflicht classification
+    confidence = sum(weights[k] for k, v in flags.items() if v and k in weights)
+    if obl_type == "pflicht":
+        confidence = min(confidence + 0.05, 1.0)
+
+    # Pass check — has_normative_signal is NO LONGER critical
+    critical = ["not_evidence_only", "min_length", "has_parent_link"]
+    passed = all(flags.get(k, False) for k in critical)
+
+    return flags, passed, confidence, obl_type
+
+
+# ── JSON parsing ──────────────────────────────────────────────────────
+
+def parse_json_array(text):
+    try:
+        result = json.loads(text)
+        if isinstance(result, list):
+            return result
+        if isinstance(result, dict):
+            return [result]
+    except json.JSONDecodeError:
+        pass
+    match = re.search(r"\[[\s\S]*\]", text)
+    if match:
+        try:
+            result = json.loads(match.group())
+            if isinstance(result, list):
+                return result
+        except json.JSONDecodeError:
+            pass
+    return []
+
+
+# ── API call ──────────────────────────────────────────────────────────
+
+def call_anthropic(prompt):
+    headers = {
+        "x-api-key": ANTHROPIC_API_KEY,
+        "anthropic-version": "2023-06-01",
+        "content-type": "application/json",
+    }
+    payload = {
+        "model": ANTHROPIC_MODEL,
+        "max_tokens": 8192,
+        "system": [{"type": "text", "text": SYSTEM_PROMPT, "cache_control": {"type": "ephemeral"}}],
+        "messages": [{"role": "user", "content": prompt}],
+    }
+    resp = requests.post(f"{ANTHROPIC_API_URL}/messages", headers=headers, json=payload, timeout=120)
+    if resp.status_code != 200:
+        return None, {}, f"HTTP {resp.status_code}: {resp.text[:200]}"
+    data = resp.json()
+    usage = data.get("usage", {})
+    content = data.get("content", [])
+    text = content[0].get("text", "") if content else ""
+    return text, usage, None
+
+
+# ── Format helpers ────────────────────────────────────────────────────
+
+def fmt_json(val):
+    if val is None:
+        return ""
+    if isinstance(val, str):
+        try:
+            val = json.loads(val)
+        except (json.JSONDecodeError, TypeError):
+            return val
+    if isinstance(val, list):
+        return "\n".join(f"  - {item}" for item in val)
+    return str(val)
+
+
+# ── Main ──────────────────────────────────────────────────────────────
+
+def main():
+    parser = argparse.ArgumentParser(description="Test Pass 0a on small sample")
+    parser.add_argument("--limit", type=int, default=10)
+    parser.add_argument("--source", type=str)
+    parser.add_argument("--dry-run", action="store_true")
+    args = parser.parse_args()
+
+    if not ANTHROPIC_API_KEY and not args.dry_run:
+        print("ERROR: Set ANTHROPIC_API_KEY")
+        sys.exit(1)
+
+    db_url = os.environ["DATABASE_URL"]
+    p = urllib.parse.urlparse(db_url)
+    conn = psycopg2.connect(
+        host=p.hostname, port=p.port or 5432,
+        user=p.username, password=p.password,
+        dbname=p.path.lstrip("/"),
+        options="-c search_path=compliance,public",
+    )
+    cur = conn.cursor()
+
+    # Select diverse sample
+    query = """
+        SELECT id, control_id, title, objective, requirements,
+               test_procedure, source_citation, category
+        FROM compliance.canonical_controls
+        WHERE release_state NOT IN ('deprecated', 'duplicate', 'too_close')
+          AND parent_control_uuid IS NULL
+          AND title IS NOT NULL AND objective IS NOT NULL
+          AND length(coalesce(objective,'') || coalesce(requirements::text,'')) > 100
+    """
+    params = []
+    if args.source:
+        query += " AND source_citation->>'source' ILIKE %s"
+        params.append(f"%{args.source}%")
+
+    query += " ORDER BY source_citation->>'source', random()"
+    query += f" LIMIT {args.limit}"
+
+    cur.execute(query, params)
+    controls = cur.fetchall()
+
+    if not controls:
+        print("No controls found.")
+        return
+
+    print(f"{'='*70}")
+    print(f"Pass 0a Test — {len(controls)} Controls")
+    print(f"Model: {ANTHROPIC_MODEL}")
+    print(f"{'='*70}")
+
+    total_in = total_out = total_obls = 0
+    type_counts = {"pflicht": 0, "empfehlung": 0, "kann": 0}
+    total_rejected = 0  # only evidence-only / too-short / no-parent
+    all_results = []
+    t_start = time.time()
+
+    for i, row in enumerate(controls, 1):
+        ctrl_uuid, ctrl_id, title, objective, reqs, test_proc, src_cit, category = row
+
+        req_str = fmt_json(reqs)
+        test_str = fmt_json(test_proc)
+        source_str = ""
+        if src_cit:
+            sc = src_cit if isinstance(src_cit, dict) else json.loads(src_cit)
+            source_str = f"{sc.get('source', '')} {sc.get('article', '')}"
+
+        print(f"\n{'─'*70}")
+        print(f"[{i}/{len(controls)}] {ctrl_id}: {title}")
+        print(f"  Source: {source_str} | Category: {category or 'N/A'}")
+        print(f"  Objective: {(objective or '')[:200]}")
+
+        if args.dry_run:
+            print("  [DRY RUN]")
+            continue
+
+        prompt = build_prompt(title or "", objective or "", req_str, test_str, source_str)
+
+        t0 = time.time()
+        response_text, usage, error = call_anthropic(prompt)
+        elapsed = time.time() - t0
+
+        if error:
+            print(f"  ERROR: {error}")
+            continue
+
+        in_tok = usage.get("input_tokens", 0)
+        out_tok = usage.get("output_tokens", 0)
+        cached = usage.get("cache_read_input_tokens", 0)
+        total_in += in_tok
+        total_out += out_tok
+
+        obligations = parse_json_array(response_text)
+        total_obls += len(obligations)
+
+        print(f"  API: {elapsed:.1f}s | {in_tok} in / {out_tok} out"
+              f"{f' ({cached} cached)' if cached else ''}"
+              f" | {len(obligations)} obligation(s)")
+
+        for j, obl in enumerate(obligations, 1):
+            obl_text = obl.get("obligation_text", "")
+            action = obl.get("action", "")
+            obj = obl.get("object", "")
+            condition = obl.get("condition")
+            strength = obl.get("normative_strength", "must")
+            is_test = bool(obl.get("is_test_obligation", False))
+            is_report = bool(obl.get("is_reporting_obligation", False))
+
+            # Auto-detect
+            if not is_test and _TEST_RE.search(obl_text):
+                is_test = True
+            if not is_report and _REPORTING_RE.search(obl_text):
+                is_report = True
+
+            flags, passed, conf, obl_type = quality_gate(obl_text, str(ctrl_uuid))
+            if passed:
+                type_counts[obl_type] = type_counts.get(obl_type, 0) + 1
+            else:
+                total_rejected += 1
+
+            tag = ""
+            if is_test:
+                tag = " [TEST]"
+            elif is_report:
+                tag = " [MELDEPFLICHT]"
+
+            # Show type instead of PASS/REJECT
+            type_label = {"pflicht": "PFLICHT", "empfehlung": "EMPFEHLUNG", "kann": "KANN"}
+            if not passed:
+                status = "REJECT"
+            else:
+                status = type_label.get(obl_type, "EMPFEHLUNG")
+
+            failed = [k for k, v in flags.items()
+                      if isinstance(v, bool) and not v]
+
+            print(f"\n    {j}. [{status}] conf={conf:.0%}{tag} strength={strength}")
+            print(f"       {obl_text}")
+            print(f"       Handlung: {action} | Gegenstand: {obj}")
+            if condition:
+                print(f"       Bedingung: {condition}")
+            if not passed:
+                print(f"       Abgelehnt: {', '.join(failed)}")
+
+            all_results.append({
+                "control_id": ctrl_id,
+                "obligation_text": obl_text,
+                "obligation_type": obl_type if passed else "rejected",
+                "action": action,
+                "object": obj,
+                "condition": condition,
+                "confidence": round(conf, 2),
+                "is_test": is_test,
+                "is_reporting": is_report,
+                "passed": passed,
+                "flags": {k: v for k, v in flags.items()},
+            })
+
+        time.sleep(0.5)
+
+    # ── Summary ──────────────────────────────────────────────────────
+    elapsed_total = time.time() - t_start
+    cost = (total_in * 3 + total_out * 15) / 1_000_000
+    total_classified = sum(type_counts.values())
+
+    print(f"\n\n{'='*70}")
+    print(f"ZUSAMMENFASSUNG — 3-Tier-Klassifizierung")
+    print(f"{'='*70}")
+    print(f"  Controls:       {len(controls)}")
+    print(f"  Obligations:    {total_obls} ({total_obls/max(len(controls),1):.1f} pro Control)")
+    print(f"  ── Klassifizierung ──")
+    print(f"  Pflicht:        {type_counts['pflicht']}"
+          f" ({type_counts['pflicht']*100/max(total_obls,1):.0f}%)")
+    print(f"  Empfehlung:     {type_counts['empfehlung']}"
+          f" ({type_counts['empfehlung']*100/max(total_obls,1):.0f}%)")
+    print(f"  Kann:           {type_counts['kann']}"
+          f" ({type_counts['kann']*100/max(total_obls,1):.0f}%)")
+    print(f"  Rejected:       {total_rejected}"
+          f" ({total_rejected*100/max(total_obls,1):.0f}%)"
+          f"  (nur evidence-only/zu kurz/kein parent)")
+    print(f"  ── Kosten ──")
+    print(f"  Laufzeit:       {elapsed_total:.1f}s")
+    print(f"  Tokens:         {total_in:,} in / {total_out:,} out")
+    print(f"  Kosten:         ${cost:.4f}")
+
+    if len(controls) > 0 and not args.dry_run and total_obls > 0:
+        n = 6000
+        factor = n / len(controls)
+        print(f"\n  --- Hochrechnung auf {n:,} Controls ---")
+        print(f"  Tokens:         {int(total_in * factor):,} in / {int(total_out * factor):,} out")
+        print(f"  Kosten:         ${cost * factor:.2f}")
+        print(f"  Laufzeit:       {elapsed_total * factor / 3600:.1f}h")
+        print(f"  Obligations:    ~{int(total_obls / len(controls) * n):,}")
+        pf = int(type_counts['pflicht'] * factor)
+        ef = int(type_counts['empfehlung'] * factor)
+        kf = int(type_counts['kann'] * factor)
+        print(f"  Pflicht:        ~{pf:,}")
+        print(f"  Empfehlung:     ~{ef:,}")
+        print(f"  Kann:           ~{kf:,}")
+
+    # Save results JSON for later analysis
+    if all_results:
+        out_path = f"/tmp/pass0a_results_{len(controls)}controls.json"
+        with open(out_path, "w") as f:
+            json.dump(all_results, f, ensure_ascii=False, indent=2)
+        print(f"\n  Ergebnisse gespeichert: {out_path}")
+
+    conn.close()
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/qa/test_pass0b_preview.py
+++ b/scripts/qa/test_pass0b_preview.py
@@ -0,0 +1,308 @@
+#!/usr/bin/env python3
+"""Preview Pass 0b: Turn obligation candidates into atomic controls.
+
+Picks a few obligations from Pass 0a results, calls LLM to compose
+atomic controls, and writes them to canonical_controls with parent_control_uuid.
+
+Usage:
+    python3 test_pass0b_preview.py --input /tmp/pass0a_results_60controls.json --limit 3
+"""
+import argparse
+import json
+import os
+import re
+import sys
+import time
+import uuid
+import urllib.parse
+
+import psycopg2
+import psycopg2.extras
+import requests
+
+# Register JSON adapter
+psycopg2.extensions.register_adapter(dict, psycopg2.extras.Json)
+
+ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", "")
+ANTHROPIC_MODEL = os.environ.get("DECOMPOSITION_LLM_MODEL", "claude-sonnet-4-6")
+
+SYSTEM_PROMPT = """\
+Du bist ein Security-Compliance-Experte. Du erstellst aus einer einzelnen \
+normativen Pflicht ein praxisorientiertes, atomares Security Control.
+
+Das Control muss UMSETZBAR sein — keine Gesetzesparaphrase.
+Antworte NUR als JSON. Keine Erklärungen."""
+
+
+def build_pass0b_prompt(obl_text, action, obj, parent_title, category, source_ref):
+    return f"""\
+Erstelle aus der folgenden Pflicht ein atomares Control.
+
+PFLICHT: {obl_text}
+HANDLUNG: {action}
+GEGENSTAND: {obj}
+
+KONTEXT (Ursprungs-Control):
+Titel: {parent_title}
+Kategorie: {category}
+Quellreferenz: {source_ref}
+
+Antworte als JSON:
+{{
+  "title": "Kurzer Titel (max 80 Zeichen, deutsch)",
+  "objective": "Was muss erreicht werden? (1-2 Sätze)",
+  "requirements": ["Konkrete Anforderung 1", "Anforderung 2"],
+  "test_procedure": ["Prüfschritt 1", "Prüfschritt 2"],
+  "evidence": ["Nachweis 1", "Nachweis 2"],
+  "severity": "critical|high|medium|low",
+  "category": "security|privacy|governance|operations|finance|reporting"
+}}"""
+
+
+def call_anthropic(prompt):
+    headers = {
+        "x-api-key": ANTHROPIC_API_KEY,
+        "anthropic-version": "2023-06-01",
+        "content-type": "application/json",
+    }
+    payload = {
+        "model": ANTHROPIC_MODEL,
+        "max_tokens": 4096,
+        "system": [{"type": "text", "text": SYSTEM_PROMPT, "cache_control": {"type": "ephemeral"}}],
+        "messages": [{"role": "user", "content": prompt}],
+    }
+    resp = requests.post("https://api.anthropic.com/v1/messages", headers=headers, json=payload, timeout=120)
+    if resp.status_code != 200:
+        return None, {}, f"HTTP {resp.status_code}: {resp.text[:200]}"
+    data = resp.json()
+    text = data.get("content", [{}])[0].get("text", "")
+    return text, data.get("usage", {}), None
+
+
+def parse_json_object(text):
+    try:
+        return json.loads(text)
+    except json.JSONDecodeError:
+        match = re.search(r"\{[\s\S]*\}", text)
+        if match:
+            try:
+                return json.loads(match.group())
+            except json.JSONDecodeError:
+                pass
+    return None
+
+
+def generate_control_id(domain, cur):
+    prefix = domain.upper()[:4]
+    cur.execute("""
+        SELECT MAX(CAST(SPLIT_PART(control_id, '-', 2) AS INTEGER))
+        FROM compliance.canonical_controls
+        WHERE control_id LIKE %s
+          AND SPLIT_PART(control_id, '-', 2) ~ '^[0-9]+$'
+    """, (f"{prefix}-%",))
+    row = cur.fetchone()
+    if row and row[0] is not None:
+        return f"{prefix}-{row[0] + 1}"
+    return f"{prefix}-001"
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input", default="/tmp/pass0a_results_60controls.json")
+    parser.add_argument("--limit", type=int, default=3, help="Number of obligations to process")
+    parser.add_argument("--control", type=str, help="Pick obligations from this control_id")
+    parser.add_argument("--dry-run", action="store_true")
+    args = parser.parse_args()
+
+    if not ANTHROPIC_API_KEY and not args.dry_run:
+        print("ERROR: Set ANTHROPIC_API_KEY")
+        sys.exit(1)
+
+    # Load 0a results
+    with open(args.input) as f:
+        obligations = json.load(f)
+
+    # Filter: only passed, pflicht or empfehlung
+    obligations = [o for o in obligations if o.get("passed", False)]
+
+    if args.control:
+        obligations = [o for o in obligations if o["control_id"] == args.control]
+
+    # Pick diverse sample
+    picked = []
+    seen_types = set()
+    for o in obligations:
+        otype = o["obligation_type"]
+        if otype not in seen_types and len(picked) < args.limit:
+            picked.append(o)
+            seen_types.add(otype)
+    # Fill rest
+    for o in obligations:
+        if o not in picked and len(picked) < args.limit:
+            picked.append(o)
+
+    if not picked:
+        print("No obligations found.")
+        return
+
+    # Connect to DB
+    db_url = os.environ["DATABASE_URL"]
+    p = urllib.parse.urlparse(db_url)
+    conn = psycopg2.connect(
+        host=p.hostname, port=p.port or 5432,
+        user=p.username, password=p.password,
+        dbname=p.path.lstrip("/"),
+        options="-c search_path=compliance,public",
+    )
+    cur = conn.cursor()
+
+    # Get parent control info
+    ctrl_ids = list(set(o["control_id"] for o in picked))
+    cur.execute("""
+        SELECT control_id, id, title, category, source_citation
+        FROM compliance.canonical_controls
+        WHERE control_id = ANY(%s)
+    """, (ctrl_ids,))
+    ctrl_map = {}
+    for row in cur.fetchall():
+        sc = row[4] if isinstance(row[4], dict) else (json.loads(row[4]) if row[4] else {})
+        # Derive domain prefix from control_id (e.g. "DSGV" from "DSGV-001")
+        prefix = row[0].split("-")[0] if "-" in row[0] else "COMP"
+        ctrl_map[row[0]] = {
+            "uuid": str(row[1]), "title": row[2], "category": row[3] or "",
+            "source_ref": f"{sc.get('source', '')} {sc.get('article', '')}",
+            "domain": prefix,
+        }
+
+    print("=" * 70)
+    print(f"Pass 0b Preview — {len(picked)} Obligations → Atomic Controls")
+    print("=" * 70)
+
+    created = []
+    for i, obl in enumerate(picked, 1):
+        ctrl = ctrl_map.get(obl["control_id"], {})
+        print(f"\n{'─'*70}")
+        print(f"[{i}/{len(picked)}] {obl['control_id']}: [{obl['obligation_type'].upper()}]")
+        print(f"  Obligation: {obl['obligation_text'][:120]}")
+        print(f"  Parent: {ctrl.get('title', 'N/A')}")
+
+        if args.dry_run:
+            print("  [DRY RUN]")
+            continue
+
+        prompt = build_pass0b_prompt(
+            obl["obligation_text"], obl["action"], obl["object"],
+            ctrl.get("title", ""), ctrl.get("category", ""),
+            ctrl.get("source_ref", ""),
+        )
+
+        t0 = time.time()
+        resp_text, usage, error = call_anthropic(prompt)
+        elapsed = time.time() - t0
+
+        if error:
+            print(f"  ERROR: {error}")
+            continue
+
+        result = parse_json_object(resp_text)
+        if not result:
+            print(f"  PARSE ERROR: {resp_text[:200]}")
+            continue
+
+        in_tok = usage.get("input_tokens", 0)
+        out_tok = usage.get("output_tokens", 0)
+        print(f"  LLM: {elapsed:.1f}s | {in_tok} in / {out_tok} out")
+
+        # Generate control_id
+        domain = ctrl.get("domain", "COMP")
+        new_control_id = generate_control_id(domain, cur)
+
+        # Show result
+        print(f"\n  === ATOMIC CONTROL: {new_control_id} ===")
+        print(f"  Titel:     {result.get('title', 'N/A')}")
+        print(f"  Ziel:      {result.get('objective', 'N/A')}")
+        print(f"  Typ:       {obl['obligation_type']}")
+        reqs = result.get("requirements", [])
+        if reqs:
+            print(f"  Anforderungen:")
+            for r in reqs:
+                print(f"    - {r}")
+        tests = result.get("test_procedure", [])
+        if tests:
+            print(f"  Pruefverfahren:")
+            for t in tests:
+                print(f"    - {t}")
+        evidence = result.get("evidence", [])
+        if evidence:
+            print(f"  Nachweise:")
+            for e in evidence:
+                print(f"    - {e}")
+        print(f"  Severity:  {result.get('severity', 'medium')}")
+        print(f"  Category:  {result.get('category', 'governance')}")
+
+        # Write to DB
+        new_uuid = str(uuid.uuid4())
+        parent_uuid = ctrl.get("uuid")
+        source_cit = {}
+        if ctrl.get("source_ref"):
+            parts = ctrl["source_ref"].strip().split(" ", 1)
+            source_cit = {"source": parts[0], "article": parts[1] if len(parts) > 1 else ""}
+
+        cur.execute("""
+            INSERT INTO compliance.canonical_controls (
+                id, control_id, title, objective, requirements, test_procedure,
+                evidence, severity, category, release_state,
+                source_citation, generation_metadata, generation_strategy,
+                pipeline_version, parent_control_uuid, framework_id
+            ) VALUES (
+                %s, %s, %s, %s, %s, %s,
+                %s, %s, %s, %s,
+                %s, %s, %s,
+                %s, %s,
+                (SELECT id FROM compliance.canonical_control_frameworks LIMIT 1)
+            )
+        """, (
+            new_uuid, new_control_id,
+            result.get("title", ""),
+            result.get("objective", ""),
+            json.dumps(result.get("requirements", []), ensure_ascii=False),
+            json.dumps(result.get("test_procedure", []), ensure_ascii=False),
+            json.dumps(result.get("evidence", []), ensure_ascii=False),
+            result.get("severity", "medium"),
+            result.get("category", "governance"),
+            "draft",
+            psycopg2.extras.Json(source_cit),
+            psycopg2.extras.Json({
+                "obligation_type": obl["obligation_type"],
+                "obligation_text": obl["obligation_text"],
+                "pass0b_model": ANTHROPIC_MODEL,
+                "decomposition_method": "pass0b_preview",
+            }),
+            "pass0b_atomic",
+            6,  # pipeline_version
+            parent_uuid,
+        ))
+        conn.commit()
+
+        created.append({
+            "control_id": new_control_id,
+            "title": result.get("title", ""),
+            "obligation_type": obl["obligation_type"],
+            "parent_control_id": obl["control_id"],
+        })
+        print(f"  ✓ Geschrieben: {new_control_id} (parent: {obl['control_id']})")
+
+        time.sleep(0.5)
+
+    if created:
+        print(f"\n{'='*70}")
+        print(f"ERGEBNIS: {len(created)} atomare Controls erstellt")
+        print(f"{'='*70}")
+        for c in created:
+            print(f"  {c['control_id']}: {c['title']} [{c['obligation_type']}] (von {c['parent_control_id']})")
+
+    conn.close()
+
+
+if __name__ == "__main__":
+    main()