feat: Control Library UI, dedup migration, QA tooling, docs

- Control Library: parent control display, ObligationTypeBadge, GenerationStrategyBadge variants, evidence string fallback - API: expose parent_control_uuid/id/title in canonical controls - Fix: DSFA SQLAlchemy 2.0 Row._mapping compatibility - Migration 074: control_parent_links + control_dedup_reviews tables - QA scripts: benchmark, gap analysis, OSCAL import, OWASP cleanup, phase5 normalize, phase74 gap fill, sync_db, run_job - Docs: dedup engine, RAG benchmark, lessons learned, pipeline docs Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-21 11:56:08 +01:00
parent c52dbdb8f1
commit 643b26618f
28 changed files with 5781 additions and 75 deletions
@@ -9,6 +9,7 @@ import {
 import {
  CanonicalControl, EFFORT_LABELS, BACKEND_URL,
  SeverityBadge, StateBadge, LicenseRuleBadge, VerificationMethodBadge, CategoryBadge, TargetAudienceBadge,
+  ObligationTypeBadge, GenerationStrategyBadge,
  VERIFICATION_METHODS, CATEGORY_OPTIONS,
 } from './helpers'

@@ -125,6 +126,8 @@ export function ControlDetail({
              <VerificationMethodBadge method={ctrl.verification_method} />
              <CategoryBadge category={ctrl.category} />
              <TargetAudienceBadge audience={ctrl.target_audience} />
+              <GenerationStrategyBadge strategy={ctrl.generation_strategy} />
+              <ObligationTypeBadge type={ctrl.generation_metadata?.obligation_type as string} />
            </div>
            <h2 className="text-lg font-semibold text-gray-900 mt-1">{ctrl.title}</h2>
          </div>
@@ -239,6 +242,32 @@ export function ControlDetail({
          </section>
        )}

+        {/* Parent Control (atomare Controls) */}
+        {ctrl.parent_control_uuid && (
+          <section className="bg-violet-50 border border-violet-200 rounded-lg p-4">
+            <div className="flex items-center gap-2 mb-1">
+              <GitMerge className="w-4 h-4 text-violet-600" />
+              <h3 className="text-sm font-semibold text-violet-900">Atomares Control</h3>
+              <ObligationTypeBadge type={ctrl.generation_metadata?.obligation_type as string} />
+            </div>
+            <p className="text-sm text-violet-800">
+              Abgeleitet aus Eltern-Control{' '}
+              <span className="font-mono font-semibold text-purple-700 bg-purple-100 px-1.5 py-0.5 rounded">
+                {ctrl.parent_control_id || ctrl.parent_control_uuid}
+              </span>
+              {ctrl.parent_control_title && (
+                <span className="text-violet-700 ml-1">— {ctrl.parent_control_title}</span>
+              )}
+            </p>
+            {ctrl.generation_metadata?.obligation_text && (
+              <p className="text-xs text-violet-600 mt-2 bg-violet-100/50 rounded p-2">
+                Obligation: {String(ctrl.generation_metadata.obligation_text).slice(0, 300)}
+                {String(ctrl.generation_metadata.obligation_text).length > 300 ? '...' : ''}
+              </p>
+            )}
+          </section>
+        )}
+
        {/* Impliziter Gesetzesbezug (Rule 3 — reformuliert, kein Originaltext) */}
        {!ctrl.source_citation && ctrl.open_anchors.length > 0 && (
          <section className="bg-amber-50 border border-amber-200 rounded-lg p-3">
@@ -297,7 +326,7 @@ export function ControlDetail({
          </section>
        )}

-        {/* Evidence */}
+        {/* Evidence — handles both {type, description} objects and plain strings */}
        {ctrl.evidence.length > 0 && (
          <section>
            <h3 className="text-sm font-semibold text-gray-900 mb-2">Nachweise</h3>
@@ -305,7 +334,11 @@ export function ControlDetail({
              {ctrl.evidence.map((ev, i) => (
                <div key={i} className="flex items-start gap-2 text-sm text-gray-700">
                  <FileText className="w-4 h-4 text-gray-400 flex-shrink-0 mt-0.5" />
-                  <div><span className="font-medium">{ev.type}:</span> {ev.description}</div>
+                  {typeof ev === 'string' ? (
+                    <div>{ev}</div>
+                  ) : (
+                    <div><span className="font-medium">{ev.type}:</span> {ev.description}</div>
+                  )}
                </div>
              ))}
            </div>
@@ -359,7 +392,18 @@ export function ControlDetail({
              <h3 className="text-sm font-semibold text-gray-700">Generierungsdetails (intern)</h3>
            </div>
            <div className="text-xs text-gray-600 space-y-1">
-              <p>Pfad: {String(ctrl.generation_metadata.processing_path || '-')}</p>
+              {ctrl.generation_metadata.processing_path && (
+                <p>Pfad: {String(ctrl.generation_metadata.processing_path)}</p>
+              )}
+              {ctrl.generation_metadata.decomposition_method && (
+                <p>Methode: {String(ctrl.generation_metadata.decomposition_method)}</p>
+              )}
+              {ctrl.generation_metadata.pass0b_model && (
+                <p>LLM: {String(ctrl.generation_metadata.pass0b_model)}</p>
+              )}
+              {ctrl.generation_metadata.obligation_type && (
+                <p>Obligation-Typ: {String(ctrl.generation_metadata.obligation_type)}</p>
+              )}
              {ctrl.generation_metadata.similarity_status && (
                <p className="text-red-600">Similarity: {String(ctrl.generation_metadata.similarity_status)}</p>
              )}
@@ -30,7 +30,7 @@ export interface CanonicalControl {
  }
  requirements: string[]
  test_procedure: string[]
-  evidence: EvidenceItem[]
+  evidence: (EvidenceItem | string)[]
  severity: string
  risk_score: number | null
  implementation_effort: string | null
@@ -47,6 +47,10 @@ export interface CanonicalControl {
  target_audience: string | string[] | null
  generation_metadata?: Record<string, unknown> | null
  generation_strategy?: string | null
+  parent_control_uuid?: string | null
+  parent_control_id?: string | null
+  parent_control_title?: string | null
+  decomposition_method?: string | null
  created_at: string
  updated_at: string
 }
@@ -275,7 +279,26 @@ export function GenerationStrategyBadge({ strategy }: { strategy: string | null
  if (strategy === 'document_grouped') {
    return <span className="inline-flex items-center px-1.5 py-0.5 rounded text-xs font-medium bg-emerald-100 text-emerald-700">v2</span>
  }
-  return null
+  if (strategy === 'phase74_gap_fill') {
+    return <span className="inline-flex items-center px-1.5 py-0.5 rounded text-xs font-medium bg-blue-100 text-blue-700">v5 Gap</span>
+  }
+  if (strategy === 'pass0b_atomic') {
+    return <span className="inline-flex items-center px-1.5 py-0.5 rounded text-xs font-medium bg-violet-100 text-violet-700">Atomar</span>
+  }
+  return <span className="inline-flex items-center px-1.5 py-0.5 rounded text-xs font-medium bg-gray-100 text-gray-500">{strategy}</span>
+}
+
+export const OBLIGATION_TYPE_CONFIG: Record<string, { bg: string; label: string }> = {
+  pflicht: { bg: 'bg-red-100 text-red-700', label: 'Pflicht' },
+  empfehlung: { bg: 'bg-amber-100 text-amber-700', label: 'Empfehlung' },
+  kann: { bg: 'bg-green-100 text-green-700', label: 'Kann' },
+}
+
+export function ObligationTypeBadge({ type }: { type: string | null | undefined }) {
+  if (!type) return null
+  const config = OBLIGATION_TYPE_CONFIG[type]
+  if (!config) return null
+  return <span className={`inline-flex items-center px-2 py-0.5 rounded text-xs font-medium ${config.bg}`}>{config.label}</span>
 }

 export function getDomain(controlId: string): string {
@@ -9,7 +9,7 @@ import {
 import {
  CanonicalControl, Framework, BACKEND_URL, EMPTY_CONTROL,
  SeverityBadge, StateBadge, LicenseRuleBadge, VerificationMethodBadge, CategoryBadge, TargetAudienceBadge,
-  GenerationStrategyBadge,
+  GenerationStrategyBadge, ObligationTypeBadge,
  VERIFICATION_METHODS, CATEGORY_OPTIONS, TARGET_AUDIENCE_OPTIONS,
 } from './components/helpers'
 import { ControlForm } from './components/ControlForm'
@@ -762,6 +762,7 @@ export default function ControlLibraryPage() {
                    <CategoryBadge category={ctrl.category} />
                    <TargetAudienceBadge audience={ctrl.target_audience} />
                    <GenerationStrategyBadge strategy={ctrl.generation_strategy} />
+                    <ObligationTypeBadge type={ctrl.generation_metadata?.obligation_type as string} />
                    {ctrl.risk_score !== null && (
                      <span className="text-xs text-gray-400">Score: {ctrl.risk_score}</span>
                    )}
@@ -174,6 +174,9 @@ _CONTROL_COLS = """id, framework_id, control_id, title, objective, rationale,
                   customer_visible, verification_method, category,
                   target_audience, generation_metadata, generation_strategy,
                   applicable_industries, applicable_company_size, scope_conditions,
+                   parent_control_uuid, decomposition_method, pipeline_version,
+                   (SELECT p.control_id FROM canonical_controls p WHERE p.id = canonical_controls.parent_control_uuid) AS parent_control_id,
+                   (SELECT p.title FROM canonical_controls p WHERE p.id = canonical_controls.parent_control_uuid) AS parent_control_title,
                   created_at, updated_at"""


@@ -798,6 +801,11 @@ def _control_row(r) -> dict:
        "applicable_industries": getattr(r, "applicable_industries", None),
        "applicable_company_size": getattr(r, "applicable_company_size", None),
        "scope_conditions": getattr(r, "scope_conditions", None),
+        "parent_control_uuid": str(r.parent_control_uuid) if getattr(r, "parent_control_uuid", None) else None,
+        "parent_control_id": getattr(r, "parent_control_id", None),
+        "parent_control_title": getattr(r, "parent_control_title", None),
+        "decomposition_method": getattr(r, "decomposition_method", None),
+        "pipeline_version": getattr(r, "pipeline_version", None),
        "created_at": r.created_at.isoformat() if r.created_at else None,
        "updated_at": r.updated_at.isoformat() if r.updated_at else None,
    }
@@ -200,6 +200,9 @@ def _get_tenant_id(tenant_id: Optional[str]) -> str:
 def _dsfa_to_response(row) -> dict:
    """Convert a DB row to a JSON-serializable dict."""
    import json
+    # SQLAlchemy 2.0: Row objects need ._mapping for string-key access
+    if hasattr(row, "_mapping"):
+        row = row._mapping

    def _parse_arr(val):
        """Parse a JSONB array field → list."""
@@ -558,8 +561,9 @@ async def create_dsfa(
    ).fetchone()

    db.flush()
+    row_id = row._mapping["id"] if hasattr(row, "_mapping") else row[0]
    _log_audit(
-        db, tid, row["id"], "CREATE", request.created_by,
+        db, tid, row_id, "CREATE", request.created_by,
        new_values={"title": request.title, "status": request.status},
    )
    db.commit()
@@ -0,0 +1,73 @@
+-- Migration 074: Control Dedup Engine — DB Schema
+-- Supports the 4-stage dedup pipeline for atomic controls (Pass 0b).
+--
+-- Tables:
+--   1. control_parent_links    — M:N parent linking (one control → many regulations)
+--   2. control_dedup_reviews   — Review queue for borderline matches (0.85-0.92)
+
+BEGIN;
+
+-- =============================================================================
+-- 1. Control Parent Links (M:N)
+--    Enables "1 Control erfuellt 5 Gesetze" — the biggest USP.
+--    An atomic control can have multiple parent controls from different
+--    regulations/obligations. This replaces the 1:1 parent_control_uuid FK.
+-- =============================================================================
+
+CREATE TABLE IF NOT EXISTS control_parent_links (
+    id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
+    control_uuid UUID NOT NULL REFERENCES canonical_controls(id) ON DELETE CASCADE,
+    parent_control_uuid UUID NOT NULL REFERENCES canonical_controls(id) ON DELETE CASCADE,
+    link_type VARCHAR(30) NOT NULL DEFAULT 'decomposition'
+        CHECK (link_type IN ('decomposition', 'dedup_merge', 'manual', 'crosswalk')),
+    confidence NUMERIC(3,2) DEFAULT 1.0
+        CHECK (confidence >= 0 AND confidence <= 1),
+    source_regulation VARCHAR(100),
+    source_article VARCHAR(100),
+    obligation_candidate_id UUID REFERENCES obligation_candidates(id),
+    created_at TIMESTAMPTZ DEFAULT NOW(),
+    CONSTRAINT uq_parent_link UNIQUE (control_uuid, parent_control_uuid)
+);
+
+CREATE INDEX IF NOT EXISTS idx_cpl_control ON control_parent_links(control_uuid);
+CREATE INDEX IF NOT EXISTS idx_cpl_parent ON control_parent_links(parent_control_uuid);
+CREATE INDEX IF NOT EXISTS idx_cpl_type ON control_parent_links(link_type);
+
+COMMENT ON TABLE control_parent_links IS
+    'M:N parent links — one atomic control can fulfill multiple regulations/obligations. USP: "1 Control erfuellt 5 Gesetze"';
+
+-- =============================================================================
+-- 2. Control Dedup Reviews
+--    Queue for borderline matches (similarity 0.85-0.92) that need human review.
+--    Reviewed entries get status updated to accepted/rejected.
+-- =============================================================================
+
+CREATE TABLE IF NOT EXISTS control_dedup_reviews (
+    id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
+    candidate_control_id VARCHAR(30) NOT NULL,
+    candidate_title TEXT NOT NULL,
+    candidate_objective TEXT,
+    matched_control_uuid UUID REFERENCES canonical_controls(id),
+    matched_control_id VARCHAR(30),
+    similarity_score NUMERIC(4,3) DEFAULT 0.0,
+    dedup_stage VARCHAR(40) NOT NULL,
+    dedup_details JSONB DEFAULT '{}',
+    parent_control_uuid UUID REFERENCES canonical_controls(id),
+    obligation_candidate_id UUID REFERENCES obligation_candidates(id),
+    review_status VARCHAR(20) DEFAULT 'pending'
+        CHECK (review_status IN ('pending', 'accepted_link', 'accepted_new', 'rejected')),
+    reviewed_by VARCHAR(100),
+    reviewed_at TIMESTAMPTZ,
+    review_notes TEXT,
+    created_at TIMESTAMPTZ DEFAULT NOW()
+);
+
+CREATE INDEX IF NOT EXISTS idx_cdr_status ON control_dedup_reviews(review_status);
+CREATE INDEX IF NOT EXISTS idx_cdr_matched ON control_dedup_reviews(matched_control_uuid);
+CREATE INDEX IF NOT EXISTS idx_cdr_parent ON control_dedup_reviews(parent_control_uuid);
+CREATE INDEX IF NOT EXISTS idx_cdr_stage ON control_dedup_reviews(dedup_stage);
+
+COMMENT ON TABLE control_dedup_reviews IS
+    'Review queue for borderline dedup matches (similarity 0.85-0.92). Human decides: link or new control.';
+
+COMMIT;
@@ -195,6 +195,11 @@ class TestControlRowConversion:
            "release_state": "draft",
            "tags": ["mfa"],
            "generation_strategy": "ungrouped",
+            "parent_control_uuid": None,
+            "parent_control_id": None,
+            "parent_control_title": None,
+            "decomposition_method": None,
+            "pipeline_version": None,
            "created_at": now,
            "updated_at": now,
        }
@@ -2,7 +2,23 @@

 ## Übersicht

-Die Control Quality Pipeline prüft und verbessert die ~9.000 Canonical Controls der Compliance-Bibliothek. Sie nutzt **PDF-basierte Verifizierung** als Ground Truth — jeder Control-Originaltext wird direkt im Quelldokument (PDF) lokalisiert.
+Die Control Quality Pipeline prüft und verbessert die Canonical Controls der Compliance-Bibliothek. Sie nutzt **PDF-basierte Verifizierung** als Ground Truth — jeder Control-Originaltext wird direkt im Quelldokument (PDF) lokalisiert.
+
+Alle Scripts liegen in **`scripts/qa/`**. Starten auf dem Mac Mini via Runner-Script:
+
+```bash
+# Job starten (laedt .env automatisch, PID-Lock, unbuffered output)
+ssh macmini "bash ~/Projekte/breakpilot-compliance/scripts/qa/run_job.sh <script.py> [args...]"
+
+# Status aller Jobs
+ssh macmini "bash ~/Projekte/breakpilot-compliance/scripts/qa/run_job.sh --status"
+
+# Log ansehen
+ssh macmini "bash ~/Projekte/breakpilot-compliance/scripts/qa/run_job.sh --log <script.py>"
+
+# Job stoppen
+ssh macmini "bash ~/Projekte/breakpilot-compliance/scripts/qa/run_job.sh --kill <script.py>"
+```

 ## Architektur

@@ -55,20 +71,24 @@ Jeder Control hat ein Feld `source_original_text` — der Chunk-Text aus dem Que

 | Metrik | Wert |
 |---|---|
-| Controls mit source_original_text | 7.943 |
-| Im PDF lokalisiert | **6.259 (79%)** |
-| Nicht gefunden (Sprachmismatch) | 1.651 |
-| Kein PDF vorhanden | 33 |
-| 100% Match-Rate | 19 Regulations (inkl. DSGVO, KI-VO, NIS2, NIST 800-53) |
+| Controls mit source_original_text | 5.751 (86%) |
+| Im PDF lokalisiert | **5.063 (88%)** |
+| Nicht gefunden | 649 |
+| Kein PDF vorhanden | 29 |
+| Recital_suspect markiert | 648 |
+| 100% Match-Rate | 20+ Regulations (inkl. DSGVO, KI-VO, NIS2, NIST 800-53, Blue Guide) |
+
+**Verlauf:** v1 (4.110, 52%) → v2 (6.091, 77%) → v3 (6.259, 79%) → v4 +Blue Guide EN (6.803, 86%) → v5 nach Cleanup (5.063/5.741, 88%)

 ### Nicht-matchende Controls

-| Ursache | Controls | Erklärung |
+| Ursache | Controls | Status |
 |---|---|---|
-| Blue Guide EN vs. DE PDF | ~562 | Controls aus englischem PDF, wir haben nur deutsches |
-| OWASP multilingual | ~632 | Controls aus PT/AR/ID/ES-Übersetzungen |
+| ~~Blue Guide EN vs. DE PDF~~ | ~~562~~ | ✅ Gelöst — EN-PDF beschafft, 544/544 gematcht |
+| ~~OWASP Top 10 multilingual~~ | ~~324~~ | ✅ Als duplicate markiert — Übersetzungen ohne Mehrwert |
 | CRA Encoding | ~76 | PDF-Ligaturen/Sonderzeichen-Differenzen |
 | CISA Secure by Design | ~113 | Falsches PDF (ENISA statt CISA) |
+| OWASP ASVS | ~173 | PDF-Matching-Problem (meist EN) |

 ## Brute-Force-Suche

@@ -100,34 +120,276 @@ Controls aus Erwägungsgründen (`article_type = preamble`) sind **kein Nachteil

 Die 1.195 v1-Controls **ohne** Originaltext sind manuell erstellt (`strategy=ungrouped`) und haben keine Chunk-Referenz.

-## DB-Status (Stand 2026-03-20)
+## OWASP Cleanup (2026-03-20)
+
+- **324 OWASP Top 10 multilingual Controls** → `duplicate` markiert (ZH, AR, ID, FR, ES, PT — Übersetzungen derselben 10 Kategorien)
+- **47 Controls** mit falscher Quellenzuordnung korrigiert (z.B. als "OWASP Top 10" getaggt, aber tatsächlich aus ASVS/SAMM/API/MASVS)
+- **~200 OWASP ASVS/SAMM/MASVS EN Controls** behalten — unique Content aus GitHub/Website, nicht im PDF auffindbar
+
+## NIST OSCAL Import (2026-03-20)
+
+**776 neue Controls** aus NIST SP 800-53 Rev 5 OSCAL (Public Domain, maschinenlesbar):
+
+- Quelle: `usnistgov/oscal-content` (JSON Catalog)
+- Vor allem **Control Enhancements** (z.B. AC-2(3), SC-7(8)) — die atomaren Unteranforderungen
+- Jeder Control enthält: Statement + Guidance + Assessment-Methoden + Cross-References + Parameters
+- `pipeline_version = 4`, `generation_strategy = 'oscal_import'`
+- Kein Pass 0a/0b nötig — Controls sind **bereits atomar**
+
+| Metrik | Vorher | Nachher |
+|---|---|---|
+| SP 800-53 Controls (aktiv) | 1.107 | **1.883** |
+| OSCAL-Abdeckung | 238/1.014 (23%) | **1.014/1.014 (100%)** |
+
+## Phase 5: RAG-Deduplizierung + Normalisierung (2026-03-20)
+
+### Durchgeführte Schritte
+
+| Schritt | Beschreibung | Controls |
+|---|---|---|
+| 5.1 | OSCAL Controls: `source_regulation` in generation_metadata gesetzt | 776 |
+| 5.2 | v3 Controls ohne Source → `needs_review` mit `missing_source` Flag | 20 |
+| 5.3 | Leerer Source-Name korrigiert (AT TKG) | 1 |
+| 5.4 | OWASP regulation_code Fehlzuordnungen korrigiert | 47 |
+| 5.5 | **duplicate/too_close Controls hart gelöscht** | **3.301** |
+| 5.6 | Processed Chunks bereinigt (gelöschte Control-IDs entfernt) | 2.520 |
+
+### Ergebnis
+
+- **Vorher:** 9.936 Controls (6.635 aktiv, 2.998 duplicate, 303 too_close)
+- **Nachher:** 6.635 Controls, **alle aktiv** (0 duplicate/too_close)
+- Alle regulation_codes haben jetzt einheitliche Source-Namen
+- OWASP-Controls sind korrekt ihren Quellen zugeordnet
+
+## DB-Status (Stand 2026-03-20, nach Phase 7.4)

 | release_state | Count |
 |---|---|
-| draft | 5.365 |
-| needs_review | 818 |
-| duplicate | 2.674 |
-| too_close | 303 |
-| **Aktiv** | **6.183** |
+| draft | ~6.030 |
+| needs_review | 838 |
+| **Gesamt** | **6.868** |

-## Scripts
+## Scripts (`scripts/qa/`)

-Alle QA-Scripts liegen in `scripts/qa/`:
+### Kern-QA (PDF-Matching)

 | Script | Beschreibung |
 |---|---|
-| `pdf_qa_all.py` | Haupt-QA: Controls gegen PDFs matchen |
-| `pdf_qa_inventory.py` | Inventar: Regulations, Controls, PDFs |
-| `apply_pdf_qa_results.py` | Ergebnisse in DB schreiben |
-| `preamble_dedup.py` | Preamble vs. Artikel Duplikat-Erkennung |
-| `qa_dedup_controls.py` | Jaccard-basierte Titel-Dedup |
-| `qa_normalize_sources.py` | Source-Namen normalisieren |
-| `db_status.py` | DB-Status-Übersicht |
+| `pdf_qa_all.py` | **Haupt-QA**: Controls gegen PDFs matchen, Artikel-Index aufbauen. Enthaelt `SOURCE_FILE_MAP`, alle Index-Builder (EU, DE, NIST, OWASP, generic). 526 Zeilen. |
+| `pdf_qa_inventory.py` | Inventar: Welche Regulations haben Controls, wie viele, welche PDFs existieren |
+| `apply_pdf_qa_results.py` | Ergebnisse aus `pdf_qa_all.py` in DB schreiben (`article_type`, `recital_suspect`) |
+| `pdf_article_lookup_poc.py` | POC: Control-Text in PDF lokalisieren, Headings von Cross-Refs unterscheiden |

-## Nächste Schritte
+### Lueckenanalyse + Control-Generierung

-1. **Blue Guide EN-PDF** beschaffen → +562 Controls matchen
-2. **CISA Secure by Design** echtes PDF finden → +113 Controls
-3. **Brute-Force Ergebnisse anwenden** — 44 falsche Source-Zuordnungen korrigieren
-4. **Frontend-Anzeige** — `article_type` im Control-Detail anzeigen
-5. **Continuous QA** — Bei neuen Controls automatisch PDF-Match prüfen
+| Script | Beschreibung |
+|---|---|
+| `gap_analysis.py` | **Phase 7.3**: Artikel im PDF vs. Controls in DB vergleichen, Luecken identifizieren |
+| `phase74_generate_gap_controls.py` | **Phase 7.4**: Neue Controls fuer Luecken via Anthropic API generieren. `pipeline_version=5`. 624 Zeilen. |
+| `benchmark_llm_controls.py` | LLM-Vergleich: gpt-oss-120b vs. Claude Sonnet fuer Control-Generierung |
+| `test_pass0a.py` | **Pass 0a Test**: Obligation Extraction + 3-Tier-Klassifizierung (Pflicht/Empfehlung/Kann). Standalone, speichert JSON. |
+
+### Deduplizierung + Normalisierung
+
+| Script | Beschreibung |
+|---|---|
+| `preamble_dedup.py` | Preamble vs. Artikel Duplikat-Erkennung (Jaccard >= 0.40) |
+| `qa_dedup_controls.py` | Jaccard-basierte Titel-Deduplizierung |
+| `qa_apply_and_dedup.py` | Ergebnisse anwenden + Duplikate in einem Schritt markieren |
+| `qa_normalize_sources.py` | Source-Namen normalisieren (kanonische Namen) |
+| `phase5_normalize_and_cleanup.py` | **Phase 5**: Normalisierung + 3.301 Duplikate hart loeschen |
+| `qa_delete_gpsr_dupe.py` | GPSR-Duplikate loeschen |
+| `delete_gpsr_prod.py` | GPSR-Duplikate aus Production-Qdrant entfernen |
+
+### Quellen-spezifische Scripts
+
+| Script | Beschreibung |
+|---|---|
+| `blue_guide_en_match.py` | Blue Guide EN-PDF matchen (544/544 Erfolg) |
+| `owasp_cleanup.py` | OWASP multilingual Cleanup (324 Duplikate) + Source-Fix (47 korrigiert) |
+| `owasp_github_match.py` | OWASP ASVS/SAMM/MASVS gegen GitHub-Markdown matchen |
+| `oscal_import.py` | NIST OSCAL Import (776 Controls aus JSON Catalog) |
+| `oscal_analysis.py` | NIST OSCAL Analyse: Abdeckung, fehlende Controls |
+
+### Diagnose + Utilities
+
+| Script | Beschreibung |
+|---|---|
+| `db_status.py` | DB-Status: release_state Counts, pipeline_version, source Verteilung |
+| `debug_low_match.py` | Debugging: Warum matchen Blue Guide / OWASP / CISA schlecht? |
+| `qa_article_map_all_chunks.py` | Alle Chunks Artikel-Nummern zuordnen (Bulk) |
+| `backfill_job_66228863.py` | Einmaliger Backfill-Job |
+| `sync_controls_to_prod.py` | Controls von Dev nach Production synchronisieren |
+
+### Runner
+
+| Script | Beschreibung |
+|---|---|
+| `run_job.sh` | **Job-Runner**: Laedt `.env`, PID-Lock, Monitoring (`--status`, `--log`, `--kill`) |
+
+## Phase 7: PDF-Validierung + Enrichment (2026-03-20)
+
+### 7.1 + 7.2: Controls gegen PDFs validiert + Ergebnisse angewendet ✅
+
+- 5.063 Controls erfolgreich im Original-PDF lokalisiert (88%)
+- `article_type` fuer alle gematchten Controls gesetzt
+- 648 Preamble-Controls als `recital_suspect` in `generation_metadata` markiert
+- 332 Controls nicht matchbar (OWASP ASVS 132, CISA 72, ENISA 38, OWASP SAMM 31, CRA 28)
+
+### 7.3: Lueckenanalyse ✅
+
+**494 Artikel-Luecken** in 15 Quellen identifiziert. Geschaetzt ~300 davon actionable.
+
+| Source | Luecken | Coverage | Bemerkung |
+|---|---:|---:|---|
+| AML-Verordnung | 91 | 5% | Kaum ingestiert |
+| MiCA | 71 | 52% | Grosse Verordnung |
+| NIST SP 800-53 | 59 | 83% | Meist Section-Header, nur SA-15 fehlt |
+| OWASP ASVS 4.0 | 47 | 35% | Requirement-Gruppen fehlen |
+| Batterieverordnung | 41 | 58% | |
+| DSGVO | 35 | 65% | Einige Governance/Aufsicht-Artikel |
+| ENISA ICS/SCADA | 34 | 31% | |
+| ENISA Supply Chain | 26 | 7% | |
+| CRA | 23 | 68% | |
+| NIS2 | 16 | 65% | |
+| KI-Verordnung | 15 | 87% | Fast komplett |
+| Maschinenverordnung | 5 | 91% | Fast komplett |
+
+### 7.4: Neue Controls fuer Luecken generieren ✅ (2026-03-20)
+
+Script: `phase74_generate_gap_controls.py --resume`
+
+- **494 Artikel-Luecken** in 15 Quellen → Anthropic Claude Sonnet 4.6
+- `pipeline_version = 5`, `generation_strategy = 'phase74_gap_fill'`
+- Direkt PDF-Text als Input (nicht RAG-Chunks)
+- Starten via: `run_job.sh phase74_generate_gap_controls.py --resume`
+
+**Ergebnis:**
+
+| Source | Luecken | Generiert |
+|---|---:|---:|
+| AML-Verordnung | 91 | 97 |
+| MiCA | 71 | 68 |
+| NIST SP 800-53 | 59 | 19 |
+| KI-Verordnung | 15 | 15 |
+| OWASP ASVS 4.0 | 47 | 11 |
+| Batterieverordnung | 41 | 9 |
+| DSGVO | 35 | 4 |
+| OWASP Top 10 | 12 | 3 |
+| NIS2 | 16 | 3 |
+| CRA | 23 | 3 |
+| OECD KI-Empfehlung | 4 | 1 |
+| **Gesamt** | **494** | **233** |
+
+Nicht generiert: 75 zu kurzer Text, 29 NIST-Intros, 11 Parse-Errors, 162 ID-Konflikte (COMP-1000 etc.).
+API-Kosten: ~$7,55 (109 min Laufzeit).
+
+## Pass 0a: Obligation Extraction — 3-Tier-Klassifizierung
+
+### Konzept
+
+Pass 0a zerlegt Rich Controls (~6.000) in **atomare Obligations** per LLM (Claude Sonnet 4.6).
+Jede Obligation wird durch den **Quality Gate** klassifiziert — nicht gefiltert:
+
+| obligation_type | Signal | Beispiel |
+|---|---|---|
+| **pflicht** | müssen, muss, ist zu, hat zu, shall, must, required | "Der Betreiber muss alle Daten verschluesseln" |
+| **empfehlung** | soll, sollen, should, sicherstellen, gewaehrleisten, dokumentieren | "Der Betreiber soll regelmaessige Audits durchfuehren" |
+| **kann** | kann, koennen, darf, duerfen, may, optional | "Der Betreiber kann zusaetzliche Massnahmen ergreifen" |
+
+**Wichtig:** Nichts wird mehr rejected wegen fehlendem normativem Signal. Obligations ohne Signal werden als `empfehlung` klassifiziert. Rejected werden nur noch: Evidence-Only, zu kurz (<20 Zeichen), fehlender Parent-Link.
+
+### Warum auch Empfehlungen behalten?
+
+Empfehlungen helfen Firmen, ihre Systeme sicherer zu machen — ueber das Pflichtprogramm hinaus. Im Frontend erhalten Kunden einen Marker, der klar anzeigt:
+
+- **Pflicht** = gesetzlich/regulatorisch vorgeschrieben
+- **Empfehlung** = Best Practice, freiwillig, aber wertvoll
+- **Kann** = optional, weitergehende Massnahme
+
+### Quality Gate — Kritische Flags
+
+| Flag | Kritisch? | Beschreibung |
+|---|---|---|
+| `has_normative_signal` | Nein | Informativer Check, kein Ablehnungsgrund |
+| `obligation_type` | — | Klassifizierung (pflicht/empfehlung/kann) |
+| `not_evidence_only` | **Ja** | Kein reiner Nachweis-Eintrag |
+| `min_length` | **Ja** | Mindestens 20 Zeichen |
+| `has_parent_link` | **Ja** | Verbindung zum Parent-Control |
+| `single_action` | Nein | Nur ein Hauptverb (heuristisch) |
+| `not_rationale` | Nein | Keine reine Begruendung |
+
+### Normative Signal Detection — Regex-Tiers
+
+```
+Tier 1 (Pflicht): muessen, muss, ist/sind/hat/haben zu + Infinitiv,
+                   Compound-Verben (festzustellen, vorzunehmen),
+                   Gerundivum (mitzuteilen, bereitzustellen),
+                   shall, must, required
+
+Tier 2 (Empfehlung): soll, sollen, sollte, sollten,
+                       gewaehrleisten, sicherstellen,
+                       should, ensure, recommend,
+                       dokumentieren, implementieren, ueberpruefen
+
+Tier 3 (Kann): kann, koennen, darf, duerfen, may, optional
+```
+
+### Testergebnisse (3 Iterationen, 2026-03-20)
+
+| Run | Controls | Obligations | Validated | Rejected | Kosten |
+|---|---:|---:|---:|---:|---:|
+| 1 (v0 Regex) | 10 | ~100 | 68% | 32% | $0,28 |
+| 2 (v1 Regex) | 50 | ~530 | 78% | 22% | $1,43 |
+| 3 (v2 Regex) | 50 | ~530 | 86% | 14% | $1,44 |
+| 4 (3-Tier) | 60 | — | — | — | — |
+
+Run 4 laeuft mit dem neuen Klassifizierer — statt PASS/REJECT wird jetzt PFLICHT/EMPFEHLUNG/KANN ausgegeben.
+
+### Scripts
+
+| Script | Beschreibung |
+|---|---|
+| `test_pass0a.py` | **Test-Script**: Standalone (kein SQLAlchemy), psycopg2 + Anthropic API. Speichert Ergebnisse als JSON. |
+
+```bash
+# Test mit 10 Controls
+run_job.sh test_pass0a.py --limit 10
+
+# Test mit bestimmter Quelle
+run_job.sh test_pass0a.py --limit 20 --source "DSGVO"
+
+# Ergebnisse: /tmp/pass0a_results_<N>controls.json
+```
+
+### Backend-Code
+
+- **Klassifizierung:** `backend-compliance/compliance/services/decomposition_pass.py`
+  - `classify_obligation_type()` — 3-Tier-Klassifizierung
+  - `quality_gate()` — gibt `obligation_type` in Flags zurueck
+  - `passes_quality_gate()` — `has_normative_signal` nicht mehr kritisch
+  - `ObligationCandidate.obligation_type` — neues Feld
+
+### Hochrechnung (basierend auf 50-Control-Runs)
+
+| Metrik | Wert |
+|---|---|
+| Kosten pro Control | ~$0,029 |
+| Kosten fuer ~6.000 Controls | **~$172** |
+| Laufzeit (geschaetzt) | ~25h |
+| Obligations pro Control | ~10,5 |
+
+---
+
+## Naechste Schritte
+
+1. ~~**Phase 5 Cleanup** → 3.301 Duplikate geloescht, Source normalisiert~~ ✅
+2. ~~**Phase 6 Pipeline-Haertung** → Source aus REGULATION_LICENSE_MAP~~ ✅
+3. ~~**Phase 7.1-7.3** → PDF-Validierung + Enrichment + Lueckenanalyse~~ ✅
+4. ~~**Phase 7.4** → 233 neue Controls fuer Luecken generiert ($7,55)~~ ✅
+5. **Pass 0a** → Obligation Extraction mit 3-Tier-Klassifizierung (Tests laufen, ~$172)
+6. **Pass 0b** → Atomic Control Composition aus validierten Obligations
+7. **Pass 1-5** → Multi-Layer Migration (Code + 500 Tests bereits vorhanden)
+8. **Phase 8** → Qdrant Re-Ingestion (Runtime-Betrieb, ZULETZT)
+9. **needs_review Triage** — 838 Controls klassifizieren
+10. **Frontend** — `obligation_type` (Pflicht/Empfehlung/Kann) + `article_type` anzeigen
@@ -0,0 +1,206 @@
+# RAG Pipeline Benchmark & Optimierungen
+
+Stand: 2026-03-21. Vergleich unserer Implementierung mit State of the Art. Priorisierte Empfehlungen nach Impact/Effort.
+
+---
+
+## Aktuelle Pipeline (Ist-Zustand)
+
+```mermaid
+flowchart LR
+    A[Dokumente] -->|Document Crawler| B[Chunks 512/50]
+    B -->|bge-m3| C[Qdrant Dense]
+    C -->|Cosine Search| D[Control Generator v2]
+    D -->|LLM| E[Rich Controls 6.373]
+    E -->|Pass 0a| F[Obligations]
+    F -->|Pass 0b| G[Atomare Controls]
+    G -->|4-Stage Dedup| H[Master Controls ~18K]
+```
+
+| Komponente | Implementierung | SOTA-Bewertung |
+|-----------|----------------|----------------|
+| **Chunking** | Rekursiv, 512 Zeichen, 50 Overlap | Zu klein fuer Rechtstexte |
+| **Embedding** | bge-m3 (1024-dim, Ollama) | Gut, aber nur Dense genutzt |
+| **Vector DB** | Qdrant mit Payload-Filtering | Hybrid Search nicht aktiviert |
+| **Retrieval** | Pure Dense Cosine Similarity | Kein Re-Ranking, kein BM25 |
+| **Extraktion** | 3-Tier (Exact → Embedding → LLM) | Solide Architektur |
+| **Dedup** | 4-Stage (Pattern → Action → Object → Embedding) | Ueberdurchschnittlich |
+| **QA** | 5-Metrik Similarity + PDF-QA Matching | Gut, RAGAS fehlt |
+
+---
+
+## Tier 1: Quick Wins (Tage, nicht Wochen)
+
+### 1. Chunk-Groesse erhoehen: 512 → 1024, Overlap 50 → 128
+
+**Problem:** NAACL 2025 Vectara-Studie zeigt: fuer analytische/juristische Queries sind 512-1024 Token optimal. Unsere 512-Zeichen-Chunks (= ~128 Token) sind deutlich zu klein.
+
+**Unsere Lessons Learned:** "Chunks werden mitten im Absatz abgeschnitten. Artikel- und Paragraphennummern fehlen."
+
+**Aenderung:** Config-Parameter in `ingest-phase-h.sh` anpassen.
+
+| Metrik | Vorher | Nachher |
+|--------|--------|---------|
+| Chunk Size | 512 chars (~128 Token) | 1024 chars (~256 Token) |
+| Overlap | 50 chars (10%) | 128 chars (12.5%) |
+
+**Impact:** HOCH | **Effort:** NIEDRIG
+
+### 2. Ollama JSON-Mode fuer Obligation Extraction
+
+**Problem:** `_parse_json` in `decomposition_pass.py` hat Regex-Fallback — das zeigt, dass LLM-Output nicht zuverlaessig JSON ist.
+
+**Aenderung:** `format: "json"` in Ollama-API-Calls setzen.
+
+**Impact:** MITTEL | **Effort:** NIEDRIG (1 Parameter)
+
+### 3. Chain-of-Thought Prompting fuer Pass 0a/0b
+
+**Problem:** LegalGPT-Framework zeigt: explizite Reasoning-Chains ("Erst Addressat identifizieren, dann Aktion, dann normative Staerke") verbessern Extraktionsqualitaet signifikant.
+
+**Impact:** MITTEL | **Effort:** NIEDRIG (Prompt Engineering)
+
+---
+
+## Tier 2: High Impact, Medium Effort (1-2 Wochen)
+
+### 4. Hybrid Search (Dense + Sparse) via Qdrant
+
+**Problem:** Reine Dense-Suche. Juristische Queries enthalten spezifische Begriffe ("DSGVO Art. 35", "Abs. 3"), die BM25/Sparse besser findet.
+
+**Loesungsansatz:** BGE-M3 generiert bereits Sparse Vectors — wir verwerfen sie aktuell!
+
+```
+Qdrant Query API:
+- Dense: bge-m3 Cosine (wie bisher)
+- Sparse: bge-m3 Sparse Vectors (neu)
+- Fusion: Reciprocal Rank Fusion (RRF)
+```
+
+**Benchmarks (Anthropic):** 49% weniger fehlgeschlagene Retrievals mit Contextual Retrieval, 67% mit Re-Ranking.
+
+**Impact:** SEHR HOCH | **Effort:** MITTEL
+
+### 5. Cross-Encoder Re-Ranking
+
+**Problem:** Top-5 Ergebnisse direkt an LLM — keine Qualitaetspruefung der Retrieval-Ergebnisse.
+
+**Loesungsansatz:** BGE Reranker v2 (MIT-Lizenz) auf Top-20 Ergebnisse, dann Top-5 an LLM.
+
+| Re-Ranker | Lizenz | Empfehlung |
+|-----------|--------|------------|
+| BGE Reranker v2 | MIT | Empfohlen |
+| Jina Reranker v2 | Apache-2.0 | Alternative |
+| ColBERT v2 | MIT | Spaeter |
+
+**Impact:** HOCH | **Effort:** MITTEL
+
+### 6. Cross-Regulation Dedup Pass
+
+**Problem:** Dedup filtert immer nach `pattern_id` — Controls aus DSGVO Art. 25 und NIS2 Art. 21 (beide Security-by-Design) werden nie verglichen.
+
+**Loesungsansatz:** Zweiter Qdrant-Search ohne `pattern_id`-Filter nach dem normalen Dedup-Pass.
+
+**Impact:** HOCH | **Effort:** MITTEL
+
+### 7. Automatische Regressionstests (Golden Set)
+
+**Problem:** Keine systematische Qualitaetsmessung nach Pipeline-Aenderungen.
+
+**Loesungsansatz:** 20-Chunk Golden Set → Control-Generation → Output-Stabilitaet pruefen.
+
+**Impact:** HOCH | **Effort:** NIEDRIG
+
+---
+
+## Tier 3: Strategische Investitionen (Wochen bis Monate)
+
+### 8. Artikel-Boundary Chunking
+
+Eigener Splitter fuer EU-Verordnungen und deutsche Gesetze: Split an "Art.", "Artikel", "Paragraph"-Grenzen statt nach Zeichenzahl.
+
+### 9. RAGAS Evaluation Pipeline
+
+[RAGAS](https://docs.ragas.io/) mit Golden Dataset (50-100 manuell verifizierte Control-to-Source Mappings). Metriken: Faithfulness, Answer Relevancy, Context Precision, Context Recall.
+
+### 10. BGE-M3 Fine-Tuning
+
+Fine-Tuning auf Compliance-Corpus (~6.373 Control-Titel/Objective-Paare). Research zeigt +10-30% Domain-Retrieval-Verbesserung.
+
+### 11. LLM-as-Judge
+
+Claude Sonnet bewertet jeden generierten Control auf Faithfulness zum Quelltext (~$0.01/Control).
+
+### 12. Active Learning aus Review-Queue
+
+Menschliche Entscheidungen der Dedup Review-Queue nutzen, um Schwellenwerte ueber die Zeit zu optimieren.
+
+---
+
+## Nicht empfohlen (niedriger ROI oder Konflikte)
+
+| Ansatz | Grund |
+|--------|-------|
+| Jina v3 Embeddings | **CC-BY-NC-4.0** — verletzt Open Source Policy |
+| Voyage-law-2 | API-only, proprietaer — kein Self-Hosting |
+| Semantic Chunking | Benchmarks zeigen keinen Vorteil gegenueber Recursive fuer strukturierte Dokumente |
+| HyDE als Primaerstrategie | Latenz (+43-60%) + Halluzinationsrisiko |
+| Knowledge Graph RAG | Massiver Aufwand, unklarer Gewinn bei strukturiertem Rechtskorpus |
+
+---
+
+## Embedding-Modell Vergleich
+
+| Modell | MTEB Score | Multilingual | Kontext | Lizenz | Bewertung |
+|--------|-----------|-------------|---------|--------|-----------|
+| **BGE-M3** (aktuell) | 63.0 | 100+ Sprachen | 8192 Token | MIT | Gut, Dense+Sparse+ColBERT |
+| Jina v3 | 65.5 | 89 Sprachen | 8192 Token | CC-BY-NC | Nicht nutzbar (Lizenz!) |
+| E5-Mistral-7B | ~65 | Gut | 4096 Token | MIT | Gross, hoher RAM |
+| Voyage-law-2 | Best Legal | EN Legal | 16K Token | Proprietaer | Nicht nutzbar (API-only) |
+
+**Fazit:** BGE-M3 bleibt die beste Wahl fuer unseren Stack. Sparse-Vectors aktivieren und Fine-Tuning bringen mehr als ein Modellwechsel.
+
+---
+
+## Test-Coverage Analyse
+
+### Pipeline-Module (567 Tests)
+
+| Modul | Tests | Bewertung | Fehlende Tests |
+|-------|-------|-----------|----------------|
+| Control Generator | 110 | Exzellent | 10-15 Edge Cases |
+| Obligation Extractor | 107 | Exzellent | 8-10 Edge Cases |
+| Decomposition Pass | 90 | Exzellent | 5-8 Edge Cases |
+| Pattern Matcher | 72 | Gut | 10-15 Edge Cases |
+| Control Dedup | 56 | Exzellent | 5-8 Edge Cases |
+| Control Composer | 54 | Gut | 8-10 Edge Cases |
+| Pipeline Adapter | 36 | Gut | 10-15 Edge Cases |
+| Citation Backfill | 20 | Moderat | 5-8 Edge Cases |
+| License Gate | 12 | Minimal | 5-8 Edge Cases |
+| RAG Client | 10 | Minimal | 5-8 Edge Cases |
+
+### Kritische Luecken (fehlende Tests)
+
+| Service | Datei | Prioritaet |
+|---------|-------|------------|
+| AI Compliance Assistant | `ai_compliance_assistant.py` | HOCH (25-30 Tests noetig) |
+| PDF Extractor | `pdf_extractor.py` | HOCH (20-25 Tests noetig) |
+| LLM Provider | `llm_provider.py` | HOCH (15-20 Tests noetig) |
+| Similarity Detector | `similarity_detector.py` | MITTEL (20-25 Tests noetig) |
+| Anchor Finder | `anchor_finder.py` | MITTEL |
+
+### Test-Infrastruktur
+
+**Fehlend:** Shared `conftest.py` mit gemeinsamen Fixtures (LLM-Mock, DB-Mock, Embedding-Mock). Aktuell sind Fixtures in jedem Test-File dupliziert.
+
+---
+
+## Quellen
+
+- [NAACL 2025 Vectara Chunking Study](https://blog.premai.io/rag-chunking-strategies-the-2026-benchmark-guide/)
+- [Anthropic Contextual Retrieval](https://www.anthropic.com/news/contextual-retrieval)
+- [Qdrant Hybrid Search Query API](https://qdrant.tech/articles/hybrid-search/)
+- [Structure-Aware Chunking for Legal (ACL 2025)](https://aclanthology.org/2025.justnlp-main.19/)
+- [RAGAS Evaluation Framework](https://docs.ragas.io/)
+- [BGE Reranker v2 (MIT)](https://huggingface.co/BAAI/bge-reranker-v2-m3)
+- [LegalGPT / CALLM Framework](https://www.emergentmind.com/topics/compliance-alignment-llm-callm)
@@ -0,0 +1,223 @@
+# RAG Pipeline: Lessons Learned & Hardening
+
+## Übersicht
+
+Dieses Dokument beschreibt die Erkenntnisse aus dem Aufbau der RAG-Pipeline und die daraus abgeleiteten Maßnahmen zur Härtung. Es dient als Referenz für zukünftige Ingestion-Runs und Pipeline-Erweiterungen.
+
+## Architektur: Wann brauchen wir RAG vs. Direct PDF?
+
+### RAG ist nötig für:
+
+| Use Case | Warum RAG? |
+|---|---|
+| **Compliance Advisor (Chat)** | Semantische Suche über 38+ Dokumente in Echtzeit |
+| **Cross-Regulation Mapping** | "Zeige alle Anforderungen zu Verschlüsselung" über alle Quellen |
+| **Customer Scope-Filtering** | Nur Chunks aus relevanten Regulations für den Kunden |
+| **Inkrementelle Updates** | Neues Dokument → nur neue Chunks verarbeiten |
+
+### RAG ist NICHT nötig für:
+
+| Use Case | Besser: Direct PDF |
+|---|---|
+| **Control-Generierung (Batch)** | PDF → PyMuPDF → Strukturparser → Artikel-Index → API |
+| **PDF-QA/Verifizierung** | Substring-Match direkt im PDF (schneller, exakter) |
+| **Artikel/§-Extraktion** | Regex-basierte Extraktion aus PDF-Text |
+
+### Hybrid-Ansatz (Empfehlung)
+
+```
+Control-Generierung:    PDF → Strukturparser → Artikel-Index → Anthropic API
+                        (KEIN RAG nötig, direkt aus PDF)
+
+Runtime-Betrieb:        Qdrant-RAG für semantische Suche, Chat, Scope-Analyse
+                        (RAG mit angereicherten Chunks + Struktur-Metadaten)
+```
+
+## Fehler und Root Causes
+
+### 1. Doppelte Ingestion = Doppelte Controls
+
+**Problem:** Gleiche PDFs unter verschiedenen Namen ingestiert (z.B. "Maschinenverordnung" und "Verordnung (EU) 2023/1230") → unterschiedliche Chunks (anderes Chunking) → anderer Hash → doppelt verarbeitet → doppelte Controls.
+
+**Root Cause:**
+- `regulation_name` aus Chunk-Metadaten statt aus kanonischer Quelle
+- UNIQUE-Constraint nur `(chunk_hash, collection, document_version)` — nicht global
+- Kein Check ob `regulation_code` bereits in einer Collection existiert
+
+**Fix (implementiert):**
+- `REGULATION_LICENSE_MAP` enthält jetzt kanonische `name`-Werte die den DB-Einträgen entsprechen
+- `source_citation.source` wird aus `REGULATION_LICENSE_MAP.name` genommen, NICHT aus `chunk.regulation_name`
+- Phase 5 Cleanup: 3.301 Duplikate hart gelöscht
+
+**Fix (noch offen):**
+- Chunk-Hash UNIQUE Constraint global machen: `(chunk_hash, document_version)` statt `(chunk_hash, collection, document_version)`
+- Vor Ingestion: Check ob `regulation_code` bereits in einer Collection existiert
+
+### 2. Chunks verlieren Strukturinformation
+
+**Problem:** Chunks werden mitten im Absatz abgeschnitten. § und Artikelnummern fehlen in den Chunk-Metadaten. Kontext des Kapitels/Abschnitts geht verloren.
+
+**Root Cause:**
+- `chunk_strategy=recursive` mit `chunk_size=512, chunk_overlap=50` — zu kleine Chunks
+- Chunking beachtet keine Dokumentstruktur (Artikel-/Paragraphengrenzen)
+- Keine Einleitung/Kapitelkontext als Prefix
+
+**Empfehlung für Re-Ingestion:**
+- **Strukturiertes Chunking:** Chunks an Artikel-/Paragraphengrenzen schneiden
+- **Kontext-Prefix:** Kapiteleinleitung und übergeordnete Struktur mitliefern
+- **Metadaten anreichern:** `article`, `paragraph`, `article_type`, `section_hierarchy`
+- **Größere Chunks:** Mindestens 1024 Tokens, besser volle Artikel/Paragraphen
+
+### 3. Cross-Collection-Duplikate
+
+**Problem:** `nist_csf_2_0` in `bp_compliance_ce` (67 Chunks) UND `bp_compliance_datenschutz` (162 Chunks). EU-Verordnungen sowohl in `bp_compliance_ce` als auch `bp_compliance_gesetze`.
+
+**Root Cause:** Keine Collection-Zuordnungsregeln. Manuelle Zuweisung bei Ingestion.
+
+**Fix:** `cleanup-qdrant-duplicates.py` Script bereinigt Cross-Collection-Duplikate.
+
+**Empfehlung:** Klare Collection-Zuordnungsregeln:
+- `bp_compliance_ce` = EU-Verordnungen + internationale Standards
+- `bp_compliance_gesetze` = Deutsche + österreichische Gesetze (NUR nationale Gesetze)
+- `bp_compliance_datenschutz` = EDPB/WP29 Leitlinien + Privacy Frameworks
+
+### 4. OWASP Multilingual Controls
+
+**Problem:** 324 OWASP Top 10 Controls in ZH, AR, ID, FR, ES, PT — Übersetzungen derselben 10 Kategorien. Kein Mehrwert, aber 324 doppelte Controls generiert.
+
+**Root Cause:** Multilingual PDFs/GitHub-Quellen ohne Spracherkennung ingestiert.
+
+**Fix:** 324 als `duplicate` markiert und gelöscht.
+
+**Empfehlung:** Bei Ingestion Spracherkennung + Deduplizierung. Nur DE + EN behalten.
+
+### 5. Fehlende Artikel/Paragraph-Extraktion
+
+**Problem:** Chunks haben `article` und `paragraph` oft leer oder falsch. Die LLM-basierte Extraktion bei der Control-Generierung ist unzuverlässig.
+
+**Root Cause:** Ingestion-Pipeline extrahiert keine Strukturinformation aus dem PDF.
+
+**Fix (implementiert):** PDF-QA-Pipeline (`pdf_qa_all.py`) matched `source_original_text` gegen Original-PDFs und extrahiert korrekte Artikel/Paragraphen — 86% Match-Rate.
+
+**Empfehlung:** Bei Re-Ingestion direkt in den Chunk-Metadaten speichern.
+
+### 6. Job-Tracking nicht persistent
+
+**Problem:** Generation-Jobs laufen als Background-Tasks. Kein Logging, welche Chunks verarbeitet, Status nur über API abfragbar. Bei API-Timeout oder Restart geht der Fortschritt verloren.
+
+**Root Cause:** `asyncio.create_task()` hat keinen Recovery-Mechanismus.
+
+**Fix (teilweise):** `canonical_generation_jobs` Tabelle trackt Jobs. `canonical_processed_chunks` markiert verarbeitete Chunks.
+
+**Empfehlung:**
+- Job-Log in DB persistieren (nicht nur stdout)
+- Fortschritt in `canonical_generation_jobs.progress` als JSONB speichern
+- Chunk-Level-Status: verarbeitet / übersprungen / Fehler
+- Recovery-Fähigkeit: Job kann von letztem Checkpoint fortgesetzt werden
+
+## Empfohlene Metadaten für Re-Ingestion
+
+### Chunk-Level Metadaten (Qdrant Payload)
+
+```json
+{
+  "chunk_text": "...",
+  "regulation_code": "eu_2016_679",
+  "regulation_name_de": "DSGVO (EU) 2016/679",
+  "regulation_name_en": "GDPR (EU) 2016/679",
+  "article": "25",
+  "article_title": "Datenschutz durch Technikgestaltung und datenschutzfreundliche Voreinstellungen",
+  "article_type": "article",
+  "paragraph": "1",
+  "section_hierarchy": ["Kapitel IV", "Abschnitt 2", "Artikel 25"],
+  "chapter_context": "Kapitel IV — Verantwortlicher und Auftragsverarbeiter",
+  "pages": [45, 46],
+  "effective_date": "2018-05-25",
+  "publication_date": "2016-04-27",
+  "document_version": "2016-04-27",
+  "source_language": "de",
+  "source_url": "https://eur-lex.europa.eu/...",
+  "celex": "32016R0679",
+  "license": "EU_LAW",
+  "license_rule": 1,
+  "source_type": "law",
+  "category": "datenschutz",
+  "chunk_position": 42,
+  "total_chunks": 423
+}
+```
+
+### Dokument-Level Metadaten (Corpus Version)
+
+```json
+{
+  "regulation_code": "eu_2016_679",
+  "canonical_name_de": "DSGVO (EU) 2016/679",
+  "canonical_name_en": "GDPR (EU) 2016/679",
+  "document_type": "eu_regulation",
+  "effective_date": "2018-05-25",
+  "publication_date": "2016-04-27",
+  "supersedes": null,
+  "superseded_by": null,
+  "source_pdf": "gdpr_regulation_eu_2016_679.pdf",
+  "source_pdf_sha256": "abc123...",
+  "total_articles": 99,
+  "total_recitals": 173,
+  "total_annexes": 0,
+  "ingestion_date": "2026-03-20",
+  "ingestion_version": "v2"
+}
+```
+
+## Pipeline-Härtung Checkliste
+
+### Vor Ingestion
+
+- [ ] Prüfen ob `regulation_code` bereits in einer Collection existiert
+- [ ] PDF-SHA256 gegen bekannte PDFs prüfen (Duplikat-Erkennung)
+- [ ] `regulation_name` aus `REGULATION_LICENSE_MAP` verwenden, NICHT aus Chunk-Metadaten
+- [ ] Spracherkennung: Nur DE + EN ingestieren
+- [ ] Dokument-Metadaten (effective_date, publication_date) recherchieren
+
+### Während Ingestion
+
+- [ ] Strukturiertes Chunking an Artikel-/Paragraphengrenzen
+- [ ] Kontext-Prefix mit Kapiteleinleitung
+- [ ] Chunk-Metadaten anreichern (article, paragraph, article_type, section_hierarchy)
+- [ ] Fortschritt in DB loggen
+
+### Nach Ingestion
+
+- [ ] Chunk-Count pro `regulation_code` prüfen (Sanity Check)
+- [ ] PDF-QA gegen Original-PDF laufen lassen
+- [ ] Cross-Collection-Duplikat-Check
+- [ ] Corpus-Version in DB eintragen
+
+### Control-Generierung
+
+- [ ] `source_citation.source` aus `REGULATION_LICENSE_MAP.name`, NICHT aus Chunk-Metadaten
+- [ ] Harmonisierung: Threshold 0.85 für Duplikate innerhalb gleicher `regulation_code`
+- [ ] Cross-Regulation-Harmonisierung bei ähnlichen Themen (z.B. DSGVO Art. 25 ↔ NIS2 Art. 21)
+- [ ] Job-Fortschritt persistent in DB speichern
+
+## Workflow: Mac Mini → Production Sync
+
+```
+1. Mac Mini: PDF → Qdrant (lokal, http://macmini:6333)
+2. Mac Mini: Control-Generierung → PostgreSQL (shared, 46.225.100.82:54321)
+3. QA: PDF-Match, Dedup, Source-Normalisierung
+4. Qdrant Migration: macmini:6333 → qdrant-dev.breakpilot.ai (scripts/migrate-qdrant.py)
+5. Deploy: git push gitea → Coolify Build + Deploy
+```
+
+**WICHTIG:** PostgreSQL ist SHARED — Änderungen auf Mac Mini sind sofort in Production sichtbar. Qdrant hat getrennte Instanzen (lokal + production) und muss manuell synchronisiert werden.
+
+## Scripts
+
+| Script | Beschreibung |
+|---|---|
+| `scripts/ingest-phase-h.sh` | Haupt-Ingestion: 38 Dokumente → Qdrant |
+| `scripts/cleanup-qdrant-duplicates.py` | Qdrant Duplikat-Cleanup (8 Schritte) |
+| `scripts/migrate-qdrant.py` | Qdrant Migration: lokal → production |
+| `scripts/qa/phase5_normalize_and_cleanup.py` | DB Normalisierung + Hard Delete |
+| `scripts/qa/pdf_qa_all.py` | PDF-Match QA |
@@ -96,6 +96,7 @@ erDiagram
        varchar verification_method
        varchar target_audience
        varchar generation_strategy
+        varchar obligation_type
        smallint pipeline_version
        integer license_rule
        jsonb source_citation
@@ -936,9 +937,11 @@ Drei Kompositions-Modi:

 Zerlegt Rich Controls in atomare Controls. Laeuft VOR den Migration Passes 1-5.

-#### Pass 0a — Obligation Extraction
+#### Pass 0a — Obligation Extraction + 3-Tier-Klassifizierung

-Extrahiert einzelne normative Pflichten aus einem Rich Control per LLM.
+Extrahiert einzelne normative Pflichten aus einem Rich Control per LLM (Claude Sonnet 4.6).
+Jede Obligation wird als **pflicht**, **empfehlung** oder **kann** klassifiziert — nichts wird
+wegen fehlendem normativem Signal abgelehnt.

 **6 Guardrails:**

@@ -949,23 +952,37 @@ Extrahiert einzelne normative Pflichten aus einem Rich Control per LLM.
 5. Nicht auf Evidence-Ebene zerlegen
 6. Parent-Link immer erhalten

-**Quality Gate:** Jeder Kandidat wird gegen 6 Kriterien geprueft:
+**3-Tier Obligation Classification:**

- `has_normative_signal` — Normatives Sprachsignal erkannt
- `single_action` — Nur eine Handlung
- `not_rationale` — Keine blosse Begruendung
- `not_evidence_only` — Kein reines Evidence-Fragment
- `min_length` — Mindestlaenge erreicht
- `has_parent_link` — Referenz zum Rich Control
+| obligation_type | Signal-Beispiele | Bedeutung |
+|---|---|---|
+| `pflicht` | müssen, ist zu, shall, must, required | Gesetzliche/regulatorische Pflicht |
+| `empfehlung` | soll, should, sicherstellen, dokumentieren | Best Practice, freiwillig |
+| `kann` | kann, darf, may, optional | Optionale Massnahme |

-Kritische Checks: `has_normative_signal`, `not_evidence_only`, `min_length`, `has_parent_link`
+Obligations ohne erkennbares Signal werden als `empfehlung` klassifiziert (nicht rejected).
+Empfehlungen helfen Firmen, Systeme ueber das Pflichtprogramm hinaus zu sichern.
+
+**Quality Gate — Kritische Checks:**
+
+| Flag | Kritisch? | Beschreibung |
+|---|---|---|
+| `obligation_type` | — | Klassifizierung (pflicht/empfehlung/kann) |
+| `not_evidence_only` | **Ja** | Kein reines Evidence-Fragment |
+| `min_length` | **Ja** | Mindestlaenge (20 Zeichen) |
+| `has_parent_link` | **Ja** | Referenz zum Rich Control |
+| `has_normative_signal` | Nein | Informativer Check (nicht mehr Ablehnungsgrund) |
+| `single_action` | Nein | Nur eine Handlung (heuristisch) |
+| `not_rationale` | Nein | Keine blosse Begruendung |

 #### Pass 0b — Atomic Control Composition

 Erstellt aus jedem validierten Obligation Candidate ein atomares Control
-(LLM-gestuetzt mit Template-Fallback).
+(LLM-gestuetzt mit Template-Fallback). Das `obligation_type` Feld wird
+vom Parent-Obligation uebernommen.

 **Datei:** `compliance/services/decomposition_pass.py`
+**Test-Script:** `scripts/qa/test_pass0a.py` (standalone, speichert JSON)

 ---

@@ -1012,11 +1029,13 @@ Die Crosswalk-Matrix bildet diese N:M-Beziehung ab.

 **Migration 061:** Decomposition-Tabellen

-| Tabelle | Beschreibung |
+| Tabelle / Feld | Beschreibung |
 |---------|-------------|
 | `obligation_candidates` | Extrahierte atomare Pflichten aus Rich Controls |
+| `obligation_candidates.obligation_type` | `pflicht` / `empfehlung` / `kann` (3-Tier-Klassifizierung) |
 | `canonical_controls.parent_control_uuid` | Self-Referenz zum Rich Control (neues Feld) |
 | `canonical_controls.decomposition_method` | Zerlegungsmethode (neues Feld) |
+| `canonical_controls.obligation_type` | Uebernommen von Obligation: pflicht/empfehlung/kann |

 ---

@@ -567,7 +567,86 @@ curl -X POST https://api-dev.breakpilot.ai/api/compliance/v1/canonical/generate/

 ---

+## Pass 0a/0b: Atomare Control-Zerlegung
+
+Die Pipeline v3 erweitert die 7-Stufen-Pipeline um einen Vor-Pass, der Rich Controls in atomare Controls zerlegt.
+
+### Pass 0a: Obligation Extraction
+
+Extrahiert individuelle normative Pflichten aus Rich Controls via LLM.
+
+```mermaid
+flowchart LR
+    A[Rich Control] -->|LLM| B[Obligations]
+    B --> C{Quality Gate}
+    C -->|Pass| D[validated]
+    C -->|Fail| E[rejected]
+```
+
+**3-Tier Klassifikation:**
+
+| Typ | Erkennungsmuster | Beispiel |
+|-----|-----------------|---------|
+| **Pflicht** | muss, ist verpflichtet, hat sicherzustellen | "Der Verantwortliche MUSS ein Verzeichnis fuehren" |
+| **Empfehlung** | soll, sollte, wird empfohlen | "Es SOLLTE eine Risikobewertung durchgefuehrt werden" |
+| **Kann** | kann, darf, ist berechtigt | "Die Aufsichtsbehoerde KANN Geldbussen verhaengen" |
+
+**Quality Gate (6 Regeln):**
+
+1. Nur normative Aussagen (muss, sicherzustellen, verpflichtet)
+2. Ein Hauptverb pro Obligation
+3. Test-Obligations separat von operativen
+4. Reporting-Obligations separat
+5. Nicht auf Evidence-Ebene splitten
+6. Parent-Link immer erhalten
+
+### Pass 0b: Atomic Control Composition
+
+Verwandelt jede validierte Obligation in ein eigenstaendiges atomares Control.
+
+```mermaid
+flowchart LR
+    A[Obligation] -->|LLM| B[Atomic Control]
+    B -->|Dedup Check| C{4-Stage Dedup}
+    C -->|NEW| D[Insert + Index]
+    C -->|LINK| E[Parent-Link]
+    C -->|REVIEW| F[Review-Queue]
+```
+
+**Konfiguration:**
+
+| Variable | Default | Beschreibung |
+|----------|---------|-------------|
+| `DECOMPOSITION_LLM_MODEL` | `claude-sonnet-4-6` | LLM fuer Pass 0a/0b |
+| `DECOMPOSITION_BATCH_SIZE` | `5` | Obligations pro LLM-Call |
+| `DECOMPOSITION_LLM_TIMEOUT` | `120` | Timeout in Sekunden |
+
+**Ergebnisse (Stand 2026-03-21):**
+
+| Metrik | Wert |
+|--------|------|
+| Rich Controls (technisch) | ~6.800 |
+| Atomare Controls (bisher) | 30 (PoC: 10x CRYP, AUTH, SEC) |
+| Ziel nach Full Run | ~18.000 unique Master Controls |
+| Obligations pro Rich Control | ~10 |
+| Dedup-Reduktion erwartet | ~70% |
+
+### Quelldateien (Pass 0a/0b)
+
+| Datei | Beschreibung |
+|-------|-------------|
+| `compliance/services/decomposition_pass.py` | Pass 0a + 0b Logik |
+| `compliance/services/control_dedup.py` | 4-Stufen Dedup-Engine |
+| `migrations/061_obligation_candidates.sql` | Obligation-Tabelle |
+| `migrations/074_control_dedup.sql` | Dedup-Tabellen (Parent-Links, Review-Queue) |
+| `tests/test_decomposition_pass.py` | 90 Tests |
+| `tests/test_control_dedup.py` | 56 Tests |
+
+---
+
 ## Verwandte Dokumentation

 - [Canonical Control Library (CP-CLIB)](canonical-control-library.md) — Domains, Datenmodell, Too-Close-Detektor, CI/CD Validation
+- [Deduplizierungs-Engine](dedup-engine.md) — 4-Stufen Dedup, Multi-Parent-Linking, Review-Queue
+- [RAG Pipeline Benchmark](../../development/rag-pipeline-benchmark.md) — State-of-the-Art Vergleich, Optimierungsempfehlungen
 - [Multi-Layer Control Architecture](canonical-control-library.md#multi-layer-control-architecture) — 10-Stage Pipeline-Erweiterung mit Obligations, Patterns, Crosswalk
@@ -0,0 +1,253 @@
+# Deduplizierungs-Engine (Control Dedup)
+
+4-stufige Dedup-Pipeline zur Vermeidung doppelter atomarer Controls bei der Pass 0b Komposition. Kern-USP: **"1 Control erfuellt 5 Gesetze"** durch Multi-Parent-Linking.
+
+**Backend:** `backend-compliance/compliance/services/control_dedup.py`
+**Migration:** `backend-compliance/migrations/074_control_dedup.sql`
+**Tests:** `backend-compliance/tests/test_control_dedup.py` (56 Tests)
+
+---
+
+## Motivation
+
+Aus ~6.800 technischen Controls x ~10 Obligations pro Control entstehen ~68.000 atomare Kandidaten. Ziel: ~18.000 einzigartige Master Controls. Viele Obligations aus verschiedenen Gesetzen fuehren zum gleichen technischen Control (z.B. "MFA implementieren" in DSGVO, NIS2, AI Act).
+
+**Problem:** Embedding-only Deduplizierung ist GEFAEHRLICH fuer Compliance.
+
+!!! danger "False-Positive Beispiel"
+    - "Admin-Zugriffe muessen MFA nutzen" vs. "Remote-Zugriffe muessen MFA nutzen"
+    - Embedding sagt >0.9 aehnlich
+    - Aber es sind **ZWEI verschiedene Controls** (verschiedene Objekte!)
+
+---
+
+## 4-Stufen Entscheidungsbaum
+
+```mermaid
+flowchart TD
+    A[Kandidat-Control] --> B{Pattern-Gate}
+    B -->|pattern_id verschieden| N1[NEW CONTROL]
+    B -->|pattern_id gleich| C{Action-Check}
+    C -->|Action verschieden| N2[NEW CONTROL]
+    C -->|Action gleich| D{Object-Normalization}
+    D -->|Objekt verschieden| E{Similarity > 0.95?}
+    E -->|Ja| L1[LINK]
+    E -->|Nein| N3[NEW CONTROL]
+    D -->|Objekt gleich| F{Tiered Thresholds}
+    F -->|> 0.92| L2[LINK]
+    F -->|0.85 - 0.92| R[REVIEW QUEUE]
+    F -->|< 0.85| N4[NEW CONTROL]
+```
+
+### Stufe 1: Pattern-Gate (hart)
+
+`pattern_id` muss uebereinstimmen. Verhindert ~80% der False Positives.
+
+```python
+if pattern_id != existing.pattern_id:
+    → NEW CONTROL  # Verschiedene Kontrollmuster = verschiedene Controls
+```
+
+### Stufe 2: Action-Check (hart)
+
+Normalisierte Aktionsverben muessen uebereinstimmen. "Implementieren" vs. "Testen" = verschiedene Controls, auch bei gleichem Objekt.
+
+```python
+if normalize_action("implementieren") != normalize_action("testen"):
+    → NEW CONTROL  # "implement" != "test"
+```
+
+**Action-Normalisierung (Deutsch → Englisch):**
+
+| Deutsche Verben | Kanonische Form |
+|----------------|-----------------|
+| implementieren, umsetzen, einrichten, aktivieren | `implement` |
+| testen, pruefen, ueberpruefen, verifizieren | `test` |
+| ueberwachen, monitoring, beobachten | `monitor` |
+| verschluesseln | `encrypt` |
+| protokollieren, aufzeichnen, loggen | `log` |
+| beschraenken, einschraenken, begrenzen | `restrict` |
+
+### Stufe 3: Object-Normalization (weich)
+
+Compliance-Objekte werden auf kanonische Token normalisiert.
+
+```python
+normalize_object("Admin-Konten") → "privileged_access"
+normalize_object("Remote-Zugriff") → "remote_access"
+normalize_object("MFA") → "multi_factor_auth"
+```
+
+Bei verschiedenen Objekten gilt ein hoeherer Schwellenwert (0.95 statt 0.92).
+
+**Objekt-Normalisierung:**
+
+| Eingabe | Kanonischer Token |
+|---------|------------------|
+| MFA, 2FA, Multi-Faktor-Authentifizierung | `multi_factor_auth` |
+| Admin-Konten, privilegierte Zugriffe | `privileged_access` |
+| Verschluesselung, Kryptografie | `encryption` |
+| Schluessel, Key Management | `key_management` |
+| TLS, SSL, HTTPS | `transport_encryption` |
+| Firewall | `firewall` |
+| Audit-Log, Protokoll, Logging | `audit_logging` |
+
+### Stufe 4: Embedding Similarity (Qdrant)
+
+Tiered Thresholds basierend auf Cosine-Similarity:
+
+| Score | Verdict | Aktion |
+|-------|---------|--------|
+| > 0.95 | **LINK** | Bei verschiedenen Objekten |
+| > 0.92 | **LINK** | Parent-Link hinzufuegen |
+| 0.85 - 0.92 | **REVIEW** | In Review-Queue zur manuellen Pruefung |
+| < 0.85 | **NEW** | Neues Control anlegen |
+
+---
+
+## Canonicalization Layer
+
+Vor dem Embedding wird der deutsche Compliance-Text in normalisiertes Englisch transformiert:
+
+```
+"Administratoren muessen MFA verwenden"
+→ "implement multi_factor_auth for administratoren verwenden"
+→ Bessere Matches, weniger Embedding-Rauschen
+```
+
+Dies reduziert das Rauschen durch synonyme Formulierungen in verschiedenen Gesetzen.
+
+---
+
+## Multi-Parent-Linking (M:N)
+
+Ein atomares Control kann mehrere Eltern-Controls aus verschiedenen Regulierungen haben:
+
+```json
+{
+  "control_id": "AUTH-1072-A01",
+  "parent_links": [
+    {"parent_control_id": "AUTH-1001", "source": "NIST IA-02(01)", "link_type": "decomposition"},
+    {"parent_control_id": "NIS2-045", "source": "NIS2 Art. 21", "link_type": "dedup_merge"}
+  ]
+}
+```
+
+### Datenbank-Schema
+
+```sql
+-- Migration 074: control_parent_links (M:N)
+CREATE TABLE control_parent_links (
+    id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
+    control_uuid UUID NOT NULL REFERENCES canonical_controls(id),
+    parent_control_uuid UUID NOT NULL REFERENCES canonical_controls(id),
+    link_type VARCHAR(30) NOT NULL DEFAULT 'decomposition',
+    confidence NUMERIC(3,2) DEFAULT 1.0,
+    source_regulation VARCHAR(100),
+    source_article VARCHAR(100),
+    obligation_candidate_id UUID REFERENCES obligation_candidates(id),
+    created_at TIMESTAMPTZ DEFAULT NOW(),
+    CONSTRAINT uq_parent_link UNIQUE (control_uuid, parent_control_uuid)
+);
+```
+
+**Link-Typen:**
+
+| Typ | Bedeutung |
+|-----|-----------|
+| `decomposition` | Aus Pass 0b Zerlegung |
+| `dedup_merge` | Durch Dedup-Engine als Duplikat erkannt |
+| `manual` | Manuell durch Reviewer verknuepft |
+| `crosswalk` | Aus Crosswalk-Matrix uebernommen |
+
+---
+
+## Review-Queue
+
+Borderline-Matches (Similarity 0.85-0.92) werden in die Review-Queue geschrieben:
+
+```sql
+-- Migration 074: control_dedup_reviews
+CREATE TABLE control_dedup_reviews (
+    id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
+    candidate_control_id VARCHAR(30) NOT NULL,
+    candidate_title TEXT NOT NULL,
+    candidate_objective TEXT,
+    matched_control_uuid UUID REFERENCES canonical_controls(id),
+    matched_control_id VARCHAR(30),
+    similarity_score NUMERIC(4,3),
+    dedup_stage VARCHAR(40) NOT NULL,
+    review_status VARCHAR(20) DEFAULT 'pending',
+    -- pending → accepted_link | accepted_new | rejected
+    created_at TIMESTAMPTZ DEFAULT NOW()
+);
+```
+
+---
+
+## Qdrant Collection
+
+```
+Collection:  atomic_controls
+Dimension:   1024 (bge-m3)
+Distance:    COSINE
+Payload:     pattern_id, action_normalized, object_normalized, control_id, canonical_text
+Index:       pattern_id (keyword), action_normalized (keyword), object_normalized (keyword)
+Query:       IMMER mit filter: pattern_id == X (reduziert Suche drastisch)
+```
+
+---
+
+## Integration in Pass 0b
+
+Die Dedup-Engine ist optional in `DecompositionPass` integriert:
+
+```python
+decomp = DecompositionPass(db=session, dedup_enabled=True)
+stats = await decomp.run_pass0b(limit=100, use_anthropic=True)
+
+# Stats enthalten Dedup-Metriken:
+# stats["dedup_linked"] = 15   (Duplikate → Parent-Link)
+# stats["dedup_review"] = 3    (Borderline → Review-Queue)
+# stats["controls_created"] = 82  (Neue Controls)
+```
+
+**Ablauf bei Pass 0b mit Dedup:**
+
+1. LLM generiert atomares Control
+2. Dedup-Engine prueft 4 Stufen
+3. **LINK:** Kein neues Control, Parent-Link zu bestehendem
+4. **REVIEW:** Kein neues Control, Eintrag in Review-Queue
+5. **NEW:** Control anlegen + in Qdrant indexieren
+
+---
+
+## Konfiguration
+
+| Umgebungsvariable | Default | Beschreibung |
+|-------------------|---------|-------------|
+| `DEDUP_ENABLED` | `true` | Dedup-Engine ein/ausschalten |
+| `DEDUP_LINK_THRESHOLD` | `0.92` | Schwelle fuer automatisches Linking |
+| `DEDUP_REVIEW_THRESHOLD` | `0.85` | Schwelle fuer Review-Queue |
+| `DEDUP_LINK_THRESHOLD_DIFF_OBJ` | `0.95` | Schwelle bei verschiedenen Objekten |
+| `DEDUP_QDRANT_COLLECTION` | `atomic_controls` | Qdrant-Collection fuer Dedup-Index |
+| `QDRANT_URL` | `http://host.docker.internal:6333` | Qdrant-URL |
+| `EMBEDDING_URL` | `http://embedding-service:8087` | Embedding-Service-URL |
+
+---
+
+## Quelldateien
+
+| Datei | Beschreibung |
+|-------|-------------|
+| `compliance/services/control_dedup.py` | 4-Stufen Dedup-Engine |
+| `compliance/services/decomposition_pass.py` | Pass 0a/0b mit Dedup-Integration |
+| `migrations/074_control_dedup.sql` | DB-Schema (parent_links, review_queue) |
+| `tests/test_control_dedup.py` | 56 Unit-Tests |
+
+---
+
+## Verwandte Dokumentation
+
+- [Control Generator Pipeline](control-generator-pipeline.md) — 7-Stufen RAG→Control Pipeline
+- [Canonical Control Library](canonical-control-library.md) — Datenmodell, Domains, Similarity-Detektor
@@ -107,6 +107,7 @@ nav:
      - Policy-Bibliothek (29 Richtlinien): services/sdk-modules/policy-bibliothek.md
      - Canonical Control Library (CP-CLIB): services/sdk-modules/canonical-control-library.md
      - Control Generator Pipeline: services/sdk-modules/control-generator-pipeline.md
+      - Deduplizierungs-Engine: services/sdk-modules/dedup-engine.md
      - Control Provenance Wiki: services/sdk-modules/control-provenance.md
  - Strategie:
    - Wettbewerbsanalyse & Roadmap: strategy/wettbewerbsanalyse.md
@@ -115,3 +116,5 @@ nav:
    - Dokumentation: development/documentation.md
    - CI/CD Pipeline: development/ci-cd-pipeline.md
    - QA Control Quality: development/qa-control-quality.md
+    - RAG Pipeline Lessons Learned: development/rag-pipeline-lessons-learned.md
+    - RAG Pipeline Benchmark: development/rag-pipeline-benchmark.md
@@ -1,11 +1,29 @@
-"""Apply PDF QA results: update source_citation with correct article + article_type."""
+"""
+Apply PDF QA results: update source_citation with correct article_type + article.
+
+Safety modes:
+  --safe (default): Only set article_type. Set article only when empty. Mark preamble as recital_suspect.
+  --force-article:  Also overwrite existing articles (CAREFUL: NIST substring matching is unreliable).
+  --dry-run:        Show what would change without writing.
+
+Usage:
+    python3 apply_pdf_qa_results.py                    # safe mode (apply article_type + empty articles)
+    python3 apply_pdf_qa_results.py --dry-run          # show changes without writing
+    python3 apply_pdf_qa_results.py --force-article    # also overwrite existing articles
+"""
 import os
+import sys
 import json
 import psycopg2
 import urllib.parse
+from collections import Counter

 RESULTS_FILE = "/tmp/pdf_qa_results.json"

+# Parse args
+dry_run = "--dry-run" in sys.argv
+force_article = "--force-article" in sys.argv
+
 # Load results
 with open(RESULTS_FILE) as f:
    results = json.load(f)
@@ -21,35 +39,101 @@ conn = psycopg2.connect(
    options="-c search_path=compliance,public"
 )

-# Update in batches
+# Load current DB state for all affected controls
 cur = conn.cursor()
-updated = 0
+ctrl_ids = [r["ctrl_id"] for r in results]
+cur.execute("""
+    SELECT id,
+           source_citation->>'article' as article,
+           source_citation->>'article_type' as article_type,
+           source_citation->>'source' as source
+    FROM compliance.canonical_controls
+    WHERE id = ANY(%s::uuid[])
+""", (ctrl_ids,))
+db_state = {}
+for row in cur.fetchall():
+    db_state[str(row[0])] = {"article": row[1] or "", "article_type": row[2], "source": row[3]}
+
+# Counters
+stats = Counter()
+updated_type = 0
+updated_article = 0
+updated_recital = 0
 errors = 0
-unchanged = 0

 for i, r in enumerate(results):
    ctrl_id = r["ctrl_id"]
-    article_label = r["article_label"]
-    article_type = r["article_type"]  # preamble, article, annex, section, unknown
+    new_article = r["article_label"]
+    new_type = r["article_type"]
+    db = db_state.get(ctrl_id, {})
+
+    if not db:
+        stats["missing_in_db"] += 1
+        continue
+
+    old_type = db.get("article_type")
+    old_article = db.get("article", "").strip()
+
+    # Decide what to update
+    set_type = (old_type != new_type)
+    set_article = (not old_article) or (force_article and old_article != new_article)
+    set_recital = (new_type == "preamble")
+
+    if set_type:
+        stats["type_" + ("new" if not old_type else "changed")] += 1
+    else:
+        stats["type_unchanged"] += 1
+
+    if not old_article and set_article:
+        stats["article_new"] += 1
+    elif old_article and old_article != new_article:
+        if force_article:
+            stats["article_force_changed"] += 1
+        else:
+            stats["article_skipped"] += 1
+    else:
+        stats["article_unchanged"] += 1
+
+    if set_recital:
+        stats["recital"] += 1
+
+    if dry_run:
+        continue

    try:
-        # Update source_citation: set article and article_type
-        cur.execute("""
-            UPDATE compliance.canonical_controls
-            SET source_citation = source_citation
-                || jsonb_build_object('article', %s, 'article_type', %s),
-                updated_at = now()
-            WHERE id = %s::uuid
-            AND (
-                source_citation->>'article' IS DISTINCT FROM %s
-                OR source_citation->>'article_type' IS DISTINCT FROM %s
-            )
-        """, (article_label, article_type, ctrl_id, article_label, article_type))
+        # Build JSONB update
+        updates = {}
+        if set_type:
+            updates["article_type"] = new_type
+        if set_article:
+            updates["article"] = new_article

-        if cur.rowcount > 0:
-            updated += 1
-        else:
-            unchanged += 1
+        if updates:
+            # Merge into source_citation
+            cur.execute("""
+                UPDATE compliance.canonical_controls
+                SET source_citation = COALESCE(source_citation, '{}'::jsonb) || %s::jsonb,
+                    updated_at = now()
+                WHERE id = %s::uuid
+            """, (json.dumps(updates), ctrl_id))
+            if set_type:
+                updated_type += 1
+            if set_article:
+                updated_article += 1
+
+        # Mark preamble as recital_suspect
+        if set_recital:
+            cur.execute("""
+                UPDATE compliance.canonical_controls
+                SET generation_metadata = jsonb_set(
+                    COALESCE(generation_metadata, '{}'::jsonb),
+                    '{recital_suspect}',
+                    'true'::jsonb
+                ),
+                updated_at = now()
+                WHERE id = %s::uuid
+            """, (ctrl_id,))
+            updated_recital += 1

    except Exception as e:
        errors += 1
@@ -58,12 +142,37 @@ for i, r in enumerate(results):
        conn.rollback()
        continue

-    if (i + 1) % 500 == 0:
+    if (i + 1) % 1000 == 0:
        conn.commit()
-        print(f"  Progress: {i+1}/{len(results)} (updated: {updated}, unchanged: {unchanged}, errors: {errors})")
+        print(f"  Progress: {i+1}/{len(results)}")

-conn.commit()
-print(f"\nDone: {updated} updated, {unchanged} unchanged, {errors} errors out of {len(results)}")
+if not dry_run:
+    conn.commit()
+
+mode = "DRY-RUN" if dry_run else "APPLIED"
+print(f"\n{'='*60}")
+print(f"  Mode: {mode}")
+print(f"{'='*60}")
+print(f"\n  article_type:")
+print(f"    New (was NULL):    {stats['type_new']:5d}")
+print(f"    Changed:           {stats['type_changed']:5d}")
+print(f"    Unchanged:         {stats['type_unchanged']:5d}")
+print(f"\n  article:")
+print(f"    New (was empty):   {stats['article_new']:5d}")
+if force_article:
+    print(f"    Force-changed:     {stats['article_force_changed']:5d}")
+else:
+    print(f"    Differs (SKIPPED): {stats['article_skipped']:5d}")
+print(f"    Unchanged:         {stats['article_unchanged']:5d}")
+print(f"\n  Preamble/Recital:    {stats['recital']:5d}")
+print(f"  Missing in DB:       {stats['missing_in_db']:5d}")
+
+if not dry_run:
+    print(f"\n  Updates written:")
+    print(f"    article_type:      {updated_type:5d}")
+    print(f"    article:           {updated_article:5d}")
+    print(f"    recital_suspect:   {updated_recital:5d}")
+    print(f"    Errors:            {errors:5d}")

 # Verify: count by article_type
 cur.execute("""
@@ -0,0 +1,524 @@
+#!/usr/bin/env python3
+"""
+Phase 7.4 Benchmark: Compare gpt-oss-120b vs Claude Sonnet for Control Generation.
+
+Tests 5 representative gap articles from different sources.
+Measures: quality (JSON valid, fields complete), response time, cost estimate.
+
+Usage:
+    python3 benchmark_llm_controls.py
+"""
+import json
+import time
+import sys
+import os
+import requests
+from pathlib import Path
+
+# ── Config ──────────────────────────────────────────────────────────
+LITELLM_URL = "https://llm-dev.meghsakha.com"
+LITELLM_MODEL = "gpt-oss-120b"
+LITELLM_API_KEY = "sk-0nAyxaMVbIqmz_ntnndzag"
+
+ANTHROPIC_URL = "https://api.anthropic.com/v1/messages"
+ANTHROPIC_MODEL = "claude-sonnet-4-6"
+ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", "")
+
+PDF_DIR = Path(os.path.expanduser("~/rag-ingestion/pdfs"))
+
+try:
+    import fitz  # PyMuPDF
+except ImportError:
+    print("PyMuPDF not available, using pre-extracted texts")
+    fitz = None
+
+# ── Prompts (identical to control_generator.py) ─────────────────────
+
+SYSTEM_PROMPT = """Du bist ein Security-Compliance-Experte. Strukturiere den gegebenen Text
+als praxisorientiertes Security Control. Erstelle eine verständliche, umsetzbare Formulierung.
+Antworte NUR mit validem JSON. Bei mehreren Controls antworte mit einem JSON-Array."""
+
+APPLICABILITY_PROMPT = """- applicable_industries: Liste der Branchen fuer die dieses Control relevant ist.
+  Verwende ["all"] wenn der Control branchenuebergreifend gilt.
+  Moegliche Werte: "all", "Technologie / IT", "Finanzdienstleistungen", "Gesundheitswesen",
+  "Produktion / Industrie", "Energie", "Telekommunikation", "Oeffentlicher Dienst"
+- applicable_company_size: Ab welcher Unternehmensgroesse gilt dieses Control?
+  Verwende ["all"] wenn keine Groessenbeschraenkung.
+  Moegliche Werte: "all", "micro", "small", "medium", "large", "enterprise"
+- scope_conditions: null wenn keine besonderen Bedingungen, sonst:
+  {"requires_any": ["signal"], "description": "Erklaerung"}"""
+
+
+def build_prompt(source_name: str, article_label: str, article_text: str, license_type: str) -> str:
+    return f"""Strukturiere den folgenden Gesetzestext als Security/Compliance Control.
+Du DARFST den Originaltext verwenden (Quelle: {source_name}, {license_type}).
+
+WICHTIG: Erstelle eine verständliche, praxisorientierte Formulierung.
+Der Originaltext wird separat gespeichert — deine Formulierung soll klar und umsetzbar sein.
+
+Gib JSON zurück mit diesen Feldern:
+- title: Kurzer prägnanter Titel (max 100 Zeichen)
+- objective: Was soll erreicht werden? (1-3 Sätze)
+- rationale: Warum ist das wichtig? (1-2 Sätze)
+- requirements: Liste von konkreten Anforderungen (Strings)
+- test_procedure: Liste von Prüfschritten (Strings)
+- evidence: Liste von Nachweisdokumenten (Strings)
+- severity: low/medium/high/critical
+- tags: Liste von Tags
+- domain: Fachgebiet (AUTH/CRYP/NET/DATA/LOG/ACC/SEC/INC/AI/COMP/GOV)
+- category: Inhaltliche Kategorie
+- target_audience: Liste der Zielgruppen
+- source_article: Artikel-Referenz (z.B. "Artikel 10", "§ 42")
+- source_paragraph: Absatz-Referenz (z.B. "Absatz 5")
+{APPLICABILITY_PROMPT}
+
+Text: {article_text[:3000]}
+Quelle: {source_name}, {article_label}"""
+
+
+# ── PDF Text Extraction ─────────────────────────────────────────────
+
+def extract_article_text(pdf_file: str, article_label: str, doc_type: str) -> str:
+    """Extract the text of a specific article from a PDF."""
+    import re
+
+    path = PDF_DIR / pdf_file
+    if not path.exists() or fitz is None:
+        return ""
+
+    doc = fitz.open(str(path))
+    full_text = ""
+    for page in doc:
+        full_text += page.get_text() + "\n"
+    doc.close()
+
+    # Find article boundaries
+    if doc_type == "eu_regulation":
+        # Find "Artikel N" heading
+        art_num = re.search(r'\d+', article_label)
+        if not art_num:
+            return ""
+        num = int(art_num.group())
+        # Find start of this article
+        pattern = rf'\nArtikel\s+{num}\s*\n'
+        match = re.search(pattern, full_text)
+        if not match:
+            return f"[Artikel {num} nicht im PDF gefunden]"
+        start = match.start()
+        # Find start of next article
+        next_pattern = rf'\nArtikel\s+{num+1}\s*\n'
+        next_match = re.search(next_pattern, full_text)
+        end = next_match.start() if next_match else start + 5000
+        text = full_text[start:end].strip()
+        return text[:3000]
+
+    elif doc_type == "de_law":
+        para_num = re.search(r'\d+', article_label)
+        if not para_num:
+            return ""
+        num = int(para_num.group())
+        pattern = rf'\n§\s+{num}\b'
+        match = re.search(pattern, full_text)
+        if not match:
+            return f"[§ {num} nicht im PDF gefunden]"
+        start = match.start()
+        next_pattern = rf'\n§\s+{num+1}\b'
+        next_match = re.search(next_pattern, full_text)
+        end = next_match.start() if next_match else start + 5000
+        text = full_text[start:end].strip()
+        return text[:3000]
+
+    elif doc_type == "nist":
+        # Find NIST control family
+        match = re.search(rf'(?:^|\n)\s*{re.escape(article_label)}\b', full_text)
+        if not match:
+            return f"[{article_label} nicht im PDF gefunden]"
+        start = match.start()
+        text = full_text[start:start+3000].strip()
+        return text
+
+    else:
+        # Generic section search
+        match = re.search(rf'(?:^|\n).*{re.escape(article_label)}\b', full_text)
+        if not match:
+            return f"[{article_label} nicht im PDF gefunden]"
+        start = match.start()
+        text = full_text[start:start+3000].strip()
+        return text
+
+
+# ── API Calls ────────────────────────────────────────────────────────
+
+def call_litellm(prompt: str, system_prompt: str) -> tuple:
+    """Call LiteLLM API. Returns (response_text, duration_seconds, error)."""
+    headers = {
+        "Content-Type": "application/json",
+        "Authorization": f"Bearer {LITELLM_API_KEY}",
+    }
+    payload = {
+        "model": LITELLM_MODEL,
+        "messages": [
+            {"role": "system", "content": system_prompt},
+            {"role": "user", "content": prompt},
+        ],
+        "temperature": 0.3,
+        "max_tokens": 4096,
+        "stream": False,
+    }
+
+    t0 = time.time()
+    try:
+        resp = requests.post(
+            f"{LITELLM_URL}/v1/chat/completions",
+            headers=headers,
+            json=payload,
+            timeout=180,
+        )
+        duration = time.time() - t0
+        if resp.status_code != 200:
+            return "", duration, f"HTTP {resp.status_code}: {resp.text[:200]}"
+        data = resp.json()
+        content = data["choices"][0]["message"]["content"]
+        usage = data.get("usage", {})
+        return content, duration, None, usage
+    except Exception as e:
+        return "", time.time() - t0, str(e), {}
+
+
+def call_anthropic(prompt: str, system_prompt: str) -> tuple:
+    """Call Anthropic API. Returns (response_text, duration_seconds, error)."""
+    headers = {
+        "x-api-key": ANTHROPIC_API_KEY,
+        "anthropic-version": "2023-06-01",
+        "content-type": "application/json",
+    }
+    payload = {
+        "model": ANTHROPIC_MODEL,
+        "max_tokens": 4096,
+        "system": system_prompt,
+        "messages": [{"role": "user", "content": prompt}],
+    }
+
+    t0 = time.time()
+    try:
+        resp = requests.post(ANTHROPIC_URL, headers=headers, json=payload, timeout=180)
+        duration = time.time() - t0
+        if resp.status_code != 200:
+            return "", duration, f"HTTP {resp.status_code}: {resp.text[:200]}", {}
+        data = resp.json()
+        content = data["content"][0]["text"] if data.get("content") else ""
+        usage = data.get("usage", {})
+        return content, duration, None, usage
+    except Exception as e:
+        return "", time.time() - t0, str(e), {}
+
+
+# ── Quality Assessment ───────────────────────────────────────────────
+
+REQUIRED_FIELDS = [
+    "title", "objective", "rationale", "requirements",
+    "test_procedure", "evidence", "severity", "domain",
+]
+
+BONUS_FIELDS = [
+    "tags", "category", "target_audience", "source_article",
+    "applicable_industries", "applicable_company_size",
+]
+
+
+def assess_quality(raw_text: str) -> dict:
+    """Assess the quality of a control generation response."""
+    result = {
+        "json_valid": False,
+        "required_fields": 0,
+        "required_total": len(REQUIRED_FIELDS),
+        "bonus_fields": 0,
+        "bonus_total": len(BONUS_FIELDS),
+        "requirements_count": 0,
+        "test_procedure_count": 0,
+        "evidence_count": 0,
+        "title_length": 0,
+        "objective_length": 0,
+        "score": 0,
+    }
+
+    # Try to parse JSON
+    text = raw_text.strip()
+    if text.startswith("```"):
+        lines = text.split("\n")
+        text = "\n".join(lines[1:-1] if lines[-1].startswith("```") else lines[1:])
+
+    try:
+        data = json.loads(text)
+        if isinstance(data, list):
+            data = data[0] if data else {}
+    except json.JSONDecodeError:
+        # Try to find JSON object
+        import re
+        match = re.search(r'\{[\s\S]*\}', text)
+        if match:
+            try:
+                data = json.loads(match.group())
+            except json.JSONDecodeError:
+                return result
+        else:
+            return result
+
+    result["json_valid"] = True
+
+    # Check required fields
+    for f in REQUIRED_FIELDS:
+        val = data.get(f)
+        if val and (isinstance(val, str) and len(val) > 2 or isinstance(val, list) and len(val) > 0):
+            result["required_fields"] += 1
+
+    # Check bonus fields
+    for f in BONUS_FIELDS:
+        val = data.get(f)
+        if val and (isinstance(val, str) and len(val) > 0 or isinstance(val, list) and len(val) > 0):
+            result["bonus_fields"] += 1
+
+    # Depth metrics
+    reqs = data.get("requirements", [])
+    result["requirements_count"] = len(reqs) if isinstance(reqs, list) else 0
+    tp = data.get("test_procedure", [])
+    result["test_procedure_count"] = len(tp) if isinstance(tp, list) else 0
+    ev = data.get("evidence", [])
+    result["evidence_count"] = len(ev) if isinstance(ev, list) else 0
+    result["title_length"] = len(data.get("title", ""))
+    result["objective_length"] = len(data.get("objective", ""))
+
+    # Score: 0-100
+    score = 0
+    score += 20 if result["json_valid"] else 0
+    score += (result["required_fields"] / result["required_total"]) * 40
+    score += (result["bonus_fields"] / result["bonus_total"]) * 15
+    score += min(result["requirements_count"], 5) * 3  # max 15 for 5+ requirements
+    score += min(result["test_procedure_count"], 3) * 3  # max 9 for 3+ tests
+    score += 1 if result["objective_length"] > 50 else 0
+    result["score"] = round(score, 1)
+
+    result["parsed_data"] = data
+    return result
+
+
+# ── Test Cases ───────────────────────────────────────────────────────
+
+TEST_CASES = [
+    {
+        "source": "DSGVO (EU) 2016/679",
+        "article": "Artikel 32",
+        "pdf": "dsgvo_2016_679.pdf",
+        "doc_type": "eu_regulation",
+        "license": "EU_LAW",
+        "description": "Sicherheit der Verarbeitung — Kernthema Datenschutz",
+    },
+    {
+        "source": "KI-Verordnung (EU) 2024/1689",
+        "article": "Artikel 9",
+        "pdf": "ai_act_2024_1689.pdf",
+        "doc_type": "eu_regulation",
+        "license": "EU_LAW",
+        "description": "Risikomanagement für Hochrisiko-KI",
+    },
+    {
+        "source": "NIS2-Richtlinie (EU) 2022/2555",
+        "article": "Artikel 21",
+        "pdf": "nis2_2022_2555.pdf",
+        "doc_type": "eu_regulation",
+        "license": "EU_LAW",
+        "description": "Cybersicherheitsrisikomanagement — NIS2 Kernpflicht",
+    },
+    {
+        "source": "Cyber Resilience Act (CRA)",
+        "article": "Artikel 13",
+        "pdf": "cra_2024_2847.pdf",
+        "doc_type": "eu_regulation",
+        "license": "EU_LAW",
+        "description": "Pflichten der Hersteller",
+    },
+    {
+        "source": "Bundesdatenschutzgesetz (BDSG)",
+        "article": "§ 26",
+        "pdf": "bdsg.pdf",
+        "doc_type": "de_law",
+        "license": "DE_LAW",
+        "description": "Datenverarbeitung im Beschäftigungskontext",
+    },
+]
+
+
+# ── Main ─────────────────────────────────────────────────────────────
+
+def main():
+    if not ANTHROPIC_API_KEY:
+        print("ERROR: Set ANTHROPIC_API_KEY environment variable")
+        sys.exit(1)
+
+    print("=" * 80)
+    print("LLM BENCHMARK: gpt-oss-120b vs Claude Sonnet 4.6")
+    print("=" * 80)
+    print(f"  LiteLLM:   {LITELLM_URL} / {LITELLM_MODEL}")
+    print(f"  Anthropic: {ANTHROPIC_MODEL}")
+    print(f"  Tests:     {len(TEST_CASES)}")
+    print()
+
+    # Pre-check LiteLLM
+    try:
+        r = requests.get(f"{LITELLM_URL}/v1/models",
+                         headers={"Authorization": f"Bearer {LITELLM_API_KEY}"}, timeout=10)
+        print(f"  LiteLLM OK: {r.status_code}")
+    except Exception as e:
+        print(f"  LiteLLM ERROR: {e}")
+        sys.exit(1)
+
+    results = []
+
+    for i, tc in enumerate(TEST_CASES):
+        print(f"\n{'='*80}")
+        print(f"TEST {i+1}/{len(TEST_CASES)}: {tc['source']} — {tc['article']}")
+        print(f"  {tc['description']}")
+        print(f"{'='*80}")
+
+        # Extract article text from PDF
+        article_text = extract_article_text(tc["pdf"], tc["article"], tc["doc_type"])
+        if not article_text or article_text.startswith("["):
+            print(f"  WARNING: {article_text or 'Empty text'}")
+            continue
+
+        print(f"  Text extracted: {len(article_text)} chars")
+        print(f"  First 120 chars: {article_text[:120].replace(chr(10), ' ')}...")
+
+        prompt = build_prompt(tc["source"], tc["article"], article_text, tc["license"])
+
+        # ── Call LiteLLM ──
+        print(f"\n  --- gpt-oss-120b ---")
+        litellm_raw, litellm_time, litellm_err, litellm_usage = call_litellm(prompt, SYSTEM_PROMPT)
+        if litellm_err:
+            print(f"  ERROR: {litellm_err}")
+            litellm_quality = {"json_valid": False, "score": 0}
+        else:
+            print(f"  Time: {litellm_time:.1f}s")
+            print(f"  Tokens: {litellm_usage}")
+            litellm_quality = assess_quality(litellm_raw)
+            print(f"  JSON valid: {litellm_quality['json_valid']}")
+            print(f"  Score: {litellm_quality['score']}/100")
+            print(f"  Required fields: {litellm_quality['required_fields']}/{litellm_quality['required_total']}")
+            print(f"  Requirements: {litellm_quality['requirements_count']}, "
+                  f"Tests: {litellm_quality['test_procedure_count']}, "
+                  f"Evidence: {litellm_quality['evidence_count']}")
+            if litellm_quality.get("parsed_data"):
+                d = litellm_quality["parsed_data"]
+                print(f"  Title: {d.get('title', 'N/A')}")
+
+        # ── Call Anthropic ──
+        print(f"\n  --- Claude Sonnet 4.6 ---")
+        anthropic_raw, anthropic_time, anthropic_err, anthropic_usage = call_anthropic(prompt, SYSTEM_PROMPT)
+        if anthropic_err:
+            print(f"  ERROR: {anthropic_err}")
+            anthropic_quality = {"json_valid": False, "score": 0}
+        else:
+            print(f"  Time: {anthropic_time:.1f}s")
+            print(f"  Tokens: {anthropic_usage}")
+            anthropic_quality = assess_quality(anthropic_raw)
+            print(f"  JSON valid: {anthropic_quality['json_valid']}")
+            print(f"  Score: {anthropic_quality['score']}/100")
+            print(f"  Required fields: {anthropic_quality['required_fields']}/{anthropic_quality['required_total']}")
+            print(f"  Requirements: {anthropic_quality['requirements_count']}, "
+                  f"Tests: {anthropic_quality['test_procedure_count']}, "
+                  f"Evidence: {anthropic_quality['evidence_count']}")
+            if anthropic_quality.get("parsed_data"):
+                d = anthropic_quality["parsed_data"]
+                print(f"  Title: {d.get('title', 'N/A')}")
+
+        # Compare
+        print(f"\n  --- VERGLEICH ---")
+        speed_ratio = litellm_time / anthropic_time if anthropic_time > 0 else 0
+        print(f"  Speed:   120b {litellm_time:.1f}s vs Sonnet {anthropic_time:.1f}s "
+              f"({'120b ' + str(round(speed_ratio, 1)) + 'x langsamer' if speed_ratio > 1 else '120b schneller'})")
+        print(f"  Score:   120b {litellm_quality.get('score', 0)}/100 vs "
+              f"Sonnet {anthropic_quality.get('score', 0)}/100")
+
+        results.append({
+            "test": f"{tc['source']} — {tc['article']}",
+            "litellm": {
+                "time": round(litellm_time, 1),
+                "score": litellm_quality.get("score", 0),
+                "json_valid": litellm_quality.get("json_valid", False),
+                "requirements": litellm_quality.get("requirements_count", 0),
+                "tests": litellm_quality.get("test_procedure_count", 0),
+                "usage": litellm_usage,
+                "raw": litellm_raw[:500] if litellm_raw else "",
+            },
+            "anthropic": {
+                "time": round(anthropic_time, 1),
+                "score": anthropic_quality.get("score", 0),
+                "json_valid": anthropic_quality.get("json_valid", False),
+                "requirements": anthropic_quality.get("requirements_count", 0),
+                "tests": anthropic_quality.get("test_procedure_count", 0),
+                "usage": anthropic_usage,
+                "raw": anthropic_raw[:500] if anthropic_raw else "",
+            },
+        })
+
+    # ── Summary ──────────────────────────────────────────────────────
+    print(f"\n\n{'='*80}")
+    print("ZUSAMMENFASSUNG")
+    print(f"{'='*80}")
+
+    if not results:
+        print("  Keine Ergebnisse.")
+        return
+
+    litellm_scores = [r["litellm"]["score"] for r in results]
+    anthropic_scores = [r["anthropic"]["score"] for r in results]
+    litellm_times = [r["litellm"]["time"] for r in results]
+    anthropic_times = [r["anthropic"]["time"] for r in results]
+
+    print(f"\n  {'Metrik':<30s} {'gpt-oss-120b':>15s} {'Claude Sonnet':>15s}")
+    print(f"  {'-'*30} {'-'*15} {'-'*15}")
+    print(f"  {'Avg Score (0-100)':<30s} {sum(litellm_scores)/len(litellm_scores):>13.1f}   "
+          f"{sum(anthropic_scores)/len(anthropic_scores):>13.1f}")
+    print(f"  {'Avg Time (s)':<30s} {sum(litellm_times)/len(litellm_times):>13.1f}   "
+          f"{sum(anthropic_times)/len(anthropic_times):>13.1f}")
+    print(f"  {'JSON Valid':<30s} {sum(1 for r in results if r['litellm']['json_valid']):>12d}/{len(results)}   "
+          f"{sum(1 for r in results if r['anthropic']['json_valid']):>12d}/{len(results)}")
+    print(f"  {'Avg Requirements':<30s} "
+          f"{sum(r['litellm']['requirements'] for r in results)/len(results):>13.1f}   "
+          f"{sum(r['anthropic']['requirements'] for r in results)/len(results):>13.1f}")
+    print(f"  {'Avg Test Procedures':<30s} "
+          f"{sum(r['litellm']['tests'] for r in results)/len(results):>13.1f}   "
+          f"{sum(r['anthropic']['tests'] for r in results)/len(results):>13.1f}")
+
+    # Cost estimate
+    # Claude Sonnet: ~$3/M input, ~$15/M output
+    # gpt-oss-120b: self-hosted = $0 API cost (only compute)
+    total_anthropic_input = sum(r["anthropic"]["usage"].get("input_tokens", 0) for r in results)
+    total_anthropic_output = sum(r["anthropic"]["usage"].get("output_tokens", 0) for r in results)
+    anthropic_cost = (total_anthropic_input * 3 + total_anthropic_output * 15) / 1_000_000
+
+    print(f"\n  Kostenvergleich (fuer {len(results)} Controls):")
+    print(f"    gpt-oss-120b:    $0.00 (self-hosted)")
+    print(f"    Claude Sonnet:   ${anthropic_cost:.4f} "
+          f"({total_anthropic_input} input + {total_anthropic_output} output tokens)")
+
+    # Extrapolate for 494 gap articles
+    if results:
+        cost_per_control = anthropic_cost / len(results)
+        print(f"\n  Hochrechnung fuer 494 Luecken-Artikel:")
+        print(f"    gpt-oss-120b:    $0.00")
+        print(f"    Claude Sonnet:   ${cost_per_control * 494:.2f}")
+        avg_time_120b = sum(litellm_times) / len(litellm_times)
+        avg_time_sonnet = sum(anthropic_times) / len(anthropic_times)
+        print(f"    Zeit 120b:       {avg_time_120b * 494 / 60:.0f} min ({avg_time_120b * 494 / 3600:.1f}h)")
+        print(f"    Zeit Sonnet:     {avg_time_sonnet * 494 / 60:.0f} min ({avg_time_sonnet * 494 / 3600:.1f}h)")
+
+    # Save full results
+    out_path = "/tmp/benchmark_llm_results.json"
+    with open(out_path, 'w') as f:
+        json.dump(results, f, indent=2, ensure_ascii=False)
+    print(f"\n  Detaillierte Ergebnisse: {out_path}")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,200 @@
+"""Match unmatched Blue Guide controls against the English PDF."""
+import os
+import re
+import json
+import unicodedata
+import psycopg2
+import urllib.parse
+
+try:
+    import fitz
+except ImportError:
+    print("ERROR: PyMuPDF (fitz) not installed")
+    exit(1)
+
+PDF_PATH = os.path.expanduser("~/rag-ingestion/pdfs/blue_guide_2022_en.pdf")
+
+def normalize(s):
+    s = s.replace('\u00ad', '').replace('\xad', '')
+    s = s.replace('\u200b', '').replace('\u00a0', ' ')
+    s = s.replace('\ufb01', 'fi').replace('\ufb02', 'fl')
+    s = s.replace('\ufb00', 'ff').replace('\ufb03', 'ffi').replace('\ufb04', 'ffl')
+    s = s.replace('\u2019', "'").replace('\u2018', "'")
+    s = s.replace('\u201c', '"').replace('\u201d', '"')
+    s = s.replace('\u2013', '-').replace('\u2014', '-')
+    s = s.replace('\u2022', '-').replace('\u00b7', '-')
+    s = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', s)
+    s = unicodedata.normalize('NFC', s)
+    s = re.sub(r'\s+', ' ', s)
+    return s.strip()
+
+# Read EN PDF
+print(f"Reading {PDF_PATH}...")
+doc = fitz.open(PDF_PATH)
+text = ""
+for page in doc:
+    text += page.get_text() + "\n"
+doc.close()
+print(f"  {len(text):,} chars")
+
+text_norm = normalize(text)
+
+# Build article index for EN Blue Guide
+# EN Blue Guide uses "Article N" headings (not "Artikel N")
+items = []
+
+# Find where "Article 1" starts — content before is preamble/intro
+art1_match = re.search(r'\nArticle\s+1\s*\n', text)
+if not art1_match:
+    # Try section-based structure instead
+    print("  No 'Article N' headings found, trying section-based index...")
+    for m in re.finditer(r'(?:^|\n)\s*(\d+(?:\.\d+)*)\.\s+[A-Z]', text, re.MULTILINE):
+        items.append((m.start(), f"Section {m.group(1)}", "section"))
+else:
+    art1_pos = art1_match.start()
+    # Article headings
+    for m in re.finditer(r'(?:^|\n)\s*Article\s+(\d+[a-z]?)\s*\n', text, re.MULTILINE):
+        art_num = int(re.match(r'(\d+)', m.group(1)).group(1))
+        items.append((m.start(), f"Article {m.group(1)}", "article"))
+
+    # Annex markers
+    for m in re.finditer(r'(?:^|\n)\s*ANNEX\s+([IVXLC]+[a-z]?)\b', text, re.MULTILINE):
+        items.append((m.start(), f"Annex {m.group(1)}", "annex"))
+
+# Also try numbered section headings as fallback
+for m in re.finditer(r'(?:^|\n)\s*(\d+\.\d+(?:\.\d+)?)\s+[A-Z]', text, re.MULTILINE):
+    items.append((m.start(), f"Section {m.group(1)}", "section"))
+
+items.sort(key=lambda x: x[0])
+seen = set()
+unique = []
+for pos, label, typ in items:
+    if label not in seen:
+        seen.add(label)
+        unique.append((pos, label, typ))
+
+print(f"  Index: {len(unique)} sections")
+if unique[:5]:
+    for pos, label, typ in unique[:5]:
+        print(f"    {label} [{typ}] @ pos {pos}")
+
+# Precompute normalized positions
+index_norm = []
+for pos, label, typ in unique:
+    norm_pos = len(normalize(text[:pos]))
+    index_norm.append((norm_pos, label, typ))
+
+# Connect to DB
+db_url = os.environ['DATABASE_URL']
+parsed = urllib.parse.urlparse(db_url)
+conn = psycopg2.connect(
+    host=parsed.hostname, port=parsed.port or 5432,
+    user=parsed.username, password=parsed.password,
+    dbname=parsed.path.lstrip('/'),
+    options="-c search_path=compliance,public"
+)
+cur = conn.cursor()
+
+# Get Blue Guide controls without article_type (unmatched)
+cur.execute("""
+    SELECT id, control_id, title, source_original_text,
+           source_citation->>'article' as existing_article,
+           source_citation->>'article_type' as existing_type,
+           release_state
+    FROM compliance.canonical_controls
+    WHERE source_citation->>'source' = 'EU Blue Guide 2022'
+    AND source_original_text IS NOT NULL
+    AND length(source_original_text) > 50
+    AND (source_citation->>'article_type' IS NULL)
+    ORDER BY control_id
+""")
+controls = cur.fetchall()
+print(f"\nUnmatched Blue Guide controls: {len(controls)}")
+
+# Match each control
+results = []
+found = 0
+not_found = 0
+
+for ctrl in controls:
+    ctrl_id, control_id, title, orig_text, existing_art, existing_type, state = ctrl
+    orig_norm = normalize(orig_text)
+    if len(orig_norm) < 30:
+        not_found += 1
+        continue
+
+    matched = False
+    for start_frac in [0.25, 0.1, 0.5, 0.0, 0.75]:
+        for length in [80, 60, 40, 30, 20]:
+            start = max(0, int(len(orig_norm) * start_frac))
+            snippet = orig_norm[start:start+length]
+            if not snippet or len(snippet) < 15:
+                continue
+            pos = text_norm.find(snippet)
+            if pos >= 0:
+                # Find section
+                label = "Unknown"
+                typ = "unknown"
+                for h_pos, h_label, h_type in reversed(index_norm):
+                    if h_pos <= pos:
+                        label = h_label
+                        typ = h_type
+                        break
+                results.append({
+                    "ctrl_id": str(ctrl_id),
+                    "control_id": control_id,
+                    "source": "EU Blue Guide 2022",
+                    "article_label": label,
+                    "article_type": typ,
+                })
+                found += 1
+                is_active = "" if state not in ('duplicate', 'too_close') else " [DUP]"
+                print(f"  {control_id:10s}: {label:25s} [{typ:8s}]{is_active}")
+                matched = True
+                break
+        if matched:
+            break
+
+    if not matched:
+        not_found += 1
+        print(f"  {control_id:10s}: NOT FOUND  {title[:50]}")
+
+print(f"\n{'='*50}")
+print(f"Results: {found} matched, {not_found} not found out of {len(controls)}")
+
+# Save results
+out_path = "/tmp/blue_guide_en_results.json"
+with open(out_path, 'w') as f:
+    json.dump(results, f, indent=2, ensure_ascii=False)
+print(f"Saved to {out_path}")
+
+# Apply results to DB
+if results:
+    print(f"\nApplying {len(results)} results to DB...")
+    applied = 0
+    for r in results:
+        cur.execute("""
+            UPDATE compliance.canonical_controls
+            SET source_citation = source_citation ||
+                jsonb_build_object('article', %s, 'article_type', %s)
+            WHERE id = %s::uuid
+            AND (source_citation->>'article' IS DISTINCT FROM %s
+                 OR source_citation->>'article_type' IS DISTINCT FROM %s)
+        """, (r["article_label"], r["article_type"],
+              r["ctrl_id"], r["article_label"], r["article_type"]))
+        if cur.rowcount > 0:
+            applied += 1
+    conn.commit()
+    print(f"  Applied: {applied} controls updated")
+
+# Show type distribution
+type_counts = {}
+for r in results:
+    t = r["article_type"]
+    type_counts[t] = type_counts.get(t, 0) + 1
+if type_counts:
+    print(f"\nArticle type distribution:")
+    for t, c in sorted(type_counts.items(), key=lambda x: -x[1]):
+        print(f"  {t:12s}: {c:5d}")
+
+conn.close()
@@ -0,0 +1,188 @@
+"""
+Phase 7.3: Gap Analysis — Identify articles/sections WITHOUT controls.
+
+For each regulation PDF:
+1. Extract all articles/sections from the PDF
+2. Compare with controls in the DB that reference this article
+3. Report gaps (articles with no controls)
+
+Usage:
+    python3 gap_analysis.py                  # show all gaps
+    python3 gap_analysis.py --source "DSGVO"  # filter by source
+"""
+import os
+import sys
+import json
+import re
+import psycopg2
+import urllib.parse
+from pathlib import Path
+from collections import defaultdict
+
+# Import from pdf_qa_all
+sys.path.insert(0, os.path.dirname(__file__))
+from pdf_qa_all import (
+    SOURCE_FILE_MAP, read_file, classify_doc, normalize,
+    build_eu_article_index, build_de_law_index, build_nist_index,
+    build_owasp_index, build_generic_index, MAX_ARTICLES
+)
+
+# Only analyze sources with significant control counts (skip sources with <5 controls)
+MIN_CONTROLS = 5
+
+
+def main():
+    source_filter = None
+    if "--source" in sys.argv:
+        idx = sys.argv.index("--source")
+        if idx + 1 < len(sys.argv):
+            source_filter = sys.argv[idx + 1]
+
+    # DB connection
+    db_url = os.environ['DATABASE_URL']
+    parsed = urllib.parse.urlparse(db_url)
+    conn = psycopg2.connect(
+        host=parsed.hostname, port=parsed.port or 5432,
+        user=parsed.username, password=parsed.password,
+        dbname=parsed.path.lstrip('/'),
+        options="-c search_path=compliance,public"
+    )
+    cur = conn.cursor()
+
+    # Get all controls grouped by source with their article
+    cur.execute("""
+        SELECT source_citation->>'source' as source,
+               source_citation->>'article' as article,
+               source_citation->>'article_type' as article_type,
+               count(*) as cnt
+        FROM compliance.canonical_controls
+        WHERE source_citation->>'source' IS NOT NULL
+        AND release_state NOT IN ('duplicate', 'too_close')
+        GROUP BY 1, 2, 3
+        ORDER BY 1, 2
+    """)
+
+    # Build: source -> {article -> (type, count)}
+    controls_by_source = defaultdict(dict)
+    for source, article, art_type, cnt in cur.fetchall():
+        if article:
+            controls_by_source[source][article] = (art_type or "unknown", cnt)
+
+    total_gaps = 0
+    total_articles_checked = 0
+    total_covered = 0
+    gap_report = []
+
+    sources_to_check = sorted(SOURCE_FILE_MAP.keys())
+    if source_filter:
+        sources_to_check = [s for s in sources_to_check if source_filter.lower() in s.lower()]
+
+    for source_name in sources_to_check:
+        filename = SOURCE_FILE_MAP.get(source_name)
+        if filename is None:
+            continue
+
+        controls = controls_by_source.get(source_name, {})
+        if len(controls) < MIN_CONTROLS and not source_filter:
+            continue
+
+        # Read PDF and build article index
+        text = read_file(filename)
+        if text is None:
+            continue
+
+        doc_type = classify_doc(source_name)
+        max_art = MAX_ARTICLES.get(source_name)
+
+        if doc_type == "eu_regulation":
+            index = build_eu_article_index(text, max_article=max_art)
+        elif doc_type == "de_law":
+            index = build_de_law_index(text)
+        elif doc_type == "nist":
+            index = build_nist_index(text)
+        elif doc_type == "owasp":
+            index = build_owasp_index(text, source_name)
+        else:
+            index = build_generic_index(text)
+
+        if not index:
+            continue
+
+        # Only look at substantive articles (not preamble, not annex for gap analysis)
+        substantive_types = {"article", "section", "control", "requirement", "category"}
+        substantive_articles = [(pos, label, typ) for pos, label, typ in index if typ in substantive_types]
+
+        preamble_articles = [(pos, label, typ) for pos, label, typ in index if typ == "preamble"]
+        annex_articles = [(pos, label, typ) for pos, label, typ in index if typ == "annex"]
+
+        # Check which articles have controls
+        covered = []
+        gaps = []
+        for pos, label, typ in substantive_articles:
+            if label in controls:
+                covered.append(label)
+            else:
+                gaps.append((label, typ))
+
+        total_articles_checked += len(substantive_articles)
+        total_covered += len(covered)
+        total_gaps += len(gaps)
+
+        # Count preamble/annex controls
+        preamble_controls = sum(1 for a in controls if controls[a][0] == "preamble")
+        annex_controls = sum(1 for a in controls if controls[a][0] == "annex")
+
+        coverage_pct = len(covered) / len(substantive_articles) * 100 if substantive_articles else 0
+
+        print(f"\n{'='*70}")
+        print(f"{source_name}")
+        print(f"  PDF articles: {len(substantive_articles)} substantive, "
+              f"{len(preamble_articles)} preamble, {len(annex_articles)} annex")
+        print(f"  DB controls:  {sum(v[1] for v in controls.values())} total "
+              f"({preamble_controls} preamble, {annex_controls} annex)")
+        print(f"  Coverage:     {len(covered)}/{len(substantive_articles)} "
+              f"({coverage_pct:.0f}%)")
+
+        if gaps:
+            print(f"  GAPS ({len(gaps)}):")
+            for label, typ in gaps[:30]:  # limit output
+                print(f"    - {label} [{typ}]")
+            if len(gaps) > 30:
+                print(f"    ... and {len(gaps)-30} more")
+
+            gap_report.append({
+                "source": source_name,
+                "total_articles": len(substantive_articles),
+                "covered": len(covered),
+                "gaps": len(gaps),
+                "coverage_pct": round(coverage_pct, 1),
+                "gap_articles": [{"label": l, "type": t} for l, t in gaps],
+            })
+
+    # Summary
+    print(f"\n{'='*70}")
+    print("GAP ANALYSIS SUMMARY")
+    print(f"{'='*70}")
+    print(f"  Sources analyzed:        {len([r for r in gap_report]) + len([s for s in sources_to_check if SOURCE_FILE_MAP.get(s)])}")
+    print(f"  Total articles in PDFs:  {total_articles_checked}")
+    print(f"  Articles with controls:  {total_covered}")
+    print(f"  Articles WITHOUT controls: {total_gaps}")
+    if total_articles_checked:
+        print(f"  Overall coverage:        {total_covered/total_articles_checked*100:.1f}%")
+
+    print(f"\n  Sources with gaps:")
+    for r in sorted(gap_report, key=lambda x: -x["gaps"]):
+        print(f"    {r['source']:45s}  {r['gaps']:4d} gaps  "
+              f"({r['covered']}/{r['total_articles']} = {r['coverage_pct']}%)")
+
+    # Save report
+    out_path = "/tmp/gap_analysis_results.json"
+    with open(out_path, 'w') as f:
+        json.dump(gap_report, f, indent=2, ensure_ascii=False)
+    print(f"\n  Full report saved to {out_path}")
+
+    conn.close()
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,288 @@
+"""Analyze NIST OSCAL data and compare with existing controls in DB."""
+import os
+import re
+import json
+import psycopg2
+import urllib.parse
+from collections import defaultdict
+
+OSCAL_DIR = os.path.expanduser("~/rag-ingestion/nist-oscal")
+
+# ── Load SP 800-53 Rev 5 ──
+with open(os.path.join(OSCAL_DIR, "sp800-53-rev5-catalog.json")) as f:
+    sp853 = json.load(f)["catalog"]
+
+print("=" * 70)
+print("NIST SP 800-53 Rev 5 — OSCAL Catalog Analysis")
+print("=" * 70)
+print(f"  UUID: {sp853.get('uuid', '?')}")
+print(f"  Last Modified: {sp853.get('metadata', {}).get('last-modified', '?')}")
+
+# Count controls
+families = sp853.get("groups", [])
+total_base = 0
+total_enhancements = 0
+total_withdrawn = 0
+total_active = 0
+family_stats = []
+
+for fam in families:
+    fam_id = fam.get("id", "?")
+    fam_title = fam.get("title", "?")
+    controls = fam.get("controls", [])
+    base = 0
+    enhancements = 0
+    withdrawn = 0
+
+    for ctrl in controls:
+        # Check if withdrawn
+        props = {p["name"]: p.get("value", "") for p in ctrl.get("props", [])}
+        is_withdrawn = props.get("status") == "withdrawn"
+        if is_withdrawn:
+            withdrawn += 1
+        else:
+            base += 1
+
+        # Count enhancements
+        for enh in ctrl.get("controls", []):
+            enh_props = {p["name"]: p.get("value", "") for p in enh.get("props", [])}
+            if enh_props.get("status") == "withdrawn":
+                withdrawn += 1
+            else:
+                enhancements += 1
+
+    family_stats.append((fam_id, fam_title, base, enhancements, withdrawn))
+    total_base += base
+    total_enhancements += enhancements
+    total_withdrawn += withdrawn
+
+total_active = total_base + total_enhancements
+print(f"\n  Families: {len(families)}")
+print(f"  Base Controls: {total_base}")
+print(f"  Enhancements: {total_enhancements}")
+print(f"  Withdrawn: {total_withdrawn}")
+print(f"  TOTAL ACTIVE: {total_active}")
+
+print(f"\n  Per Family:")
+print(f"  {'ID':6s} {'Title':45s} {'Base':>5s} {'Enh':>5s} {'Wdrn':>5s}")
+for fam_id, title, base, enh, wdrn in family_stats:
+    print(f"  {fam_id:6s} {title[:45]:45s} {base:5d} {enh:5d} {wdrn:5d}")
+
+# Show example control structure
+print(f"\n  Example Control (AC-6 Least Privilege):")
+for fam in families:
+    for ctrl in fam.get("controls", []):
+        if ctrl["id"] == "ac-6":
+            props = {p["name"]: p.get("value", "") for p in ctrl.get("props", [])}
+            print(f"    ID: {ctrl['id']}")
+            print(f"    Label: {props.get('label', '?')}")
+            print(f"    Title: {ctrl['title']}")
+            for part in ctrl.get("parts", []):
+                if part.get("name") == "statement":
+                    prose = part.get("prose", "")
+                    print(f"    Statement: {prose[:150]}...")
+                elif part.get("name") == "guidance":
+                    prose = part.get("prose", "")
+                    print(f"    Guidance: {prose[:150]}...")
+            enh_count = len(ctrl.get("controls", []))
+            print(f"    Enhancements: {enh_count}")
+            links = [l["href"].lstrip("#") for l in ctrl.get("links", []) if l.get("rel") == "related"]
+            print(f"    Related: {', '.join(links[:8])}...")
+            break
+
+# ── Load CSF 2.0 ──
+print(f"\n{'='*70}")
+print("NIST CSF 2.0 — OSCAL Catalog Analysis")
+print("=" * 70)
+
+with open(os.path.join(OSCAL_DIR, "csf-2.0-catalog.json")) as f:
+    csf = json.load(f)["catalog"]
+
+csf_groups = csf.get("groups", [])
+csf_total = 0
+for grp in csf_groups:
+    func_title = grp.get("title", "?")
+    cats = grp.get("groups", [])
+    subcats = 0
+    for cat in cats:
+        subcats += len(cat.get("controls", []))
+    csf_total += subcats
+    print(f"  {func_title:25s}: {len(cats):2d} categories, {subcats:3d} subcategories")
+
+print(f"  TOTAL: {csf_total} subcategories")
+
+# ── Compare with existing DB controls ──
+print(f"\n{'='*70}")
+print("VERGLEICH: OSCAL vs. bestehende Controls in DB")
+print("=" * 70)
+
+db_url = os.environ['DATABASE_URL']
+parsed = urllib.parse.urlparse(db_url)
+conn = psycopg2.connect(
+    host=parsed.hostname, port=parsed.port or 5432,
+    user=parsed.username, password=parsed.password,
+    dbname=parsed.path.lstrip('/'),
+    options="-c search_path=compliance,public"
+)
+cur = conn.cursor()
+
+# Get existing NIST controls
+cur.execute("""
+    SELECT control_id, title,
+           source_citation->>'source' as source,
+           source_citation->>'article' as article,
+           source_citation->>'article_type' as art_type,
+           release_state
+    FROM compliance.canonical_controls
+    WHERE source_citation->>'source' LIKE 'NIST%%'
+    ORDER BY source_citation->>'source', control_id
+""")
+nist_controls = cur.fetchall()
+
+# Group by source
+by_source = defaultdict(list)
+for ctrl in nist_controls:
+    by_source[ctrl[2]].append(ctrl)
+
+print(f"\n  Bestehende NIST Controls in DB:")
+for src in sorted(by_source.keys()):
+    ctrls = by_source[src]
+    active = sum(1 for c in ctrls if c[5] not in ('duplicate', 'too_close'))
+    with_article = sum(1 for c in ctrls if c[3])
+    print(f"    {src:40s}: {len(ctrls):4d} total, {active:4d} active, {with_article:4d} mit article")
+
+# For SP 800-53: which control families do we have?
+sp853_existing = [c for c in nist_controls if 'SP 800-53' in (c[2] or '')]
+existing_families = set()
+existing_articles = set()
+for ctrl in sp853_existing:
+    article = ctrl[3] or ""
+    if article:
+        # Extract family prefix (e.g., "AC-6" → "AC")
+        m = re.match(r'([A-Z]{2})-', article)
+        if m:
+            existing_families.add(m.group(1))
+            existing_articles.add(article)
+
+print(f"\n  SP 800-53 in DB:")
+print(f"    Total: {len(sp853_existing)}")
+print(f"    Families covered: {len(existing_families)}")
+print(f"    Unique articles: {len(existing_articles)}")
+print(f"    Families: {', '.join(sorted(existing_families))}")
+
+# Compare: which OSCAL controls are NOT in our DB?
+oscal_controls = {}  # id → (label, title, statement)
+for fam in families:
+    for ctrl in fam.get("controls", []):
+        props = {p["name"]: p.get("value", "") for p in ctrl.get("props", [])}
+        if props.get("status") == "withdrawn":
+            continue
+        label = props.get("label", ctrl["id"].upper())
+        statement = ""
+        guidance = ""
+        for part in ctrl.get("parts", []):
+            if part.get("name") == "statement":
+                statement = part.get("prose", "")
+                # Also check sub-items
+                for sub in part.get("parts", []):
+                    statement += " " + sub.get("prose", "")
+            elif part.get("name") == "guidance":
+                guidance = part.get("prose", "")
+
+        oscal_controls[label] = (ctrl["title"], statement[:500], guidance[:500])
+
+        # Enhancements
+        for enh in ctrl.get("controls", []):
+            enh_props = {p["name"]: p.get("value", "") for p in enh.get("props", [])}
+            if enh_props.get("status") == "withdrawn":
+                continue
+            enh_label = enh_props.get("label", enh["id"].upper())
+            enh_statement = ""
+            enh_guidance = ""
+            for part in enh.get("parts", []):
+                if part.get("name") == "statement":
+                    enh_statement = part.get("prose", "")
+                    for sub in part.get("parts", []):
+                        enh_statement += " " + sub.get("prose", "")
+                elif part.get("name") == "guidance":
+                    enh_guidance = part.get("prose", "")
+            oscal_controls[enh_label] = (enh["title"], enh_statement[:500], enh_guidance[:500])
+
+print(f"\n  OSCAL SP 800-53 aktive Controls: {len(oscal_controls)}")
+
+# Find missing: in OSCAL but not in DB
+missing = []
+covered = []
+for label in sorted(oscal_controls.keys()):
+    if label in existing_articles:
+        covered.append(label)
+    else:
+        missing.append(label)
+
+print(f"  In DB vorhanden: {len(covered)}")
+print(f"  FEHLEND in DB:   {len(missing)}")
+
+# Missing by family
+missing_by_fam = defaultdict(list)
+for label in missing:
+    fam = label.split("-")[0]
+    missing_by_fam[fam].append(label)
+
+print(f"\n  Fehlende Controls nach Family:")
+for fam in sorted(missing_by_fam.keys()):
+    ctrls = missing_by_fam[fam]
+    examples = ", ".join(ctrls[:5])
+    more = f" +{len(ctrls)-5}" if len(ctrls) > 5 else ""
+    print(f"    {fam:4s}: {len(ctrls):3d} fehlend  ({examples}{more})")
+
+# Also check CSF 2.0
+print(f"\n{'='*70}")
+print("NIST CSF 2.0 — Vergleich mit DB")
+print("=" * 70)
+
+cur.execute("""
+    SELECT count(*), count(*) FILTER (WHERE release_state NOT IN ('duplicate', 'too_close'))
+    FROM compliance.canonical_controls
+    WHERE source_citation->>'source' LIKE 'NIST Cybersecurity%%'
+""")
+csf_row = cur.fetchone()
+print(f"  CSF Controls in DB: {csf_row[0]} total, {csf_row[1]} active")
+
+csf_subcats = 0
+csf_ids = []
+for grp in csf_groups:
+    for cat in grp.get("groups", []):
+        for subcat in cat.get("controls", []):
+            csf_subcats += 1
+            props = {p["name"]: p.get("value", "") for p in subcat.get("props", [])}
+            csf_ids.append(props.get("label", subcat["id"]))
+
+print(f"  CSF 2.0 OSCAL Subcategories: {csf_subcats}")
+print(f"  Beispiele: {', '.join(csf_ids[:10])}")
+
+# ── Summary / Potential ──
+print(f"\n{'='*70}")
+print("POTENTIAL: Was OSCAL uns bringt")
+print("=" * 70)
+print(f"""
+  SP 800-53 Rev 5:
+    - {len(missing)} neue Controls möglich (aktuell {len(covered)} in DB)
+    - Jeder Control hat: Statement + Guidance + Assessment-Methoden
+    - Cross-References zwischen Controls (für Mapping)
+    - Maschinenlesbare Parameter (ODP)
+    - Public Domain — keine Lizenzprobleme
+
+  CSF 2.0:
+    - {csf_subcats} Subcategories als Compliance-Controls
+    - 6 Functions (Govern, Identify, Protect, Detect, Respond, Recover)
+    - Direkte Mappings zu SP 800-53 Controls
+
+  Nächste Schritte:
+    1. Fehlende SP 800-53 Controls importieren ({len(missing)} Controls)
+    2. Statement-Text als source_original_text verwenden
+    3. article_type='control', article=Label (z.B. 'AC-6')
+    4. CSF 2.0 als eigene Regulation importieren
+    5. Cross-References als Grundlage für Control-Mappings nutzen
+""")
+
+conn.close()
@@ -0,0 +1,289 @@
+"""Import 776 missing NIST SP 800-53 Rev 5 controls from OSCAL into canonical_controls."""
+import os
+import re
+import json
+import uuid
+import psycopg2
+import urllib.parse
+
+OSCAL_DIR = os.path.expanduser("~/rag-ingestion/nist-oscal")
+
+with open(os.path.join(OSCAL_DIR, "sp800-53-rev5-catalog.json")) as f:
+    sp853 = json.load(f)["catalog"]
+
+# ── Extract all OSCAL controls ──
+def extract_controls(catalog):
+    """Extract all active controls with full data."""
+    controls = []
+    for fam in catalog.get("groups", []):
+        fam_id = fam.get("id", "").upper()
+        fam_title = fam.get("title", "")
+
+        for ctrl in fam.get("controls", []):
+            result = extract_single(ctrl, fam_title)
+            if result:
+                controls.append(result)
+            # Enhancements
+            for enh in ctrl.get("controls", []):
+                result = extract_single(enh, fam_title)
+                if result:
+                    controls.append(result)
+    return controls
+
+def extract_single(ctrl, family_title):
+    """Extract a single control or enhancement."""
+    props = {p["name"]: p.get("value", "") for p in ctrl.get("props", [])}
+    if props.get("status") == "withdrawn":
+        return None
+
+    label = props.get("label", ctrl["id"].upper())
+    title = ctrl.get("title", "")
+
+    # Extract statement (main requirement text)
+    statement = ""
+    for part in ctrl.get("parts", []):
+        if part.get("name") == "statement":
+            statement = part.get("prose", "")
+            # Sub-items (a., b., c., etc.)
+            for sub in part.get("parts", []):
+                sub_prose = sub.get("prose", "")
+                sub_label = ""
+                for sp in sub.get("props", []):
+                    if sp["name"] == "label":
+                        sub_label = sp.get("value", "")
+                if sub_label:
+                    statement += f"\n{sub_label} {sub_prose}"
+                elif sub_prose:
+                    statement += f"\n{sub_prose}"
+                # Nested sub-sub-items
+                for subsub in sub.get("parts", []):
+                    ss_prose = subsub.get("prose", "")
+                    ss_label = ""
+                    for sp in subsub.get("props", []):
+                        if sp["name"] == "label":
+                            ss_label = sp.get("value", "")
+                    if ss_label:
+                        statement += f"\n  {ss_label} {ss_prose}"
+                    elif ss_prose:
+                        statement += f"\n  {ss_prose}"
+
+    # Extract guidance
+    guidance = ""
+    for part in ctrl.get("parts", []):
+        if part.get("name") == "guidance":
+            guidance = part.get("prose", "")
+
+    # Cross-references
+    related = [l["href"].lstrip("#") for l in ctrl.get("links", []) if l.get("rel") == "related"]
+
+    # Parameters
+    params = []
+    for p in ctrl.get("params", []):
+        param_id = p.get("id", "")
+        param_label = p.get("label", "")
+        guidelines = ""
+        for g in p.get("guidelines", []):
+            guidelines += g.get("prose", "")
+        select_choices = []
+        if "select" in p:
+            for choice in p["select"].get("choice", []):
+                select_choices.append(choice)
+        params.append({
+            "id": param_id,
+            "label": param_label,
+            "guidelines": guidelines,
+            "choices": select_choices,
+        })
+
+    return {
+        "label": label,
+        "title": title,
+        "family": family_title,
+        "statement": statement.strip(),
+        "guidance": guidance.strip(),
+        "related": related,
+        "params": params,
+        "is_enhancement": "(" in label,
+    }
+
+all_oscal = extract_controls(sp853)
+print(f"Total OSCAL active controls: {len(all_oscal)}")
+
+# ── Normalize label for comparison ──
+def normalize_label(label):
+    label = re.sub(r'-0+(\d)', r'-\1', label)
+    label = re.sub(r'\(0+(\d+)\)', r'(\1)', label)
+    return label.upper()
+
+# ── DB connection ──
+db_url = os.environ['DATABASE_URL']
+parsed = urllib.parse.urlparse(db_url)
+conn = psycopg2.connect(
+    host=parsed.hostname, port=parsed.port or 5432,
+    user=parsed.username, password=parsed.password,
+    dbname=parsed.path.lstrip('/'),
+    options="-c search_path=compliance,public"
+)
+cur = conn.cursor()
+
+# Get existing labels
+cur.execute("""
+    SELECT DISTINCT source_citation->>'article' as article
+    FROM compliance.canonical_controls
+    WHERE source_citation->>'source' = 'NIST SP 800-53 Rev. 5'
+    AND source_citation->>'article' IS NOT NULL
+""")
+existing_labels = set(normalize_label(r[0]) for r in cur.fetchall())
+print(f"Existing DB labels (normalized): {len(existing_labels)}")
+
+# Get highest control_id numbers per prefix
+cur.execute("""
+    SELECT control_id FROM compliance.canonical_controls
+    WHERE control_id ~ '^[A-Z]+-[0-9]+$'
+    ORDER BY control_id
+""")
+existing_ids = set(r[0] for r in cur.fetchall())
+
+# Find next available ID per prefix
+def next_control_id(prefix, existing):
+    """Find next available control_id like SEC-1234."""
+    max_num = 0
+    pattern = re.compile(rf'^{prefix}-(\d+)$')
+    for eid in existing:
+        m = pattern.match(eid)
+        if m:
+            max_num = max(max_num, int(m.group(1)))
+    return max_num
+
+# Map NIST families to our control_id prefixes
+FAMILY_PREFIX = {
+    "Access Control": "ACC",
+    "Awareness and Training": "GOV",
+    "Audit and Accountability": "LOG",
+    "Assessment, Authorization, and Monitoring": "GOV",
+    "Configuration Management": "COMP",
+    "Contingency Planning": "INC",
+    "Identification and Authentication": "AUTH",
+    "Incident Response": "INC",
+    "Maintenance": "COMP",
+    "Media Protection": "DATA",
+    "Physical and Environmental Protection": "SEC",
+    "Planning": "GOV",
+    "Program Management": "GOV",
+    "Personnel Security": "GOV",
+    "Personally Identifiable Information Processing and Transparency": "DATA",
+    "Risk Assessment": "GOV",
+    "System and Services Acquisition": "COMP",
+    "System and Communications Protection": "NET",
+    "System and Information Integrity": "SEC",
+    "Supply Chain Risk Management": "COMP",
+}
+
+# Track next IDs
+prefix_counters = {}
+for prefix in set(FAMILY_PREFIX.values()):
+    prefix_counters[prefix] = next_control_id(prefix, existing_ids)
+print(f"Starting counters: {prefix_counters}")
+
+# ── Filter to only new controls ──
+to_import = []
+for ctrl in all_oscal:
+    norm = normalize_label(ctrl["label"])
+    if norm not in existing_labels:
+        to_import.append(ctrl)
+
+print(f"\nControls to import: {len(to_import)}")
+
+# ── Import ──
+imported = 0
+for ctrl in to_import:
+    prefix = FAMILY_PREFIX.get(ctrl["family"], "COMP")
+    prefix_counters[prefix] += 1
+    control_id = f"{prefix}-{prefix_counters[prefix]:04d}"
+
+    # Build title: "NIST {label}: {title}"
+    title = f"NIST {ctrl['label']}: {ctrl['title']}"
+
+    # source_original_text = statement (the official requirement text)
+    source_text = ctrl["statement"]
+    if not source_text:
+        source_text = ctrl["guidance"][:500] if ctrl["guidance"] else ctrl["title"]
+
+    # objective = guidance text
+    objective = ctrl["guidance"][:2000] if ctrl["guidance"] else ""
+
+    # source_citation
+    citation = {
+        "source": "NIST SP 800-53 Rev. 5",
+        "article": ctrl["label"],
+        "article_type": "control",
+        "source_type": "standard",
+        "oscal_import": True,
+    }
+    if ctrl["related"]:
+        citation["related_controls"] = ctrl["related"][:20]
+    if ctrl["params"]:
+        citation["parameters"] = [{"id": p["id"], "label": p["label"]} for p in ctrl["params"][:10]]
+
+    FRAMEWORK_ID = '14b1bdd2-abc7-4a43-adae-14471ee5c7cf'
+    new_id = str(uuid.uuid4())
+    cur.execute("""
+        INSERT INTO compliance.canonical_controls
+            (id, framework_id, control_id, title, objective, rationale,
+             severity, source_original_text,
+             source_citation, pipeline_version, release_state,
+             generation_strategy, category)
+        VALUES (%s, %s, %s, %s, %s, '', 'medium', %s, %s, 4, 'draft', 'oscal_import', %s)
+    """, (
+        new_id,
+        FRAMEWORK_ID,
+        control_id,
+        title[:500],
+        objective[:5000],
+        source_text[:10000],
+        json.dumps(citation, ensure_ascii=False),
+        ctrl["family"],
+    ))
+    imported += 1
+
+conn.commit()
+print(f"\nImported: {imported} new controls")
+
+# ── Verify ──
+cur.execute("""
+    SELECT count(*),
+           count(*) FILTER (WHERE release_state NOT IN ('duplicate', 'too_close'))
+    FROM compliance.canonical_controls
+    WHERE source_citation->>'source' = 'NIST SP 800-53 Rev. 5'
+""")
+total, active = cur.fetchone()
+print(f"\nSP 800-53 after import: {total} total, {active} active")
+
+cur.execute("""
+    SELECT release_state, count(*)
+    FROM compliance.canonical_controls
+    GROUP BY release_state
+    ORDER BY count(*) DESC
+""")
+print(f"\nDB release_state gesamt:")
+for row in cur.fetchall():
+    print(f"  {row[0]:15s}: {row[1]:5d}")
+
+cur.execute("""
+    SELECT count(*)
+    FROM compliance.canonical_controls
+    WHERE release_state NOT IN ('duplicate', 'too_close')
+""")
+print(f"\nAktive Controls gesamt: {cur.fetchone()[0]}")
+
+# ── Import stats by family ──
+fam_counts = {}
+for ctrl in to_import:
+    fam = ctrl["family"]
+    fam_counts[fam] = fam_counts.get(fam, 0) + 1
+
+print(f"\nImportiert nach Family:")
+for fam in sorted(fam_counts.keys()):
+    print(f"  {fam[:45]:45s}: {fam_counts[fam]:3d}")
+
+conn.close()
@@ -0,0 +1,274 @@
+"""OWASP Cleanup:
+1. Mark 324 OWASP Top 10 multilingual controls as 'duplicate'
+2. Fix 47 wrong source attributions (found in different OWASP PDF)
+"""
+import os
+import re
+import json
+import unicodedata
+import psycopg2
+import urllib.parse
+
+try:
+    import fitz
+except ImportError:
+    print("ERROR: PyMuPDF not installed")
+    exit(1)
+
+PDF_DIR = os.path.expanduser("~/rag-ingestion/pdfs")
+
+def normalize(s):
+    s = s.replace('\u00ad', '').replace('\xad', '')
+    s = s.replace('\u200b', '').replace('\u00a0', ' ')
+    s = s.replace('\ufb01', 'fi').replace('\ufb02', 'fl')
+    s = s.replace('\ufb00', 'ff').replace('\ufb03', 'ffi').replace('\ufb04', 'ffl')
+    s = s.replace('\u2019', "'").replace('\u2018', "'")
+    s = s.replace('\u201c', '"').replace('\u201d', '"')
+    s = s.replace('\u2013', '-').replace('\u2014', '-')
+    s = s.replace('\u2022', '-').replace('\u00b7', '-')
+    s = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', s)
+    s = unicodedata.normalize('NFC', s)
+    s = re.sub(r'\s+', ' ', s)
+    return s.strip()
+
+# Load OWASP PDFs
+OWASP_PDFS = {
+    "OWASP Top 10 (2021)": "owasp_top10_2021.pdf",
+    "OWASP ASVS 4.0": "owasp_asvs_4_0.pdf",
+    "OWASP SAMM 2.0": "owasp_samm_2_0.pdf",
+    "OWASP API Security Top 10 (2023)": "owasp_api_top10_2023.pdf",
+    "OWASP MASVS 2.0": "owasp_masvs_2_0.pdf",
+}
+
+pdf_norms = {}
+for name, filename in OWASP_PDFS.items():
+    path = os.path.join(PDF_DIR, filename)
+    if not os.path.exists(path):
+        continue
+    doc = fitz.open(path)
+    text = ""
+    for page in doc:
+        text += page.get_text() + "\n"
+    doc.close()
+    pdf_norms[name] = normalize(text)
+
+def build_owasp_index(text_norm, source_name):
+    # We need the raw text for regex, but we already normalized.
+    # Rebuild index from normalized text.
+    items = []
+    if "Top 10" in source_name and "API" not in source_name:
+        for m in re.finditer(r'(A\d{2}:\d{4})', text_norm):
+            items.append((m.start(), m.group(1), "category"))
+    elif "API" in source_name:
+        for m in re.finditer(r'(API\d+:\d{4})', text_norm):
+            items.append((m.start(), m.group(1), "category"))
+    elif "ASVS" in source_name:
+        for m in re.finditer(r'(V\d+\.\d+(?:\.\d+)?)\b', text_norm):
+            items.append((m.start(), m.group(1), "requirement"))
+    elif "MASVS" in source_name:
+        for m in re.finditer(r'(MASVS-[A-Z]+-\d+)', text_norm):
+            items.append((m.start(), m.group(1), "requirement"))
+    items.sort(key=lambda x: x[0])
+    seen = set()
+    unique = []
+    for pos, label, typ in items:
+        if label not in seen:
+            seen.add(label)
+            unique.append((pos, label, typ))
+    return unique
+
+pdf_indexes = {}
+for name, norm in pdf_norms.items():
+    pdf_indexes[name] = build_owasp_index(norm, name)
+
+def find_in_pdf(orig_text, source_name):
+    """Find control text in a specific PDF. Returns (label, type) or None."""
+    pdf_norm = pdf_norms.get(source_name)
+    if not pdf_norm:
+        return None
+    orig_norm = normalize(orig_text)
+    if len(orig_norm) < 20:
+        return None
+    idx = pdf_indexes.get(source_name, [])
+    for start_frac in [0.25, 0.1, 0.5, 0.0, 0.75]:
+        for length in [80, 60, 40, 30, 20]:
+            start = max(0, int(len(orig_norm) * start_frac))
+            snippet = orig_norm[start:start+length]
+            if not snippet or len(snippet) < 15:
+                continue
+            pos = pdf_norm.find(snippet)
+            if pos >= 0:
+                label = "Unknown"
+                typ = "unknown"
+                for h_pos, h_label, h_type in reversed(idx):
+                    if h_pos <= pos:
+                        label = h_label
+                        typ = h_type
+                        break
+                return (label, typ)
+    return None
+
+# DB
+db_url = os.environ['DATABASE_URL']
+parsed = urllib.parse.urlparse(db_url)
+conn = psycopg2.connect(
+    host=parsed.hostname, port=parsed.port or 5432,
+    user=parsed.username, password=parsed.password,
+    dbname=parsed.path.lstrip('/'),
+    options="-c search_path=compliance,public"
+)
+cur = conn.cursor()
+
+# ═══════════════════════════════════════════════════════════════
+# STEP 1: Mark OWASP Top 10 multilingual controls as duplicate
+# ═══════════════════════════════════════════════════════════════
+print("=" * 60)
+print("STEP 1: OWASP Top 10 — multilingual controls → duplicate")
+print("=" * 60)
+
+cur.execute("""
+    SELECT id, control_id, title, source_original_text, release_state
+    FROM compliance.canonical_controls
+    WHERE source_citation->>'source' = 'OWASP Top 10 (2021)'
+    AND source_citation->>'article_type' IS NULL
+    AND source_original_text IS NOT NULL
+    AND release_state NOT IN ('duplicate', 'too_close')
+    ORDER BY control_id
+""")
+top10_unmatched = cur.fetchall()
+print(f"  Unmatched active OWASP Top 10: {len(top10_unmatched)}")
+
+# Separate: found in other OWASP PDF vs not found anywhere
+to_mark_dup = []
+to_fix_source = []
+
+for ctrl in top10_unmatched:
+    uid, cid, title, text, state = ctrl
+
+    # Check if found in another OWASP PDF
+    found_in = None
+    found_result = None
+    for other_src in OWASP_PDFS:
+        if other_src == 'OWASP Top 10 (2021)':
+            continue
+        result = find_in_pdf(text, other_src)
+        if result:
+            found_in = other_src
+            found_result = result
+            break
+
+    if found_in:
+        to_fix_source.append((uid, cid, found_in, found_result[0], found_result[1]))
+    else:
+        to_mark_dup.append((uid, cid))
+
+print(f"  → Not found in any PDF (multilingual): {len(to_mark_dup)} → mark as duplicate")
+print(f"  → Found in other OWASP PDF: {len(to_fix_source)} → fix source attribution")
+
+# Mark as duplicate
+dup_marked = 0
+for uid, cid in to_mark_dup:
+    cur.execute("""
+        UPDATE compliance.canonical_controls
+        SET release_state = 'duplicate'
+        WHERE id = %s AND release_state NOT IN ('duplicate', 'too_close')
+    """, (uid,))
+    if cur.rowcount > 0:
+        dup_marked += 1
+
+print(f"  Marked as duplicate: {dup_marked}")
+
+# ═══════════════════════════════════════════════════════════════
+# STEP 2: Fix wrong source attributions across ALL OWASP sources
+# ═══════════════════════════════════════════════════════════════
+print(f"\n{'='*60}")
+print("STEP 2: Fix wrong OWASP source attributions")
+print("=" * 60)
+
+all_fixes = list(to_fix_source)  # Start with Top 10 fixes
+
+# Also check ASVS, SAMM, MASVS
+for source in ['OWASP ASVS 4.0', 'OWASP SAMM 2.0', 'OWASP API Security Top 10 (2023)', 'OWASP MASVS 2.0']:
+    cur.execute("""
+        SELECT id, control_id, title, source_original_text
+        FROM compliance.canonical_controls
+        WHERE source_citation->>'source' = %s
+        AND source_citation->>'article_type' IS NULL
+        AND source_original_text IS NOT NULL
+        AND release_state NOT IN ('duplicate', 'too_close')
+    """, (source,))
+    controls = cur.fetchall()
+
+    for ctrl in controls:
+        uid, cid, title, text = ctrl
+        # Try own PDF first
+        result = find_in_pdf(text, source)
+        if result:
+            # Found in own PDF! Update article info
+            cur.execute("""
+                UPDATE compliance.canonical_controls
+                SET source_citation = source_citation ||
+                    jsonb_build_object('article', %s, 'article_type', %s)
+                WHERE id = %s
+                AND (source_citation->>'article' IS DISTINCT FROM %s
+                     OR source_citation->>'article_type' IS DISTINCT FROM %s)
+            """, (result[0], result[1], uid, result[0], result[1]))
+            continue
+
+        # Try other OWASP PDFs
+        for other_src in OWASP_PDFS:
+            if other_src == source:
+                continue
+            result = find_in_pdf(text, other_src)
+            if result:
+                all_fixes.append((uid, cid, other_src, result[0], result[1]))
+                break
+
+print(f"  Total wrong-source controls found: {len(all_fixes)}")
+
+# Apply source fixes
+fixed = 0
+for uid, cid, correct_source, label, typ in all_fixes:
+    cur.execute("""
+        UPDATE compliance.canonical_controls
+        SET source_citation = source_citation ||
+            jsonb_build_object('source', %s, 'article', %s, 'article_type', %s)
+        WHERE id = %s
+    """, (correct_source, label, typ, uid,))
+    if cur.rowcount > 0:
+        fixed += 1
+        print(f"  {cid:10s} → {correct_source} / {label} [{typ}]")
+
+print(f"  Fixed: {fixed} controls")
+
+conn.commit()
+
+# ═══════════════════════════════════════════════════════════════
+# SUMMARY
+# ═══════════════════════════════════════════════════════════════
+print(f"\n{'='*60}")
+print("ZUSAMMENFASSUNG")
+print("=" * 60)
+print(f"  OWASP Top 10 multilingual → duplicate:  {dup_marked}")
+print(f"  Wrong source attribution → fixed:        {fixed}")
+
+# Final counts
+cur.execute("""
+    SELECT release_state, count(*)
+    FROM compliance.canonical_controls
+    GROUP BY release_state
+    ORDER BY count(*) DESC
+""")
+print(f"\n  DB release_state nach Cleanup:")
+for row in cur.fetchall():
+    print(f"    {row[0]:15s}: {row[1]:5d}")
+
+cur.execute("""
+    SELECT count(*)
+    FROM compliance.canonical_controls
+    WHERE release_state NOT IN ('duplicate', 'too_close')
+""")
+active = cur.fetchone()[0]
+print(f"\n  Aktive Controls: {active}")
+
+conn.close()
@@ -0,0 +1,316 @@
+"""Match unmatched OWASP ASVS/SAMM/MASVS controls against GitHub Markdown sources."""
+import os
+import re
+import unicodedata
+import psycopg2
+import urllib.parse
+from pathlib import Path
+
+GITHUB_DIR = Path(os.path.expanduser("~/rag-ingestion/owasp-github"))
+
+def normalize(s):
+    s = s.replace('\u00ad', '').replace('\xad', '')
+    s = s.replace('\u200b', '').replace('\u00a0', ' ')
+    s = s.replace('\ufb01', 'fi').replace('\ufb02', 'fl')
+    s = s.replace('\ufb00', 'ff').replace('\ufb03', 'ffi').replace('\ufb04', 'ffl')
+    s = s.replace('\u2019', "'").replace('\u2018', "'")
+    s = s.replace('\u201c', '"').replace('\u201d', '"')
+    s = s.replace('\u2013', '-').replace('\u2014', '-')
+    s = s.replace('\u2022', '-').replace('\u00b7', '-')
+    s = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', s)
+    s = unicodedata.normalize('NFC', s)
+    s = re.sub(r'\s+', ' ', s)
+    return s.strip()
+
+# ── Load Markdown sources ──
+def load_markdown_dir(path, pattern="*.md"):
+    """Load all markdown files, return combined text and per-file index."""
+    texts = {}
+    for f in sorted(path.glob(pattern)):
+        try:
+            texts[f.name] = f.read_text(encoding='utf-8', errors='replace')
+        except:
+            pass
+    return texts
+
+# ASVS 4.0 — V-files contain requirements
+asvs_dir = GITHUB_DIR / "ASVS" / "4.0" / "en"
+asvs_files = load_markdown_dir(asvs_dir)
+asvs_full = "\n".join(asvs_files.values())
+asvs_norm = normalize(asvs_full)
+print(f"ASVS 4.0 Markdown: {len(asvs_files)} files, {len(asvs_full):,} chars")
+
+# SAMM core — YAML + Markdown
+samm_dir = GITHUB_DIR / "samm-core"
+samm_texts = {}
+for f in samm_dir.rglob("*.yml"):
+    try:
+        samm_texts[str(f.relative_to(samm_dir))] = f.read_text(encoding='utf-8', errors='replace')
+    except:
+        pass
+for f in samm_dir.rglob("*.md"):
+    try:
+        samm_texts[str(f.relative_to(samm_dir))] = f.read_text(encoding='utf-8', errors='replace')
+    except:
+        pass
+samm_full = "\n".join(samm_texts.values())
+samm_norm = normalize(samm_full)
+print(f"SAMM 2.0 source: {len(samm_texts)} files, {len(samm_full):,} chars")
+
+# MASVS — control markdown files
+masvs_dir = GITHUB_DIR / "masvs"
+masvs_files = {}
+for f in masvs_dir.rglob("*.md"):
+    try:
+        masvs_files[str(f.relative_to(masvs_dir))] = f.read_text(encoding='utf-8', errors='replace')
+    except:
+        pass
+masvs_full = "\n".join(masvs_files.values())
+masvs_norm = normalize(masvs_full)
+print(f"MASVS 2.0 source: {len(masvs_files)} files, {len(masvs_full):,} chars")
+
+# API Security
+api_dir = GITHUB_DIR / "api-security"
+api_files = {}
+for f in api_dir.rglob("*.md"):
+    try:
+        api_files[str(f.relative_to(api_dir))] = f.read_text(encoding='utf-8', errors='replace')
+    except:
+        pass
+api_full = "\n".join(api_files.values())
+api_norm = normalize(api_full)
+print(f"API Security source: {len(api_files)} files, {len(api_full):,} chars")
+
+# Source → (normalized_text, index_builder)
+SOURCE_GITHUB = {
+    "OWASP ASVS 4.0": asvs_norm,
+    "OWASP SAMM 2.0": samm_norm,
+    "OWASP MASVS 2.0": masvs_norm,
+    "OWASP API Security Top 10 (2023)": api_norm,
+}
+
+# Build indexes for each source
+def build_asvs_index(text):
+    items = []
+    for m in re.finditer(r'(V\d+\.\d+(?:\.\d+)?)\b', text):
+        items.append((m.start(), m.group(1), "requirement"))
+    items.sort(key=lambda x: x[0])
+    seen = set()
+    return [(p, l, t) for p, l, t in items if l not in seen and not seen.add(l)]
+
+def build_samm_index(text):
+    items = []
+    # SAMM practices have names like "Strategy & Metrics", sections numbered
+    for m in re.finditer(r'(?:^|\s)(\d+\.\d+(?:\.\d+)?)\s+[A-Z]', text):
+        items.append((m.start(), f"Section {m.group(1)}", "section"))
+    # Also find practice identifiers
+    for m in re.finditer(r'((?:Strategy|Education|Policy|Threat|Security Requirements|Secure Architecture|'
+                         r'Secure Build|Secure Deployment|Defect Management|Environment Management|'
+                         r'Incident Management|Requirements Testing|Security Testing|'
+                         r'Design Review|Implementation Review|Operations Management)'
+                         r'[^.\n]{0,30})', text):
+        items.append((m.start(), m.group(1)[:50], "section"))
+    items.sort(key=lambda x: x[0])
+    seen = set()
+    return [(p, l, t) for p, l, t in items if l not in seen and not seen.add(l)]
+
+def build_masvs_index(text):
+    items = []
+    for m in re.finditer(r'(MASVS-[A-Z]+-\d+)', text):
+        items.append((m.start(), m.group(1), "requirement"))
+    items.sort(key=lambda x: x[0])
+    seen = set()
+    return [(p, l, t) for p, l, t in items if l not in seen and not seen.add(l)]
+
+def build_api_index(text):
+    items = []
+    for m in re.finditer(r'(API\d+:\d{4})', text):
+        items.append((m.start(), m.group(1), "category"))
+    items.sort(key=lambda x: x[0])
+    seen = set()
+    return [(p, l, t) for p, l, t in items if l not in seen and not seen.add(l)]
+
+SOURCE_INDEX_BUILDERS = {
+    "OWASP ASVS 4.0": build_asvs_index,
+    "OWASP SAMM 2.0": build_samm_index,
+    "OWASP MASVS 2.0": build_masvs_index,
+    "OWASP API Security Top 10 (2023)": build_api_index,
+}
+
+# Build all indexes on normalized text
+source_indexes = {}
+for name, norm_text in SOURCE_GITHUB.items():
+    builder = SOURCE_INDEX_BUILDERS[name]
+    idx = builder(norm_text)
+    source_indexes[name] = idx
+    print(f"  {name}: {len(idx)} index entries")
+
+def find_text(orig_text, source_name):
+    """Find control text in GitHub source. Returns (label, type) or None."""
+    norm_text = SOURCE_GITHUB.get(source_name)
+    if not norm_text:
+        return None
+    idx = source_indexes.get(source_name, [])
+    orig_norm = normalize(orig_text)
+    if len(orig_norm) < 20:
+        return None
+
+    for start_frac in [0.25, 0.1, 0.5, 0.0, 0.75]:
+        for length in [80, 60, 40, 30, 20]:
+            start = max(0, int(len(orig_norm) * start_frac))
+            snippet = orig_norm[start:start+length]
+            if not snippet or len(snippet) < 15:
+                continue
+            pos = norm_text.find(snippet)
+            if pos >= 0:
+                label = "Unknown"
+                typ = "unknown"
+                for h_pos, h_label, h_type in reversed(idx):
+                    if h_pos <= pos:
+                        label = h_label
+                        typ = h_type
+                        break
+                return (label, typ)
+    return None
+
+def find_in_any_github(orig_text, exclude_source=None):
+    """Try all GitHub sources."""
+    for name in SOURCE_GITHUB:
+        if name == exclude_source:
+            continue
+        result = find_text(orig_text, name)
+        if result:
+            return (name, result[0], result[1])
+    return None
+
+# ── DB ──
+db_url = os.environ['DATABASE_URL']
+parsed = urllib.parse.urlparse(db_url)
+conn = psycopg2.connect(
+    host=parsed.hostname, port=parsed.port or 5432,
+    user=parsed.username, password=parsed.password,
+    dbname=parsed.path.lstrip('/'),
+    options="-c search_path=compliance,public"
+)
+cur = conn.cursor()
+
+# ── Process each OWASP source ──
+total_matched = 0
+total_cross = 0
+total_not_found = 0
+all_updates = []
+
+for source in ['OWASP ASVS 4.0', 'OWASP SAMM 2.0', 'OWASP MASVS 2.0', 'OWASP API Security Top 10 (2023)']:
+    cur.execute("""
+        SELECT id, control_id, title, source_original_text, release_state
+        FROM compliance.canonical_controls
+        WHERE source_citation->>'source' = %s
+        AND source_citation->>'article_type' IS NULL
+        AND source_original_text IS NOT NULL
+        AND release_state NOT IN ('duplicate', 'too_close')
+        ORDER BY control_id
+    """, (source,))
+    controls = cur.fetchall()
+
+    if not controls:
+        continue
+
+    print(f"\n{'='*60}")
+    print(f"{source} — {len(controls)} unmatched active")
+    print(f"{'='*60}")
+
+    matched = 0
+    cross_matched = 0
+    not_found = 0
+
+    for ctrl in controls:
+        uid, cid, title, text, state = ctrl
+
+        # Try own GitHub source
+        result = find_text(text, source)
+        if result:
+            matched += 1
+            total_matched += 1
+            all_updates.append((uid, cid, source, result[0], result[1]))
+            print(f"  {cid:10s} → {result[0]:30s} [{result[1]}]")
+            continue
+
+        # Try other GitHub sources
+        cross = find_in_any_github(text, exclude_source=source)
+        if cross:
+            cross_matched += 1
+            total_cross += 1
+            all_updates.append((uid, cid, cross[0], cross[1], cross[2]))
+            print(f"  {cid:10s} → [{cross[0]}] {cross[1]:20s} [{cross[2]}] (CROSS)")
+            continue
+
+        not_found += 1
+        total_not_found += 1
+
+    print(f"\n  Own source matched: {matched}")
+    print(f"  Cross-source:       {cross_matched}")
+    print(f"  Not found:          {not_found}")
+
+# ── Also try OWASP Top 10 remaining unmatched (34 active left after dup marking) ──
+cur.execute("""
+    SELECT id, control_id, title, source_original_text, release_state
+    FROM compliance.canonical_controls
+    WHERE source_citation->>'source' = 'OWASP Top 10 (2021)'
+    AND source_citation->>'article_type' IS NULL
+    AND source_original_text IS NOT NULL
+    AND release_state NOT IN ('duplicate', 'too_close')
+    ORDER BY control_id
+""")
+top10_remaining = cur.fetchall()
+if top10_remaining:
+    print(f"\n{'='*60}")
+    print(f"OWASP Top 10 (2021) — {len(top10_remaining)} remaining unmatched active")
+    print(f"{'='*60}")
+    for ctrl in top10_remaining:
+        uid, cid, title, text, state = ctrl
+        cross = find_in_any_github(text)
+        if cross:
+            total_cross += 1
+            all_updates.append((uid, cid, cross[0], cross[1], cross[2]))
+            print(f"  {cid:10s} → [{cross[0]}] {cross[1]:20s} [{cross[2]}]")
+        else:
+            total_not_found += 1
+
+# ── Summary ──
+print(f"\n{'='*60}")
+print(f"ZUSAMMENFASSUNG")
+print(f"{'='*60}")
+print(f"  Matched in eigener GitHub-Quelle: {total_matched}")
+print(f"  Cross-source matched:             {total_cross}")
+print(f"  Nicht gefunden:                   {total_not_found}")
+print(f"  Total Updates:                    {len(all_updates)}")
+
+# ── Apply updates ──
+if all_updates:
+    print(f"\nApplying {len(all_updates)} updates to DB...")
+    applied = 0
+    for uid, cid, correct_source, label, typ in all_updates:
+        # Update article + article_type, and fix source if cross-matched
+        cur.execute("""
+            UPDATE compliance.canonical_controls
+            SET source_citation = source_citation ||
+                jsonb_build_object('article', %s, 'article_type', %s)
+            WHERE id = %s
+            AND (source_citation->>'article' IS DISTINCT FROM %s
+                 OR source_citation->>'article_type' IS DISTINCT FROM %s)
+        """, (label, typ, uid, label, typ))
+        if cur.rowcount > 0:
+            applied += 1
+
+    conn.commit()
+    print(f"  Applied: {applied} controls updated")
+
+    # Type distribution
+    type_counts = {}
+    for _, _, _, _, typ in all_updates:
+        type_counts[typ] = type_counts.get(typ, 0) + 1
+    print(f"\n  Article type distribution:")
+    for t, c in sorted(type_counts.items(), key=lambda x: -x[1]):
+        print(f"    {t:12s}: {c:5d}")
+
+conn.close()
@@ -0,0 +1,357 @@
+"""Phase 5: Source Normalization + Duplicate Hard Delete.
+
+Steps:
+  1. OSCAL controls: add source_regulation to generation_metadata
+  2. Fix 20 v3 controls with NULL source (tag as manually_reviewed)
+  3. Fix empty-string source (DATA-631 → Telekommunikationsgesetz Oesterreich)
+  4. Fix OWASP cross-source misattributions (regulation_code vs actual source)
+  5. Hard delete duplicate/too_close controls (3,301 controls, 0 FK refs)
+  6. Clean up canonical_processed_chunks generated_control_ids
+
+Usage:
+  export DATABASE_URL='postgresql://...'
+  python3 scripts/qa/phase5_normalize_and_cleanup.py [--dry-run] [--step N]
+"""
+import os
+import sys
+import json
+import psycopg2
+import urllib.parse
+
+DRY_RUN = "--dry-run" in sys.argv
+STEP_ONLY = None
+for arg in sys.argv:
+    if arg.startswith("--step"):
+        idx = sys.argv.index(arg)
+        if idx + 1 < len(sys.argv):
+            STEP_ONLY = int(sys.argv[idx + 1])
+
+db_url = os.environ['DATABASE_URL']
+parsed = urllib.parse.urlparse(db_url)
+conn = psycopg2.connect(
+    host=parsed.hostname, port=parsed.port or 5432,
+    user=parsed.username, password=parsed.password,
+    dbname=parsed.path.lstrip('/'),
+    options="-c search_path=compliance,public"
+)
+cur = conn.cursor()
+
+def should_run(step):
+    return STEP_ONLY is None or STEP_ONLY == step
+
+
+# ══════════════════════════════════════════════════════════════════
+# Step 1: OSCAL controls — add source_regulation to generation_metadata
+# ══════════════════════════════════════════════════════════════════
+if should_run(1):
+    print("=" * 70)
+    print("STEP 1: OSCAL controls — source_regulation in generation_metadata")
+    print("=" * 70)
+
+    cur.execute("""
+        SELECT count(*)
+        FROM compliance.canonical_controls
+        WHERE generation_strategy = 'oscal_import'
+        AND (generation_metadata->>'source_regulation' IS NULL
+             OR generation_metadata->>'source_regulation' = '')
+    """)
+    count = cur.fetchone()[0]
+    print(f"  OSCAL controls without source_regulation: {count}")
+
+    if count > 0:
+        if DRY_RUN:
+            print(f"  [DRY RUN] Would update {count} controls")
+        else:
+            cur.execute("""
+                UPDATE compliance.canonical_controls
+                SET generation_metadata = COALESCE(generation_metadata, '{}'::jsonb)
+                    || '{"source_regulation": "nist_sp800_53r5"}'::jsonb
+                WHERE generation_strategy = 'oscal_import'
+                AND (generation_metadata->>'source_regulation' IS NULL
+                     OR generation_metadata->>'source_regulation' = '')
+            """)
+            print(f"  Updated: {cur.rowcount}")
+    print()
+
+
+# ══════════════════════════════════════════════════════════════════
+# Step 2: v3 controls with NULL source — tag source as best guess
+# ══════════════════════════════════════════════════════════════════
+if should_run(2):
+    print("=" * 70)
+    print("STEP 2: Fix v3 controls with NULL source")
+    print("=" * 70)
+
+    # These 20 controls are v3/document_grouped with no source or regulation.
+    # Based on title analysis, they cover:
+    # - Data protection/privacy topics (DSGVO-adjacent)
+    # - Software security (OWASP/NIST-adjacent)
+    # - Mobile security (OWASP MASVS-adjacent)
+    # Mark them as 'needs_review' and add a flag.
+    cur.execute("""
+        SELECT id, control_id, title
+        FROM compliance.canonical_controls
+        WHERE source_citation->>'source' IS NULL
+        AND pipeline_version = 3
+        AND release_state NOT IN ('duplicate', 'too_close')
+    """)
+    v3_null = cur.fetchall()
+    print(f"  v3 controls with NULL source: {len(v3_null)}")
+
+    if v3_null:
+        if DRY_RUN:
+            print(f"  [DRY RUN] Would mark {len(v3_null)} as needs_review")
+        else:
+            for ctrl_id_uuid, control_id, title in v3_null:
+                cur.execute("""
+                    UPDATE compliance.canonical_controls
+                    SET release_state = 'needs_review',
+                        generation_metadata = COALESCE(generation_metadata, '{}'::jsonb)
+                            || '{"missing_source": true}'::jsonb
+                    WHERE id = %s
+                """, (ctrl_id_uuid,))
+            print(f"  Marked {len(v3_null)} as needs_review with missing_source flag")
+    print()
+
+
+# ══════════════════════════════════════════════════════════════════
+# Step 3: Fix empty-string source (DATA-631)
+# ══════════════════════════════════════════════════════════════════
+if should_run(3):
+    print("=" * 70)
+    print("STEP 3: Fix empty-string source")
+    print("=" * 70)
+
+    cur.execute("""
+        SELECT id, control_id, title,
+               generation_metadata->>'source_regulation' as reg
+        FROM compliance.canonical_controls
+        WHERE source_citation->>'source' = ''
+        AND release_state NOT IN ('duplicate', 'too_close')
+    """)
+    empty_src = cur.fetchall()
+    print(f"  Controls with empty source: {len(empty_src)}")
+
+    for ctrl_id_uuid, control_id, title, reg in empty_src:
+        print(f"    {control_id} | reg={reg} | {title[:60]}")
+        if reg == 'at_tkg':
+            new_source = 'Telekommunikationsgesetz Oesterreich'
+        else:
+            new_source = f"Unbekannt ({reg})"
+
+        if DRY_RUN:
+            print(f"    [DRY RUN] Would set source='{new_source}'")
+        else:
+            cur.execute("""
+                UPDATE compliance.canonical_controls
+                SET source_citation = jsonb_set(
+                    source_citation, '{source}', %s::jsonb
+                )
+                WHERE id = %s
+            """, (json.dumps(new_source), ctrl_id_uuid))
+            print(f"    Set source='{new_source}'")
+    print()
+
+
+# ══════════════════════════════════════════════════════════════════
+# Step 4: Fix OWASP cross-source misattributions
+# ══════════════════════════════════════════════════════════════════
+if should_run(4):
+    print("=" * 70)
+    print("STEP 4: Fix OWASP cross-source misattributions")
+    print("=" * 70)
+
+    # Controls where source_citation.source doesn't match the regulation_code
+    OWASP_REG_TO_SOURCE = {
+        'owasp_top10_2021': 'OWASP Top 10 (2021)',
+        'owasp_asvs': 'OWASP ASVS 4.0',
+        'owasp_masvs': 'OWASP MASVS 2.0',
+        'owasp_samm': 'OWASP SAMM 2.0',
+        'owasp_api_top10_2023': 'OWASP API Security Top 10 (2023)',
+    }
+
+    # Strategy: Move controls to the regulation_code that matches their actual source
+    # i.e., if a control has source='OWASP ASVS 4.0' but reg='owasp_top10_2021',
+    # update the reg to 'owasp_asvs'
+    SOURCE_TO_REG = {v: k for k, v in OWASP_REG_TO_SOURCE.items()}
+
+    total_fixed = 0
+    for reg_code, expected_source in OWASP_REG_TO_SOURCE.items():
+        cur.execute("""
+            SELECT id, control_id, source_citation->>'source' as src
+            FROM compliance.canonical_controls
+            WHERE generation_metadata->>'source_regulation' = %s
+            AND source_citation->>'source' <> %s
+            AND release_state NOT IN ('duplicate', 'too_close')
+        """, (reg_code, expected_source))
+        mismatches = cur.fetchall()
+
+        if mismatches:
+            print(f"\n  {reg_code} → {len(mismatches)} Mismatches:")
+            for ctrl_id_uuid, control_id, actual_source in mismatches:
+                correct_reg = SOURCE_TO_REG.get(actual_source)
+                if correct_reg:
+                    print(f"    {control_id} | {actual_source} → reg={correct_reg}")
+                    if not DRY_RUN:
+                        cur.execute("""
+                            UPDATE compliance.canonical_controls
+                            SET generation_metadata = jsonb_set(
+                                generation_metadata, '{source_regulation}', %s::jsonb
+                            )
+                            WHERE id = %s
+                        """, (json.dumps(correct_reg), ctrl_id_uuid))
+                    total_fixed += 1
+                else:
+                    print(f"    {control_id} | {actual_source} → no mapping found")
+
+    if DRY_RUN:
+        print(f"\n  [DRY RUN] Would fix {total_fixed} misattributions")
+    else:
+        print(f"\n  Fixed: {total_fixed} misattributions")
+    print()
+
+
+# ══════════════════════════════════════════════════════════════════
+# Step 5: Hard delete duplicate/too_close controls
+# ══════════════════════════════════════════════════════════════════
+if should_run(5):
+    print("=" * 70)
+    print("STEP 5: Hard delete duplicate/too_close controls")
+    print("=" * 70)
+
+    # Verify no FK references
+    for table, col in [
+        ('canonical_control_mappings', 'control_id'),
+        ('obligation_extractions', 'control_uuid'),
+        ('crosswalk_matrix', 'master_control_uuid'),
+        ('obligation_candidates', 'parent_control_uuid'),
+    ]:
+        cur.execute(f"""
+            SELECT count(*)
+            FROM compliance.{table} t
+            JOIN compliance.canonical_controls cc ON cc.id = t.{col}
+            WHERE cc.release_state IN ('duplicate', 'too_close')
+        """)
+        fk_count = cur.fetchone()[0]
+        if fk_count > 0:
+            print(f"  WARNING: {table}.{col} has {fk_count} refs to dup/too_close!")
+            print(f"  ABORTING Step 5 — clean FK refs first!")
+            sys.exit(1)
+        else:
+            print(f"  {table}.{col}: 0 refs ✓")
+
+    # Check self-references
+    cur.execute("""
+        SELECT count(*)
+        FROM compliance.canonical_controls child
+        JOIN compliance.canonical_controls parent ON parent.id = child.parent_control_uuid
+        WHERE parent.release_state IN ('duplicate', 'too_close')
+    """)
+    self_refs = cur.fetchone()[0]
+    if self_refs > 0:
+        print(f"  WARNING: {self_refs} child controls reference dup/too_close parents!")
+        print(f"  ABORTING Step 5!")
+        sys.exit(1)
+    print(f"  Self-references: 0 ✓")
+
+    cur.execute("""
+        SELECT release_state, count(*)
+        FROM compliance.canonical_controls
+        WHERE release_state IN ('duplicate', 'too_close')
+        GROUP BY 1
+    """)
+    to_delete = {}
+    for state, cnt in cur.fetchall():
+        to_delete[state] = cnt
+        print(f"\n  {state}: {cnt}")
+
+    total = sum(to_delete.values())
+    print(f"\n  TOTAL to delete: {total}")
+
+    if DRY_RUN:
+        print(f"  [DRY RUN] Would delete {total} controls")
+    else:
+        cur.execute("""
+            DELETE FROM compliance.canonical_controls
+            WHERE release_state IN ('duplicate', 'too_close')
+        """)
+        print(f"  Deleted: {cur.rowcount} controls")
+    print()
+
+
+# ══════════════════════════════════════════════════════════════════
+# Step 6: Clean up canonical_processed_chunks generated_control_ids
+# ══════════════════════════════════════════════════════════════════
+if should_run(6):
+    print("=" * 70)
+    print("STEP 6: Clean up processed chunks (remove deleted control IDs)")
+    print("=" * 70)
+
+    if DRY_RUN and should_run(5):
+        print("  [DRY RUN] Skipping — depends on Step 5 deletion")
+    else:
+        # Find chunks that reference non-existent controls
+        cur.execute("""
+            SELECT id, generated_control_ids
+            FROM compliance.canonical_processed_chunks
+            WHERE generated_control_ids IS NOT NULL
+            AND generated_control_ids <> '[]'::jsonb
+        """)
+        chunks = cur.fetchall()
+        print(f"  Chunks with generated_control_ids: {len(chunks)}")
+
+        # Get all existing control IDs
+        cur.execute("SELECT id::text FROM compliance.canonical_controls")
+        existing_ids = set(r[0] for r in cur.fetchall())
+        print(f"  Existing controls: {len(existing_ids)}")
+
+        cleaned = 0
+        for chunk_id, control_ids in chunks:
+            if isinstance(control_ids, str):
+                control_ids = json.loads(control_ids)
+            if isinstance(control_ids, list):
+                valid_ids = [cid for cid in control_ids if cid in existing_ids]
+                if len(valid_ids) < len(control_ids):
+                    removed = len(control_ids) - len(valid_ids)
+                    cur.execute("""
+                        UPDATE compliance.canonical_processed_chunks
+                        SET generated_control_ids = %s::jsonb
+                        WHERE id = %s
+                    """, (json.dumps(valid_ids), chunk_id))
+                    cleaned += 1
+
+        print(f"  Chunks cleaned: {cleaned}")
+    print()
+
+
+# ══════════════════════════════════════════════════════════════════
+# Final summary
+# ══════════════════════════════════════════════════════════════════
+if not DRY_RUN:
+    conn.commit()
+    print("=" * 70)
+    print("COMMITTED. Final state:")
+    print("=" * 70)
+else:
+    print("=" * 70)
+    print("[DRY RUN] No changes committed. Current state:")
+    print("=" * 70)
+
+cur.execute("""
+    SELECT release_state, count(*)
+    FROM compliance.canonical_controls
+    GROUP BY 1
+    ORDER BY count(*) DESC
+""")
+total = 0
+active = 0
+for state, cnt in cur.fetchall():
+    total += cnt
+    if state not in ('duplicate', 'too_close'):
+        active += cnt
+    print(f"  {state:15s}: {cnt:5d}")
+
+print(f"\n  TOTAL:  {total}")
+print(f"  AKTIV:  {active}")
+
+conn.close()
@@ -0,0 +1,655 @@
+#!/usr/bin/env python3
+"""
+Phase 7.4: Generate new controls for gap articles via Anthropic Claude Sonnet.
+
+Reads gap_analysis_results.json, extracts article text from PDFs,
+calls Claude Sonnet to generate controls, inserts into DB.
+
+Usage:
+    python3 phase74_generate_gap_controls.py --dry-run          # show what would be generated
+    python3 phase74_generate_gap_controls.py                     # generate and insert
+    python3 phase74_generate_gap_controls.py --source "DSGVO"    # filter by source
+    python3 phase74_generate_gap_controls.py --resume            # skip already-generated articles
+"""
+import os
+import sys
+import json
+import re
+import time
+import hashlib
+import argparse
+import psycopg2
+import urllib.parse
+import requests
+from pathlib import Path
+from collections import Counter
+
+sys.path.insert(0, os.path.dirname(__file__))
+from pdf_qa_all import (
+    SOURCE_FILE_MAP, read_file, classify_doc, normalize,
+    build_eu_article_index, build_de_law_index, build_nist_index,
+    build_owasp_index, build_generic_index, MAX_ARTICLES,
+)
+
+# ── Config ──────────────────────────────────────────────────────────
+ANTHROPIC_URL = "https://api.anthropic.com/v1/messages"
+ANTHROPIC_MODEL = os.environ.get("CONTROL_GEN_ANTHROPIC_MODEL", "claude-sonnet-4-6")
+ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", "")
+PIPELINE_VERSION = 5
+GAP_RESULTS_FILE = "/tmp/gap_analysis_results.json"
+PDF_DIR = Path(os.path.expanduser("~/rag-ingestion/pdfs"))
+
+try:
+    import fitz
+except ImportError:
+    fitz = None
+
+# ── Source name → regulation_code reverse map ────────────────────────
+# Built from REGULATION_LICENSE_MAP in control_generator.py
+SOURCE_TO_REGCODE = {
+    "DSGVO (EU) 2016/679": "eu_2016_679",
+    "KI-Verordnung (EU) 2024/1689": "eu_2024_1689",
+    "NIS2-Richtlinie (EU) 2022/2555": "eu_2022_2555",
+    "Cyber Resilience Act (CRA)": "eu_2024_2847",
+    "Maschinenverordnung (EU) 2023/1230": "eu_2023_1230",
+    "EU Blue Guide 2022": "eu_blue_guide_2022",
+    "Markets in Crypto-Assets (MiCA)": "mica",
+    "Batterieverordnung (EU) 2023/1542": "eu_2023_1542",
+    "AML-Verordnung": "amlr",
+    "Data Governance Act (DGA)": "dga",
+    "Data Act": "data_act",
+    "GPSR (EU) 2023/988": "gpsr",
+    "IFRS-Übernahmeverordnung": "ifrs",
+    "NIST SP 800-53 Rev. 5": "nist_sp800_53r5",
+    "NIST SP 800-207 (Zero Trust)": "nist_sp800_207",
+    "NIST SP 800-63-3": "nist_sp800_63_3",
+    "NIST AI Risk Management Framework": "nist_ai_rmf",
+    "NIST SP 800-218 (SSDF)": "nist_sp_800_218",
+    "NIST Cybersecurity Framework 2.0": "nist_csf_2_0",
+    "OWASP Top 10 (2021)": "owasp_top10",
+    "OWASP ASVS 4.0": "owasp_asvs",
+    "OWASP SAMM 2.0": "owasp_samm",
+    "OWASP API Security Top 10 (2023)": "owasp_api_top10",
+    "OWASP MASVS 2.0": "owasp_masvs",
+    "ENISA ICS/SCADA Dependencies": "enisa_ics_scada",
+    "ENISA Supply Chain Good Practices": "enisa_supply_chain",
+    "CISA Secure by Design": "cisa_sbd",
+    "Bundesdatenschutzgesetz (BDSG)": "bdsg",
+    "Gewerbeordnung (GewO)": "gewo",
+    "Handelsgesetzbuch (HGB)": "hgb",
+    "Abgabenordnung (AO)": "ao",
+    "OECD KI-Empfehlung": "oecd_ai_principles",
+}
+
+# License info per regulation code (from REGULATION_LICENSE_MAP)
+LICENSE_MAP = {
+    "eu_2016_679": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
+    "eu_2024_1689": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
+    "eu_2022_2555": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
+    "eu_2024_2847": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
+    "eu_2023_1230": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
+    "eu_blue_guide_2022": {"license": "EU_PUBLIC", "rule": 1, "source_type": "guideline"},
+    "mica": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
+    "eu_2023_1542": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
+    "amlr": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
+    "dga": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
+    "data_act": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
+    "gpsr": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
+    "ifrs": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
+    "nist_sp800_53r5": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
+    "nist_sp800_207": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
+    "nist_sp800_63_3": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
+    "nist_ai_rmf": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
+    "nist_sp_800_218": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
+    "nist_csf_2_0": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
+    "owasp_top10": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard"},
+    "owasp_asvs": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard"},
+    "owasp_samm": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard"},
+    "owasp_api_top10": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard"},
+    "owasp_masvs": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard"},
+    "enisa_ics_scada": {"license": "CC-BY-4.0", "rule": 2, "source_type": "guideline"},
+    "enisa_supply_chain": {"license": "CC-BY-4.0", "rule": 2, "source_type": "guideline"},
+    "cisa_sbd": {"license": "US_GOV_PUBLIC", "rule": 1, "source_type": "guideline"},
+    "bdsg": {"license": "DE_LAW", "rule": 1, "source_type": "law"},
+    "gewo": {"license": "DE_LAW", "rule": 1, "source_type": "law"},
+    "hgb": {"license": "DE_LAW", "rule": 1, "source_type": "law"},
+    "ao": {"license": "DE_LAW", "rule": 1, "source_type": "law"},
+    "oecd_ai_principles": {"license": "OECD_PUBLIC", "rule": 2, "source_type": "standard"},
+}
+
+# Domain detection keywords
+DOMAIN_KEYWORDS = {
+    "AUTH": ["authentifizierung", "anmeldung", "login", "passwort", "identit", "identity", "credential"],
+    "CRYP": ["verschlüsselung", "kryptogra", "encrypt", "cipher", "hash", "tls", "ssl", "signatur"],
+    "NET": ["netzwerk", "network", "firewall", "router", "dns", "ip-adress"],
+    "DATA": ["daten", "data", "personenbezogen", "datenschutz", "privacy", "gdpr", "dsgvo", "verarbeitung"],
+    "LOG": ["protokoll", "logging", "audit", "nachvollzieh", "aufzeichn"],
+    "ACC": ["zugriff", "access", "berechtigung", "autorisierung", "authorization", "rolle"],
+    "SEC": ["sicherheit", "security", "schutz", "protect", "schwachstell", "vulnerab"],
+    "INC": ["vorfall", "incident", "breach", "meldung", "reaktion", "response", "notfall"],
+    "AI": ["künstliche intelligenz", "ki-system", "ai system", "machine learning", "algorithm", "hochrisiko-ki"],
+    "COMP": ["compliance", "konformität", "audit", "zertifizierung", "regulier", "vorschrift"],
+    "GOV": ["behörde", "aufsicht", "governance", "marktüberwachung", "authority"],
+    "FIN": ["finanz", "zahlungs", "payment", "crypto", "krypto-", "geldwäsche", "aml"],
+    "ENV": ["umwelt", "environment", "batterie", "recycling", "entsorgu", "nachhaltig"],
+}
+
+# ── Prompt (same as control_generator.py) ────────────────────────────
+
+SYSTEM_PROMPT = """Du bist ein Security-Compliance-Experte. Strukturiere den gegebenen Text
+als praxisorientiertes Security Control. Erstelle eine verständliche, umsetzbare Formulierung.
+Antworte NUR mit validem JSON. Bei mehreren Controls antworte mit einem JSON-Array."""
+
+APPLICABILITY_PROMPT = """- applicable_industries: Liste der Branchen fuer die dieses Control relevant ist.
+  Verwende ["all"] wenn der Control branchenuebergreifend gilt.
+  Moegliche Werte: "all", "Technologie / IT", "IT Dienstleistungen", "E-Commerce / Handel",
+  "Finanzdienstleistungen", "Versicherungen", "Gesundheitswesen", "Pharma", "Bildung",
+  "Beratung / Consulting", "Marketing / Agentur", "Produktion / Industrie",
+  "Logistik / Transport", "Immobilien", "Bau", "Energie", "Automobil",
+  "Luft- / Raumfahrt", "Maschinenbau", "Anlagenbau", "Automatisierung", "Robotik",
+  "Messtechnik", "Agrar", "Chemie", "Minen / Bergbau", "Telekommunikation",
+  "Medien / Verlage", "Gastronomie / Hotellerie", "Recht / Kanzlei",
+  "Oeffentlicher Dienst", "Verteidigung / Ruestung", "Wasser- / Abwasserwirtschaft",
+  "Lebensmittel", "Digitale Infrastruktur", "Weltraum", "Post / Kurierdienste",
+  "Abfallwirtschaft", "Forschung"
+- applicable_company_size: Ab welcher Unternehmensgroesse gilt dieses Control?
+  Verwende ["all"] wenn keine Groessenbeschraenkung.
+  Moegliche Werte: "all", "micro", "small", "medium", "large", "enterprise"
+- scope_conditions: null wenn keine besonderen Bedingungen, sonst:
+  {"requires_any": ["signal"], "description": "Erklaerung"}
+  Moegliche Signale: "uses_ai", "third_country_transfer", "processes_health_data",
+  "processes_minors_data", "automated_decisions", "employee_monitoring",
+  "video_surveillance", "financial_data", "is_kritis_operator", "payment_services" """
+
+CATEGORY_LIST = [
+    "Datenschutz-Grundlagen", "Betroffenenrechte", "Technische Massnahmen",
+    "Organisatorische Massnahmen", "Auftragsverarbeitung", "Datentransfer",
+    "Risikomanagement", "Incident Response", "KI-Regulierung", "Cybersicherheit",
+    "Zugriffskontrolle", "Kryptographie", "Netzwerksicherheit", "Compliance-Management",
+    "Produktsicherheit", "Marktüberwachung", "Supply Chain Security",
+    "Finanzregulierung", "Arbeitsrecht", "Gewerberecht", "Handelsrecht",
+    "Umwelt / Nachhaltigkeit", "Dokumentation", "Schulung / Awareness",
+]
+CATEGORY_LIST_STR = ", ".join(f'"{c}"' for c in CATEGORY_LIST)
+
+
+def build_prompt(source_name, article_label, article_text, license_type):
+    return f"""Strukturiere den folgenden Gesetzestext als Security/Compliance Control.
+Du DARFST den Originaltext verwenden (Quelle: {source_name}, {license_type}).
+
+WICHTIG: Erstelle eine verständliche, praxisorientierte Formulierung.
+Der Originaltext wird separat gespeichert — deine Formulierung soll klar und umsetzbar sein.
+
+Gib JSON zurück mit diesen Feldern:
+- title: Kurzer prägnanter Titel (max 100 Zeichen)
+- objective: Was soll erreicht werden? (1-3 Sätze)
+- rationale: Warum ist das wichtig? (1-2 Sätze)
+- requirements: Liste von konkreten Anforderungen (Strings)
+- test_procedure: Liste von Prüfschritten (Strings)
+- evidence: Liste von Nachweisdokumenten (Strings)
+- severity: low/medium/high/critical
+- tags: Liste von Tags
+- domain: Fachgebiet als Kuerzel (AUTH=Authentifizierung, CRYP=Kryptographie, NET=Netzwerk, DATA=Datenschutz, LOG=Logging, ACC=Zugriffskontrolle, SEC=IT-Sicherheit, INC=Vorfallmanagement, AI=KI, COMP=Compliance, GOV=Behoerden/Verwaltung, LAB=Arbeitsrecht, FIN=Finanzregulierung, TRD=Gewerbe/Handelsrecht, ENV=Umwelt, HLT=Gesundheit)
+- category: Inhaltliche Kategorie. Moegliche Werte: {CATEGORY_LIST_STR}
+- target_audience: Liste der Zielgruppen (z.B. "unternehmen", "behoerden", "entwickler", "datenschutzbeauftragte", "geschaeftsfuehrung", "it-abteilung", "rechtsabteilung", "compliance-officer")
+- source_article: Artikel-/Paragraphen-Referenz (z.B. "Artikel 10", "§ 42")
+- source_paragraph: Absatz-Referenz (z.B. "Absatz 5", "Nr. 2")
+{APPLICABILITY_PROMPT}
+
+Text: {article_text[:3000]}
+Quelle: {source_name}, {article_label}"""
+
+
+# ── PDF article extraction ───────────────────────────────────────────
+
+def extract_article_text(pdf_file, article_label, doc_type, full_text=None):
+    """Extract the text of a specific article from a PDF."""
+    if full_text is None:
+        full_text = read_file(pdf_file)
+    if not full_text:
+        return ""
+
+    if doc_type == "eu_regulation":
+        art_num_match = re.search(r'\d+', article_label)
+        if not art_num_match:
+            return ""
+        num = int(art_num_match.group())
+        pattern = rf'\nArtikel\s+{num}\s*\n'
+        match = re.search(pattern, full_text)
+        if not match:
+            return ""
+        start = match.start()
+        next_pattern = rf'\nArtikel\s+{num + 1}\s*\n'
+        next_match = re.search(next_pattern, full_text)
+        end = next_match.start() if next_match else min(start + 5000, len(full_text))
+        return full_text[start:end].strip()[:3000]
+
+    elif doc_type == "de_law":
+        para_match = re.search(r'\d+', article_label)
+        if not para_match:
+            return ""
+        num = int(para_match.group())
+        pattern = rf'\n§\s+{num}\b'
+        match = re.search(pattern, full_text)
+        if not match:
+            return ""
+        start = match.start()
+        next_pattern = rf'\n§\s+{num + 1}\b'
+        next_match = re.search(next_pattern, full_text)
+        end = next_match.start() if next_match else min(start + 5000, len(full_text))
+        return full_text[start:end].strip()[:3000]
+
+    elif doc_type == "nist":
+        escaped = re.escape(article_label)
+        match = re.search(rf'(?:^|\n)\s*{escaped}\b', full_text)
+        if not match:
+            return ""
+        start = match.start()
+        return full_text[start:start + 3000].strip()
+
+    else:
+        # Generic / OWASP / ENISA
+        escaped = re.escape(article_label)
+        match = re.search(rf'(?:^|\n).*{escaped}\b', full_text)
+        if not match:
+            return ""
+        start = match.start()
+        return full_text[start:start + 3000].strip()
+
+
+# ── Anthropic API ────────────────────────────────────────────────────
+
+def call_anthropic(prompt, system_prompt):
+    """Call Anthropic API. Returns (parsed_data, raw_text, usage, error)."""
+    headers = {
+        "x-api-key": ANTHROPIC_API_KEY,
+        "anthropic-version": "2023-06-01",
+        "content-type": "application/json",
+    }
+    payload = {
+        "model": ANTHROPIC_MODEL,
+        "max_tokens": 4096,
+        "system": system_prompt,
+        "messages": [{"role": "user", "content": prompt}],
+    }
+
+    try:
+        resp = requests.post(ANTHROPIC_URL, headers=headers, json=payload, timeout=120)
+        if resp.status_code != 200:
+            return None, "", {}, f"HTTP {resp.status_code}: {resp.text[:200]}"
+        data = resp.json()
+        content = data["content"][0]["text"] if data.get("content") else ""
+        usage = data.get("usage", {})
+        parsed = parse_json(content)
+        return parsed, content, usage, None
+    except Exception as e:
+        return None, "", {}, str(e)
+
+
+def parse_json(text):
+    """Parse JSON from LLM response, handling markdown fences."""
+    text = text.strip()
+    if text.startswith("```"):
+        lines = text.split("\n")
+        text = "\n".join(lines[1:-1] if lines[-1].strip().startswith("```") else lines[1:])
+        text = text.strip()
+
+    try:
+        data = json.loads(text)
+        if isinstance(data, list):
+            return data[0] if data else None
+        return data
+    except json.JSONDecodeError:
+        match = re.search(r'\{[\s\S]*\}', text)
+        if match:
+            try:
+                return json.loads(match.group())
+            except json.JSONDecodeError:
+                return None
+    return None
+
+
+# ── Domain detection ─────────────────────────────────────────────────
+
+def detect_domain(text):
+    text_lower = text.lower()
+    scores = {}
+    for domain, keywords in DOMAIN_KEYWORDS.items():
+        score = sum(1 for kw in keywords if kw in text_lower)
+        if score > 0:
+            scores[domain] = score
+    if scores:
+        return max(scores, key=scores.get)
+    return "SEC"
+
+
+# ── Control ID generation ────────────────────────────────────────────
+
+def generate_control_id(domain, cur):
+    """Generate next available control_id for domain prefix.
+
+    Uses MAX(numeric suffix) to find the true highest number,
+    avoiding gaps from string-sorted IDs (e.g. COMP-99 > COMP-1000 in text sort).
+    """
+    prefix = domain.upper()[:4]
+    cur.execute("""
+        SELECT MAX(CAST(SPLIT_PART(control_id, '-', 2) AS INTEGER))
+        FROM compliance.canonical_controls
+        WHERE control_id LIKE %s
+          AND SPLIT_PART(control_id, '-', 2) ~ '^[0-9]+$'
+    """, (f"{prefix}-%",))
+    row = cur.fetchone()
+    if row and row[0] is not None:
+        return f"{prefix}-{row[0] + 1}"
+    return f"{prefix}-001"
+
+
+# ── Main ─────────────────────────────────────────────────────────────
+
+def main():
+    parser = argparse.ArgumentParser(description="Phase 7.4: Generate controls for gap articles")
+    parser.add_argument("--dry-run", action="store_true", help="Show what would be generated")
+    parser.add_argument("--source", type=str, help="Filter by source name substring")
+    parser.add_argument("--resume", action="store_true", help="Skip articles that already have controls")
+    parser.add_argument("--results", default=GAP_RESULTS_FILE, help="Path to gap_analysis_results.json")
+    args = parser.parse_args()
+
+    if not ANTHROPIC_API_KEY:
+        print("ERROR: Set ANTHROPIC_API_KEY")
+        sys.exit(1)
+
+    # Load gap results
+    with open(args.results) as f:
+        gaps = json.load(f)
+    total_gaps = sum(len(g["gap_articles"]) for g in gaps)
+    print(f"Loaded {len(gaps)} sources with {total_gaps} gap articles")
+
+    if args.source:
+        gaps = [g for g in gaps if args.source.lower() in g["source"].lower()]
+        total_gaps = sum(len(g["gap_articles"]) for g in gaps)
+        print(f"Filtered to {len(gaps)} sources, {total_gaps} gaps")
+
+    # DB connection with keepalive + reconnect helper
+    db_url = os.environ['DATABASE_URL']
+    parsed = urllib.parse.urlparse(db_url)
+
+    def connect_db():
+        """Create DB connection with TCP keepalive."""
+        c = psycopg2.connect(
+            host=parsed.hostname, port=parsed.port or 5432,
+            user=parsed.username, password=parsed.password,
+            dbname=parsed.path.lstrip('/'),
+            options="-c search_path=compliance,public",
+            keepalives=1, keepalives_idle=30,
+            keepalives_interval=10, keepalives_count=5,
+        )
+        return c, c.cursor()
+
+    conn, cur = connect_db()
+
+    def ensure_db():
+        """Reconnect if connection is dead."""
+        nonlocal conn, cur
+        try:
+            cur.execute("SELECT 1")
+        except Exception:
+            print("  [RECONNECT] DB connection lost, reconnecting...")
+            try:
+                conn.close()
+            except Exception:
+                pass
+            conn, cur = connect_db()
+            return True
+        return False
+
+    # Get framework UUID
+    cur.execute("SELECT id FROM compliance.canonical_control_frameworks WHERE framework_id = 'bp_security_v1' LIMIT 1")
+    fw_row = cur.fetchone()
+    if not fw_row:
+        print("ERROR: Framework bp_security_v1 not found")
+        sys.exit(1)
+    framework_uuid = fw_row[0]
+
+    # If resuming, load existing articles per source
+    existing_articles = {}
+    if args.resume:
+        cur.execute("""
+            SELECT source_citation->>'source', source_citation->>'article'
+            FROM compliance.canonical_controls
+            WHERE source_citation->>'article' IS NOT NULL
+        """)
+        for src, art in cur.fetchall():
+            existing_articles.setdefault(src, set()).add(art)
+        print(f"Resume mode: {sum(len(v) for v in existing_articles.values())} existing article-control pairs")
+
+    # Stats
+    stats = Counter()
+    total_input_tokens = 0
+    total_output_tokens = 0
+    generated_ids = []
+    errors = []
+    t_start = time.time()
+
+    # Pre-read PDFs (cache full text per source)
+    pdf_cache = {}
+
+    for gap_source in sorted(gaps, key=lambda g: -len(g["gap_articles"])):
+        source_name = gap_source["source"]
+        gap_articles = gap_source["gap_articles"]
+        filename = SOURCE_FILE_MAP.get(source_name)
+        reg_code = SOURCE_TO_REGCODE.get(source_name, "unknown")
+        license_info = LICENSE_MAP.get(reg_code, {"license": "UNKNOWN", "rule": 1, "source_type": "unknown"})
+        doc_type = classify_doc(source_name)
+
+        if not filename:
+            stats["skipped_no_pdf"] += len(gap_articles)
+            continue
+
+        # Read PDF once per source
+        if source_name not in pdf_cache:
+            pdf_cache[source_name] = read_file(filename)
+        full_text = pdf_cache[source_name]
+        if not full_text:
+            stats["skipped_no_pdf"] += len(gap_articles)
+            continue
+
+        print(f"\n{'='*70}")
+        print(f"{source_name} — {len(gap_articles)} gaps (rule {license_info['rule']}, {doc_type})")
+        print(f"{'='*70}")
+
+        for gap in gap_articles:
+            article_label = gap["label"]
+            article_type = gap["type"]
+
+            # Skip if already has controls (resume mode)
+            if args.resume and article_label in existing_articles.get(source_name, set()):
+                stats["skipped_exists"] += 1
+                continue
+
+            # Skip non-substantive NIST sections (intro chapters)
+            if doc_type == "nist" and article_type == "section":
+                section_match = re.match(r'Section (\d+)', article_label)
+                if section_match and int(section_match.group(1)) <= 3:
+                    stats["skipped_intro"] += 1
+                    continue
+
+            # Extract article text
+            article_text = extract_article_text(filename, article_label, doc_type, full_text)
+            if not article_text or len(article_text) < 30:
+                stats["skipped_short_text"] += 1
+                print(f"  SKIP {article_label}: text too short ({len(article_text)} chars)")
+                continue
+
+            if args.dry_run:
+                print(f"  [DRY] {article_label} ({len(article_text)} chars)")
+                stats["would_generate"] += 1
+                continue
+
+            # Call Anthropic
+            prompt = build_prompt(source_name, article_label, article_text, license_info["license"])
+            data, raw, usage, error = call_anthropic(prompt, SYSTEM_PROMPT)
+
+            total_input_tokens += usage.get("input_tokens", 0)
+            total_output_tokens += usage.get("output_tokens", 0)
+
+            if error:
+                stats["api_error"] += 1
+                errors.append(f"{source_name} {article_label}: {error}")
+                print(f"  ERROR {article_label}: {error}")
+                time.sleep(5)
+                continue
+
+            if not data:
+                stats["parse_error"] += 1
+                print(f"  PARSE ERROR {article_label}")
+                continue
+
+            # Ensure DB is alive before writing
+            ensure_db()
+
+            # Build control
+            title = str(data.get("title", ""))[:200]
+            objective = str(data.get("objective", ""))
+            rationale = str(data.get("rationale", ""))
+            domain = str(data.get("domain", detect_domain(article_text))).upper()[:4]
+            if not domain or len(domain) < 2:
+                domain = detect_domain(article_text)
+
+            control_id = generate_control_id(domain, cur)
+            severity = str(data.get("severity", "medium")).lower()
+            if severity not in ("low", "medium", "high", "critical"):
+                severity = "medium"
+
+            requirements = data.get("requirements", [])
+            if not isinstance(requirements, list):
+                requirements = [str(requirements)]
+            test_procedure = data.get("test_procedure", [])
+            if not isinstance(test_procedure, list):
+                test_procedure = [str(test_procedure)]
+            evidence = data.get("evidence", [])
+            if not isinstance(evidence, list):
+                evidence = [str(evidence)]
+            tags = data.get("tags", [])
+            if not isinstance(tags, list):
+                tags = []
+            target_audience = data.get("target_audience", [])
+            if not isinstance(target_audience, list):
+                target_audience = []
+            applicable_industries = data.get("applicable_industries", ["all"])
+            if not isinstance(applicable_industries, list):
+                applicable_industries = ["all"]
+            applicable_company_size = data.get("applicable_company_size", ["all"])
+            if not isinstance(applicable_company_size, list):
+                applicable_company_size = ["all"]
+            scope_conditions = data.get("scope_conditions")
+
+            source_citation = {
+                "source": source_name,
+                "article": data.get("source_article", article_label),
+                "paragraph": data.get("source_paragraph", ""),
+                "article_type": article_type,
+                "license": license_info["license"],
+                "source_type": license_info["source_type"],
+            }
+
+            generation_metadata = {
+                "processing_path": "phase74_gap_fill",
+                "license_rule": license_info["rule"],
+                "source_regulation": reg_code,
+                "source_article": article_label,
+                "gap_fill": True,
+            }
+
+            category = str(data.get("category", "")) or None
+
+            # Insert into DB
+            try:
+                cur.execute("""
+                    INSERT INTO compliance.canonical_controls (
+                        framework_id, control_id, title, objective, rationale,
+                        scope, requirements, test_procedure, evidence,
+                        severity, risk_score, implementation_effort,
+                        open_anchors, release_state, tags,
+                        license_rule, source_original_text, source_citation,
+                        customer_visible, generation_metadata,
+                        verification_method, category, generation_strategy,
+                        target_audience, pipeline_version,
+                        applicable_industries, applicable_company_size, scope_conditions
+                    ) VALUES (
+                        %s, %s, %s, %s, %s,
+                        %s, %s, %s, %s,
+                        %s, %s, %s,
+                        %s, %s, %s,
+                        %s, %s, %s,
+                        %s, %s,
+                        %s, %s, %s,
+                        %s, %s,
+                        %s, %s, %s
+                    )
+                    ON CONFLICT (framework_id, control_id) DO NOTHING
+                    RETURNING id
+                """, (
+                    framework_uuid, control_id, title, objective, rationale,
+                    json.dumps({}), json.dumps(requirements), json.dumps(test_procedure), json.dumps(evidence),
+                    severity, 5, "m",
+                    json.dumps([]), "draft", json.dumps(tags),
+                    license_info["rule"], article_text, json.dumps(source_citation),
+                    True, json.dumps(generation_metadata),
+                    "document", category, "phase74_gap_fill",
+                    json.dumps(target_audience), PIPELINE_VERSION,
+                    json.dumps(applicable_industries), json.dumps(applicable_company_size),
+                    json.dumps(scope_conditions) if scope_conditions else None,
+                ))
+                conn.commit()
+                row = cur.fetchone()
+                if row:
+                    generated_ids.append(str(row[0]))
+                    stats["generated"] += 1
+                    print(f"  OK {control_id}: {title[:60]}")
+                else:
+                    stats["conflict"] += 1
+                    print(f"  CONFLICT {control_id} (already exists)")
+            except Exception as e:
+                conn.rollback()
+                stats["db_error"] += 1
+                errors.append(f"DB {control_id}: {str(e)[:100]}")
+                print(f"  DB ERROR {control_id}: {str(e)[:100]}")
+
+            # Rate limit: ~0.5s between calls
+            time.sleep(0.5)
+
+    # ── Summary ──────────────────────────────────────────────────────
+    elapsed = time.time() - t_start
+    cost = (total_input_tokens * 3 + total_output_tokens * 15) / 1_000_000
+
+    print(f"\n\n{'='*70}")
+    print(f"PHASE 7.4 — {'DRY-RUN' if args.dry_run else 'ERGEBNIS'}")
+    print(f"{'='*70}")
+    print(f"  Laufzeit:              {elapsed/60:.1f} min")
+    print(f"  API-Kosten:            ${cost:.2f}")
+    print(f"  Input Tokens:          {total_input_tokens:,}")
+    print(f"  Output Tokens:         {total_output_tokens:,}")
+    print()
+    for key in sorted(stats.keys()):
+        print(f"  {key:<25s}: {stats[key]:5d}")
+    print()
+
+    if generated_ids:
+        print(f"  Neue Control-IDs: {len(generated_ids)}")
+        # Save generated IDs
+        with open("/tmp/phase74_generated_ids.json", 'w') as f:
+            json.dump(generated_ids, f)
+        print(f"  IDs gespeichert: /tmp/phase74_generated_ids.json")
+
+    if errors:
+        print(f"\n  Fehler ({len(errors)}):")
+        for e in errors[:20]:
+            print(f"    {e}")
+        if len(errors) > 20:
+            print(f"    ... und {len(errors)-20} weitere")
+
+    conn.close()
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,218 @@
+#!/usr/bin/env bash
+# ─────────────────────────────────────────────────────────────
+# Robust job runner for QA scripts on Mac Mini
+#
+# Usage:
+#   ./run_job.sh <script.py> [args...]    # start job
+#   ./run_job.sh --status                 # show running jobs
+#   ./run_job.sh --kill <script.py>       # kill a running job
+#   ./run_job.sh --log <script.py>        # tail log
+#
+# Features:
+#   - Loads .env automatically (COMPLIANCE_DATABASE_URL → DATABASE_URL)
+#   - PID-file prevents duplicate runs
+#   - Unbuffered Python output
+#   - Structured log files in /tmp/qa_jobs/
+# ─────────────────────────────────────────────────────────────
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+PROJECT_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)"
+JOB_DIR="/tmp/qa_jobs"
+mkdir -p "$JOB_DIR"
+
+# ── Load .env ────────────────────────────────────────────────
+load_env() {
+    local envfile="$PROJECT_DIR/.env"
+    if [[ -f "$envfile" ]]; then
+        # Export all vars from .env
+        set -a
+        # shellcheck disable=SC1090
+        source "$envfile"
+        set +a
+    fi
+    # Map COMPLIANCE_DATABASE_URL → DATABASE_URL if needed
+    if [[ -z "${DATABASE_URL:-}" && -n "${COMPLIANCE_DATABASE_URL:-}" ]]; then
+        export DATABASE_URL="$COMPLIANCE_DATABASE_URL"
+    fi
+}
+
+# ── Job name from script path ─────────────────────────────────
+job_name() {
+    basename "$1" .py
+}
+
+pid_file() {
+    echo "$JOB_DIR/$(job_name "$1").pid"
+}
+
+log_file() {
+    echo "$JOB_DIR/$(job_name "$1").log"
+}
+
+# ── Status ────────────────────────────────────────────────────
+show_status() {
+    echo "═══════════════════════════════════════════════════════"
+    echo "QA Job Status ($(date '+%Y-%m-%d %H:%M:%S'))"
+    echo "═══════════════════════════════════════════════════════"
+    local found=0
+    for pidfile in "$JOB_DIR"/*.pid; do
+        [[ -f "$pidfile" ]] || continue
+        found=1
+        local name
+        name=$(basename "$pidfile" .pid)
+        local pid
+        pid=$(cat "$pidfile")
+        local logf="$JOB_DIR/$name.log"
+
+        if kill -0 "$pid" 2>/dev/null; then
+            local lines
+            lines=$(wc -l < "$logf" 2>/dev/null || echo 0)
+            local errors
+            errors=$(grep -c "ERROR" "$logf" 2>/dev/null || echo 0)
+            local last_line
+            last_line=$(tail -1 "$logf" 2>/dev/null || echo "(empty)")
+            echo "  ● $name (PID $pid) — RUNNING"
+            echo "    Log: $logf ($lines lines, $errors errors)"
+            echo "    Last: $last_line"
+        else
+            echo "  ○ $name (PID $pid) — STOPPED"
+            echo "    Log: $logf"
+            rm -f "$pidfile"
+        fi
+        echo ""
+    done
+    if [[ $found -eq 0 ]]; then
+        echo "  No jobs running."
+    fi
+}
+
+# ── Kill ──────────────────────────────────────────────────────
+kill_job() {
+    local script="$1"
+    local pf
+    pf=$(pid_file "$script")
+    if [[ ! -f "$pf" ]]; then
+        echo "No PID file for $(job_name "$script")"
+        return 1
+    fi
+    local pid
+    pid=$(cat "$pf")
+    if kill -0 "$pid" 2>/dev/null; then
+        kill "$pid"
+        echo "Killed $(job_name "$script") (PID $pid)"
+    else
+        echo "Process $pid already stopped"
+    fi
+    rm -f "$pf"
+}
+
+# ── Tail log ──────────────────────────────────────────────────
+tail_log() {
+    local script="$1"
+    local lf
+    lf=$(log_file "$script")
+    if [[ ! -f "$lf" ]]; then
+        echo "No log file: $lf"
+        return 1
+    fi
+    tail -50 "$lf"
+}
+
+# ── Start job ─────────────────────────────────────────────────
+start_job() {
+    local script="$1"
+    shift
+    local args=("$@")
+
+    # Resolve script path
+    local script_path="$script"
+    if [[ ! -f "$script_path" ]]; then
+        script_path="$SCRIPT_DIR/$script"
+    fi
+    if [[ ! -f "$script_path" ]]; then
+        echo "ERROR: Script not found: $script"
+        return 1
+    fi
+
+    local name
+    name=$(job_name "$script")
+    local pf
+    pf=$(pid_file "$script")
+    local lf
+    lf=$(log_file "$script")
+
+    # Check for already-running instance
+    if [[ -f "$pf" ]]; then
+        local existing_pid
+        existing_pid=$(cat "$pf")
+        if kill -0 "$existing_pid" 2>/dev/null; then
+            echo "ERROR: $name already running (PID $existing_pid)"
+            echo "Use: $0 --kill $script"
+            return 1
+        fi
+        rm -f "$pf"
+    fi
+
+    # Load environment
+    load_env
+
+    # Verify required env vars
+    if [[ -z "${DATABASE_URL:-}" ]]; then
+        echo "ERROR: DATABASE_URL not set (checked .env)"
+        return 1
+    fi
+
+    # Start
+    echo "Starting $name..."
+    echo "  Script: $script_path"
+    echo "  Args:   ${args[*]:-none}"
+    echo "  Log:    $lf"
+
+    nohup python3 -u "$script_path" "${args[@]}" > "$lf" 2>&1 &
+    local pid=$!
+    echo "$pid" > "$pf"
+
+    echo "  PID:    $pid"
+    echo ""
+
+    # Wait a moment and check it started OK
+    sleep 3
+    if ! kill -0 "$pid" 2>/dev/null; then
+        echo "ERROR: Process died immediately. Log output:"
+        cat "$lf"
+        rm -f "$pf"
+        return 1
+    fi
+
+    local lines
+    lines=$(wc -l < "$lf" 2>/dev/null || echo 0)
+    echo "Running OK ($lines log lines so far)"
+    echo "Monitor with: $0 --status"
+    echo "Tail log:     $0 --log $script"
+}
+
+# ── Main ──────────────────────────────────────────────────────
+case "${1:-}" in
+    --status|-s)
+        show_status
+        ;;
+    --kill|-k)
+        [[ -n "${2:-}" ]] || { echo "Usage: $0 --kill <script.py>"; exit 1; }
+        kill_job "$2"
+        ;;
+    --log|-l)
+        [[ -n "${2:-}" ]] || { echo "Usage: $0 --log <script.py>"; exit 1; }
+        tail_log "$2"
+        ;;
+    --help|-h|"")
+        echo "Usage:"
+        echo "  $0 <script.py> [args...]  Start a QA job"
+        echo "  $0 --status               Show running jobs"
+        echo "  $0 --kill <script.py>     Kill a running job"
+        echo "  $0 --log <script.py>      Tail job log"
+        ;;
+    *)
+        start_job "$@"
+        ;;
+esac
@@ -0,0 +1,307 @@
+#!/usr/bin/env python3
+"""Sync canonical control tables between production and local DB.
+
+Modes:
+    --pull    Production → Local (initial sync, full table copy)
+    --push    Local → Production (incremental, only new obligation_candidates)
+    --loop    Run --push every N minutes (default 60)
+
+Usage:
+    python3 sync_db.py --pull                    # Full sync production → local
+    python3 sync_db.py --push                    # Push new obligations to production
+    python3 sync_db.py --loop 60                 # Push every 60 minutes
+    python3 sync_db.py --pull --tables canonical_controls  # Only one table
+"""
+import argparse
+import json
+import os
+import sys
+import time
+import urllib.parse
+
+import io
+
+import psycopg2
+import psycopg2.extras
+import psycopg2.extensions
+
+# Register JSON adapter so dicts are automatically converted to JSONB
+psycopg2.extensions.register_adapter(dict, psycopg2.extras.Json)
+
+# ── DB Config ────────────────────────────────────────────────────────
+
+PROD_URL = os.environ.get(
+    "PROD_DATABASE_URL",
+    "postgresql://postgres:GmyFD3wnU1NrKBdpU1nwLdE8MLts0A0eez8L5XXdvUCe05lWnWfVp3C6JJ8Yrmt2"
+    "@46.225.100.82:54321/postgres?sslmode=require",
+)
+LOCAL_URL = os.environ.get(
+    "LOCAL_DATABASE_URL",
+    "postgresql://breakpilot:breakpilot123@localhost:5432/breakpilot_db",
+)
+
+SCHEMA = "compliance"
+
+# Tables to sync (production → local)
+SYNC_TABLES = [
+    "canonical_control_frameworks",
+    "canonical_control_licenses",
+    "canonical_control_sources",
+    "canonical_control_categories",
+    "canonical_blocked_sources",
+    "canonical_controls",
+    "canonical_control_mappings",
+    "canonical_processed_chunks",
+    "canonical_generation_jobs",
+    "control_patterns",
+    "crosswalk_matrix",
+    "obligation_extractions",
+    "obligation_candidates",
+]
+
+
+def connect(url, label="DB"):
+    parsed = urllib.parse.urlparse(url)
+    params = dict(urllib.parse.parse_qsl(parsed.query))
+    conn = psycopg2.connect(
+        host=parsed.hostname,
+        port=parsed.port or 5432,
+        user=parsed.username,
+        password=parsed.password,
+        dbname=parsed.path.lstrip("/"),
+        sslmode=params.get("sslmode", "prefer"),
+        options=f"-c search_path={SCHEMA},public",
+        keepalives=1,
+        keepalives_idle=30,
+        keepalives_interval=10,
+        keepalives_count=5,
+    )
+    conn.autocommit = False
+    print(f"  Connected to {label} ({parsed.hostname}:{parsed.port or 5432})")
+    return conn
+
+
+def get_columns(cur, table):
+    cur.execute(f"""
+        SELECT column_name FROM information_schema.columns
+        WHERE table_schema = '{SCHEMA}' AND table_name = '{table}'
+        ORDER BY ordinal_position
+    """)
+    return [r[0] for r in cur.fetchall()]
+
+
+def pull_table(prod_conn, local_conn, table):
+    """Copy entire table from production to local via SELECT + INSERT."""
+    prod_cur = prod_conn.cursor()
+    local_cur = local_conn.cursor()
+
+    # Check table exists on production
+    prod_cur.execute(f"""
+        SELECT 1 FROM pg_tables
+        WHERE schemaname = '{SCHEMA}' AND tablename = '{table}'
+    """)
+    if not prod_cur.fetchone():
+        print(f"    SKIP {table} — not found on production")
+        return 0
+
+    # Drop local table
+    local_cur.execute(f"DROP TABLE IF EXISTS {SCHEMA}.{table} CASCADE")
+    local_conn.commit()
+
+    # Build simple CREATE TABLE (no constraints, no defaults — just for data)
+    prod_cur.execute(f"""
+        SELECT column_name, data_type, udt_name, character_maximum_length
+        FROM information_schema.columns
+        WHERE table_schema = '{SCHEMA}' AND table_name = '{table}'
+        ORDER BY ordinal_position
+    """)
+    col_defs = prod_cur.fetchall()
+
+    parts = []
+    col_names = []
+    jsonb_cols = set()
+    for name, dtype, udt, max_len in col_defs:
+        col_names.append(name)
+        if dtype == "ARRAY":
+            type_map = {
+                "_text": "text[]", "_varchar": "varchar[]",
+                "_int4": "integer[]", "_uuid": "uuid[]",
+                "_jsonb": "jsonb[]", "_float8": "float8[]",
+            }
+            sql_type = type_map.get(udt, f"{udt.lstrip('_')}[]")
+        elif dtype == "USER-DEFINED" and udt == "jsonb":
+            sql_type = "jsonb"
+            jsonb_cols.add(name)
+        elif dtype == "USER-DEFINED":
+            sql_type = udt
+        elif dtype == "jsonb":
+            sql_type = "jsonb"
+            jsonb_cols.add(name)
+        elif max_len:
+            sql_type = f"{dtype}({max_len})"
+        else:
+            sql_type = dtype
+        parts.append(f'"{name}" {sql_type}')
+
+    ddl = f"CREATE TABLE {SCHEMA}.{table} ({', '.join(parts)})"
+    local_cur.execute(ddl)
+    local_conn.commit()
+
+    # Fetch all rows from production
+    col_list = ", ".join(f'"{c}"' for c in col_names)
+    prod_cur.execute(f"SELECT {col_list} FROM {SCHEMA}.{table}")
+    rows = prod_cur.fetchall()
+
+    if rows:
+        # Wrap dict/list values in Json for JSONB columns
+        adapted_rows = []
+        for row in rows:
+            adapted = []
+            for i, val in enumerate(row):
+                if col_names[i] in jsonb_cols and isinstance(val, (dict, list)):
+                    adapted.append(psycopg2.extras.Json(val))
+                else:
+                    adapted.append(val)
+            adapted_rows.append(tuple(adapted))
+
+        placeholders = ", ".join(["%s"] * len(col_names))
+        insert_sql = f'INSERT INTO {SCHEMA}.{table} ({col_list}) VALUES ({placeholders})'
+        psycopg2.extras.execute_batch(local_cur, insert_sql, adapted_rows, page_size=500)
+        local_conn.commit()
+
+    print(f"    {table}: {len(rows)} rows")
+    return len(rows)
+
+
+def pull(tables=None):
+    """Full sync: production → local."""
+    print("\n=== PULL: Production → Local ===\n")
+
+    prod_conn = connect(PROD_URL, "Production")
+    local_conn = connect(LOCAL_URL, "Local")
+
+    # Ensure schema exists
+    local_cur = local_conn.cursor()
+    local_cur.execute(f"CREATE SCHEMA IF NOT EXISTS {SCHEMA}")
+    local_conn.commit()
+
+    sync_list = tables if tables else SYNC_TABLES
+    total = 0
+
+    for table in sync_list:
+        try:
+            count = pull_table(prod_conn, local_conn, table)
+            total += count
+        except Exception as e:
+            print(f"    ERROR {table}: {e}")
+            local_conn.rollback()
+            prod_conn.rollback()
+
+    print(f"\n  Total: {total} rows synced")
+    prod_conn.close()
+    local_conn.close()
+
+
+def push():
+    """Incremental push: new obligation_candidates local → production."""
+    print(f"\n=== PUSH: Local → Production ({time.strftime('%H:%M:%S')}) ===\n")
+
+    local_conn = connect(LOCAL_URL, "Local")
+    prod_conn = connect(PROD_URL, "Production")
+
+    local_cur = local_conn.cursor()
+    prod_cur = prod_conn.cursor()
+
+    # Find obligation_candidates in local that don't exist in production
+    # Use candidate_id as the unique key
+    local_cur.execute(f"""
+        SELECT candidate_id FROM {SCHEMA}.obligation_candidates
+    """)
+    local_ids = {r[0] for r in local_cur.fetchall()}
+
+    if not local_ids:
+        print("  No obligation_candidates in local DB")
+        local_conn.close()
+        prod_conn.close()
+        return 0
+
+    # Check which already exist on production
+    prod_cur.execute(f"""
+        SELECT candidate_id FROM {SCHEMA}.obligation_candidates
+    """)
+    prod_ids = {r[0] for r in prod_cur.fetchall()}
+
+    new_ids = local_ids - prod_ids
+    if not new_ids:
+        print(f"  All {len(local_ids)} obligations already on production")
+        local_conn.close()
+        prod_conn.close()
+        return 0
+
+    print(f"  {len(new_ids)} new obligations to push (local: {len(local_ids)}, prod: {len(prod_ids)})")
+
+    # Get columns
+    columns = get_columns(local_cur, "obligation_candidates")
+    col_list = ", ".join(columns)
+    placeholders = ", ".join(["%s"] * len(columns))
+
+    # Fetch new rows from local
+    id_list = ", ".join(f"'{i}'" for i in new_ids)
+    local_cur.execute(f"""
+        SELECT {col_list} FROM {SCHEMA}.obligation_candidates
+        WHERE candidate_id IN ({id_list})
+    """)
+    rows = local_cur.fetchall()
+
+    # Insert into production
+    insert_sql = f"INSERT INTO {SCHEMA}.obligation_candidates ({col_list}) VALUES ({placeholders}) ON CONFLICT DO NOTHING"
+    psycopg2.extras.execute_batch(prod_cur, insert_sql, rows, page_size=100)
+    prod_conn.commit()
+
+    print(f"  Pushed {len(rows)} obligations to production")
+
+    local_conn.close()
+    prod_conn.close()
+    return len(rows)
+
+
+def loop(interval_min):
+    """Run push every N minutes."""
+    print(f"\n=== SYNC LOOP — Push every {interval_min} min ===")
+    print(f"  Started at {time.strftime('%Y-%m-%d %H:%M:%S')}")
+    print(f"  Press Ctrl+C to stop\n")
+
+    while True:
+        try:
+            pushed = push()
+            if pushed:
+                print(f"  Next sync in {interval_min} min...")
+        except Exception as e:
+            print(f"  SYNC ERROR: {e}")
+        time.sleep(interval_min * 60)
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Sync canonical control tables")
+    parser.add_argument("--pull", action="store_true", help="Production → Local (full copy)")
+    parser.add_argument("--push", action="store_true", help="Local → Production (new obligations)")
+    parser.add_argument("--loop", type=int, metavar="MIN", help="Push every N minutes")
+    parser.add_argument("--tables", nargs="+", help="Only sync specific tables (with --pull)")
+    args = parser.parse_args()
+
+    if not any([args.pull, args.push, args.loop]):
+        parser.print_help()
+        return
+
+    if args.pull:
+        pull(args.tables)
+
+    if args.push:
+        push()
+
+    if args.loop:
+        loop(args.loop)
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,470 @@
+#!/usr/bin/env python3
+"""Test Pass 0a (Obligation Extraction) on 5-10 controls.
+
+Standalone script — no SQLAlchemy dependency. Uses psycopg2 + requests.
+Copies prompts and quality gate from decomposition_pass.py.
+
+Usage:
+    python3 test_pass0a.py                          # 10 controls, Anthropic
+    python3 test_pass0a.py --limit 5                # 5 controls
+    python3 test_pass0a.py --source "DSGVO"         # filter by source
+    python3 test_pass0a.py --dry-run                # show controls, no LLM call
+"""
+import argparse
+import json
+import os
+import re
+import sys
+import time
+import urllib.parse
+
+import psycopg2
+import requests
+
+# ── Config ────────────────────────────────────────────────────────────
+ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", "")
+ANTHROPIC_MODEL = os.environ.get("DECOMPOSITION_LLM_MODEL", "claude-sonnet-4-6")
+ANTHROPIC_API_URL = "https://api.anthropic.com/v1"
+
+# ── Prompts (from decomposition_pass.py) ──────────────────────────────
+
+SYSTEM_PROMPT = """\
+Du bist ein Rechts-Compliance-Experte. Du zerlegst Compliance-Controls \
+in einzelne atomare Pflichten.
+
+REGELN (STRIKT EINHALTEN):
+1. Nur normative Aussagen extrahieren — erkennbar an: müssen, haben \
+sicherzustellen, sind verpflichtet, ist zu dokumentieren, ist zu melden, \
+ist zu testen, shall, must, required.
+2. Jede Pflicht hat genau EIN Hauptverb / eine Handlung.
+3. Testpflichten SEPARAT von operativen Pflichten (is_test_obligation=true).
+4. Meldepflichten SEPARAT (is_reporting_obligation=true).
+5. NICHT auf Evidence-Ebene zerlegen (z.B. "DR-Plan vorhanden" ist KEIN \
+eigenes Control, sondern Evidence).
+6. Begründungen, Erläuterungen und Erwägungsgründe sind KEINE Pflichten \
+— NICHT extrahieren.
+
+Antworte NUR mit einem JSON-Array. Keine Erklärungen."""
+
+
+def build_prompt(title, objective, requirements, test_procedure, source_ref):
+    return f"""\
+Analysiere das folgende Control und extrahiere alle einzelnen normativen \
+Pflichten als JSON-Array.
+
+CONTROL:
+Titel: {title}
+Ziel: {objective}
+Anforderungen: {requirements}
+Prüfverfahren: {test_procedure}
+Quellreferenz: {source_ref}
+
+Antworte als JSON-Array:
+[
+  {{
+    "obligation_text": "Kurze, präzise Formulierung der Pflicht",
+    "action": "Hauptverb/Handlung",
+    "object": "Gegenstand der Pflicht",
+    "condition": "Auslöser/Bedingung oder null",
+    "normative_strength": "must",
+    "is_test_obligation": false,
+    "is_reporting_obligation": false
+  }}
+]"""
+
+
+# ── Quality Gate — 3-Tier Classification (from decomposition_pass.py) ──
+
+# Tier 1: Pflicht (mandatory)
+_PFLICHT_RE = re.compile(
+    r"\bmüssen\b|\bmuss\b|\bhat\s+sicherzustellen\b|\bhaben\s+sicherzustellen\b"
+    r"|\bsind\s+verpflichtet\b|\bist\s+verpflichtet\b"
+    r"|\bist\s+zu\s+\w+en\b|\bsind\s+zu\s+\w+en\b"
+    r"|\bhat\s+zu\s+\w+en\b|\bhaben\s+zu\s+\w+en\b"
+    r"|\bist\s+\w+zu\w+en\b|\bsind\s+\w+zu\w+en\b"
+    r"|\bist\s+\w+\s+zu\s+\w+en\b|\bsind\s+\w+\s+zu\s+\w+en\b"
+    r"|\bhat\s+\w+\s+zu\s+\w+en\b|\bhaben\s+\w+\s+zu\s+\w+en\b"
+    r"|\bshall\b|\bmust\b|\brequired\b"
+    r"|\b\w+zuteilen\b|\b\w+zuwenden\b|\b\w+zustellen\b|\b\w+zulegen\b"
+    r"|\b\w+zunehmen\b|\b\w+zuführen\b|\b\w+zuhalten\b|\b\w+zusetzen\b"
+    r"|\b\w+zuweisen\b|\b\w+zuordnen\b|\b\w+zufügen\b|\b\w+zugeben\b"
+    r"|\bist\b.{1,80}\bzu\s+\w+en\b|\bsind\b.{1,80}\bzu\s+\w+en\b",
+    re.IGNORECASE,
+)
+# Tier 2: Empfehlung (recommendation)
+_EMPFEHLUNG_RE = re.compile(
+    r"\bsoll\b|\bsollen\b|\bsollte\b|\bsollten\b"
+    r"|\bgewährleisten\b|\bsicherstellen\b"
+    r"|\bshould\b|\bensure\b|\brecommend\w*\b"
+    r"|\bnachweisen\b|\beinhalten\b|\bunterlassen\b|\bwahren\b"
+    r"|\bdokumentieren\b|\bimplementieren\b|\büberprüfen\b|\büberwachen\b"
+    r"|\bprüfen,\s+ob\b|\bkontrollieren,\s+ob\b",
+    re.IGNORECASE,
+)
+# Tier 3: Kann (optional/permissive)
+_KANN_RE = re.compile(
+    r"\bkann\b|\bkönnen\b|\bdarf\b|\bdürfen\b|\bmay\b|\boptional\b",
+    re.IGNORECASE,
+)
+# Union (backward compat)
+_NORMATIVE_RE = re.compile(
+    _PFLICHT_RE.pattern + "|" + _EMPFEHLUNG_RE.pattern + "|" + _KANN_RE.pattern,
+    re.IGNORECASE,
+)
+_RATIONALE_RE = re.compile(
+    r"\bda\s+|\bweil\b|\bgrund\b|\berwägung|\bbecause\b|\breason\b|\brationale\b",
+    re.IGNORECASE,
+)
+_TEST_RE = re.compile(
+    r"\btesten\b|\btest\b|\bprüfung\b|\bprüfen\b|\bgetestet\b|\bwirksamkeit\b"
+    r"|\baudit\b|\bregelmäßig\b.*\b(prüf|test|kontroll)|\beffectiveness\b|\bverif",
+    re.IGNORECASE,
+)
+_REPORTING_RE = re.compile(
+    r"\bmelden\b|\bmeldung\b|\bunterricht|\binformieren\b|\bbenachricht"
+    r"|\bnotif|\breport\b|\bbehörd",
+    re.IGNORECASE,
+)
+
+
+def classify_obligation_type(txt):
+    """Classify: pflicht > empfehlung > kann > empfehlung (default)."""
+    if _PFLICHT_RE.search(txt):
+        return "pflicht"
+    if _EMPFEHLUNG_RE.search(txt):
+        return "empfehlung"
+    if _KANN_RE.search(txt):
+        return "kann"
+    return "empfehlung"
+
+
+def quality_gate(obl_text, parent_uuid):
+    """Validate + classify obligation. Returns (flags_dict, passed_bool, confidence, obligation_type)."""
+    flags = {}
+
+    # 1. Normative signal (informational)
+    flags["has_normative_signal"] = bool(_NORMATIVE_RE.search(obl_text))
+
+    # 1b. Obligation type classification
+    obl_type = classify_obligation_type(obl_text)
+    flags["obligation_type"] = obl_type
+
+    # 2. Single action
+    multi_verb_re = re.compile(
+        r"\b(und|sowie|als auch)\b.*\b(müssen|sicherstellen|implementieren"
+        r"|dokumentieren|melden|testen|prüfen|überwachen|gewährleisten)\b",
+        re.IGNORECASE,
+    )
+    flags["single_action"] = not bool(multi_verb_re.search(obl_text))
+
+    # 3. Not rationale
+    normative_count = len(_NORMATIVE_RE.findall(obl_text))
+    rationale_count = len(_RATIONALE_RE.findall(obl_text))
+    flags["not_rationale"] = normative_count >= rationale_count
+
+    # 4. Not evidence-only
+    evidence_only_re = re.compile(
+        r"^(Nachweis|Dokumentation|Screenshot|Protokoll|Bericht|Zertifikat)",
+        re.IGNORECASE,
+    )
+    flags["not_evidence_only"] = not bool(evidence_only_re.match(obl_text.strip()))
+
+    # 5. Min length
+    flags["min_length"] = len(obl_text.strip()) >= 20
+
+    # 6. Parent link
+    flags["has_parent_link"] = bool(parent_uuid)
+
+    # Confidence
+    weights = {
+        "has_normative_signal": 0.25, "single_action": 0.20,
+        "not_rationale": 0.20, "not_evidence_only": 0.15,
+        "min_length": 0.10, "has_parent_link": 0.05,
+    }
+    # Bonus for pflicht classification
+    confidence = sum(weights[k] for k, v in flags.items() if v and k in weights)
+    if obl_type == "pflicht":
+        confidence = min(confidence + 0.05, 1.0)
+
+    # Pass check — has_normative_signal is NO LONGER critical
+    critical = ["not_evidence_only", "min_length", "has_parent_link"]
+    passed = all(flags.get(k, False) for k in critical)
+
+    return flags, passed, confidence, obl_type
+
+
+# ── JSON parsing ──────────────────────────────────────────────────────
+
+def parse_json_array(text):
+    try:
+        result = json.loads(text)
+        if isinstance(result, list):
+            return result
+        if isinstance(result, dict):
+            return [result]
+    except json.JSONDecodeError:
+        pass
+    match = re.search(r"\[[\s\S]*\]", text)
+    if match:
+        try:
+            result = json.loads(match.group())
+            if isinstance(result, list):
+                return result
+        except json.JSONDecodeError:
+            pass
+    return []
+
+
+# ── API call ──────────────────────────────────────────────────────────
+
+def call_anthropic(prompt):
+    headers = {
+        "x-api-key": ANTHROPIC_API_KEY,
+        "anthropic-version": "2023-06-01",
+        "content-type": "application/json",
+    }
+    payload = {
+        "model": ANTHROPIC_MODEL,
+        "max_tokens": 8192,
+        "system": [{"type": "text", "text": SYSTEM_PROMPT, "cache_control": {"type": "ephemeral"}}],
+        "messages": [{"role": "user", "content": prompt}],
+    }
+    resp = requests.post(f"{ANTHROPIC_API_URL}/messages", headers=headers, json=payload, timeout=120)
+    if resp.status_code != 200:
+        return None, {}, f"HTTP {resp.status_code}: {resp.text[:200]}"
+    data = resp.json()
+    usage = data.get("usage", {})
+    content = data.get("content", [])
+    text = content[0].get("text", "") if content else ""
+    return text, usage, None
+
+
+# ── Format helpers ────────────────────────────────────────────────────
+
+def fmt_json(val):
+    if val is None:
+        return ""
+    if isinstance(val, str):
+        try:
+            val = json.loads(val)
+        except (json.JSONDecodeError, TypeError):
+            return val
+    if isinstance(val, list):
+        return "\n".join(f"  - {item}" for item in val)
+    return str(val)
+
+
+# ── Main ──────────────────────────────────────────────────────────────
+
+def main():
+    parser = argparse.ArgumentParser(description="Test Pass 0a on small sample")
+    parser.add_argument("--limit", type=int, default=10)
+    parser.add_argument("--source", type=str)
+    parser.add_argument("--dry-run", action="store_true")
+    args = parser.parse_args()
+
+    if not ANTHROPIC_API_KEY and not args.dry_run:
+        print("ERROR: Set ANTHROPIC_API_KEY")
+        sys.exit(1)
+
+    db_url = os.environ["DATABASE_URL"]
+    p = urllib.parse.urlparse(db_url)
+    conn = psycopg2.connect(
+        host=p.hostname, port=p.port or 5432,
+        user=p.username, password=p.password,
+        dbname=p.path.lstrip("/"),
+        options="-c search_path=compliance,public",
+    )
+    cur = conn.cursor()
+
+    # Select diverse sample
+    query = """
+        SELECT id, control_id, title, objective, requirements,
+               test_procedure, source_citation, category
+        FROM compliance.canonical_controls
+        WHERE release_state NOT IN ('deprecated', 'duplicate', 'too_close')
+          AND parent_control_uuid IS NULL
+          AND title IS NOT NULL AND objective IS NOT NULL
+          AND length(coalesce(objective,'') || coalesce(requirements::text,'')) > 100
+    """
+    params = []
+    if args.source:
+        query += " AND source_citation->>'source' ILIKE %s"
+        params.append(f"%{args.source}%")
+
+    query += " ORDER BY source_citation->>'source', random()"
+    query += f" LIMIT {args.limit}"
+
+    cur.execute(query, params)
+    controls = cur.fetchall()
+
+    if not controls:
+        print("No controls found.")
+        return
+
+    print(f"{'='*70}")
+    print(f"Pass 0a Test — {len(controls)} Controls")
+    print(f"Model: {ANTHROPIC_MODEL}")
+    print(f"{'='*70}")
+
+    total_in = total_out = total_obls = 0
+    type_counts = {"pflicht": 0, "empfehlung": 0, "kann": 0}
+    total_rejected = 0  # only evidence-only / too-short / no-parent
+    all_results = []
+    t_start = time.time()
+
+    for i, row in enumerate(controls, 1):
+        ctrl_uuid, ctrl_id, title, objective, reqs, test_proc, src_cit, category = row
+
+        req_str = fmt_json(reqs)
+        test_str = fmt_json(test_proc)
+        source_str = ""
+        if src_cit:
+            sc = src_cit if isinstance(src_cit, dict) else json.loads(src_cit)
+            source_str = f"{sc.get('source', '')} {sc.get('article', '')}"
+
+        print(f"\n{'─'*70}")
+        print(f"[{i}/{len(controls)}] {ctrl_id}: {title}")
+        print(f"  Source: {source_str} | Category: {category or 'N/A'}")
+        print(f"  Objective: {(objective or '')[:200]}")
+
+        if args.dry_run:
+            print("  [DRY RUN]")
+            continue
+
+        prompt = build_prompt(title or "", objective or "", req_str, test_str, source_str)
+
+        t0 = time.time()
+        response_text, usage, error = call_anthropic(prompt)
+        elapsed = time.time() - t0
+
+        if error:
+            print(f"  ERROR: {error}")
+            continue
+
+        in_tok = usage.get("input_tokens", 0)
+        out_tok = usage.get("output_tokens", 0)
+        cached = usage.get("cache_read_input_tokens", 0)
+        total_in += in_tok
+        total_out += out_tok
+
+        obligations = parse_json_array(response_text)
+        total_obls += len(obligations)
+
+        print(f"  API: {elapsed:.1f}s | {in_tok} in / {out_tok} out"
+              f"{f' ({cached} cached)' if cached else ''}"
+              f" | {len(obligations)} obligation(s)")
+
+        for j, obl in enumerate(obligations, 1):
+            obl_text = obl.get("obligation_text", "")
+            action = obl.get("action", "")
+            obj = obl.get("object", "")
+            condition = obl.get("condition")
+            strength = obl.get("normative_strength", "must")
+            is_test = bool(obl.get("is_test_obligation", False))
+            is_report = bool(obl.get("is_reporting_obligation", False))
+
+            # Auto-detect
+            if not is_test and _TEST_RE.search(obl_text):
+                is_test = True
+            if not is_report and _REPORTING_RE.search(obl_text):
+                is_report = True
+
+            flags, passed, conf, obl_type = quality_gate(obl_text, str(ctrl_uuid))
+            if passed:
+                type_counts[obl_type] = type_counts.get(obl_type, 0) + 1
+            else:
+                total_rejected += 1
+
+            tag = ""
+            if is_test:
+                tag = " [TEST]"
+            elif is_report:
+                tag = " [MELDEPFLICHT]"
+
+            # Show type instead of PASS/REJECT
+            type_label = {"pflicht": "PFLICHT", "empfehlung": "EMPFEHLUNG", "kann": "KANN"}
+            if not passed:
+                status = "REJECT"
+            else:
+                status = type_label.get(obl_type, "EMPFEHLUNG")
+
+            failed = [k for k, v in flags.items()
+                      if isinstance(v, bool) and not v]
+
+            print(f"\n    {j}. [{status}] conf={conf:.0%}{tag} strength={strength}")
+            print(f"       {obl_text}")
+            print(f"       Handlung: {action} | Gegenstand: {obj}")
+            if condition:
+                print(f"       Bedingung: {condition}")
+            if not passed:
+                print(f"       Abgelehnt: {', '.join(failed)}")
+
+            all_results.append({
+                "control_id": ctrl_id,
+                "obligation_text": obl_text,
+                "obligation_type": obl_type if passed else "rejected",
+                "action": action,
+                "object": obj,
+                "condition": condition,
+                "confidence": round(conf, 2),
+                "is_test": is_test,
+                "is_reporting": is_report,
+                "passed": passed,
+                "flags": {k: v for k, v in flags.items()},
+            })
+
+        time.sleep(0.5)
+
+    # ── Summary ──────────────────────────────────────────────────────
+    elapsed_total = time.time() - t_start
+    cost = (total_in * 3 + total_out * 15) / 1_000_000
+    total_classified = sum(type_counts.values())
+
+    print(f"\n\n{'='*70}")
+    print(f"ZUSAMMENFASSUNG — 3-Tier-Klassifizierung")
+    print(f"{'='*70}")
+    print(f"  Controls:       {len(controls)}")
+    print(f"  Obligations:    {total_obls} ({total_obls/max(len(controls),1):.1f} pro Control)")
+    print(f"  ── Klassifizierung ──")
+    print(f"  Pflicht:        {type_counts['pflicht']}"
+          f" ({type_counts['pflicht']*100/max(total_obls,1):.0f}%)")
+    print(f"  Empfehlung:     {type_counts['empfehlung']}"
+          f" ({type_counts['empfehlung']*100/max(total_obls,1):.0f}%)")
+    print(f"  Kann:           {type_counts['kann']}"
+          f" ({type_counts['kann']*100/max(total_obls,1):.0f}%)")
+    print(f"  Rejected:       {total_rejected}"
+          f" ({total_rejected*100/max(total_obls,1):.0f}%)"
+          f"  (nur evidence-only/zu kurz/kein parent)")
+    print(f"  ── Kosten ──")
+    print(f"  Laufzeit:       {elapsed_total:.1f}s")
+    print(f"  Tokens:         {total_in:,} in / {total_out:,} out")
+    print(f"  Kosten:         ${cost:.4f}")
+
+    if len(controls) > 0 and not args.dry_run and total_obls > 0:
+        n = 6000
+        factor = n / len(controls)
+        print(f"\n  --- Hochrechnung auf {n:,} Controls ---")
+        print(f"  Tokens:         {int(total_in * factor):,} in / {int(total_out * factor):,} out")
+        print(f"  Kosten:         ${cost * factor:.2f}")
+        print(f"  Laufzeit:       {elapsed_total * factor / 3600:.1f}h")
+        print(f"  Obligations:    ~{int(total_obls / len(controls) * n):,}")
+        pf = int(type_counts['pflicht'] * factor)
+        ef = int(type_counts['empfehlung'] * factor)
+        kf = int(type_counts['kann'] * factor)
+        print(f"  Pflicht:        ~{pf:,}")
+        print(f"  Empfehlung:     ~{ef:,}")
+        print(f"  Kann:           ~{kf:,}")
+
+    # Save results JSON for later analysis
+    if all_results:
+        out_path = f"/tmp/pass0a_results_{len(controls)}controls.json"
+        with open(out_path, "w") as f:
+            json.dump(all_results, f, ensure_ascii=False, indent=2)
+        print(f"\n  Ergebnisse gespeichert: {out_path}")
+
+    conn.close()
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,308 @@
+#!/usr/bin/env python3
+"""Preview Pass 0b: Turn obligation candidates into atomic controls.
+
+Picks a few obligations from Pass 0a results, calls LLM to compose
+atomic controls, and writes them to canonical_controls with parent_control_uuid.
+
+Usage:
+    python3 test_pass0b_preview.py --input /tmp/pass0a_results_60controls.json --limit 3
+"""
+import argparse
+import json
+import os
+import re
+import sys
+import time
+import uuid
+import urllib.parse
+
+import psycopg2
+import psycopg2.extras
+import requests
+
+# Register JSON adapter
+psycopg2.extensions.register_adapter(dict, psycopg2.extras.Json)
+
+ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", "")
+ANTHROPIC_MODEL = os.environ.get("DECOMPOSITION_LLM_MODEL", "claude-sonnet-4-6")
+
+SYSTEM_PROMPT = """\
+Du bist ein Security-Compliance-Experte. Du erstellst aus einer einzelnen \
+normativen Pflicht ein praxisorientiertes, atomares Security Control.
+
+Das Control muss UMSETZBAR sein — keine Gesetzesparaphrase.
+Antworte NUR als JSON. Keine Erklärungen."""
+
+
+def build_pass0b_prompt(obl_text, action, obj, parent_title, category, source_ref):
+    return f"""\
+Erstelle aus der folgenden Pflicht ein atomares Control.
+
+PFLICHT: {obl_text}
+HANDLUNG: {action}
+GEGENSTAND: {obj}
+
+KONTEXT (Ursprungs-Control):
+Titel: {parent_title}
+Kategorie: {category}
+Quellreferenz: {source_ref}
+
+Antworte als JSON:
+{{
+  "title": "Kurzer Titel (max 80 Zeichen, deutsch)",
+  "objective": "Was muss erreicht werden? (1-2 Sätze)",
+  "requirements": ["Konkrete Anforderung 1", "Anforderung 2"],
+  "test_procedure": ["Prüfschritt 1", "Prüfschritt 2"],
+  "evidence": ["Nachweis 1", "Nachweis 2"],
+  "severity": "critical|high|medium|low",
+  "category": "security|privacy|governance|operations|finance|reporting"
+}}"""
+
+
+def call_anthropic(prompt):
+    headers = {
+        "x-api-key": ANTHROPIC_API_KEY,
+        "anthropic-version": "2023-06-01",
+        "content-type": "application/json",
+    }
+    payload = {
+        "model": ANTHROPIC_MODEL,
+        "max_tokens": 4096,
+        "system": [{"type": "text", "text": SYSTEM_PROMPT, "cache_control": {"type": "ephemeral"}}],
+        "messages": [{"role": "user", "content": prompt}],
+    }
+    resp = requests.post("https://api.anthropic.com/v1/messages", headers=headers, json=payload, timeout=120)
+    if resp.status_code != 200:
+        return None, {}, f"HTTP {resp.status_code}: {resp.text[:200]}"
+    data = resp.json()
+    text = data.get("content", [{}])[0].get("text", "")
+    return text, data.get("usage", {}), None
+
+
+def parse_json_object(text):
+    try:
+        return json.loads(text)
+    except json.JSONDecodeError:
+        match = re.search(r"\{[\s\S]*\}", text)
+        if match:
+            try:
+                return json.loads(match.group())
+            except json.JSONDecodeError:
+                pass
+    return None
+
+
+def generate_control_id(domain, cur):
+    prefix = domain.upper()[:4]
+    cur.execute("""
+        SELECT MAX(CAST(SPLIT_PART(control_id, '-', 2) AS INTEGER))
+        FROM compliance.canonical_controls
+        WHERE control_id LIKE %s
+          AND SPLIT_PART(control_id, '-', 2) ~ '^[0-9]+$'
+    """, (f"{prefix}-%",))
+    row = cur.fetchone()
+    if row and row[0] is not None:
+        return f"{prefix}-{row[0] + 1}"
+    return f"{prefix}-001"
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input", default="/tmp/pass0a_results_60controls.json")
+    parser.add_argument("--limit", type=int, default=3, help="Number of obligations to process")
+    parser.add_argument("--control", type=str, help="Pick obligations from this control_id")
+    parser.add_argument("--dry-run", action="store_true")
+    args = parser.parse_args()
+
+    if not ANTHROPIC_API_KEY and not args.dry_run:
+        print("ERROR: Set ANTHROPIC_API_KEY")
+        sys.exit(1)
+
+    # Load 0a results
+    with open(args.input) as f:
+        obligations = json.load(f)
+
+    # Filter: only passed, pflicht or empfehlung
+    obligations = [o for o in obligations if o.get("passed", False)]
+
+    if args.control:
+        obligations = [o for o in obligations if o["control_id"] == args.control]
+
+    # Pick diverse sample
+    picked = []
+    seen_types = set()
+    for o in obligations:
+        otype = o["obligation_type"]
+        if otype not in seen_types and len(picked) < args.limit:
+            picked.append(o)
+            seen_types.add(otype)
+    # Fill rest
+    for o in obligations:
+        if o not in picked and len(picked) < args.limit:
+            picked.append(o)
+
+    if not picked:
+        print("No obligations found.")
+        return
+
+    # Connect to DB
+    db_url = os.environ["DATABASE_URL"]
+    p = urllib.parse.urlparse(db_url)
+    conn = psycopg2.connect(
+        host=p.hostname, port=p.port or 5432,
+        user=p.username, password=p.password,
+        dbname=p.path.lstrip("/"),
+        options="-c search_path=compliance,public",
+    )
+    cur = conn.cursor()
+
+    # Get parent control info
+    ctrl_ids = list(set(o["control_id"] for o in picked))
+    cur.execute("""
+        SELECT control_id, id, title, category, source_citation
+        FROM compliance.canonical_controls
+        WHERE control_id = ANY(%s)
+    """, (ctrl_ids,))
+    ctrl_map = {}
+    for row in cur.fetchall():
+        sc = row[4] if isinstance(row[4], dict) else (json.loads(row[4]) if row[4] else {})
+        # Derive domain prefix from control_id (e.g. "DSGV" from "DSGV-001")
+        prefix = row[0].split("-")[0] if "-" in row[0] else "COMP"
+        ctrl_map[row[0]] = {
+            "uuid": str(row[1]), "title": row[2], "category": row[3] or "",
+            "source_ref": f"{sc.get('source', '')} {sc.get('article', '')}",
+            "domain": prefix,
+        }
+
+    print("=" * 70)
+    print(f"Pass 0b Preview — {len(picked)} Obligations → Atomic Controls")
+    print("=" * 70)
+
+    created = []
+    for i, obl in enumerate(picked, 1):
+        ctrl = ctrl_map.get(obl["control_id"], {})
+        print(f"\n{'─'*70}")
+        print(f"[{i}/{len(picked)}] {obl['control_id']}: [{obl['obligation_type'].upper()}]")
+        print(f"  Obligation: {obl['obligation_text'][:120]}")
+        print(f"  Parent: {ctrl.get('title', 'N/A')}")
+
+        if args.dry_run:
+            print("  [DRY RUN]")
+            continue
+
+        prompt = build_pass0b_prompt(
+            obl["obligation_text"], obl["action"], obl["object"],
+            ctrl.get("title", ""), ctrl.get("category", ""),
+            ctrl.get("source_ref", ""),
+        )
+
+        t0 = time.time()
+        resp_text, usage, error = call_anthropic(prompt)
+        elapsed = time.time() - t0
+
+        if error:
+            print(f"  ERROR: {error}")
+            continue
+
+        result = parse_json_object(resp_text)
+        if not result:
+            print(f"  PARSE ERROR: {resp_text[:200]}")
+            continue
+
+        in_tok = usage.get("input_tokens", 0)
+        out_tok = usage.get("output_tokens", 0)
+        print(f"  LLM: {elapsed:.1f}s | {in_tok} in / {out_tok} out")
+
+        # Generate control_id
+        domain = ctrl.get("domain", "COMP")
+        new_control_id = generate_control_id(domain, cur)
+
+        # Show result
+        print(f"\n  === ATOMIC CONTROL: {new_control_id} ===")
+        print(f"  Titel:     {result.get('title', 'N/A')}")
+        print(f"  Ziel:      {result.get('objective', 'N/A')}")
+        print(f"  Typ:       {obl['obligation_type']}")
+        reqs = result.get("requirements", [])
+        if reqs:
+            print(f"  Anforderungen:")
+            for r in reqs:
+                print(f"    - {r}")
+        tests = result.get("test_procedure", [])
+        if tests:
+            print(f"  Pruefverfahren:")
+            for t in tests:
+                print(f"    - {t}")
+        evidence = result.get("evidence", [])
+        if evidence:
+            print(f"  Nachweise:")
+            for e in evidence:
+                print(f"    - {e}")
+        print(f"  Severity:  {result.get('severity', 'medium')}")
+        print(f"  Category:  {result.get('category', 'governance')}")
+
+        # Write to DB
+        new_uuid = str(uuid.uuid4())
+        parent_uuid = ctrl.get("uuid")
+        source_cit = {}
+        if ctrl.get("source_ref"):
+            parts = ctrl["source_ref"].strip().split(" ", 1)
+            source_cit = {"source": parts[0], "article": parts[1] if len(parts) > 1 else ""}
+
+        cur.execute("""
+            INSERT INTO compliance.canonical_controls (
+                id, control_id, title, objective, requirements, test_procedure,
+                evidence, severity, category, release_state,
+                source_citation, generation_metadata, generation_strategy,
+                pipeline_version, parent_control_uuid, framework_id
+            ) VALUES (
+                %s, %s, %s, %s, %s, %s,
+                %s, %s, %s, %s,
+                %s, %s, %s,
+                %s, %s,
+                (SELECT id FROM compliance.canonical_control_frameworks LIMIT 1)
+            )
+        """, (
+            new_uuid, new_control_id,
+            result.get("title", ""),
+            result.get("objective", ""),
+            json.dumps(result.get("requirements", []), ensure_ascii=False),
+            json.dumps(result.get("test_procedure", []), ensure_ascii=False),
+            json.dumps(result.get("evidence", []), ensure_ascii=False),
+            result.get("severity", "medium"),
+            result.get("category", "governance"),
+            "draft",
+            psycopg2.extras.Json(source_cit),
+            psycopg2.extras.Json({
+                "obligation_type": obl["obligation_type"],
+                "obligation_text": obl["obligation_text"],
+                "pass0b_model": ANTHROPIC_MODEL,
+                "decomposition_method": "pass0b_preview",
+            }),
+            "pass0b_atomic",
+            6,  # pipeline_version
+            parent_uuid,
+        ))
+        conn.commit()
+
+        created.append({
+            "control_id": new_control_id,
+            "title": result.get("title", ""),
+            "obligation_type": obl["obligation_type"],
+            "parent_control_id": obl["control_id"],
+        })
+        print(f"  ✓ Geschrieben: {new_control_id} (parent: {obl['control_id']})")
+
+        time.sleep(0.5)
+
+    if created:
+        print(f"\n{'='*70}")
+        print(f"ERGEBNIS: {len(created)} atomare Controls erstellt")
+        print(f"{'='*70}")
+        for c in created:
+            print(f"  {c['control_id']}: {c['title']} [{c['obligation_type']}] (von {c['parent_control_id']})")
+
+    conn.close()
+
+
+if __name__ == "__main__":
+    main()