-
{ev.type}: {ev.description}
+ {typeof ev === 'string' ? (
+
{ev}
+ ) : (
+
{ev.type}: {ev.description}
+ )}
))}
@@ -359,7 +392,18 @@ export function ControlDetail({
-
Pfad: {String(ctrl.generation_metadata.processing_path || '-')}
+ {ctrl.generation_metadata.processing_path && (
+
Pfad: {String(ctrl.generation_metadata.processing_path)}
+ )}
+ {ctrl.generation_metadata.decomposition_method && (
+
Methode: {String(ctrl.generation_metadata.decomposition_method)}
+ )}
+ {ctrl.generation_metadata.pass0b_model && (
+
LLM: {String(ctrl.generation_metadata.pass0b_model)}
+ )}
+ {ctrl.generation_metadata.obligation_type && (
+
Obligation-Typ: {String(ctrl.generation_metadata.obligation_type)}
+ )}
{ctrl.generation_metadata.similarity_status && (
Similarity: {String(ctrl.generation_metadata.similarity_status)}
)}
diff --git a/admin-compliance/app/sdk/control-library/components/helpers.tsx b/admin-compliance/app/sdk/control-library/components/helpers.tsx
index 146f7ac..f48f743 100644
--- a/admin-compliance/app/sdk/control-library/components/helpers.tsx
+++ b/admin-compliance/app/sdk/control-library/components/helpers.tsx
@@ -30,7 +30,7 @@ export interface CanonicalControl {
}
requirements: string[]
test_procedure: string[]
- evidence: EvidenceItem[]
+ evidence: (EvidenceItem | string)[]
severity: string
risk_score: number | null
implementation_effort: string | null
@@ -47,6 +47,10 @@ export interface CanonicalControl {
target_audience: string | string[] | null
generation_metadata?: Record
| null
generation_strategy?: string | null
+ parent_control_uuid?: string | null
+ parent_control_id?: string | null
+ parent_control_title?: string | null
+ decomposition_method?: string | null
created_at: string
updated_at: string
}
@@ -275,7 +279,26 @@ export function GenerationStrategyBadge({ strategy }: { strategy: string | null
if (strategy === 'document_grouped') {
return v2
}
- return null
+ if (strategy === 'phase74_gap_fill') {
+ return v5 Gap
+ }
+ if (strategy === 'pass0b_atomic') {
+ return Atomar
+ }
+ return {strategy}
+}
+
+export const OBLIGATION_TYPE_CONFIG: Record = {
+ pflicht: { bg: 'bg-red-100 text-red-700', label: 'Pflicht' },
+ empfehlung: { bg: 'bg-amber-100 text-amber-700', label: 'Empfehlung' },
+ kann: { bg: 'bg-green-100 text-green-700', label: 'Kann' },
+}
+
+export function ObligationTypeBadge({ type }: { type: string | null | undefined }) {
+ if (!type) return null
+ const config = OBLIGATION_TYPE_CONFIG[type]
+ if (!config) return null
+ return {config.label}
}
export function getDomain(controlId: string): string {
diff --git a/admin-compliance/app/sdk/control-library/page.tsx b/admin-compliance/app/sdk/control-library/page.tsx
index f67f80e..aaa5f32 100644
--- a/admin-compliance/app/sdk/control-library/page.tsx
+++ b/admin-compliance/app/sdk/control-library/page.tsx
@@ -9,7 +9,7 @@ import {
import {
CanonicalControl, Framework, BACKEND_URL, EMPTY_CONTROL,
SeverityBadge, StateBadge, LicenseRuleBadge, VerificationMethodBadge, CategoryBadge, TargetAudienceBadge,
- GenerationStrategyBadge,
+ GenerationStrategyBadge, ObligationTypeBadge,
VERIFICATION_METHODS, CATEGORY_OPTIONS, TARGET_AUDIENCE_OPTIONS,
} from './components/helpers'
import { ControlForm } from './components/ControlForm'
@@ -762,6 +762,7 @@ export default function ControlLibraryPage() {
+
{ctrl.risk_score !== null && (
Score: {ctrl.risk_score}
)}
diff --git a/backend-compliance/compliance/api/canonical_control_routes.py b/backend-compliance/compliance/api/canonical_control_routes.py
index df44dda..992a5a2 100644
--- a/backend-compliance/compliance/api/canonical_control_routes.py
+++ b/backend-compliance/compliance/api/canonical_control_routes.py
@@ -174,6 +174,9 @@ _CONTROL_COLS = """id, framework_id, control_id, title, objective, rationale,
customer_visible, verification_method, category,
target_audience, generation_metadata, generation_strategy,
applicable_industries, applicable_company_size, scope_conditions,
+ parent_control_uuid, decomposition_method, pipeline_version,
+ (SELECT p.control_id FROM canonical_controls p WHERE p.id = canonical_controls.parent_control_uuid) AS parent_control_id,
+ (SELECT p.title FROM canonical_controls p WHERE p.id = canonical_controls.parent_control_uuid) AS parent_control_title,
created_at, updated_at"""
@@ -798,6 +801,11 @@ def _control_row(r) -> dict:
"applicable_industries": getattr(r, "applicable_industries", None),
"applicable_company_size": getattr(r, "applicable_company_size", None),
"scope_conditions": getattr(r, "scope_conditions", None),
+ "parent_control_uuid": str(r.parent_control_uuid) if getattr(r, "parent_control_uuid", None) else None,
+ "parent_control_id": getattr(r, "parent_control_id", None),
+ "parent_control_title": getattr(r, "parent_control_title", None),
+ "decomposition_method": getattr(r, "decomposition_method", None),
+ "pipeline_version": getattr(r, "pipeline_version", None),
"created_at": r.created_at.isoformat() if r.created_at else None,
"updated_at": r.updated_at.isoformat() if r.updated_at else None,
}
diff --git a/backend-compliance/compliance/api/dsfa_routes.py b/backend-compliance/compliance/api/dsfa_routes.py
index dcd9ce7..f9c79c7 100644
--- a/backend-compliance/compliance/api/dsfa_routes.py
+++ b/backend-compliance/compliance/api/dsfa_routes.py
@@ -200,6 +200,9 @@ def _get_tenant_id(tenant_id: Optional[str]) -> str:
def _dsfa_to_response(row) -> dict:
"""Convert a DB row to a JSON-serializable dict."""
import json
+ # SQLAlchemy 2.0: Row objects need ._mapping for string-key access
+ if hasattr(row, "_mapping"):
+ row = row._mapping
def _parse_arr(val):
"""Parse a JSONB array field → list."""
@@ -558,8 +561,9 @@ async def create_dsfa(
).fetchone()
db.flush()
+ row_id = row._mapping["id"] if hasattr(row, "_mapping") else row[0]
_log_audit(
- db, tid, row["id"], "CREATE", request.created_by,
+ db, tid, row_id, "CREATE", request.created_by,
new_values={"title": request.title, "status": request.status},
)
db.commit()
diff --git a/backend-compliance/migrations/074_control_dedup.sql b/backend-compliance/migrations/074_control_dedup.sql
new file mode 100644
index 0000000..81cb495
--- /dev/null
+++ b/backend-compliance/migrations/074_control_dedup.sql
@@ -0,0 +1,73 @@
+-- Migration 074: Control Dedup Engine — DB Schema
+-- Supports the 4-stage dedup pipeline for atomic controls (Pass 0b).
+--
+-- Tables:
+-- 1. control_parent_links — M:N parent linking (one control → many regulations)
+-- 2. control_dedup_reviews — Review queue for borderline matches (0.85-0.92)
+
+BEGIN;
+
+-- =============================================================================
+-- 1. Control Parent Links (M:N)
+-- Enables "1 Control erfuellt 5 Gesetze" — the biggest USP.
+-- An atomic control can have multiple parent controls from different
+-- regulations/obligations. This replaces the 1:1 parent_control_uuid FK.
+-- =============================================================================
+
+CREATE TABLE IF NOT EXISTS control_parent_links (
+ id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
+ control_uuid UUID NOT NULL REFERENCES canonical_controls(id) ON DELETE CASCADE,
+ parent_control_uuid UUID NOT NULL REFERENCES canonical_controls(id) ON DELETE CASCADE,
+ link_type VARCHAR(30) NOT NULL DEFAULT 'decomposition'
+ CHECK (link_type IN ('decomposition', 'dedup_merge', 'manual', 'crosswalk')),
+ confidence NUMERIC(3,2) DEFAULT 1.0
+ CHECK (confidence >= 0 AND confidence <= 1),
+ source_regulation VARCHAR(100),
+ source_article VARCHAR(100),
+ obligation_candidate_id UUID REFERENCES obligation_candidates(id),
+ created_at TIMESTAMPTZ DEFAULT NOW(),
+ CONSTRAINT uq_parent_link UNIQUE (control_uuid, parent_control_uuid)
+);
+
+CREATE INDEX IF NOT EXISTS idx_cpl_control ON control_parent_links(control_uuid);
+CREATE INDEX IF NOT EXISTS idx_cpl_parent ON control_parent_links(parent_control_uuid);
+CREATE INDEX IF NOT EXISTS idx_cpl_type ON control_parent_links(link_type);
+
+COMMENT ON TABLE control_parent_links IS
+ 'M:N parent links — one atomic control can fulfill multiple regulations/obligations. USP: "1 Control erfuellt 5 Gesetze"';
+
+-- =============================================================================
+-- 2. Control Dedup Reviews
+-- Queue for borderline matches (similarity 0.85-0.92) that need human review.
+-- Reviewed entries get status updated to accepted/rejected.
+-- =============================================================================
+
+CREATE TABLE IF NOT EXISTS control_dedup_reviews (
+ id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
+ candidate_control_id VARCHAR(30) NOT NULL,
+ candidate_title TEXT NOT NULL,
+ candidate_objective TEXT,
+ matched_control_uuid UUID REFERENCES canonical_controls(id),
+ matched_control_id VARCHAR(30),
+ similarity_score NUMERIC(4,3) DEFAULT 0.0,
+ dedup_stage VARCHAR(40) NOT NULL,
+ dedup_details JSONB DEFAULT '{}',
+ parent_control_uuid UUID REFERENCES canonical_controls(id),
+ obligation_candidate_id UUID REFERENCES obligation_candidates(id),
+ review_status VARCHAR(20) DEFAULT 'pending'
+ CHECK (review_status IN ('pending', 'accepted_link', 'accepted_new', 'rejected')),
+ reviewed_by VARCHAR(100),
+ reviewed_at TIMESTAMPTZ,
+ review_notes TEXT,
+ created_at TIMESTAMPTZ DEFAULT NOW()
+);
+
+CREATE INDEX IF NOT EXISTS idx_cdr_status ON control_dedup_reviews(review_status);
+CREATE INDEX IF NOT EXISTS idx_cdr_matched ON control_dedup_reviews(matched_control_uuid);
+CREATE INDEX IF NOT EXISTS idx_cdr_parent ON control_dedup_reviews(parent_control_uuid);
+CREATE INDEX IF NOT EXISTS idx_cdr_stage ON control_dedup_reviews(dedup_stage);
+
+COMMENT ON TABLE control_dedup_reviews IS
+ 'Review queue for borderline dedup matches (similarity 0.85-0.92). Human decides: link or new control.';
+
+COMMIT;
diff --git a/backend-compliance/tests/test_canonical_control_routes.py b/backend-compliance/tests/test_canonical_control_routes.py
index 9097294..867537f 100644
--- a/backend-compliance/tests/test_canonical_control_routes.py
+++ b/backend-compliance/tests/test_canonical_control_routes.py
@@ -195,6 +195,11 @@ class TestControlRowConversion:
"release_state": "draft",
"tags": ["mfa"],
"generation_strategy": "ungrouped",
+ "parent_control_uuid": None,
+ "parent_control_id": None,
+ "parent_control_title": None,
+ "decomposition_method": None,
+ "pipeline_version": None,
"created_at": now,
"updated_at": now,
}
diff --git a/docs-src/development/qa-control-quality.md b/docs-src/development/qa-control-quality.md
index 7b26c43..3072e9d 100644
--- a/docs-src/development/qa-control-quality.md
+++ b/docs-src/development/qa-control-quality.md
@@ -2,7 +2,23 @@
## Übersicht
-Die Control Quality Pipeline prüft und verbessert die ~9.000 Canonical Controls der Compliance-Bibliothek. Sie nutzt **PDF-basierte Verifizierung** als Ground Truth — jeder Control-Originaltext wird direkt im Quelldokument (PDF) lokalisiert.
+Die Control Quality Pipeline prüft und verbessert die Canonical Controls der Compliance-Bibliothek. Sie nutzt **PDF-basierte Verifizierung** als Ground Truth — jeder Control-Originaltext wird direkt im Quelldokument (PDF) lokalisiert.
+
+Alle Scripts liegen in **`scripts/qa/`**. Starten auf dem Mac Mini via Runner-Script:
+
+```bash
+# Job starten (laedt .env automatisch, PID-Lock, unbuffered output)
+ssh macmini "bash ~/Projekte/breakpilot-compliance/scripts/qa/run_job.sh [args...]"
+
+# Status aller Jobs
+ssh macmini "bash ~/Projekte/breakpilot-compliance/scripts/qa/run_job.sh --status"
+
+# Log ansehen
+ssh macmini "bash ~/Projekte/breakpilot-compliance/scripts/qa/run_job.sh --log "
+
+# Job stoppen
+ssh macmini "bash ~/Projekte/breakpilot-compliance/scripts/qa/run_job.sh --kill "
+```
## Architektur
@@ -55,20 +71,24 @@ Jeder Control hat ein Feld `source_original_text` — der Chunk-Text aus dem Que
| Metrik | Wert |
|---|---|
-| Controls mit source_original_text | 7.943 |
-| Im PDF lokalisiert | **6.259 (79%)** |
-| Nicht gefunden (Sprachmismatch) | 1.651 |
-| Kein PDF vorhanden | 33 |
-| 100% Match-Rate | 19 Regulations (inkl. DSGVO, KI-VO, NIS2, NIST 800-53) |
+| Controls mit source_original_text | 5.751 (86%) |
+| Im PDF lokalisiert | **5.063 (88%)** |
+| Nicht gefunden | 649 |
+| Kein PDF vorhanden | 29 |
+| Recital_suspect markiert | 648 |
+| 100% Match-Rate | 20+ Regulations (inkl. DSGVO, KI-VO, NIS2, NIST 800-53, Blue Guide) |
+
+**Verlauf:** v1 (4.110, 52%) → v2 (6.091, 77%) → v3 (6.259, 79%) → v4 +Blue Guide EN (6.803, 86%) → v5 nach Cleanup (5.063/5.741, 88%)
### Nicht-matchende Controls
-| Ursache | Controls | Erklärung |
+| Ursache | Controls | Status |
|---|---|---|
-| Blue Guide EN vs. DE PDF | ~562 | Controls aus englischem PDF, wir haben nur deutsches |
-| OWASP multilingual | ~632 | Controls aus PT/AR/ID/ES-Übersetzungen |
+| ~~Blue Guide EN vs. DE PDF~~ | ~~562~~ | ✅ Gelöst — EN-PDF beschafft, 544/544 gematcht |
+| ~~OWASP Top 10 multilingual~~ | ~~324~~ | ✅ Als duplicate markiert — Übersetzungen ohne Mehrwert |
| CRA Encoding | ~76 | PDF-Ligaturen/Sonderzeichen-Differenzen |
| CISA Secure by Design | ~113 | Falsches PDF (ENISA statt CISA) |
+| OWASP ASVS | ~173 | PDF-Matching-Problem (meist EN) |
## Brute-Force-Suche
@@ -100,34 +120,276 @@ Controls aus Erwägungsgründen (`article_type = preamble`) sind **kein Nachteil
Die 1.195 v1-Controls **ohne** Originaltext sind manuell erstellt (`strategy=ungrouped`) und haben keine Chunk-Referenz.
-## DB-Status (Stand 2026-03-20)
+## OWASP Cleanup (2026-03-20)
+
+- **324 OWASP Top 10 multilingual Controls** → `duplicate` markiert (ZH, AR, ID, FR, ES, PT — Übersetzungen derselben 10 Kategorien)
+- **47 Controls** mit falscher Quellenzuordnung korrigiert (z.B. als "OWASP Top 10" getaggt, aber tatsächlich aus ASVS/SAMM/API/MASVS)
+- **~200 OWASP ASVS/SAMM/MASVS EN Controls** behalten — unique Content aus GitHub/Website, nicht im PDF auffindbar
+
+## NIST OSCAL Import (2026-03-20)
+
+**776 neue Controls** aus NIST SP 800-53 Rev 5 OSCAL (Public Domain, maschinenlesbar):
+
+- Quelle: `usnistgov/oscal-content` (JSON Catalog)
+- Vor allem **Control Enhancements** (z.B. AC-2(3), SC-7(8)) — die atomaren Unteranforderungen
+- Jeder Control enthält: Statement + Guidance + Assessment-Methoden + Cross-References + Parameters
+- `pipeline_version = 4`, `generation_strategy = 'oscal_import'`
+- Kein Pass 0a/0b nötig — Controls sind **bereits atomar**
+
+| Metrik | Vorher | Nachher |
+|---|---|---|
+| SP 800-53 Controls (aktiv) | 1.107 | **1.883** |
+| OSCAL-Abdeckung | 238/1.014 (23%) | **1.014/1.014 (100%)** |
+
+## Phase 5: RAG-Deduplizierung + Normalisierung (2026-03-20)
+
+### Durchgeführte Schritte
+
+| Schritt | Beschreibung | Controls |
+|---|---|---|
+| 5.1 | OSCAL Controls: `source_regulation` in generation_metadata gesetzt | 776 |
+| 5.2 | v3 Controls ohne Source → `needs_review` mit `missing_source` Flag | 20 |
+| 5.3 | Leerer Source-Name korrigiert (AT TKG) | 1 |
+| 5.4 | OWASP regulation_code Fehlzuordnungen korrigiert | 47 |
+| 5.5 | **duplicate/too_close Controls hart gelöscht** | **3.301** |
+| 5.6 | Processed Chunks bereinigt (gelöschte Control-IDs entfernt) | 2.520 |
+
+### Ergebnis
+
+- **Vorher:** 9.936 Controls (6.635 aktiv, 2.998 duplicate, 303 too_close)
+- **Nachher:** 6.635 Controls, **alle aktiv** (0 duplicate/too_close)
+- Alle regulation_codes haben jetzt einheitliche Source-Namen
+- OWASP-Controls sind korrekt ihren Quellen zugeordnet
+
+## DB-Status (Stand 2026-03-20, nach Phase 7.4)
| release_state | Count |
|---|---|
-| draft | 5.365 |
-| needs_review | 818 |
-| duplicate | 2.674 |
-| too_close | 303 |
-| **Aktiv** | **6.183** |
+| draft | ~6.030 |
+| needs_review | 838 |
+| **Gesamt** | **6.868** |
-## Scripts
+## Scripts (`scripts/qa/`)
-Alle QA-Scripts liegen in `scripts/qa/`:
+### Kern-QA (PDF-Matching)
| Script | Beschreibung |
|---|---|
-| `pdf_qa_all.py` | Haupt-QA: Controls gegen PDFs matchen |
-| `pdf_qa_inventory.py` | Inventar: Regulations, Controls, PDFs |
-| `apply_pdf_qa_results.py` | Ergebnisse in DB schreiben |
-| `preamble_dedup.py` | Preamble vs. Artikel Duplikat-Erkennung |
-| `qa_dedup_controls.py` | Jaccard-basierte Titel-Dedup |
-| `qa_normalize_sources.py` | Source-Namen normalisieren |
-| `db_status.py` | DB-Status-Übersicht |
+| `pdf_qa_all.py` | **Haupt-QA**: Controls gegen PDFs matchen, Artikel-Index aufbauen. Enthaelt `SOURCE_FILE_MAP`, alle Index-Builder (EU, DE, NIST, OWASP, generic). 526 Zeilen. |
+| `pdf_qa_inventory.py` | Inventar: Welche Regulations haben Controls, wie viele, welche PDFs existieren |
+| `apply_pdf_qa_results.py` | Ergebnisse aus `pdf_qa_all.py` in DB schreiben (`article_type`, `recital_suspect`) |
+| `pdf_article_lookup_poc.py` | POC: Control-Text in PDF lokalisieren, Headings von Cross-Refs unterscheiden |
-## Nächste Schritte
+### Lueckenanalyse + Control-Generierung
-1. **Blue Guide EN-PDF** beschaffen → +562 Controls matchen
-2. **CISA Secure by Design** echtes PDF finden → +113 Controls
-3. **Brute-Force Ergebnisse anwenden** — 44 falsche Source-Zuordnungen korrigieren
-4. **Frontend-Anzeige** — `article_type` im Control-Detail anzeigen
-5. **Continuous QA** — Bei neuen Controls automatisch PDF-Match prüfen
+| Script | Beschreibung |
+|---|---|
+| `gap_analysis.py` | **Phase 7.3**: Artikel im PDF vs. Controls in DB vergleichen, Luecken identifizieren |
+| `phase74_generate_gap_controls.py` | **Phase 7.4**: Neue Controls fuer Luecken via Anthropic API generieren. `pipeline_version=5`. 624 Zeilen. |
+| `benchmark_llm_controls.py` | LLM-Vergleich: gpt-oss-120b vs. Claude Sonnet fuer Control-Generierung |
+| `test_pass0a.py` | **Pass 0a Test**: Obligation Extraction + 3-Tier-Klassifizierung (Pflicht/Empfehlung/Kann). Standalone, speichert JSON. |
+
+### Deduplizierung + Normalisierung
+
+| Script | Beschreibung |
+|---|---|
+| `preamble_dedup.py` | Preamble vs. Artikel Duplikat-Erkennung (Jaccard >= 0.40) |
+| `qa_dedup_controls.py` | Jaccard-basierte Titel-Deduplizierung |
+| `qa_apply_and_dedup.py` | Ergebnisse anwenden + Duplikate in einem Schritt markieren |
+| `qa_normalize_sources.py` | Source-Namen normalisieren (kanonische Namen) |
+| `phase5_normalize_and_cleanup.py` | **Phase 5**: Normalisierung + 3.301 Duplikate hart loeschen |
+| `qa_delete_gpsr_dupe.py` | GPSR-Duplikate loeschen |
+| `delete_gpsr_prod.py` | GPSR-Duplikate aus Production-Qdrant entfernen |
+
+### Quellen-spezifische Scripts
+
+| Script | Beschreibung |
+|---|---|
+| `blue_guide_en_match.py` | Blue Guide EN-PDF matchen (544/544 Erfolg) |
+| `owasp_cleanup.py` | OWASP multilingual Cleanup (324 Duplikate) + Source-Fix (47 korrigiert) |
+| `owasp_github_match.py` | OWASP ASVS/SAMM/MASVS gegen GitHub-Markdown matchen |
+| `oscal_import.py` | NIST OSCAL Import (776 Controls aus JSON Catalog) |
+| `oscal_analysis.py` | NIST OSCAL Analyse: Abdeckung, fehlende Controls |
+
+### Diagnose + Utilities
+
+| Script | Beschreibung |
+|---|---|
+| `db_status.py` | DB-Status: release_state Counts, pipeline_version, source Verteilung |
+| `debug_low_match.py` | Debugging: Warum matchen Blue Guide / OWASP / CISA schlecht? |
+| `qa_article_map_all_chunks.py` | Alle Chunks Artikel-Nummern zuordnen (Bulk) |
+| `backfill_job_66228863.py` | Einmaliger Backfill-Job |
+| `sync_controls_to_prod.py` | Controls von Dev nach Production synchronisieren |
+
+### Runner
+
+| Script | Beschreibung |
+|---|---|
+| `run_job.sh` | **Job-Runner**: Laedt `.env`, PID-Lock, Monitoring (`--status`, `--log`, `--kill`) |
+
+## Phase 7: PDF-Validierung + Enrichment (2026-03-20)
+
+### 7.1 + 7.2: Controls gegen PDFs validiert + Ergebnisse angewendet ✅
+
+- 5.063 Controls erfolgreich im Original-PDF lokalisiert (88%)
+- `article_type` fuer alle gematchten Controls gesetzt
+- 648 Preamble-Controls als `recital_suspect` in `generation_metadata` markiert
+- 332 Controls nicht matchbar (OWASP ASVS 132, CISA 72, ENISA 38, OWASP SAMM 31, CRA 28)
+
+### 7.3: Lueckenanalyse ✅
+
+**494 Artikel-Luecken** in 15 Quellen identifiziert. Geschaetzt ~300 davon actionable.
+
+| Source | Luecken | Coverage | Bemerkung |
+|---|---:|---:|---|
+| AML-Verordnung | 91 | 5% | Kaum ingestiert |
+| MiCA | 71 | 52% | Grosse Verordnung |
+| NIST SP 800-53 | 59 | 83% | Meist Section-Header, nur SA-15 fehlt |
+| OWASP ASVS 4.0 | 47 | 35% | Requirement-Gruppen fehlen |
+| Batterieverordnung | 41 | 58% | |
+| DSGVO | 35 | 65% | Einige Governance/Aufsicht-Artikel |
+| ENISA ICS/SCADA | 34 | 31% | |
+| ENISA Supply Chain | 26 | 7% | |
+| CRA | 23 | 68% | |
+| NIS2 | 16 | 65% | |
+| KI-Verordnung | 15 | 87% | Fast komplett |
+| Maschinenverordnung | 5 | 91% | Fast komplett |
+
+### 7.4: Neue Controls fuer Luecken generieren ✅ (2026-03-20)
+
+Script: `phase74_generate_gap_controls.py --resume`
+
+- **494 Artikel-Luecken** in 15 Quellen → Anthropic Claude Sonnet 4.6
+- `pipeline_version = 5`, `generation_strategy = 'phase74_gap_fill'`
+- Direkt PDF-Text als Input (nicht RAG-Chunks)
+- Starten via: `run_job.sh phase74_generate_gap_controls.py --resume`
+
+**Ergebnis:**
+
+| Source | Luecken | Generiert |
+|---|---:|---:|
+| AML-Verordnung | 91 | 97 |
+| MiCA | 71 | 68 |
+| NIST SP 800-53 | 59 | 19 |
+| KI-Verordnung | 15 | 15 |
+| OWASP ASVS 4.0 | 47 | 11 |
+| Batterieverordnung | 41 | 9 |
+| DSGVO | 35 | 4 |
+| OWASP Top 10 | 12 | 3 |
+| NIS2 | 16 | 3 |
+| CRA | 23 | 3 |
+| OECD KI-Empfehlung | 4 | 1 |
+| **Gesamt** | **494** | **233** |
+
+Nicht generiert: 75 zu kurzer Text, 29 NIST-Intros, 11 Parse-Errors, 162 ID-Konflikte (COMP-1000 etc.).
+API-Kosten: ~$7,55 (109 min Laufzeit).
+
+## Pass 0a: Obligation Extraction — 3-Tier-Klassifizierung
+
+### Konzept
+
+Pass 0a zerlegt Rich Controls (~6.000) in **atomare Obligations** per LLM (Claude Sonnet 4.6).
+Jede Obligation wird durch den **Quality Gate** klassifiziert — nicht gefiltert:
+
+| obligation_type | Signal | Beispiel |
+|---|---|---|
+| **pflicht** | müssen, muss, ist zu, hat zu, shall, must, required | "Der Betreiber muss alle Daten verschluesseln" |
+| **empfehlung** | soll, sollen, should, sicherstellen, gewaehrleisten, dokumentieren | "Der Betreiber soll regelmaessige Audits durchfuehren" |
+| **kann** | kann, koennen, darf, duerfen, may, optional | "Der Betreiber kann zusaetzliche Massnahmen ergreifen" |
+
+**Wichtig:** Nichts wird mehr rejected wegen fehlendem normativem Signal. Obligations ohne Signal werden als `empfehlung` klassifiziert. Rejected werden nur noch: Evidence-Only, zu kurz (<20 Zeichen), fehlender Parent-Link.
+
+### Warum auch Empfehlungen behalten?
+
+Empfehlungen helfen Firmen, ihre Systeme sicherer zu machen — ueber das Pflichtprogramm hinaus. Im Frontend erhalten Kunden einen Marker, der klar anzeigt:
+
+- **Pflicht** = gesetzlich/regulatorisch vorgeschrieben
+- **Empfehlung** = Best Practice, freiwillig, aber wertvoll
+- **Kann** = optional, weitergehende Massnahme
+
+### Quality Gate — Kritische Flags
+
+| Flag | Kritisch? | Beschreibung |
+|---|---|---|
+| `has_normative_signal` | Nein | Informativer Check, kein Ablehnungsgrund |
+| `obligation_type` | — | Klassifizierung (pflicht/empfehlung/kann) |
+| `not_evidence_only` | **Ja** | Kein reiner Nachweis-Eintrag |
+| `min_length` | **Ja** | Mindestens 20 Zeichen |
+| `has_parent_link` | **Ja** | Verbindung zum Parent-Control |
+| `single_action` | Nein | Nur ein Hauptverb (heuristisch) |
+| `not_rationale` | Nein | Keine reine Begruendung |
+
+### Normative Signal Detection — Regex-Tiers
+
+```
+Tier 1 (Pflicht): muessen, muss, ist/sind/hat/haben zu + Infinitiv,
+ Compound-Verben (festzustellen, vorzunehmen),
+ Gerundivum (mitzuteilen, bereitzustellen),
+ shall, must, required
+
+Tier 2 (Empfehlung): soll, sollen, sollte, sollten,
+ gewaehrleisten, sicherstellen,
+ should, ensure, recommend,
+ dokumentieren, implementieren, ueberpruefen
+
+Tier 3 (Kann): kann, koennen, darf, duerfen, may, optional
+```
+
+### Testergebnisse (3 Iterationen, 2026-03-20)
+
+| Run | Controls | Obligations | Validated | Rejected | Kosten |
+|---|---:|---:|---:|---:|---:|
+| 1 (v0 Regex) | 10 | ~100 | 68% | 32% | $0,28 |
+| 2 (v1 Regex) | 50 | ~530 | 78% | 22% | $1,43 |
+| 3 (v2 Regex) | 50 | ~530 | 86% | 14% | $1,44 |
+| 4 (3-Tier) | 60 | — | — | — | — |
+
+Run 4 laeuft mit dem neuen Klassifizierer — statt PASS/REJECT wird jetzt PFLICHT/EMPFEHLUNG/KANN ausgegeben.
+
+### Scripts
+
+| Script | Beschreibung |
+|---|---|
+| `test_pass0a.py` | **Test-Script**: Standalone (kein SQLAlchemy), psycopg2 + Anthropic API. Speichert Ergebnisse als JSON. |
+
+```bash
+# Test mit 10 Controls
+run_job.sh test_pass0a.py --limit 10
+
+# Test mit bestimmter Quelle
+run_job.sh test_pass0a.py --limit 20 --source "DSGVO"
+
+# Ergebnisse: /tmp/pass0a_results_controls.json
+```
+
+### Backend-Code
+
+- **Klassifizierung:** `backend-compliance/compliance/services/decomposition_pass.py`
+ - `classify_obligation_type()` — 3-Tier-Klassifizierung
+ - `quality_gate()` — gibt `obligation_type` in Flags zurueck
+ - `passes_quality_gate()` — `has_normative_signal` nicht mehr kritisch
+ - `ObligationCandidate.obligation_type` — neues Feld
+
+### Hochrechnung (basierend auf 50-Control-Runs)
+
+| Metrik | Wert |
+|---|---|
+| Kosten pro Control | ~$0,029 |
+| Kosten fuer ~6.000 Controls | **~$172** |
+| Laufzeit (geschaetzt) | ~25h |
+| Obligations pro Control | ~10,5 |
+
+---
+
+## Naechste Schritte
+
+1. ~~**Phase 5 Cleanup** → 3.301 Duplikate geloescht, Source normalisiert~~ ✅
+2. ~~**Phase 6 Pipeline-Haertung** → Source aus REGULATION_LICENSE_MAP~~ ✅
+3. ~~**Phase 7.1-7.3** → PDF-Validierung + Enrichment + Lueckenanalyse~~ ✅
+4. ~~**Phase 7.4** → 233 neue Controls fuer Luecken generiert ($7,55)~~ ✅
+5. **Pass 0a** → Obligation Extraction mit 3-Tier-Klassifizierung (Tests laufen, ~$172)
+6. **Pass 0b** → Atomic Control Composition aus validierten Obligations
+7. **Pass 1-5** → Multi-Layer Migration (Code + 500 Tests bereits vorhanden)
+8. **Phase 8** → Qdrant Re-Ingestion (Runtime-Betrieb, ZULETZT)
+9. **needs_review Triage** — 838 Controls klassifizieren
+10. **Frontend** — `obligation_type` (Pflicht/Empfehlung/Kann) + `article_type` anzeigen
diff --git a/docs-src/development/rag-pipeline-benchmark.md b/docs-src/development/rag-pipeline-benchmark.md
new file mode 100644
index 0000000..2250a05
--- /dev/null
+++ b/docs-src/development/rag-pipeline-benchmark.md
@@ -0,0 +1,206 @@
+# RAG Pipeline Benchmark & Optimierungen
+
+Stand: 2026-03-21. Vergleich unserer Implementierung mit State of the Art. Priorisierte Empfehlungen nach Impact/Effort.
+
+---
+
+## Aktuelle Pipeline (Ist-Zustand)
+
+```mermaid
+flowchart LR
+ A[Dokumente] -->|Document Crawler| B[Chunks 512/50]
+ B -->|bge-m3| C[Qdrant Dense]
+ C -->|Cosine Search| D[Control Generator v2]
+ D -->|LLM| E[Rich Controls 6.373]
+ E -->|Pass 0a| F[Obligations]
+ F -->|Pass 0b| G[Atomare Controls]
+ G -->|4-Stage Dedup| H[Master Controls ~18K]
+```
+
+| Komponente | Implementierung | SOTA-Bewertung |
+|-----------|----------------|----------------|
+| **Chunking** | Rekursiv, 512 Zeichen, 50 Overlap | Zu klein fuer Rechtstexte |
+| **Embedding** | bge-m3 (1024-dim, Ollama) | Gut, aber nur Dense genutzt |
+| **Vector DB** | Qdrant mit Payload-Filtering | Hybrid Search nicht aktiviert |
+| **Retrieval** | Pure Dense Cosine Similarity | Kein Re-Ranking, kein BM25 |
+| **Extraktion** | 3-Tier (Exact → Embedding → LLM) | Solide Architektur |
+| **Dedup** | 4-Stage (Pattern → Action → Object → Embedding) | Ueberdurchschnittlich |
+| **QA** | 5-Metrik Similarity + PDF-QA Matching | Gut, RAGAS fehlt |
+
+---
+
+## Tier 1: Quick Wins (Tage, nicht Wochen)
+
+### 1. Chunk-Groesse erhoehen: 512 → 1024, Overlap 50 → 128
+
+**Problem:** NAACL 2025 Vectara-Studie zeigt: fuer analytische/juristische Queries sind 512-1024 Token optimal. Unsere 512-Zeichen-Chunks (= ~128 Token) sind deutlich zu klein.
+
+**Unsere Lessons Learned:** "Chunks werden mitten im Absatz abgeschnitten. Artikel- und Paragraphennummern fehlen."
+
+**Aenderung:** Config-Parameter in `ingest-phase-h.sh` anpassen.
+
+| Metrik | Vorher | Nachher |
+|--------|--------|---------|
+| Chunk Size | 512 chars (~128 Token) | 1024 chars (~256 Token) |
+| Overlap | 50 chars (10%) | 128 chars (12.5%) |
+
+**Impact:** HOCH | **Effort:** NIEDRIG
+
+### 2. Ollama JSON-Mode fuer Obligation Extraction
+
+**Problem:** `_parse_json` in `decomposition_pass.py` hat Regex-Fallback — das zeigt, dass LLM-Output nicht zuverlaessig JSON ist.
+
+**Aenderung:** `format: "json"` in Ollama-API-Calls setzen.
+
+**Impact:** MITTEL | **Effort:** NIEDRIG (1 Parameter)
+
+### 3. Chain-of-Thought Prompting fuer Pass 0a/0b
+
+**Problem:** LegalGPT-Framework zeigt: explizite Reasoning-Chains ("Erst Addressat identifizieren, dann Aktion, dann normative Staerke") verbessern Extraktionsqualitaet signifikant.
+
+**Impact:** MITTEL | **Effort:** NIEDRIG (Prompt Engineering)
+
+---
+
+## Tier 2: High Impact, Medium Effort (1-2 Wochen)
+
+### 4. Hybrid Search (Dense + Sparse) via Qdrant
+
+**Problem:** Reine Dense-Suche. Juristische Queries enthalten spezifische Begriffe ("DSGVO Art. 35", "Abs. 3"), die BM25/Sparse besser findet.
+
+**Loesungsansatz:** BGE-M3 generiert bereits Sparse Vectors — wir verwerfen sie aktuell!
+
+```
+Qdrant Query API:
+- Dense: bge-m3 Cosine (wie bisher)
+- Sparse: bge-m3 Sparse Vectors (neu)
+- Fusion: Reciprocal Rank Fusion (RRF)
+```
+
+**Benchmarks (Anthropic):** 49% weniger fehlgeschlagene Retrievals mit Contextual Retrieval, 67% mit Re-Ranking.
+
+**Impact:** SEHR HOCH | **Effort:** MITTEL
+
+### 5. Cross-Encoder Re-Ranking
+
+**Problem:** Top-5 Ergebnisse direkt an LLM — keine Qualitaetspruefung der Retrieval-Ergebnisse.
+
+**Loesungsansatz:** BGE Reranker v2 (MIT-Lizenz) auf Top-20 Ergebnisse, dann Top-5 an LLM.
+
+| Re-Ranker | Lizenz | Empfehlung |
+|-----------|--------|------------|
+| BGE Reranker v2 | MIT | Empfohlen |
+| Jina Reranker v2 | Apache-2.0 | Alternative |
+| ColBERT v2 | MIT | Spaeter |
+
+**Impact:** HOCH | **Effort:** MITTEL
+
+### 6. Cross-Regulation Dedup Pass
+
+**Problem:** Dedup filtert immer nach `pattern_id` — Controls aus DSGVO Art. 25 und NIS2 Art. 21 (beide Security-by-Design) werden nie verglichen.
+
+**Loesungsansatz:** Zweiter Qdrant-Search ohne `pattern_id`-Filter nach dem normalen Dedup-Pass.
+
+**Impact:** HOCH | **Effort:** MITTEL
+
+### 7. Automatische Regressionstests (Golden Set)
+
+**Problem:** Keine systematische Qualitaetsmessung nach Pipeline-Aenderungen.
+
+**Loesungsansatz:** 20-Chunk Golden Set → Control-Generation → Output-Stabilitaet pruefen.
+
+**Impact:** HOCH | **Effort:** NIEDRIG
+
+---
+
+## Tier 3: Strategische Investitionen (Wochen bis Monate)
+
+### 8. Artikel-Boundary Chunking
+
+Eigener Splitter fuer EU-Verordnungen und deutsche Gesetze: Split an "Art.", "Artikel", "Paragraph"-Grenzen statt nach Zeichenzahl.
+
+### 9. RAGAS Evaluation Pipeline
+
+[RAGAS](https://docs.ragas.io/) mit Golden Dataset (50-100 manuell verifizierte Control-to-Source Mappings). Metriken: Faithfulness, Answer Relevancy, Context Precision, Context Recall.
+
+### 10. BGE-M3 Fine-Tuning
+
+Fine-Tuning auf Compliance-Corpus (~6.373 Control-Titel/Objective-Paare). Research zeigt +10-30% Domain-Retrieval-Verbesserung.
+
+### 11. LLM-as-Judge
+
+Claude Sonnet bewertet jeden generierten Control auf Faithfulness zum Quelltext (~$0.01/Control).
+
+### 12. Active Learning aus Review-Queue
+
+Menschliche Entscheidungen der Dedup Review-Queue nutzen, um Schwellenwerte ueber die Zeit zu optimieren.
+
+---
+
+## Nicht empfohlen (niedriger ROI oder Konflikte)
+
+| Ansatz | Grund |
+|--------|-------|
+| Jina v3 Embeddings | **CC-BY-NC-4.0** — verletzt Open Source Policy |
+| Voyage-law-2 | API-only, proprietaer — kein Self-Hosting |
+| Semantic Chunking | Benchmarks zeigen keinen Vorteil gegenueber Recursive fuer strukturierte Dokumente |
+| HyDE als Primaerstrategie | Latenz (+43-60%) + Halluzinationsrisiko |
+| Knowledge Graph RAG | Massiver Aufwand, unklarer Gewinn bei strukturiertem Rechtskorpus |
+
+---
+
+## Embedding-Modell Vergleich
+
+| Modell | MTEB Score | Multilingual | Kontext | Lizenz | Bewertung |
+|--------|-----------|-------------|---------|--------|-----------|
+| **BGE-M3** (aktuell) | 63.0 | 100+ Sprachen | 8192 Token | MIT | Gut, Dense+Sparse+ColBERT |
+| Jina v3 | 65.5 | 89 Sprachen | 8192 Token | CC-BY-NC | Nicht nutzbar (Lizenz!) |
+| E5-Mistral-7B | ~65 | Gut | 4096 Token | MIT | Gross, hoher RAM |
+| Voyage-law-2 | Best Legal | EN Legal | 16K Token | Proprietaer | Nicht nutzbar (API-only) |
+
+**Fazit:** BGE-M3 bleibt die beste Wahl fuer unseren Stack. Sparse-Vectors aktivieren und Fine-Tuning bringen mehr als ein Modellwechsel.
+
+---
+
+## Test-Coverage Analyse
+
+### Pipeline-Module (567 Tests)
+
+| Modul | Tests | Bewertung | Fehlende Tests |
+|-------|-------|-----------|----------------|
+| Control Generator | 110 | Exzellent | 10-15 Edge Cases |
+| Obligation Extractor | 107 | Exzellent | 8-10 Edge Cases |
+| Decomposition Pass | 90 | Exzellent | 5-8 Edge Cases |
+| Pattern Matcher | 72 | Gut | 10-15 Edge Cases |
+| Control Dedup | 56 | Exzellent | 5-8 Edge Cases |
+| Control Composer | 54 | Gut | 8-10 Edge Cases |
+| Pipeline Adapter | 36 | Gut | 10-15 Edge Cases |
+| Citation Backfill | 20 | Moderat | 5-8 Edge Cases |
+| License Gate | 12 | Minimal | 5-8 Edge Cases |
+| RAG Client | 10 | Minimal | 5-8 Edge Cases |
+
+### Kritische Luecken (fehlende Tests)
+
+| Service | Datei | Prioritaet |
+|---------|-------|------------|
+| AI Compliance Assistant | `ai_compliance_assistant.py` | HOCH (25-30 Tests noetig) |
+| PDF Extractor | `pdf_extractor.py` | HOCH (20-25 Tests noetig) |
+| LLM Provider | `llm_provider.py` | HOCH (15-20 Tests noetig) |
+| Similarity Detector | `similarity_detector.py` | MITTEL (20-25 Tests noetig) |
+| Anchor Finder | `anchor_finder.py` | MITTEL |
+
+### Test-Infrastruktur
+
+**Fehlend:** Shared `conftest.py` mit gemeinsamen Fixtures (LLM-Mock, DB-Mock, Embedding-Mock). Aktuell sind Fixtures in jedem Test-File dupliziert.
+
+---
+
+## Quellen
+
+- [NAACL 2025 Vectara Chunking Study](https://blog.premai.io/rag-chunking-strategies-the-2026-benchmark-guide/)
+- [Anthropic Contextual Retrieval](https://www.anthropic.com/news/contextual-retrieval)
+- [Qdrant Hybrid Search Query API](https://qdrant.tech/articles/hybrid-search/)
+- [Structure-Aware Chunking for Legal (ACL 2025)](https://aclanthology.org/2025.justnlp-main.19/)
+- [RAGAS Evaluation Framework](https://docs.ragas.io/)
+- [BGE Reranker v2 (MIT)](https://huggingface.co/BAAI/bge-reranker-v2-m3)
+- [LegalGPT / CALLM Framework](https://www.emergentmind.com/topics/compliance-alignment-llm-callm)
diff --git a/docs-src/development/rag-pipeline-lessons-learned.md b/docs-src/development/rag-pipeline-lessons-learned.md
new file mode 100644
index 0000000..d74dcd7
--- /dev/null
+++ b/docs-src/development/rag-pipeline-lessons-learned.md
@@ -0,0 +1,223 @@
+# RAG Pipeline: Lessons Learned & Hardening
+
+## Übersicht
+
+Dieses Dokument beschreibt die Erkenntnisse aus dem Aufbau der RAG-Pipeline und die daraus abgeleiteten Maßnahmen zur Härtung. Es dient als Referenz für zukünftige Ingestion-Runs und Pipeline-Erweiterungen.
+
+## Architektur: Wann brauchen wir RAG vs. Direct PDF?
+
+### RAG ist nötig für:
+
+| Use Case | Warum RAG? |
+|---|---|
+| **Compliance Advisor (Chat)** | Semantische Suche über 38+ Dokumente in Echtzeit |
+| **Cross-Regulation Mapping** | "Zeige alle Anforderungen zu Verschlüsselung" über alle Quellen |
+| **Customer Scope-Filtering** | Nur Chunks aus relevanten Regulations für den Kunden |
+| **Inkrementelle Updates** | Neues Dokument → nur neue Chunks verarbeiten |
+
+### RAG ist NICHT nötig für:
+
+| Use Case | Besser: Direct PDF |
+|---|---|
+| **Control-Generierung (Batch)** | PDF → PyMuPDF → Strukturparser → Artikel-Index → API |
+| **PDF-QA/Verifizierung** | Substring-Match direkt im PDF (schneller, exakter) |
+| **Artikel/§-Extraktion** | Regex-basierte Extraktion aus PDF-Text |
+
+### Hybrid-Ansatz (Empfehlung)
+
+```
+Control-Generierung: PDF → Strukturparser → Artikel-Index → Anthropic API
+ (KEIN RAG nötig, direkt aus PDF)
+
+Runtime-Betrieb: Qdrant-RAG für semantische Suche, Chat, Scope-Analyse
+ (RAG mit angereicherten Chunks + Struktur-Metadaten)
+```
+
+## Fehler und Root Causes
+
+### 1. Doppelte Ingestion = Doppelte Controls
+
+**Problem:** Gleiche PDFs unter verschiedenen Namen ingestiert (z.B. "Maschinenverordnung" und "Verordnung (EU) 2023/1230") → unterschiedliche Chunks (anderes Chunking) → anderer Hash → doppelt verarbeitet → doppelte Controls.
+
+**Root Cause:**
+- `regulation_name` aus Chunk-Metadaten statt aus kanonischer Quelle
+- UNIQUE-Constraint nur `(chunk_hash, collection, document_version)` — nicht global
+- Kein Check ob `regulation_code` bereits in einer Collection existiert
+
+**Fix (implementiert):**
+- `REGULATION_LICENSE_MAP` enthält jetzt kanonische `name`-Werte die den DB-Einträgen entsprechen
+- `source_citation.source` wird aus `REGULATION_LICENSE_MAP.name` genommen, NICHT aus `chunk.regulation_name`
+- Phase 5 Cleanup: 3.301 Duplikate hart gelöscht
+
+**Fix (noch offen):**
+- Chunk-Hash UNIQUE Constraint global machen: `(chunk_hash, document_version)` statt `(chunk_hash, collection, document_version)`
+- Vor Ingestion: Check ob `regulation_code` bereits in einer Collection existiert
+
+### 2. Chunks verlieren Strukturinformation
+
+**Problem:** Chunks werden mitten im Absatz abgeschnitten. § und Artikelnummern fehlen in den Chunk-Metadaten. Kontext des Kapitels/Abschnitts geht verloren.
+
+**Root Cause:**
+- `chunk_strategy=recursive` mit `chunk_size=512, chunk_overlap=50` — zu kleine Chunks
+- Chunking beachtet keine Dokumentstruktur (Artikel-/Paragraphengrenzen)
+- Keine Einleitung/Kapitelkontext als Prefix
+
+**Empfehlung für Re-Ingestion:**
+- **Strukturiertes Chunking:** Chunks an Artikel-/Paragraphengrenzen schneiden
+- **Kontext-Prefix:** Kapiteleinleitung und übergeordnete Struktur mitliefern
+- **Metadaten anreichern:** `article`, `paragraph`, `article_type`, `section_hierarchy`
+- **Größere Chunks:** Mindestens 1024 Tokens, besser volle Artikel/Paragraphen
+
+### 3. Cross-Collection-Duplikate
+
+**Problem:** `nist_csf_2_0` in `bp_compliance_ce` (67 Chunks) UND `bp_compliance_datenschutz` (162 Chunks). EU-Verordnungen sowohl in `bp_compliance_ce` als auch `bp_compliance_gesetze`.
+
+**Root Cause:** Keine Collection-Zuordnungsregeln. Manuelle Zuweisung bei Ingestion.
+
+**Fix:** `cleanup-qdrant-duplicates.py` Script bereinigt Cross-Collection-Duplikate.
+
+**Empfehlung:** Klare Collection-Zuordnungsregeln:
+- `bp_compliance_ce` = EU-Verordnungen + internationale Standards
+- `bp_compliance_gesetze` = Deutsche + österreichische Gesetze (NUR nationale Gesetze)
+- `bp_compliance_datenschutz` = EDPB/WP29 Leitlinien + Privacy Frameworks
+
+### 4. OWASP Multilingual Controls
+
+**Problem:** 324 OWASP Top 10 Controls in ZH, AR, ID, FR, ES, PT — Übersetzungen derselben 10 Kategorien. Kein Mehrwert, aber 324 doppelte Controls generiert.
+
+**Root Cause:** Multilingual PDFs/GitHub-Quellen ohne Spracherkennung ingestiert.
+
+**Fix:** 324 als `duplicate` markiert und gelöscht.
+
+**Empfehlung:** Bei Ingestion Spracherkennung + Deduplizierung. Nur DE + EN behalten.
+
+### 5. Fehlende Artikel/Paragraph-Extraktion
+
+**Problem:** Chunks haben `article` und `paragraph` oft leer oder falsch. Die LLM-basierte Extraktion bei der Control-Generierung ist unzuverlässig.
+
+**Root Cause:** Ingestion-Pipeline extrahiert keine Strukturinformation aus dem PDF.
+
+**Fix (implementiert):** PDF-QA-Pipeline (`pdf_qa_all.py`) matched `source_original_text` gegen Original-PDFs und extrahiert korrekte Artikel/Paragraphen — 86% Match-Rate.
+
+**Empfehlung:** Bei Re-Ingestion direkt in den Chunk-Metadaten speichern.
+
+### 6. Job-Tracking nicht persistent
+
+**Problem:** Generation-Jobs laufen als Background-Tasks. Kein Logging, welche Chunks verarbeitet, Status nur über API abfragbar. Bei API-Timeout oder Restart geht der Fortschritt verloren.
+
+**Root Cause:** `asyncio.create_task()` hat keinen Recovery-Mechanismus.
+
+**Fix (teilweise):** `canonical_generation_jobs` Tabelle trackt Jobs. `canonical_processed_chunks` markiert verarbeitete Chunks.
+
+**Empfehlung:**
+- Job-Log in DB persistieren (nicht nur stdout)
+- Fortschritt in `canonical_generation_jobs.progress` als JSONB speichern
+- Chunk-Level-Status: verarbeitet / übersprungen / Fehler
+- Recovery-Fähigkeit: Job kann von letztem Checkpoint fortgesetzt werden
+
+## Empfohlene Metadaten für Re-Ingestion
+
+### Chunk-Level Metadaten (Qdrant Payload)
+
+```json
+{
+ "chunk_text": "...",
+ "regulation_code": "eu_2016_679",
+ "regulation_name_de": "DSGVO (EU) 2016/679",
+ "regulation_name_en": "GDPR (EU) 2016/679",
+ "article": "25",
+ "article_title": "Datenschutz durch Technikgestaltung und datenschutzfreundliche Voreinstellungen",
+ "article_type": "article",
+ "paragraph": "1",
+ "section_hierarchy": ["Kapitel IV", "Abschnitt 2", "Artikel 25"],
+ "chapter_context": "Kapitel IV — Verantwortlicher und Auftragsverarbeiter",
+ "pages": [45, 46],
+ "effective_date": "2018-05-25",
+ "publication_date": "2016-04-27",
+ "document_version": "2016-04-27",
+ "source_language": "de",
+ "source_url": "https://eur-lex.europa.eu/...",
+ "celex": "32016R0679",
+ "license": "EU_LAW",
+ "license_rule": 1,
+ "source_type": "law",
+ "category": "datenschutz",
+ "chunk_position": 42,
+ "total_chunks": 423
+}
+```
+
+### Dokument-Level Metadaten (Corpus Version)
+
+```json
+{
+ "regulation_code": "eu_2016_679",
+ "canonical_name_de": "DSGVO (EU) 2016/679",
+ "canonical_name_en": "GDPR (EU) 2016/679",
+ "document_type": "eu_regulation",
+ "effective_date": "2018-05-25",
+ "publication_date": "2016-04-27",
+ "supersedes": null,
+ "superseded_by": null,
+ "source_pdf": "gdpr_regulation_eu_2016_679.pdf",
+ "source_pdf_sha256": "abc123...",
+ "total_articles": 99,
+ "total_recitals": 173,
+ "total_annexes": 0,
+ "ingestion_date": "2026-03-20",
+ "ingestion_version": "v2"
+}
+```
+
+## Pipeline-Härtung Checkliste
+
+### Vor Ingestion
+
+- [ ] Prüfen ob `regulation_code` bereits in einer Collection existiert
+- [ ] PDF-SHA256 gegen bekannte PDFs prüfen (Duplikat-Erkennung)
+- [ ] `regulation_name` aus `REGULATION_LICENSE_MAP` verwenden, NICHT aus Chunk-Metadaten
+- [ ] Spracherkennung: Nur DE + EN ingestieren
+- [ ] Dokument-Metadaten (effective_date, publication_date) recherchieren
+
+### Während Ingestion
+
+- [ ] Strukturiertes Chunking an Artikel-/Paragraphengrenzen
+- [ ] Kontext-Prefix mit Kapiteleinleitung
+- [ ] Chunk-Metadaten anreichern (article, paragraph, article_type, section_hierarchy)
+- [ ] Fortschritt in DB loggen
+
+### Nach Ingestion
+
+- [ ] Chunk-Count pro `regulation_code` prüfen (Sanity Check)
+- [ ] PDF-QA gegen Original-PDF laufen lassen
+- [ ] Cross-Collection-Duplikat-Check
+- [ ] Corpus-Version in DB eintragen
+
+### Control-Generierung
+
+- [ ] `source_citation.source` aus `REGULATION_LICENSE_MAP.name`, NICHT aus Chunk-Metadaten
+- [ ] Harmonisierung: Threshold 0.85 für Duplikate innerhalb gleicher `regulation_code`
+- [ ] Cross-Regulation-Harmonisierung bei ähnlichen Themen (z.B. DSGVO Art. 25 ↔ NIS2 Art. 21)
+- [ ] Job-Fortschritt persistent in DB speichern
+
+## Workflow: Mac Mini → Production Sync
+
+```
+1. Mac Mini: PDF → Qdrant (lokal, http://macmini:6333)
+2. Mac Mini: Control-Generierung → PostgreSQL (shared, 46.225.100.82:54321)
+3. QA: PDF-Match, Dedup, Source-Normalisierung
+4. Qdrant Migration: macmini:6333 → qdrant-dev.breakpilot.ai (scripts/migrate-qdrant.py)
+5. Deploy: git push gitea → Coolify Build + Deploy
+```
+
+**WICHTIG:** PostgreSQL ist SHARED — Änderungen auf Mac Mini sind sofort in Production sichtbar. Qdrant hat getrennte Instanzen (lokal + production) und muss manuell synchronisiert werden.
+
+## Scripts
+
+| Script | Beschreibung |
+|---|---|
+| `scripts/ingest-phase-h.sh` | Haupt-Ingestion: 38 Dokumente → Qdrant |
+| `scripts/cleanup-qdrant-duplicates.py` | Qdrant Duplikat-Cleanup (8 Schritte) |
+| `scripts/migrate-qdrant.py` | Qdrant Migration: lokal → production |
+| `scripts/qa/phase5_normalize_and_cleanup.py` | DB Normalisierung + Hard Delete |
+| `scripts/qa/pdf_qa_all.py` | PDF-Match QA |
diff --git a/docs-src/services/sdk-modules/canonical-control-library.md b/docs-src/services/sdk-modules/canonical-control-library.md
index 2aadfa6..e07d3f6 100644
--- a/docs-src/services/sdk-modules/canonical-control-library.md
+++ b/docs-src/services/sdk-modules/canonical-control-library.md
@@ -96,6 +96,7 @@ erDiagram
varchar verification_method
varchar target_audience
varchar generation_strategy
+ varchar obligation_type
smallint pipeline_version
integer license_rule
jsonb source_citation
@@ -936,9 +937,11 @@ Drei Kompositions-Modi:
Zerlegt Rich Controls in atomare Controls. Laeuft VOR den Migration Passes 1-5.
-#### Pass 0a — Obligation Extraction
+#### Pass 0a — Obligation Extraction + 3-Tier-Klassifizierung
-Extrahiert einzelne normative Pflichten aus einem Rich Control per LLM.
+Extrahiert einzelne normative Pflichten aus einem Rich Control per LLM (Claude Sonnet 4.6).
+Jede Obligation wird als **pflicht**, **empfehlung** oder **kann** klassifiziert — nichts wird
+wegen fehlendem normativem Signal abgelehnt.
**6 Guardrails:**
@@ -949,23 +952,37 @@ Extrahiert einzelne normative Pflichten aus einem Rich Control per LLM.
5. Nicht auf Evidence-Ebene zerlegen
6. Parent-Link immer erhalten
-**Quality Gate:** Jeder Kandidat wird gegen 6 Kriterien geprueft:
+**3-Tier Obligation Classification:**
-- `has_normative_signal` — Normatives Sprachsignal erkannt
-- `single_action` — Nur eine Handlung
-- `not_rationale` — Keine blosse Begruendung
-- `not_evidence_only` — Kein reines Evidence-Fragment
-- `min_length` — Mindestlaenge erreicht
-- `has_parent_link` — Referenz zum Rich Control
+| obligation_type | Signal-Beispiele | Bedeutung |
+|---|---|---|
+| `pflicht` | müssen, ist zu, shall, must, required | Gesetzliche/regulatorische Pflicht |
+| `empfehlung` | soll, should, sicherstellen, dokumentieren | Best Practice, freiwillig |
+| `kann` | kann, darf, may, optional | Optionale Massnahme |
-Kritische Checks: `has_normative_signal`, `not_evidence_only`, `min_length`, `has_parent_link`
+Obligations ohne erkennbares Signal werden als `empfehlung` klassifiziert (nicht rejected).
+Empfehlungen helfen Firmen, Systeme ueber das Pflichtprogramm hinaus zu sichern.
+
+**Quality Gate — Kritische Checks:**
+
+| Flag | Kritisch? | Beschreibung |
+|---|---|---|
+| `obligation_type` | — | Klassifizierung (pflicht/empfehlung/kann) |
+| `not_evidence_only` | **Ja** | Kein reines Evidence-Fragment |
+| `min_length` | **Ja** | Mindestlaenge (20 Zeichen) |
+| `has_parent_link` | **Ja** | Referenz zum Rich Control |
+| `has_normative_signal` | Nein | Informativer Check (nicht mehr Ablehnungsgrund) |
+| `single_action` | Nein | Nur eine Handlung (heuristisch) |
+| `not_rationale` | Nein | Keine blosse Begruendung |
#### Pass 0b — Atomic Control Composition
Erstellt aus jedem validierten Obligation Candidate ein atomares Control
-(LLM-gestuetzt mit Template-Fallback).
+(LLM-gestuetzt mit Template-Fallback). Das `obligation_type` Feld wird
+vom Parent-Obligation uebernommen.
**Datei:** `compliance/services/decomposition_pass.py`
+**Test-Script:** `scripts/qa/test_pass0a.py` (standalone, speichert JSON)
---
@@ -1012,11 +1029,13 @@ Die Crosswalk-Matrix bildet diese N:M-Beziehung ab.
**Migration 061:** Decomposition-Tabellen
-| Tabelle | Beschreibung |
+| Tabelle / Feld | Beschreibung |
|---------|-------------|
| `obligation_candidates` | Extrahierte atomare Pflichten aus Rich Controls |
+| `obligation_candidates.obligation_type` | `pflicht` / `empfehlung` / `kann` (3-Tier-Klassifizierung) |
| `canonical_controls.parent_control_uuid` | Self-Referenz zum Rich Control (neues Feld) |
| `canonical_controls.decomposition_method` | Zerlegungsmethode (neues Feld) |
+| `canonical_controls.obligation_type` | Uebernommen von Obligation: pflicht/empfehlung/kann |
---
diff --git a/docs-src/services/sdk-modules/control-generator-pipeline.md b/docs-src/services/sdk-modules/control-generator-pipeline.md
index e9111ff..4ac03cc 100644
--- a/docs-src/services/sdk-modules/control-generator-pipeline.md
+++ b/docs-src/services/sdk-modules/control-generator-pipeline.md
@@ -567,7 +567,86 @@ curl -X POST https://api-dev.breakpilot.ai/api/compliance/v1/canonical/generate/
---
+## Pass 0a/0b: Atomare Control-Zerlegung
+
+Die Pipeline v3 erweitert die 7-Stufen-Pipeline um einen Vor-Pass, der Rich Controls in atomare Controls zerlegt.
+
+### Pass 0a: Obligation Extraction
+
+Extrahiert individuelle normative Pflichten aus Rich Controls via LLM.
+
+```mermaid
+flowchart LR
+ A[Rich Control] -->|LLM| B[Obligations]
+ B --> C{Quality Gate}
+ C -->|Pass| D[validated]
+ C -->|Fail| E[rejected]
+```
+
+**3-Tier Klassifikation:**
+
+| Typ | Erkennungsmuster | Beispiel |
+|-----|-----------------|---------|
+| **Pflicht** | muss, ist verpflichtet, hat sicherzustellen | "Der Verantwortliche MUSS ein Verzeichnis fuehren" |
+| **Empfehlung** | soll, sollte, wird empfohlen | "Es SOLLTE eine Risikobewertung durchgefuehrt werden" |
+| **Kann** | kann, darf, ist berechtigt | "Die Aufsichtsbehoerde KANN Geldbussen verhaengen" |
+
+**Quality Gate (6 Regeln):**
+
+1. Nur normative Aussagen (muss, sicherzustellen, verpflichtet)
+2. Ein Hauptverb pro Obligation
+3. Test-Obligations separat von operativen
+4. Reporting-Obligations separat
+5. Nicht auf Evidence-Ebene splitten
+6. Parent-Link immer erhalten
+
+### Pass 0b: Atomic Control Composition
+
+Verwandelt jede validierte Obligation in ein eigenstaendiges atomares Control.
+
+```mermaid
+flowchart LR
+ A[Obligation] -->|LLM| B[Atomic Control]
+ B -->|Dedup Check| C{4-Stage Dedup}
+ C -->|NEW| D[Insert + Index]
+ C -->|LINK| E[Parent-Link]
+ C -->|REVIEW| F[Review-Queue]
+```
+
+**Konfiguration:**
+
+| Variable | Default | Beschreibung |
+|----------|---------|-------------|
+| `DECOMPOSITION_LLM_MODEL` | `claude-sonnet-4-6` | LLM fuer Pass 0a/0b |
+| `DECOMPOSITION_BATCH_SIZE` | `5` | Obligations pro LLM-Call |
+| `DECOMPOSITION_LLM_TIMEOUT` | `120` | Timeout in Sekunden |
+
+**Ergebnisse (Stand 2026-03-21):**
+
+| Metrik | Wert |
+|--------|------|
+| Rich Controls (technisch) | ~6.800 |
+| Atomare Controls (bisher) | 30 (PoC: 10x CRYP, AUTH, SEC) |
+| Ziel nach Full Run | ~18.000 unique Master Controls |
+| Obligations pro Rich Control | ~10 |
+| Dedup-Reduktion erwartet | ~70% |
+
+### Quelldateien (Pass 0a/0b)
+
+| Datei | Beschreibung |
+|-------|-------------|
+| `compliance/services/decomposition_pass.py` | Pass 0a + 0b Logik |
+| `compliance/services/control_dedup.py` | 4-Stufen Dedup-Engine |
+| `migrations/061_obligation_candidates.sql` | Obligation-Tabelle |
+| `migrations/074_control_dedup.sql` | Dedup-Tabellen (Parent-Links, Review-Queue) |
+| `tests/test_decomposition_pass.py` | 90 Tests |
+| `tests/test_control_dedup.py` | 56 Tests |
+
+---
+
## Verwandte Dokumentation
- [Canonical Control Library (CP-CLIB)](canonical-control-library.md) — Domains, Datenmodell, Too-Close-Detektor, CI/CD Validation
+- [Deduplizierungs-Engine](dedup-engine.md) — 4-Stufen Dedup, Multi-Parent-Linking, Review-Queue
+- [RAG Pipeline Benchmark](../../development/rag-pipeline-benchmark.md) — State-of-the-Art Vergleich, Optimierungsempfehlungen
- [Multi-Layer Control Architecture](canonical-control-library.md#multi-layer-control-architecture) — 10-Stage Pipeline-Erweiterung mit Obligations, Patterns, Crosswalk
diff --git a/docs-src/services/sdk-modules/dedup-engine.md b/docs-src/services/sdk-modules/dedup-engine.md
new file mode 100644
index 0000000..5fe0883
--- /dev/null
+++ b/docs-src/services/sdk-modules/dedup-engine.md
@@ -0,0 +1,253 @@
+# Deduplizierungs-Engine (Control Dedup)
+
+4-stufige Dedup-Pipeline zur Vermeidung doppelter atomarer Controls bei der Pass 0b Komposition. Kern-USP: **"1 Control erfuellt 5 Gesetze"** durch Multi-Parent-Linking.
+
+**Backend:** `backend-compliance/compliance/services/control_dedup.py`
+**Migration:** `backend-compliance/migrations/074_control_dedup.sql`
+**Tests:** `backend-compliance/tests/test_control_dedup.py` (56 Tests)
+
+---
+
+## Motivation
+
+Aus ~6.800 technischen Controls x ~10 Obligations pro Control entstehen ~68.000 atomare Kandidaten. Ziel: ~18.000 einzigartige Master Controls. Viele Obligations aus verschiedenen Gesetzen fuehren zum gleichen technischen Control (z.B. "MFA implementieren" in DSGVO, NIS2, AI Act).
+
+**Problem:** Embedding-only Deduplizierung ist GEFAEHRLICH fuer Compliance.
+
+!!! danger "False-Positive Beispiel"
+ - "Admin-Zugriffe muessen MFA nutzen" vs. "Remote-Zugriffe muessen MFA nutzen"
+ - Embedding sagt >0.9 aehnlich
+ - Aber es sind **ZWEI verschiedene Controls** (verschiedene Objekte!)
+
+---
+
+## 4-Stufen Entscheidungsbaum
+
+```mermaid
+flowchart TD
+ A[Kandidat-Control] --> B{Pattern-Gate}
+ B -->|pattern_id verschieden| N1[NEW CONTROL]
+ B -->|pattern_id gleich| C{Action-Check}
+ C -->|Action verschieden| N2[NEW CONTROL]
+ C -->|Action gleich| D{Object-Normalization}
+ D -->|Objekt verschieden| E{Similarity > 0.95?}
+ E -->|Ja| L1[LINK]
+ E -->|Nein| N3[NEW CONTROL]
+ D -->|Objekt gleich| F{Tiered Thresholds}
+ F -->|> 0.92| L2[LINK]
+ F -->|0.85 - 0.92| R[REVIEW QUEUE]
+ F -->|< 0.85| N4[NEW CONTROL]
+```
+
+### Stufe 1: Pattern-Gate (hart)
+
+`pattern_id` muss uebereinstimmen. Verhindert ~80% der False Positives.
+
+```python
+if pattern_id != existing.pattern_id:
+ → NEW CONTROL # Verschiedene Kontrollmuster = verschiedene Controls
+```
+
+### Stufe 2: Action-Check (hart)
+
+Normalisierte Aktionsverben muessen uebereinstimmen. "Implementieren" vs. "Testen" = verschiedene Controls, auch bei gleichem Objekt.
+
+```python
+if normalize_action("implementieren") != normalize_action("testen"):
+ → NEW CONTROL # "implement" != "test"
+```
+
+**Action-Normalisierung (Deutsch → Englisch):**
+
+| Deutsche Verben | Kanonische Form |
+|----------------|-----------------|
+| implementieren, umsetzen, einrichten, aktivieren | `implement` |
+| testen, pruefen, ueberpruefen, verifizieren | `test` |
+| ueberwachen, monitoring, beobachten | `monitor` |
+| verschluesseln | `encrypt` |
+| protokollieren, aufzeichnen, loggen | `log` |
+| beschraenken, einschraenken, begrenzen | `restrict` |
+
+### Stufe 3: Object-Normalization (weich)
+
+Compliance-Objekte werden auf kanonische Token normalisiert.
+
+```python
+normalize_object("Admin-Konten") → "privileged_access"
+normalize_object("Remote-Zugriff") → "remote_access"
+normalize_object("MFA") → "multi_factor_auth"
+```
+
+Bei verschiedenen Objekten gilt ein hoeherer Schwellenwert (0.95 statt 0.92).
+
+**Objekt-Normalisierung:**
+
+| Eingabe | Kanonischer Token |
+|---------|------------------|
+| MFA, 2FA, Multi-Faktor-Authentifizierung | `multi_factor_auth` |
+| Admin-Konten, privilegierte Zugriffe | `privileged_access` |
+| Verschluesselung, Kryptografie | `encryption` |
+| Schluessel, Key Management | `key_management` |
+| TLS, SSL, HTTPS | `transport_encryption` |
+| Firewall | `firewall` |
+| Audit-Log, Protokoll, Logging | `audit_logging` |
+
+### Stufe 4: Embedding Similarity (Qdrant)
+
+Tiered Thresholds basierend auf Cosine-Similarity:
+
+| Score | Verdict | Aktion |
+|-------|---------|--------|
+| > 0.95 | **LINK** | Bei verschiedenen Objekten |
+| > 0.92 | **LINK** | Parent-Link hinzufuegen |
+| 0.85 - 0.92 | **REVIEW** | In Review-Queue zur manuellen Pruefung |
+| < 0.85 | **NEW** | Neues Control anlegen |
+
+---
+
+## Canonicalization Layer
+
+Vor dem Embedding wird der deutsche Compliance-Text in normalisiertes Englisch transformiert:
+
+```
+"Administratoren muessen MFA verwenden"
+→ "implement multi_factor_auth for administratoren verwenden"
+→ Bessere Matches, weniger Embedding-Rauschen
+```
+
+Dies reduziert das Rauschen durch synonyme Formulierungen in verschiedenen Gesetzen.
+
+---
+
+## Multi-Parent-Linking (M:N)
+
+Ein atomares Control kann mehrere Eltern-Controls aus verschiedenen Regulierungen haben:
+
+```json
+{
+ "control_id": "AUTH-1072-A01",
+ "parent_links": [
+ {"parent_control_id": "AUTH-1001", "source": "NIST IA-02(01)", "link_type": "decomposition"},
+ {"parent_control_id": "NIS2-045", "source": "NIS2 Art. 21", "link_type": "dedup_merge"}
+ ]
+}
+```
+
+### Datenbank-Schema
+
+```sql
+-- Migration 074: control_parent_links (M:N)
+CREATE TABLE control_parent_links (
+ id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
+ control_uuid UUID NOT NULL REFERENCES canonical_controls(id),
+ parent_control_uuid UUID NOT NULL REFERENCES canonical_controls(id),
+ link_type VARCHAR(30) NOT NULL DEFAULT 'decomposition',
+ confidence NUMERIC(3,2) DEFAULT 1.0,
+ source_regulation VARCHAR(100),
+ source_article VARCHAR(100),
+ obligation_candidate_id UUID REFERENCES obligation_candidates(id),
+ created_at TIMESTAMPTZ DEFAULT NOW(),
+ CONSTRAINT uq_parent_link UNIQUE (control_uuid, parent_control_uuid)
+);
+```
+
+**Link-Typen:**
+
+| Typ | Bedeutung |
+|-----|-----------|
+| `decomposition` | Aus Pass 0b Zerlegung |
+| `dedup_merge` | Durch Dedup-Engine als Duplikat erkannt |
+| `manual` | Manuell durch Reviewer verknuepft |
+| `crosswalk` | Aus Crosswalk-Matrix uebernommen |
+
+---
+
+## Review-Queue
+
+Borderline-Matches (Similarity 0.85-0.92) werden in die Review-Queue geschrieben:
+
+```sql
+-- Migration 074: control_dedup_reviews
+CREATE TABLE control_dedup_reviews (
+ id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
+ candidate_control_id VARCHAR(30) NOT NULL,
+ candidate_title TEXT NOT NULL,
+ candidate_objective TEXT,
+ matched_control_uuid UUID REFERENCES canonical_controls(id),
+ matched_control_id VARCHAR(30),
+ similarity_score NUMERIC(4,3),
+ dedup_stage VARCHAR(40) NOT NULL,
+ review_status VARCHAR(20) DEFAULT 'pending',
+ -- pending → accepted_link | accepted_new | rejected
+ created_at TIMESTAMPTZ DEFAULT NOW()
+);
+```
+
+---
+
+## Qdrant Collection
+
+```
+Collection: atomic_controls
+Dimension: 1024 (bge-m3)
+Distance: COSINE
+Payload: pattern_id, action_normalized, object_normalized, control_id, canonical_text
+Index: pattern_id (keyword), action_normalized (keyword), object_normalized (keyword)
+Query: IMMER mit filter: pattern_id == X (reduziert Suche drastisch)
+```
+
+---
+
+## Integration in Pass 0b
+
+Die Dedup-Engine ist optional in `DecompositionPass` integriert:
+
+```python
+decomp = DecompositionPass(db=session, dedup_enabled=True)
+stats = await decomp.run_pass0b(limit=100, use_anthropic=True)
+
+# Stats enthalten Dedup-Metriken:
+# stats["dedup_linked"] = 15 (Duplikate → Parent-Link)
+# stats["dedup_review"] = 3 (Borderline → Review-Queue)
+# stats["controls_created"] = 82 (Neue Controls)
+```
+
+**Ablauf bei Pass 0b mit Dedup:**
+
+1. LLM generiert atomares Control
+2. Dedup-Engine prueft 4 Stufen
+3. **LINK:** Kein neues Control, Parent-Link zu bestehendem
+4. **REVIEW:** Kein neues Control, Eintrag in Review-Queue
+5. **NEW:** Control anlegen + in Qdrant indexieren
+
+---
+
+## Konfiguration
+
+| Umgebungsvariable | Default | Beschreibung |
+|-------------------|---------|-------------|
+| `DEDUP_ENABLED` | `true` | Dedup-Engine ein/ausschalten |
+| `DEDUP_LINK_THRESHOLD` | `0.92` | Schwelle fuer automatisches Linking |
+| `DEDUP_REVIEW_THRESHOLD` | `0.85` | Schwelle fuer Review-Queue |
+| `DEDUP_LINK_THRESHOLD_DIFF_OBJ` | `0.95` | Schwelle bei verschiedenen Objekten |
+| `DEDUP_QDRANT_COLLECTION` | `atomic_controls` | Qdrant-Collection fuer Dedup-Index |
+| `QDRANT_URL` | `http://host.docker.internal:6333` | Qdrant-URL |
+| `EMBEDDING_URL` | `http://embedding-service:8087` | Embedding-Service-URL |
+
+---
+
+## Quelldateien
+
+| Datei | Beschreibung |
+|-------|-------------|
+| `compliance/services/control_dedup.py` | 4-Stufen Dedup-Engine |
+| `compliance/services/decomposition_pass.py` | Pass 0a/0b mit Dedup-Integration |
+| `migrations/074_control_dedup.sql` | DB-Schema (parent_links, review_queue) |
+| `tests/test_control_dedup.py` | 56 Unit-Tests |
+
+---
+
+## Verwandte Dokumentation
+
+- [Control Generator Pipeline](control-generator-pipeline.md) — 7-Stufen RAG→Control Pipeline
+- [Canonical Control Library](canonical-control-library.md) — Datenmodell, Domains, Similarity-Detektor
diff --git a/mkdocs.yml b/mkdocs.yml
index 76949f5..cb52db1 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -107,6 +107,7 @@ nav:
- Policy-Bibliothek (29 Richtlinien): services/sdk-modules/policy-bibliothek.md
- Canonical Control Library (CP-CLIB): services/sdk-modules/canonical-control-library.md
- Control Generator Pipeline: services/sdk-modules/control-generator-pipeline.md
+ - Deduplizierungs-Engine: services/sdk-modules/dedup-engine.md
- Control Provenance Wiki: services/sdk-modules/control-provenance.md
- Strategie:
- Wettbewerbsanalyse & Roadmap: strategy/wettbewerbsanalyse.md
@@ -115,3 +116,5 @@ nav:
- Dokumentation: development/documentation.md
- CI/CD Pipeline: development/ci-cd-pipeline.md
- QA Control Quality: development/qa-control-quality.md
+ - RAG Pipeline Lessons Learned: development/rag-pipeline-lessons-learned.md
+ - RAG Pipeline Benchmark: development/rag-pipeline-benchmark.md
diff --git a/scripts/qa/apply_pdf_qa_results.py b/scripts/qa/apply_pdf_qa_results.py
index 6bdc3f7..785ba39 100644
--- a/scripts/qa/apply_pdf_qa_results.py
+++ b/scripts/qa/apply_pdf_qa_results.py
@@ -1,11 +1,29 @@
-"""Apply PDF QA results: update source_citation with correct article + article_type."""
+"""
+Apply PDF QA results: update source_citation with correct article_type + article.
+
+Safety modes:
+ --safe (default): Only set article_type. Set article only when empty. Mark preamble as recital_suspect.
+ --force-article: Also overwrite existing articles (CAREFUL: NIST substring matching is unreliable).
+ --dry-run: Show what would change without writing.
+
+Usage:
+ python3 apply_pdf_qa_results.py # safe mode (apply article_type + empty articles)
+ python3 apply_pdf_qa_results.py --dry-run # show changes without writing
+ python3 apply_pdf_qa_results.py --force-article # also overwrite existing articles
+"""
import os
+import sys
import json
import psycopg2
import urllib.parse
+from collections import Counter
RESULTS_FILE = "/tmp/pdf_qa_results.json"
+# Parse args
+dry_run = "--dry-run" in sys.argv
+force_article = "--force-article" in sys.argv
+
# Load results
with open(RESULTS_FILE) as f:
results = json.load(f)
@@ -21,35 +39,101 @@ conn = psycopg2.connect(
options="-c search_path=compliance,public"
)
-# Update in batches
+# Load current DB state for all affected controls
cur = conn.cursor()
-updated = 0
+ctrl_ids = [r["ctrl_id"] for r in results]
+cur.execute("""
+ SELECT id,
+ source_citation->>'article' as article,
+ source_citation->>'article_type' as article_type,
+ source_citation->>'source' as source
+ FROM compliance.canonical_controls
+ WHERE id = ANY(%s::uuid[])
+""", (ctrl_ids,))
+db_state = {}
+for row in cur.fetchall():
+ db_state[str(row[0])] = {"article": row[1] or "", "article_type": row[2], "source": row[3]}
+
+# Counters
+stats = Counter()
+updated_type = 0
+updated_article = 0
+updated_recital = 0
errors = 0
-unchanged = 0
for i, r in enumerate(results):
ctrl_id = r["ctrl_id"]
- article_label = r["article_label"]
- article_type = r["article_type"] # preamble, article, annex, section, unknown
+ new_article = r["article_label"]
+ new_type = r["article_type"]
+ db = db_state.get(ctrl_id, {})
+
+ if not db:
+ stats["missing_in_db"] += 1
+ continue
+
+ old_type = db.get("article_type")
+ old_article = db.get("article", "").strip()
+
+ # Decide what to update
+ set_type = (old_type != new_type)
+ set_article = (not old_article) or (force_article and old_article != new_article)
+ set_recital = (new_type == "preamble")
+
+ if set_type:
+ stats["type_" + ("new" if not old_type else "changed")] += 1
+ else:
+ stats["type_unchanged"] += 1
+
+ if not old_article and set_article:
+ stats["article_new"] += 1
+ elif old_article and old_article != new_article:
+ if force_article:
+ stats["article_force_changed"] += 1
+ else:
+ stats["article_skipped"] += 1
+ else:
+ stats["article_unchanged"] += 1
+
+ if set_recital:
+ stats["recital"] += 1
+
+ if dry_run:
+ continue
try:
- # Update source_citation: set article and article_type
- cur.execute("""
- UPDATE compliance.canonical_controls
- SET source_citation = source_citation
- || jsonb_build_object('article', %s, 'article_type', %s),
- updated_at = now()
- WHERE id = %s::uuid
- AND (
- source_citation->>'article' IS DISTINCT FROM %s
- OR source_citation->>'article_type' IS DISTINCT FROM %s
- )
- """, (article_label, article_type, ctrl_id, article_label, article_type))
+ # Build JSONB update
+ updates = {}
+ if set_type:
+ updates["article_type"] = new_type
+ if set_article:
+ updates["article"] = new_article
- if cur.rowcount > 0:
- updated += 1
- else:
- unchanged += 1
+ if updates:
+ # Merge into source_citation
+ cur.execute("""
+ UPDATE compliance.canonical_controls
+ SET source_citation = COALESCE(source_citation, '{}'::jsonb) || %s::jsonb,
+ updated_at = now()
+ WHERE id = %s::uuid
+ """, (json.dumps(updates), ctrl_id))
+ if set_type:
+ updated_type += 1
+ if set_article:
+ updated_article += 1
+
+ # Mark preamble as recital_suspect
+ if set_recital:
+ cur.execute("""
+ UPDATE compliance.canonical_controls
+ SET generation_metadata = jsonb_set(
+ COALESCE(generation_metadata, '{}'::jsonb),
+ '{recital_suspect}',
+ 'true'::jsonb
+ ),
+ updated_at = now()
+ WHERE id = %s::uuid
+ """, (ctrl_id,))
+ updated_recital += 1
except Exception as e:
errors += 1
@@ -58,12 +142,37 @@ for i, r in enumerate(results):
conn.rollback()
continue
- if (i + 1) % 500 == 0:
+ if (i + 1) % 1000 == 0:
conn.commit()
- print(f" Progress: {i+1}/{len(results)} (updated: {updated}, unchanged: {unchanged}, errors: {errors})")
+ print(f" Progress: {i+1}/{len(results)}")
-conn.commit()
-print(f"\nDone: {updated} updated, {unchanged} unchanged, {errors} errors out of {len(results)}")
+if not dry_run:
+ conn.commit()
+
+mode = "DRY-RUN" if dry_run else "APPLIED"
+print(f"\n{'='*60}")
+print(f" Mode: {mode}")
+print(f"{'='*60}")
+print(f"\n article_type:")
+print(f" New (was NULL): {stats['type_new']:5d}")
+print(f" Changed: {stats['type_changed']:5d}")
+print(f" Unchanged: {stats['type_unchanged']:5d}")
+print(f"\n article:")
+print(f" New (was empty): {stats['article_new']:5d}")
+if force_article:
+ print(f" Force-changed: {stats['article_force_changed']:5d}")
+else:
+ print(f" Differs (SKIPPED): {stats['article_skipped']:5d}")
+print(f" Unchanged: {stats['article_unchanged']:5d}")
+print(f"\n Preamble/Recital: {stats['recital']:5d}")
+print(f" Missing in DB: {stats['missing_in_db']:5d}")
+
+if not dry_run:
+ print(f"\n Updates written:")
+ print(f" article_type: {updated_type:5d}")
+ print(f" article: {updated_article:5d}")
+ print(f" recital_suspect: {updated_recital:5d}")
+ print(f" Errors: {errors:5d}")
# Verify: count by article_type
cur.execute("""
diff --git a/scripts/qa/benchmark_llm_controls.py b/scripts/qa/benchmark_llm_controls.py
new file mode 100644
index 0000000..f6e5862
--- /dev/null
+++ b/scripts/qa/benchmark_llm_controls.py
@@ -0,0 +1,524 @@
+#!/usr/bin/env python3
+"""
+Phase 7.4 Benchmark: Compare gpt-oss-120b vs Claude Sonnet for Control Generation.
+
+Tests 5 representative gap articles from different sources.
+Measures: quality (JSON valid, fields complete), response time, cost estimate.
+
+Usage:
+ python3 benchmark_llm_controls.py
+"""
+import json
+import time
+import sys
+import os
+import requests
+from pathlib import Path
+
+# ── Config ──────────────────────────────────────────────────────────
+LITELLM_URL = "https://llm-dev.meghsakha.com"
+LITELLM_MODEL = "gpt-oss-120b"
+LITELLM_API_KEY = "sk-0nAyxaMVbIqmz_ntnndzag"
+
+ANTHROPIC_URL = "https://api.anthropic.com/v1/messages"
+ANTHROPIC_MODEL = "claude-sonnet-4-6"
+ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", "")
+
+PDF_DIR = Path(os.path.expanduser("~/rag-ingestion/pdfs"))
+
+try:
+ import fitz # PyMuPDF
+except ImportError:
+ print("PyMuPDF not available, using pre-extracted texts")
+ fitz = None
+
+# ── Prompts (identical to control_generator.py) ─────────────────────
+
+SYSTEM_PROMPT = """Du bist ein Security-Compliance-Experte. Strukturiere den gegebenen Text
+als praxisorientiertes Security Control. Erstelle eine verständliche, umsetzbare Formulierung.
+Antworte NUR mit validem JSON. Bei mehreren Controls antworte mit einem JSON-Array."""
+
+APPLICABILITY_PROMPT = """- applicable_industries: Liste der Branchen fuer die dieses Control relevant ist.
+ Verwende ["all"] wenn der Control branchenuebergreifend gilt.
+ Moegliche Werte: "all", "Technologie / IT", "Finanzdienstleistungen", "Gesundheitswesen",
+ "Produktion / Industrie", "Energie", "Telekommunikation", "Oeffentlicher Dienst"
+- applicable_company_size: Ab welcher Unternehmensgroesse gilt dieses Control?
+ Verwende ["all"] wenn keine Groessenbeschraenkung.
+ Moegliche Werte: "all", "micro", "small", "medium", "large", "enterprise"
+- scope_conditions: null wenn keine besonderen Bedingungen, sonst:
+ {"requires_any": ["signal"], "description": "Erklaerung"}"""
+
+
+def build_prompt(source_name: str, article_label: str, article_text: str, license_type: str) -> str:
+ return f"""Strukturiere den folgenden Gesetzestext als Security/Compliance Control.
+Du DARFST den Originaltext verwenden (Quelle: {source_name}, {license_type}).
+
+WICHTIG: Erstelle eine verständliche, praxisorientierte Formulierung.
+Der Originaltext wird separat gespeichert — deine Formulierung soll klar und umsetzbar sein.
+
+Gib JSON zurück mit diesen Feldern:
+- title: Kurzer prägnanter Titel (max 100 Zeichen)
+- objective: Was soll erreicht werden? (1-3 Sätze)
+- rationale: Warum ist das wichtig? (1-2 Sätze)
+- requirements: Liste von konkreten Anforderungen (Strings)
+- test_procedure: Liste von Prüfschritten (Strings)
+- evidence: Liste von Nachweisdokumenten (Strings)
+- severity: low/medium/high/critical
+- tags: Liste von Tags
+- domain: Fachgebiet (AUTH/CRYP/NET/DATA/LOG/ACC/SEC/INC/AI/COMP/GOV)
+- category: Inhaltliche Kategorie
+- target_audience: Liste der Zielgruppen
+- source_article: Artikel-Referenz (z.B. "Artikel 10", "§ 42")
+- source_paragraph: Absatz-Referenz (z.B. "Absatz 5")
+{APPLICABILITY_PROMPT}
+
+Text: {article_text[:3000]}
+Quelle: {source_name}, {article_label}"""
+
+
+# ── PDF Text Extraction ─────────────────────────────────────────────
+
+def extract_article_text(pdf_file: str, article_label: str, doc_type: str) -> str:
+ """Extract the text of a specific article from a PDF."""
+ import re
+
+ path = PDF_DIR / pdf_file
+ if not path.exists() or fitz is None:
+ return ""
+
+ doc = fitz.open(str(path))
+ full_text = ""
+ for page in doc:
+ full_text += page.get_text() + "\n"
+ doc.close()
+
+ # Find article boundaries
+ if doc_type == "eu_regulation":
+ # Find "Artikel N" heading
+ art_num = re.search(r'\d+', article_label)
+ if not art_num:
+ return ""
+ num = int(art_num.group())
+ # Find start of this article
+ pattern = rf'\nArtikel\s+{num}\s*\n'
+ match = re.search(pattern, full_text)
+ if not match:
+ return f"[Artikel {num} nicht im PDF gefunden]"
+ start = match.start()
+ # Find start of next article
+ next_pattern = rf'\nArtikel\s+{num+1}\s*\n'
+ next_match = re.search(next_pattern, full_text)
+ end = next_match.start() if next_match else start + 5000
+ text = full_text[start:end].strip()
+ return text[:3000]
+
+ elif doc_type == "de_law":
+ para_num = re.search(r'\d+', article_label)
+ if not para_num:
+ return ""
+ num = int(para_num.group())
+ pattern = rf'\n§\s+{num}\b'
+ match = re.search(pattern, full_text)
+ if not match:
+ return f"[§ {num} nicht im PDF gefunden]"
+ start = match.start()
+ next_pattern = rf'\n§\s+{num+1}\b'
+ next_match = re.search(next_pattern, full_text)
+ end = next_match.start() if next_match else start + 5000
+ text = full_text[start:end].strip()
+ return text[:3000]
+
+ elif doc_type == "nist":
+ # Find NIST control family
+ match = re.search(rf'(?:^|\n)\s*{re.escape(article_label)}\b', full_text)
+ if not match:
+ return f"[{article_label} nicht im PDF gefunden]"
+ start = match.start()
+ text = full_text[start:start+3000].strip()
+ return text
+
+ else:
+ # Generic section search
+ match = re.search(rf'(?:^|\n).*{re.escape(article_label)}\b', full_text)
+ if not match:
+ return f"[{article_label} nicht im PDF gefunden]"
+ start = match.start()
+ text = full_text[start:start+3000].strip()
+ return text
+
+
+# ── API Calls ────────────────────────────────────────────────────────
+
+def call_litellm(prompt: str, system_prompt: str) -> tuple:
+ """Call LiteLLM API. Returns (response_text, duration_seconds, error)."""
+ headers = {
+ "Content-Type": "application/json",
+ "Authorization": f"Bearer {LITELLM_API_KEY}",
+ }
+ payload = {
+ "model": LITELLM_MODEL,
+ "messages": [
+ {"role": "system", "content": system_prompt},
+ {"role": "user", "content": prompt},
+ ],
+ "temperature": 0.3,
+ "max_tokens": 4096,
+ "stream": False,
+ }
+
+ t0 = time.time()
+ try:
+ resp = requests.post(
+ f"{LITELLM_URL}/v1/chat/completions",
+ headers=headers,
+ json=payload,
+ timeout=180,
+ )
+ duration = time.time() - t0
+ if resp.status_code != 200:
+ return "", duration, f"HTTP {resp.status_code}: {resp.text[:200]}"
+ data = resp.json()
+ content = data["choices"][0]["message"]["content"]
+ usage = data.get("usage", {})
+ return content, duration, None, usage
+ except Exception as e:
+ return "", time.time() - t0, str(e), {}
+
+
+def call_anthropic(prompt: str, system_prompt: str) -> tuple:
+ """Call Anthropic API. Returns (response_text, duration_seconds, error)."""
+ headers = {
+ "x-api-key": ANTHROPIC_API_KEY,
+ "anthropic-version": "2023-06-01",
+ "content-type": "application/json",
+ }
+ payload = {
+ "model": ANTHROPIC_MODEL,
+ "max_tokens": 4096,
+ "system": system_prompt,
+ "messages": [{"role": "user", "content": prompt}],
+ }
+
+ t0 = time.time()
+ try:
+ resp = requests.post(ANTHROPIC_URL, headers=headers, json=payload, timeout=180)
+ duration = time.time() - t0
+ if resp.status_code != 200:
+ return "", duration, f"HTTP {resp.status_code}: {resp.text[:200]}", {}
+ data = resp.json()
+ content = data["content"][0]["text"] if data.get("content") else ""
+ usage = data.get("usage", {})
+ return content, duration, None, usage
+ except Exception as e:
+ return "", time.time() - t0, str(e), {}
+
+
+# ── Quality Assessment ───────────────────────────────────────────────
+
+REQUIRED_FIELDS = [
+ "title", "objective", "rationale", "requirements",
+ "test_procedure", "evidence", "severity", "domain",
+]
+
+BONUS_FIELDS = [
+ "tags", "category", "target_audience", "source_article",
+ "applicable_industries", "applicable_company_size",
+]
+
+
+def assess_quality(raw_text: str) -> dict:
+ """Assess the quality of a control generation response."""
+ result = {
+ "json_valid": False,
+ "required_fields": 0,
+ "required_total": len(REQUIRED_FIELDS),
+ "bonus_fields": 0,
+ "bonus_total": len(BONUS_FIELDS),
+ "requirements_count": 0,
+ "test_procedure_count": 0,
+ "evidence_count": 0,
+ "title_length": 0,
+ "objective_length": 0,
+ "score": 0,
+ }
+
+ # Try to parse JSON
+ text = raw_text.strip()
+ if text.startswith("```"):
+ lines = text.split("\n")
+ text = "\n".join(lines[1:-1] if lines[-1].startswith("```") else lines[1:])
+
+ try:
+ data = json.loads(text)
+ if isinstance(data, list):
+ data = data[0] if data else {}
+ except json.JSONDecodeError:
+ # Try to find JSON object
+ import re
+ match = re.search(r'\{[\s\S]*\}', text)
+ if match:
+ try:
+ data = json.loads(match.group())
+ except json.JSONDecodeError:
+ return result
+ else:
+ return result
+
+ result["json_valid"] = True
+
+ # Check required fields
+ for f in REQUIRED_FIELDS:
+ val = data.get(f)
+ if val and (isinstance(val, str) and len(val) > 2 or isinstance(val, list) and len(val) > 0):
+ result["required_fields"] += 1
+
+ # Check bonus fields
+ for f in BONUS_FIELDS:
+ val = data.get(f)
+ if val and (isinstance(val, str) and len(val) > 0 or isinstance(val, list) and len(val) > 0):
+ result["bonus_fields"] += 1
+
+ # Depth metrics
+ reqs = data.get("requirements", [])
+ result["requirements_count"] = len(reqs) if isinstance(reqs, list) else 0
+ tp = data.get("test_procedure", [])
+ result["test_procedure_count"] = len(tp) if isinstance(tp, list) else 0
+ ev = data.get("evidence", [])
+ result["evidence_count"] = len(ev) if isinstance(ev, list) else 0
+ result["title_length"] = len(data.get("title", ""))
+ result["objective_length"] = len(data.get("objective", ""))
+
+ # Score: 0-100
+ score = 0
+ score += 20 if result["json_valid"] else 0
+ score += (result["required_fields"] / result["required_total"]) * 40
+ score += (result["bonus_fields"] / result["bonus_total"]) * 15
+ score += min(result["requirements_count"], 5) * 3 # max 15 for 5+ requirements
+ score += min(result["test_procedure_count"], 3) * 3 # max 9 for 3+ tests
+ score += 1 if result["objective_length"] > 50 else 0
+ result["score"] = round(score, 1)
+
+ result["parsed_data"] = data
+ return result
+
+
+# ── Test Cases ───────────────────────────────────────────────────────
+
+TEST_CASES = [
+ {
+ "source": "DSGVO (EU) 2016/679",
+ "article": "Artikel 32",
+ "pdf": "dsgvo_2016_679.pdf",
+ "doc_type": "eu_regulation",
+ "license": "EU_LAW",
+ "description": "Sicherheit der Verarbeitung — Kernthema Datenschutz",
+ },
+ {
+ "source": "KI-Verordnung (EU) 2024/1689",
+ "article": "Artikel 9",
+ "pdf": "ai_act_2024_1689.pdf",
+ "doc_type": "eu_regulation",
+ "license": "EU_LAW",
+ "description": "Risikomanagement für Hochrisiko-KI",
+ },
+ {
+ "source": "NIS2-Richtlinie (EU) 2022/2555",
+ "article": "Artikel 21",
+ "pdf": "nis2_2022_2555.pdf",
+ "doc_type": "eu_regulation",
+ "license": "EU_LAW",
+ "description": "Cybersicherheitsrisikomanagement — NIS2 Kernpflicht",
+ },
+ {
+ "source": "Cyber Resilience Act (CRA)",
+ "article": "Artikel 13",
+ "pdf": "cra_2024_2847.pdf",
+ "doc_type": "eu_regulation",
+ "license": "EU_LAW",
+ "description": "Pflichten der Hersteller",
+ },
+ {
+ "source": "Bundesdatenschutzgesetz (BDSG)",
+ "article": "§ 26",
+ "pdf": "bdsg.pdf",
+ "doc_type": "de_law",
+ "license": "DE_LAW",
+ "description": "Datenverarbeitung im Beschäftigungskontext",
+ },
+]
+
+
+# ── Main ─────────────────────────────────────────────────────────────
+
+def main():
+ if not ANTHROPIC_API_KEY:
+ print("ERROR: Set ANTHROPIC_API_KEY environment variable")
+ sys.exit(1)
+
+ print("=" * 80)
+ print("LLM BENCHMARK: gpt-oss-120b vs Claude Sonnet 4.6")
+ print("=" * 80)
+ print(f" LiteLLM: {LITELLM_URL} / {LITELLM_MODEL}")
+ print(f" Anthropic: {ANTHROPIC_MODEL}")
+ print(f" Tests: {len(TEST_CASES)}")
+ print()
+
+ # Pre-check LiteLLM
+ try:
+ r = requests.get(f"{LITELLM_URL}/v1/models",
+ headers={"Authorization": f"Bearer {LITELLM_API_KEY}"}, timeout=10)
+ print(f" LiteLLM OK: {r.status_code}")
+ except Exception as e:
+ print(f" LiteLLM ERROR: {e}")
+ sys.exit(1)
+
+ results = []
+
+ for i, tc in enumerate(TEST_CASES):
+ print(f"\n{'='*80}")
+ print(f"TEST {i+1}/{len(TEST_CASES)}: {tc['source']} — {tc['article']}")
+ print(f" {tc['description']}")
+ print(f"{'='*80}")
+
+ # Extract article text from PDF
+ article_text = extract_article_text(tc["pdf"], tc["article"], tc["doc_type"])
+ if not article_text or article_text.startswith("["):
+ print(f" WARNING: {article_text or 'Empty text'}")
+ continue
+
+ print(f" Text extracted: {len(article_text)} chars")
+ print(f" First 120 chars: {article_text[:120].replace(chr(10), ' ')}...")
+
+ prompt = build_prompt(tc["source"], tc["article"], article_text, tc["license"])
+
+ # ── Call LiteLLM ──
+ print(f"\n --- gpt-oss-120b ---")
+ litellm_raw, litellm_time, litellm_err, litellm_usage = call_litellm(prompt, SYSTEM_PROMPT)
+ if litellm_err:
+ print(f" ERROR: {litellm_err}")
+ litellm_quality = {"json_valid": False, "score": 0}
+ else:
+ print(f" Time: {litellm_time:.1f}s")
+ print(f" Tokens: {litellm_usage}")
+ litellm_quality = assess_quality(litellm_raw)
+ print(f" JSON valid: {litellm_quality['json_valid']}")
+ print(f" Score: {litellm_quality['score']}/100")
+ print(f" Required fields: {litellm_quality['required_fields']}/{litellm_quality['required_total']}")
+ print(f" Requirements: {litellm_quality['requirements_count']}, "
+ f"Tests: {litellm_quality['test_procedure_count']}, "
+ f"Evidence: {litellm_quality['evidence_count']}")
+ if litellm_quality.get("parsed_data"):
+ d = litellm_quality["parsed_data"]
+ print(f" Title: {d.get('title', 'N/A')}")
+
+ # ── Call Anthropic ──
+ print(f"\n --- Claude Sonnet 4.6 ---")
+ anthropic_raw, anthropic_time, anthropic_err, anthropic_usage = call_anthropic(prompt, SYSTEM_PROMPT)
+ if anthropic_err:
+ print(f" ERROR: {anthropic_err}")
+ anthropic_quality = {"json_valid": False, "score": 0}
+ else:
+ print(f" Time: {anthropic_time:.1f}s")
+ print(f" Tokens: {anthropic_usage}")
+ anthropic_quality = assess_quality(anthropic_raw)
+ print(f" JSON valid: {anthropic_quality['json_valid']}")
+ print(f" Score: {anthropic_quality['score']}/100")
+ print(f" Required fields: {anthropic_quality['required_fields']}/{anthropic_quality['required_total']}")
+ print(f" Requirements: {anthropic_quality['requirements_count']}, "
+ f"Tests: {anthropic_quality['test_procedure_count']}, "
+ f"Evidence: {anthropic_quality['evidence_count']}")
+ if anthropic_quality.get("parsed_data"):
+ d = anthropic_quality["parsed_data"]
+ print(f" Title: {d.get('title', 'N/A')}")
+
+ # Compare
+ print(f"\n --- VERGLEICH ---")
+ speed_ratio = litellm_time / anthropic_time if anthropic_time > 0 else 0
+ print(f" Speed: 120b {litellm_time:.1f}s vs Sonnet {anthropic_time:.1f}s "
+ f"({'120b ' + str(round(speed_ratio, 1)) + 'x langsamer' if speed_ratio > 1 else '120b schneller'})")
+ print(f" Score: 120b {litellm_quality.get('score', 0)}/100 vs "
+ f"Sonnet {anthropic_quality.get('score', 0)}/100")
+
+ results.append({
+ "test": f"{tc['source']} — {tc['article']}",
+ "litellm": {
+ "time": round(litellm_time, 1),
+ "score": litellm_quality.get("score", 0),
+ "json_valid": litellm_quality.get("json_valid", False),
+ "requirements": litellm_quality.get("requirements_count", 0),
+ "tests": litellm_quality.get("test_procedure_count", 0),
+ "usage": litellm_usage,
+ "raw": litellm_raw[:500] if litellm_raw else "",
+ },
+ "anthropic": {
+ "time": round(anthropic_time, 1),
+ "score": anthropic_quality.get("score", 0),
+ "json_valid": anthropic_quality.get("json_valid", False),
+ "requirements": anthropic_quality.get("requirements_count", 0),
+ "tests": anthropic_quality.get("test_procedure_count", 0),
+ "usage": anthropic_usage,
+ "raw": anthropic_raw[:500] if anthropic_raw else "",
+ },
+ })
+
+ # ── Summary ──────────────────────────────────────────────────────
+ print(f"\n\n{'='*80}")
+ print("ZUSAMMENFASSUNG")
+ print(f"{'='*80}")
+
+ if not results:
+ print(" Keine Ergebnisse.")
+ return
+
+ litellm_scores = [r["litellm"]["score"] for r in results]
+ anthropic_scores = [r["anthropic"]["score"] for r in results]
+ litellm_times = [r["litellm"]["time"] for r in results]
+ anthropic_times = [r["anthropic"]["time"] for r in results]
+
+ print(f"\n {'Metrik':<30s} {'gpt-oss-120b':>15s} {'Claude Sonnet':>15s}")
+ print(f" {'-'*30} {'-'*15} {'-'*15}")
+ print(f" {'Avg Score (0-100)':<30s} {sum(litellm_scores)/len(litellm_scores):>13.1f} "
+ f"{sum(anthropic_scores)/len(anthropic_scores):>13.1f}")
+ print(f" {'Avg Time (s)':<30s} {sum(litellm_times)/len(litellm_times):>13.1f} "
+ f"{sum(anthropic_times)/len(anthropic_times):>13.1f}")
+ print(f" {'JSON Valid':<30s} {sum(1 for r in results if r['litellm']['json_valid']):>12d}/{len(results)} "
+ f"{sum(1 for r in results if r['anthropic']['json_valid']):>12d}/{len(results)}")
+ print(f" {'Avg Requirements':<30s} "
+ f"{sum(r['litellm']['requirements'] for r in results)/len(results):>13.1f} "
+ f"{sum(r['anthropic']['requirements'] for r in results)/len(results):>13.1f}")
+ print(f" {'Avg Test Procedures':<30s} "
+ f"{sum(r['litellm']['tests'] for r in results)/len(results):>13.1f} "
+ f"{sum(r['anthropic']['tests'] for r in results)/len(results):>13.1f}")
+
+ # Cost estimate
+ # Claude Sonnet: ~$3/M input, ~$15/M output
+ # gpt-oss-120b: self-hosted = $0 API cost (only compute)
+ total_anthropic_input = sum(r["anthropic"]["usage"].get("input_tokens", 0) for r in results)
+ total_anthropic_output = sum(r["anthropic"]["usage"].get("output_tokens", 0) for r in results)
+ anthropic_cost = (total_anthropic_input * 3 + total_anthropic_output * 15) / 1_000_000
+
+ print(f"\n Kostenvergleich (fuer {len(results)} Controls):")
+ print(f" gpt-oss-120b: $0.00 (self-hosted)")
+ print(f" Claude Sonnet: ${anthropic_cost:.4f} "
+ f"({total_anthropic_input} input + {total_anthropic_output} output tokens)")
+
+ # Extrapolate for 494 gap articles
+ if results:
+ cost_per_control = anthropic_cost / len(results)
+ print(f"\n Hochrechnung fuer 494 Luecken-Artikel:")
+ print(f" gpt-oss-120b: $0.00")
+ print(f" Claude Sonnet: ${cost_per_control * 494:.2f}")
+ avg_time_120b = sum(litellm_times) / len(litellm_times)
+ avg_time_sonnet = sum(anthropic_times) / len(anthropic_times)
+ print(f" Zeit 120b: {avg_time_120b * 494 / 60:.0f} min ({avg_time_120b * 494 / 3600:.1f}h)")
+ print(f" Zeit Sonnet: {avg_time_sonnet * 494 / 60:.0f} min ({avg_time_sonnet * 494 / 3600:.1f}h)")
+
+ # Save full results
+ out_path = "/tmp/benchmark_llm_results.json"
+ with open(out_path, 'w') as f:
+ json.dump(results, f, indent=2, ensure_ascii=False)
+ print(f"\n Detaillierte Ergebnisse: {out_path}")
+
+
+if __name__ == "__main__":
+ main()
diff --git a/scripts/qa/blue_guide_en_match.py b/scripts/qa/blue_guide_en_match.py
new file mode 100644
index 0000000..bad6974
--- /dev/null
+++ b/scripts/qa/blue_guide_en_match.py
@@ -0,0 +1,200 @@
+"""Match unmatched Blue Guide controls against the English PDF."""
+import os
+import re
+import json
+import unicodedata
+import psycopg2
+import urllib.parse
+
+try:
+ import fitz
+except ImportError:
+ print("ERROR: PyMuPDF (fitz) not installed")
+ exit(1)
+
+PDF_PATH = os.path.expanduser("~/rag-ingestion/pdfs/blue_guide_2022_en.pdf")
+
+def normalize(s):
+ s = s.replace('\u00ad', '').replace('\xad', '')
+ s = s.replace('\u200b', '').replace('\u00a0', ' ')
+ s = s.replace('\ufb01', 'fi').replace('\ufb02', 'fl')
+ s = s.replace('\ufb00', 'ff').replace('\ufb03', 'ffi').replace('\ufb04', 'ffl')
+ s = s.replace('\u2019', "'").replace('\u2018', "'")
+ s = s.replace('\u201c', '"').replace('\u201d', '"')
+ s = s.replace('\u2013', '-').replace('\u2014', '-')
+ s = s.replace('\u2022', '-').replace('\u00b7', '-')
+ s = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', s)
+ s = unicodedata.normalize('NFC', s)
+ s = re.sub(r'\s+', ' ', s)
+ return s.strip()
+
+# Read EN PDF
+print(f"Reading {PDF_PATH}...")
+doc = fitz.open(PDF_PATH)
+text = ""
+for page in doc:
+ text += page.get_text() + "\n"
+doc.close()
+print(f" {len(text):,} chars")
+
+text_norm = normalize(text)
+
+# Build article index for EN Blue Guide
+# EN Blue Guide uses "Article N" headings (not "Artikel N")
+items = []
+
+# Find where "Article 1" starts — content before is preamble/intro
+art1_match = re.search(r'\nArticle\s+1\s*\n', text)
+if not art1_match:
+ # Try section-based structure instead
+ print(" No 'Article N' headings found, trying section-based index...")
+ for m in re.finditer(r'(?:^|\n)\s*(\d+(?:\.\d+)*)\.\s+[A-Z]', text, re.MULTILINE):
+ items.append((m.start(), f"Section {m.group(1)}", "section"))
+else:
+ art1_pos = art1_match.start()
+ # Article headings
+ for m in re.finditer(r'(?:^|\n)\s*Article\s+(\d+[a-z]?)\s*\n', text, re.MULTILINE):
+ art_num = int(re.match(r'(\d+)', m.group(1)).group(1))
+ items.append((m.start(), f"Article {m.group(1)}", "article"))
+
+ # Annex markers
+ for m in re.finditer(r'(?:^|\n)\s*ANNEX\s+([IVXLC]+[a-z]?)\b', text, re.MULTILINE):
+ items.append((m.start(), f"Annex {m.group(1)}", "annex"))
+
+# Also try numbered section headings as fallback
+for m in re.finditer(r'(?:^|\n)\s*(\d+\.\d+(?:\.\d+)?)\s+[A-Z]', text, re.MULTILINE):
+ items.append((m.start(), f"Section {m.group(1)}", "section"))
+
+items.sort(key=lambda x: x[0])
+seen = set()
+unique = []
+for pos, label, typ in items:
+ if label not in seen:
+ seen.add(label)
+ unique.append((pos, label, typ))
+
+print(f" Index: {len(unique)} sections")
+if unique[:5]:
+ for pos, label, typ in unique[:5]:
+ print(f" {label} [{typ}] @ pos {pos}")
+
+# Precompute normalized positions
+index_norm = []
+for pos, label, typ in unique:
+ norm_pos = len(normalize(text[:pos]))
+ index_norm.append((norm_pos, label, typ))
+
+# Connect to DB
+db_url = os.environ['DATABASE_URL']
+parsed = urllib.parse.urlparse(db_url)
+conn = psycopg2.connect(
+ host=parsed.hostname, port=parsed.port or 5432,
+ user=parsed.username, password=parsed.password,
+ dbname=parsed.path.lstrip('/'),
+ options="-c search_path=compliance,public"
+)
+cur = conn.cursor()
+
+# Get Blue Guide controls without article_type (unmatched)
+cur.execute("""
+ SELECT id, control_id, title, source_original_text,
+ source_citation->>'article' as existing_article,
+ source_citation->>'article_type' as existing_type,
+ release_state
+ FROM compliance.canonical_controls
+ WHERE source_citation->>'source' = 'EU Blue Guide 2022'
+ AND source_original_text IS NOT NULL
+ AND length(source_original_text) > 50
+ AND (source_citation->>'article_type' IS NULL)
+ ORDER BY control_id
+""")
+controls = cur.fetchall()
+print(f"\nUnmatched Blue Guide controls: {len(controls)}")
+
+# Match each control
+results = []
+found = 0
+not_found = 0
+
+for ctrl in controls:
+ ctrl_id, control_id, title, orig_text, existing_art, existing_type, state = ctrl
+ orig_norm = normalize(orig_text)
+ if len(orig_norm) < 30:
+ not_found += 1
+ continue
+
+ matched = False
+ for start_frac in [0.25, 0.1, 0.5, 0.0, 0.75]:
+ for length in [80, 60, 40, 30, 20]:
+ start = max(0, int(len(orig_norm) * start_frac))
+ snippet = orig_norm[start:start+length]
+ if not snippet or len(snippet) < 15:
+ continue
+ pos = text_norm.find(snippet)
+ if pos >= 0:
+ # Find section
+ label = "Unknown"
+ typ = "unknown"
+ for h_pos, h_label, h_type in reversed(index_norm):
+ if h_pos <= pos:
+ label = h_label
+ typ = h_type
+ break
+ results.append({
+ "ctrl_id": str(ctrl_id),
+ "control_id": control_id,
+ "source": "EU Blue Guide 2022",
+ "article_label": label,
+ "article_type": typ,
+ })
+ found += 1
+ is_active = "" if state not in ('duplicate', 'too_close') else " [DUP]"
+ print(f" {control_id:10s}: {label:25s} [{typ:8s}]{is_active}")
+ matched = True
+ break
+ if matched:
+ break
+
+ if not matched:
+ not_found += 1
+ print(f" {control_id:10s}: NOT FOUND {title[:50]}")
+
+print(f"\n{'='*50}")
+print(f"Results: {found} matched, {not_found} not found out of {len(controls)}")
+
+# Save results
+out_path = "/tmp/blue_guide_en_results.json"
+with open(out_path, 'w') as f:
+ json.dump(results, f, indent=2, ensure_ascii=False)
+print(f"Saved to {out_path}")
+
+# Apply results to DB
+if results:
+ print(f"\nApplying {len(results)} results to DB...")
+ applied = 0
+ for r in results:
+ cur.execute("""
+ UPDATE compliance.canonical_controls
+ SET source_citation = source_citation ||
+ jsonb_build_object('article', %s, 'article_type', %s)
+ WHERE id = %s::uuid
+ AND (source_citation->>'article' IS DISTINCT FROM %s
+ OR source_citation->>'article_type' IS DISTINCT FROM %s)
+ """, (r["article_label"], r["article_type"],
+ r["ctrl_id"], r["article_label"], r["article_type"]))
+ if cur.rowcount > 0:
+ applied += 1
+ conn.commit()
+ print(f" Applied: {applied} controls updated")
+
+# Show type distribution
+type_counts = {}
+for r in results:
+ t = r["article_type"]
+ type_counts[t] = type_counts.get(t, 0) + 1
+if type_counts:
+ print(f"\nArticle type distribution:")
+ for t, c in sorted(type_counts.items(), key=lambda x: -x[1]):
+ print(f" {t:12s}: {c:5d}")
+
+conn.close()
diff --git a/scripts/qa/gap_analysis.py b/scripts/qa/gap_analysis.py
new file mode 100644
index 0000000..032599d
--- /dev/null
+++ b/scripts/qa/gap_analysis.py
@@ -0,0 +1,188 @@
+"""
+Phase 7.3: Gap Analysis — Identify articles/sections WITHOUT controls.
+
+For each regulation PDF:
+1. Extract all articles/sections from the PDF
+2. Compare with controls in the DB that reference this article
+3. Report gaps (articles with no controls)
+
+Usage:
+ python3 gap_analysis.py # show all gaps
+ python3 gap_analysis.py --source "DSGVO" # filter by source
+"""
+import os
+import sys
+import json
+import re
+import psycopg2
+import urllib.parse
+from pathlib import Path
+from collections import defaultdict
+
+# Import from pdf_qa_all
+sys.path.insert(0, os.path.dirname(__file__))
+from pdf_qa_all import (
+ SOURCE_FILE_MAP, read_file, classify_doc, normalize,
+ build_eu_article_index, build_de_law_index, build_nist_index,
+ build_owasp_index, build_generic_index, MAX_ARTICLES
+)
+
+# Only analyze sources with significant control counts (skip sources with <5 controls)
+MIN_CONTROLS = 5
+
+
+def main():
+ source_filter = None
+ if "--source" in sys.argv:
+ idx = sys.argv.index("--source")
+ if idx + 1 < len(sys.argv):
+ source_filter = sys.argv[idx + 1]
+
+ # DB connection
+ db_url = os.environ['DATABASE_URL']
+ parsed = urllib.parse.urlparse(db_url)
+ conn = psycopg2.connect(
+ host=parsed.hostname, port=parsed.port or 5432,
+ user=parsed.username, password=parsed.password,
+ dbname=parsed.path.lstrip('/'),
+ options="-c search_path=compliance,public"
+ )
+ cur = conn.cursor()
+
+ # Get all controls grouped by source with their article
+ cur.execute("""
+ SELECT source_citation->>'source' as source,
+ source_citation->>'article' as article,
+ source_citation->>'article_type' as article_type,
+ count(*) as cnt
+ FROM compliance.canonical_controls
+ WHERE source_citation->>'source' IS NOT NULL
+ AND release_state NOT IN ('duplicate', 'too_close')
+ GROUP BY 1, 2, 3
+ ORDER BY 1, 2
+ """)
+
+ # Build: source -> {article -> (type, count)}
+ controls_by_source = defaultdict(dict)
+ for source, article, art_type, cnt in cur.fetchall():
+ if article:
+ controls_by_source[source][article] = (art_type or "unknown", cnt)
+
+ total_gaps = 0
+ total_articles_checked = 0
+ total_covered = 0
+ gap_report = []
+
+ sources_to_check = sorted(SOURCE_FILE_MAP.keys())
+ if source_filter:
+ sources_to_check = [s for s in sources_to_check if source_filter.lower() in s.lower()]
+
+ for source_name in sources_to_check:
+ filename = SOURCE_FILE_MAP.get(source_name)
+ if filename is None:
+ continue
+
+ controls = controls_by_source.get(source_name, {})
+ if len(controls) < MIN_CONTROLS and not source_filter:
+ continue
+
+ # Read PDF and build article index
+ text = read_file(filename)
+ if text is None:
+ continue
+
+ doc_type = classify_doc(source_name)
+ max_art = MAX_ARTICLES.get(source_name)
+
+ if doc_type == "eu_regulation":
+ index = build_eu_article_index(text, max_article=max_art)
+ elif doc_type == "de_law":
+ index = build_de_law_index(text)
+ elif doc_type == "nist":
+ index = build_nist_index(text)
+ elif doc_type == "owasp":
+ index = build_owasp_index(text, source_name)
+ else:
+ index = build_generic_index(text)
+
+ if not index:
+ continue
+
+ # Only look at substantive articles (not preamble, not annex for gap analysis)
+ substantive_types = {"article", "section", "control", "requirement", "category"}
+ substantive_articles = [(pos, label, typ) for pos, label, typ in index if typ in substantive_types]
+
+ preamble_articles = [(pos, label, typ) for pos, label, typ in index if typ == "preamble"]
+ annex_articles = [(pos, label, typ) for pos, label, typ in index if typ == "annex"]
+
+ # Check which articles have controls
+ covered = []
+ gaps = []
+ for pos, label, typ in substantive_articles:
+ if label in controls:
+ covered.append(label)
+ else:
+ gaps.append((label, typ))
+
+ total_articles_checked += len(substantive_articles)
+ total_covered += len(covered)
+ total_gaps += len(gaps)
+
+ # Count preamble/annex controls
+ preamble_controls = sum(1 for a in controls if controls[a][0] == "preamble")
+ annex_controls = sum(1 for a in controls if controls[a][0] == "annex")
+
+ coverage_pct = len(covered) / len(substantive_articles) * 100 if substantive_articles else 0
+
+ print(f"\n{'='*70}")
+ print(f"{source_name}")
+ print(f" PDF articles: {len(substantive_articles)} substantive, "
+ f"{len(preamble_articles)} preamble, {len(annex_articles)} annex")
+ print(f" DB controls: {sum(v[1] for v in controls.values())} total "
+ f"({preamble_controls} preamble, {annex_controls} annex)")
+ print(f" Coverage: {len(covered)}/{len(substantive_articles)} "
+ f"({coverage_pct:.0f}%)")
+
+ if gaps:
+ print(f" GAPS ({len(gaps)}):")
+ for label, typ in gaps[:30]: # limit output
+ print(f" - {label} [{typ}]")
+ if len(gaps) > 30:
+ print(f" ... and {len(gaps)-30} more")
+
+ gap_report.append({
+ "source": source_name,
+ "total_articles": len(substantive_articles),
+ "covered": len(covered),
+ "gaps": len(gaps),
+ "coverage_pct": round(coverage_pct, 1),
+ "gap_articles": [{"label": l, "type": t} for l, t in gaps],
+ })
+
+ # Summary
+ print(f"\n{'='*70}")
+ print("GAP ANALYSIS SUMMARY")
+ print(f"{'='*70}")
+ print(f" Sources analyzed: {len([r for r in gap_report]) + len([s for s in sources_to_check if SOURCE_FILE_MAP.get(s)])}")
+ print(f" Total articles in PDFs: {total_articles_checked}")
+ print(f" Articles with controls: {total_covered}")
+ print(f" Articles WITHOUT controls: {total_gaps}")
+ if total_articles_checked:
+ print(f" Overall coverage: {total_covered/total_articles_checked*100:.1f}%")
+
+ print(f"\n Sources with gaps:")
+ for r in sorted(gap_report, key=lambda x: -x["gaps"]):
+ print(f" {r['source']:45s} {r['gaps']:4d} gaps "
+ f"({r['covered']}/{r['total_articles']} = {r['coverage_pct']}%)")
+
+ # Save report
+ out_path = "/tmp/gap_analysis_results.json"
+ with open(out_path, 'w') as f:
+ json.dump(gap_report, f, indent=2, ensure_ascii=False)
+ print(f"\n Full report saved to {out_path}")
+
+ conn.close()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/scripts/qa/oscal_analysis.py b/scripts/qa/oscal_analysis.py
new file mode 100644
index 0000000..edfd103
--- /dev/null
+++ b/scripts/qa/oscal_analysis.py
@@ -0,0 +1,288 @@
+"""Analyze NIST OSCAL data and compare with existing controls in DB."""
+import os
+import re
+import json
+import psycopg2
+import urllib.parse
+from collections import defaultdict
+
+OSCAL_DIR = os.path.expanduser("~/rag-ingestion/nist-oscal")
+
+# ── Load SP 800-53 Rev 5 ──
+with open(os.path.join(OSCAL_DIR, "sp800-53-rev5-catalog.json")) as f:
+ sp853 = json.load(f)["catalog"]
+
+print("=" * 70)
+print("NIST SP 800-53 Rev 5 — OSCAL Catalog Analysis")
+print("=" * 70)
+print(f" UUID: {sp853.get('uuid', '?')}")
+print(f" Last Modified: {sp853.get('metadata', {}).get('last-modified', '?')}")
+
+# Count controls
+families = sp853.get("groups", [])
+total_base = 0
+total_enhancements = 0
+total_withdrawn = 0
+total_active = 0
+family_stats = []
+
+for fam in families:
+ fam_id = fam.get("id", "?")
+ fam_title = fam.get("title", "?")
+ controls = fam.get("controls", [])
+ base = 0
+ enhancements = 0
+ withdrawn = 0
+
+ for ctrl in controls:
+ # Check if withdrawn
+ props = {p["name"]: p.get("value", "") for p in ctrl.get("props", [])}
+ is_withdrawn = props.get("status") == "withdrawn"
+ if is_withdrawn:
+ withdrawn += 1
+ else:
+ base += 1
+
+ # Count enhancements
+ for enh in ctrl.get("controls", []):
+ enh_props = {p["name"]: p.get("value", "") for p in enh.get("props", [])}
+ if enh_props.get("status") == "withdrawn":
+ withdrawn += 1
+ else:
+ enhancements += 1
+
+ family_stats.append((fam_id, fam_title, base, enhancements, withdrawn))
+ total_base += base
+ total_enhancements += enhancements
+ total_withdrawn += withdrawn
+
+total_active = total_base + total_enhancements
+print(f"\n Families: {len(families)}")
+print(f" Base Controls: {total_base}")
+print(f" Enhancements: {total_enhancements}")
+print(f" Withdrawn: {total_withdrawn}")
+print(f" TOTAL ACTIVE: {total_active}")
+
+print(f"\n Per Family:")
+print(f" {'ID':6s} {'Title':45s} {'Base':>5s} {'Enh':>5s} {'Wdrn':>5s}")
+for fam_id, title, base, enh, wdrn in family_stats:
+ print(f" {fam_id:6s} {title[:45]:45s} {base:5d} {enh:5d} {wdrn:5d}")
+
+# Show example control structure
+print(f"\n Example Control (AC-6 Least Privilege):")
+for fam in families:
+ for ctrl in fam.get("controls", []):
+ if ctrl["id"] == "ac-6":
+ props = {p["name"]: p.get("value", "") for p in ctrl.get("props", [])}
+ print(f" ID: {ctrl['id']}")
+ print(f" Label: {props.get('label', '?')}")
+ print(f" Title: {ctrl['title']}")
+ for part in ctrl.get("parts", []):
+ if part.get("name") == "statement":
+ prose = part.get("prose", "")
+ print(f" Statement: {prose[:150]}...")
+ elif part.get("name") == "guidance":
+ prose = part.get("prose", "")
+ print(f" Guidance: {prose[:150]}...")
+ enh_count = len(ctrl.get("controls", []))
+ print(f" Enhancements: {enh_count}")
+ links = [l["href"].lstrip("#") for l in ctrl.get("links", []) if l.get("rel") == "related"]
+ print(f" Related: {', '.join(links[:8])}...")
+ break
+
+# ── Load CSF 2.0 ──
+print(f"\n{'='*70}")
+print("NIST CSF 2.0 — OSCAL Catalog Analysis")
+print("=" * 70)
+
+with open(os.path.join(OSCAL_DIR, "csf-2.0-catalog.json")) as f:
+ csf = json.load(f)["catalog"]
+
+csf_groups = csf.get("groups", [])
+csf_total = 0
+for grp in csf_groups:
+ func_title = grp.get("title", "?")
+ cats = grp.get("groups", [])
+ subcats = 0
+ for cat in cats:
+ subcats += len(cat.get("controls", []))
+ csf_total += subcats
+ print(f" {func_title:25s}: {len(cats):2d} categories, {subcats:3d} subcategories")
+
+print(f" TOTAL: {csf_total} subcategories")
+
+# ── Compare with existing DB controls ──
+print(f"\n{'='*70}")
+print("VERGLEICH: OSCAL vs. bestehende Controls in DB")
+print("=" * 70)
+
+db_url = os.environ['DATABASE_URL']
+parsed = urllib.parse.urlparse(db_url)
+conn = psycopg2.connect(
+ host=parsed.hostname, port=parsed.port or 5432,
+ user=parsed.username, password=parsed.password,
+ dbname=parsed.path.lstrip('/'),
+ options="-c search_path=compliance,public"
+)
+cur = conn.cursor()
+
+# Get existing NIST controls
+cur.execute("""
+ SELECT control_id, title,
+ source_citation->>'source' as source,
+ source_citation->>'article' as article,
+ source_citation->>'article_type' as art_type,
+ release_state
+ FROM compliance.canonical_controls
+ WHERE source_citation->>'source' LIKE 'NIST%%'
+ ORDER BY source_citation->>'source', control_id
+""")
+nist_controls = cur.fetchall()
+
+# Group by source
+by_source = defaultdict(list)
+for ctrl in nist_controls:
+ by_source[ctrl[2]].append(ctrl)
+
+print(f"\n Bestehende NIST Controls in DB:")
+for src in sorted(by_source.keys()):
+ ctrls = by_source[src]
+ active = sum(1 for c in ctrls if c[5] not in ('duplicate', 'too_close'))
+ with_article = sum(1 for c in ctrls if c[3])
+ print(f" {src:40s}: {len(ctrls):4d} total, {active:4d} active, {with_article:4d} mit article")
+
+# For SP 800-53: which control families do we have?
+sp853_existing = [c for c in nist_controls if 'SP 800-53' in (c[2] or '')]
+existing_families = set()
+existing_articles = set()
+for ctrl in sp853_existing:
+ article = ctrl[3] or ""
+ if article:
+ # Extract family prefix (e.g., "AC-6" → "AC")
+ m = re.match(r'([A-Z]{2})-', article)
+ if m:
+ existing_families.add(m.group(1))
+ existing_articles.add(article)
+
+print(f"\n SP 800-53 in DB:")
+print(f" Total: {len(sp853_existing)}")
+print(f" Families covered: {len(existing_families)}")
+print(f" Unique articles: {len(existing_articles)}")
+print(f" Families: {', '.join(sorted(existing_families))}")
+
+# Compare: which OSCAL controls are NOT in our DB?
+oscal_controls = {} # id → (label, title, statement)
+for fam in families:
+ for ctrl in fam.get("controls", []):
+ props = {p["name"]: p.get("value", "") for p in ctrl.get("props", [])}
+ if props.get("status") == "withdrawn":
+ continue
+ label = props.get("label", ctrl["id"].upper())
+ statement = ""
+ guidance = ""
+ for part in ctrl.get("parts", []):
+ if part.get("name") == "statement":
+ statement = part.get("prose", "")
+ # Also check sub-items
+ for sub in part.get("parts", []):
+ statement += " " + sub.get("prose", "")
+ elif part.get("name") == "guidance":
+ guidance = part.get("prose", "")
+
+ oscal_controls[label] = (ctrl["title"], statement[:500], guidance[:500])
+
+ # Enhancements
+ for enh in ctrl.get("controls", []):
+ enh_props = {p["name"]: p.get("value", "") for p in enh.get("props", [])}
+ if enh_props.get("status") == "withdrawn":
+ continue
+ enh_label = enh_props.get("label", enh["id"].upper())
+ enh_statement = ""
+ enh_guidance = ""
+ for part in enh.get("parts", []):
+ if part.get("name") == "statement":
+ enh_statement = part.get("prose", "")
+ for sub in part.get("parts", []):
+ enh_statement += " " + sub.get("prose", "")
+ elif part.get("name") == "guidance":
+ enh_guidance = part.get("prose", "")
+ oscal_controls[enh_label] = (enh["title"], enh_statement[:500], enh_guidance[:500])
+
+print(f"\n OSCAL SP 800-53 aktive Controls: {len(oscal_controls)}")
+
+# Find missing: in OSCAL but not in DB
+missing = []
+covered = []
+for label in sorted(oscal_controls.keys()):
+ if label in existing_articles:
+ covered.append(label)
+ else:
+ missing.append(label)
+
+print(f" In DB vorhanden: {len(covered)}")
+print(f" FEHLEND in DB: {len(missing)}")
+
+# Missing by family
+missing_by_fam = defaultdict(list)
+for label in missing:
+ fam = label.split("-")[0]
+ missing_by_fam[fam].append(label)
+
+print(f"\n Fehlende Controls nach Family:")
+for fam in sorted(missing_by_fam.keys()):
+ ctrls = missing_by_fam[fam]
+ examples = ", ".join(ctrls[:5])
+ more = f" +{len(ctrls)-5}" if len(ctrls) > 5 else ""
+ print(f" {fam:4s}: {len(ctrls):3d} fehlend ({examples}{more})")
+
+# Also check CSF 2.0
+print(f"\n{'='*70}")
+print("NIST CSF 2.0 — Vergleich mit DB")
+print("=" * 70)
+
+cur.execute("""
+ SELECT count(*), count(*) FILTER (WHERE release_state NOT IN ('duplicate', 'too_close'))
+ FROM compliance.canonical_controls
+ WHERE source_citation->>'source' LIKE 'NIST Cybersecurity%%'
+""")
+csf_row = cur.fetchone()
+print(f" CSF Controls in DB: {csf_row[0]} total, {csf_row[1]} active")
+
+csf_subcats = 0
+csf_ids = []
+for grp in csf_groups:
+ for cat in grp.get("groups", []):
+ for subcat in cat.get("controls", []):
+ csf_subcats += 1
+ props = {p["name"]: p.get("value", "") for p in subcat.get("props", [])}
+ csf_ids.append(props.get("label", subcat["id"]))
+
+print(f" CSF 2.0 OSCAL Subcategories: {csf_subcats}")
+print(f" Beispiele: {', '.join(csf_ids[:10])}")
+
+# ── Summary / Potential ──
+print(f"\n{'='*70}")
+print("POTENTIAL: Was OSCAL uns bringt")
+print("=" * 70)
+print(f"""
+ SP 800-53 Rev 5:
+ - {len(missing)} neue Controls möglich (aktuell {len(covered)} in DB)
+ - Jeder Control hat: Statement + Guidance + Assessment-Methoden
+ - Cross-References zwischen Controls (für Mapping)
+ - Maschinenlesbare Parameter (ODP)
+ - Public Domain — keine Lizenzprobleme
+
+ CSF 2.0:
+ - {csf_subcats} Subcategories als Compliance-Controls
+ - 6 Functions (Govern, Identify, Protect, Detect, Respond, Recover)
+ - Direkte Mappings zu SP 800-53 Controls
+
+ Nächste Schritte:
+ 1. Fehlende SP 800-53 Controls importieren ({len(missing)} Controls)
+ 2. Statement-Text als source_original_text verwenden
+ 3. article_type='control', article=Label (z.B. 'AC-6')
+ 4. CSF 2.0 als eigene Regulation importieren
+ 5. Cross-References als Grundlage für Control-Mappings nutzen
+""")
+
+conn.close()
diff --git a/scripts/qa/oscal_import.py b/scripts/qa/oscal_import.py
new file mode 100644
index 0000000..ff874d8
--- /dev/null
+++ b/scripts/qa/oscal_import.py
@@ -0,0 +1,289 @@
+"""Import 776 missing NIST SP 800-53 Rev 5 controls from OSCAL into canonical_controls."""
+import os
+import re
+import json
+import uuid
+import psycopg2
+import urllib.parse
+
+OSCAL_DIR = os.path.expanduser("~/rag-ingestion/nist-oscal")
+
+with open(os.path.join(OSCAL_DIR, "sp800-53-rev5-catalog.json")) as f:
+ sp853 = json.load(f)["catalog"]
+
+# ── Extract all OSCAL controls ──
+def extract_controls(catalog):
+ """Extract all active controls with full data."""
+ controls = []
+ for fam in catalog.get("groups", []):
+ fam_id = fam.get("id", "").upper()
+ fam_title = fam.get("title", "")
+
+ for ctrl in fam.get("controls", []):
+ result = extract_single(ctrl, fam_title)
+ if result:
+ controls.append(result)
+ # Enhancements
+ for enh in ctrl.get("controls", []):
+ result = extract_single(enh, fam_title)
+ if result:
+ controls.append(result)
+ return controls
+
+def extract_single(ctrl, family_title):
+ """Extract a single control or enhancement."""
+ props = {p["name"]: p.get("value", "") for p in ctrl.get("props", [])}
+ if props.get("status") == "withdrawn":
+ return None
+
+ label = props.get("label", ctrl["id"].upper())
+ title = ctrl.get("title", "")
+
+ # Extract statement (main requirement text)
+ statement = ""
+ for part in ctrl.get("parts", []):
+ if part.get("name") == "statement":
+ statement = part.get("prose", "")
+ # Sub-items (a., b., c., etc.)
+ for sub in part.get("parts", []):
+ sub_prose = sub.get("prose", "")
+ sub_label = ""
+ for sp in sub.get("props", []):
+ if sp["name"] == "label":
+ sub_label = sp.get("value", "")
+ if sub_label:
+ statement += f"\n{sub_label} {sub_prose}"
+ elif sub_prose:
+ statement += f"\n{sub_prose}"
+ # Nested sub-sub-items
+ for subsub in sub.get("parts", []):
+ ss_prose = subsub.get("prose", "")
+ ss_label = ""
+ for sp in subsub.get("props", []):
+ if sp["name"] == "label":
+ ss_label = sp.get("value", "")
+ if ss_label:
+ statement += f"\n {ss_label} {ss_prose}"
+ elif ss_prose:
+ statement += f"\n {ss_prose}"
+
+ # Extract guidance
+ guidance = ""
+ for part in ctrl.get("parts", []):
+ if part.get("name") == "guidance":
+ guidance = part.get("prose", "")
+
+ # Cross-references
+ related = [l["href"].lstrip("#") for l in ctrl.get("links", []) if l.get("rel") == "related"]
+
+ # Parameters
+ params = []
+ for p in ctrl.get("params", []):
+ param_id = p.get("id", "")
+ param_label = p.get("label", "")
+ guidelines = ""
+ for g in p.get("guidelines", []):
+ guidelines += g.get("prose", "")
+ select_choices = []
+ if "select" in p:
+ for choice in p["select"].get("choice", []):
+ select_choices.append(choice)
+ params.append({
+ "id": param_id,
+ "label": param_label,
+ "guidelines": guidelines,
+ "choices": select_choices,
+ })
+
+ return {
+ "label": label,
+ "title": title,
+ "family": family_title,
+ "statement": statement.strip(),
+ "guidance": guidance.strip(),
+ "related": related,
+ "params": params,
+ "is_enhancement": "(" in label,
+ }
+
+all_oscal = extract_controls(sp853)
+print(f"Total OSCAL active controls: {len(all_oscal)}")
+
+# ── Normalize label for comparison ──
+def normalize_label(label):
+ label = re.sub(r'-0+(\d)', r'-\1', label)
+ label = re.sub(r'\(0+(\d+)\)', r'(\1)', label)
+ return label.upper()
+
+# ── DB connection ──
+db_url = os.environ['DATABASE_URL']
+parsed = urllib.parse.urlparse(db_url)
+conn = psycopg2.connect(
+ host=parsed.hostname, port=parsed.port or 5432,
+ user=parsed.username, password=parsed.password,
+ dbname=parsed.path.lstrip('/'),
+ options="-c search_path=compliance,public"
+)
+cur = conn.cursor()
+
+# Get existing labels
+cur.execute("""
+ SELECT DISTINCT source_citation->>'article' as article
+ FROM compliance.canonical_controls
+ WHERE source_citation->>'source' = 'NIST SP 800-53 Rev. 5'
+ AND source_citation->>'article' IS NOT NULL
+""")
+existing_labels = set(normalize_label(r[0]) for r in cur.fetchall())
+print(f"Existing DB labels (normalized): {len(existing_labels)}")
+
+# Get highest control_id numbers per prefix
+cur.execute("""
+ SELECT control_id FROM compliance.canonical_controls
+ WHERE control_id ~ '^[A-Z]+-[0-9]+$'
+ ORDER BY control_id
+""")
+existing_ids = set(r[0] for r in cur.fetchall())
+
+# Find next available ID per prefix
+def next_control_id(prefix, existing):
+ """Find next available control_id like SEC-1234."""
+ max_num = 0
+ pattern = re.compile(rf'^{prefix}-(\d+)$')
+ for eid in existing:
+ m = pattern.match(eid)
+ if m:
+ max_num = max(max_num, int(m.group(1)))
+ return max_num
+
+# Map NIST families to our control_id prefixes
+FAMILY_PREFIX = {
+ "Access Control": "ACC",
+ "Awareness and Training": "GOV",
+ "Audit and Accountability": "LOG",
+ "Assessment, Authorization, and Monitoring": "GOV",
+ "Configuration Management": "COMP",
+ "Contingency Planning": "INC",
+ "Identification and Authentication": "AUTH",
+ "Incident Response": "INC",
+ "Maintenance": "COMP",
+ "Media Protection": "DATA",
+ "Physical and Environmental Protection": "SEC",
+ "Planning": "GOV",
+ "Program Management": "GOV",
+ "Personnel Security": "GOV",
+ "Personally Identifiable Information Processing and Transparency": "DATA",
+ "Risk Assessment": "GOV",
+ "System and Services Acquisition": "COMP",
+ "System and Communications Protection": "NET",
+ "System and Information Integrity": "SEC",
+ "Supply Chain Risk Management": "COMP",
+}
+
+# Track next IDs
+prefix_counters = {}
+for prefix in set(FAMILY_PREFIX.values()):
+ prefix_counters[prefix] = next_control_id(prefix, existing_ids)
+print(f"Starting counters: {prefix_counters}")
+
+# ── Filter to only new controls ──
+to_import = []
+for ctrl in all_oscal:
+ norm = normalize_label(ctrl["label"])
+ if norm not in existing_labels:
+ to_import.append(ctrl)
+
+print(f"\nControls to import: {len(to_import)}")
+
+# ── Import ──
+imported = 0
+for ctrl in to_import:
+ prefix = FAMILY_PREFIX.get(ctrl["family"], "COMP")
+ prefix_counters[prefix] += 1
+ control_id = f"{prefix}-{prefix_counters[prefix]:04d}"
+
+ # Build title: "NIST {label}: {title}"
+ title = f"NIST {ctrl['label']}: {ctrl['title']}"
+
+ # source_original_text = statement (the official requirement text)
+ source_text = ctrl["statement"]
+ if not source_text:
+ source_text = ctrl["guidance"][:500] if ctrl["guidance"] else ctrl["title"]
+
+ # objective = guidance text
+ objective = ctrl["guidance"][:2000] if ctrl["guidance"] else ""
+
+ # source_citation
+ citation = {
+ "source": "NIST SP 800-53 Rev. 5",
+ "article": ctrl["label"],
+ "article_type": "control",
+ "source_type": "standard",
+ "oscal_import": True,
+ }
+ if ctrl["related"]:
+ citation["related_controls"] = ctrl["related"][:20]
+ if ctrl["params"]:
+ citation["parameters"] = [{"id": p["id"], "label": p["label"]} for p in ctrl["params"][:10]]
+
+ FRAMEWORK_ID = '14b1bdd2-abc7-4a43-adae-14471ee5c7cf'
+ new_id = str(uuid.uuid4())
+ cur.execute("""
+ INSERT INTO compliance.canonical_controls
+ (id, framework_id, control_id, title, objective, rationale,
+ severity, source_original_text,
+ source_citation, pipeline_version, release_state,
+ generation_strategy, category)
+ VALUES (%s, %s, %s, %s, %s, '', 'medium', %s, %s, 4, 'draft', 'oscal_import', %s)
+ """, (
+ new_id,
+ FRAMEWORK_ID,
+ control_id,
+ title[:500],
+ objective[:5000],
+ source_text[:10000],
+ json.dumps(citation, ensure_ascii=False),
+ ctrl["family"],
+ ))
+ imported += 1
+
+conn.commit()
+print(f"\nImported: {imported} new controls")
+
+# ── Verify ──
+cur.execute("""
+ SELECT count(*),
+ count(*) FILTER (WHERE release_state NOT IN ('duplicate', 'too_close'))
+ FROM compliance.canonical_controls
+ WHERE source_citation->>'source' = 'NIST SP 800-53 Rev. 5'
+""")
+total, active = cur.fetchone()
+print(f"\nSP 800-53 after import: {total} total, {active} active")
+
+cur.execute("""
+ SELECT release_state, count(*)
+ FROM compliance.canonical_controls
+ GROUP BY release_state
+ ORDER BY count(*) DESC
+""")
+print(f"\nDB release_state gesamt:")
+for row in cur.fetchall():
+ print(f" {row[0]:15s}: {row[1]:5d}")
+
+cur.execute("""
+ SELECT count(*)
+ FROM compliance.canonical_controls
+ WHERE release_state NOT IN ('duplicate', 'too_close')
+""")
+print(f"\nAktive Controls gesamt: {cur.fetchone()[0]}")
+
+# ── Import stats by family ──
+fam_counts = {}
+for ctrl in to_import:
+ fam = ctrl["family"]
+ fam_counts[fam] = fam_counts.get(fam, 0) + 1
+
+print(f"\nImportiert nach Family:")
+for fam in sorted(fam_counts.keys()):
+ print(f" {fam[:45]:45s}: {fam_counts[fam]:3d}")
+
+conn.close()
diff --git a/scripts/qa/owasp_cleanup.py b/scripts/qa/owasp_cleanup.py
new file mode 100644
index 0000000..5bcf2c0
--- /dev/null
+++ b/scripts/qa/owasp_cleanup.py
@@ -0,0 +1,274 @@
+"""OWASP Cleanup:
+1. Mark 324 OWASP Top 10 multilingual controls as 'duplicate'
+2. Fix 47 wrong source attributions (found in different OWASP PDF)
+"""
+import os
+import re
+import json
+import unicodedata
+import psycopg2
+import urllib.parse
+
+try:
+ import fitz
+except ImportError:
+ print("ERROR: PyMuPDF not installed")
+ exit(1)
+
+PDF_DIR = os.path.expanduser("~/rag-ingestion/pdfs")
+
+def normalize(s):
+ s = s.replace('\u00ad', '').replace('\xad', '')
+ s = s.replace('\u200b', '').replace('\u00a0', ' ')
+ s = s.replace('\ufb01', 'fi').replace('\ufb02', 'fl')
+ s = s.replace('\ufb00', 'ff').replace('\ufb03', 'ffi').replace('\ufb04', 'ffl')
+ s = s.replace('\u2019', "'").replace('\u2018', "'")
+ s = s.replace('\u201c', '"').replace('\u201d', '"')
+ s = s.replace('\u2013', '-').replace('\u2014', '-')
+ s = s.replace('\u2022', '-').replace('\u00b7', '-')
+ s = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', s)
+ s = unicodedata.normalize('NFC', s)
+ s = re.sub(r'\s+', ' ', s)
+ return s.strip()
+
+# Load OWASP PDFs
+OWASP_PDFS = {
+ "OWASP Top 10 (2021)": "owasp_top10_2021.pdf",
+ "OWASP ASVS 4.0": "owasp_asvs_4_0.pdf",
+ "OWASP SAMM 2.0": "owasp_samm_2_0.pdf",
+ "OWASP API Security Top 10 (2023)": "owasp_api_top10_2023.pdf",
+ "OWASP MASVS 2.0": "owasp_masvs_2_0.pdf",
+}
+
+pdf_norms = {}
+for name, filename in OWASP_PDFS.items():
+ path = os.path.join(PDF_DIR, filename)
+ if not os.path.exists(path):
+ continue
+ doc = fitz.open(path)
+ text = ""
+ for page in doc:
+ text += page.get_text() + "\n"
+ doc.close()
+ pdf_norms[name] = normalize(text)
+
+def build_owasp_index(text_norm, source_name):
+ # We need the raw text for regex, but we already normalized.
+ # Rebuild index from normalized text.
+ items = []
+ if "Top 10" in source_name and "API" not in source_name:
+ for m in re.finditer(r'(A\d{2}:\d{4})', text_norm):
+ items.append((m.start(), m.group(1), "category"))
+ elif "API" in source_name:
+ for m in re.finditer(r'(API\d+:\d{4})', text_norm):
+ items.append((m.start(), m.group(1), "category"))
+ elif "ASVS" in source_name:
+ for m in re.finditer(r'(V\d+\.\d+(?:\.\d+)?)\b', text_norm):
+ items.append((m.start(), m.group(1), "requirement"))
+ elif "MASVS" in source_name:
+ for m in re.finditer(r'(MASVS-[A-Z]+-\d+)', text_norm):
+ items.append((m.start(), m.group(1), "requirement"))
+ items.sort(key=lambda x: x[0])
+ seen = set()
+ unique = []
+ for pos, label, typ in items:
+ if label not in seen:
+ seen.add(label)
+ unique.append((pos, label, typ))
+ return unique
+
+pdf_indexes = {}
+for name, norm in pdf_norms.items():
+ pdf_indexes[name] = build_owasp_index(norm, name)
+
+def find_in_pdf(orig_text, source_name):
+ """Find control text in a specific PDF. Returns (label, type) or None."""
+ pdf_norm = pdf_norms.get(source_name)
+ if not pdf_norm:
+ return None
+ orig_norm = normalize(orig_text)
+ if len(orig_norm) < 20:
+ return None
+ idx = pdf_indexes.get(source_name, [])
+ for start_frac in [0.25, 0.1, 0.5, 0.0, 0.75]:
+ for length in [80, 60, 40, 30, 20]:
+ start = max(0, int(len(orig_norm) * start_frac))
+ snippet = orig_norm[start:start+length]
+ if not snippet or len(snippet) < 15:
+ continue
+ pos = pdf_norm.find(snippet)
+ if pos >= 0:
+ label = "Unknown"
+ typ = "unknown"
+ for h_pos, h_label, h_type in reversed(idx):
+ if h_pos <= pos:
+ label = h_label
+ typ = h_type
+ break
+ return (label, typ)
+ return None
+
+# DB
+db_url = os.environ['DATABASE_URL']
+parsed = urllib.parse.urlparse(db_url)
+conn = psycopg2.connect(
+ host=parsed.hostname, port=parsed.port or 5432,
+ user=parsed.username, password=parsed.password,
+ dbname=parsed.path.lstrip('/'),
+ options="-c search_path=compliance,public"
+)
+cur = conn.cursor()
+
+# ═══════════════════════════════════════════════════════════════
+# STEP 1: Mark OWASP Top 10 multilingual controls as duplicate
+# ═══════════════════════════════════════════════════════════════
+print("=" * 60)
+print("STEP 1: OWASP Top 10 — multilingual controls → duplicate")
+print("=" * 60)
+
+cur.execute("""
+ SELECT id, control_id, title, source_original_text, release_state
+ FROM compliance.canonical_controls
+ WHERE source_citation->>'source' = 'OWASP Top 10 (2021)'
+ AND source_citation->>'article_type' IS NULL
+ AND source_original_text IS NOT NULL
+ AND release_state NOT IN ('duplicate', 'too_close')
+ ORDER BY control_id
+""")
+top10_unmatched = cur.fetchall()
+print(f" Unmatched active OWASP Top 10: {len(top10_unmatched)}")
+
+# Separate: found in other OWASP PDF vs not found anywhere
+to_mark_dup = []
+to_fix_source = []
+
+for ctrl in top10_unmatched:
+ uid, cid, title, text, state = ctrl
+
+ # Check if found in another OWASP PDF
+ found_in = None
+ found_result = None
+ for other_src in OWASP_PDFS:
+ if other_src == 'OWASP Top 10 (2021)':
+ continue
+ result = find_in_pdf(text, other_src)
+ if result:
+ found_in = other_src
+ found_result = result
+ break
+
+ if found_in:
+ to_fix_source.append((uid, cid, found_in, found_result[0], found_result[1]))
+ else:
+ to_mark_dup.append((uid, cid))
+
+print(f" → Not found in any PDF (multilingual): {len(to_mark_dup)} → mark as duplicate")
+print(f" → Found in other OWASP PDF: {len(to_fix_source)} → fix source attribution")
+
+# Mark as duplicate
+dup_marked = 0
+for uid, cid in to_mark_dup:
+ cur.execute("""
+ UPDATE compliance.canonical_controls
+ SET release_state = 'duplicate'
+ WHERE id = %s AND release_state NOT IN ('duplicate', 'too_close')
+ """, (uid,))
+ if cur.rowcount > 0:
+ dup_marked += 1
+
+print(f" Marked as duplicate: {dup_marked}")
+
+# ═══════════════════════════════════════════════════════════════
+# STEP 2: Fix wrong source attributions across ALL OWASP sources
+# ═══════════════════════════════════════════════════════════════
+print(f"\n{'='*60}")
+print("STEP 2: Fix wrong OWASP source attributions")
+print("=" * 60)
+
+all_fixes = list(to_fix_source) # Start with Top 10 fixes
+
+# Also check ASVS, SAMM, MASVS
+for source in ['OWASP ASVS 4.0', 'OWASP SAMM 2.0', 'OWASP API Security Top 10 (2023)', 'OWASP MASVS 2.0']:
+ cur.execute("""
+ SELECT id, control_id, title, source_original_text
+ FROM compliance.canonical_controls
+ WHERE source_citation->>'source' = %s
+ AND source_citation->>'article_type' IS NULL
+ AND source_original_text IS NOT NULL
+ AND release_state NOT IN ('duplicate', 'too_close')
+ """, (source,))
+ controls = cur.fetchall()
+
+ for ctrl in controls:
+ uid, cid, title, text = ctrl
+ # Try own PDF first
+ result = find_in_pdf(text, source)
+ if result:
+ # Found in own PDF! Update article info
+ cur.execute("""
+ UPDATE compliance.canonical_controls
+ SET source_citation = source_citation ||
+ jsonb_build_object('article', %s, 'article_type', %s)
+ WHERE id = %s
+ AND (source_citation->>'article' IS DISTINCT FROM %s
+ OR source_citation->>'article_type' IS DISTINCT FROM %s)
+ """, (result[0], result[1], uid, result[0], result[1]))
+ continue
+
+ # Try other OWASP PDFs
+ for other_src in OWASP_PDFS:
+ if other_src == source:
+ continue
+ result = find_in_pdf(text, other_src)
+ if result:
+ all_fixes.append((uid, cid, other_src, result[0], result[1]))
+ break
+
+print(f" Total wrong-source controls found: {len(all_fixes)}")
+
+# Apply source fixes
+fixed = 0
+for uid, cid, correct_source, label, typ in all_fixes:
+ cur.execute("""
+ UPDATE compliance.canonical_controls
+ SET source_citation = source_citation ||
+ jsonb_build_object('source', %s, 'article', %s, 'article_type', %s)
+ WHERE id = %s
+ """, (correct_source, label, typ, uid,))
+ if cur.rowcount > 0:
+ fixed += 1
+ print(f" {cid:10s} → {correct_source} / {label} [{typ}]")
+
+print(f" Fixed: {fixed} controls")
+
+conn.commit()
+
+# ═══════════════════════════════════════════════════════════════
+# SUMMARY
+# ═══════════════════════════════════════════════════════════════
+print(f"\n{'='*60}")
+print("ZUSAMMENFASSUNG")
+print("=" * 60)
+print(f" OWASP Top 10 multilingual → duplicate: {dup_marked}")
+print(f" Wrong source attribution → fixed: {fixed}")
+
+# Final counts
+cur.execute("""
+ SELECT release_state, count(*)
+ FROM compliance.canonical_controls
+ GROUP BY release_state
+ ORDER BY count(*) DESC
+""")
+print(f"\n DB release_state nach Cleanup:")
+for row in cur.fetchall():
+ print(f" {row[0]:15s}: {row[1]:5d}")
+
+cur.execute("""
+ SELECT count(*)
+ FROM compliance.canonical_controls
+ WHERE release_state NOT IN ('duplicate', 'too_close')
+""")
+active = cur.fetchone()[0]
+print(f"\n Aktive Controls: {active}")
+
+conn.close()
diff --git a/scripts/qa/owasp_github_match.py b/scripts/qa/owasp_github_match.py
new file mode 100644
index 0000000..16e7b71
--- /dev/null
+++ b/scripts/qa/owasp_github_match.py
@@ -0,0 +1,316 @@
+"""Match unmatched OWASP ASVS/SAMM/MASVS controls against GitHub Markdown sources."""
+import os
+import re
+import unicodedata
+import psycopg2
+import urllib.parse
+from pathlib import Path
+
+GITHUB_DIR = Path(os.path.expanduser("~/rag-ingestion/owasp-github"))
+
+def normalize(s):
+ s = s.replace('\u00ad', '').replace('\xad', '')
+ s = s.replace('\u200b', '').replace('\u00a0', ' ')
+ s = s.replace('\ufb01', 'fi').replace('\ufb02', 'fl')
+ s = s.replace('\ufb00', 'ff').replace('\ufb03', 'ffi').replace('\ufb04', 'ffl')
+ s = s.replace('\u2019', "'").replace('\u2018', "'")
+ s = s.replace('\u201c', '"').replace('\u201d', '"')
+ s = s.replace('\u2013', '-').replace('\u2014', '-')
+ s = s.replace('\u2022', '-').replace('\u00b7', '-')
+ s = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', s)
+ s = unicodedata.normalize('NFC', s)
+ s = re.sub(r'\s+', ' ', s)
+ return s.strip()
+
+# ── Load Markdown sources ──
+def load_markdown_dir(path, pattern="*.md"):
+ """Load all markdown files, return combined text and per-file index."""
+ texts = {}
+ for f in sorted(path.glob(pattern)):
+ try:
+ texts[f.name] = f.read_text(encoding='utf-8', errors='replace')
+ except:
+ pass
+ return texts
+
+# ASVS 4.0 — V-files contain requirements
+asvs_dir = GITHUB_DIR / "ASVS" / "4.0" / "en"
+asvs_files = load_markdown_dir(asvs_dir)
+asvs_full = "\n".join(asvs_files.values())
+asvs_norm = normalize(asvs_full)
+print(f"ASVS 4.0 Markdown: {len(asvs_files)} files, {len(asvs_full):,} chars")
+
+# SAMM core — YAML + Markdown
+samm_dir = GITHUB_DIR / "samm-core"
+samm_texts = {}
+for f in samm_dir.rglob("*.yml"):
+ try:
+ samm_texts[str(f.relative_to(samm_dir))] = f.read_text(encoding='utf-8', errors='replace')
+ except:
+ pass
+for f in samm_dir.rglob("*.md"):
+ try:
+ samm_texts[str(f.relative_to(samm_dir))] = f.read_text(encoding='utf-8', errors='replace')
+ except:
+ pass
+samm_full = "\n".join(samm_texts.values())
+samm_norm = normalize(samm_full)
+print(f"SAMM 2.0 source: {len(samm_texts)} files, {len(samm_full):,} chars")
+
+# MASVS — control markdown files
+masvs_dir = GITHUB_DIR / "masvs"
+masvs_files = {}
+for f in masvs_dir.rglob("*.md"):
+ try:
+ masvs_files[str(f.relative_to(masvs_dir))] = f.read_text(encoding='utf-8', errors='replace')
+ except:
+ pass
+masvs_full = "\n".join(masvs_files.values())
+masvs_norm = normalize(masvs_full)
+print(f"MASVS 2.0 source: {len(masvs_files)} files, {len(masvs_full):,} chars")
+
+# API Security
+api_dir = GITHUB_DIR / "api-security"
+api_files = {}
+for f in api_dir.rglob("*.md"):
+ try:
+ api_files[str(f.relative_to(api_dir))] = f.read_text(encoding='utf-8', errors='replace')
+ except:
+ pass
+api_full = "\n".join(api_files.values())
+api_norm = normalize(api_full)
+print(f"API Security source: {len(api_files)} files, {len(api_full):,} chars")
+
+# Source → (normalized_text, index_builder)
+SOURCE_GITHUB = {
+ "OWASP ASVS 4.0": asvs_norm,
+ "OWASP SAMM 2.0": samm_norm,
+ "OWASP MASVS 2.0": masvs_norm,
+ "OWASP API Security Top 10 (2023)": api_norm,
+}
+
+# Build indexes for each source
+def build_asvs_index(text):
+ items = []
+ for m in re.finditer(r'(V\d+\.\d+(?:\.\d+)?)\b', text):
+ items.append((m.start(), m.group(1), "requirement"))
+ items.sort(key=lambda x: x[0])
+ seen = set()
+ return [(p, l, t) for p, l, t in items if l not in seen and not seen.add(l)]
+
+def build_samm_index(text):
+ items = []
+ # SAMM practices have names like "Strategy & Metrics", sections numbered
+ for m in re.finditer(r'(?:^|\s)(\d+\.\d+(?:\.\d+)?)\s+[A-Z]', text):
+ items.append((m.start(), f"Section {m.group(1)}", "section"))
+ # Also find practice identifiers
+ for m in re.finditer(r'((?:Strategy|Education|Policy|Threat|Security Requirements|Secure Architecture|'
+ r'Secure Build|Secure Deployment|Defect Management|Environment Management|'
+ r'Incident Management|Requirements Testing|Security Testing|'
+ r'Design Review|Implementation Review|Operations Management)'
+ r'[^.\n]{0,30})', text):
+ items.append((m.start(), m.group(1)[:50], "section"))
+ items.sort(key=lambda x: x[0])
+ seen = set()
+ return [(p, l, t) for p, l, t in items if l not in seen and not seen.add(l)]
+
+def build_masvs_index(text):
+ items = []
+ for m in re.finditer(r'(MASVS-[A-Z]+-\d+)', text):
+ items.append((m.start(), m.group(1), "requirement"))
+ items.sort(key=lambda x: x[0])
+ seen = set()
+ return [(p, l, t) for p, l, t in items if l not in seen and not seen.add(l)]
+
+def build_api_index(text):
+ items = []
+ for m in re.finditer(r'(API\d+:\d{4})', text):
+ items.append((m.start(), m.group(1), "category"))
+ items.sort(key=lambda x: x[0])
+ seen = set()
+ return [(p, l, t) for p, l, t in items if l not in seen and not seen.add(l)]
+
+SOURCE_INDEX_BUILDERS = {
+ "OWASP ASVS 4.0": build_asvs_index,
+ "OWASP SAMM 2.0": build_samm_index,
+ "OWASP MASVS 2.0": build_masvs_index,
+ "OWASP API Security Top 10 (2023)": build_api_index,
+}
+
+# Build all indexes on normalized text
+source_indexes = {}
+for name, norm_text in SOURCE_GITHUB.items():
+ builder = SOURCE_INDEX_BUILDERS[name]
+ idx = builder(norm_text)
+ source_indexes[name] = idx
+ print(f" {name}: {len(idx)} index entries")
+
+def find_text(orig_text, source_name):
+ """Find control text in GitHub source. Returns (label, type) or None."""
+ norm_text = SOURCE_GITHUB.get(source_name)
+ if not norm_text:
+ return None
+ idx = source_indexes.get(source_name, [])
+ orig_norm = normalize(orig_text)
+ if len(orig_norm) < 20:
+ return None
+
+ for start_frac in [0.25, 0.1, 0.5, 0.0, 0.75]:
+ for length in [80, 60, 40, 30, 20]:
+ start = max(0, int(len(orig_norm) * start_frac))
+ snippet = orig_norm[start:start+length]
+ if not snippet or len(snippet) < 15:
+ continue
+ pos = norm_text.find(snippet)
+ if pos >= 0:
+ label = "Unknown"
+ typ = "unknown"
+ for h_pos, h_label, h_type in reversed(idx):
+ if h_pos <= pos:
+ label = h_label
+ typ = h_type
+ break
+ return (label, typ)
+ return None
+
+def find_in_any_github(orig_text, exclude_source=None):
+ """Try all GitHub sources."""
+ for name in SOURCE_GITHUB:
+ if name == exclude_source:
+ continue
+ result = find_text(orig_text, name)
+ if result:
+ return (name, result[0], result[1])
+ return None
+
+# ── DB ──
+db_url = os.environ['DATABASE_URL']
+parsed = urllib.parse.urlparse(db_url)
+conn = psycopg2.connect(
+ host=parsed.hostname, port=parsed.port or 5432,
+ user=parsed.username, password=parsed.password,
+ dbname=parsed.path.lstrip('/'),
+ options="-c search_path=compliance,public"
+)
+cur = conn.cursor()
+
+# ── Process each OWASP source ──
+total_matched = 0
+total_cross = 0
+total_not_found = 0
+all_updates = []
+
+for source in ['OWASP ASVS 4.0', 'OWASP SAMM 2.0', 'OWASP MASVS 2.0', 'OWASP API Security Top 10 (2023)']:
+ cur.execute("""
+ SELECT id, control_id, title, source_original_text, release_state
+ FROM compliance.canonical_controls
+ WHERE source_citation->>'source' = %s
+ AND source_citation->>'article_type' IS NULL
+ AND source_original_text IS NOT NULL
+ AND release_state NOT IN ('duplicate', 'too_close')
+ ORDER BY control_id
+ """, (source,))
+ controls = cur.fetchall()
+
+ if not controls:
+ continue
+
+ print(f"\n{'='*60}")
+ print(f"{source} — {len(controls)} unmatched active")
+ print(f"{'='*60}")
+
+ matched = 0
+ cross_matched = 0
+ not_found = 0
+
+ for ctrl in controls:
+ uid, cid, title, text, state = ctrl
+
+ # Try own GitHub source
+ result = find_text(text, source)
+ if result:
+ matched += 1
+ total_matched += 1
+ all_updates.append((uid, cid, source, result[0], result[1]))
+ print(f" {cid:10s} → {result[0]:30s} [{result[1]}]")
+ continue
+
+ # Try other GitHub sources
+ cross = find_in_any_github(text, exclude_source=source)
+ if cross:
+ cross_matched += 1
+ total_cross += 1
+ all_updates.append((uid, cid, cross[0], cross[1], cross[2]))
+ print(f" {cid:10s} → [{cross[0]}] {cross[1]:20s} [{cross[2]}] (CROSS)")
+ continue
+
+ not_found += 1
+ total_not_found += 1
+
+ print(f"\n Own source matched: {matched}")
+ print(f" Cross-source: {cross_matched}")
+ print(f" Not found: {not_found}")
+
+# ── Also try OWASP Top 10 remaining unmatched (34 active left after dup marking) ──
+cur.execute("""
+ SELECT id, control_id, title, source_original_text, release_state
+ FROM compliance.canonical_controls
+ WHERE source_citation->>'source' = 'OWASP Top 10 (2021)'
+ AND source_citation->>'article_type' IS NULL
+ AND source_original_text IS NOT NULL
+ AND release_state NOT IN ('duplicate', 'too_close')
+ ORDER BY control_id
+""")
+top10_remaining = cur.fetchall()
+if top10_remaining:
+ print(f"\n{'='*60}")
+ print(f"OWASP Top 10 (2021) — {len(top10_remaining)} remaining unmatched active")
+ print(f"{'='*60}")
+ for ctrl in top10_remaining:
+ uid, cid, title, text, state = ctrl
+ cross = find_in_any_github(text)
+ if cross:
+ total_cross += 1
+ all_updates.append((uid, cid, cross[0], cross[1], cross[2]))
+ print(f" {cid:10s} → [{cross[0]}] {cross[1]:20s} [{cross[2]}]")
+ else:
+ total_not_found += 1
+
+# ── Summary ──
+print(f"\n{'='*60}")
+print(f"ZUSAMMENFASSUNG")
+print(f"{'='*60}")
+print(f" Matched in eigener GitHub-Quelle: {total_matched}")
+print(f" Cross-source matched: {total_cross}")
+print(f" Nicht gefunden: {total_not_found}")
+print(f" Total Updates: {len(all_updates)}")
+
+# ── Apply updates ──
+if all_updates:
+ print(f"\nApplying {len(all_updates)} updates to DB...")
+ applied = 0
+ for uid, cid, correct_source, label, typ in all_updates:
+ # Update article + article_type, and fix source if cross-matched
+ cur.execute("""
+ UPDATE compliance.canonical_controls
+ SET source_citation = source_citation ||
+ jsonb_build_object('article', %s, 'article_type', %s)
+ WHERE id = %s
+ AND (source_citation->>'article' IS DISTINCT FROM %s
+ OR source_citation->>'article_type' IS DISTINCT FROM %s)
+ """, (label, typ, uid, label, typ))
+ if cur.rowcount > 0:
+ applied += 1
+
+ conn.commit()
+ print(f" Applied: {applied} controls updated")
+
+ # Type distribution
+ type_counts = {}
+ for _, _, _, _, typ in all_updates:
+ type_counts[typ] = type_counts.get(typ, 0) + 1
+ print(f"\n Article type distribution:")
+ for t, c in sorted(type_counts.items(), key=lambda x: -x[1]):
+ print(f" {t:12s}: {c:5d}")
+
+conn.close()
diff --git a/scripts/qa/phase5_normalize_and_cleanup.py b/scripts/qa/phase5_normalize_and_cleanup.py
new file mode 100644
index 0000000..bebd2fa
--- /dev/null
+++ b/scripts/qa/phase5_normalize_and_cleanup.py
@@ -0,0 +1,357 @@
+"""Phase 5: Source Normalization + Duplicate Hard Delete.
+
+Steps:
+ 1. OSCAL controls: add source_regulation to generation_metadata
+ 2. Fix 20 v3 controls with NULL source (tag as manually_reviewed)
+ 3. Fix empty-string source (DATA-631 → Telekommunikationsgesetz Oesterreich)
+ 4. Fix OWASP cross-source misattributions (regulation_code vs actual source)
+ 5. Hard delete duplicate/too_close controls (3,301 controls, 0 FK refs)
+ 6. Clean up canonical_processed_chunks generated_control_ids
+
+Usage:
+ export DATABASE_URL='postgresql://...'
+ python3 scripts/qa/phase5_normalize_and_cleanup.py [--dry-run] [--step N]
+"""
+import os
+import sys
+import json
+import psycopg2
+import urllib.parse
+
+DRY_RUN = "--dry-run" in sys.argv
+STEP_ONLY = None
+for arg in sys.argv:
+ if arg.startswith("--step"):
+ idx = sys.argv.index(arg)
+ if idx + 1 < len(sys.argv):
+ STEP_ONLY = int(sys.argv[idx + 1])
+
+db_url = os.environ['DATABASE_URL']
+parsed = urllib.parse.urlparse(db_url)
+conn = psycopg2.connect(
+ host=parsed.hostname, port=parsed.port or 5432,
+ user=parsed.username, password=parsed.password,
+ dbname=parsed.path.lstrip('/'),
+ options="-c search_path=compliance,public"
+)
+cur = conn.cursor()
+
+def should_run(step):
+ return STEP_ONLY is None or STEP_ONLY == step
+
+
+# ══════════════════════════════════════════════════════════════════
+# Step 1: OSCAL controls — add source_regulation to generation_metadata
+# ══════════════════════════════════════════════════════════════════
+if should_run(1):
+ print("=" * 70)
+ print("STEP 1: OSCAL controls — source_regulation in generation_metadata")
+ print("=" * 70)
+
+ cur.execute("""
+ SELECT count(*)
+ FROM compliance.canonical_controls
+ WHERE generation_strategy = 'oscal_import'
+ AND (generation_metadata->>'source_regulation' IS NULL
+ OR generation_metadata->>'source_regulation' = '')
+ """)
+ count = cur.fetchone()[0]
+ print(f" OSCAL controls without source_regulation: {count}")
+
+ if count > 0:
+ if DRY_RUN:
+ print(f" [DRY RUN] Would update {count} controls")
+ else:
+ cur.execute("""
+ UPDATE compliance.canonical_controls
+ SET generation_metadata = COALESCE(generation_metadata, '{}'::jsonb)
+ || '{"source_regulation": "nist_sp800_53r5"}'::jsonb
+ WHERE generation_strategy = 'oscal_import'
+ AND (generation_metadata->>'source_regulation' IS NULL
+ OR generation_metadata->>'source_regulation' = '')
+ """)
+ print(f" Updated: {cur.rowcount}")
+ print()
+
+
+# ══════════════════════════════════════════════════════════════════
+# Step 2: v3 controls with NULL source — tag source as best guess
+# ══════════════════════════════════════════════════════════════════
+if should_run(2):
+ print("=" * 70)
+ print("STEP 2: Fix v3 controls with NULL source")
+ print("=" * 70)
+
+ # These 20 controls are v3/document_grouped with no source or regulation.
+ # Based on title analysis, they cover:
+ # - Data protection/privacy topics (DSGVO-adjacent)
+ # - Software security (OWASP/NIST-adjacent)
+ # - Mobile security (OWASP MASVS-adjacent)
+ # Mark them as 'needs_review' and add a flag.
+ cur.execute("""
+ SELECT id, control_id, title
+ FROM compliance.canonical_controls
+ WHERE source_citation->>'source' IS NULL
+ AND pipeline_version = 3
+ AND release_state NOT IN ('duplicate', 'too_close')
+ """)
+ v3_null = cur.fetchall()
+ print(f" v3 controls with NULL source: {len(v3_null)}")
+
+ if v3_null:
+ if DRY_RUN:
+ print(f" [DRY RUN] Would mark {len(v3_null)} as needs_review")
+ else:
+ for ctrl_id_uuid, control_id, title in v3_null:
+ cur.execute("""
+ UPDATE compliance.canonical_controls
+ SET release_state = 'needs_review',
+ generation_metadata = COALESCE(generation_metadata, '{}'::jsonb)
+ || '{"missing_source": true}'::jsonb
+ WHERE id = %s
+ """, (ctrl_id_uuid,))
+ print(f" Marked {len(v3_null)} as needs_review with missing_source flag")
+ print()
+
+
+# ══════════════════════════════════════════════════════════════════
+# Step 3: Fix empty-string source (DATA-631)
+# ══════════════════════════════════════════════════════════════════
+if should_run(3):
+ print("=" * 70)
+ print("STEP 3: Fix empty-string source")
+ print("=" * 70)
+
+ cur.execute("""
+ SELECT id, control_id, title,
+ generation_metadata->>'source_regulation' as reg
+ FROM compliance.canonical_controls
+ WHERE source_citation->>'source' = ''
+ AND release_state NOT IN ('duplicate', 'too_close')
+ """)
+ empty_src = cur.fetchall()
+ print(f" Controls with empty source: {len(empty_src)}")
+
+ for ctrl_id_uuid, control_id, title, reg in empty_src:
+ print(f" {control_id} | reg={reg} | {title[:60]}")
+ if reg == 'at_tkg':
+ new_source = 'Telekommunikationsgesetz Oesterreich'
+ else:
+ new_source = f"Unbekannt ({reg})"
+
+ if DRY_RUN:
+ print(f" [DRY RUN] Would set source='{new_source}'")
+ else:
+ cur.execute("""
+ UPDATE compliance.canonical_controls
+ SET source_citation = jsonb_set(
+ source_citation, '{source}', %s::jsonb
+ )
+ WHERE id = %s
+ """, (json.dumps(new_source), ctrl_id_uuid))
+ print(f" Set source='{new_source}'")
+ print()
+
+
+# ══════════════════════════════════════════════════════════════════
+# Step 4: Fix OWASP cross-source misattributions
+# ══════════════════════════════════════════════════════════════════
+if should_run(4):
+ print("=" * 70)
+ print("STEP 4: Fix OWASP cross-source misattributions")
+ print("=" * 70)
+
+ # Controls where source_citation.source doesn't match the regulation_code
+ OWASP_REG_TO_SOURCE = {
+ 'owasp_top10_2021': 'OWASP Top 10 (2021)',
+ 'owasp_asvs': 'OWASP ASVS 4.0',
+ 'owasp_masvs': 'OWASP MASVS 2.0',
+ 'owasp_samm': 'OWASP SAMM 2.0',
+ 'owasp_api_top10_2023': 'OWASP API Security Top 10 (2023)',
+ }
+
+ # Strategy: Move controls to the regulation_code that matches their actual source
+ # i.e., if a control has source='OWASP ASVS 4.0' but reg='owasp_top10_2021',
+ # update the reg to 'owasp_asvs'
+ SOURCE_TO_REG = {v: k for k, v in OWASP_REG_TO_SOURCE.items()}
+
+ total_fixed = 0
+ for reg_code, expected_source in OWASP_REG_TO_SOURCE.items():
+ cur.execute("""
+ SELECT id, control_id, source_citation->>'source' as src
+ FROM compliance.canonical_controls
+ WHERE generation_metadata->>'source_regulation' = %s
+ AND source_citation->>'source' <> %s
+ AND release_state NOT IN ('duplicate', 'too_close')
+ """, (reg_code, expected_source))
+ mismatches = cur.fetchall()
+
+ if mismatches:
+ print(f"\n {reg_code} → {len(mismatches)} Mismatches:")
+ for ctrl_id_uuid, control_id, actual_source in mismatches:
+ correct_reg = SOURCE_TO_REG.get(actual_source)
+ if correct_reg:
+ print(f" {control_id} | {actual_source} → reg={correct_reg}")
+ if not DRY_RUN:
+ cur.execute("""
+ UPDATE compliance.canonical_controls
+ SET generation_metadata = jsonb_set(
+ generation_metadata, '{source_regulation}', %s::jsonb
+ )
+ WHERE id = %s
+ """, (json.dumps(correct_reg), ctrl_id_uuid))
+ total_fixed += 1
+ else:
+ print(f" {control_id} | {actual_source} → no mapping found")
+
+ if DRY_RUN:
+ print(f"\n [DRY RUN] Would fix {total_fixed} misattributions")
+ else:
+ print(f"\n Fixed: {total_fixed} misattributions")
+ print()
+
+
+# ══════════════════════════════════════════════════════════════════
+# Step 5: Hard delete duplicate/too_close controls
+# ══════════════════════════════════════════════════════════════════
+if should_run(5):
+ print("=" * 70)
+ print("STEP 5: Hard delete duplicate/too_close controls")
+ print("=" * 70)
+
+ # Verify no FK references
+ for table, col in [
+ ('canonical_control_mappings', 'control_id'),
+ ('obligation_extractions', 'control_uuid'),
+ ('crosswalk_matrix', 'master_control_uuid'),
+ ('obligation_candidates', 'parent_control_uuid'),
+ ]:
+ cur.execute(f"""
+ SELECT count(*)
+ FROM compliance.{table} t
+ JOIN compliance.canonical_controls cc ON cc.id = t.{col}
+ WHERE cc.release_state IN ('duplicate', 'too_close')
+ """)
+ fk_count = cur.fetchone()[0]
+ if fk_count > 0:
+ print(f" WARNING: {table}.{col} has {fk_count} refs to dup/too_close!")
+ print(f" ABORTING Step 5 — clean FK refs first!")
+ sys.exit(1)
+ else:
+ print(f" {table}.{col}: 0 refs ✓")
+
+ # Check self-references
+ cur.execute("""
+ SELECT count(*)
+ FROM compliance.canonical_controls child
+ JOIN compliance.canonical_controls parent ON parent.id = child.parent_control_uuid
+ WHERE parent.release_state IN ('duplicate', 'too_close')
+ """)
+ self_refs = cur.fetchone()[0]
+ if self_refs > 0:
+ print(f" WARNING: {self_refs} child controls reference dup/too_close parents!")
+ print(f" ABORTING Step 5!")
+ sys.exit(1)
+ print(f" Self-references: 0 ✓")
+
+ cur.execute("""
+ SELECT release_state, count(*)
+ FROM compliance.canonical_controls
+ WHERE release_state IN ('duplicate', 'too_close')
+ GROUP BY 1
+ """)
+ to_delete = {}
+ for state, cnt in cur.fetchall():
+ to_delete[state] = cnt
+ print(f"\n {state}: {cnt}")
+
+ total = sum(to_delete.values())
+ print(f"\n TOTAL to delete: {total}")
+
+ if DRY_RUN:
+ print(f" [DRY RUN] Would delete {total} controls")
+ else:
+ cur.execute("""
+ DELETE FROM compliance.canonical_controls
+ WHERE release_state IN ('duplicate', 'too_close')
+ """)
+ print(f" Deleted: {cur.rowcount} controls")
+ print()
+
+
+# ══════════════════════════════════════════════════════════════════
+# Step 6: Clean up canonical_processed_chunks generated_control_ids
+# ══════════════════════════════════════════════════════════════════
+if should_run(6):
+ print("=" * 70)
+ print("STEP 6: Clean up processed chunks (remove deleted control IDs)")
+ print("=" * 70)
+
+ if DRY_RUN and should_run(5):
+ print(" [DRY RUN] Skipping — depends on Step 5 deletion")
+ else:
+ # Find chunks that reference non-existent controls
+ cur.execute("""
+ SELECT id, generated_control_ids
+ FROM compliance.canonical_processed_chunks
+ WHERE generated_control_ids IS NOT NULL
+ AND generated_control_ids <> '[]'::jsonb
+ """)
+ chunks = cur.fetchall()
+ print(f" Chunks with generated_control_ids: {len(chunks)}")
+
+ # Get all existing control IDs
+ cur.execute("SELECT id::text FROM compliance.canonical_controls")
+ existing_ids = set(r[0] for r in cur.fetchall())
+ print(f" Existing controls: {len(existing_ids)}")
+
+ cleaned = 0
+ for chunk_id, control_ids in chunks:
+ if isinstance(control_ids, str):
+ control_ids = json.loads(control_ids)
+ if isinstance(control_ids, list):
+ valid_ids = [cid for cid in control_ids if cid in existing_ids]
+ if len(valid_ids) < len(control_ids):
+ removed = len(control_ids) - len(valid_ids)
+ cur.execute("""
+ UPDATE compliance.canonical_processed_chunks
+ SET generated_control_ids = %s::jsonb
+ WHERE id = %s
+ """, (json.dumps(valid_ids), chunk_id))
+ cleaned += 1
+
+ print(f" Chunks cleaned: {cleaned}")
+ print()
+
+
+# ══════════════════════════════════════════════════════════════════
+# Final summary
+# ══════════════════════════════════════════════════════════════════
+if not DRY_RUN:
+ conn.commit()
+ print("=" * 70)
+ print("COMMITTED. Final state:")
+ print("=" * 70)
+else:
+ print("=" * 70)
+ print("[DRY RUN] No changes committed. Current state:")
+ print("=" * 70)
+
+cur.execute("""
+ SELECT release_state, count(*)
+ FROM compliance.canonical_controls
+ GROUP BY 1
+ ORDER BY count(*) DESC
+""")
+total = 0
+active = 0
+for state, cnt in cur.fetchall():
+ total += cnt
+ if state not in ('duplicate', 'too_close'):
+ active += cnt
+ print(f" {state:15s}: {cnt:5d}")
+
+print(f"\n TOTAL: {total}")
+print(f" AKTIV: {active}")
+
+conn.close()
diff --git a/scripts/qa/phase74_generate_gap_controls.py b/scripts/qa/phase74_generate_gap_controls.py
new file mode 100644
index 0000000..d83b6e8
--- /dev/null
+++ b/scripts/qa/phase74_generate_gap_controls.py
@@ -0,0 +1,655 @@
+#!/usr/bin/env python3
+"""
+Phase 7.4: Generate new controls for gap articles via Anthropic Claude Sonnet.
+
+Reads gap_analysis_results.json, extracts article text from PDFs,
+calls Claude Sonnet to generate controls, inserts into DB.
+
+Usage:
+ python3 phase74_generate_gap_controls.py --dry-run # show what would be generated
+ python3 phase74_generate_gap_controls.py # generate and insert
+ python3 phase74_generate_gap_controls.py --source "DSGVO" # filter by source
+ python3 phase74_generate_gap_controls.py --resume # skip already-generated articles
+"""
+import os
+import sys
+import json
+import re
+import time
+import hashlib
+import argparse
+import psycopg2
+import urllib.parse
+import requests
+from pathlib import Path
+from collections import Counter
+
+sys.path.insert(0, os.path.dirname(__file__))
+from pdf_qa_all import (
+ SOURCE_FILE_MAP, read_file, classify_doc, normalize,
+ build_eu_article_index, build_de_law_index, build_nist_index,
+ build_owasp_index, build_generic_index, MAX_ARTICLES,
+)
+
+# ── Config ──────────────────────────────────────────────────────────
+ANTHROPIC_URL = "https://api.anthropic.com/v1/messages"
+ANTHROPIC_MODEL = os.environ.get("CONTROL_GEN_ANTHROPIC_MODEL", "claude-sonnet-4-6")
+ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", "")
+PIPELINE_VERSION = 5
+GAP_RESULTS_FILE = "/tmp/gap_analysis_results.json"
+PDF_DIR = Path(os.path.expanduser("~/rag-ingestion/pdfs"))
+
+try:
+ import fitz
+except ImportError:
+ fitz = None
+
+# ── Source name → regulation_code reverse map ────────────────────────
+# Built from REGULATION_LICENSE_MAP in control_generator.py
+SOURCE_TO_REGCODE = {
+ "DSGVO (EU) 2016/679": "eu_2016_679",
+ "KI-Verordnung (EU) 2024/1689": "eu_2024_1689",
+ "NIS2-Richtlinie (EU) 2022/2555": "eu_2022_2555",
+ "Cyber Resilience Act (CRA)": "eu_2024_2847",
+ "Maschinenverordnung (EU) 2023/1230": "eu_2023_1230",
+ "EU Blue Guide 2022": "eu_blue_guide_2022",
+ "Markets in Crypto-Assets (MiCA)": "mica",
+ "Batterieverordnung (EU) 2023/1542": "eu_2023_1542",
+ "AML-Verordnung": "amlr",
+ "Data Governance Act (DGA)": "dga",
+ "Data Act": "data_act",
+ "GPSR (EU) 2023/988": "gpsr",
+ "IFRS-Übernahmeverordnung": "ifrs",
+ "NIST SP 800-53 Rev. 5": "nist_sp800_53r5",
+ "NIST SP 800-207 (Zero Trust)": "nist_sp800_207",
+ "NIST SP 800-63-3": "nist_sp800_63_3",
+ "NIST AI Risk Management Framework": "nist_ai_rmf",
+ "NIST SP 800-218 (SSDF)": "nist_sp_800_218",
+ "NIST Cybersecurity Framework 2.0": "nist_csf_2_0",
+ "OWASP Top 10 (2021)": "owasp_top10",
+ "OWASP ASVS 4.0": "owasp_asvs",
+ "OWASP SAMM 2.0": "owasp_samm",
+ "OWASP API Security Top 10 (2023)": "owasp_api_top10",
+ "OWASP MASVS 2.0": "owasp_masvs",
+ "ENISA ICS/SCADA Dependencies": "enisa_ics_scada",
+ "ENISA Supply Chain Good Practices": "enisa_supply_chain",
+ "CISA Secure by Design": "cisa_sbd",
+ "Bundesdatenschutzgesetz (BDSG)": "bdsg",
+ "Gewerbeordnung (GewO)": "gewo",
+ "Handelsgesetzbuch (HGB)": "hgb",
+ "Abgabenordnung (AO)": "ao",
+ "OECD KI-Empfehlung": "oecd_ai_principles",
+}
+
+# License info per regulation code (from REGULATION_LICENSE_MAP)
+LICENSE_MAP = {
+ "eu_2016_679": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
+ "eu_2024_1689": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
+ "eu_2022_2555": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
+ "eu_2024_2847": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
+ "eu_2023_1230": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
+ "eu_blue_guide_2022": {"license": "EU_PUBLIC", "rule": 1, "source_type": "guideline"},
+ "mica": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
+ "eu_2023_1542": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
+ "amlr": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
+ "dga": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
+ "data_act": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
+ "gpsr": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
+ "ifrs": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
+ "nist_sp800_53r5": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
+ "nist_sp800_207": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
+ "nist_sp800_63_3": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
+ "nist_ai_rmf": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
+ "nist_sp_800_218": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
+ "nist_csf_2_0": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
+ "owasp_top10": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard"},
+ "owasp_asvs": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard"},
+ "owasp_samm": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard"},
+ "owasp_api_top10": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard"},
+ "owasp_masvs": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard"},
+ "enisa_ics_scada": {"license": "CC-BY-4.0", "rule": 2, "source_type": "guideline"},
+ "enisa_supply_chain": {"license": "CC-BY-4.0", "rule": 2, "source_type": "guideline"},
+ "cisa_sbd": {"license": "US_GOV_PUBLIC", "rule": 1, "source_type": "guideline"},
+ "bdsg": {"license": "DE_LAW", "rule": 1, "source_type": "law"},
+ "gewo": {"license": "DE_LAW", "rule": 1, "source_type": "law"},
+ "hgb": {"license": "DE_LAW", "rule": 1, "source_type": "law"},
+ "ao": {"license": "DE_LAW", "rule": 1, "source_type": "law"},
+ "oecd_ai_principles": {"license": "OECD_PUBLIC", "rule": 2, "source_type": "standard"},
+}
+
+# Domain detection keywords
+DOMAIN_KEYWORDS = {
+ "AUTH": ["authentifizierung", "anmeldung", "login", "passwort", "identit", "identity", "credential"],
+ "CRYP": ["verschlüsselung", "kryptogra", "encrypt", "cipher", "hash", "tls", "ssl", "signatur"],
+ "NET": ["netzwerk", "network", "firewall", "router", "dns", "ip-adress"],
+ "DATA": ["daten", "data", "personenbezogen", "datenschutz", "privacy", "gdpr", "dsgvo", "verarbeitung"],
+ "LOG": ["protokoll", "logging", "audit", "nachvollzieh", "aufzeichn"],
+ "ACC": ["zugriff", "access", "berechtigung", "autorisierung", "authorization", "rolle"],
+ "SEC": ["sicherheit", "security", "schutz", "protect", "schwachstell", "vulnerab"],
+ "INC": ["vorfall", "incident", "breach", "meldung", "reaktion", "response", "notfall"],
+ "AI": ["künstliche intelligenz", "ki-system", "ai system", "machine learning", "algorithm", "hochrisiko-ki"],
+ "COMP": ["compliance", "konformität", "audit", "zertifizierung", "regulier", "vorschrift"],
+ "GOV": ["behörde", "aufsicht", "governance", "marktüberwachung", "authority"],
+ "FIN": ["finanz", "zahlungs", "payment", "crypto", "krypto-", "geldwäsche", "aml"],
+ "ENV": ["umwelt", "environment", "batterie", "recycling", "entsorgu", "nachhaltig"],
+}
+
+# ── Prompt (same as control_generator.py) ────────────────────────────
+
+SYSTEM_PROMPT = """Du bist ein Security-Compliance-Experte. Strukturiere den gegebenen Text
+als praxisorientiertes Security Control. Erstelle eine verständliche, umsetzbare Formulierung.
+Antworte NUR mit validem JSON. Bei mehreren Controls antworte mit einem JSON-Array."""
+
+APPLICABILITY_PROMPT = """- applicable_industries: Liste der Branchen fuer die dieses Control relevant ist.
+ Verwende ["all"] wenn der Control branchenuebergreifend gilt.
+ Moegliche Werte: "all", "Technologie / IT", "IT Dienstleistungen", "E-Commerce / Handel",
+ "Finanzdienstleistungen", "Versicherungen", "Gesundheitswesen", "Pharma", "Bildung",
+ "Beratung / Consulting", "Marketing / Agentur", "Produktion / Industrie",
+ "Logistik / Transport", "Immobilien", "Bau", "Energie", "Automobil",
+ "Luft- / Raumfahrt", "Maschinenbau", "Anlagenbau", "Automatisierung", "Robotik",
+ "Messtechnik", "Agrar", "Chemie", "Minen / Bergbau", "Telekommunikation",
+ "Medien / Verlage", "Gastronomie / Hotellerie", "Recht / Kanzlei",
+ "Oeffentlicher Dienst", "Verteidigung / Ruestung", "Wasser- / Abwasserwirtschaft",
+ "Lebensmittel", "Digitale Infrastruktur", "Weltraum", "Post / Kurierdienste",
+ "Abfallwirtschaft", "Forschung"
+- applicable_company_size: Ab welcher Unternehmensgroesse gilt dieses Control?
+ Verwende ["all"] wenn keine Groessenbeschraenkung.
+ Moegliche Werte: "all", "micro", "small", "medium", "large", "enterprise"
+- scope_conditions: null wenn keine besonderen Bedingungen, sonst:
+ {"requires_any": ["signal"], "description": "Erklaerung"}
+ Moegliche Signale: "uses_ai", "third_country_transfer", "processes_health_data",
+ "processes_minors_data", "automated_decisions", "employee_monitoring",
+ "video_surveillance", "financial_data", "is_kritis_operator", "payment_services" """
+
+CATEGORY_LIST = [
+ "Datenschutz-Grundlagen", "Betroffenenrechte", "Technische Massnahmen",
+ "Organisatorische Massnahmen", "Auftragsverarbeitung", "Datentransfer",
+ "Risikomanagement", "Incident Response", "KI-Regulierung", "Cybersicherheit",
+ "Zugriffskontrolle", "Kryptographie", "Netzwerksicherheit", "Compliance-Management",
+ "Produktsicherheit", "Marktüberwachung", "Supply Chain Security",
+ "Finanzregulierung", "Arbeitsrecht", "Gewerberecht", "Handelsrecht",
+ "Umwelt / Nachhaltigkeit", "Dokumentation", "Schulung / Awareness",
+]
+CATEGORY_LIST_STR = ", ".join(f'"{c}"' for c in CATEGORY_LIST)
+
+
+def build_prompt(source_name, article_label, article_text, license_type):
+ return f"""Strukturiere den folgenden Gesetzestext als Security/Compliance Control.
+Du DARFST den Originaltext verwenden (Quelle: {source_name}, {license_type}).
+
+WICHTIG: Erstelle eine verständliche, praxisorientierte Formulierung.
+Der Originaltext wird separat gespeichert — deine Formulierung soll klar und umsetzbar sein.
+
+Gib JSON zurück mit diesen Feldern:
+- title: Kurzer prägnanter Titel (max 100 Zeichen)
+- objective: Was soll erreicht werden? (1-3 Sätze)
+- rationale: Warum ist das wichtig? (1-2 Sätze)
+- requirements: Liste von konkreten Anforderungen (Strings)
+- test_procedure: Liste von Prüfschritten (Strings)
+- evidence: Liste von Nachweisdokumenten (Strings)
+- severity: low/medium/high/critical
+- tags: Liste von Tags
+- domain: Fachgebiet als Kuerzel (AUTH=Authentifizierung, CRYP=Kryptographie, NET=Netzwerk, DATA=Datenschutz, LOG=Logging, ACC=Zugriffskontrolle, SEC=IT-Sicherheit, INC=Vorfallmanagement, AI=KI, COMP=Compliance, GOV=Behoerden/Verwaltung, LAB=Arbeitsrecht, FIN=Finanzregulierung, TRD=Gewerbe/Handelsrecht, ENV=Umwelt, HLT=Gesundheit)
+- category: Inhaltliche Kategorie. Moegliche Werte: {CATEGORY_LIST_STR}
+- target_audience: Liste der Zielgruppen (z.B. "unternehmen", "behoerden", "entwickler", "datenschutzbeauftragte", "geschaeftsfuehrung", "it-abteilung", "rechtsabteilung", "compliance-officer")
+- source_article: Artikel-/Paragraphen-Referenz (z.B. "Artikel 10", "§ 42")
+- source_paragraph: Absatz-Referenz (z.B. "Absatz 5", "Nr. 2")
+{APPLICABILITY_PROMPT}
+
+Text: {article_text[:3000]}
+Quelle: {source_name}, {article_label}"""
+
+
+# ── PDF article extraction ───────────────────────────────────────────
+
+def extract_article_text(pdf_file, article_label, doc_type, full_text=None):
+ """Extract the text of a specific article from a PDF."""
+ if full_text is None:
+ full_text = read_file(pdf_file)
+ if not full_text:
+ return ""
+
+ if doc_type == "eu_regulation":
+ art_num_match = re.search(r'\d+', article_label)
+ if not art_num_match:
+ return ""
+ num = int(art_num_match.group())
+ pattern = rf'\nArtikel\s+{num}\s*\n'
+ match = re.search(pattern, full_text)
+ if not match:
+ return ""
+ start = match.start()
+ next_pattern = rf'\nArtikel\s+{num + 1}\s*\n'
+ next_match = re.search(next_pattern, full_text)
+ end = next_match.start() if next_match else min(start + 5000, len(full_text))
+ return full_text[start:end].strip()[:3000]
+
+ elif doc_type == "de_law":
+ para_match = re.search(r'\d+', article_label)
+ if not para_match:
+ return ""
+ num = int(para_match.group())
+ pattern = rf'\n§\s+{num}\b'
+ match = re.search(pattern, full_text)
+ if not match:
+ return ""
+ start = match.start()
+ next_pattern = rf'\n§\s+{num + 1}\b'
+ next_match = re.search(next_pattern, full_text)
+ end = next_match.start() if next_match else min(start + 5000, len(full_text))
+ return full_text[start:end].strip()[:3000]
+
+ elif doc_type == "nist":
+ escaped = re.escape(article_label)
+ match = re.search(rf'(?:^|\n)\s*{escaped}\b', full_text)
+ if not match:
+ return ""
+ start = match.start()
+ return full_text[start:start + 3000].strip()
+
+ else:
+ # Generic / OWASP / ENISA
+ escaped = re.escape(article_label)
+ match = re.search(rf'(?:^|\n).*{escaped}\b', full_text)
+ if not match:
+ return ""
+ start = match.start()
+ return full_text[start:start + 3000].strip()
+
+
+# ── Anthropic API ────────────────────────────────────────────────────
+
+def call_anthropic(prompt, system_prompt):
+ """Call Anthropic API. Returns (parsed_data, raw_text, usage, error)."""
+ headers = {
+ "x-api-key": ANTHROPIC_API_KEY,
+ "anthropic-version": "2023-06-01",
+ "content-type": "application/json",
+ }
+ payload = {
+ "model": ANTHROPIC_MODEL,
+ "max_tokens": 4096,
+ "system": system_prompt,
+ "messages": [{"role": "user", "content": prompt}],
+ }
+
+ try:
+ resp = requests.post(ANTHROPIC_URL, headers=headers, json=payload, timeout=120)
+ if resp.status_code != 200:
+ return None, "", {}, f"HTTP {resp.status_code}: {resp.text[:200]}"
+ data = resp.json()
+ content = data["content"][0]["text"] if data.get("content") else ""
+ usage = data.get("usage", {})
+ parsed = parse_json(content)
+ return parsed, content, usage, None
+ except Exception as e:
+ return None, "", {}, str(e)
+
+
+def parse_json(text):
+ """Parse JSON from LLM response, handling markdown fences."""
+ text = text.strip()
+ if text.startswith("```"):
+ lines = text.split("\n")
+ text = "\n".join(lines[1:-1] if lines[-1].strip().startswith("```") else lines[1:])
+ text = text.strip()
+
+ try:
+ data = json.loads(text)
+ if isinstance(data, list):
+ return data[0] if data else None
+ return data
+ except json.JSONDecodeError:
+ match = re.search(r'\{[\s\S]*\}', text)
+ if match:
+ try:
+ return json.loads(match.group())
+ except json.JSONDecodeError:
+ return None
+ return None
+
+
+# ── Domain detection ─────────────────────────────────────────────────
+
+def detect_domain(text):
+ text_lower = text.lower()
+ scores = {}
+ for domain, keywords in DOMAIN_KEYWORDS.items():
+ score = sum(1 for kw in keywords if kw in text_lower)
+ if score > 0:
+ scores[domain] = score
+ if scores:
+ return max(scores, key=scores.get)
+ return "SEC"
+
+
+# ── Control ID generation ────────────────────────────────────────────
+
+def generate_control_id(domain, cur):
+ """Generate next available control_id for domain prefix.
+
+ Uses MAX(numeric suffix) to find the true highest number,
+ avoiding gaps from string-sorted IDs (e.g. COMP-99 > COMP-1000 in text sort).
+ """
+ prefix = domain.upper()[:4]
+ cur.execute("""
+ SELECT MAX(CAST(SPLIT_PART(control_id, '-', 2) AS INTEGER))
+ FROM compliance.canonical_controls
+ WHERE control_id LIKE %s
+ AND SPLIT_PART(control_id, '-', 2) ~ '^[0-9]+$'
+ """, (f"{prefix}-%",))
+ row = cur.fetchone()
+ if row and row[0] is not None:
+ return f"{prefix}-{row[0] + 1}"
+ return f"{prefix}-001"
+
+
+# ── Main ─────────────────────────────────────────────────────────────
+
+def main():
+ parser = argparse.ArgumentParser(description="Phase 7.4: Generate controls for gap articles")
+ parser.add_argument("--dry-run", action="store_true", help="Show what would be generated")
+ parser.add_argument("--source", type=str, help="Filter by source name substring")
+ parser.add_argument("--resume", action="store_true", help="Skip articles that already have controls")
+ parser.add_argument("--results", default=GAP_RESULTS_FILE, help="Path to gap_analysis_results.json")
+ args = parser.parse_args()
+
+ if not ANTHROPIC_API_KEY:
+ print("ERROR: Set ANTHROPIC_API_KEY")
+ sys.exit(1)
+
+ # Load gap results
+ with open(args.results) as f:
+ gaps = json.load(f)
+ total_gaps = sum(len(g["gap_articles"]) for g in gaps)
+ print(f"Loaded {len(gaps)} sources with {total_gaps} gap articles")
+
+ if args.source:
+ gaps = [g for g in gaps if args.source.lower() in g["source"].lower()]
+ total_gaps = sum(len(g["gap_articles"]) for g in gaps)
+ print(f"Filtered to {len(gaps)} sources, {total_gaps} gaps")
+
+ # DB connection with keepalive + reconnect helper
+ db_url = os.environ['DATABASE_URL']
+ parsed = urllib.parse.urlparse(db_url)
+
+ def connect_db():
+ """Create DB connection with TCP keepalive."""
+ c = psycopg2.connect(
+ host=parsed.hostname, port=parsed.port or 5432,
+ user=parsed.username, password=parsed.password,
+ dbname=parsed.path.lstrip('/'),
+ options="-c search_path=compliance,public",
+ keepalives=1, keepalives_idle=30,
+ keepalives_interval=10, keepalives_count=5,
+ )
+ return c, c.cursor()
+
+ conn, cur = connect_db()
+
+ def ensure_db():
+ """Reconnect if connection is dead."""
+ nonlocal conn, cur
+ try:
+ cur.execute("SELECT 1")
+ except Exception:
+ print(" [RECONNECT] DB connection lost, reconnecting...")
+ try:
+ conn.close()
+ except Exception:
+ pass
+ conn, cur = connect_db()
+ return True
+ return False
+
+ # Get framework UUID
+ cur.execute("SELECT id FROM compliance.canonical_control_frameworks WHERE framework_id = 'bp_security_v1' LIMIT 1")
+ fw_row = cur.fetchone()
+ if not fw_row:
+ print("ERROR: Framework bp_security_v1 not found")
+ sys.exit(1)
+ framework_uuid = fw_row[0]
+
+ # If resuming, load existing articles per source
+ existing_articles = {}
+ if args.resume:
+ cur.execute("""
+ SELECT source_citation->>'source', source_citation->>'article'
+ FROM compliance.canonical_controls
+ WHERE source_citation->>'article' IS NOT NULL
+ """)
+ for src, art in cur.fetchall():
+ existing_articles.setdefault(src, set()).add(art)
+ print(f"Resume mode: {sum(len(v) for v in existing_articles.values())} existing article-control pairs")
+
+ # Stats
+ stats = Counter()
+ total_input_tokens = 0
+ total_output_tokens = 0
+ generated_ids = []
+ errors = []
+ t_start = time.time()
+
+ # Pre-read PDFs (cache full text per source)
+ pdf_cache = {}
+
+ for gap_source in sorted(gaps, key=lambda g: -len(g["gap_articles"])):
+ source_name = gap_source["source"]
+ gap_articles = gap_source["gap_articles"]
+ filename = SOURCE_FILE_MAP.get(source_name)
+ reg_code = SOURCE_TO_REGCODE.get(source_name, "unknown")
+ license_info = LICENSE_MAP.get(reg_code, {"license": "UNKNOWN", "rule": 1, "source_type": "unknown"})
+ doc_type = classify_doc(source_name)
+
+ if not filename:
+ stats["skipped_no_pdf"] += len(gap_articles)
+ continue
+
+ # Read PDF once per source
+ if source_name not in pdf_cache:
+ pdf_cache[source_name] = read_file(filename)
+ full_text = pdf_cache[source_name]
+ if not full_text:
+ stats["skipped_no_pdf"] += len(gap_articles)
+ continue
+
+ print(f"\n{'='*70}")
+ print(f"{source_name} — {len(gap_articles)} gaps (rule {license_info['rule']}, {doc_type})")
+ print(f"{'='*70}")
+
+ for gap in gap_articles:
+ article_label = gap["label"]
+ article_type = gap["type"]
+
+ # Skip if already has controls (resume mode)
+ if args.resume and article_label in existing_articles.get(source_name, set()):
+ stats["skipped_exists"] += 1
+ continue
+
+ # Skip non-substantive NIST sections (intro chapters)
+ if doc_type == "nist" and article_type == "section":
+ section_match = re.match(r'Section (\d+)', article_label)
+ if section_match and int(section_match.group(1)) <= 3:
+ stats["skipped_intro"] += 1
+ continue
+
+ # Extract article text
+ article_text = extract_article_text(filename, article_label, doc_type, full_text)
+ if not article_text or len(article_text) < 30:
+ stats["skipped_short_text"] += 1
+ print(f" SKIP {article_label}: text too short ({len(article_text)} chars)")
+ continue
+
+ if args.dry_run:
+ print(f" [DRY] {article_label} ({len(article_text)} chars)")
+ stats["would_generate"] += 1
+ continue
+
+ # Call Anthropic
+ prompt = build_prompt(source_name, article_label, article_text, license_info["license"])
+ data, raw, usage, error = call_anthropic(prompt, SYSTEM_PROMPT)
+
+ total_input_tokens += usage.get("input_tokens", 0)
+ total_output_tokens += usage.get("output_tokens", 0)
+
+ if error:
+ stats["api_error"] += 1
+ errors.append(f"{source_name} {article_label}: {error}")
+ print(f" ERROR {article_label}: {error}")
+ time.sleep(5)
+ continue
+
+ if not data:
+ stats["parse_error"] += 1
+ print(f" PARSE ERROR {article_label}")
+ continue
+
+ # Ensure DB is alive before writing
+ ensure_db()
+
+ # Build control
+ title = str(data.get("title", ""))[:200]
+ objective = str(data.get("objective", ""))
+ rationale = str(data.get("rationale", ""))
+ domain = str(data.get("domain", detect_domain(article_text))).upper()[:4]
+ if not domain or len(domain) < 2:
+ domain = detect_domain(article_text)
+
+ control_id = generate_control_id(domain, cur)
+ severity = str(data.get("severity", "medium")).lower()
+ if severity not in ("low", "medium", "high", "critical"):
+ severity = "medium"
+
+ requirements = data.get("requirements", [])
+ if not isinstance(requirements, list):
+ requirements = [str(requirements)]
+ test_procedure = data.get("test_procedure", [])
+ if not isinstance(test_procedure, list):
+ test_procedure = [str(test_procedure)]
+ evidence = data.get("evidence", [])
+ if not isinstance(evidence, list):
+ evidence = [str(evidence)]
+ tags = data.get("tags", [])
+ if not isinstance(tags, list):
+ tags = []
+ target_audience = data.get("target_audience", [])
+ if not isinstance(target_audience, list):
+ target_audience = []
+ applicable_industries = data.get("applicable_industries", ["all"])
+ if not isinstance(applicable_industries, list):
+ applicable_industries = ["all"]
+ applicable_company_size = data.get("applicable_company_size", ["all"])
+ if not isinstance(applicable_company_size, list):
+ applicable_company_size = ["all"]
+ scope_conditions = data.get("scope_conditions")
+
+ source_citation = {
+ "source": source_name,
+ "article": data.get("source_article", article_label),
+ "paragraph": data.get("source_paragraph", ""),
+ "article_type": article_type,
+ "license": license_info["license"],
+ "source_type": license_info["source_type"],
+ }
+
+ generation_metadata = {
+ "processing_path": "phase74_gap_fill",
+ "license_rule": license_info["rule"],
+ "source_regulation": reg_code,
+ "source_article": article_label,
+ "gap_fill": True,
+ }
+
+ category = str(data.get("category", "")) or None
+
+ # Insert into DB
+ try:
+ cur.execute("""
+ INSERT INTO compliance.canonical_controls (
+ framework_id, control_id, title, objective, rationale,
+ scope, requirements, test_procedure, evidence,
+ severity, risk_score, implementation_effort,
+ open_anchors, release_state, tags,
+ license_rule, source_original_text, source_citation,
+ customer_visible, generation_metadata,
+ verification_method, category, generation_strategy,
+ target_audience, pipeline_version,
+ applicable_industries, applicable_company_size, scope_conditions
+ ) VALUES (
+ %s, %s, %s, %s, %s,
+ %s, %s, %s, %s,
+ %s, %s, %s,
+ %s, %s, %s,
+ %s, %s, %s,
+ %s, %s,
+ %s, %s, %s,
+ %s, %s,
+ %s, %s, %s
+ )
+ ON CONFLICT (framework_id, control_id) DO NOTHING
+ RETURNING id
+ """, (
+ framework_uuid, control_id, title, objective, rationale,
+ json.dumps({}), json.dumps(requirements), json.dumps(test_procedure), json.dumps(evidence),
+ severity, 5, "m",
+ json.dumps([]), "draft", json.dumps(tags),
+ license_info["rule"], article_text, json.dumps(source_citation),
+ True, json.dumps(generation_metadata),
+ "document", category, "phase74_gap_fill",
+ json.dumps(target_audience), PIPELINE_VERSION,
+ json.dumps(applicable_industries), json.dumps(applicable_company_size),
+ json.dumps(scope_conditions) if scope_conditions else None,
+ ))
+ conn.commit()
+ row = cur.fetchone()
+ if row:
+ generated_ids.append(str(row[0]))
+ stats["generated"] += 1
+ print(f" OK {control_id}: {title[:60]}")
+ else:
+ stats["conflict"] += 1
+ print(f" CONFLICT {control_id} (already exists)")
+ except Exception as e:
+ conn.rollback()
+ stats["db_error"] += 1
+ errors.append(f"DB {control_id}: {str(e)[:100]}")
+ print(f" DB ERROR {control_id}: {str(e)[:100]}")
+
+ # Rate limit: ~0.5s between calls
+ time.sleep(0.5)
+
+ # ── Summary ──────────────────────────────────────────────────────
+ elapsed = time.time() - t_start
+ cost = (total_input_tokens * 3 + total_output_tokens * 15) / 1_000_000
+
+ print(f"\n\n{'='*70}")
+ print(f"PHASE 7.4 — {'DRY-RUN' if args.dry_run else 'ERGEBNIS'}")
+ print(f"{'='*70}")
+ print(f" Laufzeit: {elapsed/60:.1f} min")
+ print(f" API-Kosten: ${cost:.2f}")
+ print(f" Input Tokens: {total_input_tokens:,}")
+ print(f" Output Tokens: {total_output_tokens:,}")
+ print()
+ for key in sorted(stats.keys()):
+ print(f" {key:<25s}: {stats[key]:5d}")
+ print()
+
+ if generated_ids:
+ print(f" Neue Control-IDs: {len(generated_ids)}")
+ # Save generated IDs
+ with open("/tmp/phase74_generated_ids.json", 'w') as f:
+ json.dump(generated_ids, f)
+ print(f" IDs gespeichert: /tmp/phase74_generated_ids.json")
+
+ if errors:
+ print(f"\n Fehler ({len(errors)}):")
+ for e in errors[:20]:
+ print(f" {e}")
+ if len(errors) > 20:
+ print(f" ... und {len(errors)-20} weitere")
+
+ conn.close()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/scripts/qa/run_job.sh b/scripts/qa/run_job.sh
new file mode 100755
index 0000000..4b5ea41
--- /dev/null
+++ b/scripts/qa/run_job.sh
@@ -0,0 +1,218 @@
+#!/usr/bin/env bash
+# ─────────────────────────────────────────────────────────────
+# Robust job runner for QA scripts on Mac Mini
+#
+# Usage:
+# ./run_job.sh [args...] # start job
+# ./run_job.sh --status # show running jobs
+# ./run_job.sh --kill # kill a running job
+# ./run_job.sh --log # tail log
+#
+# Features:
+# - Loads .env automatically (COMPLIANCE_DATABASE_URL → DATABASE_URL)
+# - PID-file prevents duplicate runs
+# - Unbuffered Python output
+# - Structured log files in /tmp/qa_jobs/
+# ─────────────────────────────────────────────────────────────
+set -euo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
+PROJECT_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)"
+JOB_DIR="/tmp/qa_jobs"
+mkdir -p "$JOB_DIR"
+
+# ── Load .env ────────────────────────────────────────────────
+load_env() {
+ local envfile="$PROJECT_DIR/.env"
+ if [[ -f "$envfile" ]]; then
+ # Export all vars from .env
+ set -a
+ # shellcheck disable=SC1090
+ source "$envfile"
+ set +a
+ fi
+ # Map COMPLIANCE_DATABASE_URL → DATABASE_URL if needed
+ if [[ -z "${DATABASE_URL:-}" && -n "${COMPLIANCE_DATABASE_URL:-}" ]]; then
+ export DATABASE_URL="$COMPLIANCE_DATABASE_URL"
+ fi
+}
+
+# ── Job name from script path ─────────────────────────────────
+job_name() {
+ basename "$1" .py
+}
+
+pid_file() {
+ echo "$JOB_DIR/$(job_name "$1").pid"
+}
+
+log_file() {
+ echo "$JOB_DIR/$(job_name "$1").log"
+}
+
+# ── Status ────────────────────────────────────────────────────
+show_status() {
+ echo "═══════════════════════════════════════════════════════"
+ echo "QA Job Status ($(date '+%Y-%m-%d %H:%M:%S'))"
+ echo "═══════════════════════════════════════════════════════"
+ local found=0
+ for pidfile in "$JOB_DIR"/*.pid; do
+ [[ -f "$pidfile" ]] || continue
+ found=1
+ local name
+ name=$(basename "$pidfile" .pid)
+ local pid
+ pid=$(cat "$pidfile")
+ local logf="$JOB_DIR/$name.log"
+
+ if kill -0 "$pid" 2>/dev/null; then
+ local lines
+ lines=$(wc -l < "$logf" 2>/dev/null || echo 0)
+ local errors
+ errors=$(grep -c "ERROR" "$logf" 2>/dev/null || echo 0)
+ local last_line
+ last_line=$(tail -1 "$logf" 2>/dev/null || echo "(empty)")
+ echo " ● $name (PID $pid) — RUNNING"
+ echo " Log: $logf ($lines lines, $errors errors)"
+ echo " Last: $last_line"
+ else
+ echo " ○ $name (PID $pid) — STOPPED"
+ echo " Log: $logf"
+ rm -f "$pidfile"
+ fi
+ echo ""
+ done
+ if [[ $found -eq 0 ]]; then
+ echo " No jobs running."
+ fi
+}
+
+# ── Kill ──────────────────────────────────────────────────────
+kill_job() {
+ local script="$1"
+ local pf
+ pf=$(pid_file "$script")
+ if [[ ! -f "$pf" ]]; then
+ echo "No PID file for $(job_name "$script")"
+ return 1
+ fi
+ local pid
+ pid=$(cat "$pf")
+ if kill -0 "$pid" 2>/dev/null; then
+ kill "$pid"
+ echo "Killed $(job_name "$script") (PID $pid)"
+ else
+ echo "Process $pid already stopped"
+ fi
+ rm -f "$pf"
+}
+
+# ── Tail log ──────────────────────────────────────────────────
+tail_log() {
+ local script="$1"
+ local lf
+ lf=$(log_file "$script")
+ if [[ ! -f "$lf" ]]; then
+ echo "No log file: $lf"
+ return 1
+ fi
+ tail -50 "$lf"
+}
+
+# ── Start job ─────────────────────────────────────────────────
+start_job() {
+ local script="$1"
+ shift
+ local args=("$@")
+
+ # Resolve script path
+ local script_path="$script"
+ if [[ ! -f "$script_path" ]]; then
+ script_path="$SCRIPT_DIR/$script"
+ fi
+ if [[ ! -f "$script_path" ]]; then
+ echo "ERROR: Script not found: $script"
+ return 1
+ fi
+
+ local name
+ name=$(job_name "$script")
+ local pf
+ pf=$(pid_file "$script")
+ local lf
+ lf=$(log_file "$script")
+
+ # Check for already-running instance
+ if [[ -f "$pf" ]]; then
+ local existing_pid
+ existing_pid=$(cat "$pf")
+ if kill -0 "$existing_pid" 2>/dev/null; then
+ echo "ERROR: $name already running (PID $existing_pid)"
+ echo "Use: $0 --kill $script"
+ return 1
+ fi
+ rm -f "$pf"
+ fi
+
+ # Load environment
+ load_env
+
+ # Verify required env vars
+ if [[ -z "${DATABASE_URL:-}" ]]; then
+ echo "ERROR: DATABASE_URL not set (checked .env)"
+ return 1
+ fi
+
+ # Start
+ echo "Starting $name..."
+ echo " Script: $script_path"
+ echo " Args: ${args[*]:-none}"
+ echo " Log: $lf"
+
+ nohup python3 -u "$script_path" "${args[@]}" > "$lf" 2>&1 &
+ local pid=$!
+ echo "$pid" > "$pf"
+
+ echo " PID: $pid"
+ echo ""
+
+ # Wait a moment and check it started OK
+ sleep 3
+ if ! kill -0 "$pid" 2>/dev/null; then
+ echo "ERROR: Process died immediately. Log output:"
+ cat "$lf"
+ rm -f "$pf"
+ return 1
+ fi
+
+ local lines
+ lines=$(wc -l < "$lf" 2>/dev/null || echo 0)
+ echo "Running OK ($lines log lines so far)"
+ echo "Monitor with: $0 --status"
+ echo "Tail log: $0 --log $script"
+}
+
+# ── Main ──────────────────────────────────────────────────────
+case "${1:-}" in
+ --status|-s)
+ show_status
+ ;;
+ --kill|-k)
+ [[ -n "${2:-}" ]] || { echo "Usage: $0 --kill "; exit 1; }
+ kill_job "$2"
+ ;;
+ --log|-l)
+ [[ -n "${2:-}" ]] || { echo "Usage: $0 --log "; exit 1; }
+ tail_log "$2"
+ ;;
+ --help|-h|"")
+ echo "Usage:"
+ echo " $0 [args...] Start a QA job"
+ echo " $0 --status Show running jobs"
+ echo " $0 --kill Kill a running job"
+ echo " $0 --log Tail job log"
+ ;;
+ *)
+ start_job "$@"
+ ;;
+esac
diff --git a/scripts/qa/sync_db.py b/scripts/qa/sync_db.py
new file mode 100644
index 0000000..5ed5230
--- /dev/null
+++ b/scripts/qa/sync_db.py
@@ -0,0 +1,307 @@
+#!/usr/bin/env python3
+"""Sync canonical control tables between production and local DB.
+
+Modes:
+ --pull Production → Local (initial sync, full table copy)
+ --push Local → Production (incremental, only new obligation_candidates)
+ --loop Run --push every N minutes (default 60)
+
+Usage:
+ python3 sync_db.py --pull # Full sync production → local
+ python3 sync_db.py --push # Push new obligations to production
+ python3 sync_db.py --loop 60 # Push every 60 minutes
+ python3 sync_db.py --pull --tables canonical_controls # Only one table
+"""
+import argparse
+import json
+import os
+import sys
+import time
+import urllib.parse
+
+import io
+
+import psycopg2
+import psycopg2.extras
+import psycopg2.extensions
+
+# Register JSON adapter so dicts are automatically converted to JSONB
+psycopg2.extensions.register_adapter(dict, psycopg2.extras.Json)
+
+# ── DB Config ────────────────────────────────────────────────────────
+
+PROD_URL = os.environ.get(
+ "PROD_DATABASE_URL",
+ "postgresql://postgres:GmyFD3wnU1NrKBdpU1nwLdE8MLts0A0eez8L5XXdvUCe05lWnWfVp3C6JJ8Yrmt2"
+ "@46.225.100.82:54321/postgres?sslmode=require",
+)
+LOCAL_URL = os.environ.get(
+ "LOCAL_DATABASE_URL",
+ "postgresql://breakpilot:breakpilot123@localhost:5432/breakpilot_db",
+)
+
+SCHEMA = "compliance"
+
+# Tables to sync (production → local)
+SYNC_TABLES = [
+ "canonical_control_frameworks",
+ "canonical_control_licenses",
+ "canonical_control_sources",
+ "canonical_control_categories",
+ "canonical_blocked_sources",
+ "canonical_controls",
+ "canonical_control_mappings",
+ "canonical_processed_chunks",
+ "canonical_generation_jobs",
+ "control_patterns",
+ "crosswalk_matrix",
+ "obligation_extractions",
+ "obligation_candidates",
+]
+
+
+def connect(url, label="DB"):
+ parsed = urllib.parse.urlparse(url)
+ params = dict(urllib.parse.parse_qsl(parsed.query))
+ conn = psycopg2.connect(
+ host=parsed.hostname,
+ port=parsed.port or 5432,
+ user=parsed.username,
+ password=parsed.password,
+ dbname=parsed.path.lstrip("/"),
+ sslmode=params.get("sslmode", "prefer"),
+ options=f"-c search_path={SCHEMA},public",
+ keepalives=1,
+ keepalives_idle=30,
+ keepalives_interval=10,
+ keepalives_count=5,
+ )
+ conn.autocommit = False
+ print(f" Connected to {label} ({parsed.hostname}:{parsed.port or 5432})")
+ return conn
+
+
+def get_columns(cur, table):
+ cur.execute(f"""
+ SELECT column_name FROM information_schema.columns
+ WHERE table_schema = '{SCHEMA}' AND table_name = '{table}'
+ ORDER BY ordinal_position
+ """)
+ return [r[0] for r in cur.fetchall()]
+
+
+def pull_table(prod_conn, local_conn, table):
+ """Copy entire table from production to local via SELECT + INSERT."""
+ prod_cur = prod_conn.cursor()
+ local_cur = local_conn.cursor()
+
+ # Check table exists on production
+ prod_cur.execute(f"""
+ SELECT 1 FROM pg_tables
+ WHERE schemaname = '{SCHEMA}' AND tablename = '{table}'
+ """)
+ if not prod_cur.fetchone():
+ print(f" SKIP {table} — not found on production")
+ return 0
+
+ # Drop local table
+ local_cur.execute(f"DROP TABLE IF EXISTS {SCHEMA}.{table} CASCADE")
+ local_conn.commit()
+
+ # Build simple CREATE TABLE (no constraints, no defaults — just for data)
+ prod_cur.execute(f"""
+ SELECT column_name, data_type, udt_name, character_maximum_length
+ FROM information_schema.columns
+ WHERE table_schema = '{SCHEMA}' AND table_name = '{table}'
+ ORDER BY ordinal_position
+ """)
+ col_defs = prod_cur.fetchall()
+
+ parts = []
+ col_names = []
+ jsonb_cols = set()
+ for name, dtype, udt, max_len in col_defs:
+ col_names.append(name)
+ if dtype == "ARRAY":
+ type_map = {
+ "_text": "text[]", "_varchar": "varchar[]",
+ "_int4": "integer[]", "_uuid": "uuid[]",
+ "_jsonb": "jsonb[]", "_float8": "float8[]",
+ }
+ sql_type = type_map.get(udt, f"{udt.lstrip('_')}[]")
+ elif dtype == "USER-DEFINED" and udt == "jsonb":
+ sql_type = "jsonb"
+ jsonb_cols.add(name)
+ elif dtype == "USER-DEFINED":
+ sql_type = udt
+ elif dtype == "jsonb":
+ sql_type = "jsonb"
+ jsonb_cols.add(name)
+ elif max_len:
+ sql_type = f"{dtype}({max_len})"
+ else:
+ sql_type = dtype
+ parts.append(f'"{name}" {sql_type}')
+
+ ddl = f"CREATE TABLE {SCHEMA}.{table} ({', '.join(parts)})"
+ local_cur.execute(ddl)
+ local_conn.commit()
+
+ # Fetch all rows from production
+ col_list = ", ".join(f'"{c}"' for c in col_names)
+ prod_cur.execute(f"SELECT {col_list} FROM {SCHEMA}.{table}")
+ rows = prod_cur.fetchall()
+
+ if rows:
+ # Wrap dict/list values in Json for JSONB columns
+ adapted_rows = []
+ for row in rows:
+ adapted = []
+ for i, val in enumerate(row):
+ if col_names[i] in jsonb_cols and isinstance(val, (dict, list)):
+ adapted.append(psycopg2.extras.Json(val))
+ else:
+ adapted.append(val)
+ adapted_rows.append(tuple(adapted))
+
+ placeholders = ", ".join(["%s"] * len(col_names))
+ insert_sql = f'INSERT INTO {SCHEMA}.{table} ({col_list}) VALUES ({placeholders})'
+ psycopg2.extras.execute_batch(local_cur, insert_sql, adapted_rows, page_size=500)
+ local_conn.commit()
+
+ print(f" {table}: {len(rows)} rows")
+ return len(rows)
+
+
+def pull(tables=None):
+ """Full sync: production → local."""
+ print("\n=== PULL: Production → Local ===\n")
+
+ prod_conn = connect(PROD_URL, "Production")
+ local_conn = connect(LOCAL_URL, "Local")
+
+ # Ensure schema exists
+ local_cur = local_conn.cursor()
+ local_cur.execute(f"CREATE SCHEMA IF NOT EXISTS {SCHEMA}")
+ local_conn.commit()
+
+ sync_list = tables if tables else SYNC_TABLES
+ total = 0
+
+ for table in sync_list:
+ try:
+ count = pull_table(prod_conn, local_conn, table)
+ total += count
+ except Exception as e:
+ print(f" ERROR {table}: {e}")
+ local_conn.rollback()
+ prod_conn.rollback()
+
+ print(f"\n Total: {total} rows synced")
+ prod_conn.close()
+ local_conn.close()
+
+
+def push():
+ """Incremental push: new obligation_candidates local → production."""
+ print(f"\n=== PUSH: Local → Production ({time.strftime('%H:%M:%S')}) ===\n")
+
+ local_conn = connect(LOCAL_URL, "Local")
+ prod_conn = connect(PROD_URL, "Production")
+
+ local_cur = local_conn.cursor()
+ prod_cur = prod_conn.cursor()
+
+ # Find obligation_candidates in local that don't exist in production
+ # Use candidate_id as the unique key
+ local_cur.execute(f"""
+ SELECT candidate_id FROM {SCHEMA}.obligation_candidates
+ """)
+ local_ids = {r[0] for r in local_cur.fetchall()}
+
+ if not local_ids:
+ print(" No obligation_candidates in local DB")
+ local_conn.close()
+ prod_conn.close()
+ return 0
+
+ # Check which already exist on production
+ prod_cur.execute(f"""
+ SELECT candidate_id FROM {SCHEMA}.obligation_candidates
+ """)
+ prod_ids = {r[0] for r in prod_cur.fetchall()}
+
+ new_ids = local_ids - prod_ids
+ if not new_ids:
+ print(f" All {len(local_ids)} obligations already on production")
+ local_conn.close()
+ prod_conn.close()
+ return 0
+
+ print(f" {len(new_ids)} new obligations to push (local: {len(local_ids)}, prod: {len(prod_ids)})")
+
+ # Get columns
+ columns = get_columns(local_cur, "obligation_candidates")
+ col_list = ", ".join(columns)
+ placeholders = ", ".join(["%s"] * len(columns))
+
+ # Fetch new rows from local
+ id_list = ", ".join(f"'{i}'" for i in new_ids)
+ local_cur.execute(f"""
+ SELECT {col_list} FROM {SCHEMA}.obligation_candidates
+ WHERE candidate_id IN ({id_list})
+ """)
+ rows = local_cur.fetchall()
+
+ # Insert into production
+ insert_sql = f"INSERT INTO {SCHEMA}.obligation_candidates ({col_list}) VALUES ({placeholders}) ON CONFLICT DO NOTHING"
+ psycopg2.extras.execute_batch(prod_cur, insert_sql, rows, page_size=100)
+ prod_conn.commit()
+
+ print(f" Pushed {len(rows)} obligations to production")
+
+ local_conn.close()
+ prod_conn.close()
+ return len(rows)
+
+
+def loop(interval_min):
+ """Run push every N minutes."""
+ print(f"\n=== SYNC LOOP — Push every {interval_min} min ===")
+ print(f" Started at {time.strftime('%Y-%m-%d %H:%M:%S')}")
+ print(f" Press Ctrl+C to stop\n")
+
+ while True:
+ try:
+ pushed = push()
+ if pushed:
+ print(f" Next sync in {interval_min} min...")
+ except Exception as e:
+ print(f" SYNC ERROR: {e}")
+ time.sleep(interval_min * 60)
+
+
+def main():
+ parser = argparse.ArgumentParser(description="Sync canonical control tables")
+ parser.add_argument("--pull", action="store_true", help="Production → Local (full copy)")
+ parser.add_argument("--push", action="store_true", help="Local → Production (new obligations)")
+ parser.add_argument("--loop", type=int, metavar="MIN", help="Push every N minutes")
+ parser.add_argument("--tables", nargs="+", help="Only sync specific tables (with --pull)")
+ args = parser.parse_args()
+
+ if not any([args.pull, args.push, args.loop]):
+ parser.print_help()
+ return
+
+ if args.pull:
+ pull(args.tables)
+
+ if args.push:
+ push()
+
+ if args.loop:
+ loop(args.loop)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/scripts/qa/test_pass0a.py b/scripts/qa/test_pass0a.py
new file mode 100644
index 0000000..54df95c
--- /dev/null
+++ b/scripts/qa/test_pass0a.py
@@ -0,0 +1,470 @@
+#!/usr/bin/env python3
+"""Test Pass 0a (Obligation Extraction) on 5-10 controls.
+
+Standalone script — no SQLAlchemy dependency. Uses psycopg2 + requests.
+Copies prompts and quality gate from decomposition_pass.py.
+
+Usage:
+ python3 test_pass0a.py # 10 controls, Anthropic
+ python3 test_pass0a.py --limit 5 # 5 controls
+ python3 test_pass0a.py --source "DSGVO" # filter by source
+ python3 test_pass0a.py --dry-run # show controls, no LLM call
+"""
+import argparse
+import json
+import os
+import re
+import sys
+import time
+import urllib.parse
+
+import psycopg2
+import requests
+
+# ── Config ────────────────────────────────────────────────────────────
+ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", "")
+ANTHROPIC_MODEL = os.environ.get("DECOMPOSITION_LLM_MODEL", "claude-sonnet-4-6")
+ANTHROPIC_API_URL = "https://api.anthropic.com/v1"
+
+# ── Prompts (from decomposition_pass.py) ──────────────────────────────
+
+SYSTEM_PROMPT = """\
+Du bist ein Rechts-Compliance-Experte. Du zerlegst Compliance-Controls \
+in einzelne atomare Pflichten.
+
+REGELN (STRIKT EINHALTEN):
+1. Nur normative Aussagen extrahieren — erkennbar an: müssen, haben \
+sicherzustellen, sind verpflichtet, ist zu dokumentieren, ist zu melden, \
+ist zu testen, shall, must, required.
+2. Jede Pflicht hat genau EIN Hauptverb / eine Handlung.
+3. Testpflichten SEPARAT von operativen Pflichten (is_test_obligation=true).
+4. Meldepflichten SEPARAT (is_reporting_obligation=true).
+5. NICHT auf Evidence-Ebene zerlegen (z.B. "DR-Plan vorhanden" ist KEIN \
+eigenes Control, sondern Evidence).
+6. Begründungen, Erläuterungen und Erwägungsgründe sind KEINE Pflichten \
+— NICHT extrahieren.
+
+Antworte NUR mit einem JSON-Array. Keine Erklärungen."""
+
+
+def build_prompt(title, objective, requirements, test_procedure, source_ref):
+ return f"""\
+Analysiere das folgende Control und extrahiere alle einzelnen normativen \
+Pflichten als JSON-Array.
+
+CONTROL:
+Titel: {title}
+Ziel: {objective}
+Anforderungen: {requirements}
+Prüfverfahren: {test_procedure}
+Quellreferenz: {source_ref}
+
+Antworte als JSON-Array:
+[
+ {{
+ "obligation_text": "Kurze, präzise Formulierung der Pflicht",
+ "action": "Hauptverb/Handlung",
+ "object": "Gegenstand der Pflicht",
+ "condition": "Auslöser/Bedingung oder null",
+ "normative_strength": "must",
+ "is_test_obligation": false,
+ "is_reporting_obligation": false
+ }}
+]"""
+
+
+# ── Quality Gate — 3-Tier Classification (from decomposition_pass.py) ──
+
+# Tier 1: Pflicht (mandatory)
+_PFLICHT_RE = re.compile(
+ r"\bmüssen\b|\bmuss\b|\bhat\s+sicherzustellen\b|\bhaben\s+sicherzustellen\b"
+ r"|\bsind\s+verpflichtet\b|\bist\s+verpflichtet\b"
+ r"|\bist\s+zu\s+\w+en\b|\bsind\s+zu\s+\w+en\b"
+ r"|\bhat\s+zu\s+\w+en\b|\bhaben\s+zu\s+\w+en\b"
+ r"|\bist\s+\w+zu\w+en\b|\bsind\s+\w+zu\w+en\b"
+ r"|\bist\s+\w+\s+zu\s+\w+en\b|\bsind\s+\w+\s+zu\s+\w+en\b"
+ r"|\bhat\s+\w+\s+zu\s+\w+en\b|\bhaben\s+\w+\s+zu\s+\w+en\b"
+ r"|\bshall\b|\bmust\b|\brequired\b"
+ r"|\b\w+zuteilen\b|\b\w+zuwenden\b|\b\w+zustellen\b|\b\w+zulegen\b"
+ r"|\b\w+zunehmen\b|\b\w+zuführen\b|\b\w+zuhalten\b|\b\w+zusetzen\b"
+ r"|\b\w+zuweisen\b|\b\w+zuordnen\b|\b\w+zufügen\b|\b\w+zugeben\b"
+ r"|\bist\b.{1,80}\bzu\s+\w+en\b|\bsind\b.{1,80}\bzu\s+\w+en\b",
+ re.IGNORECASE,
+)
+# Tier 2: Empfehlung (recommendation)
+_EMPFEHLUNG_RE = re.compile(
+ r"\bsoll\b|\bsollen\b|\bsollte\b|\bsollten\b"
+ r"|\bgewährleisten\b|\bsicherstellen\b"
+ r"|\bshould\b|\bensure\b|\brecommend\w*\b"
+ r"|\bnachweisen\b|\beinhalten\b|\bunterlassen\b|\bwahren\b"
+ r"|\bdokumentieren\b|\bimplementieren\b|\büberprüfen\b|\büberwachen\b"
+ r"|\bprüfen,\s+ob\b|\bkontrollieren,\s+ob\b",
+ re.IGNORECASE,
+)
+# Tier 3: Kann (optional/permissive)
+_KANN_RE = re.compile(
+ r"\bkann\b|\bkönnen\b|\bdarf\b|\bdürfen\b|\bmay\b|\boptional\b",
+ re.IGNORECASE,
+)
+# Union (backward compat)
+_NORMATIVE_RE = re.compile(
+ _PFLICHT_RE.pattern + "|" + _EMPFEHLUNG_RE.pattern + "|" + _KANN_RE.pattern,
+ re.IGNORECASE,
+)
+_RATIONALE_RE = re.compile(
+ r"\bda\s+|\bweil\b|\bgrund\b|\berwägung|\bbecause\b|\breason\b|\brationale\b",
+ re.IGNORECASE,
+)
+_TEST_RE = re.compile(
+ r"\btesten\b|\btest\b|\bprüfung\b|\bprüfen\b|\bgetestet\b|\bwirksamkeit\b"
+ r"|\baudit\b|\bregelmäßig\b.*\b(prüf|test|kontroll)|\beffectiveness\b|\bverif",
+ re.IGNORECASE,
+)
+_REPORTING_RE = re.compile(
+ r"\bmelden\b|\bmeldung\b|\bunterricht|\binformieren\b|\bbenachricht"
+ r"|\bnotif|\breport\b|\bbehörd",
+ re.IGNORECASE,
+)
+
+
+def classify_obligation_type(txt):
+ """Classify: pflicht > empfehlung > kann > empfehlung (default)."""
+ if _PFLICHT_RE.search(txt):
+ return "pflicht"
+ if _EMPFEHLUNG_RE.search(txt):
+ return "empfehlung"
+ if _KANN_RE.search(txt):
+ return "kann"
+ return "empfehlung"
+
+
+def quality_gate(obl_text, parent_uuid):
+ """Validate + classify obligation. Returns (flags_dict, passed_bool, confidence, obligation_type)."""
+ flags = {}
+
+ # 1. Normative signal (informational)
+ flags["has_normative_signal"] = bool(_NORMATIVE_RE.search(obl_text))
+
+ # 1b. Obligation type classification
+ obl_type = classify_obligation_type(obl_text)
+ flags["obligation_type"] = obl_type
+
+ # 2. Single action
+ multi_verb_re = re.compile(
+ r"\b(und|sowie|als auch)\b.*\b(müssen|sicherstellen|implementieren"
+ r"|dokumentieren|melden|testen|prüfen|überwachen|gewährleisten)\b",
+ re.IGNORECASE,
+ )
+ flags["single_action"] = not bool(multi_verb_re.search(obl_text))
+
+ # 3. Not rationale
+ normative_count = len(_NORMATIVE_RE.findall(obl_text))
+ rationale_count = len(_RATIONALE_RE.findall(obl_text))
+ flags["not_rationale"] = normative_count >= rationale_count
+
+ # 4. Not evidence-only
+ evidence_only_re = re.compile(
+ r"^(Nachweis|Dokumentation|Screenshot|Protokoll|Bericht|Zertifikat)",
+ re.IGNORECASE,
+ )
+ flags["not_evidence_only"] = not bool(evidence_only_re.match(obl_text.strip()))
+
+ # 5. Min length
+ flags["min_length"] = len(obl_text.strip()) >= 20
+
+ # 6. Parent link
+ flags["has_parent_link"] = bool(parent_uuid)
+
+ # Confidence
+ weights = {
+ "has_normative_signal": 0.25, "single_action": 0.20,
+ "not_rationale": 0.20, "not_evidence_only": 0.15,
+ "min_length": 0.10, "has_parent_link": 0.05,
+ }
+ # Bonus for pflicht classification
+ confidence = sum(weights[k] for k, v in flags.items() if v and k in weights)
+ if obl_type == "pflicht":
+ confidence = min(confidence + 0.05, 1.0)
+
+ # Pass check — has_normative_signal is NO LONGER critical
+ critical = ["not_evidence_only", "min_length", "has_parent_link"]
+ passed = all(flags.get(k, False) for k in critical)
+
+ return flags, passed, confidence, obl_type
+
+
+# ── JSON parsing ──────────────────────────────────────────────────────
+
+def parse_json_array(text):
+ try:
+ result = json.loads(text)
+ if isinstance(result, list):
+ return result
+ if isinstance(result, dict):
+ return [result]
+ except json.JSONDecodeError:
+ pass
+ match = re.search(r"\[[\s\S]*\]", text)
+ if match:
+ try:
+ result = json.loads(match.group())
+ if isinstance(result, list):
+ return result
+ except json.JSONDecodeError:
+ pass
+ return []
+
+
+# ── API call ──────────────────────────────────────────────────────────
+
+def call_anthropic(prompt):
+ headers = {
+ "x-api-key": ANTHROPIC_API_KEY,
+ "anthropic-version": "2023-06-01",
+ "content-type": "application/json",
+ }
+ payload = {
+ "model": ANTHROPIC_MODEL,
+ "max_tokens": 8192,
+ "system": [{"type": "text", "text": SYSTEM_PROMPT, "cache_control": {"type": "ephemeral"}}],
+ "messages": [{"role": "user", "content": prompt}],
+ }
+ resp = requests.post(f"{ANTHROPIC_API_URL}/messages", headers=headers, json=payload, timeout=120)
+ if resp.status_code != 200:
+ return None, {}, f"HTTP {resp.status_code}: {resp.text[:200]}"
+ data = resp.json()
+ usage = data.get("usage", {})
+ content = data.get("content", [])
+ text = content[0].get("text", "") if content else ""
+ return text, usage, None
+
+
+# ── Format helpers ────────────────────────────────────────────────────
+
+def fmt_json(val):
+ if val is None:
+ return ""
+ if isinstance(val, str):
+ try:
+ val = json.loads(val)
+ except (json.JSONDecodeError, TypeError):
+ return val
+ if isinstance(val, list):
+ return "\n".join(f" - {item}" for item in val)
+ return str(val)
+
+
+# ── Main ──────────────────────────────────────────────────────────────
+
+def main():
+ parser = argparse.ArgumentParser(description="Test Pass 0a on small sample")
+ parser.add_argument("--limit", type=int, default=10)
+ parser.add_argument("--source", type=str)
+ parser.add_argument("--dry-run", action="store_true")
+ args = parser.parse_args()
+
+ if not ANTHROPIC_API_KEY and not args.dry_run:
+ print("ERROR: Set ANTHROPIC_API_KEY")
+ sys.exit(1)
+
+ db_url = os.environ["DATABASE_URL"]
+ p = urllib.parse.urlparse(db_url)
+ conn = psycopg2.connect(
+ host=p.hostname, port=p.port or 5432,
+ user=p.username, password=p.password,
+ dbname=p.path.lstrip("/"),
+ options="-c search_path=compliance,public",
+ )
+ cur = conn.cursor()
+
+ # Select diverse sample
+ query = """
+ SELECT id, control_id, title, objective, requirements,
+ test_procedure, source_citation, category
+ FROM compliance.canonical_controls
+ WHERE release_state NOT IN ('deprecated', 'duplicate', 'too_close')
+ AND parent_control_uuid IS NULL
+ AND title IS NOT NULL AND objective IS NOT NULL
+ AND length(coalesce(objective,'') || coalesce(requirements::text,'')) > 100
+ """
+ params = []
+ if args.source:
+ query += " AND source_citation->>'source' ILIKE %s"
+ params.append(f"%{args.source}%")
+
+ query += " ORDER BY source_citation->>'source', random()"
+ query += f" LIMIT {args.limit}"
+
+ cur.execute(query, params)
+ controls = cur.fetchall()
+
+ if not controls:
+ print("No controls found.")
+ return
+
+ print(f"{'='*70}")
+ print(f"Pass 0a Test — {len(controls)} Controls")
+ print(f"Model: {ANTHROPIC_MODEL}")
+ print(f"{'='*70}")
+
+ total_in = total_out = total_obls = 0
+ type_counts = {"pflicht": 0, "empfehlung": 0, "kann": 0}
+ total_rejected = 0 # only evidence-only / too-short / no-parent
+ all_results = []
+ t_start = time.time()
+
+ for i, row in enumerate(controls, 1):
+ ctrl_uuid, ctrl_id, title, objective, reqs, test_proc, src_cit, category = row
+
+ req_str = fmt_json(reqs)
+ test_str = fmt_json(test_proc)
+ source_str = ""
+ if src_cit:
+ sc = src_cit if isinstance(src_cit, dict) else json.loads(src_cit)
+ source_str = f"{sc.get('source', '')} {sc.get('article', '')}"
+
+ print(f"\n{'─'*70}")
+ print(f"[{i}/{len(controls)}] {ctrl_id}: {title}")
+ print(f" Source: {source_str} | Category: {category or 'N/A'}")
+ print(f" Objective: {(objective or '')[:200]}")
+
+ if args.dry_run:
+ print(" [DRY RUN]")
+ continue
+
+ prompt = build_prompt(title or "", objective or "", req_str, test_str, source_str)
+
+ t0 = time.time()
+ response_text, usage, error = call_anthropic(prompt)
+ elapsed = time.time() - t0
+
+ if error:
+ print(f" ERROR: {error}")
+ continue
+
+ in_tok = usage.get("input_tokens", 0)
+ out_tok = usage.get("output_tokens", 0)
+ cached = usage.get("cache_read_input_tokens", 0)
+ total_in += in_tok
+ total_out += out_tok
+
+ obligations = parse_json_array(response_text)
+ total_obls += len(obligations)
+
+ print(f" API: {elapsed:.1f}s | {in_tok} in / {out_tok} out"
+ f"{f' ({cached} cached)' if cached else ''}"
+ f" | {len(obligations)} obligation(s)")
+
+ for j, obl in enumerate(obligations, 1):
+ obl_text = obl.get("obligation_text", "")
+ action = obl.get("action", "")
+ obj = obl.get("object", "")
+ condition = obl.get("condition")
+ strength = obl.get("normative_strength", "must")
+ is_test = bool(obl.get("is_test_obligation", False))
+ is_report = bool(obl.get("is_reporting_obligation", False))
+
+ # Auto-detect
+ if not is_test and _TEST_RE.search(obl_text):
+ is_test = True
+ if not is_report and _REPORTING_RE.search(obl_text):
+ is_report = True
+
+ flags, passed, conf, obl_type = quality_gate(obl_text, str(ctrl_uuid))
+ if passed:
+ type_counts[obl_type] = type_counts.get(obl_type, 0) + 1
+ else:
+ total_rejected += 1
+
+ tag = ""
+ if is_test:
+ tag = " [TEST]"
+ elif is_report:
+ tag = " [MELDEPFLICHT]"
+
+ # Show type instead of PASS/REJECT
+ type_label = {"pflicht": "PFLICHT", "empfehlung": "EMPFEHLUNG", "kann": "KANN"}
+ if not passed:
+ status = "REJECT"
+ else:
+ status = type_label.get(obl_type, "EMPFEHLUNG")
+
+ failed = [k for k, v in flags.items()
+ if isinstance(v, bool) and not v]
+
+ print(f"\n {j}. [{status}] conf={conf:.0%}{tag} strength={strength}")
+ print(f" {obl_text}")
+ print(f" Handlung: {action} | Gegenstand: {obj}")
+ if condition:
+ print(f" Bedingung: {condition}")
+ if not passed:
+ print(f" Abgelehnt: {', '.join(failed)}")
+
+ all_results.append({
+ "control_id": ctrl_id,
+ "obligation_text": obl_text,
+ "obligation_type": obl_type if passed else "rejected",
+ "action": action,
+ "object": obj,
+ "condition": condition,
+ "confidence": round(conf, 2),
+ "is_test": is_test,
+ "is_reporting": is_report,
+ "passed": passed,
+ "flags": {k: v for k, v in flags.items()},
+ })
+
+ time.sleep(0.5)
+
+ # ── Summary ──────────────────────────────────────────────────────
+ elapsed_total = time.time() - t_start
+ cost = (total_in * 3 + total_out * 15) / 1_000_000
+ total_classified = sum(type_counts.values())
+
+ print(f"\n\n{'='*70}")
+ print(f"ZUSAMMENFASSUNG — 3-Tier-Klassifizierung")
+ print(f"{'='*70}")
+ print(f" Controls: {len(controls)}")
+ print(f" Obligations: {total_obls} ({total_obls/max(len(controls),1):.1f} pro Control)")
+ print(f" ── Klassifizierung ──")
+ print(f" Pflicht: {type_counts['pflicht']}"
+ f" ({type_counts['pflicht']*100/max(total_obls,1):.0f}%)")
+ print(f" Empfehlung: {type_counts['empfehlung']}"
+ f" ({type_counts['empfehlung']*100/max(total_obls,1):.0f}%)")
+ print(f" Kann: {type_counts['kann']}"
+ f" ({type_counts['kann']*100/max(total_obls,1):.0f}%)")
+ print(f" Rejected: {total_rejected}"
+ f" ({total_rejected*100/max(total_obls,1):.0f}%)"
+ f" (nur evidence-only/zu kurz/kein parent)")
+ print(f" ── Kosten ──")
+ print(f" Laufzeit: {elapsed_total:.1f}s")
+ print(f" Tokens: {total_in:,} in / {total_out:,} out")
+ print(f" Kosten: ${cost:.4f}")
+
+ if len(controls) > 0 and not args.dry_run and total_obls > 0:
+ n = 6000
+ factor = n / len(controls)
+ print(f"\n --- Hochrechnung auf {n:,} Controls ---")
+ print(f" Tokens: {int(total_in * factor):,} in / {int(total_out * factor):,} out")
+ print(f" Kosten: ${cost * factor:.2f}")
+ print(f" Laufzeit: {elapsed_total * factor / 3600:.1f}h")
+ print(f" Obligations: ~{int(total_obls / len(controls) * n):,}")
+ pf = int(type_counts['pflicht'] * factor)
+ ef = int(type_counts['empfehlung'] * factor)
+ kf = int(type_counts['kann'] * factor)
+ print(f" Pflicht: ~{pf:,}")
+ print(f" Empfehlung: ~{ef:,}")
+ print(f" Kann: ~{kf:,}")
+
+ # Save results JSON for later analysis
+ if all_results:
+ out_path = f"/tmp/pass0a_results_{len(controls)}controls.json"
+ with open(out_path, "w") as f:
+ json.dump(all_results, f, ensure_ascii=False, indent=2)
+ print(f"\n Ergebnisse gespeichert: {out_path}")
+
+ conn.close()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/scripts/qa/test_pass0b_preview.py b/scripts/qa/test_pass0b_preview.py
new file mode 100644
index 0000000..7b4a6af
--- /dev/null
+++ b/scripts/qa/test_pass0b_preview.py
@@ -0,0 +1,308 @@
+#!/usr/bin/env python3
+"""Preview Pass 0b: Turn obligation candidates into atomic controls.
+
+Picks a few obligations from Pass 0a results, calls LLM to compose
+atomic controls, and writes them to canonical_controls with parent_control_uuid.
+
+Usage:
+ python3 test_pass0b_preview.py --input /tmp/pass0a_results_60controls.json --limit 3
+"""
+import argparse
+import json
+import os
+import re
+import sys
+import time
+import uuid
+import urllib.parse
+
+import psycopg2
+import psycopg2.extras
+import requests
+
+# Register JSON adapter
+psycopg2.extensions.register_adapter(dict, psycopg2.extras.Json)
+
+ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", "")
+ANTHROPIC_MODEL = os.environ.get("DECOMPOSITION_LLM_MODEL", "claude-sonnet-4-6")
+
+SYSTEM_PROMPT = """\
+Du bist ein Security-Compliance-Experte. Du erstellst aus einer einzelnen \
+normativen Pflicht ein praxisorientiertes, atomares Security Control.
+
+Das Control muss UMSETZBAR sein — keine Gesetzesparaphrase.
+Antworte NUR als JSON. Keine Erklärungen."""
+
+
+def build_pass0b_prompt(obl_text, action, obj, parent_title, category, source_ref):
+ return f"""\
+Erstelle aus der folgenden Pflicht ein atomares Control.
+
+PFLICHT: {obl_text}
+HANDLUNG: {action}
+GEGENSTAND: {obj}
+
+KONTEXT (Ursprungs-Control):
+Titel: {parent_title}
+Kategorie: {category}
+Quellreferenz: {source_ref}
+
+Antworte als JSON:
+{{
+ "title": "Kurzer Titel (max 80 Zeichen, deutsch)",
+ "objective": "Was muss erreicht werden? (1-2 Sätze)",
+ "requirements": ["Konkrete Anforderung 1", "Anforderung 2"],
+ "test_procedure": ["Prüfschritt 1", "Prüfschritt 2"],
+ "evidence": ["Nachweis 1", "Nachweis 2"],
+ "severity": "critical|high|medium|low",
+ "category": "security|privacy|governance|operations|finance|reporting"
+}}"""
+
+
+def call_anthropic(prompt):
+ headers = {
+ "x-api-key": ANTHROPIC_API_KEY,
+ "anthropic-version": "2023-06-01",
+ "content-type": "application/json",
+ }
+ payload = {
+ "model": ANTHROPIC_MODEL,
+ "max_tokens": 4096,
+ "system": [{"type": "text", "text": SYSTEM_PROMPT, "cache_control": {"type": "ephemeral"}}],
+ "messages": [{"role": "user", "content": prompt}],
+ }
+ resp = requests.post("https://api.anthropic.com/v1/messages", headers=headers, json=payload, timeout=120)
+ if resp.status_code != 200:
+ return None, {}, f"HTTP {resp.status_code}: {resp.text[:200]}"
+ data = resp.json()
+ text = data.get("content", [{}])[0].get("text", "")
+ return text, data.get("usage", {}), None
+
+
+def parse_json_object(text):
+ try:
+ return json.loads(text)
+ except json.JSONDecodeError:
+ match = re.search(r"\{[\s\S]*\}", text)
+ if match:
+ try:
+ return json.loads(match.group())
+ except json.JSONDecodeError:
+ pass
+ return None
+
+
+def generate_control_id(domain, cur):
+ prefix = domain.upper()[:4]
+ cur.execute("""
+ SELECT MAX(CAST(SPLIT_PART(control_id, '-', 2) AS INTEGER))
+ FROM compliance.canonical_controls
+ WHERE control_id LIKE %s
+ AND SPLIT_PART(control_id, '-', 2) ~ '^[0-9]+$'
+ """, (f"{prefix}-%",))
+ row = cur.fetchone()
+ if row and row[0] is not None:
+ return f"{prefix}-{row[0] + 1}"
+ return f"{prefix}-001"
+
+
+def main():
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--input", default="/tmp/pass0a_results_60controls.json")
+ parser.add_argument("--limit", type=int, default=3, help="Number of obligations to process")
+ parser.add_argument("--control", type=str, help="Pick obligations from this control_id")
+ parser.add_argument("--dry-run", action="store_true")
+ args = parser.parse_args()
+
+ if not ANTHROPIC_API_KEY and not args.dry_run:
+ print("ERROR: Set ANTHROPIC_API_KEY")
+ sys.exit(1)
+
+ # Load 0a results
+ with open(args.input) as f:
+ obligations = json.load(f)
+
+ # Filter: only passed, pflicht or empfehlung
+ obligations = [o for o in obligations if o.get("passed", False)]
+
+ if args.control:
+ obligations = [o for o in obligations if o["control_id"] == args.control]
+
+ # Pick diverse sample
+ picked = []
+ seen_types = set()
+ for o in obligations:
+ otype = o["obligation_type"]
+ if otype not in seen_types and len(picked) < args.limit:
+ picked.append(o)
+ seen_types.add(otype)
+ # Fill rest
+ for o in obligations:
+ if o not in picked and len(picked) < args.limit:
+ picked.append(o)
+
+ if not picked:
+ print("No obligations found.")
+ return
+
+ # Connect to DB
+ db_url = os.environ["DATABASE_URL"]
+ p = urllib.parse.urlparse(db_url)
+ conn = psycopg2.connect(
+ host=p.hostname, port=p.port or 5432,
+ user=p.username, password=p.password,
+ dbname=p.path.lstrip("/"),
+ options="-c search_path=compliance,public",
+ )
+ cur = conn.cursor()
+
+ # Get parent control info
+ ctrl_ids = list(set(o["control_id"] for o in picked))
+ cur.execute("""
+ SELECT control_id, id, title, category, source_citation
+ FROM compliance.canonical_controls
+ WHERE control_id = ANY(%s)
+ """, (ctrl_ids,))
+ ctrl_map = {}
+ for row in cur.fetchall():
+ sc = row[4] if isinstance(row[4], dict) else (json.loads(row[4]) if row[4] else {})
+ # Derive domain prefix from control_id (e.g. "DSGV" from "DSGV-001")
+ prefix = row[0].split("-")[0] if "-" in row[0] else "COMP"
+ ctrl_map[row[0]] = {
+ "uuid": str(row[1]), "title": row[2], "category": row[3] or "",
+ "source_ref": f"{sc.get('source', '')} {sc.get('article', '')}",
+ "domain": prefix,
+ }
+
+ print("=" * 70)
+ print(f"Pass 0b Preview — {len(picked)} Obligations → Atomic Controls")
+ print("=" * 70)
+
+ created = []
+ for i, obl in enumerate(picked, 1):
+ ctrl = ctrl_map.get(obl["control_id"], {})
+ print(f"\n{'─'*70}")
+ print(f"[{i}/{len(picked)}] {obl['control_id']}: [{obl['obligation_type'].upper()}]")
+ print(f" Obligation: {obl['obligation_text'][:120]}")
+ print(f" Parent: {ctrl.get('title', 'N/A')}")
+
+ if args.dry_run:
+ print(" [DRY RUN]")
+ continue
+
+ prompt = build_pass0b_prompt(
+ obl["obligation_text"], obl["action"], obl["object"],
+ ctrl.get("title", ""), ctrl.get("category", ""),
+ ctrl.get("source_ref", ""),
+ )
+
+ t0 = time.time()
+ resp_text, usage, error = call_anthropic(prompt)
+ elapsed = time.time() - t0
+
+ if error:
+ print(f" ERROR: {error}")
+ continue
+
+ result = parse_json_object(resp_text)
+ if not result:
+ print(f" PARSE ERROR: {resp_text[:200]}")
+ continue
+
+ in_tok = usage.get("input_tokens", 0)
+ out_tok = usage.get("output_tokens", 0)
+ print(f" LLM: {elapsed:.1f}s | {in_tok} in / {out_tok} out")
+
+ # Generate control_id
+ domain = ctrl.get("domain", "COMP")
+ new_control_id = generate_control_id(domain, cur)
+
+ # Show result
+ print(f"\n === ATOMIC CONTROL: {new_control_id} ===")
+ print(f" Titel: {result.get('title', 'N/A')}")
+ print(f" Ziel: {result.get('objective', 'N/A')}")
+ print(f" Typ: {obl['obligation_type']}")
+ reqs = result.get("requirements", [])
+ if reqs:
+ print(f" Anforderungen:")
+ for r in reqs:
+ print(f" - {r}")
+ tests = result.get("test_procedure", [])
+ if tests:
+ print(f" Pruefverfahren:")
+ for t in tests:
+ print(f" - {t}")
+ evidence = result.get("evidence", [])
+ if evidence:
+ print(f" Nachweise:")
+ for e in evidence:
+ print(f" - {e}")
+ print(f" Severity: {result.get('severity', 'medium')}")
+ print(f" Category: {result.get('category', 'governance')}")
+
+ # Write to DB
+ new_uuid = str(uuid.uuid4())
+ parent_uuid = ctrl.get("uuid")
+ source_cit = {}
+ if ctrl.get("source_ref"):
+ parts = ctrl["source_ref"].strip().split(" ", 1)
+ source_cit = {"source": parts[0], "article": parts[1] if len(parts) > 1 else ""}
+
+ cur.execute("""
+ INSERT INTO compliance.canonical_controls (
+ id, control_id, title, objective, requirements, test_procedure,
+ evidence, severity, category, release_state,
+ source_citation, generation_metadata, generation_strategy,
+ pipeline_version, parent_control_uuid, framework_id
+ ) VALUES (
+ %s, %s, %s, %s, %s, %s,
+ %s, %s, %s, %s,
+ %s, %s, %s,
+ %s, %s,
+ (SELECT id FROM compliance.canonical_control_frameworks LIMIT 1)
+ )
+ """, (
+ new_uuid, new_control_id,
+ result.get("title", ""),
+ result.get("objective", ""),
+ json.dumps(result.get("requirements", []), ensure_ascii=False),
+ json.dumps(result.get("test_procedure", []), ensure_ascii=False),
+ json.dumps(result.get("evidence", []), ensure_ascii=False),
+ result.get("severity", "medium"),
+ result.get("category", "governance"),
+ "draft",
+ psycopg2.extras.Json(source_cit),
+ psycopg2.extras.Json({
+ "obligation_type": obl["obligation_type"],
+ "obligation_text": obl["obligation_text"],
+ "pass0b_model": ANTHROPIC_MODEL,
+ "decomposition_method": "pass0b_preview",
+ }),
+ "pass0b_atomic",
+ 6, # pipeline_version
+ parent_uuid,
+ ))
+ conn.commit()
+
+ created.append({
+ "control_id": new_control_id,
+ "title": result.get("title", ""),
+ "obligation_type": obl["obligation_type"],
+ "parent_control_id": obl["control_id"],
+ })
+ print(f" ✓ Geschrieben: {new_control_id} (parent: {obl['control_id']})")
+
+ time.sleep(0.5)
+
+ if created:
+ print(f"\n{'='*70}")
+ print(f"ERGEBNIS: {len(created)} atomare Controls erstellt")
+ print(f"{'='*70}")
+ for c in created:
+ print(f" {c['control_id']}: {c['title']} [{c['obligation_type']}] (von {c['parent_control_id']})")
+
+ conn.close()
+
+
+if __name__ == "__main__":
+ main()