diff --git a/admin-compliance/app/sdk/control-library/components/ControlDetail.tsx b/admin-compliance/app/sdk/control-library/components/ControlDetail.tsx index e899cde..dbc96d1 100644 --- a/admin-compliance/app/sdk/control-library/components/ControlDetail.tsx +++ b/admin-compliance/app/sdk/control-library/components/ControlDetail.tsx @@ -9,6 +9,7 @@ import { import { CanonicalControl, EFFORT_LABELS, BACKEND_URL, SeverityBadge, StateBadge, LicenseRuleBadge, VerificationMethodBadge, CategoryBadge, TargetAudienceBadge, + ObligationTypeBadge, GenerationStrategyBadge, VERIFICATION_METHODS, CATEGORY_OPTIONS, } from './helpers' @@ -125,6 +126,8 @@ export function ControlDetail({ + +

{ctrl.title}

@@ -239,6 +242,32 @@ export function ControlDetail({ )} + {/* Parent Control (atomare Controls) */} + {ctrl.parent_control_uuid && ( +
+
+ +

Atomares Control

+ +
+

+ Abgeleitet aus Eltern-Control{' '} + + {ctrl.parent_control_id || ctrl.parent_control_uuid} + + {ctrl.parent_control_title && ( + — {ctrl.parent_control_title} + )} +

+ {ctrl.generation_metadata?.obligation_text && ( +

+ Obligation: {String(ctrl.generation_metadata.obligation_text).slice(0, 300)} + {String(ctrl.generation_metadata.obligation_text).length > 300 ? '...' : ''} +

+ )} +
+ )} + {/* Impliziter Gesetzesbezug (Rule 3 — reformuliert, kein Originaltext) */} {!ctrl.source_citation && ctrl.open_anchors.length > 0 && (
@@ -297,7 +326,7 @@ export function ControlDetail({
)} - {/* Evidence */} + {/* Evidence — handles both {type, description} objects and plain strings */} {ctrl.evidence.length > 0 && (

Nachweise

@@ -305,7 +334,11 @@ export function ControlDetail({ {ctrl.evidence.map((ev, i) => (
-
{ev.type}: {ev.description}
+ {typeof ev === 'string' ? ( +
{ev}
+ ) : ( +
{ev.type}: {ev.description}
+ )}
))} @@ -359,7 +392,18 @@ export function ControlDetail({

Generierungsdetails (intern)

-

Pfad: {String(ctrl.generation_metadata.processing_path || '-')}

+ {ctrl.generation_metadata.processing_path && ( +

Pfad: {String(ctrl.generation_metadata.processing_path)}

+ )} + {ctrl.generation_metadata.decomposition_method && ( +

Methode: {String(ctrl.generation_metadata.decomposition_method)}

+ )} + {ctrl.generation_metadata.pass0b_model && ( +

LLM: {String(ctrl.generation_metadata.pass0b_model)}

+ )} + {ctrl.generation_metadata.obligation_type && ( +

Obligation-Typ: {String(ctrl.generation_metadata.obligation_type)}

+ )} {ctrl.generation_metadata.similarity_status && (

Similarity: {String(ctrl.generation_metadata.similarity_status)}

)} diff --git a/admin-compliance/app/sdk/control-library/components/helpers.tsx b/admin-compliance/app/sdk/control-library/components/helpers.tsx index 146f7ac..f48f743 100644 --- a/admin-compliance/app/sdk/control-library/components/helpers.tsx +++ b/admin-compliance/app/sdk/control-library/components/helpers.tsx @@ -30,7 +30,7 @@ export interface CanonicalControl { } requirements: string[] test_procedure: string[] - evidence: EvidenceItem[] + evidence: (EvidenceItem | string)[] severity: string risk_score: number | null implementation_effort: string | null @@ -47,6 +47,10 @@ export interface CanonicalControl { target_audience: string | string[] | null generation_metadata?: Record | null generation_strategy?: string | null + parent_control_uuid?: string | null + parent_control_id?: string | null + parent_control_title?: string | null + decomposition_method?: string | null created_at: string updated_at: string } @@ -275,7 +279,26 @@ export function GenerationStrategyBadge({ strategy }: { strategy: string | null if (strategy === 'document_grouped') { return v2 } - return null + if (strategy === 'phase74_gap_fill') { + return v5 Gap + } + if (strategy === 'pass0b_atomic') { + return Atomar + } + return {strategy} +} + +export const OBLIGATION_TYPE_CONFIG: Record = { + pflicht: { bg: 'bg-red-100 text-red-700', label: 'Pflicht' }, + empfehlung: { bg: 'bg-amber-100 text-amber-700', label: 'Empfehlung' }, + kann: { bg: 'bg-green-100 text-green-700', label: 'Kann' }, +} + +export function ObligationTypeBadge({ type }: { type: string | null | undefined }) { + if (!type) return null + const config = OBLIGATION_TYPE_CONFIG[type] + if (!config) return null + return {config.label} } export function getDomain(controlId: string): string { diff --git a/admin-compliance/app/sdk/control-library/page.tsx b/admin-compliance/app/sdk/control-library/page.tsx index f67f80e..aaa5f32 100644 --- a/admin-compliance/app/sdk/control-library/page.tsx +++ b/admin-compliance/app/sdk/control-library/page.tsx @@ -9,7 +9,7 @@ import { import { CanonicalControl, Framework, BACKEND_URL, EMPTY_CONTROL, SeverityBadge, StateBadge, LicenseRuleBadge, VerificationMethodBadge, CategoryBadge, TargetAudienceBadge, - GenerationStrategyBadge, + GenerationStrategyBadge, ObligationTypeBadge, VERIFICATION_METHODS, CATEGORY_OPTIONS, TARGET_AUDIENCE_OPTIONS, } from './components/helpers' import { ControlForm } from './components/ControlForm' @@ -762,6 +762,7 @@ export default function ControlLibraryPage() { + {ctrl.risk_score !== null && ( Score: {ctrl.risk_score} )} diff --git a/backend-compliance/compliance/api/canonical_control_routes.py b/backend-compliance/compliance/api/canonical_control_routes.py index df44dda..992a5a2 100644 --- a/backend-compliance/compliance/api/canonical_control_routes.py +++ b/backend-compliance/compliance/api/canonical_control_routes.py @@ -174,6 +174,9 @@ _CONTROL_COLS = """id, framework_id, control_id, title, objective, rationale, customer_visible, verification_method, category, target_audience, generation_metadata, generation_strategy, applicable_industries, applicable_company_size, scope_conditions, + parent_control_uuid, decomposition_method, pipeline_version, + (SELECT p.control_id FROM canonical_controls p WHERE p.id = canonical_controls.parent_control_uuid) AS parent_control_id, + (SELECT p.title FROM canonical_controls p WHERE p.id = canonical_controls.parent_control_uuid) AS parent_control_title, created_at, updated_at""" @@ -798,6 +801,11 @@ def _control_row(r) -> dict: "applicable_industries": getattr(r, "applicable_industries", None), "applicable_company_size": getattr(r, "applicable_company_size", None), "scope_conditions": getattr(r, "scope_conditions", None), + "parent_control_uuid": str(r.parent_control_uuid) if getattr(r, "parent_control_uuid", None) else None, + "parent_control_id": getattr(r, "parent_control_id", None), + "parent_control_title": getattr(r, "parent_control_title", None), + "decomposition_method": getattr(r, "decomposition_method", None), + "pipeline_version": getattr(r, "pipeline_version", None), "created_at": r.created_at.isoformat() if r.created_at else None, "updated_at": r.updated_at.isoformat() if r.updated_at else None, } diff --git a/backend-compliance/compliance/api/dsfa_routes.py b/backend-compliance/compliance/api/dsfa_routes.py index dcd9ce7..f9c79c7 100644 --- a/backend-compliance/compliance/api/dsfa_routes.py +++ b/backend-compliance/compliance/api/dsfa_routes.py @@ -200,6 +200,9 @@ def _get_tenant_id(tenant_id: Optional[str]) -> str: def _dsfa_to_response(row) -> dict: """Convert a DB row to a JSON-serializable dict.""" import json + # SQLAlchemy 2.0: Row objects need ._mapping for string-key access + if hasattr(row, "_mapping"): + row = row._mapping def _parse_arr(val): """Parse a JSONB array field → list.""" @@ -558,8 +561,9 @@ async def create_dsfa( ).fetchone() db.flush() + row_id = row._mapping["id"] if hasattr(row, "_mapping") else row[0] _log_audit( - db, tid, row["id"], "CREATE", request.created_by, + db, tid, row_id, "CREATE", request.created_by, new_values={"title": request.title, "status": request.status}, ) db.commit() diff --git a/backend-compliance/migrations/074_control_dedup.sql b/backend-compliance/migrations/074_control_dedup.sql new file mode 100644 index 0000000..81cb495 --- /dev/null +++ b/backend-compliance/migrations/074_control_dedup.sql @@ -0,0 +1,73 @@ +-- Migration 074: Control Dedup Engine — DB Schema +-- Supports the 4-stage dedup pipeline for atomic controls (Pass 0b). +-- +-- Tables: +-- 1. control_parent_links — M:N parent linking (one control → many regulations) +-- 2. control_dedup_reviews — Review queue for borderline matches (0.85-0.92) + +BEGIN; + +-- ============================================================================= +-- 1. Control Parent Links (M:N) +-- Enables "1 Control erfuellt 5 Gesetze" — the biggest USP. +-- An atomic control can have multiple parent controls from different +-- regulations/obligations. This replaces the 1:1 parent_control_uuid FK. +-- ============================================================================= + +CREATE TABLE IF NOT EXISTS control_parent_links ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + control_uuid UUID NOT NULL REFERENCES canonical_controls(id) ON DELETE CASCADE, + parent_control_uuid UUID NOT NULL REFERENCES canonical_controls(id) ON DELETE CASCADE, + link_type VARCHAR(30) NOT NULL DEFAULT 'decomposition' + CHECK (link_type IN ('decomposition', 'dedup_merge', 'manual', 'crosswalk')), + confidence NUMERIC(3,2) DEFAULT 1.0 + CHECK (confidence >= 0 AND confidence <= 1), + source_regulation VARCHAR(100), + source_article VARCHAR(100), + obligation_candidate_id UUID REFERENCES obligation_candidates(id), + created_at TIMESTAMPTZ DEFAULT NOW(), + CONSTRAINT uq_parent_link UNIQUE (control_uuid, parent_control_uuid) +); + +CREATE INDEX IF NOT EXISTS idx_cpl_control ON control_parent_links(control_uuid); +CREATE INDEX IF NOT EXISTS idx_cpl_parent ON control_parent_links(parent_control_uuid); +CREATE INDEX IF NOT EXISTS idx_cpl_type ON control_parent_links(link_type); + +COMMENT ON TABLE control_parent_links IS + 'M:N parent links — one atomic control can fulfill multiple regulations/obligations. USP: "1 Control erfuellt 5 Gesetze"'; + +-- ============================================================================= +-- 2. Control Dedup Reviews +-- Queue for borderline matches (similarity 0.85-0.92) that need human review. +-- Reviewed entries get status updated to accepted/rejected. +-- ============================================================================= + +CREATE TABLE IF NOT EXISTS control_dedup_reviews ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + candidate_control_id VARCHAR(30) NOT NULL, + candidate_title TEXT NOT NULL, + candidate_objective TEXT, + matched_control_uuid UUID REFERENCES canonical_controls(id), + matched_control_id VARCHAR(30), + similarity_score NUMERIC(4,3) DEFAULT 0.0, + dedup_stage VARCHAR(40) NOT NULL, + dedup_details JSONB DEFAULT '{}', + parent_control_uuid UUID REFERENCES canonical_controls(id), + obligation_candidate_id UUID REFERENCES obligation_candidates(id), + review_status VARCHAR(20) DEFAULT 'pending' + CHECK (review_status IN ('pending', 'accepted_link', 'accepted_new', 'rejected')), + reviewed_by VARCHAR(100), + reviewed_at TIMESTAMPTZ, + review_notes TEXT, + created_at TIMESTAMPTZ DEFAULT NOW() +); + +CREATE INDEX IF NOT EXISTS idx_cdr_status ON control_dedup_reviews(review_status); +CREATE INDEX IF NOT EXISTS idx_cdr_matched ON control_dedup_reviews(matched_control_uuid); +CREATE INDEX IF NOT EXISTS idx_cdr_parent ON control_dedup_reviews(parent_control_uuid); +CREATE INDEX IF NOT EXISTS idx_cdr_stage ON control_dedup_reviews(dedup_stage); + +COMMENT ON TABLE control_dedup_reviews IS + 'Review queue for borderline dedup matches (similarity 0.85-0.92). Human decides: link or new control.'; + +COMMIT; diff --git a/backend-compliance/tests/test_canonical_control_routes.py b/backend-compliance/tests/test_canonical_control_routes.py index 9097294..867537f 100644 --- a/backend-compliance/tests/test_canonical_control_routes.py +++ b/backend-compliance/tests/test_canonical_control_routes.py @@ -195,6 +195,11 @@ class TestControlRowConversion: "release_state": "draft", "tags": ["mfa"], "generation_strategy": "ungrouped", + "parent_control_uuid": None, + "parent_control_id": None, + "parent_control_title": None, + "decomposition_method": None, + "pipeline_version": None, "created_at": now, "updated_at": now, } diff --git a/docs-src/development/qa-control-quality.md b/docs-src/development/qa-control-quality.md index 7b26c43..3072e9d 100644 --- a/docs-src/development/qa-control-quality.md +++ b/docs-src/development/qa-control-quality.md @@ -2,7 +2,23 @@ ## Übersicht -Die Control Quality Pipeline prüft und verbessert die ~9.000 Canonical Controls der Compliance-Bibliothek. Sie nutzt **PDF-basierte Verifizierung** als Ground Truth — jeder Control-Originaltext wird direkt im Quelldokument (PDF) lokalisiert. +Die Control Quality Pipeline prüft und verbessert die Canonical Controls der Compliance-Bibliothek. Sie nutzt **PDF-basierte Verifizierung** als Ground Truth — jeder Control-Originaltext wird direkt im Quelldokument (PDF) lokalisiert. + +Alle Scripts liegen in **`scripts/qa/`**. Starten auf dem Mac Mini via Runner-Script: + +```bash +# Job starten (laedt .env automatisch, PID-Lock, unbuffered output) +ssh macmini "bash ~/Projekte/breakpilot-compliance/scripts/qa/run_job.sh [args...]" + +# Status aller Jobs +ssh macmini "bash ~/Projekte/breakpilot-compliance/scripts/qa/run_job.sh --status" + +# Log ansehen +ssh macmini "bash ~/Projekte/breakpilot-compliance/scripts/qa/run_job.sh --log " + +# Job stoppen +ssh macmini "bash ~/Projekte/breakpilot-compliance/scripts/qa/run_job.sh --kill " +``` ## Architektur @@ -55,20 +71,24 @@ Jeder Control hat ein Feld `source_original_text` — der Chunk-Text aus dem Que | Metrik | Wert | |---|---| -| Controls mit source_original_text | 7.943 | -| Im PDF lokalisiert | **6.259 (79%)** | -| Nicht gefunden (Sprachmismatch) | 1.651 | -| Kein PDF vorhanden | 33 | -| 100% Match-Rate | 19 Regulations (inkl. DSGVO, KI-VO, NIS2, NIST 800-53) | +| Controls mit source_original_text | 5.751 (86%) | +| Im PDF lokalisiert | **5.063 (88%)** | +| Nicht gefunden | 649 | +| Kein PDF vorhanden | 29 | +| Recital_suspect markiert | 648 | +| 100% Match-Rate | 20+ Regulations (inkl. DSGVO, KI-VO, NIS2, NIST 800-53, Blue Guide) | + +**Verlauf:** v1 (4.110, 52%) → v2 (6.091, 77%) → v3 (6.259, 79%) → v4 +Blue Guide EN (6.803, 86%) → v5 nach Cleanup (5.063/5.741, 88%) ### Nicht-matchende Controls -| Ursache | Controls | Erklärung | +| Ursache | Controls | Status | |---|---|---| -| Blue Guide EN vs. DE PDF | ~562 | Controls aus englischem PDF, wir haben nur deutsches | -| OWASP multilingual | ~632 | Controls aus PT/AR/ID/ES-Übersetzungen | +| ~~Blue Guide EN vs. DE PDF~~ | ~~562~~ | ✅ Gelöst — EN-PDF beschafft, 544/544 gematcht | +| ~~OWASP Top 10 multilingual~~ | ~~324~~ | ✅ Als duplicate markiert — Übersetzungen ohne Mehrwert | | CRA Encoding | ~76 | PDF-Ligaturen/Sonderzeichen-Differenzen | | CISA Secure by Design | ~113 | Falsches PDF (ENISA statt CISA) | +| OWASP ASVS | ~173 | PDF-Matching-Problem (meist EN) | ## Brute-Force-Suche @@ -100,34 +120,276 @@ Controls aus Erwägungsgründen (`article_type = preamble`) sind **kein Nachteil Die 1.195 v1-Controls **ohne** Originaltext sind manuell erstellt (`strategy=ungrouped`) und haben keine Chunk-Referenz. -## DB-Status (Stand 2026-03-20) +## OWASP Cleanup (2026-03-20) + +- **324 OWASP Top 10 multilingual Controls** → `duplicate` markiert (ZH, AR, ID, FR, ES, PT — Übersetzungen derselben 10 Kategorien) +- **47 Controls** mit falscher Quellenzuordnung korrigiert (z.B. als "OWASP Top 10" getaggt, aber tatsächlich aus ASVS/SAMM/API/MASVS) +- **~200 OWASP ASVS/SAMM/MASVS EN Controls** behalten — unique Content aus GitHub/Website, nicht im PDF auffindbar + +## NIST OSCAL Import (2026-03-20) + +**776 neue Controls** aus NIST SP 800-53 Rev 5 OSCAL (Public Domain, maschinenlesbar): + +- Quelle: `usnistgov/oscal-content` (JSON Catalog) +- Vor allem **Control Enhancements** (z.B. AC-2(3), SC-7(8)) — die atomaren Unteranforderungen +- Jeder Control enthält: Statement + Guidance + Assessment-Methoden + Cross-References + Parameters +- `pipeline_version = 4`, `generation_strategy = 'oscal_import'` +- Kein Pass 0a/0b nötig — Controls sind **bereits atomar** + +| Metrik | Vorher | Nachher | +|---|---|---| +| SP 800-53 Controls (aktiv) | 1.107 | **1.883** | +| OSCAL-Abdeckung | 238/1.014 (23%) | **1.014/1.014 (100%)** | + +## Phase 5: RAG-Deduplizierung + Normalisierung (2026-03-20) + +### Durchgeführte Schritte + +| Schritt | Beschreibung | Controls | +|---|---|---| +| 5.1 | OSCAL Controls: `source_regulation` in generation_metadata gesetzt | 776 | +| 5.2 | v3 Controls ohne Source → `needs_review` mit `missing_source` Flag | 20 | +| 5.3 | Leerer Source-Name korrigiert (AT TKG) | 1 | +| 5.4 | OWASP regulation_code Fehlzuordnungen korrigiert | 47 | +| 5.5 | **duplicate/too_close Controls hart gelöscht** | **3.301** | +| 5.6 | Processed Chunks bereinigt (gelöschte Control-IDs entfernt) | 2.520 | + +### Ergebnis + +- **Vorher:** 9.936 Controls (6.635 aktiv, 2.998 duplicate, 303 too_close) +- **Nachher:** 6.635 Controls, **alle aktiv** (0 duplicate/too_close) +- Alle regulation_codes haben jetzt einheitliche Source-Namen +- OWASP-Controls sind korrekt ihren Quellen zugeordnet + +## DB-Status (Stand 2026-03-20, nach Phase 7.4) | release_state | Count | |---|---| -| draft | 5.365 | -| needs_review | 818 | -| duplicate | 2.674 | -| too_close | 303 | -| **Aktiv** | **6.183** | +| draft | ~6.030 | +| needs_review | 838 | +| **Gesamt** | **6.868** | -## Scripts +## Scripts (`scripts/qa/`) -Alle QA-Scripts liegen in `scripts/qa/`: +### Kern-QA (PDF-Matching) | Script | Beschreibung | |---|---| -| `pdf_qa_all.py` | Haupt-QA: Controls gegen PDFs matchen | -| `pdf_qa_inventory.py` | Inventar: Regulations, Controls, PDFs | -| `apply_pdf_qa_results.py` | Ergebnisse in DB schreiben | -| `preamble_dedup.py` | Preamble vs. Artikel Duplikat-Erkennung | -| `qa_dedup_controls.py` | Jaccard-basierte Titel-Dedup | -| `qa_normalize_sources.py` | Source-Namen normalisieren | -| `db_status.py` | DB-Status-Übersicht | +| `pdf_qa_all.py` | **Haupt-QA**: Controls gegen PDFs matchen, Artikel-Index aufbauen. Enthaelt `SOURCE_FILE_MAP`, alle Index-Builder (EU, DE, NIST, OWASP, generic). 526 Zeilen. | +| `pdf_qa_inventory.py` | Inventar: Welche Regulations haben Controls, wie viele, welche PDFs existieren | +| `apply_pdf_qa_results.py` | Ergebnisse aus `pdf_qa_all.py` in DB schreiben (`article_type`, `recital_suspect`) | +| `pdf_article_lookup_poc.py` | POC: Control-Text in PDF lokalisieren, Headings von Cross-Refs unterscheiden | -## Nächste Schritte +### Lueckenanalyse + Control-Generierung -1. **Blue Guide EN-PDF** beschaffen → +562 Controls matchen -2. **CISA Secure by Design** echtes PDF finden → +113 Controls -3. **Brute-Force Ergebnisse anwenden** — 44 falsche Source-Zuordnungen korrigieren -4. **Frontend-Anzeige** — `article_type` im Control-Detail anzeigen -5. **Continuous QA** — Bei neuen Controls automatisch PDF-Match prüfen +| Script | Beschreibung | +|---|---| +| `gap_analysis.py` | **Phase 7.3**: Artikel im PDF vs. Controls in DB vergleichen, Luecken identifizieren | +| `phase74_generate_gap_controls.py` | **Phase 7.4**: Neue Controls fuer Luecken via Anthropic API generieren. `pipeline_version=5`. 624 Zeilen. | +| `benchmark_llm_controls.py` | LLM-Vergleich: gpt-oss-120b vs. Claude Sonnet fuer Control-Generierung | +| `test_pass0a.py` | **Pass 0a Test**: Obligation Extraction + 3-Tier-Klassifizierung (Pflicht/Empfehlung/Kann). Standalone, speichert JSON. | + +### Deduplizierung + Normalisierung + +| Script | Beschreibung | +|---|---| +| `preamble_dedup.py` | Preamble vs. Artikel Duplikat-Erkennung (Jaccard >= 0.40) | +| `qa_dedup_controls.py` | Jaccard-basierte Titel-Deduplizierung | +| `qa_apply_and_dedup.py` | Ergebnisse anwenden + Duplikate in einem Schritt markieren | +| `qa_normalize_sources.py` | Source-Namen normalisieren (kanonische Namen) | +| `phase5_normalize_and_cleanup.py` | **Phase 5**: Normalisierung + 3.301 Duplikate hart loeschen | +| `qa_delete_gpsr_dupe.py` | GPSR-Duplikate loeschen | +| `delete_gpsr_prod.py` | GPSR-Duplikate aus Production-Qdrant entfernen | + +### Quellen-spezifische Scripts + +| Script | Beschreibung | +|---|---| +| `blue_guide_en_match.py` | Blue Guide EN-PDF matchen (544/544 Erfolg) | +| `owasp_cleanup.py` | OWASP multilingual Cleanup (324 Duplikate) + Source-Fix (47 korrigiert) | +| `owasp_github_match.py` | OWASP ASVS/SAMM/MASVS gegen GitHub-Markdown matchen | +| `oscal_import.py` | NIST OSCAL Import (776 Controls aus JSON Catalog) | +| `oscal_analysis.py` | NIST OSCAL Analyse: Abdeckung, fehlende Controls | + +### Diagnose + Utilities + +| Script | Beschreibung | +|---|---| +| `db_status.py` | DB-Status: release_state Counts, pipeline_version, source Verteilung | +| `debug_low_match.py` | Debugging: Warum matchen Blue Guide / OWASP / CISA schlecht? | +| `qa_article_map_all_chunks.py` | Alle Chunks Artikel-Nummern zuordnen (Bulk) | +| `backfill_job_66228863.py` | Einmaliger Backfill-Job | +| `sync_controls_to_prod.py` | Controls von Dev nach Production synchronisieren | + +### Runner + +| Script | Beschreibung | +|---|---| +| `run_job.sh` | **Job-Runner**: Laedt `.env`, PID-Lock, Monitoring (`--status`, `--log`, `--kill`) | + +## Phase 7: PDF-Validierung + Enrichment (2026-03-20) + +### 7.1 + 7.2: Controls gegen PDFs validiert + Ergebnisse angewendet ✅ + +- 5.063 Controls erfolgreich im Original-PDF lokalisiert (88%) +- `article_type` fuer alle gematchten Controls gesetzt +- 648 Preamble-Controls als `recital_suspect` in `generation_metadata` markiert +- 332 Controls nicht matchbar (OWASP ASVS 132, CISA 72, ENISA 38, OWASP SAMM 31, CRA 28) + +### 7.3: Lueckenanalyse ✅ + +**494 Artikel-Luecken** in 15 Quellen identifiziert. Geschaetzt ~300 davon actionable. + +| Source | Luecken | Coverage | Bemerkung | +|---|---:|---:|---| +| AML-Verordnung | 91 | 5% | Kaum ingestiert | +| MiCA | 71 | 52% | Grosse Verordnung | +| NIST SP 800-53 | 59 | 83% | Meist Section-Header, nur SA-15 fehlt | +| OWASP ASVS 4.0 | 47 | 35% | Requirement-Gruppen fehlen | +| Batterieverordnung | 41 | 58% | | +| DSGVO | 35 | 65% | Einige Governance/Aufsicht-Artikel | +| ENISA ICS/SCADA | 34 | 31% | | +| ENISA Supply Chain | 26 | 7% | | +| CRA | 23 | 68% | | +| NIS2 | 16 | 65% | | +| KI-Verordnung | 15 | 87% | Fast komplett | +| Maschinenverordnung | 5 | 91% | Fast komplett | + +### 7.4: Neue Controls fuer Luecken generieren ✅ (2026-03-20) + +Script: `phase74_generate_gap_controls.py --resume` + +- **494 Artikel-Luecken** in 15 Quellen → Anthropic Claude Sonnet 4.6 +- `pipeline_version = 5`, `generation_strategy = 'phase74_gap_fill'` +- Direkt PDF-Text als Input (nicht RAG-Chunks) +- Starten via: `run_job.sh phase74_generate_gap_controls.py --resume` + +**Ergebnis:** + +| Source | Luecken | Generiert | +|---|---:|---:| +| AML-Verordnung | 91 | 97 | +| MiCA | 71 | 68 | +| NIST SP 800-53 | 59 | 19 | +| KI-Verordnung | 15 | 15 | +| OWASP ASVS 4.0 | 47 | 11 | +| Batterieverordnung | 41 | 9 | +| DSGVO | 35 | 4 | +| OWASP Top 10 | 12 | 3 | +| NIS2 | 16 | 3 | +| CRA | 23 | 3 | +| OECD KI-Empfehlung | 4 | 1 | +| **Gesamt** | **494** | **233** | + +Nicht generiert: 75 zu kurzer Text, 29 NIST-Intros, 11 Parse-Errors, 162 ID-Konflikte (COMP-1000 etc.). +API-Kosten: ~$7,55 (109 min Laufzeit). + +## Pass 0a: Obligation Extraction — 3-Tier-Klassifizierung + +### Konzept + +Pass 0a zerlegt Rich Controls (~6.000) in **atomare Obligations** per LLM (Claude Sonnet 4.6). +Jede Obligation wird durch den **Quality Gate** klassifiziert — nicht gefiltert: + +| obligation_type | Signal | Beispiel | +|---|---|---| +| **pflicht** | müssen, muss, ist zu, hat zu, shall, must, required | "Der Betreiber muss alle Daten verschluesseln" | +| **empfehlung** | soll, sollen, should, sicherstellen, gewaehrleisten, dokumentieren | "Der Betreiber soll regelmaessige Audits durchfuehren" | +| **kann** | kann, koennen, darf, duerfen, may, optional | "Der Betreiber kann zusaetzliche Massnahmen ergreifen" | + +**Wichtig:** Nichts wird mehr rejected wegen fehlendem normativem Signal. Obligations ohne Signal werden als `empfehlung` klassifiziert. Rejected werden nur noch: Evidence-Only, zu kurz (<20 Zeichen), fehlender Parent-Link. + +### Warum auch Empfehlungen behalten? + +Empfehlungen helfen Firmen, ihre Systeme sicherer zu machen — ueber das Pflichtprogramm hinaus. Im Frontend erhalten Kunden einen Marker, der klar anzeigt: + +- **Pflicht** = gesetzlich/regulatorisch vorgeschrieben +- **Empfehlung** = Best Practice, freiwillig, aber wertvoll +- **Kann** = optional, weitergehende Massnahme + +### Quality Gate — Kritische Flags + +| Flag | Kritisch? | Beschreibung | +|---|---|---| +| `has_normative_signal` | Nein | Informativer Check, kein Ablehnungsgrund | +| `obligation_type` | — | Klassifizierung (pflicht/empfehlung/kann) | +| `not_evidence_only` | **Ja** | Kein reiner Nachweis-Eintrag | +| `min_length` | **Ja** | Mindestens 20 Zeichen | +| `has_parent_link` | **Ja** | Verbindung zum Parent-Control | +| `single_action` | Nein | Nur ein Hauptverb (heuristisch) | +| `not_rationale` | Nein | Keine reine Begruendung | + +### Normative Signal Detection — Regex-Tiers + +``` +Tier 1 (Pflicht): muessen, muss, ist/sind/hat/haben zu + Infinitiv, + Compound-Verben (festzustellen, vorzunehmen), + Gerundivum (mitzuteilen, bereitzustellen), + shall, must, required + +Tier 2 (Empfehlung): soll, sollen, sollte, sollten, + gewaehrleisten, sicherstellen, + should, ensure, recommend, + dokumentieren, implementieren, ueberpruefen + +Tier 3 (Kann): kann, koennen, darf, duerfen, may, optional +``` + +### Testergebnisse (3 Iterationen, 2026-03-20) + +| Run | Controls | Obligations | Validated | Rejected | Kosten | +|---|---:|---:|---:|---:|---:| +| 1 (v0 Regex) | 10 | ~100 | 68% | 32% | $0,28 | +| 2 (v1 Regex) | 50 | ~530 | 78% | 22% | $1,43 | +| 3 (v2 Regex) | 50 | ~530 | 86% | 14% | $1,44 | +| 4 (3-Tier) | 60 | — | — | — | — | + +Run 4 laeuft mit dem neuen Klassifizierer — statt PASS/REJECT wird jetzt PFLICHT/EMPFEHLUNG/KANN ausgegeben. + +### Scripts + +| Script | Beschreibung | +|---|---| +| `test_pass0a.py` | **Test-Script**: Standalone (kein SQLAlchemy), psycopg2 + Anthropic API. Speichert Ergebnisse als JSON. | + +```bash +# Test mit 10 Controls +run_job.sh test_pass0a.py --limit 10 + +# Test mit bestimmter Quelle +run_job.sh test_pass0a.py --limit 20 --source "DSGVO" + +# Ergebnisse: /tmp/pass0a_results_controls.json +``` + +### Backend-Code + +- **Klassifizierung:** `backend-compliance/compliance/services/decomposition_pass.py` + - `classify_obligation_type()` — 3-Tier-Klassifizierung + - `quality_gate()` — gibt `obligation_type` in Flags zurueck + - `passes_quality_gate()` — `has_normative_signal` nicht mehr kritisch + - `ObligationCandidate.obligation_type` — neues Feld + +### Hochrechnung (basierend auf 50-Control-Runs) + +| Metrik | Wert | +|---|---| +| Kosten pro Control | ~$0,029 | +| Kosten fuer ~6.000 Controls | **~$172** | +| Laufzeit (geschaetzt) | ~25h | +| Obligations pro Control | ~10,5 | + +--- + +## Naechste Schritte + +1. ~~**Phase 5 Cleanup** → 3.301 Duplikate geloescht, Source normalisiert~~ ✅ +2. ~~**Phase 6 Pipeline-Haertung** → Source aus REGULATION_LICENSE_MAP~~ ✅ +3. ~~**Phase 7.1-7.3** → PDF-Validierung + Enrichment + Lueckenanalyse~~ ✅ +4. ~~**Phase 7.4** → 233 neue Controls fuer Luecken generiert ($7,55)~~ ✅ +5. **Pass 0a** → Obligation Extraction mit 3-Tier-Klassifizierung (Tests laufen, ~$172) +6. **Pass 0b** → Atomic Control Composition aus validierten Obligations +7. **Pass 1-5** → Multi-Layer Migration (Code + 500 Tests bereits vorhanden) +8. **Phase 8** → Qdrant Re-Ingestion (Runtime-Betrieb, ZULETZT) +9. **needs_review Triage** — 838 Controls klassifizieren +10. **Frontend** — `obligation_type` (Pflicht/Empfehlung/Kann) + `article_type` anzeigen diff --git a/docs-src/development/rag-pipeline-benchmark.md b/docs-src/development/rag-pipeline-benchmark.md new file mode 100644 index 0000000..2250a05 --- /dev/null +++ b/docs-src/development/rag-pipeline-benchmark.md @@ -0,0 +1,206 @@ +# RAG Pipeline Benchmark & Optimierungen + +Stand: 2026-03-21. Vergleich unserer Implementierung mit State of the Art. Priorisierte Empfehlungen nach Impact/Effort. + +--- + +## Aktuelle Pipeline (Ist-Zustand) + +```mermaid +flowchart LR + A[Dokumente] -->|Document Crawler| B[Chunks 512/50] + B -->|bge-m3| C[Qdrant Dense] + C -->|Cosine Search| D[Control Generator v2] + D -->|LLM| E[Rich Controls 6.373] + E -->|Pass 0a| F[Obligations] + F -->|Pass 0b| G[Atomare Controls] + G -->|4-Stage Dedup| H[Master Controls ~18K] +``` + +| Komponente | Implementierung | SOTA-Bewertung | +|-----------|----------------|----------------| +| **Chunking** | Rekursiv, 512 Zeichen, 50 Overlap | Zu klein fuer Rechtstexte | +| **Embedding** | bge-m3 (1024-dim, Ollama) | Gut, aber nur Dense genutzt | +| **Vector DB** | Qdrant mit Payload-Filtering | Hybrid Search nicht aktiviert | +| **Retrieval** | Pure Dense Cosine Similarity | Kein Re-Ranking, kein BM25 | +| **Extraktion** | 3-Tier (Exact → Embedding → LLM) | Solide Architektur | +| **Dedup** | 4-Stage (Pattern → Action → Object → Embedding) | Ueberdurchschnittlich | +| **QA** | 5-Metrik Similarity + PDF-QA Matching | Gut, RAGAS fehlt | + +--- + +## Tier 1: Quick Wins (Tage, nicht Wochen) + +### 1. Chunk-Groesse erhoehen: 512 → 1024, Overlap 50 → 128 + +**Problem:** NAACL 2025 Vectara-Studie zeigt: fuer analytische/juristische Queries sind 512-1024 Token optimal. Unsere 512-Zeichen-Chunks (= ~128 Token) sind deutlich zu klein. + +**Unsere Lessons Learned:** "Chunks werden mitten im Absatz abgeschnitten. Artikel- und Paragraphennummern fehlen." + +**Aenderung:** Config-Parameter in `ingest-phase-h.sh` anpassen. + +| Metrik | Vorher | Nachher | +|--------|--------|---------| +| Chunk Size | 512 chars (~128 Token) | 1024 chars (~256 Token) | +| Overlap | 50 chars (10%) | 128 chars (12.5%) | + +**Impact:** HOCH | **Effort:** NIEDRIG + +### 2. Ollama JSON-Mode fuer Obligation Extraction + +**Problem:** `_parse_json` in `decomposition_pass.py` hat Regex-Fallback — das zeigt, dass LLM-Output nicht zuverlaessig JSON ist. + +**Aenderung:** `format: "json"` in Ollama-API-Calls setzen. + +**Impact:** MITTEL | **Effort:** NIEDRIG (1 Parameter) + +### 3. Chain-of-Thought Prompting fuer Pass 0a/0b + +**Problem:** LegalGPT-Framework zeigt: explizite Reasoning-Chains ("Erst Addressat identifizieren, dann Aktion, dann normative Staerke") verbessern Extraktionsqualitaet signifikant. + +**Impact:** MITTEL | **Effort:** NIEDRIG (Prompt Engineering) + +--- + +## Tier 2: High Impact, Medium Effort (1-2 Wochen) + +### 4. Hybrid Search (Dense + Sparse) via Qdrant + +**Problem:** Reine Dense-Suche. Juristische Queries enthalten spezifische Begriffe ("DSGVO Art. 35", "Abs. 3"), die BM25/Sparse besser findet. + +**Loesungsansatz:** BGE-M3 generiert bereits Sparse Vectors — wir verwerfen sie aktuell! + +``` +Qdrant Query API: +- Dense: bge-m3 Cosine (wie bisher) +- Sparse: bge-m3 Sparse Vectors (neu) +- Fusion: Reciprocal Rank Fusion (RRF) +``` + +**Benchmarks (Anthropic):** 49% weniger fehlgeschlagene Retrievals mit Contextual Retrieval, 67% mit Re-Ranking. + +**Impact:** SEHR HOCH | **Effort:** MITTEL + +### 5. Cross-Encoder Re-Ranking + +**Problem:** Top-5 Ergebnisse direkt an LLM — keine Qualitaetspruefung der Retrieval-Ergebnisse. + +**Loesungsansatz:** BGE Reranker v2 (MIT-Lizenz) auf Top-20 Ergebnisse, dann Top-5 an LLM. + +| Re-Ranker | Lizenz | Empfehlung | +|-----------|--------|------------| +| BGE Reranker v2 | MIT | Empfohlen | +| Jina Reranker v2 | Apache-2.0 | Alternative | +| ColBERT v2 | MIT | Spaeter | + +**Impact:** HOCH | **Effort:** MITTEL + +### 6. Cross-Regulation Dedup Pass + +**Problem:** Dedup filtert immer nach `pattern_id` — Controls aus DSGVO Art. 25 und NIS2 Art. 21 (beide Security-by-Design) werden nie verglichen. + +**Loesungsansatz:** Zweiter Qdrant-Search ohne `pattern_id`-Filter nach dem normalen Dedup-Pass. + +**Impact:** HOCH | **Effort:** MITTEL + +### 7. Automatische Regressionstests (Golden Set) + +**Problem:** Keine systematische Qualitaetsmessung nach Pipeline-Aenderungen. + +**Loesungsansatz:** 20-Chunk Golden Set → Control-Generation → Output-Stabilitaet pruefen. + +**Impact:** HOCH | **Effort:** NIEDRIG + +--- + +## Tier 3: Strategische Investitionen (Wochen bis Monate) + +### 8. Artikel-Boundary Chunking + +Eigener Splitter fuer EU-Verordnungen und deutsche Gesetze: Split an "Art.", "Artikel", "Paragraph"-Grenzen statt nach Zeichenzahl. + +### 9. RAGAS Evaluation Pipeline + +[RAGAS](https://docs.ragas.io/) mit Golden Dataset (50-100 manuell verifizierte Control-to-Source Mappings). Metriken: Faithfulness, Answer Relevancy, Context Precision, Context Recall. + +### 10. BGE-M3 Fine-Tuning + +Fine-Tuning auf Compliance-Corpus (~6.373 Control-Titel/Objective-Paare). Research zeigt +10-30% Domain-Retrieval-Verbesserung. + +### 11. LLM-as-Judge + +Claude Sonnet bewertet jeden generierten Control auf Faithfulness zum Quelltext (~$0.01/Control). + +### 12. Active Learning aus Review-Queue + +Menschliche Entscheidungen der Dedup Review-Queue nutzen, um Schwellenwerte ueber die Zeit zu optimieren. + +--- + +## Nicht empfohlen (niedriger ROI oder Konflikte) + +| Ansatz | Grund | +|--------|-------| +| Jina v3 Embeddings | **CC-BY-NC-4.0** — verletzt Open Source Policy | +| Voyage-law-2 | API-only, proprietaer — kein Self-Hosting | +| Semantic Chunking | Benchmarks zeigen keinen Vorteil gegenueber Recursive fuer strukturierte Dokumente | +| HyDE als Primaerstrategie | Latenz (+43-60%) + Halluzinationsrisiko | +| Knowledge Graph RAG | Massiver Aufwand, unklarer Gewinn bei strukturiertem Rechtskorpus | + +--- + +## Embedding-Modell Vergleich + +| Modell | MTEB Score | Multilingual | Kontext | Lizenz | Bewertung | +|--------|-----------|-------------|---------|--------|-----------| +| **BGE-M3** (aktuell) | 63.0 | 100+ Sprachen | 8192 Token | MIT | Gut, Dense+Sparse+ColBERT | +| Jina v3 | 65.5 | 89 Sprachen | 8192 Token | CC-BY-NC | Nicht nutzbar (Lizenz!) | +| E5-Mistral-7B | ~65 | Gut | 4096 Token | MIT | Gross, hoher RAM | +| Voyage-law-2 | Best Legal | EN Legal | 16K Token | Proprietaer | Nicht nutzbar (API-only) | + +**Fazit:** BGE-M3 bleibt die beste Wahl fuer unseren Stack. Sparse-Vectors aktivieren und Fine-Tuning bringen mehr als ein Modellwechsel. + +--- + +## Test-Coverage Analyse + +### Pipeline-Module (567 Tests) + +| Modul | Tests | Bewertung | Fehlende Tests | +|-------|-------|-----------|----------------| +| Control Generator | 110 | Exzellent | 10-15 Edge Cases | +| Obligation Extractor | 107 | Exzellent | 8-10 Edge Cases | +| Decomposition Pass | 90 | Exzellent | 5-8 Edge Cases | +| Pattern Matcher | 72 | Gut | 10-15 Edge Cases | +| Control Dedup | 56 | Exzellent | 5-8 Edge Cases | +| Control Composer | 54 | Gut | 8-10 Edge Cases | +| Pipeline Adapter | 36 | Gut | 10-15 Edge Cases | +| Citation Backfill | 20 | Moderat | 5-8 Edge Cases | +| License Gate | 12 | Minimal | 5-8 Edge Cases | +| RAG Client | 10 | Minimal | 5-8 Edge Cases | + +### Kritische Luecken (fehlende Tests) + +| Service | Datei | Prioritaet | +|---------|-------|------------| +| AI Compliance Assistant | `ai_compliance_assistant.py` | HOCH (25-30 Tests noetig) | +| PDF Extractor | `pdf_extractor.py` | HOCH (20-25 Tests noetig) | +| LLM Provider | `llm_provider.py` | HOCH (15-20 Tests noetig) | +| Similarity Detector | `similarity_detector.py` | MITTEL (20-25 Tests noetig) | +| Anchor Finder | `anchor_finder.py` | MITTEL | + +### Test-Infrastruktur + +**Fehlend:** Shared `conftest.py` mit gemeinsamen Fixtures (LLM-Mock, DB-Mock, Embedding-Mock). Aktuell sind Fixtures in jedem Test-File dupliziert. + +--- + +## Quellen + +- [NAACL 2025 Vectara Chunking Study](https://blog.premai.io/rag-chunking-strategies-the-2026-benchmark-guide/) +- [Anthropic Contextual Retrieval](https://www.anthropic.com/news/contextual-retrieval) +- [Qdrant Hybrid Search Query API](https://qdrant.tech/articles/hybrid-search/) +- [Structure-Aware Chunking for Legal (ACL 2025)](https://aclanthology.org/2025.justnlp-main.19/) +- [RAGAS Evaluation Framework](https://docs.ragas.io/) +- [BGE Reranker v2 (MIT)](https://huggingface.co/BAAI/bge-reranker-v2-m3) +- [LegalGPT / CALLM Framework](https://www.emergentmind.com/topics/compliance-alignment-llm-callm) diff --git a/docs-src/development/rag-pipeline-lessons-learned.md b/docs-src/development/rag-pipeline-lessons-learned.md new file mode 100644 index 0000000..d74dcd7 --- /dev/null +++ b/docs-src/development/rag-pipeline-lessons-learned.md @@ -0,0 +1,223 @@ +# RAG Pipeline: Lessons Learned & Hardening + +## Übersicht + +Dieses Dokument beschreibt die Erkenntnisse aus dem Aufbau der RAG-Pipeline und die daraus abgeleiteten Maßnahmen zur Härtung. Es dient als Referenz für zukünftige Ingestion-Runs und Pipeline-Erweiterungen. + +## Architektur: Wann brauchen wir RAG vs. Direct PDF? + +### RAG ist nötig für: + +| Use Case | Warum RAG? | +|---|---| +| **Compliance Advisor (Chat)** | Semantische Suche über 38+ Dokumente in Echtzeit | +| **Cross-Regulation Mapping** | "Zeige alle Anforderungen zu Verschlüsselung" über alle Quellen | +| **Customer Scope-Filtering** | Nur Chunks aus relevanten Regulations für den Kunden | +| **Inkrementelle Updates** | Neues Dokument → nur neue Chunks verarbeiten | + +### RAG ist NICHT nötig für: + +| Use Case | Besser: Direct PDF | +|---|---| +| **Control-Generierung (Batch)** | PDF → PyMuPDF → Strukturparser → Artikel-Index → API | +| **PDF-QA/Verifizierung** | Substring-Match direkt im PDF (schneller, exakter) | +| **Artikel/§-Extraktion** | Regex-basierte Extraktion aus PDF-Text | + +### Hybrid-Ansatz (Empfehlung) + +``` +Control-Generierung: PDF → Strukturparser → Artikel-Index → Anthropic API + (KEIN RAG nötig, direkt aus PDF) + +Runtime-Betrieb: Qdrant-RAG für semantische Suche, Chat, Scope-Analyse + (RAG mit angereicherten Chunks + Struktur-Metadaten) +``` + +## Fehler und Root Causes + +### 1. Doppelte Ingestion = Doppelte Controls + +**Problem:** Gleiche PDFs unter verschiedenen Namen ingestiert (z.B. "Maschinenverordnung" und "Verordnung (EU) 2023/1230") → unterschiedliche Chunks (anderes Chunking) → anderer Hash → doppelt verarbeitet → doppelte Controls. + +**Root Cause:** +- `regulation_name` aus Chunk-Metadaten statt aus kanonischer Quelle +- UNIQUE-Constraint nur `(chunk_hash, collection, document_version)` — nicht global +- Kein Check ob `regulation_code` bereits in einer Collection existiert + +**Fix (implementiert):** +- `REGULATION_LICENSE_MAP` enthält jetzt kanonische `name`-Werte die den DB-Einträgen entsprechen +- `source_citation.source` wird aus `REGULATION_LICENSE_MAP.name` genommen, NICHT aus `chunk.regulation_name` +- Phase 5 Cleanup: 3.301 Duplikate hart gelöscht + +**Fix (noch offen):** +- Chunk-Hash UNIQUE Constraint global machen: `(chunk_hash, document_version)` statt `(chunk_hash, collection, document_version)` +- Vor Ingestion: Check ob `regulation_code` bereits in einer Collection existiert + +### 2. Chunks verlieren Strukturinformation + +**Problem:** Chunks werden mitten im Absatz abgeschnitten. § und Artikelnummern fehlen in den Chunk-Metadaten. Kontext des Kapitels/Abschnitts geht verloren. + +**Root Cause:** +- `chunk_strategy=recursive` mit `chunk_size=512, chunk_overlap=50` — zu kleine Chunks +- Chunking beachtet keine Dokumentstruktur (Artikel-/Paragraphengrenzen) +- Keine Einleitung/Kapitelkontext als Prefix + +**Empfehlung für Re-Ingestion:** +- **Strukturiertes Chunking:** Chunks an Artikel-/Paragraphengrenzen schneiden +- **Kontext-Prefix:** Kapiteleinleitung und übergeordnete Struktur mitliefern +- **Metadaten anreichern:** `article`, `paragraph`, `article_type`, `section_hierarchy` +- **Größere Chunks:** Mindestens 1024 Tokens, besser volle Artikel/Paragraphen + +### 3. Cross-Collection-Duplikate + +**Problem:** `nist_csf_2_0` in `bp_compliance_ce` (67 Chunks) UND `bp_compliance_datenschutz` (162 Chunks). EU-Verordnungen sowohl in `bp_compliance_ce` als auch `bp_compliance_gesetze`. + +**Root Cause:** Keine Collection-Zuordnungsregeln. Manuelle Zuweisung bei Ingestion. + +**Fix:** `cleanup-qdrant-duplicates.py` Script bereinigt Cross-Collection-Duplikate. + +**Empfehlung:** Klare Collection-Zuordnungsregeln: +- `bp_compliance_ce` = EU-Verordnungen + internationale Standards +- `bp_compliance_gesetze` = Deutsche + österreichische Gesetze (NUR nationale Gesetze) +- `bp_compliance_datenschutz` = EDPB/WP29 Leitlinien + Privacy Frameworks + +### 4. OWASP Multilingual Controls + +**Problem:** 324 OWASP Top 10 Controls in ZH, AR, ID, FR, ES, PT — Übersetzungen derselben 10 Kategorien. Kein Mehrwert, aber 324 doppelte Controls generiert. + +**Root Cause:** Multilingual PDFs/GitHub-Quellen ohne Spracherkennung ingestiert. + +**Fix:** 324 als `duplicate` markiert und gelöscht. + +**Empfehlung:** Bei Ingestion Spracherkennung + Deduplizierung. Nur DE + EN behalten. + +### 5. Fehlende Artikel/Paragraph-Extraktion + +**Problem:** Chunks haben `article` und `paragraph` oft leer oder falsch. Die LLM-basierte Extraktion bei der Control-Generierung ist unzuverlässig. + +**Root Cause:** Ingestion-Pipeline extrahiert keine Strukturinformation aus dem PDF. + +**Fix (implementiert):** PDF-QA-Pipeline (`pdf_qa_all.py`) matched `source_original_text` gegen Original-PDFs und extrahiert korrekte Artikel/Paragraphen — 86% Match-Rate. + +**Empfehlung:** Bei Re-Ingestion direkt in den Chunk-Metadaten speichern. + +### 6. Job-Tracking nicht persistent + +**Problem:** Generation-Jobs laufen als Background-Tasks. Kein Logging, welche Chunks verarbeitet, Status nur über API abfragbar. Bei API-Timeout oder Restart geht der Fortschritt verloren. + +**Root Cause:** `asyncio.create_task()` hat keinen Recovery-Mechanismus. + +**Fix (teilweise):** `canonical_generation_jobs` Tabelle trackt Jobs. `canonical_processed_chunks` markiert verarbeitete Chunks. + +**Empfehlung:** +- Job-Log in DB persistieren (nicht nur stdout) +- Fortschritt in `canonical_generation_jobs.progress` als JSONB speichern +- Chunk-Level-Status: verarbeitet / übersprungen / Fehler +- Recovery-Fähigkeit: Job kann von letztem Checkpoint fortgesetzt werden + +## Empfohlene Metadaten für Re-Ingestion + +### Chunk-Level Metadaten (Qdrant Payload) + +```json +{ + "chunk_text": "...", + "regulation_code": "eu_2016_679", + "regulation_name_de": "DSGVO (EU) 2016/679", + "regulation_name_en": "GDPR (EU) 2016/679", + "article": "25", + "article_title": "Datenschutz durch Technikgestaltung und datenschutzfreundliche Voreinstellungen", + "article_type": "article", + "paragraph": "1", + "section_hierarchy": ["Kapitel IV", "Abschnitt 2", "Artikel 25"], + "chapter_context": "Kapitel IV — Verantwortlicher und Auftragsverarbeiter", + "pages": [45, 46], + "effective_date": "2018-05-25", + "publication_date": "2016-04-27", + "document_version": "2016-04-27", + "source_language": "de", + "source_url": "https://eur-lex.europa.eu/...", + "celex": "32016R0679", + "license": "EU_LAW", + "license_rule": 1, + "source_type": "law", + "category": "datenschutz", + "chunk_position": 42, + "total_chunks": 423 +} +``` + +### Dokument-Level Metadaten (Corpus Version) + +```json +{ + "regulation_code": "eu_2016_679", + "canonical_name_de": "DSGVO (EU) 2016/679", + "canonical_name_en": "GDPR (EU) 2016/679", + "document_type": "eu_regulation", + "effective_date": "2018-05-25", + "publication_date": "2016-04-27", + "supersedes": null, + "superseded_by": null, + "source_pdf": "gdpr_regulation_eu_2016_679.pdf", + "source_pdf_sha256": "abc123...", + "total_articles": 99, + "total_recitals": 173, + "total_annexes": 0, + "ingestion_date": "2026-03-20", + "ingestion_version": "v2" +} +``` + +## Pipeline-Härtung Checkliste + +### Vor Ingestion + +- [ ] Prüfen ob `regulation_code` bereits in einer Collection existiert +- [ ] PDF-SHA256 gegen bekannte PDFs prüfen (Duplikat-Erkennung) +- [ ] `regulation_name` aus `REGULATION_LICENSE_MAP` verwenden, NICHT aus Chunk-Metadaten +- [ ] Spracherkennung: Nur DE + EN ingestieren +- [ ] Dokument-Metadaten (effective_date, publication_date) recherchieren + +### Während Ingestion + +- [ ] Strukturiertes Chunking an Artikel-/Paragraphengrenzen +- [ ] Kontext-Prefix mit Kapiteleinleitung +- [ ] Chunk-Metadaten anreichern (article, paragraph, article_type, section_hierarchy) +- [ ] Fortschritt in DB loggen + +### Nach Ingestion + +- [ ] Chunk-Count pro `regulation_code` prüfen (Sanity Check) +- [ ] PDF-QA gegen Original-PDF laufen lassen +- [ ] Cross-Collection-Duplikat-Check +- [ ] Corpus-Version in DB eintragen + +### Control-Generierung + +- [ ] `source_citation.source` aus `REGULATION_LICENSE_MAP.name`, NICHT aus Chunk-Metadaten +- [ ] Harmonisierung: Threshold 0.85 für Duplikate innerhalb gleicher `regulation_code` +- [ ] Cross-Regulation-Harmonisierung bei ähnlichen Themen (z.B. DSGVO Art. 25 ↔ NIS2 Art. 21) +- [ ] Job-Fortschritt persistent in DB speichern + +## Workflow: Mac Mini → Production Sync + +``` +1. Mac Mini: PDF → Qdrant (lokal, http://macmini:6333) +2. Mac Mini: Control-Generierung → PostgreSQL (shared, 46.225.100.82:54321) +3. QA: PDF-Match, Dedup, Source-Normalisierung +4. Qdrant Migration: macmini:6333 → qdrant-dev.breakpilot.ai (scripts/migrate-qdrant.py) +5. Deploy: git push gitea → Coolify Build + Deploy +``` + +**WICHTIG:** PostgreSQL ist SHARED — Änderungen auf Mac Mini sind sofort in Production sichtbar. Qdrant hat getrennte Instanzen (lokal + production) und muss manuell synchronisiert werden. + +## Scripts + +| Script | Beschreibung | +|---|---| +| `scripts/ingest-phase-h.sh` | Haupt-Ingestion: 38 Dokumente → Qdrant | +| `scripts/cleanup-qdrant-duplicates.py` | Qdrant Duplikat-Cleanup (8 Schritte) | +| `scripts/migrate-qdrant.py` | Qdrant Migration: lokal → production | +| `scripts/qa/phase5_normalize_and_cleanup.py` | DB Normalisierung + Hard Delete | +| `scripts/qa/pdf_qa_all.py` | PDF-Match QA | diff --git a/docs-src/services/sdk-modules/canonical-control-library.md b/docs-src/services/sdk-modules/canonical-control-library.md index 2aadfa6..e07d3f6 100644 --- a/docs-src/services/sdk-modules/canonical-control-library.md +++ b/docs-src/services/sdk-modules/canonical-control-library.md @@ -96,6 +96,7 @@ erDiagram varchar verification_method varchar target_audience varchar generation_strategy + varchar obligation_type smallint pipeline_version integer license_rule jsonb source_citation @@ -936,9 +937,11 @@ Drei Kompositions-Modi: Zerlegt Rich Controls in atomare Controls. Laeuft VOR den Migration Passes 1-5. -#### Pass 0a — Obligation Extraction +#### Pass 0a — Obligation Extraction + 3-Tier-Klassifizierung -Extrahiert einzelne normative Pflichten aus einem Rich Control per LLM. +Extrahiert einzelne normative Pflichten aus einem Rich Control per LLM (Claude Sonnet 4.6). +Jede Obligation wird als **pflicht**, **empfehlung** oder **kann** klassifiziert — nichts wird +wegen fehlendem normativem Signal abgelehnt. **6 Guardrails:** @@ -949,23 +952,37 @@ Extrahiert einzelne normative Pflichten aus einem Rich Control per LLM. 5. Nicht auf Evidence-Ebene zerlegen 6. Parent-Link immer erhalten -**Quality Gate:** Jeder Kandidat wird gegen 6 Kriterien geprueft: +**3-Tier Obligation Classification:** -- `has_normative_signal` — Normatives Sprachsignal erkannt -- `single_action` — Nur eine Handlung -- `not_rationale` — Keine blosse Begruendung -- `not_evidence_only` — Kein reines Evidence-Fragment -- `min_length` — Mindestlaenge erreicht -- `has_parent_link` — Referenz zum Rich Control +| obligation_type | Signal-Beispiele | Bedeutung | +|---|---|---| +| `pflicht` | müssen, ist zu, shall, must, required | Gesetzliche/regulatorische Pflicht | +| `empfehlung` | soll, should, sicherstellen, dokumentieren | Best Practice, freiwillig | +| `kann` | kann, darf, may, optional | Optionale Massnahme | -Kritische Checks: `has_normative_signal`, `not_evidence_only`, `min_length`, `has_parent_link` +Obligations ohne erkennbares Signal werden als `empfehlung` klassifiziert (nicht rejected). +Empfehlungen helfen Firmen, Systeme ueber das Pflichtprogramm hinaus zu sichern. + +**Quality Gate — Kritische Checks:** + +| Flag | Kritisch? | Beschreibung | +|---|---|---| +| `obligation_type` | — | Klassifizierung (pflicht/empfehlung/kann) | +| `not_evidence_only` | **Ja** | Kein reines Evidence-Fragment | +| `min_length` | **Ja** | Mindestlaenge (20 Zeichen) | +| `has_parent_link` | **Ja** | Referenz zum Rich Control | +| `has_normative_signal` | Nein | Informativer Check (nicht mehr Ablehnungsgrund) | +| `single_action` | Nein | Nur eine Handlung (heuristisch) | +| `not_rationale` | Nein | Keine blosse Begruendung | #### Pass 0b — Atomic Control Composition Erstellt aus jedem validierten Obligation Candidate ein atomares Control -(LLM-gestuetzt mit Template-Fallback). +(LLM-gestuetzt mit Template-Fallback). Das `obligation_type` Feld wird +vom Parent-Obligation uebernommen. **Datei:** `compliance/services/decomposition_pass.py` +**Test-Script:** `scripts/qa/test_pass0a.py` (standalone, speichert JSON) --- @@ -1012,11 +1029,13 @@ Die Crosswalk-Matrix bildet diese N:M-Beziehung ab. **Migration 061:** Decomposition-Tabellen -| Tabelle | Beschreibung | +| Tabelle / Feld | Beschreibung | |---------|-------------| | `obligation_candidates` | Extrahierte atomare Pflichten aus Rich Controls | +| `obligation_candidates.obligation_type` | `pflicht` / `empfehlung` / `kann` (3-Tier-Klassifizierung) | | `canonical_controls.parent_control_uuid` | Self-Referenz zum Rich Control (neues Feld) | | `canonical_controls.decomposition_method` | Zerlegungsmethode (neues Feld) | +| `canonical_controls.obligation_type` | Uebernommen von Obligation: pflicht/empfehlung/kann | --- diff --git a/docs-src/services/sdk-modules/control-generator-pipeline.md b/docs-src/services/sdk-modules/control-generator-pipeline.md index e9111ff..4ac03cc 100644 --- a/docs-src/services/sdk-modules/control-generator-pipeline.md +++ b/docs-src/services/sdk-modules/control-generator-pipeline.md @@ -567,7 +567,86 @@ curl -X POST https://api-dev.breakpilot.ai/api/compliance/v1/canonical/generate/ --- +## Pass 0a/0b: Atomare Control-Zerlegung + +Die Pipeline v3 erweitert die 7-Stufen-Pipeline um einen Vor-Pass, der Rich Controls in atomare Controls zerlegt. + +### Pass 0a: Obligation Extraction + +Extrahiert individuelle normative Pflichten aus Rich Controls via LLM. + +```mermaid +flowchart LR + A[Rich Control] -->|LLM| B[Obligations] + B --> C{Quality Gate} + C -->|Pass| D[validated] + C -->|Fail| E[rejected] +``` + +**3-Tier Klassifikation:** + +| Typ | Erkennungsmuster | Beispiel | +|-----|-----------------|---------| +| **Pflicht** | muss, ist verpflichtet, hat sicherzustellen | "Der Verantwortliche MUSS ein Verzeichnis fuehren" | +| **Empfehlung** | soll, sollte, wird empfohlen | "Es SOLLTE eine Risikobewertung durchgefuehrt werden" | +| **Kann** | kann, darf, ist berechtigt | "Die Aufsichtsbehoerde KANN Geldbussen verhaengen" | + +**Quality Gate (6 Regeln):** + +1. Nur normative Aussagen (muss, sicherzustellen, verpflichtet) +2. Ein Hauptverb pro Obligation +3. Test-Obligations separat von operativen +4. Reporting-Obligations separat +5. Nicht auf Evidence-Ebene splitten +6. Parent-Link immer erhalten + +### Pass 0b: Atomic Control Composition + +Verwandelt jede validierte Obligation in ein eigenstaendiges atomares Control. + +```mermaid +flowchart LR + A[Obligation] -->|LLM| B[Atomic Control] + B -->|Dedup Check| C{4-Stage Dedup} + C -->|NEW| D[Insert + Index] + C -->|LINK| E[Parent-Link] + C -->|REVIEW| F[Review-Queue] +``` + +**Konfiguration:** + +| Variable | Default | Beschreibung | +|----------|---------|-------------| +| `DECOMPOSITION_LLM_MODEL` | `claude-sonnet-4-6` | LLM fuer Pass 0a/0b | +| `DECOMPOSITION_BATCH_SIZE` | `5` | Obligations pro LLM-Call | +| `DECOMPOSITION_LLM_TIMEOUT` | `120` | Timeout in Sekunden | + +**Ergebnisse (Stand 2026-03-21):** + +| Metrik | Wert | +|--------|------| +| Rich Controls (technisch) | ~6.800 | +| Atomare Controls (bisher) | 30 (PoC: 10x CRYP, AUTH, SEC) | +| Ziel nach Full Run | ~18.000 unique Master Controls | +| Obligations pro Rich Control | ~10 | +| Dedup-Reduktion erwartet | ~70% | + +### Quelldateien (Pass 0a/0b) + +| Datei | Beschreibung | +|-------|-------------| +| `compliance/services/decomposition_pass.py` | Pass 0a + 0b Logik | +| `compliance/services/control_dedup.py` | 4-Stufen Dedup-Engine | +| `migrations/061_obligation_candidates.sql` | Obligation-Tabelle | +| `migrations/074_control_dedup.sql` | Dedup-Tabellen (Parent-Links, Review-Queue) | +| `tests/test_decomposition_pass.py` | 90 Tests | +| `tests/test_control_dedup.py` | 56 Tests | + +--- + ## Verwandte Dokumentation - [Canonical Control Library (CP-CLIB)](canonical-control-library.md) — Domains, Datenmodell, Too-Close-Detektor, CI/CD Validation +- [Deduplizierungs-Engine](dedup-engine.md) — 4-Stufen Dedup, Multi-Parent-Linking, Review-Queue +- [RAG Pipeline Benchmark](../../development/rag-pipeline-benchmark.md) — State-of-the-Art Vergleich, Optimierungsempfehlungen - [Multi-Layer Control Architecture](canonical-control-library.md#multi-layer-control-architecture) — 10-Stage Pipeline-Erweiterung mit Obligations, Patterns, Crosswalk diff --git a/docs-src/services/sdk-modules/dedup-engine.md b/docs-src/services/sdk-modules/dedup-engine.md new file mode 100644 index 0000000..5fe0883 --- /dev/null +++ b/docs-src/services/sdk-modules/dedup-engine.md @@ -0,0 +1,253 @@ +# Deduplizierungs-Engine (Control Dedup) + +4-stufige Dedup-Pipeline zur Vermeidung doppelter atomarer Controls bei der Pass 0b Komposition. Kern-USP: **"1 Control erfuellt 5 Gesetze"** durch Multi-Parent-Linking. + +**Backend:** `backend-compliance/compliance/services/control_dedup.py` +**Migration:** `backend-compliance/migrations/074_control_dedup.sql` +**Tests:** `backend-compliance/tests/test_control_dedup.py` (56 Tests) + +--- + +## Motivation + +Aus ~6.800 technischen Controls x ~10 Obligations pro Control entstehen ~68.000 atomare Kandidaten. Ziel: ~18.000 einzigartige Master Controls. Viele Obligations aus verschiedenen Gesetzen fuehren zum gleichen technischen Control (z.B. "MFA implementieren" in DSGVO, NIS2, AI Act). + +**Problem:** Embedding-only Deduplizierung ist GEFAEHRLICH fuer Compliance. + +!!! danger "False-Positive Beispiel" + - "Admin-Zugriffe muessen MFA nutzen" vs. "Remote-Zugriffe muessen MFA nutzen" + - Embedding sagt >0.9 aehnlich + - Aber es sind **ZWEI verschiedene Controls** (verschiedene Objekte!) + +--- + +## 4-Stufen Entscheidungsbaum + +```mermaid +flowchart TD + A[Kandidat-Control] --> B{Pattern-Gate} + B -->|pattern_id verschieden| N1[NEW CONTROL] + B -->|pattern_id gleich| C{Action-Check} + C -->|Action verschieden| N2[NEW CONTROL] + C -->|Action gleich| D{Object-Normalization} + D -->|Objekt verschieden| E{Similarity > 0.95?} + E -->|Ja| L1[LINK] + E -->|Nein| N3[NEW CONTROL] + D -->|Objekt gleich| F{Tiered Thresholds} + F -->|> 0.92| L2[LINK] + F -->|0.85 - 0.92| R[REVIEW QUEUE] + F -->|< 0.85| N4[NEW CONTROL] +``` + +### Stufe 1: Pattern-Gate (hart) + +`pattern_id` muss uebereinstimmen. Verhindert ~80% der False Positives. + +```python +if pattern_id != existing.pattern_id: + → NEW CONTROL # Verschiedene Kontrollmuster = verschiedene Controls +``` + +### Stufe 2: Action-Check (hart) + +Normalisierte Aktionsverben muessen uebereinstimmen. "Implementieren" vs. "Testen" = verschiedene Controls, auch bei gleichem Objekt. + +```python +if normalize_action("implementieren") != normalize_action("testen"): + → NEW CONTROL # "implement" != "test" +``` + +**Action-Normalisierung (Deutsch → Englisch):** + +| Deutsche Verben | Kanonische Form | +|----------------|-----------------| +| implementieren, umsetzen, einrichten, aktivieren | `implement` | +| testen, pruefen, ueberpruefen, verifizieren | `test` | +| ueberwachen, monitoring, beobachten | `monitor` | +| verschluesseln | `encrypt` | +| protokollieren, aufzeichnen, loggen | `log` | +| beschraenken, einschraenken, begrenzen | `restrict` | + +### Stufe 3: Object-Normalization (weich) + +Compliance-Objekte werden auf kanonische Token normalisiert. + +```python +normalize_object("Admin-Konten") → "privileged_access" +normalize_object("Remote-Zugriff") → "remote_access" +normalize_object("MFA") → "multi_factor_auth" +``` + +Bei verschiedenen Objekten gilt ein hoeherer Schwellenwert (0.95 statt 0.92). + +**Objekt-Normalisierung:** + +| Eingabe | Kanonischer Token | +|---------|------------------| +| MFA, 2FA, Multi-Faktor-Authentifizierung | `multi_factor_auth` | +| Admin-Konten, privilegierte Zugriffe | `privileged_access` | +| Verschluesselung, Kryptografie | `encryption` | +| Schluessel, Key Management | `key_management` | +| TLS, SSL, HTTPS | `transport_encryption` | +| Firewall | `firewall` | +| Audit-Log, Protokoll, Logging | `audit_logging` | + +### Stufe 4: Embedding Similarity (Qdrant) + +Tiered Thresholds basierend auf Cosine-Similarity: + +| Score | Verdict | Aktion | +|-------|---------|--------| +| > 0.95 | **LINK** | Bei verschiedenen Objekten | +| > 0.92 | **LINK** | Parent-Link hinzufuegen | +| 0.85 - 0.92 | **REVIEW** | In Review-Queue zur manuellen Pruefung | +| < 0.85 | **NEW** | Neues Control anlegen | + +--- + +## Canonicalization Layer + +Vor dem Embedding wird der deutsche Compliance-Text in normalisiertes Englisch transformiert: + +``` +"Administratoren muessen MFA verwenden" +→ "implement multi_factor_auth for administratoren verwenden" +→ Bessere Matches, weniger Embedding-Rauschen +``` + +Dies reduziert das Rauschen durch synonyme Formulierungen in verschiedenen Gesetzen. + +--- + +## Multi-Parent-Linking (M:N) + +Ein atomares Control kann mehrere Eltern-Controls aus verschiedenen Regulierungen haben: + +```json +{ + "control_id": "AUTH-1072-A01", + "parent_links": [ + {"parent_control_id": "AUTH-1001", "source": "NIST IA-02(01)", "link_type": "decomposition"}, + {"parent_control_id": "NIS2-045", "source": "NIS2 Art. 21", "link_type": "dedup_merge"} + ] +} +``` + +### Datenbank-Schema + +```sql +-- Migration 074: control_parent_links (M:N) +CREATE TABLE control_parent_links ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + control_uuid UUID NOT NULL REFERENCES canonical_controls(id), + parent_control_uuid UUID NOT NULL REFERENCES canonical_controls(id), + link_type VARCHAR(30) NOT NULL DEFAULT 'decomposition', + confidence NUMERIC(3,2) DEFAULT 1.0, + source_regulation VARCHAR(100), + source_article VARCHAR(100), + obligation_candidate_id UUID REFERENCES obligation_candidates(id), + created_at TIMESTAMPTZ DEFAULT NOW(), + CONSTRAINT uq_parent_link UNIQUE (control_uuid, parent_control_uuid) +); +``` + +**Link-Typen:** + +| Typ | Bedeutung | +|-----|-----------| +| `decomposition` | Aus Pass 0b Zerlegung | +| `dedup_merge` | Durch Dedup-Engine als Duplikat erkannt | +| `manual` | Manuell durch Reviewer verknuepft | +| `crosswalk` | Aus Crosswalk-Matrix uebernommen | + +--- + +## Review-Queue + +Borderline-Matches (Similarity 0.85-0.92) werden in die Review-Queue geschrieben: + +```sql +-- Migration 074: control_dedup_reviews +CREATE TABLE control_dedup_reviews ( + id UUID PRIMARY KEY DEFAULT gen_random_uuid(), + candidate_control_id VARCHAR(30) NOT NULL, + candidate_title TEXT NOT NULL, + candidate_objective TEXT, + matched_control_uuid UUID REFERENCES canonical_controls(id), + matched_control_id VARCHAR(30), + similarity_score NUMERIC(4,3), + dedup_stage VARCHAR(40) NOT NULL, + review_status VARCHAR(20) DEFAULT 'pending', + -- pending → accepted_link | accepted_new | rejected + created_at TIMESTAMPTZ DEFAULT NOW() +); +``` + +--- + +## Qdrant Collection + +``` +Collection: atomic_controls +Dimension: 1024 (bge-m3) +Distance: COSINE +Payload: pattern_id, action_normalized, object_normalized, control_id, canonical_text +Index: pattern_id (keyword), action_normalized (keyword), object_normalized (keyword) +Query: IMMER mit filter: pattern_id == X (reduziert Suche drastisch) +``` + +--- + +## Integration in Pass 0b + +Die Dedup-Engine ist optional in `DecompositionPass` integriert: + +```python +decomp = DecompositionPass(db=session, dedup_enabled=True) +stats = await decomp.run_pass0b(limit=100, use_anthropic=True) + +# Stats enthalten Dedup-Metriken: +# stats["dedup_linked"] = 15 (Duplikate → Parent-Link) +# stats["dedup_review"] = 3 (Borderline → Review-Queue) +# stats["controls_created"] = 82 (Neue Controls) +``` + +**Ablauf bei Pass 0b mit Dedup:** + +1. LLM generiert atomares Control +2. Dedup-Engine prueft 4 Stufen +3. **LINK:** Kein neues Control, Parent-Link zu bestehendem +4. **REVIEW:** Kein neues Control, Eintrag in Review-Queue +5. **NEW:** Control anlegen + in Qdrant indexieren + +--- + +## Konfiguration + +| Umgebungsvariable | Default | Beschreibung | +|-------------------|---------|-------------| +| `DEDUP_ENABLED` | `true` | Dedup-Engine ein/ausschalten | +| `DEDUP_LINK_THRESHOLD` | `0.92` | Schwelle fuer automatisches Linking | +| `DEDUP_REVIEW_THRESHOLD` | `0.85` | Schwelle fuer Review-Queue | +| `DEDUP_LINK_THRESHOLD_DIFF_OBJ` | `0.95` | Schwelle bei verschiedenen Objekten | +| `DEDUP_QDRANT_COLLECTION` | `atomic_controls` | Qdrant-Collection fuer Dedup-Index | +| `QDRANT_URL` | `http://host.docker.internal:6333` | Qdrant-URL | +| `EMBEDDING_URL` | `http://embedding-service:8087` | Embedding-Service-URL | + +--- + +## Quelldateien + +| Datei | Beschreibung | +|-------|-------------| +| `compliance/services/control_dedup.py` | 4-Stufen Dedup-Engine | +| `compliance/services/decomposition_pass.py` | Pass 0a/0b mit Dedup-Integration | +| `migrations/074_control_dedup.sql` | DB-Schema (parent_links, review_queue) | +| `tests/test_control_dedup.py` | 56 Unit-Tests | + +--- + +## Verwandte Dokumentation + +- [Control Generator Pipeline](control-generator-pipeline.md) — 7-Stufen RAG→Control Pipeline +- [Canonical Control Library](canonical-control-library.md) — Datenmodell, Domains, Similarity-Detektor diff --git a/mkdocs.yml b/mkdocs.yml index 76949f5..cb52db1 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -107,6 +107,7 @@ nav: - Policy-Bibliothek (29 Richtlinien): services/sdk-modules/policy-bibliothek.md - Canonical Control Library (CP-CLIB): services/sdk-modules/canonical-control-library.md - Control Generator Pipeline: services/sdk-modules/control-generator-pipeline.md + - Deduplizierungs-Engine: services/sdk-modules/dedup-engine.md - Control Provenance Wiki: services/sdk-modules/control-provenance.md - Strategie: - Wettbewerbsanalyse & Roadmap: strategy/wettbewerbsanalyse.md @@ -115,3 +116,5 @@ nav: - Dokumentation: development/documentation.md - CI/CD Pipeline: development/ci-cd-pipeline.md - QA Control Quality: development/qa-control-quality.md + - RAG Pipeline Lessons Learned: development/rag-pipeline-lessons-learned.md + - RAG Pipeline Benchmark: development/rag-pipeline-benchmark.md diff --git a/scripts/qa/apply_pdf_qa_results.py b/scripts/qa/apply_pdf_qa_results.py index 6bdc3f7..785ba39 100644 --- a/scripts/qa/apply_pdf_qa_results.py +++ b/scripts/qa/apply_pdf_qa_results.py @@ -1,11 +1,29 @@ -"""Apply PDF QA results: update source_citation with correct article + article_type.""" +""" +Apply PDF QA results: update source_citation with correct article_type + article. + +Safety modes: + --safe (default): Only set article_type. Set article only when empty. Mark preamble as recital_suspect. + --force-article: Also overwrite existing articles (CAREFUL: NIST substring matching is unreliable). + --dry-run: Show what would change without writing. + +Usage: + python3 apply_pdf_qa_results.py # safe mode (apply article_type + empty articles) + python3 apply_pdf_qa_results.py --dry-run # show changes without writing + python3 apply_pdf_qa_results.py --force-article # also overwrite existing articles +""" import os +import sys import json import psycopg2 import urllib.parse +from collections import Counter RESULTS_FILE = "/tmp/pdf_qa_results.json" +# Parse args +dry_run = "--dry-run" in sys.argv +force_article = "--force-article" in sys.argv + # Load results with open(RESULTS_FILE) as f: results = json.load(f) @@ -21,35 +39,101 @@ conn = psycopg2.connect( options="-c search_path=compliance,public" ) -# Update in batches +# Load current DB state for all affected controls cur = conn.cursor() -updated = 0 +ctrl_ids = [r["ctrl_id"] for r in results] +cur.execute(""" + SELECT id, + source_citation->>'article' as article, + source_citation->>'article_type' as article_type, + source_citation->>'source' as source + FROM compliance.canonical_controls + WHERE id = ANY(%s::uuid[]) +""", (ctrl_ids,)) +db_state = {} +for row in cur.fetchall(): + db_state[str(row[0])] = {"article": row[1] or "", "article_type": row[2], "source": row[3]} + +# Counters +stats = Counter() +updated_type = 0 +updated_article = 0 +updated_recital = 0 errors = 0 -unchanged = 0 for i, r in enumerate(results): ctrl_id = r["ctrl_id"] - article_label = r["article_label"] - article_type = r["article_type"] # preamble, article, annex, section, unknown + new_article = r["article_label"] + new_type = r["article_type"] + db = db_state.get(ctrl_id, {}) + + if not db: + stats["missing_in_db"] += 1 + continue + + old_type = db.get("article_type") + old_article = db.get("article", "").strip() + + # Decide what to update + set_type = (old_type != new_type) + set_article = (not old_article) or (force_article and old_article != new_article) + set_recital = (new_type == "preamble") + + if set_type: + stats["type_" + ("new" if not old_type else "changed")] += 1 + else: + stats["type_unchanged"] += 1 + + if not old_article and set_article: + stats["article_new"] += 1 + elif old_article and old_article != new_article: + if force_article: + stats["article_force_changed"] += 1 + else: + stats["article_skipped"] += 1 + else: + stats["article_unchanged"] += 1 + + if set_recital: + stats["recital"] += 1 + + if dry_run: + continue try: - # Update source_citation: set article and article_type - cur.execute(""" - UPDATE compliance.canonical_controls - SET source_citation = source_citation - || jsonb_build_object('article', %s, 'article_type', %s), - updated_at = now() - WHERE id = %s::uuid - AND ( - source_citation->>'article' IS DISTINCT FROM %s - OR source_citation->>'article_type' IS DISTINCT FROM %s - ) - """, (article_label, article_type, ctrl_id, article_label, article_type)) + # Build JSONB update + updates = {} + if set_type: + updates["article_type"] = new_type + if set_article: + updates["article"] = new_article - if cur.rowcount > 0: - updated += 1 - else: - unchanged += 1 + if updates: + # Merge into source_citation + cur.execute(""" + UPDATE compliance.canonical_controls + SET source_citation = COALESCE(source_citation, '{}'::jsonb) || %s::jsonb, + updated_at = now() + WHERE id = %s::uuid + """, (json.dumps(updates), ctrl_id)) + if set_type: + updated_type += 1 + if set_article: + updated_article += 1 + + # Mark preamble as recital_suspect + if set_recital: + cur.execute(""" + UPDATE compliance.canonical_controls + SET generation_metadata = jsonb_set( + COALESCE(generation_metadata, '{}'::jsonb), + '{recital_suspect}', + 'true'::jsonb + ), + updated_at = now() + WHERE id = %s::uuid + """, (ctrl_id,)) + updated_recital += 1 except Exception as e: errors += 1 @@ -58,12 +142,37 @@ for i, r in enumerate(results): conn.rollback() continue - if (i + 1) % 500 == 0: + if (i + 1) % 1000 == 0: conn.commit() - print(f" Progress: {i+1}/{len(results)} (updated: {updated}, unchanged: {unchanged}, errors: {errors})") + print(f" Progress: {i+1}/{len(results)}") -conn.commit() -print(f"\nDone: {updated} updated, {unchanged} unchanged, {errors} errors out of {len(results)}") +if not dry_run: + conn.commit() + +mode = "DRY-RUN" if dry_run else "APPLIED" +print(f"\n{'='*60}") +print(f" Mode: {mode}") +print(f"{'='*60}") +print(f"\n article_type:") +print(f" New (was NULL): {stats['type_new']:5d}") +print(f" Changed: {stats['type_changed']:5d}") +print(f" Unchanged: {stats['type_unchanged']:5d}") +print(f"\n article:") +print(f" New (was empty): {stats['article_new']:5d}") +if force_article: + print(f" Force-changed: {stats['article_force_changed']:5d}") +else: + print(f" Differs (SKIPPED): {stats['article_skipped']:5d}") +print(f" Unchanged: {stats['article_unchanged']:5d}") +print(f"\n Preamble/Recital: {stats['recital']:5d}") +print(f" Missing in DB: {stats['missing_in_db']:5d}") + +if not dry_run: + print(f"\n Updates written:") + print(f" article_type: {updated_type:5d}") + print(f" article: {updated_article:5d}") + print(f" recital_suspect: {updated_recital:5d}") + print(f" Errors: {errors:5d}") # Verify: count by article_type cur.execute(""" diff --git a/scripts/qa/benchmark_llm_controls.py b/scripts/qa/benchmark_llm_controls.py new file mode 100644 index 0000000..f6e5862 --- /dev/null +++ b/scripts/qa/benchmark_llm_controls.py @@ -0,0 +1,524 @@ +#!/usr/bin/env python3 +""" +Phase 7.4 Benchmark: Compare gpt-oss-120b vs Claude Sonnet for Control Generation. + +Tests 5 representative gap articles from different sources. +Measures: quality (JSON valid, fields complete), response time, cost estimate. + +Usage: + python3 benchmark_llm_controls.py +""" +import json +import time +import sys +import os +import requests +from pathlib import Path + +# ── Config ────────────────────────────────────────────────────────── +LITELLM_URL = "https://llm-dev.meghsakha.com" +LITELLM_MODEL = "gpt-oss-120b" +LITELLM_API_KEY = "sk-0nAyxaMVbIqmz_ntnndzag" + +ANTHROPIC_URL = "https://api.anthropic.com/v1/messages" +ANTHROPIC_MODEL = "claude-sonnet-4-6" +ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", "") + +PDF_DIR = Path(os.path.expanduser("~/rag-ingestion/pdfs")) + +try: + import fitz # PyMuPDF +except ImportError: + print("PyMuPDF not available, using pre-extracted texts") + fitz = None + +# ── Prompts (identical to control_generator.py) ───────────────────── + +SYSTEM_PROMPT = """Du bist ein Security-Compliance-Experte. Strukturiere den gegebenen Text +als praxisorientiertes Security Control. Erstelle eine verständliche, umsetzbare Formulierung. +Antworte NUR mit validem JSON. Bei mehreren Controls antworte mit einem JSON-Array.""" + +APPLICABILITY_PROMPT = """- applicable_industries: Liste der Branchen fuer die dieses Control relevant ist. + Verwende ["all"] wenn der Control branchenuebergreifend gilt. + Moegliche Werte: "all", "Technologie / IT", "Finanzdienstleistungen", "Gesundheitswesen", + "Produktion / Industrie", "Energie", "Telekommunikation", "Oeffentlicher Dienst" +- applicable_company_size: Ab welcher Unternehmensgroesse gilt dieses Control? + Verwende ["all"] wenn keine Groessenbeschraenkung. + Moegliche Werte: "all", "micro", "small", "medium", "large", "enterprise" +- scope_conditions: null wenn keine besonderen Bedingungen, sonst: + {"requires_any": ["signal"], "description": "Erklaerung"}""" + + +def build_prompt(source_name: str, article_label: str, article_text: str, license_type: str) -> str: + return f"""Strukturiere den folgenden Gesetzestext als Security/Compliance Control. +Du DARFST den Originaltext verwenden (Quelle: {source_name}, {license_type}). + +WICHTIG: Erstelle eine verständliche, praxisorientierte Formulierung. +Der Originaltext wird separat gespeichert — deine Formulierung soll klar und umsetzbar sein. + +Gib JSON zurück mit diesen Feldern: +- title: Kurzer prägnanter Titel (max 100 Zeichen) +- objective: Was soll erreicht werden? (1-3 Sätze) +- rationale: Warum ist das wichtig? (1-2 Sätze) +- requirements: Liste von konkreten Anforderungen (Strings) +- test_procedure: Liste von Prüfschritten (Strings) +- evidence: Liste von Nachweisdokumenten (Strings) +- severity: low/medium/high/critical +- tags: Liste von Tags +- domain: Fachgebiet (AUTH/CRYP/NET/DATA/LOG/ACC/SEC/INC/AI/COMP/GOV) +- category: Inhaltliche Kategorie +- target_audience: Liste der Zielgruppen +- source_article: Artikel-Referenz (z.B. "Artikel 10", "§ 42") +- source_paragraph: Absatz-Referenz (z.B. "Absatz 5") +{APPLICABILITY_PROMPT} + +Text: {article_text[:3000]} +Quelle: {source_name}, {article_label}""" + + +# ── PDF Text Extraction ───────────────────────────────────────────── + +def extract_article_text(pdf_file: str, article_label: str, doc_type: str) -> str: + """Extract the text of a specific article from a PDF.""" + import re + + path = PDF_DIR / pdf_file + if not path.exists() or fitz is None: + return "" + + doc = fitz.open(str(path)) + full_text = "" + for page in doc: + full_text += page.get_text() + "\n" + doc.close() + + # Find article boundaries + if doc_type == "eu_regulation": + # Find "Artikel N" heading + art_num = re.search(r'\d+', article_label) + if not art_num: + return "" + num = int(art_num.group()) + # Find start of this article + pattern = rf'\nArtikel\s+{num}\s*\n' + match = re.search(pattern, full_text) + if not match: + return f"[Artikel {num} nicht im PDF gefunden]" + start = match.start() + # Find start of next article + next_pattern = rf'\nArtikel\s+{num+1}\s*\n' + next_match = re.search(next_pattern, full_text) + end = next_match.start() if next_match else start + 5000 + text = full_text[start:end].strip() + return text[:3000] + + elif doc_type == "de_law": + para_num = re.search(r'\d+', article_label) + if not para_num: + return "" + num = int(para_num.group()) + pattern = rf'\n§\s+{num}\b' + match = re.search(pattern, full_text) + if not match: + return f"[§ {num} nicht im PDF gefunden]" + start = match.start() + next_pattern = rf'\n§\s+{num+1}\b' + next_match = re.search(next_pattern, full_text) + end = next_match.start() if next_match else start + 5000 + text = full_text[start:end].strip() + return text[:3000] + + elif doc_type == "nist": + # Find NIST control family + match = re.search(rf'(?:^|\n)\s*{re.escape(article_label)}\b', full_text) + if not match: + return f"[{article_label} nicht im PDF gefunden]" + start = match.start() + text = full_text[start:start+3000].strip() + return text + + else: + # Generic section search + match = re.search(rf'(?:^|\n).*{re.escape(article_label)}\b', full_text) + if not match: + return f"[{article_label} nicht im PDF gefunden]" + start = match.start() + text = full_text[start:start+3000].strip() + return text + + +# ── API Calls ──────────────────────────────────────────────────────── + +def call_litellm(prompt: str, system_prompt: str) -> tuple: + """Call LiteLLM API. Returns (response_text, duration_seconds, error).""" + headers = { + "Content-Type": "application/json", + "Authorization": f"Bearer {LITELLM_API_KEY}", + } + payload = { + "model": LITELLM_MODEL, + "messages": [ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": prompt}, + ], + "temperature": 0.3, + "max_tokens": 4096, + "stream": False, + } + + t0 = time.time() + try: + resp = requests.post( + f"{LITELLM_URL}/v1/chat/completions", + headers=headers, + json=payload, + timeout=180, + ) + duration = time.time() - t0 + if resp.status_code != 200: + return "", duration, f"HTTP {resp.status_code}: {resp.text[:200]}" + data = resp.json() + content = data["choices"][0]["message"]["content"] + usage = data.get("usage", {}) + return content, duration, None, usage + except Exception as e: + return "", time.time() - t0, str(e), {} + + +def call_anthropic(prompt: str, system_prompt: str) -> tuple: + """Call Anthropic API. Returns (response_text, duration_seconds, error).""" + headers = { + "x-api-key": ANTHROPIC_API_KEY, + "anthropic-version": "2023-06-01", + "content-type": "application/json", + } + payload = { + "model": ANTHROPIC_MODEL, + "max_tokens": 4096, + "system": system_prompt, + "messages": [{"role": "user", "content": prompt}], + } + + t0 = time.time() + try: + resp = requests.post(ANTHROPIC_URL, headers=headers, json=payload, timeout=180) + duration = time.time() - t0 + if resp.status_code != 200: + return "", duration, f"HTTP {resp.status_code}: {resp.text[:200]}", {} + data = resp.json() + content = data["content"][0]["text"] if data.get("content") else "" + usage = data.get("usage", {}) + return content, duration, None, usage + except Exception as e: + return "", time.time() - t0, str(e), {} + + +# ── Quality Assessment ─────────────────────────────────────────────── + +REQUIRED_FIELDS = [ + "title", "objective", "rationale", "requirements", + "test_procedure", "evidence", "severity", "domain", +] + +BONUS_FIELDS = [ + "tags", "category", "target_audience", "source_article", + "applicable_industries", "applicable_company_size", +] + + +def assess_quality(raw_text: str) -> dict: + """Assess the quality of a control generation response.""" + result = { + "json_valid": False, + "required_fields": 0, + "required_total": len(REQUIRED_FIELDS), + "bonus_fields": 0, + "bonus_total": len(BONUS_FIELDS), + "requirements_count": 0, + "test_procedure_count": 0, + "evidence_count": 0, + "title_length": 0, + "objective_length": 0, + "score": 0, + } + + # Try to parse JSON + text = raw_text.strip() + if text.startswith("```"): + lines = text.split("\n") + text = "\n".join(lines[1:-1] if lines[-1].startswith("```") else lines[1:]) + + try: + data = json.loads(text) + if isinstance(data, list): + data = data[0] if data else {} + except json.JSONDecodeError: + # Try to find JSON object + import re + match = re.search(r'\{[\s\S]*\}', text) + if match: + try: + data = json.loads(match.group()) + except json.JSONDecodeError: + return result + else: + return result + + result["json_valid"] = True + + # Check required fields + for f in REQUIRED_FIELDS: + val = data.get(f) + if val and (isinstance(val, str) and len(val) > 2 or isinstance(val, list) and len(val) > 0): + result["required_fields"] += 1 + + # Check bonus fields + for f in BONUS_FIELDS: + val = data.get(f) + if val and (isinstance(val, str) and len(val) > 0 or isinstance(val, list) and len(val) > 0): + result["bonus_fields"] += 1 + + # Depth metrics + reqs = data.get("requirements", []) + result["requirements_count"] = len(reqs) if isinstance(reqs, list) else 0 + tp = data.get("test_procedure", []) + result["test_procedure_count"] = len(tp) if isinstance(tp, list) else 0 + ev = data.get("evidence", []) + result["evidence_count"] = len(ev) if isinstance(ev, list) else 0 + result["title_length"] = len(data.get("title", "")) + result["objective_length"] = len(data.get("objective", "")) + + # Score: 0-100 + score = 0 + score += 20 if result["json_valid"] else 0 + score += (result["required_fields"] / result["required_total"]) * 40 + score += (result["bonus_fields"] / result["bonus_total"]) * 15 + score += min(result["requirements_count"], 5) * 3 # max 15 for 5+ requirements + score += min(result["test_procedure_count"], 3) * 3 # max 9 for 3+ tests + score += 1 if result["objective_length"] > 50 else 0 + result["score"] = round(score, 1) + + result["parsed_data"] = data + return result + + +# ── Test Cases ─────────────────────────────────────────────────────── + +TEST_CASES = [ + { + "source": "DSGVO (EU) 2016/679", + "article": "Artikel 32", + "pdf": "dsgvo_2016_679.pdf", + "doc_type": "eu_regulation", + "license": "EU_LAW", + "description": "Sicherheit der Verarbeitung — Kernthema Datenschutz", + }, + { + "source": "KI-Verordnung (EU) 2024/1689", + "article": "Artikel 9", + "pdf": "ai_act_2024_1689.pdf", + "doc_type": "eu_regulation", + "license": "EU_LAW", + "description": "Risikomanagement für Hochrisiko-KI", + }, + { + "source": "NIS2-Richtlinie (EU) 2022/2555", + "article": "Artikel 21", + "pdf": "nis2_2022_2555.pdf", + "doc_type": "eu_regulation", + "license": "EU_LAW", + "description": "Cybersicherheitsrisikomanagement — NIS2 Kernpflicht", + }, + { + "source": "Cyber Resilience Act (CRA)", + "article": "Artikel 13", + "pdf": "cra_2024_2847.pdf", + "doc_type": "eu_regulation", + "license": "EU_LAW", + "description": "Pflichten der Hersteller", + }, + { + "source": "Bundesdatenschutzgesetz (BDSG)", + "article": "§ 26", + "pdf": "bdsg.pdf", + "doc_type": "de_law", + "license": "DE_LAW", + "description": "Datenverarbeitung im Beschäftigungskontext", + }, +] + + +# ── Main ───────────────────────────────────────────────────────────── + +def main(): + if not ANTHROPIC_API_KEY: + print("ERROR: Set ANTHROPIC_API_KEY environment variable") + sys.exit(1) + + print("=" * 80) + print("LLM BENCHMARK: gpt-oss-120b vs Claude Sonnet 4.6") + print("=" * 80) + print(f" LiteLLM: {LITELLM_URL} / {LITELLM_MODEL}") + print(f" Anthropic: {ANTHROPIC_MODEL}") + print(f" Tests: {len(TEST_CASES)}") + print() + + # Pre-check LiteLLM + try: + r = requests.get(f"{LITELLM_URL}/v1/models", + headers={"Authorization": f"Bearer {LITELLM_API_KEY}"}, timeout=10) + print(f" LiteLLM OK: {r.status_code}") + except Exception as e: + print(f" LiteLLM ERROR: {e}") + sys.exit(1) + + results = [] + + for i, tc in enumerate(TEST_CASES): + print(f"\n{'='*80}") + print(f"TEST {i+1}/{len(TEST_CASES)}: {tc['source']} — {tc['article']}") + print(f" {tc['description']}") + print(f"{'='*80}") + + # Extract article text from PDF + article_text = extract_article_text(tc["pdf"], tc["article"], tc["doc_type"]) + if not article_text or article_text.startswith("["): + print(f" WARNING: {article_text or 'Empty text'}") + continue + + print(f" Text extracted: {len(article_text)} chars") + print(f" First 120 chars: {article_text[:120].replace(chr(10), ' ')}...") + + prompt = build_prompt(tc["source"], tc["article"], article_text, tc["license"]) + + # ── Call LiteLLM ── + print(f"\n --- gpt-oss-120b ---") + litellm_raw, litellm_time, litellm_err, litellm_usage = call_litellm(prompt, SYSTEM_PROMPT) + if litellm_err: + print(f" ERROR: {litellm_err}") + litellm_quality = {"json_valid": False, "score": 0} + else: + print(f" Time: {litellm_time:.1f}s") + print(f" Tokens: {litellm_usage}") + litellm_quality = assess_quality(litellm_raw) + print(f" JSON valid: {litellm_quality['json_valid']}") + print(f" Score: {litellm_quality['score']}/100") + print(f" Required fields: {litellm_quality['required_fields']}/{litellm_quality['required_total']}") + print(f" Requirements: {litellm_quality['requirements_count']}, " + f"Tests: {litellm_quality['test_procedure_count']}, " + f"Evidence: {litellm_quality['evidence_count']}") + if litellm_quality.get("parsed_data"): + d = litellm_quality["parsed_data"] + print(f" Title: {d.get('title', 'N/A')}") + + # ── Call Anthropic ── + print(f"\n --- Claude Sonnet 4.6 ---") + anthropic_raw, anthropic_time, anthropic_err, anthropic_usage = call_anthropic(prompt, SYSTEM_PROMPT) + if anthropic_err: + print(f" ERROR: {anthropic_err}") + anthropic_quality = {"json_valid": False, "score": 0} + else: + print(f" Time: {anthropic_time:.1f}s") + print(f" Tokens: {anthropic_usage}") + anthropic_quality = assess_quality(anthropic_raw) + print(f" JSON valid: {anthropic_quality['json_valid']}") + print(f" Score: {anthropic_quality['score']}/100") + print(f" Required fields: {anthropic_quality['required_fields']}/{anthropic_quality['required_total']}") + print(f" Requirements: {anthropic_quality['requirements_count']}, " + f"Tests: {anthropic_quality['test_procedure_count']}, " + f"Evidence: {anthropic_quality['evidence_count']}") + if anthropic_quality.get("parsed_data"): + d = anthropic_quality["parsed_data"] + print(f" Title: {d.get('title', 'N/A')}") + + # Compare + print(f"\n --- VERGLEICH ---") + speed_ratio = litellm_time / anthropic_time if anthropic_time > 0 else 0 + print(f" Speed: 120b {litellm_time:.1f}s vs Sonnet {anthropic_time:.1f}s " + f"({'120b ' + str(round(speed_ratio, 1)) + 'x langsamer' if speed_ratio > 1 else '120b schneller'})") + print(f" Score: 120b {litellm_quality.get('score', 0)}/100 vs " + f"Sonnet {anthropic_quality.get('score', 0)}/100") + + results.append({ + "test": f"{tc['source']} — {tc['article']}", + "litellm": { + "time": round(litellm_time, 1), + "score": litellm_quality.get("score", 0), + "json_valid": litellm_quality.get("json_valid", False), + "requirements": litellm_quality.get("requirements_count", 0), + "tests": litellm_quality.get("test_procedure_count", 0), + "usage": litellm_usage, + "raw": litellm_raw[:500] if litellm_raw else "", + }, + "anthropic": { + "time": round(anthropic_time, 1), + "score": anthropic_quality.get("score", 0), + "json_valid": anthropic_quality.get("json_valid", False), + "requirements": anthropic_quality.get("requirements_count", 0), + "tests": anthropic_quality.get("test_procedure_count", 0), + "usage": anthropic_usage, + "raw": anthropic_raw[:500] if anthropic_raw else "", + }, + }) + + # ── Summary ────────────────────────────────────────────────────── + print(f"\n\n{'='*80}") + print("ZUSAMMENFASSUNG") + print(f"{'='*80}") + + if not results: + print(" Keine Ergebnisse.") + return + + litellm_scores = [r["litellm"]["score"] for r in results] + anthropic_scores = [r["anthropic"]["score"] for r in results] + litellm_times = [r["litellm"]["time"] for r in results] + anthropic_times = [r["anthropic"]["time"] for r in results] + + print(f"\n {'Metrik':<30s} {'gpt-oss-120b':>15s} {'Claude Sonnet':>15s}") + print(f" {'-'*30} {'-'*15} {'-'*15}") + print(f" {'Avg Score (0-100)':<30s} {sum(litellm_scores)/len(litellm_scores):>13.1f} " + f"{sum(anthropic_scores)/len(anthropic_scores):>13.1f}") + print(f" {'Avg Time (s)':<30s} {sum(litellm_times)/len(litellm_times):>13.1f} " + f"{sum(anthropic_times)/len(anthropic_times):>13.1f}") + print(f" {'JSON Valid':<30s} {sum(1 for r in results if r['litellm']['json_valid']):>12d}/{len(results)} " + f"{sum(1 for r in results if r['anthropic']['json_valid']):>12d}/{len(results)}") + print(f" {'Avg Requirements':<30s} " + f"{sum(r['litellm']['requirements'] for r in results)/len(results):>13.1f} " + f"{sum(r['anthropic']['requirements'] for r in results)/len(results):>13.1f}") + print(f" {'Avg Test Procedures':<30s} " + f"{sum(r['litellm']['tests'] for r in results)/len(results):>13.1f} " + f"{sum(r['anthropic']['tests'] for r in results)/len(results):>13.1f}") + + # Cost estimate + # Claude Sonnet: ~$3/M input, ~$15/M output + # gpt-oss-120b: self-hosted = $0 API cost (only compute) + total_anthropic_input = sum(r["anthropic"]["usage"].get("input_tokens", 0) for r in results) + total_anthropic_output = sum(r["anthropic"]["usage"].get("output_tokens", 0) for r in results) + anthropic_cost = (total_anthropic_input * 3 + total_anthropic_output * 15) / 1_000_000 + + print(f"\n Kostenvergleich (fuer {len(results)} Controls):") + print(f" gpt-oss-120b: $0.00 (self-hosted)") + print(f" Claude Sonnet: ${anthropic_cost:.4f} " + f"({total_anthropic_input} input + {total_anthropic_output} output tokens)") + + # Extrapolate for 494 gap articles + if results: + cost_per_control = anthropic_cost / len(results) + print(f"\n Hochrechnung fuer 494 Luecken-Artikel:") + print(f" gpt-oss-120b: $0.00") + print(f" Claude Sonnet: ${cost_per_control * 494:.2f}") + avg_time_120b = sum(litellm_times) / len(litellm_times) + avg_time_sonnet = sum(anthropic_times) / len(anthropic_times) + print(f" Zeit 120b: {avg_time_120b * 494 / 60:.0f} min ({avg_time_120b * 494 / 3600:.1f}h)") + print(f" Zeit Sonnet: {avg_time_sonnet * 494 / 60:.0f} min ({avg_time_sonnet * 494 / 3600:.1f}h)") + + # Save full results + out_path = "/tmp/benchmark_llm_results.json" + with open(out_path, 'w') as f: + json.dump(results, f, indent=2, ensure_ascii=False) + print(f"\n Detaillierte Ergebnisse: {out_path}") + + +if __name__ == "__main__": + main() diff --git a/scripts/qa/blue_guide_en_match.py b/scripts/qa/blue_guide_en_match.py new file mode 100644 index 0000000..bad6974 --- /dev/null +++ b/scripts/qa/blue_guide_en_match.py @@ -0,0 +1,200 @@ +"""Match unmatched Blue Guide controls against the English PDF.""" +import os +import re +import json +import unicodedata +import psycopg2 +import urllib.parse + +try: + import fitz +except ImportError: + print("ERROR: PyMuPDF (fitz) not installed") + exit(1) + +PDF_PATH = os.path.expanduser("~/rag-ingestion/pdfs/blue_guide_2022_en.pdf") + +def normalize(s): + s = s.replace('\u00ad', '').replace('\xad', '') + s = s.replace('\u200b', '').replace('\u00a0', ' ') + s = s.replace('\ufb01', 'fi').replace('\ufb02', 'fl') + s = s.replace('\ufb00', 'ff').replace('\ufb03', 'ffi').replace('\ufb04', 'ffl') + s = s.replace('\u2019', "'").replace('\u2018', "'") + s = s.replace('\u201c', '"').replace('\u201d', '"') + s = s.replace('\u2013', '-').replace('\u2014', '-') + s = s.replace('\u2022', '-').replace('\u00b7', '-') + s = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', s) + s = unicodedata.normalize('NFC', s) + s = re.sub(r'\s+', ' ', s) + return s.strip() + +# Read EN PDF +print(f"Reading {PDF_PATH}...") +doc = fitz.open(PDF_PATH) +text = "" +for page in doc: + text += page.get_text() + "\n" +doc.close() +print(f" {len(text):,} chars") + +text_norm = normalize(text) + +# Build article index for EN Blue Guide +# EN Blue Guide uses "Article N" headings (not "Artikel N") +items = [] + +# Find where "Article 1" starts — content before is preamble/intro +art1_match = re.search(r'\nArticle\s+1\s*\n', text) +if not art1_match: + # Try section-based structure instead + print(" No 'Article N' headings found, trying section-based index...") + for m in re.finditer(r'(?:^|\n)\s*(\d+(?:\.\d+)*)\.\s+[A-Z]', text, re.MULTILINE): + items.append((m.start(), f"Section {m.group(1)}", "section")) +else: + art1_pos = art1_match.start() + # Article headings + for m in re.finditer(r'(?:^|\n)\s*Article\s+(\d+[a-z]?)\s*\n', text, re.MULTILINE): + art_num = int(re.match(r'(\d+)', m.group(1)).group(1)) + items.append((m.start(), f"Article {m.group(1)}", "article")) + + # Annex markers + for m in re.finditer(r'(?:^|\n)\s*ANNEX\s+([IVXLC]+[a-z]?)\b', text, re.MULTILINE): + items.append((m.start(), f"Annex {m.group(1)}", "annex")) + +# Also try numbered section headings as fallback +for m in re.finditer(r'(?:^|\n)\s*(\d+\.\d+(?:\.\d+)?)\s+[A-Z]', text, re.MULTILINE): + items.append((m.start(), f"Section {m.group(1)}", "section")) + +items.sort(key=lambda x: x[0]) +seen = set() +unique = [] +for pos, label, typ in items: + if label not in seen: + seen.add(label) + unique.append((pos, label, typ)) + +print(f" Index: {len(unique)} sections") +if unique[:5]: + for pos, label, typ in unique[:5]: + print(f" {label} [{typ}] @ pos {pos}") + +# Precompute normalized positions +index_norm = [] +for pos, label, typ in unique: + norm_pos = len(normalize(text[:pos])) + index_norm.append((norm_pos, label, typ)) + +# Connect to DB +db_url = os.environ['DATABASE_URL'] +parsed = urllib.parse.urlparse(db_url) +conn = psycopg2.connect( + host=parsed.hostname, port=parsed.port or 5432, + user=parsed.username, password=parsed.password, + dbname=parsed.path.lstrip('/'), + options="-c search_path=compliance,public" +) +cur = conn.cursor() + +# Get Blue Guide controls without article_type (unmatched) +cur.execute(""" + SELECT id, control_id, title, source_original_text, + source_citation->>'article' as existing_article, + source_citation->>'article_type' as existing_type, + release_state + FROM compliance.canonical_controls + WHERE source_citation->>'source' = 'EU Blue Guide 2022' + AND source_original_text IS NOT NULL + AND length(source_original_text) > 50 + AND (source_citation->>'article_type' IS NULL) + ORDER BY control_id +""") +controls = cur.fetchall() +print(f"\nUnmatched Blue Guide controls: {len(controls)}") + +# Match each control +results = [] +found = 0 +not_found = 0 + +for ctrl in controls: + ctrl_id, control_id, title, orig_text, existing_art, existing_type, state = ctrl + orig_norm = normalize(orig_text) + if len(orig_norm) < 30: + not_found += 1 + continue + + matched = False + for start_frac in [0.25, 0.1, 0.5, 0.0, 0.75]: + for length in [80, 60, 40, 30, 20]: + start = max(0, int(len(orig_norm) * start_frac)) + snippet = orig_norm[start:start+length] + if not snippet or len(snippet) < 15: + continue + pos = text_norm.find(snippet) + if pos >= 0: + # Find section + label = "Unknown" + typ = "unknown" + for h_pos, h_label, h_type in reversed(index_norm): + if h_pos <= pos: + label = h_label + typ = h_type + break + results.append({ + "ctrl_id": str(ctrl_id), + "control_id": control_id, + "source": "EU Blue Guide 2022", + "article_label": label, + "article_type": typ, + }) + found += 1 + is_active = "" if state not in ('duplicate', 'too_close') else " [DUP]" + print(f" {control_id:10s}: {label:25s} [{typ:8s}]{is_active}") + matched = True + break + if matched: + break + + if not matched: + not_found += 1 + print(f" {control_id:10s}: NOT FOUND {title[:50]}") + +print(f"\n{'='*50}") +print(f"Results: {found} matched, {not_found} not found out of {len(controls)}") + +# Save results +out_path = "/tmp/blue_guide_en_results.json" +with open(out_path, 'w') as f: + json.dump(results, f, indent=2, ensure_ascii=False) +print(f"Saved to {out_path}") + +# Apply results to DB +if results: + print(f"\nApplying {len(results)} results to DB...") + applied = 0 + for r in results: + cur.execute(""" + UPDATE compliance.canonical_controls + SET source_citation = source_citation || + jsonb_build_object('article', %s, 'article_type', %s) + WHERE id = %s::uuid + AND (source_citation->>'article' IS DISTINCT FROM %s + OR source_citation->>'article_type' IS DISTINCT FROM %s) + """, (r["article_label"], r["article_type"], + r["ctrl_id"], r["article_label"], r["article_type"])) + if cur.rowcount > 0: + applied += 1 + conn.commit() + print(f" Applied: {applied} controls updated") + +# Show type distribution +type_counts = {} +for r in results: + t = r["article_type"] + type_counts[t] = type_counts.get(t, 0) + 1 +if type_counts: + print(f"\nArticle type distribution:") + for t, c in sorted(type_counts.items(), key=lambda x: -x[1]): + print(f" {t:12s}: {c:5d}") + +conn.close() diff --git a/scripts/qa/gap_analysis.py b/scripts/qa/gap_analysis.py new file mode 100644 index 0000000..032599d --- /dev/null +++ b/scripts/qa/gap_analysis.py @@ -0,0 +1,188 @@ +""" +Phase 7.3: Gap Analysis — Identify articles/sections WITHOUT controls. + +For each regulation PDF: +1. Extract all articles/sections from the PDF +2. Compare with controls in the DB that reference this article +3. Report gaps (articles with no controls) + +Usage: + python3 gap_analysis.py # show all gaps + python3 gap_analysis.py --source "DSGVO" # filter by source +""" +import os +import sys +import json +import re +import psycopg2 +import urllib.parse +from pathlib import Path +from collections import defaultdict + +# Import from pdf_qa_all +sys.path.insert(0, os.path.dirname(__file__)) +from pdf_qa_all import ( + SOURCE_FILE_MAP, read_file, classify_doc, normalize, + build_eu_article_index, build_de_law_index, build_nist_index, + build_owasp_index, build_generic_index, MAX_ARTICLES +) + +# Only analyze sources with significant control counts (skip sources with <5 controls) +MIN_CONTROLS = 5 + + +def main(): + source_filter = None + if "--source" in sys.argv: + idx = sys.argv.index("--source") + if idx + 1 < len(sys.argv): + source_filter = sys.argv[idx + 1] + + # DB connection + db_url = os.environ['DATABASE_URL'] + parsed = urllib.parse.urlparse(db_url) + conn = psycopg2.connect( + host=parsed.hostname, port=parsed.port or 5432, + user=parsed.username, password=parsed.password, + dbname=parsed.path.lstrip('/'), + options="-c search_path=compliance,public" + ) + cur = conn.cursor() + + # Get all controls grouped by source with their article + cur.execute(""" + SELECT source_citation->>'source' as source, + source_citation->>'article' as article, + source_citation->>'article_type' as article_type, + count(*) as cnt + FROM compliance.canonical_controls + WHERE source_citation->>'source' IS NOT NULL + AND release_state NOT IN ('duplicate', 'too_close') + GROUP BY 1, 2, 3 + ORDER BY 1, 2 + """) + + # Build: source -> {article -> (type, count)} + controls_by_source = defaultdict(dict) + for source, article, art_type, cnt in cur.fetchall(): + if article: + controls_by_source[source][article] = (art_type or "unknown", cnt) + + total_gaps = 0 + total_articles_checked = 0 + total_covered = 0 + gap_report = [] + + sources_to_check = sorted(SOURCE_FILE_MAP.keys()) + if source_filter: + sources_to_check = [s for s in sources_to_check if source_filter.lower() in s.lower()] + + for source_name in sources_to_check: + filename = SOURCE_FILE_MAP.get(source_name) + if filename is None: + continue + + controls = controls_by_source.get(source_name, {}) + if len(controls) < MIN_CONTROLS and not source_filter: + continue + + # Read PDF and build article index + text = read_file(filename) + if text is None: + continue + + doc_type = classify_doc(source_name) + max_art = MAX_ARTICLES.get(source_name) + + if doc_type == "eu_regulation": + index = build_eu_article_index(text, max_article=max_art) + elif doc_type == "de_law": + index = build_de_law_index(text) + elif doc_type == "nist": + index = build_nist_index(text) + elif doc_type == "owasp": + index = build_owasp_index(text, source_name) + else: + index = build_generic_index(text) + + if not index: + continue + + # Only look at substantive articles (not preamble, not annex for gap analysis) + substantive_types = {"article", "section", "control", "requirement", "category"} + substantive_articles = [(pos, label, typ) for pos, label, typ in index if typ in substantive_types] + + preamble_articles = [(pos, label, typ) for pos, label, typ in index if typ == "preamble"] + annex_articles = [(pos, label, typ) for pos, label, typ in index if typ == "annex"] + + # Check which articles have controls + covered = [] + gaps = [] + for pos, label, typ in substantive_articles: + if label in controls: + covered.append(label) + else: + gaps.append((label, typ)) + + total_articles_checked += len(substantive_articles) + total_covered += len(covered) + total_gaps += len(gaps) + + # Count preamble/annex controls + preamble_controls = sum(1 for a in controls if controls[a][0] == "preamble") + annex_controls = sum(1 for a in controls if controls[a][0] == "annex") + + coverage_pct = len(covered) / len(substantive_articles) * 100 if substantive_articles else 0 + + print(f"\n{'='*70}") + print(f"{source_name}") + print(f" PDF articles: {len(substantive_articles)} substantive, " + f"{len(preamble_articles)} preamble, {len(annex_articles)} annex") + print(f" DB controls: {sum(v[1] for v in controls.values())} total " + f"({preamble_controls} preamble, {annex_controls} annex)") + print(f" Coverage: {len(covered)}/{len(substantive_articles)} " + f"({coverage_pct:.0f}%)") + + if gaps: + print(f" GAPS ({len(gaps)}):") + for label, typ in gaps[:30]: # limit output + print(f" - {label} [{typ}]") + if len(gaps) > 30: + print(f" ... and {len(gaps)-30} more") + + gap_report.append({ + "source": source_name, + "total_articles": len(substantive_articles), + "covered": len(covered), + "gaps": len(gaps), + "coverage_pct": round(coverage_pct, 1), + "gap_articles": [{"label": l, "type": t} for l, t in gaps], + }) + + # Summary + print(f"\n{'='*70}") + print("GAP ANALYSIS SUMMARY") + print(f"{'='*70}") + print(f" Sources analyzed: {len([r for r in gap_report]) + len([s for s in sources_to_check if SOURCE_FILE_MAP.get(s)])}") + print(f" Total articles in PDFs: {total_articles_checked}") + print(f" Articles with controls: {total_covered}") + print(f" Articles WITHOUT controls: {total_gaps}") + if total_articles_checked: + print(f" Overall coverage: {total_covered/total_articles_checked*100:.1f}%") + + print(f"\n Sources with gaps:") + for r in sorted(gap_report, key=lambda x: -x["gaps"]): + print(f" {r['source']:45s} {r['gaps']:4d} gaps " + f"({r['covered']}/{r['total_articles']} = {r['coverage_pct']}%)") + + # Save report + out_path = "/tmp/gap_analysis_results.json" + with open(out_path, 'w') as f: + json.dump(gap_report, f, indent=2, ensure_ascii=False) + print(f"\n Full report saved to {out_path}") + + conn.close() + + +if __name__ == "__main__": + main() diff --git a/scripts/qa/oscal_analysis.py b/scripts/qa/oscal_analysis.py new file mode 100644 index 0000000..edfd103 --- /dev/null +++ b/scripts/qa/oscal_analysis.py @@ -0,0 +1,288 @@ +"""Analyze NIST OSCAL data and compare with existing controls in DB.""" +import os +import re +import json +import psycopg2 +import urllib.parse +from collections import defaultdict + +OSCAL_DIR = os.path.expanduser("~/rag-ingestion/nist-oscal") + +# ── Load SP 800-53 Rev 5 ── +with open(os.path.join(OSCAL_DIR, "sp800-53-rev5-catalog.json")) as f: + sp853 = json.load(f)["catalog"] + +print("=" * 70) +print("NIST SP 800-53 Rev 5 — OSCAL Catalog Analysis") +print("=" * 70) +print(f" UUID: {sp853.get('uuid', '?')}") +print(f" Last Modified: {sp853.get('metadata', {}).get('last-modified', '?')}") + +# Count controls +families = sp853.get("groups", []) +total_base = 0 +total_enhancements = 0 +total_withdrawn = 0 +total_active = 0 +family_stats = [] + +for fam in families: + fam_id = fam.get("id", "?") + fam_title = fam.get("title", "?") + controls = fam.get("controls", []) + base = 0 + enhancements = 0 + withdrawn = 0 + + for ctrl in controls: + # Check if withdrawn + props = {p["name"]: p.get("value", "") for p in ctrl.get("props", [])} + is_withdrawn = props.get("status") == "withdrawn" + if is_withdrawn: + withdrawn += 1 + else: + base += 1 + + # Count enhancements + for enh in ctrl.get("controls", []): + enh_props = {p["name"]: p.get("value", "") for p in enh.get("props", [])} + if enh_props.get("status") == "withdrawn": + withdrawn += 1 + else: + enhancements += 1 + + family_stats.append((fam_id, fam_title, base, enhancements, withdrawn)) + total_base += base + total_enhancements += enhancements + total_withdrawn += withdrawn + +total_active = total_base + total_enhancements +print(f"\n Families: {len(families)}") +print(f" Base Controls: {total_base}") +print(f" Enhancements: {total_enhancements}") +print(f" Withdrawn: {total_withdrawn}") +print(f" TOTAL ACTIVE: {total_active}") + +print(f"\n Per Family:") +print(f" {'ID':6s} {'Title':45s} {'Base':>5s} {'Enh':>5s} {'Wdrn':>5s}") +for fam_id, title, base, enh, wdrn in family_stats: + print(f" {fam_id:6s} {title[:45]:45s} {base:5d} {enh:5d} {wdrn:5d}") + +# Show example control structure +print(f"\n Example Control (AC-6 Least Privilege):") +for fam in families: + for ctrl in fam.get("controls", []): + if ctrl["id"] == "ac-6": + props = {p["name"]: p.get("value", "") for p in ctrl.get("props", [])} + print(f" ID: {ctrl['id']}") + print(f" Label: {props.get('label', '?')}") + print(f" Title: {ctrl['title']}") + for part in ctrl.get("parts", []): + if part.get("name") == "statement": + prose = part.get("prose", "") + print(f" Statement: {prose[:150]}...") + elif part.get("name") == "guidance": + prose = part.get("prose", "") + print(f" Guidance: {prose[:150]}...") + enh_count = len(ctrl.get("controls", [])) + print(f" Enhancements: {enh_count}") + links = [l["href"].lstrip("#") for l in ctrl.get("links", []) if l.get("rel") == "related"] + print(f" Related: {', '.join(links[:8])}...") + break + +# ── Load CSF 2.0 ── +print(f"\n{'='*70}") +print("NIST CSF 2.0 — OSCAL Catalog Analysis") +print("=" * 70) + +with open(os.path.join(OSCAL_DIR, "csf-2.0-catalog.json")) as f: + csf = json.load(f)["catalog"] + +csf_groups = csf.get("groups", []) +csf_total = 0 +for grp in csf_groups: + func_title = grp.get("title", "?") + cats = grp.get("groups", []) + subcats = 0 + for cat in cats: + subcats += len(cat.get("controls", [])) + csf_total += subcats + print(f" {func_title:25s}: {len(cats):2d} categories, {subcats:3d} subcategories") + +print(f" TOTAL: {csf_total} subcategories") + +# ── Compare with existing DB controls ── +print(f"\n{'='*70}") +print("VERGLEICH: OSCAL vs. bestehende Controls in DB") +print("=" * 70) + +db_url = os.environ['DATABASE_URL'] +parsed = urllib.parse.urlparse(db_url) +conn = psycopg2.connect( + host=parsed.hostname, port=parsed.port or 5432, + user=parsed.username, password=parsed.password, + dbname=parsed.path.lstrip('/'), + options="-c search_path=compliance,public" +) +cur = conn.cursor() + +# Get existing NIST controls +cur.execute(""" + SELECT control_id, title, + source_citation->>'source' as source, + source_citation->>'article' as article, + source_citation->>'article_type' as art_type, + release_state + FROM compliance.canonical_controls + WHERE source_citation->>'source' LIKE 'NIST%%' + ORDER BY source_citation->>'source', control_id +""") +nist_controls = cur.fetchall() + +# Group by source +by_source = defaultdict(list) +for ctrl in nist_controls: + by_source[ctrl[2]].append(ctrl) + +print(f"\n Bestehende NIST Controls in DB:") +for src in sorted(by_source.keys()): + ctrls = by_source[src] + active = sum(1 for c in ctrls if c[5] not in ('duplicate', 'too_close')) + with_article = sum(1 for c in ctrls if c[3]) + print(f" {src:40s}: {len(ctrls):4d} total, {active:4d} active, {with_article:4d} mit article") + +# For SP 800-53: which control families do we have? +sp853_existing = [c for c in nist_controls if 'SP 800-53' in (c[2] or '')] +existing_families = set() +existing_articles = set() +for ctrl in sp853_existing: + article = ctrl[3] or "" + if article: + # Extract family prefix (e.g., "AC-6" → "AC") + m = re.match(r'([A-Z]{2})-', article) + if m: + existing_families.add(m.group(1)) + existing_articles.add(article) + +print(f"\n SP 800-53 in DB:") +print(f" Total: {len(sp853_existing)}") +print(f" Families covered: {len(existing_families)}") +print(f" Unique articles: {len(existing_articles)}") +print(f" Families: {', '.join(sorted(existing_families))}") + +# Compare: which OSCAL controls are NOT in our DB? +oscal_controls = {} # id → (label, title, statement) +for fam in families: + for ctrl in fam.get("controls", []): + props = {p["name"]: p.get("value", "") for p in ctrl.get("props", [])} + if props.get("status") == "withdrawn": + continue + label = props.get("label", ctrl["id"].upper()) + statement = "" + guidance = "" + for part in ctrl.get("parts", []): + if part.get("name") == "statement": + statement = part.get("prose", "") + # Also check sub-items + for sub in part.get("parts", []): + statement += " " + sub.get("prose", "") + elif part.get("name") == "guidance": + guidance = part.get("prose", "") + + oscal_controls[label] = (ctrl["title"], statement[:500], guidance[:500]) + + # Enhancements + for enh in ctrl.get("controls", []): + enh_props = {p["name"]: p.get("value", "") for p in enh.get("props", [])} + if enh_props.get("status") == "withdrawn": + continue + enh_label = enh_props.get("label", enh["id"].upper()) + enh_statement = "" + enh_guidance = "" + for part in enh.get("parts", []): + if part.get("name") == "statement": + enh_statement = part.get("prose", "") + for sub in part.get("parts", []): + enh_statement += " " + sub.get("prose", "") + elif part.get("name") == "guidance": + enh_guidance = part.get("prose", "") + oscal_controls[enh_label] = (enh["title"], enh_statement[:500], enh_guidance[:500]) + +print(f"\n OSCAL SP 800-53 aktive Controls: {len(oscal_controls)}") + +# Find missing: in OSCAL but not in DB +missing = [] +covered = [] +for label in sorted(oscal_controls.keys()): + if label in existing_articles: + covered.append(label) + else: + missing.append(label) + +print(f" In DB vorhanden: {len(covered)}") +print(f" FEHLEND in DB: {len(missing)}") + +# Missing by family +missing_by_fam = defaultdict(list) +for label in missing: + fam = label.split("-")[0] + missing_by_fam[fam].append(label) + +print(f"\n Fehlende Controls nach Family:") +for fam in sorted(missing_by_fam.keys()): + ctrls = missing_by_fam[fam] + examples = ", ".join(ctrls[:5]) + more = f" +{len(ctrls)-5}" if len(ctrls) > 5 else "" + print(f" {fam:4s}: {len(ctrls):3d} fehlend ({examples}{more})") + +# Also check CSF 2.0 +print(f"\n{'='*70}") +print("NIST CSF 2.0 — Vergleich mit DB") +print("=" * 70) + +cur.execute(""" + SELECT count(*), count(*) FILTER (WHERE release_state NOT IN ('duplicate', 'too_close')) + FROM compliance.canonical_controls + WHERE source_citation->>'source' LIKE 'NIST Cybersecurity%%' +""") +csf_row = cur.fetchone() +print(f" CSF Controls in DB: {csf_row[0]} total, {csf_row[1]} active") + +csf_subcats = 0 +csf_ids = [] +for grp in csf_groups: + for cat in grp.get("groups", []): + for subcat in cat.get("controls", []): + csf_subcats += 1 + props = {p["name"]: p.get("value", "") for p in subcat.get("props", [])} + csf_ids.append(props.get("label", subcat["id"])) + +print(f" CSF 2.0 OSCAL Subcategories: {csf_subcats}") +print(f" Beispiele: {', '.join(csf_ids[:10])}") + +# ── Summary / Potential ── +print(f"\n{'='*70}") +print("POTENTIAL: Was OSCAL uns bringt") +print("=" * 70) +print(f""" + SP 800-53 Rev 5: + - {len(missing)} neue Controls möglich (aktuell {len(covered)} in DB) + - Jeder Control hat: Statement + Guidance + Assessment-Methoden + - Cross-References zwischen Controls (für Mapping) + - Maschinenlesbare Parameter (ODP) + - Public Domain — keine Lizenzprobleme + + CSF 2.0: + - {csf_subcats} Subcategories als Compliance-Controls + - 6 Functions (Govern, Identify, Protect, Detect, Respond, Recover) + - Direkte Mappings zu SP 800-53 Controls + + Nächste Schritte: + 1. Fehlende SP 800-53 Controls importieren ({len(missing)} Controls) + 2. Statement-Text als source_original_text verwenden + 3. article_type='control', article=Label (z.B. 'AC-6') + 4. CSF 2.0 als eigene Regulation importieren + 5. Cross-References als Grundlage für Control-Mappings nutzen +""") + +conn.close() diff --git a/scripts/qa/oscal_import.py b/scripts/qa/oscal_import.py new file mode 100644 index 0000000..ff874d8 --- /dev/null +++ b/scripts/qa/oscal_import.py @@ -0,0 +1,289 @@ +"""Import 776 missing NIST SP 800-53 Rev 5 controls from OSCAL into canonical_controls.""" +import os +import re +import json +import uuid +import psycopg2 +import urllib.parse + +OSCAL_DIR = os.path.expanduser("~/rag-ingestion/nist-oscal") + +with open(os.path.join(OSCAL_DIR, "sp800-53-rev5-catalog.json")) as f: + sp853 = json.load(f)["catalog"] + +# ── Extract all OSCAL controls ── +def extract_controls(catalog): + """Extract all active controls with full data.""" + controls = [] + for fam in catalog.get("groups", []): + fam_id = fam.get("id", "").upper() + fam_title = fam.get("title", "") + + for ctrl in fam.get("controls", []): + result = extract_single(ctrl, fam_title) + if result: + controls.append(result) + # Enhancements + for enh in ctrl.get("controls", []): + result = extract_single(enh, fam_title) + if result: + controls.append(result) + return controls + +def extract_single(ctrl, family_title): + """Extract a single control or enhancement.""" + props = {p["name"]: p.get("value", "") for p in ctrl.get("props", [])} + if props.get("status") == "withdrawn": + return None + + label = props.get("label", ctrl["id"].upper()) + title = ctrl.get("title", "") + + # Extract statement (main requirement text) + statement = "" + for part in ctrl.get("parts", []): + if part.get("name") == "statement": + statement = part.get("prose", "") + # Sub-items (a., b., c., etc.) + for sub in part.get("parts", []): + sub_prose = sub.get("prose", "") + sub_label = "" + for sp in sub.get("props", []): + if sp["name"] == "label": + sub_label = sp.get("value", "") + if sub_label: + statement += f"\n{sub_label} {sub_prose}" + elif sub_prose: + statement += f"\n{sub_prose}" + # Nested sub-sub-items + for subsub in sub.get("parts", []): + ss_prose = subsub.get("prose", "") + ss_label = "" + for sp in subsub.get("props", []): + if sp["name"] == "label": + ss_label = sp.get("value", "") + if ss_label: + statement += f"\n {ss_label} {ss_prose}" + elif ss_prose: + statement += f"\n {ss_prose}" + + # Extract guidance + guidance = "" + for part in ctrl.get("parts", []): + if part.get("name") == "guidance": + guidance = part.get("prose", "") + + # Cross-references + related = [l["href"].lstrip("#") for l in ctrl.get("links", []) if l.get("rel") == "related"] + + # Parameters + params = [] + for p in ctrl.get("params", []): + param_id = p.get("id", "") + param_label = p.get("label", "") + guidelines = "" + for g in p.get("guidelines", []): + guidelines += g.get("prose", "") + select_choices = [] + if "select" in p: + for choice in p["select"].get("choice", []): + select_choices.append(choice) + params.append({ + "id": param_id, + "label": param_label, + "guidelines": guidelines, + "choices": select_choices, + }) + + return { + "label": label, + "title": title, + "family": family_title, + "statement": statement.strip(), + "guidance": guidance.strip(), + "related": related, + "params": params, + "is_enhancement": "(" in label, + } + +all_oscal = extract_controls(sp853) +print(f"Total OSCAL active controls: {len(all_oscal)}") + +# ── Normalize label for comparison ── +def normalize_label(label): + label = re.sub(r'-0+(\d)', r'-\1', label) + label = re.sub(r'\(0+(\d+)\)', r'(\1)', label) + return label.upper() + +# ── DB connection ── +db_url = os.environ['DATABASE_URL'] +parsed = urllib.parse.urlparse(db_url) +conn = psycopg2.connect( + host=parsed.hostname, port=parsed.port or 5432, + user=parsed.username, password=parsed.password, + dbname=parsed.path.lstrip('/'), + options="-c search_path=compliance,public" +) +cur = conn.cursor() + +# Get existing labels +cur.execute(""" + SELECT DISTINCT source_citation->>'article' as article + FROM compliance.canonical_controls + WHERE source_citation->>'source' = 'NIST SP 800-53 Rev. 5' + AND source_citation->>'article' IS NOT NULL +""") +existing_labels = set(normalize_label(r[0]) for r in cur.fetchall()) +print(f"Existing DB labels (normalized): {len(existing_labels)}") + +# Get highest control_id numbers per prefix +cur.execute(""" + SELECT control_id FROM compliance.canonical_controls + WHERE control_id ~ '^[A-Z]+-[0-9]+$' + ORDER BY control_id +""") +existing_ids = set(r[0] for r in cur.fetchall()) + +# Find next available ID per prefix +def next_control_id(prefix, existing): + """Find next available control_id like SEC-1234.""" + max_num = 0 + pattern = re.compile(rf'^{prefix}-(\d+)$') + for eid in existing: + m = pattern.match(eid) + if m: + max_num = max(max_num, int(m.group(1))) + return max_num + +# Map NIST families to our control_id prefixes +FAMILY_PREFIX = { + "Access Control": "ACC", + "Awareness and Training": "GOV", + "Audit and Accountability": "LOG", + "Assessment, Authorization, and Monitoring": "GOV", + "Configuration Management": "COMP", + "Contingency Planning": "INC", + "Identification and Authentication": "AUTH", + "Incident Response": "INC", + "Maintenance": "COMP", + "Media Protection": "DATA", + "Physical and Environmental Protection": "SEC", + "Planning": "GOV", + "Program Management": "GOV", + "Personnel Security": "GOV", + "Personally Identifiable Information Processing and Transparency": "DATA", + "Risk Assessment": "GOV", + "System and Services Acquisition": "COMP", + "System and Communications Protection": "NET", + "System and Information Integrity": "SEC", + "Supply Chain Risk Management": "COMP", +} + +# Track next IDs +prefix_counters = {} +for prefix in set(FAMILY_PREFIX.values()): + prefix_counters[prefix] = next_control_id(prefix, existing_ids) +print(f"Starting counters: {prefix_counters}") + +# ── Filter to only new controls ── +to_import = [] +for ctrl in all_oscal: + norm = normalize_label(ctrl["label"]) + if norm not in existing_labels: + to_import.append(ctrl) + +print(f"\nControls to import: {len(to_import)}") + +# ── Import ── +imported = 0 +for ctrl in to_import: + prefix = FAMILY_PREFIX.get(ctrl["family"], "COMP") + prefix_counters[prefix] += 1 + control_id = f"{prefix}-{prefix_counters[prefix]:04d}" + + # Build title: "NIST {label}: {title}" + title = f"NIST {ctrl['label']}: {ctrl['title']}" + + # source_original_text = statement (the official requirement text) + source_text = ctrl["statement"] + if not source_text: + source_text = ctrl["guidance"][:500] if ctrl["guidance"] else ctrl["title"] + + # objective = guidance text + objective = ctrl["guidance"][:2000] if ctrl["guidance"] else "" + + # source_citation + citation = { + "source": "NIST SP 800-53 Rev. 5", + "article": ctrl["label"], + "article_type": "control", + "source_type": "standard", + "oscal_import": True, + } + if ctrl["related"]: + citation["related_controls"] = ctrl["related"][:20] + if ctrl["params"]: + citation["parameters"] = [{"id": p["id"], "label": p["label"]} for p in ctrl["params"][:10]] + + FRAMEWORK_ID = '14b1bdd2-abc7-4a43-adae-14471ee5c7cf' + new_id = str(uuid.uuid4()) + cur.execute(""" + INSERT INTO compliance.canonical_controls + (id, framework_id, control_id, title, objective, rationale, + severity, source_original_text, + source_citation, pipeline_version, release_state, + generation_strategy, category) + VALUES (%s, %s, %s, %s, %s, '', 'medium', %s, %s, 4, 'draft', 'oscal_import', %s) + """, ( + new_id, + FRAMEWORK_ID, + control_id, + title[:500], + objective[:5000], + source_text[:10000], + json.dumps(citation, ensure_ascii=False), + ctrl["family"], + )) + imported += 1 + +conn.commit() +print(f"\nImported: {imported} new controls") + +# ── Verify ── +cur.execute(""" + SELECT count(*), + count(*) FILTER (WHERE release_state NOT IN ('duplicate', 'too_close')) + FROM compliance.canonical_controls + WHERE source_citation->>'source' = 'NIST SP 800-53 Rev. 5' +""") +total, active = cur.fetchone() +print(f"\nSP 800-53 after import: {total} total, {active} active") + +cur.execute(""" + SELECT release_state, count(*) + FROM compliance.canonical_controls + GROUP BY release_state + ORDER BY count(*) DESC +""") +print(f"\nDB release_state gesamt:") +for row in cur.fetchall(): + print(f" {row[0]:15s}: {row[1]:5d}") + +cur.execute(""" + SELECT count(*) + FROM compliance.canonical_controls + WHERE release_state NOT IN ('duplicate', 'too_close') +""") +print(f"\nAktive Controls gesamt: {cur.fetchone()[0]}") + +# ── Import stats by family ── +fam_counts = {} +for ctrl in to_import: + fam = ctrl["family"] + fam_counts[fam] = fam_counts.get(fam, 0) + 1 + +print(f"\nImportiert nach Family:") +for fam in sorted(fam_counts.keys()): + print(f" {fam[:45]:45s}: {fam_counts[fam]:3d}") + +conn.close() diff --git a/scripts/qa/owasp_cleanup.py b/scripts/qa/owasp_cleanup.py new file mode 100644 index 0000000..5bcf2c0 --- /dev/null +++ b/scripts/qa/owasp_cleanup.py @@ -0,0 +1,274 @@ +"""OWASP Cleanup: +1. Mark 324 OWASP Top 10 multilingual controls as 'duplicate' +2. Fix 47 wrong source attributions (found in different OWASP PDF) +""" +import os +import re +import json +import unicodedata +import psycopg2 +import urllib.parse + +try: + import fitz +except ImportError: + print("ERROR: PyMuPDF not installed") + exit(1) + +PDF_DIR = os.path.expanduser("~/rag-ingestion/pdfs") + +def normalize(s): + s = s.replace('\u00ad', '').replace('\xad', '') + s = s.replace('\u200b', '').replace('\u00a0', ' ') + s = s.replace('\ufb01', 'fi').replace('\ufb02', 'fl') + s = s.replace('\ufb00', 'ff').replace('\ufb03', 'ffi').replace('\ufb04', 'ffl') + s = s.replace('\u2019', "'").replace('\u2018', "'") + s = s.replace('\u201c', '"').replace('\u201d', '"') + s = s.replace('\u2013', '-').replace('\u2014', '-') + s = s.replace('\u2022', '-').replace('\u00b7', '-') + s = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', s) + s = unicodedata.normalize('NFC', s) + s = re.sub(r'\s+', ' ', s) + return s.strip() + +# Load OWASP PDFs +OWASP_PDFS = { + "OWASP Top 10 (2021)": "owasp_top10_2021.pdf", + "OWASP ASVS 4.0": "owasp_asvs_4_0.pdf", + "OWASP SAMM 2.0": "owasp_samm_2_0.pdf", + "OWASP API Security Top 10 (2023)": "owasp_api_top10_2023.pdf", + "OWASP MASVS 2.0": "owasp_masvs_2_0.pdf", +} + +pdf_norms = {} +for name, filename in OWASP_PDFS.items(): + path = os.path.join(PDF_DIR, filename) + if not os.path.exists(path): + continue + doc = fitz.open(path) + text = "" + for page in doc: + text += page.get_text() + "\n" + doc.close() + pdf_norms[name] = normalize(text) + +def build_owasp_index(text_norm, source_name): + # We need the raw text for regex, but we already normalized. + # Rebuild index from normalized text. + items = [] + if "Top 10" in source_name and "API" not in source_name: + for m in re.finditer(r'(A\d{2}:\d{4})', text_norm): + items.append((m.start(), m.group(1), "category")) + elif "API" in source_name: + for m in re.finditer(r'(API\d+:\d{4})', text_norm): + items.append((m.start(), m.group(1), "category")) + elif "ASVS" in source_name: + for m in re.finditer(r'(V\d+\.\d+(?:\.\d+)?)\b', text_norm): + items.append((m.start(), m.group(1), "requirement")) + elif "MASVS" in source_name: + for m in re.finditer(r'(MASVS-[A-Z]+-\d+)', text_norm): + items.append((m.start(), m.group(1), "requirement")) + items.sort(key=lambda x: x[0]) + seen = set() + unique = [] + for pos, label, typ in items: + if label not in seen: + seen.add(label) + unique.append((pos, label, typ)) + return unique + +pdf_indexes = {} +for name, norm in pdf_norms.items(): + pdf_indexes[name] = build_owasp_index(norm, name) + +def find_in_pdf(orig_text, source_name): + """Find control text in a specific PDF. Returns (label, type) or None.""" + pdf_norm = pdf_norms.get(source_name) + if not pdf_norm: + return None + orig_norm = normalize(orig_text) + if len(orig_norm) < 20: + return None + idx = pdf_indexes.get(source_name, []) + for start_frac in [0.25, 0.1, 0.5, 0.0, 0.75]: + for length in [80, 60, 40, 30, 20]: + start = max(0, int(len(orig_norm) * start_frac)) + snippet = orig_norm[start:start+length] + if not snippet or len(snippet) < 15: + continue + pos = pdf_norm.find(snippet) + if pos >= 0: + label = "Unknown" + typ = "unknown" + for h_pos, h_label, h_type in reversed(idx): + if h_pos <= pos: + label = h_label + typ = h_type + break + return (label, typ) + return None + +# DB +db_url = os.environ['DATABASE_URL'] +parsed = urllib.parse.urlparse(db_url) +conn = psycopg2.connect( + host=parsed.hostname, port=parsed.port or 5432, + user=parsed.username, password=parsed.password, + dbname=parsed.path.lstrip('/'), + options="-c search_path=compliance,public" +) +cur = conn.cursor() + +# ═══════════════════════════════════════════════════════════════ +# STEP 1: Mark OWASP Top 10 multilingual controls as duplicate +# ═══════════════════════════════════════════════════════════════ +print("=" * 60) +print("STEP 1: OWASP Top 10 — multilingual controls → duplicate") +print("=" * 60) + +cur.execute(""" + SELECT id, control_id, title, source_original_text, release_state + FROM compliance.canonical_controls + WHERE source_citation->>'source' = 'OWASP Top 10 (2021)' + AND source_citation->>'article_type' IS NULL + AND source_original_text IS NOT NULL + AND release_state NOT IN ('duplicate', 'too_close') + ORDER BY control_id +""") +top10_unmatched = cur.fetchall() +print(f" Unmatched active OWASP Top 10: {len(top10_unmatched)}") + +# Separate: found in other OWASP PDF vs not found anywhere +to_mark_dup = [] +to_fix_source = [] + +for ctrl in top10_unmatched: + uid, cid, title, text, state = ctrl + + # Check if found in another OWASP PDF + found_in = None + found_result = None + for other_src in OWASP_PDFS: + if other_src == 'OWASP Top 10 (2021)': + continue + result = find_in_pdf(text, other_src) + if result: + found_in = other_src + found_result = result + break + + if found_in: + to_fix_source.append((uid, cid, found_in, found_result[0], found_result[1])) + else: + to_mark_dup.append((uid, cid)) + +print(f" → Not found in any PDF (multilingual): {len(to_mark_dup)} → mark as duplicate") +print(f" → Found in other OWASP PDF: {len(to_fix_source)} → fix source attribution") + +# Mark as duplicate +dup_marked = 0 +for uid, cid in to_mark_dup: + cur.execute(""" + UPDATE compliance.canonical_controls + SET release_state = 'duplicate' + WHERE id = %s AND release_state NOT IN ('duplicate', 'too_close') + """, (uid,)) + if cur.rowcount > 0: + dup_marked += 1 + +print(f" Marked as duplicate: {dup_marked}") + +# ═══════════════════════════════════════════════════════════════ +# STEP 2: Fix wrong source attributions across ALL OWASP sources +# ═══════════════════════════════════════════════════════════════ +print(f"\n{'='*60}") +print("STEP 2: Fix wrong OWASP source attributions") +print("=" * 60) + +all_fixes = list(to_fix_source) # Start with Top 10 fixes + +# Also check ASVS, SAMM, MASVS +for source in ['OWASP ASVS 4.0', 'OWASP SAMM 2.0', 'OWASP API Security Top 10 (2023)', 'OWASP MASVS 2.0']: + cur.execute(""" + SELECT id, control_id, title, source_original_text + FROM compliance.canonical_controls + WHERE source_citation->>'source' = %s + AND source_citation->>'article_type' IS NULL + AND source_original_text IS NOT NULL + AND release_state NOT IN ('duplicate', 'too_close') + """, (source,)) + controls = cur.fetchall() + + for ctrl in controls: + uid, cid, title, text = ctrl + # Try own PDF first + result = find_in_pdf(text, source) + if result: + # Found in own PDF! Update article info + cur.execute(""" + UPDATE compliance.canonical_controls + SET source_citation = source_citation || + jsonb_build_object('article', %s, 'article_type', %s) + WHERE id = %s + AND (source_citation->>'article' IS DISTINCT FROM %s + OR source_citation->>'article_type' IS DISTINCT FROM %s) + """, (result[0], result[1], uid, result[0], result[1])) + continue + + # Try other OWASP PDFs + for other_src in OWASP_PDFS: + if other_src == source: + continue + result = find_in_pdf(text, other_src) + if result: + all_fixes.append((uid, cid, other_src, result[0], result[1])) + break + +print(f" Total wrong-source controls found: {len(all_fixes)}") + +# Apply source fixes +fixed = 0 +for uid, cid, correct_source, label, typ in all_fixes: + cur.execute(""" + UPDATE compliance.canonical_controls + SET source_citation = source_citation || + jsonb_build_object('source', %s, 'article', %s, 'article_type', %s) + WHERE id = %s + """, (correct_source, label, typ, uid,)) + if cur.rowcount > 0: + fixed += 1 + print(f" {cid:10s} → {correct_source} / {label} [{typ}]") + +print(f" Fixed: {fixed} controls") + +conn.commit() + +# ═══════════════════════════════════════════════════════════════ +# SUMMARY +# ═══════════════════════════════════════════════════════════════ +print(f"\n{'='*60}") +print("ZUSAMMENFASSUNG") +print("=" * 60) +print(f" OWASP Top 10 multilingual → duplicate: {dup_marked}") +print(f" Wrong source attribution → fixed: {fixed}") + +# Final counts +cur.execute(""" + SELECT release_state, count(*) + FROM compliance.canonical_controls + GROUP BY release_state + ORDER BY count(*) DESC +""") +print(f"\n DB release_state nach Cleanup:") +for row in cur.fetchall(): + print(f" {row[0]:15s}: {row[1]:5d}") + +cur.execute(""" + SELECT count(*) + FROM compliance.canonical_controls + WHERE release_state NOT IN ('duplicate', 'too_close') +""") +active = cur.fetchone()[0] +print(f"\n Aktive Controls: {active}") + +conn.close() diff --git a/scripts/qa/owasp_github_match.py b/scripts/qa/owasp_github_match.py new file mode 100644 index 0000000..16e7b71 --- /dev/null +++ b/scripts/qa/owasp_github_match.py @@ -0,0 +1,316 @@ +"""Match unmatched OWASP ASVS/SAMM/MASVS controls against GitHub Markdown sources.""" +import os +import re +import unicodedata +import psycopg2 +import urllib.parse +from pathlib import Path + +GITHUB_DIR = Path(os.path.expanduser("~/rag-ingestion/owasp-github")) + +def normalize(s): + s = s.replace('\u00ad', '').replace('\xad', '') + s = s.replace('\u200b', '').replace('\u00a0', ' ') + s = s.replace('\ufb01', 'fi').replace('\ufb02', 'fl') + s = s.replace('\ufb00', 'ff').replace('\ufb03', 'ffi').replace('\ufb04', 'ffl') + s = s.replace('\u2019', "'").replace('\u2018', "'") + s = s.replace('\u201c', '"').replace('\u201d', '"') + s = s.replace('\u2013', '-').replace('\u2014', '-') + s = s.replace('\u2022', '-').replace('\u00b7', '-') + s = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', s) + s = unicodedata.normalize('NFC', s) + s = re.sub(r'\s+', ' ', s) + return s.strip() + +# ── Load Markdown sources ── +def load_markdown_dir(path, pattern="*.md"): + """Load all markdown files, return combined text and per-file index.""" + texts = {} + for f in sorted(path.glob(pattern)): + try: + texts[f.name] = f.read_text(encoding='utf-8', errors='replace') + except: + pass + return texts + +# ASVS 4.0 — V-files contain requirements +asvs_dir = GITHUB_DIR / "ASVS" / "4.0" / "en" +asvs_files = load_markdown_dir(asvs_dir) +asvs_full = "\n".join(asvs_files.values()) +asvs_norm = normalize(asvs_full) +print(f"ASVS 4.0 Markdown: {len(asvs_files)} files, {len(asvs_full):,} chars") + +# SAMM core — YAML + Markdown +samm_dir = GITHUB_DIR / "samm-core" +samm_texts = {} +for f in samm_dir.rglob("*.yml"): + try: + samm_texts[str(f.relative_to(samm_dir))] = f.read_text(encoding='utf-8', errors='replace') + except: + pass +for f in samm_dir.rglob("*.md"): + try: + samm_texts[str(f.relative_to(samm_dir))] = f.read_text(encoding='utf-8', errors='replace') + except: + pass +samm_full = "\n".join(samm_texts.values()) +samm_norm = normalize(samm_full) +print(f"SAMM 2.0 source: {len(samm_texts)} files, {len(samm_full):,} chars") + +# MASVS — control markdown files +masvs_dir = GITHUB_DIR / "masvs" +masvs_files = {} +for f in masvs_dir.rglob("*.md"): + try: + masvs_files[str(f.relative_to(masvs_dir))] = f.read_text(encoding='utf-8', errors='replace') + except: + pass +masvs_full = "\n".join(masvs_files.values()) +masvs_norm = normalize(masvs_full) +print(f"MASVS 2.0 source: {len(masvs_files)} files, {len(masvs_full):,} chars") + +# API Security +api_dir = GITHUB_DIR / "api-security" +api_files = {} +for f in api_dir.rglob("*.md"): + try: + api_files[str(f.relative_to(api_dir))] = f.read_text(encoding='utf-8', errors='replace') + except: + pass +api_full = "\n".join(api_files.values()) +api_norm = normalize(api_full) +print(f"API Security source: {len(api_files)} files, {len(api_full):,} chars") + +# Source → (normalized_text, index_builder) +SOURCE_GITHUB = { + "OWASP ASVS 4.0": asvs_norm, + "OWASP SAMM 2.0": samm_norm, + "OWASP MASVS 2.0": masvs_norm, + "OWASP API Security Top 10 (2023)": api_norm, +} + +# Build indexes for each source +def build_asvs_index(text): + items = [] + for m in re.finditer(r'(V\d+\.\d+(?:\.\d+)?)\b', text): + items.append((m.start(), m.group(1), "requirement")) + items.sort(key=lambda x: x[0]) + seen = set() + return [(p, l, t) for p, l, t in items if l not in seen and not seen.add(l)] + +def build_samm_index(text): + items = [] + # SAMM practices have names like "Strategy & Metrics", sections numbered + for m in re.finditer(r'(?:^|\s)(\d+\.\d+(?:\.\d+)?)\s+[A-Z]', text): + items.append((m.start(), f"Section {m.group(1)}", "section")) + # Also find practice identifiers + for m in re.finditer(r'((?:Strategy|Education|Policy|Threat|Security Requirements|Secure Architecture|' + r'Secure Build|Secure Deployment|Defect Management|Environment Management|' + r'Incident Management|Requirements Testing|Security Testing|' + r'Design Review|Implementation Review|Operations Management)' + r'[^.\n]{0,30})', text): + items.append((m.start(), m.group(1)[:50], "section")) + items.sort(key=lambda x: x[0]) + seen = set() + return [(p, l, t) for p, l, t in items if l not in seen and not seen.add(l)] + +def build_masvs_index(text): + items = [] + for m in re.finditer(r'(MASVS-[A-Z]+-\d+)', text): + items.append((m.start(), m.group(1), "requirement")) + items.sort(key=lambda x: x[0]) + seen = set() + return [(p, l, t) for p, l, t in items if l not in seen and not seen.add(l)] + +def build_api_index(text): + items = [] + for m in re.finditer(r'(API\d+:\d{4})', text): + items.append((m.start(), m.group(1), "category")) + items.sort(key=lambda x: x[0]) + seen = set() + return [(p, l, t) for p, l, t in items if l not in seen and not seen.add(l)] + +SOURCE_INDEX_BUILDERS = { + "OWASP ASVS 4.0": build_asvs_index, + "OWASP SAMM 2.0": build_samm_index, + "OWASP MASVS 2.0": build_masvs_index, + "OWASP API Security Top 10 (2023)": build_api_index, +} + +# Build all indexes on normalized text +source_indexes = {} +for name, norm_text in SOURCE_GITHUB.items(): + builder = SOURCE_INDEX_BUILDERS[name] + idx = builder(norm_text) + source_indexes[name] = idx + print(f" {name}: {len(idx)} index entries") + +def find_text(orig_text, source_name): + """Find control text in GitHub source. Returns (label, type) or None.""" + norm_text = SOURCE_GITHUB.get(source_name) + if not norm_text: + return None + idx = source_indexes.get(source_name, []) + orig_norm = normalize(orig_text) + if len(orig_norm) < 20: + return None + + for start_frac in [0.25, 0.1, 0.5, 0.0, 0.75]: + for length in [80, 60, 40, 30, 20]: + start = max(0, int(len(orig_norm) * start_frac)) + snippet = orig_norm[start:start+length] + if not snippet or len(snippet) < 15: + continue + pos = norm_text.find(snippet) + if pos >= 0: + label = "Unknown" + typ = "unknown" + for h_pos, h_label, h_type in reversed(idx): + if h_pos <= pos: + label = h_label + typ = h_type + break + return (label, typ) + return None + +def find_in_any_github(orig_text, exclude_source=None): + """Try all GitHub sources.""" + for name in SOURCE_GITHUB: + if name == exclude_source: + continue + result = find_text(orig_text, name) + if result: + return (name, result[0], result[1]) + return None + +# ── DB ── +db_url = os.environ['DATABASE_URL'] +parsed = urllib.parse.urlparse(db_url) +conn = psycopg2.connect( + host=parsed.hostname, port=parsed.port or 5432, + user=parsed.username, password=parsed.password, + dbname=parsed.path.lstrip('/'), + options="-c search_path=compliance,public" +) +cur = conn.cursor() + +# ── Process each OWASP source ── +total_matched = 0 +total_cross = 0 +total_not_found = 0 +all_updates = [] + +for source in ['OWASP ASVS 4.0', 'OWASP SAMM 2.0', 'OWASP MASVS 2.0', 'OWASP API Security Top 10 (2023)']: + cur.execute(""" + SELECT id, control_id, title, source_original_text, release_state + FROM compliance.canonical_controls + WHERE source_citation->>'source' = %s + AND source_citation->>'article_type' IS NULL + AND source_original_text IS NOT NULL + AND release_state NOT IN ('duplicate', 'too_close') + ORDER BY control_id + """, (source,)) + controls = cur.fetchall() + + if not controls: + continue + + print(f"\n{'='*60}") + print(f"{source} — {len(controls)} unmatched active") + print(f"{'='*60}") + + matched = 0 + cross_matched = 0 + not_found = 0 + + for ctrl in controls: + uid, cid, title, text, state = ctrl + + # Try own GitHub source + result = find_text(text, source) + if result: + matched += 1 + total_matched += 1 + all_updates.append((uid, cid, source, result[0], result[1])) + print(f" {cid:10s} → {result[0]:30s} [{result[1]}]") + continue + + # Try other GitHub sources + cross = find_in_any_github(text, exclude_source=source) + if cross: + cross_matched += 1 + total_cross += 1 + all_updates.append((uid, cid, cross[0], cross[1], cross[2])) + print(f" {cid:10s} → [{cross[0]}] {cross[1]:20s} [{cross[2]}] (CROSS)") + continue + + not_found += 1 + total_not_found += 1 + + print(f"\n Own source matched: {matched}") + print(f" Cross-source: {cross_matched}") + print(f" Not found: {not_found}") + +# ── Also try OWASP Top 10 remaining unmatched (34 active left after dup marking) ── +cur.execute(""" + SELECT id, control_id, title, source_original_text, release_state + FROM compliance.canonical_controls + WHERE source_citation->>'source' = 'OWASP Top 10 (2021)' + AND source_citation->>'article_type' IS NULL + AND source_original_text IS NOT NULL + AND release_state NOT IN ('duplicate', 'too_close') + ORDER BY control_id +""") +top10_remaining = cur.fetchall() +if top10_remaining: + print(f"\n{'='*60}") + print(f"OWASP Top 10 (2021) — {len(top10_remaining)} remaining unmatched active") + print(f"{'='*60}") + for ctrl in top10_remaining: + uid, cid, title, text, state = ctrl + cross = find_in_any_github(text) + if cross: + total_cross += 1 + all_updates.append((uid, cid, cross[0], cross[1], cross[2])) + print(f" {cid:10s} → [{cross[0]}] {cross[1]:20s} [{cross[2]}]") + else: + total_not_found += 1 + +# ── Summary ── +print(f"\n{'='*60}") +print(f"ZUSAMMENFASSUNG") +print(f"{'='*60}") +print(f" Matched in eigener GitHub-Quelle: {total_matched}") +print(f" Cross-source matched: {total_cross}") +print(f" Nicht gefunden: {total_not_found}") +print(f" Total Updates: {len(all_updates)}") + +# ── Apply updates ── +if all_updates: + print(f"\nApplying {len(all_updates)} updates to DB...") + applied = 0 + for uid, cid, correct_source, label, typ in all_updates: + # Update article + article_type, and fix source if cross-matched + cur.execute(""" + UPDATE compliance.canonical_controls + SET source_citation = source_citation || + jsonb_build_object('article', %s, 'article_type', %s) + WHERE id = %s + AND (source_citation->>'article' IS DISTINCT FROM %s + OR source_citation->>'article_type' IS DISTINCT FROM %s) + """, (label, typ, uid, label, typ)) + if cur.rowcount > 0: + applied += 1 + + conn.commit() + print(f" Applied: {applied} controls updated") + + # Type distribution + type_counts = {} + for _, _, _, _, typ in all_updates: + type_counts[typ] = type_counts.get(typ, 0) + 1 + print(f"\n Article type distribution:") + for t, c in sorted(type_counts.items(), key=lambda x: -x[1]): + print(f" {t:12s}: {c:5d}") + +conn.close() diff --git a/scripts/qa/phase5_normalize_and_cleanup.py b/scripts/qa/phase5_normalize_and_cleanup.py new file mode 100644 index 0000000..bebd2fa --- /dev/null +++ b/scripts/qa/phase5_normalize_and_cleanup.py @@ -0,0 +1,357 @@ +"""Phase 5: Source Normalization + Duplicate Hard Delete. + +Steps: + 1. OSCAL controls: add source_regulation to generation_metadata + 2. Fix 20 v3 controls with NULL source (tag as manually_reviewed) + 3. Fix empty-string source (DATA-631 → Telekommunikationsgesetz Oesterreich) + 4. Fix OWASP cross-source misattributions (regulation_code vs actual source) + 5. Hard delete duplicate/too_close controls (3,301 controls, 0 FK refs) + 6. Clean up canonical_processed_chunks generated_control_ids + +Usage: + export DATABASE_URL='postgresql://...' + python3 scripts/qa/phase5_normalize_and_cleanup.py [--dry-run] [--step N] +""" +import os +import sys +import json +import psycopg2 +import urllib.parse + +DRY_RUN = "--dry-run" in sys.argv +STEP_ONLY = None +for arg in sys.argv: + if arg.startswith("--step"): + idx = sys.argv.index(arg) + if idx + 1 < len(sys.argv): + STEP_ONLY = int(sys.argv[idx + 1]) + +db_url = os.environ['DATABASE_URL'] +parsed = urllib.parse.urlparse(db_url) +conn = psycopg2.connect( + host=parsed.hostname, port=parsed.port or 5432, + user=parsed.username, password=parsed.password, + dbname=parsed.path.lstrip('/'), + options="-c search_path=compliance,public" +) +cur = conn.cursor() + +def should_run(step): + return STEP_ONLY is None or STEP_ONLY == step + + +# ══════════════════════════════════════════════════════════════════ +# Step 1: OSCAL controls — add source_regulation to generation_metadata +# ══════════════════════════════════════════════════════════════════ +if should_run(1): + print("=" * 70) + print("STEP 1: OSCAL controls — source_regulation in generation_metadata") + print("=" * 70) + + cur.execute(""" + SELECT count(*) + FROM compliance.canonical_controls + WHERE generation_strategy = 'oscal_import' + AND (generation_metadata->>'source_regulation' IS NULL + OR generation_metadata->>'source_regulation' = '') + """) + count = cur.fetchone()[0] + print(f" OSCAL controls without source_regulation: {count}") + + if count > 0: + if DRY_RUN: + print(f" [DRY RUN] Would update {count} controls") + else: + cur.execute(""" + UPDATE compliance.canonical_controls + SET generation_metadata = COALESCE(generation_metadata, '{}'::jsonb) + || '{"source_regulation": "nist_sp800_53r5"}'::jsonb + WHERE generation_strategy = 'oscal_import' + AND (generation_metadata->>'source_regulation' IS NULL + OR generation_metadata->>'source_regulation' = '') + """) + print(f" Updated: {cur.rowcount}") + print() + + +# ══════════════════════════════════════════════════════════════════ +# Step 2: v3 controls with NULL source — tag source as best guess +# ══════════════════════════════════════════════════════════════════ +if should_run(2): + print("=" * 70) + print("STEP 2: Fix v3 controls with NULL source") + print("=" * 70) + + # These 20 controls are v3/document_grouped with no source or regulation. + # Based on title analysis, they cover: + # - Data protection/privacy topics (DSGVO-adjacent) + # - Software security (OWASP/NIST-adjacent) + # - Mobile security (OWASP MASVS-adjacent) + # Mark them as 'needs_review' and add a flag. + cur.execute(""" + SELECT id, control_id, title + FROM compliance.canonical_controls + WHERE source_citation->>'source' IS NULL + AND pipeline_version = 3 + AND release_state NOT IN ('duplicate', 'too_close') + """) + v3_null = cur.fetchall() + print(f" v3 controls with NULL source: {len(v3_null)}") + + if v3_null: + if DRY_RUN: + print(f" [DRY RUN] Would mark {len(v3_null)} as needs_review") + else: + for ctrl_id_uuid, control_id, title in v3_null: + cur.execute(""" + UPDATE compliance.canonical_controls + SET release_state = 'needs_review', + generation_metadata = COALESCE(generation_metadata, '{}'::jsonb) + || '{"missing_source": true}'::jsonb + WHERE id = %s + """, (ctrl_id_uuid,)) + print(f" Marked {len(v3_null)} as needs_review with missing_source flag") + print() + + +# ══════════════════════════════════════════════════════════════════ +# Step 3: Fix empty-string source (DATA-631) +# ══════════════════════════════════════════════════════════════════ +if should_run(3): + print("=" * 70) + print("STEP 3: Fix empty-string source") + print("=" * 70) + + cur.execute(""" + SELECT id, control_id, title, + generation_metadata->>'source_regulation' as reg + FROM compliance.canonical_controls + WHERE source_citation->>'source' = '' + AND release_state NOT IN ('duplicate', 'too_close') + """) + empty_src = cur.fetchall() + print(f" Controls with empty source: {len(empty_src)}") + + for ctrl_id_uuid, control_id, title, reg in empty_src: + print(f" {control_id} | reg={reg} | {title[:60]}") + if reg == 'at_tkg': + new_source = 'Telekommunikationsgesetz Oesterreich' + else: + new_source = f"Unbekannt ({reg})" + + if DRY_RUN: + print(f" [DRY RUN] Would set source='{new_source}'") + else: + cur.execute(""" + UPDATE compliance.canonical_controls + SET source_citation = jsonb_set( + source_citation, '{source}', %s::jsonb + ) + WHERE id = %s + """, (json.dumps(new_source), ctrl_id_uuid)) + print(f" Set source='{new_source}'") + print() + + +# ══════════════════════════════════════════════════════════════════ +# Step 4: Fix OWASP cross-source misattributions +# ══════════════════════════════════════════════════════════════════ +if should_run(4): + print("=" * 70) + print("STEP 4: Fix OWASP cross-source misattributions") + print("=" * 70) + + # Controls where source_citation.source doesn't match the regulation_code + OWASP_REG_TO_SOURCE = { + 'owasp_top10_2021': 'OWASP Top 10 (2021)', + 'owasp_asvs': 'OWASP ASVS 4.0', + 'owasp_masvs': 'OWASP MASVS 2.0', + 'owasp_samm': 'OWASP SAMM 2.0', + 'owasp_api_top10_2023': 'OWASP API Security Top 10 (2023)', + } + + # Strategy: Move controls to the regulation_code that matches their actual source + # i.e., if a control has source='OWASP ASVS 4.0' but reg='owasp_top10_2021', + # update the reg to 'owasp_asvs' + SOURCE_TO_REG = {v: k for k, v in OWASP_REG_TO_SOURCE.items()} + + total_fixed = 0 + for reg_code, expected_source in OWASP_REG_TO_SOURCE.items(): + cur.execute(""" + SELECT id, control_id, source_citation->>'source' as src + FROM compliance.canonical_controls + WHERE generation_metadata->>'source_regulation' = %s + AND source_citation->>'source' <> %s + AND release_state NOT IN ('duplicate', 'too_close') + """, (reg_code, expected_source)) + mismatches = cur.fetchall() + + if mismatches: + print(f"\n {reg_code} → {len(mismatches)} Mismatches:") + for ctrl_id_uuid, control_id, actual_source in mismatches: + correct_reg = SOURCE_TO_REG.get(actual_source) + if correct_reg: + print(f" {control_id} | {actual_source} → reg={correct_reg}") + if not DRY_RUN: + cur.execute(""" + UPDATE compliance.canonical_controls + SET generation_metadata = jsonb_set( + generation_metadata, '{source_regulation}', %s::jsonb + ) + WHERE id = %s + """, (json.dumps(correct_reg), ctrl_id_uuid)) + total_fixed += 1 + else: + print(f" {control_id} | {actual_source} → no mapping found") + + if DRY_RUN: + print(f"\n [DRY RUN] Would fix {total_fixed} misattributions") + else: + print(f"\n Fixed: {total_fixed} misattributions") + print() + + +# ══════════════════════════════════════════════════════════════════ +# Step 5: Hard delete duplicate/too_close controls +# ══════════════════════════════════════════════════════════════════ +if should_run(5): + print("=" * 70) + print("STEP 5: Hard delete duplicate/too_close controls") + print("=" * 70) + + # Verify no FK references + for table, col in [ + ('canonical_control_mappings', 'control_id'), + ('obligation_extractions', 'control_uuid'), + ('crosswalk_matrix', 'master_control_uuid'), + ('obligation_candidates', 'parent_control_uuid'), + ]: + cur.execute(f""" + SELECT count(*) + FROM compliance.{table} t + JOIN compliance.canonical_controls cc ON cc.id = t.{col} + WHERE cc.release_state IN ('duplicate', 'too_close') + """) + fk_count = cur.fetchone()[0] + if fk_count > 0: + print(f" WARNING: {table}.{col} has {fk_count} refs to dup/too_close!") + print(f" ABORTING Step 5 — clean FK refs first!") + sys.exit(1) + else: + print(f" {table}.{col}: 0 refs ✓") + + # Check self-references + cur.execute(""" + SELECT count(*) + FROM compliance.canonical_controls child + JOIN compliance.canonical_controls parent ON parent.id = child.parent_control_uuid + WHERE parent.release_state IN ('duplicate', 'too_close') + """) + self_refs = cur.fetchone()[0] + if self_refs > 0: + print(f" WARNING: {self_refs} child controls reference dup/too_close parents!") + print(f" ABORTING Step 5!") + sys.exit(1) + print(f" Self-references: 0 ✓") + + cur.execute(""" + SELECT release_state, count(*) + FROM compliance.canonical_controls + WHERE release_state IN ('duplicate', 'too_close') + GROUP BY 1 + """) + to_delete = {} + for state, cnt in cur.fetchall(): + to_delete[state] = cnt + print(f"\n {state}: {cnt}") + + total = sum(to_delete.values()) + print(f"\n TOTAL to delete: {total}") + + if DRY_RUN: + print(f" [DRY RUN] Would delete {total} controls") + else: + cur.execute(""" + DELETE FROM compliance.canonical_controls + WHERE release_state IN ('duplicate', 'too_close') + """) + print(f" Deleted: {cur.rowcount} controls") + print() + + +# ══════════════════════════════════════════════════════════════════ +# Step 6: Clean up canonical_processed_chunks generated_control_ids +# ══════════════════════════════════════════════════════════════════ +if should_run(6): + print("=" * 70) + print("STEP 6: Clean up processed chunks (remove deleted control IDs)") + print("=" * 70) + + if DRY_RUN and should_run(5): + print(" [DRY RUN] Skipping — depends on Step 5 deletion") + else: + # Find chunks that reference non-existent controls + cur.execute(""" + SELECT id, generated_control_ids + FROM compliance.canonical_processed_chunks + WHERE generated_control_ids IS NOT NULL + AND generated_control_ids <> '[]'::jsonb + """) + chunks = cur.fetchall() + print(f" Chunks with generated_control_ids: {len(chunks)}") + + # Get all existing control IDs + cur.execute("SELECT id::text FROM compliance.canonical_controls") + existing_ids = set(r[0] for r in cur.fetchall()) + print(f" Existing controls: {len(existing_ids)}") + + cleaned = 0 + for chunk_id, control_ids in chunks: + if isinstance(control_ids, str): + control_ids = json.loads(control_ids) + if isinstance(control_ids, list): + valid_ids = [cid for cid in control_ids if cid in existing_ids] + if len(valid_ids) < len(control_ids): + removed = len(control_ids) - len(valid_ids) + cur.execute(""" + UPDATE compliance.canonical_processed_chunks + SET generated_control_ids = %s::jsonb + WHERE id = %s + """, (json.dumps(valid_ids), chunk_id)) + cleaned += 1 + + print(f" Chunks cleaned: {cleaned}") + print() + + +# ══════════════════════════════════════════════════════════════════ +# Final summary +# ══════════════════════════════════════════════════════════════════ +if not DRY_RUN: + conn.commit() + print("=" * 70) + print("COMMITTED. Final state:") + print("=" * 70) +else: + print("=" * 70) + print("[DRY RUN] No changes committed. Current state:") + print("=" * 70) + +cur.execute(""" + SELECT release_state, count(*) + FROM compliance.canonical_controls + GROUP BY 1 + ORDER BY count(*) DESC +""") +total = 0 +active = 0 +for state, cnt in cur.fetchall(): + total += cnt + if state not in ('duplicate', 'too_close'): + active += cnt + print(f" {state:15s}: {cnt:5d}") + +print(f"\n TOTAL: {total}") +print(f" AKTIV: {active}") + +conn.close() diff --git a/scripts/qa/phase74_generate_gap_controls.py b/scripts/qa/phase74_generate_gap_controls.py new file mode 100644 index 0000000..d83b6e8 --- /dev/null +++ b/scripts/qa/phase74_generate_gap_controls.py @@ -0,0 +1,655 @@ +#!/usr/bin/env python3 +""" +Phase 7.4: Generate new controls for gap articles via Anthropic Claude Sonnet. + +Reads gap_analysis_results.json, extracts article text from PDFs, +calls Claude Sonnet to generate controls, inserts into DB. + +Usage: + python3 phase74_generate_gap_controls.py --dry-run # show what would be generated + python3 phase74_generate_gap_controls.py # generate and insert + python3 phase74_generate_gap_controls.py --source "DSGVO" # filter by source + python3 phase74_generate_gap_controls.py --resume # skip already-generated articles +""" +import os +import sys +import json +import re +import time +import hashlib +import argparse +import psycopg2 +import urllib.parse +import requests +from pathlib import Path +from collections import Counter + +sys.path.insert(0, os.path.dirname(__file__)) +from pdf_qa_all import ( + SOURCE_FILE_MAP, read_file, classify_doc, normalize, + build_eu_article_index, build_de_law_index, build_nist_index, + build_owasp_index, build_generic_index, MAX_ARTICLES, +) + +# ── Config ────────────────────────────────────────────────────────── +ANTHROPIC_URL = "https://api.anthropic.com/v1/messages" +ANTHROPIC_MODEL = os.environ.get("CONTROL_GEN_ANTHROPIC_MODEL", "claude-sonnet-4-6") +ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", "") +PIPELINE_VERSION = 5 +GAP_RESULTS_FILE = "/tmp/gap_analysis_results.json" +PDF_DIR = Path(os.path.expanduser("~/rag-ingestion/pdfs")) + +try: + import fitz +except ImportError: + fitz = None + +# ── Source name → regulation_code reverse map ──────────────────────── +# Built from REGULATION_LICENSE_MAP in control_generator.py +SOURCE_TO_REGCODE = { + "DSGVO (EU) 2016/679": "eu_2016_679", + "KI-Verordnung (EU) 2024/1689": "eu_2024_1689", + "NIS2-Richtlinie (EU) 2022/2555": "eu_2022_2555", + "Cyber Resilience Act (CRA)": "eu_2024_2847", + "Maschinenverordnung (EU) 2023/1230": "eu_2023_1230", + "EU Blue Guide 2022": "eu_blue_guide_2022", + "Markets in Crypto-Assets (MiCA)": "mica", + "Batterieverordnung (EU) 2023/1542": "eu_2023_1542", + "AML-Verordnung": "amlr", + "Data Governance Act (DGA)": "dga", + "Data Act": "data_act", + "GPSR (EU) 2023/988": "gpsr", + "IFRS-Übernahmeverordnung": "ifrs", + "NIST SP 800-53 Rev. 5": "nist_sp800_53r5", + "NIST SP 800-207 (Zero Trust)": "nist_sp800_207", + "NIST SP 800-63-3": "nist_sp800_63_3", + "NIST AI Risk Management Framework": "nist_ai_rmf", + "NIST SP 800-218 (SSDF)": "nist_sp_800_218", + "NIST Cybersecurity Framework 2.0": "nist_csf_2_0", + "OWASP Top 10 (2021)": "owasp_top10", + "OWASP ASVS 4.0": "owasp_asvs", + "OWASP SAMM 2.0": "owasp_samm", + "OWASP API Security Top 10 (2023)": "owasp_api_top10", + "OWASP MASVS 2.0": "owasp_masvs", + "ENISA ICS/SCADA Dependencies": "enisa_ics_scada", + "ENISA Supply Chain Good Practices": "enisa_supply_chain", + "CISA Secure by Design": "cisa_sbd", + "Bundesdatenschutzgesetz (BDSG)": "bdsg", + "Gewerbeordnung (GewO)": "gewo", + "Handelsgesetzbuch (HGB)": "hgb", + "Abgabenordnung (AO)": "ao", + "OECD KI-Empfehlung": "oecd_ai_principles", +} + +# License info per regulation code (from REGULATION_LICENSE_MAP) +LICENSE_MAP = { + "eu_2016_679": {"license": "EU_LAW", "rule": 1, "source_type": "law"}, + "eu_2024_1689": {"license": "EU_LAW", "rule": 1, "source_type": "law"}, + "eu_2022_2555": {"license": "EU_LAW", "rule": 1, "source_type": "law"}, + "eu_2024_2847": {"license": "EU_LAW", "rule": 1, "source_type": "law"}, + "eu_2023_1230": {"license": "EU_LAW", "rule": 1, "source_type": "law"}, + "eu_blue_guide_2022": {"license": "EU_PUBLIC", "rule": 1, "source_type": "guideline"}, + "mica": {"license": "EU_LAW", "rule": 1, "source_type": "law"}, + "eu_2023_1542": {"license": "EU_LAW", "rule": 1, "source_type": "law"}, + "amlr": {"license": "EU_LAW", "rule": 1, "source_type": "law"}, + "dga": {"license": "EU_LAW", "rule": 1, "source_type": "law"}, + "data_act": {"license": "EU_LAW", "rule": 1, "source_type": "law"}, + "gpsr": {"license": "EU_LAW", "rule": 1, "source_type": "law"}, + "ifrs": {"license": "EU_LAW", "rule": 1, "source_type": "law"}, + "nist_sp800_53r5": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"}, + "nist_sp800_207": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"}, + "nist_sp800_63_3": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"}, + "nist_ai_rmf": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"}, + "nist_sp_800_218": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"}, + "nist_csf_2_0": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"}, + "owasp_top10": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard"}, + "owasp_asvs": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard"}, + "owasp_samm": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard"}, + "owasp_api_top10": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard"}, + "owasp_masvs": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard"}, + "enisa_ics_scada": {"license": "CC-BY-4.0", "rule": 2, "source_type": "guideline"}, + "enisa_supply_chain": {"license": "CC-BY-4.0", "rule": 2, "source_type": "guideline"}, + "cisa_sbd": {"license": "US_GOV_PUBLIC", "rule": 1, "source_type": "guideline"}, + "bdsg": {"license": "DE_LAW", "rule": 1, "source_type": "law"}, + "gewo": {"license": "DE_LAW", "rule": 1, "source_type": "law"}, + "hgb": {"license": "DE_LAW", "rule": 1, "source_type": "law"}, + "ao": {"license": "DE_LAW", "rule": 1, "source_type": "law"}, + "oecd_ai_principles": {"license": "OECD_PUBLIC", "rule": 2, "source_type": "standard"}, +} + +# Domain detection keywords +DOMAIN_KEYWORDS = { + "AUTH": ["authentifizierung", "anmeldung", "login", "passwort", "identit", "identity", "credential"], + "CRYP": ["verschlüsselung", "kryptogra", "encrypt", "cipher", "hash", "tls", "ssl", "signatur"], + "NET": ["netzwerk", "network", "firewall", "router", "dns", "ip-adress"], + "DATA": ["daten", "data", "personenbezogen", "datenschutz", "privacy", "gdpr", "dsgvo", "verarbeitung"], + "LOG": ["protokoll", "logging", "audit", "nachvollzieh", "aufzeichn"], + "ACC": ["zugriff", "access", "berechtigung", "autorisierung", "authorization", "rolle"], + "SEC": ["sicherheit", "security", "schutz", "protect", "schwachstell", "vulnerab"], + "INC": ["vorfall", "incident", "breach", "meldung", "reaktion", "response", "notfall"], + "AI": ["künstliche intelligenz", "ki-system", "ai system", "machine learning", "algorithm", "hochrisiko-ki"], + "COMP": ["compliance", "konformität", "audit", "zertifizierung", "regulier", "vorschrift"], + "GOV": ["behörde", "aufsicht", "governance", "marktüberwachung", "authority"], + "FIN": ["finanz", "zahlungs", "payment", "crypto", "krypto-", "geldwäsche", "aml"], + "ENV": ["umwelt", "environment", "batterie", "recycling", "entsorgu", "nachhaltig"], +} + +# ── Prompt (same as control_generator.py) ──────────────────────────── + +SYSTEM_PROMPT = """Du bist ein Security-Compliance-Experte. Strukturiere den gegebenen Text +als praxisorientiertes Security Control. Erstelle eine verständliche, umsetzbare Formulierung. +Antworte NUR mit validem JSON. Bei mehreren Controls antworte mit einem JSON-Array.""" + +APPLICABILITY_PROMPT = """- applicable_industries: Liste der Branchen fuer die dieses Control relevant ist. + Verwende ["all"] wenn der Control branchenuebergreifend gilt. + Moegliche Werte: "all", "Technologie / IT", "IT Dienstleistungen", "E-Commerce / Handel", + "Finanzdienstleistungen", "Versicherungen", "Gesundheitswesen", "Pharma", "Bildung", + "Beratung / Consulting", "Marketing / Agentur", "Produktion / Industrie", + "Logistik / Transport", "Immobilien", "Bau", "Energie", "Automobil", + "Luft- / Raumfahrt", "Maschinenbau", "Anlagenbau", "Automatisierung", "Robotik", + "Messtechnik", "Agrar", "Chemie", "Minen / Bergbau", "Telekommunikation", + "Medien / Verlage", "Gastronomie / Hotellerie", "Recht / Kanzlei", + "Oeffentlicher Dienst", "Verteidigung / Ruestung", "Wasser- / Abwasserwirtschaft", + "Lebensmittel", "Digitale Infrastruktur", "Weltraum", "Post / Kurierdienste", + "Abfallwirtschaft", "Forschung" +- applicable_company_size: Ab welcher Unternehmensgroesse gilt dieses Control? + Verwende ["all"] wenn keine Groessenbeschraenkung. + Moegliche Werte: "all", "micro", "small", "medium", "large", "enterprise" +- scope_conditions: null wenn keine besonderen Bedingungen, sonst: + {"requires_any": ["signal"], "description": "Erklaerung"} + Moegliche Signale: "uses_ai", "third_country_transfer", "processes_health_data", + "processes_minors_data", "automated_decisions", "employee_monitoring", + "video_surveillance", "financial_data", "is_kritis_operator", "payment_services" """ + +CATEGORY_LIST = [ + "Datenschutz-Grundlagen", "Betroffenenrechte", "Technische Massnahmen", + "Organisatorische Massnahmen", "Auftragsverarbeitung", "Datentransfer", + "Risikomanagement", "Incident Response", "KI-Regulierung", "Cybersicherheit", + "Zugriffskontrolle", "Kryptographie", "Netzwerksicherheit", "Compliance-Management", + "Produktsicherheit", "Marktüberwachung", "Supply Chain Security", + "Finanzregulierung", "Arbeitsrecht", "Gewerberecht", "Handelsrecht", + "Umwelt / Nachhaltigkeit", "Dokumentation", "Schulung / Awareness", +] +CATEGORY_LIST_STR = ", ".join(f'"{c}"' for c in CATEGORY_LIST) + + +def build_prompt(source_name, article_label, article_text, license_type): + return f"""Strukturiere den folgenden Gesetzestext als Security/Compliance Control. +Du DARFST den Originaltext verwenden (Quelle: {source_name}, {license_type}). + +WICHTIG: Erstelle eine verständliche, praxisorientierte Formulierung. +Der Originaltext wird separat gespeichert — deine Formulierung soll klar und umsetzbar sein. + +Gib JSON zurück mit diesen Feldern: +- title: Kurzer prägnanter Titel (max 100 Zeichen) +- objective: Was soll erreicht werden? (1-3 Sätze) +- rationale: Warum ist das wichtig? (1-2 Sätze) +- requirements: Liste von konkreten Anforderungen (Strings) +- test_procedure: Liste von Prüfschritten (Strings) +- evidence: Liste von Nachweisdokumenten (Strings) +- severity: low/medium/high/critical +- tags: Liste von Tags +- domain: Fachgebiet als Kuerzel (AUTH=Authentifizierung, CRYP=Kryptographie, NET=Netzwerk, DATA=Datenschutz, LOG=Logging, ACC=Zugriffskontrolle, SEC=IT-Sicherheit, INC=Vorfallmanagement, AI=KI, COMP=Compliance, GOV=Behoerden/Verwaltung, LAB=Arbeitsrecht, FIN=Finanzregulierung, TRD=Gewerbe/Handelsrecht, ENV=Umwelt, HLT=Gesundheit) +- category: Inhaltliche Kategorie. Moegliche Werte: {CATEGORY_LIST_STR} +- target_audience: Liste der Zielgruppen (z.B. "unternehmen", "behoerden", "entwickler", "datenschutzbeauftragte", "geschaeftsfuehrung", "it-abteilung", "rechtsabteilung", "compliance-officer") +- source_article: Artikel-/Paragraphen-Referenz (z.B. "Artikel 10", "§ 42") +- source_paragraph: Absatz-Referenz (z.B. "Absatz 5", "Nr. 2") +{APPLICABILITY_PROMPT} + +Text: {article_text[:3000]} +Quelle: {source_name}, {article_label}""" + + +# ── PDF article extraction ─────────────────────────────────────────── + +def extract_article_text(pdf_file, article_label, doc_type, full_text=None): + """Extract the text of a specific article from a PDF.""" + if full_text is None: + full_text = read_file(pdf_file) + if not full_text: + return "" + + if doc_type == "eu_regulation": + art_num_match = re.search(r'\d+', article_label) + if not art_num_match: + return "" + num = int(art_num_match.group()) + pattern = rf'\nArtikel\s+{num}\s*\n' + match = re.search(pattern, full_text) + if not match: + return "" + start = match.start() + next_pattern = rf'\nArtikel\s+{num + 1}\s*\n' + next_match = re.search(next_pattern, full_text) + end = next_match.start() if next_match else min(start + 5000, len(full_text)) + return full_text[start:end].strip()[:3000] + + elif doc_type == "de_law": + para_match = re.search(r'\d+', article_label) + if not para_match: + return "" + num = int(para_match.group()) + pattern = rf'\n§\s+{num}\b' + match = re.search(pattern, full_text) + if not match: + return "" + start = match.start() + next_pattern = rf'\n§\s+{num + 1}\b' + next_match = re.search(next_pattern, full_text) + end = next_match.start() if next_match else min(start + 5000, len(full_text)) + return full_text[start:end].strip()[:3000] + + elif doc_type == "nist": + escaped = re.escape(article_label) + match = re.search(rf'(?:^|\n)\s*{escaped}\b', full_text) + if not match: + return "" + start = match.start() + return full_text[start:start + 3000].strip() + + else: + # Generic / OWASP / ENISA + escaped = re.escape(article_label) + match = re.search(rf'(?:^|\n).*{escaped}\b', full_text) + if not match: + return "" + start = match.start() + return full_text[start:start + 3000].strip() + + +# ── Anthropic API ──────────────────────────────────────────────────── + +def call_anthropic(prompt, system_prompt): + """Call Anthropic API. Returns (parsed_data, raw_text, usage, error).""" + headers = { + "x-api-key": ANTHROPIC_API_KEY, + "anthropic-version": "2023-06-01", + "content-type": "application/json", + } + payload = { + "model": ANTHROPIC_MODEL, + "max_tokens": 4096, + "system": system_prompt, + "messages": [{"role": "user", "content": prompt}], + } + + try: + resp = requests.post(ANTHROPIC_URL, headers=headers, json=payload, timeout=120) + if resp.status_code != 200: + return None, "", {}, f"HTTP {resp.status_code}: {resp.text[:200]}" + data = resp.json() + content = data["content"][0]["text"] if data.get("content") else "" + usage = data.get("usage", {}) + parsed = parse_json(content) + return parsed, content, usage, None + except Exception as e: + return None, "", {}, str(e) + + +def parse_json(text): + """Parse JSON from LLM response, handling markdown fences.""" + text = text.strip() + if text.startswith("```"): + lines = text.split("\n") + text = "\n".join(lines[1:-1] if lines[-1].strip().startswith("```") else lines[1:]) + text = text.strip() + + try: + data = json.loads(text) + if isinstance(data, list): + return data[0] if data else None + return data + except json.JSONDecodeError: + match = re.search(r'\{[\s\S]*\}', text) + if match: + try: + return json.loads(match.group()) + except json.JSONDecodeError: + return None + return None + + +# ── Domain detection ───────────────────────────────────────────────── + +def detect_domain(text): + text_lower = text.lower() + scores = {} + for domain, keywords in DOMAIN_KEYWORDS.items(): + score = sum(1 for kw in keywords if kw in text_lower) + if score > 0: + scores[domain] = score + if scores: + return max(scores, key=scores.get) + return "SEC" + + +# ── Control ID generation ──────────────────────────────────────────── + +def generate_control_id(domain, cur): + """Generate next available control_id for domain prefix. + + Uses MAX(numeric suffix) to find the true highest number, + avoiding gaps from string-sorted IDs (e.g. COMP-99 > COMP-1000 in text sort). + """ + prefix = domain.upper()[:4] + cur.execute(""" + SELECT MAX(CAST(SPLIT_PART(control_id, '-', 2) AS INTEGER)) + FROM compliance.canonical_controls + WHERE control_id LIKE %s + AND SPLIT_PART(control_id, '-', 2) ~ '^[0-9]+$' + """, (f"{prefix}-%",)) + row = cur.fetchone() + if row and row[0] is not None: + return f"{prefix}-{row[0] + 1}" + return f"{prefix}-001" + + +# ── Main ───────────────────────────────────────────────────────────── + +def main(): + parser = argparse.ArgumentParser(description="Phase 7.4: Generate controls for gap articles") + parser.add_argument("--dry-run", action="store_true", help="Show what would be generated") + parser.add_argument("--source", type=str, help="Filter by source name substring") + parser.add_argument("--resume", action="store_true", help="Skip articles that already have controls") + parser.add_argument("--results", default=GAP_RESULTS_FILE, help="Path to gap_analysis_results.json") + args = parser.parse_args() + + if not ANTHROPIC_API_KEY: + print("ERROR: Set ANTHROPIC_API_KEY") + sys.exit(1) + + # Load gap results + with open(args.results) as f: + gaps = json.load(f) + total_gaps = sum(len(g["gap_articles"]) for g in gaps) + print(f"Loaded {len(gaps)} sources with {total_gaps} gap articles") + + if args.source: + gaps = [g for g in gaps if args.source.lower() in g["source"].lower()] + total_gaps = sum(len(g["gap_articles"]) for g in gaps) + print(f"Filtered to {len(gaps)} sources, {total_gaps} gaps") + + # DB connection with keepalive + reconnect helper + db_url = os.environ['DATABASE_URL'] + parsed = urllib.parse.urlparse(db_url) + + def connect_db(): + """Create DB connection with TCP keepalive.""" + c = psycopg2.connect( + host=parsed.hostname, port=parsed.port or 5432, + user=parsed.username, password=parsed.password, + dbname=parsed.path.lstrip('/'), + options="-c search_path=compliance,public", + keepalives=1, keepalives_idle=30, + keepalives_interval=10, keepalives_count=5, + ) + return c, c.cursor() + + conn, cur = connect_db() + + def ensure_db(): + """Reconnect if connection is dead.""" + nonlocal conn, cur + try: + cur.execute("SELECT 1") + except Exception: + print(" [RECONNECT] DB connection lost, reconnecting...") + try: + conn.close() + except Exception: + pass + conn, cur = connect_db() + return True + return False + + # Get framework UUID + cur.execute("SELECT id FROM compliance.canonical_control_frameworks WHERE framework_id = 'bp_security_v1' LIMIT 1") + fw_row = cur.fetchone() + if not fw_row: + print("ERROR: Framework bp_security_v1 not found") + sys.exit(1) + framework_uuid = fw_row[0] + + # If resuming, load existing articles per source + existing_articles = {} + if args.resume: + cur.execute(""" + SELECT source_citation->>'source', source_citation->>'article' + FROM compliance.canonical_controls + WHERE source_citation->>'article' IS NOT NULL + """) + for src, art in cur.fetchall(): + existing_articles.setdefault(src, set()).add(art) + print(f"Resume mode: {sum(len(v) for v in existing_articles.values())} existing article-control pairs") + + # Stats + stats = Counter() + total_input_tokens = 0 + total_output_tokens = 0 + generated_ids = [] + errors = [] + t_start = time.time() + + # Pre-read PDFs (cache full text per source) + pdf_cache = {} + + for gap_source in sorted(gaps, key=lambda g: -len(g["gap_articles"])): + source_name = gap_source["source"] + gap_articles = gap_source["gap_articles"] + filename = SOURCE_FILE_MAP.get(source_name) + reg_code = SOURCE_TO_REGCODE.get(source_name, "unknown") + license_info = LICENSE_MAP.get(reg_code, {"license": "UNKNOWN", "rule": 1, "source_type": "unknown"}) + doc_type = classify_doc(source_name) + + if not filename: + stats["skipped_no_pdf"] += len(gap_articles) + continue + + # Read PDF once per source + if source_name not in pdf_cache: + pdf_cache[source_name] = read_file(filename) + full_text = pdf_cache[source_name] + if not full_text: + stats["skipped_no_pdf"] += len(gap_articles) + continue + + print(f"\n{'='*70}") + print(f"{source_name} — {len(gap_articles)} gaps (rule {license_info['rule']}, {doc_type})") + print(f"{'='*70}") + + for gap in gap_articles: + article_label = gap["label"] + article_type = gap["type"] + + # Skip if already has controls (resume mode) + if args.resume and article_label in existing_articles.get(source_name, set()): + stats["skipped_exists"] += 1 + continue + + # Skip non-substantive NIST sections (intro chapters) + if doc_type == "nist" and article_type == "section": + section_match = re.match(r'Section (\d+)', article_label) + if section_match and int(section_match.group(1)) <= 3: + stats["skipped_intro"] += 1 + continue + + # Extract article text + article_text = extract_article_text(filename, article_label, doc_type, full_text) + if not article_text or len(article_text) < 30: + stats["skipped_short_text"] += 1 + print(f" SKIP {article_label}: text too short ({len(article_text)} chars)") + continue + + if args.dry_run: + print(f" [DRY] {article_label} ({len(article_text)} chars)") + stats["would_generate"] += 1 + continue + + # Call Anthropic + prompt = build_prompt(source_name, article_label, article_text, license_info["license"]) + data, raw, usage, error = call_anthropic(prompt, SYSTEM_PROMPT) + + total_input_tokens += usage.get("input_tokens", 0) + total_output_tokens += usage.get("output_tokens", 0) + + if error: + stats["api_error"] += 1 + errors.append(f"{source_name} {article_label}: {error}") + print(f" ERROR {article_label}: {error}") + time.sleep(5) + continue + + if not data: + stats["parse_error"] += 1 + print(f" PARSE ERROR {article_label}") + continue + + # Ensure DB is alive before writing + ensure_db() + + # Build control + title = str(data.get("title", ""))[:200] + objective = str(data.get("objective", "")) + rationale = str(data.get("rationale", "")) + domain = str(data.get("domain", detect_domain(article_text))).upper()[:4] + if not domain or len(domain) < 2: + domain = detect_domain(article_text) + + control_id = generate_control_id(domain, cur) + severity = str(data.get("severity", "medium")).lower() + if severity not in ("low", "medium", "high", "critical"): + severity = "medium" + + requirements = data.get("requirements", []) + if not isinstance(requirements, list): + requirements = [str(requirements)] + test_procedure = data.get("test_procedure", []) + if not isinstance(test_procedure, list): + test_procedure = [str(test_procedure)] + evidence = data.get("evidence", []) + if not isinstance(evidence, list): + evidence = [str(evidence)] + tags = data.get("tags", []) + if not isinstance(tags, list): + tags = [] + target_audience = data.get("target_audience", []) + if not isinstance(target_audience, list): + target_audience = [] + applicable_industries = data.get("applicable_industries", ["all"]) + if not isinstance(applicable_industries, list): + applicable_industries = ["all"] + applicable_company_size = data.get("applicable_company_size", ["all"]) + if not isinstance(applicable_company_size, list): + applicable_company_size = ["all"] + scope_conditions = data.get("scope_conditions") + + source_citation = { + "source": source_name, + "article": data.get("source_article", article_label), + "paragraph": data.get("source_paragraph", ""), + "article_type": article_type, + "license": license_info["license"], + "source_type": license_info["source_type"], + } + + generation_metadata = { + "processing_path": "phase74_gap_fill", + "license_rule": license_info["rule"], + "source_regulation": reg_code, + "source_article": article_label, + "gap_fill": True, + } + + category = str(data.get("category", "")) or None + + # Insert into DB + try: + cur.execute(""" + INSERT INTO compliance.canonical_controls ( + framework_id, control_id, title, objective, rationale, + scope, requirements, test_procedure, evidence, + severity, risk_score, implementation_effort, + open_anchors, release_state, tags, + license_rule, source_original_text, source_citation, + customer_visible, generation_metadata, + verification_method, category, generation_strategy, + target_audience, pipeline_version, + applicable_industries, applicable_company_size, scope_conditions + ) VALUES ( + %s, %s, %s, %s, %s, + %s, %s, %s, %s, + %s, %s, %s, + %s, %s, %s, + %s, %s, %s, + %s, %s, + %s, %s, %s, + %s, %s, + %s, %s, %s + ) + ON CONFLICT (framework_id, control_id) DO NOTHING + RETURNING id + """, ( + framework_uuid, control_id, title, objective, rationale, + json.dumps({}), json.dumps(requirements), json.dumps(test_procedure), json.dumps(evidence), + severity, 5, "m", + json.dumps([]), "draft", json.dumps(tags), + license_info["rule"], article_text, json.dumps(source_citation), + True, json.dumps(generation_metadata), + "document", category, "phase74_gap_fill", + json.dumps(target_audience), PIPELINE_VERSION, + json.dumps(applicable_industries), json.dumps(applicable_company_size), + json.dumps(scope_conditions) if scope_conditions else None, + )) + conn.commit() + row = cur.fetchone() + if row: + generated_ids.append(str(row[0])) + stats["generated"] += 1 + print(f" OK {control_id}: {title[:60]}") + else: + stats["conflict"] += 1 + print(f" CONFLICT {control_id} (already exists)") + except Exception as e: + conn.rollback() + stats["db_error"] += 1 + errors.append(f"DB {control_id}: {str(e)[:100]}") + print(f" DB ERROR {control_id}: {str(e)[:100]}") + + # Rate limit: ~0.5s between calls + time.sleep(0.5) + + # ── Summary ────────────────────────────────────────────────────── + elapsed = time.time() - t_start + cost = (total_input_tokens * 3 + total_output_tokens * 15) / 1_000_000 + + print(f"\n\n{'='*70}") + print(f"PHASE 7.4 — {'DRY-RUN' if args.dry_run else 'ERGEBNIS'}") + print(f"{'='*70}") + print(f" Laufzeit: {elapsed/60:.1f} min") + print(f" API-Kosten: ${cost:.2f}") + print(f" Input Tokens: {total_input_tokens:,}") + print(f" Output Tokens: {total_output_tokens:,}") + print() + for key in sorted(stats.keys()): + print(f" {key:<25s}: {stats[key]:5d}") + print() + + if generated_ids: + print(f" Neue Control-IDs: {len(generated_ids)}") + # Save generated IDs + with open("/tmp/phase74_generated_ids.json", 'w') as f: + json.dump(generated_ids, f) + print(f" IDs gespeichert: /tmp/phase74_generated_ids.json") + + if errors: + print(f"\n Fehler ({len(errors)}):") + for e in errors[:20]: + print(f" {e}") + if len(errors) > 20: + print(f" ... und {len(errors)-20} weitere") + + conn.close() + + +if __name__ == "__main__": + main() diff --git a/scripts/qa/run_job.sh b/scripts/qa/run_job.sh new file mode 100755 index 0000000..4b5ea41 --- /dev/null +++ b/scripts/qa/run_job.sh @@ -0,0 +1,218 @@ +#!/usr/bin/env bash +# ───────────────────────────────────────────────────────────── +# Robust job runner for QA scripts on Mac Mini +# +# Usage: +# ./run_job.sh [args...] # start job +# ./run_job.sh --status # show running jobs +# ./run_job.sh --kill # kill a running job +# ./run_job.sh --log # tail log +# +# Features: +# - Loads .env automatically (COMPLIANCE_DATABASE_URL → DATABASE_URL) +# - PID-file prevents duplicate runs +# - Unbuffered Python output +# - Structured log files in /tmp/qa_jobs/ +# ───────────────────────────────────────────────────────────── +set -euo pipefail + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +PROJECT_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)" +JOB_DIR="/tmp/qa_jobs" +mkdir -p "$JOB_DIR" + +# ── Load .env ──────────────────────────────────────────────── +load_env() { + local envfile="$PROJECT_DIR/.env" + if [[ -f "$envfile" ]]; then + # Export all vars from .env + set -a + # shellcheck disable=SC1090 + source "$envfile" + set +a + fi + # Map COMPLIANCE_DATABASE_URL → DATABASE_URL if needed + if [[ -z "${DATABASE_URL:-}" && -n "${COMPLIANCE_DATABASE_URL:-}" ]]; then + export DATABASE_URL="$COMPLIANCE_DATABASE_URL" + fi +} + +# ── Job name from script path ───────────────────────────────── +job_name() { + basename "$1" .py +} + +pid_file() { + echo "$JOB_DIR/$(job_name "$1").pid" +} + +log_file() { + echo "$JOB_DIR/$(job_name "$1").log" +} + +# ── Status ──────────────────────────────────────────────────── +show_status() { + echo "═══════════════════════════════════════════════════════" + echo "QA Job Status ($(date '+%Y-%m-%d %H:%M:%S'))" + echo "═══════════════════════════════════════════════════════" + local found=0 + for pidfile in "$JOB_DIR"/*.pid; do + [[ -f "$pidfile" ]] || continue + found=1 + local name + name=$(basename "$pidfile" .pid) + local pid + pid=$(cat "$pidfile") + local logf="$JOB_DIR/$name.log" + + if kill -0 "$pid" 2>/dev/null; then + local lines + lines=$(wc -l < "$logf" 2>/dev/null || echo 0) + local errors + errors=$(grep -c "ERROR" "$logf" 2>/dev/null || echo 0) + local last_line + last_line=$(tail -1 "$logf" 2>/dev/null || echo "(empty)") + echo " ● $name (PID $pid) — RUNNING" + echo " Log: $logf ($lines lines, $errors errors)" + echo " Last: $last_line" + else + echo " ○ $name (PID $pid) — STOPPED" + echo " Log: $logf" + rm -f "$pidfile" + fi + echo "" + done + if [[ $found -eq 0 ]]; then + echo " No jobs running." + fi +} + +# ── Kill ────────────────────────────────────────────────────── +kill_job() { + local script="$1" + local pf + pf=$(pid_file "$script") + if [[ ! -f "$pf" ]]; then + echo "No PID file for $(job_name "$script")" + return 1 + fi + local pid + pid=$(cat "$pf") + if kill -0 "$pid" 2>/dev/null; then + kill "$pid" + echo "Killed $(job_name "$script") (PID $pid)" + else + echo "Process $pid already stopped" + fi + rm -f "$pf" +} + +# ── Tail log ────────────────────────────────────────────────── +tail_log() { + local script="$1" + local lf + lf=$(log_file "$script") + if [[ ! -f "$lf" ]]; then + echo "No log file: $lf" + return 1 + fi + tail -50 "$lf" +} + +# ── Start job ───────────────────────────────────────────────── +start_job() { + local script="$1" + shift + local args=("$@") + + # Resolve script path + local script_path="$script" + if [[ ! -f "$script_path" ]]; then + script_path="$SCRIPT_DIR/$script" + fi + if [[ ! -f "$script_path" ]]; then + echo "ERROR: Script not found: $script" + return 1 + fi + + local name + name=$(job_name "$script") + local pf + pf=$(pid_file "$script") + local lf + lf=$(log_file "$script") + + # Check for already-running instance + if [[ -f "$pf" ]]; then + local existing_pid + existing_pid=$(cat "$pf") + if kill -0 "$existing_pid" 2>/dev/null; then + echo "ERROR: $name already running (PID $existing_pid)" + echo "Use: $0 --kill $script" + return 1 + fi + rm -f "$pf" + fi + + # Load environment + load_env + + # Verify required env vars + if [[ -z "${DATABASE_URL:-}" ]]; then + echo "ERROR: DATABASE_URL not set (checked .env)" + return 1 + fi + + # Start + echo "Starting $name..." + echo " Script: $script_path" + echo " Args: ${args[*]:-none}" + echo " Log: $lf" + + nohup python3 -u "$script_path" "${args[@]}" > "$lf" 2>&1 & + local pid=$! + echo "$pid" > "$pf" + + echo " PID: $pid" + echo "" + + # Wait a moment and check it started OK + sleep 3 + if ! kill -0 "$pid" 2>/dev/null; then + echo "ERROR: Process died immediately. Log output:" + cat "$lf" + rm -f "$pf" + return 1 + fi + + local lines + lines=$(wc -l < "$lf" 2>/dev/null || echo 0) + echo "Running OK ($lines log lines so far)" + echo "Monitor with: $0 --status" + echo "Tail log: $0 --log $script" +} + +# ── Main ────────────────────────────────────────────────────── +case "${1:-}" in + --status|-s) + show_status + ;; + --kill|-k) + [[ -n "${2:-}" ]] || { echo "Usage: $0 --kill "; exit 1; } + kill_job "$2" + ;; + --log|-l) + [[ -n "${2:-}" ]] || { echo "Usage: $0 --log "; exit 1; } + tail_log "$2" + ;; + --help|-h|"") + echo "Usage:" + echo " $0 [args...] Start a QA job" + echo " $0 --status Show running jobs" + echo " $0 --kill Kill a running job" + echo " $0 --log Tail job log" + ;; + *) + start_job "$@" + ;; +esac diff --git a/scripts/qa/sync_db.py b/scripts/qa/sync_db.py new file mode 100644 index 0000000..5ed5230 --- /dev/null +++ b/scripts/qa/sync_db.py @@ -0,0 +1,307 @@ +#!/usr/bin/env python3 +"""Sync canonical control tables between production and local DB. + +Modes: + --pull Production → Local (initial sync, full table copy) + --push Local → Production (incremental, only new obligation_candidates) + --loop Run --push every N minutes (default 60) + +Usage: + python3 sync_db.py --pull # Full sync production → local + python3 sync_db.py --push # Push new obligations to production + python3 sync_db.py --loop 60 # Push every 60 minutes + python3 sync_db.py --pull --tables canonical_controls # Only one table +""" +import argparse +import json +import os +import sys +import time +import urllib.parse + +import io + +import psycopg2 +import psycopg2.extras +import psycopg2.extensions + +# Register JSON adapter so dicts are automatically converted to JSONB +psycopg2.extensions.register_adapter(dict, psycopg2.extras.Json) + +# ── DB Config ──────────────────────────────────────────────────────── + +PROD_URL = os.environ.get( + "PROD_DATABASE_URL", + "postgresql://postgres:GmyFD3wnU1NrKBdpU1nwLdE8MLts0A0eez8L5XXdvUCe05lWnWfVp3C6JJ8Yrmt2" + "@46.225.100.82:54321/postgres?sslmode=require", +) +LOCAL_URL = os.environ.get( + "LOCAL_DATABASE_URL", + "postgresql://breakpilot:breakpilot123@localhost:5432/breakpilot_db", +) + +SCHEMA = "compliance" + +# Tables to sync (production → local) +SYNC_TABLES = [ + "canonical_control_frameworks", + "canonical_control_licenses", + "canonical_control_sources", + "canonical_control_categories", + "canonical_blocked_sources", + "canonical_controls", + "canonical_control_mappings", + "canonical_processed_chunks", + "canonical_generation_jobs", + "control_patterns", + "crosswalk_matrix", + "obligation_extractions", + "obligation_candidates", +] + + +def connect(url, label="DB"): + parsed = urllib.parse.urlparse(url) + params = dict(urllib.parse.parse_qsl(parsed.query)) + conn = psycopg2.connect( + host=parsed.hostname, + port=parsed.port or 5432, + user=parsed.username, + password=parsed.password, + dbname=parsed.path.lstrip("/"), + sslmode=params.get("sslmode", "prefer"), + options=f"-c search_path={SCHEMA},public", + keepalives=1, + keepalives_idle=30, + keepalives_interval=10, + keepalives_count=5, + ) + conn.autocommit = False + print(f" Connected to {label} ({parsed.hostname}:{parsed.port or 5432})") + return conn + + +def get_columns(cur, table): + cur.execute(f""" + SELECT column_name FROM information_schema.columns + WHERE table_schema = '{SCHEMA}' AND table_name = '{table}' + ORDER BY ordinal_position + """) + return [r[0] for r in cur.fetchall()] + + +def pull_table(prod_conn, local_conn, table): + """Copy entire table from production to local via SELECT + INSERT.""" + prod_cur = prod_conn.cursor() + local_cur = local_conn.cursor() + + # Check table exists on production + prod_cur.execute(f""" + SELECT 1 FROM pg_tables + WHERE schemaname = '{SCHEMA}' AND tablename = '{table}' + """) + if not prod_cur.fetchone(): + print(f" SKIP {table} — not found on production") + return 0 + + # Drop local table + local_cur.execute(f"DROP TABLE IF EXISTS {SCHEMA}.{table} CASCADE") + local_conn.commit() + + # Build simple CREATE TABLE (no constraints, no defaults — just for data) + prod_cur.execute(f""" + SELECT column_name, data_type, udt_name, character_maximum_length + FROM information_schema.columns + WHERE table_schema = '{SCHEMA}' AND table_name = '{table}' + ORDER BY ordinal_position + """) + col_defs = prod_cur.fetchall() + + parts = [] + col_names = [] + jsonb_cols = set() + for name, dtype, udt, max_len in col_defs: + col_names.append(name) + if dtype == "ARRAY": + type_map = { + "_text": "text[]", "_varchar": "varchar[]", + "_int4": "integer[]", "_uuid": "uuid[]", + "_jsonb": "jsonb[]", "_float8": "float8[]", + } + sql_type = type_map.get(udt, f"{udt.lstrip('_')}[]") + elif dtype == "USER-DEFINED" and udt == "jsonb": + sql_type = "jsonb" + jsonb_cols.add(name) + elif dtype == "USER-DEFINED": + sql_type = udt + elif dtype == "jsonb": + sql_type = "jsonb" + jsonb_cols.add(name) + elif max_len: + sql_type = f"{dtype}({max_len})" + else: + sql_type = dtype + parts.append(f'"{name}" {sql_type}') + + ddl = f"CREATE TABLE {SCHEMA}.{table} ({', '.join(parts)})" + local_cur.execute(ddl) + local_conn.commit() + + # Fetch all rows from production + col_list = ", ".join(f'"{c}"' for c in col_names) + prod_cur.execute(f"SELECT {col_list} FROM {SCHEMA}.{table}") + rows = prod_cur.fetchall() + + if rows: + # Wrap dict/list values in Json for JSONB columns + adapted_rows = [] + for row in rows: + adapted = [] + for i, val in enumerate(row): + if col_names[i] in jsonb_cols and isinstance(val, (dict, list)): + adapted.append(psycopg2.extras.Json(val)) + else: + adapted.append(val) + adapted_rows.append(tuple(adapted)) + + placeholders = ", ".join(["%s"] * len(col_names)) + insert_sql = f'INSERT INTO {SCHEMA}.{table} ({col_list}) VALUES ({placeholders})' + psycopg2.extras.execute_batch(local_cur, insert_sql, adapted_rows, page_size=500) + local_conn.commit() + + print(f" {table}: {len(rows)} rows") + return len(rows) + + +def pull(tables=None): + """Full sync: production → local.""" + print("\n=== PULL: Production → Local ===\n") + + prod_conn = connect(PROD_URL, "Production") + local_conn = connect(LOCAL_URL, "Local") + + # Ensure schema exists + local_cur = local_conn.cursor() + local_cur.execute(f"CREATE SCHEMA IF NOT EXISTS {SCHEMA}") + local_conn.commit() + + sync_list = tables if tables else SYNC_TABLES + total = 0 + + for table in sync_list: + try: + count = pull_table(prod_conn, local_conn, table) + total += count + except Exception as e: + print(f" ERROR {table}: {e}") + local_conn.rollback() + prod_conn.rollback() + + print(f"\n Total: {total} rows synced") + prod_conn.close() + local_conn.close() + + +def push(): + """Incremental push: new obligation_candidates local → production.""" + print(f"\n=== PUSH: Local → Production ({time.strftime('%H:%M:%S')}) ===\n") + + local_conn = connect(LOCAL_URL, "Local") + prod_conn = connect(PROD_URL, "Production") + + local_cur = local_conn.cursor() + prod_cur = prod_conn.cursor() + + # Find obligation_candidates in local that don't exist in production + # Use candidate_id as the unique key + local_cur.execute(f""" + SELECT candidate_id FROM {SCHEMA}.obligation_candidates + """) + local_ids = {r[0] for r in local_cur.fetchall()} + + if not local_ids: + print(" No obligation_candidates in local DB") + local_conn.close() + prod_conn.close() + return 0 + + # Check which already exist on production + prod_cur.execute(f""" + SELECT candidate_id FROM {SCHEMA}.obligation_candidates + """) + prod_ids = {r[0] for r in prod_cur.fetchall()} + + new_ids = local_ids - prod_ids + if not new_ids: + print(f" All {len(local_ids)} obligations already on production") + local_conn.close() + prod_conn.close() + return 0 + + print(f" {len(new_ids)} new obligations to push (local: {len(local_ids)}, prod: {len(prod_ids)})") + + # Get columns + columns = get_columns(local_cur, "obligation_candidates") + col_list = ", ".join(columns) + placeholders = ", ".join(["%s"] * len(columns)) + + # Fetch new rows from local + id_list = ", ".join(f"'{i}'" for i in new_ids) + local_cur.execute(f""" + SELECT {col_list} FROM {SCHEMA}.obligation_candidates + WHERE candidate_id IN ({id_list}) + """) + rows = local_cur.fetchall() + + # Insert into production + insert_sql = f"INSERT INTO {SCHEMA}.obligation_candidates ({col_list}) VALUES ({placeholders}) ON CONFLICT DO NOTHING" + psycopg2.extras.execute_batch(prod_cur, insert_sql, rows, page_size=100) + prod_conn.commit() + + print(f" Pushed {len(rows)} obligations to production") + + local_conn.close() + prod_conn.close() + return len(rows) + + +def loop(interval_min): + """Run push every N minutes.""" + print(f"\n=== SYNC LOOP — Push every {interval_min} min ===") + print(f" Started at {time.strftime('%Y-%m-%d %H:%M:%S')}") + print(f" Press Ctrl+C to stop\n") + + while True: + try: + pushed = push() + if pushed: + print(f" Next sync in {interval_min} min...") + except Exception as e: + print(f" SYNC ERROR: {e}") + time.sleep(interval_min * 60) + + +def main(): + parser = argparse.ArgumentParser(description="Sync canonical control tables") + parser.add_argument("--pull", action="store_true", help="Production → Local (full copy)") + parser.add_argument("--push", action="store_true", help="Local → Production (new obligations)") + parser.add_argument("--loop", type=int, metavar="MIN", help="Push every N minutes") + parser.add_argument("--tables", nargs="+", help="Only sync specific tables (with --pull)") + args = parser.parse_args() + + if not any([args.pull, args.push, args.loop]): + parser.print_help() + return + + if args.pull: + pull(args.tables) + + if args.push: + push() + + if args.loop: + loop(args.loop) + + +if __name__ == "__main__": + main() diff --git a/scripts/qa/test_pass0a.py b/scripts/qa/test_pass0a.py new file mode 100644 index 0000000..54df95c --- /dev/null +++ b/scripts/qa/test_pass0a.py @@ -0,0 +1,470 @@ +#!/usr/bin/env python3 +"""Test Pass 0a (Obligation Extraction) on 5-10 controls. + +Standalone script — no SQLAlchemy dependency. Uses psycopg2 + requests. +Copies prompts and quality gate from decomposition_pass.py. + +Usage: + python3 test_pass0a.py # 10 controls, Anthropic + python3 test_pass0a.py --limit 5 # 5 controls + python3 test_pass0a.py --source "DSGVO" # filter by source + python3 test_pass0a.py --dry-run # show controls, no LLM call +""" +import argparse +import json +import os +import re +import sys +import time +import urllib.parse + +import psycopg2 +import requests + +# ── Config ──────────────────────────────────────────────────────────── +ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", "") +ANTHROPIC_MODEL = os.environ.get("DECOMPOSITION_LLM_MODEL", "claude-sonnet-4-6") +ANTHROPIC_API_URL = "https://api.anthropic.com/v1" + +# ── Prompts (from decomposition_pass.py) ────────────────────────────── + +SYSTEM_PROMPT = """\ +Du bist ein Rechts-Compliance-Experte. Du zerlegst Compliance-Controls \ +in einzelne atomare Pflichten. + +REGELN (STRIKT EINHALTEN): +1. Nur normative Aussagen extrahieren — erkennbar an: müssen, haben \ +sicherzustellen, sind verpflichtet, ist zu dokumentieren, ist zu melden, \ +ist zu testen, shall, must, required. +2. Jede Pflicht hat genau EIN Hauptverb / eine Handlung. +3. Testpflichten SEPARAT von operativen Pflichten (is_test_obligation=true). +4. Meldepflichten SEPARAT (is_reporting_obligation=true). +5. NICHT auf Evidence-Ebene zerlegen (z.B. "DR-Plan vorhanden" ist KEIN \ +eigenes Control, sondern Evidence). +6. Begründungen, Erläuterungen und Erwägungsgründe sind KEINE Pflichten \ +— NICHT extrahieren. + +Antworte NUR mit einem JSON-Array. Keine Erklärungen.""" + + +def build_prompt(title, objective, requirements, test_procedure, source_ref): + return f"""\ +Analysiere das folgende Control und extrahiere alle einzelnen normativen \ +Pflichten als JSON-Array. + +CONTROL: +Titel: {title} +Ziel: {objective} +Anforderungen: {requirements} +Prüfverfahren: {test_procedure} +Quellreferenz: {source_ref} + +Antworte als JSON-Array: +[ + {{ + "obligation_text": "Kurze, präzise Formulierung der Pflicht", + "action": "Hauptverb/Handlung", + "object": "Gegenstand der Pflicht", + "condition": "Auslöser/Bedingung oder null", + "normative_strength": "must", + "is_test_obligation": false, + "is_reporting_obligation": false + }} +]""" + + +# ── Quality Gate — 3-Tier Classification (from decomposition_pass.py) ── + +# Tier 1: Pflicht (mandatory) +_PFLICHT_RE = re.compile( + r"\bmüssen\b|\bmuss\b|\bhat\s+sicherzustellen\b|\bhaben\s+sicherzustellen\b" + r"|\bsind\s+verpflichtet\b|\bist\s+verpflichtet\b" + r"|\bist\s+zu\s+\w+en\b|\bsind\s+zu\s+\w+en\b" + r"|\bhat\s+zu\s+\w+en\b|\bhaben\s+zu\s+\w+en\b" + r"|\bist\s+\w+zu\w+en\b|\bsind\s+\w+zu\w+en\b" + r"|\bist\s+\w+\s+zu\s+\w+en\b|\bsind\s+\w+\s+zu\s+\w+en\b" + r"|\bhat\s+\w+\s+zu\s+\w+en\b|\bhaben\s+\w+\s+zu\s+\w+en\b" + r"|\bshall\b|\bmust\b|\brequired\b" + r"|\b\w+zuteilen\b|\b\w+zuwenden\b|\b\w+zustellen\b|\b\w+zulegen\b" + r"|\b\w+zunehmen\b|\b\w+zuführen\b|\b\w+zuhalten\b|\b\w+zusetzen\b" + r"|\b\w+zuweisen\b|\b\w+zuordnen\b|\b\w+zufügen\b|\b\w+zugeben\b" + r"|\bist\b.{1,80}\bzu\s+\w+en\b|\bsind\b.{1,80}\bzu\s+\w+en\b", + re.IGNORECASE, +) +# Tier 2: Empfehlung (recommendation) +_EMPFEHLUNG_RE = re.compile( + r"\bsoll\b|\bsollen\b|\bsollte\b|\bsollten\b" + r"|\bgewährleisten\b|\bsicherstellen\b" + r"|\bshould\b|\bensure\b|\brecommend\w*\b" + r"|\bnachweisen\b|\beinhalten\b|\bunterlassen\b|\bwahren\b" + r"|\bdokumentieren\b|\bimplementieren\b|\büberprüfen\b|\büberwachen\b" + r"|\bprüfen,\s+ob\b|\bkontrollieren,\s+ob\b", + re.IGNORECASE, +) +# Tier 3: Kann (optional/permissive) +_KANN_RE = re.compile( + r"\bkann\b|\bkönnen\b|\bdarf\b|\bdürfen\b|\bmay\b|\boptional\b", + re.IGNORECASE, +) +# Union (backward compat) +_NORMATIVE_RE = re.compile( + _PFLICHT_RE.pattern + "|" + _EMPFEHLUNG_RE.pattern + "|" + _KANN_RE.pattern, + re.IGNORECASE, +) +_RATIONALE_RE = re.compile( + r"\bda\s+|\bweil\b|\bgrund\b|\berwägung|\bbecause\b|\breason\b|\brationale\b", + re.IGNORECASE, +) +_TEST_RE = re.compile( + r"\btesten\b|\btest\b|\bprüfung\b|\bprüfen\b|\bgetestet\b|\bwirksamkeit\b" + r"|\baudit\b|\bregelmäßig\b.*\b(prüf|test|kontroll)|\beffectiveness\b|\bverif", + re.IGNORECASE, +) +_REPORTING_RE = re.compile( + r"\bmelden\b|\bmeldung\b|\bunterricht|\binformieren\b|\bbenachricht" + r"|\bnotif|\breport\b|\bbehörd", + re.IGNORECASE, +) + + +def classify_obligation_type(txt): + """Classify: pflicht > empfehlung > kann > empfehlung (default).""" + if _PFLICHT_RE.search(txt): + return "pflicht" + if _EMPFEHLUNG_RE.search(txt): + return "empfehlung" + if _KANN_RE.search(txt): + return "kann" + return "empfehlung" + + +def quality_gate(obl_text, parent_uuid): + """Validate + classify obligation. Returns (flags_dict, passed_bool, confidence, obligation_type).""" + flags = {} + + # 1. Normative signal (informational) + flags["has_normative_signal"] = bool(_NORMATIVE_RE.search(obl_text)) + + # 1b. Obligation type classification + obl_type = classify_obligation_type(obl_text) + flags["obligation_type"] = obl_type + + # 2. Single action + multi_verb_re = re.compile( + r"\b(und|sowie|als auch)\b.*\b(müssen|sicherstellen|implementieren" + r"|dokumentieren|melden|testen|prüfen|überwachen|gewährleisten)\b", + re.IGNORECASE, + ) + flags["single_action"] = not bool(multi_verb_re.search(obl_text)) + + # 3. Not rationale + normative_count = len(_NORMATIVE_RE.findall(obl_text)) + rationale_count = len(_RATIONALE_RE.findall(obl_text)) + flags["not_rationale"] = normative_count >= rationale_count + + # 4. Not evidence-only + evidence_only_re = re.compile( + r"^(Nachweis|Dokumentation|Screenshot|Protokoll|Bericht|Zertifikat)", + re.IGNORECASE, + ) + flags["not_evidence_only"] = not bool(evidence_only_re.match(obl_text.strip())) + + # 5. Min length + flags["min_length"] = len(obl_text.strip()) >= 20 + + # 6. Parent link + flags["has_parent_link"] = bool(parent_uuid) + + # Confidence + weights = { + "has_normative_signal": 0.25, "single_action": 0.20, + "not_rationale": 0.20, "not_evidence_only": 0.15, + "min_length": 0.10, "has_parent_link": 0.05, + } + # Bonus for pflicht classification + confidence = sum(weights[k] for k, v in flags.items() if v and k in weights) + if obl_type == "pflicht": + confidence = min(confidence + 0.05, 1.0) + + # Pass check — has_normative_signal is NO LONGER critical + critical = ["not_evidence_only", "min_length", "has_parent_link"] + passed = all(flags.get(k, False) for k in critical) + + return flags, passed, confidence, obl_type + + +# ── JSON parsing ────────────────────────────────────────────────────── + +def parse_json_array(text): + try: + result = json.loads(text) + if isinstance(result, list): + return result + if isinstance(result, dict): + return [result] + except json.JSONDecodeError: + pass + match = re.search(r"\[[\s\S]*\]", text) + if match: + try: + result = json.loads(match.group()) + if isinstance(result, list): + return result + except json.JSONDecodeError: + pass + return [] + + +# ── API call ────────────────────────────────────────────────────────── + +def call_anthropic(prompt): + headers = { + "x-api-key": ANTHROPIC_API_KEY, + "anthropic-version": "2023-06-01", + "content-type": "application/json", + } + payload = { + "model": ANTHROPIC_MODEL, + "max_tokens": 8192, + "system": [{"type": "text", "text": SYSTEM_PROMPT, "cache_control": {"type": "ephemeral"}}], + "messages": [{"role": "user", "content": prompt}], + } + resp = requests.post(f"{ANTHROPIC_API_URL}/messages", headers=headers, json=payload, timeout=120) + if resp.status_code != 200: + return None, {}, f"HTTP {resp.status_code}: {resp.text[:200]}" + data = resp.json() + usage = data.get("usage", {}) + content = data.get("content", []) + text = content[0].get("text", "") if content else "" + return text, usage, None + + +# ── Format helpers ──────────────────────────────────────────────────── + +def fmt_json(val): + if val is None: + return "" + if isinstance(val, str): + try: + val = json.loads(val) + except (json.JSONDecodeError, TypeError): + return val + if isinstance(val, list): + return "\n".join(f" - {item}" for item in val) + return str(val) + + +# ── Main ────────────────────────────────────────────────────────────── + +def main(): + parser = argparse.ArgumentParser(description="Test Pass 0a on small sample") + parser.add_argument("--limit", type=int, default=10) + parser.add_argument("--source", type=str) + parser.add_argument("--dry-run", action="store_true") + args = parser.parse_args() + + if not ANTHROPIC_API_KEY and not args.dry_run: + print("ERROR: Set ANTHROPIC_API_KEY") + sys.exit(1) + + db_url = os.environ["DATABASE_URL"] + p = urllib.parse.urlparse(db_url) + conn = psycopg2.connect( + host=p.hostname, port=p.port or 5432, + user=p.username, password=p.password, + dbname=p.path.lstrip("/"), + options="-c search_path=compliance,public", + ) + cur = conn.cursor() + + # Select diverse sample + query = """ + SELECT id, control_id, title, objective, requirements, + test_procedure, source_citation, category + FROM compliance.canonical_controls + WHERE release_state NOT IN ('deprecated', 'duplicate', 'too_close') + AND parent_control_uuid IS NULL + AND title IS NOT NULL AND objective IS NOT NULL + AND length(coalesce(objective,'') || coalesce(requirements::text,'')) > 100 + """ + params = [] + if args.source: + query += " AND source_citation->>'source' ILIKE %s" + params.append(f"%{args.source}%") + + query += " ORDER BY source_citation->>'source', random()" + query += f" LIMIT {args.limit}" + + cur.execute(query, params) + controls = cur.fetchall() + + if not controls: + print("No controls found.") + return + + print(f"{'='*70}") + print(f"Pass 0a Test — {len(controls)} Controls") + print(f"Model: {ANTHROPIC_MODEL}") + print(f"{'='*70}") + + total_in = total_out = total_obls = 0 + type_counts = {"pflicht": 0, "empfehlung": 0, "kann": 0} + total_rejected = 0 # only evidence-only / too-short / no-parent + all_results = [] + t_start = time.time() + + for i, row in enumerate(controls, 1): + ctrl_uuid, ctrl_id, title, objective, reqs, test_proc, src_cit, category = row + + req_str = fmt_json(reqs) + test_str = fmt_json(test_proc) + source_str = "" + if src_cit: + sc = src_cit if isinstance(src_cit, dict) else json.loads(src_cit) + source_str = f"{sc.get('source', '')} {sc.get('article', '')}" + + print(f"\n{'─'*70}") + print(f"[{i}/{len(controls)}] {ctrl_id}: {title}") + print(f" Source: {source_str} | Category: {category or 'N/A'}") + print(f" Objective: {(objective or '')[:200]}") + + if args.dry_run: + print(" [DRY RUN]") + continue + + prompt = build_prompt(title or "", objective or "", req_str, test_str, source_str) + + t0 = time.time() + response_text, usage, error = call_anthropic(prompt) + elapsed = time.time() - t0 + + if error: + print(f" ERROR: {error}") + continue + + in_tok = usage.get("input_tokens", 0) + out_tok = usage.get("output_tokens", 0) + cached = usage.get("cache_read_input_tokens", 0) + total_in += in_tok + total_out += out_tok + + obligations = parse_json_array(response_text) + total_obls += len(obligations) + + print(f" API: {elapsed:.1f}s | {in_tok} in / {out_tok} out" + f"{f' ({cached} cached)' if cached else ''}" + f" | {len(obligations)} obligation(s)") + + for j, obl in enumerate(obligations, 1): + obl_text = obl.get("obligation_text", "") + action = obl.get("action", "") + obj = obl.get("object", "") + condition = obl.get("condition") + strength = obl.get("normative_strength", "must") + is_test = bool(obl.get("is_test_obligation", False)) + is_report = bool(obl.get("is_reporting_obligation", False)) + + # Auto-detect + if not is_test and _TEST_RE.search(obl_text): + is_test = True + if not is_report and _REPORTING_RE.search(obl_text): + is_report = True + + flags, passed, conf, obl_type = quality_gate(obl_text, str(ctrl_uuid)) + if passed: + type_counts[obl_type] = type_counts.get(obl_type, 0) + 1 + else: + total_rejected += 1 + + tag = "" + if is_test: + tag = " [TEST]" + elif is_report: + tag = " [MELDEPFLICHT]" + + # Show type instead of PASS/REJECT + type_label = {"pflicht": "PFLICHT", "empfehlung": "EMPFEHLUNG", "kann": "KANN"} + if not passed: + status = "REJECT" + else: + status = type_label.get(obl_type, "EMPFEHLUNG") + + failed = [k for k, v in flags.items() + if isinstance(v, bool) and not v] + + print(f"\n {j}. [{status}] conf={conf:.0%}{tag} strength={strength}") + print(f" {obl_text}") + print(f" Handlung: {action} | Gegenstand: {obj}") + if condition: + print(f" Bedingung: {condition}") + if not passed: + print(f" Abgelehnt: {', '.join(failed)}") + + all_results.append({ + "control_id": ctrl_id, + "obligation_text": obl_text, + "obligation_type": obl_type if passed else "rejected", + "action": action, + "object": obj, + "condition": condition, + "confidence": round(conf, 2), + "is_test": is_test, + "is_reporting": is_report, + "passed": passed, + "flags": {k: v for k, v in flags.items()}, + }) + + time.sleep(0.5) + + # ── Summary ────────────────────────────────────────────────────── + elapsed_total = time.time() - t_start + cost = (total_in * 3 + total_out * 15) / 1_000_000 + total_classified = sum(type_counts.values()) + + print(f"\n\n{'='*70}") + print(f"ZUSAMMENFASSUNG — 3-Tier-Klassifizierung") + print(f"{'='*70}") + print(f" Controls: {len(controls)}") + print(f" Obligations: {total_obls} ({total_obls/max(len(controls),1):.1f} pro Control)") + print(f" ── Klassifizierung ──") + print(f" Pflicht: {type_counts['pflicht']}" + f" ({type_counts['pflicht']*100/max(total_obls,1):.0f}%)") + print(f" Empfehlung: {type_counts['empfehlung']}" + f" ({type_counts['empfehlung']*100/max(total_obls,1):.0f}%)") + print(f" Kann: {type_counts['kann']}" + f" ({type_counts['kann']*100/max(total_obls,1):.0f}%)") + print(f" Rejected: {total_rejected}" + f" ({total_rejected*100/max(total_obls,1):.0f}%)" + f" (nur evidence-only/zu kurz/kein parent)") + print(f" ── Kosten ──") + print(f" Laufzeit: {elapsed_total:.1f}s") + print(f" Tokens: {total_in:,} in / {total_out:,} out") + print(f" Kosten: ${cost:.4f}") + + if len(controls) > 0 and not args.dry_run and total_obls > 0: + n = 6000 + factor = n / len(controls) + print(f"\n --- Hochrechnung auf {n:,} Controls ---") + print(f" Tokens: {int(total_in * factor):,} in / {int(total_out * factor):,} out") + print(f" Kosten: ${cost * factor:.2f}") + print(f" Laufzeit: {elapsed_total * factor / 3600:.1f}h") + print(f" Obligations: ~{int(total_obls / len(controls) * n):,}") + pf = int(type_counts['pflicht'] * factor) + ef = int(type_counts['empfehlung'] * factor) + kf = int(type_counts['kann'] * factor) + print(f" Pflicht: ~{pf:,}") + print(f" Empfehlung: ~{ef:,}") + print(f" Kann: ~{kf:,}") + + # Save results JSON for later analysis + if all_results: + out_path = f"/tmp/pass0a_results_{len(controls)}controls.json" + with open(out_path, "w") as f: + json.dump(all_results, f, ensure_ascii=False, indent=2) + print(f"\n Ergebnisse gespeichert: {out_path}") + + conn.close() + + +if __name__ == "__main__": + main() diff --git a/scripts/qa/test_pass0b_preview.py b/scripts/qa/test_pass0b_preview.py new file mode 100644 index 0000000..7b4a6af --- /dev/null +++ b/scripts/qa/test_pass0b_preview.py @@ -0,0 +1,308 @@ +#!/usr/bin/env python3 +"""Preview Pass 0b: Turn obligation candidates into atomic controls. + +Picks a few obligations from Pass 0a results, calls LLM to compose +atomic controls, and writes them to canonical_controls with parent_control_uuid. + +Usage: + python3 test_pass0b_preview.py --input /tmp/pass0a_results_60controls.json --limit 3 +""" +import argparse +import json +import os +import re +import sys +import time +import uuid +import urllib.parse + +import psycopg2 +import psycopg2.extras +import requests + +# Register JSON adapter +psycopg2.extensions.register_adapter(dict, psycopg2.extras.Json) + +ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", "") +ANTHROPIC_MODEL = os.environ.get("DECOMPOSITION_LLM_MODEL", "claude-sonnet-4-6") + +SYSTEM_PROMPT = """\ +Du bist ein Security-Compliance-Experte. Du erstellst aus einer einzelnen \ +normativen Pflicht ein praxisorientiertes, atomares Security Control. + +Das Control muss UMSETZBAR sein — keine Gesetzesparaphrase. +Antworte NUR als JSON. Keine Erklärungen.""" + + +def build_pass0b_prompt(obl_text, action, obj, parent_title, category, source_ref): + return f"""\ +Erstelle aus der folgenden Pflicht ein atomares Control. + +PFLICHT: {obl_text} +HANDLUNG: {action} +GEGENSTAND: {obj} + +KONTEXT (Ursprungs-Control): +Titel: {parent_title} +Kategorie: {category} +Quellreferenz: {source_ref} + +Antworte als JSON: +{{ + "title": "Kurzer Titel (max 80 Zeichen, deutsch)", + "objective": "Was muss erreicht werden? (1-2 Sätze)", + "requirements": ["Konkrete Anforderung 1", "Anforderung 2"], + "test_procedure": ["Prüfschritt 1", "Prüfschritt 2"], + "evidence": ["Nachweis 1", "Nachweis 2"], + "severity": "critical|high|medium|low", + "category": "security|privacy|governance|operations|finance|reporting" +}}""" + + +def call_anthropic(prompt): + headers = { + "x-api-key": ANTHROPIC_API_KEY, + "anthropic-version": "2023-06-01", + "content-type": "application/json", + } + payload = { + "model": ANTHROPIC_MODEL, + "max_tokens": 4096, + "system": [{"type": "text", "text": SYSTEM_PROMPT, "cache_control": {"type": "ephemeral"}}], + "messages": [{"role": "user", "content": prompt}], + } + resp = requests.post("https://api.anthropic.com/v1/messages", headers=headers, json=payload, timeout=120) + if resp.status_code != 200: + return None, {}, f"HTTP {resp.status_code}: {resp.text[:200]}" + data = resp.json() + text = data.get("content", [{}])[0].get("text", "") + return text, data.get("usage", {}), None + + +def parse_json_object(text): + try: + return json.loads(text) + except json.JSONDecodeError: + match = re.search(r"\{[\s\S]*\}", text) + if match: + try: + return json.loads(match.group()) + except json.JSONDecodeError: + pass + return None + + +def generate_control_id(domain, cur): + prefix = domain.upper()[:4] + cur.execute(""" + SELECT MAX(CAST(SPLIT_PART(control_id, '-', 2) AS INTEGER)) + FROM compliance.canonical_controls + WHERE control_id LIKE %s + AND SPLIT_PART(control_id, '-', 2) ~ '^[0-9]+$' + """, (f"{prefix}-%",)) + row = cur.fetchone() + if row and row[0] is not None: + return f"{prefix}-{row[0] + 1}" + return f"{prefix}-001" + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--input", default="/tmp/pass0a_results_60controls.json") + parser.add_argument("--limit", type=int, default=3, help="Number of obligations to process") + parser.add_argument("--control", type=str, help="Pick obligations from this control_id") + parser.add_argument("--dry-run", action="store_true") + args = parser.parse_args() + + if not ANTHROPIC_API_KEY and not args.dry_run: + print("ERROR: Set ANTHROPIC_API_KEY") + sys.exit(1) + + # Load 0a results + with open(args.input) as f: + obligations = json.load(f) + + # Filter: only passed, pflicht or empfehlung + obligations = [o for o in obligations if o.get("passed", False)] + + if args.control: + obligations = [o for o in obligations if o["control_id"] == args.control] + + # Pick diverse sample + picked = [] + seen_types = set() + for o in obligations: + otype = o["obligation_type"] + if otype not in seen_types and len(picked) < args.limit: + picked.append(o) + seen_types.add(otype) + # Fill rest + for o in obligations: + if o not in picked and len(picked) < args.limit: + picked.append(o) + + if not picked: + print("No obligations found.") + return + + # Connect to DB + db_url = os.environ["DATABASE_URL"] + p = urllib.parse.urlparse(db_url) + conn = psycopg2.connect( + host=p.hostname, port=p.port or 5432, + user=p.username, password=p.password, + dbname=p.path.lstrip("/"), + options="-c search_path=compliance,public", + ) + cur = conn.cursor() + + # Get parent control info + ctrl_ids = list(set(o["control_id"] for o in picked)) + cur.execute(""" + SELECT control_id, id, title, category, source_citation + FROM compliance.canonical_controls + WHERE control_id = ANY(%s) + """, (ctrl_ids,)) + ctrl_map = {} + for row in cur.fetchall(): + sc = row[4] if isinstance(row[4], dict) else (json.loads(row[4]) if row[4] else {}) + # Derive domain prefix from control_id (e.g. "DSGV" from "DSGV-001") + prefix = row[0].split("-")[0] if "-" in row[0] else "COMP" + ctrl_map[row[0]] = { + "uuid": str(row[1]), "title": row[2], "category": row[3] or "", + "source_ref": f"{sc.get('source', '')} {sc.get('article', '')}", + "domain": prefix, + } + + print("=" * 70) + print(f"Pass 0b Preview — {len(picked)} Obligations → Atomic Controls") + print("=" * 70) + + created = [] + for i, obl in enumerate(picked, 1): + ctrl = ctrl_map.get(obl["control_id"], {}) + print(f"\n{'─'*70}") + print(f"[{i}/{len(picked)}] {obl['control_id']}: [{obl['obligation_type'].upper()}]") + print(f" Obligation: {obl['obligation_text'][:120]}") + print(f" Parent: {ctrl.get('title', 'N/A')}") + + if args.dry_run: + print(" [DRY RUN]") + continue + + prompt = build_pass0b_prompt( + obl["obligation_text"], obl["action"], obl["object"], + ctrl.get("title", ""), ctrl.get("category", ""), + ctrl.get("source_ref", ""), + ) + + t0 = time.time() + resp_text, usage, error = call_anthropic(prompt) + elapsed = time.time() - t0 + + if error: + print(f" ERROR: {error}") + continue + + result = parse_json_object(resp_text) + if not result: + print(f" PARSE ERROR: {resp_text[:200]}") + continue + + in_tok = usage.get("input_tokens", 0) + out_tok = usage.get("output_tokens", 0) + print(f" LLM: {elapsed:.1f}s | {in_tok} in / {out_tok} out") + + # Generate control_id + domain = ctrl.get("domain", "COMP") + new_control_id = generate_control_id(domain, cur) + + # Show result + print(f"\n === ATOMIC CONTROL: {new_control_id} ===") + print(f" Titel: {result.get('title', 'N/A')}") + print(f" Ziel: {result.get('objective', 'N/A')}") + print(f" Typ: {obl['obligation_type']}") + reqs = result.get("requirements", []) + if reqs: + print(f" Anforderungen:") + for r in reqs: + print(f" - {r}") + tests = result.get("test_procedure", []) + if tests: + print(f" Pruefverfahren:") + for t in tests: + print(f" - {t}") + evidence = result.get("evidence", []) + if evidence: + print(f" Nachweise:") + for e in evidence: + print(f" - {e}") + print(f" Severity: {result.get('severity', 'medium')}") + print(f" Category: {result.get('category', 'governance')}") + + # Write to DB + new_uuid = str(uuid.uuid4()) + parent_uuid = ctrl.get("uuid") + source_cit = {} + if ctrl.get("source_ref"): + parts = ctrl["source_ref"].strip().split(" ", 1) + source_cit = {"source": parts[0], "article": parts[1] if len(parts) > 1 else ""} + + cur.execute(""" + INSERT INTO compliance.canonical_controls ( + id, control_id, title, objective, requirements, test_procedure, + evidence, severity, category, release_state, + source_citation, generation_metadata, generation_strategy, + pipeline_version, parent_control_uuid, framework_id + ) VALUES ( + %s, %s, %s, %s, %s, %s, + %s, %s, %s, %s, + %s, %s, %s, + %s, %s, + (SELECT id FROM compliance.canonical_control_frameworks LIMIT 1) + ) + """, ( + new_uuid, new_control_id, + result.get("title", ""), + result.get("objective", ""), + json.dumps(result.get("requirements", []), ensure_ascii=False), + json.dumps(result.get("test_procedure", []), ensure_ascii=False), + json.dumps(result.get("evidence", []), ensure_ascii=False), + result.get("severity", "medium"), + result.get("category", "governance"), + "draft", + psycopg2.extras.Json(source_cit), + psycopg2.extras.Json({ + "obligation_type": obl["obligation_type"], + "obligation_text": obl["obligation_text"], + "pass0b_model": ANTHROPIC_MODEL, + "decomposition_method": "pass0b_preview", + }), + "pass0b_atomic", + 6, # pipeline_version + parent_uuid, + )) + conn.commit() + + created.append({ + "control_id": new_control_id, + "title": result.get("title", ""), + "obligation_type": obl["obligation_type"], + "parent_control_id": obl["control_id"], + }) + print(f" ✓ Geschrieben: {new_control_id} (parent: {obl['control_id']})") + + time.sleep(0.5) + + if created: + print(f"\n{'='*70}") + print(f"ERGEBNIS: {len(created)} atomare Controls erstellt") + print(f"{'='*70}") + for c in created: + print(f" {c['control_id']}: {c['title']} [{c['obligation_type']}] (von {c['parent_control_id']})") + + conn.close() + + +if __name__ == "__main__": + main()