feat: Control Library UI, dedup migration, QA tooling, docs
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 31s
CI/CD / test-python-backend-compliance (push) Successful in 1m35s
CI/CD / test-python-document-crawler (push) Successful in 20s
CI/CD / test-python-dsms-gateway (push) Successful in 17s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped

- Control Library: parent control display, ObligationTypeBadge,
  GenerationStrategyBadge variants, evidence string fallback
- API: expose parent_control_uuid/id/title in canonical controls
- Fix: DSFA SQLAlchemy 2.0 Row._mapping compatibility
- Migration 074: control_parent_links + control_dedup_reviews tables
- QA scripts: benchmark, gap analysis, OSCAL import, OWASP cleanup,
  phase5 normalize, phase74 gap fill, sync_db, run_job
- Docs: dedup engine, RAG benchmark, lessons learned, pipeline docs

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-21 11:56:08 +01:00
parent c52dbdb8f1
commit 643b26618f
28 changed files with 5781 additions and 75 deletions

View File

@@ -9,6 +9,7 @@ import {
import {
CanonicalControl, EFFORT_LABELS, BACKEND_URL,
SeverityBadge, StateBadge, LicenseRuleBadge, VerificationMethodBadge, CategoryBadge, TargetAudienceBadge,
ObligationTypeBadge, GenerationStrategyBadge,
VERIFICATION_METHODS, CATEGORY_OPTIONS,
} from './helpers'
@@ -125,6 +126,8 @@ export function ControlDetail({
<VerificationMethodBadge method={ctrl.verification_method} />
<CategoryBadge category={ctrl.category} />
<TargetAudienceBadge audience={ctrl.target_audience} />
<GenerationStrategyBadge strategy={ctrl.generation_strategy} />
<ObligationTypeBadge type={ctrl.generation_metadata?.obligation_type as string} />
</div>
<h2 className="text-lg font-semibold text-gray-900 mt-1">{ctrl.title}</h2>
</div>
@@ -239,6 +242,32 @@ export function ControlDetail({
</section>
)}
{/* Parent Control (atomare Controls) */}
{ctrl.parent_control_uuid && (
<section className="bg-violet-50 border border-violet-200 rounded-lg p-4">
<div className="flex items-center gap-2 mb-1">
<GitMerge className="w-4 h-4 text-violet-600" />
<h3 className="text-sm font-semibold text-violet-900">Atomares Control</h3>
<ObligationTypeBadge type={ctrl.generation_metadata?.obligation_type as string} />
</div>
<p className="text-sm text-violet-800">
Abgeleitet aus Eltern-Control{' '}
<span className="font-mono font-semibold text-purple-700 bg-purple-100 px-1.5 py-0.5 rounded">
{ctrl.parent_control_id || ctrl.parent_control_uuid}
</span>
{ctrl.parent_control_title && (
<span className="text-violet-700 ml-1"> {ctrl.parent_control_title}</span>
)}
</p>
{ctrl.generation_metadata?.obligation_text && (
<p className="text-xs text-violet-600 mt-2 bg-violet-100/50 rounded p-2">
Obligation: {String(ctrl.generation_metadata.obligation_text).slice(0, 300)}
{String(ctrl.generation_metadata.obligation_text).length > 300 ? '...' : ''}
</p>
)}
</section>
)}
{/* Impliziter Gesetzesbezug (Rule 3 — reformuliert, kein Originaltext) */}
{!ctrl.source_citation && ctrl.open_anchors.length > 0 && (
<section className="bg-amber-50 border border-amber-200 rounded-lg p-3">
@@ -297,7 +326,7 @@ export function ControlDetail({
</section>
)}
{/* Evidence */}
{/* Evidence — handles both {type, description} objects and plain strings */}
{ctrl.evidence.length > 0 && (
<section>
<h3 className="text-sm font-semibold text-gray-900 mb-2">Nachweise</h3>
@@ -305,7 +334,11 @@ export function ControlDetail({
{ctrl.evidence.map((ev, i) => (
<div key={i} className="flex items-start gap-2 text-sm text-gray-700">
<FileText className="w-4 h-4 text-gray-400 flex-shrink-0 mt-0.5" />
<div><span className="font-medium">{ev.type}:</span> {ev.description}</div>
{typeof ev === 'string' ? (
<div>{ev}</div>
) : (
<div><span className="font-medium">{ev.type}:</span> {ev.description}</div>
)}
</div>
))}
</div>
@@ -359,7 +392,18 @@ export function ControlDetail({
<h3 className="text-sm font-semibold text-gray-700">Generierungsdetails (intern)</h3>
</div>
<div className="text-xs text-gray-600 space-y-1">
<p>Pfad: {String(ctrl.generation_metadata.processing_path || '-')}</p>
{ctrl.generation_metadata.processing_path && (
<p>Pfad: {String(ctrl.generation_metadata.processing_path)}</p>
)}
{ctrl.generation_metadata.decomposition_method && (
<p>Methode: {String(ctrl.generation_metadata.decomposition_method)}</p>
)}
{ctrl.generation_metadata.pass0b_model && (
<p>LLM: {String(ctrl.generation_metadata.pass0b_model)}</p>
)}
{ctrl.generation_metadata.obligation_type && (
<p>Obligation-Typ: {String(ctrl.generation_metadata.obligation_type)}</p>
)}
{ctrl.generation_metadata.similarity_status && (
<p className="text-red-600">Similarity: {String(ctrl.generation_metadata.similarity_status)}</p>
)}

View File

@@ -30,7 +30,7 @@ export interface CanonicalControl {
}
requirements: string[]
test_procedure: string[]
evidence: EvidenceItem[]
evidence: (EvidenceItem | string)[]
severity: string
risk_score: number | null
implementation_effort: string | null
@@ -47,6 +47,10 @@ export interface CanonicalControl {
target_audience: string | string[] | null
generation_metadata?: Record<string, unknown> | null
generation_strategy?: string | null
parent_control_uuid?: string | null
parent_control_id?: string | null
parent_control_title?: string | null
decomposition_method?: string | null
created_at: string
updated_at: string
}
@@ -275,7 +279,26 @@ export function GenerationStrategyBadge({ strategy }: { strategy: string | null
if (strategy === 'document_grouped') {
return <span className="inline-flex items-center px-1.5 py-0.5 rounded text-xs font-medium bg-emerald-100 text-emerald-700">v2</span>
}
return null
if (strategy === 'phase74_gap_fill') {
return <span className="inline-flex items-center px-1.5 py-0.5 rounded text-xs font-medium bg-blue-100 text-blue-700">v5 Gap</span>
}
if (strategy === 'pass0b_atomic') {
return <span className="inline-flex items-center px-1.5 py-0.5 rounded text-xs font-medium bg-violet-100 text-violet-700">Atomar</span>
}
return <span className="inline-flex items-center px-1.5 py-0.5 rounded text-xs font-medium bg-gray-100 text-gray-500">{strategy}</span>
}
export const OBLIGATION_TYPE_CONFIG: Record<string, { bg: string; label: string }> = {
pflicht: { bg: 'bg-red-100 text-red-700', label: 'Pflicht' },
empfehlung: { bg: 'bg-amber-100 text-amber-700', label: 'Empfehlung' },
kann: { bg: 'bg-green-100 text-green-700', label: 'Kann' },
}
export function ObligationTypeBadge({ type }: { type: string | null | undefined }) {
if (!type) return null
const config = OBLIGATION_TYPE_CONFIG[type]
if (!config) return null
return <span className={`inline-flex items-center px-2 py-0.5 rounded text-xs font-medium ${config.bg}`}>{config.label}</span>
}
export function getDomain(controlId: string): string {

View File

@@ -9,7 +9,7 @@ import {
import {
CanonicalControl, Framework, BACKEND_URL, EMPTY_CONTROL,
SeverityBadge, StateBadge, LicenseRuleBadge, VerificationMethodBadge, CategoryBadge, TargetAudienceBadge,
GenerationStrategyBadge,
GenerationStrategyBadge, ObligationTypeBadge,
VERIFICATION_METHODS, CATEGORY_OPTIONS, TARGET_AUDIENCE_OPTIONS,
} from './components/helpers'
import { ControlForm } from './components/ControlForm'
@@ -762,6 +762,7 @@ export default function ControlLibraryPage() {
<CategoryBadge category={ctrl.category} />
<TargetAudienceBadge audience={ctrl.target_audience} />
<GenerationStrategyBadge strategy={ctrl.generation_strategy} />
<ObligationTypeBadge type={ctrl.generation_metadata?.obligation_type as string} />
{ctrl.risk_score !== null && (
<span className="text-xs text-gray-400">Score: {ctrl.risk_score}</span>
)}

View File

@@ -174,6 +174,9 @@ _CONTROL_COLS = """id, framework_id, control_id, title, objective, rationale,
customer_visible, verification_method, category,
target_audience, generation_metadata, generation_strategy,
applicable_industries, applicable_company_size, scope_conditions,
parent_control_uuid, decomposition_method, pipeline_version,
(SELECT p.control_id FROM canonical_controls p WHERE p.id = canonical_controls.parent_control_uuid) AS parent_control_id,
(SELECT p.title FROM canonical_controls p WHERE p.id = canonical_controls.parent_control_uuid) AS parent_control_title,
created_at, updated_at"""
@@ -798,6 +801,11 @@ def _control_row(r) -> dict:
"applicable_industries": getattr(r, "applicable_industries", None),
"applicable_company_size": getattr(r, "applicable_company_size", None),
"scope_conditions": getattr(r, "scope_conditions", None),
"parent_control_uuid": str(r.parent_control_uuid) if getattr(r, "parent_control_uuid", None) else None,
"parent_control_id": getattr(r, "parent_control_id", None),
"parent_control_title": getattr(r, "parent_control_title", None),
"decomposition_method": getattr(r, "decomposition_method", None),
"pipeline_version": getattr(r, "pipeline_version", None),
"created_at": r.created_at.isoformat() if r.created_at else None,
"updated_at": r.updated_at.isoformat() if r.updated_at else None,
}

View File

@@ -200,6 +200,9 @@ def _get_tenant_id(tenant_id: Optional[str]) -> str:
def _dsfa_to_response(row) -> dict:
"""Convert a DB row to a JSON-serializable dict."""
import json
# SQLAlchemy 2.0: Row objects need ._mapping for string-key access
if hasattr(row, "_mapping"):
row = row._mapping
def _parse_arr(val):
"""Parse a JSONB array field → list."""
@@ -558,8 +561,9 @@ async def create_dsfa(
).fetchone()
db.flush()
row_id = row._mapping["id"] if hasattr(row, "_mapping") else row[0]
_log_audit(
db, tid, row["id"], "CREATE", request.created_by,
db, tid, row_id, "CREATE", request.created_by,
new_values={"title": request.title, "status": request.status},
)
db.commit()

View File

@@ -0,0 +1,73 @@
-- Migration 074: Control Dedup Engine — DB Schema
-- Supports the 4-stage dedup pipeline for atomic controls (Pass 0b).
--
-- Tables:
-- 1. control_parent_links — M:N parent linking (one control → many regulations)
-- 2. control_dedup_reviews — Review queue for borderline matches (0.85-0.92)
BEGIN;
-- =============================================================================
-- 1. Control Parent Links (M:N)
-- Enables "1 Control erfuellt 5 Gesetze" — the biggest USP.
-- An atomic control can have multiple parent controls from different
-- regulations/obligations. This replaces the 1:1 parent_control_uuid FK.
-- =============================================================================
CREATE TABLE IF NOT EXISTS control_parent_links (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
control_uuid UUID NOT NULL REFERENCES canonical_controls(id) ON DELETE CASCADE,
parent_control_uuid UUID NOT NULL REFERENCES canonical_controls(id) ON DELETE CASCADE,
link_type VARCHAR(30) NOT NULL DEFAULT 'decomposition'
CHECK (link_type IN ('decomposition', 'dedup_merge', 'manual', 'crosswalk')),
confidence NUMERIC(3,2) DEFAULT 1.0
CHECK (confidence >= 0 AND confidence <= 1),
source_regulation VARCHAR(100),
source_article VARCHAR(100),
obligation_candidate_id UUID REFERENCES obligation_candidates(id),
created_at TIMESTAMPTZ DEFAULT NOW(),
CONSTRAINT uq_parent_link UNIQUE (control_uuid, parent_control_uuid)
);
CREATE INDEX IF NOT EXISTS idx_cpl_control ON control_parent_links(control_uuid);
CREATE INDEX IF NOT EXISTS idx_cpl_parent ON control_parent_links(parent_control_uuid);
CREATE INDEX IF NOT EXISTS idx_cpl_type ON control_parent_links(link_type);
COMMENT ON TABLE control_parent_links IS
'M:N parent links — one atomic control can fulfill multiple regulations/obligations. USP: "1 Control erfuellt 5 Gesetze"';
-- =============================================================================
-- 2. Control Dedup Reviews
-- Queue for borderline matches (similarity 0.85-0.92) that need human review.
-- Reviewed entries get status updated to accepted/rejected.
-- =============================================================================
CREATE TABLE IF NOT EXISTS control_dedup_reviews (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
candidate_control_id VARCHAR(30) NOT NULL,
candidate_title TEXT NOT NULL,
candidate_objective TEXT,
matched_control_uuid UUID REFERENCES canonical_controls(id),
matched_control_id VARCHAR(30),
similarity_score NUMERIC(4,3) DEFAULT 0.0,
dedup_stage VARCHAR(40) NOT NULL,
dedup_details JSONB DEFAULT '{}',
parent_control_uuid UUID REFERENCES canonical_controls(id),
obligation_candidate_id UUID REFERENCES obligation_candidates(id),
review_status VARCHAR(20) DEFAULT 'pending'
CHECK (review_status IN ('pending', 'accepted_link', 'accepted_new', 'rejected')),
reviewed_by VARCHAR(100),
reviewed_at TIMESTAMPTZ,
review_notes TEXT,
created_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE INDEX IF NOT EXISTS idx_cdr_status ON control_dedup_reviews(review_status);
CREATE INDEX IF NOT EXISTS idx_cdr_matched ON control_dedup_reviews(matched_control_uuid);
CREATE INDEX IF NOT EXISTS idx_cdr_parent ON control_dedup_reviews(parent_control_uuid);
CREATE INDEX IF NOT EXISTS idx_cdr_stage ON control_dedup_reviews(dedup_stage);
COMMENT ON TABLE control_dedup_reviews IS
'Review queue for borderline dedup matches (similarity 0.85-0.92). Human decides: link or new control.';
COMMIT;

View File

@@ -195,6 +195,11 @@ class TestControlRowConversion:
"release_state": "draft",
"tags": ["mfa"],
"generation_strategy": "ungrouped",
"parent_control_uuid": None,
"parent_control_id": None,
"parent_control_title": None,
"decomposition_method": None,
"pipeline_version": None,
"created_at": now,
"updated_at": now,
}

View File

@@ -2,7 +2,23 @@
## Übersicht
Die Control Quality Pipeline prüft und verbessert die ~9.000 Canonical Controls der Compliance-Bibliothek. Sie nutzt **PDF-basierte Verifizierung** als Ground Truth — jeder Control-Originaltext wird direkt im Quelldokument (PDF) lokalisiert.
Die Control Quality Pipeline prüft und verbessert die Canonical Controls der Compliance-Bibliothek. Sie nutzt **PDF-basierte Verifizierung** als Ground Truth — jeder Control-Originaltext wird direkt im Quelldokument (PDF) lokalisiert.
Alle Scripts liegen in **`scripts/qa/`**. Starten auf dem Mac Mini via Runner-Script:
```bash
# Job starten (laedt .env automatisch, PID-Lock, unbuffered output)
ssh macmini "bash ~/Projekte/breakpilot-compliance/scripts/qa/run_job.sh <script.py> [args...]"
# Status aller Jobs
ssh macmini "bash ~/Projekte/breakpilot-compliance/scripts/qa/run_job.sh --status"
# Log ansehen
ssh macmini "bash ~/Projekte/breakpilot-compliance/scripts/qa/run_job.sh --log <script.py>"
# Job stoppen
ssh macmini "bash ~/Projekte/breakpilot-compliance/scripts/qa/run_job.sh --kill <script.py>"
```
## Architektur
@@ -55,20 +71,24 @@ Jeder Control hat ein Feld `source_original_text` — der Chunk-Text aus dem Que
| Metrik | Wert |
|---|---|
| Controls mit source_original_text | 7.943 |
| Im PDF lokalisiert | **6.259 (79%)** |
| Nicht gefunden (Sprachmismatch) | 1.651 |
| Kein PDF vorhanden | 33 |
| 100% Match-Rate | 19 Regulations (inkl. DSGVO, KI-VO, NIS2, NIST 800-53) |
| Controls mit source_original_text | 5.751 (86%) |
| Im PDF lokalisiert | **5.063 (88%)** |
| Nicht gefunden | 649 |
| Kein PDF vorhanden | 29 |
| Recital_suspect markiert | 648 |
| 100% Match-Rate | 20+ Regulations (inkl. DSGVO, KI-VO, NIS2, NIST 800-53, Blue Guide) |
**Verlauf:** v1 (4.110, 52%) → v2 (6.091, 77%) → v3 (6.259, 79%) → v4 +Blue Guide EN (6.803, 86%) → v5 nach Cleanup (5.063/5.741, 88%)
### Nicht-matchende Controls
| Ursache | Controls | Erklärung |
| Ursache | Controls | Status |
|---|---|---|
| Blue Guide EN vs. DE PDF | ~562 | Controls aus englischem PDF, wir haben nur deutsches |
| OWASP multilingual | ~632 | Controls aus PT/AR/ID/ES-Übersetzungen |
| ~~Blue Guide EN vs. DE PDF~~ | ~~562~~ | ✅ Gelöst — EN-PDF beschafft, 544/544 gematcht |
| ~~OWASP Top 10 multilingual~~ | ~~324~~ | ✅ Als duplicate markiert — Übersetzungen ohne Mehrwert |
| CRA Encoding | ~76 | PDF-Ligaturen/Sonderzeichen-Differenzen |
| CISA Secure by Design | ~113 | Falsches PDF (ENISA statt CISA) |
| OWASP ASVS | ~173 | PDF-Matching-Problem (meist EN) |
## Brute-Force-Suche
@@ -100,34 +120,276 @@ Controls aus Erwägungsgründen (`article_type = preamble`) sind **kein Nachteil
Die 1.195 v1-Controls **ohne** Originaltext sind manuell erstellt (`strategy=ungrouped`) und haben keine Chunk-Referenz.
## DB-Status (Stand 2026-03-20)
## OWASP Cleanup (2026-03-20)
- **324 OWASP Top 10 multilingual Controls** → `duplicate` markiert (ZH, AR, ID, FR, ES, PT — Übersetzungen derselben 10 Kategorien)
- **47 Controls** mit falscher Quellenzuordnung korrigiert (z.B. als "OWASP Top 10" getaggt, aber tatsächlich aus ASVS/SAMM/API/MASVS)
- **~200 OWASP ASVS/SAMM/MASVS EN Controls** behalten — unique Content aus GitHub/Website, nicht im PDF auffindbar
## NIST OSCAL Import (2026-03-20)
**776 neue Controls** aus NIST SP 800-53 Rev 5 OSCAL (Public Domain, maschinenlesbar):
- Quelle: `usnistgov/oscal-content` (JSON Catalog)
- Vor allem **Control Enhancements** (z.B. AC-2(3), SC-7(8)) — die atomaren Unteranforderungen
- Jeder Control enthält: Statement + Guidance + Assessment-Methoden + Cross-References + Parameters
- `pipeline_version = 4`, `generation_strategy = 'oscal_import'`
- Kein Pass 0a/0b nötig — Controls sind **bereits atomar**
| Metrik | Vorher | Nachher |
|---|---|---|
| SP 800-53 Controls (aktiv) | 1.107 | **1.883** |
| OSCAL-Abdeckung | 238/1.014 (23%) | **1.014/1.014 (100%)** |
## Phase 5: RAG-Deduplizierung + Normalisierung (2026-03-20)
### Durchgeführte Schritte
| Schritt | Beschreibung | Controls |
|---|---|---|
| 5.1 | OSCAL Controls: `source_regulation` in generation_metadata gesetzt | 776 |
| 5.2 | v3 Controls ohne Source → `needs_review` mit `missing_source` Flag | 20 |
| 5.3 | Leerer Source-Name korrigiert (AT TKG) | 1 |
| 5.4 | OWASP regulation_code Fehlzuordnungen korrigiert | 47 |
| 5.5 | **duplicate/too_close Controls hart gelöscht** | **3.301** |
| 5.6 | Processed Chunks bereinigt (gelöschte Control-IDs entfernt) | 2.520 |
### Ergebnis
- **Vorher:** 9.936 Controls (6.635 aktiv, 2.998 duplicate, 303 too_close)
- **Nachher:** 6.635 Controls, **alle aktiv** (0 duplicate/too_close)
- Alle regulation_codes haben jetzt einheitliche Source-Namen
- OWASP-Controls sind korrekt ihren Quellen zugeordnet
## DB-Status (Stand 2026-03-20, nach Phase 7.4)
| release_state | Count |
|---|---|
| draft | 5.365 |
| needs_review | 818 |
| duplicate | 2.674 |
| too_close | 303 |
| **Aktiv** | **6.183** |
| draft | ~6.030 |
| needs_review | 838 |
| **Gesamt** | **6.868** |
## Scripts
## Scripts (`scripts/qa/`)
Alle QA-Scripts liegen in `scripts/qa/`:
### Kern-QA (PDF-Matching)
| Script | Beschreibung |
|---|---|
| `pdf_qa_all.py` | Haupt-QA: Controls gegen PDFs matchen |
| `pdf_qa_inventory.py` | Inventar: Regulations, Controls, PDFs |
| `apply_pdf_qa_results.py` | Ergebnisse in DB schreiben |
| `preamble_dedup.py` | Preamble vs. Artikel Duplikat-Erkennung |
| `qa_dedup_controls.py` | Jaccard-basierte Titel-Dedup |
| `qa_normalize_sources.py` | Source-Namen normalisieren |
| `db_status.py` | DB-Status-Übersicht |
| `pdf_qa_all.py` | **Haupt-QA**: Controls gegen PDFs matchen, Artikel-Index aufbauen. Enthaelt `SOURCE_FILE_MAP`, alle Index-Builder (EU, DE, NIST, OWASP, generic). 526 Zeilen. |
| `pdf_qa_inventory.py` | Inventar: Welche Regulations haben Controls, wie viele, welche PDFs existieren |
| `apply_pdf_qa_results.py` | Ergebnisse aus `pdf_qa_all.py` in DB schreiben (`article_type`, `recital_suspect`) |
| `pdf_article_lookup_poc.py` | POC: Control-Text in PDF lokalisieren, Headings von Cross-Refs unterscheiden |
## Nächste Schritte
### Lueckenanalyse + Control-Generierung
1. **Blue Guide EN-PDF** beschaffen → +562 Controls matchen
2. **CISA Secure by Design** echtes PDF finden → +113 Controls
3. **Brute-Force Ergebnisse anwenden** — 44 falsche Source-Zuordnungen korrigieren
4. **Frontend-Anzeige**`article_type` im Control-Detail anzeigen
5. **Continuous QA** — Bei neuen Controls automatisch PDF-Match prüfen
| Script | Beschreibung |
|---|---|
| `gap_analysis.py` | **Phase 7.3**: Artikel im PDF vs. Controls in DB vergleichen, Luecken identifizieren |
| `phase74_generate_gap_controls.py` | **Phase 7.4**: Neue Controls fuer Luecken via Anthropic API generieren. `pipeline_version=5`. 624 Zeilen. |
| `benchmark_llm_controls.py` | LLM-Vergleich: gpt-oss-120b vs. Claude Sonnet fuer Control-Generierung |
| `test_pass0a.py` | **Pass 0a Test**: Obligation Extraction + 3-Tier-Klassifizierung (Pflicht/Empfehlung/Kann). Standalone, speichert JSON. |
### Deduplizierung + Normalisierung
| Script | Beschreibung |
|---|---|
| `preamble_dedup.py` | Preamble vs. Artikel Duplikat-Erkennung (Jaccard >= 0.40) |
| `qa_dedup_controls.py` | Jaccard-basierte Titel-Deduplizierung |
| `qa_apply_and_dedup.py` | Ergebnisse anwenden + Duplikate in einem Schritt markieren |
| `qa_normalize_sources.py` | Source-Namen normalisieren (kanonische Namen) |
| `phase5_normalize_and_cleanup.py` | **Phase 5**: Normalisierung + 3.301 Duplikate hart loeschen |
| `qa_delete_gpsr_dupe.py` | GPSR-Duplikate loeschen |
| `delete_gpsr_prod.py` | GPSR-Duplikate aus Production-Qdrant entfernen |
### Quellen-spezifische Scripts
| Script | Beschreibung |
|---|---|
| `blue_guide_en_match.py` | Blue Guide EN-PDF matchen (544/544 Erfolg) |
| `owasp_cleanup.py` | OWASP multilingual Cleanup (324 Duplikate) + Source-Fix (47 korrigiert) |
| `owasp_github_match.py` | OWASP ASVS/SAMM/MASVS gegen GitHub-Markdown matchen |
| `oscal_import.py` | NIST OSCAL Import (776 Controls aus JSON Catalog) |
| `oscal_analysis.py` | NIST OSCAL Analyse: Abdeckung, fehlende Controls |
### Diagnose + Utilities
| Script | Beschreibung |
|---|---|
| `db_status.py` | DB-Status: release_state Counts, pipeline_version, source Verteilung |
| `debug_low_match.py` | Debugging: Warum matchen Blue Guide / OWASP / CISA schlecht? |
| `qa_article_map_all_chunks.py` | Alle Chunks Artikel-Nummern zuordnen (Bulk) |
| `backfill_job_66228863.py` | Einmaliger Backfill-Job |
| `sync_controls_to_prod.py` | Controls von Dev nach Production synchronisieren |
### Runner
| Script | Beschreibung |
|---|---|
| `run_job.sh` | **Job-Runner**: Laedt `.env`, PID-Lock, Monitoring (`--status`, `--log`, `--kill`) |
## Phase 7: PDF-Validierung + Enrichment (2026-03-20)
### 7.1 + 7.2: Controls gegen PDFs validiert + Ergebnisse angewendet ✅
- 5.063 Controls erfolgreich im Original-PDF lokalisiert (88%)
- `article_type` fuer alle gematchten Controls gesetzt
- 648 Preamble-Controls als `recital_suspect` in `generation_metadata` markiert
- 332 Controls nicht matchbar (OWASP ASVS 132, CISA 72, ENISA 38, OWASP SAMM 31, CRA 28)
### 7.3: Lueckenanalyse ✅
**494 Artikel-Luecken** in 15 Quellen identifiziert. Geschaetzt ~300 davon actionable.
| Source | Luecken | Coverage | Bemerkung |
|---|---:|---:|---|
| AML-Verordnung | 91 | 5% | Kaum ingestiert |
| MiCA | 71 | 52% | Grosse Verordnung |
| NIST SP 800-53 | 59 | 83% | Meist Section-Header, nur SA-15 fehlt |
| OWASP ASVS 4.0 | 47 | 35% | Requirement-Gruppen fehlen |
| Batterieverordnung | 41 | 58% | |
| DSGVO | 35 | 65% | Einige Governance/Aufsicht-Artikel |
| ENISA ICS/SCADA | 34 | 31% | |
| ENISA Supply Chain | 26 | 7% | |
| CRA | 23 | 68% | |
| NIS2 | 16 | 65% | |
| KI-Verordnung | 15 | 87% | Fast komplett |
| Maschinenverordnung | 5 | 91% | Fast komplett |
### 7.4: Neue Controls fuer Luecken generieren ✅ (2026-03-20)
Script: `phase74_generate_gap_controls.py --resume`
- **494 Artikel-Luecken** in 15 Quellen → Anthropic Claude Sonnet 4.6
- `pipeline_version = 5`, `generation_strategy = 'phase74_gap_fill'`
- Direkt PDF-Text als Input (nicht RAG-Chunks)
- Starten via: `run_job.sh phase74_generate_gap_controls.py --resume`
**Ergebnis:**
| Source | Luecken | Generiert |
|---|---:|---:|
| AML-Verordnung | 91 | 97 |
| MiCA | 71 | 68 |
| NIST SP 800-53 | 59 | 19 |
| KI-Verordnung | 15 | 15 |
| OWASP ASVS 4.0 | 47 | 11 |
| Batterieverordnung | 41 | 9 |
| DSGVO | 35 | 4 |
| OWASP Top 10 | 12 | 3 |
| NIS2 | 16 | 3 |
| CRA | 23 | 3 |
| OECD KI-Empfehlung | 4 | 1 |
| **Gesamt** | **494** | **233** |
Nicht generiert: 75 zu kurzer Text, 29 NIST-Intros, 11 Parse-Errors, 162 ID-Konflikte (COMP-1000 etc.).
API-Kosten: ~$7,55 (109 min Laufzeit).
## Pass 0a: Obligation Extraction — 3-Tier-Klassifizierung
### Konzept
Pass 0a zerlegt Rich Controls (~6.000) in **atomare Obligations** per LLM (Claude Sonnet 4.6).
Jede Obligation wird durch den **Quality Gate** klassifiziert — nicht gefiltert:
| obligation_type | Signal | Beispiel |
|---|---|---|
| **pflicht** | müssen, muss, ist zu, hat zu, shall, must, required | "Der Betreiber muss alle Daten verschluesseln" |
| **empfehlung** | soll, sollen, should, sicherstellen, gewaehrleisten, dokumentieren | "Der Betreiber soll regelmaessige Audits durchfuehren" |
| **kann** | kann, koennen, darf, duerfen, may, optional | "Der Betreiber kann zusaetzliche Massnahmen ergreifen" |
**Wichtig:** Nichts wird mehr rejected wegen fehlendem normativem Signal. Obligations ohne Signal werden als `empfehlung` klassifiziert. Rejected werden nur noch: Evidence-Only, zu kurz (<20 Zeichen), fehlender Parent-Link.
### Warum auch Empfehlungen behalten?
Empfehlungen helfen Firmen, ihre Systeme sicherer zu machen — ueber das Pflichtprogramm hinaus. Im Frontend erhalten Kunden einen Marker, der klar anzeigt:
- **Pflicht** = gesetzlich/regulatorisch vorgeschrieben
- **Empfehlung** = Best Practice, freiwillig, aber wertvoll
- **Kann** = optional, weitergehende Massnahme
### Quality Gate — Kritische Flags
| Flag | Kritisch? | Beschreibung |
|---|---|---|
| `has_normative_signal` | Nein | Informativer Check, kein Ablehnungsgrund |
| `obligation_type` | — | Klassifizierung (pflicht/empfehlung/kann) |
| `not_evidence_only` | **Ja** | Kein reiner Nachweis-Eintrag |
| `min_length` | **Ja** | Mindestens 20 Zeichen |
| `has_parent_link` | **Ja** | Verbindung zum Parent-Control |
| `single_action` | Nein | Nur ein Hauptverb (heuristisch) |
| `not_rationale` | Nein | Keine reine Begruendung |
### Normative Signal Detection — Regex-Tiers
```
Tier 1 (Pflicht): muessen, muss, ist/sind/hat/haben zu + Infinitiv,
Compound-Verben (festzustellen, vorzunehmen),
Gerundivum (mitzuteilen, bereitzustellen),
shall, must, required
Tier 2 (Empfehlung): soll, sollen, sollte, sollten,
gewaehrleisten, sicherstellen,
should, ensure, recommend,
dokumentieren, implementieren, ueberpruefen
Tier 3 (Kann): kann, koennen, darf, duerfen, may, optional
```
### Testergebnisse (3 Iterationen, 2026-03-20)
| Run | Controls | Obligations | Validated | Rejected | Kosten |
|---|---:|---:|---:|---:|---:|
| 1 (v0 Regex) | 10 | ~100 | 68% | 32% | $0,28 |
| 2 (v1 Regex) | 50 | ~530 | 78% | 22% | $1,43 |
| 3 (v2 Regex) | 50 | ~530 | 86% | 14% | $1,44 |
| 4 (3-Tier) | 60 | — | — | — | — |
Run 4 laeuft mit dem neuen Klassifizierer — statt PASS/REJECT wird jetzt PFLICHT/EMPFEHLUNG/KANN ausgegeben.
### Scripts
| Script | Beschreibung |
|---|---|
| `test_pass0a.py` | **Test-Script**: Standalone (kein SQLAlchemy), psycopg2 + Anthropic API. Speichert Ergebnisse als JSON. |
```bash
# Test mit 10 Controls
run_job.sh test_pass0a.py --limit 10
# Test mit bestimmter Quelle
run_job.sh test_pass0a.py --limit 20 --source "DSGVO"
# Ergebnisse: /tmp/pass0a_results_<N>controls.json
```
### Backend-Code
- **Klassifizierung:** `backend-compliance/compliance/services/decomposition_pass.py`
- `classify_obligation_type()` — 3-Tier-Klassifizierung
- `quality_gate()` — gibt `obligation_type` in Flags zurueck
- `passes_quality_gate()``has_normative_signal` nicht mehr kritisch
- `ObligationCandidate.obligation_type` — neues Feld
### Hochrechnung (basierend auf 50-Control-Runs)
| Metrik | Wert |
|---|---|
| Kosten pro Control | ~$0,029 |
| Kosten fuer ~6.000 Controls | **~$172** |
| Laufzeit (geschaetzt) | ~25h |
| Obligations pro Control | ~10,5 |
---
## Naechste Schritte
1. ~~**Phase 5 Cleanup** → 3.301 Duplikate geloescht, Source normalisiert~~
2. ~~**Phase 6 Pipeline-Haertung** → Source aus REGULATION_LICENSE_MAP~~
3. ~~**Phase 7.1-7.3** → PDF-Validierung + Enrichment + Lueckenanalyse~~
4. ~~**Phase 7.4** → 233 neue Controls fuer Luecken generiert ($7,55)~~
5. **Pass 0a** → Obligation Extraction mit 3-Tier-Klassifizierung (Tests laufen, ~$172)
6. **Pass 0b** → Atomic Control Composition aus validierten Obligations
7. **Pass 1-5** → Multi-Layer Migration (Code + 500 Tests bereits vorhanden)
8. **Phase 8** → Qdrant Re-Ingestion (Runtime-Betrieb, ZULETZT)
9. **needs_review Triage** — 838 Controls klassifizieren
10. **Frontend**`obligation_type` (Pflicht/Empfehlung/Kann) + `article_type` anzeigen

View File

@@ -0,0 +1,206 @@
# RAG Pipeline Benchmark & Optimierungen
Stand: 2026-03-21. Vergleich unserer Implementierung mit State of the Art. Priorisierte Empfehlungen nach Impact/Effort.
---
## Aktuelle Pipeline (Ist-Zustand)
```mermaid
flowchart LR
A[Dokumente] -->|Document Crawler| B[Chunks 512/50]
B -->|bge-m3| C[Qdrant Dense]
C -->|Cosine Search| D[Control Generator v2]
D -->|LLM| E[Rich Controls 6.373]
E -->|Pass 0a| F[Obligations]
F -->|Pass 0b| G[Atomare Controls]
G -->|4-Stage Dedup| H[Master Controls ~18K]
```
| Komponente | Implementierung | SOTA-Bewertung |
|-----------|----------------|----------------|
| **Chunking** | Rekursiv, 512 Zeichen, 50 Overlap | Zu klein fuer Rechtstexte |
| **Embedding** | bge-m3 (1024-dim, Ollama) | Gut, aber nur Dense genutzt |
| **Vector DB** | Qdrant mit Payload-Filtering | Hybrid Search nicht aktiviert |
| **Retrieval** | Pure Dense Cosine Similarity | Kein Re-Ranking, kein BM25 |
| **Extraktion** | 3-Tier (Exact → Embedding → LLM) | Solide Architektur |
| **Dedup** | 4-Stage (Pattern → Action → Object → Embedding) | Ueberdurchschnittlich |
| **QA** | 5-Metrik Similarity + PDF-QA Matching | Gut, RAGAS fehlt |
---
## Tier 1: Quick Wins (Tage, nicht Wochen)
### 1. Chunk-Groesse erhoehen: 512 → 1024, Overlap 50 → 128
**Problem:** NAACL 2025 Vectara-Studie zeigt: fuer analytische/juristische Queries sind 512-1024 Token optimal. Unsere 512-Zeichen-Chunks (= ~128 Token) sind deutlich zu klein.
**Unsere Lessons Learned:** "Chunks werden mitten im Absatz abgeschnitten. Artikel- und Paragraphennummern fehlen."
**Aenderung:** Config-Parameter in `ingest-phase-h.sh` anpassen.
| Metrik | Vorher | Nachher |
|--------|--------|---------|
| Chunk Size | 512 chars (~128 Token) | 1024 chars (~256 Token) |
| Overlap | 50 chars (10%) | 128 chars (12.5%) |
**Impact:** HOCH | **Effort:** NIEDRIG
### 2. Ollama JSON-Mode fuer Obligation Extraction
**Problem:** `_parse_json` in `decomposition_pass.py` hat Regex-Fallback — das zeigt, dass LLM-Output nicht zuverlaessig JSON ist.
**Aenderung:** `format: "json"` in Ollama-API-Calls setzen.
**Impact:** MITTEL | **Effort:** NIEDRIG (1 Parameter)
### 3. Chain-of-Thought Prompting fuer Pass 0a/0b
**Problem:** LegalGPT-Framework zeigt: explizite Reasoning-Chains ("Erst Addressat identifizieren, dann Aktion, dann normative Staerke") verbessern Extraktionsqualitaet signifikant.
**Impact:** MITTEL | **Effort:** NIEDRIG (Prompt Engineering)
---
## Tier 2: High Impact, Medium Effort (1-2 Wochen)
### 4. Hybrid Search (Dense + Sparse) via Qdrant
**Problem:** Reine Dense-Suche. Juristische Queries enthalten spezifische Begriffe ("DSGVO Art. 35", "Abs. 3"), die BM25/Sparse besser findet.
**Loesungsansatz:** BGE-M3 generiert bereits Sparse Vectors — wir verwerfen sie aktuell!
```
Qdrant Query API:
- Dense: bge-m3 Cosine (wie bisher)
- Sparse: bge-m3 Sparse Vectors (neu)
- Fusion: Reciprocal Rank Fusion (RRF)
```
**Benchmarks (Anthropic):** 49% weniger fehlgeschlagene Retrievals mit Contextual Retrieval, 67% mit Re-Ranking.
**Impact:** SEHR HOCH | **Effort:** MITTEL
### 5. Cross-Encoder Re-Ranking
**Problem:** Top-5 Ergebnisse direkt an LLM — keine Qualitaetspruefung der Retrieval-Ergebnisse.
**Loesungsansatz:** BGE Reranker v2 (MIT-Lizenz) auf Top-20 Ergebnisse, dann Top-5 an LLM.
| Re-Ranker | Lizenz | Empfehlung |
|-----------|--------|------------|
| BGE Reranker v2 | MIT | Empfohlen |
| Jina Reranker v2 | Apache-2.0 | Alternative |
| ColBERT v2 | MIT | Spaeter |
**Impact:** HOCH | **Effort:** MITTEL
### 6. Cross-Regulation Dedup Pass
**Problem:** Dedup filtert immer nach `pattern_id` — Controls aus DSGVO Art. 25 und NIS2 Art. 21 (beide Security-by-Design) werden nie verglichen.
**Loesungsansatz:** Zweiter Qdrant-Search ohne `pattern_id`-Filter nach dem normalen Dedup-Pass.
**Impact:** HOCH | **Effort:** MITTEL
### 7. Automatische Regressionstests (Golden Set)
**Problem:** Keine systematische Qualitaetsmessung nach Pipeline-Aenderungen.
**Loesungsansatz:** 20-Chunk Golden Set → Control-Generation → Output-Stabilitaet pruefen.
**Impact:** HOCH | **Effort:** NIEDRIG
---
## Tier 3: Strategische Investitionen (Wochen bis Monate)
### 8. Artikel-Boundary Chunking
Eigener Splitter fuer EU-Verordnungen und deutsche Gesetze: Split an "Art.", "Artikel", "Paragraph"-Grenzen statt nach Zeichenzahl.
### 9. RAGAS Evaluation Pipeline
[RAGAS](https://docs.ragas.io/) mit Golden Dataset (50-100 manuell verifizierte Control-to-Source Mappings). Metriken: Faithfulness, Answer Relevancy, Context Precision, Context Recall.
### 10. BGE-M3 Fine-Tuning
Fine-Tuning auf Compliance-Corpus (~6.373 Control-Titel/Objective-Paare). Research zeigt +10-30% Domain-Retrieval-Verbesserung.
### 11. LLM-as-Judge
Claude Sonnet bewertet jeden generierten Control auf Faithfulness zum Quelltext (~$0.01/Control).
### 12. Active Learning aus Review-Queue
Menschliche Entscheidungen der Dedup Review-Queue nutzen, um Schwellenwerte ueber die Zeit zu optimieren.
---
## Nicht empfohlen (niedriger ROI oder Konflikte)
| Ansatz | Grund |
|--------|-------|
| Jina v3 Embeddings | **CC-BY-NC-4.0** — verletzt Open Source Policy |
| Voyage-law-2 | API-only, proprietaer — kein Self-Hosting |
| Semantic Chunking | Benchmarks zeigen keinen Vorteil gegenueber Recursive fuer strukturierte Dokumente |
| HyDE als Primaerstrategie | Latenz (+43-60%) + Halluzinationsrisiko |
| Knowledge Graph RAG | Massiver Aufwand, unklarer Gewinn bei strukturiertem Rechtskorpus |
---
## Embedding-Modell Vergleich
| Modell | MTEB Score | Multilingual | Kontext | Lizenz | Bewertung |
|--------|-----------|-------------|---------|--------|-----------|
| **BGE-M3** (aktuell) | 63.0 | 100+ Sprachen | 8192 Token | MIT | Gut, Dense+Sparse+ColBERT |
| Jina v3 | 65.5 | 89 Sprachen | 8192 Token | CC-BY-NC | Nicht nutzbar (Lizenz!) |
| E5-Mistral-7B | ~65 | Gut | 4096 Token | MIT | Gross, hoher RAM |
| Voyage-law-2 | Best Legal | EN Legal | 16K Token | Proprietaer | Nicht nutzbar (API-only) |
**Fazit:** BGE-M3 bleibt die beste Wahl fuer unseren Stack. Sparse-Vectors aktivieren und Fine-Tuning bringen mehr als ein Modellwechsel.
---
## Test-Coverage Analyse
### Pipeline-Module (567 Tests)
| Modul | Tests | Bewertung | Fehlende Tests |
|-------|-------|-----------|----------------|
| Control Generator | 110 | Exzellent | 10-15 Edge Cases |
| Obligation Extractor | 107 | Exzellent | 8-10 Edge Cases |
| Decomposition Pass | 90 | Exzellent | 5-8 Edge Cases |
| Pattern Matcher | 72 | Gut | 10-15 Edge Cases |
| Control Dedup | 56 | Exzellent | 5-8 Edge Cases |
| Control Composer | 54 | Gut | 8-10 Edge Cases |
| Pipeline Adapter | 36 | Gut | 10-15 Edge Cases |
| Citation Backfill | 20 | Moderat | 5-8 Edge Cases |
| License Gate | 12 | Minimal | 5-8 Edge Cases |
| RAG Client | 10 | Minimal | 5-8 Edge Cases |
### Kritische Luecken (fehlende Tests)
| Service | Datei | Prioritaet |
|---------|-------|------------|
| AI Compliance Assistant | `ai_compliance_assistant.py` | HOCH (25-30 Tests noetig) |
| PDF Extractor | `pdf_extractor.py` | HOCH (20-25 Tests noetig) |
| LLM Provider | `llm_provider.py` | HOCH (15-20 Tests noetig) |
| Similarity Detector | `similarity_detector.py` | MITTEL (20-25 Tests noetig) |
| Anchor Finder | `anchor_finder.py` | MITTEL |
### Test-Infrastruktur
**Fehlend:** Shared `conftest.py` mit gemeinsamen Fixtures (LLM-Mock, DB-Mock, Embedding-Mock). Aktuell sind Fixtures in jedem Test-File dupliziert.
---
## Quellen
- [NAACL 2025 Vectara Chunking Study](https://blog.premai.io/rag-chunking-strategies-the-2026-benchmark-guide/)
- [Anthropic Contextual Retrieval](https://www.anthropic.com/news/contextual-retrieval)
- [Qdrant Hybrid Search Query API](https://qdrant.tech/articles/hybrid-search/)
- [Structure-Aware Chunking for Legal (ACL 2025)](https://aclanthology.org/2025.justnlp-main.19/)
- [RAGAS Evaluation Framework](https://docs.ragas.io/)
- [BGE Reranker v2 (MIT)](https://huggingface.co/BAAI/bge-reranker-v2-m3)
- [LegalGPT / CALLM Framework](https://www.emergentmind.com/topics/compliance-alignment-llm-callm)

View File

@@ -0,0 +1,223 @@
# RAG Pipeline: Lessons Learned & Hardening
## Übersicht
Dieses Dokument beschreibt die Erkenntnisse aus dem Aufbau der RAG-Pipeline und die daraus abgeleiteten Maßnahmen zur Härtung. Es dient als Referenz für zukünftige Ingestion-Runs und Pipeline-Erweiterungen.
## Architektur: Wann brauchen wir RAG vs. Direct PDF?
### RAG ist nötig für:
| Use Case | Warum RAG? |
|---|---|
| **Compliance Advisor (Chat)** | Semantische Suche über 38+ Dokumente in Echtzeit |
| **Cross-Regulation Mapping** | "Zeige alle Anforderungen zu Verschlüsselung" über alle Quellen |
| **Customer Scope-Filtering** | Nur Chunks aus relevanten Regulations für den Kunden |
| **Inkrementelle Updates** | Neues Dokument → nur neue Chunks verarbeiten |
### RAG ist NICHT nötig für:
| Use Case | Besser: Direct PDF |
|---|---|
| **Control-Generierung (Batch)** | PDF → PyMuPDF → Strukturparser → Artikel-Index → API |
| **PDF-QA/Verifizierung** | Substring-Match direkt im PDF (schneller, exakter) |
| **Artikel/§-Extraktion** | Regex-basierte Extraktion aus PDF-Text |
### Hybrid-Ansatz (Empfehlung)
```
Control-Generierung: PDF → Strukturparser → Artikel-Index → Anthropic API
(KEIN RAG nötig, direkt aus PDF)
Runtime-Betrieb: Qdrant-RAG für semantische Suche, Chat, Scope-Analyse
(RAG mit angereicherten Chunks + Struktur-Metadaten)
```
## Fehler und Root Causes
### 1. Doppelte Ingestion = Doppelte Controls
**Problem:** Gleiche PDFs unter verschiedenen Namen ingestiert (z.B. "Maschinenverordnung" und "Verordnung (EU) 2023/1230") → unterschiedliche Chunks (anderes Chunking) → anderer Hash → doppelt verarbeitet → doppelte Controls.
**Root Cause:**
- `regulation_name` aus Chunk-Metadaten statt aus kanonischer Quelle
- UNIQUE-Constraint nur `(chunk_hash, collection, document_version)` — nicht global
- Kein Check ob `regulation_code` bereits in einer Collection existiert
**Fix (implementiert):**
- `REGULATION_LICENSE_MAP` enthält jetzt kanonische `name`-Werte die den DB-Einträgen entsprechen
- `source_citation.source` wird aus `REGULATION_LICENSE_MAP.name` genommen, NICHT aus `chunk.regulation_name`
- Phase 5 Cleanup: 3.301 Duplikate hart gelöscht
**Fix (noch offen):**
- Chunk-Hash UNIQUE Constraint global machen: `(chunk_hash, document_version)` statt `(chunk_hash, collection, document_version)`
- Vor Ingestion: Check ob `regulation_code` bereits in einer Collection existiert
### 2. Chunks verlieren Strukturinformation
**Problem:** Chunks werden mitten im Absatz abgeschnitten. § und Artikelnummern fehlen in den Chunk-Metadaten. Kontext des Kapitels/Abschnitts geht verloren.
**Root Cause:**
- `chunk_strategy=recursive` mit `chunk_size=512, chunk_overlap=50` — zu kleine Chunks
- Chunking beachtet keine Dokumentstruktur (Artikel-/Paragraphengrenzen)
- Keine Einleitung/Kapitelkontext als Prefix
**Empfehlung für Re-Ingestion:**
- **Strukturiertes Chunking:** Chunks an Artikel-/Paragraphengrenzen schneiden
- **Kontext-Prefix:** Kapiteleinleitung und übergeordnete Struktur mitliefern
- **Metadaten anreichern:** `article`, `paragraph`, `article_type`, `section_hierarchy`
- **Größere Chunks:** Mindestens 1024 Tokens, besser volle Artikel/Paragraphen
### 3. Cross-Collection-Duplikate
**Problem:** `nist_csf_2_0` in `bp_compliance_ce` (67 Chunks) UND `bp_compliance_datenschutz` (162 Chunks). EU-Verordnungen sowohl in `bp_compliance_ce` als auch `bp_compliance_gesetze`.
**Root Cause:** Keine Collection-Zuordnungsregeln. Manuelle Zuweisung bei Ingestion.
**Fix:** `cleanup-qdrant-duplicates.py` Script bereinigt Cross-Collection-Duplikate.
**Empfehlung:** Klare Collection-Zuordnungsregeln:
- `bp_compliance_ce` = EU-Verordnungen + internationale Standards
- `bp_compliance_gesetze` = Deutsche + österreichische Gesetze (NUR nationale Gesetze)
- `bp_compliance_datenschutz` = EDPB/WP29 Leitlinien + Privacy Frameworks
### 4. OWASP Multilingual Controls
**Problem:** 324 OWASP Top 10 Controls in ZH, AR, ID, FR, ES, PT — Übersetzungen derselben 10 Kategorien. Kein Mehrwert, aber 324 doppelte Controls generiert.
**Root Cause:** Multilingual PDFs/GitHub-Quellen ohne Spracherkennung ingestiert.
**Fix:** 324 als `duplicate` markiert und gelöscht.
**Empfehlung:** Bei Ingestion Spracherkennung + Deduplizierung. Nur DE + EN behalten.
### 5. Fehlende Artikel/Paragraph-Extraktion
**Problem:** Chunks haben `article` und `paragraph` oft leer oder falsch. Die LLM-basierte Extraktion bei der Control-Generierung ist unzuverlässig.
**Root Cause:** Ingestion-Pipeline extrahiert keine Strukturinformation aus dem PDF.
**Fix (implementiert):** PDF-QA-Pipeline (`pdf_qa_all.py`) matched `source_original_text` gegen Original-PDFs und extrahiert korrekte Artikel/Paragraphen — 86% Match-Rate.
**Empfehlung:** Bei Re-Ingestion direkt in den Chunk-Metadaten speichern.
### 6. Job-Tracking nicht persistent
**Problem:** Generation-Jobs laufen als Background-Tasks. Kein Logging, welche Chunks verarbeitet, Status nur über API abfragbar. Bei API-Timeout oder Restart geht der Fortschritt verloren.
**Root Cause:** `asyncio.create_task()` hat keinen Recovery-Mechanismus.
**Fix (teilweise):** `canonical_generation_jobs` Tabelle trackt Jobs. `canonical_processed_chunks` markiert verarbeitete Chunks.
**Empfehlung:**
- Job-Log in DB persistieren (nicht nur stdout)
- Fortschritt in `canonical_generation_jobs.progress` als JSONB speichern
- Chunk-Level-Status: verarbeitet / übersprungen / Fehler
- Recovery-Fähigkeit: Job kann von letztem Checkpoint fortgesetzt werden
## Empfohlene Metadaten für Re-Ingestion
### Chunk-Level Metadaten (Qdrant Payload)
```json
{
"chunk_text": "...",
"regulation_code": "eu_2016_679",
"regulation_name_de": "DSGVO (EU) 2016/679",
"regulation_name_en": "GDPR (EU) 2016/679",
"article": "25",
"article_title": "Datenschutz durch Technikgestaltung und datenschutzfreundliche Voreinstellungen",
"article_type": "article",
"paragraph": "1",
"section_hierarchy": ["Kapitel IV", "Abschnitt 2", "Artikel 25"],
"chapter_context": "Kapitel IV — Verantwortlicher und Auftragsverarbeiter",
"pages": [45, 46],
"effective_date": "2018-05-25",
"publication_date": "2016-04-27",
"document_version": "2016-04-27",
"source_language": "de",
"source_url": "https://eur-lex.europa.eu/...",
"celex": "32016R0679",
"license": "EU_LAW",
"license_rule": 1,
"source_type": "law",
"category": "datenschutz",
"chunk_position": 42,
"total_chunks": 423
}
```
### Dokument-Level Metadaten (Corpus Version)
```json
{
"regulation_code": "eu_2016_679",
"canonical_name_de": "DSGVO (EU) 2016/679",
"canonical_name_en": "GDPR (EU) 2016/679",
"document_type": "eu_regulation",
"effective_date": "2018-05-25",
"publication_date": "2016-04-27",
"supersedes": null,
"superseded_by": null,
"source_pdf": "gdpr_regulation_eu_2016_679.pdf",
"source_pdf_sha256": "abc123...",
"total_articles": 99,
"total_recitals": 173,
"total_annexes": 0,
"ingestion_date": "2026-03-20",
"ingestion_version": "v2"
}
```
## Pipeline-Härtung Checkliste
### Vor Ingestion
- [ ] Prüfen ob `regulation_code` bereits in einer Collection existiert
- [ ] PDF-SHA256 gegen bekannte PDFs prüfen (Duplikat-Erkennung)
- [ ] `regulation_name` aus `REGULATION_LICENSE_MAP` verwenden, NICHT aus Chunk-Metadaten
- [ ] Spracherkennung: Nur DE + EN ingestieren
- [ ] Dokument-Metadaten (effective_date, publication_date) recherchieren
### Während Ingestion
- [ ] Strukturiertes Chunking an Artikel-/Paragraphengrenzen
- [ ] Kontext-Prefix mit Kapiteleinleitung
- [ ] Chunk-Metadaten anreichern (article, paragraph, article_type, section_hierarchy)
- [ ] Fortschritt in DB loggen
### Nach Ingestion
- [ ] Chunk-Count pro `regulation_code` prüfen (Sanity Check)
- [ ] PDF-QA gegen Original-PDF laufen lassen
- [ ] Cross-Collection-Duplikat-Check
- [ ] Corpus-Version in DB eintragen
### Control-Generierung
- [ ] `source_citation.source` aus `REGULATION_LICENSE_MAP.name`, NICHT aus Chunk-Metadaten
- [ ] Harmonisierung: Threshold 0.85 für Duplikate innerhalb gleicher `regulation_code`
- [ ] Cross-Regulation-Harmonisierung bei ähnlichen Themen (z.B. DSGVO Art. 25 ↔ NIS2 Art. 21)
- [ ] Job-Fortschritt persistent in DB speichern
## Workflow: Mac Mini → Production Sync
```
1. Mac Mini: PDF → Qdrant (lokal, http://macmini:6333)
2. Mac Mini: Control-Generierung → PostgreSQL (shared, 46.225.100.82:54321)
3. QA: PDF-Match, Dedup, Source-Normalisierung
4. Qdrant Migration: macmini:6333 → qdrant-dev.breakpilot.ai (scripts/migrate-qdrant.py)
5. Deploy: git push gitea → Coolify Build + Deploy
```
**WICHTIG:** PostgreSQL ist SHARED — Änderungen auf Mac Mini sind sofort in Production sichtbar. Qdrant hat getrennte Instanzen (lokal + production) und muss manuell synchronisiert werden.
## Scripts
| Script | Beschreibung |
|---|---|
| `scripts/ingest-phase-h.sh` | Haupt-Ingestion: 38 Dokumente → Qdrant |
| `scripts/cleanup-qdrant-duplicates.py` | Qdrant Duplikat-Cleanup (8 Schritte) |
| `scripts/migrate-qdrant.py` | Qdrant Migration: lokal → production |
| `scripts/qa/phase5_normalize_and_cleanup.py` | DB Normalisierung + Hard Delete |
| `scripts/qa/pdf_qa_all.py` | PDF-Match QA |

View File

@@ -96,6 +96,7 @@ erDiagram
varchar verification_method
varchar target_audience
varchar generation_strategy
varchar obligation_type
smallint pipeline_version
integer license_rule
jsonb source_citation
@@ -936,9 +937,11 @@ Drei Kompositions-Modi:
Zerlegt Rich Controls in atomare Controls. Laeuft VOR den Migration Passes 1-5.
#### Pass 0a — Obligation Extraction
#### Pass 0a — Obligation Extraction + 3-Tier-Klassifizierung
Extrahiert einzelne normative Pflichten aus einem Rich Control per LLM.
Extrahiert einzelne normative Pflichten aus einem Rich Control per LLM (Claude Sonnet 4.6).
Jede Obligation wird als **pflicht**, **empfehlung** oder **kann** klassifiziert — nichts wird
wegen fehlendem normativem Signal abgelehnt.
**6 Guardrails:**
@@ -949,23 +952,37 @@ Extrahiert einzelne normative Pflichten aus einem Rich Control per LLM.
5. Nicht auf Evidence-Ebene zerlegen
6. Parent-Link immer erhalten
**Quality Gate:** Jeder Kandidat wird gegen 6 Kriterien geprueft:
**3-Tier Obligation Classification:**
- `has_normative_signal` — Normatives Sprachsignal erkannt
- `single_action` — Nur eine Handlung
- `not_rationale` — Keine blosse Begruendung
- `not_evidence_only` — Kein reines Evidence-Fragment
- `min_length` — Mindestlaenge erreicht
- `has_parent_link` — Referenz zum Rich Control
| obligation_type | Signal-Beispiele | Bedeutung |
|---|---|---|
| `pflicht` | müssen, ist zu, shall, must, required | Gesetzliche/regulatorische Pflicht |
| `empfehlung` | soll, should, sicherstellen, dokumentieren | Best Practice, freiwillig |
| `kann` | kann, darf, may, optional | Optionale Massnahme |
Kritische Checks: `has_normative_signal`, `not_evidence_only`, `min_length`, `has_parent_link`
Obligations ohne erkennbares Signal werden als `empfehlung` klassifiziert (nicht rejected).
Empfehlungen helfen Firmen, Systeme ueber das Pflichtprogramm hinaus zu sichern.
**Quality Gate — Kritische Checks:**
| Flag | Kritisch? | Beschreibung |
|---|---|---|
| `obligation_type` | — | Klassifizierung (pflicht/empfehlung/kann) |
| `not_evidence_only` | **Ja** | Kein reines Evidence-Fragment |
| `min_length` | **Ja** | Mindestlaenge (20 Zeichen) |
| `has_parent_link` | **Ja** | Referenz zum Rich Control |
| `has_normative_signal` | Nein | Informativer Check (nicht mehr Ablehnungsgrund) |
| `single_action` | Nein | Nur eine Handlung (heuristisch) |
| `not_rationale` | Nein | Keine blosse Begruendung |
#### Pass 0b — Atomic Control Composition
Erstellt aus jedem validierten Obligation Candidate ein atomares Control
(LLM-gestuetzt mit Template-Fallback).
(LLM-gestuetzt mit Template-Fallback). Das `obligation_type` Feld wird
vom Parent-Obligation uebernommen.
**Datei:** `compliance/services/decomposition_pass.py`
**Test-Script:** `scripts/qa/test_pass0a.py` (standalone, speichert JSON)
---
@@ -1012,11 +1029,13 @@ Die Crosswalk-Matrix bildet diese N:M-Beziehung ab.
**Migration 061:** Decomposition-Tabellen
| Tabelle | Beschreibung |
| Tabelle / Feld | Beschreibung |
|---------|-------------|
| `obligation_candidates` | Extrahierte atomare Pflichten aus Rich Controls |
| `obligation_candidates.obligation_type` | `pflicht` / `empfehlung` / `kann` (3-Tier-Klassifizierung) |
| `canonical_controls.parent_control_uuid` | Self-Referenz zum Rich Control (neues Feld) |
| `canonical_controls.decomposition_method` | Zerlegungsmethode (neues Feld) |
| `canonical_controls.obligation_type` | Uebernommen von Obligation: pflicht/empfehlung/kann |
---

View File

@@ -567,7 +567,86 @@ curl -X POST https://api-dev.breakpilot.ai/api/compliance/v1/canonical/generate/
---
## Pass 0a/0b: Atomare Control-Zerlegung
Die Pipeline v3 erweitert die 7-Stufen-Pipeline um einen Vor-Pass, der Rich Controls in atomare Controls zerlegt.
### Pass 0a: Obligation Extraction
Extrahiert individuelle normative Pflichten aus Rich Controls via LLM.
```mermaid
flowchart LR
A[Rich Control] -->|LLM| B[Obligations]
B --> C{Quality Gate}
C -->|Pass| D[validated]
C -->|Fail| E[rejected]
```
**3-Tier Klassifikation:**
| Typ | Erkennungsmuster | Beispiel |
|-----|-----------------|---------|
| **Pflicht** | muss, ist verpflichtet, hat sicherzustellen | "Der Verantwortliche MUSS ein Verzeichnis fuehren" |
| **Empfehlung** | soll, sollte, wird empfohlen | "Es SOLLTE eine Risikobewertung durchgefuehrt werden" |
| **Kann** | kann, darf, ist berechtigt | "Die Aufsichtsbehoerde KANN Geldbussen verhaengen" |
**Quality Gate (6 Regeln):**
1. Nur normative Aussagen (muss, sicherzustellen, verpflichtet)
2. Ein Hauptverb pro Obligation
3. Test-Obligations separat von operativen
4. Reporting-Obligations separat
5. Nicht auf Evidence-Ebene splitten
6. Parent-Link immer erhalten
### Pass 0b: Atomic Control Composition
Verwandelt jede validierte Obligation in ein eigenstaendiges atomares Control.
```mermaid
flowchart LR
A[Obligation] -->|LLM| B[Atomic Control]
B -->|Dedup Check| C{4-Stage Dedup}
C -->|NEW| D[Insert + Index]
C -->|LINK| E[Parent-Link]
C -->|REVIEW| F[Review-Queue]
```
**Konfiguration:**
| Variable | Default | Beschreibung |
|----------|---------|-------------|
| `DECOMPOSITION_LLM_MODEL` | `claude-sonnet-4-6` | LLM fuer Pass 0a/0b |
| `DECOMPOSITION_BATCH_SIZE` | `5` | Obligations pro LLM-Call |
| `DECOMPOSITION_LLM_TIMEOUT` | `120` | Timeout in Sekunden |
**Ergebnisse (Stand 2026-03-21):**
| Metrik | Wert |
|--------|------|
| Rich Controls (technisch) | ~6.800 |
| Atomare Controls (bisher) | 30 (PoC: 10x CRYP, AUTH, SEC) |
| Ziel nach Full Run | ~18.000 unique Master Controls |
| Obligations pro Rich Control | ~10 |
| Dedup-Reduktion erwartet | ~70% |
### Quelldateien (Pass 0a/0b)
| Datei | Beschreibung |
|-------|-------------|
| `compliance/services/decomposition_pass.py` | Pass 0a + 0b Logik |
| `compliance/services/control_dedup.py` | 4-Stufen Dedup-Engine |
| `migrations/061_obligation_candidates.sql` | Obligation-Tabelle |
| `migrations/074_control_dedup.sql` | Dedup-Tabellen (Parent-Links, Review-Queue) |
| `tests/test_decomposition_pass.py` | 90 Tests |
| `tests/test_control_dedup.py` | 56 Tests |
---
## Verwandte Dokumentation
- [Canonical Control Library (CP-CLIB)](canonical-control-library.md) — Domains, Datenmodell, Too-Close-Detektor, CI/CD Validation
- [Deduplizierungs-Engine](dedup-engine.md) — 4-Stufen Dedup, Multi-Parent-Linking, Review-Queue
- [RAG Pipeline Benchmark](../../development/rag-pipeline-benchmark.md) — State-of-the-Art Vergleich, Optimierungsempfehlungen
- [Multi-Layer Control Architecture](canonical-control-library.md#multi-layer-control-architecture) — 10-Stage Pipeline-Erweiterung mit Obligations, Patterns, Crosswalk

View File

@@ -0,0 +1,253 @@
# Deduplizierungs-Engine (Control Dedup)
4-stufige Dedup-Pipeline zur Vermeidung doppelter atomarer Controls bei der Pass 0b Komposition. Kern-USP: **"1 Control erfuellt 5 Gesetze"** durch Multi-Parent-Linking.
**Backend:** `backend-compliance/compliance/services/control_dedup.py`
**Migration:** `backend-compliance/migrations/074_control_dedup.sql`
**Tests:** `backend-compliance/tests/test_control_dedup.py` (56 Tests)
---
## Motivation
Aus ~6.800 technischen Controls x ~10 Obligations pro Control entstehen ~68.000 atomare Kandidaten. Ziel: ~18.000 einzigartige Master Controls. Viele Obligations aus verschiedenen Gesetzen fuehren zum gleichen technischen Control (z.B. "MFA implementieren" in DSGVO, NIS2, AI Act).
**Problem:** Embedding-only Deduplizierung ist GEFAEHRLICH fuer Compliance.
!!! danger "False-Positive Beispiel"
- "Admin-Zugriffe muessen MFA nutzen" vs. "Remote-Zugriffe muessen MFA nutzen"
- Embedding sagt >0.9 aehnlich
- Aber es sind **ZWEI verschiedene Controls** (verschiedene Objekte!)
---
## 4-Stufen Entscheidungsbaum
```mermaid
flowchart TD
A[Kandidat-Control] --> B{Pattern-Gate}
B -->|pattern_id verschieden| N1[NEW CONTROL]
B -->|pattern_id gleich| C{Action-Check}
C -->|Action verschieden| N2[NEW CONTROL]
C -->|Action gleich| D{Object-Normalization}
D -->|Objekt verschieden| E{Similarity > 0.95?}
E -->|Ja| L1[LINK]
E -->|Nein| N3[NEW CONTROL]
D -->|Objekt gleich| F{Tiered Thresholds}
F -->|> 0.92| L2[LINK]
F -->|0.85 - 0.92| R[REVIEW QUEUE]
F -->|< 0.85| N4[NEW CONTROL]
```
### Stufe 1: Pattern-Gate (hart)
`pattern_id` muss uebereinstimmen. Verhindert ~80% der False Positives.
```python
if pattern_id != existing.pattern_id:
NEW CONTROL # Verschiedene Kontrollmuster = verschiedene Controls
```
### Stufe 2: Action-Check (hart)
Normalisierte Aktionsverben muessen uebereinstimmen. "Implementieren" vs. "Testen" = verschiedene Controls, auch bei gleichem Objekt.
```python
if normalize_action("implementieren") != normalize_action("testen"):
NEW CONTROL # "implement" != "test"
```
**Action-Normalisierung (Deutsch → Englisch):**
| Deutsche Verben | Kanonische Form |
|----------------|-----------------|
| implementieren, umsetzen, einrichten, aktivieren | `implement` |
| testen, pruefen, ueberpruefen, verifizieren | `test` |
| ueberwachen, monitoring, beobachten | `monitor` |
| verschluesseln | `encrypt` |
| protokollieren, aufzeichnen, loggen | `log` |
| beschraenken, einschraenken, begrenzen | `restrict` |
### Stufe 3: Object-Normalization (weich)
Compliance-Objekte werden auf kanonische Token normalisiert.
```python
normalize_object("Admin-Konten") "privileged_access"
normalize_object("Remote-Zugriff") "remote_access"
normalize_object("MFA") "multi_factor_auth"
```
Bei verschiedenen Objekten gilt ein hoeherer Schwellenwert (0.95 statt 0.92).
**Objekt-Normalisierung:**
| Eingabe | Kanonischer Token |
|---------|------------------|
| MFA, 2FA, Multi-Faktor-Authentifizierung | `multi_factor_auth` |
| Admin-Konten, privilegierte Zugriffe | `privileged_access` |
| Verschluesselung, Kryptografie | `encryption` |
| Schluessel, Key Management | `key_management` |
| TLS, SSL, HTTPS | `transport_encryption` |
| Firewall | `firewall` |
| Audit-Log, Protokoll, Logging | `audit_logging` |
### Stufe 4: Embedding Similarity (Qdrant)
Tiered Thresholds basierend auf Cosine-Similarity:
| Score | Verdict | Aktion |
|-------|---------|--------|
| > 0.95 | **LINK** | Bei verschiedenen Objekten |
| > 0.92 | **LINK** | Parent-Link hinzufuegen |
| 0.85 - 0.92 | **REVIEW** | In Review-Queue zur manuellen Pruefung |
| < 0.85 | **NEW** | Neues Control anlegen |
---
## Canonicalization Layer
Vor dem Embedding wird der deutsche Compliance-Text in normalisiertes Englisch transformiert:
```
"Administratoren muessen MFA verwenden"
→ "implement multi_factor_auth for administratoren verwenden"
→ Bessere Matches, weniger Embedding-Rauschen
```
Dies reduziert das Rauschen durch synonyme Formulierungen in verschiedenen Gesetzen.
---
## Multi-Parent-Linking (M:N)
Ein atomares Control kann mehrere Eltern-Controls aus verschiedenen Regulierungen haben:
```json
{
"control_id": "AUTH-1072-A01",
"parent_links": [
{"parent_control_id": "AUTH-1001", "source": "NIST IA-02(01)", "link_type": "decomposition"},
{"parent_control_id": "NIS2-045", "source": "NIS2 Art. 21", "link_type": "dedup_merge"}
]
}
```
### Datenbank-Schema
```sql
-- Migration 074: control_parent_links (M:N)
CREATE TABLE control_parent_links (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
control_uuid UUID NOT NULL REFERENCES canonical_controls(id),
parent_control_uuid UUID NOT NULL REFERENCES canonical_controls(id),
link_type VARCHAR(30) NOT NULL DEFAULT 'decomposition',
confidence NUMERIC(3,2) DEFAULT 1.0,
source_regulation VARCHAR(100),
source_article VARCHAR(100),
obligation_candidate_id UUID REFERENCES obligation_candidates(id),
created_at TIMESTAMPTZ DEFAULT NOW(),
CONSTRAINT uq_parent_link UNIQUE (control_uuid, parent_control_uuid)
);
```
**Link-Typen:**
| Typ | Bedeutung |
|-----|-----------|
| `decomposition` | Aus Pass 0b Zerlegung |
| `dedup_merge` | Durch Dedup-Engine als Duplikat erkannt |
| `manual` | Manuell durch Reviewer verknuepft |
| `crosswalk` | Aus Crosswalk-Matrix uebernommen |
---
## Review-Queue
Borderline-Matches (Similarity 0.85-0.92) werden in die Review-Queue geschrieben:
```sql
-- Migration 074: control_dedup_reviews
CREATE TABLE control_dedup_reviews (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
candidate_control_id VARCHAR(30) NOT NULL,
candidate_title TEXT NOT NULL,
candidate_objective TEXT,
matched_control_uuid UUID REFERENCES canonical_controls(id),
matched_control_id VARCHAR(30),
similarity_score NUMERIC(4,3),
dedup_stage VARCHAR(40) NOT NULL,
review_status VARCHAR(20) DEFAULT 'pending',
-- pending → accepted_link | accepted_new | rejected
created_at TIMESTAMPTZ DEFAULT NOW()
);
```
---
## Qdrant Collection
```
Collection: atomic_controls
Dimension: 1024 (bge-m3)
Distance: COSINE
Payload: pattern_id, action_normalized, object_normalized, control_id, canonical_text
Index: pattern_id (keyword), action_normalized (keyword), object_normalized (keyword)
Query: IMMER mit filter: pattern_id == X (reduziert Suche drastisch)
```
---
## Integration in Pass 0b
Die Dedup-Engine ist optional in `DecompositionPass` integriert:
```python
decomp = DecompositionPass(db=session, dedup_enabled=True)
stats = await decomp.run_pass0b(limit=100, use_anthropic=True)
# Stats enthalten Dedup-Metriken:
# stats["dedup_linked"] = 15 (Duplikate → Parent-Link)
# stats["dedup_review"] = 3 (Borderline → Review-Queue)
# stats["controls_created"] = 82 (Neue Controls)
```
**Ablauf bei Pass 0b mit Dedup:**
1. LLM generiert atomares Control
2. Dedup-Engine prueft 4 Stufen
3. **LINK:** Kein neues Control, Parent-Link zu bestehendem
4. **REVIEW:** Kein neues Control, Eintrag in Review-Queue
5. **NEW:** Control anlegen + in Qdrant indexieren
---
## Konfiguration
| Umgebungsvariable | Default | Beschreibung |
|-------------------|---------|-------------|
| `DEDUP_ENABLED` | `true` | Dedup-Engine ein/ausschalten |
| `DEDUP_LINK_THRESHOLD` | `0.92` | Schwelle fuer automatisches Linking |
| `DEDUP_REVIEW_THRESHOLD` | `0.85` | Schwelle fuer Review-Queue |
| `DEDUP_LINK_THRESHOLD_DIFF_OBJ` | `0.95` | Schwelle bei verschiedenen Objekten |
| `DEDUP_QDRANT_COLLECTION` | `atomic_controls` | Qdrant-Collection fuer Dedup-Index |
| `QDRANT_URL` | `http://host.docker.internal:6333` | Qdrant-URL |
| `EMBEDDING_URL` | `http://embedding-service:8087` | Embedding-Service-URL |
---
## Quelldateien
| Datei | Beschreibung |
|-------|-------------|
| `compliance/services/control_dedup.py` | 4-Stufen Dedup-Engine |
| `compliance/services/decomposition_pass.py` | Pass 0a/0b mit Dedup-Integration |
| `migrations/074_control_dedup.sql` | DB-Schema (parent_links, review_queue) |
| `tests/test_control_dedup.py` | 56 Unit-Tests |
---
## Verwandte Dokumentation
- [Control Generator Pipeline](control-generator-pipeline.md) — 7-Stufen RAG→Control Pipeline
- [Canonical Control Library](canonical-control-library.md) — Datenmodell, Domains, Similarity-Detektor

View File

@@ -107,6 +107,7 @@ nav:
- Policy-Bibliothek (29 Richtlinien): services/sdk-modules/policy-bibliothek.md
- Canonical Control Library (CP-CLIB): services/sdk-modules/canonical-control-library.md
- Control Generator Pipeline: services/sdk-modules/control-generator-pipeline.md
- Deduplizierungs-Engine: services/sdk-modules/dedup-engine.md
- Control Provenance Wiki: services/sdk-modules/control-provenance.md
- Strategie:
- Wettbewerbsanalyse & Roadmap: strategy/wettbewerbsanalyse.md
@@ -115,3 +116,5 @@ nav:
- Dokumentation: development/documentation.md
- CI/CD Pipeline: development/ci-cd-pipeline.md
- QA Control Quality: development/qa-control-quality.md
- RAG Pipeline Lessons Learned: development/rag-pipeline-lessons-learned.md
- RAG Pipeline Benchmark: development/rag-pipeline-benchmark.md

View File

@@ -1,11 +1,29 @@
"""Apply PDF QA results: update source_citation with correct article + article_type."""
"""
Apply PDF QA results: update source_citation with correct article_type + article.
Safety modes:
--safe (default): Only set article_type. Set article only when empty. Mark preamble as recital_suspect.
--force-article: Also overwrite existing articles (CAREFUL: NIST substring matching is unreliable).
--dry-run: Show what would change without writing.
Usage:
python3 apply_pdf_qa_results.py # safe mode (apply article_type + empty articles)
python3 apply_pdf_qa_results.py --dry-run # show changes without writing
python3 apply_pdf_qa_results.py --force-article # also overwrite existing articles
"""
import os
import sys
import json
import psycopg2
import urllib.parse
from collections import Counter
RESULTS_FILE = "/tmp/pdf_qa_results.json"
# Parse args
dry_run = "--dry-run" in sys.argv
force_article = "--force-article" in sys.argv
# Load results
with open(RESULTS_FILE) as f:
results = json.load(f)
@@ -21,35 +39,101 @@ conn = psycopg2.connect(
options="-c search_path=compliance,public"
)
# Update in batches
# Load current DB state for all affected controls
cur = conn.cursor()
updated = 0
ctrl_ids = [r["ctrl_id"] for r in results]
cur.execute("""
SELECT id,
source_citation->>'article' as article,
source_citation->>'article_type' as article_type,
source_citation->>'source' as source
FROM compliance.canonical_controls
WHERE id = ANY(%s::uuid[])
""", (ctrl_ids,))
db_state = {}
for row in cur.fetchall():
db_state[str(row[0])] = {"article": row[1] or "", "article_type": row[2], "source": row[3]}
# Counters
stats = Counter()
updated_type = 0
updated_article = 0
updated_recital = 0
errors = 0
unchanged = 0
for i, r in enumerate(results):
ctrl_id = r["ctrl_id"]
article_label = r["article_label"]
article_type = r["article_type"] # preamble, article, annex, section, unknown
new_article = r["article_label"]
new_type = r["article_type"]
db = db_state.get(ctrl_id, {})
if not db:
stats["missing_in_db"] += 1
continue
old_type = db.get("article_type")
old_article = db.get("article", "").strip()
# Decide what to update
set_type = (old_type != new_type)
set_article = (not old_article) or (force_article and old_article != new_article)
set_recital = (new_type == "preamble")
if set_type:
stats["type_" + ("new" if not old_type else "changed")] += 1
else:
stats["type_unchanged"] += 1
if not old_article and set_article:
stats["article_new"] += 1
elif old_article and old_article != new_article:
if force_article:
stats["article_force_changed"] += 1
else:
stats["article_skipped"] += 1
else:
stats["article_unchanged"] += 1
if set_recital:
stats["recital"] += 1
if dry_run:
continue
try:
# Update source_citation: set article and article_type
cur.execute("""
UPDATE compliance.canonical_controls
SET source_citation = source_citation
|| jsonb_build_object('article', %s, 'article_type', %s),
updated_at = now()
WHERE id = %s::uuid
AND (
source_citation->>'article' IS DISTINCT FROM %s
OR source_citation->>'article_type' IS DISTINCT FROM %s
)
""", (article_label, article_type, ctrl_id, article_label, article_type))
# Build JSONB update
updates = {}
if set_type:
updates["article_type"] = new_type
if set_article:
updates["article"] = new_article
if cur.rowcount > 0:
updated += 1
else:
unchanged += 1
if updates:
# Merge into source_citation
cur.execute("""
UPDATE compliance.canonical_controls
SET source_citation = COALESCE(source_citation, '{}'::jsonb) || %s::jsonb,
updated_at = now()
WHERE id = %s::uuid
""", (json.dumps(updates), ctrl_id))
if set_type:
updated_type += 1
if set_article:
updated_article += 1
# Mark preamble as recital_suspect
if set_recital:
cur.execute("""
UPDATE compliance.canonical_controls
SET generation_metadata = jsonb_set(
COALESCE(generation_metadata, '{}'::jsonb),
'{recital_suspect}',
'true'::jsonb
),
updated_at = now()
WHERE id = %s::uuid
""", (ctrl_id,))
updated_recital += 1
except Exception as e:
errors += 1
@@ -58,12 +142,37 @@ for i, r in enumerate(results):
conn.rollback()
continue
if (i + 1) % 500 == 0:
if (i + 1) % 1000 == 0:
conn.commit()
print(f" Progress: {i+1}/{len(results)} (updated: {updated}, unchanged: {unchanged}, errors: {errors})")
print(f" Progress: {i+1}/{len(results)}")
conn.commit()
print(f"\nDone: {updated} updated, {unchanged} unchanged, {errors} errors out of {len(results)}")
if not dry_run:
conn.commit()
mode = "DRY-RUN" if dry_run else "APPLIED"
print(f"\n{'='*60}")
print(f" Mode: {mode}")
print(f"{'='*60}")
print(f"\n article_type:")
print(f" New (was NULL): {stats['type_new']:5d}")
print(f" Changed: {stats['type_changed']:5d}")
print(f" Unchanged: {stats['type_unchanged']:5d}")
print(f"\n article:")
print(f" New (was empty): {stats['article_new']:5d}")
if force_article:
print(f" Force-changed: {stats['article_force_changed']:5d}")
else:
print(f" Differs (SKIPPED): {stats['article_skipped']:5d}")
print(f" Unchanged: {stats['article_unchanged']:5d}")
print(f"\n Preamble/Recital: {stats['recital']:5d}")
print(f" Missing in DB: {stats['missing_in_db']:5d}")
if not dry_run:
print(f"\n Updates written:")
print(f" article_type: {updated_type:5d}")
print(f" article: {updated_article:5d}")
print(f" recital_suspect: {updated_recital:5d}")
print(f" Errors: {errors:5d}")
# Verify: count by article_type
cur.execute("""

View File

@@ -0,0 +1,524 @@
#!/usr/bin/env python3
"""
Phase 7.4 Benchmark: Compare gpt-oss-120b vs Claude Sonnet for Control Generation.
Tests 5 representative gap articles from different sources.
Measures: quality (JSON valid, fields complete), response time, cost estimate.
Usage:
python3 benchmark_llm_controls.py
"""
import json
import time
import sys
import os
import requests
from pathlib import Path
# ── Config ──────────────────────────────────────────────────────────
LITELLM_URL = "https://llm-dev.meghsakha.com"
LITELLM_MODEL = "gpt-oss-120b"
LITELLM_API_KEY = "sk-0nAyxaMVbIqmz_ntnndzag"
ANTHROPIC_URL = "https://api.anthropic.com/v1/messages"
ANTHROPIC_MODEL = "claude-sonnet-4-6"
ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", "")
PDF_DIR = Path(os.path.expanduser("~/rag-ingestion/pdfs"))
try:
import fitz # PyMuPDF
except ImportError:
print("PyMuPDF not available, using pre-extracted texts")
fitz = None
# ── Prompts (identical to control_generator.py) ─────────────────────
SYSTEM_PROMPT = """Du bist ein Security-Compliance-Experte. Strukturiere den gegebenen Text
als praxisorientiertes Security Control. Erstelle eine verständliche, umsetzbare Formulierung.
Antworte NUR mit validem JSON. Bei mehreren Controls antworte mit einem JSON-Array."""
APPLICABILITY_PROMPT = """- applicable_industries: Liste der Branchen fuer die dieses Control relevant ist.
Verwende ["all"] wenn der Control branchenuebergreifend gilt.
Moegliche Werte: "all", "Technologie / IT", "Finanzdienstleistungen", "Gesundheitswesen",
"Produktion / Industrie", "Energie", "Telekommunikation", "Oeffentlicher Dienst"
- applicable_company_size: Ab welcher Unternehmensgroesse gilt dieses Control?
Verwende ["all"] wenn keine Groessenbeschraenkung.
Moegliche Werte: "all", "micro", "small", "medium", "large", "enterprise"
- scope_conditions: null wenn keine besonderen Bedingungen, sonst:
{"requires_any": ["signal"], "description": "Erklaerung"}"""
def build_prompt(source_name: str, article_label: str, article_text: str, license_type: str) -> str:
return f"""Strukturiere den folgenden Gesetzestext als Security/Compliance Control.
Du DARFST den Originaltext verwenden (Quelle: {source_name}, {license_type}).
WICHTIG: Erstelle eine verständliche, praxisorientierte Formulierung.
Der Originaltext wird separat gespeichert — deine Formulierung soll klar und umsetzbar sein.
Gib JSON zurück mit diesen Feldern:
- title: Kurzer prägnanter Titel (max 100 Zeichen)
- objective: Was soll erreicht werden? (1-3 Sätze)
- rationale: Warum ist das wichtig? (1-2 Sätze)
- requirements: Liste von konkreten Anforderungen (Strings)
- test_procedure: Liste von Prüfschritten (Strings)
- evidence: Liste von Nachweisdokumenten (Strings)
- severity: low/medium/high/critical
- tags: Liste von Tags
- domain: Fachgebiet (AUTH/CRYP/NET/DATA/LOG/ACC/SEC/INC/AI/COMP/GOV)
- category: Inhaltliche Kategorie
- target_audience: Liste der Zielgruppen
- source_article: Artikel-Referenz (z.B. "Artikel 10", "§ 42")
- source_paragraph: Absatz-Referenz (z.B. "Absatz 5")
{APPLICABILITY_PROMPT}
Text: {article_text[:3000]}
Quelle: {source_name}, {article_label}"""
# ── PDF Text Extraction ─────────────────────────────────────────────
def extract_article_text(pdf_file: str, article_label: str, doc_type: str) -> str:
"""Extract the text of a specific article from a PDF."""
import re
path = PDF_DIR / pdf_file
if not path.exists() or fitz is None:
return ""
doc = fitz.open(str(path))
full_text = ""
for page in doc:
full_text += page.get_text() + "\n"
doc.close()
# Find article boundaries
if doc_type == "eu_regulation":
# Find "Artikel N" heading
art_num = re.search(r'\d+', article_label)
if not art_num:
return ""
num = int(art_num.group())
# Find start of this article
pattern = rf'\nArtikel\s+{num}\s*\n'
match = re.search(pattern, full_text)
if not match:
return f"[Artikel {num} nicht im PDF gefunden]"
start = match.start()
# Find start of next article
next_pattern = rf'\nArtikel\s+{num+1}\s*\n'
next_match = re.search(next_pattern, full_text)
end = next_match.start() if next_match else start + 5000
text = full_text[start:end].strip()
return text[:3000]
elif doc_type == "de_law":
para_num = re.search(r'\d+', article_label)
if not para_num:
return ""
num = int(para_num.group())
pattern = rf'\\s+{num}\b'
match = re.search(pattern, full_text)
if not match:
return f"{num} nicht im PDF gefunden]"
start = match.start()
next_pattern = rf'\\s+{num+1}\b'
next_match = re.search(next_pattern, full_text)
end = next_match.start() if next_match else start + 5000
text = full_text[start:end].strip()
return text[:3000]
elif doc_type == "nist":
# Find NIST control family
match = re.search(rf'(?:^|\n)\s*{re.escape(article_label)}\b', full_text)
if not match:
return f"[{article_label} nicht im PDF gefunden]"
start = match.start()
text = full_text[start:start+3000].strip()
return text
else:
# Generic section search
match = re.search(rf'(?:^|\n).*{re.escape(article_label)}\b', full_text)
if not match:
return f"[{article_label} nicht im PDF gefunden]"
start = match.start()
text = full_text[start:start+3000].strip()
return text
# ── API Calls ────────────────────────────────────────────────────────
def call_litellm(prompt: str, system_prompt: str) -> tuple:
"""Call LiteLLM API. Returns (response_text, duration_seconds, error)."""
headers = {
"Content-Type": "application/json",
"Authorization": f"Bearer {LITELLM_API_KEY}",
}
payload = {
"model": LITELLM_MODEL,
"messages": [
{"role": "system", "content": system_prompt},
{"role": "user", "content": prompt},
],
"temperature": 0.3,
"max_tokens": 4096,
"stream": False,
}
t0 = time.time()
try:
resp = requests.post(
f"{LITELLM_URL}/v1/chat/completions",
headers=headers,
json=payload,
timeout=180,
)
duration = time.time() - t0
if resp.status_code != 200:
return "", duration, f"HTTP {resp.status_code}: {resp.text[:200]}"
data = resp.json()
content = data["choices"][0]["message"]["content"]
usage = data.get("usage", {})
return content, duration, None, usage
except Exception as e:
return "", time.time() - t0, str(e), {}
def call_anthropic(prompt: str, system_prompt: str) -> tuple:
"""Call Anthropic API. Returns (response_text, duration_seconds, error)."""
headers = {
"x-api-key": ANTHROPIC_API_KEY,
"anthropic-version": "2023-06-01",
"content-type": "application/json",
}
payload = {
"model": ANTHROPIC_MODEL,
"max_tokens": 4096,
"system": system_prompt,
"messages": [{"role": "user", "content": prompt}],
}
t0 = time.time()
try:
resp = requests.post(ANTHROPIC_URL, headers=headers, json=payload, timeout=180)
duration = time.time() - t0
if resp.status_code != 200:
return "", duration, f"HTTP {resp.status_code}: {resp.text[:200]}", {}
data = resp.json()
content = data["content"][0]["text"] if data.get("content") else ""
usage = data.get("usage", {})
return content, duration, None, usage
except Exception as e:
return "", time.time() - t0, str(e), {}
# ── Quality Assessment ───────────────────────────────────────────────
REQUIRED_FIELDS = [
"title", "objective", "rationale", "requirements",
"test_procedure", "evidence", "severity", "domain",
]
BONUS_FIELDS = [
"tags", "category", "target_audience", "source_article",
"applicable_industries", "applicable_company_size",
]
def assess_quality(raw_text: str) -> dict:
"""Assess the quality of a control generation response."""
result = {
"json_valid": False,
"required_fields": 0,
"required_total": len(REQUIRED_FIELDS),
"bonus_fields": 0,
"bonus_total": len(BONUS_FIELDS),
"requirements_count": 0,
"test_procedure_count": 0,
"evidence_count": 0,
"title_length": 0,
"objective_length": 0,
"score": 0,
}
# Try to parse JSON
text = raw_text.strip()
if text.startswith("```"):
lines = text.split("\n")
text = "\n".join(lines[1:-1] if lines[-1].startswith("```") else lines[1:])
try:
data = json.loads(text)
if isinstance(data, list):
data = data[0] if data else {}
except json.JSONDecodeError:
# Try to find JSON object
import re
match = re.search(r'\{[\s\S]*\}', text)
if match:
try:
data = json.loads(match.group())
except json.JSONDecodeError:
return result
else:
return result
result["json_valid"] = True
# Check required fields
for f in REQUIRED_FIELDS:
val = data.get(f)
if val and (isinstance(val, str) and len(val) > 2 or isinstance(val, list) and len(val) > 0):
result["required_fields"] += 1
# Check bonus fields
for f in BONUS_FIELDS:
val = data.get(f)
if val and (isinstance(val, str) and len(val) > 0 or isinstance(val, list) and len(val) > 0):
result["bonus_fields"] += 1
# Depth metrics
reqs = data.get("requirements", [])
result["requirements_count"] = len(reqs) if isinstance(reqs, list) else 0
tp = data.get("test_procedure", [])
result["test_procedure_count"] = len(tp) if isinstance(tp, list) else 0
ev = data.get("evidence", [])
result["evidence_count"] = len(ev) if isinstance(ev, list) else 0
result["title_length"] = len(data.get("title", ""))
result["objective_length"] = len(data.get("objective", ""))
# Score: 0-100
score = 0
score += 20 if result["json_valid"] else 0
score += (result["required_fields"] / result["required_total"]) * 40
score += (result["bonus_fields"] / result["bonus_total"]) * 15
score += min(result["requirements_count"], 5) * 3 # max 15 for 5+ requirements
score += min(result["test_procedure_count"], 3) * 3 # max 9 for 3+ tests
score += 1 if result["objective_length"] > 50 else 0
result["score"] = round(score, 1)
result["parsed_data"] = data
return result
# ── Test Cases ───────────────────────────────────────────────────────
TEST_CASES = [
{
"source": "DSGVO (EU) 2016/679",
"article": "Artikel 32",
"pdf": "dsgvo_2016_679.pdf",
"doc_type": "eu_regulation",
"license": "EU_LAW",
"description": "Sicherheit der Verarbeitung — Kernthema Datenschutz",
},
{
"source": "KI-Verordnung (EU) 2024/1689",
"article": "Artikel 9",
"pdf": "ai_act_2024_1689.pdf",
"doc_type": "eu_regulation",
"license": "EU_LAW",
"description": "Risikomanagement für Hochrisiko-KI",
},
{
"source": "NIS2-Richtlinie (EU) 2022/2555",
"article": "Artikel 21",
"pdf": "nis2_2022_2555.pdf",
"doc_type": "eu_regulation",
"license": "EU_LAW",
"description": "Cybersicherheitsrisikomanagement — NIS2 Kernpflicht",
},
{
"source": "Cyber Resilience Act (CRA)",
"article": "Artikel 13",
"pdf": "cra_2024_2847.pdf",
"doc_type": "eu_regulation",
"license": "EU_LAW",
"description": "Pflichten der Hersteller",
},
{
"source": "Bundesdatenschutzgesetz (BDSG)",
"article": "§ 26",
"pdf": "bdsg.pdf",
"doc_type": "de_law",
"license": "DE_LAW",
"description": "Datenverarbeitung im Beschäftigungskontext",
},
]
# ── Main ─────────────────────────────────────────────────────────────
def main():
if not ANTHROPIC_API_KEY:
print("ERROR: Set ANTHROPIC_API_KEY environment variable")
sys.exit(1)
print("=" * 80)
print("LLM BENCHMARK: gpt-oss-120b vs Claude Sonnet 4.6")
print("=" * 80)
print(f" LiteLLM: {LITELLM_URL} / {LITELLM_MODEL}")
print(f" Anthropic: {ANTHROPIC_MODEL}")
print(f" Tests: {len(TEST_CASES)}")
print()
# Pre-check LiteLLM
try:
r = requests.get(f"{LITELLM_URL}/v1/models",
headers={"Authorization": f"Bearer {LITELLM_API_KEY}"}, timeout=10)
print(f" LiteLLM OK: {r.status_code}")
except Exception as e:
print(f" LiteLLM ERROR: {e}")
sys.exit(1)
results = []
for i, tc in enumerate(TEST_CASES):
print(f"\n{'='*80}")
print(f"TEST {i+1}/{len(TEST_CASES)}: {tc['source']}{tc['article']}")
print(f" {tc['description']}")
print(f"{'='*80}")
# Extract article text from PDF
article_text = extract_article_text(tc["pdf"], tc["article"], tc["doc_type"])
if not article_text or article_text.startswith("["):
print(f" WARNING: {article_text or 'Empty text'}")
continue
print(f" Text extracted: {len(article_text)} chars")
print(f" First 120 chars: {article_text[:120].replace(chr(10), ' ')}...")
prompt = build_prompt(tc["source"], tc["article"], article_text, tc["license"])
# ── Call LiteLLM ──
print(f"\n --- gpt-oss-120b ---")
litellm_raw, litellm_time, litellm_err, litellm_usage = call_litellm(prompt, SYSTEM_PROMPT)
if litellm_err:
print(f" ERROR: {litellm_err}")
litellm_quality = {"json_valid": False, "score": 0}
else:
print(f" Time: {litellm_time:.1f}s")
print(f" Tokens: {litellm_usage}")
litellm_quality = assess_quality(litellm_raw)
print(f" JSON valid: {litellm_quality['json_valid']}")
print(f" Score: {litellm_quality['score']}/100")
print(f" Required fields: {litellm_quality['required_fields']}/{litellm_quality['required_total']}")
print(f" Requirements: {litellm_quality['requirements_count']}, "
f"Tests: {litellm_quality['test_procedure_count']}, "
f"Evidence: {litellm_quality['evidence_count']}")
if litellm_quality.get("parsed_data"):
d = litellm_quality["parsed_data"]
print(f" Title: {d.get('title', 'N/A')}")
# ── Call Anthropic ──
print(f"\n --- Claude Sonnet 4.6 ---")
anthropic_raw, anthropic_time, anthropic_err, anthropic_usage = call_anthropic(prompt, SYSTEM_PROMPT)
if anthropic_err:
print(f" ERROR: {anthropic_err}")
anthropic_quality = {"json_valid": False, "score": 0}
else:
print(f" Time: {anthropic_time:.1f}s")
print(f" Tokens: {anthropic_usage}")
anthropic_quality = assess_quality(anthropic_raw)
print(f" JSON valid: {anthropic_quality['json_valid']}")
print(f" Score: {anthropic_quality['score']}/100")
print(f" Required fields: {anthropic_quality['required_fields']}/{anthropic_quality['required_total']}")
print(f" Requirements: {anthropic_quality['requirements_count']}, "
f"Tests: {anthropic_quality['test_procedure_count']}, "
f"Evidence: {anthropic_quality['evidence_count']}")
if anthropic_quality.get("parsed_data"):
d = anthropic_quality["parsed_data"]
print(f" Title: {d.get('title', 'N/A')}")
# Compare
print(f"\n --- VERGLEICH ---")
speed_ratio = litellm_time / anthropic_time if anthropic_time > 0 else 0
print(f" Speed: 120b {litellm_time:.1f}s vs Sonnet {anthropic_time:.1f}s "
f"({'120b ' + str(round(speed_ratio, 1)) + 'x langsamer' if speed_ratio > 1 else '120b schneller'})")
print(f" Score: 120b {litellm_quality.get('score', 0)}/100 vs "
f"Sonnet {anthropic_quality.get('score', 0)}/100")
results.append({
"test": f"{tc['source']}{tc['article']}",
"litellm": {
"time": round(litellm_time, 1),
"score": litellm_quality.get("score", 0),
"json_valid": litellm_quality.get("json_valid", False),
"requirements": litellm_quality.get("requirements_count", 0),
"tests": litellm_quality.get("test_procedure_count", 0),
"usage": litellm_usage,
"raw": litellm_raw[:500] if litellm_raw else "",
},
"anthropic": {
"time": round(anthropic_time, 1),
"score": anthropic_quality.get("score", 0),
"json_valid": anthropic_quality.get("json_valid", False),
"requirements": anthropic_quality.get("requirements_count", 0),
"tests": anthropic_quality.get("test_procedure_count", 0),
"usage": anthropic_usage,
"raw": anthropic_raw[:500] if anthropic_raw else "",
},
})
# ── Summary ──────────────────────────────────────────────────────
print(f"\n\n{'='*80}")
print("ZUSAMMENFASSUNG")
print(f"{'='*80}")
if not results:
print(" Keine Ergebnisse.")
return
litellm_scores = [r["litellm"]["score"] for r in results]
anthropic_scores = [r["anthropic"]["score"] for r in results]
litellm_times = [r["litellm"]["time"] for r in results]
anthropic_times = [r["anthropic"]["time"] for r in results]
print(f"\n {'Metrik':<30s} {'gpt-oss-120b':>15s} {'Claude Sonnet':>15s}")
print(f" {'-'*30} {'-'*15} {'-'*15}")
print(f" {'Avg Score (0-100)':<30s} {sum(litellm_scores)/len(litellm_scores):>13.1f} "
f"{sum(anthropic_scores)/len(anthropic_scores):>13.1f}")
print(f" {'Avg Time (s)':<30s} {sum(litellm_times)/len(litellm_times):>13.1f} "
f"{sum(anthropic_times)/len(anthropic_times):>13.1f}")
print(f" {'JSON Valid':<30s} {sum(1 for r in results if r['litellm']['json_valid']):>12d}/{len(results)} "
f"{sum(1 for r in results if r['anthropic']['json_valid']):>12d}/{len(results)}")
print(f" {'Avg Requirements':<30s} "
f"{sum(r['litellm']['requirements'] for r in results)/len(results):>13.1f} "
f"{sum(r['anthropic']['requirements'] for r in results)/len(results):>13.1f}")
print(f" {'Avg Test Procedures':<30s} "
f"{sum(r['litellm']['tests'] for r in results)/len(results):>13.1f} "
f"{sum(r['anthropic']['tests'] for r in results)/len(results):>13.1f}")
# Cost estimate
# Claude Sonnet: ~$3/M input, ~$15/M output
# gpt-oss-120b: self-hosted = $0 API cost (only compute)
total_anthropic_input = sum(r["anthropic"]["usage"].get("input_tokens", 0) for r in results)
total_anthropic_output = sum(r["anthropic"]["usage"].get("output_tokens", 0) for r in results)
anthropic_cost = (total_anthropic_input * 3 + total_anthropic_output * 15) / 1_000_000
print(f"\n Kostenvergleich (fuer {len(results)} Controls):")
print(f" gpt-oss-120b: $0.00 (self-hosted)")
print(f" Claude Sonnet: ${anthropic_cost:.4f} "
f"({total_anthropic_input} input + {total_anthropic_output} output tokens)")
# Extrapolate for 494 gap articles
if results:
cost_per_control = anthropic_cost / len(results)
print(f"\n Hochrechnung fuer 494 Luecken-Artikel:")
print(f" gpt-oss-120b: $0.00")
print(f" Claude Sonnet: ${cost_per_control * 494:.2f}")
avg_time_120b = sum(litellm_times) / len(litellm_times)
avg_time_sonnet = sum(anthropic_times) / len(anthropic_times)
print(f" Zeit 120b: {avg_time_120b * 494 / 60:.0f} min ({avg_time_120b * 494 / 3600:.1f}h)")
print(f" Zeit Sonnet: {avg_time_sonnet * 494 / 60:.0f} min ({avg_time_sonnet * 494 / 3600:.1f}h)")
# Save full results
out_path = "/tmp/benchmark_llm_results.json"
with open(out_path, 'w') as f:
json.dump(results, f, indent=2, ensure_ascii=False)
print(f"\n Detaillierte Ergebnisse: {out_path}")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,200 @@
"""Match unmatched Blue Guide controls against the English PDF."""
import os
import re
import json
import unicodedata
import psycopg2
import urllib.parse
try:
import fitz
except ImportError:
print("ERROR: PyMuPDF (fitz) not installed")
exit(1)
PDF_PATH = os.path.expanduser("~/rag-ingestion/pdfs/blue_guide_2022_en.pdf")
def normalize(s):
s = s.replace('\u00ad', '').replace('\xad', '')
s = s.replace('\u200b', '').replace('\u00a0', ' ')
s = s.replace('\ufb01', 'fi').replace('\ufb02', 'fl')
s = s.replace('\ufb00', 'ff').replace('\ufb03', 'ffi').replace('\ufb04', 'ffl')
s = s.replace('\u2019', "'").replace('\u2018', "'")
s = s.replace('\u201c', '"').replace('\u201d', '"')
s = s.replace('\u2013', '-').replace('\u2014', '-')
s = s.replace('\u2022', '-').replace('\u00b7', '-')
s = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', s)
s = unicodedata.normalize('NFC', s)
s = re.sub(r'\s+', ' ', s)
return s.strip()
# Read EN PDF
print(f"Reading {PDF_PATH}...")
doc = fitz.open(PDF_PATH)
text = ""
for page in doc:
text += page.get_text() + "\n"
doc.close()
print(f" {len(text):,} chars")
text_norm = normalize(text)
# Build article index for EN Blue Guide
# EN Blue Guide uses "Article N" headings (not "Artikel N")
items = []
# Find where "Article 1" starts — content before is preamble/intro
art1_match = re.search(r'\nArticle\s+1\s*\n', text)
if not art1_match:
# Try section-based structure instead
print(" No 'Article N' headings found, trying section-based index...")
for m in re.finditer(r'(?:^|\n)\s*(\d+(?:\.\d+)*)\.\s+[A-Z]', text, re.MULTILINE):
items.append((m.start(), f"Section {m.group(1)}", "section"))
else:
art1_pos = art1_match.start()
# Article headings
for m in re.finditer(r'(?:^|\n)\s*Article\s+(\d+[a-z]?)\s*\n', text, re.MULTILINE):
art_num = int(re.match(r'(\d+)', m.group(1)).group(1))
items.append((m.start(), f"Article {m.group(1)}", "article"))
# Annex markers
for m in re.finditer(r'(?:^|\n)\s*ANNEX\s+([IVXLC]+[a-z]?)\b', text, re.MULTILINE):
items.append((m.start(), f"Annex {m.group(1)}", "annex"))
# Also try numbered section headings as fallback
for m in re.finditer(r'(?:^|\n)\s*(\d+\.\d+(?:\.\d+)?)\s+[A-Z]', text, re.MULTILINE):
items.append((m.start(), f"Section {m.group(1)}", "section"))
items.sort(key=lambda x: x[0])
seen = set()
unique = []
for pos, label, typ in items:
if label not in seen:
seen.add(label)
unique.append((pos, label, typ))
print(f" Index: {len(unique)} sections")
if unique[:5]:
for pos, label, typ in unique[:5]:
print(f" {label} [{typ}] @ pos {pos}")
# Precompute normalized positions
index_norm = []
for pos, label, typ in unique:
norm_pos = len(normalize(text[:pos]))
index_norm.append((norm_pos, label, typ))
# Connect to DB
db_url = os.environ['DATABASE_URL']
parsed = urllib.parse.urlparse(db_url)
conn = psycopg2.connect(
host=parsed.hostname, port=parsed.port or 5432,
user=parsed.username, password=parsed.password,
dbname=parsed.path.lstrip('/'),
options="-c search_path=compliance,public"
)
cur = conn.cursor()
# Get Blue Guide controls without article_type (unmatched)
cur.execute("""
SELECT id, control_id, title, source_original_text,
source_citation->>'article' as existing_article,
source_citation->>'article_type' as existing_type,
release_state
FROM compliance.canonical_controls
WHERE source_citation->>'source' = 'EU Blue Guide 2022'
AND source_original_text IS NOT NULL
AND length(source_original_text) > 50
AND (source_citation->>'article_type' IS NULL)
ORDER BY control_id
""")
controls = cur.fetchall()
print(f"\nUnmatched Blue Guide controls: {len(controls)}")
# Match each control
results = []
found = 0
not_found = 0
for ctrl in controls:
ctrl_id, control_id, title, orig_text, existing_art, existing_type, state = ctrl
orig_norm = normalize(orig_text)
if len(orig_norm) < 30:
not_found += 1
continue
matched = False
for start_frac in [0.25, 0.1, 0.5, 0.0, 0.75]:
for length in [80, 60, 40, 30, 20]:
start = max(0, int(len(orig_norm) * start_frac))
snippet = orig_norm[start:start+length]
if not snippet or len(snippet) < 15:
continue
pos = text_norm.find(snippet)
if pos >= 0:
# Find section
label = "Unknown"
typ = "unknown"
for h_pos, h_label, h_type in reversed(index_norm):
if h_pos <= pos:
label = h_label
typ = h_type
break
results.append({
"ctrl_id": str(ctrl_id),
"control_id": control_id,
"source": "EU Blue Guide 2022",
"article_label": label,
"article_type": typ,
})
found += 1
is_active = "" if state not in ('duplicate', 'too_close') else " [DUP]"
print(f" {control_id:10s}: {label:25s} [{typ:8s}]{is_active}")
matched = True
break
if matched:
break
if not matched:
not_found += 1
print(f" {control_id:10s}: NOT FOUND {title[:50]}")
print(f"\n{'='*50}")
print(f"Results: {found} matched, {not_found} not found out of {len(controls)}")
# Save results
out_path = "/tmp/blue_guide_en_results.json"
with open(out_path, 'w') as f:
json.dump(results, f, indent=2, ensure_ascii=False)
print(f"Saved to {out_path}")
# Apply results to DB
if results:
print(f"\nApplying {len(results)} results to DB...")
applied = 0
for r in results:
cur.execute("""
UPDATE compliance.canonical_controls
SET source_citation = source_citation ||
jsonb_build_object('article', %s, 'article_type', %s)
WHERE id = %s::uuid
AND (source_citation->>'article' IS DISTINCT FROM %s
OR source_citation->>'article_type' IS DISTINCT FROM %s)
""", (r["article_label"], r["article_type"],
r["ctrl_id"], r["article_label"], r["article_type"]))
if cur.rowcount > 0:
applied += 1
conn.commit()
print(f" Applied: {applied} controls updated")
# Show type distribution
type_counts = {}
for r in results:
t = r["article_type"]
type_counts[t] = type_counts.get(t, 0) + 1
if type_counts:
print(f"\nArticle type distribution:")
for t, c in sorted(type_counts.items(), key=lambda x: -x[1]):
print(f" {t:12s}: {c:5d}")
conn.close()

188
scripts/qa/gap_analysis.py Normal file
View File

@@ -0,0 +1,188 @@
"""
Phase 7.3: Gap Analysis — Identify articles/sections WITHOUT controls.
For each regulation PDF:
1. Extract all articles/sections from the PDF
2. Compare with controls in the DB that reference this article
3. Report gaps (articles with no controls)
Usage:
python3 gap_analysis.py # show all gaps
python3 gap_analysis.py --source "DSGVO" # filter by source
"""
import os
import sys
import json
import re
import psycopg2
import urllib.parse
from pathlib import Path
from collections import defaultdict
# Import from pdf_qa_all
sys.path.insert(0, os.path.dirname(__file__))
from pdf_qa_all import (
SOURCE_FILE_MAP, read_file, classify_doc, normalize,
build_eu_article_index, build_de_law_index, build_nist_index,
build_owasp_index, build_generic_index, MAX_ARTICLES
)
# Only analyze sources with significant control counts (skip sources with <5 controls)
MIN_CONTROLS = 5
def main():
source_filter = None
if "--source" in sys.argv:
idx = sys.argv.index("--source")
if idx + 1 < len(sys.argv):
source_filter = sys.argv[idx + 1]
# DB connection
db_url = os.environ['DATABASE_URL']
parsed = urllib.parse.urlparse(db_url)
conn = psycopg2.connect(
host=parsed.hostname, port=parsed.port or 5432,
user=parsed.username, password=parsed.password,
dbname=parsed.path.lstrip('/'),
options="-c search_path=compliance,public"
)
cur = conn.cursor()
# Get all controls grouped by source with their article
cur.execute("""
SELECT source_citation->>'source' as source,
source_citation->>'article' as article,
source_citation->>'article_type' as article_type,
count(*) as cnt
FROM compliance.canonical_controls
WHERE source_citation->>'source' IS NOT NULL
AND release_state NOT IN ('duplicate', 'too_close')
GROUP BY 1, 2, 3
ORDER BY 1, 2
""")
# Build: source -> {article -> (type, count)}
controls_by_source = defaultdict(dict)
for source, article, art_type, cnt in cur.fetchall():
if article:
controls_by_source[source][article] = (art_type or "unknown", cnt)
total_gaps = 0
total_articles_checked = 0
total_covered = 0
gap_report = []
sources_to_check = sorted(SOURCE_FILE_MAP.keys())
if source_filter:
sources_to_check = [s for s in sources_to_check if source_filter.lower() in s.lower()]
for source_name in sources_to_check:
filename = SOURCE_FILE_MAP.get(source_name)
if filename is None:
continue
controls = controls_by_source.get(source_name, {})
if len(controls) < MIN_CONTROLS and not source_filter:
continue
# Read PDF and build article index
text = read_file(filename)
if text is None:
continue
doc_type = classify_doc(source_name)
max_art = MAX_ARTICLES.get(source_name)
if doc_type == "eu_regulation":
index = build_eu_article_index(text, max_article=max_art)
elif doc_type == "de_law":
index = build_de_law_index(text)
elif doc_type == "nist":
index = build_nist_index(text)
elif doc_type == "owasp":
index = build_owasp_index(text, source_name)
else:
index = build_generic_index(text)
if not index:
continue
# Only look at substantive articles (not preamble, not annex for gap analysis)
substantive_types = {"article", "section", "control", "requirement", "category"}
substantive_articles = [(pos, label, typ) for pos, label, typ in index if typ in substantive_types]
preamble_articles = [(pos, label, typ) for pos, label, typ in index if typ == "preamble"]
annex_articles = [(pos, label, typ) for pos, label, typ in index if typ == "annex"]
# Check which articles have controls
covered = []
gaps = []
for pos, label, typ in substantive_articles:
if label in controls:
covered.append(label)
else:
gaps.append((label, typ))
total_articles_checked += len(substantive_articles)
total_covered += len(covered)
total_gaps += len(gaps)
# Count preamble/annex controls
preamble_controls = sum(1 for a in controls if controls[a][0] == "preamble")
annex_controls = sum(1 for a in controls if controls[a][0] == "annex")
coverage_pct = len(covered) / len(substantive_articles) * 100 if substantive_articles else 0
print(f"\n{'='*70}")
print(f"{source_name}")
print(f" PDF articles: {len(substantive_articles)} substantive, "
f"{len(preamble_articles)} preamble, {len(annex_articles)} annex")
print(f" DB controls: {sum(v[1] for v in controls.values())} total "
f"({preamble_controls} preamble, {annex_controls} annex)")
print(f" Coverage: {len(covered)}/{len(substantive_articles)} "
f"({coverage_pct:.0f}%)")
if gaps:
print(f" GAPS ({len(gaps)}):")
for label, typ in gaps[:30]: # limit output
print(f" - {label} [{typ}]")
if len(gaps) > 30:
print(f" ... and {len(gaps)-30} more")
gap_report.append({
"source": source_name,
"total_articles": len(substantive_articles),
"covered": len(covered),
"gaps": len(gaps),
"coverage_pct": round(coverage_pct, 1),
"gap_articles": [{"label": l, "type": t} for l, t in gaps],
})
# Summary
print(f"\n{'='*70}")
print("GAP ANALYSIS SUMMARY")
print(f"{'='*70}")
print(f" Sources analyzed: {len([r for r in gap_report]) + len([s for s in sources_to_check if SOURCE_FILE_MAP.get(s)])}")
print(f" Total articles in PDFs: {total_articles_checked}")
print(f" Articles with controls: {total_covered}")
print(f" Articles WITHOUT controls: {total_gaps}")
if total_articles_checked:
print(f" Overall coverage: {total_covered/total_articles_checked*100:.1f}%")
print(f"\n Sources with gaps:")
for r in sorted(gap_report, key=lambda x: -x["gaps"]):
print(f" {r['source']:45s} {r['gaps']:4d} gaps "
f"({r['covered']}/{r['total_articles']} = {r['coverage_pct']}%)")
# Save report
out_path = "/tmp/gap_analysis_results.json"
with open(out_path, 'w') as f:
json.dump(gap_report, f, indent=2, ensure_ascii=False)
print(f"\n Full report saved to {out_path}")
conn.close()
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,288 @@
"""Analyze NIST OSCAL data and compare with existing controls in DB."""
import os
import re
import json
import psycopg2
import urllib.parse
from collections import defaultdict
OSCAL_DIR = os.path.expanduser("~/rag-ingestion/nist-oscal")
# ── Load SP 800-53 Rev 5 ──
with open(os.path.join(OSCAL_DIR, "sp800-53-rev5-catalog.json")) as f:
sp853 = json.load(f)["catalog"]
print("=" * 70)
print("NIST SP 800-53 Rev 5 — OSCAL Catalog Analysis")
print("=" * 70)
print(f" UUID: {sp853.get('uuid', '?')}")
print(f" Last Modified: {sp853.get('metadata', {}).get('last-modified', '?')}")
# Count controls
families = sp853.get("groups", [])
total_base = 0
total_enhancements = 0
total_withdrawn = 0
total_active = 0
family_stats = []
for fam in families:
fam_id = fam.get("id", "?")
fam_title = fam.get("title", "?")
controls = fam.get("controls", [])
base = 0
enhancements = 0
withdrawn = 0
for ctrl in controls:
# Check if withdrawn
props = {p["name"]: p.get("value", "") for p in ctrl.get("props", [])}
is_withdrawn = props.get("status") == "withdrawn"
if is_withdrawn:
withdrawn += 1
else:
base += 1
# Count enhancements
for enh in ctrl.get("controls", []):
enh_props = {p["name"]: p.get("value", "") for p in enh.get("props", [])}
if enh_props.get("status") == "withdrawn":
withdrawn += 1
else:
enhancements += 1
family_stats.append((fam_id, fam_title, base, enhancements, withdrawn))
total_base += base
total_enhancements += enhancements
total_withdrawn += withdrawn
total_active = total_base + total_enhancements
print(f"\n Families: {len(families)}")
print(f" Base Controls: {total_base}")
print(f" Enhancements: {total_enhancements}")
print(f" Withdrawn: {total_withdrawn}")
print(f" TOTAL ACTIVE: {total_active}")
print(f"\n Per Family:")
print(f" {'ID':6s} {'Title':45s} {'Base':>5s} {'Enh':>5s} {'Wdrn':>5s}")
for fam_id, title, base, enh, wdrn in family_stats:
print(f" {fam_id:6s} {title[:45]:45s} {base:5d} {enh:5d} {wdrn:5d}")
# Show example control structure
print(f"\n Example Control (AC-6 Least Privilege):")
for fam in families:
for ctrl in fam.get("controls", []):
if ctrl["id"] == "ac-6":
props = {p["name"]: p.get("value", "") for p in ctrl.get("props", [])}
print(f" ID: {ctrl['id']}")
print(f" Label: {props.get('label', '?')}")
print(f" Title: {ctrl['title']}")
for part in ctrl.get("parts", []):
if part.get("name") == "statement":
prose = part.get("prose", "")
print(f" Statement: {prose[:150]}...")
elif part.get("name") == "guidance":
prose = part.get("prose", "")
print(f" Guidance: {prose[:150]}...")
enh_count = len(ctrl.get("controls", []))
print(f" Enhancements: {enh_count}")
links = [l["href"].lstrip("#") for l in ctrl.get("links", []) if l.get("rel") == "related"]
print(f" Related: {', '.join(links[:8])}...")
break
# ── Load CSF 2.0 ──
print(f"\n{'='*70}")
print("NIST CSF 2.0 — OSCAL Catalog Analysis")
print("=" * 70)
with open(os.path.join(OSCAL_DIR, "csf-2.0-catalog.json")) as f:
csf = json.load(f)["catalog"]
csf_groups = csf.get("groups", [])
csf_total = 0
for grp in csf_groups:
func_title = grp.get("title", "?")
cats = grp.get("groups", [])
subcats = 0
for cat in cats:
subcats += len(cat.get("controls", []))
csf_total += subcats
print(f" {func_title:25s}: {len(cats):2d} categories, {subcats:3d} subcategories")
print(f" TOTAL: {csf_total} subcategories")
# ── Compare with existing DB controls ──
print(f"\n{'='*70}")
print("VERGLEICH: OSCAL vs. bestehende Controls in DB")
print("=" * 70)
db_url = os.environ['DATABASE_URL']
parsed = urllib.parse.urlparse(db_url)
conn = psycopg2.connect(
host=parsed.hostname, port=parsed.port or 5432,
user=parsed.username, password=parsed.password,
dbname=parsed.path.lstrip('/'),
options="-c search_path=compliance,public"
)
cur = conn.cursor()
# Get existing NIST controls
cur.execute("""
SELECT control_id, title,
source_citation->>'source' as source,
source_citation->>'article' as article,
source_citation->>'article_type' as art_type,
release_state
FROM compliance.canonical_controls
WHERE source_citation->>'source' LIKE 'NIST%%'
ORDER BY source_citation->>'source', control_id
""")
nist_controls = cur.fetchall()
# Group by source
by_source = defaultdict(list)
for ctrl in nist_controls:
by_source[ctrl[2]].append(ctrl)
print(f"\n Bestehende NIST Controls in DB:")
for src in sorted(by_source.keys()):
ctrls = by_source[src]
active = sum(1 for c in ctrls if c[5] not in ('duplicate', 'too_close'))
with_article = sum(1 for c in ctrls if c[3])
print(f" {src:40s}: {len(ctrls):4d} total, {active:4d} active, {with_article:4d} mit article")
# For SP 800-53: which control families do we have?
sp853_existing = [c for c in nist_controls if 'SP 800-53' in (c[2] or '')]
existing_families = set()
existing_articles = set()
for ctrl in sp853_existing:
article = ctrl[3] or ""
if article:
# Extract family prefix (e.g., "AC-6" → "AC")
m = re.match(r'([A-Z]{2})-', article)
if m:
existing_families.add(m.group(1))
existing_articles.add(article)
print(f"\n SP 800-53 in DB:")
print(f" Total: {len(sp853_existing)}")
print(f" Families covered: {len(existing_families)}")
print(f" Unique articles: {len(existing_articles)}")
print(f" Families: {', '.join(sorted(existing_families))}")
# Compare: which OSCAL controls are NOT in our DB?
oscal_controls = {} # id → (label, title, statement)
for fam in families:
for ctrl in fam.get("controls", []):
props = {p["name"]: p.get("value", "") for p in ctrl.get("props", [])}
if props.get("status") == "withdrawn":
continue
label = props.get("label", ctrl["id"].upper())
statement = ""
guidance = ""
for part in ctrl.get("parts", []):
if part.get("name") == "statement":
statement = part.get("prose", "")
# Also check sub-items
for sub in part.get("parts", []):
statement += " " + sub.get("prose", "")
elif part.get("name") == "guidance":
guidance = part.get("prose", "")
oscal_controls[label] = (ctrl["title"], statement[:500], guidance[:500])
# Enhancements
for enh in ctrl.get("controls", []):
enh_props = {p["name"]: p.get("value", "") for p in enh.get("props", [])}
if enh_props.get("status") == "withdrawn":
continue
enh_label = enh_props.get("label", enh["id"].upper())
enh_statement = ""
enh_guidance = ""
for part in enh.get("parts", []):
if part.get("name") == "statement":
enh_statement = part.get("prose", "")
for sub in part.get("parts", []):
enh_statement += " " + sub.get("prose", "")
elif part.get("name") == "guidance":
enh_guidance = part.get("prose", "")
oscal_controls[enh_label] = (enh["title"], enh_statement[:500], enh_guidance[:500])
print(f"\n OSCAL SP 800-53 aktive Controls: {len(oscal_controls)}")
# Find missing: in OSCAL but not in DB
missing = []
covered = []
for label in sorted(oscal_controls.keys()):
if label in existing_articles:
covered.append(label)
else:
missing.append(label)
print(f" In DB vorhanden: {len(covered)}")
print(f" FEHLEND in DB: {len(missing)}")
# Missing by family
missing_by_fam = defaultdict(list)
for label in missing:
fam = label.split("-")[0]
missing_by_fam[fam].append(label)
print(f"\n Fehlende Controls nach Family:")
for fam in sorted(missing_by_fam.keys()):
ctrls = missing_by_fam[fam]
examples = ", ".join(ctrls[:5])
more = f" +{len(ctrls)-5}" if len(ctrls) > 5 else ""
print(f" {fam:4s}: {len(ctrls):3d} fehlend ({examples}{more})")
# Also check CSF 2.0
print(f"\n{'='*70}")
print("NIST CSF 2.0 — Vergleich mit DB")
print("=" * 70)
cur.execute("""
SELECT count(*), count(*) FILTER (WHERE release_state NOT IN ('duplicate', 'too_close'))
FROM compliance.canonical_controls
WHERE source_citation->>'source' LIKE 'NIST Cybersecurity%%'
""")
csf_row = cur.fetchone()
print(f" CSF Controls in DB: {csf_row[0]} total, {csf_row[1]} active")
csf_subcats = 0
csf_ids = []
for grp in csf_groups:
for cat in grp.get("groups", []):
for subcat in cat.get("controls", []):
csf_subcats += 1
props = {p["name"]: p.get("value", "") for p in subcat.get("props", [])}
csf_ids.append(props.get("label", subcat["id"]))
print(f" CSF 2.0 OSCAL Subcategories: {csf_subcats}")
print(f" Beispiele: {', '.join(csf_ids[:10])}")
# ── Summary / Potential ──
print(f"\n{'='*70}")
print("POTENTIAL: Was OSCAL uns bringt")
print("=" * 70)
print(f"""
SP 800-53 Rev 5:
- {len(missing)} neue Controls möglich (aktuell {len(covered)} in DB)
- Jeder Control hat: Statement + Guidance + Assessment-Methoden
- Cross-References zwischen Controls (für Mapping)
- Maschinenlesbare Parameter (ODP)
- Public Domain — keine Lizenzprobleme
CSF 2.0:
- {csf_subcats} Subcategories als Compliance-Controls
- 6 Functions (Govern, Identify, Protect, Detect, Respond, Recover)
- Direkte Mappings zu SP 800-53 Controls
Nächste Schritte:
1. Fehlende SP 800-53 Controls importieren ({len(missing)} Controls)
2. Statement-Text als source_original_text verwenden
3. article_type='control', article=Label (z.B. 'AC-6')
4. CSF 2.0 als eigene Regulation importieren
5. Cross-References als Grundlage für Control-Mappings nutzen
""")
conn.close()

289
scripts/qa/oscal_import.py Normal file
View File

@@ -0,0 +1,289 @@
"""Import 776 missing NIST SP 800-53 Rev 5 controls from OSCAL into canonical_controls."""
import os
import re
import json
import uuid
import psycopg2
import urllib.parse
OSCAL_DIR = os.path.expanduser("~/rag-ingestion/nist-oscal")
with open(os.path.join(OSCAL_DIR, "sp800-53-rev5-catalog.json")) as f:
sp853 = json.load(f)["catalog"]
# ── Extract all OSCAL controls ──
def extract_controls(catalog):
"""Extract all active controls with full data."""
controls = []
for fam in catalog.get("groups", []):
fam_id = fam.get("id", "").upper()
fam_title = fam.get("title", "")
for ctrl in fam.get("controls", []):
result = extract_single(ctrl, fam_title)
if result:
controls.append(result)
# Enhancements
for enh in ctrl.get("controls", []):
result = extract_single(enh, fam_title)
if result:
controls.append(result)
return controls
def extract_single(ctrl, family_title):
"""Extract a single control or enhancement."""
props = {p["name"]: p.get("value", "") for p in ctrl.get("props", [])}
if props.get("status") == "withdrawn":
return None
label = props.get("label", ctrl["id"].upper())
title = ctrl.get("title", "")
# Extract statement (main requirement text)
statement = ""
for part in ctrl.get("parts", []):
if part.get("name") == "statement":
statement = part.get("prose", "")
# Sub-items (a., b., c., etc.)
for sub in part.get("parts", []):
sub_prose = sub.get("prose", "")
sub_label = ""
for sp in sub.get("props", []):
if sp["name"] == "label":
sub_label = sp.get("value", "")
if sub_label:
statement += f"\n{sub_label} {sub_prose}"
elif sub_prose:
statement += f"\n{sub_prose}"
# Nested sub-sub-items
for subsub in sub.get("parts", []):
ss_prose = subsub.get("prose", "")
ss_label = ""
for sp in subsub.get("props", []):
if sp["name"] == "label":
ss_label = sp.get("value", "")
if ss_label:
statement += f"\n {ss_label} {ss_prose}"
elif ss_prose:
statement += f"\n {ss_prose}"
# Extract guidance
guidance = ""
for part in ctrl.get("parts", []):
if part.get("name") == "guidance":
guidance = part.get("prose", "")
# Cross-references
related = [l["href"].lstrip("#") for l in ctrl.get("links", []) if l.get("rel") == "related"]
# Parameters
params = []
for p in ctrl.get("params", []):
param_id = p.get("id", "")
param_label = p.get("label", "")
guidelines = ""
for g in p.get("guidelines", []):
guidelines += g.get("prose", "")
select_choices = []
if "select" in p:
for choice in p["select"].get("choice", []):
select_choices.append(choice)
params.append({
"id": param_id,
"label": param_label,
"guidelines": guidelines,
"choices": select_choices,
})
return {
"label": label,
"title": title,
"family": family_title,
"statement": statement.strip(),
"guidance": guidance.strip(),
"related": related,
"params": params,
"is_enhancement": "(" in label,
}
all_oscal = extract_controls(sp853)
print(f"Total OSCAL active controls: {len(all_oscal)}")
# ── Normalize label for comparison ──
def normalize_label(label):
label = re.sub(r'-0+(\d)', r'-\1', label)
label = re.sub(r'\(0+(\d+)\)', r'(\1)', label)
return label.upper()
# ── DB connection ──
db_url = os.environ['DATABASE_URL']
parsed = urllib.parse.urlparse(db_url)
conn = psycopg2.connect(
host=parsed.hostname, port=parsed.port or 5432,
user=parsed.username, password=parsed.password,
dbname=parsed.path.lstrip('/'),
options="-c search_path=compliance,public"
)
cur = conn.cursor()
# Get existing labels
cur.execute("""
SELECT DISTINCT source_citation->>'article' as article
FROM compliance.canonical_controls
WHERE source_citation->>'source' = 'NIST SP 800-53 Rev. 5'
AND source_citation->>'article' IS NOT NULL
""")
existing_labels = set(normalize_label(r[0]) for r in cur.fetchall())
print(f"Existing DB labels (normalized): {len(existing_labels)}")
# Get highest control_id numbers per prefix
cur.execute("""
SELECT control_id FROM compliance.canonical_controls
WHERE control_id ~ '^[A-Z]+-[0-9]+$'
ORDER BY control_id
""")
existing_ids = set(r[0] for r in cur.fetchall())
# Find next available ID per prefix
def next_control_id(prefix, existing):
"""Find next available control_id like SEC-1234."""
max_num = 0
pattern = re.compile(rf'^{prefix}-(\d+)$')
for eid in existing:
m = pattern.match(eid)
if m:
max_num = max(max_num, int(m.group(1)))
return max_num
# Map NIST families to our control_id prefixes
FAMILY_PREFIX = {
"Access Control": "ACC",
"Awareness and Training": "GOV",
"Audit and Accountability": "LOG",
"Assessment, Authorization, and Monitoring": "GOV",
"Configuration Management": "COMP",
"Contingency Planning": "INC",
"Identification and Authentication": "AUTH",
"Incident Response": "INC",
"Maintenance": "COMP",
"Media Protection": "DATA",
"Physical and Environmental Protection": "SEC",
"Planning": "GOV",
"Program Management": "GOV",
"Personnel Security": "GOV",
"Personally Identifiable Information Processing and Transparency": "DATA",
"Risk Assessment": "GOV",
"System and Services Acquisition": "COMP",
"System and Communications Protection": "NET",
"System and Information Integrity": "SEC",
"Supply Chain Risk Management": "COMP",
}
# Track next IDs
prefix_counters = {}
for prefix in set(FAMILY_PREFIX.values()):
prefix_counters[prefix] = next_control_id(prefix, existing_ids)
print(f"Starting counters: {prefix_counters}")
# ── Filter to only new controls ──
to_import = []
for ctrl in all_oscal:
norm = normalize_label(ctrl["label"])
if norm not in existing_labels:
to_import.append(ctrl)
print(f"\nControls to import: {len(to_import)}")
# ── Import ──
imported = 0
for ctrl in to_import:
prefix = FAMILY_PREFIX.get(ctrl["family"], "COMP")
prefix_counters[prefix] += 1
control_id = f"{prefix}-{prefix_counters[prefix]:04d}"
# Build title: "NIST {label}: {title}"
title = f"NIST {ctrl['label']}: {ctrl['title']}"
# source_original_text = statement (the official requirement text)
source_text = ctrl["statement"]
if not source_text:
source_text = ctrl["guidance"][:500] if ctrl["guidance"] else ctrl["title"]
# objective = guidance text
objective = ctrl["guidance"][:2000] if ctrl["guidance"] else ""
# source_citation
citation = {
"source": "NIST SP 800-53 Rev. 5",
"article": ctrl["label"],
"article_type": "control",
"source_type": "standard",
"oscal_import": True,
}
if ctrl["related"]:
citation["related_controls"] = ctrl["related"][:20]
if ctrl["params"]:
citation["parameters"] = [{"id": p["id"], "label": p["label"]} for p in ctrl["params"][:10]]
FRAMEWORK_ID = '14b1bdd2-abc7-4a43-adae-14471ee5c7cf'
new_id = str(uuid.uuid4())
cur.execute("""
INSERT INTO compliance.canonical_controls
(id, framework_id, control_id, title, objective, rationale,
severity, source_original_text,
source_citation, pipeline_version, release_state,
generation_strategy, category)
VALUES (%s, %s, %s, %s, %s, '', 'medium', %s, %s, 4, 'draft', 'oscal_import', %s)
""", (
new_id,
FRAMEWORK_ID,
control_id,
title[:500],
objective[:5000],
source_text[:10000],
json.dumps(citation, ensure_ascii=False),
ctrl["family"],
))
imported += 1
conn.commit()
print(f"\nImported: {imported} new controls")
# ── Verify ──
cur.execute("""
SELECT count(*),
count(*) FILTER (WHERE release_state NOT IN ('duplicate', 'too_close'))
FROM compliance.canonical_controls
WHERE source_citation->>'source' = 'NIST SP 800-53 Rev. 5'
""")
total, active = cur.fetchone()
print(f"\nSP 800-53 after import: {total} total, {active} active")
cur.execute("""
SELECT release_state, count(*)
FROM compliance.canonical_controls
GROUP BY release_state
ORDER BY count(*) DESC
""")
print(f"\nDB release_state gesamt:")
for row in cur.fetchall():
print(f" {row[0]:15s}: {row[1]:5d}")
cur.execute("""
SELECT count(*)
FROM compliance.canonical_controls
WHERE release_state NOT IN ('duplicate', 'too_close')
""")
print(f"\nAktive Controls gesamt: {cur.fetchone()[0]}")
# ── Import stats by family ──
fam_counts = {}
for ctrl in to_import:
fam = ctrl["family"]
fam_counts[fam] = fam_counts.get(fam, 0) + 1
print(f"\nImportiert nach Family:")
for fam in sorted(fam_counts.keys()):
print(f" {fam[:45]:45s}: {fam_counts[fam]:3d}")
conn.close()

274
scripts/qa/owasp_cleanup.py Normal file
View File

@@ -0,0 +1,274 @@
"""OWASP Cleanup:
1. Mark 324 OWASP Top 10 multilingual controls as 'duplicate'
2. Fix 47 wrong source attributions (found in different OWASP PDF)
"""
import os
import re
import json
import unicodedata
import psycopg2
import urllib.parse
try:
import fitz
except ImportError:
print("ERROR: PyMuPDF not installed")
exit(1)
PDF_DIR = os.path.expanduser("~/rag-ingestion/pdfs")
def normalize(s):
s = s.replace('\u00ad', '').replace('\xad', '')
s = s.replace('\u200b', '').replace('\u00a0', ' ')
s = s.replace('\ufb01', 'fi').replace('\ufb02', 'fl')
s = s.replace('\ufb00', 'ff').replace('\ufb03', 'ffi').replace('\ufb04', 'ffl')
s = s.replace('\u2019', "'").replace('\u2018', "'")
s = s.replace('\u201c', '"').replace('\u201d', '"')
s = s.replace('\u2013', '-').replace('\u2014', '-')
s = s.replace('\u2022', '-').replace('\u00b7', '-')
s = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', s)
s = unicodedata.normalize('NFC', s)
s = re.sub(r'\s+', ' ', s)
return s.strip()
# Load OWASP PDFs
OWASP_PDFS = {
"OWASP Top 10 (2021)": "owasp_top10_2021.pdf",
"OWASP ASVS 4.0": "owasp_asvs_4_0.pdf",
"OWASP SAMM 2.0": "owasp_samm_2_0.pdf",
"OWASP API Security Top 10 (2023)": "owasp_api_top10_2023.pdf",
"OWASP MASVS 2.0": "owasp_masvs_2_0.pdf",
}
pdf_norms = {}
for name, filename in OWASP_PDFS.items():
path = os.path.join(PDF_DIR, filename)
if not os.path.exists(path):
continue
doc = fitz.open(path)
text = ""
for page in doc:
text += page.get_text() + "\n"
doc.close()
pdf_norms[name] = normalize(text)
def build_owasp_index(text_norm, source_name):
# We need the raw text for regex, but we already normalized.
# Rebuild index from normalized text.
items = []
if "Top 10" in source_name and "API" not in source_name:
for m in re.finditer(r'(A\d{2}:\d{4})', text_norm):
items.append((m.start(), m.group(1), "category"))
elif "API" in source_name:
for m in re.finditer(r'(API\d+:\d{4})', text_norm):
items.append((m.start(), m.group(1), "category"))
elif "ASVS" in source_name:
for m in re.finditer(r'(V\d+\.\d+(?:\.\d+)?)\b', text_norm):
items.append((m.start(), m.group(1), "requirement"))
elif "MASVS" in source_name:
for m in re.finditer(r'(MASVS-[A-Z]+-\d+)', text_norm):
items.append((m.start(), m.group(1), "requirement"))
items.sort(key=lambda x: x[0])
seen = set()
unique = []
for pos, label, typ in items:
if label not in seen:
seen.add(label)
unique.append((pos, label, typ))
return unique
pdf_indexes = {}
for name, norm in pdf_norms.items():
pdf_indexes[name] = build_owasp_index(norm, name)
def find_in_pdf(orig_text, source_name):
"""Find control text in a specific PDF. Returns (label, type) or None."""
pdf_norm = pdf_norms.get(source_name)
if not pdf_norm:
return None
orig_norm = normalize(orig_text)
if len(orig_norm) < 20:
return None
idx = pdf_indexes.get(source_name, [])
for start_frac in [0.25, 0.1, 0.5, 0.0, 0.75]:
for length in [80, 60, 40, 30, 20]:
start = max(0, int(len(orig_norm) * start_frac))
snippet = orig_norm[start:start+length]
if not snippet or len(snippet) < 15:
continue
pos = pdf_norm.find(snippet)
if pos >= 0:
label = "Unknown"
typ = "unknown"
for h_pos, h_label, h_type in reversed(idx):
if h_pos <= pos:
label = h_label
typ = h_type
break
return (label, typ)
return None
# DB
db_url = os.environ['DATABASE_URL']
parsed = urllib.parse.urlparse(db_url)
conn = psycopg2.connect(
host=parsed.hostname, port=parsed.port or 5432,
user=parsed.username, password=parsed.password,
dbname=parsed.path.lstrip('/'),
options="-c search_path=compliance,public"
)
cur = conn.cursor()
# ═══════════════════════════════════════════════════════════════
# STEP 1: Mark OWASP Top 10 multilingual controls as duplicate
# ═══════════════════════════════════════════════════════════════
print("=" * 60)
print("STEP 1: OWASP Top 10 — multilingual controls → duplicate")
print("=" * 60)
cur.execute("""
SELECT id, control_id, title, source_original_text, release_state
FROM compliance.canonical_controls
WHERE source_citation->>'source' = 'OWASP Top 10 (2021)'
AND source_citation->>'article_type' IS NULL
AND source_original_text IS NOT NULL
AND release_state NOT IN ('duplicate', 'too_close')
ORDER BY control_id
""")
top10_unmatched = cur.fetchall()
print(f" Unmatched active OWASP Top 10: {len(top10_unmatched)}")
# Separate: found in other OWASP PDF vs not found anywhere
to_mark_dup = []
to_fix_source = []
for ctrl in top10_unmatched:
uid, cid, title, text, state = ctrl
# Check if found in another OWASP PDF
found_in = None
found_result = None
for other_src in OWASP_PDFS:
if other_src == 'OWASP Top 10 (2021)':
continue
result = find_in_pdf(text, other_src)
if result:
found_in = other_src
found_result = result
break
if found_in:
to_fix_source.append((uid, cid, found_in, found_result[0], found_result[1]))
else:
to_mark_dup.append((uid, cid))
print(f" → Not found in any PDF (multilingual): {len(to_mark_dup)} → mark as duplicate")
print(f" → Found in other OWASP PDF: {len(to_fix_source)} → fix source attribution")
# Mark as duplicate
dup_marked = 0
for uid, cid in to_mark_dup:
cur.execute("""
UPDATE compliance.canonical_controls
SET release_state = 'duplicate'
WHERE id = %s AND release_state NOT IN ('duplicate', 'too_close')
""", (uid,))
if cur.rowcount > 0:
dup_marked += 1
print(f" Marked as duplicate: {dup_marked}")
# ═══════════════════════════════════════════════════════════════
# STEP 2: Fix wrong source attributions across ALL OWASP sources
# ═══════════════════════════════════════════════════════════════
print(f"\n{'='*60}")
print("STEP 2: Fix wrong OWASP source attributions")
print("=" * 60)
all_fixes = list(to_fix_source) # Start with Top 10 fixes
# Also check ASVS, SAMM, MASVS
for source in ['OWASP ASVS 4.0', 'OWASP SAMM 2.0', 'OWASP API Security Top 10 (2023)', 'OWASP MASVS 2.0']:
cur.execute("""
SELECT id, control_id, title, source_original_text
FROM compliance.canonical_controls
WHERE source_citation->>'source' = %s
AND source_citation->>'article_type' IS NULL
AND source_original_text IS NOT NULL
AND release_state NOT IN ('duplicate', 'too_close')
""", (source,))
controls = cur.fetchall()
for ctrl in controls:
uid, cid, title, text = ctrl
# Try own PDF first
result = find_in_pdf(text, source)
if result:
# Found in own PDF! Update article info
cur.execute("""
UPDATE compliance.canonical_controls
SET source_citation = source_citation ||
jsonb_build_object('article', %s, 'article_type', %s)
WHERE id = %s
AND (source_citation->>'article' IS DISTINCT FROM %s
OR source_citation->>'article_type' IS DISTINCT FROM %s)
""", (result[0], result[1], uid, result[0], result[1]))
continue
# Try other OWASP PDFs
for other_src in OWASP_PDFS:
if other_src == source:
continue
result = find_in_pdf(text, other_src)
if result:
all_fixes.append((uid, cid, other_src, result[0], result[1]))
break
print(f" Total wrong-source controls found: {len(all_fixes)}")
# Apply source fixes
fixed = 0
for uid, cid, correct_source, label, typ in all_fixes:
cur.execute("""
UPDATE compliance.canonical_controls
SET source_citation = source_citation ||
jsonb_build_object('source', %s, 'article', %s, 'article_type', %s)
WHERE id = %s
""", (correct_source, label, typ, uid,))
if cur.rowcount > 0:
fixed += 1
print(f" {cid:10s}{correct_source} / {label} [{typ}]")
print(f" Fixed: {fixed} controls")
conn.commit()
# ═══════════════════════════════════════════════════════════════
# SUMMARY
# ═══════════════════════════════════════════════════════════════
print(f"\n{'='*60}")
print("ZUSAMMENFASSUNG")
print("=" * 60)
print(f" OWASP Top 10 multilingual → duplicate: {dup_marked}")
print(f" Wrong source attribution → fixed: {fixed}")
# Final counts
cur.execute("""
SELECT release_state, count(*)
FROM compliance.canonical_controls
GROUP BY release_state
ORDER BY count(*) DESC
""")
print(f"\n DB release_state nach Cleanup:")
for row in cur.fetchall():
print(f" {row[0]:15s}: {row[1]:5d}")
cur.execute("""
SELECT count(*)
FROM compliance.canonical_controls
WHERE release_state NOT IN ('duplicate', 'too_close')
""")
active = cur.fetchone()[0]
print(f"\n Aktive Controls: {active}")
conn.close()

View File

@@ -0,0 +1,316 @@
"""Match unmatched OWASP ASVS/SAMM/MASVS controls against GitHub Markdown sources."""
import os
import re
import unicodedata
import psycopg2
import urllib.parse
from pathlib import Path
GITHUB_DIR = Path(os.path.expanduser("~/rag-ingestion/owasp-github"))
def normalize(s):
s = s.replace('\u00ad', '').replace('\xad', '')
s = s.replace('\u200b', '').replace('\u00a0', ' ')
s = s.replace('\ufb01', 'fi').replace('\ufb02', 'fl')
s = s.replace('\ufb00', 'ff').replace('\ufb03', 'ffi').replace('\ufb04', 'ffl')
s = s.replace('\u2019', "'").replace('\u2018', "'")
s = s.replace('\u201c', '"').replace('\u201d', '"')
s = s.replace('\u2013', '-').replace('\u2014', '-')
s = s.replace('\u2022', '-').replace('\u00b7', '-')
s = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', s)
s = unicodedata.normalize('NFC', s)
s = re.sub(r'\s+', ' ', s)
return s.strip()
# ── Load Markdown sources ──
def load_markdown_dir(path, pattern="*.md"):
"""Load all markdown files, return combined text and per-file index."""
texts = {}
for f in sorted(path.glob(pattern)):
try:
texts[f.name] = f.read_text(encoding='utf-8', errors='replace')
except:
pass
return texts
# ASVS 4.0 — V-files contain requirements
asvs_dir = GITHUB_DIR / "ASVS" / "4.0" / "en"
asvs_files = load_markdown_dir(asvs_dir)
asvs_full = "\n".join(asvs_files.values())
asvs_norm = normalize(asvs_full)
print(f"ASVS 4.0 Markdown: {len(asvs_files)} files, {len(asvs_full):,} chars")
# SAMM core — YAML + Markdown
samm_dir = GITHUB_DIR / "samm-core"
samm_texts = {}
for f in samm_dir.rglob("*.yml"):
try:
samm_texts[str(f.relative_to(samm_dir))] = f.read_text(encoding='utf-8', errors='replace')
except:
pass
for f in samm_dir.rglob("*.md"):
try:
samm_texts[str(f.relative_to(samm_dir))] = f.read_text(encoding='utf-8', errors='replace')
except:
pass
samm_full = "\n".join(samm_texts.values())
samm_norm = normalize(samm_full)
print(f"SAMM 2.0 source: {len(samm_texts)} files, {len(samm_full):,} chars")
# MASVS — control markdown files
masvs_dir = GITHUB_DIR / "masvs"
masvs_files = {}
for f in masvs_dir.rglob("*.md"):
try:
masvs_files[str(f.relative_to(masvs_dir))] = f.read_text(encoding='utf-8', errors='replace')
except:
pass
masvs_full = "\n".join(masvs_files.values())
masvs_norm = normalize(masvs_full)
print(f"MASVS 2.0 source: {len(masvs_files)} files, {len(masvs_full):,} chars")
# API Security
api_dir = GITHUB_DIR / "api-security"
api_files = {}
for f in api_dir.rglob("*.md"):
try:
api_files[str(f.relative_to(api_dir))] = f.read_text(encoding='utf-8', errors='replace')
except:
pass
api_full = "\n".join(api_files.values())
api_norm = normalize(api_full)
print(f"API Security source: {len(api_files)} files, {len(api_full):,} chars")
# Source → (normalized_text, index_builder)
SOURCE_GITHUB = {
"OWASP ASVS 4.0": asvs_norm,
"OWASP SAMM 2.0": samm_norm,
"OWASP MASVS 2.0": masvs_norm,
"OWASP API Security Top 10 (2023)": api_norm,
}
# Build indexes for each source
def build_asvs_index(text):
items = []
for m in re.finditer(r'(V\d+\.\d+(?:\.\d+)?)\b', text):
items.append((m.start(), m.group(1), "requirement"))
items.sort(key=lambda x: x[0])
seen = set()
return [(p, l, t) for p, l, t in items if l not in seen and not seen.add(l)]
def build_samm_index(text):
items = []
# SAMM practices have names like "Strategy & Metrics", sections numbered
for m in re.finditer(r'(?:^|\s)(\d+\.\d+(?:\.\d+)?)\s+[A-Z]', text):
items.append((m.start(), f"Section {m.group(1)}", "section"))
# Also find practice identifiers
for m in re.finditer(r'((?:Strategy|Education|Policy|Threat|Security Requirements|Secure Architecture|'
r'Secure Build|Secure Deployment|Defect Management|Environment Management|'
r'Incident Management|Requirements Testing|Security Testing|'
r'Design Review|Implementation Review|Operations Management)'
r'[^.\n]{0,30})', text):
items.append((m.start(), m.group(1)[:50], "section"))
items.sort(key=lambda x: x[0])
seen = set()
return [(p, l, t) for p, l, t in items if l not in seen and not seen.add(l)]
def build_masvs_index(text):
items = []
for m in re.finditer(r'(MASVS-[A-Z]+-\d+)', text):
items.append((m.start(), m.group(1), "requirement"))
items.sort(key=lambda x: x[0])
seen = set()
return [(p, l, t) for p, l, t in items if l not in seen and not seen.add(l)]
def build_api_index(text):
items = []
for m in re.finditer(r'(API\d+:\d{4})', text):
items.append((m.start(), m.group(1), "category"))
items.sort(key=lambda x: x[0])
seen = set()
return [(p, l, t) for p, l, t in items if l not in seen and not seen.add(l)]
SOURCE_INDEX_BUILDERS = {
"OWASP ASVS 4.0": build_asvs_index,
"OWASP SAMM 2.0": build_samm_index,
"OWASP MASVS 2.0": build_masvs_index,
"OWASP API Security Top 10 (2023)": build_api_index,
}
# Build all indexes on normalized text
source_indexes = {}
for name, norm_text in SOURCE_GITHUB.items():
builder = SOURCE_INDEX_BUILDERS[name]
idx = builder(norm_text)
source_indexes[name] = idx
print(f" {name}: {len(idx)} index entries")
def find_text(orig_text, source_name):
"""Find control text in GitHub source. Returns (label, type) or None."""
norm_text = SOURCE_GITHUB.get(source_name)
if not norm_text:
return None
idx = source_indexes.get(source_name, [])
orig_norm = normalize(orig_text)
if len(orig_norm) < 20:
return None
for start_frac in [0.25, 0.1, 0.5, 0.0, 0.75]:
for length in [80, 60, 40, 30, 20]:
start = max(0, int(len(orig_norm) * start_frac))
snippet = orig_norm[start:start+length]
if not snippet or len(snippet) < 15:
continue
pos = norm_text.find(snippet)
if pos >= 0:
label = "Unknown"
typ = "unknown"
for h_pos, h_label, h_type in reversed(idx):
if h_pos <= pos:
label = h_label
typ = h_type
break
return (label, typ)
return None
def find_in_any_github(orig_text, exclude_source=None):
"""Try all GitHub sources."""
for name in SOURCE_GITHUB:
if name == exclude_source:
continue
result = find_text(orig_text, name)
if result:
return (name, result[0], result[1])
return None
# ── DB ──
db_url = os.environ['DATABASE_URL']
parsed = urllib.parse.urlparse(db_url)
conn = psycopg2.connect(
host=parsed.hostname, port=parsed.port or 5432,
user=parsed.username, password=parsed.password,
dbname=parsed.path.lstrip('/'),
options="-c search_path=compliance,public"
)
cur = conn.cursor()
# ── Process each OWASP source ──
total_matched = 0
total_cross = 0
total_not_found = 0
all_updates = []
for source in ['OWASP ASVS 4.0', 'OWASP SAMM 2.0', 'OWASP MASVS 2.0', 'OWASP API Security Top 10 (2023)']:
cur.execute("""
SELECT id, control_id, title, source_original_text, release_state
FROM compliance.canonical_controls
WHERE source_citation->>'source' = %s
AND source_citation->>'article_type' IS NULL
AND source_original_text IS NOT NULL
AND release_state NOT IN ('duplicate', 'too_close')
ORDER BY control_id
""", (source,))
controls = cur.fetchall()
if not controls:
continue
print(f"\n{'='*60}")
print(f"{source}{len(controls)} unmatched active")
print(f"{'='*60}")
matched = 0
cross_matched = 0
not_found = 0
for ctrl in controls:
uid, cid, title, text, state = ctrl
# Try own GitHub source
result = find_text(text, source)
if result:
matched += 1
total_matched += 1
all_updates.append((uid, cid, source, result[0], result[1]))
print(f" {cid:10s}{result[0]:30s} [{result[1]}]")
continue
# Try other GitHub sources
cross = find_in_any_github(text, exclude_source=source)
if cross:
cross_matched += 1
total_cross += 1
all_updates.append((uid, cid, cross[0], cross[1], cross[2]))
print(f" {cid:10s} → [{cross[0]}] {cross[1]:20s} [{cross[2]}] (CROSS)")
continue
not_found += 1
total_not_found += 1
print(f"\n Own source matched: {matched}")
print(f" Cross-source: {cross_matched}")
print(f" Not found: {not_found}")
# ── Also try OWASP Top 10 remaining unmatched (34 active left after dup marking) ──
cur.execute("""
SELECT id, control_id, title, source_original_text, release_state
FROM compliance.canonical_controls
WHERE source_citation->>'source' = 'OWASP Top 10 (2021)'
AND source_citation->>'article_type' IS NULL
AND source_original_text IS NOT NULL
AND release_state NOT IN ('duplicate', 'too_close')
ORDER BY control_id
""")
top10_remaining = cur.fetchall()
if top10_remaining:
print(f"\n{'='*60}")
print(f"OWASP Top 10 (2021) — {len(top10_remaining)} remaining unmatched active")
print(f"{'='*60}")
for ctrl in top10_remaining:
uid, cid, title, text, state = ctrl
cross = find_in_any_github(text)
if cross:
total_cross += 1
all_updates.append((uid, cid, cross[0], cross[1], cross[2]))
print(f" {cid:10s} → [{cross[0]}] {cross[1]:20s} [{cross[2]}]")
else:
total_not_found += 1
# ── Summary ──
print(f"\n{'='*60}")
print(f"ZUSAMMENFASSUNG")
print(f"{'='*60}")
print(f" Matched in eigener GitHub-Quelle: {total_matched}")
print(f" Cross-source matched: {total_cross}")
print(f" Nicht gefunden: {total_not_found}")
print(f" Total Updates: {len(all_updates)}")
# ── Apply updates ──
if all_updates:
print(f"\nApplying {len(all_updates)} updates to DB...")
applied = 0
for uid, cid, correct_source, label, typ in all_updates:
# Update article + article_type, and fix source if cross-matched
cur.execute("""
UPDATE compliance.canonical_controls
SET source_citation = source_citation ||
jsonb_build_object('article', %s, 'article_type', %s)
WHERE id = %s
AND (source_citation->>'article' IS DISTINCT FROM %s
OR source_citation->>'article_type' IS DISTINCT FROM %s)
""", (label, typ, uid, label, typ))
if cur.rowcount > 0:
applied += 1
conn.commit()
print(f" Applied: {applied} controls updated")
# Type distribution
type_counts = {}
for _, _, _, _, typ in all_updates:
type_counts[typ] = type_counts.get(typ, 0) + 1
print(f"\n Article type distribution:")
for t, c in sorted(type_counts.items(), key=lambda x: -x[1]):
print(f" {t:12s}: {c:5d}")
conn.close()

View File

@@ -0,0 +1,357 @@
"""Phase 5: Source Normalization + Duplicate Hard Delete.
Steps:
1. OSCAL controls: add source_regulation to generation_metadata
2. Fix 20 v3 controls with NULL source (tag as manually_reviewed)
3. Fix empty-string source (DATA-631 → Telekommunikationsgesetz Oesterreich)
4. Fix OWASP cross-source misattributions (regulation_code vs actual source)
5. Hard delete duplicate/too_close controls (3,301 controls, 0 FK refs)
6. Clean up canonical_processed_chunks generated_control_ids
Usage:
export DATABASE_URL='postgresql://...'
python3 scripts/qa/phase5_normalize_and_cleanup.py [--dry-run] [--step N]
"""
import os
import sys
import json
import psycopg2
import urllib.parse
DRY_RUN = "--dry-run" in sys.argv
STEP_ONLY = None
for arg in sys.argv:
if arg.startswith("--step"):
idx = sys.argv.index(arg)
if idx + 1 < len(sys.argv):
STEP_ONLY = int(sys.argv[idx + 1])
db_url = os.environ['DATABASE_URL']
parsed = urllib.parse.urlparse(db_url)
conn = psycopg2.connect(
host=parsed.hostname, port=parsed.port or 5432,
user=parsed.username, password=parsed.password,
dbname=parsed.path.lstrip('/'),
options="-c search_path=compliance,public"
)
cur = conn.cursor()
def should_run(step):
return STEP_ONLY is None or STEP_ONLY == step
# ══════════════════════════════════════════════════════════════════
# Step 1: OSCAL controls — add source_regulation to generation_metadata
# ══════════════════════════════════════════════════════════════════
if should_run(1):
print("=" * 70)
print("STEP 1: OSCAL controls — source_regulation in generation_metadata")
print("=" * 70)
cur.execute("""
SELECT count(*)
FROM compliance.canonical_controls
WHERE generation_strategy = 'oscal_import'
AND (generation_metadata->>'source_regulation' IS NULL
OR generation_metadata->>'source_regulation' = '')
""")
count = cur.fetchone()[0]
print(f" OSCAL controls without source_regulation: {count}")
if count > 0:
if DRY_RUN:
print(f" [DRY RUN] Would update {count} controls")
else:
cur.execute("""
UPDATE compliance.canonical_controls
SET generation_metadata = COALESCE(generation_metadata, '{}'::jsonb)
|| '{"source_regulation": "nist_sp800_53r5"}'::jsonb
WHERE generation_strategy = 'oscal_import'
AND (generation_metadata->>'source_regulation' IS NULL
OR generation_metadata->>'source_regulation' = '')
""")
print(f" Updated: {cur.rowcount}")
print()
# ══════════════════════════════════════════════════════════════════
# Step 2: v3 controls with NULL source — tag source as best guess
# ══════════════════════════════════════════════════════════════════
if should_run(2):
print("=" * 70)
print("STEP 2: Fix v3 controls with NULL source")
print("=" * 70)
# These 20 controls are v3/document_grouped with no source or regulation.
# Based on title analysis, they cover:
# - Data protection/privacy topics (DSGVO-adjacent)
# - Software security (OWASP/NIST-adjacent)
# - Mobile security (OWASP MASVS-adjacent)
# Mark them as 'needs_review' and add a flag.
cur.execute("""
SELECT id, control_id, title
FROM compliance.canonical_controls
WHERE source_citation->>'source' IS NULL
AND pipeline_version = 3
AND release_state NOT IN ('duplicate', 'too_close')
""")
v3_null = cur.fetchall()
print(f" v3 controls with NULL source: {len(v3_null)}")
if v3_null:
if DRY_RUN:
print(f" [DRY RUN] Would mark {len(v3_null)} as needs_review")
else:
for ctrl_id_uuid, control_id, title in v3_null:
cur.execute("""
UPDATE compliance.canonical_controls
SET release_state = 'needs_review',
generation_metadata = COALESCE(generation_metadata, '{}'::jsonb)
|| '{"missing_source": true}'::jsonb
WHERE id = %s
""", (ctrl_id_uuid,))
print(f" Marked {len(v3_null)} as needs_review with missing_source flag")
print()
# ══════════════════════════════════════════════════════════════════
# Step 3: Fix empty-string source (DATA-631)
# ══════════════════════════════════════════════════════════════════
if should_run(3):
print("=" * 70)
print("STEP 3: Fix empty-string source")
print("=" * 70)
cur.execute("""
SELECT id, control_id, title,
generation_metadata->>'source_regulation' as reg
FROM compliance.canonical_controls
WHERE source_citation->>'source' = ''
AND release_state NOT IN ('duplicate', 'too_close')
""")
empty_src = cur.fetchall()
print(f" Controls with empty source: {len(empty_src)}")
for ctrl_id_uuid, control_id, title, reg in empty_src:
print(f" {control_id} | reg={reg} | {title[:60]}")
if reg == 'at_tkg':
new_source = 'Telekommunikationsgesetz Oesterreich'
else:
new_source = f"Unbekannt ({reg})"
if DRY_RUN:
print(f" [DRY RUN] Would set source='{new_source}'")
else:
cur.execute("""
UPDATE compliance.canonical_controls
SET source_citation = jsonb_set(
source_citation, '{source}', %s::jsonb
)
WHERE id = %s
""", (json.dumps(new_source), ctrl_id_uuid))
print(f" Set source='{new_source}'")
print()
# ══════════════════════════════════════════════════════════════════
# Step 4: Fix OWASP cross-source misattributions
# ══════════════════════════════════════════════════════════════════
if should_run(4):
print("=" * 70)
print("STEP 4: Fix OWASP cross-source misattributions")
print("=" * 70)
# Controls where source_citation.source doesn't match the regulation_code
OWASP_REG_TO_SOURCE = {
'owasp_top10_2021': 'OWASP Top 10 (2021)',
'owasp_asvs': 'OWASP ASVS 4.0',
'owasp_masvs': 'OWASP MASVS 2.0',
'owasp_samm': 'OWASP SAMM 2.0',
'owasp_api_top10_2023': 'OWASP API Security Top 10 (2023)',
}
# Strategy: Move controls to the regulation_code that matches their actual source
# i.e., if a control has source='OWASP ASVS 4.0' but reg='owasp_top10_2021',
# update the reg to 'owasp_asvs'
SOURCE_TO_REG = {v: k for k, v in OWASP_REG_TO_SOURCE.items()}
total_fixed = 0
for reg_code, expected_source in OWASP_REG_TO_SOURCE.items():
cur.execute("""
SELECT id, control_id, source_citation->>'source' as src
FROM compliance.canonical_controls
WHERE generation_metadata->>'source_regulation' = %s
AND source_citation->>'source' <> %s
AND release_state NOT IN ('duplicate', 'too_close')
""", (reg_code, expected_source))
mismatches = cur.fetchall()
if mismatches:
print(f"\n {reg_code}{len(mismatches)} Mismatches:")
for ctrl_id_uuid, control_id, actual_source in mismatches:
correct_reg = SOURCE_TO_REG.get(actual_source)
if correct_reg:
print(f" {control_id} | {actual_source} → reg={correct_reg}")
if not DRY_RUN:
cur.execute("""
UPDATE compliance.canonical_controls
SET generation_metadata = jsonb_set(
generation_metadata, '{source_regulation}', %s::jsonb
)
WHERE id = %s
""", (json.dumps(correct_reg), ctrl_id_uuid))
total_fixed += 1
else:
print(f" {control_id} | {actual_source} → no mapping found")
if DRY_RUN:
print(f"\n [DRY RUN] Would fix {total_fixed} misattributions")
else:
print(f"\n Fixed: {total_fixed} misattributions")
print()
# ══════════════════════════════════════════════════════════════════
# Step 5: Hard delete duplicate/too_close controls
# ══════════════════════════════════════════════════════════════════
if should_run(5):
print("=" * 70)
print("STEP 5: Hard delete duplicate/too_close controls")
print("=" * 70)
# Verify no FK references
for table, col in [
('canonical_control_mappings', 'control_id'),
('obligation_extractions', 'control_uuid'),
('crosswalk_matrix', 'master_control_uuid'),
('obligation_candidates', 'parent_control_uuid'),
]:
cur.execute(f"""
SELECT count(*)
FROM compliance.{table} t
JOIN compliance.canonical_controls cc ON cc.id = t.{col}
WHERE cc.release_state IN ('duplicate', 'too_close')
""")
fk_count = cur.fetchone()[0]
if fk_count > 0:
print(f" WARNING: {table}.{col} has {fk_count} refs to dup/too_close!")
print(f" ABORTING Step 5 — clean FK refs first!")
sys.exit(1)
else:
print(f" {table}.{col}: 0 refs ✓")
# Check self-references
cur.execute("""
SELECT count(*)
FROM compliance.canonical_controls child
JOIN compliance.canonical_controls parent ON parent.id = child.parent_control_uuid
WHERE parent.release_state IN ('duplicate', 'too_close')
""")
self_refs = cur.fetchone()[0]
if self_refs > 0:
print(f" WARNING: {self_refs} child controls reference dup/too_close parents!")
print(f" ABORTING Step 5!")
sys.exit(1)
print(f" Self-references: 0 ✓")
cur.execute("""
SELECT release_state, count(*)
FROM compliance.canonical_controls
WHERE release_state IN ('duplicate', 'too_close')
GROUP BY 1
""")
to_delete = {}
for state, cnt in cur.fetchall():
to_delete[state] = cnt
print(f"\n {state}: {cnt}")
total = sum(to_delete.values())
print(f"\n TOTAL to delete: {total}")
if DRY_RUN:
print(f" [DRY RUN] Would delete {total} controls")
else:
cur.execute("""
DELETE FROM compliance.canonical_controls
WHERE release_state IN ('duplicate', 'too_close')
""")
print(f" Deleted: {cur.rowcount} controls")
print()
# ══════════════════════════════════════════════════════════════════
# Step 6: Clean up canonical_processed_chunks generated_control_ids
# ══════════════════════════════════════════════════════════════════
if should_run(6):
print("=" * 70)
print("STEP 6: Clean up processed chunks (remove deleted control IDs)")
print("=" * 70)
if DRY_RUN and should_run(5):
print(" [DRY RUN] Skipping — depends on Step 5 deletion")
else:
# Find chunks that reference non-existent controls
cur.execute("""
SELECT id, generated_control_ids
FROM compliance.canonical_processed_chunks
WHERE generated_control_ids IS NOT NULL
AND generated_control_ids <> '[]'::jsonb
""")
chunks = cur.fetchall()
print(f" Chunks with generated_control_ids: {len(chunks)}")
# Get all existing control IDs
cur.execute("SELECT id::text FROM compliance.canonical_controls")
existing_ids = set(r[0] for r in cur.fetchall())
print(f" Existing controls: {len(existing_ids)}")
cleaned = 0
for chunk_id, control_ids in chunks:
if isinstance(control_ids, str):
control_ids = json.loads(control_ids)
if isinstance(control_ids, list):
valid_ids = [cid for cid in control_ids if cid in existing_ids]
if len(valid_ids) < len(control_ids):
removed = len(control_ids) - len(valid_ids)
cur.execute("""
UPDATE compliance.canonical_processed_chunks
SET generated_control_ids = %s::jsonb
WHERE id = %s
""", (json.dumps(valid_ids), chunk_id))
cleaned += 1
print(f" Chunks cleaned: {cleaned}")
print()
# ══════════════════════════════════════════════════════════════════
# Final summary
# ══════════════════════════════════════════════════════════════════
if not DRY_RUN:
conn.commit()
print("=" * 70)
print("COMMITTED. Final state:")
print("=" * 70)
else:
print("=" * 70)
print("[DRY RUN] No changes committed. Current state:")
print("=" * 70)
cur.execute("""
SELECT release_state, count(*)
FROM compliance.canonical_controls
GROUP BY 1
ORDER BY count(*) DESC
""")
total = 0
active = 0
for state, cnt in cur.fetchall():
total += cnt
if state not in ('duplicate', 'too_close'):
active += cnt
print(f" {state:15s}: {cnt:5d}")
print(f"\n TOTAL: {total}")
print(f" AKTIV: {active}")
conn.close()

View File

@@ -0,0 +1,655 @@
#!/usr/bin/env python3
"""
Phase 7.4: Generate new controls for gap articles via Anthropic Claude Sonnet.
Reads gap_analysis_results.json, extracts article text from PDFs,
calls Claude Sonnet to generate controls, inserts into DB.
Usage:
python3 phase74_generate_gap_controls.py --dry-run # show what would be generated
python3 phase74_generate_gap_controls.py # generate and insert
python3 phase74_generate_gap_controls.py --source "DSGVO" # filter by source
python3 phase74_generate_gap_controls.py --resume # skip already-generated articles
"""
import os
import sys
import json
import re
import time
import hashlib
import argparse
import psycopg2
import urllib.parse
import requests
from pathlib import Path
from collections import Counter
sys.path.insert(0, os.path.dirname(__file__))
from pdf_qa_all import (
SOURCE_FILE_MAP, read_file, classify_doc, normalize,
build_eu_article_index, build_de_law_index, build_nist_index,
build_owasp_index, build_generic_index, MAX_ARTICLES,
)
# ── Config ──────────────────────────────────────────────────────────
ANTHROPIC_URL = "https://api.anthropic.com/v1/messages"
ANTHROPIC_MODEL = os.environ.get("CONTROL_GEN_ANTHROPIC_MODEL", "claude-sonnet-4-6")
ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", "")
PIPELINE_VERSION = 5
GAP_RESULTS_FILE = "/tmp/gap_analysis_results.json"
PDF_DIR = Path(os.path.expanduser("~/rag-ingestion/pdfs"))
try:
import fitz
except ImportError:
fitz = None
# ── Source name → regulation_code reverse map ────────────────────────
# Built from REGULATION_LICENSE_MAP in control_generator.py
SOURCE_TO_REGCODE = {
"DSGVO (EU) 2016/679": "eu_2016_679",
"KI-Verordnung (EU) 2024/1689": "eu_2024_1689",
"NIS2-Richtlinie (EU) 2022/2555": "eu_2022_2555",
"Cyber Resilience Act (CRA)": "eu_2024_2847",
"Maschinenverordnung (EU) 2023/1230": "eu_2023_1230",
"EU Blue Guide 2022": "eu_blue_guide_2022",
"Markets in Crypto-Assets (MiCA)": "mica",
"Batterieverordnung (EU) 2023/1542": "eu_2023_1542",
"AML-Verordnung": "amlr",
"Data Governance Act (DGA)": "dga",
"Data Act": "data_act",
"GPSR (EU) 2023/988": "gpsr",
"IFRS-Übernahmeverordnung": "ifrs",
"NIST SP 800-53 Rev. 5": "nist_sp800_53r5",
"NIST SP 800-207 (Zero Trust)": "nist_sp800_207",
"NIST SP 800-63-3": "nist_sp800_63_3",
"NIST AI Risk Management Framework": "nist_ai_rmf",
"NIST SP 800-218 (SSDF)": "nist_sp_800_218",
"NIST Cybersecurity Framework 2.0": "nist_csf_2_0",
"OWASP Top 10 (2021)": "owasp_top10",
"OWASP ASVS 4.0": "owasp_asvs",
"OWASP SAMM 2.0": "owasp_samm",
"OWASP API Security Top 10 (2023)": "owasp_api_top10",
"OWASP MASVS 2.0": "owasp_masvs",
"ENISA ICS/SCADA Dependencies": "enisa_ics_scada",
"ENISA Supply Chain Good Practices": "enisa_supply_chain",
"CISA Secure by Design": "cisa_sbd",
"Bundesdatenschutzgesetz (BDSG)": "bdsg",
"Gewerbeordnung (GewO)": "gewo",
"Handelsgesetzbuch (HGB)": "hgb",
"Abgabenordnung (AO)": "ao",
"OECD KI-Empfehlung": "oecd_ai_principles",
}
# License info per regulation code (from REGULATION_LICENSE_MAP)
LICENSE_MAP = {
"eu_2016_679": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
"eu_2024_1689": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
"eu_2022_2555": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
"eu_2024_2847": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
"eu_2023_1230": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
"eu_blue_guide_2022": {"license": "EU_PUBLIC", "rule": 1, "source_type": "guideline"},
"mica": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
"eu_2023_1542": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
"amlr": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
"dga": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
"data_act": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
"gpsr": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
"ifrs": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
"nist_sp800_53r5": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
"nist_sp800_207": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
"nist_sp800_63_3": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
"nist_ai_rmf": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
"nist_sp_800_218": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
"nist_csf_2_0": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
"owasp_top10": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard"},
"owasp_asvs": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard"},
"owasp_samm": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard"},
"owasp_api_top10": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard"},
"owasp_masvs": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard"},
"enisa_ics_scada": {"license": "CC-BY-4.0", "rule": 2, "source_type": "guideline"},
"enisa_supply_chain": {"license": "CC-BY-4.0", "rule": 2, "source_type": "guideline"},
"cisa_sbd": {"license": "US_GOV_PUBLIC", "rule": 1, "source_type": "guideline"},
"bdsg": {"license": "DE_LAW", "rule": 1, "source_type": "law"},
"gewo": {"license": "DE_LAW", "rule": 1, "source_type": "law"},
"hgb": {"license": "DE_LAW", "rule": 1, "source_type": "law"},
"ao": {"license": "DE_LAW", "rule": 1, "source_type": "law"},
"oecd_ai_principles": {"license": "OECD_PUBLIC", "rule": 2, "source_type": "standard"},
}
# Domain detection keywords
DOMAIN_KEYWORDS = {
"AUTH": ["authentifizierung", "anmeldung", "login", "passwort", "identit", "identity", "credential"],
"CRYP": ["verschlüsselung", "kryptogra", "encrypt", "cipher", "hash", "tls", "ssl", "signatur"],
"NET": ["netzwerk", "network", "firewall", "router", "dns", "ip-adress"],
"DATA": ["daten", "data", "personenbezogen", "datenschutz", "privacy", "gdpr", "dsgvo", "verarbeitung"],
"LOG": ["protokoll", "logging", "audit", "nachvollzieh", "aufzeichn"],
"ACC": ["zugriff", "access", "berechtigung", "autorisierung", "authorization", "rolle"],
"SEC": ["sicherheit", "security", "schutz", "protect", "schwachstell", "vulnerab"],
"INC": ["vorfall", "incident", "breach", "meldung", "reaktion", "response", "notfall"],
"AI": ["künstliche intelligenz", "ki-system", "ai system", "machine learning", "algorithm", "hochrisiko-ki"],
"COMP": ["compliance", "konformität", "audit", "zertifizierung", "regulier", "vorschrift"],
"GOV": ["behörde", "aufsicht", "governance", "marktüberwachung", "authority"],
"FIN": ["finanz", "zahlungs", "payment", "crypto", "krypto-", "geldwäsche", "aml"],
"ENV": ["umwelt", "environment", "batterie", "recycling", "entsorgu", "nachhaltig"],
}
# ── Prompt (same as control_generator.py) ────────────────────────────
SYSTEM_PROMPT = """Du bist ein Security-Compliance-Experte. Strukturiere den gegebenen Text
als praxisorientiertes Security Control. Erstelle eine verständliche, umsetzbare Formulierung.
Antworte NUR mit validem JSON. Bei mehreren Controls antworte mit einem JSON-Array."""
APPLICABILITY_PROMPT = """- applicable_industries: Liste der Branchen fuer die dieses Control relevant ist.
Verwende ["all"] wenn der Control branchenuebergreifend gilt.
Moegliche Werte: "all", "Technologie / IT", "IT Dienstleistungen", "E-Commerce / Handel",
"Finanzdienstleistungen", "Versicherungen", "Gesundheitswesen", "Pharma", "Bildung",
"Beratung / Consulting", "Marketing / Agentur", "Produktion / Industrie",
"Logistik / Transport", "Immobilien", "Bau", "Energie", "Automobil",
"Luft- / Raumfahrt", "Maschinenbau", "Anlagenbau", "Automatisierung", "Robotik",
"Messtechnik", "Agrar", "Chemie", "Minen / Bergbau", "Telekommunikation",
"Medien / Verlage", "Gastronomie / Hotellerie", "Recht / Kanzlei",
"Oeffentlicher Dienst", "Verteidigung / Ruestung", "Wasser- / Abwasserwirtschaft",
"Lebensmittel", "Digitale Infrastruktur", "Weltraum", "Post / Kurierdienste",
"Abfallwirtschaft", "Forschung"
- applicable_company_size: Ab welcher Unternehmensgroesse gilt dieses Control?
Verwende ["all"] wenn keine Groessenbeschraenkung.
Moegliche Werte: "all", "micro", "small", "medium", "large", "enterprise"
- scope_conditions: null wenn keine besonderen Bedingungen, sonst:
{"requires_any": ["signal"], "description": "Erklaerung"}
Moegliche Signale: "uses_ai", "third_country_transfer", "processes_health_data",
"processes_minors_data", "automated_decisions", "employee_monitoring",
"video_surveillance", "financial_data", "is_kritis_operator", "payment_services" """
CATEGORY_LIST = [
"Datenschutz-Grundlagen", "Betroffenenrechte", "Technische Massnahmen",
"Organisatorische Massnahmen", "Auftragsverarbeitung", "Datentransfer",
"Risikomanagement", "Incident Response", "KI-Regulierung", "Cybersicherheit",
"Zugriffskontrolle", "Kryptographie", "Netzwerksicherheit", "Compliance-Management",
"Produktsicherheit", "Marktüberwachung", "Supply Chain Security",
"Finanzregulierung", "Arbeitsrecht", "Gewerberecht", "Handelsrecht",
"Umwelt / Nachhaltigkeit", "Dokumentation", "Schulung / Awareness",
]
CATEGORY_LIST_STR = ", ".join(f'"{c}"' for c in CATEGORY_LIST)
def build_prompt(source_name, article_label, article_text, license_type):
return f"""Strukturiere den folgenden Gesetzestext als Security/Compliance Control.
Du DARFST den Originaltext verwenden (Quelle: {source_name}, {license_type}).
WICHTIG: Erstelle eine verständliche, praxisorientierte Formulierung.
Der Originaltext wird separat gespeichert — deine Formulierung soll klar und umsetzbar sein.
Gib JSON zurück mit diesen Feldern:
- title: Kurzer prägnanter Titel (max 100 Zeichen)
- objective: Was soll erreicht werden? (1-3 Sätze)
- rationale: Warum ist das wichtig? (1-2 Sätze)
- requirements: Liste von konkreten Anforderungen (Strings)
- test_procedure: Liste von Prüfschritten (Strings)
- evidence: Liste von Nachweisdokumenten (Strings)
- severity: low/medium/high/critical
- tags: Liste von Tags
- domain: Fachgebiet als Kuerzel (AUTH=Authentifizierung, CRYP=Kryptographie, NET=Netzwerk, DATA=Datenschutz, LOG=Logging, ACC=Zugriffskontrolle, SEC=IT-Sicherheit, INC=Vorfallmanagement, AI=KI, COMP=Compliance, GOV=Behoerden/Verwaltung, LAB=Arbeitsrecht, FIN=Finanzregulierung, TRD=Gewerbe/Handelsrecht, ENV=Umwelt, HLT=Gesundheit)
- category: Inhaltliche Kategorie. Moegliche Werte: {CATEGORY_LIST_STR}
- target_audience: Liste der Zielgruppen (z.B. "unternehmen", "behoerden", "entwickler", "datenschutzbeauftragte", "geschaeftsfuehrung", "it-abteilung", "rechtsabteilung", "compliance-officer")
- source_article: Artikel-/Paragraphen-Referenz (z.B. "Artikel 10", "§ 42")
- source_paragraph: Absatz-Referenz (z.B. "Absatz 5", "Nr. 2")
{APPLICABILITY_PROMPT}
Text: {article_text[:3000]}
Quelle: {source_name}, {article_label}"""
# ── PDF article extraction ───────────────────────────────────────────
def extract_article_text(pdf_file, article_label, doc_type, full_text=None):
"""Extract the text of a specific article from a PDF."""
if full_text is None:
full_text = read_file(pdf_file)
if not full_text:
return ""
if doc_type == "eu_regulation":
art_num_match = re.search(r'\d+', article_label)
if not art_num_match:
return ""
num = int(art_num_match.group())
pattern = rf'\nArtikel\s+{num}\s*\n'
match = re.search(pattern, full_text)
if not match:
return ""
start = match.start()
next_pattern = rf'\nArtikel\s+{num + 1}\s*\n'
next_match = re.search(next_pattern, full_text)
end = next_match.start() if next_match else min(start + 5000, len(full_text))
return full_text[start:end].strip()[:3000]
elif doc_type == "de_law":
para_match = re.search(r'\d+', article_label)
if not para_match:
return ""
num = int(para_match.group())
pattern = rf'\\s+{num}\b'
match = re.search(pattern, full_text)
if not match:
return ""
start = match.start()
next_pattern = rf'\\s+{num + 1}\b'
next_match = re.search(next_pattern, full_text)
end = next_match.start() if next_match else min(start + 5000, len(full_text))
return full_text[start:end].strip()[:3000]
elif doc_type == "nist":
escaped = re.escape(article_label)
match = re.search(rf'(?:^|\n)\s*{escaped}\b', full_text)
if not match:
return ""
start = match.start()
return full_text[start:start + 3000].strip()
else:
# Generic / OWASP / ENISA
escaped = re.escape(article_label)
match = re.search(rf'(?:^|\n).*{escaped}\b', full_text)
if not match:
return ""
start = match.start()
return full_text[start:start + 3000].strip()
# ── Anthropic API ────────────────────────────────────────────────────
def call_anthropic(prompt, system_prompt):
"""Call Anthropic API. Returns (parsed_data, raw_text, usage, error)."""
headers = {
"x-api-key": ANTHROPIC_API_KEY,
"anthropic-version": "2023-06-01",
"content-type": "application/json",
}
payload = {
"model": ANTHROPIC_MODEL,
"max_tokens": 4096,
"system": system_prompt,
"messages": [{"role": "user", "content": prompt}],
}
try:
resp = requests.post(ANTHROPIC_URL, headers=headers, json=payload, timeout=120)
if resp.status_code != 200:
return None, "", {}, f"HTTP {resp.status_code}: {resp.text[:200]}"
data = resp.json()
content = data["content"][0]["text"] if data.get("content") else ""
usage = data.get("usage", {})
parsed = parse_json(content)
return parsed, content, usage, None
except Exception as e:
return None, "", {}, str(e)
def parse_json(text):
"""Parse JSON from LLM response, handling markdown fences."""
text = text.strip()
if text.startswith("```"):
lines = text.split("\n")
text = "\n".join(lines[1:-1] if lines[-1].strip().startswith("```") else lines[1:])
text = text.strip()
try:
data = json.loads(text)
if isinstance(data, list):
return data[0] if data else None
return data
except json.JSONDecodeError:
match = re.search(r'\{[\s\S]*\}', text)
if match:
try:
return json.loads(match.group())
except json.JSONDecodeError:
return None
return None
# ── Domain detection ─────────────────────────────────────────────────
def detect_domain(text):
text_lower = text.lower()
scores = {}
for domain, keywords in DOMAIN_KEYWORDS.items():
score = sum(1 for kw in keywords if kw in text_lower)
if score > 0:
scores[domain] = score
if scores:
return max(scores, key=scores.get)
return "SEC"
# ── Control ID generation ────────────────────────────────────────────
def generate_control_id(domain, cur):
"""Generate next available control_id for domain prefix.
Uses MAX(numeric suffix) to find the true highest number,
avoiding gaps from string-sorted IDs (e.g. COMP-99 > COMP-1000 in text sort).
"""
prefix = domain.upper()[:4]
cur.execute("""
SELECT MAX(CAST(SPLIT_PART(control_id, '-', 2) AS INTEGER))
FROM compliance.canonical_controls
WHERE control_id LIKE %s
AND SPLIT_PART(control_id, '-', 2) ~ '^[0-9]+$'
""", (f"{prefix}-%",))
row = cur.fetchone()
if row and row[0] is not None:
return f"{prefix}-{row[0] + 1}"
return f"{prefix}-001"
# ── Main ─────────────────────────────────────────────────────────────
def main():
parser = argparse.ArgumentParser(description="Phase 7.4: Generate controls for gap articles")
parser.add_argument("--dry-run", action="store_true", help="Show what would be generated")
parser.add_argument("--source", type=str, help="Filter by source name substring")
parser.add_argument("--resume", action="store_true", help="Skip articles that already have controls")
parser.add_argument("--results", default=GAP_RESULTS_FILE, help="Path to gap_analysis_results.json")
args = parser.parse_args()
if not ANTHROPIC_API_KEY:
print("ERROR: Set ANTHROPIC_API_KEY")
sys.exit(1)
# Load gap results
with open(args.results) as f:
gaps = json.load(f)
total_gaps = sum(len(g["gap_articles"]) for g in gaps)
print(f"Loaded {len(gaps)} sources with {total_gaps} gap articles")
if args.source:
gaps = [g for g in gaps if args.source.lower() in g["source"].lower()]
total_gaps = sum(len(g["gap_articles"]) for g in gaps)
print(f"Filtered to {len(gaps)} sources, {total_gaps} gaps")
# DB connection with keepalive + reconnect helper
db_url = os.environ['DATABASE_URL']
parsed = urllib.parse.urlparse(db_url)
def connect_db():
"""Create DB connection with TCP keepalive."""
c = psycopg2.connect(
host=parsed.hostname, port=parsed.port or 5432,
user=parsed.username, password=parsed.password,
dbname=parsed.path.lstrip('/'),
options="-c search_path=compliance,public",
keepalives=1, keepalives_idle=30,
keepalives_interval=10, keepalives_count=5,
)
return c, c.cursor()
conn, cur = connect_db()
def ensure_db():
"""Reconnect if connection is dead."""
nonlocal conn, cur
try:
cur.execute("SELECT 1")
except Exception:
print(" [RECONNECT] DB connection lost, reconnecting...")
try:
conn.close()
except Exception:
pass
conn, cur = connect_db()
return True
return False
# Get framework UUID
cur.execute("SELECT id FROM compliance.canonical_control_frameworks WHERE framework_id = 'bp_security_v1' LIMIT 1")
fw_row = cur.fetchone()
if not fw_row:
print("ERROR: Framework bp_security_v1 not found")
sys.exit(1)
framework_uuid = fw_row[0]
# If resuming, load existing articles per source
existing_articles = {}
if args.resume:
cur.execute("""
SELECT source_citation->>'source', source_citation->>'article'
FROM compliance.canonical_controls
WHERE source_citation->>'article' IS NOT NULL
""")
for src, art in cur.fetchall():
existing_articles.setdefault(src, set()).add(art)
print(f"Resume mode: {sum(len(v) for v in existing_articles.values())} existing article-control pairs")
# Stats
stats = Counter()
total_input_tokens = 0
total_output_tokens = 0
generated_ids = []
errors = []
t_start = time.time()
# Pre-read PDFs (cache full text per source)
pdf_cache = {}
for gap_source in sorted(gaps, key=lambda g: -len(g["gap_articles"])):
source_name = gap_source["source"]
gap_articles = gap_source["gap_articles"]
filename = SOURCE_FILE_MAP.get(source_name)
reg_code = SOURCE_TO_REGCODE.get(source_name, "unknown")
license_info = LICENSE_MAP.get(reg_code, {"license": "UNKNOWN", "rule": 1, "source_type": "unknown"})
doc_type = classify_doc(source_name)
if not filename:
stats["skipped_no_pdf"] += len(gap_articles)
continue
# Read PDF once per source
if source_name not in pdf_cache:
pdf_cache[source_name] = read_file(filename)
full_text = pdf_cache[source_name]
if not full_text:
stats["skipped_no_pdf"] += len(gap_articles)
continue
print(f"\n{'='*70}")
print(f"{source_name}{len(gap_articles)} gaps (rule {license_info['rule']}, {doc_type})")
print(f"{'='*70}")
for gap in gap_articles:
article_label = gap["label"]
article_type = gap["type"]
# Skip if already has controls (resume mode)
if args.resume and article_label in existing_articles.get(source_name, set()):
stats["skipped_exists"] += 1
continue
# Skip non-substantive NIST sections (intro chapters)
if doc_type == "nist" and article_type == "section":
section_match = re.match(r'Section (\d+)', article_label)
if section_match and int(section_match.group(1)) <= 3:
stats["skipped_intro"] += 1
continue
# Extract article text
article_text = extract_article_text(filename, article_label, doc_type, full_text)
if not article_text or len(article_text) < 30:
stats["skipped_short_text"] += 1
print(f" SKIP {article_label}: text too short ({len(article_text)} chars)")
continue
if args.dry_run:
print(f" [DRY] {article_label} ({len(article_text)} chars)")
stats["would_generate"] += 1
continue
# Call Anthropic
prompt = build_prompt(source_name, article_label, article_text, license_info["license"])
data, raw, usage, error = call_anthropic(prompt, SYSTEM_PROMPT)
total_input_tokens += usage.get("input_tokens", 0)
total_output_tokens += usage.get("output_tokens", 0)
if error:
stats["api_error"] += 1
errors.append(f"{source_name} {article_label}: {error}")
print(f" ERROR {article_label}: {error}")
time.sleep(5)
continue
if not data:
stats["parse_error"] += 1
print(f" PARSE ERROR {article_label}")
continue
# Ensure DB is alive before writing
ensure_db()
# Build control
title = str(data.get("title", ""))[:200]
objective = str(data.get("objective", ""))
rationale = str(data.get("rationale", ""))
domain = str(data.get("domain", detect_domain(article_text))).upper()[:4]
if not domain or len(domain) < 2:
domain = detect_domain(article_text)
control_id = generate_control_id(domain, cur)
severity = str(data.get("severity", "medium")).lower()
if severity not in ("low", "medium", "high", "critical"):
severity = "medium"
requirements = data.get("requirements", [])
if not isinstance(requirements, list):
requirements = [str(requirements)]
test_procedure = data.get("test_procedure", [])
if not isinstance(test_procedure, list):
test_procedure = [str(test_procedure)]
evidence = data.get("evidence", [])
if not isinstance(evidence, list):
evidence = [str(evidence)]
tags = data.get("tags", [])
if not isinstance(tags, list):
tags = []
target_audience = data.get("target_audience", [])
if not isinstance(target_audience, list):
target_audience = []
applicable_industries = data.get("applicable_industries", ["all"])
if not isinstance(applicable_industries, list):
applicable_industries = ["all"]
applicable_company_size = data.get("applicable_company_size", ["all"])
if not isinstance(applicable_company_size, list):
applicable_company_size = ["all"]
scope_conditions = data.get("scope_conditions")
source_citation = {
"source": source_name,
"article": data.get("source_article", article_label),
"paragraph": data.get("source_paragraph", ""),
"article_type": article_type,
"license": license_info["license"],
"source_type": license_info["source_type"],
}
generation_metadata = {
"processing_path": "phase74_gap_fill",
"license_rule": license_info["rule"],
"source_regulation": reg_code,
"source_article": article_label,
"gap_fill": True,
}
category = str(data.get("category", "")) or None
# Insert into DB
try:
cur.execute("""
INSERT INTO compliance.canonical_controls (
framework_id, control_id, title, objective, rationale,
scope, requirements, test_procedure, evidence,
severity, risk_score, implementation_effort,
open_anchors, release_state, tags,
license_rule, source_original_text, source_citation,
customer_visible, generation_metadata,
verification_method, category, generation_strategy,
target_audience, pipeline_version,
applicable_industries, applicable_company_size, scope_conditions
) VALUES (
%s, %s, %s, %s, %s,
%s, %s, %s, %s,
%s, %s, %s,
%s, %s, %s,
%s, %s, %s,
%s, %s,
%s, %s, %s,
%s, %s,
%s, %s, %s
)
ON CONFLICT (framework_id, control_id) DO NOTHING
RETURNING id
""", (
framework_uuid, control_id, title, objective, rationale,
json.dumps({}), json.dumps(requirements), json.dumps(test_procedure), json.dumps(evidence),
severity, 5, "m",
json.dumps([]), "draft", json.dumps(tags),
license_info["rule"], article_text, json.dumps(source_citation),
True, json.dumps(generation_metadata),
"document", category, "phase74_gap_fill",
json.dumps(target_audience), PIPELINE_VERSION,
json.dumps(applicable_industries), json.dumps(applicable_company_size),
json.dumps(scope_conditions) if scope_conditions else None,
))
conn.commit()
row = cur.fetchone()
if row:
generated_ids.append(str(row[0]))
stats["generated"] += 1
print(f" OK {control_id}: {title[:60]}")
else:
stats["conflict"] += 1
print(f" CONFLICT {control_id} (already exists)")
except Exception as e:
conn.rollback()
stats["db_error"] += 1
errors.append(f"DB {control_id}: {str(e)[:100]}")
print(f" DB ERROR {control_id}: {str(e)[:100]}")
# Rate limit: ~0.5s between calls
time.sleep(0.5)
# ── Summary ──────────────────────────────────────────────────────
elapsed = time.time() - t_start
cost = (total_input_tokens * 3 + total_output_tokens * 15) / 1_000_000
print(f"\n\n{'='*70}")
print(f"PHASE 7.4 — {'DRY-RUN' if args.dry_run else 'ERGEBNIS'}")
print(f"{'='*70}")
print(f" Laufzeit: {elapsed/60:.1f} min")
print(f" API-Kosten: ${cost:.2f}")
print(f" Input Tokens: {total_input_tokens:,}")
print(f" Output Tokens: {total_output_tokens:,}")
print()
for key in sorted(stats.keys()):
print(f" {key:<25s}: {stats[key]:5d}")
print()
if generated_ids:
print(f" Neue Control-IDs: {len(generated_ids)}")
# Save generated IDs
with open("/tmp/phase74_generated_ids.json", 'w') as f:
json.dump(generated_ids, f)
print(f" IDs gespeichert: /tmp/phase74_generated_ids.json")
if errors:
print(f"\n Fehler ({len(errors)}):")
for e in errors[:20]:
print(f" {e}")
if len(errors) > 20:
print(f" ... und {len(errors)-20} weitere")
conn.close()
if __name__ == "__main__":
main()

218
scripts/qa/run_job.sh Executable file
View File

@@ -0,0 +1,218 @@
#!/usr/bin/env bash
# ─────────────────────────────────────────────────────────────
# Robust job runner for QA scripts on Mac Mini
#
# Usage:
# ./run_job.sh <script.py> [args...] # start job
# ./run_job.sh --status # show running jobs
# ./run_job.sh --kill <script.py> # kill a running job
# ./run_job.sh --log <script.py> # tail log
#
# Features:
# - Loads .env automatically (COMPLIANCE_DATABASE_URL → DATABASE_URL)
# - PID-file prevents duplicate runs
# - Unbuffered Python output
# - Structured log files in /tmp/qa_jobs/
# ─────────────────────────────────────────────────────────────
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
PROJECT_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)"
JOB_DIR="/tmp/qa_jobs"
mkdir -p "$JOB_DIR"
# ── Load .env ────────────────────────────────────────────────
load_env() {
local envfile="$PROJECT_DIR/.env"
if [[ -f "$envfile" ]]; then
# Export all vars from .env
set -a
# shellcheck disable=SC1090
source "$envfile"
set +a
fi
# Map COMPLIANCE_DATABASE_URL → DATABASE_URL if needed
if [[ -z "${DATABASE_URL:-}" && -n "${COMPLIANCE_DATABASE_URL:-}" ]]; then
export DATABASE_URL="$COMPLIANCE_DATABASE_URL"
fi
}
# ── Job name from script path ─────────────────────────────────
job_name() {
basename "$1" .py
}
pid_file() {
echo "$JOB_DIR/$(job_name "$1").pid"
}
log_file() {
echo "$JOB_DIR/$(job_name "$1").log"
}
# ── Status ────────────────────────────────────────────────────
show_status() {
echo "═══════════════════════════════════════════════════════"
echo "QA Job Status ($(date '+%Y-%m-%d %H:%M:%S'))"
echo "═══════════════════════════════════════════════════════"
local found=0
for pidfile in "$JOB_DIR"/*.pid; do
[[ -f "$pidfile" ]] || continue
found=1
local name
name=$(basename "$pidfile" .pid)
local pid
pid=$(cat "$pidfile")
local logf="$JOB_DIR/$name.log"
if kill -0 "$pid" 2>/dev/null; then
local lines
lines=$(wc -l < "$logf" 2>/dev/null || echo 0)
local errors
errors=$(grep -c "ERROR" "$logf" 2>/dev/null || echo 0)
local last_line
last_line=$(tail -1 "$logf" 2>/dev/null || echo "(empty)")
echo "$name (PID $pid) — RUNNING"
echo " Log: $logf ($lines lines, $errors errors)"
echo " Last: $last_line"
else
echo "$name (PID $pid) — STOPPED"
echo " Log: $logf"
rm -f "$pidfile"
fi
echo ""
done
if [[ $found -eq 0 ]]; then
echo " No jobs running."
fi
}
# ── Kill ──────────────────────────────────────────────────────
kill_job() {
local script="$1"
local pf
pf=$(pid_file "$script")
if [[ ! -f "$pf" ]]; then
echo "No PID file for $(job_name "$script")"
return 1
fi
local pid
pid=$(cat "$pf")
if kill -0 "$pid" 2>/dev/null; then
kill "$pid"
echo "Killed $(job_name "$script") (PID $pid)"
else
echo "Process $pid already stopped"
fi
rm -f "$pf"
}
# ── Tail log ──────────────────────────────────────────────────
tail_log() {
local script="$1"
local lf
lf=$(log_file "$script")
if [[ ! -f "$lf" ]]; then
echo "No log file: $lf"
return 1
fi
tail -50 "$lf"
}
# ── Start job ─────────────────────────────────────────────────
start_job() {
local script="$1"
shift
local args=("$@")
# Resolve script path
local script_path="$script"
if [[ ! -f "$script_path" ]]; then
script_path="$SCRIPT_DIR/$script"
fi
if [[ ! -f "$script_path" ]]; then
echo "ERROR: Script not found: $script"
return 1
fi
local name
name=$(job_name "$script")
local pf
pf=$(pid_file "$script")
local lf
lf=$(log_file "$script")
# Check for already-running instance
if [[ -f "$pf" ]]; then
local existing_pid
existing_pid=$(cat "$pf")
if kill -0 "$existing_pid" 2>/dev/null; then
echo "ERROR: $name already running (PID $existing_pid)"
echo "Use: $0 --kill $script"
return 1
fi
rm -f "$pf"
fi
# Load environment
load_env
# Verify required env vars
if [[ -z "${DATABASE_URL:-}" ]]; then
echo "ERROR: DATABASE_URL not set (checked .env)"
return 1
fi
# Start
echo "Starting $name..."
echo " Script: $script_path"
echo " Args: ${args[*]:-none}"
echo " Log: $lf"
nohup python3 -u "$script_path" "${args[@]}" > "$lf" 2>&1 &
local pid=$!
echo "$pid" > "$pf"
echo " PID: $pid"
echo ""
# Wait a moment and check it started OK
sleep 3
if ! kill -0 "$pid" 2>/dev/null; then
echo "ERROR: Process died immediately. Log output:"
cat "$lf"
rm -f "$pf"
return 1
fi
local lines
lines=$(wc -l < "$lf" 2>/dev/null || echo 0)
echo "Running OK ($lines log lines so far)"
echo "Monitor with: $0 --status"
echo "Tail log: $0 --log $script"
}
# ── Main ──────────────────────────────────────────────────────
case "${1:-}" in
--status|-s)
show_status
;;
--kill|-k)
[[ -n "${2:-}" ]] || { echo "Usage: $0 --kill <script.py>"; exit 1; }
kill_job "$2"
;;
--log|-l)
[[ -n "${2:-}" ]] || { echo "Usage: $0 --log <script.py>"; exit 1; }
tail_log "$2"
;;
--help|-h|"")
echo "Usage:"
echo " $0 <script.py> [args...] Start a QA job"
echo " $0 --status Show running jobs"
echo " $0 --kill <script.py> Kill a running job"
echo " $0 --log <script.py> Tail job log"
;;
*)
start_job "$@"
;;
esac

307
scripts/qa/sync_db.py Normal file
View File

@@ -0,0 +1,307 @@
#!/usr/bin/env python3
"""Sync canonical control tables between production and local DB.
Modes:
--pull Production → Local (initial sync, full table copy)
--push Local → Production (incremental, only new obligation_candidates)
--loop Run --push every N minutes (default 60)
Usage:
python3 sync_db.py --pull # Full sync production → local
python3 sync_db.py --push # Push new obligations to production
python3 sync_db.py --loop 60 # Push every 60 minutes
python3 sync_db.py --pull --tables canonical_controls # Only one table
"""
import argparse
import json
import os
import sys
import time
import urllib.parse
import io
import psycopg2
import psycopg2.extras
import psycopg2.extensions
# Register JSON adapter so dicts are automatically converted to JSONB
psycopg2.extensions.register_adapter(dict, psycopg2.extras.Json)
# ── DB Config ────────────────────────────────────────────────────────
PROD_URL = os.environ.get(
"PROD_DATABASE_URL",
"postgresql://postgres:GmyFD3wnU1NrKBdpU1nwLdE8MLts0A0eez8L5XXdvUCe05lWnWfVp3C6JJ8Yrmt2"
"@46.225.100.82:54321/postgres?sslmode=require",
)
LOCAL_URL = os.environ.get(
"LOCAL_DATABASE_URL",
"postgresql://breakpilot:breakpilot123@localhost:5432/breakpilot_db",
)
SCHEMA = "compliance"
# Tables to sync (production → local)
SYNC_TABLES = [
"canonical_control_frameworks",
"canonical_control_licenses",
"canonical_control_sources",
"canonical_control_categories",
"canonical_blocked_sources",
"canonical_controls",
"canonical_control_mappings",
"canonical_processed_chunks",
"canonical_generation_jobs",
"control_patterns",
"crosswalk_matrix",
"obligation_extractions",
"obligation_candidates",
]
def connect(url, label="DB"):
parsed = urllib.parse.urlparse(url)
params = dict(urllib.parse.parse_qsl(parsed.query))
conn = psycopg2.connect(
host=parsed.hostname,
port=parsed.port or 5432,
user=parsed.username,
password=parsed.password,
dbname=parsed.path.lstrip("/"),
sslmode=params.get("sslmode", "prefer"),
options=f"-c search_path={SCHEMA},public",
keepalives=1,
keepalives_idle=30,
keepalives_interval=10,
keepalives_count=5,
)
conn.autocommit = False
print(f" Connected to {label} ({parsed.hostname}:{parsed.port or 5432})")
return conn
def get_columns(cur, table):
cur.execute(f"""
SELECT column_name FROM information_schema.columns
WHERE table_schema = '{SCHEMA}' AND table_name = '{table}'
ORDER BY ordinal_position
""")
return [r[0] for r in cur.fetchall()]
def pull_table(prod_conn, local_conn, table):
"""Copy entire table from production to local via SELECT + INSERT."""
prod_cur = prod_conn.cursor()
local_cur = local_conn.cursor()
# Check table exists on production
prod_cur.execute(f"""
SELECT 1 FROM pg_tables
WHERE schemaname = '{SCHEMA}' AND tablename = '{table}'
""")
if not prod_cur.fetchone():
print(f" SKIP {table} — not found on production")
return 0
# Drop local table
local_cur.execute(f"DROP TABLE IF EXISTS {SCHEMA}.{table} CASCADE")
local_conn.commit()
# Build simple CREATE TABLE (no constraints, no defaults — just for data)
prod_cur.execute(f"""
SELECT column_name, data_type, udt_name, character_maximum_length
FROM information_schema.columns
WHERE table_schema = '{SCHEMA}' AND table_name = '{table}'
ORDER BY ordinal_position
""")
col_defs = prod_cur.fetchall()
parts = []
col_names = []
jsonb_cols = set()
for name, dtype, udt, max_len in col_defs:
col_names.append(name)
if dtype == "ARRAY":
type_map = {
"_text": "text[]", "_varchar": "varchar[]",
"_int4": "integer[]", "_uuid": "uuid[]",
"_jsonb": "jsonb[]", "_float8": "float8[]",
}
sql_type = type_map.get(udt, f"{udt.lstrip('_')}[]")
elif dtype == "USER-DEFINED" and udt == "jsonb":
sql_type = "jsonb"
jsonb_cols.add(name)
elif dtype == "USER-DEFINED":
sql_type = udt
elif dtype == "jsonb":
sql_type = "jsonb"
jsonb_cols.add(name)
elif max_len:
sql_type = f"{dtype}({max_len})"
else:
sql_type = dtype
parts.append(f'"{name}" {sql_type}')
ddl = f"CREATE TABLE {SCHEMA}.{table} ({', '.join(parts)})"
local_cur.execute(ddl)
local_conn.commit()
# Fetch all rows from production
col_list = ", ".join(f'"{c}"' for c in col_names)
prod_cur.execute(f"SELECT {col_list} FROM {SCHEMA}.{table}")
rows = prod_cur.fetchall()
if rows:
# Wrap dict/list values in Json for JSONB columns
adapted_rows = []
for row in rows:
adapted = []
for i, val in enumerate(row):
if col_names[i] in jsonb_cols and isinstance(val, (dict, list)):
adapted.append(psycopg2.extras.Json(val))
else:
adapted.append(val)
adapted_rows.append(tuple(adapted))
placeholders = ", ".join(["%s"] * len(col_names))
insert_sql = f'INSERT INTO {SCHEMA}.{table} ({col_list}) VALUES ({placeholders})'
psycopg2.extras.execute_batch(local_cur, insert_sql, adapted_rows, page_size=500)
local_conn.commit()
print(f" {table}: {len(rows)} rows")
return len(rows)
def pull(tables=None):
"""Full sync: production → local."""
print("\n=== PULL: Production → Local ===\n")
prod_conn = connect(PROD_URL, "Production")
local_conn = connect(LOCAL_URL, "Local")
# Ensure schema exists
local_cur = local_conn.cursor()
local_cur.execute(f"CREATE SCHEMA IF NOT EXISTS {SCHEMA}")
local_conn.commit()
sync_list = tables if tables else SYNC_TABLES
total = 0
for table in sync_list:
try:
count = pull_table(prod_conn, local_conn, table)
total += count
except Exception as e:
print(f" ERROR {table}: {e}")
local_conn.rollback()
prod_conn.rollback()
print(f"\n Total: {total} rows synced")
prod_conn.close()
local_conn.close()
def push():
"""Incremental push: new obligation_candidates local → production."""
print(f"\n=== PUSH: Local → Production ({time.strftime('%H:%M:%S')}) ===\n")
local_conn = connect(LOCAL_URL, "Local")
prod_conn = connect(PROD_URL, "Production")
local_cur = local_conn.cursor()
prod_cur = prod_conn.cursor()
# Find obligation_candidates in local that don't exist in production
# Use candidate_id as the unique key
local_cur.execute(f"""
SELECT candidate_id FROM {SCHEMA}.obligation_candidates
""")
local_ids = {r[0] for r in local_cur.fetchall()}
if not local_ids:
print(" No obligation_candidates in local DB")
local_conn.close()
prod_conn.close()
return 0
# Check which already exist on production
prod_cur.execute(f"""
SELECT candidate_id FROM {SCHEMA}.obligation_candidates
""")
prod_ids = {r[0] for r in prod_cur.fetchall()}
new_ids = local_ids - prod_ids
if not new_ids:
print(f" All {len(local_ids)} obligations already on production")
local_conn.close()
prod_conn.close()
return 0
print(f" {len(new_ids)} new obligations to push (local: {len(local_ids)}, prod: {len(prod_ids)})")
# Get columns
columns = get_columns(local_cur, "obligation_candidates")
col_list = ", ".join(columns)
placeholders = ", ".join(["%s"] * len(columns))
# Fetch new rows from local
id_list = ", ".join(f"'{i}'" for i in new_ids)
local_cur.execute(f"""
SELECT {col_list} FROM {SCHEMA}.obligation_candidates
WHERE candidate_id IN ({id_list})
""")
rows = local_cur.fetchall()
# Insert into production
insert_sql = f"INSERT INTO {SCHEMA}.obligation_candidates ({col_list}) VALUES ({placeholders}) ON CONFLICT DO NOTHING"
psycopg2.extras.execute_batch(prod_cur, insert_sql, rows, page_size=100)
prod_conn.commit()
print(f" Pushed {len(rows)} obligations to production")
local_conn.close()
prod_conn.close()
return len(rows)
def loop(interval_min):
"""Run push every N minutes."""
print(f"\n=== SYNC LOOP — Push every {interval_min} min ===")
print(f" Started at {time.strftime('%Y-%m-%d %H:%M:%S')}")
print(f" Press Ctrl+C to stop\n")
while True:
try:
pushed = push()
if pushed:
print(f" Next sync in {interval_min} min...")
except Exception as e:
print(f" SYNC ERROR: {e}")
time.sleep(interval_min * 60)
def main():
parser = argparse.ArgumentParser(description="Sync canonical control tables")
parser.add_argument("--pull", action="store_true", help="Production → Local (full copy)")
parser.add_argument("--push", action="store_true", help="Local → Production (new obligations)")
parser.add_argument("--loop", type=int, metavar="MIN", help="Push every N minutes")
parser.add_argument("--tables", nargs="+", help="Only sync specific tables (with --pull)")
args = parser.parse_args()
if not any([args.pull, args.push, args.loop]):
parser.print_help()
return
if args.pull:
pull(args.tables)
if args.push:
push()
if args.loop:
loop(args.loop)
if __name__ == "__main__":
main()

470
scripts/qa/test_pass0a.py Normal file
View File

@@ -0,0 +1,470 @@
#!/usr/bin/env python3
"""Test Pass 0a (Obligation Extraction) on 5-10 controls.
Standalone script — no SQLAlchemy dependency. Uses psycopg2 + requests.
Copies prompts and quality gate from decomposition_pass.py.
Usage:
python3 test_pass0a.py # 10 controls, Anthropic
python3 test_pass0a.py --limit 5 # 5 controls
python3 test_pass0a.py --source "DSGVO" # filter by source
python3 test_pass0a.py --dry-run # show controls, no LLM call
"""
import argparse
import json
import os
import re
import sys
import time
import urllib.parse
import psycopg2
import requests
# ── Config ────────────────────────────────────────────────────────────
ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", "")
ANTHROPIC_MODEL = os.environ.get("DECOMPOSITION_LLM_MODEL", "claude-sonnet-4-6")
ANTHROPIC_API_URL = "https://api.anthropic.com/v1"
# ── Prompts (from decomposition_pass.py) ──────────────────────────────
SYSTEM_PROMPT = """\
Du bist ein Rechts-Compliance-Experte. Du zerlegst Compliance-Controls \
in einzelne atomare Pflichten.
REGELN (STRIKT EINHALTEN):
1. Nur normative Aussagen extrahieren — erkennbar an: müssen, haben \
sicherzustellen, sind verpflichtet, ist zu dokumentieren, ist zu melden, \
ist zu testen, shall, must, required.
2. Jede Pflicht hat genau EIN Hauptverb / eine Handlung.
3. Testpflichten SEPARAT von operativen Pflichten (is_test_obligation=true).
4. Meldepflichten SEPARAT (is_reporting_obligation=true).
5. NICHT auf Evidence-Ebene zerlegen (z.B. "DR-Plan vorhanden" ist KEIN \
eigenes Control, sondern Evidence).
6. Begründungen, Erläuterungen und Erwägungsgründe sind KEINE Pflichten \
— NICHT extrahieren.
Antworte NUR mit einem JSON-Array. Keine Erklärungen."""
def build_prompt(title, objective, requirements, test_procedure, source_ref):
return f"""\
Analysiere das folgende Control und extrahiere alle einzelnen normativen \
Pflichten als JSON-Array.
CONTROL:
Titel: {title}
Ziel: {objective}
Anforderungen: {requirements}
Prüfverfahren: {test_procedure}
Quellreferenz: {source_ref}
Antworte als JSON-Array:
[
{{
"obligation_text": "Kurze, präzise Formulierung der Pflicht",
"action": "Hauptverb/Handlung",
"object": "Gegenstand der Pflicht",
"condition": "Auslöser/Bedingung oder null",
"normative_strength": "must",
"is_test_obligation": false,
"is_reporting_obligation": false
}}
]"""
# ── Quality Gate — 3-Tier Classification (from decomposition_pass.py) ──
# Tier 1: Pflicht (mandatory)
_PFLICHT_RE = re.compile(
r"\bmüssen\b|\bmuss\b|\bhat\s+sicherzustellen\b|\bhaben\s+sicherzustellen\b"
r"|\bsind\s+verpflichtet\b|\bist\s+verpflichtet\b"
r"|\bist\s+zu\s+\w+en\b|\bsind\s+zu\s+\w+en\b"
r"|\bhat\s+zu\s+\w+en\b|\bhaben\s+zu\s+\w+en\b"
r"|\bist\s+\w+zu\w+en\b|\bsind\s+\w+zu\w+en\b"
r"|\bist\s+\w+\s+zu\s+\w+en\b|\bsind\s+\w+\s+zu\s+\w+en\b"
r"|\bhat\s+\w+\s+zu\s+\w+en\b|\bhaben\s+\w+\s+zu\s+\w+en\b"
r"|\bshall\b|\bmust\b|\brequired\b"
r"|\b\w+zuteilen\b|\b\w+zuwenden\b|\b\w+zustellen\b|\b\w+zulegen\b"
r"|\b\w+zunehmen\b|\b\w+zuführen\b|\b\w+zuhalten\b|\b\w+zusetzen\b"
r"|\b\w+zuweisen\b|\b\w+zuordnen\b|\b\w+zufügen\b|\b\w+zugeben\b"
r"|\bist\b.{1,80}\bzu\s+\w+en\b|\bsind\b.{1,80}\bzu\s+\w+en\b",
re.IGNORECASE,
)
# Tier 2: Empfehlung (recommendation)
_EMPFEHLUNG_RE = re.compile(
r"\bsoll\b|\bsollen\b|\bsollte\b|\bsollten\b"
r"|\bgewährleisten\b|\bsicherstellen\b"
r"|\bshould\b|\bensure\b|\brecommend\w*\b"
r"|\bnachweisen\b|\beinhalten\b|\bunterlassen\b|\bwahren\b"
r"|\bdokumentieren\b|\bimplementieren\b|\büberprüfen\b|\büberwachen\b"
r"|\bprüfen,\s+ob\b|\bkontrollieren,\s+ob\b",
re.IGNORECASE,
)
# Tier 3: Kann (optional/permissive)
_KANN_RE = re.compile(
r"\bkann\b|\bkönnen\b|\bdarf\b|\bdürfen\b|\bmay\b|\boptional\b",
re.IGNORECASE,
)
# Union (backward compat)
_NORMATIVE_RE = re.compile(
_PFLICHT_RE.pattern + "|" + _EMPFEHLUNG_RE.pattern + "|" + _KANN_RE.pattern,
re.IGNORECASE,
)
_RATIONALE_RE = re.compile(
r"\bda\s+|\bweil\b|\bgrund\b|\berwägung|\bbecause\b|\breason\b|\brationale\b",
re.IGNORECASE,
)
_TEST_RE = re.compile(
r"\btesten\b|\btest\b|\bprüfung\b|\bprüfen\b|\bgetestet\b|\bwirksamkeit\b"
r"|\baudit\b|\bregelmäßig\b.*\b(prüf|test|kontroll)|\beffectiveness\b|\bverif",
re.IGNORECASE,
)
_REPORTING_RE = re.compile(
r"\bmelden\b|\bmeldung\b|\bunterricht|\binformieren\b|\bbenachricht"
r"|\bnotif|\breport\b|\bbehörd",
re.IGNORECASE,
)
def classify_obligation_type(txt):
"""Classify: pflicht > empfehlung > kann > empfehlung (default)."""
if _PFLICHT_RE.search(txt):
return "pflicht"
if _EMPFEHLUNG_RE.search(txt):
return "empfehlung"
if _KANN_RE.search(txt):
return "kann"
return "empfehlung"
def quality_gate(obl_text, parent_uuid):
"""Validate + classify obligation. Returns (flags_dict, passed_bool, confidence, obligation_type)."""
flags = {}
# 1. Normative signal (informational)
flags["has_normative_signal"] = bool(_NORMATIVE_RE.search(obl_text))
# 1b. Obligation type classification
obl_type = classify_obligation_type(obl_text)
flags["obligation_type"] = obl_type
# 2. Single action
multi_verb_re = re.compile(
r"\b(und|sowie|als auch)\b.*\b(müssen|sicherstellen|implementieren"
r"|dokumentieren|melden|testen|prüfen|überwachen|gewährleisten)\b",
re.IGNORECASE,
)
flags["single_action"] = not bool(multi_verb_re.search(obl_text))
# 3. Not rationale
normative_count = len(_NORMATIVE_RE.findall(obl_text))
rationale_count = len(_RATIONALE_RE.findall(obl_text))
flags["not_rationale"] = normative_count >= rationale_count
# 4. Not evidence-only
evidence_only_re = re.compile(
r"^(Nachweis|Dokumentation|Screenshot|Protokoll|Bericht|Zertifikat)",
re.IGNORECASE,
)
flags["not_evidence_only"] = not bool(evidence_only_re.match(obl_text.strip()))
# 5. Min length
flags["min_length"] = len(obl_text.strip()) >= 20
# 6. Parent link
flags["has_parent_link"] = bool(parent_uuid)
# Confidence
weights = {
"has_normative_signal": 0.25, "single_action": 0.20,
"not_rationale": 0.20, "not_evidence_only": 0.15,
"min_length": 0.10, "has_parent_link": 0.05,
}
# Bonus for pflicht classification
confidence = sum(weights[k] for k, v in flags.items() if v and k in weights)
if obl_type == "pflicht":
confidence = min(confidence + 0.05, 1.0)
# Pass check — has_normative_signal is NO LONGER critical
critical = ["not_evidence_only", "min_length", "has_parent_link"]
passed = all(flags.get(k, False) for k in critical)
return flags, passed, confidence, obl_type
# ── JSON parsing ──────────────────────────────────────────────────────
def parse_json_array(text):
try:
result = json.loads(text)
if isinstance(result, list):
return result
if isinstance(result, dict):
return [result]
except json.JSONDecodeError:
pass
match = re.search(r"\[[\s\S]*\]", text)
if match:
try:
result = json.loads(match.group())
if isinstance(result, list):
return result
except json.JSONDecodeError:
pass
return []
# ── API call ──────────────────────────────────────────────────────────
def call_anthropic(prompt):
headers = {
"x-api-key": ANTHROPIC_API_KEY,
"anthropic-version": "2023-06-01",
"content-type": "application/json",
}
payload = {
"model": ANTHROPIC_MODEL,
"max_tokens": 8192,
"system": [{"type": "text", "text": SYSTEM_PROMPT, "cache_control": {"type": "ephemeral"}}],
"messages": [{"role": "user", "content": prompt}],
}
resp = requests.post(f"{ANTHROPIC_API_URL}/messages", headers=headers, json=payload, timeout=120)
if resp.status_code != 200:
return None, {}, f"HTTP {resp.status_code}: {resp.text[:200]}"
data = resp.json()
usage = data.get("usage", {})
content = data.get("content", [])
text = content[0].get("text", "") if content else ""
return text, usage, None
# ── Format helpers ────────────────────────────────────────────────────
def fmt_json(val):
if val is None:
return ""
if isinstance(val, str):
try:
val = json.loads(val)
except (json.JSONDecodeError, TypeError):
return val
if isinstance(val, list):
return "\n".join(f" - {item}" for item in val)
return str(val)
# ── Main ──────────────────────────────────────────────────────────────
def main():
parser = argparse.ArgumentParser(description="Test Pass 0a on small sample")
parser.add_argument("--limit", type=int, default=10)
parser.add_argument("--source", type=str)
parser.add_argument("--dry-run", action="store_true")
args = parser.parse_args()
if not ANTHROPIC_API_KEY and not args.dry_run:
print("ERROR: Set ANTHROPIC_API_KEY")
sys.exit(1)
db_url = os.environ["DATABASE_URL"]
p = urllib.parse.urlparse(db_url)
conn = psycopg2.connect(
host=p.hostname, port=p.port or 5432,
user=p.username, password=p.password,
dbname=p.path.lstrip("/"),
options="-c search_path=compliance,public",
)
cur = conn.cursor()
# Select diverse sample
query = """
SELECT id, control_id, title, objective, requirements,
test_procedure, source_citation, category
FROM compliance.canonical_controls
WHERE release_state NOT IN ('deprecated', 'duplicate', 'too_close')
AND parent_control_uuid IS NULL
AND title IS NOT NULL AND objective IS NOT NULL
AND length(coalesce(objective,'') || coalesce(requirements::text,'')) > 100
"""
params = []
if args.source:
query += " AND source_citation->>'source' ILIKE %s"
params.append(f"%{args.source}%")
query += " ORDER BY source_citation->>'source', random()"
query += f" LIMIT {args.limit}"
cur.execute(query, params)
controls = cur.fetchall()
if not controls:
print("No controls found.")
return
print(f"{'='*70}")
print(f"Pass 0a Test — {len(controls)} Controls")
print(f"Model: {ANTHROPIC_MODEL}")
print(f"{'='*70}")
total_in = total_out = total_obls = 0
type_counts = {"pflicht": 0, "empfehlung": 0, "kann": 0}
total_rejected = 0 # only evidence-only / too-short / no-parent
all_results = []
t_start = time.time()
for i, row in enumerate(controls, 1):
ctrl_uuid, ctrl_id, title, objective, reqs, test_proc, src_cit, category = row
req_str = fmt_json(reqs)
test_str = fmt_json(test_proc)
source_str = ""
if src_cit:
sc = src_cit if isinstance(src_cit, dict) else json.loads(src_cit)
source_str = f"{sc.get('source', '')} {sc.get('article', '')}"
print(f"\n{''*70}")
print(f"[{i}/{len(controls)}] {ctrl_id}: {title}")
print(f" Source: {source_str} | Category: {category or 'N/A'}")
print(f" Objective: {(objective or '')[:200]}")
if args.dry_run:
print(" [DRY RUN]")
continue
prompt = build_prompt(title or "", objective or "", req_str, test_str, source_str)
t0 = time.time()
response_text, usage, error = call_anthropic(prompt)
elapsed = time.time() - t0
if error:
print(f" ERROR: {error}")
continue
in_tok = usage.get("input_tokens", 0)
out_tok = usage.get("output_tokens", 0)
cached = usage.get("cache_read_input_tokens", 0)
total_in += in_tok
total_out += out_tok
obligations = parse_json_array(response_text)
total_obls += len(obligations)
print(f" API: {elapsed:.1f}s | {in_tok} in / {out_tok} out"
f"{f' ({cached} cached)' if cached else ''}"
f" | {len(obligations)} obligation(s)")
for j, obl in enumerate(obligations, 1):
obl_text = obl.get("obligation_text", "")
action = obl.get("action", "")
obj = obl.get("object", "")
condition = obl.get("condition")
strength = obl.get("normative_strength", "must")
is_test = bool(obl.get("is_test_obligation", False))
is_report = bool(obl.get("is_reporting_obligation", False))
# Auto-detect
if not is_test and _TEST_RE.search(obl_text):
is_test = True
if not is_report and _REPORTING_RE.search(obl_text):
is_report = True
flags, passed, conf, obl_type = quality_gate(obl_text, str(ctrl_uuid))
if passed:
type_counts[obl_type] = type_counts.get(obl_type, 0) + 1
else:
total_rejected += 1
tag = ""
if is_test:
tag = " [TEST]"
elif is_report:
tag = " [MELDEPFLICHT]"
# Show type instead of PASS/REJECT
type_label = {"pflicht": "PFLICHT", "empfehlung": "EMPFEHLUNG", "kann": "KANN"}
if not passed:
status = "REJECT"
else:
status = type_label.get(obl_type, "EMPFEHLUNG")
failed = [k for k, v in flags.items()
if isinstance(v, bool) and not v]
print(f"\n {j}. [{status}] conf={conf:.0%}{tag} strength={strength}")
print(f" {obl_text}")
print(f" Handlung: {action} | Gegenstand: {obj}")
if condition:
print(f" Bedingung: {condition}")
if not passed:
print(f" Abgelehnt: {', '.join(failed)}")
all_results.append({
"control_id": ctrl_id,
"obligation_text": obl_text,
"obligation_type": obl_type if passed else "rejected",
"action": action,
"object": obj,
"condition": condition,
"confidence": round(conf, 2),
"is_test": is_test,
"is_reporting": is_report,
"passed": passed,
"flags": {k: v for k, v in flags.items()},
})
time.sleep(0.5)
# ── Summary ──────────────────────────────────────────────────────
elapsed_total = time.time() - t_start
cost = (total_in * 3 + total_out * 15) / 1_000_000
total_classified = sum(type_counts.values())
print(f"\n\n{'='*70}")
print(f"ZUSAMMENFASSUNG — 3-Tier-Klassifizierung")
print(f"{'='*70}")
print(f" Controls: {len(controls)}")
print(f" Obligations: {total_obls} ({total_obls/max(len(controls),1):.1f} pro Control)")
print(f" ── Klassifizierung ──")
print(f" Pflicht: {type_counts['pflicht']}"
f" ({type_counts['pflicht']*100/max(total_obls,1):.0f}%)")
print(f" Empfehlung: {type_counts['empfehlung']}"
f" ({type_counts['empfehlung']*100/max(total_obls,1):.0f}%)")
print(f" Kann: {type_counts['kann']}"
f" ({type_counts['kann']*100/max(total_obls,1):.0f}%)")
print(f" Rejected: {total_rejected}"
f" ({total_rejected*100/max(total_obls,1):.0f}%)"
f" (nur evidence-only/zu kurz/kein parent)")
print(f" ── Kosten ──")
print(f" Laufzeit: {elapsed_total:.1f}s")
print(f" Tokens: {total_in:,} in / {total_out:,} out")
print(f" Kosten: ${cost:.4f}")
if len(controls) > 0 and not args.dry_run and total_obls > 0:
n = 6000
factor = n / len(controls)
print(f"\n --- Hochrechnung auf {n:,} Controls ---")
print(f" Tokens: {int(total_in * factor):,} in / {int(total_out * factor):,} out")
print(f" Kosten: ${cost * factor:.2f}")
print(f" Laufzeit: {elapsed_total * factor / 3600:.1f}h")
print(f" Obligations: ~{int(total_obls / len(controls) * n):,}")
pf = int(type_counts['pflicht'] * factor)
ef = int(type_counts['empfehlung'] * factor)
kf = int(type_counts['kann'] * factor)
print(f" Pflicht: ~{pf:,}")
print(f" Empfehlung: ~{ef:,}")
print(f" Kann: ~{kf:,}")
# Save results JSON for later analysis
if all_results:
out_path = f"/tmp/pass0a_results_{len(controls)}controls.json"
with open(out_path, "w") as f:
json.dump(all_results, f, ensure_ascii=False, indent=2)
print(f"\n Ergebnisse gespeichert: {out_path}")
conn.close()
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,308 @@
#!/usr/bin/env python3
"""Preview Pass 0b: Turn obligation candidates into atomic controls.
Picks a few obligations from Pass 0a results, calls LLM to compose
atomic controls, and writes them to canonical_controls with parent_control_uuid.
Usage:
python3 test_pass0b_preview.py --input /tmp/pass0a_results_60controls.json --limit 3
"""
import argparse
import json
import os
import re
import sys
import time
import uuid
import urllib.parse
import psycopg2
import psycopg2.extras
import requests
# Register JSON adapter
psycopg2.extensions.register_adapter(dict, psycopg2.extras.Json)
ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", "")
ANTHROPIC_MODEL = os.environ.get("DECOMPOSITION_LLM_MODEL", "claude-sonnet-4-6")
SYSTEM_PROMPT = """\
Du bist ein Security-Compliance-Experte. Du erstellst aus einer einzelnen \
normativen Pflicht ein praxisorientiertes, atomares Security Control.
Das Control muss UMSETZBAR sein — keine Gesetzesparaphrase.
Antworte NUR als JSON. Keine Erklärungen."""
def build_pass0b_prompt(obl_text, action, obj, parent_title, category, source_ref):
return f"""\
Erstelle aus der folgenden Pflicht ein atomares Control.
PFLICHT: {obl_text}
HANDLUNG: {action}
GEGENSTAND: {obj}
KONTEXT (Ursprungs-Control):
Titel: {parent_title}
Kategorie: {category}
Quellreferenz: {source_ref}
Antworte als JSON:
{{
"title": "Kurzer Titel (max 80 Zeichen, deutsch)",
"objective": "Was muss erreicht werden? (1-2 Sätze)",
"requirements": ["Konkrete Anforderung 1", "Anforderung 2"],
"test_procedure": ["Prüfschritt 1", "Prüfschritt 2"],
"evidence": ["Nachweis 1", "Nachweis 2"],
"severity": "critical|high|medium|low",
"category": "security|privacy|governance|operations|finance|reporting"
}}"""
def call_anthropic(prompt):
headers = {
"x-api-key": ANTHROPIC_API_KEY,
"anthropic-version": "2023-06-01",
"content-type": "application/json",
}
payload = {
"model": ANTHROPIC_MODEL,
"max_tokens": 4096,
"system": [{"type": "text", "text": SYSTEM_PROMPT, "cache_control": {"type": "ephemeral"}}],
"messages": [{"role": "user", "content": prompt}],
}
resp = requests.post("https://api.anthropic.com/v1/messages", headers=headers, json=payload, timeout=120)
if resp.status_code != 200:
return None, {}, f"HTTP {resp.status_code}: {resp.text[:200]}"
data = resp.json()
text = data.get("content", [{}])[0].get("text", "")
return text, data.get("usage", {}), None
def parse_json_object(text):
try:
return json.loads(text)
except json.JSONDecodeError:
match = re.search(r"\{[\s\S]*\}", text)
if match:
try:
return json.loads(match.group())
except json.JSONDecodeError:
pass
return None
def generate_control_id(domain, cur):
prefix = domain.upper()[:4]
cur.execute("""
SELECT MAX(CAST(SPLIT_PART(control_id, '-', 2) AS INTEGER))
FROM compliance.canonical_controls
WHERE control_id LIKE %s
AND SPLIT_PART(control_id, '-', 2) ~ '^[0-9]+$'
""", (f"{prefix}-%",))
row = cur.fetchone()
if row and row[0] is not None:
return f"{prefix}-{row[0] + 1}"
return f"{prefix}-001"
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--input", default="/tmp/pass0a_results_60controls.json")
parser.add_argument("--limit", type=int, default=3, help="Number of obligations to process")
parser.add_argument("--control", type=str, help="Pick obligations from this control_id")
parser.add_argument("--dry-run", action="store_true")
args = parser.parse_args()
if not ANTHROPIC_API_KEY and not args.dry_run:
print("ERROR: Set ANTHROPIC_API_KEY")
sys.exit(1)
# Load 0a results
with open(args.input) as f:
obligations = json.load(f)
# Filter: only passed, pflicht or empfehlung
obligations = [o for o in obligations if o.get("passed", False)]
if args.control:
obligations = [o for o in obligations if o["control_id"] == args.control]
# Pick diverse sample
picked = []
seen_types = set()
for o in obligations:
otype = o["obligation_type"]
if otype not in seen_types and len(picked) < args.limit:
picked.append(o)
seen_types.add(otype)
# Fill rest
for o in obligations:
if o not in picked and len(picked) < args.limit:
picked.append(o)
if not picked:
print("No obligations found.")
return
# Connect to DB
db_url = os.environ["DATABASE_URL"]
p = urllib.parse.urlparse(db_url)
conn = psycopg2.connect(
host=p.hostname, port=p.port or 5432,
user=p.username, password=p.password,
dbname=p.path.lstrip("/"),
options="-c search_path=compliance,public",
)
cur = conn.cursor()
# Get parent control info
ctrl_ids = list(set(o["control_id"] for o in picked))
cur.execute("""
SELECT control_id, id, title, category, source_citation
FROM compliance.canonical_controls
WHERE control_id = ANY(%s)
""", (ctrl_ids,))
ctrl_map = {}
for row in cur.fetchall():
sc = row[4] if isinstance(row[4], dict) else (json.loads(row[4]) if row[4] else {})
# Derive domain prefix from control_id (e.g. "DSGV" from "DSGV-001")
prefix = row[0].split("-")[0] if "-" in row[0] else "COMP"
ctrl_map[row[0]] = {
"uuid": str(row[1]), "title": row[2], "category": row[3] or "",
"source_ref": f"{sc.get('source', '')} {sc.get('article', '')}",
"domain": prefix,
}
print("=" * 70)
print(f"Pass 0b Preview — {len(picked)} Obligations → Atomic Controls")
print("=" * 70)
created = []
for i, obl in enumerate(picked, 1):
ctrl = ctrl_map.get(obl["control_id"], {})
print(f"\n{''*70}")
print(f"[{i}/{len(picked)}] {obl['control_id']}: [{obl['obligation_type'].upper()}]")
print(f" Obligation: {obl['obligation_text'][:120]}")
print(f" Parent: {ctrl.get('title', 'N/A')}")
if args.dry_run:
print(" [DRY RUN]")
continue
prompt = build_pass0b_prompt(
obl["obligation_text"], obl["action"], obl["object"],
ctrl.get("title", ""), ctrl.get("category", ""),
ctrl.get("source_ref", ""),
)
t0 = time.time()
resp_text, usage, error = call_anthropic(prompt)
elapsed = time.time() - t0
if error:
print(f" ERROR: {error}")
continue
result = parse_json_object(resp_text)
if not result:
print(f" PARSE ERROR: {resp_text[:200]}")
continue
in_tok = usage.get("input_tokens", 0)
out_tok = usage.get("output_tokens", 0)
print(f" LLM: {elapsed:.1f}s | {in_tok} in / {out_tok} out")
# Generate control_id
domain = ctrl.get("domain", "COMP")
new_control_id = generate_control_id(domain, cur)
# Show result
print(f"\n === ATOMIC CONTROL: {new_control_id} ===")
print(f" Titel: {result.get('title', 'N/A')}")
print(f" Ziel: {result.get('objective', 'N/A')}")
print(f" Typ: {obl['obligation_type']}")
reqs = result.get("requirements", [])
if reqs:
print(f" Anforderungen:")
for r in reqs:
print(f" - {r}")
tests = result.get("test_procedure", [])
if tests:
print(f" Pruefverfahren:")
for t in tests:
print(f" - {t}")
evidence = result.get("evidence", [])
if evidence:
print(f" Nachweise:")
for e in evidence:
print(f" - {e}")
print(f" Severity: {result.get('severity', 'medium')}")
print(f" Category: {result.get('category', 'governance')}")
# Write to DB
new_uuid = str(uuid.uuid4())
parent_uuid = ctrl.get("uuid")
source_cit = {}
if ctrl.get("source_ref"):
parts = ctrl["source_ref"].strip().split(" ", 1)
source_cit = {"source": parts[0], "article": parts[1] if len(parts) > 1 else ""}
cur.execute("""
INSERT INTO compliance.canonical_controls (
id, control_id, title, objective, requirements, test_procedure,
evidence, severity, category, release_state,
source_citation, generation_metadata, generation_strategy,
pipeline_version, parent_control_uuid, framework_id
) VALUES (
%s, %s, %s, %s, %s, %s,
%s, %s, %s, %s,
%s, %s, %s,
%s, %s,
(SELECT id FROM compliance.canonical_control_frameworks LIMIT 1)
)
""", (
new_uuid, new_control_id,
result.get("title", ""),
result.get("objective", ""),
json.dumps(result.get("requirements", []), ensure_ascii=False),
json.dumps(result.get("test_procedure", []), ensure_ascii=False),
json.dumps(result.get("evidence", []), ensure_ascii=False),
result.get("severity", "medium"),
result.get("category", "governance"),
"draft",
psycopg2.extras.Json(source_cit),
psycopg2.extras.Json({
"obligation_type": obl["obligation_type"],
"obligation_text": obl["obligation_text"],
"pass0b_model": ANTHROPIC_MODEL,
"decomposition_method": "pass0b_preview",
}),
"pass0b_atomic",
6, # pipeline_version
parent_uuid,
))
conn.commit()
created.append({
"control_id": new_control_id,
"title": result.get("title", ""),
"obligation_type": obl["obligation_type"],
"parent_control_id": obl["control_id"],
})
print(f" ✓ Geschrieben: {new_control_id} (parent: {obl['control_id']})")
time.sleep(0.5)
if created:
print(f"\n{'='*70}")
print(f"ERGEBNIS: {len(created)} atomare Controls erstellt")
print(f"{'='*70}")
for c in created:
print(f" {c['control_id']}: {c['title']} [{c['obligation_type']}] (von {c['parent_control_id']})")
conn.close()
if __name__ == "__main__":
main()