feat: Control Library UI, dedup migration, QA tooling, docs
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 31s
CI/CD / test-python-backend-compliance (push) Successful in 1m35s
CI/CD / test-python-document-crawler (push) Successful in 20s
CI/CD / test-python-dsms-gateway (push) Successful in 17s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 31s
CI/CD / test-python-backend-compliance (push) Successful in 1m35s
CI/CD / test-python-document-crawler (push) Successful in 20s
CI/CD / test-python-dsms-gateway (push) Successful in 17s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped
- Control Library: parent control display, ObligationTypeBadge, GenerationStrategyBadge variants, evidence string fallback - API: expose parent_control_uuid/id/title in canonical controls - Fix: DSFA SQLAlchemy 2.0 Row._mapping compatibility - Migration 074: control_parent_links + control_dedup_reviews tables - QA scripts: benchmark, gap analysis, OSCAL import, OWASP cleanup, phase5 normalize, phase74 gap fill, sync_db, run_job - Docs: dedup engine, RAG benchmark, lessons learned, pipeline docs Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -9,6 +9,7 @@ import {
|
||||
import {
|
||||
CanonicalControl, EFFORT_LABELS, BACKEND_URL,
|
||||
SeverityBadge, StateBadge, LicenseRuleBadge, VerificationMethodBadge, CategoryBadge, TargetAudienceBadge,
|
||||
ObligationTypeBadge, GenerationStrategyBadge,
|
||||
VERIFICATION_METHODS, CATEGORY_OPTIONS,
|
||||
} from './helpers'
|
||||
|
||||
@@ -125,6 +126,8 @@ export function ControlDetail({
|
||||
<VerificationMethodBadge method={ctrl.verification_method} />
|
||||
<CategoryBadge category={ctrl.category} />
|
||||
<TargetAudienceBadge audience={ctrl.target_audience} />
|
||||
<GenerationStrategyBadge strategy={ctrl.generation_strategy} />
|
||||
<ObligationTypeBadge type={ctrl.generation_metadata?.obligation_type as string} />
|
||||
</div>
|
||||
<h2 className="text-lg font-semibold text-gray-900 mt-1">{ctrl.title}</h2>
|
||||
</div>
|
||||
@@ -239,6 +242,32 @@ export function ControlDetail({
|
||||
</section>
|
||||
)}
|
||||
|
||||
{/* Parent Control (atomare Controls) */}
|
||||
{ctrl.parent_control_uuid && (
|
||||
<section className="bg-violet-50 border border-violet-200 rounded-lg p-4">
|
||||
<div className="flex items-center gap-2 mb-1">
|
||||
<GitMerge className="w-4 h-4 text-violet-600" />
|
||||
<h3 className="text-sm font-semibold text-violet-900">Atomares Control</h3>
|
||||
<ObligationTypeBadge type={ctrl.generation_metadata?.obligation_type as string} />
|
||||
</div>
|
||||
<p className="text-sm text-violet-800">
|
||||
Abgeleitet aus Eltern-Control{' '}
|
||||
<span className="font-mono font-semibold text-purple-700 bg-purple-100 px-1.5 py-0.5 rounded">
|
||||
{ctrl.parent_control_id || ctrl.parent_control_uuid}
|
||||
</span>
|
||||
{ctrl.parent_control_title && (
|
||||
<span className="text-violet-700 ml-1">— {ctrl.parent_control_title}</span>
|
||||
)}
|
||||
</p>
|
||||
{ctrl.generation_metadata?.obligation_text && (
|
||||
<p className="text-xs text-violet-600 mt-2 bg-violet-100/50 rounded p-2">
|
||||
Obligation: {String(ctrl.generation_metadata.obligation_text).slice(0, 300)}
|
||||
{String(ctrl.generation_metadata.obligation_text).length > 300 ? '...' : ''}
|
||||
</p>
|
||||
)}
|
||||
</section>
|
||||
)}
|
||||
|
||||
{/* Impliziter Gesetzesbezug (Rule 3 — reformuliert, kein Originaltext) */}
|
||||
{!ctrl.source_citation && ctrl.open_anchors.length > 0 && (
|
||||
<section className="bg-amber-50 border border-amber-200 rounded-lg p-3">
|
||||
@@ -297,7 +326,7 @@ export function ControlDetail({
|
||||
</section>
|
||||
)}
|
||||
|
||||
{/* Evidence */}
|
||||
{/* Evidence — handles both {type, description} objects and plain strings */}
|
||||
{ctrl.evidence.length > 0 && (
|
||||
<section>
|
||||
<h3 className="text-sm font-semibold text-gray-900 mb-2">Nachweise</h3>
|
||||
@@ -305,7 +334,11 @@ export function ControlDetail({
|
||||
{ctrl.evidence.map((ev, i) => (
|
||||
<div key={i} className="flex items-start gap-2 text-sm text-gray-700">
|
||||
<FileText className="w-4 h-4 text-gray-400 flex-shrink-0 mt-0.5" />
|
||||
<div><span className="font-medium">{ev.type}:</span> {ev.description}</div>
|
||||
{typeof ev === 'string' ? (
|
||||
<div>{ev}</div>
|
||||
) : (
|
||||
<div><span className="font-medium">{ev.type}:</span> {ev.description}</div>
|
||||
)}
|
||||
</div>
|
||||
))}
|
||||
</div>
|
||||
@@ -359,7 +392,18 @@ export function ControlDetail({
|
||||
<h3 className="text-sm font-semibold text-gray-700">Generierungsdetails (intern)</h3>
|
||||
</div>
|
||||
<div className="text-xs text-gray-600 space-y-1">
|
||||
<p>Pfad: {String(ctrl.generation_metadata.processing_path || '-')}</p>
|
||||
{ctrl.generation_metadata.processing_path && (
|
||||
<p>Pfad: {String(ctrl.generation_metadata.processing_path)}</p>
|
||||
)}
|
||||
{ctrl.generation_metadata.decomposition_method && (
|
||||
<p>Methode: {String(ctrl.generation_metadata.decomposition_method)}</p>
|
||||
)}
|
||||
{ctrl.generation_metadata.pass0b_model && (
|
||||
<p>LLM: {String(ctrl.generation_metadata.pass0b_model)}</p>
|
||||
)}
|
||||
{ctrl.generation_metadata.obligation_type && (
|
||||
<p>Obligation-Typ: {String(ctrl.generation_metadata.obligation_type)}</p>
|
||||
)}
|
||||
{ctrl.generation_metadata.similarity_status && (
|
||||
<p className="text-red-600">Similarity: {String(ctrl.generation_metadata.similarity_status)}</p>
|
||||
)}
|
||||
|
||||
@@ -30,7 +30,7 @@ export interface CanonicalControl {
|
||||
}
|
||||
requirements: string[]
|
||||
test_procedure: string[]
|
||||
evidence: EvidenceItem[]
|
||||
evidence: (EvidenceItem | string)[]
|
||||
severity: string
|
||||
risk_score: number | null
|
||||
implementation_effort: string | null
|
||||
@@ -47,6 +47,10 @@ export interface CanonicalControl {
|
||||
target_audience: string | string[] | null
|
||||
generation_metadata?: Record<string, unknown> | null
|
||||
generation_strategy?: string | null
|
||||
parent_control_uuid?: string | null
|
||||
parent_control_id?: string | null
|
||||
parent_control_title?: string | null
|
||||
decomposition_method?: string | null
|
||||
created_at: string
|
||||
updated_at: string
|
||||
}
|
||||
@@ -275,7 +279,26 @@ export function GenerationStrategyBadge({ strategy }: { strategy: string | null
|
||||
if (strategy === 'document_grouped') {
|
||||
return <span className="inline-flex items-center px-1.5 py-0.5 rounded text-xs font-medium bg-emerald-100 text-emerald-700">v2</span>
|
||||
}
|
||||
return null
|
||||
if (strategy === 'phase74_gap_fill') {
|
||||
return <span className="inline-flex items-center px-1.5 py-0.5 rounded text-xs font-medium bg-blue-100 text-blue-700">v5 Gap</span>
|
||||
}
|
||||
if (strategy === 'pass0b_atomic') {
|
||||
return <span className="inline-flex items-center px-1.5 py-0.5 rounded text-xs font-medium bg-violet-100 text-violet-700">Atomar</span>
|
||||
}
|
||||
return <span className="inline-flex items-center px-1.5 py-0.5 rounded text-xs font-medium bg-gray-100 text-gray-500">{strategy}</span>
|
||||
}
|
||||
|
||||
export const OBLIGATION_TYPE_CONFIG: Record<string, { bg: string; label: string }> = {
|
||||
pflicht: { bg: 'bg-red-100 text-red-700', label: 'Pflicht' },
|
||||
empfehlung: { bg: 'bg-amber-100 text-amber-700', label: 'Empfehlung' },
|
||||
kann: { bg: 'bg-green-100 text-green-700', label: 'Kann' },
|
||||
}
|
||||
|
||||
export function ObligationTypeBadge({ type }: { type: string | null | undefined }) {
|
||||
if (!type) return null
|
||||
const config = OBLIGATION_TYPE_CONFIG[type]
|
||||
if (!config) return null
|
||||
return <span className={`inline-flex items-center px-2 py-0.5 rounded text-xs font-medium ${config.bg}`}>{config.label}</span>
|
||||
}
|
||||
|
||||
export function getDomain(controlId: string): string {
|
||||
|
||||
@@ -9,7 +9,7 @@ import {
|
||||
import {
|
||||
CanonicalControl, Framework, BACKEND_URL, EMPTY_CONTROL,
|
||||
SeverityBadge, StateBadge, LicenseRuleBadge, VerificationMethodBadge, CategoryBadge, TargetAudienceBadge,
|
||||
GenerationStrategyBadge,
|
||||
GenerationStrategyBadge, ObligationTypeBadge,
|
||||
VERIFICATION_METHODS, CATEGORY_OPTIONS, TARGET_AUDIENCE_OPTIONS,
|
||||
} from './components/helpers'
|
||||
import { ControlForm } from './components/ControlForm'
|
||||
@@ -762,6 +762,7 @@ export default function ControlLibraryPage() {
|
||||
<CategoryBadge category={ctrl.category} />
|
||||
<TargetAudienceBadge audience={ctrl.target_audience} />
|
||||
<GenerationStrategyBadge strategy={ctrl.generation_strategy} />
|
||||
<ObligationTypeBadge type={ctrl.generation_metadata?.obligation_type as string} />
|
||||
{ctrl.risk_score !== null && (
|
||||
<span className="text-xs text-gray-400">Score: {ctrl.risk_score}</span>
|
||||
)}
|
||||
|
||||
@@ -174,6 +174,9 @@ _CONTROL_COLS = """id, framework_id, control_id, title, objective, rationale,
|
||||
customer_visible, verification_method, category,
|
||||
target_audience, generation_metadata, generation_strategy,
|
||||
applicable_industries, applicable_company_size, scope_conditions,
|
||||
parent_control_uuid, decomposition_method, pipeline_version,
|
||||
(SELECT p.control_id FROM canonical_controls p WHERE p.id = canonical_controls.parent_control_uuid) AS parent_control_id,
|
||||
(SELECT p.title FROM canonical_controls p WHERE p.id = canonical_controls.parent_control_uuid) AS parent_control_title,
|
||||
created_at, updated_at"""
|
||||
|
||||
|
||||
@@ -798,6 +801,11 @@ def _control_row(r) -> dict:
|
||||
"applicable_industries": getattr(r, "applicable_industries", None),
|
||||
"applicable_company_size": getattr(r, "applicable_company_size", None),
|
||||
"scope_conditions": getattr(r, "scope_conditions", None),
|
||||
"parent_control_uuid": str(r.parent_control_uuid) if getattr(r, "parent_control_uuid", None) else None,
|
||||
"parent_control_id": getattr(r, "parent_control_id", None),
|
||||
"parent_control_title": getattr(r, "parent_control_title", None),
|
||||
"decomposition_method": getattr(r, "decomposition_method", None),
|
||||
"pipeline_version": getattr(r, "pipeline_version", None),
|
||||
"created_at": r.created_at.isoformat() if r.created_at else None,
|
||||
"updated_at": r.updated_at.isoformat() if r.updated_at else None,
|
||||
}
|
||||
|
||||
@@ -200,6 +200,9 @@ def _get_tenant_id(tenant_id: Optional[str]) -> str:
|
||||
def _dsfa_to_response(row) -> dict:
|
||||
"""Convert a DB row to a JSON-serializable dict."""
|
||||
import json
|
||||
# SQLAlchemy 2.0: Row objects need ._mapping for string-key access
|
||||
if hasattr(row, "_mapping"):
|
||||
row = row._mapping
|
||||
|
||||
def _parse_arr(val):
|
||||
"""Parse a JSONB array field → list."""
|
||||
@@ -558,8 +561,9 @@ async def create_dsfa(
|
||||
).fetchone()
|
||||
|
||||
db.flush()
|
||||
row_id = row._mapping["id"] if hasattr(row, "_mapping") else row[0]
|
||||
_log_audit(
|
||||
db, tid, row["id"], "CREATE", request.created_by,
|
||||
db, tid, row_id, "CREATE", request.created_by,
|
||||
new_values={"title": request.title, "status": request.status},
|
||||
)
|
||||
db.commit()
|
||||
|
||||
73
backend-compliance/migrations/074_control_dedup.sql
Normal file
73
backend-compliance/migrations/074_control_dedup.sql
Normal file
@@ -0,0 +1,73 @@
|
||||
-- Migration 074: Control Dedup Engine — DB Schema
|
||||
-- Supports the 4-stage dedup pipeline for atomic controls (Pass 0b).
|
||||
--
|
||||
-- Tables:
|
||||
-- 1. control_parent_links — M:N parent linking (one control → many regulations)
|
||||
-- 2. control_dedup_reviews — Review queue for borderline matches (0.85-0.92)
|
||||
|
||||
BEGIN;
|
||||
|
||||
-- =============================================================================
|
||||
-- 1. Control Parent Links (M:N)
|
||||
-- Enables "1 Control erfuellt 5 Gesetze" — the biggest USP.
|
||||
-- An atomic control can have multiple parent controls from different
|
||||
-- regulations/obligations. This replaces the 1:1 parent_control_uuid FK.
|
||||
-- =============================================================================
|
||||
|
||||
CREATE TABLE IF NOT EXISTS control_parent_links (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
control_uuid UUID NOT NULL REFERENCES canonical_controls(id) ON DELETE CASCADE,
|
||||
parent_control_uuid UUID NOT NULL REFERENCES canonical_controls(id) ON DELETE CASCADE,
|
||||
link_type VARCHAR(30) NOT NULL DEFAULT 'decomposition'
|
||||
CHECK (link_type IN ('decomposition', 'dedup_merge', 'manual', 'crosswalk')),
|
||||
confidence NUMERIC(3,2) DEFAULT 1.0
|
||||
CHECK (confidence >= 0 AND confidence <= 1),
|
||||
source_regulation VARCHAR(100),
|
||||
source_article VARCHAR(100),
|
||||
obligation_candidate_id UUID REFERENCES obligation_candidates(id),
|
||||
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
CONSTRAINT uq_parent_link UNIQUE (control_uuid, parent_control_uuid)
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_cpl_control ON control_parent_links(control_uuid);
|
||||
CREATE INDEX IF NOT EXISTS idx_cpl_parent ON control_parent_links(parent_control_uuid);
|
||||
CREATE INDEX IF NOT EXISTS idx_cpl_type ON control_parent_links(link_type);
|
||||
|
||||
COMMENT ON TABLE control_parent_links IS
|
||||
'M:N parent links — one atomic control can fulfill multiple regulations/obligations. USP: "1 Control erfuellt 5 Gesetze"';
|
||||
|
||||
-- =============================================================================
|
||||
-- 2. Control Dedup Reviews
|
||||
-- Queue for borderline matches (similarity 0.85-0.92) that need human review.
|
||||
-- Reviewed entries get status updated to accepted/rejected.
|
||||
-- =============================================================================
|
||||
|
||||
CREATE TABLE IF NOT EXISTS control_dedup_reviews (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
candidate_control_id VARCHAR(30) NOT NULL,
|
||||
candidate_title TEXT NOT NULL,
|
||||
candidate_objective TEXT,
|
||||
matched_control_uuid UUID REFERENCES canonical_controls(id),
|
||||
matched_control_id VARCHAR(30),
|
||||
similarity_score NUMERIC(4,3) DEFAULT 0.0,
|
||||
dedup_stage VARCHAR(40) NOT NULL,
|
||||
dedup_details JSONB DEFAULT '{}',
|
||||
parent_control_uuid UUID REFERENCES canonical_controls(id),
|
||||
obligation_candidate_id UUID REFERENCES obligation_candidates(id),
|
||||
review_status VARCHAR(20) DEFAULT 'pending'
|
||||
CHECK (review_status IN ('pending', 'accepted_link', 'accepted_new', 'rejected')),
|
||||
reviewed_by VARCHAR(100),
|
||||
reviewed_at TIMESTAMPTZ,
|
||||
review_notes TEXT,
|
||||
created_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
|
||||
CREATE INDEX IF NOT EXISTS idx_cdr_status ON control_dedup_reviews(review_status);
|
||||
CREATE INDEX IF NOT EXISTS idx_cdr_matched ON control_dedup_reviews(matched_control_uuid);
|
||||
CREATE INDEX IF NOT EXISTS idx_cdr_parent ON control_dedup_reviews(parent_control_uuid);
|
||||
CREATE INDEX IF NOT EXISTS idx_cdr_stage ON control_dedup_reviews(dedup_stage);
|
||||
|
||||
COMMENT ON TABLE control_dedup_reviews IS
|
||||
'Review queue for borderline dedup matches (similarity 0.85-0.92). Human decides: link or new control.';
|
||||
|
||||
COMMIT;
|
||||
@@ -195,6 +195,11 @@ class TestControlRowConversion:
|
||||
"release_state": "draft",
|
||||
"tags": ["mfa"],
|
||||
"generation_strategy": "ungrouped",
|
||||
"parent_control_uuid": None,
|
||||
"parent_control_id": None,
|
||||
"parent_control_title": None,
|
||||
"decomposition_method": None,
|
||||
"pipeline_version": None,
|
||||
"created_at": now,
|
||||
"updated_at": now,
|
||||
}
|
||||
|
||||
@@ -2,7 +2,23 @@
|
||||
|
||||
## Übersicht
|
||||
|
||||
Die Control Quality Pipeline prüft und verbessert die ~9.000 Canonical Controls der Compliance-Bibliothek. Sie nutzt **PDF-basierte Verifizierung** als Ground Truth — jeder Control-Originaltext wird direkt im Quelldokument (PDF) lokalisiert.
|
||||
Die Control Quality Pipeline prüft und verbessert die Canonical Controls der Compliance-Bibliothek. Sie nutzt **PDF-basierte Verifizierung** als Ground Truth — jeder Control-Originaltext wird direkt im Quelldokument (PDF) lokalisiert.
|
||||
|
||||
Alle Scripts liegen in **`scripts/qa/`**. Starten auf dem Mac Mini via Runner-Script:
|
||||
|
||||
```bash
|
||||
# Job starten (laedt .env automatisch, PID-Lock, unbuffered output)
|
||||
ssh macmini "bash ~/Projekte/breakpilot-compliance/scripts/qa/run_job.sh <script.py> [args...]"
|
||||
|
||||
# Status aller Jobs
|
||||
ssh macmini "bash ~/Projekte/breakpilot-compliance/scripts/qa/run_job.sh --status"
|
||||
|
||||
# Log ansehen
|
||||
ssh macmini "bash ~/Projekte/breakpilot-compliance/scripts/qa/run_job.sh --log <script.py>"
|
||||
|
||||
# Job stoppen
|
||||
ssh macmini "bash ~/Projekte/breakpilot-compliance/scripts/qa/run_job.sh --kill <script.py>"
|
||||
```
|
||||
|
||||
## Architektur
|
||||
|
||||
@@ -55,20 +71,24 @@ Jeder Control hat ein Feld `source_original_text` — der Chunk-Text aus dem Que
|
||||
|
||||
| Metrik | Wert |
|
||||
|---|---|
|
||||
| Controls mit source_original_text | 7.943 |
|
||||
| Im PDF lokalisiert | **6.259 (79%)** |
|
||||
| Nicht gefunden (Sprachmismatch) | 1.651 |
|
||||
| Kein PDF vorhanden | 33 |
|
||||
| 100% Match-Rate | 19 Regulations (inkl. DSGVO, KI-VO, NIS2, NIST 800-53) |
|
||||
| Controls mit source_original_text | 5.751 (86%) |
|
||||
| Im PDF lokalisiert | **5.063 (88%)** |
|
||||
| Nicht gefunden | 649 |
|
||||
| Kein PDF vorhanden | 29 |
|
||||
| Recital_suspect markiert | 648 |
|
||||
| 100% Match-Rate | 20+ Regulations (inkl. DSGVO, KI-VO, NIS2, NIST 800-53, Blue Guide) |
|
||||
|
||||
**Verlauf:** v1 (4.110, 52%) → v2 (6.091, 77%) → v3 (6.259, 79%) → v4 +Blue Guide EN (6.803, 86%) → v5 nach Cleanup (5.063/5.741, 88%)
|
||||
|
||||
### Nicht-matchende Controls
|
||||
|
||||
| Ursache | Controls | Erklärung |
|
||||
| Ursache | Controls | Status |
|
||||
|---|---|---|
|
||||
| Blue Guide EN vs. DE PDF | ~562 | Controls aus englischem PDF, wir haben nur deutsches |
|
||||
| OWASP multilingual | ~632 | Controls aus PT/AR/ID/ES-Übersetzungen |
|
||||
| ~~Blue Guide EN vs. DE PDF~~ | ~~562~~ | ✅ Gelöst — EN-PDF beschafft, 544/544 gematcht |
|
||||
| ~~OWASP Top 10 multilingual~~ | ~~324~~ | ✅ Als duplicate markiert — Übersetzungen ohne Mehrwert |
|
||||
| CRA Encoding | ~76 | PDF-Ligaturen/Sonderzeichen-Differenzen |
|
||||
| CISA Secure by Design | ~113 | Falsches PDF (ENISA statt CISA) |
|
||||
| OWASP ASVS | ~173 | PDF-Matching-Problem (meist EN) |
|
||||
|
||||
## Brute-Force-Suche
|
||||
|
||||
@@ -100,34 +120,276 @@ Controls aus Erwägungsgründen (`article_type = preamble`) sind **kein Nachteil
|
||||
|
||||
Die 1.195 v1-Controls **ohne** Originaltext sind manuell erstellt (`strategy=ungrouped`) und haben keine Chunk-Referenz.
|
||||
|
||||
## DB-Status (Stand 2026-03-20)
|
||||
## OWASP Cleanup (2026-03-20)
|
||||
|
||||
- **324 OWASP Top 10 multilingual Controls** → `duplicate` markiert (ZH, AR, ID, FR, ES, PT — Übersetzungen derselben 10 Kategorien)
|
||||
- **47 Controls** mit falscher Quellenzuordnung korrigiert (z.B. als "OWASP Top 10" getaggt, aber tatsächlich aus ASVS/SAMM/API/MASVS)
|
||||
- **~200 OWASP ASVS/SAMM/MASVS EN Controls** behalten — unique Content aus GitHub/Website, nicht im PDF auffindbar
|
||||
|
||||
## NIST OSCAL Import (2026-03-20)
|
||||
|
||||
**776 neue Controls** aus NIST SP 800-53 Rev 5 OSCAL (Public Domain, maschinenlesbar):
|
||||
|
||||
- Quelle: `usnistgov/oscal-content` (JSON Catalog)
|
||||
- Vor allem **Control Enhancements** (z.B. AC-2(3), SC-7(8)) — die atomaren Unteranforderungen
|
||||
- Jeder Control enthält: Statement + Guidance + Assessment-Methoden + Cross-References + Parameters
|
||||
- `pipeline_version = 4`, `generation_strategy = 'oscal_import'`
|
||||
- Kein Pass 0a/0b nötig — Controls sind **bereits atomar**
|
||||
|
||||
| Metrik | Vorher | Nachher |
|
||||
|---|---|---|
|
||||
| SP 800-53 Controls (aktiv) | 1.107 | **1.883** |
|
||||
| OSCAL-Abdeckung | 238/1.014 (23%) | **1.014/1.014 (100%)** |
|
||||
|
||||
## Phase 5: RAG-Deduplizierung + Normalisierung (2026-03-20)
|
||||
|
||||
### Durchgeführte Schritte
|
||||
|
||||
| Schritt | Beschreibung | Controls |
|
||||
|---|---|---|
|
||||
| 5.1 | OSCAL Controls: `source_regulation` in generation_metadata gesetzt | 776 |
|
||||
| 5.2 | v3 Controls ohne Source → `needs_review` mit `missing_source` Flag | 20 |
|
||||
| 5.3 | Leerer Source-Name korrigiert (AT TKG) | 1 |
|
||||
| 5.4 | OWASP regulation_code Fehlzuordnungen korrigiert | 47 |
|
||||
| 5.5 | **duplicate/too_close Controls hart gelöscht** | **3.301** |
|
||||
| 5.6 | Processed Chunks bereinigt (gelöschte Control-IDs entfernt) | 2.520 |
|
||||
|
||||
### Ergebnis
|
||||
|
||||
- **Vorher:** 9.936 Controls (6.635 aktiv, 2.998 duplicate, 303 too_close)
|
||||
- **Nachher:** 6.635 Controls, **alle aktiv** (0 duplicate/too_close)
|
||||
- Alle regulation_codes haben jetzt einheitliche Source-Namen
|
||||
- OWASP-Controls sind korrekt ihren Quellen zugeordnet
|
||||
|
||||
## DB-Status (Stand 2026-03-20, nach Phase 7.4)
|
||||
|
||||
| release_state | Count |
|
||||
|---|---|
|
||||
| draft | 5.365 |
|
||||
| needs_review | 818 |
|
||||
| duplicate | 2.674 |
|
||||
| too_close | 303 |
|
||||
| **Aktiv** | **6.183** |
|
||||
| draft | ~6.030 |
|
||||
| needs_review | 838 |
|
||||
| **Gesamt** | **6.868** |
|
||||
|
||||
## Scripts
|
||||
## Scripts (`scripts/qa/`)
|
||||
|
||||
Alle QA-Scripts liegen in `scripts/qa/`:
|
||||
### Kern-QA (PDF-Matching)
|
||||
|
||||
| Script | Beschreibung |
|
||||
|---|---|
|
||||
| `pdf_qa_all.py` | Haupt-QA: Controls gegen PDFs matchen |
|
||||
| `pdf_qa_inventory.py` | Inventar: Regulations, Controls, PDFs |
|
||||
| `apply_pdf_qa_results.py` | Ergebnisse in DB schreiben |
|
||||
| `preamble_dedup.py` | Preamble vs. Artikel Duplikat-Erkennung |
|
||||
| `qa_dedup_controls.py` | Jaccard-basierte Titel-Dedup |
|
||||
| `qa_normalize_sources.py` | Source-Namen normalisieren |
|
||||
| `db_status.py` | DB-Status-Übersicht |
|
||||
| `pdf_qa_all.py` | **Haupt-QA**: Controls gegen PDFs matchen, Artikel-Index aufbauen. Enthaelt `SOURCE_FILE_MAP`, alle Index-Builder (EU, DE, NIST, OWASP, generic). 526 Zeilen. |
|
||||
| `pdf_qa_inventory.py` | Inventar: Welche Regulations haben Controls, wie viele, welche PDFs existieren |
|
||||
| `apply_pdf_qa_results.py` | Ergebnisse aus `pdf_qa_all.py` in DB schreiben (`article_type`, `recital_suspect`) |
|
||||
| `pdf_article_lookup_poc.py` | POC: Control-Text in PDF lokalisieren, Headings von Cross-Refs unterscheiden |
|
||||
|
||||
## Nächste Schritte
|
||||
### Lueckenanalyse + Control-Generierung
|
||||
|
||||
1. **Blue Guide EN-PDF** beschaffen → +562 Controls matchen
|
||||
2. **CISA Secure by Design** echtes PDF finden → +113 Controls
|
||||
3. **Brute-Force Ergebnisse anwenden** — 44 falsche Source-Zuordnungen korrigieren
|
||||
4. **Frontend-Anzeige** — `article_type` im Control-Detail anzeigen
|
||||
5. **Continuous QA** — Bei neuen Controls automatisch PDF-Match prüfen
|
||||
| Script | Beschreibung |
|
||||
|---|---|
|
||||
| `gap_analysis.py` | **Phase 7.3**: Artikel im PDF vs. Controls in DB vergleichen, Luecken identifizieren |
|
||||
| `phase74_generate_gap_controls.py` | **Phase 7.4**: Neue Controls fuer Luecken via Anthropic API generieren. `pipeline_version=5`. 624 Zeilen. |
|
||||
| `benchmark_llm_controls.py` | LLM-Vergleich: gpt-oss-120b vs. Claude Sonnet fuer Control-Generierung |
|
||||
| `test_pass0a.py` | **Pass 0a Test**: Obligation Extraction + 3-Tier-Klassifizierung (Pflicht/Empfehlung/Kann). Standalone, speichert JSON. |
|
||||
|
||||
### Deduplizierung + Normalisierung
|
||||
|
||||
| Script | Beschreibung |
|
||||
|---|---|
|
||||
| `preamble_dedup.py` | Preamble vs. Artikel Duplikat-Erkennung (Jaccard >= 0.40) |
|
||||
| `qa_dedup_controls.py` | Jaccard-basierte Titel-Deduplizierung |
|
||||
| `qa_apply_and_dedup.py` | Ergebnisse anwenden + Duplikate in einem Schritt markieren |
|
||||
| `qa_normalize_sources.py` | Source-Namen normalisieren (kanonische Namen) |
|
||||
| `phase5_normalize_and_cleanup.py` | **Phase 5**: Normalisierung + 3.301 Duplikate hart loeschen |
|
||||
| `qa_delete_gpsr_dupe.py` | GPSR-Duplikate loeschen |
|
||||
| `delete_gpsr_prod.py` | GPSR-Duplikate aus Production-Qdrant entfernen |
|
||||
|
||||
### Quellen-spezifische Scripts
|
||||
|
||||
| Script | Beschreibung |
|
||||
|---|---|
|
||||
| `blue_guide_en_match.py` | Blue Guide EN-PDF matchen (544/544 Erfolg) |
|
||||
| `owasp_cleanup.py` | OWASP multilingual Cleanup (324 Duplikate) + Source-Fix (47 korrigiert) |
|
||||
| `owasp_github_match.py` | OWASP ASVS/SAMM/MASVS gegen GitHub-Markdown matchen |
|
||||
| `oscal_import.py` | NIST OSCAL Import (776 Controls aus JSON Catalog) |
|
||||
| `oscal_analysis.py` | NIST OSCAL Analyse: Abdeckung, fehlende Controls |
|
||||
|
||||
### Diagnose + Utilities
|
||||
|
||||
| Script | Beschreibung |
|
||||
|---|---|
|
||||
| `db_status.py` | DB-Status: release_state Counts, pipeline_version, source Verteilung |
|
||||
| `debug_low_match.py` | Debugging: Warum matchen Blue Guide / OWASP / CISA schlecht? |
|
||||
| `qa_article_map_all_chunks.py` | Alle Chunks Artikel-Nummern zuordnen (Bulk) |
|
||||
| `backfill_job_66228863.py` | Einmaliger Backfill-Job |
|
||||
| `sync_controls_to_prod.py` | Controls von Dev nach Production synchronisieren |
|
||||
|
||||
### Runner
|
||||
|
||||
| Script | Beschreibung |
|
||||
|---|---|
|
||||
| `run_job.sh` | **Job-Runner**: Laedt `.env`, PID-Lock, Monitoring (`--status`, `--log`, `--kill`) |
|
||||
|
||||
## Phase 7: PDF-Validierung + Enrichment (2026-03-20)
|
||||
|
||||
### 7.1 + 7.2: Controls gegen PDFs validiert + Ergebnisse angewendet ✅
|
||||
|
||||
- 5.063 Controls erfolgreich im Original-PDF lokalisiert (88%)
|
||||
- `article_type` fuer alle gematchten Controls gesetzt
|
||||
- 648 Preamble-Controls als `recital_suspect` in `generation_metadata` markiert
|
||||
- 332 Controls nicht matchbar (OWASP ASVS 132, CISA 72, ENISA 38, OWASP SAMM 31, CRA 28)
|
||||
|
||||
### 7.3: Lueckenanalyse ✅
|
||||
|
||||
**494 Artikel-Luecken** in 15 Quellen identifiziert. Geschaetzt ~300 davon actionable.
|
||||
|
||||
| Source | Luecken | Coverage | Bemerkung |
|
||||
|---|---:|---:|---|
|
||||
| AML-Verordnung | 91 | 5% | Kaum ingestiert |
|
||||
| MiCA | 71 | 52% | Grosse Verordnung |
|
||||
| NIST SP 800-53 | 59 | 83% | Meist Section-Header, nur SA-15 fehlt |
|
||||
| OWASP ASVS 4.0 | 47 | 35% | Requirement-Gruppen fehlen |
|
||||
| Batterieverordnung | 41 | 58% | |
|
||||
| DSGVO | 35 | 65% | Einige Governance/Aufsicht-Artikel |
|
||||
| ENISA ICS/SCADA | 34 | 31% | |
|
||||
| ENISA Supply Chain | 26 | 7% | |
|
||||
| CRA | 23 | 68% | |
|
||||
| NIS2 | 16 | 65% | |
|
||||
| KI-Verordnung | 15 | 87% | Fast komplett |
|
||||
| Maschinenverordnung | 5 | 91% | Fast komplett |
|
||||
|
||||
### 7.4: Neue Controls fuer Luecken generieren ✅ (2026-03-20)
|
||||
|
||||
Script: `phase74_generate_gap_controls.py --resume`
|
||||
|
||||
- **494 Artikel-Luecken** in 15 Quellen → Anthropic Claude Sonnet 4.6
|
||||
- `pipeline_version = 5`, `generation_strategy = 'phase74_gap_fill'`
|
||||
- Direkt PDF-Text als Input (nicht RAG-Chunks)
|
||||
- Starten via: `run_job.sh phase74_generate_gap_controls.py --resume`
|
||||
|
||||
**Ergebnis:**
|
||||
|
||||
| Source | Luecken | Generiert |
|
||||
|---|---:|---:|
|
||||
| AML-Verordnung | 91 | 97 |
|
||||
| MiCA | 71 | 68 |
|
||||
| NIST SP 800-53 | 59 | 19 |
|
||||
| KI-Verordnung | 15 | 15 |
|
||||
| OWASP ASVS 4.0 | 47 | 11 |
|
||||
| Batterieverordnung | 41 | 9 |
|
||||
| DSGVO | 35 | 4 |
|
||||
| OWASP Top 10 | 12 | 3 |
|
||||
| NIS2 | 16 | 3 |
|
||||
| CRA | 23 | 3 |
|
||||
| OECD KI-Empfehlung | 4 | 1 |
|
||||
| **Gesamt** | **494** | **233** |
|
||||
|
||||
Nicht generiert: 75 zu kurzer Text, 29 NIST-Intros, 11 Parse-Errors, 162 ID-Konflikte (COMP-1000 etc.).
|
||||
API-Kosten: ~$7,55 (109 min Laufzeit).
|
||||
|
||||
## Pass 0a: Obligation Extraction — 3-Tier-Klassifizierung
|
||||
|
||||
### Konzept
|
||||
|
||||
Pass 0a zerlegt Rich Controls (~6.000) in **atomare Obligations** per LLM (Claude Sonnet 4.6).
|
||||
Jede Obligation wird durch den **Quality Gate** klassifiziert — nicht gefiltert:
|
||||
|
||||
| obligation_type | Signal | Beispiel |
|
||||
|---|---|---|
|
||||
| **pflicht** | müssen, muss, ist zu, hat zu, shall, must, required | "Der Betreiber muss alle Daten verschluesseln" |
|
||||
| **empfehlung** | soll, sollen, should, sicherstellen, gewaehrleisten, dokumentieren | "Der Betreiber soll regelmaessige Audits durchfuehren" |
|
||||
| **kann** | kann, koennen, darf, duerfen, may, optional | "Der Betreiber kann zusaetzliche Massnahmen ergreifen" |
|
||||
|
||||
**Wichtig:** Nichts wird mehr rejected wegen fehlendem normativem Signal. Obligations ohne Signal werden als `empfehlung` klassifiziert. Rejected werden nur noch: Evidence-Only, zu kurz (<20 Zeichen), fehlender Parent-Link.
|
||||
|
||||
### Warum auch Empfehlungen behalten?
|
||||
|
||||
Empfehlungen helfen Firmen, ihre Systeme sicherer zu machen — ueber das Pflichtprogramm hinaus. Im Frontend erhalten Kunden einen Marker, der klar anzeigt:
|
||||
|
||||
- **Pflicht** = gesetzlich/regulatorisch vorgeschrieben
|
||||
- **Empfehlung** = Best Practice, freiwillig, aber wertvoll
|
||||
- **Kann** = optional, weitergehende Massnahme
|
||||
|
||||
### Quality Gate — Kritische Flags
|
||||
|
||||
| Flag | Kritisch? | Beschreibung |
|
||||
|---|---|---|
|
||||
| `has_normative_signal` | Nein | Informativer Check, kein Ablehnungsgrund |
|
||||
| `obligation_type` | — | Klassifizierung (pflicht/empfehlung/kann) |
|
||||
| `not_evidence_only` | **Ja** | Kein reiner Nachweis-Eintrag |
|
||||
| `min_length` | **Ja** | Mindestens 20 Zeichen |
|
||||
| `has_parent_link` | **Ja** | Verbindung zum Parent-Control |
|
||||
| `single_action` | Nein | Nur ein Hauptverb (heuristisch) |
|
||||
| `not_rationale` | Nein | Keine reine Begruendung |
|
||||
|
||||
### Normative Signal Detection — Regex-Tiers
|
||||
|
||||
```
|
||||
Tier 1 (Pflicht): muessen, muss, ist/sind/hat/haben zu + Infinitiv,
|
||||
Compound-Verben (festzustellen, vorzunehmen),
|
||||
Gerundivum (mitzuteilen, bereitzustellen),
|
||||
shall, must, required
|
||||
|
||||
Tier 2 (Empfehlung): soll, sollen, sollte, sollten,
|
||||
gewaehrleisten, sicherstellen,
|
||||
should, ensure, recommend,
|
||||
dokumentieren, implementieren, ueberpruefen
|
||||
|
||||
Tier 3 (Kann): kann, koennen, darf, duerfen, may, optional
|
||||
```
|
||||
|
||||
### Testergebnisse (3 Iterationen, 2026-03-20)
|
||||
|
||||
| Run | Controls | Obligations | Validated | Rejected | Kosten |
|
||||
|---|---:|---:|---:|---:|---:|
|
||||
| 1 (v0 Regex) | 10 | ~100 | 68% | 32% | $0,28 |
|
||||
| 2 (v1 Regex) | 50 | ~530 | 78% | 22% | $1,43 |
|
||||
| 3 (v2 Regex) | 50 | ~530 | 86% | 14% | $1,44 |
|
||||
| 4 (3-Tier) | 60 | — | — | — | — |
|
||||
|
||||
Run 4 laeuft mit dem neuen Klassifizierer — statt PASS/REJECT wird jetzt PFLICHT/EMPFEHLUNG/KANN ausgegeben.
|
||||
|
||||
### Scripts
|
||||
|
||||
| Script | Beschreibung |
|
||||
|---|---|
|
||||
| `test_pass0a.py` | **Test-Script**: Standalone (kein SQLAlchemy), psycopg2 + Anthropic API. Speichert Ergebnisse als JSON. |
|
||||
|
||||
```bash
|
||||
# Test mit 10 Controls
|
||||
run_job.sh test_pass0a.py --limit 10
|
||||
|
||||
# Test mit bestimmter Quelle
|
||||
run_job.sh test_pass0a.py --limit 20 --source "DSGVO"
|
||||
|
||||
# Ergebnisse: /tmp/pass0a_results_<N>controls.json
|
||||
```
|
||||
|
||||
### Backend-Code
|
||||
|
||||
- **Klassifizierung:** `backend-compliance/compliance/services/decomposition_pass.py`
|
||||
- `classify_obligation_type()` — 3-Tier-Klassifizierung
|
||||
- `quality_gate()` — gibt `obligation_type` in Flags zurueck
|
||||
- `passes_quality_gate()` — `has_normative_signal` nicht mehr kritisch
|
||||
- `ObligationCandidate.obligation_type` — neues Feld
|
||||
|
||||
### Hochrechnung (basierend auf 50-Control-Runs)
|
||||
|
||||
| Metrik | Wert |
|
||||
|---|---|
|
||||
| Kosten pro Control | ~$0,029 |
|
||||
| Kosten fuer ~6.000 Controls | **~$172** |
|
||||
| Laufzeit (geschaetzt) | ~25h |
|
||||
| Obligations pro Control | ~10,5 |
|
||||
|
||||
---
|
||||
|
||||
## Naechste Schritte
|
||||
|
||||
1. ~~**Phase 5 Cleanup** → 3.301 Duplikate geloescht, Source normalisiert~~ ✅
|
||||
2. ~~**Phase 6 Pipeline-Haertung** → Source aus REGULATION_LICENSE_MAP~~ ✅
|
||||
3. ~~**Phase 7.1-7.3** → PDF-Validierung + Enrichment + Lueckenanalyse~~ ✅
|
||||
4. ~~**Phase 7.4** → 233 neue Controls fuer Luecken generiert ($7,55)~~ ✅
|
||||
5. **Pass 0a** → Obligation Extraction mit 3-Tier-Klassifizierung (Tests laufen, ~$172)
|
||||
6. **Pass 0b** → Atomic Control Composition aus validierten Obligations
|
||||
7. **Pass 1-5** → Multi-Layer Migration (Code + 500 Tests bereits vorhanden)
|
||||
8. **Phase 8** → Qdrant Re-Ingestion (Runtime-Betrieb, ZULETZT)
|
||||
9. **needs_review Triage** — 838 Controls klassifizieren
|
||||
10. **Frontend** — `obligation_type` (Pflicht/Empfehlung/Kann) + `article_type` anzeigen
|
||||
|
||||
206
docs-src/development/rag-pipeline-benchmark.md
Normal file
206
docs-src/development/rag-pipeline-benchmark.md
Normal file
@@ -0,0 +1,206 @@
|
||||
# RAG Pipeline Benchmark & Optimierungen
|
||||
|
||||
Stand: 2026-03-21. Vergleich unserer Implementierung mit State of the Art. Priorisierte Empfehlungen nach Impact/Effort.
|
||||
|
||||
---
|
||||
|
||||
## Aktuelle Pipeline (Ist-Zustand)
|
||||
|
||||
```mermaid
|
||||
flowchart LR
|
||||
A[Dokumente] -->|Document Crawler| B[Chunks 512/50]
|
||||
B -->|bge-m3| C[Qdrant Dense]
|
||||
C -->|Cosine Search| D[Control Generator v2]
|
||||
D -->|LLM| E[Rich Controls 6.373]
|
||||
E -->|Pass 0a| F[Obligations]
|
||||
F -->|Pass 0b| G[Atomare Controls]
|
||||
G -->|4-Stage Dedup| H[Master Controls ~18K]
|
||||
```
|
||||
|
||||
| Komponente | Implementierung | SOTA-Bewertung |
|
||||
|-----------|----------------|----------------|
|
||||
| **Chunking** | Rekursiv, 512 Zeichen, 50 Overlap | Zu klein fuer Rechtstexte |
|
||||
| **Embedding** | bge-m3 (1024-dim, Ollama) | Gut, aber nur Dense genutzt |
|
||||
| **Vector DB** | Qdrant mit Payload-Filtering | Hybrid Search nicht aktiviert |
|
||||
| **Retrieval** | Pure Dense Cosine Similarity | Kein Re-Ranking, kein BM25 |
|
||||
| **Extraktion** | 3-Tier (Exact → Embedding → LLM) | Solide Architektur |
|
||||
| **Dedup** | 4-Stage (Pattern → Action → Object → Embedding) | Ueberdurchschnittlich |
|
||||
| **QA** | 5-Metrik Similarity + PDF-QA Matching | Gut, RAGAS fehlt |
|
||||
|
||||
---
|
||||
|
||||
## Tier 1: Quick Wins (Tage, nicht Wochen)
|
||||
|
||||
### 1. Chunk-Groesse erhoehen: 512 → 1024, Overlap 50 → 128
|
||||
|
||||
**Problem:** NAACL 2025 Vectara-Studie zeigt: fuer analytische/juristische Queries sind 512-1024 Token optimal. Unsere 512-Zeichen-Chunks (= ~128 Token) sind deutlich zu klein.
|
||||
|
||||
**Unsere Lessons Learned:** "Chunks werden mitten im Absatz abgeschnitten. Artikel- und Paragraphennummern fehlen."
|
||||
|
||||
**Aenderung:** Config-Parameter in `ingest-phase-h.sh` anpassen.
|
||||
|
||||
| Metrik | Vorher | Nachher |
|
||||
|--------|--------|---------|
|
||||
| Chunk Size | 512 chars (~128 Token) | 1024 chars (~256 Token) |
|
||||
| Overlap | 50 chars (10%) | 128 chars (12.5%) |
|
||||
|
||||
**Impact:** HOCH | **Effort:** NIEDRIG
|
||||
|
||||
### 2. Ollama JSON-Mode fuer Obligation Extraction
|
||||
|
||||
**Problem:** `_parse_json` in `decomposition_pass.py` hat Regex-Fallback — das zeigt, dass LLM-Output nicht zuverlaessig JSON ist.
|
||||
|
||||
**Aenderung:** `format: "json"` in Ollama-API-Calls setzen.
|
||||
|
||||
**Impact:** MITTEL | **Effort:** NIEDRIG (1 Parameter)
|
||||
|
||||
### 3. Chain-of-Thought Prompting fuer Pass 0a/0b
|
||||
|
||||
**Problem:** LegalGPT-Framework zeigt: explizite Reasoning-Chains ("Erst Addressat identifizieren, dann Aktion, dann normative Staerke") verbessern Extraktionsqualitaet signifikant.
|
||||
|
||||
**Impact:** MITTEL | **Effort:** NIEDRIG (Prompt Engineering)
|
||||
|
||||
---
|
||||
|
||||
## Tier 2: High Impact, Medium Effort (1-2 Wochen)
|
||||
|
||||
### 4. Hybrid Search (Dense + Sparse) via Qdrant
|
||||
|
||||
**Problem:** Reine Dense-Suche. Juristische Queries enthalten spezifische Begriffe ("DSGVO Art. 35", "Abs. 3"), die BM25/Sparse besser findet.
|
||||
|
||||
**Loesungsansatz:** BGE-M3 generiert bereits Sparse Vectors — wir verwerfen sie aktuell!
|
||||
|
||||
```
|
||||
Qdrant Query API:
|
||||
- Dense: bge-m3 Cosine (wie bisher)
|
||||
- Sparse: bge-m3 Sparse Vectors (neu)
|
||||
- Fusion: Reciprocal Rank Fusion (RRF)
|
||||
```
|
||||
|
||||
**Benchmarks (Anthropic):** 49% weniger fehlgeschlagene Retrievals mit Contextual Retrieval, 67% mit Re-Ranking.
|
||||
|
||||
**Impact:** SEHR HOCH | **Effort:** MITTEL
|
||||
|
||||
### 5. Cross-Encoder Re-Ranking
|
||||
|
||||
**Problem:** Top-5 Ergebnisse direkt an LLM — keine Qualitaetspruefung der Retrieval-Ergebnisse.
|
||||
|
||||
**Loesungsansatz:** BGE Reranker v2 (MIT-Lizenz) auf Top-20 Ergebnisse, dann Top-5 an LLM.
|
||||
|
||||
| Re-Ranker | Lizenz | Empfehlung |
|
||||
|-----------|--------|------------|
|
||||
| BGE Reranker v2 | MIT | Empfohlen |
|
||||
| Jina Reranker v2 | Apache-2.0 | Alternative |
|
||||
| ColBERT v2 | MIT | Spaeter |
|
||||
|
||||
**Impact:** HOCH | **Effort:** MITTEL
|
||||
|
||||
### 6. Cross-Regulation Dedup Pass
|
||||
|
||||
**Problem:** Dedup filtert immer nach `pattern_id` — Controls aus DSGVO Art. 25 und NIS2 Art. 21 (beide Security-by-Design) werden nie verglichen.
|
||||
|
||||
**Loesungsansatz:** Zweiter Qdrant-Search ohne `pattern_id`-Filter nach dem normalen Dedup-Pass.
|
||||
|
||||
**Impact:** HOCH | **Effort:** MITTEL
|
||||
|
||||
### 7. Automatische Regressionstests (Golden Set)
|
||||
|
||||
**Problem:** Keine systematische Qualitaetsmessung nach Pipeline-Aenderungen.
|
||||
|
||||
**Loesungsansatz:** 20-Chunk Golden Set → Control-Generation → Output-Stabilitaet pruefen.
|
||||
|
||||
**Impact:** HOCH | **Effort:** NIEDRIG
|
||||
|
||||
---
|
||||
|
||||
## Tier 3: Strategische Investitionen (Wochen bis Monate)
|
||||
|
||||
### 8. Artikel-Boundary Chunking
|
||||
|
||||
Eigener Splitter fuer EU-Verordnungen und deutsche Gesetze: Split an "Art.", "Artikel", "Paragraph"-Grenzen statt nach Zeichenzahl.
|
||||
|
||||
### 9. RAGAS Evaluation Pipeline
|
||||
|
||||
[RAGAS](https://docs.ragas.io/) mit Golden Dataset (50-100 manuell verifizierte Control-to-Source Mappings). Metriken: Faithfulness, Answer Relevancy, Context Precision, Context Recall.
|
||||
|
||||
### 10. BGE-M3 Fine-Tuning
|
||||
|
||||
Fine-Tuning auf Compliance-Corpus (~6.373 Control-Titel/Objective-Paare). Research zeigt +10-30% Domain-Retrieval-Verbesserung.
|
||||
|
||||
### 11. LLM-as-Judge
|
||||
|
||||
Claude Sonnet bewertet jeden generierten Control auf Faithfulness zum Quelltext (~$0.01/Control).
|
||||
|
||||
### 12. Active Learning aus Review-Queue
|
||||
|
||||
Menschliche Entscheidungen der Dedup Review-Queue nutzen, um Schwellenwerte ueber die Zeit zu optimieren.
|
||||
|
||||
---
|
||||
|
||||
## Nicht empfohlen (niedriger ROI oder Konflikte)
|
||||
|
||||
| Ansatz | Grund |
|
||||
|--------|-------|
|
||||
| Jina v3 Embeddings | **CC-BY-NC-4.0** — verletzt Open Source Policy |
|
||||
| Voyage-law-2 | API-only, proprietaer — kein Self-Hosting |
|
||||
| Semantic Chunking | Benchmarks zeigen keinen Vorteil gegenueber Recursive fuer strukturierte Dokumente |
|
||||
| HyDE als Primaerstrategie | Latenz (+43-60%) + Halluzinationsrisiko |
|
||||
| Knowledge Graph RAG | Massiver Aufwand, unklarer Gewinn bei strukturiertem Rechtskorpus |
|
||||
|
||||
---
|
||||
|
||||
## Embedding-Modell Vergleich
|
||||
|
||||
| Modell | MTEB Score | Multilingual | Kontext | Lizenz | Bewertung |
|
||||
|--------|-----------|-------------|---------|--------|-----------|
|
||||
| **BGE-M3** (aktuell) | 63.0 | 100+ Sprachen | 8192 Token | MIT | Gut, Dense+Sparse+ColBERT |
|
||||
| Jina v3 | 65.5 | 89 Sprachen | 8192 Token | CC-BY-NC | Nicht nutzbar (Lizenz!) |
|
||||
| E5-Mistral-7B | ~65 | Gut | 4096 Token | MIT | Gross, hoher RAM |
|
||||
| Voyage-law-2 | Best Legal | EN Legal | 16K Token | Proprietaer | Nicht nutzbar (API-only) |
|
||||
|
||||
**Fazit:** BGE-M3 bleibt die beste Wahl fuer unseren Stack. Sparse-Vectors aktivieren und Fine-Tuning bringen mehr als ein Modellwechsel.
|
||||
|
||||
---
|
||||
|
||||
## Test-Coverage Analyse
|
||||
|
||||
### Pipeline-Module (567 Tests)
|
||||
|
||||
| Modul | Tests | Bewertung | Fehlende Tests |
|
||||
|-------|-------|-----------|----------------|
|
||||
| Control Generator | 110 | Exzellent | 10-15 Edge Cases |
|
||||
| Obligation Extractor | 107 | Exzellent | 8-10 Edge Cases |
|
||||
| Decomposition Pass | 90 | Exzellent | 5-8 Edge Cases |
|
||||
| Pattern Matcher | 72 | Gut | 10-15 Edge Cases |
|
||||
| Control Dedup | 56 | Exzellent | 5-8 Edge Cases |
|
||||
| Control Composer | 54 | Gut | 8-10 Edge Cases |
|
||||
| Pipeline Adapter | 36 | Gut | 10-15 Edge Cases |
|
||||
| Citation Backfill | 20 | Moderat | 5-8 Edge Cases |
|
||||
| License Gate | 12 | Minimal | 5-8 Edge Cases |
|
||||
| RAG Client | 10 | Minimal | 5-8 Edge Cases |
|
||||
|
||||
### Kritische Luecken (fehlende Tests)
|
||||
|
||||
| Service | Datei | Prioritaet |
|
||||
|---------|-------|------------|
|
||||
| AI Compliance Assistant | `ai_compliance_assistant.py` | HOCH (25-30 Tests noetig) |
|
||||
| PDF Extractor | `pdf_extractor.py` | HOCH (20-25 Tests noetig) |
|
||||
| LLM Provider | `llm_provider.py` | HOCH (15-20 Tests noetig) |
|
||||
| Similarity Detector | `similarity_detector.py` | MITTEL (20-25 Tests noetig) |
|
||||
| Anchor Finder | `anchor_finder.py` | MITTEL |
|
||||
|
||||
### Test-Infrastruktur
|
||||
|
||||
**Fehlend:** Shared `conftest.py` mit gemeinsamen Fixtures (LLM-Mock, DB-Mock, Embedding-Mock). Aktuell sind Fixtures in jedem Test-File dupliziert.
|
||||
|
||||
---
|
||||
|
||||
## Quellen
|
||||
|
||||
- [NAACL 2025 Vectara Chunking Study](https://blog.premai.io/rag-chunking-strategies-the-2026-benchmark-guide/)
|
||||
- [Anthropic Contextual Retrieval](https://www.anthropic.com/news/contextual-retrieval)
|
||||
- [Qdrant Hybrid Search Query API](https://qdrant.tech/articles/hybrid-search/)
|
||||
- [Structure-Aware Chunking for Legal (ACL 2025)](https://aclanthology.org/2025.justnlp-main.19/)
|
||||
- [RAGAS Evaluation Framework](https://docs.ragas.io/)
|
||||
- [BGE Reranker v2 (MIT)](https://huggingface.co/BAAI/bge-reranker-v2-m3)
|
||||
- [LegalGPT / CALLM Framework](https://www.emergentmind.com/topics/compliance-alignment-llm-callm)
|
||||
223
docs-src/development/rag-pipeline-lessons-learned.md
Normal file
223
docs-src/development/rag-pipeline-lessons-learned.md
Normal file
@@ -0,0 +1,223 @@
|
||||
# RAG Pipeline: Lessons Learned & Hardening
|
||||
|
||||
## Übersicht
|
||||
|
||||
Dieses Dokument beschreibt die Erkenntnisse aus dem Aufbau der RAG-Pipeline und die daraus abgeleiteten Maßnahmen zur Härtung. Es dient als Referenz für zukünftige Ingestion-Runs und Pipeline-Erweiterungen.
|
||||
|
||||
## Architektur: Wann brauchen wir RAG vs. Direct PDF?
|
||||
|
||||
### RAG ist nötig für:
|
||||
|
||||
| Use Case | Warum RAG? |
|
||||
|---|---|
|
||||
| **Compliance Advisor (Chat)** | Semantische Suche über 38+ Dokumente in Echtzeit |
|
||||
| **Cross-Regulation Mapping** | "Zeige alle Anforderungen zu Verschlüsselung" über alle Quellen |
|
||||
| **Customer Scope-Filtering** | Nur Chunks aus relevanten Regulations für den Kunden |
|
||||
| **Inkrementelle Updates** | Neues Dokument → nur neue Chunks verarbeiten |
|
||||
|
||||
### RAG ist NICHT nötig für:
|
||||
|
||||
| Use Case | Besser: Direct PDF |
|
||||
|---|---|
|
||||
| **Control-Generierung (Batch)** | PDF → PyMuPDF → Strukturparser → Artikel-Index → API |
|
||||
| **PDF-QA/Verifizierung** | Substring-Match direkt im PDF (schneller, exakter) |
|
||||
| **Artikel/§-Extraktion** | Regex-basierte Extraktion aus PDF-Text |
|
||||
|
||||
### Hybrid-Ansatz (Empfehlung)
|
||||
|
||||
```
|
||||
Control-Generierung: PDF → Strukturparser → Artikel-Index → Anthropic API
|
||||
(KEIN RAG nötig, direkt aus PDF)
|
||||
|
||||
Runtime-Betrieb: Qdrant-RAG für semantische Suche, Chat, Scope-Analyse
|
||||
(RAG mit angereicherten Chunks + Struktur-Metadaten)
|
||||
```
|
||||
|
||||
## Fehler und Root Causes
|
||||
|
||||
### 1. Doppelte Ingestion = Doppelte Controls
|
||||
|
||||
**Problem:** Gleiche PDFs unter verschiedenen Namen ingestiert (z.B. "Maschinenverordnung" und "Verordnung (EU) 2023/1230") → unterschiedliche Chunks (anderes Chunking) → anderer Hash → doppelt verarbeitet → doppelte Controls.
|
||||
|
||||
**Root Cause:**
|
||||
- `regulation_name` aus Chunk-Metadaten statt aus kanonischer Quelle
|
||||
- UNIQUE-Constraint nur `(chunk_hash, collection, document_version)` — nicht global
|
||||
- Kein Check ob `regulation_code` bereits in einer Collection existiert
|
||||
|
||||
**Fix (implementiert):**
|
||||
- `REGULATION_LICENSE_MAP` enthält jetzt kanonische `name`-Werte die den DB-Einträgen entsprechen
|
||||
- `source_citation.source` wird aus `REGULATION_LICENSE_MAP.name` genommen, NICHT aus `chunk.regulation_name`
|
||||
- Phase 5 Cleanup: 3.301 Duplikate hart gelöscht
|
||||
|
||||
**Fix (noch offen):**
|
||||
- Chunk-Hash UNIQUE Constraint global machen: `(chunk_hash, document_version)` statt `(chunk_hash, collection, document_version)`
|
||||
- Vor Ingestion: Check ob `regulation_code` bereits in einer Collection existiert
|
||||
|
||||
### 2. Chunks verlieren Strukturinformation
|
||||
|
||||
**Problem:** Chunks werden mitten im Absatz abgeschnitten. § und Artikelnummern fehlen in den Chunk-Metadaten. Kontext des Kapitels/Abschnitts geht verloren.
|
||||
|
||||
**Root Cause:**
|
||||
- `chunk_strategy=recursive` mit `chunk_size=512, chunk_overlap=50` — zu kleine Chunks
|
||||
- Chunking beachtet keine Dokumentstruktur (Artikel-/Paragraphengrenzen)
|
||||
- Keine Einleitung/Kapitelkontext als Prefix
|
||||
|
||||
**Empfehlung für Re-Ingestion:**
|
||||
- **Strukturiertes Chunking:** Chunks an Artikel-/Paragraphengrenzen schneiden
|
||||
- **Kontext-Prefix:** Kapiteleinleitung und übergeordnete Struktur mitliefern
|
||||
- **Metadaten anreichern:** `article`, `paragraph`, `article_type`, `section_hierarchy`
|
||||
- **Größere Chunks:** Mindestens 1024 Tokens, besser volle Artikel/Paragraphen
|
||||
|
||||
### 3. Cross-Collection-Duplikate
|
||||
|
||||
**Problem:** `nist_csf_2_0` in `bp_compliance_ce` (67 Chunks) UND `bp_compliance_datenschutz` (162 Chunks). EU-Verordnungen sowohl in `bp_compliance_ce` als auch `bp_compliance_gesetze`.
|
||||
|
||||
**Root Cause:** Keine Collection-Zuordnungsregeln. Manuelle Zuweisung bei Ingestion.
|
||||
|
||||
**Fix:** `cleanup-qdrant-duplicates.py` Script bereinigt Cross-Collection-Duplikate.
|
||||
|
||||
**Empfehlung:** Klare Collection-Zuordnungsregeln:
|
||||
- `bp_compliance_ce` = EU-Verordnungen + internationale Standards
|
||||
- `bp_compliance_gesetze` = Deutsche + österreichische Gesetze (NUR nationale Gesetze)
|
||||
- `bp_compliance_datenschutz` = EDPB/WP29 Leitlinien + Privacy Frameworks
|
||||
|
||||
### 4. OWASP Multilingual Controls
|
||||
|
||||
**Problem:** 324 OWASP Top 10 Controls in ZH, AR, ID, FR, ES, PT — Übersetzungen derselben 10 Kategorien. Kein Mehrwert, aber 324 doppelte Controls generiert.
|
||||
|
||||
**Root Cause:** Multilingual PDFs/GitHub-Quellen ohne Spracherkennung ingestiert.
|
||||
|
||||
**Fix:** 324 als `duplicate` markiert und gelöscht.
|
||||
|
||||
**Empfehlung:** Bei Ingestion Spracherkennung + Deduplizierung. Nur DE + EN behalten.
|
||||
|
||||
### 5. Fehlende Artikel/Paragraph-Extraktion
|
||||
|
||||
**Problem:** Chunks haben `article` und `paragraph` oft leer oder falsch. Die LLM-basierte Extraktion bei der Control-Generierung ist unzuverlässig.
|
||||
|
||||
**Root Cause:** Ingestion-Pipeline extrahiert keine Strukturinformation aus dem PDF.
|
||||
|
||||
**Fix (implementiert):** PDF-QA-Pipeline (`pdf_qa_all.py`) matched `source_original_text` gegen Original-PDFs und extrahiert korrekte Artikel/Paragraphen — 86% Match-Rate.
|
||||
|
||||
**Empfehlung:** Bei Re-Ingestion direkt in den Chunk-Metadaten speichern.
|
||||
|
||||
### 6. Job-Tracking nicht persistent
|
||||
|
||||
**Problem:** Generation-Jobs laufen als Background-Tasks. Kein Logging, welche Chunks verarbeitet, Status nur über API abfragbar. Bei API-Timeout oder Restart geht der Fortschritt verloren.
|
||||
|
||||
**Root Cause:** `asyncio.create_task()` hat keinen Recovery-Mechanismus.
|
||||
|
||||
**Fix (teilweise):** `canonical_generation_jobs` Tabelle trackt Jobs. `canonical_processed_chunks` markiert verarbeitete Chunks.
|
||||
|
||||
**Empfehlung:**
|
||||
- Job-Log in DB persistieren (nicht nur stdout)
|
||||
- Fortschritt in `canonical_generation_jobs.progress` als JSONB speichern
|
||||
- Chunk-Level-Status: verarbeitet / übersprungen / Fehler
|
||||
- Recovery-Fähigkeit: Job kann von letztem Checkpoint fortgesetzt werden
|
||||
|
||||
## Empfohlene Metadaten für Re-Ingestion
|
||||
|
||||
### Chunk-Level Metadaten (Qdrant Payload)
|
||||
|
||||
```json
|
||||
{
|
||||
"chunk_text": "...",
|
||||
"regulation_code": "eu_2016_679",
|
||||
"regulation_name_de": "DSGVO (EU) 2016/679",
|
||||
"regulation_name_en": "GDPR (EU) 2016/679",
|
||||
"article": "25",
|
||||
"article_title": "Datenschutz durch Technikgestaltung und datenschutzfreundliche Voreinstellungen",
|
||||
"article_type": "article",
|
||||
"paragraph": "1",
|
||||
"section_hierarchy": ["Kapitel IV", "Abschnitt 2", "Artikel 25"],
|
||||
"chapter_context": "Kapitel IV — Verantwortlicher und Auftragsverarbeiter",
|
||||
"pages": [45, 46],
|
||||
"effective_date": "2018-05-25",
|
||||
"publication_date": "2016-04-27",
|
||||
"document_version": "2016-04-27",
|
||||
"source_language": "de",
|
||||
"source_url": "https://eur-lex.europa.eu/...",
|
||||
"celex": "32016R0679",
|
||||
"license": "EU_LAW",
|
||||
"license_rule": 1,
|
||||
"source_type": "law",
|
||||
"category": "datenschutz",
|
||||
"chunk_position": 42,
|
||||
"total_chunks": 423
|
||||
}
|
||||
```
|
||||
|
||||
### Dokument-Level Metadaten (Corpus Version)
|
||||
|
||||
```json
|
||||
{
|
||||
"regulation_code": "eu_2016_679",
|
||||
"canonical_name_de": "DSGVO (EU) 2016/679",
|
||||
"canonical_name_en": "GDPR (EU) 2016/679",
|
||||
"document_type": "eu_regulation",
|
||||
"effective_date": "2018-05-25",
|
||||
"publication_date": "2016-04-27",
|
||||
"supersedes": null,
|
||||
"superseded_by": null,
|
||||
"source_pdf": "gdpr_regulation_eu_2016_679.pdf",
|
||||
"source_pdf_sha256": "abc123...",
|
||||
"total_articles": 99,
|
||||
"total_recitals": 173,
|
||||
"total_annexes": 0,
|
||||
"ingestion_date": "2026-03-20",
|
||||
"ingestion_version": "v2"
|
||||
}
|
||||
```
|
||||
|
||||
## Pipeline-Härtung Checkliste
|
||||
|
||||
### Vor Ingestion
|
||||
|
||||
- [ ] Prüfen ob `regulation_code` bereits in einer Collection existiert
|
||||
- [ ] PDF-SHA256 gegen bekannte PDFs prüfen (Duplikat-Erkennung)
|
||||
- [ ] `regulation_name` aus `REGULATION_LICENSE_MAP` verwenden, NICHT aus Chunk-Metadaten
|
||||
- [ ] Spracherkennung: Nur DE + EN ingestieren
|
||||
- [ ] Dokument-Metadaten (effective_date, publication_date) recherchieren
|
||||
|
||||
### Während Ingestion
|
||||
|
||||
- [ ] Strukturiertes Chunking an Artikel-/Paragraphengrenzen
|
||||
- [ ] Kontext-Prefix mit Kapiteleinleitung
|
||||
- [ ] Chunk-Metadaten anreichern (article, paragraph, article_type, section_hierarchy)
|
||||
- [ ] Fortschritt in DB loggen
|
||||
|
||||
### Nach Ingestion
|
||||
|
||||
- [ ] Chunk-Count pro `regulation_code` prüfen (Sanity Check)
|
||||
- [ ] PDF-QA gegen Original-PDF laufen lassen
|
||||
- [ ] Cross-Collection-Duplikat-Check
|
||||
- [ ] Corpus-Version in DB eintragen
|
||||
|
||||
### Control-Generierung
|
||||
|
||||
- [ ] `source_citation.source` aus `REGULATION_LICENSE_MAP.name`, NICHT aus Chunk-Metadaten
|
||||
- [ ] Harmonisierung: Threshold 0.85 für Duplikate innerhalb gleicher `regulation_code`
|
||||
- [ ] Cross-Regulation-Harmonisierung bei ähnlichen Themen (z.B. DSGVO Art. 25 ↔ NIS2 Art. 21)
|
||||
- [ ] Job-Fortschritt persistent in DB speichern
|
||||
|
||||
## Workflow: Mac Mini → Production Sync
|
||||
|
||||
```
|
||||
1. Mac Mini: PDF → Qdrant (lokal, http://macmini:6333)
|
||||
2. Mac Mini: Control-Generierung → PostgreSQL (shared, 46.225.100.82:54321)
|
||||
3. QA: PDF-Match, Dedup, Source-Normalisierung
|
||||
4. Qdrant Migration: macmini:6333 → qdrant-dev.breakpilot.ai (scripts/migrate-qdrant.py)
|
||||
5. Deploy: git push gitea → Coolify Build + Deploy
|
||||
```
|
||||
|
||||
**WICHTIG:** PostgreSQL ist SHARED — Änderungen auf Mac Mini sind sofort in Production sichtbar. Qdrant hat getrennte Instanzen (lokal + production) und muss manuell synchronisiert werden.
|
||||
|
||||
## Scripts
|
||||
|
||||
| Script | Beschreibung |
|
||||
|---|---|
|
||||
| `scripts/ingest-phase-h.sh` | Haupt-Ingestion: 38 Dokumente → Qdrant |
|
||||
| `scripts/cleanup-qdrant-duplicates.py` | Qdrant Duplikat-Cleanup (8 Schritte) |
|
||||
| `scripts/migrate-qdrant.py` | Qdrant Migration: lokal → production |
|
||||
| `scripts/qa/phase5_normalize_and_cleanup.py` | DB Normalisierung + Hard Delete |
|
||||
| `scripts/qa/pdf_qa_all.py` | PDF-Match QA |
|
||||
@@ -96,6 +96,7 @@ erDiagram
|
||||
varchar verification_method
|
||||
varchar target_audience
|
||||
varchar generation_strategy
|
||||
varchar obligation_type
|
||||
smallint pipeline_version
|
||||
integer license_rule
|
||||
jsonb source_citation
|
||||
@@ -936,9 +937,11 @@ Drei Kompositions-Modi:
|
||||
|
||||
Zerlegt Rich Controls in atomare Controls. Laeuft VOR den Migration Passes 1-5.
|
||||
|
||||
#### Pass 0a — Obligation Extraction
|
||||
#### Pass 0a — Obligation Extraction + 3-Tier-Klassifizierung
|
||||
|
||||
Extrahiert einzelne normative Pflichten aus einem Rich Control per LLM.
|
||||
Extrahiert einzelne normative Pflichten aus einem Rich Control per LLM (Claude Sonnet 4.6).
|
||||
Jede Obligation wird als **pflicht**, **empfehlung** oder **kann** klassifiziert — nichts wird
|
||||
wegen fehlendem normativem Signal abgelehnt.
|
||||
|
||||
**6 Guardrails:**
|
||||
|
||||
@@ -949,23 +952,37 @@ Extrahiert einzelne normative Pflichten aus einem Rich Control per LLM.
|
||||
5. Nicht auf Evidence-Ebene zerlegen
|
||||
6. Parent-Link immer erhalten
|
||||
|
||||
**Quality Gate:** Jeder Kandidat wird gegen 6 Kriterien geprueft:
|
||||
**3-Tier Obligation Classification:**
|
||||
|
||||
- `has_normative_signal` — Normatives Sprachsignal erkannt
|
||||
- `single_action` — Nur eine Handlung
|
||||
- `not_rationale` — Keine blosse Begruendung
|
||||
- `not_evidence_only` — Kein reines Evidence-Fragment
|
||||
- `min_length` — Mindestlaenge erreicht
|
||||
- `has_parent_link` — Referenz zum Rich Control
|
||||
| obligation_type | Signal-Beispiele | Bedeutung |
|
||||
|---|---|---|
|
||||
| `pflicht` | müssen, ist zu, shall, must, required | Gesetzliche/regulatorische Pflicht |
|
||||
| `empfehlung` | soll, should, sicherstellen, dokumentieren | Best Practice, freiwillig |
|
||||
| `kann` | kann, darf, may, optional | Optionale Massnahme |
|
||||
|
||||
Kritische Checks: `has_normative_signal`, `not_evidence_only`, `min_length`, `has_parent_link`
|
||||
Obligations ohne erkennbares Signal werden als `empfehlung` klassifiziert (nicht rejected).
|
||||
Empfehlungen helfen Firmen, Systeme ueber das Pflichtprogramm hinaus zu sichern.
|
||||
|
||||
**Quality Gate — Kritische Checks:**
|
||||
|
||||
| Flag | Kritisch? | Beschreibung |
|
||||
|---|---|---|
|
||||
| `obligation_type` | — | Klassifizierung (pflicht/empfehlung/kann) |
|
||||
| `not_evidence_only` | **Ja** | Kein reines Evidence-Fragment |
|
||||
| `min_length` | **Ja** | Mindestlaenge (20 Zeichen) |
|
||||
| `has_parent_link` | **Ja** | Referenz zum Rich Control |
|
||||
| `has_normative_signal` | Nein | Informativer Check (nicht mehr Ablehnungsgrund) |
|
||||
| `single_action` | Nein | Nur eine Handlung (heuristisch) |
|
||||
| `not_rationale` | Nein | Keine blosse Begruendung |
|
||||
|
||||
#### Pass 0b — Atomic Control Composition
|
||||
|
||||
Erstellt aus jedem validierten Obligation Candidate ein atomares Control
|
||||
(LLM-gestuetzt mit Template-Fallback).
|
||||
(LLM-gestuetzt mit Template-Fallback). Das `obligation_type` Feld wird
|
||||
vom Parent-Obligation uebernommen.
|
||||
|
||||
**Datei:** `compliance/services/decomposition_pass.py`
|
||||
**Test-Script:** `scripts/qa/test_pass0a.py` (standalone, speichert JSON)
|
||||
|
||||
---
|
||||
|
||||
@@ -1012,11 +1029,13 @@ Die Crosswalk-Matrix bildet diese N:M-Beziehung ab.
|
||||
|
||||
**Migration 061:** Decomposition-Tabellen
|
||||
|
||||
| Tabelle | Beschreibung |
|
||||
| Tabelle / Feld | Beschreibung |
|
||||
|---------|-------------|
|
||||
| `obligation_candidates` | Extrahierte atomare Pflichten aus Rich Controls |
|
||||
| `obligation_candidates.obligation_type` | `pflicht` / `empfehlung` / `kann` (3-Tier-Klassifizierung) |
|
||||
| `canonical_controls.parent_control_uuid` | Self-Referenz zum Rich Control (neues Feld) |
|
||||
| `canonical_controls.decomposition_method` | Zerlegungsmethode (neues Feld) |
|
||||
| `canonical_controls.obligation_type` | Uebernommen von Obligation: pflicht/empfehlung/kann |
|
||||
|
||||
---
|
||||
|
||||
|
||||
@@ -567,7 +567,86 @@ curl -X POST https://api-dev.breakpilot.ai/api/compliance/v1/canonical/generate/
|
||||
|
||||
---
|
||||
|
||||
## Pass 0a/0b: Atomare Control-Zerlegung
|
||||
|
||||
Die Pipeline v3 erweitert die 7-Stufen-Pipeline um einen Vor-Pass, der Rich Controls in atomare Controls zerlegt.
|
||||
|
||||
### Pass 0a: Obligation Extraction
|
||||
|
||||
Extrahiert individuelle normative Pflichten aus Rich Controls via LLM.
|
||||
|
||||
```mermaid
|
||||
flowchart LR
|
||||
A[Rich Control] -->|LLM| B[Obligations]
|
||||
B --> C{Quality Gate}
|
||||
C -->|Pass| D[validated]
|
||||
C -->|Fail| E[rejected]
|
||||
```
|
||||
|
||||
**3-Tier Klassifikation:**
|
||||
|
||||
| Typ | Erkennungsmuster | Beispiel |
|
||||
|-----|-----------------|---------|
|
||||
| **Pflicht** | muss, ist verpflichtet, hat sicherzustellen | "Der Verantwortliche MUSS ein Verzeichnis fuehren" |
|
||||
| **Empfehlung** | soll, sollte, wird empfohlen | "Es SOLLTE eine Risikobewertung durchgefuehrt werden" |
|
||||
| **Kann** | kann, darf, ist berechtigt | "Die Aufsichtsbehoerde KANN Geldbussen verhaengen" |
|
||||
|
||||
**Quality Gate (6 Regeln):**
|
||||
|
||||
1. Nur normative Aussagen (muss, sicherzustellen, verpflichtet)
|
||||
2. Ein Hauptverb pro Obligation
|
||||
3. Test-Obligations separat von operativen
|
||||
4. Reporting-Obligations separat
|
||||
5. Nicht auf Evidence-Ebene splitten
|
||||
6. Parent-Link immer erhalten
|
||||
|
||||
### Pass 0b: Atomic Control Composition
|
||||
|
||||
Verwandelt jede validierte Obligation in ein eigenstaendiges atomares Control.
|
||||
|
||||
```mermaid
|
||||
flowchart LR
|
||||
A[Obligation] -->|LLM| B[Atomic Control]
|
||||
B -->|Dedup Check| C{4-Stage Dedup}
|
||||
C -->|NEW| D[Insert + Index]
|
||||
C -->|LINK| E[Parent-Link]
|
||||
C -->|REVIEW| F[Review-Queue]
|
||||
```
|
||||
|
||||
**Konfiguration:**
|
||||
|
||||
| Variable | Default | Beschreibung |
|
||||
|----------|---------|-------------|
|
||||
| `DECOMPOSITION_LLM_MODEL` | `claude-sonnet-4-6` | LLM fuer Pass 0a/0b |
|
||||
| `DECOMPOSITION_BATCH_SIZE` | `5` | Obligations pro LLM-Call |
|
||||
| `DECOMPOSITION_LLM_TIMEOUT` | `120` | Timeout in Sekunden |
|
||||
|
||||
**Ergebnisse (Stand 2026-03-21):**
|
||||
|
||||
| Metrik | Wert |
|
||||
|--------|------|
|
||||
| Rich Controls (technisch) | ~6.800 |
|
||||
| Atomare Controls (bisher) | 30 (PoC: 10x CRYP, AUTH, SEC) |
|
||||
| Ziel nach Full Run | ~18.000 unique Master Controls |
|
||||
| Obligations pro Rich Control | ~10 |
|
||||
| Dedup-Reduktion erwartet | ~70% |
|
||||
|
||||
### Quelldateien (Pass 0a/0b)
|
||||
|
||||
| Datei | Beschreibung |
|
||||
|-------|-------------|
|
||||
| `compliance/services/decomposition_pass.py` | Pass 0a + 0b Logik |
|
||||
| `compliance/services/control_dedup.py` | 4-Stufen Dedup-Engine |
|
||||
| `migrations/061_obligation_candidates.sql` | Obligation-Tabelle |
|
||||
| `migrations/074_control_dedup.sql` | Dedup-Tabellen (Parent-Links, Review-Queue) |
|
||||
| `tests/test_decomposition_pass.py` | 90 Tests |
|
||||
| `tests/test_control_dedup.py` | 56 Tests |
|
||||
|
||||
---
|
||||
|
||||
## Verwandte Dokumentation
|
||||
|
||||
- [Canonical Control Library (CP-CLIB)](canonical-control-library.md) — Domains, Datenmodell, Too-Close-Detektor, CI/CD Validation
|
||||
- [Deduplizierungs-Engine](dedup-engine.md) — 4-Stufen Dedup, Multi-Parent-Linking, Review-Queue
|
||||
- [RAG Pipeline Benchmark](../../development/rag-pipeline-benchmark.md) — State-of-the-Art Vergleich, Optimierungsempfehlungen
|
||||
- [Multi-Layer Control Architecture](canonical-control-library.md#multi-layer-control-architecture) — 10-Stage Pipeline-Erweiterung mit Obligations, Patterns, Crosswalk
|
||||
|
||||
253
docs-src/services/sdk-modules/dedup-engine.md
Normal file
253
docs-src/services/sdk-modules/dedup-engine.md
Normal file
@@ -0,0 +1,253 @@
|
||||
# Deduplizierungs-Engine (Control Dedup)
|
||||
|
||||
4-stufige Dedup-Pipeline zur Vermeidung doppelter atomarer Controls bei der Pass 0b Komposition. Kern-USP: **"1 Control erfuellt 5 Gesetze"** durch Multi-Parent-Linking.
|
||||
|
||||
**Backend:** `backend-compliance/compliance/services/control_dedup.py`
|
||||
**Migration:** `backend-compliance/migrations/074_control_dedup.sql`
|
||||
**Tests:** `backend-compliance/tests/test_control_dedup.py` (56 Tests)
|
||||
|
||||
---
|
||||
|
||||
## Motivation
|
||||
|
||||
Aus ~6.800 technischen Controls x ~10 Obligations pro Control entstehen ~68.000 atomare Kandidaten. Ziel: ~18.000 einzigartige Master Controls. Viele Obligations aus verschiedenen Gesetzen fuehren zum gleichen technischen Control (z.B. "MFA implementieren" in DSGVO, NIS2, AI Act).
|
||||
|
||||
**Problem:** Embedding-only Deduplizierung ist GEFAEHRLICH fuer Compliance.
|
||||
|
||||
!!! danger "False-Positive Beispiel"
|
||||
- "Admin-Zugriffe muessen MFA nutzen" vs. "Remote-Zugriffe muessen MFA nutzen"
|
||||
- Embedding sagt >0.9 aehnlich
|
||||
- Aber es sind **ZWEI verschiedene Controls** (verschiedene Objekte!)
|
||||
|
||||
---
|
||||
|
||||
## 4-Stufen Entscheidungsbaum
|
||||
|
||||
```mermaid
|
||||
flowchart TD
|
||||
A[Kandidat-Control] --> B{Pattern-Gate}
|
||||
B -->|pattern_id verschieden| N1[NEW CONTROL]
|
||||
B -->|pattern_id gleich| C{Action-Check}
|
||||
C -->|Action verschieden| N2[NEW CONTROL]
|
||||
C -->|Action gleich| D{Object-Normalization}
|
||||
D -->|Objekt verschieden| E{Similarity > 0.95?}
|
||||
E -->|Ja| L1[LINK]
|
||||
E -->|Nein| N3[NEW CONTROL]
|
||||
D -->|Objekt gleich| F{Tiered Thresholds}
|
||||
F -->|> 0.92| L2[LINK]
|
||||
F -->|0.85 - 0.92| R[REVIEW QUEUE]
|
||||
F -->|< 0.85| N4[NEW CONTROL]
|
||||
```
|
||||
|
||||
### Stufe 1: Pattern-Gate (hart)
|
||||
|
||||
`pattern_id` muss uebereinstimmen. Verhindert ~80% der False Positives.
|
||||
|
||||
```python
|
||||
if pattern_id != existing.pattern_id:
|
||||
→ NEW CONTROL # Verschiedene Kontrollmuster = verschiedene Controls
|
||||
```
|
||||
|
||||
### Stufe 2: Action-Check (hart)
|
||||
|
||||
Normalisierte Aktionsverben muessen uebereinstimmen. "Implementieren" vs. "Testen" = verschiedene Controls, auch bei gleichem Objekt.
|
||||
|
||||
```python
|
||||
if normalize_action("implementieren") != normalize_action("testen"):
|
||||
→ NEW CONTROL # "implement" != "test"
|
||||
```
|
||||
|
||||
**Action-Normalisierung (Deutsch → Englisch):**
|
||||
|
||||
| Deutsche Verben | Kanonische Form |
|
||||
|----------------|-----------------|
|
||||
| implementieren, umsetzen, einrichten, aktivieren | `implement` |
|
||||
| testen, pruefen, ueberpruefen, verifizieren | `test` |
|
||||
| ueberwachen, monitoring, beobachten | `monitor` |
|
||||
| verschluesseln | `encrypt` |
|
||||
| protokollieren, aufzeichnen, loggen | `log` |
|
||||
| beschraenken, einschraenken, begrenzen | `restrict` |
|
||||
|
||||
### Stufe 3: Object-Normalization (weich)
|
||||
|
||||
Compliance-Objekte werden auf kanonische Token normalisiert.
|
||||
|
||||
```python
|
||||
normalize_object("Admin-Konten") → "privileged_access"
|
||||
normalize_object("Remote-Zugriff") → "remote_access"
|
||||
normalize_object("MFA") → "multi_factor_auth"
|
||||
```
|
||||
|
||||
Bei verschiedenen Objekten gilt ein hoeherer Schwellenwert (0.95 statt 0.92).
|
||||
|
||||
**Objekt-Normalisierung:**
|
||||
|
||||
| Eingabe | Kanonischer Token |
|
||||
|---------|------------------|
|
||||
| MFA, 2FA, Multi-Faktor-Authentifizierung | `multi_factor_auth` |
|
||||
| Admin-Konten, privilegierte Zugriffe | `privileged_access` |
|
||||
| Verschluesselung, Kryptografie | `encryption` |
|
||||
| Schluessel, Key Management | `key_management` |
|
||||
| TLS, SSL, HTTPS | `transport_encryption` |
|
||||
| Firewall | `firewall` |
|
||||
| Audit-Log, Protokoll, Logging | `audit_logging` |
|
||||
|
||||
### Stufe 4: Embedding Similarity (Qdrant)
|
||||
|
||||
Tiered Thresholds basierend auf Cosine-Similarity:
|
||||
|
||||
| Score | Verdict | Aktion |
|
||||
|-------|---------|--------|
|
||||
| > 0.95 | **LINK** | Bei verschiedenen Objekten |
|
||||
| > 0.92 | **LINK** | Parent-Link hinzufuegen |
|
||||
| 0.85 - 0.92 | **REVIEW** | In Review-Queue zur manuellen Pruefung |
|
||||
| < 0.85 | **NEW** | Neues Control anlegen |
|
||||
|
||||
---
|
||||
|
||||
## Canonicalization Layer
|
||||
|
||||
Vor dem Embedding wird der deutsche Compliance-Text in normalisiertes Englisch transformiert:
|
||||
|
||||
```
|
||||
"Administratoren muessen MFA verwenden"
|
||||
→ "implement multi_factor_auth for administratoren verwenden"
|
||||
→ Bessere Matches, weniger Embedding-Rauschen
|
||||
```
|
||||
|
||||
Dies reduziert das Rauschen durch synonyme Formulierungen in verschiedenen Gesetzen.
|
||||
|
||||
---
|
||||
|
||||
## Multi-Parent-Linking (M:N)
|
||||
|
||||
Ein atomares Control kann mehrere Eltern-Controls aus verschiedenen Regulierungen haben:
|
||||
|
||||
```json
|
||||
{
|
||||
"control_id": "AUTH-1072-A01",
|
||||
"parent_links": [
|
||||
{"parent_control_id": "AUTH-1001", "source": "NIST IA-02(01)", "link_type": "decomposition"},
|
||||
{"parent_control_id": "NIS2-045", "source": "NIS2 Art. 21", "link_type": "dedup_merge"}
|
||||
]
|
||||
}
|
||||
```
|
||||
|
||||
### Datenbank-Schema
|
||||
|
||||
```sql
|
||||
-- Migration 074: control_parent_links (M:N)
|
||||
CREATE TABLE control_parent_links (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
control_uuid UUID NOT NULL REFERENCES canonical_controls(id),
|
||||
parent_control_uuid UUID NOT NULL REFERENCES canonical_controls(id),
|
||||
link_type VARCHAR(30) NOT NULL DEFAULT 'decomposition',
|
||||
confidence NUMERIC(3,2) DEFAULT 1.0,
|
||||
source_regulation VARCHAR(100),
|
||||
source_article VARCHAR(100),
|
||||
obligation_candidate_id UUID REFERENCES obligation_candidates(id),
|
||||
created_at TIMESTAMPTZ DEFAULT NOW(),
|
||||
CONSTRAINT uq_parent_link UNIQUE (control_uuid, parent_control_uuid)
|
||||
);
|
||||
```
|
||||
|
||||
**Link-Typen:**
|
||||
|
||||
| Typ | Bedeutung |
|
||||
|-----|-----------|
|
||||
| `decomposition` | Aus Pass 0b Zerlegung |
|
||||
| `dedup_merge` | Durch Dedup-Engine als Duplikat erkannt |
|
||||
| `manual` | Manuell durch Reviewer verknuepft |
|
||||
| `crosswalk` | Aus Crosswalk-Matrix uebernommen |
|
||||
|
||||
---
|
||||
|
||||
## Review-Queue
|
||||
|
||||
Borderline-Matches (Similarity 0.85-0.92) werden in die Review-Queue geschrieben:
|
||||
|
||||
```sql
|
||||
-- Migration 074: control_dedup_reviews
|
||||
CREATE TABLE control_dedup_reviews (
|
||||
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
|
||||
candidate_control_id VARCHAR(30) NOT NULL,
|
||||
candidate_title TEXT NOT NULL,
|
||||
candidate_objective TEXT,
|
||||
matched_control_uuid UUID REFERENCES canonical_controls(id),
|
||||
matched_control_id VARCHAR(30),
|
||||
similarity_score NUMERIC(4,3),
|
||||
dedup_stage VARCHAR(40) NOT NULL,
|
||||
review_status VARCHAR(20) DEFAULT 'pending',
|
||||
-- pending → accepted_link | accepted_new | rejected
|
||||
created_at TIMESTAMPTZ DEFAULT NOW()
|
||||
);
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Qdrant Collection
|
||||
|
||||
```
|
||||
Collection: atomic_controls
|
||||
Dimension: 1024 (bge-m3)
|
||||
Distance: COSINE
|
||||
Payload: pattern_id, action_normalized, object_normalized, control_id, canonical_text
|
||||
Index: pattern_id (keyword), action_normalized (keyword), object_normalized (keyword)
|
||||
Query: IMMER mit filter: pattern_id == X (reduziert Suche drastisch)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Integration in Pass 0b
|
||||
|
||||
Die Dedup-Engine ist optional in `DecompositionPass` integriert:
|
||||
|
||||
```python
|
||||
decomp = DecompositionPass(db=session, dedup_enabled=True)
|
||||
stats = await decomp.run_pass0b(limit=100, use_anthropic=True)
|
||||
|
||||
# Stats enthalten Dedup-Metriken:
|
||||
# stats["dedup_linked"] = 15 (Duplikate → Parent-Link)
|
||||
# stats["dedup_review"] = 3 (Borderline → Review-Queue)
|
||||
# stats["controls_created"] = 82 (Neue Controls)
|
||||
```
|
||||
|
||||
**Ablauf bei Pass 0b mit Dedup:**
|
||||
|
||||
1. LLM generiert atomares Control
|
||||
2. Dedup-Engine prueft 4 Stufen
|
||||
3. **LINK:** Kein neues Control, Parent-Link zu bestehendem
|
||||
4. **REVIEW:** Kein neues Control, Eintrag in Review-Queue
|
||||
5. **NEW:** Control anlegen + in Qdrant indexieren
|
||||
|
||||
---
|
||||
|
||||
## Konfiguration
|
||||
|
||||
| Umgebungsvariable | Default | Beschreibung |
|
||||
|-------------------|---------|-------------|
|
||||
| `DEDUP_ENABLED` | `true` | Dedup-Engine ein/ausschalten |
|
||||
| `DEDUP_LINK_THRESHOLD` | `0.92` | Schwelle fuer automatisches Linking |
|
||||
| `DEDUP_REVIEW_THRESHOLD` | `0.85` | Schwelle fuer Review-Queue |
|
||||
| `DEDUP_LINK_THRESHOLD_DIFF_OBJ` | `0.95` | Schwelle bei verschiedenen Objekten |
|
||||
| `DEDUP_QDRANT_COLLECTION` | `atomic_controls` | Qdrant-Collection fuer Dedup-Index |
|
||||
| `QDRANT_URL` | `http://host.docker.internal:6333` | Qdrant-URL |
|
||||
| `EMBEDDING_URL` | `http://embedding-service:8087` | Embedding-Service-URL |
|
||||
|
||||
---
|
||||
|
||||
## Quelldateien
|
||||
|
||||
| Datei | Beschreibung |
|
||||
|-------|-------------|
|
||||
| `compliance/services/control_dedup.py` | 4-Stufen Dedup-Engine |
|
||||
| `compliance/services/decomposition_pass.py` | Pass 0a/0b mit Dedup-Integration |
|
||||
| `migrations/074_control_dedup.sql` | DB-Schema (parent_links, review_queue) |
|
||||
| `tests/test_control_dedup.py` | 56 Unit-Tests |
|
||||
|
||||
---
|
||||
|
||||
## Verwandte Dokumentation
|
||||
|
||||
- [Control Generator Pipeline](control-generator-pipeline.md) — 7-Stufen RAG→Control Pipeline
|
||||
- [Canonical Control Library](canonical-control-library.md) — Datenmodell, Domains, Similarity-Detektor
|
||||
@@ -107,6 +107,7 @@ nav:
|
||||
- Policy-Bibliothek (29 Richtlinien): services/sdk-modules/policy-bibliothek.md
|
||||
- Canonical Control Library (CP-CLIB): services/sdk-modules/canonical-control-library.md
|
||||
- Control Generator Pipeline: services/sdk-modules/control-generator-pipeline.md
|
||||
- Deduplizierungs-Engine: services/sdk-modules/dedup-engine.md
|
||||
- Control Provenance Wiki: services/sdk-modules/control-provenance.md
|
||||
- Strategie:
|
||||
- Wettbewerbsanalyse & Roadmap: strategy/wettbewerbsanalyse.md
|
||||
@@ -115,3 +116,5 @@ nav:
|
||||
- Dokumentation: development/documentation.md
|
||||
- CI/CD Pipeline: development/ci-cd-pipeline.md
|
||||
- QA Control Quality: development/qa-control-quality.md
|
||||
- RAG Pipeline Lessons Learned: development/rag-pipeline-lessons-learned.md
|
||||
- RAG Pipeline Benchmark: development/rag-pipeline-benchmark.md
|
||||
|
||||
@@ -1,11 +1,29 @@
|
||||
"""Apply PDF QA results: update source_citation with correct article + article_type."""
|
||||
"""
|
||||
Apply PDF QA results: update source_citation with correct article_type + article.
|
||||
|
||||
Safety modes:
|
||||
--safe (default): Only set article_type. Set article only when empty. Mark preamble as recital_suspect.
|
||||
--force-article: Also overwrite existing articles (CAREFUL: NIST substring matching is unreliable).
|
||||
--dry-run: Show what would change without writing.
|
||||
|
||||
Usage:
|
||||
python3 apply_pdf_qa_results.py # safe mode (apply article_type + empty articles)
|
||||
python3 apply_pdf_qa_results.py --dry-run # show changes without writing
|
||||
python3 apply_pdf_qa_results.py --force-article # also overwrite existing articles
|
||||
"""
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import psycopg2
|
||||
import urllib.parse
|
||||
from collections import Counter
|
||||
|
||||
RESULTS_FILE = "/tmp/pdf_qa_results.json"
|
||||
|
||||
# Parse args
|
||||
dry_run = "--dry-run" in sys.argv
|
||||
force_article = "--force-article" in sys.argv
|
||||
|
||||
# Load results
|
||||
with open(RESULTS_FILE) as f:
|
||||
results = json.load(f)
|
||||
@@ -21,35 +39,101 @@ conn = psycopg2.connect(
|
||||
options="-c search_path=compliance,public"
|
||||
)
|
||||
|
||||
# Update in batches
|
||||
# Load current DB state for all affected controls
|
||||
cur = conn.cursor()
|
||||
updated = 0
|
||||
ctrl_ids = [r["ctrl_id"] for r in results]
|
||||
cur.execute("""
|
||||
SELECT id,
|
||||
source_citation->>'article' as article,
|
||||
source_citation->>'article_type' as article_type,
|
||||
source_citation->>'source' as source
|
||||
FROM compliance.canonical_controls
|
||||
WHERE id = ANY(%s::uuid[])
|
||||
""", (ctrl_ids,))
|
||||
db_state = {}
|
||||
for row in cur.fetchall():
|
||||
db_state[str(row[0])] = {"article": row[1] or "", "article_type": row[2], "source": row[3]}
|
||||
|
||||
# Counters
|
||||
stats = Counter()
|
||||
updated_type = 0
|
||||
updated_article = 0
|
||||
updated_recital = 0
|
||||
errors = 0
|
||||
unchanged = 0
|
||||
|
||||
for i, r in enumerate(results):
|
||||
ctrl_id = r["ctrl_id"]
|
||||
article_label = r["article_label"]
|
||||
article_type = r["article_type"] # preamble, article, annex, section, unknown
|
||||
new_article = r["article_label"]
|
||||
new_type = r["article_type"]
|
||||
db = db_state.get(ctrl_id, {})
|
||||
|
||||
if not db:
|
||||
stats["missing_in_db"] += 1
|
||||
continue
|
||||
|
||||
old_type = db.get("article_type")
|
||||
old_article = db.get("article", "").strip()
|
||||
|
||||
# Decide what to update
|
||||
set_type = (old_type != new_type)
|
||||
set_article = (not old_article) or (force_article and old_article != new_article)
|
||||
set_recital = (new_type == "preamble")
|
||||
|
||||
if set_type:
|
||||
stats["type_" + ("new" if not old_type else "changed")] += 1
|
||||
else:
|
||||
stats["type_unchanged"] += 1
|
||||
|
||||
if not old_article and set_article:
|
||||
stats["article_new"] += 1
|
||||
elif old_article and old_article != new_article:
|
||||
if force_article:
|
||||
stats["article_force_changed"] += 1
|
||||
else:
|
||||
stats["article_skipped"] += 1
|
||||
else:
|
||||
stats["article_unchanged"] += 1
|
||||
|
||||
if set_recital:
|
||||
stats["recital"] += 1
|
||||
|
||||
if dry_run:
|
||||
continue
|
||||
|
||||
try:
|
||||
# Update source_citation: set article and article_type
|
||||
cur.execute("""
|
||||
UPDATE compliance.canonical_controls
|
||||
SET source_citation = source_citation
|
||||
|| jsonb_build_object('article', %s, 'article_type', %s),
|
||||
updated_at = now()
|
||||
WHERE id = %s::uuid
|
||||
AND (
|
||||
source_citation->>'article' IS DISTINCT FROM %s
|
||||
OR source_citation->>'article_type' IS DISTINCT FROM %s
|
||||
)
|
||||
""", (article_label, article_type, ctrl_id, article_label, article_type))
|
||||
# Build JSONB update
|
||||
updates = {}
|
||||
if set_type:
|
||||
updates["article_type"] = new_type
|
||||
if set_article:
|
||||
updates["article"] = new_article
|
||||
|
||||
if cur.rowcount > 0:
|
||||
updated += 1
|
||||
else:
|
||||
unchanged += 1
|
||||
if updates:
|
||||
# Merge into source_citation
|
||||
cur.execute("""
|
||||
UPDATE compliance.canonical_controls
|
||||
SET source_citation = COALESCE(source_citation, '{}'::jsonb) || %s::jsonb,
|
||||
updated_at = now()
|
||||
WHERE id = %s::uuid
|
||||
""", (json.dumps(updates), ctrl_id))
|
||||
if set_type:
|
||||
updated_type += 1
|
||||
if set_article:
|
||||
updated_article += 1
|
||||
|
||||
# Mark preamble as recital_suspect
|
||||
if set_recital:
|
||||
cur.execute("""
|
||||
UPDATE compliance.canonical_controls
|
||||
SET generation_metadata = jsonb_set(
|
||||
COALESCE(generation_metadata, '{}'::jsonb),
|
||||
'{recital_suspect}',
|
||||
'true'::jsonb
|
||||
),
|
||||
updated_at = now()
|
||||
WHERE id = %s::uuid
|
||||
""", (ctrl_id,))
|
||||
updated_recital += 1
|
||||
|
||||
except Exception as e:
|
||||
errors += 1
|
||||
@@ -58,12 +142,37 @@ for i, r in enumerate(results):
|
||||
conn.rollback()
|
||||
continue
|
||||
|
||||
if (i + 1) % 500 == 0:
|
||||
if (i + 1) % 1000 == 0:
|
||||
conn.commit()
|
||||
print(f" Progress: {i+1}/{len(results)} (updated: {updated}, unchanged: {unchanged}, errors: {errors})")
|
||||
print(f" Progress: {i+1}/{len(results)}")
|
||||
|
||||
conn.commit()
|
||||
print(f"\nDone: {updated} updated, {unchanged} unchanged, {errors} errors out of {len(results)}")
|
||||
if not dry_run:
|
||||
conn.commit()
|
||||
|
||||
mode = "DRY-RUN" if dry_run else "APPLIED"
|
||||
print(f"\n{'='*60}")
|
||||
print(f" Mode: {mode}")
|
||||
print(f"{'='*60}")
|
||||
print(f"\n article_type:")
|
||||
print(f" New (was NULL): {stats['type_new']:5d}")
|
||||
print(f" Changed: {stats['type_changed']:5d}")
|
||||
print(f" Unchanged: {stats['type_unchanged']:5d}")
|
||||
print(f"\n article:")
|
||||
print(f" New (was empty): {stats['article_new']:5d}")
|
||||
if force_article:
|
||||
print(f" Force-changed: {stats['article_force_changed']:5d}")
|
||||
else:
|
||||
print(f" Differs (SKIPPED): {stats['article_skipped']:5d}")
|
||||
print(f" Unchanged: {stats['article_unchanged']:5d}")
|
||||
print(f"\n Preamble/Recital: {stats['recital']:5d}")
|
||||
print(f" Missing in DB: {stats['missing_in_db']:5d}")
|
||||
|
||||
if not dry_run:
|
||||
print(f"\n Updates written:")
|
||||
print(f" article_type: {updated_type:5d}")
|
||||
print(f" article: {updated_article:5d}")
|
||||
print(f" recital_suspect: {updated_recital:5d}")
|
||||
print(f" Errors: {errors:5d}")
|
||||
|
||||
# Verify: count by article_type
|
||||
cur.execute("""
|
||||
|
||||
524
scripts/qa/benchmark_llm_controls.py
Normal file
524
scripts/qa/benchmark_llm_controls.py
Normal file
@@ -0,0 +1,524 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Phase 7.4 Benchmark: Compare gpt-oss-120b vs Claude Sonnet for Control Generation.
|
||||
|
||||
Tests 5 representative gap articles from different sources.
|
||||
Measures: quality (JSON valid, fields complete), response time, cost estimate.
|
||||
|
||||
Usage:
|
||||
python3 benchmark_llm_controls.py
|
||||
"""
|
||||
import json
|
||||
import time
|
||||
import sys
|
||||
import os
|
||||
import requests
|
||||
from pathlib import Path
|
||||
|
||||
# ── Config ──────────────────────────────────────────────────────────
|
||||
LITELLM_URL = "https://llm-dev.meghsakha.com"
|
||||
LITELLM_MODEL = "gpt-oss-120b"
|
||||
LITELLM_API_KEY = "sk-0nAyxaMVbIqmz_ntnndzag"
|
||||
|
||||
ANTHROPIC_URL = "https://api.anthropic.com/v1/messages"
|
||||
ANTHROPIC_MODEL = "claude-sonnet-4-6"
|
||||
ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", "")
|
||||
|
||||
PDF_DIR = Path(os.path.expanduser("~/rag-ingestion/pdfs"))
|
||||
|
||||
try:
|
||||
import fitz # PyMuPDF
|
||||
except ImportError:
|
||||
print("PyMuPDF not available, using pre-extracted texts")
|
||||
fitz = None
|
||||
|
||||
# ── Prompts (identical to control_generator.py) ─────────────────────
|
||||
|
||||
SYSTEM_PROMPT = """Du bist ein Security-Compliance-Experte. Strukturiere den gegebenen Text
|
||||
als praxisorientiertes Security Control. Erstelle eine verständliche, umsetzbare Formulierung.
|
||||
Antworte NUR mit validem JSON. Bei mehreren Controls antworte mit einem JSON-Array."""
|
||||
|
||||
APPLICABILITY_PROMPT = """- applicable_industries: Liste der Branchen fuer die dieses Control relevant ist.
|
||||
Verwende ["all"] wenn der Control branchenuebergreifend gilt.
|
||||
Moegliche Werte: "all", "Technologie / IT", "Finanzdienstleistungen", "Gesundheitswesen",
|
||||
"Produktion / Industrie", "Energie", "Telekommunikation", "Oeffentlicher Dienst"
|
||||
- applicable_company_size: Ab welcher Unternehmensgroesse gilt dieses Control?
|
||||
Verwende ["all"] wenn keine Groessenbeschraenkung.
|
||||
Moegliche Werte: "all", "micro", "small", "medium", "large", "enterprise"
|
||||
- scope_conditions: null wenn keine besonderen Bedingungen, sonst:
|
||||
{"requires_any": ["signal"], "description": "Erklaerung"}"""
|
||||
|
||||
|
||||
def build_prompt(source_name: str, article_label: str, article_text: str, license_type: str) -> str:
|
||||
return f"""Strukturiere den folgenden Gesetzestext als Security/Compliance Control.
|
||||
Du DARFST den Originaltext verwenden (Quelle: {source_name}, {license_type}).
|
||||
|
||||
WICHTIG: Erstelle eine verständliche, praxisorientierte Formulierung.
|
||||
Der Originaltext wird separat gespeichert — deine Formulierung soll klar und umsetzbar sein.
|
||||
|
||||
Gib JSON zurück mit diesen Feldern:
|
||||
- title: Kurzer prägnanter Titel (max 100 Zeichen)
|
||||
- objective: Was soll erreicht werden? (1-3 Sätze)
|
||||
- rationale: Warum ist das wichtig? (1-2 Sätze)
|
||||
- requirements: Liste von konkreten Anforderungen (Strings)
|
||||
- test_procedure: Liste von Prüfschritten (Strings)
|
||||
- evidence: Liste von Nachweisdokumenten (Strings)
|
||||
- severity: low/medium/high/critical
|
||||
- tags: Liste von Tags
|
||||
- domain: Fachgebiet (AUTH/CRYP/NET/DATA/LOG/ACC/SEC/INC/AI/COMP/GOV)
|
||||
- category: Inhaltliche Kategorie
|
||||
- target_audience: Liste der Zielgruppen
|
||||
- source_article: Artikel-Referenz (z.B. "Artikel 10", "§ 42")
|
||||
- source_paragraph: Absatz-Referenz (z.B. "Absatz 5")
|
||||
{APPLICABILITY_PROMPT}
|
||||
|
||||
Text: {article_text[:3000]}
|
||||
Quelle: {source_name}, {article_label}"""
|
||||
|
||||
|
||||
# ── PDF Text Extraction ─────────────────────────────────────────────
|
||||
|
||||
def extract_article_text(pdf_file: str, article_label: str, doc_type: str) -> str:
|
||||
"""Extract the text of a specific article from a PDF."""
|
||||
import re
|
||||
|
||||
path = PDF_DIR / pdf_file
|
||||
if not path.exists() or fitz is None:
|
||||
return ""
|
||||
|
||||
doc = fitz.open(str(path))
|
||||
full_text = ""
|
||||
for page in doc:
|
||||
full_text += page.get_text() + "\n"
|
||||
doc.close()
|
||||
|
||||
# Find article boundaries
|
||||
if doc_type == "eu_regulation":
|
||||
# Find "Artikel N" heading
|
||||
art_num = re.search(r'\d+', article_label)
|
||||
if not art_num:
|
||||
return ""
|
||||
num = int(art_num.group())
|
||||
# Find start of this article
|
||||
pattern = rf'\nArtikel\s+{num}\s*\n'
|
||||
match = re.search(pattern, full_text)
|
||||
if not match:
|
||||
return f"[Artikel {num} nicht im PDF gefunden]"
|
||||
start = match.start()
|
||||
# Find start of next article
|
||||
next_pattern = rf'\nArtikel\s+{num+1}\s*\n'
|
||||
next_match = re.search(next_pattern, full_text)
|
||||
end = next_match.start() if next_match else start + 5000
|
||||
text = full_text[start:end].strip()
|
||||
return text[:3000]
|
||||
|
||||
elif doc_type == "de_law":
|
||||
para_num = re.search(r'\d+', article_label)
|
||||
if not para_num:
|
||||
return ""
|
||||
num = int(para_num.group())
|
||||
pattern = rf'\n§\s+{num}\b'
|
||||
match = re.search(pattern, full_text)
|
||||
if not match:
|
||||
return f"[§ {num} nicht im PDF gefunden]"
|
||||
start = match.start()
|
||||
next_pattern = rf'\n§\s+{num+1}\b'
|
||||
next_match = re.search(next_pattern, full_text)
|
||||
end = next_match.start() if next_match else start + 5000
|
||||
text = full_text[start:end].strip()
|
||||
return text[:3000]
|
||||
|
||||
elif doc_type == "nist":
|
||||
# Find NIST control family
|
||||
match = re.search(rf'(?:^|\n)\s*{re.escape(article_label)}\b', full_text)
|
||||
if not match:
|
||||
return f"[{article_label} nicht im PDF gefunden]"
|
||||
start = match.start()
|
||||
text = full_text[start:start+3000].strip()
|
||||
return text
|
||||
|
||||
else:
|
||||
# Generic section search
|
||||
match = re.search(rf'(?:^|\n).*{re.escape(article_label)}\b', full_text)
|
||||
if not match:
|
||||
return f"[{article_label} nicht im PDF gefunden]"
|
||||
start = match.start()
|
||||
text = full_text[start:start+3000].strip()
|
||||
return text
|
||||
|
||||
|
||||
# ── API Calls ────────────────────────────────────────────────────────
|
||||
|
||||
def call_litellm(prompt: str, system_prompt: str) -> tuple:
|
||||
"""Call LiteLLM API. Returns (response_text, duration_seconds, error)."""
|
||||
headers = {
|
||||
"Content-Type": "application/json",
|
||||
"Authorization": f"Bearer {LITELLM_API_KEY}",
|
||||
}
|
||||
payload = {
|
||||
"model": LITELLM_MODEL,
|
||||
"messages": [
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": prompt},
|
||||
],
|
||||
"temperature": 0.3,
|
||||
"max_tokens": 4096,
|
||||
"stream": False,
|
||||
}
|
||||
|
||||
t0 = time.time()
|
||||
try:
|
||||
resp = requests.post(
|
||||
f"{LITELLM_URL}/v1/chat/completions",
|
||||
headers=headers,
|
||||
json=payload,
|
||||
timeout=180,
|
||||
)
|
||||
duration = time.time() - t0
|
||||
if resp.status_code != 200:
|
||||
return "", duration, f"HTTP {resp.status_code}: {resp.text[:200]}"
|
||||
data = resp.json()
|
||||
content = data["choices"][0]["message"]["content"]
|
||||
usage = data.get("usage", {})
|
||||
return content, duration, None, usage
|
||||
except Exception as e:
|
||||
return "", time.time() - t0, str(e), {}
|
||||
|
||||
|
||||
def call_anthropic(prompt: str, system_prompt: str) -> tuple:
|
||||
"""Call Anthropic API. Returns (response_text, duration_seconds, error)."""
|
||||
headers = {
|
||||
"x-api-key": ANTHROPIC_API_KEY,
|
||||
"anthropic-version": "2023-06-01",
|
||||
"content-type": "application/json",
|
||||
}
|
||||
payload = {
|
||||
"model": ANTHROPIC_MODEL,
|
||||
"max_tokens": 4096,
|
||||
"system": system_prompt,
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
}
|
||||
|
||||
t0 = time.time()
|
||||
try:
|
||||
resp = requests.post(ANTHROPIC_URL, headers=headers, json=payload, timeout=180)
|
||||
duration = time.time() - t0
|
||||
if resp.status_code != 200:
|
||||
return "", duration, f"HTTP {resp.status_code}: {resp.text[:200]}", {}
|
||||
data = resp.json()
|
||||
content = data["content"][0]["text"] if data.get("content") else ""
|
||||
usage = data.get("usage", {})
|
||||
return content, duration, None, usage
|
||||
except Exception as e:
|
||||
return "", time.time() - t0, str(e), {}
|
||||
|
||||
|
||||
# ── Quality Assessment ───────────────────────────────────────────────
|
||||
|
||||
REQUIRED_FIELDS = [
|
||||
"title", "objective", "rationale", "requirements",
|
||||
"test_procedure", "evidence", "severity", "domain",
|
||||
]
|
||||
|
||||
BONUS_FIELDS = [
|
||||
"tags", "category", "target_audience", "source_article",
|
||||
"applicable_industries", "applicable_company_size",
|
||||
]
|
||||
|
||||
|
||||
def assess_quality(raw_text: str) -> dict:
|
||||
"""Assess the quality of a control generation response."""
|
||||
result = {
|
||||
"json_valid": False,
|
||||
"required_fields": 0,
|
||||
"required_total": len(REQUIRED_FIELDS),
|
||||
"bonus_fields": 0,
|
||||
"bonus_total": len(BONUS_FIELDS),
|
||||
"requirements_count": 0,
|
||||
"test_procedure_count": 0,
|
||||
"evidence_count": 0,
|
||||
"title_length": 0,
|
||||
"objective_length": 0,
|
||||
"score": 0,
|
||||
}
|
||||
|
||||
# Try to parse JSON
|
||||
text = raw_text.strip()
|
||||
if text.startswith("```"):
|
||||
lines = text.split("\n")
|
||||
text = "\n".join(lines[1:-1] if lines[-1].startswith("```") else lines[1:])
|
||||
|
||||
try:
|
||||
data = json.loads(text)
|
||||
if isinstance(data, list):
|
||||
data = data[0] if data else {}
|
||||
except json.JSONDecodeError:
|
||||
# Try to find JSON object
|
||||
import re
|
||||
match = re.search(r'\{[\s\S]*\}', text)
|
||||
if match:
|
||||
try:
|
||||
data = json.loads(match.group())
|
||||
except json.JSONDecodeError:
|
||||
return result
|
||||
else:
|
||||
return result
|
||||
|
||||
result["json_valid"] = True
|
||||
|
||||
# Check required fields
|
||||
for f in REQUIRED_FIELDS:
|
||||
val = data.get(f)
|
||||
if val and (isinstance(val, str) and len(val) > 2 or isinstance(val, list) and len(val) > 0):
|
||||
result["required_fields"] += 1
|
||||
|
||||
# Check bonus fields
|
||||
for f in BONUS_FIELDS:
|
||||
val = data.get(f)
|
||||
if val and (isinstance(val, str) and len(val) > 0 or isinstance(val, list) and len(val) > 0):
|
||||
result["bonus_fields"] += 1
|
||||
|
||||
# Depth metrics
|
||||
reqs = data.get("requirements", [])
|
||||
result["requirements_count"] = len(reqs) if isinstance(reqs, list) else 0
|
||||
tp = data.get("test_procedure", [])
|
||||
result["test_procedure_count"] = len(tp) if isinstance(tp, list) else 0
|
||||
ev = data.get("evidence", [])
|
||||
result["evidence_count"] = len(ev) if isinstance(ev, list) else 0
|
||||
result["title_length"] = len(data.get("title", ""))
|
||||
result["objective_length"] = len(data.get("objective", ""))
|
||||
|
||||
# Score: 0-100
|
||||
score = 0
|
||||
score += 20 if result["json_valid"] else 0
|
||||
score += (result["required_fields"] / result["required_total"]) * 40
|
||||
score += (result["bonus_fields"] / result["bonus_total"]) * 15
|
||||
score += min(result["requirements_count"], 5) * 3 # max 15 for 5+ requirements
|
||||
score += min(result["test_procedure_count"], 3) * 3 # max 9 for 3+ tests
|
||||
score += 1 if result["objective_length"] > 50 else 0
|
||||
result["score"] = round(score, 1)
|
||||
|
||||
result["parsed_data"] = data
|
||||
return result
|
||||
|
||||
|
||||
# ── Test Cases ───────────────────────────────────────────────────────
|
||||
|
||||
TEST_CASES = [
|
||||
{
|
||||
"source": "DSGVO (EU) 2016/679",
|
||||
"article": "Artikel 32",
|
||||
"pdf": "dsgvo_2016_679.pdf",
|
||||
"doc_type": "eu_regulation",
|
||||
"license": "EU_LAW",
|
||||
"description": "Sicherheit der Verarbeitung — Kernthema Datenschutz",
|
||||
},
|
||||
{
|
||||
"source": "KI-Verordnung (EU) 2024/1689",
|
||||
"article": "Artikel 9",
|
||||
"pdf": "ai_act_2024_1689.pdf",
|
||||
"doc_type": "eu_regulation",
|
||||
"license": "EU_LAW",
|
||||
"description": "Risikomanagement für Hochrisiko-KI",
|
||||
},
|
||||
{
|
||||
"source": "NIS2-Richtlinie (EU) 2022/2555",
|
||||
"article": "Artikel 21",
|
||||
"pdf": "nis2_2022_2555.pdf",
|
||||
"doc_type": "eu_regulation",
|
||||
"license": "EU_LAW",
|
||||
"description": "Cybersicherheitsrisikomanagement — NIS2 Kernpflicht",
|
||||
},
|
||||
{
|
||||
"source": "Cyber Resilience Act (CRA)",
|
||||
"article": "Artikel 13",
|
||||
"pdf": "cra_2024_2847.pdf",
|
||||
"doc_type": "eu_regulation",
|
||||
"license": "EU_LAW",
|
||||
"description": "Pflichten der Hersteller",
|
||||
},
|
||||
{
|
||||
"source": "Bundesdatenschutzgesetz (BDSG)",
|
||||
"article": "§ 26",
|
||||
"pdf": "bdsg.pdf",
|
||||
"doc_type": "de_law",
|
||||
"license": "DE_LAW",
|
||||
"description": "Datenverarbeitung im Beschäftigungskontext",
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
# ── Main ─────────────────────────────────────────────────────────────
|
||||
|
||||
def main():
|
||||
if not ANTHROPIC_API_KEY:
|
||||
print("ERROR: Set ANTHROPIC_API_KEY environment variable")
|
||||
sys.exit(1)
|
||||
|
||||
print("=" * 80)
|
||||
print("LLM BENCHMARK: gpt-oss-120b vs Claude Sonnet 4.6")
|
||||
print("=" * 80)
|
||||
print(f" LiteLLM: {LITELLM_URL} / {LITELLM_MODEL}")
|
||||
print(f" Anthropic: {ANTHROPIC_MODEL}")
|
||||
print(f" Tests: {len(TEST_CASES)}")
|
||||
print()
|
||||
|
||||
# Pre-check LiteLLM
|
||||
try:
|
||||
r = requests.get(f"{LITELLM_URL}/v1/models",
|
||||
headers={"Authorization": f"Bearer {LITELLM_API_KEY}"}, timeout=10)
|
||||
print(f" LiteLLM OK: {r.status_code}")
|
||||
except Exception as e:
|
||||
print(f" LiteLLM ERROR: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
results = []
|
||||
|
||||
for i, tc in enumerate(TEST_CASES):
|
||||
print(f"\n{'='*80}")
|
||||
print(f"TEST {i+1}/{len(TEST_CASES)}: {tc['source']} — {tc['article']}")
|
||||
print(f" {tc['description']}")
|
||||
print(f"{'='*80}")
|
||||
|
||||
# Extract article text from PDF
|
||||
article_text = extract_article_text(tc["pdf"], tc["article"], tc["doc_type"])
|
||||
if not article_text or article_text.startswith("["):
|
||||
print(f" WARNING: {article_text or 'Empty text'}")
|
||||
continue
|
||||
|
||||
print(f" Text extracted: {len(article_text)} chars")
|
||||
print(f" First 120 chars: {article_text[:120].replace(chr(10), ' ')}...")
|
||||
|
||||
prompt = build_prompt(tc["source"], tc["article"], article_text, tc["license"])
|
||||
|
||||
# ── Call LiteLLM ──
|
||||
print(f"\n --- gpt-oss-120b ---")
|
||||
litellm_raw, litellm_time, litellm_err, litellm_usage = call_litellm(prompt, SYSTEM_PROMPT)
|
||||
if litellm_err:
|
||||
print(f" ERROR: {litellm_err}")
|
||||
litellm_quality = {"json_valid": False, "score": 0}
|
||||
else:
|
||||
print(f" Time: {litellm_time:.1f}s")
|
||||
print(f" Tokens: {litellm_usage}")
|
||||
litellm_quality = assess_quality(litellm_raw)
|
||||
print(f" JSON valid: {litellm_quality['json_valid']}")
|
||||
print(f" Score: {litellm_quality['score']}/100")
|
||||
print(f" Required fields: {litellm_quality['required_fields']}/{litellm_quality['required_total']}")
|
||||
print(f" Requirements: {litellm_quality['requirements_count']}, "
|
||||
f"Tests: {litellm_quality['test_procedure_count']}, "
|
||||
f"Evidence: {litellm_quality['evidence_count']}")
|
||||
if litellm_quality.get("parsed_data"):
|
||||
d = litellm_quality["parsed_data"]
|
||||
print(f" Title: {d.get('title', 'N/A')}")
|
||||
|
||||
# ── Call Anthropic ──
|
||||
print(f"\n --- Claude Sonnet 4.6 ---")
|
||||
anthropic_raw, anthropic_time, anthropic_err, anthropic_usage = call_anthropic(prompt, SYSTEM_PROMPT)
|
||||
if anthropic_err:
|
||||
print(f" ERROR: {anthropic_err}")
|
||||
anthropic_quality = {"json_valid": False, "score": 0}
|
||||
else:
|
||||
print(f" Time: {anthropic_time:.1f}s")
|
||||
print(f" Tokens: {anthropic_usage}")
|
||||
anthropic_quality = assess_quality(anthropic_raw)
|
||||
print(f" JSON valid: {anthropic_quality['json_valid']}")
|
||||
print(f" Score: {anthropic_quality['score']}/100")
|
||||
print(f" Required fields: {anthropic_quality['required_fields']}/{anthropic_quality['required_total']}")
|
||||
print(f" Requirements: {anthropic_quality['requirements_count']}, "
|
||||
f"Tests: {anthropic_quality['test_procedure_count']}, "
|
||||
f"Evidence: {anthropic_quality['evidence_count']}")
|
||||
if anthropic_quality.get("parsed_data"):
|
||||
d = anthropic_quality["parsed_data"]
|
||||
print(f" Title: {d.get('title', 'N/A')}")
|
||||
|
||||
# Compare
|
||||
print(f"\n --- VERGLEICH ---")
|
||||
speed_ratio = litellm_time / anthropic_time if anthropic_time > 0 else 0
|
||||
print(f" Speed: 120b {litellm_time:.1f}s vs Sonnet {anthropic_time:.1f}s "
|
||||
f"({'120b ' + str(round(speed_ratio, 1)) + 'x langsamer' if speed_ratio > 1 else '120b schneller'})")
|
||||
print(f" Score: 120b {litellm_quality.get('score', 0)}/100 vs "
|
||||
f"Sonnet {anthropic_quality.get('score', 0)}/100")
|
||||
|
||||
results.append({
|
||||
"test": f"{tc['source']} — {tc['article']}",
|
||||
"litellm": {
|
||||
"time": round(litellm_time, 1),
|
||||
"score": litellm_quality.get("score", 0),
|
||||
"json_valid": litellm_quality.get("json_valid", False),
|
||||
"requirements": litellm_quality.get("requirements_count", 0),
|
||||
"tests": litellm_quality.get("test_procedure_count", 0),
|
||||
"usage": litellm_usage,
|
||||
"raw": litellm_raw[:500] if litellm_raw else "",
|
||||
},
|
||||
"anthropic": {
|
||||
"time": round(anthropic_time, 1),
|
||||
"score": anthropic_quality.get("score", 0),
|
||||
"json_valid": anthropic_quality.get("json_valid", False),
|
||||
"requirements": anthropic_quality.get("requirements_count", 0),
|
||||
"tests": anthropic_quality.get("test_procedure_count", 0),
|
||||
"usage": anthropic_usage,
|
||||
"raw": anthropic_raw[:500] if anthropic_raw else "",
|
||||
},
|
||||
})
|
||||
|
||||
# ── Summary ──────────────────────────────────────────────────────
|
||||
print(f"\n\n{'='*80}")
|
||||
print("ZUSAMMENFASSUNG")
|
||||
print(f"{'='*80}")
|
||||
|
||||
if not results:
|
||||
print(" Keine Ergebnisse.")
|
||||
return
|
||||
|
||||
litellm_scores = [r["litellm"]["score"] for r in results]
|
||||
anthropic_scores = [r["anthropic"]["score"] for r in results]
|
||||
litellm_times = [r["litellm"]["time"] for r in results]
|
||||
anthropic_times = [r["anthropic"]["time"] for r in results]
|
||||
|
||||
print(f"\n {'Metrik':<30s} {'gpt-oss-120b':>15s} {'Claude Sonnet':>15s}")
|
||||
print(f" {'-'*30} {'-'*15} {'-'*15}")
|
||||
print(f" {'Avg Score (0-100)':<30s} {sum(litellm_scores)/len(litellm_scores):>13.1f} "
|
||||
f"{sum(anthropic_scores)/len(anthropic_scores):>13.1f}")
|
||||
print(f" {'Avg Time (s)':<30s} {sum(litellm_times)/len(litellm_times):>13.1f} "
|
||||
f"{sum(anthropic_times)/len(anthropic_times):>13.1f}")
|
||||
print(f" {'JSON Valid':<30s} {sum(1 for r in results if r['litellm']['json_valid']):>12d}/{len(results)} "
|
||||
f"{sum(1 for r in results if r['anthropic']['json_valid']):>12d}/{len(results)}")
|
||||
print(f" {'Avg Requirements':<30s} "
|
||||
f"{sum(r['litellm']['requirements'] for r in results)/len(results):>13.1f} "
|
||||
f"{sum(r['anthropic']['requirements'] for r in results)/len(results):>13.1f}")
|
||||
print(f" {'Avg Test Procedures':<30s} "
|
||||
f"{sum(r['litellm']['tests'] for r in results)/len(results):>13.1f} "
|
||||
f"{sum(r['anthropic']['tests'] for r in results)/len(results):>13.1f}")
|
||||
|
||||
# Cost estimate
|
||||
# Claude Sonnet: ~$3/M input, ~$15/M output
|
||||
# gpt-oss-120b: self-hosted = $0 API cost (only compute)
|
||||
total_anthropic_input = sum(r["anthropic"]["usage"].get("input_tokens", 0) for r in results)
|
||||
total_anthropic_output = sum(r["anthropic"]["usage"].get("output_tokens", 0) for r in results)
|
||||
anthropic_cost = (total_anthropic_input * 3 + total_anthropic_output * 15) / 1_000_000
|
||||
|
||||
print(f"\n Kostenvergleich (fuer {len(results)} Controls):")
|
||||
print(f" gpt-oss-120b: $0.00 (self-hosted)")
|
||||
print(f" Claude Sonnet: ${anthropic_cost:.4f} "
|
||||
f"({total_anthropic_input} input + {total_anthropic_output} output tokens)")
|
||||
|
||||
# Extrapolate for 494 gap articles
|
||||
if results:
|
||||
cost_per_control = anthropic_cost / len(results)
|
||||
print(f"\n Hochrechnung fuer 494 Luecken-Artikel:")
|
||||
print(f" gpt-oss-120b: $0.00")
|
||||
print(f" Claude Sonnet: ${cost_per_control * 494:.2f}")
|
||||
avg_time_120b = sum(litellm_times) / len(litellm_times)
|
||||
avg_time_sonnet = sum(anthropic_times) / len(anthropic_times)
|
||||
print(f" Zeit 120b: {avg_time_120b * 494 / 60:.0f} min ({avg_time_120b * 494 / 3600:.1f}h)")
|
||||
print(f" Zeit Sonnet: {avg_time_sonnet * 494 / 60:.0f} min ({avg_time_sonnet * 494 / 3600:.1f}h)")
|
||||
|
||||
# Save full results
|
||||
out_path = "/tmp/benchmark_llm_results.json"
|
||||
with open(out_path, 'w') as f:
|
||||
json.dump(results, f, indent=2, ensure_ascii=False)
|
||||
print(f"\n Detaillierte Ergebnisse: {out_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
200
scripts/qa/blue_guide_en_match.py
Normal file
200
scripts/qa/blue_guide_en_match.py
Normal file
@@ -0,0 +1,200 @@
|
||||
"""Match unmatched Blue Guide controls against the English PDF."""
|
||||
import os
|
||||
import re
|
||||
import json
|
||||
import unicodedata
|
||||
import psycopg2
|
||||
import urllib.parse
|
||||
|
||||
try:
|
||||
import fitz
|
||||
except ImportError:
|
||||
print("ERROR: PyMuPDF (fitz) not installed")
|
||||
exit(1)
|
||||
|
||||
PDF_PATH = os.path.expanduser("~/rag-ingestion/pdfs/blue_guide_2022_en.pdf")
|
||||
|
||||
def normalize(s):
|
||||
s = s.replace('\u00ad', '').replace('\xad', '')
|
||||
s = s.replace('\u200b', '').replace('\u00a0', ' ')
|
||||
s = s.replace('\ufb01', 'fi').replace('\ufb02', 'fl')
|
||||
s = s.replace('\ufb00', 'ff').replace('\ufb03', 'ffi').replace('\ufb04', 'ffl')
|
||||
s = s.replace('\u2019', "'").replace('\u2018', "'")
|
||||
s = s.replace('\u201c', '"').replace('\u201d', '"')
|
||||
s = s.replace('\u2013', '-').replace('\u2014', '-')
|
||||
s = s.replace('\u2022', '-').replace('\u00b7', '-')
|
||||
s = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', s)
|
||||
s = unicodedata.normalize('NFC', s)
|
||||
s = re.sub(r'\s+', ' ', s)
|
||||
return s.strip()
|
||||
|
||||
# Read EN PDF
|
||||
print(f"Reading {PDF_PATH}...")
|
||||
doc = fitz.open(PDF_PATH)
|
||||
text = ""
|
||||
for page in doc:
|
||||
text += page.get_text() + "\n"
|
||||
doc.close()
|
||||
print(f" {len(text):,} chars")
|
||||
|
||||
text_norm = normalize(text)
|
||||
|
||||
# Build article index for EN Blue Guide
|
||||
# EN Blue Guide uses "Article N" headings (not "Artikel N")
|
||||
items = []
|
||||
|
||||
# Find where "Article 1" starts — content before is preamble/intro
|
||||
art1_match = re.search(r'\nArticle\s+1\s*\n', text)
|
||||
if not art1_match:
|
||||
# Try section-based structure instead
|
||||
print(" No 'Article N' headings found, trying section-based index...")
|
||||
for m in re.finditer(r'(?:^|\n)\s*(\d+(?:\.\d+)*)\.\s+[A-Z]', text, re.MULTILINE):
|
||||
items.append((m.start(), f"Section {m.group(1)}", "section"))
|
||||
else:
|
||||
art1_pos = art1_match.start()
|
||||
# Article headings
|
||||
for m in re.finditer(r'(?:^|\n)\s*Article\s+(\d+[a-z]?)\s*\n', text, re.MULTILINE):
|
||||
art_num = int(re.match(r'(\d+)', m.group(1)).group(1))
|
||||
items.append((m.start(), f"Article {m.group(1)}", "article"))
|
||||
|
||||
# Annex markers
|
||||
for m in re.finditer(r'(?:^|\n)\s*ANNEX\s+([IVXLC]+[a-z]?)\b', text, re.MULTILINE):
|
||||
items.append((m.start(), f"Annex {m.group(1)}", "annex"))
|
||||
|
||||
# Also try numbered section headings as fallback
|
||||
for m in re.finditer(r'(?:^|\n)\s*(\d+\.\d+(?:\.\d+)?)\s+[A-Z]', text, re.MULTILINE):
|
||||
items.append((m.start(), f"Section {m.group(1)}", "section"))
|
||||
|
||||
items.sort(key=lambda x: x[0])
|
||||
seen = set()
|
||||
unique = []
|
||||
for pos, label, typ in items:
|
||||
if label not in seen:
|
||||
seen.add(label)
|
||||
unique.append((pos, label, typ))
|
||||
|
||||
print(f" Index: {len(unique)} sections")
|
||||
if unique[:5]:
|
||||
for pos, label, typ in unique[:5]:
|
||||
print(f" {label} [{typ}] @ pos {pos}")
|
||||
|
||||
# Precompute normalized positions
|
||||
index_norm = []
|
||||
for pos, label, typ in unique:
|
||||
norm_pos = len(normalize(text[:pos]))
|
||||
index_norm.append((norm_pos, label, typ))
|
||||
|
||||
# Connect to DB
|
||||
db_url = os.environ['DATABASE_URL']
|
||||
parsed = urllib.parse.urlparse(db_url)
|
||||
conn = psycopg2.connect(
|
||||
host=parsed.hostname, port=parsed.port or 5432,
|
||||
user=parsed.username, password=parsed.password,
|
||||
dbname=parsed.path.lstrip('/'),
|
||||
options="-c search_path=compliance,public"
|
||||
)
|
||||
cur = conn.cursor()
|
||||
|
||||
# Get Blue Guide controls without article_type (unmatched)
|
||||
cur.execute("""
|
||||
SELECT id, control_id, title, source_original_text,
|
||||
source_citation->>'article' as existing_article,
|
||||
source_citation->>'article_type' as existing_type,
|
||||
release_state
|
||||
FROM compliance.canonical_controls
|
||||
WHERE source_citation->>'source' = 'EU Blue Guide 2022'
|
||||
AND source_original_text IS NOT NULL
|
||||
AND length(source_original_text) > 50
|
||||
AND (source_citation->>'article_type' IS NULL)
|
||||
ORDER BY control_id
|
||||
""")
|
||||
controls = cur.fetchall()
|
||||
print(f"\nUnmatched Blue Guide controls: {len(controls)}")
|
||||
|
||||
# Match each control
|
||||
results = []
|
||||
found = 0
|
||||
not_found = 0
|
||||
|
||||
for ctrl in controls:
|
||||
ctrl_id, control_id, title, orig_text, existing_art, existing_type, state = ctrl
|
||||
orig_norm = normalize(orig_text)
|
||||
if len(orig_norm) < 30:
|
||||
not_found += 1
|
||||
continue
|
||||
|
||||
matched = False
|
||||
for start_frac in [0.25, 0.1, 0.5, 0.0, 0.75]:
|
||||
for length in [80, 60, 40, 30, 20]:
|
||||
start = max(0, int(len(orig_norm) * start_frac))
|
||||
snippet = orig_norm[start:start+length]
|
||||
if not snippet or len(snippet) < 15:
|
||||
continue
|
||||
pos = text_norm.find(snippet)
|
||||
if pos >= 0:
|
||||
# Find section
|
||||
label = "Unknown"
|
||||
typ = "unknown"
|
||||
for h_pos, h_label, h_type in reversed(index_norm):
|
||||
if h_pos <= pos:
|
||||
label = h_label
|
||||
typ = h_type
|
||||
break
|
||||
results.append({
|
||||
"ctrl_id": str(ctrl_id),
|
||||
"control_id": control_id,
|
||||
"source": "EU Blue Guide 2022",
|
||||
"article_label": label,
|
||||
"article_type": typ,
|
||||
})
|
||||
found += 1
|
||||
is_active = "" if state not in ('duplicate', 'too_close') else " [DUP]"
|
||||
print(f" {control_id:10s}: {label:25s} [{typ:8s}]{is_active}")
|
||||
matched = True
|
||||
break
|
||||
if matched:
|
||||
break
|
||||
|
||||
if not matched:
|
||||
not_found += 1
|
||||
print(f" {control_id:10s}: NOT FOUND {title[:50]}")
|
||||
|
||||
print(f"\n{'='*50}")
|
||||
print(f"Results: {found} matched, {not_found} not found out of {len(controls)}")
|
||||
|
||||
# Save results
|
||||
out_path = "/tmp/blue_guide_en_results.json"
|
||||
with open(out_path, 'w') as f:
|
||||
json.dump(results, f, indent=2, ensure_ascii=False)
|
||||
print(f"Saved to {out_path}")
|
||||
|
||||
# Apply results to DB
|
||||
if results:
|
||||
print(f"\nApplying {len(results)} results to DB...")
|
||||
applied = 0
|
||||
for r in results:
|
||||
cur.execute("""
|
||||
UPDATE compliance.canonical_controls
|
||||
SET source_citation = source_citation ||
|
||||
jsonb_build_object('article', %s, 'article_type', %s)
|
||||
WHERE id = %s::uuid
|
||||
AND (source_citation->>'article' IS DISTINCT FROM %s
|
||||
OR source_citation->>'article_type' IS DISTINCT FROM %s)
|
||||
""", (r["article_label"], r["article_type"],
|
||||
r["ctrl_id"], r["article_label"], r["article_type"]))
|
||||
if cur.rowcount > 0:
|
||||
applied += 1
|
||||
conn.commit()
|
||||
print(f" Applied: {applied} controls updated")
|
||||
|
||||
# Show type distribution
|
||||
type_counts = {}
|
||||
for r in results:
|
||||
t = r["article_type"]
|
||||
type_counts[t] = type_counts.get(t, 0) + 1
|
||||
if type_counts:
|
||||
print(f"\nArticle type distribution:")
|
||||
for t, c in sorted(type_counts.items(), key=lambda x: -x[1]):
|
||||
print(f" {t:12s}: {c:5d}")
|
||||
|
||||
conn.close()
|
||||
188
scripts/qa/gap_analysis.py
Normal file
188
scripts/qa/gap_analysis.py
Normal file
@@ -0,0 +1,188 @@
|
||||
"""
|
||||
Phase 7.3: Gap Analysis — Identify articles/sections WITHOUT controls.
|
||||
|
||||
For each regulation PDF:
|
||||
1. Extract all articles/sections from the PDF
|
||||
2. Compare with controls in the DB that reference this article
|
||||
3. Report gaps (articles with no controls)
|
||||
|
||||
Usage:
|
||||
python3 gap_analysis.py # show all gaps
|
||||
python3 gap_analysis.py --source "DSGVO" # filter by source
|
||||
"""
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import re
|
||||
import psycopg2
|
||||
import urllib.parse
|
||||
from pathlib import Path
|
||||
from collections import defaultdict
|
||||
|
||||
# Import from pdf_qa_all
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
from pdf_qa_all import (
|
||||
SOURCE_FILE_MAP, read_file, classify_doc, normalize,
|
||||
build_eu_article_index, build_de_law_index, build_nist_index,
|
||||
build_owasp_index, build_generic_index, MAX_ARTICLES
|
||||
)
|
||||
|
||||
# Only analyze sources with significant control counts (skip sources with <5 controls)
|
||||
MIN_CONTROLS = 5
|
||||
|
||||
|
||||
def main():
|
||||
source_filter = None
|
||||
if "--source" in sys.argv:
|
||||
idx = sys.argv.index("--source")
|
||||
if idx + 1 < len(sys.argv):
|
||||
source_filter = sys.argv[idx + 1]
|
||||
|
||||
# DB connection
|
||||
db_url = os.environ['DATABASE_URL']
|
||||
parsed = urllib.parse.urlparse(db_url)
|
||||
conn = psycopg2.connect(
|
||||
host=parsed.hostname, port=parsed.port or 5432,
|
||||
user=parsed.username, password=parsed.password,
|
||||
dbname=parsed.path.lstrip('/'),
|
||||
options="-c search_path=compliance,public"
|
||||
)
|
||||
cur = conn.cursor()
|
||||
|
||||
# Get all controls grouped by source with their article
|
||||
cur.execute("""
|
||||
SELECT source_citation->>'source' as source,
|
||||
source_citation->>'article' as article,
|
||||
source_citation->>'article_type' as article_type,
|
||||
count(*) as cnt
|
||||
FROM compliance.canonical_controls
|
||||
WHERE source_citation->>'source' IS NOT NULL
|
||||
AND release_state NOT IN ('duplicate', 'too_close')
|
||||
GROUP BY 1, 2, 3
|
||||
ORDER BY 1, 2
|
||||
""")
|
||||
|
||||
# Build: source -> {article -> (type, count)}
|
||||
controls_by_source = defaultdict(dict)
|
||||
for source, article, art_type, cnt in cur.fetchall():
|
||||
if article:
|
||||
controls_by_source[source][article] = (art_type or "unknown", cnt)
|
||||
|
||||
total_gaps = 0
|
||||
total_articles_checked = 0
|
||||
total_covered = 0
|
||||
gap_report = []
|
||||
|
||||
sources_to_check = sorted(SOURCE_FILE_MAP.keys())
|
||||
if source_filter:
|
||||
sources_to_check = [s for s in sources_to_check if source_filter.lower() in s.lower()]
|
||||
|
||||
for source_name in sources_to_check:
|
||||
filename = SOURCE_FILE_MAP.get(source_name)
|
||||
if filename is None:
|
||||
continue
|
||||
|
||||
controls = controls_by_source.get(source_name, {})
|
||||
if len(controls) < MIN_CONTROLS and not source_filter:
|
||||
continue
|
||||
|
||||
# Read PDF and build article index
|
||||
text = read_file(filename)
|
||||
if text is None:
|
||||
continue
|
||||
|
||||
doc_type = classify_doc(source_name)
|
||||
max_art = MAX_ARTICLES.get(source_name)
|
||||
|
||||
if doc_type == "eu_regulation":
|
||||
index = build_eu_article_index(text, max_article=max_art)
|
||||
elif doc_type == "de_law":
|
||||
index = build_de_law_index(text)
|
||||
elif doc_type == "nist":
|
||||
index = build_nist_index(text)
|
||||
elif doc_type == "owasp":
|
||||
index = build_owasp_index(text, source_name)
|
||||
else:
|
||||
index = build_generic_index(text)
|
||||
|
||||
if not index:
|
||||
continue
|
||||
|
||||
# Only look at substantive articles (not preamble, not annex for gap analysis)
|
||||
substantive_types = {"article", "section", "control", "requirement", "category"}
|
||||
substantive_articles = [(pos, label, typ) for pos, label, typ in index if typ in substantive_types]
|
||||
|
||||
preamble_articles = [(pos, label, typ) for pos, label, typ in index if typ == "preamble"]
|
||||
annex_articles = [(pos, label, typ) for pos, label, typ in index if typ == "annex"]
|
||||
|
||||
# Check which articles have controls
|
||||
covered = []
|
||||
gaps = []
|
||||
for pos, label, typ in substantive_articles:
|
||||
if label in controls:
|
||||
covered.append(label)
|
||||
else:
|
||||
gaps.append((label, typ))
|
||||
|
||||
total_articles_checked += len(substantive_articles)
|
||||
total_covered += len(covered)
|
||||
total_gaps += len(gaps)
|
||||
|
||||
# Count preamble/annex controls
|
||||
preamble_controls = sum(1 for a in controls if controls[a][0] == "preamble")
|
||||
annex_controls = sum(1 for a in controls if controls[a][0] == "annex")
|
||||
|
||||
coverage_pct = len(covered) / len(substantive_articles) * 100 if substantive_articles else 0
|
||||
|
||||
print(f"\n{'='*70}")
|
||||
print(f"{source_name}")
|
||||
print(f" PDF articles: {len(substantive_articles)} substantive, "
|
||||
f"{len(preamble_articles)} preamble, {len(annex_articles)} annex")
|
||||
print(f" DB controls: {sum(v[1] for v in controls.values())} total "
|
||||
f"({preamble_controls} preamble, {annex_controls} annex)")
|
||||
print(f" Coverage: {len(covered)}/{len(substantive_articles)} "
|
||||
f"({coverage_pct:.0f}%)")
|
||||
|
||||
if gaps:
|
||||
print(f" GAPS ({len(gaps)}):")
|
||||
for label, typ in gaps[:30]: # limit output
|
||||
print(f" - {label} [{typ}]")
|
||||
if len(gaps) > 30:
|
||||
print(f" ... and {len(gaps)-30} more")
|
||||
|
||||
gap_report.append({
|
||||
"source": source_name,
|
||||
"total_articles": len(substantive_articles),
|
||||
"covered": len(covered),
|
||||
"gaps": len(gaps),
|
||||
"coverage_pct": round(coverage_pct, 1),
|
||||
"gap_articles": [{"label": l, "type": t} for l, t in gaps],
|
||||
})
|
||||
|
||||
# Summary
|
||||
print(f"\n{'='*70}")
|
||||
print("GAP ANALYSIS SUMMARY")
|
||||
print(f"{'='*70}")
|
||||
print(f" Sources analyzed: {len([r for r in gap_report]) + len([s for s in sources_to_check if SOURCE_FILE_MAP.get(s)])}")
|
||||
print(f" Total articles in PDFs: {total_articles_checked}")
|
||||
print(f" Articles with controls: {total_covered}")
|
||||
print(f" Articles WITHOUT controls: {total_gaps}")
|
||||
if total_articles_checked:
|
||||
print(f" Overall coverage: {total_covered/total_articles_checked*100:.1f}%")
|
||||
|
||||
print(f"\n Sources with gaps:")
|
||||
for r in sorted(gap_report, key=lambda x: -x["gaps"]):
|
||||
print(f" {r['source']:45s} {r['gaps']:4d} gaps "
|
||||
f"({r['covered']}/{r['total_articles']} = {r['coverage_pct']}%)")
|
||||
|
||||
# Save report
|
||||
out_path = "/tmp/gap_analysis_results.json"
|
||||
with open(out_path, 'w') as f:
|
||||
json.dump(gap_report, f, indent=2, ensure_ascii=False)
|
||||
print(f"\n Full report saved to {out_path}")
|
||||
|
||||
conn.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
288
scripts/qa/oscal_analysis.py
Normal file
288
scripts/qa/oscal_analysis.py
Normal file
@@ -0,0 +1,288 @@
|
||||
"""Analyze NIST OSCAL data and compare with existing controls in DB."""
|
||||
import os
|
||||
import re
|
||||
import json
|
||||
import psycopg2
|
||||
import urllib.parse
|
||||
from collections import defaultdict
|
||||
|
||||
OSCAL_DIR = os.path.expanduser("~/rag-ingestion/nist-oscal")
|
||||
|
||||
# ── Load SP 800-53 Rev 5 ──
|
||||
with open(os.path.join(OSCAL_DIR, "sp800-53-rev5-catalog.json")) as f:
|
||||
sp853 = json.load(f)["catalog"]
|
||||
|
||||
print("=" * 70)
|
||||
print("NIST SP 800-53 Rev 5 — OSCAL Catalog Analysis")
|
||||
print("=" * 70)
|
||||
print(f" UUID: {sp853.get('uuid', '?')}")
|
||||
print(f" Last Modified: {sp853.get('metadata', {}).get('last-modified', '?')}")
|
||||
|
||||
# Count controls
|
||||
families = sp853.get("groups", [])
|
||||
total_base = 0
|
||||
total_enhancements = 0
|
||||
total_withdrawn = 0
|
||||
total_active = 0
|
||||
family_stats = []
|
||||
|
||||
for fam in families:
|
||||
fam_id = fam.get("id", "?")
|
||||
fam_title = fam.get("title", "?")
|
||||
controls = fam.get("controls", [])
|
||||
base = 0
|
||||
enhancements = 0
|
||||
withdrawn = 0
|
||||
|
||||
for ctrl in controls:
|
||||
# Check if withdrawn
|
||||
props = {p["name"]: p.get("value", "") for p in ctrl.get("props", [])}
|
||||
is_withdrawn = props.get("status") == "withdrawn"
|
||||
if is_withdrawn:
|
||||
withdrawn += 1
|
||||
else:
|
||||
base += 1
|
||||
|
||||
# Count enhancements
|
||||
for enh in ctrl.get("controls", []):
|
||||
enh_props = {p["name"]: p.get("value", "") for p in enh.get("props", [])}
|
||||
if enh_props.get("status") == "withdrawn":
|
||||
withdrawn += 1
|
||||
else:
|
||||
enhancements += 1
|
||||
|
||||
family_stats.append((fam_id, fam_title, base, enhancements, withdrawn))
|
||||
total_base += base
|
||||
total_enhancements += enhancements
|
||||
total_withdrawn += withdrawn
|
||||
|
||||
total_active = total_base + total_enhancements
|
||||
print(f"\n Families: {len(families)}")
|
||||
print(f" Base Controls: {total_base}")
|
||||
print(f" Enhancements: {total_enhancements}")
|
||||
print(f" Withdrawn: {total_withdrawn}")
|
||||
print(f" TOTAL ACTIVE: {total_active}")
|
||||
|
||||
print(f"\n Per Family:")
|
||||
print(f" {'ID':6s} {'Title':45s} {'Base':>5s} {'Enh':>5s} {'Wdrn':>5s}")
|
||||
for fam_id, title, base, enh, wdrn in family_stats:
|
||||
print(f" {fam_id:6s} {title[:45]:45s} {base:5d} {enh:5d} {wdrn:5d}")
|
||||
|
||||
# Show example control structure
|
||||
print(f"\n Example Control (AC-6 Least Privilege):")
|
||||
for fam in families:
|
||||
for ctrl in fam.get("controls", []):
|
||||
if ctrl["id"] == "ac-6":
|
||||
props = {p["name"]: p.get("value", "") for p in ctrl.get("props", [])}
|
||||
print(f" ID: {ctrl['id']}")
|
||||
print(f" Label: {props.get('label', '?')}")
|
||||
print(f" Title: {ctrl['title']}")
|
||||
for part in ctrl.get("parts", []):
|
||||
if part.get("name") == "statement":
|
||||
prose = part.get("prose", "")
|
||||
print(f" Statement: {prose[:150]}...")
|
||||
elif part.get("name") == "guidance":
|
||||
prose = part.get("prose", "")
|
||||
print(f" Guidance: {prose[:150]}...")
|
||||
enh_count = len(ctrl.get("controls", []))
|
||||
print(f" Enhancements: {enh_count}")
|
||||
links = [l["href"].lstrip("#") for l in ctrl.get("links", []) if l.get("rel") == "related"]
|
||||
print(f" Related: {', '.join(links[:8])}...")
|
||||
break
|
||||
|
||||
# ── Load CSF 2.0 ──
|
||||
print(f"\n{'='*70}")
|
||||
print("NIST CSF 2.0 — OSCAL Catalog Analysis")
|
||||
print("=" * 70)
|
||||
|
||||
with open(os.path.join(OSCAL_DIR, "csf-2.0-catalog.json")) as f:
|
||||
csf = json.load(f)["catalog"]
|
||||
|
||||
csf_groups = csf.get("groups", [])
|
||||
csf_total = 0
|
||||
for grp in csf_groups:
|
||||
func_title = grp.get("title", "?")
|
||||
cats = grp.get("groups", [])
|
||||
subcats = 0
|
||||
for cat in cats:
|
||||
subcats += len(cat.get("controls", []))
|
||||
csf_total += subcats
|
||||
print(f" {func_title:25s}: {len(cats):2d} categories, {subcats:3d} subcategories")
|
||||
|
||||
print(f" TOTAL: {csf_total} subcategories")
|
||||
|
||||
# ── Compare with existing DB controls ──
|
||||
print(f"\n{'='*70}")
|
||||
print("VERGLEICH: OSCAL vs. bestehende Controls in DB")
|
||||
print("=" * 70)
|
||||
|
||||
db_url = os.environ['DATABASE_URL']
|
||||
parsed = urllib.parse.urlparse(db_url)
|
||||
conn = psycopg2.connect(
|
||||
host=parsed.hostname, port=parsed.port or 5432,
|
||||
user=parsed.username, password=parsed.password,
|
||||
dbname=parsed.path.lstrip('/'),
|
||||
options="-c search_path=compliance,public"
|
||||
)
|
||||
cur = conn.cursor()
|
||||
|
||||
# Get existing NIST controls
|
||||
cur.execute("""
|
||||
SELECT control_id, title,
|
||||
source_citation->>'source' as source,
|
||||
source_citation->>'article' as article,
|
||||
source_citation->>'article_type' as art_type,
|
||||
release_state
|
||||
FROM compliance.canonical_controls
|
||||
WHERE source_citation->>'source' LIKE 'NIST%%'
|
||||
ORDER BY source_citation->>'source', control_id
|
||||
""")
|
||||
nist_controls = cur.fetchall()
|
||||
|
||||
# Group by source
|
||||
by_source = defaultdict(list)
|
||||
for ctrl in nist_controls:
|
||||
by_source[ctrl[2]].append(ctrl)
|
||||
|
||||
print(f"\n Bestehende NIST Controls in DB:")
|
||||
for src in sorted(by_source.keys()):
|
||||
ctrls = by_source[src]
|
||||
active = sum(1 for c in ctrls if c[5] not in ('duplicate', 'too_close'))
|
||||
with_article = sum(1 for c in ctrls if c[3])
|
||||
print(f" {src:40s}: {len(ctrls):4d} total, {active:4d} active, {with_article:4d} mit article")
|
||||
|
||||
# For SP 800-53: which control families do we have?
|
||||
sp853_existing = [c for c in nist_controls if 'SP 800-53' in (c[2] or '')]
|
||||
existing_families = set()
|
||||
existing_articles = set()
|
||||
for ctrl in sp853_existing:
|
||||
article = ctrl[3] or ""
|
||||
if article:
|
||||
# Extract family prefix (e.g., "AC-6" → "AC")
|
||||
m = re.match(r'([A-Z]{2})-', article)
|
||||
if m:
|
||||
existing_families.add(m.group(1))
|
||||
existing_articles.add(article)
|
||||
|
||||
print(f"\n SP 800-53 in DB:")
|
||||
print(f" Total: {len(sp853_existing)}")
|
||||
print(f" Families covered: {len(existing_families)}")
|
||||
print(f" Unique articles: {len(existing_articles)}")
|
||||
print(f" Families: {', '.join(sorted(existing_families))}")
|
||||
|
||||
# Compare: which OSCAL controls are NOT in our DB?
|
||||
oscal_controls = {} # id → (label, title, statement)
|
||||
for fam in families:
|
||||
for ctrl in fam.get("controls", []):
|
||||
props = {p["name"]: p.get("value", "") for p in ctrl.get("props", [])}
|
||||
if props.get("status") == "withdrawn":
|
||||
continue
|
||||
label = props.get("label", ctrl["id"].upper())
|
||||
statement = ""
|
||||
guidance = ""
|
||||
for part in ctrl.get("parts", []):
|
||||
if part.get("name") == "statement":
|
||||
statement = part.get("prose", "")
|
||||
# Also check sub-items
|
||||
for sub in part.get("parts", []):
|
||||
statement += " " + sub.get("prose", "")
|
||||
elif part.get("name") == "guidance":
|
||||
guidance = part.get("prose", "")
|
||||
|
||||
oscal_controls[label] = (ctrl["title"], statement[:500], guidance[:500])
|
||||
|
||||
# Enhancements
|
||||
for enh in ctrl.get("controls", []):
|
||||
enh_props = {p["name"]: p.get("value", "") for p in enh.get("props", [])}
|
||||
if enh_props.get("status") == "withdrawn":
|
||||
continue
|
||||
enh_label = enh_props.get("label", enh["id"].upper())
|
||||
enh_statement = ""
|
||||
enh_guidance = ""
|
||||
for part in enh.get("parts", []):
|
||||
if part.get("name") == "statement":
|
||||
enh_statement = part.get("prose", "")
|
||||
for sub in part.get("parts", []):
|
||||
enh_statement += " " + sub.get("prose", "")
|
||||
elif part.get("name") == "guidance":
|
||||
enh_guidance = part.get("prose", "")
|
||||
oscal_controls[enh_label] = (enh["title"], enh_statement[:500], enh_guidance[:500])
|
||||
|
||||
print(f"\n OSCAL SP 800-53 aktive Controls: {len(oscal_controls)}")
|
||||
|
||||
# Find missing: in OSCAL but not in DB
|
||||
missing = []
|
||||
covered = []
|
||||
for label in sorted(oscal_controls.keys()):
|
||||
if label in existing_articles:
|
||||
covered.append(label)
|
||||
else:
|
||||
missing.append(label)
|
||||
|
||||
print(f" In DB vorhanden: {len(covered)}")
|
||||
print(f" FEHLEND in DB: {len(missing)}")
|
||||
|
||||
# Missing by family
|
||||
missing_by_fam = defaultdict(list)
|
||||
for label in missing:
|
||||
fam = label.split("-")[0]
|
||||
missing_by_fam[fam].append(label)
|
||||
|
||||
print(f"\n Fehlende Controls nach Family:")
|
||||
for fam in sorted(missing_by_fam.keys()):
|
||||
ctrls = missing_by_fam[fam]
|
||||
examples = ", ".join(ctrls[:5])
|
||||
more = f" +{len(ctrls)-5}" if len(ctrls) > 5 else ""
|
||||
print(f" {fam:4s}: {len(ctrls):3d} fehlend ({examples}{more})")
|
||||
|
||||
# Also check CSF 2.0
|
||||
print(f"\n{'='*70}")
|
||||
print("NIST CSF 2.0 — Vergleich mit DB")
|
||||
print("=" * 70)
|
||||
|
||||
cur.execute("""
|
||||
SELECT count(*), count(*) FILTER (WHERE release_state NOT IN ('duplicate', 'too_close'))
|
||||
FROM compliance.canonical_controls
|
||||
WHERE source_citation->>'source' LIKE 'NIST Cybersecurity%%'
|
||||
""")
|
||||
csf_row = cur.fetchone()
|
||||
print(f" CSF Controls in DB: {csf_row[0]} total, {csf_row[1]} active")
|
||||
|
||||
csf_subcats = 0
|
||||
csf_ids = []
|
||||
for grp in csf_groups:
|
||||
for cat in grp.get("groups", []):
|
||||
for subcat in cat.get("controls", []):
|
||||
csf_subcats += 1
|
||||
props = {p["name"]: p.get("value", "") for p in subcat.get("props", [])}
|
||||
csf_ids.append(props.get("label", subcat["id"]))
|
||||
|
||||
print(f" CSF 2.0 OSCAL Subcategories: {csf_subcats}")
|
||||
print(f" Beispiele: {', '.join(csf_ids[:10])}")
|
||||
|
||||
# ── Summary / Potential ──
|
||||
print(f"\n{'='*70}")
|
||||
print("POTENTIAL: Was OSCAL uns bringt")
|
||||
print("=" * 70)
|
||||
print(f"""
|
||||
SP 800-53 Rev 5:
|
||||
- {len(missing)} neue Controls möglich (aktuell {len(covered)} in DB)
|
||||
- Jeder Control hat: Statement + Guidance + Assessment-Methoden
|
||||
- Cross-References zwischen Controls (für Mapping)
|
||||
- Maschinenlesbare Parameter (ODP)
|
||||
- Public Domain — keine Lizenzprobleme
|
||||
|
||||
CSF 2.0:
|
||||
- {csf_subcats} Subcategories als Compliance-Controls
|
||||
- 6 Functions (Govern, Identify, Protect, Detect, Respond, Recover)
|
||||
- Direkte Mappings zu SP 800-53 Controls
|
||||
|
||||
Nächste Schritte:
|
||||
1. Fehlende SP 800-53 Controls importieren ({len(missing)} Controls)
|
||||
2. Statement-Text als source_original_text verwenden
|
||||
3. article_type='control', article=Label (z.B. 'AC-6')
|
||||
4. CSF 2.0 als eigene Regulation importieren
|
||||
5. Cross-References als Grundlage für Control-Mappings nutzen
|
||||
""")
|
||||
|
||||
conn.close()
|
||||
289
scripts/qa/oscal_import.py
Normal file
289
scripts/qa/oscal_import.py
Normal file
@@ -0,0 +1,289 @@
|
||||
"""Import 776 missing NIST SP 800-53 Rev 5 controls from OSCAL into canonical_controls."""
|
||||
import os
|
||||
import re
|
||||
import json
|
||||
import uuid
|
||||
import psycopg2
|
||||
import urllib.parse
|
||||
|
||||
OSCAL_DIR = os.path.expanduser("~/rag-ingestion/nist-oscal")
|
||||
|
||||
with open(os.path.join(OSCAL_DIR, "sp800-53-rev5-catalog.json")) as f:
|
||||
sp853 = json.load(f)["catalog"]
|
||||
|
||||
# ── Extract all OSCAL controls ──
|
||||
def extract_controls(catalog):
|
||||
"""Extract all active controls with full data."""
|
||||
controls = []
|
||||
for fam in catalog.get("groups", []):
|
||||
fam_id = fam.get("id", "").upper()
|
||||
fam_title = fam.get("title", "")
|
||||
|
||||
for ctrl in fam.get("controls", []):
|
||||
result = extract_single(ctrl, fam_title)
|
||||
if result:
|
||||
controls.append(result)
|
||||
# Enhancements
|
||||
for enh in ctrl.get("controls", []):
|
||||
result = extract_single(enh, fam_title)
|
||||
if result:
|
||||
controls.append(result)
|
||||
return controls
|
||||
|
||||
def extract_single(ctrl, family_title):
|
||||
"""Extract a single control or enhancement."""
|
||||
props = {p["name"]: p.get("value", "") for p in ctrl.get("props", [])}
|
||||
if props.get("status") == "withdrawn":
|
||||
return None
|
||||
|
||||
label = props.get("label", ctrl["id"].upper())
|
||||
title = ctrl.get("title", "")
|
||||
|
||||
# Extract statement (main requirement text)
|
||||
statement = ""
|
||||
for part in ctrl.get("parts", []):
|
||||
if part.get("name") == "statement":
|
||||
statement = part.get("prose", "")
|
||||
# Sub-items (a., b., c., etc.)
|
||||
for sub in part.get("parts", []):
|
||||
sub_prose = sub.get("prose", "")
|
||||
sub_label = ""
|
||||
for sp in sub.get("props", []):
|
||||
if sp["name"] == "label":
|
||||
sub_label = sp.get("value", "")
|
||||
if sub_label:
|
||||
statement += f"\n{sub_label} {sub_prose}"
|
||||
elif sub_prose:
|
||||
statement += f"\n{sub_prose}"
|
||||
# Nested sub-sub-items
|
||||
for subsub in sub.get("parts", []):
|
||||
ss_prose = subsub.get("prose", "")
|
||||
ss_label = ""
|
||||
for sp in subsub.get("props", []):
|
||||
if sp["name"] == "label":
|
||||
ss_label = sp.get("value", "")
|
||||
if ss_label:
|
||||
statement += f"\n {ss_label} {ss_prose}"
|
||||
elif ss_prose:
|
||||
statement += f"\n {ss_prose}"
|
||||
|
||||
# Extract guidance
|
||||
guidance = ""
|
||||
for part in ctrl.get("parts", []):
|
||||
if part.get("name") == "guidance":
|
||||
guidance = part.get("prose", "")
|
||||
|
||||
# Cross-references
|
||||
related = [l["href"].lstrip("#") for l in ctrl.get("links", []) if l.get("rel") == "related"]
|
||||
|
||||
# Parameters
|
||||
params = []
|
||||
for p in ctrl.get("params", []):
|
||||
param_id = p.get("id", "")
|
||||
param_label = p.get("label", "")
|
||||
guidelines = ""
|
||||
for g in p.get("guidelines", []):
|
||||
guidelines += g.get("prose", "")
|
||||
select_choices = []
|
||||
if "select" in p:
|
||||
for choice in p["select"].get("choice", []):
|
||||
select_choices.append(choice)
|
||||
params.append({
|
||||
"id": param_id,
|
||||
"label": param_label,
|
||||
"guidelines": guidelines,
|
||||
"choices": select_choices,
|
||||
})
|
||||
|
||||
return {
|
||||
"label": label,
|
||||
"title": title,
|
||||
"family": family_title,
|
||||
"statement": statement.strip(),
|
||||
"guidance": guidance.strip(),
|
||||
"related": related,
|
||||
"params": params,
|
||||
"is_enhancement": "(" in label,
|
||||
}
|
||||
|
||||
all_oscal = extract_controls(sp853)
|
||||
print(f"Total OSCAL active controls: {len(all_oscal)}")
|
||||
|
||||
# ── Normalize label for comparison ──
|
||||
def normalize_label(label):
|
||||
label = re.sub(r'-0+(\d)', r'-\1', label)
|
||||
label = re.sub(r'\(0+(\d+)\)', r'(\1)', label)
|
||||
return label.upper()
|
||||
|
||||
# ── DB connection ──
|
||||
db_url = os.environ['DATABASE_URL']
|
||||
parsed = urllib.parse.urlparse(db_url)
|
||||
conn = psycopg2.connect(
|
||||
host=parsed.hostname, port=parsed.port or 5432,
|
||||
user=parsed.username, password=parsed.password,
|
||||
dbname=parsed.path.lstrip('/'),
|
||||
options="-c search_path=compliance,public"
|
||||
)
|
||||
cur = conn.cursor()
|
||||
|
||||
# Get existing labels
|
||||
cur.execute("""
|
||||
SELECT DISTINCT source_citation->>'article' as article
|
||||
FROM compliance.canonical_controls
|
||||
WHERE source_citation->>'source' = 'NIST SP 800-53 Rev. 5'
|
||||
AND source_citation->>'article' IS NOT NULL
|
||||
""")
|
||||
existing_labels = set(normalize_label(r[0]) for r in cur.fetchall())
|
||||
print(f"Existing DB labels (normalized): {len(existing_labels)}")
|
||||
|
||||
# Get highest control_id numbers per prefix
|
||||
cur.execute("""
|
||||
SELECT control_id FROM compliance.canonical_controls
|
||||
WHERE control_id ~ '^[A-Z]+-[0-9]+$'
|
||||
ORDER BY control_id
|
||||
""")
|
||||
existing_ids = set(r[0] for r in cur.fetchall())
|
||||
|
||||
# Find next available ID per prefix
|
||||
def next_control_id(prefix, existing):
|
||||
"""Find next available control_id like SEC-1234."""
|
||||
max_num = 0
|
||||
pattern = re.compile(rf'^{prefix}-(\d+)$')
|
||||
for eid in existing:
|
||||
m = pattern.match(eid)
|
||||
if m:
|
||||
max_num = max(max_num, int(m.group(1)))
|
||||
return max_num
|
||||
|
||||
# Map NIST families to our control_id prefixes
|
||||
FAMILY_PREFIX = {
|
||||
"Access Control": "ACC",
|
||||
"Awareness and Training": "GOV",
|
||||
"Audit and Accountability": "LOG",
|
||||
"Assessment, Authorization, and Monitoring": "GOV",
|
||||
"Configuration Management": "COMP",
|
||||
"Contingency Planning": "INC",
|
||||
"Identification and Authentication": "AUTH",
|
||||
"Incident Response": "INC",
|
||||
"Maintenance": "COMP",
|
||||
"Media Protection": "DATA",
|
||||
"Physical and Environmental Protection": "SEC",
|
||||
"Planning": "GOV",
|
||||
"Program Management": "GOV",
|
||||
"Personnel Security": "GOV",
|
||||
"Personally Identifiable Information Processing and Transparency": "DATA",
|
||||
"Risk Assessment": "GOV",
|
||||
"System and Services Acquisition": "COMP",
|
||||
"System and Communications Protection": "NET",
|
||||
"System and Information Integrity": "SEC",
|
||||
"Supply Chain Risk Management": "COMP",
|
||||
}
|
||||
|
||||
# Track next IDs
|
||||
prefix_counters = {}
|
||||
for prefix in set(FAMILY_PREFIX.values()):
|
||||
prefix_counters[prefix] = next_control_id(prefix, existing_ids)
|
||||
print(f"Starting counters: {prefix_counters}")
|
||||
|
||||
# ── Filter to only new controls ──
|
||||
to_import = []
|
||||
for ctrl in all_oscal:
|
||||
norm = normalize_label(ctrl["label"])
|
||||
if norm not in existing_labels:
|
||||
to_import.append(ctrl)
|
||||
|
||||
print(f"\nControls to import: {len(to_import)}")
|
||||
|
||||
# ── Import ──
|
||||
imported = 0
|
||||
for ctrl in to_import:
|
||||
prefix = FAMILY_PREFIX.get(ctrl["family"], "COMP")
|
||||
prefix_counters[prefix] += 1
|
||||
control_id = f"{prefix}-{prefix_counters[prefix]:04d}"
|
||||
|
||||
# Build title: "NIST {label}: {title}"
|
||||
title = f"NIST {ctrl['label']}: {ctrl['title']}"
|
||||
|
||||
# source_original_text = statement (the official requirement text)
|
||||
source_text = ctrl["statement"]
|
||||
if not source_text:
|
||||
source_text = ctrl["guidance"][:500] if ctrl["guidance"] else ctrl["title"]
|
||||
|
||||
# objective = guidance text
|
||||
objective = ctrl["guidance"][:2000] if ctrl["guidance"] else ""
|
||||
|
||||
# source_citation
|
||||
citation = {
|
||||
"source": "NIST SP 800-53 Rev. 5",
|
||||
"article": ctrl["label"],
|
||||
"article_type": "control",
|
||||
"source_type": "standard",
|
||||
"oscal_import": True,
|
||||
}
|
||||
if ctrl["related"]:
|
||||
citation["related_controls"] = ctrl["related"][:20]
|
||||
if ctrl["params"]:
|
||||
citation["parameters"] = [{"id": p["id"], "label": p["label"]} for p in ctrl["params"][:10]]
|
||||
|
||||
FRAMEWORK_ID = '14b1bdd2-abc7-4a43-adae-14471ee5c7cf'
|
||||
new_id = str(uuid.uuid4())
|
||||
cur.execute("""
|
||||
INSERT INTO compliance.canonical_controls
|
||||
(id, framework_id, control_id, title, objective, rationale,
|
||||
severity, source_original_text,
|
||||
source_citation, pipeline_version, release_state,
|
||||
generation_strategy, category)
|
||||
VALUES (%s, %s, %s, %s, %s, '', 'medium', %s, %s, 4, 'draft', 'oscal_import', %s)
|
||||
""", (
|
||||
new_id,
|
||||
FRAMEWORK_ID,
|
||||
control_id,
|
||||
title[:500],
|
||||
objective[:5000],
|
||||
source_text[:10000],
|
||||
json.dumps(citation, ensure_ascii=False),
|
||||
ctrl["family"],
|
||||
))
|
||||
imported += 1
|
||||
|
||||
conn.commit()
|
||||
print(f"\nImported: {imported} new controls")
|
||||
|
||||
# ── Verify ──
|
||||
cur.execute("""
|
||||
SELECT count(*),
|
||||
count(*) FILTER (WHERE release_state NOT IN ('duplicate', 'too_close'))
|
||||
FROM compliance.canonical_controls
|
||||
WHERE source_citation->>'source' = 'NIST SP 800-53 Rev. 5'
|
||||
""")
|
||||
total, active = cur.fetchone()
|
||||
print(f"\nSP 800-53 after import: {total} total, {active} active")
|
||||
|
||||
cur.execute("""
|
||||
SELECT release_state, count(*)
|
||||
FROM compliance.canonical_controls
|
||||
GROUP BY release_state
|
||||
ORDER BY count(*) DESC
|
||||
""")
|
||||
print(f"\nDB release_state gesamt:")
|
||||
for row in cur.fetchall():
|
||||
print(f" {row[0]:15s}: {row[1]:5d}")
|
||||
|
||||
cur.execute("""
|
||||
SELECT count(*)
|
||||
FROM compliance.canonical_controls
|
||||
WHERE release_state NOT IN ('duplicate', 'too_close')
|
||||
""")
|
||||
print(f"\nAktive Controls gesamt: {cur.fetchone()[0]}")
|
||||
|
||||
# ── Import stats by family ──
|
||||
fam_counts = {}
|
||||
for ctrl in to_import:
|
||||
fam = ctrl["family"]
|
||||
fam_counts[fam] = fam_counts.get(fam, 0) + 1
|
||||
|
||||
print(f"\nImportiert nach Family:")
|
||||
for fam in sorted(fam_counts.keys()):
|
||||
print(f" {fam[:45]:45s}: {fam_counts[fam]:3d}")
|
||||
|
||||
conn.close()
|
||||
274
scripts/qa/owasp_cleanup.py
Normal file
274
scripts/qa/owasp_cleanup.py
Normal file
@@ -0,0 +1,274 @@
|
||||
"""OWASP Cleanup:
|
||||
1. Mark 324 OWASP Top 10 multilingual controls as 'duplicate'
|
||||
2. Fix 47 wrong source attributions (found in different OWASP PDF)
|
||||
"""
|
||||
import os
|
||||
import re
|
||||
import json
|
||||
import unicodedata
|
||||
import psycopg2
|
||||
import urllib.parse
|
||||
|
||||
try:
|
||||
import fitz
|
||||
except ImportError:
|
||||
print("ERROR: PyMuPDF not installed")
|
||||
exit(1)
|
||||
|
||||
PDF_DIR = os.path.expanduser("~/rag-ingestion/pdfs")
|
||||
|
||||
def normalize(s):
|
||||
s = s.replace('\u00ad', '').replace('\xad', '')
|
||||
s = s.replace('\u200b', '').replace('\u00a0', ' ')
|
||||
s = s.replace('\ufb01', 'fi').replace('\ufb02', 'fl')
|
||||
s = s.replace('\ufb00', 'ff').replace('\ufb03', 'ffi').replace('\ufb04', 'ffl')
|
||||
s = s.replace('\u2019', "'").replace('\u2018', "'")
|
||||
s = s.replace('\u201c', '"').replace('\u201d', '"')
|
||||
s = s.replace('\u2013', '-').replace('\u2014', '-')
|
||||
s = s.replace('\u2022', '-').replace('\u00b7', '-')
|
||||
s = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', s)
|
||||
s = unicodedata.normalize('NFC', s)
|
||||
s = re.sub(r'\s+', ' ', s)
|
||||
return s.strip()
|
||||
|
||||
# Load OWASP PDFs
|
||||
OWASP_PDFS = {
|
||||
"OWASP Top 10 (2021)": "owasp_top10_2021.pdf",
|
||||
"OWASP ASVS 4.0": "owasp_asvs_4_0.pdf",
|
||||
"OWASP SAMM 2.0": "owasp_samm_2_0.pdf",
|
||||
"OWASP API Security Top 10 (2023)": "owasp_api_top10_2023.pdf",
|
||||
"OWASP MASVS 2.0": "owasp_masvs_2_0.pdf",
|
||||
}
|
||||
|
||||
pdf_norms = {}
|
||||
for name, filename in OWASP_PDFS.items():
|
||||
path = os.path.join(PDF_DIR, filename)
|
||||
if not os.path.exists(path):
|
||||
continue
|
||||
doc = fitz.open(path)
|
||||
text = ""
|
||||
for page in doc:
|
||||
text += page.get_text() + "\n"
|
||||
doc.close()
|
||||
pdf_norms[name] = normalize(text)
|
||||
|
||||
def build_owasp_index(text_norm, source_name):
|
||||
# We need the raw text for regex, but we already normalized.
|
||||
# Rebuild index from normalized text.
|
||||
items = []
|
||||
if "Top 10" in source_name and "API" not in source_name:
|
||||
for m in re.finditer(r'(A\d{2}:\d{4})', text_norm):
|
||||
items.append((m.start(), m.group(1), "category"))
|
||||
elif "API" in source_name:
|
||||
for m in re.finditer(r'(API\d+:\d{4})', text_norm):
|
||||
items.append((m.start(), m.group(1), "category"))
|
||||
elif "ASVS" in source_name:
|
||||
for m in re.finditer(r'(V\d+\.\d+(?:\.\d+)?)\b', text_norm):
|
||||
items.append((m.start(), m.group(1), "requirement"))
|
||||
elif "MASVS" in source_name:
|
||||
for m in re.finditer(r'(MASVS-[A-Z]+-\d+)', text_norm):
|
||||
items.append((m.start(), m.group(1), "requirement"))
|
||||
items.sort(key=lambda x: x[0])
|
||||
seen = set()
|
||||
unique = []
|
||||
for pos, label, typ in items:
|
||||
if label not in seen:
|
||||
seen.add(label)
|
||||
unique.append((pos, label, typ))
|
||||
return unique
|
||||
|
||||
pdf_indexes = {}
|
||||
for name, norm in pdf_norms.items():
|
||||
pdf_indexes[name] = build_owasp_index(norm, name)
|
||||
|
||||
def find_in_pdf(orig_text, source_name):
|
||||
"""Find control text in a specific PDF. Returns (label, type) or None."""
|
||||
pdf_norm = pdf_norms.get(source_name)
|
||||
if not pdf_norm:
|
||||
return None
|
||||
orig_norm = normalize(orig_text)
|
||||
if len(orig_norm) < 20:
|
||||
return None
|
||||
idx = pdf_indexes.get(source_name, [])
|
||||
for start_frac in [0.25, 0.1, 0.5, 0.0, 0.75]:
|
||||
for length in [80, 60, 40, 30, 20]:
|
||||
start = max(0, int(len(orig_norm) * start_frac))
|
||||
snippet = orig_norm[start:start+length]
|
||||
if not snippet or len(snippet) < 15:
|
||||
continue
|
||||
pos = pdf_norm.find(snippet)
|
||||
if pos >= 0:
|
||||
label = "Unknown"
|
||||
typ = "unknown"
|
||||
for h_pos, h_label, h_type in reversed(idx):
|
||||
if h_pos <= pos:
|
||||
label = h_label
|
||||
typ = h_type
|
||||
break
|
||||
return (label, typ)
|
||||
return None
|
||||
|
||||
# DB
|
||||
db_url = os.environ['DATABASE_URL']
|
||||
parsed = urllib.parse.urlparse(db_url)
|
||||
conn = psycopg2.connect(
|
||||
host=parsed.hostname, port=parsed.port or 5432,
|
||||
user=parsed.username, password=parsed.password,
|
||||
dbname=parsed.path.lstrip('/'),
|
||||
options="-c search_path=compliance,public"
|
||||
)
|
||||
cur = conn.cursor()
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════
|
||||
# STEP 1: Mark OWASP Top 10 multilingual controls as duplicate
|
||||
# ═══════════════════════════════════════════════════════════════
|
||||
print("=" * 60)
|
||||
print("STEP 1: OWASP Top 10 — multilingual controls → duplicate")
|
||||
print("=" * 60)
|
||||
|
||||
cur.execute("""
|
||||
SELECT id, control_id, title, source_original_text, release_state
|
||||
FROM compliance.canonical_controls
|
||||
WHERE source_citation->>'source' = 'OWASP Top 10 (2021)'
|
||||
AND source_citation->>'article_type' IS NULL
|
||||
AND source_original_text IS NOT NULL
|
||||
AND release_state NOT IN ('duplicate', 'too_close')
|
||||
ORDER BY control_id
|
||||
""")
|
||||
top10_unmatched = cur.fetchall()
|
||||
print(f" Unmatched active OWASP Top 10: {len(top10_unmatched)}")
|
||||
|
||||
# Separate: found in other OWASP PDF vs not found anywhere
|
||||
to_mark_dup = []
|
||||
to_fix_source = []
|
||||
|
||||
for ctrl in top10_unmatched:
|
||||
uid, cid, title, text, state = ctrl
|
||||
|
||||
# Check if found in another OWASP PDF
|
||||
found_in = None
|
||||
found_result = None
|
||||
for other_src in OWASP_PDFS:
|
||||
if other_src == 'OWASP Top 10 (2021)':
|
||||
continue
|
||||
result = find_in_pdf(text, other_src)
|
||||
if result:
|
||||
found_in = other_src
|
||||
found_result = result
|
||||
break
|
||||
|
||||
if found_in:
|
||||
to_fix_source.append((uid, cid, found_in, found_result[0], found_result[1]))
|
||||
else:
|
||||
to_mark_dup.append((uid, cid))
|
||||
|
||||
print(f" → Not found in any PDF (multilingual): {len(to_mark_dup)} → mark as duplicate")
|
||||
print(f" → Found in other OWASP PDF: {len(to_fix_source)} → fix source attribution")
|
||||
|
||||
# Mark as duplicate
|
||||
dup_marked = 0
|
||||
for uid, cid in to_mark_dup:
|
||||
cur.execute("""
|
||||
UPDATE compliance.canonical_controls
|
||||
SET release_state = 'duplicate'
|
||||
WHERE id = %s AND release_state NOT IN ('duplicate', 'too_close')
|
||||
""", (uid,))
|
||||
if cur.rowcount > 0:
|
||||
dup_marked += 1
|
||||
|
||||
print(f" Marked as duplicate: {dup_marked}")
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════
|
||||
# STEP 2: Fix wrong source attributions across ALL OWASP sources
|
||||
# ═══════════════════════════════════════════════════════════════
|
||||
print(f"\n{'='*60}")
|
||||
print("STEP 2: Fix wrong OWASP source attributions")
|
||||
print("=" * 60)
|
||||
|
||||
all_fixes = list(to_fix_source) # Start with Top 10 fixes
|
||||
|
||||
# Also check ASVS, SAMM, MASVS
|
||||
for source in ['OWASP ASVS 4.0', 'OWASP SAMM 2.0', 'OWASP API Security Top 10 (2023)', 'OWASP MASVS 2.0']:
|
||||
cur.execute("""
|
||||
SELECT id, control_id, title, source_original_text
|
||||
FROM compliance.canonical_controls
|
||||
WHERE source_citation->>'source' = %s
|
||||
AND source_citation->>'article_type' IS NULL
|
||||
AND source_original_text IS NOT NULL
|
||||
AND release_state NOT IN ('duplicate', 'too_close')
|
||||
""", (source,))
|
||||
controls = cur.fetchall()
|
||||
|
||||
for ctrl in controls:
|
||||
uid, cid, title, text = ctrl
|
||||
# Try own PDF first
|
||||
result = find_in_pdf(text, source)
|
||||
if result:
|
||||
# Found in own PDF! Update article info
|
||||
cur.execute("""
|
||||
UPDATE compliance.canonical_controls
|
||||
SET source_citation = source_citation ||
|
||||
jsonb_build_object('article', %s, 'article_type', %s)
|
||||
WHERE id = %s
|
||||
AND (source_citation->>'article' IS DISTINCT FROM %s
|
||||
OR source_citation->>'article_type' IS DISTINCT FROM %s)
|
||||
""", (result[0], result[1], uid, result[0], result[1]))
|
||||
continue
|
||||
|
||||
# Try other OWASP PDFs
|
||||
for other_src in OWASP_PDFS:
|
||||
if other_src == source:
|
||||
continue
|
||||
result = find_in_pdf(text, other_src)
|
||||
if result:
|
||||
all_fixes.append((uid, cid, other_src, result[0], result[1]))
|
||||
break
|
||||
|
||||
print(f" Total wrong-source controls found: {len(all_fixes)}")
|
||||
|
||||
# Apply source fixes
|
||||
fixed = 0
|
||||
for uid, cid, correct_source, label, typ in all_fixes:
|
||||
cur.execute("""
|
||||
UPDATE compliance.canonical_controls
|
||||
SET source_citation = source_citation ||
|
||||
jsonb_build_object('source', %s, 'article', %s, 'article_type', %s)
|
||||
WHERE id = %s
|
||||
""", (correct_source, label, typ, uid,))
|
||||
if cur.rowcount > 0:
|
||||
fixed += 1
|
||||
print(f" {cid:10s} → {correct_source} / {label} [{typ}]")
|
||||
|
||||
print(f" Fixed: {fixed} controls")
|
||||
|
||||
conn.commit()
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════
|
||||
# SUMMARY
|
||||
# ═══════════════════════════════════════════════════════════════
|
||||
print(f"\n{'='*60}")
|
||||
print("ZUSAMMENFASSUNG")
|
||||
print("=" * 60)
|
||||
print(f" OWASP Top 10 multilingual → duplicate: {dup_marked}")
|
||||
print(f" Wrong source attribution → fixed: {fixed}")
|
||||
|
||||
# Final counts
|
||||
cur.execute("""
|
||||
SELECT release_state, count(*)
|
||||
FROM compliance.canonical_controls
|
||||
GROUP BY release_state
|
||||
ORDER BY count(*) DESC
|
||||
""")
|
||||
print(f"\n DB release_state nach Cleanup:")
|
||||
for row in cur.fetchall():
|
||||
print(f" {row[0]:15s}: {row[1]:5d}")
|
||||
|
||||
cur.execute("""
|
||||
SELECT count(*)
|
||||
FROM compliance.canonical_controls
|
||||
WHERE release_state NOT IN ('duplicate', 'too_close')
|
||||
""")
|
||||
active = cur.fetchone()[0]
|
||||
print(f"\n Aktive Controls: {active}")
|
||||
|
||||
conn.close()
|
||||
316
scripts/qa/owasp_github_match.py
Normal file
316
scripts/qa/owasp_github_match.py
Normal file
@@ -0,0 +1,316 @@
|
||||
"""Match unmatched OWASP ASVS/SAMM/MASVS controls against GitHub Markdown sources."""
|
||||
import os
|
||||
import re
|
||||
import unicodedata
|
||||
import psycopg2
|
||||
import urllib.parse
|
||||
from pathlib import Path
|
||||
|
||||
GITHUB_DIR = Path(os.path.expanduser("~/rag-ingestion/owasp-github"))
|
||||
|
||||
def normalize(s):
|
||||
s = s.replace('\u00ad', '').replace('\xad', '')
|
||||
s = s.replace('\u200b', '').replace('\u00a0', ' ')
|
||||
s = s.replace('\ufb01', 'fi').replace('\ufb02', 'fl')
|
||||
s = s.replace('\ufb00', 'ff').replace('\ufb03', 'ffi').replace('\ufb04', 'ffl')
|
||||
s = s.replace('\u2019', "'").replace('\u2018', "'")
|
||||
s = s.replace('\u201c', '"').replace('\u201d', '"')
|
||||
s = s.replace('\u2013', '-').replace('\u2014', '-')
|
||||
s = s.replace('\u2022', '-').replace('\u00b7', '-')
|
||||
s = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f]', '', s)
|
||||
s = unicodedata.normalize('NFC', s)
|
||||
s = re.sub(r'\s+', ' ', s)
|
||||
return s.strip()
|
||||
|
||||
# ── Load Markdown sources ──
|
||||
def load_markdown_dir(path, pattern="*.md"):
|
||||
"""Load all markdown files, return combined text and per-file index."""
|
||||
texts = {}
|
||||
for f in sorted(path.glob(pattern)):
|
||||
try:
|
||||
texts[f.name] = f.read_text(encoding='utf-8', errors='replace')
|
||||
except:
|
||||
pass
|
||||
return texts
|
||||
|
||||
# ASVS 4.0 — V-files contain requirements
|
||||
asvs_dir = GITHUB_DIR / "ASVS" / "4.0" / "en"
|
||||
asvs_files = load_markdown_dir(asvs_dir)
|
||||
asvs_full = "\n".join(asvs_files.values())
|
||||
asvs_norm = normalize(asvs_full)
|
||||
print(f"ASVS 4.0 Markdown: {len(asvs_files)} files, {len(asvs_full):,} chars")
|
||||
|
||||
# SAMM core — YAML + Markdown
|
||||
samm_dir = GITHUB_DIR / "samm-core"
|
||||
samm_texts = {}
|
||||
for f in samm_dir.rglob("*.yml"):
|
||||
try:
|
||||
samm_texts[str(f.relative_to(samm_dir))] = f.read_text(encoding='utf-8', errors='replace')
|
||||
except:
|
||||
pass
|
||||
for f in samm_dir.rglob("*.md"):
|
||||
try:
|
||||
samm_texts[str(f.relative_to(samm_dir))] = f.read_text(encoding='utf-8', errors='replace')
|
||||
except:
|
||||
pass
|
||||
samm_full = "\n".join(samm_texts.values())
|
||||
samm_norm = normalize(samm_full)
|
||||
print(f"SAMM 2.0 source: {len(samm_texts)} files, {len(samm_full):,} chars")
|
||||
|
||||
# MASVS — control markdown files
|
||||
masvs_dir = GITHUB_DIR / "masvs"
|
||||
masvs_files = {}
|
||||
for f in masvs_dir.rglob("*.md"):
|
||||
try:
|
||||
masvs_files[str(f.relative_to(masvs_dir))] = f.read_text(encoding='utf-8', errors='replace')
|
||||
except:
|
||||
pass
|
||||
masvs_full = "\n".join(masvs_files.values())
|
||||
masvs_norm = normalize(masvs_full)
|
||||
print(f"MASVS 2.0 source: {len(masvs_files)} files, {len(masvs_full):,} chars")
|
||||
|
||||
# API Security
|
||||
api_dir = GITHUB_DIR / "api-security"
|
||||
api_files = {}
|
||||
for f in api_dir.rglob("*.md"):
|
||||
try:
|
||||
api_files[str(f.relative_to(api_dir))] = f.read_text(encoding='utf-8', errors='replace')
|
||||
except:
|
||||
pass
|
||||
api_full = "\n".join(api_files.values())
|
||||
api_norm = normalize(api_full)
|
||||
print(f"API Security source: {len(api_files)} files, {len(api_full):,} chars")
|
||||
|
||||
# Source → (normalized_text, index_builder)
|
||||
SOURCE_GITHUB = {
|
||||
"OWASP ASVS 4.0": asvs_norm,
|
||||
"OWASP SAMM 2.0": samm_norm,
|
||||
"OWASP MASVS 2.0": masvs_norm,
|
||||
"OWASP API Security Top 10 (2023)": api_norm,
|
||||
}
|
||||
|
||||
# Build indexes for each source
|
||||
def build_asvs_index(text):
|
||||
items = []
|
||||
for m in re.finditer(r'(V\d+\.\d+(?:\.\d+)?)\b', text):
|
||||
items.append((m.start(), m.group(1), "requirement"))
|
||||
items.sort(key=lambda x: x[0])
|
||||
seen = set()
|
||||
return [(p, l, t) for p, l, t in items if l not in seen and not seen.add(l)]
|
||||
|
||||
def build_samm_index(text):
|
||||
items = []
|
||||
# SAMM practices have names like "Strategy & Metrics", sections numbered
|
||||
for m in re.finditer(r'(?:^|\s)(\d+\.\d+(?:\.\d+)?)\s+[A-Z]', text):
|
||||
items.append((m.start(), f"Section {m.group(1)}", "section"))
|
||||
# Also find practice identifiers
|
||||
for m in re.finditer(r'((?:Strategy|Education|Policy|Threat|Security Requirements|Secure Architecture|'
|
||||
r'Secure Build|Secure Deployment|Defect Management|Environment Management|'
|
||||
r'Incident Management|Requirements Testing|Security Testing|'
|
||||
r'Design Review|Implementation Review|Operations Management)'
|
||||
r'[^.\n]{0,30})', text):
|
||||
items.append((m.start(), m.group(1)[:50], "section"))
|
||||
items.sort(key=lambda x: x[0])
|
||||
seen = set()
|
||||
return [(p, l, t) for p, l, t in items if l not in seen and not seen.add(l)]
|
||||
|
||||
def build_masvs_index(text):
|
||||
items = []
|
||||
for m in re.finditer(r'(MASVS-[A-Z]+-\d+)', text):
|
||||
items.append((m.start(), m.group(1), "requirement"))
|
||||
items.sort(key=lambda x: x[0])
|
||||
seen = set()
|
||||
return [(p, l, t) for p, l, t in items if l not in seen and not seen.add(l)]
|
||||
|
||||
def build_api_index(text):
|
||||
items = []
|
||||
for m in re.finditer(r'(API\d+:\d{4})', text):
|
||||
items.append((m.start(), m.group(1), "category"))
|
||||
items.sort(key=lambda x: x[0])
|
||||
seen = set()
|
||||
return [(p, l, t) for p, l, t in items if l not in seen and not seen.add(l)]
|
||||
|
||||
SOURCE_INDEX_BUILDERS = {
|
||||
"OWASP ASVS 4.0": build_asvs_index,
|
||||
"OWASP SAMM 2.0": build_samm_index,
|
||||
"OWASP MASVS 2.0": build_masvs_index,
|
||||
"OWASP API Security Top 10 (2023)": build_api_index,
|
||||
}
|
||||
|
||||
# Build all indexes on normalized text
|
||||
source_indexes = {}
|
||||
for name, norm_text in SOURCE_GITHUB.items():
|
||||
builder = SOURCE_INDEX_BUILDERS[name]
|
||||
idx = builder(norm_text)
|
||||
source_indexes[name] = idx
|
||||
print(f" {name}: {len(idx)} index entries")
|
||||
|
||||
def find_text(orig_text, source_name):
|
||||
"""Find control text in GitHub source. Returns (label, type) or None."""
|
||||
norm_text = SOURCE_GITHUB.get(source_name)
|
||||
if not norm_text:
|
||||
return None
|
||||
idx = source_indexes.get(source_name, [])
|
||||
orig_norm = normalize(orig_text)
|
||||
if len(orig_norm) < 20:
|
||||
return None
|
||||
|
||||
for start_frac in [0.25, 0.1, 0.5, 0.0, 0.75]:
|
||||
for length in [80, 60, 40, 30, 20]:
|
||||
start = max(0, int(len(orig_norm) * start_frac))
|
||||
snippet = orig_norm[start:start+length]
|
||||
if not snippet or len(snippet) < 15:
|
||||
continue
|
||||
pos = norm_text.find(snippet)
|
||||
if pos >= 0:
|
||||
label = "Unknown"
|
||||
typ = "unknown"
|
||||
for h_pos, h_label, h_type in reversed(idx):
|
||||
if h_pos <= pos:
|
||||
label = h_label
|
||||
typ = h_type
|
||||
break
|
||||
return (label, typ)
|
||||
return None
|
||||
|
||||
def find_in_any_github(orig_text, exclude_source=None):
|
||||
"""Try all GitHub sources."""
|
||||
for name in SOURCE_GITHUB:
|
||||
if name == exclude_source:
|
||||
continue
|
||||
result = find_text(orig_text, name)
|
||||
if result:
|
||||
return (name, result[0], result[1])
|
||||
return None
|
||||
|
||||
# ── DB ──
|
||||
db_url = os.environ['DATABASE_URL']
|
||||
parsed = urllib.parse.urlparse(db_url)
|
||||
conn = psycopg2.connect(
|
||||
host=parsed.hostname, port=parsed.port or 5432,
|
||||
user=parsed.username, password=parsed.password,
|
||||
dbname=parsed.path.lstrip('/'),
|
||||
options="-c search_path=compliance,public"
|
||||
)
|
||||
cur = conn.cursor()
|
||||
|
||||
# ── Process each OWASP source ──
|
||||
total_matched = 0
|
||||
total_cross = 0
|
||||
total_not_found = 0
|
||||
all_updates = []
|
||||
|
||||
for source in ['OWASP ASVS 4.0', 'OWASP SAMM 2.0', 'OWASP MASVS 2.0', 'OWASP API Security Top 10 (2023)']:
|
||||
cur.execute("""
|
||||
SELECT id, control_id, title, source_original_text, release_state
|
||||
FROM compliance.canonical_controls
|
||||
WHERE source_citation->>'source' = %s
|
||||
AND source_citation->>'article_type' IS NULL
|
||||
AND source_original_text IS NOT NULL
|
||||
AND release_state NOT IN ('duplicate', 'too_close')
|
||||
ORDER BY control_id
|
||||
""", (source,))
|
||||
controls = cur.fetchall()
|
||||
|
||||
if not controls:
|
||||
continue
|
||||
|
||||
print(f"\n{'='*60}")
|
||||
print(f"{source} — {len(controls)} unmatched active")
|
||||
print(f"{'='*60}")
|
||||
|
||||
matched = 0
|
||||
cross_matched = 0
|
||||
not_found = 0
|
||||
|
||||
for ctrl in controls:
|
||||
uid, cid, title, text, state = ctrl
|
||||
|
||||
# Try own GitHub source
|
||||
result = find_text(text, source)
|
||||
if result:
|
||||
matched += 1
|
||||
total_matched += 1
|
||||
all_updates.append((uid, cid, source, result[0], result[1]))
|
||||
print(f" {cid:10s} → {result[0]:30s} [{result[1]}]")
|
||||
continue
|
||||
|
||||
# Try other GitHub sources
|
||||
cross = find_in_any_github(text, exclude_source=source)
|
||||
if cross:
|
||||
cross_matched += 1
|
||||
total_cross += 1
|
||||
all_updates.append((uid, cid, cross[0], cross[1], cross[2]))
|
||||
print(f" {cid:10s} → [{cross[0]}] {cross[1]:20s} [{cross[2]}] (CROSS)")
|
||||
continue
|
||||
|
||||
not_found += 1
|
||||
total_not_found += 1
|
||||
|
||||
print(f"\n Own source matched: {matched}")
|
||||
print(f" Cross-source: {cross_matched}")
|
||||
print(f" Not found: {not_found}")
|
||||
|
||||
# ── Also try OWASP Top 10 remaining unmatched (34 active left after dup marking) ──
|
||||
cur.execute("""
|
||||
SELECT id, control_id, title, source_original_text, release_state
|
||||
FROM compliance.canonical_controls
|
||||
WHERE source_citation->>'source' = 'OWASP Top 10 (2021)'
|
||||
AND source_citation->>'article_type' IS NULL
|
||||
AND source_original_text IS NOT NULL
|
||||
AND release_state NOT IN ('duplicate', 'too_close')
|
||||
ORDER BY control_id
|
||||
""")
|
||||
top10_remaining = cur.fetchall()
|
||||
if top10_remaining:
|
||||
print(f"\n{'='*60}")
|
||||
print(f"OWASP Top 10 (2021) — {len(top10_remaining)} remaining unmatched active")
|
||||
print(f"{'='*60}")
|
||||
for ctrl in top10_remaining:
|
||||
uid, cid, title, text, state = ctrl
|
||||
cross = find_in_any_github(text)
|
||||
if cross:
|
||||
total_cross += 1
|
||||
all_updates.append((uid, cid, cross[0], cross[1], cross[2]))
|
||||
print(f" {cid:10s} → [{cross[0]}] {cross[1]:20s} [{cross[2]}]")
|
||||
else:
|
||||
total_not_found += 1
|
||||
|
||||
# ── Summary ──
|
||||
print(f"\n{'='*60}")
|
||||
print(f"ZUSAMMENFASSUNG")
|
||||
print(f"{'='*60}")
|
||||
print(f" Matched in eigener GitHub-Quelle: {total_matched}")
|
||||
print(f" Cross-source matched: {total_cross}")
|
||||
print(f" Nicht gefunden: {total_not_found}")
|
||||
print(f" Total Updates: {len(all_updates)}")
|
||||
|
||||
# ── Apply updates ──
|
||||
if all_updates:
|
||||
print(f"\nApplying {len(all_updates)} updates to DB...")
|
||||
applied = 0
|
||||
for uid, cid, correct_source, label, typ in all_updates:
|
||||
# Update article + article_type, and fix source if cross-matched
|
||||
cur.execute("""
|
||||
UPDATE compliance.canonical_controls
|
||||
SET source_citation = source_citation ||
|
||||
jsonb_build_object('article', %s, 'article_type', %s)
|
||||
WHERE id = %s
|
||||
AND (source_citation->>'article' IS DISTINCT FROM %s
|
||||
OR source_citation->>'article_type' IS DISTINCT FROM %s)
|
||||
""", (label, typ, uid, label, typ))
|
||||
if cur.rowcount > 0:
|
||||
applied += 1
|
||||
|
||||
conn.commit()
|
||||
print(f" Applied: {applied} controls updated")
|
||||
|
||||
# Type distribution
|
||||
type_counts = {}
|
||||
for _, _, _, _, typ in all_updates:
|
||||
type_counts[typ] = type_counts.get(typ, 0) + 1
|
||||
print(f"\n Article type distribution:")
|
||||
for t, c in sorted(type_counts.items(), key=lambda x: -x[1]):
|
||||
print(f" {t:12s}: {c:5d}")
|
||||
|
||||
conn.close()
|
||||
357
scripts/qa/phase5_normalize_and_cleanup.py
Normal file
357
scripts/qa/phase5_normalize_and_cleanup.py
Normal file
@@ -0,0 +1,357 @@
|
||||
"""Phase 5: Source Normalization + Duplicate Hard Delete.
|
||||
|
||||
Steps:
|
||||
1. OSCAL controls: add source_regulation to generation_metadata
|
||||
2. Fix 20 v3 controls with NULL source (tag as manually_reviewed)
|
||||
3. Fix empty-string source (DATA-631 → Telekommunikationsgesetz Oesterreich)
|
||||
4. Fix OWASP cross-source misattributions (regulation_code vs actual source)
|
||||
5. Hard delete duplicate/too_close controls (3,301 controls, 0 FK refs)
|
||||
6. Clean up canonical_processed_chunks generated_control_ids
|
||||
|
||||
Usage:
|
||||
export DATABASE_URL='postgresql://...'
|
||||
python3 scripts/qa/phase5_normalize_and_cleanup.py [--dry-run] [--step N]
|
||||
"""
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import psycopg2
|
||||
import urllib.parse
|
||||
|
||||
DRY_RUN = "--dry-run" in sys.argv
|
||||
STEP_ONLY = None
|
||||
for arg in sys.argv:
|
||||
if arg.startswith("--step"):
|
||||
idx = sys.argv.index(arg)
|
||||
if idx + 1 < len(sys.argv):
|
||||
STEP_ONLY = int(sys.argv[idx + 1])
|
||||
|
||||
db_url = os.environ['DATABASE_URL']
|
||||
parsed = urllib.parse.urlparse(db_url)
|
||||
conn = psycopg2.connect(
|
||||
host=parsed.hostname, port=parsed.port or 5432,
|
||||
user=parsed.username, password=parsed.password,
|
||||
dbname=parsed.path.lstrip('/'),
|
||||
options="-c search_path=compliance,public"
|
||||
)
|
||||
cur = conn.cursor()
|
||||
|
||||
def should_run(step):
|
||||
return STEP_ONLY is None or STEP_ONLY == step
|
||||
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════
|
||||
# Step 1: OSCAL controls — add source_regulation to generation_metadata
|
||||
# ══════════════════════════════════════════════════════════════════
|
||||
if should_run(1):
|
||||
print("=" * 70)
|
||||
print("STEP 1: OSCAL controls — source_regulation in generation_metadata")
|
||||
print("=" * 70)
|
||||
|
||||
cur.execute("""
|
||||
SELECT count(*)
|
||||
FROM compliance.canonical_controls
|
||||
WHERE generation_strategy = 'oscal_import'
|
||||
AND (generation_metadata->>'source_regulation' IS NULL
|
||||
OR generation_metadata->>'source_regulation' = '')
|
||||
""")
|
||||
count = cur.fetchone()[0]
|
||||
print(f" OSCAL controls without source_regulation: {count}")
|
||||
|
||||
if count > 0:
|
||||
if DRY_RUN:
|
||||
print(f" [DRY RUN] Would update {count} controls")
|
||||
else:
|
||||
cur.execute("""
|
||||
UPDATE compliance.canonical_controls
|
||||
SET generation_metadata = COALESCE(generation_metadata, '{}'::jsonb)
|
||||
|| '{"source_regulation": "nist_sp800_53r5"}'::jsonb
|
||||
WHERE generation_strategy = 'oscal_import'
|
||||
AND (generation_metadata->>'source_regulation' IS NULL
|
||||
OR generation_metadata->>'source_regulation' = '')
|
||||
""")
|
||||
print(f" Updated: {cur.rowcount}")
|
||||
print()
|
||||
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════
|
||||
# Step 2: v3 controls with NULL source — tag source as best guess
|
||||
# ══════════════════════════════════════════════════════════════════
|
||||
if should_run(2):
|
||||
print("=" * 70)
|
||||
print("STEP 2: Fix v3 controls with NULL source")
|
||||
print("=" * 70)
|
||||
|
||||
# These 20 controls are v3/document_grouped with no source or regulation.
|
||||
# Based on title analysis, they cover:
|
||||
# - Data protection/privacy topics (DSGVO-adjacent)
|
||||
# - Software security (OWASP/NIST-adjacent)
|
||||
# - Mobile security (OWASP MASVS-adjacent)
|
||||
# Mark them as 'needs_review' and add a flag.
|
||||
cur.execute("""
|
||||
SELECT id, control_id, title
|
||||
FROM compliance.canonical_controls
|
||||
WHERE source_citation->>'source' IS NULL
|
||||
AND pipeline_version = 3
|
||||
AND release_state NOT IN ('duplicate', 'too_close')
|
||||
""")
|
||||
v3_null = cur.fetchall()
|
||||
print(f" v3 controls with NULL source: {len(v3_null)}")
|
||||
|
||||
if v3_null:
|
||||
if DRY_RUN:
|
||||
print(f" [DRY RUN] Would mark {len(v3_null)} as needs_review")
|
||||
else:
|
||||
for ctrl_id_uuid, control_id, title in v3_null:
|
||||
cur.execute("""
|
||||
UPDATE compliance.canonical_controls
|
||||
SET release_state = 'needs_review',
|
||||
generation_metadata = COALESCE(generation_metadata, '{}'::jsonb)
|
||||
|| '{"missing_source": true}'::jsonb
|
||||
WHERE id = %s
|
||||
""", (ctrl_id_uuid,))
|
||||
print(f" Marked {len(v3_null)} as needs_review with missing_source flag")
|
||||
print()
|
||||
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════
|
||||
# Step 3: Fix empty-string source (DATA-631)
|
||||
# ══════════════════════════════════════════════════════════════════
|
||||
if should_run(3):
|
||||
print("=" * 70)
|
||||
print("STEP 3: Fix empty-string source")
|
||||
print("=" * 70)
|
||||
|
||||
cur.execute("""
|
||||
SELECT id, control_id, title,
|
||||
generation_metadata->>'source_regulation' as reg
|
||||
FROM compliance.canonical_controls
|
||||
WHERE source_citation->>'source' = ''
|
||||
AND release_state NOT IN ('duplicate', 'too_close')
|
||||
""")
|
||||
empty_src = cur.fetchall()
|
||||
print(f" Controls with empty source: {len(empty_src)}")
|
||||
|
||||
for ctrl_id_uuid, control_id, title, reg in empty_src:
|
||||
print(f" {control_id} | reg={reg} | {title[:60]}")
|
||||
if reg == 'at_tkg':
|
||||
new_source = 'Telekommunikationsgesetz Oesterreich'
|
||||
else:
|
||||
new_source = f"Unbekannt ({reg})"
|
||||
|
||||
if DRY_RUN:
|
||||
print(f" [DRY RUN] Would set source='{new_source}'")
|
||||
else:
|
||||
cur.execute("""
|
||||
UPDATE compliance.canonical_controls
|
||||
SET source_citation = jsonb_set(
|
||||
source_citation, '{source}', %s::jsonb
|
||||
)
|
||||
WHERE id = %s
|
||||
""", (json.dumps(new_source), ctrl_id_uuid))
|
||||
print(f" Set source='{new_source}'")
|
||||
print()
|
||||
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════
|
||||
# Step 4: Fix OWASP cross-source misattributions
|
||||
# ══════════════════════════════════════════════════════════════════
|
||||
if should_run(4):
|
||||
print("=" * 70)
|
||||
print("STEP 4: Fix OWASP cross-source misattributions")
|
||||
print("=" * 70)
|
||||
|
||||
# Controls where source_citation.source doesn't match the regulation_code
|
||||
OWASP_REG_TO_SOURCE = {
|
||||
'owasp_top10_2021': 'OWASP Top 10 (2021)',
|
||||
'owasp_asvs': 'OWASP ASVS 4.0',
|
||||
'owasp_masvs': 'OWASP MASVS 2.0',
|
||||
'owasp_samm': 'OWASP SAMM 2.0',
|
||||
'owasp_api_top10_2023': 'OWASP API Security Top 10 (2023)',
|
||||
}
|
||||
|
||||
# Strategy: Move controls to the regulation_code that matches their actual source
|
||||
# i.e., if a control has source='OWASP ASVS 4.0' but reg='owasp_top10_2021',
|
||||
# update the reg to 'owasp_asvs'
|
||||
SOURCE_TO_REG = {v: k for k, v in OWASP_REG_TO_SOURCE.items()}
|
||||
|
||||
total_fixed = 0
|
||||
for reg_code, expected_source in OWASP_REG_TO_SOURCE.items():
|
||||
cur.execute("""
|
||||
SELECT id, control_id, source_citation->>'source' as src
|
||||
FROM compliance.canonical_controls
|
||||
WHERE generation_metadata->>'source_regulation' = %s
|
||||
AND source_citation->>'source' <> %s
|
||||
AND release_state NOT IN ('duplicate', 'too_close')
|
||||
""", (reg_code, expected_source))
|
||||
mismatches = cur.fetchall()
|
||||
|
||||
if mismatches:
|
||||
print(f"\n {reg_code} → {len(mismatches)} Mismatches:")
|
||||
for ctrl_id_uuid, control_id, actual_source in mismatches:
|
||||
correct_reg = SOURCE_TO_REG.get(actual_source)
|
||||
if correct_reg:
|
||||
print(f" {control_id} | {actual_source} → reg={correct_reg}")
|
||||
if not DRY_RUN:
|
||||
cur.execute("""
|
||||
UPDATE compliance.canonical_controls
|
||||
SET generation_metadata = jsonb_set(
|
||||
generation_metadata, '{source_regulation}', %s::jsonb
|
||||
)
|
||||
WHERE id = %s
|
||||
""", (json.dumps(correct_reg), ctrl_id_uuid))
|
||||
total_fixed += 1
|
||||
else:
|
||||
print(f" {control_id} | {actual_source} → no mapping found")
|
||||
|
||||
if DRY_RUN:
|
||||
print(f"\n [DRY RUN] Would fix {total_fixed} misattributions")
|
||||
else:
|
||||
print(f"\n Fixed: {total_fixed} misattributions")
|
||||
print()
|
||||
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════
|
||||
# Step 5: Hard delete duplicate/too_close controls
|
||||
# ══════════════════════════════════════════════════════════════════
|
||||
if should_run(5):
|
||||
print("=" * 70)
|
||||
print("STEP 5: Hard delete duplicate/too_close controls")
|
||||
print("=" * 70)
|
||||
|
||||
# Verify no FK references
|
||||
for table, col in [
|
||||
('canonical_control_mappings', 'control_id'),
|
||||
('obligation_extractions', 'control_uuid'),
|
||||
('crosswalk_matrix', 'master_control_uuid'),
|
||||
('obligation_candidates', 'parent_control_uuid'),
|
||||
]:
|
||||
cur.execute(f"""
|
||||
SELECT count(*)
|
||||
FROM compliance.{table} t
|
||||
JOIN compliance.canonical_controls cc ON cc.id = t.{col}
|
||||
WHERE cc.release_state IN ('duplicate', 'too_close')
|
||||
""")
|
||||
fk_count = cur.fetchone()[0]
|
||||
if fk_count > 0:
|
||||
print(f" WARNING: {table}.{col} has {fk_count} refs to dup/too_close!")
|
||||
print(f" ABORTING Step 5 — clean FK refs first!")
|
||||
sys.exit(1)
|
||||
else:
|
||||
print(f" {table}.{col}: 0 refs ✓")
|
||||
|
||||
# Check self-references
|
||||
cur.execute("""
|
||||
SELECT count(*)
|
||||
FROM compliance.canonical_controls child
|
||||
JOIN compliance.canonical_controls parent ON parent.id = child.parent_control_uuid
|
||||
WHERE parent.release_state IN ('duplicate', 'too_close')
|
||||
""")
|
||||
self_refs = cur.fetchone()[0]
|
||||
if self_refs > 0:
|
||||
print(f" WARNING: {self_refs} child controls reference dup/too_close parents!")
|
||||
print(f" ABORTING Step 5!")
|
||||
sys.exit(1)
|
||||
print(f" Self-references: 0 ✓")
|
||||
|
||||
cur.execute("""
|
||||
SELECT release_state, count(*)
|
||||
FROM compliance.canonical_controls
|
||||
WHERE release_state IN ('duplicate', 'too_close')
|
||||
GROUP BY 1
|
||||
""")
|
||||
to_delete = {}
|
||||
for state, cnt in cur.fetchall():
|
||||
to_delete[state] = cnt
|
||||
print(f"\n {state}: {cnt}")
|
||||
|
||||
total = sum(to_delete.values())
|
||||
print(f"\n TOTAL to delete: {total}")
|
||||
|
||||
if DRY_RUN:
|
||||
print(f" [DRY RUN] Would delete {total} controls")
|
||||
else:
|
||||
cur.execute("""
|
||||
DELETE FROM compliance.canonical_controls
|
||||
WHERE release_state IN ('duplicate', 'too_close')
|
||||
""")
|
||||
print(f" Deleted: {cur.rowcount} controls")
|
||||
print()
|
||||
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════
|
||||
# Step 6: Clean up canonical_processed_chunks generated_control_ids
|
||||
# ══════════════════════════════════════════════════════════════════
|
||||
if should_run(6):
|
||||
print("=" * 70)
|
||||
print("STEP 6: Clean up processed chunks (remove deleted control IDs)")
|
||||
print("=" * 70)
|
||||
|
||||
if DRY_RUN and should_run(5):
|
||||
print(" [DRY RUN] Skipping — depends on Step 5 deletion")
|
||||
else:
|
||||
# Find chunks that reference non-existent controls
|
||||
cur.execute("""
|
||||
SELECT id, generated_control_ids
|
||||
FROM compliance.canonical_processed_chunks
|
||||
WHERE generated_control_ids IS NOT NULL
|
||||
AND generated_control_ids <> '[]'::jsonb
|
||||
""")
|
||||
chunks = cur.fetchall()
|
||||
print(f" Chunks with generated_control_ids: {len(chunks)}")
|
||||
|
||||
# Get all existing control IDs
|
||||
cur.execute("SELECT id::text FROM compliance.canonical_controls")
|
||||
existing_ids = set(r[0] for r in cur.fetchall())
|
||||
print(f" Existing controls: {len(existing_ids)}")
|
||||
|
||||
cleaned = 0
|
||||
for chunk_id, control_ids in chunks:
|
||||
if isinstance(control_ids, str):
|
||||
control_ids = json.loads(control_ids)
|
||||
if isinstance(control_ids, list):
|
||||
valid_ids = [cid for cid in control_ids if cid in existing_ids]
|
||||
if len(valid_ids) < len(control_ids):
|
||||
removed = len(control_ids) - len(valid_ids)
|
||||
cur.execute("""
|
||||
UPDATE compliance.canonical_processed_chunks
|
||||
SET generated_control_ids = %s::jsonb
|
||||
WHERE id = %s
|
||||
""", (json.dumps(valid_ids), chunk_id))
|
||||
cleaned += 1
|
||||
|
||||
print(f" Chunks cleaned: {cleaned}")
|
||||
print()
|
||||
|
||||
|
||||
# ══════════════════════════════════════════════════════════════════
|
||||
# Final summary
|
||||
# ══════════════════════════════════════════════════════════════════
|
||||
if not DRY_RUN:
|
||||
conn.commit()
|
||||
print("=" * 70)
|
||||
print("COMMITTED. Final state:")
|
||||
print("=" * 70)
|
||||
else:
|
||||
print("=" * 70)
|
||||
print("[DRY RUN] No changes committed. Current state:")
|
||||
print("=" * 70)
|
||||
|
||||
cur.execute("""
|
||||
SELECT release_state, count(*)
|
||||
FROM compliance.canonical_controls
|
||||
GROUP BY 1
|
||||
ORDER BY count(*) DESC
|
||||
""")
|
||||
total = 0
|
||||
active = 0
|
||||
for state, cnt in cur.fetchall():
|
||||
total += cnt
|
||||
if state not in ('duplicate', 'too_close'):
|
||||
active += cnt
|
||||
print(f" {state:15s}: {cnt:5d}")
|
||||
|
||||
print(f"\n TOTAL: {total}")
|
||||
print(f" AKTIV: {active}")
|
||||
|
||||
conn.close()
|
||||
655
scripts/qa/phase74_generate_gap_controls.py
Normal file
655
scripts/qa/phase74_generate_gap_controls.py
Normal file
@@ -0,0 +1,655 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Phase 7.4: Generate new controls for gap articles via Anthropic Claude Sonnet.
|
||||
|
||||
Reads gap_analysis_results.json, extracts article text from PDFs,
|
||||
calls Claude Sonnet to generate controls, inserts into DB.
|
||||
|
||||
Usage:
|
||||
python3 phase74_generate_gap_controls.py --dry-run # show what would be generated
|
||||
python3 phase74_generate_gap_controls.py # generate and insert
|
||||
python3 phase74_generate_gap_controls.py --source "DSGVO" # filter by source
|
||||
python3 phase74_generate_gap_controls.py --resume # skip already-generated articles
|
||||
"""
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import re
|
||||
import time
|
||||
import hashlib
|
||||
import argparse
|
||||
import psycopg2
|
||||
import urllib.parse
|
||||
import requests
|
||||
from pathlib import Path
|
||||
from collections import Counter
|
||||
|
||||
sys.path.insert(0, os.path.dirname(__file__))
|
||||
from pdf_qa_all import (
|
||||
SOURCE_FILE_MAP, read_file, classify_doc, normalize,
|
||||
build_eu_article_index, build_de_law_index, build_nist_index,
|
||||
build_owasp_index, build_generic_index, MAX_ARTICLES,
|
||||
)
|
||||
|
||||
# ── Config ──────────────────────────────────────────────────────────
|
||||
ANTHROPIC_URL = "https://api.anthropic.com/v1/messages"
|
||||
ANTHROPIC_MODEL = os.environ.get("CONTROL_GEN_ANTHROPIC_MODEL", "claude-sonnet-4-6")
|
||||
ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", "")
|
||||
PIPELINE_VERSION = 5
|
||||
GAP_RESULTS_FILE = "/tmp/gap_analysis_results.json"
|
||||
PDF_DIR = Path(os.path.expanduser("~/rag-ingestion/pdfs"))
|
||||
|
||||
try:
|
||||
import fitz
|
||||
except ImportError:
|
||||
fitz = None
|
||||
|
||||
# ── Source name → regulation_code reverse map ────────────────────────
|
||||
# Built from REGULATION_LICENSE_MAP in control_generator.py
|
||||
SOURCE_TO_REGCODE = {
|
||||
"DSGVO (EU) 2016/679": "eu_2016_679",
|
||||
"KI-Verordnung (EU) 2024/1689": "eu_2024_1689",
|
||||
"NIS2-Richtlinie (EU) 2022/2555": "eu_2022_2555",
|
||||
"Cyber Resilience Act (CRA)": "eu_2024_2847",
|
||||
"Maschinenverordnung (EU) 2023/1230": "eu_2023_1230",
|
||||
"EU Blue Guide 2022": "eu_blue_guide_2022",
|
||||
"Markets in Crypto-Assets (MiCA)": "mica",
|
||||
"Batterieverordnung (EU) 2023/1542": "eu_2023_1542",
|
||||
"AML-Verordnung": "amlr",
|
||||
"Data Governance Act (DGA)": "dga",
|
||||
"Data Act": "data_act",
|
||||
"GPSR (EU) 2023/988": "gpsr",
|
||||
"IFRS-Übernahmeverordnung": "ifrs",
|
||||
"NIST SP 800-53 Rev. 5": "nist_sp800_53r5",
|
||||
"NIST SP 800-207 (Zero Trust)": "nist_sp800_207",
|
||||
"NIST SP 800-63-3": "nist_sp800_63_3",
|
||||
"NIST AI Risk Management Framework": "nist_ai_rmf",
|
||||
"NIST SP 800-218 (SSDF)": "nist_sp_800_218",
|
||||
"NIST Cybersecurity Framework 2.0": "nist_csf_2_0",
|
||||
"OWASP Top 10 (2021)": "owasp_top10",
|
||||
"OWASP ASVS 4.0": "owasp_asvs",
|
||||
"OWASP SAMM 2.0": "owasp_samm",
|
||||
"OWASP API Security Top 10 (2023)": "owasp_api_top10",
|
||||
"OWASP MASVS 2.0": "owasp_masvs",
|
||||
"ENISA ICS/SCADA Dependencies": "enisa_ics_scada",
|
||||
"ENISA Supply Chain Good Practices": "enisa_supply_chain",
|
||||
"CISA Secure by Design": "cisa_sbd",
|
||||
"Bundesdatenschutzgesetz (BDSG)": "bdsg",
|
||||
"Gewerbeordnung (GewO)": "gewo",
|
||||
"Handelsgesetzbuch (HGB)": "hgb",
|
||||
"Abgabenordnung (AO)": "ao",
|
||||
"OECD KI-Empfehlung": "oecd_ai_principles",
|
||||
}
|
||||
|
||||
# License info per regulation code (from REGULATION_LICENSE_MAP)
|
||||
LICENSE_MAP = {
|
||||
"eu_2016_679": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
|
||||
"eu_2024_1689": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
|
||||
"eu_2022_2555": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
|
||||
"eu_2024_2847": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
|
||||
"eu_2023_1230": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
|
||||
"eu_blue_guide_2022": {"license": "EU_PUBLIC", "rule": 1, "source_type": "guideline"},
|
||||
"mica": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
|
||||
"eu_2023_1542": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
|
||||
"amlr": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
|
||||
"dga": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
|
||||
"data_act": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
|
||||
"gpsr": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
|
||||
"ifrs": {"license": "EU_LAW", "rule": 1, "source_type": "law"},
|
||||
"nist_sp800_53r5": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
|
||||
"nist_sp800_207": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
|
||||
"nist_sp800_63_3": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
|
||||
"nist_ai_rmf": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
|
||||
"nist_sp_800_218": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
|
||||
"nist_csf_2_0": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "source_type": "standard"},
|
||||
"owasp_top10": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard"},
|
||||
"owasp_asvs": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard"},
|
||||
"owasp_samm": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard"},
|
||||
"owasp_api_top10": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard"},
|
||||
"owasp_masvs": {"license": "CC-BY-SA-4.0", "rule": 2, "source_type": "standard"},
|
||||
"enisa_ics_scada": {"license": "CC-BY-4.0", "rule": 2, "source_type": "guideline"},
|
||||
"enisa_supply_chain": {"license": "CC-BY-4.0", "rule": 2, "source_type": "guideline"},
|
||||
"cisa_sbd": {"license": "US_GOV_PUBLIC", "rule": 1, "source_type": "guideline"},
|
||||
"bdsg": {"license": "DE_LAW", "rule": 1, "source_type": "law"},
|
||||
"gewo": {"license": "DE_LAW", "rule": 1, "source_type": "law"},
|
||||
"hgb": {"license": "DE_LAW", "rule": 1, "source_type": "law"},
|
||||
"ao": {"license": "DE_LAW", "rule": 1, "source_type": "law"},
|
||||
"oecd_ai_principles": {"license": "OECD_PUBLIC", "rule": 2, "source_type": "standard"},
|
||||
}
|
||||
|
||||
# Domain detection keywords
|
||||
DOMAIN_KEYWORDS = {
|
||||
"AUTH": ["authentifizierung", "anmeldung", "login", "passwort", "identit", "identity", "credential"],
|
||||
"CRYP": ["verschlüsselung", "kryptogra", "encrypt", "cipher", "hash", "tls", "ssl", "signatur"],
|
||||
"NET": ["netzwerk", "network", "firewall", "router", "dns", "ip-adress"],
|
||||
"DATA": ["daten", "data", "personenbezogen", "datenschutz", "privacy", "gdpr", "dsgvo", "verarbeitung"],
|
||||
"LOG": ["protokoll", "logging", "audit", "nachvollzieh", "aufzeichn"],
|
||||
"ACC": ["zugriff", "access", "berechtigung", "autorisierung", "authorization", "rolle"],
|
||||
"SEC": ["sicherheit", "security", "schutz", "protect", "schwachstell", "vulnerab"],
|
||||
"INC": ["vorfall", "incident", "breach", "meldung", "reaktion", "response", "notfall"],
|
||||
"AI": ["künstliche intelligenz", "ki-system", "ai system", "machine learning", "algorithm", "hochrisiko-ki"],
|
||||
"COMP": ["compliance", "konformität", "audit", "zertifizierung", "regulier", "vorschrift"],
|
||||
"GOV": ["behörde", "aufsicht", "governance", "marktüberwachung", "authority"],
|
||||
"FIN": ["finanz", "zahlungs", "payment", "crypto", "krypto-", "geldwäsche", "aml"],
|
||||
"ENV": ["umwelt", "environment", "batterie", "recycling", "entsorgu", "nachhaltig"],
|
||||
}
|
||||
|
||||
# ── Prompt (same as control_generator.py) ────────────────────────────
|
||||
|
||||
SYSTEM_PROMPT = """Du bist ein Security-Compliance-Experte. Strukturiere den gegebenen Text
|
||||
als praxisorientiertes Security Control. Erstelle eine verständliche, umsetzbare Formulierung.
|
||||
Antworte NUR mit validem JSON. Bei mehreren Controls antworte mit einem JSON-Array."""
|
||||
|
||||
APPLICABILITY_PROMPT = """- applicable_industries: Liste der Branchen fuer die dieses Control relevant ist.
|
||||
Verwende ["all"] wenn der Control branchenuebergreifend gilt.
|
||||
Moegliche Werte: "all", "Technologie / IT", "IT Dienstleistungen", "E-Commerce / Handel",
|
||||
"Finanzdienstleistungen", "Versicherungen", "Gesundheitswesen", "Pharma", "Bildung",
|
||||
"Beratung / Consulting", "Marketing / Agentur", "Produktion / Industrie",
|
||||
"Logistik / Transport", "Immobilien", "Bau", "Energie", "Automobil",
|
||||
"Luft- / Raumfahrt", "Maschinenbau", "Anlagenbau", "Automatisierung", "Robotik",
|
||||
"Messtechnik", "Agrar", "Chemie", "Minen / Bergbau", "Telekommunikation",
|
||||
"Medien / Verlage", "Gastronomie / Hotellerie", "Recht / Kanzlei",
|
||||
"Oeffentlicher Dienst", "Verteidigung / Ruestung", "Wasser- / Abwasserwirtschaft",
|
||||
"Lebensmittel", "Digitale Infrastruktur", "Weltraum", "Post / Kurierdienste",
|
||||
"Abfallwirtschaft", "Forschung"
|
||||
- applicable_company_size: Ab welcher Unternehmensgroesse gilt dieses Control?
|
||||
Verwende ["all"] wenn keine Groessenbeschraenkung.
|
||||
Moegliche Werte: "all", "micro", "small", "medium", "large", "enterprise"
|
||||
- scope_conditions: null wenn keine besonderen Bedingungen, sonst:
|
||||
{"requires_any": ["signal"], "description": "Erklaerung"}
|
||||
Moegliche Signale: "uses_ai", "third_country_transfer", "processes_health_data",
|
||||
"processes_minors_data", "automated_decisions", "employee_monitoring",
|
||||
"video_surveillance", "financial_data", "is_kritis_operator", "payment_services" """
|
||||
|
||||
CATEGORY_LIST = [
|
||||
"Datenschutz-Grundlagen", "Betroffenenrechte", "Technische Massnahmen",
|
||||
"Organisatorische Massnahmen", "Auftragsverarbeitung", "Datentransfer",
|
||||
"Risikomanagement", "Incident Response", "KI-Regulierung", "Cybersicherheit",
|
||||
"Zugriffskontrolle", "Kryptographie", "Netzwerksicherheit", "Compliance-Management",
|
||||
"Produktsicherheit", "Marktüberwachung", "Supply Chain Security",
|
||||
"Finanzregulierung", "Arbeitsrecht", "Gewerberecht", "Handelsrecht",
|
||||
"Umwelt / Nachhaltigkeit", "Dokumentation", "Schulung / Awareness",
|
||||
]
|
||||
CATEGORY_LIST_STR = ", ".join(f'"{c}"' for c in CATEGORY_LIST)
|
||||
|
||||
|
||||
def build_prompt(source_name, article_label, article_text, license_type):
|
||||
return f"""Strukturiere den folgenden Gesetzestext als Security/Compliance Control.
|
||||
Du DARFST den Originaltext verwenden (Quelle: {source_name}, {license_type}).
|
||||
|
||||
WICHTIG: Erstelle eine verständliche, praxisorientierte Formulierung.
|
||||
Der Originaltext wird separat gespeichert — deine Formulierung soll klar und umsetzbar sein.
|
||||
|
||||
Gib JSON zurück mit diesen Feldern:
|
||||
- title: Kurzer prägnanter Titel (max 100 Zeichen)
|
||||
- objective: Was soll erreicht werden? (1-3 Sätze)
|
||||
- rationale: Warum ist das wichtig? (1-2 Sätze)
|
||||
- requirements: Liste von konkreten Anforderungen (Strings)
|
||||
- test_procedure: Liste von Prüfschritten (Strings)
|
||||
- evidence: Liste von Nachweisdokumenten (Strings)
|
||||
- severity: low/medium/high/critical
|
||||
- tags: Liste von Tags
|
||||
- domain: Fachgebiet als Kuerzel (AUTH=Authentifizierung, CRYP=Kryptographie, NET=Netzwerk, DATA=Datenschutz, LOG=Logging, ACC=Zugriffskontrolle, SEC=IT-Sicherheit, INC=Vorfallmanagement, AI=KI, COMP=Compliance, GOV=Behoerden/Verwaltung, LAB=Arbeitsrecht, FIN=Finanzregulierung, TRD=Gewerbe/Handelsrecht, ENV=Umwelt, HLT=Gesundheit)
|
||||
- category: Inhaltliche Kategorie. Moegliche Werte: {CATEGORY_LIST_STR}
|
||||
- target_audience: Liste der Zielgruppen (z.B. "unternehmen", "behoerden", "entwickler", "datenschutzbeauftragte", "geschaeftsfuehrung", "it-abteilung", "rechtsabteilung", "compliance-officer")
|
||||
- source_article: Artikel-/Paragraphen-Referenz (z.B. "Artikel 10", "§ 42")
|
||||
- source_paragraph: Absatz-Referenz (z.B. "Absatz 5", "Nr. 2")
|
||||
{APPLICABILITY_PROMPT}
|
||||
|
||||
Text: {article_text[:3000]}
|
||||
Quelle: {source_name}, {article_label}"""
|
||||
|
||||
|
||||
# ── PDF article extraction ───────────────────────────────────────────
|
||||
|
||||
def extract_article_text(pdf_file, article_label, doc_type, full_text=None):
|
||||
"""Extract the text of a specific article from a PDF."""
|
||||
if full_text is None:
|
||||
full_text = read_file(pdf_file)
|
||||
if not full_text:
|
||||
return ""
|
||||
|
||||
if doc_type == "eu_regulation":
|
||||
art_num_match = re.search(r'\d+', article_label)
|
||||
if not art_num_match:
|
||||
return ""
|
||||
num = int(art_num_match.group())
|
||||
pattern = rf'\nArtikel\s+{num}\s*\n'
|
||||
match = re.search(pattern, full_text)
|
||||
if not match:
|
||||
return ""
|
||||
start = match.start()
|
||||
next_pattern = rf'\nArtikel\s+{num + 1}\s*\n'
|
||||
next_match = re.search(next_pattern, full_text)
|
||||
end = next_match.start() if next_match else min(start + 5000, len(full_text))
|
||||
return full_text[start:end].strip()[:3000]
|
||||
|
||||
elif doc_type == "de_law":
|
||||
para_match = re.search(r'\d+', article_label)
|
||||
if not para_match:
|
||||
return ""
|
||||
num = int(para_match.group())
|
||||
pattern = rf'\n§\s+{num}\b'
|
||||
match = re.search(pattern, full_text)
|
||||
if not match:
|
||||
return ""
|
||||
start = match.start()
|
||||
next_pattern = rf'\n§\s+{num + 1}\b'
|
||||
next_match = re.search(next_pattern, full_text)
|
||||
end = next_match.start() if next_match else min(start + 5000, len(full_text))
|
||||
return full_text[start:end].strip()[:3000]
|
||||
|
||||
elif doc_type == "nist":
|
||||
escaped = re.escape(article_label)
|
||||
match = re.search(rf'(?:^|\n)\s*{escaped}\b', full_text)
|
||||
if not match:
|
||||
return ""
|
||||
start = match.start()
|
||||
return full_text[start:start + 3000].strip()
|
||||
|
||||
else:
|
||||
# Generic / OWASP / ENISA
|
||||
escaped = re.escape(article_label)
|
||||
match = re.search(rf'(?:^|\n).*{escaped}\b', full_text)
|
||||
if not match:
|
||||
return ""
|
||||
start = match.start()
|
||||
return full_text[start:start + 3000].strip()
|
||||
|
||||
|
||||
# ── Anthropic API ────────────────────────────────────────────────────
|
||||
|
||||
def call_anthropic(prompt, system_prompt):
|
||||
"""Call Anthropic API. Returns (parsed_data, raw_text, usage, error)."""
|
||||
headers = {
|
||||
"x-api-key": ANTHROPIC_API_KEY,
|
||||
"anthropic-version": "2023-06-01",
|
||||
"content-type": "application/json",
|
||||
}
|
||||
payload = {
|
||||
"model": ANTHROPIC_MODEL,
|
||||
"max_tokens": 4096,
|
||||
"system": system_prompt,
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
}
|
||||
|
||||
try:
|
||||
resp = requests.post(ANTHROPIC_URL, headers=headers, json=payload, timeout=120)
|
||||
if resp.status_code != 200:
|
||||
return None, "", {}, f"HTTP {resp.status_code}: {resp.text[:200]}"
|
||||
data = resp.json()
|
||||
content = data["content"][0]["text"] if data.get("content") else ""
|
||||
usage = data.get("usage", {})
|
||||
parsed = parse_json(content)
|
||||
return parsed, content, usage, None
|
||||
except Exception as e:
|
||||
return None, "", {}, str(e)
|
||||
|
||||
|
||||
def parse_json(text):
|
||||
"""Parse JSON from LLM response, handling markdown fences."""
|
||||
text = text.strip()
|
||||
if text.startswith("```"):
|
||||
lines = text.split("\n")
|
||||
text = "\n".join(lines[1:-1] if lines[-1].strip().startswith("```") else lines[1:])
|
||||
text = text.strip()
|
||||
|
||||
try:
|
||||
data = json.loads(text)
|
||||
if isinstance(data, list):
|
||||
return data[0] if data else None
|
||||
return data
|
||||
except json.JSONDecodeError:
|
||||
match = re.search(r'\{[\s\S]*\}', text)
|
||||
if match:
|
||||
try:
|
||||
return json.loads(match.group())
|
||||
except json.JSONDecodeError:
|
||||
return None
|
||||
return None
|
||||
|
||||
|
||||
# ── Domain detection ─────────────────────────────────────────────────
|
||||
|
||||
def detect_domain(text):
|
||||
text_lower = text.lower()
|
||||
scores = {}
|
||||
for domain, keywords in DOMAIN_KEYWORDS.items():
|
||||
score = sum(1 for kw in keywords if kw in text_lower)
|
||||
if score > 0:
|
||||
scores[domain] = score
|
||||
if scores:
|
||||
return max(scores, key=scores.get)
|
||||
return "SEC"
|
||||
|
||||
|
||||
# ── Control ID generation ────────────────────────────────────────────
|
||||
|
||||
def generate_control_id(domain, cur):
|
||||
"""Generate next available control_id for domain prefix.
|
||||
|
||||
Uses MAX(numeric suffix) to find the true highest number,
|
||||
avoiding gaps from string-sorted IDs (e.g. COMP-99 > COMP-1000 in text sort).
|
||||
"""
|
||||
prefix = domain.upper()[:4]
|
||||
cur.execute("""
|
||||
SELECT MAX(CAST(SPLIT_PART(control_id, '-', 2) AS INTEGER))
|
||||
FROM compliance.canonical_controls
|
||||
WHERE control_id LIKE %s
|
||||
AND SPLIT_PART(control_id, '-', 2) ~ '^[0-9]+$'
|
||||
""", (f"{prefix}-%",))
|
||||
row = cur.fetchone()
|
||||
if row and row[0] is not None:
|
||||
return f"{prefix}-{row[0] + 1}"
|
||||
return f"{prefix}-001"
|
||||
|
||||
|
||||
# ── Main ─────────────────────────────────────────────────────────────
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Phase 7.4: Generate controls for gap articles")
|
||||
parser.add_argument("--dry-run", action="store_true", help="Show what would be generated")
|
||||
parser.add_argument("--source", type=str, help="Filter by source name substring")
|
||||
parser.add_argument("--resume", action="store_true", help="Skip articles that already have controls")
|
||||
parser.add_argument("--results", default=GAP_RESULTS_FILE, help="Path to gap_analysis_results.json")
|
||||
args = parser.parse_args()
|
||||
|
||||
if not ANTHROPIC_API_KEY:
|
||||
print("ERROR: Set ANTHROPIC_API_KEY")
|
||||
sys.exit(1)
|
||||
|
||||
# Load gap results
|
||||
with open(args.results) as f:
|
||||
gaps = json.load(f)
|
||||
total_gaps = sum(len(g["gap_articles"]) for g in gaps)
|
||||
print(f"Loaded {len(gaps)} sources with {total_gaps} gap articles")
|
||||
|
||||
if args.source:
|
||||
gaps = [g for g in gaps if args.source.lower() in g["source"].lower()]
|
||||
total_gaps = sum(len(g["gap_articles"]) for g in gaps)
|
||||
print(f"Filtered to {len(gaps)} sources, {total_gaps} gaps")
|
||||
|
||||
# DB connection with keepalive + reconnect helper
|
||||
db_url = os.environ['DATABASE_URL']
|
||||
parsed = urllib.parse.urlparse(db_url)
|
||||
|
||||
def connect_db():
|
||||
"""Create DB connection with TCP keepalive."""
|
||||
c = psycopg2.connect(
|
||||
host=parsed.hostname, port=parsed.port or 5432,
|
||||
user=parsed.username, password=parsed.password,
|
||||
dbname=parsed.path.lstrip('/'),
|
||||
options="-c search_path=compliance,public",
|
||||
keepalives=1, keepalives_idle=30,
|
||||
keepalives_interval=10, keepalives_count=5,
|
||||
)
|
||||
return c, c.cursor()
|
||||
|
||||
conn, cur = connect_db()
|
||||
|
||||
def ensure_db():
|
||||
"""Reconnect if connection is dead."""
|
||||
nonlocal conn, cur
|
||||
try:
|
||||
cur.execute("SELECT 1")
|
||||
except Exception:
|
||||
print(" [RECONNECT] DB connection lost, reconnecting...")
|
||||
try:
|
||||
conn.close()
|
||||
except Exception:
|
||||
pass
|
||||
conn, cur = connect_db()
|
||||
return True
|
||||
return False
|
||||
|
||||
# Get framework UUID
|
||||
cur.execute("SELECT id FROM compliance.canonical_control_frameworks WHERE framework_id = 'bp_security_v1' LIMIT 1")
|
||||
fw_row = cur.fetchone()
|
||||
if not fw_row:
|
||||
print("ERROR: Framework bp_security_v1 not found")
|
||||
sys.exit(1)
|
||||
framework_uuid = fw_row[0]
|
||||
|
||||
# If resuming, load existing articles per source
|
||||
existing_articles = {}
|
||||
if args.resume:
|
||||
cur.execute("""
|
||||
SELECT source_citation->>'source', source_citation->>'article'
|
||||
FROM compliance.canonical_controls
|
||||
WHERE source_citation->>'article' IS NOT NULL
|
||||
""")
|
||||
for src, art in cur.fetchall():
|
||||
existing_articles.setdefault(src, set()).add(art)
|
||||
print(f"Resume mode: {sum(len(v) for v in existing_articles.values())} existing article-control pairs")
|
||||
|
||||
# Stats
|
||||
stats = Counter()
|
||||
total_input_tokens = 0
|
||||
total_output_tokens = 0
|
||||
generated_ids = []
|
||||
errors = []
|
||||
t_start = time.time()
|
||||
|
||||
# Pre-read PDFs (cache full text per source)
|
||||
pdf_cache = {}
|
||||
|
||||
for gap_source in sorted(gaps, key=lambda g: -len(g["gap_articles"])):
|
||||
source_name = gap_source["source"]
|
||||
gap_articles = gap_source["gap_articles"]
|
||||
filename = SOURCE_FILE_MAP.get(source_name)
|
||||
reg_code = SOURCE_TO_REGCODE.get(source_name, "unknown")
|
||||
license_info = LICENSE_MAP.get(reg_code, {"license": "UNKNOWN", "rule": 1, "source_type": "unknown"})
|
||||
doc_type = classify_doc(source_name)
|
||||
|
||||
if not filename:
|
||||
stats["skipped_no_pdf"] += len(gap_articles)
|
||||
continue
|
||||
|
||||
# Read PDF once per source
|
||||
if source_name not in pdf_cache:
|
||||
pdf_cache[source_name] = read_file(filename)
|
||||
full_text = pdf_cache[source_name]
|
||||
if not full_text:
|
||||
stats["skipped_no_pdf"] += len(gap_articles)
|
||||
continue
|
||||
|
||||
print(f"\n{'='*70}")
|
||||
print(f"{source_name} — {len(gap_articles)} gaps (rule {license_info['rule']}, {doc_type})")
|
||||
print(f"{'='*70}")
|
||||
|
||||
for gap in gap_articles:
|
||||
article_label = gap["label"]
|
||||
article_type = gap["type"]
|
||||
|
||||
# Skip if already has controls (resume mode)
|
||||
if args.resume and article_label in existing_articles.get(source_name, set()):
|
||||
stats["skipped_exists"] += 1
|
||||
continue
|
||||
|
||||
# Skip non-substantive NIST sections (intro chapters)
|
||||
if doc_type == "nist" and article_type == "section":
|
||||
section_match = re.match(r'Section (\d+)', article_label)
|
||||
if section_match and int(section_match.group(1)) <= 3:
|
||||
stats["skipped_intro"] += 1
|
||||
continue
|
||||
|
||||
# Extract article text
|
||||
article_text = extract_article_text(filename, article_label, doc_type, full_text)
|
||||
if not article_text or len(article_text) < 30:
|
||||
stats["skipped_short_text"] += 1
|
||||
print(f" SKIP {article_label}: text too short ({len(article_text)} chars)")
|
||||
continue
|
||||
|
||||
if args.dry_run:
|
||||
print(f" [DRY] {article_label} ({len(article_text)} chars)")
|
||||
stats["would_generate"] += 1
|
||||
continue
|
||||
|
||||
# Call Anthropic
|
||||
prompt = build_prompt(source_name, article_label, article_text, license_info["license"])
|
||||
data, raw, usage, error = call_anthropic(prompt, SYSTEM_PROMPT)
|
||||
|
||||
total_input_tokens += usage.get("input_tokens", 0)
|
||||
total_output_tokens += usage.get("output_tokens", 0)
|
||||
|
||||
if error:
|
||||
stats["api_error"] += 1
|
||||
errors.append(f"{source_name} {article_label}: {error}")
|
||||
print(f" ERROR {article_label}: {error}")
|
||||
time.sleep(5)
|
||||
continue
|
||||
|
||||
if not data:
|
||||
stats["parse_error"] += 1
|
||||
print(f" PARSE ERROR {article_label}")
|
||||
continue
|
||||
|
||||
# Ensure DB is alive before writing
|
||||
ensure_db()
|
||||
|
||||
# Build control
|
||||
title = str(data.get("title", ""))[:200]
|
||||
objective = str(data.get("objective", ""))
|
||||
rationale = str(data.get("rationale", ""))
|
||||
domain = str(data.get("domain", detect_domain(article_text))).upper()[:4]
|
||||
if not domain or len(domain) < 2:
|
||||
domain = detect_domain(article_text)
|
||||
|
||||
control_id = generate_control_id(domain, cur)
|
||||
severity = str(data.get("severity", "medium")).lower()
|
||||
if severity not in ("low", "medium", "high", "critical"):
|
||||
severity = "medium"
|
||||
|
||||
requirements = data.get("requirements", [])
|
||||
if not isinstance(requirements, list):
|
||||
requirements = [str(requirements)]
|
||||
test_procedure = data.get("test_procedure", [])
|
||||
if not isinstance(test_procedure, list):
|
||||
test_procedure = [str(test_procedure)]
|
||||
evidence = data.get("evidence", [])
|
||||
if not isinstance(evidence, list):
|
||||
evidence = [str(evidence)]
|
||||
tags = data.get("tags", [])
|
||||
if not isinstance(tags, list):
|
||||
tags = []
|
||||
target_audience = data.get("target_audience", [])
|
||||
if not isinstance(target_audience, list):
|
||||
target_audience = []
|
||||
applicable_industries = data.get("applicable_industries", ["all"])
|
||||
if not isinstance(applicable_industries, list):
|
||||
applicable_industries = ["all"]
|
||||
applicable_company_size = data.get("applicable_company_size", ["all"])
|
||||
if not isinstance(applicable_company_size, list):
|
||||
applicable_company_size = ["all"]
|
||||
scope_conditions = data.get("scope_conditions")
|
||||
|
||||
source_citation = {
|
||||
"source": source_name,
|
||||
"article": data.get("source_article", article_label),
|
||||
"paragraph": data.get("source_paragraph", ""),
|
||||
"article_type": article_type,
|
||||
"license": license_info["license"],
|
||||
"source_type": license_info["source_type"],
|
||||
}
|
||||
|
||||
generation_metadata = {
|
||||
"processing_path": "phase74_gap_fill",
|
||||
"license_rule": license_info["rule"],
|
||||
"source_regulation": reg_code,
|
||||
"source_article": article_label,
|
||||
"gap_fill": True,
|
||||
}
|
||||
|
||||
category = str(data.get("category", "")) or None
|
||||
|
||||
# Insert into DB
|
||||
try:
|
||||
cur.execute("""
|
||||
INSERT INTO compliance.canonical_controls (
|
||||
framework_id, control_id, title, objective, rationale,
|
||||
scope, requirements, test_procedure, evidence,
|
||||
severity, risk_score, implementation_effort,
|
||||
open_anchors, release_state, tags,
|
||||
license_rule, source_original_text, source_citation,
|
||||
customer_visible, generation_metadata,
|
||||
verification_method, category, generation_strategy,
|
||||
target_audience, pipeline_version,
|
||||
applicable_industries, applicable_company_size, scope_conditions
|
||||
) VALUES (
|
||||
%s, %s, %s, %s, %s,
|
||||
%s, %s, %s, %s,
|
||||
%s, %s, %s,
|
||||
%s, %s, %s,
|
||||
%s, %s, %s,
|
||||
%s, %s,
|
||||
%s, %s, %s,
|
||||
%s, %s,
|
||||
%s, %s, %s
|
||||
)
|
||||
ON CONFLICT (framework_id, control_id) DO NOTHING
|
||||
RETURNING id
|
||||
""", (
|
||||
framework_uuid, control_id, title, objective, rationale,
|
||||
json.dumps({}), json.dumps(requirements), json.dumps(test_procedure), json.dumps(evidence),
|
||||
severity, 5, "m",
|
||||
json.dumps([]), "draft", json.dumps(tags),
|
||||
license_info["rule"], article_text, json.dumps(source_citation),
|
||||
True, json.dumps(generation_metadata),
|
||||
"document", category, "phase74_gap_fill",
|
||||
json.dumps(target_audience), PIPELINE_VERSION,
|
||||
json.dumps(applicable_industries), json.dumps(applicable_company_size),
|
||||
json.dumps(scope_conditions) if scope_conditions else None,
|
||||
))
|
||||
conn.commit()
|
||||
row = cur.fetchone()
|
||||
if row:
|
||||
generated_ids.append(str(row[0]))
|
||||
stats["generated"] += 1
|
||||
print(f" OK {control_id}: {title[:60]}")
|
||||
else:
|
||||
stats["conflict"] += 1
|
||||
print(f" CONFLICT {control_id} (already exists)")
|
||||
except Exception as e:
|
||||
conn.rollback()
|
||||
stats["db_error"] += 1
|
||||
errors.append(f"DB {control_id}: {str(e)[:100]}")
|
||||
print(f" DB ERROR {control_id}: {str(e)[:100]}")
|
||||
|
||||
# Rate limit: ~0.5s between calls
|
||||
time.sleep(0.5)
|
||||
|
||||
# ── Summary ──────────────────────────────────────────────────────
|
||||
elapsed = time.time() - t_start
|
||||
cost = (total_input_tokens * 3 + total_output_tokens * 15) / 1_000_000
|
||||
|
||||
print(f"\n\n{'='*70}")
|
||||
print(f"PHASE 7.4 — {'DRY-RUN' if args.dry_run else 'ERGEBNIS'}")
|
||||
print(f"{'='*70}")
|
||||
print(f" Laufzeit: {elapsed/60:.1f} min")
|
||||
print(f" API-Kosten: ${cost:.2f}")
|
||||
print(f" Input Tokens: {total_input_tokens:,}")
|
||||
print(f" Output Tokens: {total_output_tokens:,}")
|
||||
print()
|
||||
for key in sorted(stats.keys()):
|
||||
print(f" {key:<25s}: {stats[key]:5d}")
|
||||
print()
|
||||
|
||||
if generated_ids:
|
||||
print(f" Neue Control-IDs: {len(generated_ids)}")
|
||||
# Save generated IDs
|
||||
with open("/tmp/phase74_generated_ids.json", 'w') as f:
|
||||
json.dump(generated_ids, f)
|
||||
print(f" IDs gespeichert: /tmp/phase74_generated_ids.json")
|
||||
|
||||
if errors:
|
||||
print(f"\n Fehler ({len(errors)}):")
|
||||
for e in errors[:20]:
|
||||
print(f" {e}")
|
||||
if len(errors) > 20:
|
||||
print(f" ... und {len(errors)-20} weitere")
|
||||
|
||||
conn.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
218
scripts/qa/run_job.sh
Executable file
218
scripts/qa/run_job.sh
Executable file
@@ -0,0 +1,218 @@
|
||||
#!/usr/bin/env bash
|
||||
# ─────────────────────────────────────────────────────────────
|
||||
# Robust job runner for QA scripts on Mac Mini
|
||||
#
|
||||
# Usage:
|
||||
# ./run_job.sh <script.py> [args...] # start job
|
||||
# ./run_job.sh --status # show running jobs
|
||||
# ./run_job.sh --kill <script.py> # kill a running job
|
||||
# ./run_job.sh --log <script.py> # tail log
|
||||
#
|
||||
# Features:
|
||||
# - Loads .env automatically (COMPLIANCE_DATABASE_URL → DATABASE_URL)
|
||||
# - PID-file prevents duplicate runs
|
||||
# - Unbuffered Python output
|
||||
# - Structured log files in /tmp/qa_jobs/
|
||||
# ─────────────────────────────────────────────────────────────
|
||||
set -euo pipefail
|
||||
|
||||
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
||||
PROJECT_DIR="$(cd "$SCRIPT_DIR/../.." && pwd)"
|
||||
JOB_DIR="/tmp/qa_jobs"
|
||||
mkdir -p "$JOB_DIR"
|
||||
|
||||
# ── Load .env ────────────────────────────────────────────────
|
||||
load_env() {
|
||||
local envfile="$PROJECT_DIR/.env"
|
||||
if [[ -f "$envfile" ]]; then
|
||||
# Export all vars from .env
|
||||
set -a
|
||||
# shellcheck disable=SC1090
|
||||
source "$envfile"
|
||||
set +a
|
||||
fi
|
||||
# Map COMPLIANCE_DATABASE_URL → DATABASE_URL if needed
|
||||
if [[ -z "${DATABASE_URL:-}" && -n "${COMPLIANCE_DATABASE_URL:-}" ]]; then
|
||||
export DATABASE_URL="$COMPLIANCE_DATABASE_URL"
|
||||
fi
|
||||
}
|
||||
|
||||
# ── Job name from script path ─────────────────────────────────
|
||||
job_name() {
|
||||
basename "$1" .py
|
||||
}
|
||||
|
||||
pid_file() {
|
||||
echo "$JOB_DIR/$(job_name "$1").pid"
|
||||
}
|
||||
|
||||
log_file() {
|
||||
echo "$JOB_DIR/$(job_name "$1").log"
|
||||
}
|
||||
|
||||
# ── Status ────────────────────────────────────────────────────
|
||||
show_status() {
|
||||
echo "═══════════════════════════════════════════════════════"
|
||||
echo "QA Job Status ($(date '+%Y-%m-%d %H:%M:%S'))"
|
||||
echo "═══════════════════════════════════════════════════════"
|
||||
local found=0
|
||||
for pidfile in "$JOB_DIR"/*.pid; do
|
||||
[[ -f "$pidfile" ]] || continue
|
||||
found=1
|
||||
local name
|
||||
name=$(basename "$pidfile" .pid)
|
||||
local pid
|
||||
pid=$(cat "$pidfile")
|
||||
local logf="$JOB_DIR/$name.log"
|
||||
|
||||
if kill -0 "$pid" 2>/dev/null; then
|
||||
local lines
|
||||
lines=$(wc -l < "$logf" 2>/dev/null || echo 0)
|
||||
local errors
|
||||
errors=$(grep -c "ERROR" "$logf" 2>/dev/null || echo 0)
|
||||
local last_line
|
||||
last_line=$(tail -1 "$logf" 2>/dev/null || echo "(empty)")
|
||||
echo " ● $name (PID $pid) — RUNNING"
|
||||
echo " Log: $logf ($lines lines, $errors errors)"
|
||||
echo " Last: $last_line"
|
||||
else
|
||||
echo " ○ $name (PID $pid) — STOPPED"
|
||||
echo " Log: $logf"
|
||||
rm -f "$pidfile"
|
||||
fi
|
||||
echo ""
|
||||
done
|
||||
if [[ $found -eq 0 ]]; then
|
||||
echo " No jobs running."
|
||||
fi
|
||||
}
|
||||
|
||||
# ── Kill ──────────────────────────────────────────────────────
|
||||
kill_job() {
|
||||
local script="$1"
|
||||
local pf
|
||||
pf=$(pid_file "$script")
|
||||
if [[ ! -f "$pf" ]]; then
|
||||
echo "No PID file for $(job_name "$script")"
|
||||
return 1
|
||||
fi
|
||||
local pid
|
||||
pid=$(cat "$pf")
|
||||
if kill -0 "$pid" 2>/dev/null; then
|
||||
kill "$pid"
|
||||
echo "Killed $(job_name "$script") (PID $pid)"
|
||||
else
|
||||
echo "Process $pid already stopped"
|
||||
fi
|
||||
rm -f "$pf"
|
||||
}
|
||||
|
||||
# ── Tail log ──────────────────────────────────────────────────
|
||||
tail_log() {
|
||||
local script="$1"
|
||||
local lf
|
||||
lf=$(log_file "$script")
|
||||
if [[ ! -f "$lf" ]]; then
|
||||
echo "No log file: $lf"
|
||||
return 1
|
||||
fi
|
||||
tail -50 "$lf"
|
||||
}
|
||||
|
||||
# ── Start job ─────────────────────────────────────────────────
|
||||
start_job() {
|
||||
local script="$1"
|
||||
shift
|
||||
local args=("$@")
|
||||
|
||||
# Resolve script path
|
||||
local script_path="$script"
|
||||
if [[ ! -f "$script_path" ]]; then
|
||||
script_path="$SCRIPT_DIR/$script"
|
||||
fi
|
||||
if [[ ! -f "$script_path" ]]; then
|
||||
echo "ERROR: Script not found: $script"
|
||||
return 1
|
||||
fi
|
||||
|
||||
local name
|
||||
name=$(job_name "$script")
|
||||
local pf
|
||||
pf=$(pid_file "$script")
|
||||
local lf
|
||||
lf=$(log_file "$script")
|
||||
|
||||
# Check for already-running instance
|
||||
if [[ -f "$pf" ]]; then
|
||||
local existing_pid
|
||||
existing_pid=$(cat "$pf")
|
||||
if kill -0 "$existing_pid" 2>/dev/null; then
|
||||
echo "ERROR: $name already running (PID $existing_pid)"
|
||||
echo "Use: $0 --kill $script"
|
||||
return 1
|
||||
fi
|
||||
rm -f "$pf"
|
||||
fi
|
||||
|
||||
# Load environment
|
||||
load_env
|
||||
|
||||
# Verify required env vars
|
||||
if [[ -z "${DATABASE_URL:-}" ]]; then
|
||||
echo "ERROR: DATABASE_URL not set (checked .env)"
|
||||
return 1
|
||||
fi
|
||||
|
||||
# Start
|
||||
echo "Starting $name..."
|
||||
echo " Script: $script_path"
|
||||
echo " Args: ${args[*]:-none}"
|
||||
echo " Log: $lf"
|
||||
|
||||
nohup python3 -u "$script_path" "${args[@]}" > "$lf" 2>&1 &
|
||||
local pid=$!
|
||||
echo "$pid" > "$pf"
|
||||
|
||||
echo " PID: $pid"
|
||||
echo ""
|
||||
|
||||
# Wait a moment and check it started OK
|
||||
sleep 3
|
||||
if ! kill -0 "$pid" 2>/dev/null; then
|
||||
echo "ERROR: Process died immediately. Log output:"
|
||||
cat "$lf"
|
||||
rm -f "$pf"
|
||||
return 1
|
||||
fi
|
||||
|
||||
local lines
|
||||
lines=$(wc -l < "$lf" 2>/dev/null || echo 0)
|
||||
echo "Running OK ($lines log lines so far)"
|
||||
echo "Monitor with: $0 --status"
|
||||
echo "Tail log: $0 --log $script"
|
||||
}
|
||||
|
||||
# ── Main ──────────────────────────────────────────────────────
|
||||
case "${1:-}" in
|
||||
--status|-s)
|
||||
show_status
|
||||
;;
|
||||
--kill|-k)
|
||||
[[ -n "${2:-}" ]] || { echo "Usage: $0 --kill <script.py>"; exit 1; }
|
||||
kill_job "$2"
|
||||
;;
|
||||
--log|-l)
|
||||
[[ -n "${2:-}" ]] || { echo "Usage: $0 --log <script.py>"; exit 1; }
|
||||
tail_log "$2"
|
||||
;;
|
||||
--help|-h|"")
|
||||
echo "Usage:"
|
||||
echo " $0 <script.py> [args...] Start a QA job"
|
||||
echo " $0 --status Show running jobs"
|
||||
echo " $0 --kill <script.py> Kill a running job"
|
||||
echo " $0 --log <script.py> Tail job log"
|
||||
;;
|
||||
*)
|
||||
start_job "$@"
|
||||
;;
|
||||
esac
|
||||
307
scripts/qa/sync_db.py
Normal file
307
scripts/qa/sync_db.py
Normal file
@@ -0,0 +1,307 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Sync canonical control tables between production and local DB.
|
||||
|
||||
Modes:
|
||||
--pull Production → Local (initial sync, full table copy)
|
||||
--push Local → Production (incremental, only new obligation_candidates)
|
||||
--loop Run --push every N minutes (default 60)
|
||||
|
||||
Usage:
|
||||
python3 sync_db.py --pull # Full sync production → local
|
||||
python3 sync_db.py --push # Push new obligations to production
|
||||
python3 sync_db.py --loop 60 # Push every 60 minutes
|
||||
python3 sync_db.py --pull --tables canonical_controls # Only one table
|
||||
"""
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import urllib.parse
|
||||
|
||||
import io
|
||||
|
||||
import psycopg2
|
||||
import psycopg2.extras
|
||||
import psycopg2.extensions
|
||||
|
||||
# Register JSON adapter so dicts are automatically converted to JSONB
|
||||
psycopg2.extensions.register_adapter(dict, psycopg2.extras.Json)
|
||||
|
||||
# ── DB Config ────────────────────────────────────────────────────────
|
||||
|
||||
PROD_URL = os.environ.get(
|
||||
"PROD_DATABASE_URL",
|
||||
"postgresql://postgres:GmyFD3wnU1NrKBdpU1nwLdE8MLts0A0eez8L5XXdvUCe05lWnWfVp3C6JJ8Yrmt2"
|
||||
"@46.225.100.82:54321/postgres?sslmode=require",
|
||||
)
|
||||
LOCAL_URL = os.environ.get(
|
||||
"LOCAL_DATABASE_URL",
|
||||
"postgresql://breakpilot:breakpilot123@localhost:5432/breakpilot_db",
|
||||
)
|
||||
|
||||
SCHEMA = "compliance"
|
||||
|
||||
# Tables to sync (production → local)
|
||||
SYNC_TABLES = [
|
||||
"canonical_control_frameworks",
|
||||
"canonical_control_licenses",
|
||||
"canonical_control_sources",
|
||||
"canonical_control_categories",
|
||||
"canonical_blocked_sources",
|
||||
"canonical_controls",
|
||||
"canonical_control_mappings",
|
||||
"canonical_processed_chunks",
|
||||
"canonical_generation_jobs",
|
||||
"control_patterns",
|
||||
"crosswalk_matrix",
|
||||
"obligation_extractions",
|
||||
"obligation_candidates",
|
||||
]
|
||||
|
||||
|
||||
def connect(url, label="DB"):
|
||||
parsed = urllib.parse.urlparse(url)
|
||||
params = dict(urllib.parse.parse_qsl(parsed.query))
|
||||
conn = psycopg2.connect(
|
||||
host=parsed.hostname,
|
||||
port=parsed.port or 5432,
|
||||
user=parsed.username,
|
||||
password=parsed.password,
|
||||
dbname=parsed.path.lstrip("/"),
|
||||
sslmode=params.get("sslmode", "prefer"),
|
||||
options=f"-c search_path={SCHEMA},public",
|
||||
keepalives=1,
|
||||
keepalives_idle=30,
|
||||
keepalives_interval=10,
|
||||
keepalives_count=5,
|
||||
)
|
||||
conn.autocommit = False
|
||||
print(f" Connected to {label} ({parsed.hostname}:{parsed.port or 5432})")
|
||||
return conn
|
||||
|
||||
|
||||
def get_columns(cur, table):
|
||||
cur.execute(f"""
|
||||
SELECT column_name FROM information_schema.columns
|
||||
WHERE table_schema = '{SCHEMA}' AND table_name = '{table}'
|
||||
ORDER BY ordinal_position
|
||||
""")
|
||||
return [r[0] for r in cur.fetchall()]
|
||||
|
||||
|
||||
def pull_table(prod_conn, local_conn, table):
|
||||
"""Copy entire table from production to local via SELECT + INSERT."""
|
||||
prod_cur = prod_conn.cursor()
|
||||
local_cur = local_conn.cursor()
|
||||
|
||||
# Check table exists on production
|
||||
prod_cur.execute(f"""
|
||||
SELECT 1 FROM pg_tables
|
||||
WHERE schemaname = '{SCHEMA}' AND tablename = '{table}'
|
||||
""")
|
||||
if not prod_cur.fetchone():
|
||||
print(f" SKIP {table} — not found on production")
|
||||
return 0
|
||||
|
||||
# Drop local table
|
||||
local_cur.execute(f"DROP TABLE IF EXISTS {SCHEMA}.{table} CASCADE")
|
||||
local_conn.commit()
|
||||
|
||||
# Build simple CREATE TABLE (no constraints, no defaults — just for data)
|
||||
prod_cur.execute(f"""
|
||||
SELECT column_name, data_type, udt_name, character_maximum_length
|
||||
FROM information_schema.columns
|
||||
WHERE table_schema = '{SCHEMA}' AND table_name = '{table}'
|
||||
ORDER BY ordinal_position
|
||||
""")
|
||||
col_defs = prod_cur.fetchall()
|
||||
|
||||
parts = []
|
||||
col_names = []
|
||||
jsonb_cols = set()
|
||||
for name, dtype, udt, max_len in col_defs:
|
||||
col_names.append(name)
|
||||
if dtype == "ARRAY":
|
||||
type_map = {
|
||||
"_text": "text[]", "_varchar": "varchar[]",
|
||||
"_int4": "integer[]", "_uuid": "uuid[]",
|
||||
"_jsonb": "jsonb[]", "_float8": "float8[]",
|
||||
}
|
||||
sql_type = type_map.get(udt, f"{udt.lstrip('_')}[]")
|
||||
elif dtype == "USER-DEFINED" and udt == "jsonb":
|
||||
sql_type = "jsonb"
|
||||
jsonb_cols.add(name)
|
||||
elif dtype == "USER-DEFINED":
|
||||
sql_type = udt
|
||||
elif dtype == "jsonb":
|
||||
sql_type = "jsonb"
|
||||
jsonb_cols.add(name)
|
||||
elif max_len:
|
||||
sql_type = f"{dtype}({max_len})"
|
||||
else:
|
||||
sql_type = dtype
|
||||
parts.append(f'"{name}" {sql_type}')
|
||||
|
||||
ddl = f"CREATE TABLE {SCHEMA}.{table} ({', '.join(parts)})"
|
||||
local_cur.execute(ddl)
|
||||
local_conn.commit()
|
||||
|
||||
# Fetch all rows from production
|
||||
col_list = ", ".join(f'"{c}"' for c in col_names)
|
||||
prod_cur.execute(f"SELECT {col_list} FROM {SCHEMA}.{table}")
|
||||
rows = prod_cur.fetchall()
|
||||
|
||||
if rows:
|
||||
# Wrap dict/list values in Json for JSONB columns
|
||||
adapted_rows = []
|
||||
for row in rows:
|
||||
adapted = []
|
||||
for i, val in enumerate(row):
|
||||
if col_names[i] in jsonb_cols and isinstance(val, (dict, list)):
|
||||
adapted.append(psycopg2.extras.Json(val))
|
||||
else:
|
||||
adapted.append(val)
|
||||
adapted_rows.append(tuple(adapted))
|
||||
|
||||
placeholders = ", ".join(["%s"] * len(col_names))
|
||||
insert_sql = f'INSERT INTO {SCHEMA}.{table} ({col_list}) VALUES ({placeholders})'
|
||||
psycopg2.extras.execute_batch(local_cur, insert_sql, adapted_rows, page_size=500)
|
||||
local_conn.commit()
|
||||
|
||||
print(f" {table}: {len(rows)} rows")
|
||||
return len(rows)
|
||||
|
||||
|
||||
def pull(tables=None):
|
||||
"""Full sync: production → local."""
|
||||
print("\n=== PULL: Production → Local ===\n")
|
||||
|
||||
prod_conn = connect(PROD_URL, "Production")
|
||||
local_conn = connect(LOCAL_URL, "Local")
|
||||
|
||||
# Ensure schema exists
|
||||
local_cur = local_conn.cursor()
|
||||
local_cur.execute(f"CREATE SCHEMA IF NOT EXISTS {SCHEMA}")
|
||||
local_conn.commit()
|
||||
|
||||
sync_list = tables if tables else SYNC_TABLES
|
||||
total = 0
|
||||
|
||||
for table in sync_list:
|
||||
try:
|
||||
count = pull_table(prod_conn, local_conn, table)
|
||||
total += count
|
||||
except Exception as e:
|
||||
print(f" ERROR {table}: {e}")
|
||||
local_conn.rollback()
|
||||
prod_conn.rollback()
|
||||
|
||||
print(f"\n Total: {total} rows synced")
|
||||
prod_conn.close()
|
||||
local_conn.close()
|
||||
|
||||
|
||||
def push():
|
||||
"""Incremental push: new obligation_candidates local → production."""
|
||||
print(f"\n=== PUSH: Local → Production ({time.strftime('%H:%M:%S')}) ===\n")
|
||||
|
||||
local_conn = connect(LOCAL_URL, "Local")
|
||||
prod_conn = connect(PROD_URL, "Production")
|
||||
|
||||
local_cur = local_conn.cursor()
|
||||
prod_cur = prod_conn.cursor()
|
||||
|
||||
# Find obligation_candidates in local that don't exist in production
|
||||
# Use candidate_id as the unique key
|
||||
local_cur.execute(f"""
|
||||
SELECT candidate_id FROM {SCHEMA}.obligation_candidates
|
||||
""")
|
||||
local_ids = {r[0] for r in local_cur.fetchall()}
|
||||
|
||||
if not local_ids:
|
||||
print(" No obligation_candidates in local DB")
|
||||
local_conn.close()
|
||||
prod_conn.close()
|
||||
return 0
|
||||
|
||||
# Check which already exist on production
|
||||
prod_cur.execute(f"""
|
||||
SELECT candidate_id FROM {SCHEMA}.obligation_candidates
|
||||
""")
|
||||
prod_ids = {r[0] for r in prod_cur.fetchall()}
|
||||
|
||||
new_ids = local_ids - prod_ids
|
||||
if not new_ids:
|
||||
print(f" All {len(local_ids)} obligations already on production")
|
||||
local_conn.close()
|
||||
prod_conn.close()
|
||||
return 0
|
||||
|
||||
print(f" {len(new_ids)} new obligations to push (local: {len(local_ids)}, prod: {len(prod_ids)})")
|
||||
|
||||
# Get columns
|
||||
columns = get_columns(local_cur, "obligation_candidates")
|
||||
col_list = ", ".join(columns)
|
||||
placeholders = ", ".join(["%s"] * len(columns))
|
||||
|
||||
# Fetch new rows from local
|
||||
id_list = ", ".join(f"'{i}'" for i in new_ids)
|
||||
local_cur.execute(f"""
|
||||
SELECT {col_list} FROM {SCHEMA}.obligation_candidates
|
||||
WHERE candidate_id IN ({id_list})
|
||||
""")
|
||||
rows = local_cur.fetchall()
|
||||
|
||||
# Insert into production
|
||||
insert_sql = f"INSERT INTO {SCHEMA}.obligation_candidates ({col_list}) VALUES ({placeholders}) ON CONFLICT DO NOTHING"
|
||||
psycopg2.extras.execute_batch(prod_cur, insert_sql, rows, page_size=100)
|
||||
prod_conn.commit()
|
||||
|
||||
print(f" Pushed {len(rows)} obligations to production")
|
||||
|
||||
local_conn.close()
|
||||
prod_conn.close()
|
||||
return len(rows)
|
||||
|
||||
|
||||
def loop(interval_min):
|
||||
"""Run push every N minutes."""
|
||||
print(f"\n=== SYNC LOOP — Push every {interval_min} min ===")
|
||||
print(f" Started at {time.strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
print(f" Press Ctrl+C to stop\n")
|
||||
|
||||
while True:
|
||||
try:
|
||||
pushed = push()
|
||||
if pushed:
|
||||
print(f" Next sync in {interval_min} min...")
|
||||
except Exception as e:
|
||||
print(f" SYNC ERROR: {e}")
|
||||
time.sleep(interval_min * 60)
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Sync canonical control tables")
|
||||
parser.add_argument("--pull", action="store_true", help="Production → Local (full copy)")
|
||||
parser.add_argument("--push", action="store_true", help="Local → Production (new obligations)")
|
||||
parser.add_argument("--loop", type=int, metavar="MIN", help="Push every N minutes")
|
||||
parser.add_argument("--tables", nargs="+", help="Only sync specific tables (with --pull)")
|
||||
args = parser.parse_args()
|
||||
|
||||
if not any([args.pull, args.push, args.loop]):
|
||||
parser.print_help()
|
||||
return
|
||||
|
||||
if args.pull:
|
||||
pull(args.tables)
|
||||
|
||||
if args.push:
|
||||
push()
|
||||
|
||||
if args.loop:
|
||||
loop(args.loop)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
470
scripts/qa/test_pass0a.py
Normal file
470
scripts/qa/test_pass0a.py
Normal file
@@ -0,0 +1,470 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Test Pass 0a (Obligation Extraction) on 5-10 controls.
|
||||
|
||||
Standalone script — no SQLAlchemy dependency. Uses psycopg2 + requests.
|
||||
Copies prompts and quality gate from decomposition_pass.py.
|
||||
|
||||
Usage:
|
||||
python3 test_pass0a.py # 10 controls, Anthropic
|
||||
python3 test_pass0a.py --limit 5 # 5 controls
|
||||
python3 test_pass0a.py --source "DSGVO" # filter by source
|
||||
python3 test_pass0a.py --dry-run # show controls, no LLM call
|
||||
"""
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
import urllib.parse
|
||||
|
||||
import psycopg2
|
||||
import requests
|
||||
|
||||
# ── Config ────────────────────────────────────────────────────────────
|
||||
ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", "")
|
||||
ANTHROPIC_MODEL = os.environ.get("DECOMPOSITION_LLM_MODEL", "claude-sonnet-4-6")
|
||||
ANTHROPIC_API_URL = "https://api.anthropic.com/v1"
|
||||
|
||||
# ── Prompts (from decomposition_pass.py) ──────────────────────────────
|
||||
|
||||
SYSTEM_PROMPT = """\
|
||||
Du bist ein Rechts-Compliance-Experte. Du zerlegst Compliance-Controls \
|
||||
in einzelne atomare Pflichten.
|
||||
|
||||
REGELN (STRIKT EINHALTEN):
|
||||
1. Nur normative Aussagen extrahieren — erkennbar an: müssen, haben \
|
||||
sicherzustellen, sind verpflichtet, ist zu dokumentieren, ist zu melden, \
|
||||
ist zu testen, shall, must, required.
|
||||
2. Jede Pflicht hat genau EIN Hauptverb / eine Handlung.
|
||||
3. Testpflichten SEPARAT von operativen Pflichten (is_test_obligation=true).
|
||||
4. Meldepflichten SEPARAT (is_reporting_obligation=true).
|
||||
5. NICHT auf Evidence-Ebene zerlegen (z.B. "DR-Plan vorhanden" ist KEIN \
|
||||
eigenes Control, sondern Evidence).
|
||||
6. Begründungen, Erläuterungen und Erwägungsgründe sind KEINE Pflichten \
|
||||
— NICHT extrahieren.
|
||||
|
||||
Antworte NUR mit einem JSON-Array. Keine Erklärungen."""
|
||||
|
||||
|
||||
def build_prompt(title, objective, requirements, test_procedure, source_ref):
|
||||
return f"""\
|
||||
Analysiere das folgende Control und extrahiere alle einzelnen normativen \
|
||||
Pflichten als JSON-Array.
|
||||
|
||||
CONTROL:
|
||||
Titel: {title}
|
||||
Ziel: {objective}
|
||||
Anforderungen: {requirements}
|
||||
Prüfverfahren: {test_procedure}
|
||||
Quellreferenz: {source_ref}
|
||||
|
||||
Antworte als JSON-Array:
|
||||
[
|
||||
{{
|
||||
"obligation_text": "Kurze, präzise Formulierung der Pflicht",
|
||||
"action": "Hauptverb/Handlung",
|
||||
"object": "Gegenstand der Pflicht",
|
||||
"condition": "Auslöser/Bedingung oder null",
|
||||
"normative_strength": "must",
|
||||
"is_test_obligation": false,
|
||||
"is_reporting_obligation": false
|
||||
}}
|
||||
]"""
|
||||
|
||||
|
||||
# ── Quality Gate — 3-Tier Classification (from decomposition_pass.py) ──
|
||||
|
||||
# Tier 1: Pflicht (mandatory)
|
||||
_PFLICHT_RE = re.compile(
|
||||
r"\bmüssen\b|\bmuss\b|\bhat\s+sicherzustellen\b|\bhaben\s+sicherzustellen\b"
|
||||
r"|\bsind\s+verpflichtet\b|\bist\s+verpflichtet\b"
|
||||
r"|\bist\s+zu\s+\w+en\b|\bsind\s+zu\s+\w+en\b"
|
||||
r"|\bhat\s+zu\s+\w+en\b|\bhaben\s+zu\s+\w+en\b"
|
||||
r"|\bist\s+\w+zu\w+en\b|\bsind\s+\w+zu\w+en\b"
|
||||
r"|\bist\s+\w+\s+zu\s+\w+en\b|\bsind\s+\w+\s+zu\s+\w+en\b"
|
||||
r"|\bhat\s+\w+\s+zu\s+\w+en\b|\bhaben\s+\w+\s+zu\s+\w+en\b"
|
||||
r"|\bshall\b|\bmust\b|\brequired\b"
|
||||
r"|\b\w+zuteilen\b|\b\w+zuwenden\b|\b\w+zustellen\b|\b\w+zulegen\b"
|
||||
r"|\b\w+zunehmen\b|\b\w+zuführen\b|\b\w+zuhalten\b|\b\w+zusetzen\b"
|
||||
r"|\b\w+zuweisen\b|\b\w+zuordnen\b|\b\w+zufügen\b|\b\w+zugeben\b"
|
||||
r"|\bist\b.{1,80}\bzu\s+\w+en\b|\bsind\b.{1,80}\bzu\s+\w+en\b",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
# Tier 2: Empfehlung (recommendation)
|
||||
_EMPFEHLUNG_RE = re.compile(
|
||||
r"\bsoll\b|\bsollen\b|\bsollte\b|\bsollten\b"
|
||||
r"|\bgewährleisten\b|\bsicherstellen\b"
|
||||
r"|\bshould\b|\bensure\b|\brecommend\w*\b"
|
||||
r"|\bnachweisen\b|\beinhalten\b|\bunterlassen\b|\bwahren\b"
|
||||
r"|\bdokumentieren\b|\bimplementieren\b|\büberprüfen\b|\büberwachen\b"
|
||||
r"|\bprüfen,\s+ob\b|\bkontrollieren,\s+ob\b",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
# Tier 3: Kann (optional/permissive)
|
||||
_KANN_RE = re.compile(
|
||||
r"\bkann\b|\bkönnen\b|\bdarf\b|\bdürfen\b|\bmay\b|\boptional\b",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
# Union (backward compat)
|
||||
_NORMATIVE_RE = re.compile(
|
||||
_PFLICHT_RE.pattern + "|" + _EMPFEHLUNG_RE.pattern + "|" + _KANN_RE.pattern,
|
||||
re.IGNORECASE,
|
||||
)
|
||||
_RATIONALE_RE = re.compile(
|
||||
r"\bda\s+|\bweil\b|\bgrund\b|\berwägung|\bbecause\b|\breason\b|\brationale\b",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
_TEST_RE = re.compile(
|
||||
r"\btesten\b|\btest\b|\bprüfung\b|\bprüfen\b|\bgetestet\b|\bwirksamkeit\b"
|
||||
r"|\baudit\b|\bregelmäßig\b.*\b(prüf|test|kontroll)|\beffectiveness\b|\bverif",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
_REPORTING_RE = re.compile(
|
||||
r"\bmelden\b|\bmeldung\b|\bunterricht|\binformieren\b|\bbenachricht"
|
||||
r"|\bnotif|\breport\b|\bbehörd",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
|
||||
def classify_obligation_type(txt):
|
||||
"""Classify: pflicht > empfehlung > kann > empfehlung (default)."""
|
||||
if _PFLICHT_RE.search(txt):
|
||||
return "pflicht"
|
||||
if _EMPFEHLUNG_RE.search(txt):
|
||||
return "empfehlung"
|
||||
if _KANN_RE.search(txt):
|
||||
return "kann"
|
||||
return "empfehlung"
|
||||
|
||||
|
||||
def quality_gate(obl_text, parent_uuid):
|
||||
"""Validate + classify obligation. Returns (flags_dict, passed_bool, confidence, obligation_type)."""
|
||||
flags = {}
|
||||
|
||||
# 1. Normative signal (informational)
|
||||
flags["has_normative_signal"] = bool(_NORMATIVE_RE.search(obl_text))
|
||||
|
||||
# 1b. Obligation type classification
|
||||
obl_type = classify_obligation_type(obl_text)
|
||||
flags["obligation_type"] = obl_type
|
||||
|
||||
# 2. Single action
|
||||
multi_verb_re = re.compile(
|
||||
r"\b(und|sowie|als auch)\b.*\b(müssen|sicherstellen|implementieren"
|
||||
r"|dokumentieren|melden|testen|prüfen|überwachen|gewährleisten)\b",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
flags["single_action"] = not bool(multi_verb_re.search(obl_text))
|
||||
|
||||
# 3. Not rationale
|
||||
normative_count = len(_NORMATIVE_RE.findall(obl_text))
|
||||
rationale_count = len(_RATIONALE_RE.findall(obl_text))
|
||||
flags["not_rationale"] = normative_count >= rationale_count
|
||||
|
||||
# 4. Not evidence-only
|
||||
evidence_only_re = re.compile(
|
||||
r"^(Nachweis|Dokumentation|Screenshot|Protokoll|Bericht|Zertifikat)",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
flags["not_evidence_only"] = not bool(evidence_only_re.match(obl_text.strip()))
|
||||
|
||||
# 5. Min length
|
||||
flags["min_length"] = len(obl_text.strip()) >= 20
|
||||
|
||||
# 6. Parent link
|
||||
flags["has_parent_link"] = bool(parent_uuid)
|
||||
|
||||
# Confidence
|
||||
weights = {
|
||||
"has_normative_signal": 0.25, "single_action": 0.20,
|
||||
"not_rationale": 0.20, "not_evidence_only": 0.15,
|
||||
"min_length": 0.10, "has_parent_link": 0.05,
|
||||
}
|
||||
# Bonus for pflicht classification
|
||||
confidence = sum(weights[k] for k, v in flags.items() if v and k in weights)
|
||||
if obl_type == "pflicht":
|
||||
confidence = min(confidence + 0.05, 1.0)
|
||||
|
||||
# Pass check — has_normative_signal is NO LONGER critical
|
||||
critical = ["not_evidence_only", "min_length", "has_parent_link"]
|
||||
passed = all(flags.get(k, False) for k in critical)
|
||||
|
||||
return flags, passed, confidence, obl_type
|
||||
|
||||
|
||||
# ── JSON parsing ──────────────────────────────────────────────────────
|
||||
|
||||
def parse_json_array(text):
|
||||
try:
|
||||
result = json.loads(text)
|
||||
if isinstance(result, list):
|
||||
return result
|
||||
if isinstance(result, dict):
|
||||
return [result]
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
match = re.search(r"\[[\s\S]*\]", text)
|
||||
if match:
|
||||
try:
|
||||
result = json.loads(match.group())
|
||||
if isinstance(result, list):
|
||||
return result
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
return []
|
||||
|
||||
|
||||
# ── API call ──────────────────────────────────────────────────────────
|
||||
|
||||
def call_anthropic(prompt):
|
||||
headers = {
|
||||
"x-api-key": ANTHROPIC_API_KEY,
|
||||
"anthropic-version": "2023-06-01",
|
||||
"content-type": "application/json",
|
||||
}
|
||||
payload = {
|
||||
"model": ANTHROPIC_MODEL,
|
||||
"max_tokens": 8192,
|
||||
"system": [{"type": "text", "text": SYSTEM_PROMPT, "cache_control": {"type": "ephemeral"}}],
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
}
|
||||
resp = requests.post(f"{ANTHROPIC_API_URL}/messages", headers=headers, json=payload, timeout=120)
|
||||
if resp.status_code != 200:
|
||||
return None, {}, f"HTTP {resp.status_code}: {resp.text[:200]}"
|
||||
data = resp.json()
|
||||
usage = data.get("usage", {})
|
||||
content = data.get("content", [])
|
||||
text = content[0].get("text", "") if content else ""
|
||||
return text, usage, None
|
||||
|
||||
|
||||
# ── Format helpers ────────────────────────────────────────────────────
|
||||
|
||||
def fmt_json(val):
|
||||
if val is None:
|
||||
return ""
|
||||
if isinstance(val, str):
|
||||
try:
|
||||
val = json.loads(val)
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
return val
|
||||
if isinstance(val, list):
|
||||
return "\n".join(f" - {item}" for item in val)
|
||||
return str(val)
|
||||
|
||||
|
||||
# ── Main ──────────────────────────────────────────────────────────────
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Test Pass 0a on small sample")
|
||||
parser.add_argument("--limit", type=int, default=10)
|
||||
parser.add_argument("--source", type=str)
|
||||
parser.add_argument("--dry-run", action="store_true")
|
||||
args = parser.parse_args()
|
||||
|
||||
if not ANTHROPIC_API_KEY and not args.dry_run:
|
||||
print("ERROR: Set ANTHROPIC_API_KEY")
|
||||
sys.exit(1)
|
||||
|
||||
db_url = os.environ["DATABASE_URL"]
|
||||
p = urllib.parse.urlparse(db_url)
|
||||
conn = psycopg2.connect(
|
||||
host=p.hostname, port=p.port or 5432,
|
||||
user=p.username, password=p.password,
|
||||
dbname=p.path.lstrip("/"),
|
||||
options="-c search_path=compliance,public",
|
||||
)
|
||||
cur = conn.cursor()
|
||||
|
||||
# Select diverse sample
|
||||
query = """
|
||||
SELECT id, control_id, title, objective, requirements,
|
||||
test_procedure, source_citation, category
|
||||
FROM compliance.canonical_controls
|
||||
WHERE release_state NOT IN ('deprecated', 'duplicate', 'too_close')
|
||||
AND parent_control_uuid IS NULL
|
||||
AND title IS NOT NULL AND objective IS NOT NULL
|
||||
AND length(coalesce(objective,'') || coalesce(requirements::text,'')) > 100
|
||||
"""
|
||||
params = []
|
||||
if args.source:
|
||||
query += " AND source_citation->>'source' ILIKE %s"
|
||||
params.append(f"%{args.source}%")
|
||||
|
||||
query += " ORDER BY source_citation->>'source', random()"
|
||||
query += f" LIMIT {args.limit}"
|
||||
|
||||
cur.execute(query, params)
|
||||
controls = cur.fetchall()
|
||||
|
||||
if not controls:
|
||||
print("No controls found.")
|
||||
return
|
||||
|
||||
print(f"{'='*70}")
|
||||
print(f"Pass 0a Test — {len(controls)} Controls")
|
||||
print(f"Model: {ANTHROPIC_MODEL}")
|
||||
print(f"{'='*70}")
|
||||
|
||||
total_in = total_out = total_obls = 0
|
||||
type_counts = {"pflicht": 0, "empfehlung": 0, "kann": 0}
|
||||
total_rejected = 0 # only evidence-only / too-short / no-parent
|
||||
all_results = []
|
||||
t_start = time.time()
|
||||
|
||||
for i, row in enumerate(controls, 1):
|
||||
ctrl_uuid, ctrl_id, title, objective, reqs, test_proc, src_cit, category = row
|
||||
|
||||
req_str = fmt_json(reqs)
|
||||
test_str = fmt_json(test_proc)
|
||||
source_str = ""
|
||||
if src_cit:
|
||||
sc = src_cit if isinstance(src_cit, dict) else json.loads(src_cit)
|
||||
source_str = f"{sc.get('source', '')} {sc.get('article', '')}"
|
||||
|
||||
print(f"\n{'─'*70}")
|
||||
print(f"[{i}/{len(controls)}] {ctrl_id}: {title}")
|
||||
print(f" Source: {source_str} | Category: {category or 'N/A'}")
|
||||
print(f" Objective: {(objective or '')[:200]}")
|
||||
|
||||
if args.dry_run:
|
||||
print(" [DRY RUN]")
|
||||
continue
|
||||
|
||||
prompt = build_prompt(title or "", objective or "", req_str, test_str, source_str)
|
||||
|
||||
t0 = time.time()
|
||||
response_text, usage, error = call_anthropic(prompt)
|
||||
elapsed = time.time() - t0
|
||||
|
||||
if error:
|
||||
print(f" ERROR: {error}")
|
||||
continue
|
||||
|
||||
in_tok = usage.get("input_tokens", 0)
|
||||
out_tok = usage.get("output_tokens", 0)
|
||||
cached = usage.get("cache_read_input_tokens", 0)
|
||||
total_in += in_tok
|
||||
total_out += out_tok
|
||||
|
||||
obligations = parse_json_array(response_text)
|
||||
total_obls += len(obligations)
|
||||
|
||||
print(f" API: {elapsed:.1f}s | {in_tok} in / {out_tok} out"
|
||||
f"{f' ({cached} cached)' if cached else ''}"
|
||||
f" | {len(obligations)} obligation(s)")
|
||||
|
||||
for j, obl in enumerate(obligations, 1):
|
||||
obl_text = obl.get("obligation_text", "")
|
||||
action = obl.get("action", "")
|
||||
obj = obl.get("object", "")
|
||||
condition = obl.get("condition")
|
||||
strength = obl.get("normative_strength", "must")
|
||||
is_test = bool(obl.get("is_test_obligation", False))
|
||||
is_report = bool(obl.get("is_reporting_obligation", False))
|
||||
|
||||
# Auto-detect
|
||||
if not is_test and _TEST_RE.search(obl_text):
|
||||
is_test = True
|
||||
if not is_report and _REPORTING_RE.search(obl_text):
|
||||
is_report = True
|
||||
|
||||
flags, passed, conf, obl_type = quality_gate(obl_text, str(ctrl_uuid))
|
||||
if passed:
|
||||
type_counts[obl_type] = type_counts.get(obl_type, 0) + 1
|
||||
else:
|
||||
total_rejected += 1
|
||||
|
||||
tag = ""
|
||||
if is_test:
|
||||
tag = " [TEST]"
|
||||
elif is_report:
|
||||
tag = " [MELDEPFLICHT]"
|
||||
|
||||
# Show type instead of PASS/REJECT
|
||||
type_label = {"pflicht": "PFLICHT", "empfehlung": "EMPFEHLUNG", "kann": "KANN"}
|
||||
if not passed:
|
||||
status = "REJECT"
|
||||
else:
|
||||
status = type_label.get(obl_type, "EMPFEHLUNG")
|
||||
|
||||
failed = [k for k, v in flags.items()
|
||||
if isinstance(v, bool) and not v]
|
||||
|
||||
print(f"\n {j}. [{status}] conf={conf:.0%}{tag} strength={strength}")
|
||||
print(f" {obl_text}")
|
||||
print(f" Handlung: {action} | Gegenstand: {obj}")
|
||||
if condition:
|
||||
print(f" Bedingung: {condition}")
|
||||
if not passed:
|
||||
print(f" Abgelehnt: {', '.join(failed)}")
|
||||
|
||||
all_results.append({
|
||||
"control_id": ctrl_id,
|
||||
"obligation_text": obl_text,
|
||||
"obligation_type": obl_type if passed else "rejected",
|
||||
"action": action,
|
||||
"object": obj,
|
||||
"condition": condition,
|
||||
"confidence": round(conf, 2),
|
||||
"is_test": is_test,
|
||||
"is_reporting": is_report,
|
||||
"passed": passed,
|
||||
"flags": {k: v for k, v in flags.items()},
|
||||
})
|
||||
|
||||
time.sleep(0.5)
|
||||
|
||||
# ── Summary ──────────────────────────────────────────────────────
|
||||
elapsed_total = time.time() - t_start
|
||||
cost = (total_in * 3 + total_out * 15) / 1_000_000
|
||||
total_classified = sum(type_counts.values())
|
||||
|
||||
print(f"\n\n{'='*70}")
|
||||
print(f"ZUSAMMENFASSUNG — 3-Tier-Klassifizierung")
|
||||
print(f"{'='*70}")
|
||||
print(f" Controls: {len(controls)}")
|
||||
print(f" Obligations: {total_obls} ({total_obls/max(len(controls),1):.1f} pro Control)")
|
||||
print(f" ── Klassifizierung ──")
|
||||
print(f" Pflicht: {type_counts['pflicht']}"
|
||||
f" ({type_counts['pflicht']*100/max(total_obls,1):.0f}%)")
|
||||
print(f" Empfehlung: {type_counts['empfehlung']}"
|
||||
f" ({type_counts['empfehlung']*100/max(total_obls,1):.0f}%)")
|
||||
print(f" Kann: {type_counts['kann']}"
|
||||
f" ({type_counts['kann']*100/max(total_obls,1):.0f}%)")
|
||||
print(f" Rejected: {total_rejected}"
|
||||
f" ({total_rejected*100/max(total_obls,1):.0f}%)"
|
||||
f" (nur evidence-only/zu kurz/kein parent)")
|
||||
print(f" ── Kosten ──")
|
||||
print(f" Laufzeit: {elapsed_total:.1f}s")
|
||||
print(f" Tokens: {total_in:,} in / {total_out:,} out")
|
||||
print(f" Kosten: ${cost:.4f}")
|
||||
|
||||
if len(controls) > 0 and not args.dry_run and total_obls > 0:
|
||||
n = 6000
|
||||
factor = n / len(controls)
|
||||
print(f"\n --- Hochrechnung auf {n:,} Controls ---")
|
||||
print(f" Tokens: {int(total_in * factor):,} in / {int(total_out * factor):,} out")
|
||||
print(f" Kosten: ${cost * factor:.2f}")
|
||||
print(f" Laufzeit: {elapsed_total * factor / 3600:.1f}h")
|
||||
print(f" Obligations: ~{int(total_obls / len(controls) * n):,}")
|
||||
pf = int(type_counts['pflicht'] * factor)
|
||||
ef = int(type_counts['empfehlung'] * factor)
|
||||
kf = int(type_counts['kann'] * factor)
|
||||
print(f" Pflicht: ~{pf:,}")
|
||||
print(f" Empfehlung: ~{ef:,}")
|
||||
print(f" Kann: ~{kf:,}")
|
||||
|
||||
# Save results JSON for later analysis
|
||||
if all_results:
|
||||
out_path = f"/tmp/pass0a_results_{len(controls)}controls.json"
|
||||
with open(out_path, "w") as f:
|
||||
json.dump(all_results, f, ensure_ascii=False, indent=2)
|
||||
print(f"\n Ergebnisse gespeichert: {out_path}")
|
||||
|
||||
conn.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
308
scripts/qa/test_pass0b_preview.py
Normal file
308
scripts/qa/test_pass0b_preview.py
Normal file
@@ -0,0 +1,308 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Preview Pass 0b: Turn obligation candidates into atomic controls.
|
||||
|
||||
Picks a few obligations from Pass 0a results, calls LLM to compose
|
||||
atomic controls, and writes them to canonical_controls with parent_control_uuid.
|
||||
|
||||
Usage:
|
||||
python3 test_pass0b_preview.py --input /tmp/pass0a_results_60controls.json --limit 3
|
||||
"""
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
import uuid
|
||||
import urllib.parse
|
||||
|
||||
import psycopg2
|
||||
import psycopg2.extras
|
||||
import requests
|
||||
|
||||
# Register JSON adapter
|
||||
psycopg2.extensions.register_adapter(dict, psycopg2.extras.Json)
|
||||
|
||||
ANTHROPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY", "")
|
||||
ANTHROPIC_MODEL = os.environ.get("DECOMPOSITION_LLM_MODEL", "claude-sonnet-4-6")
|
||||
|
||||
SYSTEM_PROMPT = """\
|
||||
Du bist ein Security-Compliance-Experte. Du erstellst aus einer einzelnen \
|
||||
normativen Pflicht ein praxisorientiertes, atomares Security Control.
|
||||
|
||||
Das Control muss UMSETZBAR sein — keine Gesetzesparaphrase.
|
||||
Antworte NUR als JSON. Keine Erklärungen."""
|
||||
|
||||
|
||||
def build_pass0b_prompt(obl_text, action, obj, parent_title, category, source_ref):
|
||||
return f"""\
|
||||
Erstelle aus der folgenden Pflicht ein atomares Control.
|
||||
|
||||
PFLICHT: {obl_text}
|
||||
HANDLUNG: {action}
|
||||
GEGENSTAND: {obj}
|
||||
|
||||
KONTEXT (Ursprungs-Control):
|
||||
Titel: {parent_title}
|
||||
Kategorie: {category}
|
||||
Quellreferenz: {source_ref}
|
||||
|
||||
Antworte als JSON:
|
||||
{{
|
||||
"title": "Kurzer Titel (max 80 Zeichen, deutsch)",
|
||||
"objective": "Was muss erreicht werden? (1-2 Sätze)",
|
||||
"requirements": ["Konkrete Anforderung 1", "Anforderung 2"],
|
||||
"test_procedure": ["Prüfschritt 1", "Prüfschritt 2"],
|
||||
"evidence": ["Nachweis 1", "Nachweis 2"],
|
||||
"severity": "critical|high|medium|low",
|
||||
"category": "security|privacy|governance|operations|finance|reporting"
|
||||
}}"""
|
||||
|
||||
|
||||
def call_anthropic(prompt):
|
||||
headers = {
|
||||
"x-api-key": ANTHROPIC_API_KEY,
|
||||
"anthropic-version": "2023-06-01",
|
||||
"content-type": "application/json",
|
||||
}
|
||||
payload = {
|
||||
"model": ANTHROPIC_MODEL,
|
||||
"max_tokens": 4096,
|
||||
"system": [{"type": "text", "text": SYSTEM_PROMPT, "cache_control": {"type": "ephemeral"}}],
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
}
|
||||
resp = requests.post("https://api.anthropic.com/v1/messages", headers=headers, json=payload, timeout=120)
|
||||
if resp.status_code != 200:
|
||||
return None, {}, f"HTTP {resp.status_code}: {resp.text[:200]}"
|
||||
data = resp.json()
|
||||
text = data.get("content", [{}])[0].get("text", "")
|
||||
return text, data.get("usage", {}), None
|
||||
|
||||
|
||||
def parse_json_object(text):
|
||||
try:
|
||||
return json.loads(text)
|
||||
except json.JSONDecodeError:
|
||||
match = re.search(r"\{[\s\S]*\}", text)
|
||||
if match:
|
||||
try:
|
||||
return json.loads(match.group())
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
return None
|
||||
|
||||
|
||||
def generate_control_id(domain, cur):
|
||||
prefix = domain.upper()[:4]
|
||||
cur.execute("""
|
||||
SELECT MAX(CAST(SPLIT_PART(control_id, '-', 2) AS INTEGER))
|
||||
FROM compliance.canonical_controls
|
||||
WHERE control_id LIKE %s
|
||||
AND SPLIT_PART(control_id, '-', 2) ~ '^[0-9]+$'
|
||||
""", (f"{prefix}-%",))
|
||||
row = cur.fetchone()
|
||||
if row and row[0] is not None:
|
||||
return f"{prefix}-{row[0] + 1}"
|
||||
return f"{prefix}-001"
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--input", default="/tmp/pass0a_results_60controls.json")
|
||||
parser.add_argument("--limit", type=int, default=3, help="Number of obligations to process")
|
||||
parser.add_argument("--control", type=str, help="Pick obligations from this control_id")
|
||||
parser.add_argument("--dry-run", action="store_true")
|
||||
args = parser.parse_args()
|
||||
|
||||
if not ANTHROPIC_API_KEY and not args.dry_run:
|
||||
print("ERROR: Set ANTHROPIC_API_KEY")
|
||||
sys.exit(1)
|
||||
|
||||
# Load 0a results
|
||||
with open(args.input) as f:
|
||||
obligations = json.load(f)
|
||||
|
||||
# Filter: only passed, pflicht or empfehlung
|
||||
obligations = [o for o in obligations if o.get("passed", False)]
|
||||
|
||||
if args.control:
|
||||
obligations = [o for o in obligations if o["control_id"] == args.control]
|
||||
|
||||
# Pick diverse sample
|
||||
picked = []
|
||||
seen_types = set()
|
||||
for o in obligations:
|
||||
otype = o["obligation_type"]
|
||||
if otype not in seen_types and len(picked) < args.limit:
|
||||
picked.append(o)
|
||||
seen_types.add(otype)
|
||||
# Fill rest
|
||||
for o in obligations:
|
||||
if o not in picked and len(picked) < args.limit:
|
||||
picked.append(o)
|
||||
|
||||
if not picked:
|
||||
print("No obligations found.")
|
||||
return
|
||||
|
||||
# Connect to DB
|
||||
db_url = os.environ["DATABASE_URL"]
|
||||
p = urllib.parse.urlparse(db_url)
|
||||
conn = psycopg2.connect(
|
||||
host=p.hostname, port=p.port or 5432,
|
||||
user=p.username, password=p.password,
|
||||
dbname=p.path.lstrip("/"),
|
||||
options="-c search_path=compliance,public",
|
||||
)
|
||||
cur = conn.cursor()
|
||||
|
||||
# Get parent control info
|
||||
ctrl_ids = list(set(o["control_id"] for o in picked))
|
||||
cur.execute("""
|
||||
SELECT control_id, id, title, category, source_citation
|
||||
FROM compliance.canonical_controls
|
||||
WHERE control_id = ANY(%s)
|
||||
""", (ctrl_ids,))
|
||||
ctrl_map = {}
|
||||
for row in cur.fetchall():
|
||||
sc = row[4] if isinstance(row[4], dict) else (json.loads(row[4]) if row[4] else {})
|
||||
# Derive domain prefix from control_id (e.g. "DSGV" from "DSGV-001")
|
||||
prefix = row[0].split("-")[0] if "-" in row[0] else "COMP"
|
||||
ctrl_map[row[0]] = {
|
||||
"uuid": str(row[1]), "title": row[2], "category": row[3] or "",
|
||||
"source_ref": f"{sc.get('source', '')} {sc.get('article', '')}",
|
||||
"domain": prefix,
|
||||
}
|
||||
|
||||
print("=" * 70)
|
||||
print(f"Pass 0b Preview — {len(picked)} Obligations → Atomic Controls")
|
||||
print("=" * 70)
|
||||
|
||||
created = []
|
||||
for i, obl in enumerate(picked, 1):
|
||||
ctrl = ctrl_map.get(obl["control_id"], {})
|
||||
print(f"\n{'─'*70}")
|
||||
print(f"[{i}/{len(picked)}] {obl['control_id']}: [{obl['obligation_type'].upper()}]")
|
||||
print(f" Obligation: {obl['obligation_text'][:120]}")
|
||||
print(f" Parent: {ctrl.get('title', 'N/A')}")
|
||||
|
||||
if args.dry_run:
|
||||
print(" [DRY RUN]")
|
||||
continue
|
||||
|
||||
prompt = build_pass0b_prompt(
|
||||
obl["obligation_text"], obl["action"], obl["object"],
|
||||
ctrl.get("title", ""), ctrl.get("category", ""),
|
||||
ctrl.get("source_ref", ""),
|
||||
)
|
||||
|
||||
t0 = time.time()
|
||||
resp_text, usage, error = call_anthropic(prompt)
|
||||
elapsed = time.time() - t0
|
||||
|
||||
if error:
|
||||
print(f" ERROR: {error}")
|
||||
continue
|
||||
|
||||
result = parse_json_object(resp_text)
|
||||
if not result:
|
||||
print(f" PARSE ERROR: {resp_text[:200]}")
|
||||
continue
|
||||
|
||||
in_tok = usage.get("input_tokens", 0)
|
||||
out_tok = usage.get("output_tokens", 0)
|
||||
print(f" LLM: {elapsed:.1f}s | {in_tok} in / {out_tok} out")
|
||||
|
||||
# Generate control_id
|
||||
domain = ctrl.get("domain", "COMP")
|
||||
new_control_id = generate_control_id(domain, cur)
|
||||
|
||||
# Show result
|
||||
print(f"\n === ATOMIC CONTROL: {new_control_id} ===")
|
||||
print(f" Titel: {result.get('title', 'N/A')}")
|
||||
print(f" Ziel: {result.get('objective', 'N/A')}")
|
||||
print(f" Typ: {obl['obligation_type']}")
|
||||
reqs = result.get("requirements", [])
|
||||
if reqs:
|
||||
print(f" Anforderungen:")
|
||||
for r in reqs:
|
||||
print(f" - {r}")
|
||||
tests = result.get("test_procedure", [])
|
||||
if tests:
|
||||
print(f" Pruefverfahren:")
|
||||
for t in tests:
|
||||
print(f" - {t}")
|
||||
evidence = result.get("evidence", [])
|
||||
if evidence:
|
||||
print(f" Nachweise:")
|
||||
for e in evidence:
|
||||
print(f" - {e}")
|
||||
print(f" Severity: {result.get('severity', 'medium')}")
|
||||
print(f" Category: {result.get('category', 'governance')}")
|
||||
|
||||
# Write to DB
|
||||
new_uuid = str(uuid.uuid4())
|
||||
parent_uuid = ctrl.get("uuid")
|
||||
source_cit = {}
|
||||
if ctrl.get("source_ref"):
|
||||
parts = ctrl["source_ref"].strip().split(" ", 1)
|
||||
source_cit = {"source": parts[0], "article": parts[1] if len(parts) > 1 else ""}
|
||||
|
||||
cur.execute("""
|
||||
INSERT INTO compliance.canonical_controls (
|
||||
id, control_id, title, objective, requirements, test_procedure,
|
||||
evidence, severity, category, release_state,
|
||||
source_citation, generation_metadata, generation_strategy,
|
||||
pipeline_version, parent_control_uuid, framework_id
|
||||
) VALUES (
|
||||
%s, %s, %s, %s, %s, %s,
|
||||
%s, %s, %s, %s,
|
||||
%s, %s, %s,
|
||||
%s, %s,
|
||||
(SELECT id FROM compliance.canonical_control_frameworks LIMIT 1)
|
||||
)
|
||||
""", (
|
||||
new_uuid, new_control_id,
|
||||
result.get("title", ""),
|
||||
result.get("objective", ""),
|
||||
json.dumps(result.get("requirements", []), ensure_ascii=False),
|
||||
json.dumps(result.get("test_procedure", []), ensure_ascii=False),
|
||||
json.dumps(result.get("evidence", []), ensure_ascii=False),
|
||||
result.get("severity", "medium"),
|
||||
result.get("category", "governance"),
|
||||
"draft",
|
||||
psycopg2.extras.Json(source_cit),
|
||||
psycopg2.extras.Json({
|
||||
"obligation_type": obl["obligation_type"],
|
||||
"obligation_text": obl["obligation_text"],
|
||||
"pass0b_model": ANTHROPIC_MODEL,
|
||||
"decomposition_method": "pass0b_preview",
|
||||
}),
|
||||
"pass0b_atomic",
|
||||
6, # pipeline_version
|
||||
parent_uuid,
|
||||
))
|
||||
conn.commit()
|
||||
|
||||
created.append({
|
||||
"control_id": new_control_id,
|
||||
"title": result.get("title", ""),
|
||||
"obligation_type": obl["obligation_type"],
|
||||
"parent_control_id": obl["control_id"],
|
||||
})
|
||||
print(f" ✓ Geschrieben: {new_control_id} (parent: {obl['control_id']})")
|
||||
|
||||
time.sleep(0.5)
|
||||
|
||||
if created:
|
||||
print(f"\n{'='*70}")
|
||||
print(f"ERGEBNIS: {len(created)} atomare Controls erstellt")
|
||||
print(f"{'='*70}")
|
||||
for c in created:
|
||||
print(f" {c['control_id']}: {c['title']} [{c['obligation_type']}] (von {c['parent_control_id']})")
|
||||
|
||||
conn.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user