feat(control-generator): 7-stage pipeline for RAG→LLM→Controls generation
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Successful in 45s
CI/CD / test-python-document-crawler (push) Has been cancelled
CI/CD / test-python-dsms-gateway (push) Has been cancelled
CI/CD / validate-canonical-controls (push) Has been cancelled
CI/CD / deploy-hetzner (push) Has been cancelled
CI/CD / test-python-backend-compliance (push) Has been cancelled

Implements the Control Generator Pipeline that systematically generates
canonical security controls from 150k+ RAG chunks across all compliance
collections (BSI, NIST, OWASP, ENISA, EU laws, German laws).

Three license rules enforced throughout:
- Rule 1 (free_use): Laws/Public Domain — original text preserved
- Rule 2 (citation_required): CC-BY/CC-BY-SA — text with citation
- Rule 3 (restricted): BSI/ISO — full reformulation, no source traces

New files:
- Migration 046: job tracking, chunk tracking, blocked sources tables
- control_generator.py: 7-stage pipeline (scan→classify→structure/reform→harmonize→anchor→store→mark)
- anchor_finder.py: RAG + DuckDuckGo open-source reference search
- control_generator_routes.py: REST API (generate, review, stats, blocked-sources)
- test_control_generator.py: license mapping, rule enforcement, anchor filtering tests

Modified:
- __init__.py: register control_generator_router
- route.ts: proxy generator/review/stats endpoints
- page.tsx: Generator modal, stats panel, state filter, review queue, license badges

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-03-13 09:03:37 +01:00
parent c87f07c99a
commit de19ef0684
8 changed files with 2404 additions and 9 deletions

View File

@@ -52,6 +52,45 @@ export async function GET(request: NextRequest) {
backendPath = '/api/compliance/v1/canonical/licenses'
break
// Generator endpoints
case 'generate-jobs':
backendPath = '/api/compliance/v1/canonical/generate/jobs'
break
case 'generate-status': {
const jobId = searchParams.get('jobId')
if (!jobId) {
return NextResponse.json({ error: 'Missing jobId' }, { status: 400 })
}
backendPath = `/api/compliance/v1/canonical/generate/status/${encodeURIComponent(jobId)}`
break
}
case 'review-queue': {
const state = searchParams.get('release_state') || 'needs_review'
backendPath = `/api/compliance/v1/canonical/generate/review-queue?release_state=${encodeURIComponent(state)}`
break
}
case 'processed-stats':
backendPath = '/api/compliance/v1/canonical/generate/processed-stats'
break
case 'blocked-sources':
backendPath = '/api/compliance/v1/canonical/blocked-sources'
break
case 'controls-customer': {
const custSeverity = searchParams.get('severity')
const custDomain = searchParams.get('domain')
const custParams = new URLSearchParams()
if (custSeverity) custParams.set('severity', custSeverity)
if (custDomain) custParams.set('domain', custDomain)
const custQs = custParams.toString()
backendPath = `/api/compliance/v1/canonical/controls-customer${custQs ? `?${custQs}` : ''}`
break
}
default:
return NextResponse.json({ error: `Unknown endpoint: ${endpoint}` }, { status: 400 })
}
@@ -95,6 +134,16 @@ export async function POST(request: NextRequest) {
if (endpoint === 'create-control') {
backendPath = '/api/compliance/v1/canonical/controls'
} else if (endpoint === 'generate') {
backendPath = '/api/compliance/v1/canonical/generate'
} else if (endpoint === 'review') {
const controlId = searchParams.get('id')
if (!controlId) {
return NextResponse.json({ error: 'Missing control id' }, { status: 400 })
}
backendPath = `/api/compliance/v1/canonical/generate/review/${encodeURIComponent(controlId)}`
} else if (endpoint === 'blocked-sources-cleanup') {
backendPath = '/api/compliance/v1/canonical/blocked-sources/cleanup'
} else if (endpoint === 'similarity-check') {
const controlId = searchParams.get('id')
if (!controlId) {

View File

@@ -5,6 +5,7 @@ import {
Shield, Search, ChevronRight, ArrowLeft, ExternalLink,
Filter, AlertTriangle, CheckCircle2, Info, Lock,
FileText, BookOpen, Scale, Plus, Pencil, Trash2, Save, X,
Zap, BarChart3, Eye, RefreshCw, Clock,
} from 'lucide-react'
// =============================================================================
@@ -44,6 +45,11 @@ interface CanonicalControl {
open_anchors: OpenAnchor[]
release_state: string
tags: string[]
license_rule?: number | null
source_original_text?: string | null
source_citation?: Record<string, string> | null
customer_visible?: boolean
generation_metadata?: Record<string, unknown> | null
created_at: string
updated_at: string
}
@@ -116,14 +122,34 @@ function StateBadge({ state }: { state: string }) {
review: 'bg-blue-100 text-blue-700',
approved: 'bg-green-100 text-green-700',
deprecated: 'bg-red-100 text-red-600',
needs_review: 'bg-yellow-100 text-yellow-800',
too_close: 'bg-red-100 text-red-700',
duplicate: 'bg-orange-100 text-orange-700',
}
const labels: Record<string, string> = {
needs_review: 'Review noetig',
too_close: 'Zu aehnlich',
duplicate: 'Duplikat',
}
return (
<span className={`inline-flex items-center px-2 py-0.5 rounded text-xs font-medium ${config[state] || config.draft}`}>
{state}
{labels[state] || state}
</span>
)
}
function LicenseRuleBadge({ rule }: { rule: number | null | undefined }) {
if (!rule) return null
const config: Record<number, { bg: string; label: string }> = {
1: { bg: 'bg-green-100 text-green-700', label: 'Free Use' },
2: { bg: 'bg-blue-100 text-blue-700', label: 'Zitation' },
3: { bg: 'bg-amber-100 text-amber-700', label: 'Reformuliert' },
}
const c = config[rule]
if (!c) return null
return <span className={`inline-flex items-center px-2 py-0.5 rounded text-xs font-medium ${c.bg}`}>{c.label}</span>
}
function getDomain(controlId: string): string {
return controlId.split('-')[0] || ''
}
@@ -419,6 +445,17 @@ export default function ControlLibraryPage() {
const [mode, setMode] = useState<'list' | 'detail' | 'create' | 'edit'>('list')
const [saving, setSaving] = useState(false)
// Generator state
const [showGenerator, setShowGenerator] = useState(false)
const [generating, setGenerating] = useState(false)
const [genResult, setGenResult] = useState<Record<string, unknown> | null>(null)
const [genDomain, setGenDomain] = useState('')
const [genMaxControls, setGenMaxControls] = useState(10)
const [genDryRun, setGenDryRun] = useState(true)
const [stateFilter, setStateFilter] = useState<string>('')
const [processedStats, setProcessedStats] = useState<Array<Record<string, unknown>>>([])
const [showStats, setShowStats] = useState(false)
// Load data
const loadData = useCallback(async () => {
try {
@@ -450,6 +487,7 @@ export default function ControlLibraryPage() {
return controls.filter(c => {
if (severityFilter && c.severity !== severityFilter) return false
if (domainFilter && getDomain(c.control_id) !== domainFilter) return false
if (stateFilter && c.release_state !== stateFilter) return false
if (searchQuery) {
const q = searchQuery.toLowerCase()
return (
@@ -461,7 +499,7 @@ export default function ControlLibraryPage() {
}
return true
})
}, [controls, severityFilter, domainFilter, searchQuery])
}, [controls, severityFilter, domainFilter, stateFilter, searchQuery])
// CRUD handlers
const handleCreate = async (data: typeof EMPTY_CONTROL) => {
@@ -526,6 +564,63 @@ export default function ControlLibraryPage() {
}
}
// Generator handlers
const handleGenerate = async () => {
setGenerating(true)
setGenResult(null)
try {
const res = await fetch(`${BACKEND_URL}?endpoint=generate`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({
domain: genDomain || null,
max_controls: genMaxControls,
dry_run: genDryRun,
skip_web_search: false,
}),
})
if (!res.ok) {
const err = await res.json()
setGenResult({ status: 'error', message: err.error || err.details || 'Fehler' })
return
}
const data = await res.json()
setGenResult(data)
if (!genDryRun) {
await loadData()
}
} catch {
setGenResult({ status: 'error', message: 'Netzwerkfehler' })
} finally {
setGenerating(false)
}
}
const loadProcessedStats = async () => {
try {
const res = await fetch(`${BACKEND_URL}?endpoint=processed-stats`)
if (res.ok) {
const data = await res.json()
setProcessedStats(data.stats || [])
}
} catch { /* ignore */ }
}
const handleReview = async (controlId: string, action: string) => {
try {
const res = await fetch(`${BACKEND_URL}?endpoint=review&id=${controlId}`, {
method: 'POST',
headers: { 'Content-Type': 'application/json' },
body: JSON.stringify({ action }),
})
if (res.ok) {
await loadData()
setSelectedControl(null)
setMode('list')
}
} catch { /* ignore */ }
}
if (loading) {
return (
<div className="flex items-center justify-center h-96">
@@ -748,6 +843,98 @@ export default function ControlLibraryPage() {
</div>
</section>
)}
{/* License & Citation Info */}
{ctrl.license_rule && (
<section className="bg-blue-50 border border-blue-200 rounded-lg p-4">
<div className="flex items-center gap-2 mb-2">
<Scale className="w-4 h-4 text-blue-700" />
<h3 className="text-sm font-semibold text-blue-900">Lizenzinformationen</h3>
<LicenseRuleBadge rule={ctrl.license_rule} />
</div>
{ctrl.source_citation && (
<div className="text-xs text-blue-800 space-y-1">
<p><span className="font-medium">Quelle:</span> {ctrl.source_citation.source}</p>
{ctrl.source_citation.license && <p><span className="font-medium">Lizenz:</span> {ctrl.source_citation.license}</p>}
{ctrl.source_citation.license_notice && <p><span className="font-medium">Hinweis:</span> {ctrl.source_citation.license_notice}</p>}
{ctrl.source_citation.url && (
<a href={ctrl.source_citation.url} target="_blank" rel="noopener noreferrer" className="flex items-center gap-1 text-blue-600 hover:text-blue-800">
<ExternalLink className="w-3 h-3" /> Originalquelle
</a>
)}
</div>
)}
{ctrl.source_original_text && (
<details className="mt-2">
<summary className="text-xs text-blue-600 cursor-pointer hover:text-blue-800">Originaltext anzeigen</summary>
<p className="mt-1 text-xs text-gray-700 bg-white rounded p-2 border border-blue-100 max-h-40 overflow-y-auto">{ctrl.source_original_text}</p>
</details>
)}
{ctrl.license_rule === 3 && (
<p className="text-xs text-amber-700 mt-2 flex items-center gap-1">
<Lock className="w-3 h-3" />
Eigenstaendig formuliert keine Originalquelle gespeichert
</p>
)}
</section>
)}
{/* Generation Metadata (internal) */}
{ctrl.generation_metadata && (
<section className="bg-gray-50 border border-gray-200 rounded-lg p-4">
<div className="flex items-center gap-2 mb-2">
<Clock className="w-4 h-4 text-gray-500" />
<h3 className="text-sm font-semibold text-gray-700">Generierungsdetails (intern)</h3>
</div>
<div className="text-xs text-gray-600 space-y-1">
<p>Pfad: {String(ctrl.generation_metadata.processing_path || '-')}</p>
{ctrl.generation_metadata.similarity_status && (
<p className="text-red-600">Similarity: {String(ctrl.generation_metadata.similarity_status)}</p>
)}
{Array.isArray(ctrl.generation_metadata.similar_controls) && (
<div>
<p className="font-medium">Aehnliche Controls:</p>
{(ctrl.generation_metadata.similar_controls as Array<Record<string, unknown>>).map((s, i) => (
<p key={i} className="ml-2">{String(s.control_id)} {String(s.title)} ({String(s.similarity)})</p>
))}
</div>
)}
</div>
</section>
)}
{/* Review Actions */}
{['needs_review', 'too_close', 'duplicate'].includes(ctrl.release_state) && (
<section className="bg-yellow-50 border border-yellow-200 rounded-lg p-4">
<div className="flex items-center gap-2 mb-3">
<Eye className="w-4 h-4 text-yellow-700" />
<h3 className="text-sm font-semibold text-yellow-900">Review erforderlich</h3>
</div>
<div className="flex items-center gap-2">
<button
onClick={() => handleReview(ctrl.control_id, 'approve')}
className="px-3 py-1.5 text-sm text-white bg-green-600 rounded-lg hover:bg-green-700"
>
<CheckCircle2 className="w-3.5 h-3.5 inline mr-1" />
Akzeptieren
</button>
<button
onClick={() => handleReview(ctrl.control_id, 'reject')}
className="px-3 py-1.5 text-sm text-white bg-red-600 rounded-lg hover:bg-red-700"
>
<Trash2 className="w-3.5 h-3.5 inline mr-1" />
Ablehnen
</button>
<button
onClick={() => setMode('edit')}
className="px-3 py-1.5 text-sm text-gray-600 border border-gray-300 rounded-lg hover:bg-gray-50"
>
<Pencil className="w-3.5 h-3.5 inline mr-1" />
Ueberarbeiten
</button>
</div>
</section>
)}
</div>
</div>
)
@@ -772,13 +959,29 @@ export default function ControlLibraryPage() {
</p>
</div>
</div>
<button
onClick={() => setMode('create')}
className="flex items-center gap-1.5 px-3 py-2 text-sm text-white bg-purple-600 rounded-lg hover:bg-purple-700"
>
<Plus className="w-4 h-4" />
Neues Control
</button>
<div className="flex items-center gap-2">
<button
onClick={() => { setShowStats(!showStats); if (!showStats) loadProcessedStats() }}
className="flex items-center gap-1.5 px-3 py-2 text-sm text-gray-600 border border-gray-300 rounded-lg hover:bg-gray-50"
>
<BarChart3 className="w-4 h-4" />
Stats
</button>
<button
onClick={() => setShowGenerator(true)}
className="flex items-center gap-1.5 px-3 py-2 text-sm text-white bg-amber-600 rounded-lg hover:bg-amber-700"
>
<Zap className="w-4 h-4" />
Generator
</button>
<button
onClick={() => setMode('create')}
className="flex items-center gap-1.5 px-3 py-2 text-sm text-white bg-purple-600 rounded-lg hover:bg-purple-700"
>
<Plus className="w-4 h-4" />
Neues Control
</button>
</div>
</div>
{/* Frameworks */}
@@ -829,9 +1032,131 @@ export default function ControlLibraryPage() {
<option key={d} value={d}>{d}</option>
))}
</select>
<select
value={stateFilter}
onChange={e => setStateFilter(e.target.value)}
className="text-sm border border-gray-300 rounded-lg px-3 py-2 focus:outline-none focus:ring-2 focus:ring-purple-500"
>
<option value="">Alle Status</option>
<option value="draft">Draft</option>
<option value="approved">Approved</option>
<option value="needs_review">Review noetig</option>
<option value="too_close">Zu aehnlich</option>
<option value="duplicate">Duplikat</option>
</select>
</div>
{/* Processing Stats */}
{showStats && processedStats.length > 0 && (
<div className="mt-3 p-3 bg-gray-50 rounded-lg">
<h4 className="text-xs font-semibold text-gray-700 mb-2">Verarbeitungsfortschritt</h4>
<div className="grid grid-cols-3 gap-3">
{processedStats.map((s, i) => (
<div key={i} className="text-xs">
<span className="font-medium text-gray-700">{String(s.collection)}</span>
<div className="flex gap-2 mt-1 text-gray-500">
<span>{String(s.processed_chunks)} verarbeitet</span>
<span>{String(s.direct_adopted)} direkt</span>
<span>{String(s.llm_reformed)} reformuliert</span>
</div>
</div>
))}
</div>
</div>
)}
</div>
{/* Generator Modal */}
{showGenerator && (
<div className="fixed inset-0 z-50 flex items-center justify-center bg-black/40">
<div className="bg-white rounded-xl shadow-xl w-full max-w-lg p-6 mx-4">
<div className="flex items-center justify-between mb-4">
<div className="flex items-center gap-2">
<Zap className="w-5 h-5 text-amber-600" />
<h2 className="text-lg font-semibold text-gray-900">Control Generator</h2>
</div>
<button onClick={() => { setShowGenerator(false); setGenResult(null) }} className="text-gray-400 hover:text-gray-600">
<X className="w-5 h-5" />
</button>
</div>
<div className="space-y-4">
<div>
<label className="block text-xs font-medium text-gray-600 mb-1">Domain (optional)</label>
<select value={genDomain} onChange={e => setGenDomain(e.target.value)} className="w-full px-3 py-2 text-sm border border-gray-300 rounded-lg">
<option value="">Alle Domains</option>
<option value="AUTH">AUTH Authentifizierung</option>
<option value="CRYPT">CRYPT Kryptographie</option>
<option value="NET">NET Netzwerk</option>
<option value="DATA">DATA Datenschutz</option>
<option value="LOG">LOG Logging</option>
<option value="ACC">ACC Zugriffskontrolle</option>
<option value="SEC">SEC Sicherheit</option>
<option value="INC">INC Incident Response</option>
<option value="AI">AI Kuenstliche Intelligenz</option>
<option value="COMP">COMP Compliance</option>
</select>
</div>
<div>
<label className="block text-xs font-medium text-gray-600 mb-1">Max. Controls: {genMaxControls}</label>
<input
type="range" min="1" max="100" step="1"
value={genMaxControls}
onChange={e => setGenMaxControls(parseInt(e.target.value))}
className="w-full"
/>
</div>
<div className="flex items-center gap-2">
<input
type="checkbox"
id="dryRun"
checked={genDryRun}
onChange={e => setGenDryRun(e.target.checked)}
className="rounded border-gray-300"
/>
<label htmlFor="dryRun" className="text-sm text-gray-700">Dry Run (Vorschau ohne Speicherung)</label>
</div>
<button
onClick={handleGenerate}
disabled={generating}
className="w-full py-2 text-sm text-white bg-amber-600 rounded-lg hover:bg-amber-700 disabled:opacity-50 flex items-center justify-center gap-2"
>
{generating ? (
<><RefreshCw className="w-4 h-4 animate-spin" /> Generiere...</>
) : (
<><Zap className="w-4 h-4" /> Generierung starten</>
)}
</button>
{/* Results */}
{genResult && (
<div className={`p-4 rounded-lg text-sm ${genResult.status === 'error' ? 'bg-red-50 text-red-800' : 'bg-green-50 text-green-800'}`}>
<p className="font-medium mb-1">{String(genResult.message || genResult.status)}</p>
{genResult.status !== 'error' && (
<div className="grid grid-cols-2 gap-1 text-xs mt-2">
<span>Chunks gescannt: {String(genResult.total_chunks_scanned)}</span>
<span>Controls generiert: {String(genResult.controls_generated)}</span>
<span>Verifiziert: {String(genResult.controls_verified)}</span>
<span>Review noetig: {String(genResult.controls_needs_review)}</span>
<span>Zu aehnlich: {String(genResult.controls_too_close)}</span>
<span>Duplikate: {String(genResult.controls_duplicates_found)}</span>
</div>
)}
{Array.isArray(genResult.errors) && (genResult.errors as string[]).length > 0 && (
<div className="mt-2 text-xs text-red-600">
{(genResult.errors as string[]).slice(0, 3).map((e, i) => <p key={i}>{e}</p>)}
</div>
)}
</div>
)}
</div>
</div>
</div>
)}
{/* Control List */}
<div className="flex-1 overflow-y-auto p-6">
<div className="space-y-3">
@@ -847,6 +1172,7 @@ export default function ControlLibraryPage() {
<span className="text-xs font-mono text-purple-600 bg-purple-50 px-1.5 py-0.5 rounded">{ctrl.control_id}</span>
<SeverityBadge severity={ctrl.severity} />
<StateBadge state={ctrl.release_state} />
<LicenseRuleBadge rule={ctrl.license_rule} />
{ctrl.risk_score !== null && (
<span className="text-xs text-gray-400">Score: {ctrl.risk_score}</span>
)}

View File

@@ -34,6 +34,7 @@ from .generation_routes import router as generation_router
from .project_routes import router as project_router
from .wiki_routes import router as wiki_router
from .canonical_control_routes import router as canonical_control_router
from .control_generator_routes import router as control_generator_router
# Include sub-routers
router.include_router(audit_router)
@@ -69,6 +70,7 @@ router.include_router(generation_router)
router.include_router(project_router)
router.include_router(wiki_router)
router.include_router(canonical_control_router)
router.include_router(control_generator_router)
__all__ = [
"router",
@@ -104,4 +106,5 @@ __all__ = [
"project_router",
"wiki_router",
"canonical_control_router",
"control_generator_router",
]

View File

@@ -0,0 +1,433 @@
"""
FastAPI routes for the Control Generator Pipeline.
Endpoints:
POST /v1/canonical/generate — Start generation run
GET /v1/canonical/generate/status/{job_id} — Job status
GET /v1/canonical/generate/jobs — All jobs
GET /v1/canonical/generate/review-queue — Controls needing review
POST /v1/canonical/generate/review/{control_id} — Complete review
GET /v1/canonical/generate/processed-stats — Processing stats per collection
GET /v1/canonical/blocked-sources — Blocked sources list
POST /v1/canonical/blocked-sources/cleanup — Start cleanup workflow
"""
from __future__ import annotations
import json
import logging
from typing import Optional
from fastapi import APIRouter, HTTPException, Query
from pydantic import BaseModel
from sqlalchemy import text
from database import SessionLocal
from compliance.services.control_generator import (
ControlGeneratorPipeline,
GeneratorConfig,
ALL_COLLECTIONS,
)
from compliance.services.rag_client import get_rag_client
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/v1/canonical", tags=["control-generator"])
# =============================================================================
# REQUEST / RESPONSE MODELS
# =============================================================================
class GenerateRequest(BaseModel):
domain: str | None = None
collections: list[str] | None = None
max_controls: int = 50
batch_size: int = 5
skip_web_search: bool = False
dry_run: bool = False
class GenerateResponse(BaseModel):
job_id: str
status: str
message: str
total_chunks_scanned: int = 0
controls_generated: int = 0
controls_verified: int = 0
controls_needs_review: int = 0
controls_too_close: int = 0
controls_duplicates_found: int = 0
errors: list = []
controls: list = []
class ReviewRequest(BaseModel):
action: str # "approve", "reject", "needs_rework"
release_state: str | None = None # Override release_state
notes: str | None = None
class ProcessedStats(BaseModel):
collection: str
total_chunks_estimated: int
processed_chunks: int
pending_chunks: int
direct_adopted: int
llm_reformed: int
skipped: int
class BlockedSourceResponse(BaseModel):
id: str
regulation_code: str
document_title: str
reason: str
deletion_status: str
qdrant_collection: str | None = None
marked_at: str
# =============================================================================
# ENDPOINTS
# =============================================================================
@router.post("/generate", response_model=GenerateResponse)
async def start_generation(req: GenerateRequest):
"""Start a control generation run."""
config = GeneratorConfig(
collections=req.collections,
domain=req.domain,
batch_size=req.batch_size,
max_controls=req.max_controls,
skip_web_search=req.skip_web_search,
dry_run=req.dry_run,
)
db = SessionLocal()
try:
pipeline = ControlGeneratorPipeline(db=db, rag_client=get_rag_client())
result = await pipeline.run(config)
return GenerateResponse(
job_id=result.job_id,
status=result.status,
message=f"Generated {result.controls_generated} controls from {result.total_chunks_scanned} chunks",
total_chunks_scanned=result.total_chunks_scanned,
controls_generated=result.controls_generated,
controls_verified=result.controls_verified,
controls_needs_review=result.controls_needs_review,
controls_too_close=result.controls_too_close,
controls_duplicates_found=result.controls_duplicates_found,
errors=result.errors,
controls=result.controls if req.dry_run else [],
)
except Exception as e:
logger.error("Generation failed: %s", e)
raise HTTPException(status_code=500, detail=str(e))
finally:
db.close()
@router.get("/generate/status/{job_id}")
async def get_job_status(job_id: str):
"""Get status of a generation job."""
db = SessionLocal()
try:
result = db.execute(
text("SELECT * FROM canonical_generation_jobs WHERE id = :id::uuid"),
{"id": job_id},
)
row = result.fetchone()
if not row:
raise HTTPException(status_code=404, detail="Job not found")
cols = result.keys()
job = dict(zip(cols, row))
# Serialize datetime fields
for key in ("started_at", "completed_at", "created_at"):
if job.get(key):
job[key] = str(job[key])
job["id"] = str(job["id"])
return job
finally:
db.close()
@router.get("/generate/jobs")
async def list_jobs(
limit: int = Query(20, ge=1, le=100),
offset: int = Query(0, ge=0),
):
"""List all generation jobs."""
db = SessionLocal()
try:
result = db.execute(
text("""
SELECT id, status, total_chunks_scanned, controls_generated,
controls_verified, controls_needs_review, controls_too_close,
controls_duplicates_found, created_at, completed_at
FROM canonical_generation_jobs
ORDER BY created_at DESC
LIMIT :limit OFFSET :offset
"""),
{"limit": limit, "offset": offset},
)
jobs = []
cols = result.keys()
for row in result:
job = dict(zip(cols, row))
job["id"] = str(job["id"])
for key in ("created_at", "completed_at"):
if job.get(key):
job[key] = str(job[key])
jobs.append(job)
return {"jobs": jobs, "total": len(jobs)}
finally:
db.close()
@router.get("/generate/review-queue")
async def get_review_queue(
release_state: str = Query("needs_review", regex="^(needs_review|too_close|duplicate)$"),
limit: int = Query(50, ge=1, le=200),
):
"""Get controls that need manual review."""
db = SessionLocal()
try:
result = db.execute(
text("""
SELECT c.id, c.control_id, c.title, c.objective, c.severity,
c.release_state, c.license_rule, c.customer_visible,
c.generation_metadata, c.open_anchors, c.tags,
c.created_at
FROM canonical_controls c
WHERE c.release_state = :state
ORDER BY c.created_at DESC
LIMIT :limit
"""),
{"state": release_state, "limit": limit},
)
controls = []
cols = result.keys()
for row in result:
ctrl = dict(zip(cols, row))
ctrl["id"] = str(ctrl["id"])
ctrl["created_at"] = str(ctrl["created_at"])
# Parse JSON fields
for jf in ("generation_metadata", "open_anchors", "tags"):
if isinstance(ctrl.get(jf), str):
try:
ctrl[jf] = json.loads(ctrl[jf])
except (json.JSONDecodeError, TypeError):
pass
controls.append(ctrl)
return {"controls": controls, "total": len(controls)}
finally:
db.close()
@router.post("/generate/review/{control_id}")
async def review_control(control_id: str, req: ReviewRequest):
"""Complete review of a generated control."""
db = SessionLocal()
try:
# Validate control exists and is in reviewable state
result = db.execute(
text("SELECT id, release_state FROM canonical_controls WHERE control_id = :cid"),
{"cid": control_id},
)
row = result.fetchone()
if not row:
raise HTTPException(status_code=404, detail="Control not found")
current_state = row[1]
if current_state not in ("needs_review", "too_close", "duplicate"):
raise HTTPException(status_code=400, detail=f"Control is in state '{current_state}', not reviewable")
# Determine new state
if req.action == "approve":
new_state = req.release_state or "draft"
elif req.action == "reject":
new_state = "deprecated"
elif req.action == "needs_rework":
new_state = "needs_review"
else:
raise HTTPException(status_code=400, detail=f"Unknown action: {req.action}")
if new_state not in ("draft", "review", "approved", "deprecated", "needs_review", "too_close", "duplicate"):
raise HTTPException(status_code=400, detail=f"Invalid release_state: {new_state}")
db.execute(
text("""
UPDATE canonical_controls
SET release_state = :state, updated_at = NOW()
WHERE control_id = :cid
"""),
{"state": new_state, "cid": control_id},
)
db.commit()
return {"control_id": control_id, "release_state": new_state, "action": req.action}
finally:
db.close()
@router.get("/generate/processed-stats")
async def get_processed_stats():
"""Get processing statistics per collection."""
db = SessionLocal()
try:
result = db.execute(
text("""
SELECT
collection,
COUNT(*) as processed_chunks,
COUNT(*) FILTER (WHERE processing_path = 'structured') as direct_adopted,
COUNT(*) FILTER (WHERE processing_path = 'llm_reform') as llm_reformed,
COUNT(*) FILTER (WHERE processing_path = 'skipped') as skipped
FROM canonical_processed_chunks
GROUP BY collection
ORDER BY collection
""")
)
stats = []
cols = result.keys()
for row in result:
stat = dict(zip(cols, row))
stat["total_chunks_estimated"] = 0 # Would need Qdrant API to get total
stat["pending_chunks"] = 0
stats.append(stat)
return {"stats": stats}
finally:
db.close()
# =============================================================================
# BLOCKED SOURCES
# =============================================================================
@router.get("/blocked-sources")
async def list_blocked_sources():
"""List all blocked (Rule 3) sources."""
db = SessionLocal()
try:
result = db.execute(
text("""
SELECT id, regulation_code, document_title, reason,
deletion_status, qdrant_collection, marked_at
FROM canonical_blocked_sources
ORDER BY marked_at DESC
""")
)
sources = []
cols = result.keys()
for row in result:
src = dict(zip(cols, row))
src["id"] = str(src["id"])
src["marked_at"] = str(src["marked_at"])
sources.append(src)
return {"sources": sources}
finally:
db.close()
@router.post("/blocked-sources/cleanup")
async def start_cleanup():
"""Start cleanup workflow for blocked sources.
This marks all pending blocked sources for deletion.
Actual RAG chunk deletion and file removal is a separate manual step.
"""
db = SessionLocal()
try:
result = db.execute(
text("""
UPDATE canonical_blocked_sources
SET deletion_status = 'marked_for_deletion'
WHERE deletion_status = 'pending'
RETURNING regulation_code
""")
)
marked = [row[0] for row in result]
db.commit()
return {
"status": "marked_for_deletion",
"marked_count": len(marked),
"regulation_codes": marked,
"message": "Sources marked for deletion. Run manual cleanup to remove RAG chunks and files.",
}
finally:
db.close()
# =============================================================================
# CUSTOMER VIEW FILTER
# =============================================================================
@router.get("/controls-customer")
async def get_controls_customer_view(
severity: str | None = Query(None),
domain: str | None = Query(None),
):
"""Get controls filtered for customer visibility.
Rule 3 controls have source_citation and source_original_text hidden.
generation_metadata is NEVER shown to customers.
"""
db = SessionLocal()
try:
query = """
SELECT c.id, c.control_id, c.title, c.objective, c.rationale,
c.scope, c.requirements, c.test_procedure, c.evidence,
c.severity, c.risk_score, c.implementation_effort,
c.open_anchors, c.release_state, c.tags,
c.license_rule, c.customer_visible,
c.source_original_text, c.source_citation,
c.created_at, c.updated_at
FROM canonical_controls c
WHERE c.release_state IN ('draft', 'approved')
"""
params: dict = {}
if severity:
query += " AND c.severity = :severity"
params["severity"] = severity
if domain:
query += " AND c.control_id LIKE :domain"
params["domain"] = f"{domain.upper()}-%"
query += " ORDER BY c.control_id"
result = db.execute(text(query), params)
controls = []
cols = result.keys()
for row in result:
ctrl = dict(zip(cols, row))
ctrl["id"] = str(ctrl["id"])
for key in ("created_at", "updated_at"):
if ctrl.get(key):
ctrl[key] = str(ctrl[key])
# Parse JSON fields
for jf in ("scope", "requirements", "test_procedure", "evidence",
"open_anchors", "tags", "source_citation"):
if isinstance(ctrl.get(jf), str):
try:
ctrl[jf] = json.loads(ctrl[jf])
except (json.JSONDecodeError, TypeError):
pass
# Customer visibility rules:
# - NEVER show generation_metadata
# - Rule 3: NEVER show source_citation or source_original_text
ctrl.pop("generation_metadata", None)
if not ctrl.get("customer_visible", True):
ctrl["source_citation"] = None
ctrl["source_original_text"] = None
controls.append(ctrl)
return {"controls": controls, "total": len(controls)}
finally:
db.close()

View File

@@ -0,0 +1,188 @@
"""
Anchor Finder — finds open-source references (OWASP, NIST, ENISA) for controls.
Two-stage search:
Stage A: RAG-internal search for open-source chunks matching the control topic
Stage B: Web search via DuckDuckGo Instant Answer API (no API key needed)
Only open-source references (Rule 1+2) are accepted as anchors.
"""
from __future__ import annotations
import logging
from dataclasses import dataclass
import httpx
from .rag_client import ComplianceRAGClient, get_rag_client
from .control_generator import (
GeneratedControl,
REGULATION_LICENSE_MAP,
_RULE2_PREFIXES,
_RULE3_PREFIXES,
_classify_regulation,
)
logger = logging.getLogger(__name__)
# Regulation codes that are safe to reference as open anchors (Rule 1+2)
_OPEN_SOURCE_RULES = {1, 2}
@dataclass
class OpenAnchor:
framework: str
ref: str
url: str
class AnchorFinder:
"""Finds open-source references to anchor generated controls."""
def __init__(self, rag_client: ComplianceRAGClient | None = None):
self.rag = rag_client or get_rag_client()
async def find_anchors(
self,
control: GeneratedControl,
skip_web: bool = False,
min_anchors: int = 2,
) -> list[OpenAnchor]:
"""Find open-source anchors for a control."""
# Stage A: RAG-internal search
anchors = await self._search_rag_for_open_anchors(control)
# Stage B: Web search if not enough anchors
if len(anchors) < min_anchors and not skip_web:
web_anchors = await self._search_web(control)
# Deduplicate by framework+ref
existing_keys = {(a.framework, a.ref) for a in anchors}
for wa in web_anchors:
if (wa.framework, wa.ref) not in existing_keys:
anchors.append(wa)
return anchors
async def _search_rag_for_open_anchors(self, control: GeneratedControl) -> list[OpenAnchor]:
"""Search RAG for chunks from open sources matching the control topic."""
# Build search query from control title + first 3 tags
tags_str = " ".join(control.tags[:3]) if control.tags else ""
query = f"{control.title} {tags_str}".strip()
results = await self.rag.search(
query=query,
collection="bp_compliance_ce",
top_k=15,
)
anchors: list[OpenAnchor] = []
seen: set[str] = set()
for r in results:
if not r.regulation_code:
continue
# Only accept open-source references
license_info = _classify_regulation(r.regulation_code)
if license_info.get("rule") not in _OPEN_SOURCE_RULES:
continue
# Build reference key for dedup
ref = r.article or r.category or ""
key = f"{r.regulation_code}:{ref}"
if key in seen:
continue
seen.add(key)
framework_name = license_info.get("name", r.regulation_name or r.regulation_short or r.regulation_code)
url = r.source_url or self._build_reference_url(r.regulation_code, ref)
anchors.append(OpenAnchor(
framework=framework_name,
ref=ref,
url=url,
))
if len(anchors) >= 5:
break
return anchors
async def _search_web(self, control: GeneratedControl) -> list[OpenAnchor]:
"""Search DuckDuckGo Instant Answer API for open references."""
keywords = f"{control.title} security control OWASP NIST"
try:
async with httpx.AsyncClient(timeout=10.0) as client:
resp = await client.get(
"https://api.duckduckgo.com/",
params={
"q": keywords,
"format": "json",
"no_html": "1",
"skip_disambig": "1",
},
)
if resp.status_code != 200:
return []
data = resp.json()
anchors: list[OpenAnchor] = []
# Parse RelatedTopics
for topic in data.get("RelatedTopics", [])[:10]:
url = topic.get("FirstURL", "")
text = topic.get("Text", "")
if not url:
continue
# Only accept known open-source domains
framework = self._identify_framework_from_url(url)
if framework:
anchors.append(OpenAnchor(
framework=framework,
ref=text[:100] if text else url,
url=url,
))
if len(anchors) >= 3:
break
return anchors
except Exception as e:
logger.warning("Web anchor search failed: %s", e)
return []
@staticmethod
def _identify_framework_from_url(url: str) -> str | None:
"""Identify if a URL belongs to a known open-source framework."""
url_lower = url.lower()
if "owasp.org" in url_lower:
return "OWASP"
if "nist.gov" in url_lower or "csrc.nist.gov" in url_lower:
return "NIST"
if "enisa.europa.eu" in url_lower:
return "ENISA"
if "cisa.gov" in url_lower:
return "CISA"
if "eur-lex.europa.eu" in url_lower:
return "EU Law"
return None
@staticmethod
def _build_reference_url(regulation_code: str, ref: str) -> str:
"""Build a reference URL for known frameworks."""
code = regulation_code.lower()
if code.startswith("owasp"):
return "https://owasp.org/www-project-application-security-verification-standard/"
if code.startswith("nist"):
return "https://csrc.nist.gov/publications"
if code.startswith("enisa"):
return "https://www.enisa.europa.eu/publications"
if code.startswith("eu_"):
return "https://eur-lex.europa.eu/"
if code == "cisa_secure_by_design":
return "https://www.cisa.gov/securebydesign"
return ""

View File

@@ -0,0 +1,951 @@
"""
Control Generator Pipeline — RAG → License → Structure/Reform → Harmonize → Anchor → Store.
7-stage pipeline that generates canonical security controls from RAG chunks:
1. RAG SCAN — Load unprocessed chunks (or new document versions)
2. LICENSE CLASSIFY — Determine which of 3 license rules applies
3a. STRUCTURE — Rule 1+2: Structure original text into control format
3b. LLM REFORM — Rule 3: Fully reformulate (no original text, no source names)
4. HARMONIZE — Check against existing controls for duplicates
5. ANCHOR SEARCH — Find open-source references (OWASP, NIST, ENISA)
6. STORE — Persist to DB with correct visibility flags
7. MARK PROCESSED — Mark RAG chunks as processed (with version tracking)
Three License Rules:
Rule 1 (free_use): Laws, Public Domain — original text allowed
Rule 2 (citation_required): CC-BY, CC-BY-SA — original text with citation
Rule 3 (restricted): BSI, ISO — full reformulation, no source names
"""
from __future__ import annotations
import hashlib
import json
import logging
import os
import re
import uuid
from dataclasses import dataclass, field, asdict
from datetime import datetime, timezone
from typing import Optional
import httpx
from pydantic import BaseModel
from sqlalchemy import text
from sqlalchemy.orm import Session
from .rag_client import ComplianceRAGClient, RAGSearchResult, get_rag_client
from .similarity_detector import check_similarity, SimilarityReport
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------
SDK_URL = os.getenv("SDK_URL", "http://ai-compliance-sdk:8090")
LLM_CHAT_URL = f"{SDK_URL}/sdk/v1/llm/chat"
EMBEDDING_URL = os.getenv("EMBEDDING_URL", "http://embedding-service:8087")
LLM_MODEL = os.getenv("CONTROL_GEN_LLM_MODEL", "qwen3:30b-a3b")
LLM_TIMEOUT = float(os.getenv("CONTROL_GEN_LLM_TIMEOUT", "120"))
HARMONIZATION_THRESHOLD = 0.85 # Cosine similarity above this = duplicate
ALL_COLLECTIONS = [
"bp_compliance_ce",
"bp_compliance_recht",
"bp_compliance_gesetze",
"bp_compliance_datenschutz",
"bp_dsfa_corpus",
"bp_legal_templates",
]
# ---------------------------------------------------------------------------
# License Mapping (3-Rule System)
# ---------------------------------------------------------------------------
REGULATION_LICENSE_MAP: dict[str, dict] = {
# RULE 1: FREE USE — Laws, Public Domain
"eu_2016_679": {"license": "EU_LAW", "rule": 1, "name": "DSGVO"},
"eu_2024_1689": {"license": "EU_LAW", "rule": 1, "name": "AI Act"},
"eu_2022_2555": {"license": "EU_LAW", "rule": 1, "name": "NIS2"},
"eu_2024_2847": {"license": "EU_LAW", "rule": 1, "name": "CRA"},
"eu_2023_1230": {"license": "EU_LAW", "rule": 1, "name": "Maschinenverordnung"},
"nist_sp_800_53": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "name": "NIST SP 800-53"},
"nist_sp_800_63b": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "name": "NIST SP 800-63B"},
"nist_csf_2_0": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "name": "NIST CSF 2.0"},
"nist_sp_800_218": {"license": "NIST_PUBLIC_DOMAIN", "rule": 1, "name": "NIST SSDF"},
"cisa_secure_by_design": {"license": "US_GOV_PUBLIC", "rule": 1, "name": "CISA Secure by Design"},
"bdsg": {"license": "DE_LAW", "rule": 1, "name": "BDSG"},
"ttdsg": {"license": "DE_LAW", "rule": 1, "name": "TTDSG"},
"tkg": {"license": "DE_LAW", "rule": 1, "name": "TKG"},
# RULE 2: CITATION REQUIRED — CC-BY, CC-BY-SA
"owasp_asvs": {"license": "CC-BY-SA-4.0", "rule": 2, "name": "OWASP ASVS",
"attribution": "OWASP Foundation, CC BY-SA 4.0"},
"owasp_masvs": {"license": "CC-BY-SA-4.0", "rule": 2, "name": "OWASP MASVS",
"attribution": "OWASP Foundation, CC BY-SA 4.0"},
"owasp_top10": {"license": "CC-BY-SA-4.0", "rule": 2, "name": "OWASP Top 10",
"attribution": "OWASP Foundation, CC BY-SA 4.0"},
"oecd_ai_principles": {"license": "OECD_PUBLIC", "rule": 2, "name": "OECD AI Principles",
"attribution": "OECD"},
# RULE 3: RESTRICTED — Full reformulation required
# Names stored as INTERNAL_ONLY — never exposed to customers
}
# Prefix-based matching for wildcard entries
_RULE3_PREFIXES = ["bsi_", "iso_", "etsi_"]
_RULE2_PREFIXES = ["enisa_"]
def _classify_regulation(regulation_code: str) -> dict:
"""Determine license rule for a regulation_code."""
code = regulation_code.lower().strip()
# Exact match first
if code in REGULATION_LICENSE_MAP:
return REGULATION_LICENSE_MAP[code]
# Prefix match for Rule 2
for prefix in _RULE2_PREFIXES:
if code.startswith(prefix):
return {"license": "CC-BY-4.0", "rule": 2, "name": "ENISA",
"attribution": "ENISA, CC BY 4.0"}
# Prefix match for Rule 3
for prefix in _RULE3_PREFIXES:
if code.startswith(prefix):
return {"license": f"{prefix.rstrip('_').upper()}_RESTRICTED", "rule": 3,
"name": "INTERNAL_ONLY"}
# Unknown → treat as restricted (safe default)
logger.warning("Unknown regulation_code %r — defaulting to Rule 3 (restricted)", code)
return {"license": "UNKNOWN", "rule": 3, "name": "INTERNAL_ONLY"}
# ---------------------------------------------------------------------------
# Domain detection from content
# ---------------------------------------------------------------------------
DOMAIN_KEYWORDS = {
"AUTH": ["authentication", "login", "password", "credential", "mfa", "2fa",
"session", "token", "oauth", "identity", "authentifizierung", "anmeldung"],
"CRYPT": ["encryption", "cryptography", "tls", "ssl", "certificate", "hashing",
"aes", "rsa", "verschlüsselung", "kryptographie", "zertifikat"],
"NET": ["network", "firewall", "dns", "vpn", "proxy", "segmentation",
"netzwerk", "routing", "port", "intrusion"],
"DATA": ["data protection", "privacy", "personal data", "datenschutz",
"personenbezogen", "dsgvo", "gdpr", "löschung", "verarbeitung"],
"LOG": ["logging", "monitoring", "audit", "siem", "alert", "anomaly",
"protokollierung", "überwachung"],
"ACC": ["access control", "authorization", "rbac", "permission", "privilege",
"zugriffskontrolle", "berechtigung", "autorisierung"],
"SEC": ["vulnerability", "patch", "update", "hardening", "configuration",
"schwachstelle", "härtung", "konfiguration"],
"INC": ["incident", "response", "breach", "recovery", "backup",
"vorfall", "wiederherstellung", "notfall"],
"AI": ["artificial intelligence", "machine learning", "model", "bias",
"ki", "künstliche intelligenz", "algorithmus", "training"],
"COMP": ["compliance", "audit", "regulation", "standard", "certification",
"konformität", "prüfung", "zertifizierung"],
}
def _detect_domain(text: str) -> str:
"""Detect the most likely domain from text content."""
text_lower = text.lower()
scores: dict[str, int] = {}
for domain, keywords in DOMAIN_KEYWORDS.items():
scores[domain] = sum(1 for kw in keywords if kw in text_lower)
if not scores or max(scores.values()) == 0:
return "SEC" # Default
return max(scores, key=scores.get)
# ---------------------------------------------------------------------------
# Data Models
# ---------------------------------------------------------------------------
class GeneratorConfig(BaseModel):
collections: list[str] | None = None
domain: str | None = None
batch_size: int = 5
max_controls: int = 50
skip_processed: bool = True
skip_web_search: bool = False
dry_run: bool = False
@dataclass
class GeneratedControl:
control_id: str = ""
title: str = ""
objective: str = ""
rationale: str = ""
scope: dict = field(default_factory=dict)
requirements: list = field(default_factory=list)
test_procedure: list = field(default_factory=list)
evidence: list = field(default_factory=list)
severity: str = "medium"
risk_score: float = 5.0
implementation_effort: str = "m"
open_anchors: list = field(default_factory=list)
release_state: str = "draft"
tags: list = field(default_factory=list)
# 3-rule fields
license_rule: int | None = None
source_original_text: str | None = None
source_citation: dict | None = None
customer_visible: bool = True
generation_metadata: dict = field(default_factory=dict)
@dataclass
class GeneratorResult:
job_id: str = ""
status: str = "completed"
total_chunks_scanned: int = 0
controls_generated: int = 0
controls_verified: int = 0
controls_needs_review: int = 0
controls_too_close: int = 0
controls_duplicates_found: int = 0
errors: list = field(default_factory=list)
controls: list = field(default_factory=list)
# ---------------------------------------------------------------------------
# LLM Client (via Go SDK)
# ---------------------------------------------------------------------------
async def _llm_chat(prompt: str, system_prompt: str | None = None) -> str:
"""Call the Go SDK LLM chat endpoint."""
messages = []
if system_prompt:
messages.append({"role": "system", "content": system_prompt})
messages.append({"role": "user", "content": prompt})
payload = {
"model": LLM_MODEL,
"messages": messages,
"stream": False,
}
try:
async with httpx.AsyncClient(timeout=LLM_TIMEOUT) as client:
resp = await client.post(LLM_CHAT_URL, json=payload)
if resp.status_code != 200:
logger.error("LLM chat failed %d: %s", resp.status_code, resp.text[:300])
return ""
data = resp.json()
# Go SDK returns {message: {content: "..."}} or {response: "..."}
msg = data.get("message", {})
if isinstance(msg, dict):
return msg.get("content", "")
return data.get("response", str(msg))
except Exception as e:
logger.error("LLM chat request failed: %s", e)
return ""
async def _get_embedding(text: str) -> list[float]:
"""Get embedding vector for text via embedding service."""
try:
async with httpx.AsyncClient(timeout=10.0) as client:
resp = await client.post(
f"{EMBEDDING_URL}/embed",
json={"texts": [text]},
)
resp.raise_for_status()
embeddings = resp.json().get("embeddings", [])
return embeddings[0] if embeddings else []
except Exception:
return []
def _cosine_sim(a: list[float], b: list[float]) -> float:
"""Compute cosine similarity between two vectors."""
if not a or not b or len(a) != len(b):
return 0.0
dot = sum(x * y for x, y in zip(a, b))
norm_a = sum(x * x for x in a) ** 0.5
norm_b = sum(x * x for x in b) ** 0.5
if norm_a == 0 or norm_b == 0:
return 0.0
return dot / (norm_a * norm_b)
# ---------------------------------------------------------------------------
# JSON Parsing Helper
# ---------------------------------------------------------------------------
def _parse_llm_json(raw: str) -> dict:
"""Extract JSON from LLM response (handles markdown fences)."""
# Try extracting from ```json ... ``` blocks
match = re.search(r"```(?:json)?\s*\n?(.*?)\n?```", raw, re.DOTALL)
text = match.group(1) if match else raw
# Try parsing directly
try:
return json.loads(text)
except json.JSONDecodeError:
pass
# Try finding first { ... } block
brace_match = re.search(r"\{.*\}", text, re.DOTALL)
if brace_match:
try:
return json.loads(brace_match.group(0))
except json.JSONDecodeError:
pass
logger.warning("Failed to parse LLM JSON response")
return {}
# ---------------------------------------------------------------------------
# Pipeline
# ---------------------------------------------------------------------------
REFORM_SYSTEM_PROMPT = """Du bist ein Security-Compliance-Experte. Deine Aufgabe ist es, eigenständige
Security Controls zu formulieren. Du formulierst IMMER in eigenen Worten.
KOPIERE KEINE Sätze aus dem Quelltext. Verwende eigene Begriffe und Struktur.
NENNE NICHT die Quelle. Keine proprietären Bezeichner.
Antworte NUR mit validem JSON."""
STRUCTURE_SYSTEM_PROMPT = """Du bist ein Security-Compliance-Experte. Strukturiere den gegebenen Text
als praxisorientiertes Security Control. Erstelle eine verständliche, umsetzbare Formulierung.
Antworte NUR mit validem JSON."""
class ControlGeneratorPipeline:
"""Orchestrates the 7-stage control generation pipeline."""
def __init__(self, db: Session, rag_client: ComplianceRAGClient | None = None):
self.db = db
self.rag = rag_client or get_rag_client()
self._existing_controls: list[dict] | None = None
self._existing_embeddings: dict[str, list[float]] = {}
# ── Stage 1: RAG Scan ──────────────────────────────────────────────
async def _scan_rag(self, config: GeneratorConfig) -> list[RAGSearchResult]:
"""Load unprocessed chunks from RAG collections."""
collections = config.collections or ALL_COLLECTIONS
all_results: list[RAGSearchResult] = []
queries = [
"security requirement control measure",
"Sicherheitsanforderung Maßnahme Prüfaspekt",
"compliance requirement audit criterion",
"data protection privacy obligation",
"access control authentication authorization",
]
if config.domain:
domain_kw = DOMAIN_KEYWORDS.get(config.domain, [])
if domain_kw:
queries.append(" ".join(domain_kw[:5]))
for collection in collections:
for query in queries:
results = await self.rag.search(
query=query,
collection=collection,
top_k=20,
)
all_results.extend(results)
# Deduplicate by text hash
seen_hashes: set[str] = set()
unique: list[RAGSearchResult] = []
for r in all_results:
h = hashlib.sha256(r.text.encode()).hexdigest()
if h not in seen_hashes:
seen_hashes.add(h)
unique.append(r)
# Filter out already-processed chunks
if config.skip_processed and unique:
hashes = [hashlib.sha256(r.text.encode()).hexdigest() for r in unique]
processed = self._get_processed_hashes(hashes)
unique = [r for r, h in zip(unique, hashes) if h not in processed]
logger.info("RAG scan: %d unique chunks (%d after filtering processed)",
len(seen_hashes), len(unique))
return unique[:config.max_controls * 3] # Over-fetch to account for duplicates
def _get_processed_hashes(self, hashes: list[str]) -> set[str]:
"""Check which chunk hashes are already processed."""
if not hashes:
return set()
try:
result = self.db.execute(
text("SELECT chunk_hash FROM canonical_processed_chunks WHERE chunk_hash = ANY(:hashes)"),
{"hashes": hashes},
)
return {row[0] for row in result}
except Exception as e:
logger.warning("Error checking processed chunks: %s", e)
return set()
# ── Stage 2: License Classification ────────────────────────────────
def _classify_license(self, chunk: RAGSearchResult) -> dict:
"""Determine which license rule applies to this chunk."""
return _classify_regulation(chunk.regulation_code)
# ── Stage 3a: Structure (Rule 1 — Free Use) ───────────────────────
async def _structure_free_use(self, chunk: RAGSearchResult, license_info: dict) -> GeneratedControl:
"""Structure a freely usable text into control format."""
source_name = license_info.get("name", chunk.regulation_name)
prompt = f"""Strukturiere den folgenden Gesetzestext als Security/Compliance Control.
Du DARFST den Originaltext verwenden (Quelle: {source_name}, {license_info.get('license', '')}).
WICHTIG: Erstelle eine verständliche, praxisorientierte Formulierung.
Der Originaltext wird separat gespeichert — deine Formulierung soll klar und umsetzbar sein.
Gib JSON zurück mit diesen Feldern:
- title: Kurzer prägnanter Titel (max 100 Zeichen)
- objective: Was soll erreicht werden? (1-3 Sätze)
- rationale: Warum ist das wichtig? (1-2 Sätze)
- requirements: Liste von konkreten Anforderungen (Strings)
- test_procedure: Liste von Prüfschritten (Strings)
- evidence: Liste von Nachweisdokumenten (Strings)
- severity: low/medium/high/critical
- tags: Liste von Tags
Text: {chunk.text[:2000]}
Quelle: {chunk.regulation_name} ({chunk.regulation_code}), {chunk.article}"""
raw = await _llm_chat(prompt, STRUCTURE_SYSTEM_PROMPT)
data = _parse_llm_json(raw)
if not data:
return self._fallback_control(chunk)
domain = _detect_domain(chunk.text)
control = self._build_control_from_json(data, domain)
control.license_rule = 1
control.source_original_text = chunk.text
control.source_citation = {
"source": f"{chunk.regulation_name} {chunk.article or ''}".strip(),
"license": license_info.get("license", ""),
"url": chunk.source_url or "",
}
control.customer_visible = True
control.generation_metadata = {
"processing_path": "structured",
"license_rule": 1,
"source_regulation": chunk.regulation_code,
"source_article": chunk.article,
}
return control
# ── Stage 3b: Structure with Citation (Rule 2) ────────────────────
async def _structure_with_citation(self, chunk: RAGSearchResult, license_info: dict) -> GeneratedControl:
"""Structure text that requires citation."""
source_name = license_info.get("name", chunk.regulation_name)
attribution = license_info.get("attribution", "")
prompt = f"""Strukturiere den folgenden Text als Security Control.
Quelle: {source_name} ({license_info.get('license', '')}) — Zitation erforderlich.
Du darfst den Text übernehmen oder verständlicher umformulieren.
Die Quelle wird automatisch zitiert — fokussiere dich auf Klarheit.
Gib JSON zurück mit diesen Feldern:
- title: Kurzer prägnanter Titel (max 100 Zeichen)
- objective: Was soll erreicht werden? (1-3 Sätze)
- rationale: Warum ist das wichtig? (1-2 Sätze)
- requirements: Liste von konkreten Anforderungen (Strings)
- test_procedure: Liste von Prüfschritten (Strings)
- evidence: Liste von Nachweisdokumenten (Strings)
- severity: low/medium/high/critical
- tags: Liste von Tags
Text: {chunk.text[:2000]}
Quelle: {chunk.regulation_name}, {chunk.article}"""
raw = await _llm_chat(prompt, STRUCTURE_SYSTEM_PROMPT)
data = _parse_llm_json(raw)
if not data:
return self._fallback_control(chunk)
domain = _detect_domain(chunk.text)
control = self._build_control_from_json(data, domain)
control.license_rule = 2
control.source_original_text = chunk.text
control.source_citation = {
"source": f"{chunk.regulation_name} {chunk.article or ''}".strip(),
"license": license_info.get("license", ""),
"license_notice": attribution,
"url": chunk.source_url or "",
}
control.customer_visible = True
control.generation_metadata = {
"processing_path": "structured",
"license_rule": 2,
"source_regulation": chunk.regulation_code,
"source_article": chunk.article,
}
return control
# ── Stage 3c: LLM Reformulation (Rule 3 — Restricted) ─────────────
async def _llm_reformulate(self, chunk: RAGSearchResult, config: GeneratorConfig) -> GeneratedControl:
"""Fully reformulate — NO original text, NO source names."""
domain = config.domain or _detect_domain(chunk.text)
prompt = f"""Analysiere den folgenden Prüfaspekt und formuliere ein EIGENSTÄNDIGES Security Control.
KOPIERE KEINE Sätze. Verwende eigene Begriffe und Struktur.
NENNE NICHT die Quelle. Keine proprietären Bezeichner (kein O.Auth_*, TR-03161, BSI-TR etc.).
Aspekt (nur zur Analyse, NICHT kopieren, NICHT referenzieren):
---
{chunk.text[:1500]}
---
Domain: {domain}
Gib JSON zurück mit diesen Feldern:
- title: Kurzer eigenständiger Titel (max 100 Zeichen)
- objective: Eigenständige Formulierung des Ziels (1-3 Sätze)
- rationale: Eigenständige Begründung (1-2 Sätze)
- requirements: Liste von konkreten Anforderungen (Strings, eigene Worte)
- test_procedure: Liste von Prüfschritten (Strings)
- evidence: Liste von Nachweisdokumenten (Strings)
- severity: low/medium/high/critical
- tags: Liste von Tags (eigene Begriffe)"""
raw = await _llm_chat(prompt, REFORM_SYSTEM_PROMPT)
data = _parse_llm_json(raw)
if not data:
return self._fallback_control(chunk)
control = self._build_control_from_json(data, domain)
control.license_rule = 3
control.source_original_text = None # NEVER store original
control.source_citation = None # NEVER cite source
control.customer_visible = False # Only our formulation
# generation_metadata: NO source names, NO original texts
control.generation_metadata = {
"processing_path": "llm_reform",
"license_rule": 3,
}
return control
# ── Stage 4: Harmonization ─────────────────────────────────────────
async def _check_harmonization(self, new_control: GeneratedControl) -> list | None:
"""Check if a new control duplicates existing ones via embedding similarity."""
existing = self._load_existing_controls()
if not existing:
return None
new_text = f"{new_control.title} {new_control.objective}"
new_emb = await _get_embedding(new_text)
if not new_emb:
return None
similar = []
for ex in existing:
ex_key = ex.get("control_id", "")
ex_text = f"{ex.get('title', '')} {ex.get('objective', '')}"
# Get or compute embedding for existing control
if ex_key not in self._existing_embeddings:
emb = await _get_embedding(ex_text)
self._existing_embeddings[ex_key] = emb
ex_emb = self._existing_embeddings.get(ex_key, [])
if not ex_emb:
continue
cosine = _cosine_sim(new_emb, ex_emb)
if cosine > HARMONIZATION_THRESHOLD:
similar.append({
"control_id": ex.get("control_id", ""),
"title": ex.get("title", ""),
"similarity": round(cosine, 3),
})
return similar if similar else None
def _load_existing_controls(self) -> list[dict]:
"""Load existing controls from DB (cached per pipeline run)."""
if self._existing_controls is not None:
return self._existing_controls
try:
result = self.db.execute(
text("SELECT control_id, title, objective FROM canonical_controls WHERE release_state != 'deprecated'")
)
self._existing_controls = [
{"control_id": r[0], "title": r[1], "objective": r[2]}
for r in result
]
except Exception as e:
logger.warning("Error loading existing controls: %s", e)
self._existing_controls = []
return self._existing_controls
# ── Helpers ────────────────────────────────────────────────────────
def _build_control_from_json(self, data: dict, domain: str) -> GeneratedControl:
"""Build a GeneratedControl from parsed LLM JSON."""
severity = data.get("severity", "medium")
if severity not in ("low", "medium", "high", "critical"):
severity = "medium"
tags = data.get("tags", [])
if isinstance(tags, str):
tags = [t.strip() for t in tags.split(",")]
return GeneratedControl(
title=str(data.get("title", "Untitled Control"))[:255],
objective=str(data.get("objective", "")),
rationale=str(data.get("rationale", "")),
scope=data.get("scope", {}),
requirements=data.get("requirements", []) if isinstance(data.get("requirements"), list) else [],
test_procedure=data.get("test_procedure", []) if isinstance(data.get("test_procedure"), list) else [],
evidence=data.get("evidence", []) if isinstance(data.get("evidence"), list) else [],
severity=severity,
risk_score=min(10.0, max(0.0, float(data.get("risk_score", 5.0)))),
implementation_effort=data.get("implementation_effort", "m") if data.get("implementation_effort") in ("s", "m", "l", "xl") else "m",
tags=tags[:20],
)
def _fallback_control(self, chunk: RAGSearchResult) -> GeneratedControl:
"""Create a minimal control when LLM parsing fails."""
domain = _detect_domain(chunk.text)
return GeneratedControl(
title=f"Control from {chunk.regulation_code} {chunk.article or ''}".strip()[:255],
objective=chunk.text[:500] if chunk.text else "Needs manual review",
rationale="Auto-generated — LLM parsing failed, manual review required.",
severity="medium",
release_state="needs_review",
tags=[domain.lower()],
)
def _generate_control_id(self, domain: str, db: Session) -> str:
"""Generate next sequential control ID like AUTH-011."""
prefix = domain.upper()[:4]
try:
result = db.execute(
text("SELECT control_id FROM canonical_controls WHERE control_id LIKE :prefix ORDER BY control_id DESC LIMIT 1"),
{"prefix": f"{prefix}-%"},
)
row = result.fetchone()
if row:
last_num = int(row[0].split("-")[-1])
return f"{prefix}-{last_num + 1:03d}"
except Exception:
pass
return f"{prefix}-001"
# ── Pipeline Orchestration ─────────────────────────────────────────
def _create_job(self, config: GeneratorConfig) -> str:
"""Create a generation job record."""
try:
result = self.db.execute(
text("""
INSERT INTO canonical_generation_jobs (status, config)
VALUES ('running', :config)
RETURNING id
"""),
{"config": json.dumps(config.model_dump())},
)
self.db.commit()
row = result.fetchone()
return str(row[0]) if row else str(uuid.uuid4())
except Exception as e:
logger.error("Failed to create job: %s", e)
return str(uuid.uuid4())
def _update_job(self, job_id: str, result: GeneratorResult):
"""Update job with final stats."""
try:
self.db.execute(
text("""
UPDATE canonical_generation_jobs
SET status = :status,
total_chunks_scanned = :scanned,
controls_generated = :generated,
controls_verified = :verified,
controls_needs_review = :needs_review,
controls_too_close = :too_close,
controls_duplicates_found = :duplicates,
errors = :errors,
completed_at = NOW()
WHERE id = :job_id::uuid
"""),
{
"job_id": job_id,
"status": result.status,
"scanned": result.total_chunks_scanned,
"generated": result.controls_generated,
"verified": result.controls_verified,
"needs_review": result.controls_needs_review,
"too_close": result.controls_too_close,
"duplicates": result.controls_duplicates_found,
"errors": json.dumps(result.errors[-50:]),
},
)
self.db.commit()
except Exception as e:
logger.error("Failed to update job: %s", e)
def _store_control(self, control: GeneratedControl, job_id: str) -> str | None:
"""Persist a generated control to DB. Returns the control UUID or None."""
try:
# Get framework UUID
fw_result = self.db.execute(
text("SELECT id FROM canonical_control_frameworks WHERE framework_id = 'bp_security_v1' LIMIT 1")
)
fw_row = fw_result.fetchone()
if not fw_row:
logger.error("Framework bp_security_v1 not found")
return None
framework_uuid = fw_row[0]
# Generate control_id if not set
if not control.control_id:
domain = _detect_domain(control.objective) if control.objective else "SEC"
control.control_id = self._generate_control_id(domain, self.db)
result = self.db.execute(
text("""
INSERT INTO canonical_controls (
framework_id, control_id, title, objective, rationale,
scope, requirements, test_procedure, evidence,
severity, risk_score, implementation_effort,
open_anchors, release_state, tags,
license_rule, source_original_text, source_citation,
customer_visible, generation_metadata
) VALUES (
:framework_id, :control_id, :title, :objective, :rationale,
:scope, :requirements, :test_procedure, :evidence,
:severity, :risk_score, :implementation_effort,
:open_anchors, :release_state, :tags,
:license_rule, :source_original_text, :source_citation,
:customer_visible, :generation_metadata
)
ON CONFLICT (framework_id, control_id) DO NOTHING
RETURNING id
"""),
{
"framework_id": framework_uuid,
"control_id": control.control_id,
"title": control.title,
"objective": control.objective,
"rationale": control.rationale,
"scope": json.dumps(control.scope),
"requirements": json.dumps(control.requirements),
"test_procedure": json.dumps(control.test_procedure),
"evidence": json.dumps(control.evidence),
"severity": control.severity,
"risk_score": control.risk_score,
"implementation_effort": control.implementation_effort,
"open_anchors": json.dumps(control.open_anchors),
"release_state": control.release_state,
"tags": json.dumps(control.tags),
"license_rule": control.license_rule,
"source_original_text": control.source_original_text,
"source_citation": json.dumps(control.source_citation) if control.source_citation else None,
"customer_visible": control.customer_visible,
"generation_metadata": json.dumps(control.generation_metadata) if control.generation_metadata else None,
},
)
self.db.commit()
row = result.fetchone()
return str(row[0]) if row else None
except Exception as e:
logger.error("Failed to store control %s: %s", control.control_id, e)
self.db.rollback()
return None
def _mark_chunk_processed(
self,
chunk: RAGSearchResult,
license_info: dict,
processing_path: str,
control_ids: list[str],
job_id: str,
):
"""Mark a RAG chunk as processed (Stage 7)."""
chunk_hash = hashlib.sha256(chunk.text.encode()).hexdigest()
try:
self.db.execute(
text("""
INSERT INTO canonical_processed_chunks (
chunk_hash, collection, regulation_code,
document_version, source_license, license_rule,
processing_path, generated_control_ids, job_id
) VALUES (
:hash, :collection, :regulation_code,
:doc_version, :license, :rule,
:path, :control_ids, :job_id::uuid
)
ON CONFLICT (chunk_hash, collection, document_version) DO NOTHING
"""),
{
"hash": chunk_hash,
"collection": "bp_compliance_ce", # Default, we don't track collection per result
"regulation_code": chunk.regulation_code,
"doc_version": "1.0",
"license": license_info.get("license", ""),
"rule": license_info.get("rule", 3),
"path": processing_path,
"control_ids": json.dumps(control_ids),
"job_id": job_id,
},
)
self.db.commit()
except Exception as e:
logger.warning("Failed to mark chunk processed: %s", e)
# ── Main Pipeline ──────────────────────────────────────────────────
async def run(self, config: GeneratorConfig) -> GeneratorResult:
"""Execute the full 7-stage pipeline."""
result = GeneratorResult()
# Create job
job_id = self._create_job(config)
result.job_id = job_id
try:
# Stage 1: RAG Scan
chunks = await self._scan_rag(config)
result.total_chunks_scanned = len(chunks)
if not chunks:
result.status = "completed"
self._update_job(job_id, result)
return result
# Process chunks
controls_count = 0
for chunk in chunks:
if controls_count >= config.max_controls:
break
try:
control = await self._process_single_chunk(chunk, config, job_id)
if control is None:
continue
# Count by state
if control.release_state == "too_close":
result.controls_too_close += 1
elif control.release_state == "duplicate":
result.controls_duplicates_found += 1
elif control.release_state == "needs_review":
result.controls_needs_review += 1
else:
result.controls_verified += 1
# Store (unless dry run)
if not config.dry_run:
ctrl_uuid = self._store_control(control, job_id)
if ctrl_uuid:
# Stage 7: Mark chunk processed
license_info = self._classify_license(chunk)
path = "llm_reform" if license_info["rule"] == 3 else "structured"
self._mark_chunk_processed(chunk, license_info, path, [ctrl_uuid], job_id)
result.controls_generated += 1
result.controls.append(asdict(control))
controls_count += 1
# Add to existing controls for harmonization of next chunks
if self._existing_controls is not None:
self._existing_controls.append({
"control_id": control.control_id,
"title": control.title,
"objective": control.objective,
})
except Exception as e:
error_msg = f"Error processing chunk {chunk.regulation_code}/{chunk.article}: {e}"
logger.error(error_msg)
result.errors.append(error_msg)
result.status = "completed"
except Exception as e:
result.status = "failed"
result.errors.append(str(e))
logger.error("Pipeline failed: %s", e)
self._update_job(job_id, result)
return result
async def _process_single_chunk(
self,
chunk: RAGSearchResult,
config: GeneratorConfig,
job_id: str,
) -> GeneratedControl | None:
"""Process a single chunk through stages 2-5."""
# Stage 2: License classification
license_info = self._classify_license(chunk)
# Stage 3: Structure or Reform based on rule
if license_info["rule"] == 1:
control = await self._structure_free_use(chunk, license_info)
elif license_info["rule"] == 2:
control = await self._structure_with_citation(chunk, license_info)
else:
control = await self._llm_reformulate(chunk, config)
# Too-Close-Check for Rule 3
similarity = await check_similarity(chunk.text, f"{control.objective} {control.rationale}")
if similarity.status == "FAIL":
control.release_state = "too_close"
control.generation_metadata["similarity_status"] = "FAIL"
control.generation_metadata["similarity_scores"] = {
"token_overlap": similarity.token_overlap,
"ngram_jaccard": similarity.ngram_jaccard,
"lcs_ratio": similarity.lcs_ratio,
}
return control
if not control.title or not control.objective:
return None
# Stage 4: Harmonization
duplicates = await self._check_harmonization(control)
if duplicates:
control.release_state = "duplicate"
control.generation_metadata["similar_controls"] = duplicates
return control
# Stage 5: Anchor Search (imported from anchor_finder)
try:
from .anchor_finder import AnchorFinder
finder = AnchorFinder(self.rag)
anchors = await finder.find_anchors(control, skip_web=config.skip_web_search)
control.open_anchors = [asdict(a) if hasattr(a, '__dataclass_fields__') else a for a in anchors]
except Exception as e:
logger.warning("Anchor search failed: %s", e)
# Determine release state
if control.license_rule in (1, 2):
control.release_state = "draft"
elif control.open_anchors:
control.release_state = "draft"
else:
control.release_state = "needs_review"
# Generate control_id
domain = config.domain or _detect_domain(control.objective)
control.control_id = self._generate_control_id(domain, self.db)
# Store job_id in metadata
control.generation_metadata["job_id"] = job_id
return control

View File

@@ -0,0 +1,103 @@
-- Migration 046: Control Generator Pipeline
-- Adds job tracking, chunk tracking, blocked sources, and extends canonical_controls
-- for the 3-license-rule system (free_use, citation_required, restricted).
BEGIN;
-- =============================================================================
-- 1. Job-Tracking for Generator Runs
-- =============================================================================
CREATE TABLE IF NOT EXISTS canonical_generation_jobs (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
status VARCHAR(20) DEFAULT 'pending'
CHECK (status IN ('pending', 'running', 'completed', 'failed', 'cancelled')),
config JSONB NOT NULL,
total_chunks_scanned INTEGER DEFAULT 0,
controls_generated INTEGER DEFAULT 0,
controls_verified INTEGER DEFAULT 0,
controls_needs_review INTEGER DEFAULT 0,
controls_too_close INTEGER DEFAULT 0,
controls_duplicates_found INTEGER DEFAULT 0,
errors JSONB DEFAULT '[]',
started_at TIMESTAMPTZ,
completed_at TIMESTAMPTZ,
created_at TIMESTAMPTZ DEFAULT NOW()
);
-- =============================================================================
-- 2. Tracking which RAG chunks have been processed
-- =============================================================================
CREATE TABLE IF NOT EXISTS canonical_processed_chunks (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
chunk_hash VARCHAR(64) NOT NULL,
collection VARCHAR(100) NOT NULL,
regulation_code VARCHAR(100),
document_version VARCHAR(50),
source_license VARCHAR(50),
license_rule INTEGER NOT NULL
CHECK (license_rule IN (1, 2, 3)),
processing_path VARCHAR(20) NOT NULL
CHECK (processing_path IN ('structured', 'llm_reform', 'skipped')),
generated_control_ids JSONB DEFAULT '[]',
job_id UUID REFERENCES canonical_generation_jobs(id),
processed_at TIMESTAMPTZ DEFAULT NOW(),
UNIQUE (chunk_hash, collection, document_version)
);
CREATE INDEX IF NOT EXISTS idx_cpc_collection ON canonical_processed_chunks(collection);
CREATE INDEX IF NOT EXISTS idx_cpc_regulation ON canonical_processed_chunks(regulation_code);
CREATE INDEX IF NOT EXISTS idx_cpc_job ON canonical_processed_chunks(job_id);
-- =============================================================================
-- 3. Blocked Sources (Rule 3 documents to be deleted after generation)
-- =============================================================================
CREATE TABLE IF NOT EXISTS canonical_blocked_sources (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
regulation_code VARCHAR(100) NOT NULL UNIQUE,
document_title VARCHAR(500) NOT NULL,
reason VARCHAR(500) DEFAULT 'Kommerziell nicht nutzbar — darf nicht mit KI verarbeitet werden',
deletion_status VARCHAR(20) DEFAULT 'pending'
CHECK (deletion_status IN ('pending', 'marked_for_deletion', 'deleted')),
qdrant_collection VARCHAR(100),
original_file_path TEXT,
marked_at TIMESTAMPTZ DEFAULT NOW(),
deleted_at TIMESTAMPTZ
);
-- =============================================================================
-- 4. Extend canonical_controls: release_state + 3-rule columns
-- =============================================================================
-- Expand release_state enum to include generator states
ALTER TABLE canonical_controls DROP CONSTRAINT IF EXISTS canonical_controls_release_state_check;
ALTER TABLE canonical_controls ADD CONSTRAINT canonical_controls_release_state_check
CHECK (release_state IN ('draft', 'review', 'approved', 'deprecated', 'needs_review', 'too_close', 'duplicate'));
-- License rule: 1 = free_use, 2 = citation_required, 3 = restricted
ALTER TABLE canonical_controls ADD COLUMN IF NOT EXISTS
license_rule INTEGER DEFAULT NULL;
-- Original text from source (Rule 1+2 only; Rule 3 = always NULL)
ALTER TABLE canonical_controls ADD COLUMN IF NOT EXISTS
source_original_text TEXT DEFAULT NULL;
-- Citation info (Rule 1+2 only; Rule 3 = always NULL)
ALTER TABLE canonical_controls ADD COLUMN IF NOT EXISTS
source_citation JSONB DEFAULT NULL;
-- Whether source info may be shown to customers
ALTER TABLE canonical_controls ADD COLUMN IF NOT EXISTS
customer_visible BOOLEAN DEFAULT true;
-- Generation metadata (internal only, never shown to customers)
ALTER TABLE canonical_controls ADD COLUMN IF NOT EXISTS
generation_metadata JSONB DEFAULT NULL;
-- Index for filtering by license rule and customer visibility
CREATE INDEX IF NOT EXISTS idx_canonical_controls_license_rule ON canonical_controls(license_rule);
CREATE INDEX IF NOT EXISTS idx_canonical_controls_customer_visible ON canonical_controls(customer_visible);
COMMIT;

View File

@@ -0,0 +1,342 @@
"""Tests for Control Generator Pipeline."""
import json
import pytest
from unittest.mock import AsyncMock, MagicMock, patch
from compliance.services.control_generator import (
_classify_regulation,
_detect_domain,
_parse_llm_json,
GeneratorConfig,
GeneratedControl,
ControlGeneratorPipeline,
REGULATION_LICENSE_MAP,
)
from compliance.services.anchor_finder import AnchorFinder, OpenAnchor
from compliance.services.rag_client import RAGSearchResult
# =============================================================================
# License Mapping Tests
# =============================================================================
class TestLicenseMapping:
"""Tests for regulation_code → license rule classification."""
def test_rule1_eu_law(self):
info = _classify_regulation("eu_2016_679")
assert info["rule"] == 1
assert info["name"] == "DSGVO"
def test_rule1_nist(self):
info = _classify_regulation("nist_sp_800_53")
assert info["rule"] == 1
assert "NIST" in info["name"]
def test_rule1_german_law(self):
info = _classify_regulation("bdsg")
assert info["rule"] == 1
assert info["name"] == "BDSG"
def test_rule2_owasp(self):
info = _classify_regulation("owasp_asvs")
assert info["rule"] == 2
assert "OWASP" in info["name"]
assert "attribution" in info
def test_rule2_enisa_prefix(self):
info = _classify_regulation("enisa_iot_security")
assert info["rule"] == 2
assert "ENISA" in info["name"]
def test_rule3_bsi_prefix(self):
info = _classify_regulation("bsi_tr03161")
assert info["rule"] == 3
assert info["name"] == "INTERNAL_ONLY"
def test_rule3_iso_prefix(self):
info = _classify_regulation("iso_27001")
assert info["rule"] == 3
def test_rule3_etsi_prefix(self):
info = _classify_regulation("etsi_en_303_645")
assert info["rule"] == 3
def test_unknown_defaults_to_rule3(self):
info = _classify_regulation("some_unknown_source")
assert info["rule"] == 3
def test_case_insensitive(self):
info = _classify_regulation("EU_2016_679")
assert info["rule"] == 1
def test_all_mapped_regulations_have_valid_rules(self):
for code, info in REGULATION_LICENSE_MAP.items():
assert info["rule"] in (1, 2, 3), f"{code} has invalid rule {info['rule']}"
def test_rule3_never_exposes_names(self):
for prefix in ["bsi_test", "iso_test", "etsi_test"]:
info = _classify_regulation(prefix)
assert info["name"] == "INTERNAL_ONLY", f"{prefix} exposes name: {info['name']}"
# =============================================================================
# Domain Detection Tests
# =============================================================================
class TestDomainDetection:
def test_auth_domain(self):
assert _detect_domain("Multi-factor authentication and password policy") == "AUTH"
def test_crypto_domain(self):
assert _detect_domain("TLS 1.3 encryption and certificate management") == "CRYPT"
def test_network_domain(self):
assert _detect_domain("Firewall rules and network segmentation") == "NET"
def test_data_domain(self):
assert _detect_domain("DSGVO personenbezogene Daten Datenschutz") == "DATA"
def test_default_domain(self):
assert _detect_domain("random unrelated text xyz") == "SEC"
# =============================================================================
# JSON Parsing Tests
# =============================================================================
class TestJsonParsing:
def test_parse_plain_json(self):
result = _parse_llm_json('{"title": "Test", "objective": "Test obj"}')
assert result["title"] == "Test"
def test_parse_markdown_fenced_json(self):
raw = '```json\n{"title": "Test"}\n```'
result = _parse_llm_json(raw)
assert result["title"] == "Test"
def test_parse_json_with_preamble(self):
raw = 'Here is the result:\n{"title": "Test"}'
result = _parse_llm_json(raw)
assert result["title"] == "Test"
def test_parse_invalid_json(self):
result = _parse_llm_json("not json at all")
assert result == {}
# =============================================================================
# GeneratedControl Rule Tests
# =============================================================================
class TestGeneratedControlRules:
"""Tests that enforce the 3-rule licensing constraints."""
def test_rule1_has_original_text(self):
ctrl = GeneratedControl(license_rule=1)
ctrl.source_original_text = "Original EU law text"
ctrl.source_citation = {"source": "DSGVO Art. 35", "license": "EU_LAW"}
ctrl.customer_visible = True
assert ctrl.source_original_text is not None
assert ctrl.source_citation is not None
assert ctrl.customer_visible is True
def test_rule2_has_citation(self):
ctrl = GeneratedControl(license_rule=2)
ctrl.source_citation = {"source": "OWASP ASVS V2.1", "license": "CC-BY-SA-4.0"}
ctrl.customer_visible = True
assert ctrl.source_citation is not None
assert "CC-BY-SA" in ctrl.source_citation["license"]
def test_rule3_no_original_no_citation(self):
ctrl = GeneratedControl(license_rule=3)
ctrl.source_original_text = None
ctrl.source_citation = None
ctrl.customer_visible = False
ctrl.generation_metadata = {"processing_path": "llm_reform", "license_rule": 3}
assert ctrl.source_original_text is None
assert ctrl.source_citation is None
assert ctrl.customer_visible is False
# generation_metadata must NOT contain source names
metadata_str = json.dumps(ctrl.generation_metadata)
assert "bsi" not in metadata_str.lower()
assert "iso" not in metadata_str.lower()
assert "TR-03161" not in metadata_str
# =============================================================================
# Anchor Finder Tests
# =============================================================================
class TestAnchorFinder:
@pytest.mark.asyncio
async def test_rag_anchor_search_filters_restricted(self):
"""Only Rule 1+2 sources are returned as anchors."""
mock_rag = AsyncMock()
mock_rag.search.return_value = [
RAGSearchResult(
text="OWASP requirement",
regulation_code="owasp_asvs",
regulation_name="OWASP ASVS",
regulation_short="OWASP",
category="requirement",
article="V2.1.1",
paragraph="",
source_url="https://owasp.org",
score=0.9,
),
RAGSearchResult(
text="BSI requirement",
regulation_code="bsi_tr03161",
regulation_name="BSI TR-03161",
regulation_short="BSI",
category="requirement",
article="O.Auth_1",
paragraph="",
source_url="",
score=0.85,
),
]
finder = AnchorFinder(rag_client=mock_rag)
control = GeneratedControl(title="Test Auth Control", tags=["auth"])
anchors = await finder.find_anchors(control, skip_web=True)
# Only OWASP should be returned (Rule 2), BSI should be filtered out (Rule 3)
assert len(anchors) == 1
assert anchors[0].framework == "OWASP ASVS"
@pytest.mark.asyncio
async def test_web_search_identifies_frameworks(self):
finder = AnchorFinder()
assert finder._identify_framework_from_url("https://owasp.org/asvs") == "OWASP"
assert finder._identify_framework_from_url("https://csrc.nist.gov/sp800-53") == "NIST"
assert finder._identify_framework_from_url("https://www.enisa.europa.eu/pub") == "ENISA"
assert finder._identify_framework_from_url("https://random-site.com") is None
# =============================================================================
# Pipeline Integration Tests (Mocked)
# =============================================================================
class TestPipelineMocked:
"""Tests for the pipeline with mocked DB and external services."""
def _make_chunk(self, regulation_code: str = "owasp_asvs", article: str = "V2.1.1"):
return RAGSearchResult(
text="Applications must implement multi-factor authentication.",
regulation_code=regulation_code,
regulation_name="OWASP ASVS",
regulation_short="OWASP",
category="requirement",
article=article,
paragraph="",
source_url="https://owasp.org",
score=0.9,
)
@pytest.mark.asyncio
async def test_rule1_processing_path(self):
"""Rule 1 chunks produce controls with original text."""
chunk = self._make_chunk(regulation_code="eu_2016_679", article="Art. 35")
chunk.text = "Die Datenschutz-Folgenabschaetzung ist durchzufuehren."
chunk.regulation_name = "DSGVO"
mock_db = MagicMock()
mock_db.execute.return_value.fetchone.return_value = None
pipeline = ControlGeneratorPipeline(db=mock_db)
license_info = pipeline._classify_license(chunk)
assert license_info["rule"] == 1
@pytest.mark.asyncio
async def test_rule3_processing_blocks_source_info(self):
"""Rule 3 must never store original text or source names."""
mock_db = MagicMock()
mock_rag = AsyncMock()
pipeline = ControlGeneratorPipeline(db=mock_db, rag_client=mock_rag)
# Simulate LLM response
llm_response = json.dumps({
"title": "Secure Password Storage",
"objective": "Passwords must be hashed with modern algorithms.",
"rationale": "Prevents credential theft.",
"requirements": ["Use bcrypt or argon2"],
"test_procedure": ["Verify hash algorithm"],
"evidence": ["Config review"],
"severity": "high",
"tags": ["auth", "password"],
})
with patch("compliance.services.control_generator._llm_chat", return_value=llm_response):
chunk = self._make_chunk(regulation_code="bsi_tr03161", article="O.Auth_1")
config = GeneratorConfig(max_controls=1)
control = await pipeline._llm_reformulate(chunk, config)
assert control.license_rule == 3
assert control.source_original_text is None
assert control.source_citation is None
assert control.customer_visible is False
# Verify no BSI references in metadata
metadata_str = json.dumps(control.generation_metadata)
assert "bsi" not in metadata_str.lower()
assert "BSI" not in metadata_str
assert "TR-03161" not in metadata_str
@pytest.mark.asyncio
async def test_chunk_hash_deduplication(self):
"""Same chunk text produces same hash — no double processing."""
import hashlib
text = "Test requirement text"
h1 = hashlib.sha256(text.encode()).hexdigest()
h2 = hashlib.sha256(text.encode()).hexdigest()
assert h1 == h2
def test_config_defaults(self):
config = GeneratorConfig()
assert config.max_controls == 50
assert config.batch_size == 5
assert config.skip_processed is True
assert config.dry_run is False
@pytest.mark.asyncio
async def test_structure_free_use_produces_citation(self):
"""Rule 1 structuring includes source citation."""
mock_db = MagicMock()
pipeline = ControlGeneratorPipeline(db=mock_db)
llm_response = json.dumps({
"title": "DSFA Pflicht",
"objective": "DSFA bei hohem Risiko durchfuehren.",
"rationale": "Gesetzliche Pflicht nach DSGVO.",
"requirements": ["DSFA durchfuehren"],
"test_procedure": ["DSFA Bericht pruefen"],
"evidence": ["DSFA Dokumentation"],
"severity": "high",
"tags": ["dsfa", "dsgvo"],
})
chunk = self._make_chunk(regulation_code="eu_2016_679", article="Art. 35")
chunk.text = "Art. 35 DSGVO: Datenschutz-Folgenabschaetzung"
chunk.regulation_name = "DSGVO"
license_info = _classify_regulation("eu_2016_679")
with patch("compliance.services.control_generator._llm_chat", return_value=llm_response):
control = await pipeline._structure_free_use(chunk, license_info)
assert control.license_rule == 1
assert control.source_original_text is not None
assert control.source_citation is not None
assert "DSGVO" in control.source_citation["source"]
assert control.customer_visible is True