fix: Restore all files lost during destructive rebase

A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.

This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).

Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-02-09 09:51:32 +01:00
parent f7487ee240
commit bfdaf63ba9
2009 changed files with 749983 additions and 1731 deletions

View File

@@ -0,0 +1,17 @@
"""
Compliance Services Module.
Contains business logic services for the compliance module:
- PDF extraction from BSI-TR and EU regulations
- LLM-based requirement interpretation
- Export generation
"""
from .pdf_extractor import BSIPDFExtractor, BSIAspect, EURegulationExtractor, EUArticle
__all__ = [
"BSIPDFExtractor",
"BSIAspect",
"EURegulationExtractor",
"EUArticle",
]

View File

@@ -0,0 +1,500 @@
"""
AI Compliance Assistant for Breakpilot.
Provides AI-powered features for:
- Requirement interpretation (translating legal text to technical guidance)
- Control suggestions (recommending controls for requirements)
- Risk assessment (evaluating compliance risks)
- Gap analysis (identifying missing controls)
"""
import json
import logging
import re
from dataclasses import dataclass, field
from typing import List, Optional, Dict, Any
from enum import Enum
from .llm_provider import LLMProvider, get_shared_provider, LLMResponse
logger = logging.getLogger(__name__)
class InterpretationSection(str, Enum):
"""Sections in a requirement interpretation."""
SUMMARY = "summary"
APPLICABILITY = "applicability"
TECHNICAL_MEASURES = "technical_measures"
AFFECTED_MODULES = "affected_modules"
RISK_LEVEL = "risk_level"
IMPLEMENTATION_HINTS = "implementation_hints"
@dataclass
class RequirementInterpretation:
"""AI-generated interpretation of a regulatory requirement."""
requirement_id: str
summary: str
applicability: str
technical_measures: List[str]
affected_modules: List[str]
risk_level: str # low, medium, high, critical
implementation_hints: List[str]
confidence_score: float # 0.0 - 1.0
raw_response: Optional[str] = None
error: Optional[str] = None
@dataclass
class ControlSuggestion:
"""AI-suggested control for a requirement."""
control_id: str # Suggested ID like "PRIV-XXX"
domain: str # Control domain (priv, sdlc, iam, etc.)
title: str
description: str
pass_criteria: str
implementation_guidance: str
is_automated: bool
automation_tool: Optional[str] = None
priority: str = "medium" # low, medium, high, critical
confidence_score: float = 0.0
@dataclass
class RiskAssessment:
"""AI-generated risk assessment for a module."""
module_name: str
overall_risk: str # low, medium, high, critical
risk_factors: List[Dict[str, Any]]
recommendations: List[str]
compliance_gaps: List[str]
confidence_score: float = 0.0
@dataclass
class GapAnalysis:
"""Gap analysis result for requirement-control mapping."""
requirement_id: str
requirement_title: str
coverage_level: str # full, partial, none
existing_controls: List[str]
missing_coverage: List[str]
suggested_actions: List[str]
class AIComplianceAssistant:
"""
AI-powered compliance assistant using LLM providers.
Supports both Claude API and self-hosted LLMs through the
abstracted LLMProvider interface.
"""
# System prompts for different tasks
SYSTEM_PROMPT_BASE = """Du bist ein Compliance-Experte für die Breakpilot Bildungsplattform.
Breakpilot ist ein EdTech SaaS-System mit folgenden Eigenschaften:
- KI-gestützte Klausurkorrektur und Feedback
- Videokonferenzen (Jitsi) und Chat (Matrix)
- Schulverwaltung mit Noten und Zeugnissen
- Consent-Management und DSGVO-Compliance
- Self-Hosted in Deutschland
Du analysierst regulatorische Anforderungen und gibst konkrete technische Empfehlungen."""
INTERPRETATION_PROMPT = """Analysiere folgende regulatorische Anforderung für Breakpilot:
Verordnung: {regulation_name} ({regulation_code})
Artikel: {article}
Titel: {title}
Originaltext: {requirement_text}
Erstelle eine strukturierte Analyse im JSON-Format:
{{
"summary": "Kurze Zusammenfassung in 2-3 Sätzen",
"applicability": "Erklärung wie dies auf Breakpilot anwendbar ist",
"technical_measures": ["Liste konkreter technischer Maßnahmen"],
"affected_modules": ["Liste betroffener Breakpilot-Module (z.B. consent-service, klausur-service, matrix-synapse)"],
"risk_level": "low|medium|high|critical",
"implementation_hints": ["Konkrete Implementierungshinweise"]
}}
Gib NUR das JSON zurück, keine zusätzlichen Erklärungen."""
CONTROL_SUGGESTION_PROMPT = """Basierend auf folgender Anforderung, schlage passende Controls vor:
Verordnung: {regulation_name}
Anforderung: {requirement_title}
Beschreibung: {requirement_text}
Betroffene Module: {affected_modules}
Schlage 1-3 Controls im JSON-Format vor:
{{
"controls": [
{{
"control_id": "DOMAIN-XXX",
"domain": "priv|iam|sdlc|crypto|ops|ai|cra|gov|aud",
"title": "Kurzer Titel",
"description": "Beschreibung des Controls",
"pass_criteria": "Messbare Erfolgskriterien",
"implementation_guidance": "Wie implementieren",
"is_automated": true|false,
"automation_tool": "Tool-Name oder null",
"priority": "low|medium|high|critical"
}}
]
}}
Domains:
- priv: Datenschutz & Privacy (DSGVO)
- iam: Identity & Access Management
- sdlc: Secure Development Lifecycle
- crypto: Kryptografie
- ops: Betrieb & Monitoring
- ai: KI-spezifisch (AI Act)
- cra: Cyber Resilience Act
- gov: Governance
- aud: Audit & Nachvollziehbarkeit
Gib NUR das JSON zurück."""
RISK_ASSESSMENT_PROMPT = """Bewerte das Compliance-Risiko für folgendes Breakpilot-Modul:
Modul: {module_name}
Typ: {service_type}
Beschreibung: {description}
Verarbeitet PII: {processes_pii}
KI-Komponenten: {ai_components}
Kritikalität: {criticality}
Daten-Kategorien: {data_categories}
Zugeordnete Verordnungen: {regulations}
Erstelle eine Risikobewertung im JSON-Format:
{{
"overall_risk": "low|medium|high|critical",
"risk_factors": [
{{"factor": "Beschreibung", "severity": "low|medium|high", "likelihood": "low|medium|high"}}
],
"recommendations": ["Empfehlungen zur Risikominderung"],
"compliance_gaps": ["Identifizierte Compliance-Lücken"]
}}
Gib NUR das JSON zurück."""
GAP_ANALYSIS_PROMPT = """Analysiere die Control-Abdeckung für folgende Anforderung:
Anforderung: {requirement_title}
Verordnung: {regulation_code}
Beschreibung: {requirement_text}
Existierende Controls:
{existing_controls}
Bewerte die Abdeckung und identifiziere Lücken im JSON-Format:
{{
"coverage_level": "full|partial|none",
"covered_aspects": ["Was ist bereits abgedeckt"],
"missing_coverage": ["Was fehlt noch"],
"suggested_actions": ["Empfohlene Maßnahmen"]
}}
Gib NUR das JSON zurück."""
def __init__(self, llm_provider: Optional[LLMProvider] = None):
"""Initialize the assistant with an LLM provider."""
self.llm = llm_provider or get_shared_provider()
async def interpret_requirement(
self,
requirement_id: str,
article: str,
title: str,
requirement_text: str,
regulation_code: str,
regulation_name: str
) -> RequirementInterpretation:
"""
Generate an interpretation for a regulatory requirement.
Translates legal text into practical technical guidance
for the Breakpilot development team.
"""
prompt = self.INTERPRETATION_PROMPT.format(
regulation_name=regulation_name,
regulation_code=regulation_code,
article=article,
title=title,
requirement_text=requirement_text or "Kein Text verfügbar"
)
try:
response = await self.llm.complete(
prompt=prompt,
system_prompt=self.SYSTEM_PROMPT_BASE,
max_tokens=2000,
temperature=0.3
)
# Parse JSON response
data = self._parse_json_response(response.content)
return RequirementInterpretation(
requirement_id=requirement_id,
summary=data.get("summary", ""),
applicability=data.get("applicability", ""),
technical_measures=data.get("technical_measures", []),
affected_modules=data.get("affected_modules", []),
risk_level=data.get("risk_level", "medium"),
implementation_hints=data.get("implementation_hints", []),
confidence_score=0.85, # Based on model quality
raw_response=response.content
)
except Exception as e:
logger.error(f"Failed to interpret requirement {requirement_id}: {e}")
return RequirementInterpretation(
requirement_id=requirement_id,
summary="",
applicability="",
technical_measures=[],
affected_modules=[],
risk_level="medium",
implementation_hints=[],
confidence_score=0.0,
error=str(e)
)
async def suggest_controls(
self,
requirement_title: str,
requirement_text: str,
regulation_name: str,
affected_modules: List[str]
) -> List[ControlSuggestion]:
"""
Suggest controls for a given requirement.
Returns a list of control suggestions with implementation guidance.
"""
prompt = self.CONTROL_SUGGESTION_PROMPT.format(
regulation_name=regulation_name,
requirement_title=requirement_title,
requirement_text=requirement_text or "Keine Beschreibung",
affected_modules=", ".join(affected_modules) if affected_modules else "Alle Module"
)
try:
response = await self.llm.complete(
prompt=prompt,
system_prompt=self.SYSTEM_PROMPT_BASE,
max_tokens=2000,
temperature=0.4
)
data = self._parse_json_response(response.content)
controls = data.get("controls", [])
return [
ControlSuggestion(
control_id=c.get("control_id", "NEW-001"),
domain=c.get("domain", "gov"),
title=c.get("title", ""),
description=c.get("description", ""),
pass_criteria=c.get("pass_criteria", ""),
implementation_guidance=c.get("implementation_guidance", ""),
is_automated=c.get("is_automated", False),
automation_tool=c.get("automation_tool"),
priority=c.get("priority", "medium"),
confidence_score=0.75
)
for c in controls
]
except Exception as e:
logger.error(f"Failed to suggest controls: {e}")
return []
async def assess_module_risk(
self,
module_name: str,
service_type: str,
description: str,
processes_pii: bool,
ai_components: bool,
criticality: str,
data_categories: List[str],
regulations: List[Dict[str, str]]
) -> RiskAssessment:
"""
Assess the compliance risk for a service module.
"""
prompt = self.RISK_ASSESSMENT_PROMPT.format(
module_name=module_name,
service_type=service_type,
description=description or "Keine Beschreibung",
processes_pii="Ja" if processes_pii else "Nein",
ai_components="Ja" if ai_components else "Nein",
criticality=criticality,
data_categories=", ".join(data_categories) if data_categories else "Keine",
regulations=", ".join([f"{r['code']} ({r.get('relevance', 'medium')})" for r in regulations]) if regulations else "Keine"
)
try:
response = await self.llm.complete(
prompt=prompt,
system_prompt=self.SYSTEM_PROMPT_BASE,
max_tokens=1500,
temperature=0.3
)
data = self._parse_json_response(response.content)
return RiskAssessment(
module_name=module_name,
overall_risk=data.get("overall_risk", "medium"),
risk_factors=data.get("risk_factors", []),
recommendations=data.get("recommendations", []),
compliance_gaps=data.get("compliance_gaps", []),
confidence_score=0.8
)
except Exception as e:
logger.error(f"Failed to assess risk for {module_name}: {e}")
return RiskAssessment(
module_name=module_name,
overall_risk="unknown",
risk_factors=[],
recommendations=[],
compliance_gaps=[],
confidence_score=0.0
)
async def analyze_gap(
self,
requirement_id: str,
requirement_title: str,
requirement_text: str,
regulation_code: str,
existing_controls: List[Dict[str, str]]
) -> GapAnalysis:
"""
Analyze gaps between requirements and existing controls.
"""
controls_text = "\n".join([
f"- {c.get('control_id', 'N/A')}: {c.get('title', 'N/A')} - {c.get('status', 'N/A')}"
for c in existing_controls
]) if existing_controls else "Keine Controls zugeordnet"
prompt = self.GAP_ANALYSIS_PROMPT.format(
requirement_title=requirement_title,
regulation_code=regulation_code,
requirement_text=requirement_text or "Keine Beschreibung",
existing_controls=controls_text
)
try:
response = await self.llm.complete(
prompt=prompt,
system_prompt=self.SYSTEM_PROMPT_BASE,
max_tokens=1500,
temperature=0.3
)
data = self._parse_json_response(response.content)
return GapAnalysis(
requirement_id=requirement_id,
requirement_title=requirement_title,
coverage_level=data.get("coverage_level", "none"),
existing_controls=[c.get("control_id", "") for c in existing_controls],
missing_coverage=data.get("missing_coverage", []),
suggested_actions=data.get("suggested_actions", [])
)
except Exception as e:
logger.error(f"Failed to analyze gap for {requirement_id}: {e}")
return GapAnalysis(
requirement_id=requirement_id,
requirement_title=requirement_title,
coverage_level="unknown",
existing_controls=[],
missing_coverage=[],
suggested_actions=[]
)
async def batch_interpret_requirements(
self,
requirements: List[Dict[str, Any]],
rate_limit: float = 1.0
) -> List[RequirementInterpretation]:
"""
Process multiple requirements with rate limiting.
Useful for bulk processing of regulations.
"""
results = []
for i, req in enumerate(requirements):
if i > 0:
import asyncio
await asyncio.sleep(rate_limit)
result = await self.interpret_requirement(
requirement_id=req.get("id", str(i)),
article=req.get("article", ""),
title=req.get("title", ""),
requirement_text=req.get("requirement_text", ""),
regulation_code=req.get("regulation_code", ""),
regulation_name=req.get("regulation_name", "")
)
results.append(result)
logger.info(f"Processed requirement {i+1}/{len(requirements)}: {req.get('title', 'N/A')}")
return results
def _parse_json_response(self, content: str) -> Dict[str, Any]:
"""
Parse JSON from LLM response, handling common formatting issues.
"""
# Try to extract JSON from the response
content = content.strip()
# Remove markdown code blocks if present
if content.startswith("```json"):
content = content[7:]
elif content.startswith("```"):
content = content[3:]
if content.endswith("```"):
content = content[:-3]
content = content.strip()
# Find JSON object in the response
json_match = re.search(r'\{[\s\S]*\}', content)
if json_match:
content = json_match.group(0)
try:
return json.loads(content)
except json.JSONDecodeError as e:
logger.warning(f"Failed to parse JSON response: {e}")
logger.debug(f"Raw content: {content[:500]}")
return {}
# Singleton instance
_assistant_instance: Optional[AIComplianceAssistant] = None
def get_ai_assistant() -> AIComplianceAssistant:
"""Get the shared AI compliance assistant instance."""
global _assistant_instance
if _assistant_instance is None:
_assistant_instance = AIComplianceAssistant()
return _assistant_instance
def reset_ai_assistant():
"""Reset the shared assistant instance (useful for testing)."""
global _assistant_instance
_assistant_instance = None

View File

@@ -0,0 +1,880 @@
"""
Audit Session PDF Report Generator.
Sprint 3 Phase 4: Generates PDF reports for completed audit sessions.
Features:
- Cover page with audit session metadata
- Executive summary with traffic light status
- Statistics pie chart (compliant/non-compliant/pending)
- Detailed checklist with sign-off status
- Digital signature verification
- Appendix with non-compliant items
Uses reportlab for PDF generation (lightweight, no external dependencies).
"""
import io
import logging
from datetime import datetime
from typing import Dict, List, Any, Optional, Tuple
from uuid import uuid4
import hashlib
from sqlalchemy.orm import Session, selectinload
from reportlab.lib import colors
from reportlab.lib.pagesizes import A4
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.units import mm, cm
from reportlab.lib.enums import TA_CENTER, TA_LEFT, TA_RIGHT, TA_JUSTIFY
from reportlab.platypus import (
SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle,
PageBreak, Image, ListFlowable, ListItem, KeepTogether,
HRFlowable
)
from reportlab.graphics.shapes import Drawing, Rect, String
from reportlab.graphics.charts.piecharts import Pie
from ..db.models import (
AuditSessionDB, AuditSignOffDB, AuditResultEnum, AuditSessionStatusEnum,
RequirementDB, RegulationDB
)
logger = logging.getLogger(__name__)
# =============================================================================
# Color Definitions
# =============================================================================
COLORS = {
'primary': colors.HexColor('#1a365d'), # Dark blue
'secondary': colors.HexColor('#2c5282'), # Medium blue
'accent': colors.HexColor('#3182ce'), # Light blue
'success': colors.HexColor('#38a169'), # Green
'warning': colors.HexColor('#d69e2e'), # Yellow/Orange
'danger': colors.HexColor('#e53e3e'), # Red
'muted': colors.HexColor('#718096'), # Gray
'light': colors.HexColor('#f7fafc'), # Light gray
'white': colors.white,
'black': colors.black,
}
RESULT_COLORS = {
'compliant': COLORS['success'],
'compliant_notes': colors.HexColor('#68d391'), # Light green
'non_compliant': COLORS['danger'],
'not_applicable': COLORS['muted'],
'pending': COLORS['warning'],
}
# =============================================================================
# Custom Styles
# =============================================================================
def get_custom_styles() -> Dict[str, ParagraphStyle]:
"""Create custom paragraph styles for the audit report."""
styles = getSampleStyleSheet()
custom = {
'Title': ParagraphStyle(
'AuditTitle',
parent=styles['Title'],
fontSize=24,
textColor=COLORS['primary'],
spaceAfter=12*mm,
alignment=TA_CENTER,
),
'Subtitle': ParagraphStyle(
'AuditSubtitle',
parent=styles['Normal'],
fontSize=14,
textColor=COLORS['secondary'],
spaceAfter=6*mm,
alignment=TA_CENTER,
),
'Heading1': ParagraphStyle(
'AuditH1',
parent=styles['Heading1'],
fontSize=18,
textColor=COLORS['primary'],
spaceBefore=12*mm,
spaceAfter=6*mm,
borderPadding=3*mm,
),
'Heading2': ParagraphStyle(
'AuditH2',
parent=styles['Heading2'],
fontSize=14,
textColor=COLORS['secondary'],
spaceBefore=8*mm,
spaceAfter=4*mm,
),
'Heading3': ParagraphStyle(
'AuditH3',
parent=styles['Heading3'],
fontSize=12,
textColor=COLORS['accent'],
spaceBefore=6*mm,
spaceAfter=3*mm,
),
'Normal': ParagraphStyle(
'AuditNormal',
parent=styles['Normal'],
fontSize=10,
textColor=COLORS['black'],
spaceAfter=3*mm,
alignment=TA_JUSTIFY,
),
'Small': ParagraphStyle(
'AuditSmall',
parent=styles['Normal'],
fontSize=8,
textColor=COLORS['muted'],
spaceAfter=2*mm,
),
'Footer': ParagraphStyle(
'AuditFooter',
parent=styles['Normal'],
fontSize=8,
textColor=COLORS['muted'],
alignment=TA_CENTER,
),
'Success': ParagraphStyle(
'AuditSuccess',
parent=styles['Normal'],
fontSize=10,
textColor=COLORS['success'],
),
'Warning': ParagraphStyle(
'AuditWarning',
parent=styles['Normal'],
fontSize=10,
textColor=COLORS['warning'],
),
'Danger': ParagraphStyle(
'AuditDanger',
parent=styles['Normal'],
fontSize=10,
textColor=COLORS['danger'],
),
}
return custom
# =============================================================================
# PDF Generator Class
# =============================================================================
class AuditPDFGenerator:
"""Generates PDF reports for audit sessions."""
def __init__(self, db: Session):
self.db = db
self.styles = get_custom_styles()
self.page_width, self.page_height = A4
self.margin = 20 * mm
def generate(
self,
session_id: str,
language: str = 'de',
include_signatures: bool = True,
) -> Tuple[bytes, str]:
"""
Generate a PDF report for an audit session.
Args:
session_id: The audit session ID
language: Report language ('de' or 'en')
include_signatures: Whether to include digital signature info
Returns:
Tuple of (PDF bytes, filename)
"""
# Load session with all related data
session = self._load_session(session_id)
if not session:
raise ValueError(f"Audit session {session_id} not found")
# Load all sign-offs
signoffs = self._load_signoffs(session_id)
signoff_map = {s.requirement_id: s for s in signoffs}
# Load requirements for this session
requirements = self._load_requirements(session)
# Calculate statistics
stats = self._calculate_statistics(session, signoffs)
# Generate PDF
buffer = io.BytesIO()
doc = SimpleDocTemplate(
buffer,
pagesize=A4,
leftMargin=self.margin,
rightMargin=self.margin,
topMargin=self.margin,
bottomMargin=self.margin,
)
# Build story (content)
story = []
# 1. Cover page
story.extend(self._build_cover_page(session, language))
story.append(PageBreak())
# 2. Executive summary
story.extend(self._build_executive_summary(session, stats, language))
story.append(PageBreak())
# 3. Statistics overview
story.extend(self._build_statistics_section(stats, language))
# 4. Detailed checklist
story.extend(self._build_checklist_section(
session, requirements, signoff_map, language
))
# 5. Non-compliant items appendix (if any)
non_compliant = [s for s in signoffs if s.result == AuditResultEnum.NON_COMPLIANT]
if non_compliant:
story.append(PageBreak())
story.extend(self._build_non_compliant_appendix(
non_compliant, requirements, language
))
# 6. Signature verification (if requested)
if include_signatures:
signed_items = [s for s in signoffs if s.signature_hash]
if signed_items:
story.append(PageBreak())
story.extend(self._build_signature_section(signed_items, language))
# Build the PDF
doc.build(story)
# Generate filename
date_str = datetime.utcnow().strftime('%Y%m%d')
filename = f"audit_report_{session.name.replace(' ', '_')}_{date_str}.pdf"
return buffer.getvalue(), filename
def _load_session(self, session_id: str) -> Optional[AuditSessionDB]:
"""Load an audit session by ID."""
return self.db.query(AuditSessionDB).filter(
AuditSessionDB.id == session_id
).first()
def _load_signoffs(self, session_id: str) -> List[AuditSignOffDB]:
"""Load all sign-offs for a session."""
return (
self.db.query(AuditSignOffDB)
.filter(AuditSignOffDB.session_id == session_id)
.all()
)
def _load_requirements(self, session: AuditSessionDB) -> List[RequirementDB]:
"""Load requirements for a session based on filters."""
query = self.db.query(RequirementDB).join(RegulationDB)
if session.regulation_ids:
query = query.filter(RegulationDB.code.in_(session.regulation_ids))
return query.order_by(RegulationDB.code, RequirementDB.article).all()
def _calculate_statistics(
self,
session: AuditSessionDB,
signoffs: List[AuditSignOffDB],
) -> Dict[str, Any]:
"""Calculate audit statistics."""
total = session.total_items
completed = len(signoffs)
compliant = sum(1 for s in signoffs if s.result == AuditResultEnum.COMPLIANT)
compliant_notes = sum(1 for s in signoffs if s.result == AuditResultEnum.COMPLIANT_WITH_NOTES)
non_compliant = sum(1 for s in signoffs if s.result == AuditResultEnum.NON_COMPLIANT)
not_applicable = sum(1 for s in signoffs if s.result == AuditResultEnum.NOT_APPLICABLE)
pending = total - completed
# Calculate compliance rate (excluding N/A and pending)
applicable = compliant + compliant_notes + non_compliant
compliance_rate = ((compliant + compliant_notes) / applicable * 100) if applicable > 0 else 0
return {
'total': total,
'completed': completed,
'pending': pending,
'compliant': compliant,
'compliant_notes': compliant_notes,
'non_compliant': non_compliant,
'not_applicable': not_applicable,
'completion_percentage': round((completed / total * 100) if total > 0 else 0, 1),
'compliance_rate': round(compliance_rate, 1),
'traffic_light': self._determine_traffic_light(compliance_rate, pending, total),
}
def _determine_traffic_light(
self,
compliance_rate: float,
pending: int,
total: int,
) -> str:
"""Determine traffic light status."""
pending_ratio = pending / total if total > 0 else 0
if pending_ratio > 0.3:
return 'yellow' # Too many pending items
elif compliance_rate >= 90:
return 'green'
elif compliance_rate >= 70:
return 'yellow'
else:
return 'red'
# =========================================================================
# Build Page Sections
# =========================================================================
def _build_cover_page(
self,
session: AuditSessionDB,
language: str,
) -> List:
"""Build the cover page."""
story = []
# Title
title = 'AUDIT-BERICHT' if language == 'de' else 'AUDIT REPORT'
story.append(Spacer(1, 30*mm))
story.append(Paragraph(title, self.styles['Title']))
# Session name
story.append(Paragraph(session.name, self.styles['Subtitle']))
story.append(Spacer(1, 15*mm))
# Horizontal rule
story.append(HRFlowable(
width="80%",
thickness=1,
color=COLORS['accent'],
spaceAfter=15*mm,
))
# Metadata table
labels = {
'de': {
'auditor': 'Auditor',
'organization': 'Organisation',
'status': 'Status',
'created': 'Erstellt am',
'started': 'Gestartet am',
'completed': 'Abgeschlossen am',
'regulations': 'Verordnungen',
},
'en': {
'auditor': 'Auditor',
'organization': 'Organization',
'status': 'Status',
'created': 'Created',
'started': 'Started',
'completed': 'Completed',
'regulations': 'Regulations',
},
}
l = labels.get(language, labels['de'])
status_map = {
'draft': 'Entwurf' if language == 'de' else 'Draft',
'in_progress': 'In Bearbeitung' if language == 'de' else 'In Progress',
'completed': 'Abgeschlossen' if language == 'de' else 'Completed',
'archived': 'Archiviert' if language == 'de' else 'Archived',
}
data = [
[l['auditor'], session.auditor_name],
[l['organization'], session.auditor_organization or '-'],
[l['status'], status_map.get(session.status.value, session.status.value)],
[l['created'], session.created_at.strftime('%d.%m.%Y %H:%M') if session.created_at else '-'],
[l['started'], session.started_at.strftime('%d.%m.%Y %H:%M') if session.started_at else '-'],
[l['completed'], session.completed_at.strftime('%d.%m.%Y %H:%M') if session.completed_at else '-'],
[l['regulations'], ', '.join(session.regulation_ids) if session.regulation_ids else 'Alle'],
]
table = Table(data, colWidths=[50*mm, 100*mm])
table.setStyle(TableStyle([
('FONTNAME', (0, 0), (0, -1), 'Helvetica-Bold'),
('FONTNAME', (1, 0), (1, -1), 'Helvetica'),
('FONTSIZE', (0, 0), (-1, -1), 11),
('TEXTCOLOR', (0, 0), (0, -1), COLORS['secondary']),
('TEXTCOLOR', (1, 0), (1, -1), COLORS['black']),
('BOTTOMPADDING', (0, 0), (-1, -1), 8),
('TOPPADDING', (0, 0), (-1, -1), 8),
('ALIGN', (0, 0), (0, -1), 'RIGHT'),
('ALIGN', (1, 0), (1, -1), 'LEFT'),
('VALIGN', (0, 0), (-1, -1), 'MIDDLE'),
]))
story.append(table)
story.append(Spacer(1, 20*mm))
# Description if available
if session.description:
desc_label = 'Beschreibung' if language == 'de' else 'Description'
story.append(Paragraph(f"<b>{desc_label}:</b>", self.styles['Normal']))
story.append(Paragraph(session.description, self.styles['Normal']))
# Generation timestamp
story.append(Spacer(1, 30*mm))
gen_label = 'Generiert am' if language == 'de' else 'Generated on'
story.append(Paragraph(
f"{gen_label}: {datetime.utcnow().strftime('%d.%m.%Y %H:%M')} UTC",
self.styles['Footer']
))
return story
def _build_executive_summary(
self,
session: AuditSessionDB,
stats: Dict[str, Any],
language: str,
) -> List:
"""Build the executive summary section."""
story = []
title = 'ZUSAMMENFASSUNG' if language == 'de' else 'EXECUTIVE SUMMARY'
story.append(Paragraph(title, self.styles['Heading1']))
# Traffic light status
traffic_light = stats['traffic_light']
tl_colors = {
'green': COLORS['success'],
'yellow': COLORS['warning'],
'red': COLORS['danger'],
}
tl_labels = {
'de': {'green': 'GUT', 'yellow': 'AUFMERKSAMKEIT', 'red': 'KRITISCH'},
'en': {'green': 'GOOD', 'yellow': 'ATTENTION', 'red': 'CRITICAL'},
}
# Create traffic light indicator
tl_table = Table(
[[tl_labels[language][traffic_light]]],
colWidths=[60*mm],
rowHeights=[15*mm],
)
tl_table.setStyle(TableStyle([
('BACKGROUND', (0, 0), (0, 0), tl_colors[traffic_light]),
('TEXTCOLOR', (0, 0), (0, 0), COLORS['white']),
('FONTNAME', (0, 0), (0, 0), 'Helvetica-Bold'),
('FONTSIZE', (0, 0), (0, 0), 16),
('ALIGN', (0, 0), (0, 0), 'CENTER'),
('VALIGN', (0, 0), (0, 0), 'MIDDLE'),
('ROUNDEDCORNERS', [3, 3, 3, 3]),
]))
story.append(tl_table)
story.append(Spacer(1, 10*mm))
# Key metrics
labels = {
'de': {
'completion': 'Abschlussrate',
'compliance': 'Konformitaetsrate',
'total': 'Gesamtanforderungen',
'non_compliant': 'Nicht konform',
'pending': 'Ausstehend',
},
'en': {
'completion': 'Completion Rate',
'compliance': 'Compliance Rate',
'total': 'Total Requirements',
'non_compliant': 'Non-Compliant',
'pending': 'Pending',
},
}
l = labels.get(language, labels['de'])
metrics_data = [
[l['completion'], f"{stats['completion_percentage']}%"],
[l['compliance'], f"{stats['compliance_rate']}%"],
[l['total'], str(stats['total'])],
[l['non_compliant'], str(stats['non_compliant'])],
[l['pending'], str(stats['pending'])],
]
metrics_table = Table(metrics_data, colWidths=[60*mm, 40*mm])
metrics_table.setStyle(TableStyle([
('FONTNAME', (0, 0), (0, -1), 'Helvetica-Bold'),
('FONTNAME', (1, 0), (1, -1), 'Helvetica'),
('FONTSIZE', (0, 0), (-1, -1), 12),
('TEXTCOLOR', (0, 0), (0, -1), COLORS['secondary']),
('TEXTCOLOR', (1, 0), (1, -1), COLORS['black']),
('BOTTOMPADDING', (0, 0), (-1, -1), 6),
('TOPPADDING', (0, 0), (-1, -1), 6),
('ALIGN', (0, 0), (0, -1), 'LEFT'),
('ALIGN', (1, 0), (1, -1), 'RIGHT'),
('LINEABOVE', (0, 0), (-1, 0), 1, COLORS['light']),
('LINEBELOW', (0, -1), (-1, -1), 1, COLORS['light']),
]))
story.append(metrics_table)
story.append(Spacer(1, 10*mm))
# Key findings
findings_title = 'Wichtige Erkenntnisse' if language == 'de' else 'Key Findings'
story.append(Paragraph(f"<b>{findings_title}:</b>", self.styles['Heading3']))
findings = self._generate_findings(stats, language)
for finding in findings:
story.append(Paragraph(f"{finding}", self.styles['Normal']))
return story
def _generate_findings(self, stats: Dict[str, Any], language: str) -> List[str]:
"""Generate key findings based on statistics."""
findings = []
if language == 'de':
if stats['non_compliant'] > 0:
findings.append(
f"{stats['non_compliant']} Anforderungen sind nicht konform und "
f"erfordern Massnahmen."
)
if stats['pending'] > 0:
findings.append(
f"{stats['pending']} Anforderungen wurden noch nicht geprueft."
)
if stats['compliance_rate'] >= 90:
findings.append(
"Hohe Konformitaetsrate erreicht. Weiter so!"
)
elif stats['compliance_rate'] < 70:
findings.append(
"Konformitaetsrate unter 70%. Priorisierte Massnahmen erforderlich."
)
if stats['compliant_notes'] > 0:
findings.append(
f"{stats['compliant_notes']} Anforderungen sind konform mit Anmerkungen. "
f"Verbesserungspotenzial identifiziert."
)
if not findings:
findings.append("Audit vollstaendig abgeschlossen ohne kritische Befunde.")
else:
if stats['non_compliant'] > 0:
findings.append(
f"{stats['non_compliant']} requirements are non-compliant and "
f"require action."
)
if stats['pending'] > 0:
findings.append(
f"{stats['pending']} requirements have not been reviewed yet."
)
if stats['compliance_rate'] >= 90:
findings.append(
"High compliance rate achieved. Keep up the good work!"
)
elif stats['compliance_rate'] < 70:
findings.append(
"Compliance rate below 70%. Prioritized actions required."
)
if stats['compliant_notes'] > 0:
findings.append(
f"{stats['compliant_notes']} requirements are compliant with notes. "
f"Improvement potential identified."
)
if not findings:
findings.append("Audit completed without critical findings.")
return findings
def _build_statistics_section(
self,
stats: Dict[str, Any],
language: str,
) -> List:
"""Build the statistics overview section with pie chart."""
story = []
title = 'STATISTIK-UEBERSICHT' if language == 'de' else 'STATISTICS OVERVIEW'
story.append(Paragraph(title, self.styles['Heading1']))
# Create pie chart
drawing = Drawing(200, 200)
pie = Pie()
pie.x = 50
pie.y = 25
pie.width = 100
pie.height = 100
# Data for pie chart
data = [
stats['compliant'],
stats['compliant_notes'],
stats['non_compliant'],
stats['not_applicable'],
stats['pending'],
]
# Only include non-zero values
labels_de = ['Konform', 'Konform (Anm.)', 'Nicht konform', 'N/A', 'Ausstehend']
labels_en = ['Compliant', 'Compliant (Notes)', 'Non-Compliant', 'N/A', 'Pending']
labels = labels_de if language == 'de' else labels_en
pie_colors = [
COLORS['success'],
colors.HexColor('#68d391'),
COLORS['danger'],
COLORS['muted'],
COLORS['warning'],
]
# Filter out zero values
filtered_data = []
filtered_labels = []
filtered_colors = []
for i, val in enumerate(data):
if val > 0:
filtered_data.append(val)
filtered_labels.append(labels[i])
filtered_colors.append(pie_colors[i])
if filtered_data:
pie.data = filtered_data
pie.labels = filtered_labels
pie.slices.strokeWidth = 0.5
for i, col in enumerate(filtered_colors):
pie.slices[i].fillColor = col
drawing.add(pie)
story.append(drawing)
else:
no_data = 'Keine Daten verfuegbar' if language == 'de' else 'No data available'
story.append(Paragraph(no_data, self.styles['Normal']))
story.append(Spacer(1, 10*mm))
# Legend table
legend_data = []
for i, label in enumerate(labels):
if data[i] > 0:
count = data[i]
pct = round(count / stats['total'] * 100, 1) if stats['total'] > 0 else 0
legend_data.append([label, str(count), f"{pct}%"])
if legend_data:
header = ['Status', 'Anzahl', '%'] if language == 'de' else ['Status', 'Count', '%']
legend_table = Table([header] + legend_data, colWidths=[50*mm, 25*mm, 25*mm])
legend_table.setStyle(TableStyle([
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
('FONTNAME', (0, 1), (-1, -1), 'Helvetica'),
('FONTSIZE', (0, 0), (-1, -1), 10),
('BACKGROUND', (0, 0), (-1, 0), COLORS['light']),
('ALIGN', (1, 0), (-1, -1), 'RIGHT'),
('BOTTOMPADDING', (0, 0), (-1, -1), 5),
('TOPPADDING', (0, 0), (-1, -1), 5),
('GRID', (0, 0), (-1, -1), 0.5, COLORS['muted']),
]))
story.append(legend_table)
return story
def _build_checklist_section(
self,
session: AuditSessionDB,
requirements: List[RequirementDB],
signoff_map: Dict[str, AuditSignOffDB],
language: str,
) -> List:
"""Build the detailed checklist section."""
story = []
story.append(PageBreak())
title = 'PRUEFUNGSCHECKLISTE' if language == 'de' else 'AUDIT CHECKLIST'
story.append(Paragraph(title, self.styles['Heading1']))
# Group by regulation
by_regulation = {}
for req in requirements:
reg_code = req.regulation.code if req.regulation else 'OTHER'
if reg_code not in by_regulation:
by_regulation[reg_code] = []
by_regulation[reg_code].append(req)
result_labels = {
'de': {
'compliant': 'Konform',
'compliant_notes': 'Konform (Anm.)',
'non_compliant': 'Nicht konform',
'not_applicable': 'N/A',
'pending': 'Ausstehend',
},
'en': {
'compliant': 'Compliant',
'compliant_notes': 'Compliant (Notes)',
'non_compliant': 'Non-Compliant',
'not_applicable': 'N/A',
'pending': 'Pending',
},
}
labels = result_labels.get(language, result_labels['de'])
for reg_code, reqs in sorted(by_regulation.items()):
story.append(Paragraph(reg_code, self.styles['Heading2']))
# Build table data
header = ['Art.', 'Titel', 'Ergebnis', 'Signiert'] if language == 'de' else \
['Art.', 'Title', 'Result', 'Signed']
table_data = [header]
for req in reqs:
signoff = signoff_map.get(req.id)
result = signoff.result.value if signoff else 'pending'
result_label = labels.get(result, result)
signed = 'Ja' if (signoff and signoff.signature_hash) else '-'
if language == 'en':
signed = 'Yes' if (signoff and signoff.signature_hash) else '-'
# Truncate title if too long
title_text = req.title[:50] + '...' if len(req.title) > 50 else req.title
table_data.append([
req.article or '-',
title_text,
result_label,
signed,
])
table = Table(table_data, colWidths=[20*mm, 80*mm, 35*mm, 20*mm])
# Style rows based on result
style_commands = [
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
('FONTNAME', (0, 1), (-1, -1), 'Helvetica'),
('FONTSIZE', (0, 0), (-1, -1), 9),
('BACKGROUND', (0, 0), (-1, 0), COLORS['light']),
('ALIGN', (0, 0), (-1, -1), 'LEFT'),
('ALIGN', (2, 0), (3, -1), 'CENTER'),
('BOTTOMPADDING', (0, 0), (-1, -1), 4),
('TOPPADDING', (0, 0), (-1, -1), 4),
('GRID', (0, 0), (-1, -1), 0.5, COLORS['muted']),
('VALIGN', (0, 0), (-1, -1), 'TOP'),
]
# Color code results
for i, req in enumerate(reqs, start=1):
signoff = signoff_map.get(req.id)
if signoff:
result = signoff.result.value
if result == 'compliant':
style_commands.append(('TEXTCOLOR', (2, i), (2, i), COLORS['success']))
elif result == 'compliant_notes':
style_commands.append(('TEXTCOLOR', (2, i), (2, i), colors.HexColor('#2f855a')))
elif result == 'non_compliant':
style_commands.append(('TEXTCOLOR', (2, i), (2, i), COLORS['danger']))
else:
style_commands.append(('TEXTCOLOR', (2, i), (2, i), COLORS['warning']))
table.setStyle(TableStyle(style_commands))
story.append(table)
story.append(Spacer(1, 5*mm))
return story
def _build_non_compliant_appendix(
self,
non_compliant: List[AuditSignOffDB],
requirements: List[RequirementDB],
language: str,
) -> List:
"""Build appendix with non-compliant items detail."""
story = []
title = 'ANHANG: NICHT KONFORME ANFORDERUNGEN' if language == 'de' else \
'APPENDIX: NON-COMPLIANT REQUIREMENTS'
story.append(Paragraph(title, self.styles['Heading1']))
req_map = {r.id: r for r in requirements}
for i, signoff in enumerate(non_compliant, start=1):
req = req_map.get(signoff.requirement_id)
if not req:
continue
# Requirement header
story.append(Paragraph(
f"<b>{i}. {req.regulation.code if req.regulation else ''} {req.article}</b>",
self.styles['Heading3']
))
story.append(Paragraph(f"<b>{req.title}</b>", self.styles['Normal']))
if req.description:
desc = req.description[:500] + '...' if len(req.description) > 500 else req.description
story.append(Paragraph(desc, self.styles['Small']))
# Notes from auditor
if signoff.notes:
notes_label = 'Auditor-Anmerkungen' if language == 'de' else 'Auditor Notes'
story.append(Paragraph(f"<b>{notes_label}:</b>", self.styles['Normal']))
story.append(Paragraph(signoff.notes, self.styles['Normal']))
story.append(Spacer(1, 5*mm))
return story
def _build_signature_section(
self,
signed_items: List[AuditSignOffDB],
language: str,
) -> List:
"""Build section with digital signature verification."""
story = []
title = 'DIGITALE SIGNATUREN' if language == 'de' else 'DIGITAL SIGNATURES'
story.append(Paragraph(title, self.styles['Heading1']))
explanation = (
'Die folgenden Pruefpunkte wurden digital signiert. '
'Die SHA-256 Hashes dienen als unveraenderlicher Nachweis des Pruefergebnisses.'
) if language == 'de' else (
'The following audit items have been digitally signed. '
'The SHA-256 hashes serve as immutable proof of the audit result.'
)
story.append(Paragraph(explanation, self.styles['Normal']))
story.append(Spacer(1, 5*mm))
header = ['Anforderung', 'Signiert von', 'Datum', 'SHA-256 (gekuerzt)'] if language == 'de' else \
['Requirement', 'Signed by', 'Date', 'SHA-256 (truncated)']
table_data = [header]
for item in signed_items[:50]: # Limit to 50 entries
table_data.append([
item.requirement_id[:8] + '...',
item.signed_by or '-',
item.signed_at.strftime('%d.%m.%Y') if item.signed_at else '-',
item.signature_hash[:16] + '...' if item.signature_hash else '-',
])
table = Table(table_data, colWidths=[35*mm, 40*mm, 30*mm, 50*mm])
table.setStyle(TableStyle([
('FONTNAME', (0, 0), (-1, 0), 'Helvetica-Bold'),
('FONTNAME', (0, 1), (-1, -1), 'Courier'),
('FONTSIZE', (0, 0), (-1, -1), 8),
('BACKGROUND', (0, 0), (-1, 0), COLORS['light']),
('ALIGN', (0, 0), (-1, -1), 'LEFT'),
('BOTTOMPADDING', (0, 0), (-1, -1), 3),
('TOPPADDING', (0, 0), (-1, -1), 3),
('GRID', (0, 0), (-1, -1), 0.5, COLORS['muted']),
]))
story.append(table)
return story

View File

@@ -0,0 +1,383 @@
"""
Automatic Risk Update Service for Compliance Framework.
This service processes CI/CD security scan results and automatically:
1. Updates Control status based on scan findings
2. Adjusts Risk levels when critical CVEs are found
3. Creates Evidence records from scan reports
4. Generates alerts for significant findings
Sprint 6: CI/CD Evidence Collection (2026-01-18)
"""
import logging
from datetime import datetime
from typing import Dict, List, Optional, Any
from dataclasses import dataclass
from enum import Enum
from sqlalchemy.orm import Session
from ..db.models import (
ControlDB, ControlStatusEnum,
EvidenceDB, EvidenceStatusEnum,
RiskDB, RiskLevelEnum,
)
from ..db.repository import ControlRepository, EvidenceRepository, RiskRepository
logger = logging.getLogger(__name__)
class ScanType(str, Enum):
"""Types of CI/CD security scans."""
SAST = "sast" # Static Application Security Testing
DEPENDENCY = "dependency" # Dependency/CVE scanning
SECRET = "secret" # Secret detection
CONTAINER = "container" # Container image scanning
SBOM = "sbom" # Software Bill of Materials
class FindingSeverity(str, Enum):
"""Severity levels for security findings."""
CRITICAL = "critical"
HIGH = "high"
MEDIUM = "medium"
LOW = "low"
INFO = "info"
@dataclass
class ScanResult:
"""Represents a CI/CD scan result."""
scan_type: ScanType
tool: str
timestamp: datetime
commit_sha: str
branch: str
control_id: str # Mapped Control ID (e.g., SDLC-001)
findings: Dict[str, int] # {"critical": 0, "high": 2, ...}
raw_report: Optional[Dict] = None
ci_job_id: Optional[str] = None
@dataclass
class RiskUpdateResult:
"""Result of an automatic risk update."""
control_id: str
control_updated: bool
old_status: Optional[str]
new_status: Optional[str]
evidence_created: bool
evidence_id: Optional[str]
risks_affected: List[str]
alerts_generated: List[str]
message: str
# Mapping from Control IDs to scan types
CONTROL_SCAN_MAPPING = {
"SDLC-001": ScanType.SAST, # SAST Scanning
"SDLC-002": ScanType.DEPENDENCY, # Dependency Scanning
"SDLC-003": ScanType.SECRET, # Secret Detection
"SDLC-006": ScanType.CONTAINER, # Container Scanning
"CRA-001": ScanType.SBOM, # SBOM Generation
}
class AutoRiskUpdater:
"""
Automatically updates Controls and Risks based on CI/CD scan results.
Flow:
1. Receive scan result from CI/CD pipeline
2. Determine Control status based on findings
3. Create Evidence record
4. Update linked Risks if necessary
5. Generate alerts for critical findings
"""
def __init__(self, db: Session):
self.db = db
self.control_repo = ControlRepository(db)
self.evidence_repo = EvidenceRepository(db)
self.risk_repo = RiskRepository(db)
def process_scan_result(self, scan_result: ScanResult) -> RiskUpdateResult:
"""
Process a CI/CD scan result and update Compliance status.
Args:
scan_result: The scan result from CI/CD pipeline
Returns:
RiskUpdateResult with details of all updates made
"""
logger.info(f"Processing {scan_result.scan_type.value} scan for control {scan_result.control_id}")
# Find the Control
control = self.control_repo.get_by_control_id(scan_result.control_id)
if not control:
logger.warning(f"Control {scan_result.control_id} not found")
return RiskUpdateResult(
control_id=scan_result.control_id,
control_updated=False,
old_status=None,
new_status=None,
evidence_created=False,
evidence_id=None,
risks_affected=[],
alerts_generated=[],
message=f"Control {scan_result.control_id} not found"
)
old_status = control.status.value if control.status else "unknown"
# Determine new Control status based on findings
new_status = self._determine_control_status(scan_result.findings)
# Update Control status
control_updated = False
if new_status != old_status:
control.status = ControlStatusEnum(new_status)
control.status_notes = self._generate_status_notes(scan_result)
control.updated_at = datetime.utcnow()
control_updated = True
logger.info(f"Control {scan_result.control_id} status changed: {old_status} -> {new_status}")
# Create Evidence record
evidence = self._create_evidence(control, scan_result)
# Update linked Risks
risks_affected = self._update_linked_risks(control, new_status, scan_result.findings)
# Generate alerts for critical findings
alerts = self._generate_alerts(scan_result, new_status)
# Commit all changes
self.db.commit()
return RiskUpdateResult(
control_id=scan_result.control_id,
control_updated=control_updated,
old_status=old_status,
new_status=new_status,
evidence_created=True,
evidence_id=evidence.id,
risks_affected=risks_affected,
alerts_generated=alerts,
message=f"Processed {scan_result.scan_type.value} scan successfully"
)
def _determine_control_status(self, findings: Dict[str, int]) -> str:
"""
Determine Control status based on security findings.
Rules:
- Any CRITICAL findings -> fail
- >5 HIGH findings -> fail
- 1-5 HIGH findings -> partial
- Only MEDIUM/LOW findings -> pass (with notes)
- No findings -> pass
"""
critical = findings.get("critical", 0)
high = findings.get("high", 0)
medium = findings.get("medium", 0)
if critical > 0:
return ControlStatusEnum.FAIL.value
elif high > 5:
return ControlStatusEnum.FAIL.value
elif high > 0:
return ControlStatusEnum.PARTIAL.value
elif medium > 10:
return ControlStatusEnum.PARTIAL.value
else:
return ControlStatusEnum.PASS.value
def _generate_status_notes(self, scan_result: ScanResult) -> str:
"""Generate human-readable status notes from scan result."""
findings = scan_result.findings
parts = []
if findings.get("critical", 0) > 0:
parts.append(f"{findings['critical']} CRITICAL")
if findings.get("high", 0) > 0:
parts.append(f"{findings['high']} HIGH")
if findings.get("medium", 0) > 0:
parts.append(f"{findings['medium']} MEDIUM")
if parts:
findings_str = ", ".join(parts)
return f"Auto-updated from {scan_result.tool} scan ({scan_result.timestamp.strftime('%Y-%m-%d %H:%M')}): {findings_str} findings"
else:
return f"Auto-updated from {scan_result.tool} scan ({scan_result.timestamp.strftime('%Y-%m-%d %H:%M')}): No significant findings"
def _create_evidence(self, control: ControlDB, scan_result: ScanResult) -> EvidenceDB:
"""Create an Evidence record from the scan result."""
from uuid import uuid4
evidence = EvidenceDB(
id=str(uuid4()),
control_id=control.id,
evidence_type=f"{scan_result.scan_type.value}_report",
title=f"{scan_result.tool} Scan - {scan_result.timestamp.strftime('%Y-%m-%d')}",
description=self._generate_status_notes(scan_result),
source="ci_pipeline",
ci_job_id=scan_result.ci_job_id,
status=EvidenceStatusEnum.VALID,
valid_from=datetime.utcnow(),
collected_at=scan_result.timestamp,
)
self.db.add(evidence)
logger.info(f"Created evidence {evidence.id} for control {control.control_id}")
return evidence
def _update_linked_risks(
self,
control: ControlDB,
new_status: str,
findings: Dict[str, int]
) -> List[str]:
"""
Update Risks that are mitigated by this Control.
When a Control fails:
- Increase residual risk of linked Risks
- Update risk status to "open" if was "mitigated"
When a Control passes:
- Decrease residual risk if appropriate
"""
affected_risks = []
# Find all Risks that list this Control as a mitigating control
all_risks = self.risk_repo.get_all()
for risk in all_risks:
if not risk.mitigating_controls:
continue
mitigating_ids = risk.mitigating_controls
if control.control_id not in mitigating_ids:
continue
# This Risk is linked to the affected Control
risk_updated = False
if new_status == ControlStatusEnum.FAIL.value:
# Control failed - increase risk
if risk.status == "mitigated":
risk.status = "open"
risk_updated = True
# Increase residual likelihood if critical findings
if findings.get("critical", 0) > 0:
old_likelihood = risk.residual_likelihood or risk.likelihood
risk.residual_likelihood = min(5, old_likelihood + 1)
risk.residual_risk = RiskDB.calculate_risk_level(
risk.residual_likelihood,
risk.residual_impact or risk.impact
)
risk_updated = True
elif new_status == ControlStatusEnum.PASS.value:
# Control passed - potentially reduce risk
if risk.status == "open":
# Check if all mitigating controls are passing
all_passing = True
for ctrl_id in mitigating_ids:
other_ctrl = self.control_repo.get_by_control_id(ctrl_id)
if other_ctrl and other_ctrl.status != ControlStatusEnum.PASS:
all_passing = False
break
if all_passing:
risk.status = "mitigated"
risk_updated = True
if risk_updated:
risk.last_assessed_at = datetime.utcnow()
risk.updated_at = datetime.utcnow()
affected_risks.append(risk.risk_id)
logger.info(f"Updated risk {risk.risk_id} due to control {control.control_id} status change")
return affected_risks
def _generate_alerts(self, scan_result: ScanResult, new_status: str) -> List[str]:
"""
Generate alerts for significant findings.
Alert conditions:
- Any CRITICAL findings
- Control status changed to FAIL
- >10 HIGH findings in one scan
"""
alerts = []
findings = scan_result.findings
if findings.get("critical", 0) > 0:
alert_msg = f"CRITICAL: {findings['critical']} critical vulnerabilities found in {scan_result.tool} scan"
alerts.append(alert_msg)
logger.warning(alert_msg)
if new_status == ControlStatusEnum.FAIL.value:
alert_msg = f"Control {scan_result.control_id} status changed to FAIL"
alerts.append(alert_msg)
logger.warning(alert_msg)
if findings.get("high", 0) > 10:
alert_msg = f"HIGH: {findings['high']} high-severity findings in {scan_result.tool} scan"
alerts.append(alert_msg)
logger.warning(alert_msg)
return alerts
def process_evidence_collect_request(
self,
tool: str,
control_id: str,
evidence_type: str,
timestamp: str,
commit_sha: str,
ci_job_id: Optional[str] = None,
findings: Optional[Dict[str, int]] = None,
**kwargs
) -> RiskUpdateResult:
"""
Process an evidence collection request from CI/CD.
This is the main entry point for the /evidence/collect API endpoint.
"""
# Parse timestamp
try:
ts = datetime.fromisoformat(timestamp.replace('Z', '+00:00'))
except (ValueError, AttributeError):
ts = datetime.utcnow()
# Determine scan type from evidence_type
scan_type = ScanType.SAST # Default
for ctrl_id, stype in CONTROL_SCAN_MAPPING.items():
if ctrl_id == control_id:
scan_type = stype
break
# Create ScanResult
scan_result = ScanResult(
scan_type=scan_type,
tool=tool,
timestamp=ts,
commit_sha=commit_sha,
branch=kwargs.get("branch", "unknown"),
control_id=control_id,
findings=findings or {"critical": 0, "high": 0, "medium": 0, "low": 0},
ci_job_id=ci_job_id,
)
return self.process_scan_result(scan_result)
def create_auto_risk_updater(db: Session) -> AutoRiskUpdater:
"""Factory function for creating AutoRiskUpdater instances."""
return AutoRiskUpdater(db)

View File

@@ -0,0 +1,616 @@
"""
Audit Export Generator.
Generates ZIP packages for external auditors containing:
- Regulations & Requirements
- Control Catalogue with status
- Evidence artifacts
- Risk register
- Summary reports
"""
import hashlib
import json
import logging
import os
import shutil
import tempfile
import zipfile
from datetime import datetime, date
from pathlib import Path
from typing import Dict, List, Optional, Any
from sqlalchemy.orm import Session
from ..db.models import (
RegulationDB,
RequirementDB,
ControlDB,
ControlMappingDB,
EvidenceDB,
RiskDB,
AuditExportDB,
ExportStatusEnum,
ControlStatusEnum,
)
logger = logging.getLogger(__name__)
class AuditExportGenerator:
"""Generates audit export packages."""
def __init__(self, db: Session, export_dir: str = "/tmp/compliance_exports"):
self.db = db
self.export_dir = Path(export_dir)
self.export_dir.mkdir(parents=True, exist_ok=True)
def create_export(
self,
requested_by: str,
export_type: str = "full",
included_regulations: Optional[List[str]] = None,
included_domains: Optional[List[str]] = None,
date_range_start: Optional[date] = None,
date_range_end: Optional[date] = None,
) -> AuditExportDB:
"""
Create a new audit export.
Args:
requested_by: User requesting the export
export_type: "full", "controls_only", "evidence_only"
included_regulations: Filter by regulation codes
included_domains: Filter by control domains
date_range_start: Evidence collected after this date
date_range_end: Evidence collected before this date
Returns:
AuditExportDB record
"""
# Create export record
export_record = AuditExportDB(
export_type=export_type,
export_name=f"Breakpilot Compliance Export {datetime.now().strftime('%Y-%m-%d %H:%M')}",
included_regulations=included_regulations,
included_domains=included_domains,
date_range_start=date_range_start,
date_range_end=date_range_end,
requested_by=requested_by,
status=ExportStatusEnum.GENERATING,
)
self.db.add(export_record)
self.db.flush()
try:
# Generate the export
file_path, file_hash, file_size = self._generate_zip(
export_record.id,
export_type,
included_regulations,
included_domains,
date_range_start,
date_range_end,
)
# Update record with results
export_record.file_path = str(file_path)
export_record.file_hash = file_hash
export_record.file_size_bytes = file_size
export_record.status = ExportStatusEnum.COMPLETED
export_record.completed_at = datetime.utcnow()
# Calculate statistics
stats = self._calculate_statistics(
included_regulations, included_domains
)
export_record.total_controls = stats["total_controls"]
export_record.total_evidence = stats["total_evidence"]
export_record.compliance_score = stats["compliance_score"]
self.db.commit()
logger.info(f"Export completed: {file_path}")
return export_record
except Exception as e:
export_record.status = ExportStatusEnum.FAILED
export_record.error_message = str(e)
self.db.commit()
logger.error(f"Export failed: {e}")
raise
def _generate_zip(
self,
export_id: str,
export_type: str,
included_regulations: Optional[List[str]],
included_domains: Optional[List[str]],
date_range_start: Optional[date],
date_range_end: Optional[date],
) -> tuple:
"""Generate the actual ZIP file."""
timestamp = datetime.now().strftime("%Y-%m-%d_%H%M%S")
zip_filename = f"audit_export_{timestamp}.zip"
zip_path = self.export_dir / zip_filename
with tempfile.TemporaryDirectory() as temp_dir:
temp_path = Path(temp_dir)
# Create directory structure
(temp_path / "regulations").mkdir()
(temp_path / "controls").mkdir()
(temp_path / "evidence").mkdir()
(temp_path / "risks").mkdir()
# Generate content based on export type
if export_type in ["full", "controls_only"]:
self._export_regulations(temp_path / "regulations", included_regulations)
self._export_controls(temp_path / "controls", included_domains)
if export_type in ["full", "evidence_only"]:
self._export_evidence(
temp_path / "evidence",
included_domains,
date_range_start,
date_range_end,
)
if export_type == "full":
self._export_risks(temp_path / "risks")
# Generate summary
self._export_summary(
temp_path,
export_type,
included_regulations,
included_domains,
)
# Generate README
self._export_readme(temp_path)
# Generate index.html for navigation
self._export_index_html(temp_path)
# Create ZIP
with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
for file_path in temp_path.rglob("*"):
if file_path.is_file():
arcname = file_path.relative_to(temp_path)
zf.write(file_path, arcname)
# Calculate hash
file_hash = self._calculate_file_hash(zip_path)
file_size = zip_path.stat().st_size
return zip_path, file_hash, file_size
def _export_regulations(
self, output_dir: Path, included_regulations: Optional[List[str]]
) -> None:
"""Export regulations to JSON files."""
query = self.db.query(RegulationDB).filter(RegulationDB.is_active == True)
if included_regulations:
query = query.filter(RegulationDB.code.in_(included_regulations))
regulations = query.all()
for reg in regulations:
# Get requirements for this regulation
requirements = self.db.query(RequirementDB).filter(
RequirementDB.regulation_id == reg.id
).all()
data = {
"code": reg.code,
"name": reg.name,
"full_name": reg.full_name,
"type": reg.regulation_type.value if reg.regulation_type else None,
"source_url": reg.source_url,
"effective_date": reg.effective_date.isoformat() if reg.effective_date else None,
"description": reg.description,
"requirements": [
{
"article": r.article,
"paragraph": r.paragraph,
"title": r.title,
"description": r.description,
"is_applicable": r.is_applicable,
"breakpilot_interpretation": r.breakpilot_interpretation,
}
for r in requirements
],
}
file_path = output_dir / f"{reg.code.lower()}.json"
with open(file_path, "w", encoding="utf-8") as f:
json.dump(data, f, indent=2, ensure_ascii=False)
def _export_controls(
self, output_dir: Path, included_domains: Optional[List[str]]
) -> None:
"""Export controls to JSON and generate summary."""
query = self.db.query(ControlDB)
if included_domains:
from ..db.models import ControlDomainEnum
domain_enums = [ControlDomainEnum(d) for d in included_domains]
query = query.filter(ControlDB.domain.in_(domain_enums))
controls = query.order_by(ControlDB.control_id).all()
controls_data = []
for ctrl in controls:
# Get mappings
mappings = self.db.query(ControlMappingDB).filter(
ControlMappingDB.control_id == ctrl.id
).all()
# Get requirement references
requirement_refs = []
for m in mappings:
req = self.db.query(RequirementDB).get(m.requirement_id)
if req:
reg = self.db.query(RegulationDB).get(req.regulation_id)
requirement_refs.append({
"regulation": reg.code if reg else None,
"article": req.article,
"paragraph": req.paragraph,
"coverage": m.coverage_level,
})
ctrl_data = {
"control_id": ctrl.control_id,
"domain": ctrl.domain.value if ctrl.domain else None,
"type": ctrl.control_type.value if ctrl.control_type else None,
"title": ctrl.title,
"description": ctrl.description,
"pass_criteria": ctrl.pass_criteria,
"status": ctrl.status.value if ctrl.status else None,
"is_automated": ctrl.is_automated,
"automation_tool": ctrl.automation_tool,
"owner": ctrl.owner,
"last_reviewed": ctrl.last_reviewed_at.isoformat() if ctrl.last_reviewed_at else None,
"code_reference": ctrl.code_reference,
"mapped_requirements": requirement_refs,
}
controls_data.append(ctrl_data)
# Write full catalogue
with open(output_dir / "control_catalogue.json", "w", encoding="utf-8") as f:
json.dump(controls_data, f, indent=2, ensure_ascii=False)
# Write summary by domain
domain_summary = {}
for ctrl in controls_data:
domain = ctrl["domain"]
if domain not in domain_summary:
domain_summary[domain] = {"total": 0, "pass": 0, "partial": 0, "fail": 0}
domain_summary[domain]["total"] += 1
status = ctrl["status"]
if status in domain_summary[domain]:
domain_summary[domain][status] += 1
with open(output_dir / "domain_summary.json", "w", encoding="utf-8") as f:
json.dump(domain_summary, f, indent=2, ensure_ascii=False)
def _export_evidence(
self,
output_dir: Path,
included_domains: Optional[List[str]],
date_range_start: Optional[date],
date_range_end: Optional[date],
) -> None:
"""Export evidence metadata and files."""
query = self.db.query(EvidenceDB)
if date_range_start:
query = query.filter(EvidenceDB.collected_at >= datetime.combine(date_range_start, datetime.min.time()))
if date_range_end:
query = query.filter(EvidenceDB.collected_at <= datetime.combine(date_range_end, datetime.max.time()))
if included_domains:
from ..db.models import ControlDomainEnum
domain_enums = [ControlDomainEnum(d) for d in included_domains]
query = query.join(ControlDB).filter(ControlDB.domain.in_(domain_enums))
evidence_list = query.all()
evidence_data = []
for ev in evidence_list:
ctrl = self.db.query(ControlDB).get(ev.control_id)
ev_data = {
"id": ev.id,
"control_id": ctrl.control_id if ctrl else None,
"evidence_type": ev.evidence_type,
"title": ev.title,
"description": ev.description,
"artifact_path": ev.artifact_path,
"artifact_url": ev.artifact_url,
"artifact_hash": ev.artifact_hash,
"status": ev.status.value if ev.status else None,
"valid_from": ev.valid_from.isoformat() if ev.valid_from else None,
"valid_until": ev.valid_until.isoformat() if ev.valid_until else None,
"collected_at": ev.collected_at.isoformat() if ev.collected_at else None,
"source": ev.source,
}
evidence_data.append(ev_data)
# Copy evidence files if they exist
if ev.artifact_path and os.path.exists(ev.artifact_path):
evidence_subdir = output_dir / ev.evidence_type
evidence_subdir.mkdir(exist_ok=True)
filename = os.path.basename(ev.artifact_path)
shutil.copy2(ev.artifact_path, evidence_subdir / filename)
with open(output_dir / "evidence_index.json", "w", encoding="utf-8") as f:
json.dump(evidence_data, f, indent=2, ensure_ascii=False)
def _export_risks(self, output_dir: Path) -> None:
"""Export risk register."""
risks = self.db.query(RiskDB).order_by(RiskDB.risk_id).all()
risks_data = []
for risk in risks:
risk_data = {
"risk_id": risk.risk_id,
"title": risk.title,
"description": risk.description,
"category": risk.category,
"likelihood": risk.likelihood,
"impact": risk.impact,
"inherent_risk": risk.inherent_risk.value if risk.inherent_risk else None,
"mitigating_controls": risk.mitigating_controls,
"residual_likelihood": risk.residual_likelihood,
"residual_impact": risk.residual_impact,
"residual_risk": risk.residual_risk.value if risk.residual_risk else None,
"owner": risk.owner,
"status": risk.status,
"treatment_plan": risk.treatment_plan,
}
risks_data.append(risk_data)
with open(output_dir / "risk_register.json", "w", encoding="utf-8") as f:
json.dump(risks_data, f, indent=2, ensure_ascii=False)
def _export_summary(
self,
output_dir: Path,
export_type: str,
included_regulations: Optional[List[str]],
included_domains: Optional[List[str]],
) -> None:
"""Generate summary.json with overall statistics."""
stats = self._calculate_statistics(included_regulations, included_domains)
summary = {
"export_date": datetime.now().isoformat(),
"export_type": export_type,
"filters": {
"regulations": included_regulations,
"domains": included_domains,
},
"statistics": stats,
"organization": "Breakpilot",
"version": "1.0.0",
}
with open(output_dir / "summary.json", "w", encoding="utf-8") as f:
json.dump(summary, f, indent=2, ensure_ascii=False)
def _export_readme(self, output_dir: Path) -> None:
"""Generate README.md for auditors."""
readme = """# Breakpilot Compliance Export
Dieses Paket enthält die Compliance-Dokumentation von Breakpilot.
## Struktur
```
├── summary.json # Zusammenfassung und Statistiken
├── index.html # HTML-Navigation (im Browser öffnen)
├── regulations/ # Verordnungen und Anforderungen
│ ├── gdpr.json
│ ├── aiact.json
│ └── ...
├── controls/ # Control Catalogue
│ ├── control_catalogue.json
│ └── domain_summary.json
├── evidence/ # Nachweise
│ ├── evidence_index.json
│ └── [evidence_type]/
└── risks/ # Risikoregister
└── risk_register.json
```
## Verwendung
1. **HTML-Navigation**: Öffnen Sie `index.html` im Browser für eine visuelle Übersicht.
2. **JSON-Dateien**: Maschinenlesbare Daten für Import in GRC-Tools.
3. **Nachweis-Dateien**: Originale Scan-Reports und Konfigurationen.
## Kontakt
Bei Fragen wenden Sie sich an das Breakpilot Security Team.
---
Generiert am: """ + datetime.now().strftime("%Y-%m-%d %H:%M:%S")
with open(output_dir / "README.md", "w", encoding="utf-8") as f:
f.write(readme)
def _export_index_html(self, output_dir: Path) -> None:
"""Generate index.html for browser navigation."""
html = """<!DOCTYPE html>
<html lang="de">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>Breakpilot Compliance Export</title>
<style>
body { font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif; max-width: 1200px; margin: 0 auto; padding: 2rem; background: #f5f5f5; }
h1 { color: #1a1a1a; border-bottom: 3px solid #0066cc; padding-bottom: 1rem; }
h2 { color: #333; margin-top: 2rem; }
.card { background: white; border-radius: 8px; padding: 1.5rem; margin: 1rem 0; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }
.stats { display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 1rem; }
.stat { background: linear-gradient(135deg, #0066cc, #004499); color: white; padding: 1.5rem; border-radius: 8px; text-align: center; }
.stat-value { font-size: 2.5rem; font-weight: bold; }
.stat-label { opacity: 0.9; margin-top: 0.5rem; }
ul { list-style: none; padding: 0; }
li { padding: 0.75rem; border-bottom: 1px solid #eee; }
li:last-child { border-bottom: none; }
a { color: #0066cc; text-decoration: none; }
a:hover { text-decoration: underline; }
.footer { margin-top: 3rem; padding-top: 1rem; border-top: 1px solid #ddd; color: #666; font-size: 0.9rem; }
</style>
</head>
<body>
<h1>Breakpilot Compliance Export</h1>
<div class="stats">
<div class="stat">
<div class="stat-value" id="score">--%</div>
<div class="stat-label">Compliance Score</div>
</div>
<div class="stat">
<div class="stat-value" id="controls">--</div>
<div class="stat-label">Controls</div>
</div>
<div class="stat">
<div class="stat-value" id="evidence">--</div>
<div class="stat-label">Evidence Items</div>
</div>
<div class="stat">
<div class="stat-value" id="regulations">--</div>
<div class="stat-label">Regulations</div>
</div>
</div>
<div class="card">
<h2>Regulations & Requirements</h2>
<ul id="regulations-list">
<li>Loading...</li>
</ul>
</div>
<div class="card">
<h2>Controls by Domain</h2>
<ul id="domains-list">
<li>Loading...</li>
</ul>
</div>
<div class="card">
<h2>Export Contents</h2>
<ul>
<li><a href="summary.json">summary.json</a> - Export metadata and statistics</li>
<li><a href="controls/control_catalogue.json">controls/control_catalogue.json</a> - Full control catalogue</li>
<li><a href="evidence/evidence_index.json">evidence/evidence_index.json</a> - Evidence index</li>
<li><a href="risks/risk_register.json">risks/risk_register.json</a> - Risk register</li>
</ul>
</div>
<div class="footer">
<p>Generated by Breakpilot Compliance Framework</p>
</div>
<script>
// Load summary and populate stats
fetch('summary.json')
.then(r => r.json())
.then(data => {
document.getElementById('score').textContent = (data.statistics.compliance_score || 0).toFixed(0) + '%';
document.getElementById('controls').textContent = data.statistics.total_controls || 0;
document.getElementById('evidence').textContent = data.statistics.total_evidence || 0;
document.getElementById('regulations').textContent = data.statistics.total_regulations || 0;
})
.catch(() => console.log('Could not load summary'));
// Load regulations list
const regsDir = 'regulations/';
document.getElementById('regulations-list').innerHTML =
'<li><a href="regulations/gdpr.json">GDPR</a> - Datenschutz-Grundverordnung</li>' +
'<li><a href="regulations/aiact.json">AI Act</a> - KI-Verordnung</li>' +
'<li><a href="regulations/cra.json">CRA</a> - Cyber Resilience Act</li>';
// Load domain summary
fetch('controls/domain_summary.json')
.then(r => r.json())
.then(data => {
const list = document.getElementById('domains-list');
list.innerHTML = Object.entries(data).map(([domain, stats]) =>
`<li><strong>${domain.toUpperCase()}</strong>: ${stats.pass || 0}/${stats.total} controls passing</li>`
).join('');
})
.catch(() => console.log('Could not load domain summary'));
</script>
</body>
</html>"""
with open(output_dir / "index.html", "w", encoding="utf-8") as f:
f.write(html)
def _calculate_statistics(
self,
included_regulations: Optional[List[str]],
included_domains: Optional[List[str]],
) -> Dict[str, Any]:
"""Calculate compliance statistics."""
# Count regulations
reg_query = self.db.query(RegulationDB).filter(RegulationDB.is_active == True)
if included_regulations:
reg_query = reg_query.filter(RegulationDB.code.in_(included_regulations))
total_regulations = reg_query.count()
# Count controls
ctrl_query = self.db.query(ControlDB)
if included_domains:
from ..db.models import ControlDomainEnum
domain_enums = [ControlDomainEnum(d) for d in included_domains]
ctrl_query = ctrl_query.filter(ControlDB.domain.in_(domain_enums))
total_controls = ctrl_query.count()
passing_controls = ctrl_query.filter(ControlDB.status == ControlStatusEnum.PASS).count()
partial_controls = ctrl_query.filter(ControlDB.status == ControlStatusEnum.PARTIAL).count()
# Count evidence
total_evidence = self.db.query(EvidenceDB).count()
# Calculate compliance score
if total_controls > 0:
score = ((passing_controls + partial_controls * 0.5) / total_controls) * 100
else:
score = 0
return {
"total_regulations": total_regulations,
"total_controls": total_controls,
"passing_controls": passing_controls,
"partial_controls": partial_controls,
"total_evidence": total_evidence,
"compliance_score": round(score, 1),
}
def _calculate_file_hash(self, file_path: Path) -> str:
"""Calculate SHA-256 hash of file."""
sha256 = hashlib.sha256()
with open(file_path, "rb") as f:
for chunk in iter(lambda: f.read(8192), b""):
sha256.update(chunk)
return sha256.hexdigest()
def get_export_status(self, export_id: str) -> Optional[AuditExportDB]:
"""Get status of an export."""
return self.db.query(AuditExportDB).get(export_id)
def list_exports(
self, limit: int = 20, offset: int = 0
) -> List[AuditExportDB]:
"""List recent exports."""
return (
self.db.query(AuditExportDB)
.order_by(AuditExportDB.requested_at.desc())
.offset(offset)
.limit(limit)
.all()
)

View File

@@ -0,0 +1,622 @@
"""
LLM Provider Abstraction for Compliance AI Features.
Supports:
- Anthropic Claude API (default)
- Self-Hosted LLMs (Ollama, vLLM, LocalAI, etc.)
- HashiCorp Vault integration for secure API key storage
Configuration via environment variables:
- COMPLIANCE_LLM_PROVIDER: "anthropic" or "self_hosted"
- ANTHROPIC_API_KEY: API key for Claude (or loaded from Vault)
- ANTHROPIC_MODEL: Model name (default: claude-sonnet-4-20250514)
- SELF_HOSTED_LLM_URL: Base URL for self-hosted LLM
- SELF_HOSTED_LLM_MODEL: Model name for self-hosted
- SELF_HOSTED_LLM_KEY: Optional API key for self-hosted
Vault Configuration:
- VAULT_ADDR: Vault server address (e.g., http://vault:8200)
- VAULT_TOKEN: Vault authentication token
- USE_VAULT_SECRETS: Set to "true" to enable Vault integration
- VAULT_SECRET_PATH: Path to secrets (default: secret/breakpilot/api_keys)
"""
import os
import asyncio
import logging
from abc import ABC, abstractmethod
from typing import List, Optional, Dict, Any
from dataclasses import dataclass, field
from enum import Enum
import httpx
logger = logging.getLogger(__name__)
# =============================================================================
# Vault Integration
# =============================================================================
class VaultClient:
"""
HashiCorp Vault client for retrieving secrets.
Supports KV v2 secrets engine.
"""
def __init__(
self,
addr: Optional[str] = None,
token: Optional[str] = None
):
self.addr = addr or os.getenv("VAULT_ADDR", "http://localhost:8200")
self.token = token or os.getenv("VAULT_TOKEN")
self._cache: Dict[str, Any] = {}
self._cache_ttl = 300 # 5 minutes cache
def _get_headers(self) -> Dict[str, str]:
"""Get request headers with Vault token."""
headers = {"Content-Type": "application/json"}
if self.token:
headers["X-Vault-Token"] = self.token
return headers
def get_secret(self, path: str, key: str = "value") -> Optional[str]:
"""
Get a secret from Vault KV v2.
Args:
path: Secret path (e.g., "breakpilot/api_keys/anthropic")
key: Key within the secret data (default: "value")
Returns:
Secret value or None if not found
"""
cache_key = f"{path}:{key}"
# Check cache first
if cache_key in self._cache:
return self._cache[cache_key]
try:
# KV v2 uses /data/ in the path
full_path = f"{self.addr}/v1/secret/data/{path}"
response = httpx.get(
full_path,
headers=self._get_headers(),
timeout=10.0
)
if response.status_code == 200:
data = response.json()
secret_data = data.get("data", {}).get("data", {})
secret_value = secret_data.get(key)
if secret_value:
self._cache[cache_key] = secret_value
logger.info(f"Successfully loaded secret from Vault: {path}")
return secret_value
elif response.status_code == 404:
logger.warning(f"Secret not found in Vault: {path}")
else:
logger.error(f"Vault error {response.status_code}: {response.text}")
except httpx.RequestError as e:
logger.error(f"Failed to connect to Vault at {self.addr}: {e}")
except Exception as e:
logger.error(f"Error retrieving secret from Vault: {e}")
return None
def get_anthropic_key(self) -> Optional[str]:
"""Get Anthropic API key from Vault."""
path = os.getenv("VAULT_ANTHROPIC_PATH", "breakpilot/api_keys/anthropic")
return self.get_secret(path, "value")
def is_available(self) -> bool:
"""Check if Vault is available and authenticated."""
try:
response = httpx.get(
f"{self.addr}/v1/sys/health",
headers=self._get_headers(),
timeout=5.0
)
return response.status_code in (200, 429, 472, 473, 501, 503)
except Exception:
return False
# Singleton Vault client
_vault_client: Optional[VaultClient] = None
def get_vault_client() -> VaultClient:
"""Get shared Vault client instance."""
global _vault_client
if _vault_client is None:
_vault_client = VaultClient()
return _vault_client
def get_secret_from_vault_or_env(
vault_path: str,
env_var: str,
vault_key: str = "value"
) -> Optional[str]:
"""
Get a secret, trying Vault first, then falling back to environment variable.
Args:
vault_path: Path in Vault (e.g., "breakpilot/api_keys/anthropic")
env_var: Environment variable name as fallback
vault_key: Key within Vault secret data
Returns:
Secret value or None
"""
use_vault = os.getenv("USE_VAULT_SECRETS", "").lower() in ("true", "1", "yes")
if use_vault:
vault = get_vault_client()
secret = vault.get_secret(vault_path, vault_key)
if secret:
return secret
logger.info(f"Vault secret not found, falling back to env: {env_var}")
return os.getenv(env_var)
class LLMProviderType(str, Enum):
"""Supported LLM provider types."""
ANTHROPIC = "anthropic"
SELF_HOSTED = "self_hosted"
MOCK = "mock" # For testing
@dataclass
class LLMResponse:
"""Standard response from LLM."""
content: str
model: str
provider: str
usage: Optional[Dict[str, int]] = None
raw_response: Optional[Dict[str, Any]] = None
@dataclass
class LLMConfig:
"""Configuration for LLM provider."""
provider_type: LLMProviderType
api_key: Optional[str] = None
model: str = "claude-sonnet-4-20250514"
base_url: Optional[str] = None
max_tokens: int = 4096
temperature: float = 0.3
timeout: float = 60.0
class LLMProvider(ABC):
"""Abstract base class for LLM providers."""
def __init__(self, config: LLMConfig):
self.config = config
@abstractmethod
async def complete(
self,
prompt: str,
system_prompt: Optional[str] = None,
max_tokens: Optional[int] = None,
temperature: Optional[float] = None
) -> LLMResponse:
"""Generate a completion for the given prompt."""
pass
@abstractmethod
async def batch_complete(
self,
prompts: List[str],
system_prompt: Optional[str] = None,
max_tokens: Optional[int] = None,
rate_limit: float = 1.0
) -> List[LLMResponse]:
"""Generate completions for multiple prompts with rate limiting."""
pass
@property
@abstractmethod
def provider_name(self) -> str:
"""Return the provider name."""
pass
class AnthropicProvider(LLMProvider):
"""Claude API Provider using Anthropic's official API."""
ANTHROPIC_API_URL = "https://api.anthropic.com/v1/messages"
def __init__(self, config: LLMConfig):
super().__init__(config)
if not config.api_key:
raise ValueError("Anthropic API key is required")
self.api_key = config.api_key
self.model = config.model or "claude-sonnet-4-20250514"
@property
def provider_name(self) -> str:
return "anthropic"
async def complete(
self,
prompt: str,
system_prompt: Optional[str] = None,
max_tokens: Optional[int] = None,
temperature: Optional[float] = None
) -> LLMResponse:
"""Generate completion using Claude API."""
headers = {
"x-api-key": self.api_key,
"anthropic-version": "2023-06-01",
"content-type": "application/json"
}
messages = [{"role": "user", "content": prompt}]
payload = {
"model": self.model,
"max_tokens": max_tokens or self.config.max_tokens,
"messages": messages
}
if system_prompt:
payload["system"] = system_prompt
if temperature is not None:
payload["temperature"] = temperature
elif self.config.temperature is not None:
payload["temperature"] = self.config.temperature
async with httpx.AsyncClient(timeout=self.config.timeout) as client:
try:
response = await client.post(
self.ANTHROPIC_API_URL,
headers=headers,
json=payload
)
response.raise_for_status()
data = response.json()
content = ""
if data.get("content"):
content = data["content"][0].get("text", "")
return LLMResponse(
content=content,
model=self.model,
provider=self.provider_name,
usage=data.get("usage"),
raw_response=data
)
except httpx.HTTPStatusError as e:
logger.error(f"Anthropic API error: {e.response.status_code} - {e.response.text}")
raise
except Exception as e:
logger.error(f"Anthropic API request failed: {e}")
raise
async def batch_complete(
self,
prompts: List[str],
system_prompt: Optional[str] = None,
max_tokens: Optional[int] = None,
rate_limit: float = 1.0
) -> List[LLMResponse]:
"""Process multiple prompts with rate limiting."""
results = []
for i, prompt in enumerate(prompts):
if i > 0:
await asyncio.sleep(rate_limit)
try:
result = await self.complete(
prompt=prompt,
system_prompt=system_prompt,
max_tokens=max_tokens
)
results.append(result)
except Exception as e:
logger.error(f"Failed to process prompt {i}: {e}")
# Append error response
results.append(LLMResponse(
content=f"Error: {str(e)}",
model=self.model,
provider=self.provider_name
))
return results
class SelfHostedProvider(LLMProvider):
"""Self-Hosted LLM Provider supporting Ollama, vLLM, LocalAI, etc."""
def __init__(self, config: LLMConfig):
super().__init__(config)
if not config.base_url:
raise ValueError("Base URL is required for self-hosted provider")
self.base_url = config.base_url.rstrip("/")
self.model = config.model
self.api_key = config.api_key
@property
def provider_name(self) -> str:
return "self_hosted"
def _detect_api_format(self) -> str:
"""Detect the API format based on URL patterns."""
if "11434" in self.base_url or "ollama" in self.base_url.lower():
return "ollama"
elif "openai" in self.base_url.lower() or "v1" in self.base_url:
return "openai"
else:
return "ollama" # Default to Ollama format
async def complete(
self,
prompt: str,
system_prompt: Optional[str] = None,
max_tokens: Optional[int] = None,
temperature: Optional[float] = None
) -> LLMResponse:
"""Generate completion using self-hosted LLM."""
api_format = self._detect_api_format()
headers = {"content-type": "application/json"}
if self.api_key:
headers["Authorization"] = f"Bearer {self.api_key}"
if api_format == "ollama":
# Ollama API format
endpoint = f"{self.base_url}/api/generate"
full_prompt = prompt
if system_prompt:
full_prompt = f"{system_prompt}\n\n{prompt}"
payload = {
"model": self.model,
"prompt": full_prompt,
"stream": False,
"options": {}
}
if max_tokens:
payload["options"]["num_predict"] = max_tokens
if temperature is not None:
payload["options"]["temperature"] = temperature
else:
# OpenAI-compatible format (vLLM, LocalAI, etc.)
endpoint = f"{self.base_url}/v1/chat/completions"
messages = []
if system_prompt:
messages.append({"role": "system", "content": system_prompt})
messages.append({"role": "user", "content": prompt})
payload = {
"model": self.model,
"messages": messages,
"max_tokens": max_tokens or self.config.max_tokens,
"temperature": temperature if temperature is not None else self.config.temperature
}
async with httpx.AsyncClient(timeout=self.config.timeout) as client:
try:
response = await client.post(endpoint, headers=headers, json=payload)
response.raise_for_status()
data = response.json()
# Parse response based on format
if api_format == "ollama":
content = data.get("response", "")
else:
# OpenAI format
content = data.get("choices", [{}])[0].get("message", {}).get("content", "")
return LLMResponse(
content=content,
model=self.model,
provider=self.provider_name,
usage=data.get("usage"),
raw_response=data
)
except httpx.HTTPStatusError as e:
logger.error(f"Self-hosted LLM error: {e.response.status_code} - {e.response.text}")
raise
except Exception as e:
logger.error(f"Self-hosted LLM request failed: {e}")
raise
async def batch_complete(
self,
prompts: List[str],
system_prompt: Optional[str] = None,
max_tokens: Optional[int] = None,
rate_limit: float = 0.5 # Self-hosted can be faster
) -> List[LLMResponse]:
"""Process multiple prompts with rate limiting."""
results = []
for i, prompt in enumerate(prompts):
if i > 0:
await asyncio.sleep(rate_limit)
try:
result = await self.complete(
prompt=prompt,
system_prompt=system_prompt,
max_tokens=max_tokens
)
results.append(result)
except Exception as e:
logger.error(f"Failed to process prompt {i}: {e}")
results.append(LLMResponse(
content=f"Error: {str(e)}",
model=self.model,
provider=self.provider_name
))
return results
class MockProvider(LLMProvider):
"""Mock provider for testing without actual API calls."""
def __init__(self, config: LLMConfig):
super().__init__(config)
self.responses: List[str] = []
self.call_count = 0
@property
def provider_name(self) -> str:
return "mock"
def set_responses(self, responses: List[str]):
"""Set predetermined responses for testing."""
self.responses = responses
self.call_count = 0
async def complete(
self,
prompt: str,
system_prompt: Optional[str] = None,
max_tokens: Optional[int] = None,
temperature: Optional[float] = None
) -> LLMResponse:
"""Return mock response."""
if self.responses:
content = self.responses[self.call_count % len(self.responses)]
else:
content = f"Mock response for: {prompt[:50]}..."
self.call_count += 1
return LLMResponse(
content=content,
model="mock-model",
provider=self.provider_name,
usage={"input_tokens": len(prompt), "output_tokens": len(content)}
)
async def batch_complete(
self,
prompts: List[str],
system_prompt: Optional[str] = None,
max_tokens: Optional[int] = None,
rate_limit: float = 0.0
) -> List[LLMResponse]:
"""Return mock responses for batch."""
return [await self.complete(p, system_prompt, max_tokens) for p in prompts]
def get_llm_config() -> LLMConfig:
"""
Create LLM config from environment variables or Vault.
Priority for API key:
1. Vault (if USE_VAULT_SECRETS=true and Vault is available)
2. Environment variable (ANTHROPIC_API_KEY)
"""
provider_type_str = os.getenv("COMPLIANCE_LLM_PROVIDER", "anthropic")
try:
provider_type = LLMProviderType(provider_type_str)
except ValueError:
logger.warning(f"Unknown LLM provider: {provider_type_str}, falling back to mock")
provider_type = LLMProviderType.MOCK
# Get API key from Vault or environment
api_key = None
if provider_type == LLMProviderType.ANTHROPIC:
api_key = get_secret_from_vault_or_env(
vault_path="breakpilot/api_keys/anthropic",
env_var="ANTHROPIC_API_KEY"
)
elif provider_type == LLMProviderType.SELF_HOSTED:
api_key = get_secret_from_vault_or_env(
vault_path="breakpilot/api_keys/self_hosted_llm",
env_var="SELF_HOSTED_LLM_KEY"
)
# Select model based on provider type
if provider_type == LLMProviderType.ANTHROPIC:
model = os.getenv("ANTHROPIC_MODEL", "claude-sonnet-4-20250514")
elif provider_type == LLMProviderType.SELF_HOSTED:
model = os.getenv("SELF_HOSTED_LLM_MODEL", "qwen2.5:14b")
else:
model = "mock-model"
return LLMConfig(
provider_type=provider_type,
api_key=api_key,
model=model,
base_url=os.getenv("SELF_HOSTED_LLM_URL"),
max_tokens=int(os.getenv("COMPLIANCE_LLM_MAX_TOKENS", "4096")),
temperature=float(os.getenv("COMPLIANCE_LLM_TEMPERATURE", "0.3")),
timeout=float(os.getenv("COMPLIANCE_LLM_TIMEOUT", "60.0"))
)
def get_llm_provider(config: Optional[LLMConfig] = None) -> LLMProvider:
"""
Factory function to get the appropriate LLM provider based on configuration.
Usage:
provider = get_llm_provider()
response = await provider.complete("Analyze this requirement...")
"""
if config is None:
config = get_llm_config()
if config.provider_type == LLMProviderType.ANTHROPIC:
if not config.api_key:
logger.warning("No Anthropic API key found, using mock provider")
return MockProvider(config)
return AnthropicProvider(config)
elif config.provider_type == LLMProviderType.SELF_HOSTED:
if not config.base_url:
logger.warning("No self-hosted LLM URL found, using mock provider")
return MockProvider(config)
return SelfHostedProvider(config)
elif config.provider_type == LLMProviderType.MOCK:
return MockProvider(config)
else:
raise ValueError(f"Unsupported LLM provider type: {config.provider_type}")
# Singleton instance for reuse
_provider_instance: Optional[LLMProvider] = None
def get_shared_provider() -> LLMProvider:
"""Get a shared LLM provider instance."""
global _provider_instance
if _provider_instance is None:
_provider_instance = get_llm_provider()
return _provider_instance
def reset_shared_provider():
"""Reset the shared provider instance (useful for testing)."""
global _provider_instance
_provider_instance = None

View File

@@ -0,0 +1,602 @@
"""
PDF Extractor for BSI-TR-03161 and EU Regulation Documents.
This module extracts Pruefaspekte (test aspects) from BSI Technical Guidelines
and Articles from EU regulations in PDF format.
"""
import re
import logging
from dataclasses import dataclass, field
from typing import List, Optional, Dict, Any
from pathlib import Path
from enum import Enum
try:
import fitz # PyMuPDF
except ImportError:
fitz = None
logging.warning("PyMuPDF not installed. PDF extraction will not work.")
class RequirementLevel(str, Enum):
"""BSI requirement levels (German: Anforderungsstufen)."""
MUSS = "MUSS" # MUST - mandatory
SOLL = "SOLL" # SHOULD - recommended
KANN = "KANN" # MAY - optional
DARF_NICHT = "DARF NICHT" # MUST NOT - prohibited
class AspectCategory(str, Enum):
"""Categories for BSI-TR Pruefaspekte."""
AUTHENTICATION = "authentication"
SESSION_MANAGEMENT = "session_management"
CRYPTOGRAPHY = "cryptography"
INPUT_VALIDATION = "input_validation"
SQL_INJECTION = "sql_injection"
XSS_PREVENTION = "xss_prevention"
CSRF_PROTECTION = "csrf_protection"
LOGGING_AUDIT = "logging_audit"
ERROR_HANDLING = "error_handling"
NETWORK_SECURITY = "network_security"
SECURE_STORAGE = "secure_storage"
PRIVACY = "privacy"
ACCESS_CONTROL = "access_control"
DATA_PROTECTION = "data_protection"
KEY_MANAGEMENT = "key_management"
SECURE_COMMUNICATION = "secure_communication"
UPDATE_MECHANISM = "update_mechanism"
GENERAL = "general"
TEST_ASPECT = "test_aspect"
@dataclass
class BSIAspect:
"""A single extracted BSI-TR Pruefaspekt (test aspect)."""
aspect_id: str # e.g., "O.Auth_1", "T.Sess_2"
title: str # Short title
full_text: str # Complete requirement text
category: AspectCategory # Categorization
page_number: int # PDF page where found
section: str # Chapter/section number
requirement_level: RequirementLevel # MUSS/SOLL/KANN
source_document: str # e.g., "BSI-TR-03161-2"
context_before: str = "" # Text before the aspect
context_after: str = "" # Text after the aspect
related_aspects: List[str] = field(default_factory=list) # Related aspect IDs
keywords: List[str] = field(default_factory=list) # Extracted keywords
@dataclass
class EUArticle:
"""A single extracted EU regulation article."""
article_number: str # e.g., "Art. 32", "Artikel 5"
title: str # Article title
full_text: str # Complete article text
paragraphs: List[str] # Individual paragraphs
page_number: int # PDF page
regulation_name: str # e.g., "DSGVO", "AI Act"
recitals: List[str] = field(default_factory=list) # Related recitals
keywords: List[str] = field(default_factory=list) # Extracted keywords
class BSIPDFExtractor:
"""
Extracts Pruefaspekte from BSI-TR-03161 PDF documents.
The BSI-TR-03161 series contains security requirements for mobile applications:
- Part 1: General security requirements
- Part 2: Web application security (OAuth, Sessions, Input validation, etc.)
- Part 3: Backend/server security
Each document contains hundreds of Pruefaspekte (test aspects) that need to
be extracted, categorized, and stored for compliance tracking.
"""
# Regex patterns for BSI-TR aspect identification
PATTERNS = {
# Primary aspect ID patterns (e.g., O.Auth_1, T.Network_2)
'aspect_id': r'(O\.[A-Za-z]+_\d+|T\.[A-Za-z]+_\d+)',
# Alternative section-based pattern (e.g., "Pruefaspekt 4.2.1")
'section_aspect': r'(?:Prüfaspekt|Pruefaspekt|Anforderung)\s+(\d+\.\d+(?:\.\d+)?)',
# Section number pattern
'section': r'(\d+\.\d+(?:\.\d+)?)',
# Requirement level pattern
'requirement': r'\b(MUSS|SOLL|KANN|DARF\s+NICHT|muss|soll|kann|darf\s+nicht)\b',
# Table header pattern for Pruefaspekte tables
'table_header': r'(?:Prüfaspekt|Bezeichnung|ID|Anforderung)',
}
# Category mapping based on aspect ID prefix
CATEGORY_MAP = {
'O.Auth': AspectCategory.AUTHENTICATION,
'O.Sess': AspectCategory.SESSION_MANAGEMENT,
'O.Cryp': AspectCategory.CRYPTOGRAPHY,
'O.Crypto': AspectCategory.CRYPTOGRAPHY,
'O.Input': AspectCategory.INPUT_VALIDATION,
'O.SQL': AspectCategory.SQL_INJECTION,
'O.XSS': AspectCategory.XSS_PREVENTION,
'O.CSRF': AspectCategory.CSRF_PROTECTION,
'O.Log': AspectCategory.LOGGING_AUDIT,
'O.Audit': AspectCategory.LOGGING_AUDIT,
'O.Err': AspectCategory.ERROR_HANDLING,
'O.Error': AspectCategory.ERROR_HANDLING,
'O.Net': AspectCategory.NETWORK_SECURITY,
'O.Network': AspectCategory.NETWORK_SECURITY,
'O.Store': AspectCategory.SECURE_STORAGE,
'O.Storage': AspectCategory.SECURE_STORAGE,
'O.Priv': AspectCategory.PRIVACY,
'O.Privacy': AspectCategory.PRIVACY,
'O.Data': AspectCategory.DATA_PROTECTION,
'O.Access': AspectCategory.ACCESS_CONTROL,
'O.Key': AspectCategory.KEY_MANAGEMENT,
'O.Comm': AspectCategory.SECURE_COMMUNICATION,
'O.TLS': AspectCategory.SECURE_COMMUNICATION,
'O.Update': AspectCategory.UPDATE_MECHANISM,
'T.': AspectCategory.TEST_ASPECT,
}
# Keywords for category detection when aspect ID is ambiguous
CATEGORY_KEYWORDS = {
AspectCategory.AUTHENTICATION: [
'authentifizierung', 'authentication', 'login', 'anmeldung',
'passwort', 'password', 'credential', 'oauth', 'oidc', 'token',
'bearer', 'jwt', 'session', 'multi-faktor', 'mfa', '2fa'
],
AspectCategory.SESSION_MANAGEMENT: [
'session', 'sitzung', 'cookie', 'timeout', 'ablauf',
'session-id', 'sessionid', 'logout', 'abmeldung'
],
AspectCategory.CRYPTOGRAPHY: [
'verschlüsselung', 'encryption', 'kryptograph', 'cryptograph',
'aes', 'rsa', 'hash', 'signatur', 'signature', 'zertifikat',
'certificate', 'tls', 'ssl', 'hmac', 'pbkdf', 'argon'
],
AspectCategory.INPUT_VALIDATION: [
'eingabevalidierung', 'input validation', 'validierung',
'eingabeprüfung', 'sanitiz', 'whitelist', 'blacklist',
'filter', 'escape', 'encoding'
],
AspectCategory.SQL_INJECTION: [
'sql injection', 'sql-injection', 'prepared statement',
'parameterisiert', 'parameterized', 'orm', 'database'
],
AspectCategory.XSS_PREVENTION: [
'xss', 'cross-site scripting', 'script injection',
'html encoding', 'output encoding', 'csp', 'content-security'
],
AspectCategory.CSRF_PROTECTION: [
'csrf', 'cross-site request', 'token', 'anti-csrf',
'state parameter', 'same-site', 'samesite'
],
AspectCategory.LOGGING_AUDIT: [
'logging', 'protokollierung', 'audit', 'nachvollziehbar',
'traceability', 'log', 'event', 'monitoring'
],
AspectCategory.ERROR_HANDLING: [
'fehlerbehandlung', 'error handling', 'exception',
'fehlermeldung', 'error message', 'stack trace'
],
}
def __init__(self, logger: Optional[logging.Logger] = None):
"""Initialize the PDF extractor."""
self.logger = logger or logging.getLogger(__name__)
if fitz is None:
raise ImportError(
"PyMuPDF is required for PDF extraction. "
"Install it with: pip install PyMuPDF"
)
def extract_from_file(self, pdf_path: str, source_name: Optional[str] = None) -> List[BSIAspect]:
"""
Extract all Pruefaspekte from a BSI-TR PDF file.
Args:
pdf_path: Path to the PDF file
source_name: Optional source document name (auto-detected if not provided)
Returns:
List of extracted BSIAspect objects
"""
path = Path(pdf_path)
if not path.exists():
raise FileNotFoundError(f"PDF file not found: {pdf_path}")
source = source_name or path.stem
self.logger.info(f"Extracting aspects from: {source}")
doc = fitz.open(pdf_path)
aspects = []
for page_num in range(len(doc)):
page = doc[page_num]
text = page.get_text()
# Extract aspects from this page
page_aspects = self._extract_aspects_from_text(
text=text,
page_num=page_num + 1,
source_document=source
)
aspects.extend(page_aspects)
doc.close()
# Post-process: deduplicate and enrich
aspects = self._deduplicate_aspects(aspects)
aspects = self._enrich_aspects(aspects)
self.logger.info(f"Extracted {len(aspects)} unique aspects from {source}")
return aspects
def extract_all_documents(self, docs_dir: str) -> Dict[str, List[BSIAspect]]:
"""
Extract aspects from all BSI-TR PDFs in a directory.
Args:
docs_dir: Directory containing BSI-TR PDF files
Returns:
Dictionary mapping document names to their extracted aspects
"""
docs_path = Path(docs_dir)
results = {}
# Look for BSI-TR PDFs
patterns = ["BSI-TR-03161*.pdf", "bsi-tr-03161*.pdf"]
for pattern in patterns:
for pdf_file in docs_path.glob(pattern):
try:
aspects = self.extract_from_file(str(pdf_file))
results[pdf_file.stem] = aspects
except Exception as e:
self.logger.error(f"Failed to extract from {pdf_file}: {e}")
return results
def _extract_aspects_from_text(
self,
text: str,
page_num: int,
source_document: str
) -> List[BSIAspect]:
"""Extract all Pruefaspekte from a page's text."""
aspects = []
# Find all aspect IDs on this page
for match in re.finditer(self.PATTERNS['aspect_id'], text, re.IGNORECASE):
aspect_id = match.group(1).upper()
# Extract context around the match
start = max(0, match.start() - 200)
end = min(len(text), match.end() + 1000)
context = text[start:end]
# Determine category from aspect ID
category = self._determine_category(aspect_id, context)
# Extract requirement level
req_level = self._extract_requirement_level(context)
# Extract title (text immediately after aspect ID)
title = self._extract_title(context, aspect_id)
# Extract section number
section = self._extract_section(context)
# Extract full requirement text
full_text = self._extract_full_text(context, aspect_id)
aspects.append(BSIAspect(
aspect_id=aspect_id,
title=title,
full_text=full_text,
category=category,
page_number=page_num,
section=section,
requirement_level=req_level,
source_document=source_document,
context_before=text[start:match.start()].strip()[-100:],
context_after=text[match.end():end].strip()[:200],
))
# Also look for section-based aspects
for match in re.finditer(self.PATTERNS['section_aspect'], text, re.IGNORECASE):
section_id = match.group(1)
aspect_id = f"SEC_{section_id.replace('.', '_')}"
# Check if we already have this as an O.* aspect
if any(a.section == section_id for a in aspects):
continue
start = max(0, match.start() - 100)
end = min(len(text), match.end() + 800)
context = text[start:end]
category = self._determine_category_from_keywords(context)
req_level = self._extract_requirement_level(context)
aspects.append(BSIAspect(
aspect_id=aspect_id,
title=f"Prüfaspekt {section_id}",
full_text=context.strip(),
category=category,
page_number=page_num,
section=section_id,
requirement_level=req_level,
source_document=source_document,
))
return aspects
def _determine_category(self, aspect_id: str, context: str) -> AspectCategory:
"""Determine the category of an aspect based on its ID and context."""
# First try to match by aspect ID prefix
for prefix, category in self.CATEGORY_MAP.items():
if aspect_id.upper().startswith(prefix.upper()):
return category
# Fall back to keyword-based detection
return self._determine_category_from_keywords(context)
def _determine_category_from_keywords(self, text: str) -> AspectCategory:
"""Determine category based on keywords in the text."""
text_lower = text.lower()
category_scores = {}
for category, keywords in self.CATEGORY_KEYWORDS.items():
score = sum(1 for kw in keywords if kw in text_lower)
if score > 0:
category_scores[category] = score
if category_scores:
return max(category_scores, key=category_scores.get)
return AspectCategory.GENERAL
def _extract_requirement_level(self, text: str) -> RequirementLevel:
"""Extract the requirement level from text."""
match = re.search(self.PATTERNS['requirement'], text, re.IGNORECASE)
if match:
level = match.group(1).upper()
if 'DARF' in level and 'NICHT' in level:
return RequirementLevel.DARF_NICHT
elif level == 'MUSS':
return RequirementLevel.MUSS
elif level == 'SOLL':
return RequirementLevel.SOLL
elif level == 'KANN':
return RequirementLevel.KANN
return RequirementLevel.SOLL # Default
def _extract_title(self, context: str, aspect_id: str) -> str:
"""Extract the title/short description of an aspect."""
# Look for text immediately after the aspect ID
pattern = rf'{re.escape(aspect_id)}\s*[:\-]?\s*([^\n]+)'
match = re.search(pattern, context, re.IGNORECASE)
if match:
title = match.group(1).strip()
# Clean up the title
title = re.sub(r'\s+', ' ', title)
# Truncate if too long
if len(title) > 200:
title = title[:197] + "..."
return title
return aspect_id
def _extract_section(self, context: str) -> str:
"""Extract the section number from context."""
match = re.search(self.PATTERNS['section'], context)
return match.group(1) if match else ""
def _extract_full_text(self, context: str, aspect_id: str) -> str:
"""Extract the complete requirement text."""
# Find the aspect ID and get text until the next aspect or section
pattern = rf'{re.escape(aspect_id)}[^\n]*\n(.*?)(?=\n\s*(?:O\.[A-Z]|T\.[A-Z]|\d+\.\d+\s|\Z))'
match = re.search(pattern, context, re.IGNORECASE | re.DOTALL)
if match:
full_text = match.group(0).strip()
else:
# Fall back to context
full_text = context.strip()
# Clean up
full_text = re.sub(r'\s+', ' ', full_text)
return full_text
def _deduplicate_aspects(self, aspects: List[BSIAspect]) -> List[BSIAspect]:
"""Remove duplicate aspects, keeping the one with more context."""
seen = {}
for aspect in aspects:
key = aspect.aspect_id
if key not in seen:
seen[key] = aspect
else:
# Keep the one with longer full_text
if len(aspect.full_text) > len(seen[key].full_text):
seen[key] = aspect
return list(seen.values())
def _enrich_aspects(self, aspects: List[BSIAspect]) -> List[BSIAspect]:
"""Enrich aspects with additional metadata."""
aspect_ids = {a.aspect_id for a in aspects}
for aspect in aspects:
# Find related aspects mentioned in the full text
for other_id in aspect_ids:
if other_id != aspect.aspect_id and other_id in aspect.full_text:
aspect.related_aspects.append(other_id)
# Extract keywords based on category
aspect.keywords = self._extract_keywords(aspect)
return aspects
def _extract_keywords(self, aspect: BSIAspect) -> List[str]:
"""Extract relevant keywords from an aspect."""
keywords = []
text_lower = aspect.full_text.lower()
# Add keywords based on category
if aspect.category in self.CATEGORY_KEYWORDS:
for kw in self.CATEGORY_KEYWORDS[aspect.category]:
if kw in text_lower:
keywords.append(kw)
return list(set(keywords))[:10] # Limit to 10 keywords
def get_statistics(self, aspects: List[BSIAspect]) -> Dict[str, Any]:
"""Get statistics about extracted aspects."""
stats = {
"total_aspects": len(aspects),
"by_category": {},
"by_requirement_level": {},
"by_source": {},
"unique_sections": set(),
}
for aspect in aspects:
# By category
cat = aspect.category.value
stats["by_category"][cat] = stats["by_category"].get(cat, 0) + 1
# By requirement level
level = aspect.requirement_level.value
stats["by_requirement_level"][level] = stats["by_requirement_level"].get(level, 0) + 1
# By source
src = aspect.source_document
stats["by_source"][src] = stats["by_source"].get(src, 0) + 1
# Unique sections
if aspect.section:
stats["unique_sections"].add(aspect.section)
stats["unique_sections"] = len(stats["unique_sections"])
return stats
class EURegulationExtractor:
"""
Extracts Articles from EU Regulation PDF documents.
Handles documents like GDPR, AI Act, CRA, etc. in their official formats.
"""
PATTERNS = {
'article_de': r'Artikel\s+(\d+)',
'article_en': r'Article\s+(\d+)',
'paragraph': r'\((\d+)\)',
'recital': r'Erwägungsgrund\s+(\d+)|Recital\s+(\d+)',
}
def __init__(self, logger: Optional[logging.Logger] = None):
self.logger = logger or logging.getLogger(__name__)
def extract_from_file(
self,
pdf_path: str,
regulation_name: str,
language: str = "de"
) -> List[EUArticle]:
"""Extract all articles from an EU regulation PDF."""
if fitz is None:
raise ImportError("PyMuPDF is required for PDF extraction.")
path = Path(pdf_path)
if not path.exists():
raise FileNotFoundError(f"PDF file not found: {pdf_path}")
doc = fitz.open(pdf_path)
articles = []
article_pattern = (
self.PATTERNS['article_de'] if language == "de"
else self.PATTERNS['article_en']
)
for page_num in range(len(doc)):
page = doc[page_num]
text = page.get_text()
# Find article starts
for match in re.finditer(article_pattern, text):
article_num = match.group(1)
# Extract article content
start = match.start()
# Find next article or end of page
next_match = re.search(article_pattern, text[match.end():])
end = match.end() + next_match.start() if next_match else len(text)
article_text = text[start:end].strip()
# Extract paragraphs
paragraphs = self._extract_paragraphs(article_text)
# Extract title
title = self._extract_article_title(article_text, article_num)
articles.append(EUArticle(
article_number=f"Art. {article_num}",
title=title,
full_text=article_text,
paragraphs=paragraphs,
page_number=page_num + 1,
regulation_name=regulation_name,
))
doc.close()
return self._deduplicate_articles(articles)
def _extract_paragraphs(self, text: str) -> List[str]:
"""Extract numbered paragraphs from article text."""
paragraphs = []
matches = list(re.finditer(self.PATTERNS['paragraph'], text))
for i, match in enumerate(matches):
start = match.start()
end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
para_text = text[start:end].strip()
if para_text:
paragraphs.append(para_text)
return paragraphs
def _extract_article_title(self, text: str, article_num: str) -> str:
"""Extract the title of an article."""
# Look for title after "Artikel X"
pattern = rf'Artikel\s+{article_num}\s*\n\s*([^\n]+)'
match = re.search(pattern, text)
if match:
return match.group(1).strip()
return f"Artikel {article_num}"
def _deduplicate_articles(self, articles: List[EUArticle]) -> List[EUArticle]:
"""Remove duplicate articles."""
seen = {}
for article in articles:
key = article.article_number
if key not in seen:
seen[key] = article
else:
if len(article.full_text) > len(seen[key].full_text):
seen[key] = article
return list(seen.values())

View File

@@ -0,0 +1,876 @@
"""
Compliance Regulation Scraper Service.
Extracts requirements and audit aspects from:
- EU-Lex regulations (GDPR, AI Act, CRA, NIS2, etc.)
- BSI Technical Guidelines (TR-03161)
- German laws (TDDDG, etc.)
Similar pattern to edu-search and zeugnisse-crawler.
"""
import logging
import re
import asyncio
from datetime import datetime
from typing import Dict, List, Any, Optional
from enum import Enum
import hashlib
import httpx
from bs4 import BeautifulSoup
from sqlalchemy.orm import Session
from ..db.models import (
RegulationDB,
RequirementDB,
RegulationTypeEnum,
)
from ..db.repository import (
RegulationRepository,
RequirementRepository,
)
logger = logging.getLogger(__name__)
class SourceType(str, Enum):
EUR_LEX = "eur_lex"
BSI_PDF = "bsi_pdf"
GESETZE_IM_INTERNET = "gesetze_im_internet"
MANUAL = "manual"
class ScraperStatus(str, Enum):
IDLE = "idle"
RUNNING = "running"
COMPLETED = "completed"
ERROR = "error"
class RegulationScraperService:
"""
Scrapes and extracts requirements from regulatory sources.
Supported sources:
- EUR-Lex: https://eur-lex.europa.eu/eli/reg/{year}/{number}/oj/eng
- BSI: Local PDF parsing
- Gesetze-im-Internet: German law portal
"""
# EUR-Lex patterns for article extraction
ARTICLE_PATTERN = re.compile(
r'Article\s+(\d+[a-z]?)\s*\n\s*(.+?)(?=\nArticle\s+\d|$)',
re.DOTALL | re.IGNORECASE
)
# BSI TR pattern for test aspects
BSI_ASPECT_PATTERN = re.compile(
r'(O\.[A-Za-z_]+[\d]*)\s+(.+?)(?=\nO\.|$)',
re.DOTALL
)
# Known regulation URLs - All 19 regulations from seed data
KNOWN_SOURCES = {
# A. Datenschutz & Datenuebermittlung
"GDPR": {
"url": "https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:32016R0679",
"type": SourceType.EUR_LEX,
"regulation_type": RegulationTypeEnum.EU_REGULATION,
},
"EPRIVACY": {
"url": "https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:32002L0058",
"type": SourceType.EUR_LEX,
"regulation_type": RegulationTypeEnum.EU_DIRECTIVE,
},
"SCC": {
"url": "https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:32021D0914",
"type": SourceType.EUR_LEX,
"regulation_type": RegulationTypeEnum.EU_REGULATION,
},
"DPF": {
"url": "https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:32023D1795",
"type": SourceType.EUR_LEX,
"regulation_type": RegulationTypeEnum.EU_REGULATION,
},
# B. KI-Regulierung
"AIACT": {
"url": "https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=OJ:L_202401689",
"type": SourceType.EUR_LEX,
"regulation_type": RegulationTypeEnum.EU_REGULATION,
},
# C. Cybersecurity & Produktsicherheit
"CRA": {
"url": "https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=OJ:L_202402847",
"type": SourceType.EUR_LEX,
"regulation_type": RegulationTypeEnum.EU_REGULATION,
},
"NIS2": {
"url": "https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:32022L2555",
"type": SourceType.EUR_LEX,
"regulation_type": RegulationTypeEnum.EU_DIRECTIVE,
},
"EUCSA": {
"url": "https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:32019R0881",
"type": SourceType.EUR_LEX,
"regulation_type": RegulationTypeEnum.EU_REGULATION,
},
# D. Datenoekonomie & Interoperabilitaet
"DATAACT": {
"url": "https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:32023R2854",
"type": SourceType.EUR_LEX,
"regulation_type": RegulationTypeEnum.EU_REGULATION,
},
"DGA": {
"url": "https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:32022R0868",
"type": SourceType.EUR_LEX,
"regulation_type": RegulationTypeEnum.EU_REGULATION,
},
# E. Plattform-Pflichten
"DSA": {
"url": "https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:32022R2065",
"type": SourceType.EUR_LEX,
"regulation_type": RegulationTypeEnum.EU_REGULATION,
},
# F. Barrierefreiheit
"EAA": {
"url": "https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:32019L0882",
"type": SourceType.EUR_LEX,
"regulation_type": RegulationTypeEnum.EU_DIRECTIVE,
},
# G. IP & Urheberrecht
"DSM": {
"url": "https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:32019L0790",
"type": SourceType.EUR_LEX,
"regulation_type": RegulationTypeEnum.EU_DIRECTIVE,
},
# H. Produkthaftung
"PLD": {
"url": "https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:32024L2853",
"type": SourceType.EUR_LEX,
"regulation_type": RegulationTypeEnum.EU_DIRECTIVE,
},
"GPSR": {
"url": "https://eur-lex.europa.eu/legal-content/EN/TXT/HTML/?uri=CELEX:32023R0988",
"type": SourceType.EUR_LEX,
"regulation_type": RegulationTypeEnum.EU_REGULATION,
},
# I. BSI-Standards (Deutschland)
"BSI-TR-03161-1": {
"url": "/docs/BSI-TR-03161-1.pdf",
"type": SourceType.BSI_PDF,
"regulation_type": RegulationTypeEnum.BSI_STANDARD,
},
"BSI-TR-03161-2": {
"url": "/docs/BSI-TR-03161-2.pdf",
"type": SourceType.BSI_PDF,
"regulation_type": RegulationTypeEnum.BSI_STANDARD,
},
"BSI-TR-03161-3": {
"url": "/docs/BSI-TR-03161-3.pdf",
"type": SourceType.BSI_PDF,
"regulation_type": RegulationTypeEnum.BSI_STANDARD,
},
}
def __init__(self, db: Session):
self.db = db
self.reg_repo = RegulationRepository(db)
self.req_repo = RequirementRepository(db)
self.status = ScraperStatus.IDLE
self.current_source: Optional[str] = None
self.last_error: Optional[str] = None
self.stats = {
"sources_processed": 0,
"requirements_extracted": 0,
"errors": 0,
"last_run": None,
}
async def get_status(self) -> Dict[str, Any]:
"""Get current scraper status."""
return {
"status": self.status.value,
"current_source": self.current_source,
"last_error": self.last_error,
"stats": self.stats,
"known_sources": list(self.KNOWN_SOURCES.keys()),
}
async def scrape_all(self) -> Dict[str, Any]:
"""Scrape all known regulation sources."""
self.status = ScraperStatus.RUNNING
self.stats["last_run"] = datetime.utcnow().isoformat()
results = {
"success": [],
"failed": [],
"skipped": [],
}
for code, source_info in self.KNOWN_SOURCES.items():
try:
self.current_source = code
# Check if already scraped recently
existing = self.reg_repo.get_by_code(code)
if existing and existing.requirements:
results["skipped"].append({
"code": code,
"reason": "already_has_requirements",
"requirement_count": len(existing.requirements),
})
continue
# Scrape based on source type
if source_info["type"] == SourceType.EUR_LEX:
count = await self._scrape_eurlex(code, source_info)
elif source_info["type"] == SourceType.BSI_PDF:
count = await self._scrape_bsi_pdf(code, source_info)
else:
results["skipped"].append({
"code": code,
"reason": "unknown_source_type",
})
continue
results["success"].append({
"code": code,
"requirements_extracted": count,
})
self.stats["sources_processed"] += 1
self.stats["requirements_extracted"] += count
except Exception as e:
logger.error(f"Error scraping {code}: {e}")
results["failed"].append({
"code": code,
"error": str(e),
})
self.stats["errors"] += 1
self.last_error = str(e)
self.status = ScraperStatus.COMPLETED
self.current_source = None
return results
async def scrape_single(self, code: str, force: bool = False) -> Dict[str, Any]:
"""Scrape a single regulation source."""
if code not in self.KNOWN_SOURCES:
raise ValueError(f"Unknown regulation code: {code}")
source_info = self.KNOWN_SOURCES[code]
self.status = ScraperStatus.RUNNING
self.current_source = code
try:
# Check existing
existing = self.reg_repo.get_by_code(code)
if existing and existing.requirements and not force:
self.status = ScraperStatus.COMPLETED
return {
"code": code,
"status": "skipped",
"reason": "already_has_requirements",
"requirement_count": len(existing.requirements),
}
# Delete existing requirements if force
if existing and force:
for req in existing.requirements:
self.db.delete(req)
self.db.commit()
# Scrape
if source_info["type"] == SourceType.EUR_LEX:
count = await self._scrape_eurlex(code, source_info)
elif source_info["type"] == SourceType.BSI_PDF:
count = await self._scrape_bsi_pdf(code, source_info)
else:
raise ValueError(f"Unknown source type: {source_info['type']}")
self.status = ScraperStatus.COMPLETED
return {
"code": code,
"status": "success",
"requirements_extracted": count,
}
except Exception as e:
self.status = ScraperStatus.ERROR
self.last_error = str(e)
raise
finally:
self.current_source = None
async def _scrape_eurlex(self, code: str, source_info: Dict) -> int:
"""Scrape EUR-Lex regulation page."""
url = source_info["url"]
logger.info(f"Scraping EUR-Lex: {code} from {url}")
async with httpx.AsyncClient(timeout=60.0) as client:
response = await client.get(url, follow_redirects=True)
response.raise_for_status()
html = response.text
soup = BeautifulSoup(html, 'html.parser')
# Get or create regulation
regulation = self.reg_repo.get_by_code(code)
if not regulation:
regulation = RegulationDB(
code=code,
name=code,
regulation_type=source_info["regulation_type"],
source_url=url,
is_active=True,
)
self.db.add(regulation)
self.db.commit()
self.db.refresh(regulation)
# Extract articles
requirements_created = 0
# Find all article elements (EUR-Lex structure varies)
articles = soup.find_all('div', class_='eli-subdivision')
if not articles:
articles = soup.find_all('p', class_='oj-ti-art')
for article_elem in articles:
try:
# Extract article number and title
article_id = article_elem.get('id', '')
if not article_id:
title_elem = article_elem.find(['span', 'p'], class_=['oj-ti-art', 'eli-title'])
if title_elem:
text = title_elem.get_text(strip=True)
match = re.search(r'Article\s+(\d+[a-z]?)', text, re.IGNORECASE)
if match:
article_id = f"art_{match.group(1)}"
if not article_id:
continue
# Extract article text
article_text = article_elem.get_text(separator='\n', strip=True)
# Parse article number and title
lines = article_text.split('\n')
article_num = None
title = None
for line in lines[:3]:
art_match = re.search(r'Article\s+(\d+[a-z]?)', line, re.IGNORECASE)
if art_match:
article_num = f"Art. {art_match.group(1)}"
elif not article_num:
continue
elif not title and len(line) > 3 and not line.startswith('Article'):
title = line[:200]
break
if not article_num:
continue
# Check if requirement already exists
existing = self.db.query(RequirementDB).filter(
RequirementDB.regulation_id == regulation.id,
RequirementDB.article == article_num
).first()
if existing:
continue
# Create requirement
requirement = RequirementDB(
regulation_id=regulation.id,
article=article_num,
title=title or f"{code} {article_num}",
requirement_text=article_text[:5000], # Limit length
source_section=article_id,
is_applicable=True,
priority=2,
)
self.db.add(requirement)
requirements_created += 1
except Exception as e:
logger.warning(f"Error parsing article in {code}: {e}")
continue
# Alternative: extract from raw text with regex
if requirements_created == 0:
text = soup.get_text()
matches = self.ARTICLE_PATTERN.findall(text)
for art_num, art_text in matches[:50]: # Limit to 50 articles
article_num = f"Art. {art_num}"
existing = self.db.query(RequirementDB).filter(
RequirementDB.regulation_id == regulation.id,
RequirementDB.article == article_num
).first()
if existing:
continue
# Extract first line as title
lines = art_text.strip().split('\n')
title = lines[0][:200] if lines else f"{code} {article_num}"
requirement = RequirementDB(
regulation_id=regulation.id,
article=article_num,
title=title,
requirement_text=art_text[:5000],
is_applicable=True,
priority=2,
)
self.db.add(requirement)
requirements_created += 1
# Fallback: If scraping failed (e.g., WAF protection), use seed requirements
if requirements_created == 0:
logger.info(f"Scraping returned 0 results for {code}, using seed requirements")
seed_reqs = self._get_eurlex_seed_requirements(code)
for seed in seed_reqs:
existing = self.db.query(RequirementDB).filter(
RequirementDB.regulation_id == regulation.id,
RequirementDB.article == seed["article"]
).first()
if existing:
continue
requirement = RequirementDB(
regulation_id=regulation.id,
article=seed["article"],
title=seed["title"],
description=seed.get("description"),
requirement_text=seed.get("requirement_text"),
is_applicable=True,
priority=seed.get("priority", 2),
)
self.db.add(requirement)
requirements_created += 1
self.db.commit()
logger.info(f"Extracted {requirements_created} requirements from {code}")
return requirements_created
def _get_eurlex_seed_requirements(self, code: str) -> List[Dict[str, Any]]:
"""
Returns seed requirements for EUR-Lex regulations when scraping fails.
These are the key articles relevant for Breakpilot compliance.
"""
if code == "NIS2":
return [
{"article": "Art. 6", "title": "Risikobewertung", "description": "Risikobewertung fuer Cybersicherheit", "requirement_text": "Einrichtungen muessen eine Risikobewertung fuer Cybersicherheit durchfuehren.", "priority": 1},
{"article": "Art. 7", "title": "Nationale Cybersicherheitsstrategie", "description": "Umsetzung nationaler Vorgaben", "requirement_text": "Einhaltung der nationalen Cybersicherheitsstrategie.", "priority": 2},
{"article": "Art. 20", "title": "Governance", "description": "Leitungsorgane muessen Cybersicherheit beaufsichtigen", "requirement_text": "Leitungsorgane muessen Cybersicherheitsmassnahmen genehmigen und deren Umsetzung beaufsichtigen.", "priority": 1},
{"article": "Art. 21", "title": "Risikomanagementmassnahmen", "description": "Technische und organisatorische Massnahmen", "requirement_text": "Geeignete und verhaeltnismaessige technische, operative und organisatorische Massnahmen zur Beherrschung von Cybersicherheitsrisiken.", "priority": 1},
{"article": "Art. 21(2)(a)", "title": "Risikoanalyse und Sicherheitskonzepte", "description": "Konzepte fuer Risikoanalyse", "requirement_text": "Konzepte fuer die Risikoanalyse und Sicherheit von Informationssystemen.", "priority": 1},
{"article": "Art. 21(2)(b)", "title": "Bewertung von Sicherheitsvorfaellen", "description": "Incident Handling", "requirement_text": "Bewertung der Wirksamkeit von Risikomanagementmassnahmen.", "priority": 1},
{"article": "Art. 21(2)(c)", "title": "Business Continuity", "description": "Betriebskontinuitaet sicherstellen", "requirement_text": "Aufrechterhaltung des Betriebs, Backup-Management und Krisenmanagement.", "priority": 1},
{"article": "Art. 21(2)(d)", "title": "Lieferkettensicherheit", "description": "Sicherheit in der Lieferkette", "requirement_text": "Sicherheit der Lieferkette einschliesslich Beziehungen zu Lieferanten.", "priority": 1},
{"article": "Art. 21(2)(e)", "title": "Sicherheit bei Entwicklung", "description": "Sichere Entwicklung", "requirement_text": "Sicherheit bei Erwerb, Entwicklung und Wartung von Systemen.", "priority": 1},
{"article": "Art. 21(2)(f)", "title": "Schwachstellenmanagement", "description": "Umgang mit Schwachstellen", "requirement_text": "Konzepte zur Bewertung der Wirksamkeit von Massnahmen.", "priority": 1},
{"article": "Art. 21(2)(g)", "title": "Cyberhygiene und Schulungen", "description": "Grundlegende Cyberhygiene-Praktiken", "requirement_text": "Grundlegende Cyberhygiene-Praktiken und Schulungen.", "priority": 1},
{"article": "Art. 21(2)(h)", "title": "Kryptografie", "description": "Einsatz von Verschluesselung", "requirement_text": "Konzepte und Verfahren fuer Kryptografie und Verschluesselung.", "priority": 1},
{"article": "Art. 21(2)(i)", "title": "Personalsicherheit", "description": "HR-Security", "requirement_text": "Sicherheit des Personals, Zugangskontrollen und Asset-Management.", "priority": 1},
{"article": "Art. 21(2)(j)", "title": "MFA und sichere Authentifizierung", "description": "Multi-Faktor-Authentifizierung", "requirement_text": "Multi-Faktor-Authentifizierung und sichere Kommunikation.", "priority": 1},
{"article": "Art. 23", "title": "Meldepflichten", "description": "Meldung von Sicherheitsvorfaellen", "requirement_text": "Erhebliche Sicherheitsvorfaelle muessen den zustaendigen Behoerden gemeldet werden.", "priority": 1},
{"article": "Art. 24", "title": "Europaeische Schwachstellendatenbank", "description": "CVE-Datenbank nutzen", "requirement_text": "Nutzung der europaeischen Schwachstellendatenbank.", "priority": 2},
]
elif code == "DATAACT":
return [
{"article": "Art. 3", "title": "Datenzugang fuer Nutzer", "description": "Nutzer koennen auf ihre Daten zugreifen", "requirement_text": "Daten, die durch Nutzung vernetzter Produkte generiert werden, muessen dem Nutzer zugaenglich gemacht werden.", "priority": 1},
{"article": "Art. 4", "title": "Recht auf Datenzugang", "description": "Unentgeltlicher Zugang", "requirement_text": "Nutzer haben das Recht auf unentgeltlichen Zugang zu ihren Daten.", "priority": 1},
{"article": "Art. 5", "title": "Recht auf Datenweitergabe", "description": "Daten an Dritte weitergeben", "requirement_text": "Nutzer koennen verlangen, dass Daten an Dritte weitergegeben werden.", "priority": 1},
{"article": "Art. 6", "title": "Pflichten des Dateninhabers", "description": "Daten zeitnah bereitstellen", "requirement_text": "Dateninhaber muessen Daten unverzueglich und in geeignetem Format bereitstellen.", "priority": 1},
{"article": "Art. 8", "title": "Faire Vertragsbedingungen", "description": "Keine unfairen Klauseln", "requirement_text": "Vertragsbedingungen fuer Datenzugang muessen fair und nicht-diskriminierend sein.", "priority": 2},
{"article": "Art. 14", "title": "Cloud-Switching", "description": "Wechsel zwischen Cloud-Anbietern", "requirement_text": "Unterstuetzung beim Wechsel zwischen Cloud-Diensten und Datenportabilitaet.", "priority": 1},
{"article": "Art. 23", "title": "Technische Schutzmassnahmen", "description": "Schutz nicht-personenbezogener Daten", "requirement_text": "Angemessene technische Schutzmassnahmen fuer nicht-personenbezogene Daten.", "priority": 1},
{"article": "Art. 25", "title": "Geschaeftsgeheimnisse", "description": "Schutz von Geschaeftsgeheimnissen", "requirement_text": "Massnahmen zum Schutz von Geschaeftsgeheimnissen bei Datenzugang.", "priority": 2},
]
elif code == "DGA":
return [
{"article": "Art. 5", "title": "Bedingungen fuer Weiterverwendung", "description": "Weiterverwendung oeffentlicher Daten", "requirement_text": "Bedingungen fuer die Weiterverwendung geschuetzter Daten oeffentlicher Stellen.", "priority": 2},
{"article": "Art. 7", "title": "Technische Anforderungen", "description": "Sichere Verarbeitungsumgebungen", "requirement_text": "Sichere Verarbeitungsumgebungen fuer Zugang zu geschuetzten Daten.", "priority": 1},
{"article": "Art. 10", "title": "Datenvermittlungsdienste", "description": "Registrierung von Vermittlungsdiensten", "requirement_text": "Datenvermittlungsdienste muessen registriert und reguliert werden.", "priority": 2},
{"article": "Art. 12", "title": "Bedingungen fuer Datenvermittlung", "description": "Neutralitaet wahren", "requirement_text": "Datenvermittler muessen neutral handeln und duerfen Daten nicht fuer eigene Zwecke nutzen.", "priority": 1},
{"article": "Art. 16", "title": "Datenaltruismus", "description": "Freiwillige Datenspende", "requirement_text": "Registrierung als Organisation fuer Datenaltruismus moeglich.", "priority": 3},
{"article": "Art. 21", "title": "Einwilligungsformular", "description": "Europaeisches Einwilligungsformular", "requirement_text": "Verwendung des europaeischen Einwilligungsformulars fuer Datenaltruismus.", "priority": 3},
]
elif code == "DSA":
return [
{"article": "Art. 6", "title": "Haftungsausschluss Hosting", "description": "Bedingungen fuer Haftungsausschluss", "requirement_text": "Hosting-Dienste haften nicht, wenn sie keine Kenntnis von rechtswidrigen Inhalten haben.", "priority": 1},
{"article": "Art. 11", "title": "Kontaktstelle", "description": "Behoerdenkontakt", "requirement_text": "Anbieter muessen eine Kontaktstelle fuer Behoerden benennen.", "priority": 2},
{"article": "Art. 12", "title": "Rechtsvertreter", "description": "Vertreter in der EU", "requirement_text": "Nicht-EU-Anbieter muessen einen Rechtsvertreter in der EU benennen.", "priority": 2},
{"article": "Art. 13", "title": "AGB-Transparenz", "description": "Transparente Nutzungsbedingungen", "requirement_text": "AGB muessen klar, verstaendlich und leicht zugaenglich sein.", "priority": 1},
{"article": "Art. 14", "title": "Transparenzberichte", "description": "Jaehrliche Berichte", "requirement_text": "Jaehrliche Transparenzberichte ueber Content-Moderation veroeffentlichen.", "priority": 2},
{"article": "Art. 16", "title": "Melde- und Abhilfeverfahren", "description": "Notice and Action", "requirement_text": "Leicht zugaengliches System fuer Meldung rechtswidriger Inhalte.", "priority": 1},
{"article": "Art. 17", "title": "Begruendungspflicht", "description": "Entscheidungen begruenden", "requirement_text": "Nutzer muessen ueber Content-Moderation-Entscheidungen informiert werden.", "priority": 1},
{"article": "Art. 20", "title": "Internes Beschwerdemanagement", "description": "Beschwerden bearbeiten", "requirement_text": "Internes System zur Bearbeitung von Beschwerden ueber Content-Moderation.", "priority": 1},
{"article": "Art. 26", "title": "Werbetransparenz", "description": "Werbung kennzeichnen", "requirement_text": "Online-Werbung muss klar als solche erkennbar sein.", "priority": 1},
{"article": "Art. 27", "title": "Empfehlungssysteme", "description": "Algorithmen erklaeren", "requirement_text": "Transparenz ueber Parameter von Empfehlungsalgorithmen.", "priority": 2},
]
elif code == "EUCSA":
return [
{"article": "Art. 46", "title": "Cybersicherheitszertifizierung", "description": "EU-Zertifizierungsrahmen", "requirement_text": "Freiwillige europaeische Zertifizierung fuer IKT-Produkte und -Dienste.", "priority": 2},
{"article": "Art. 51", "title": "Sicherheitsziele", "description": "Ziele der Zertifizierung", "requirement_text": "Schutz von Daten vor unbefugtem Zugriff, Manipulation und Zerstoerung.", "priority": 1},
{"article": "Art. 52", "title": "Vertrauenswuerdigkeitsstufen", "description": "Basic, Substantial, High", "requirement_text": "Drei Stufen: Basic, Substantial, High - je nach Risiko.", "priority": 1},
{"article": "Art. 54", "title": "Konformitaetsbewertung", "description": "Selbstbewertung oder Drittbewertung", "requirement_text": "Je nach Stufe Selbstbewertung oder unabhaengige Bewertung.", "priority": 2},
{"article": "Art. 56", "title": "Zertifizierungsstellen", "description": "Akkreditierte Stellen", "requirement_text": "Zertifizierung durch akkreditierte Konformitaetsbewertungsstellen.", "priority": 2},
]
elif code == "EAA":
return [
{"article": "Art. 3", "title": "Barrierefreiheitsanforderungen", "description": "Produkte barrierefrei gestalten", "requirement_text": "Produkte und Dienstleistungen muessen die Barrierefreiheitsanforderungen erfuellen.", "priority": 1},
{"article": "Art. 4", "title": "Bestehende Rechtsvorschriften", "description": "Verhaeltnis zu anderen Vorschriften", "requirement_text": "Ergaenzung zu bestehenden Barrierefreiheitsvorschriften.", "priority": 3},
{"article": "Art. 13", "title": "Konformitaetsvermutung", "description": "Harmonisierte Normen", "requirement_text": "Konformitaet bei Einhaltung harmonisierter Normen vermutet.", "priority": 2},
{"article": "Art. 14", "title": "Gemeinsame technische Spezifikationen", "description": "Falls keine Normen existieren", "requirement_text": "EU-Kommission kann technische Spezifikationen festlegen.", "priority": 3},
{"article": "Anhang I", "title": "Barrierefreiheitsanforderungen fuer Produkte", "description": "WCAG-konforme Webseiten", "requirement_text": "Webseiten, Apps und E-Books muessen WCAG 2.1 Level AA erfuellen.", "priority": 1},
]
elif code == "DSM":
return [
{"article": "Art. 3", "title": "Text and Data Mining (Forschung)", "description": "TDM fuer Forschung erlaubt", "requirement_text": "Text- und Data-Mining fuer wissenschaftliche Forschung ist erlaubt.", "priority": 2},
{"article": "Art. 4", "title": "Text and Data Mining (Allgemein)", "description": "TDM-Ausnahme", "requirement_text": "TDM erlaubt, wenn Rechteinhaber nicht widersprochen haben.", "priority": 1},
{"article": "Art. 15", "title": "Leistungsschutzrecht Presse", "description": "Verguetung fuer Presseverleger", "requirement_text": "Online-Nutzung von Presseveroeffentlichungen erfordert Lizenz.", "priority": 2},
{"article": "Art. 17", "title": "Upload-Filter", "description": "Plattformhaftung fuer Uploads", "requirement_text": "Plattformen haften fuer urheberrechtsverletzende Uploads ihrer Nutzer.", "priority": 1},
{"article": "Art. 17(7)", "title": "Overblocking verhindern", "description": "Legitime Nutzung schuetzen", "requirement_text": "Massnahmen duerfen nicht zu ungerechtfertigter Sperrung fuehren.", "priority": 1},
]
elif code == "PLD":
return [
{"article": "Art. 4", "title": "Produktbegriff", "description": "Software als Produkt", "requirement_text": "Software gilt als Produkt im Sinne der Produkthaftung.", "priority": 1},
{"article": "Art. 6", "title": "Fehlerhaftes Produkt", "description": "Definition Produktfehler", "requirement_text": "Ein Produkt ist fehlerhaft, wenn es nicht die erwartete Sicherheit bietet.", "priority": 1},
{"article": "Art. 7", "title": "KI-Systeme", "description": "Haftung fuer KI", "requirement_text": "Haftung gilt auch fuer durch KI verursachte Schaeden.", "priority": 1},
{"article": "Art. 9", "title": "Haftung des Herstellers", "description": "Verschuldensunabhaengige Haftung", "requirement_text": "Hersteller haften verschuldensunabhaengig fuer Produktfehler.", "priority": 1},
{"article": "Art. 10", "title": "Softwareaktualisierungen", "description": "Pflicht zu Updates", "requirement_text": "Fehlende Sicherheitsupdates koennen Haftung begruenden.", "priority": 1},
]
elif code == "GPSR":
return [
{"article": "Art. 5", "title": "Allgemeine Sicherheitsanforderung", "description": "Produkte muessen sicher sein", "requirement_text": "Nur sichere Produkte duerfen in Verkehr gebracht werden.", "priority": 1},
{"article": "Art. 8", "title": "Pflichten der Hersteller", "description": "Sicherheitsbewertung durchfuehren", "requirement_text": "Hersteller muessen Risikoanalyse und Sicherheitsbewertung durchfuehren.", "priority": 1},
{"article": "Art. 9", "title": "Technische Dokumentation", "description": "Dokumentationspflicht", "requirement_text": "Technische Dokumentation zur Konformitaet erstellen und aufbewahren.", "priority": 1},
{"article": "Art. 10", "title": "EU-Konformitaetserklaerung", "description": "CE-Kennzeichnung", "requirement_text": "Konformitaetserklaerung und CE-Kennzeichnung erforderlich.", "priority": 1},
{"article": "Art. 14", "title": "Produktrueckrufe", "description": "Rueckrufverfahren", "requirement_text": "Bei Sicherheitsrisiken muessen Produkte zurueckgerufen werden.", "priority": 1},
]
elif code == "CRA":
return [
{"article": "Art. 5", "title": "Wesentliche Anforderungen", "description": "Cybersicherheit bei Entwurf", "requirement_text": "Produkte muessen so entworfen werden, dass sie ein angemessenes Cybersicherheitsniveau gewaehrleisten.", "priority": 1},
{"article": "Art. 6", "title": "Sicherheitsupdates", "description": "Updates bereitstellen", "requirement_text": "Hersteller muessen Sicherheitsupdates fuer die erwartete Produktlebensdauer bereitstellen.", "priority": 1},
{"article": "Art. 10", "title": "Schwachstellenbehandlung", "description": "Vulnerability Handling", "requirement_text": "Hersteller muessen ein koordiniertes Schwachstellenmanagement implementieren.", "priority": 1},
{"article": "Art. 11", "title": "Meldepflicht", "description": "Schwachstellen melden", "requirement_text": "Aktiv ausgenutzte Schwachstellen muessen innerhalb von 24 Stunden gemeldet werden.", "priority": 1},
{"article": "Art. 13", "title": "SBOM", "description": "Software Bill of Materials", "requirement_text": "Eine SBOM muss fuer das Produkt erstellt und gepflegt werden.", "priority": 1},
{"article": "Art. 15", "title": "Support-Zeitraum", "description": "Mindest-Support-Dauer", "requirement_text": "Mindestens 5 Jahre Support oder erwartete Produktlebensdauer.", "priority": 1},
{"article": "Anhang I.1", "title": "Sichere Standardkonfiguration", "description": "Secure by Default", "requirement_text": "Produkte muessen mit sicheren Standardeinstellungen ausgeliefert werden.", "priority": 1},
{"article": "Anhang I.2", "title": "Schutz vor unbefugtem Zugriff", "description": "Access Control", "requirement_text": "Mechanismen zum Schutz vor unbefugtem Zugriff implementieren.", "priority": 1},
{"article": "Anhang I.3", "title": "Datenintegritaet", "description": "Integritaetsschutz", "requirement_text": "Schutz der Integritaet von Daten und Konfiguration.", "priority": 1},
{"article": "Anhang I.4", "title": "Verfuegbarkeit", "description": "Resilienz", "requirement_text": "Schutz vor DoS-Angriffen und Sicherstellung der Verfuegbarkeit.", "priority": 1},
]
elif code == "EPRIVACY":
return [
{"article": "Art. 5", "title": "Vertraulichkeit der Kommunikation", "description": "Kommunikation schuetzen", "requirement_text": "Vertraulichkeit der Kommunikation und Verkehrsdaten gewaehrleisten.", "priority": 1},
{"article": "Art. 6", "title": "Verkehrsdaten", "description": "Umgang mit Verkehrsdaten", "requirement_text": "Verkehrsdaten muessen nach Abschluss geloescht oder anonymisiert werden.", "priority": 1},
{"article": "Art. 9", "title": "Standortdaten", "description": "Nur mit Einwilligung", "requirement_text": "Standortdaten nur mit ausdruecklicher Einwilligung verarbeiten.", "priority": 1},
{"article": "Art. 13", "title": "Unerbetene Nachrichten", "description": "Opt-in fuer Marketing", "requirement_text": "Direktwerbung per E-Mail nur mit vorheriger Einwilligung.", "priority": 1},
]
elif code == "SCC":
return [
{"article": "Klausel 8", "title": "Datenschutzgarantien", "description": "Garantien dokumentieren", "requirement_text": "Datenimporteur muss angemessene Datenschutzgarantien gewaehrleisten.", "priority": 1},
{"article": "Klausel 10", "title": "Betroffenenrechte", "description": "Rechte durchsetzen", "requirement_text": "Betroffene koennen ihre Rechte auch gegenueber Datenimporteur geltend machen.", "priority": 1},
{"article": "Klausel 14", "title": "Lokale Rechtsvorschriften", "description": "Rechtslage pruefen", "requirement_text": "Parteien muessen pruefen, ob lokale Gesetze die Einhaltung verhindern.", "priority": 1},
{"article": "Klausel 15", "title": "Behoerdenzugriff", "description": "Transparenz bei Anfragen", "requirement_text": "Datenimporteur muss ueber Behoerdenanfragen informieren.", "priority": 1},
]
elif code == "DPF":
return [
{"article": "Prinzip 1", "title": "Notice", "description": "Informationspflicht", "requirement_text": "Betroffene muessen ueber Datenverarbeitung informiert werden.", "priority": 1},
{"article": "Prinzip 2", "title": "Choice", "description": "Wahlmoeglichkeit", "requirement_text": "Betroffene muessen der Weitergabe widersprechen koennen.", "priority": 1},
{"article": "Prinzip 4", "title": "Security", "description": "Sicherheitsmassnahmen", "requirement_text": "Angemessene Sicherheitsmassnahmen zum Schutz der Daten.", "priority": 1},
{"article": "Prinzip 5", "title": "Data Integrity", "description": "Datenintegritaet", "requirement_text": "Daten muessen richtig, vollstaendig und aktuell sein.", "priority": 1},
{"article": "Prinzip 6", "title": "Access", "description": "Auskunftsrecht", "requirement_text": "Betroffene haben Recht auf Zugang zu ihren Daten.", "priority": 1},
]
return []
async def _scrape_bsi_pdf(self, code: str, source_info: Dict) -> int:
"""
Scrape BSI Technical Guideline PDF.
Note: Full PDF parsing requires PyMuPDF or pdfplumber.
This is a placeholder that creates seed requirements.
"""
logger.info(f"Processing BSI TR: {code}")
# Get or create regulation
regulation = self.reg_repo.get_by_code(code)
if not regulation:
regulation = RegulationDB(
code=code,
name=f"BSI {code}",
full_name=f"BSI Technical Guideline {code}",
regulation_type=source_info["regulation_type"],
local_pdf_path=source_info["url"],
is_active=True,
)
self.db.add(regulation)
self.db.commit()
self.db.refresh(regulation)
# Known BSI TR-03161 test aspects (Pruefaspekte)
# These are the key security requirements from the TR
bsi_aspects = self._get_bsi_aspects(code)
requirements_created = 0
for aspect in bsi_aspects:
existing = self.db.query(RequirementDB).filter(
RequirementDB.regulation_id == regulation.id,
RequirementDB.article == aspect["id"]
).first()
if existing:
continue
requirement = RequirementDB(
regulation_id=regulation.id,
article=aspect["id"],
title=aspect["title"],
description=aspect.get("description"),
requirement_text=aspect.get("requirement_text"),
breakpilot_interpretation=aspect.get("interpretation"),
is_applicable=aspect.get("is_applicable", True),
priority=aspect.get("priority", 2),
source_page=aspect.get("page"),
source_section=aspect.get("section"),
)
self.db.add(requirement)
requirements_created += 1
self.db.commit()
logger.info(f"Created {requirements_created} BSI requirements from {code}")
return requirements_created
def _get_bsi_aspects(self, code: str) -> List[Dict[str, Any]]:
"""
Returns comprehensive BSI TR-03161 test aspects (Pruefaspekte).
These are the actual test aspects from BSI TR-03161:
- Part 1: Allgemeine Anforderungen (~45 Aspekte)
- Part 2: Web-Anwendungen (~40 Aspekte)
- Part 3: Hintergrundsysteme (~35 Aspekte)
Total: ~120 Pruefaspekte
"""
if code == "BSI-TR-03161-1":
# Teil 1: Allgemeine Anforderungen
return [
# Zweckbindung & Datenminimierung
{"id": "O.Purp_1", "title": "Zweckbindung", "description": "Anwendungszweck klar definiert", "requirement_text": "Die Anwendung muss einen klar definierten und dokumentierten Zweck haben.", "priority": 1, "section": "4.1"},
{"id": "O.Purp_2", "title": "Zweckdokumentation", "description": "Zweck fuer Nutzer einsehbar", "requirement_text": "Der Zweck muss fuer Nutzer transparent und einsehbar dokumentiert sein.", "priority": 2, "section": "4.1"},
{"id": "O.Data_1", "title": "Datenminimierung", "description": "Nur notwendige Daten erheben", "requirement_text": "Es duerfen nur die fuer den definierten Zweck erforderlichen Daten erhoben werden.", "priority": 1, "section": "4.2"},
{"id": "O.Data_2", "title": "Datenerforderlichkeit", "description": "Erforderlichkeit pruefen", "requirement_text": "Vor jeder Datenerhebung muss die Erforderlichkeit geprueft und dokumentiert werden.", "priority": 1, "section": "4.2"},
{"id": "O.Data_3", "title": "Datenkategorien", "description": "Datenkategorien klassifizieren", "requirement_text": "Alle verarbeiteten Datenkategorien muessen klassifiziert und dokumentiert sein.", "priority": 2, "section": "4.2"},
{"id": "O.Data_4", "title": "Besondere Kategorien", "description": "Art. 9 DSGVO Daten identifizieren", "requirement_text": "Besondere Kategorien personenbezogener Daten (Art. 9 DSGVO) muessen identifiziert und besonders geschuetzt werden.", "priority": 1, "section": "4.2"},
# Authentifizierung
{"id": "O.Auth_1", "title": "Authentifizierungsmechanismus", "description": "Sichere Authentifizierung", "requirement_text": "Die Anwendung muss sichere Authentifizierungsmechanismen implementieren.", "priority": 1, "section": "4.3"},
{"id": "O.Auth_2", "title": "Passwortrichtlinie", "description": "Starke Passwoerter erzwingen", "requirement_text": "Passwortrichtlinien muessen Mindestlaenge (12 Zeichen), Komplexitaet und Historie durchsetzen.", "priority": 1, "section": "4.3"},
{"id": "O.Auth_3", "title": "Passwort-Hashing", "description": "Sichere Hash-Algorithmen", "requirement_text": "Passwoerter muessen mit aktuellen Algorithmen (bcrypt, Argon2) gehasht werden.", "priority": 1, "section": "4.3"},
{"id": "O.Auth_4", "title": "Multi-Faktor-Authentifizierung", "description": "MFA fuer sensitive Bereiche", "requirement_text": "Fuer administrative und sensitive Funktionen muss MFA verfuegbar sein.", "priority": 1, "section": "4.3"},
{"id": "O.Auth_5", "title": "Brute-Force-Schutz", "description": "Rate Limiting bei Login", "requirement_text": "Nach mehreren fehlgeschlagenen Anmeldeversuchen muss Account-Lockout oder Rate-Limiting greifen.", "priority": 1, "section": "4.3"},
{"id": "O.Auth_6", "title": "Sichere Passwort-Wiederherstellung", "description": "Reset-Prozess absichern", "requirement_text": "Der Passwort-Reset-Prozess muss gegen Enumeration und Manipulation geschuetzt sein.", "priority": 1, "section": "4.3"},
# Autorisierung
{"id": "O.Authz_1", "title": "Zugriffskontrolle", "description": "Rollenbasierte Zugriffskontrolle", "requirement_text": "Ein rollenbasiertes Zugriffskonzept (RBAC) muss implementiert sein.", "priority": 1, "section": "4.4"},
{"id": "O.Authz_2", "title": "Least Privilege", "description": "Minimale Rechte", "requirement_text": "Benutzer sollen nur die minimal notwendigen Berechtigungen erhalten.", "priority": 1, "section": "4.4"},
{"id": "O.Authz_3", "title": "Rechtetrennung", "description": "Funktionale Trennung", "requirement_text": "Administrative und operative Rollen muessen getrennt sein.", "priority": 1, "section": "4.4"},
{"id": "O.Authz_4", "title": "Autorisierungspruefung", "description": "Serverseitige Pruefung", "requirement_text": "Jede Ressource muss serverseitig auf Zugriffsberechtigung geprueft werden.", "priority": 1, "section": "4.4"},
# Kryptografie
{"id": "O.Cryp_1", "title": "TLS-Verschluesselung", "description": "TLS 1.2+ fuer Transport", "requirement_text": "Alle Daten muessen bei der Uebertragung mit TLS 1.2 oder hoeher verschluesselt werden.", "priority": 1, "section": "4.5"},
{"id": "O.Cryp_2", "title": "Verschluesselung at Rest", "description": "Sensible Daten verschluesseln", "requirement_text": "Sensible Daten muessen bei der Speicherung verschluesselt werden (AES-256 oder vergleichbar).", "priority": 1, "section": "4.5"},
{"id": "O.Cryp_3", "title": "HSTS", "description": "HTTP Strict Transport Security", "requirement_text": "HSTS-Header muessen gesetzt sein um HTTPS zu erzwingen.", "priority": 1, "section": "4.5"},
{"id": "O.Cryp_4", "title": "Zertifikatvalidierung", "description": "Zertifikate pruefen", "requirement_text": "TLS-Zertifikate muessen vollstaendig validiert werden (Chain, Revocation, Hostname).", "priority": 1, "section": "4.5"},
{"id": "O.Cryp_5", "title": "Key Management", "description": "Sichere Schluesselverwaltung", "requirement_text": "Kryptographische Schluessel muessen sicher generiert, gespeichert und rotiert werden.", "priority": 1, "section": "4.5"},
{"id": "O.Cryp_6", "title": "Aktuelle Algorithmen", "description": "Keine veralteten Algorithmen", "requirement_text": "Es duerfen nur aktuelle, als sicher geltende kryptographische Algorithmen verwendet werden.", "priority": 1, "section": "4.5"},
# Datenschutz
{"id": "O.Priv_1", "title": "Datenschutzerklaerung", "description": "Transparente Information", "requirement_text": "Eine vollstaendige Datenschutzerklaerung muss vor Nutzung einsehbar sein.", "priority": 1, "section": "4.6"},
{"id": "O.Priv_2", "title": "Einwilligung", "description": "Wirksame Einwilligung", "requirement_text": "Einwilligungen muessen freiwillig, informiert, spezifisch und dokumentiert sein.", "priority": 1, "section": "4.6"},
{"id": "O.Priv_3", "title": "Betroffenenrechte", "description": "Auskunft, Loeschung, etc.", "requirement_text": "Technische Prozesse fuer Betroffenenrechte (Art. 15-21 DSGVO) muessen implementiert sein.", "priority": 1, "section": "4.6"},
{"id": "O.Priv_4", "title": "Loeschkonzept", "description": "Aufbewahrungsfristen", "requirement_text": "Ein dokumentiertes Loeschkonzept mit definierten Aufbewahrungsfristen muss umgesetzt sein.", "priority": 1, "section": "4.6"},
{"id": "O.Priv_5", "title": "Datenschutz durch Technik", "description": "Privacy by Design", "requirement_text": "Datenschutz muss bereits bei der Entwicklung beruecksichtigt werden (Art. 25 DSGVO).", "priority": 1, "section": "4.6"},
# Logging & Audit
{"id": "O.Log_1", "title": "Security Logging", "description": "Sicherheitsereignisse protokollieren", "requirement_text": "Sicherheitsrelevante Ereignisse (Login, Fehler, Zugriffsverletzungen) muessen protokolliert werden.", "priority": 1, "section": "4.7"},
{"id": "O.Log_2", "title": "Audit Trail", "description": "Nachvollziehbarkeit", "requirement_text": "Aenderungen an personenbezogenen Daten muessen nachvollziehbar protokolliert werden.", "priority": 1, "section": "4.7"},
{"id": "O.Log_3", "title": "Log-Integritaet", "description": "Logs vor Manipulation schuetzen", "requirement_text": "Logs muessen vor unbefugter Aenderung oder Loeschung geschuetzt sein.", "priority": 2, "section": "4.7"},
{"id": "O.Log_4", "title": "Keine PII in Logs", "description": "Keine personenbezogenen Daten loggen", "requirement_text": "Logs duerfen keine personenbezogenen Daten im Klartext enthalten.", "priority": 1, "section": "4.7"},
# Software-Entwicklung
{"id": "O.Dev_1", "title": "Secure SDLC", "description": "Sicherer Entwicklungsprozess", "requirement_text": "Ein dokumentierter sicherer Entwicklungsprozess (Secure SDLC) muss etabliert sein.", "priority": 1, "section": "4.8"},
{"id": "O.Dev_2", "title": "Code Review", "description": "Sicherheits-Review von Code", "requirement_text": "Sicherheitsrelevanter Code muss vor Release einem Review unterzogen werden.", "priority": 2, "section": "4.8"},
{"id": "O.Dev_3", "title": "Dependency Management", "description": "Abhaengigkeiten pruefen", "requirement_text": "Externe Abhaengigkeiten muessen auf bekannte Schwachstellen geprueft werden.", "priority": 1, "section": "4.8"},
{"id": "O.Dev_4", "title": "Penetration Testing", "description": "Regelmaessige Sicherheitstests", "requirement_text": "Regelmaessige Penetrationstests oder Schwachstellenscans muessen durchgefuehrt werden.", "priority": 2, "section": "4.8"},
# Betrieb
{"id": "O.Ops_1", "title": "Patch Management", "description": "Zeitnahes Patchen", "requirement_text": "Sicherheitspatches muessen zeitnah (kritisch: 24-72h) eingespielt werden.", "priority": 1, "section": "4.9"},
{"id": "O.Ops_2", "title": "Backup", "description": "Regelmaessige Datensicherung", "requirement_text": "Regelmaessige, getestete Backups muessen vorhanden sein.", "priority": 1, "section": "4.9"},
{"id": "O.Ops_3", "title": "Incident Response", "description": "Vorfallsmanagement", "requirement_text": "Ein dokumentierter Incident-Response-Prozess muss etabliert sein.", "priority": 1, "section": "4.9"},
{"id": "O.Ops_4", "title": "Monitoring", "description": "Systemueberwachung", "requirement_text": "Kritische Systeme und Dienste muessen kontinuierlich ueberwacht werden.", "priority": 2, "section": "4.9"},
# Dokumentation
{"id": "O.Doc_1", "title": "Technische Dokumentation", "description": "Systemarchitektur dokumentiert", "requirement_text": "Die Systemarchitektur und Datenflüsse muessen dokumentiert sein.", "priority": 2, "section": "4.10"},
{"id": "O.Doc_2", "title": "Verarbeitungsverzeichnis", "description": "Art. 30 DSGVO", "requirement_text": "Ein vollstaendiges Verzeichnis von Verarbeitungstaetigkeiten muss gefuehrt werden.", "priority": 1, "section": "4.10"},
{"id": "O.Doc_3", "title": "TOMs", "description": "Technisch-organisatorische Massnahmen", "requirement_text": "Technisch-organisatorische Massnahmen (Art. 32 DSGVO) muessen dokumentiert sein.", "priority": 1, "section": "4.10"},
]
elif code == "BSI-TR-03161-2":
# Teil 2: Web-Anwendungen
return [
# Session Management
{"id": "O.Sess_1", "title": "Session-Timeout", "description": "Automatische Sitzungsbeendigung", "requirement_text": "Sessions muessen nach Inaktivitaet automatisch beendet werden (max. 30 Min).", "priority": 1, "section": "5.1"},
{"id": "O.Sess_2", "title": "Session-ID Sicherheit", "description": "Sichere Session-IDs", "requirement_text": "Session-IDs muessen kryptographisch sicher generiert werden (min. 128 Bit Entropie).", "priority": 1, "section": "5.1"},
{"id": "O.Sess_3", "title": "Session-Regeneration", "description": "ID nach Login erneuern", "requirement_text": "Nach erfolgreicher Authentifizierung muss eine neue Session-ID generiert werden.", "priority": 1, "section": "5.1"},
{"id": "O.Sess_4", "title": "Secure Cookie Flags", "description": "HttpOnly, Secure, SameSite", "requirement_text": "Session-Cookies muessen mit Secure, HttpOnly und SameSite-Flags gesetzt werden.", "priority": 1, "section": "5.1"},
{"id": "O.Sess_5", "title": "Session-Binding", "description": "Session an Client binden", "requirement_text": "Sessions sollten an Client-Eigenschaften (User-Agent, IP) gebunden werden.", "priority": 2, "section": "5.1"},
{"id": "O.Sess_6", "title": "Logout-Funktionalitaet", "description": "Vollstaendiges Logout", "requirement_text": "Beim Logout muss die Session vollstaendig invalidiert werden.", "priority": 1, "section": "5.1"},
# Eingabevalidierung
{"id": "O.Input_1", "title": "Serverseitige Validierung", "description": "Alle Eingaben serverseitig pruefen", "requirement_text": "Alle Benutzereingaben muessen serverseitig validiert werden.", "priority": 1, "section": "5.2"},
{"id": "O.Input_2", "title": "Whitelist-Validierung", "description": "Erlaubte Zeichen definieren", "requirement_text": "Eingabevalidierung sollte auf Whitelist-Basis erfolgen.", "priority": 1, "section": "5.2"},
{"id": "O.Input_3", "title": "Encoding", "description": "Korrekte Zeichenkodierung", "requirement_text": "Einheitliche Zeichenkodierung (UTF-8) muss durchgesetzt werden.", "priority": 2, "section": "5.2"},
{"id": "O.Input_4", "title": "Datei-Upload Validierung", "description": "Uploads pruefen", "requirement_text": "Datei-Uploads muessen auf Typ, Groesse und Inhalt validiert werden.", "priority": 1, "section": "5.2"},
# Injection-Schutz
{"id": "O.SQL_1", "title": "SQL-Injection Schutz", "description": "Prepared Statements", "requirement_text": "SQL-Anfragen muessen parametrisiert sein (Prepared Statements).", "priority": 1, "section": "5.3"},
{"id": "O.SQL_2", "title": "ORM Nutzung", "description": "Abstraktionsschicht nutzen", "requirement_text": "Es sollte ein ORM oder Query Builder verwendet werden.", "priority": 2, "section": "5.3"},
{"id": "O.Cmd_1", "title": "Command Injection Schutz", "description": "Keine Shell-Befehle mit Eingaben", "requirement_text": "Benutzereingaben duerfen nicht in Shell-Befehlen verwendet werden.", "priority": 1, "section": "5.3"},
{"id": "O.LDAP_1", "title": "LDAP Injection Schutz", "description": "LDAP-Queries absichern", "requirement_text": "LDAP-Queries muessen gegen Injection geschuetzt sein.", "priority": 1, "section": "5.3"},
{"id": "O.XML_1", "title": "XML Injection Schutz", "description": "XXE verhindern", "requirement_text": "XML-Parser muessen gegen XXE-Angriffe konfiguriert sein.", "priority": 1, "section": "5.3"},
# XSS-Schutz
{"id": "O.XSS_1", "title": "Output Encoding", "description": "Kontextabhaengiges Escaping", "requirement_text": "Ausgaben muessen kontextabhaengig (HTML, JS, CSS, URL) escaped werden.", "priority": 1, "section": "5.4"},
{"id": "O.XSS_2", "title": "Content Security Policy", "description": "CSP-Header setzen", "requirement_text": "Ein restriktiver Content-Security-Policy-Header muss gesetzt sein.", "priority": 1, "section": "5.4"},
{"id": "O.XSS_3", "title": "DOM-basiertes XSS", "description": "DOM-Manipulation absichern", "requirement_text": "JavaScript-DOM-Manipulationen muessen sicher implementiert sein.", "priority": 1, "section": "5.4"},
{"id": "O.XSS_4", "title": "Template-Engine Escaping", "description": "Auto-Escaping aktivieren", "requirement_text": "Template-Engines muessen mit aktiviertem Auto-Escaping verwendet werden.", "priority": 1, "section": "5.4"},
# CSRF-Schutz
{"id": "O.CSRF_1", "title": "Anti-CSRF Token", "description": "Token bei State-Changes", "requirement_text": "Zustandsaendernde Anfragen muessen mit Anti-CSRF-Token geschuetzt sein.", "priority": 1, "section": "5.5"},
{"id": "O.CSRF_2", "title": "SameSite Cookie", "description": "SameSite-Attribut setzen", "requirement_text": "Cookies sollten das SameSite-Attribut (Strict oder Lax) haben.", "priority": 1, "section": "5.5"},
{"id": "O.CSRF_3", "title": "Referer-Pruefung", "description": "Origin validieren", "requirement_text": "Bei kritischen Aktionen sollte der Origin/Referer-Header geprueft werden.", "priority": 2, "section": "5.5"},
# Security Headers
{"id": "O.Head_1", "title": "X-Content-Type-Options", "description": "nosniff setzen", "requirement_text": "Der X-Content-Type-Options: nosniff Header muss gesetzt sein.", "priority": 1, "section": "5.6"},
{"id": "O.Head_2", "title": "X-Frame-Options", "description": "Clickjacking-Schutz", "requirement_text": "X-Frame-Options oder CSP frame-ancestors muss Clickjacking verhindern.", "priority": 1, "section": "5.6"},
{"id": "O.Head_3", "title": "X-XSS-Protection", "description": "Browser XSS-Filter", "requirement_text": "X-XSS-Protection sollte aktiviert sein (oder CSP nutzen).", "priority": 2, "section": "5.6"},
{"id": "O.Head_4", "title": "Referrer-Policy", "description": "Referrer einschraenken", "requirement_text": "Eine restriktive Referrer-Policy sollte gesetzt sein.", "priority": 2, "section": "5.6"},
{"id": "O.Head_5", "title": "Permissions-Policy", "description": "Browser-Features einschraenken", "requirement_text": "Nicht benoetigte Browser-APIs sollten per Permissions-Policy deaktiviert werden.", "priority": 3, "section": "5.6"},
# Fehlerbehandlung
{"id": "O.Err_1", "title": "Generische Fehlermeldungen", "description": "Keine technischen Details", "requirement_text": "Fehlermeldungen an Benutzer duerfen keine technischen Details enthalten.", "priority": 1, "section": "5.7"},
{"id": "O.Err_2", "title": "Custom Error Pages", "description": "Eigene Fehlerseiten", "requirement_text": "Standard-Fehlerseiten des Servers muessen durch eigene ersetzt werden.", "priority": 2, "section": "5.7"},
{"id": "O.Err_3", "title": "Exception Handling", "description": "Alle Exceptions abfangen", "requirement_text": "Unbehandelte Exceptions muessen abgefangen und geloggt werden.", "priority": 1, "section": "5.7"},
# API-Sicherheit
{"id": "O.API_1", "title": "API-Authentifizierung", "description": "API-Keys oder OAuth", "requirement_text": "APIs muessen authentifiziert werden (API-Keys, OAuth, JWT).", "priority": 1, "section": "5.8"},
{"id": "O.API_2", "title": "Rate Limiting", "description": "Anfragen begrenzen", "requirement_text": "APIs muessen Rate-Limiting implementieren.", "priority": 1, "section": "5.8"},
{"id": "O.API_3", "title": "Input-Validierung API", "description": "Request-Body validieren", "requirement_text": "API-Request-Bodies muessen gegen ein Schema validiert werden.", "priority": 1, "section": "5.8"},
{"id": "O.API_4", "title": "Versionierung", "description": "API-Versionen", "requirement_text": "APIs sollten versioniert sein um Breaking Changes zu vermeiden.", "priority": 3, "section": "5.8"},
# Client-Sicherheit
{"id": "O.JS_1", "title": "JavaScript Sicherheit", "description": "Sichere JS-Praktiken", "requirement_text": "JavaScript muss sicher implementiert sein (kein eval, innerHTML mit Vorsicht).", "priority": 1, "section": "5.9"},
{"id": "O.JS_2", "title": "Third-Party Scripts", "description": "Externe Scripts absichern", "requirement_text": "Third-Party Scripts muessen mit SRI oder CSP abgesichert werden.", "priority": 1, "section": "5.9"},
{"id": "O.Store_1", "title": "Lokale Speicherung", "description": "LocalStorage sicher nutzen", "requirement_text": "Sensible Daten duerfen nicht im LocalStorage/SessionStorage gespeichert werden.", "priority": 1, "section": "5.9"},
]
elif code == "BSI-TR-03161-3":
# Teil 3: Hintergrundsysteme (Backend)
return [
# Systemarchitektur
{"id": "O.Arch_1", "title": "Defense in Depth", "description": "Mehrschichtige Sicherheit", "requirement_text": "Eine mehrschichtige Sicherheitsarchitektur (Defense in Depth) muss implementiert sein.", "priority": 1, "section": "6.1"},
{"id": "O.Arch_2", "title": "Segmentierung", "description": "Netzwerksegmentierung", "requirement_text": "Das Netzwerk muss segmentiert sein (DMZ, interne Zonen).", "priority": 1, "section": "6.1"},
{"id": "O.Arch_3", "title": "Microservices Isolation", "description": "Services isolieren", "requirement_text": "Microservices sollten minimal gekoppelt und isoliert sein.", "priority": 2, "section": "6.1"},
{"id": "O.Arch_4", "title": "Zero Trust", "description": "Kein implizites Vertrauen", "requirement_text": "Interne Kommunikation sollte nach Zero-Trust-Prinzipien abgesichert sein.", "priority": 2, "section": "6.1"},
# Datenspeicherung
{"id": "O.DB_1", "title": "Datenbank-Sicherheit", "description": "DB abhaerten", "requirement_text": "Datenbanken muessen gehaertet sein (keine Default-Credentials, minimale Rechte).", "priority": 1, "section": "6.2"},
{"id": "O.DB_2", "title": "Verschluesselung in DB", "description": "Sensible Felder verschluesseln", "requirement_text": "Sensible Daten sollten in der Datenbank verschluesselt gespeichert werden.", "priority": 1, "section": "6.2"},
{"id": "O.DB_3", "title": "Backup-Verschluesselung", "description": "Backups verschluesseln", "requirement_text": "Datenbank-Backups muessen verschluesselt sein.", "priority": 1, "section": "6.2"},
{"id": "O.DB_4", "title": "Zugriffskontrolle DB", "description": "DB-Zugriffe beschraenken", "requirement_text": "Der Datenbankzugriff muss auf notwendige Dienste beschraenkt sein.", "priority": 1, "section": "6.2"},
{"id": "O.Store_2", "title": "Dateispeicher-Sicherheit", "description": "Uploads isolieren", "requirement_text": "Hochgeladene Dateien muessen isoliert und mit Malware-Scanning verarbeitet werden.", "priority": 1, "section": "6.2"},
# Container & Infrastruktur
{"id": "O.Cont_1", "title": "Container-Sicherheit", "description": "Images scannen", "requirement_text": "Container-Images muessen auf Schwachstellen gescannt werden.", "priority": 1, "section": "6.3"},
{"id": "O.Cont_2", "title": "Rootless Container", "description": "Nicht als Root laufen", "requirement_text": "Container sollten nicht als Root-User ausgefuehrt werden.", "priority": 1, "section": "6.3"},
{"id": "O.Cont_3", "title": "Image-Herkunft", "description": "Vertrauenswuerdige Images", "requirement_text": "Es duerfen nur Images aus vertrauenswuerdigen Quellen verwendet werden.", "priority": 1, "section": "6.3"},
{"id": "O.Cont_4", "title": "Read-Only Filesystem", "description": "Unveraenderliches Dateisystem", "requirement_text": "Container sollten mit Read-Only Root-Filesystem laufen wo moeglich.", "priority": 2, "section": "6.3"},
{"id": "O.Cont_5", "title": "Resource Limits", "description": "CPU/Memory begrenzen", "requirement_text": "Container muessen Resource-Limits (CPU, Memory) konfiguriert haben.", "priority": 2, "section": "6.3"},
# Secrets Management
{"id": "O.Sec_1", "title": "Secrets Management", "description": "Zentrale Secrets-Verwaltung", "requirement_text": "Sensible Konfiguration (Passwoerter, Keys) muss zentral und sicher verwaltet werden.", "priority": 1, "section": "6.4"},
{"id": "O.Sec_2", "title": "Keine Hardcoded Secrets", "description": "Secrets nicht im Code", "requirement_text": "Secrets duerfen nicht im Quellcode oder in Git-Repositories stehen.", "priority": 1, "section": "6.4"},
{"id": "O.Sec_3", "title": "Secret Rotation", "description": "Regelmaessige Rotation", "requirement_text": "Secrets und API-Keys sollten regelmaessig rotiert werden.", "priority": 2, "section": "6.4"},
{"id": "O.Sec_4", "title": "Vault Integration", "description": "Secrets-Vault nutzen", "requirement_text": "Ein Secrets-Management-System (HashiCorp Vault o.ae.) sollte verwendet werden.", "priority": 2, "section": "6.4"},
# Kommunikation
{"id": "O.Comm_1", "title": "Service-to-Service TLS", "description": "Interne Verschluesselung", "requirement_text": "Auch interne Service-Kommunikation sollte verschluesselt sein (mTLS).", "priority": 2, "section": "6.5"},
{"id": "O.Comm_2", "title": "Message Queue Sicherheit", "description": "Queue-Zugriff absichern", "requirement_text": "Message Queues muessen authentifiziert und autorisiert werden.", "priority": 1, "section": "6.5"},
{"id": "O.Comm_3", "title": "API Gateway", "description": "Zentraler Zugangspunkt", "requirement_text": "Ein API Gateway sollte als zentraler Zugangspunkt dienen.", "priority": 2, "section": "6.5"},
# Monitoring & Logging
{"id": "O.Mon_1", "title": "Zentrale Logs", "description": "Log-Aggregation", "requirement_text": "Logs aller Services muessen zentral aggregiert werden.", "priority": 1, "section": "6.6"},
{"id": "O.Mon_2", "title": "Security Monitoring", "description": "Anomalie-Erkennung", "requirement_text": "Sicherheitsrelevante Ereignisse muessen ueberwacht und alarmiert werden.", "priority": 1, "section": "6.6"},
{"id": "O.Mon_3", "title": "Metriken", "description": "Performance-Monitoring", "requirement_text": "System-Metriken (CPU, Memory, Latenz) sollten erfasst und ueberwacht werden.", "priority": 2, "section": "6.6"},
{"id": "O.Mon_4", "title": "Alerting", "description": "Alarmierung konfigurieren", "requirement_text": "Kritische Schwellwerte muessen definiert und alarmiert werden.", "priority": 1, "section": "6.6"},
# CI/CD Sicherheit
{"id": "O.CI_1", "title": "Pipeline-Sicherheit", "description": "CI/CD absichern", "requirement_text": "CI/CD-Pipelines muessen abgesichert sein (Secrets, Zugriffsrechte).", "priority": 1, "section": "6.7"},
{"id": "O.CI_2", "title": "SAST/DAST", "description": "Automatisierte Security-Tests", "requirement_text": "Statische und dynamische Sicherheitsanalysen sollten in die Pipeline integriert sein.", "priority": 2, "section": "6.7"},
{"id": "O.CI_3", "title": "Dependency Scanning", "description": "Abhaengigkeiten pruefen", "requirement_text": "Abhaengigkeiten muessen automatisiert auf Schwachstellen geprueft werden.", "priority": 1, "section": "6.7"},
{"id": "O.CI_4", "title": "SBOM", "description": "Software Bill of Materials", "requirement_text": "Ein SBOM (Software Bill of Materials) sollte generiert und gepflegt werden.", "priority": 2, "section": "6.7"},
# Disaster Recovery
{"id": "O.DR_1", "title": "Backup-Strategie", "description": "3-2-1 Backup-Regel", "requirement_text": "Backups sollten der 3-2-1-Regel folgen (3 Kopien, 2 Medien, 1 offsite).", "priority": 1, "section": "6.8"},
{"id": "O.DR_2", "title": "Recovery-Tests", "description": "Restore regelmaessig testen", "requirement_text": "Die Wiederherstellung aus Backups muss regelmaessig getestet werden.", "priority": 1, "section": "6.8"},
{"id": "O.DR_3", "title": "RTO/RPO", "description": "Recovery-Ziele definieren", "requirement_text": "Recovery Time Objective (RTO) und Recovery Point Objective (RPO) muessen definiert sein.", "priority": 2, "section": "6.8"},
]
return []
def get_known_sources(self) -> List[Dict[str, Any]]:
"""Get list of known regulation sources with metadata."""
sources = []
for code, info in self.KNOWN_SOURCES.items():
# Check database for existing data
regulation = self.reg_repo.get_by_code(code)
requirement_count = 0
if regulation:
requirement_count = self.db.query(RequirementDB).filter(
RequirementDB.regulation_id == regulation.id
).count()
sources.append({
"code": code,
"url": info["url"],
"source_type": info["type"].value,
"regulation_type": info["regulation_type"].value,
"has_data": regulation is not None,
"requirement_count": requirement_count,
})
return sources

View File

@@ -0,0 +1,442 @@
"""
Compliance Report Generator Service.
Generates periodic compliance reports (weekly, monthly, quarterly, yearly).
Reports include:
- Compliance score trends
- Control status summary
- Risk assessment summary
- Evidence coverage
- Action items / recommendations
"""
import logging
from datetime import datetime, date, timedelta
from typing import Dict, List, Any, Optional
from enum import Enum
from sqlalchemy.orm import Session
from sqlalchemy import func
from ..db.models import (
RegulationDB,
RequirementDB,
ControlDB,
ControlMappingDB,
EvidenceDB,
RiskDB,
AuditExportDB,
ControlStatusEnum,
RiskLevelEnum,
EvidenceStatusEnum,
)
from ..db.repository import (
RegulationRepository,
ControlRepository,
EvidenceRepository,
RiskRepository,
)
logger = logging.getLogger(__name__)
class ReportPeriod(str, Enum):
WEEKLY = "weekly"
MONTHLY = "monthly"
QUARTERLY = "quarterly"
YEARLY = "yearly"
class ComplianceReportGenerator:
"""Generates compliance reports for different time periods."""
def __init__(self, db: Session):
self.db = db
self.reg_repo = RegulationRepository(db)
self.ctrl_repo = ControlRepository(db)
self.evidence_repo = EvidenceRepository(db)
self.risk_repo = RiskRepository(db)
def generate_report(
self,
period: ReportPeriod,
as_of_date: Optional[date] = None,
) -> Dict[str, Any]:
"""
Generate a compliance report for the specified period.
Args:
period: Report period (weekly, monthly, quarterly, yearly)
as_of_date: Report date (defaults to today)
Returns:
Complete report dictionary
"""
if as_of_date is None:
as_of_date = date.today()
# Calculate date ranges
date_range = self._get_date_range(period, as_of_date)
report = {
"report_metadata": {
"generated_at": datetime.utcnow().isoformat(),
"period": period.value,
"as_of_date": as_of_date.isoformat(),
"date_range_start": date_range["start"].isoformat(),
"date_range_end": date_range["end"].isoformat(),
"report_title": self._get_report_title(period, as_of_date),
},
"executive_summary": self._generate_executive_summary(),
"compliance_score": self._generate_compliance_score_section(),
"regulations_coverage": self._generate_regulations_coverage(),
"controls_summary": self._generate_controls_summary(),
"risks_summary": self._generate_risks_summary(),
"evidence_summary": self._generate_evidence_summary(),
"action_items": self._generate_action_items(),
"trends": self._generate_trends_placeholder(period),
}
return report
def _get_date_range(self, period: ReportPeriod, as_of: date) -> Dict[str, date]:
"""Calculate date range for the reporting period."""
if period == ReportPeriod.WEEKLY:
# Last 7 days
start = as_of - timedelta(days=7)
elif period == ReportPeriod.MONTHLY:
# Last 30 days
start = as_of - timedelta(days=30)
elif period == ReportPeriod.QUARTERLY:
# Last 90 days
start = as_of - timedelta(days=90)
elif period == ReportPeriod.YEARLY:
# Last 365 days
start = as_of - timedelta(days=365)
else:
start = as_of - timedelta(days=30)
return {"start": start, "end": as_of}
def _get_report_title(self, period: ReportPeriod, as_of: date) -> str:
"""Generate report title based on period."""
titles = {
ReportPeriod.WEEKLY: f"Woechentlicher Compliance-Report KW{as_of.isocalendar()[1]} {as_of.year}",
ReportPeriod.MONTHLY: f"Monatlicher Compliance-Report {as_of.strftime('%B %Y')}",
ReportPeriod.QUARTERLY: f"Quartals-Compliance-Report Q{(as_of.month - 1) // 3 + 1}/{as_of.year}",
ReportPeriod.YEARLY: f"Jaehrlicher Compliance-Report {as_of.year}",
}
return titles.get(period, f"Compliance Report {as_of.isoformat()}")
def _generate_executive_summary(self) -> Dict[str, Any]:
"""Generate executive summary section."""
stats = self.ctrl_repo.get_statistics()
risk_matrix = self.risk_repo.get_matrix_data()
total_controls = stats.get("total", 0)
score = stats.get("compliance_score", 0)
# Determine overall status
if score >= 80:
status = "GREEN"
status_text = "Guter Compliance-Stand"
elif score >= 60:
status = "YELLOW"
status_text = "Verbesserungsbedarf"
else:
status = "RED"
status_text = "Kritischer Handlungsbedarf"
high_critical_risks = (
risk_matrix["by_level"].get("critical", 0) +
risk_matrix["by_level"].get("high", 0)
)
return {
"overall_status": status,
"status_text": status_text,
"compliance_score": score,
"total_controls": total_controls,
"high_critical_risks": high_critical_risks,
"key_findings": self._generate_key_findings(stats, risk_matrix),
}
def _generate_key_findings(
self,
ctrl_stats: Dict[str, Any],
risk_matrix: Dict[str, Any],
) -> List[str]:
"""Generate key findings for executive summary."""
findings = []
# Control status findings
by_status = ctrl_stats.get("by_status", {})
passed = by_status.get("pass", 0)
failed = by_status.get("fail", 0)
planned = by_status.get("planned", 0)
if failed > 0:
findings.append(f"{failed} Controls im Status 'Fail' - sofortige Massnahmen erforderlich")
if planned > 5:
findings.append(f"{planned} Controls noch nicht implementiert")
# Risk findings
critical = risk_matrix["by_level"].get("critical", 0)
high = risk_matrix["by_level"].get("high", 0)
if critical > 0:
findings.append(f"{critical} kritische Risiken identifiziert - Eskalation empfohlen")
if high > 3:
findings.append(f"{high} hohe Risiken - priorisierte Behandlung erforderlich")
if not findings:
findings.append("Keine kritischen Befunde - Compliance-Status stabil")
return findings
def _generate_compliance_score_section(self) -> Dict[str, Any]:
"""Generate compliance score section with breakdown."""
stats = self.ctrl_repo.get_statistics()
by_domain = stats.get("by_domain", {})
domain_scores = {}
controls = self.ctrl_repo.get_all()
domain_stats = {}
for ctrl in controls:
domain = ctrl.domain.value if ctrl.domain else "unknown"
if domain not in domain_stats:
domain_stats[domain] = {"total": 0, "pass": 0, "partial": 0}
domain_stats[domain]["total"] += 1
if ctrl.status == ControlStatusEnum.PASS:
domain_stats[domain]["pass"] += 1
elif ctrl.status == ControlStatusEnum.PARTIAL:
domain_stats[domain]["partial"] += 1
for domain, ds in domain_stats.items():
if ds["total"] > 0:
score = ((ds["pass"] + ds["partial"] * 0.5) / ds["total"]) * 100
domain_scores[domain] = round(score, 1)
else:
domain_scores[domain] = 0
return {
"overall_score": stats.get("compliance_score", 0),
"by_domain": domain_scores,
"domain_labels": {
"gov": "Governance",
"priv": "Datenschutz",
"iam": "Identity & Access",
"crypto": "Kryptografie",
"sdlc": "Secure Development",
"ops": "Operations",
"ai": "KI-spezifisch",
"cra": "Supply Chain",
"aud": "Audit",
},
}
def _generate_regulations_coverage(self) -> Dict[str, Any]:
"""Generate regulations coverage section."""
regulations = self.reg_repo.get_all()
coverage = []
for reg in regulations:
# Count requirements for this regulation
req_count = self.db.query(func.count(RequirementDB.id)).filter(
RequirementDB.regulation_id == reg.id
).scalar() or 0
# Count mapped controls
mapped_controls = self.db.query(func.count(ControlMappingDB.id)).join(
RequirementDB
).filter(
RequirementDB.regulation_id == reg.id
).scalar() or 0
coverage.append({
"code": reg.code,
"name": reg.name,
"requirements": req_count,
"mapped_controls": mapped_controls,
"coverage_status": "covered" if mapped_controls > 0 else "pending",
})
return {
"total_regulations": len(regulations),
"covered_regulations": len([c for c in coverage if c["coverage_status"] == "covered"]),
"details": coverage,
}
def _generate_controls_summary(self) -> Dict[str, Any]:
"""Generate controls summary section."""
stats = self.ctrl_repo.get_statistics()
due_for_review = self.ctrl_repo.get_due_for_review()
return {
"total": stats.get("total", 0),
"by_status": stats.get("by_status", {}),
"by_domain": stats.get("by_domain", {}),
"due_for_review": len(due_for_review),
"review_items": [
{
"control_id": c.control_id,
"title": c.title,
"last_reviewed": c.last_reviewed_at.isoformat() if c.last_reviewed_at else None,
}
for c in due_for_review[:10] # Top 10
],
}
def _generate_risks_summary(self) -> Dict[str, Any]:
"""Generate risks summary section."""
matrix = self.risk_repo.get_matrix_data()
risks = self.risk_repo.get_all()
# Group by category
by_category = {}
for risk in risks:
cat = risk.category or "other"
if cat not in by_category:
by_category[cat] = 0
by_category[cat] += 1
# High priority risks
high_priority = [
{
"risk_id": r.risk_id,
"title": r.title,
"inherent_risk": r.inherent_risk.value if r.inherent_risk else None,
"owner": r.owner,
"status": r.status,
}
for r in risks
if r.inherent_risk in [RiskLevelEnum.CRITICAL, RiskLevelEnum.HIGH]
]
return {
"total_risks": matrix["total_risks"],
"by_level": matrix["by_level"],
"by_category": by_category,
"high_priority_risks": high_priority,
"risk_matrix": matrix["matrix"],
}
def _generate_evidence_summary(self) -> Dict[str, Any]:
"""Generate evidence summary section."""
stats = self.evidence_repo.get_statistics()
all_evidence = self.evidence_repo.get_all(limit=100)
# Find controls without evidence
controls = self.ctrl_repo.get_all()
controls_with_evidence = set()
for evidence in all_evidence:
control = self.db.query(ControlDB).filter(
ControlDB.id == evidence.control_id
).first()
if control:
controls_with_evidence.add(control.control_id)
controls_without_evidence = [
c.control_id for c in controls
if c.control_id not in controls_with_evidence
]
return {
"total_evidence": stats.get("total", 0),
"by_type": stats.get("by_type", {}),
"by_status": stats.get("by_status", {}),
"coverage_percent": stats.get("coverage_percent", 0),
"controls_without_evidence": controls_without_evidence[:20], # Top 20
}
def _generate_action_items(self) -> List[Dict[str, Any]]:
"""Generate action items based on current status."""
action_items = []
# Check for failed controls
failed_controls = self.ctrl_repo.get_all(status=ControlStatusEnum.FAIL)
for ctrl in failed_controls[:5]:
action_items.append({
"priority": "high",
"category": "control_remediation",
"title": f"Control {ctrl.control_id} beheben",
"description": f"Control '{ctrl.title}' ist im Status 'Fail'. Sofortige Massnahmen erforderlich.",
"owner": ctrl.owner,
"due_date": (date.today() + timedelta(days=7)).isoformat(),
})
# Check for critical/high risks
critical_risks = self.risk_repo.get_all(min_risk_level=RiskLevelEnum.HIGH)
for risk in critical_risks[:5]:
if risk.status == "open":
action_items.append({
"priority": "high" if risk.inherent_risk == RiskLevelEnum.CRITICAL else "medium",
"category": "risk_treatment",
"title": f"Risiko {risk.risk_id} behandeln",
"description": f"Risiko '{risk.title}' hat Status 'open' und Level '{risk.inherent_risk.value}'.",
"owner": risk.owner,
"due_date": (date.today() + timedelta(days=14)).isoformat(),
})
# Check for overdue reviews
due_for_review = self.ctrl_repo.get_due_for_review()
if len(due_for_review) > 5:
action_items.append({
"priority": "medium",
"category": "review",
"title": f"{len(due_for_review)} Control-Reviews ueberfaellig",
"description": "Mehrere Controls muessen reviewed werden.",
"owner": "Compliance Officer",
"due_date": (date.today() + timedelta(days=30)).isoformat(),
})
return action_items
def _generate_trends_placeholder(self, period: ReportPeriod) -> Dict[str, Any]:
"""
Generate trends section.
Note: Full trend analysis requires historical data storage.
This is a placeholder for future implementation.
"""
return {
"note": "Trend-Analyse erfordert historische Daten. Feature in Entwicklung.",
"period": period.value,
"compliance_score_trend": "stable", # Placeholder
"risk_trend": "stable", # Placeholder
"recommendations": [
"Historische Score-Snapshots aktivieren fuer Trend-Analyse",
"Regelmaessige Report-Generierung einrichten",
],
}
def generate_summary_report(self) -> Dict[str, Any]:
"""Generate a quick summary report (for dashboard)."""
stats = self.ctrl_repo.get_statistics()
risk_matrix = self.risk_repo.get_matrix_data()
evidence_stats = self.evidence_repo.get_statistics()
return {
"generated_at": datetime.utcnow().isoformat(),
"compliance_score": stats.get("compliance_score", 0),
"controls": {
"total": stats.get("total", 0),
"passing": stats.get("by_status", {}).get("pass", 0),
"failing": stats.get("by_status", {}).get("fail", 0),
},
"risks": {
"total": risk_matrix["total_risks"],
"critical": risk_matrix["by_level"].get("critical", 0),
"high": risk_matrix["by_level"].get("high", 0),
},
"evidence": {
"total": evidence_stats.get("total", 0),
"coverage": evidence_stats.get("coverage_percent", 0),
},
}

View File

@@ -0,0 +1,488 @@
"""
Compliance Seeder Service.
Seeds the database with initial regulations, controls, and requirements.
"""
import logging
from typing import Dict, List, Optional
from datetime import datetime
from sqlalchemy.orm import Session
from ..db.models import (
RegulationDB,
RequirementDB,
ControlDB,
ControlMappingDB,
RiskDB,
ServiceModuleDB,
ModuleRegulationMappingDB,
StatementOfApplicabilityDB,
RegulationTypeEnum,
ControlTypeEnum,
ControlDomainEnum,
ControlStatusEnum,
RiskLevelEnum,
ServiceTypeEnum,
RelevanceLevelEnum,
)
from ..data.regulations import REGULATIONS_SEED
from ..data.controls import CONTROLS_SEED
from ..data.requirements import REQUIREMENTS_SEED
from ..data.risks import RISKS_SEED
from ..data.service_modules import BREAKPILOT_SERVICES
from ..data.iso27001_annex_a import ISO27001_ANNEX_A_CONTROLS
logger = logging.getLogger(__name__)
class ComplianceSeeder:
"""Seeds the compliance database with initial data."""
def __init__(self, db: Session):
self.db = db
self._regulation_map: Dict[str, str] = {} # code -> id
self._module_map: Dict[str, str] = {} # name -> id
def seed_all(self, force: bool = False) -> Dict[str, int]:
"""
Seed all compliance data.
Args:
force: If True, re-seed even if data exists
Returns:
Dictionary with counts of seeded items
"""
results = {
"regulations": 0,
"controls": 0,
"requirements": 0,
"mappings": 0,
"risks": 0,
"service_modules": 0,
"module_regulation_mappings": 0,
"soa_entries": 0,
}
# Check if already seeded
existing_regulations = self.db.query(RegulationDB).count()
if existing_regulations > 0 and not force:
logger.info(f"Database already has {existing_regulations} regulations, skipping seed")
return results
try:
# Seed in order (regulations first, then controls, then requirements, then risks, then service modules)
results["regulations"] = self._seed_regulations()
results["controls"] = self._seed_controls()
results["requirements"] = self._seed_requirements()
results["mappings"] = self._seed_default_mappings()
results["risks"] = self._seed_risks()
results["service_modules"] = self._seed_service_modules()
results["module_regulation_mappings"] = self._seed_module_regulation_mappings()
results["soa_entries"] = self._seed_soa()
self.db.commit()
logger.info(f"Seeding completed: {results}")
return results
except Exception as e:
self.db.rollback()
logger.error(f"Seeding failed: {e}")
raise
def _seed_regulations(self) -> int:
"""Seed regulations from REGULATIONS_SEED."""
count = 0
for reg_data in REGULATIONS_SEED:
# Check if regulation already exists
existing = self.db.query(RegulationDB).filter(
RegulationDB.code == reg_data["code"]
).first()
if existing:
self._regulation_map[reg_data["code"]] = existing.id
continue
regulation = RegulationDB(
code=reg_data["code"],
name=reg_data["name"],
full_name=reg_data.get("full_name"),
regulation_type=RegulationTypeEnum(reg_data["regulation_type"]),
source_url=reg_data.get("source_url"),
local_pdf_path=reg_data.get("local_pdf_path"),
effective_date=reg_data.get("effective_date"),
description=reg_data.get("description"),
is_active=reg_data.get("is_active", True),
)
self.db.add(regulation)
self.db.flush() # Get the ID
self._regulation_map[reg_data["code"]] = regulation.id
count += 1
return count
def _seed_controls(self) -> int:
"""Seed controls from CONTROLS_SEED."""
count = 0
for ctrl_data in CONTROLS_SEED:
# Check if control already exists
existing = self.db.query(ControlDB).filter(
ControlDB.control_id == ctrl_data["control_id"]
).first()
if existing:
continue
control = ControlDB(
control_id=ctrl_data["control_id"],
domain=ControlDomainEnum(ctrl_data["domain"]),
control_type=ControlTypeEnum(ctrl_data["control_type"]),
title=ctrl_data["title"],
description=ctrl_data.get("description"),
pass_criteria=ctrl_data["pass_criteria"],
implementation_guidance=ctrl_data.get("implementation_guidance"),
code_reference=ctrl_data.get("code_reference"),
is_automated=ctrl_data.get("is_automated", False),
automation_tool=ctrl_data.get("automation_tool"),
owner=ctrl_data.get("owner"),
review_frequency_days=ctrl_data.get("review_frequency_days", 90),
status=ControlStatusEnum.PLANNED, # All start as planned
)
self.db.add(control)
count += 1
return count
def _seed_requirements(self) -> int:
"""Seed requirements from REQUIREMENTS_SEED."""
count = 0
for req_data in REQUIREMENTS_SEED:
# Get regulation ID
regulation_code = req_data["regulation_code"]
regulation_id = self._regulation_map.get(regulation_code)
if not regulation_id:
# Try to find in database
regulation = self.db.query(RegulationDB).filter(
RegulationDB.code == regulation_code
).first()
if regulation:
regulation_id = regulation.id
self._regulation_map[regulation_code] = regulation_id
else:
logger.warning(f"Regulation {regulation_code} not found, skipping requirement")
continue
# Check if requirement already exists
existing = self.db.query(RequirementDB).filter(
RequirementDB.regulation_id == regulation_id,
RequirementDB.article == req_data["article"],
RequirementDB.paragraph == req_data.get("paragraph"),
).first()
if existing:
continue
requirement = RequirementDB(
regulation_id=regulation_id,
article=req_data["article"],
paragraph=req_data.get("paragraph"),
title=req_data["title"],
description=req_data.get("description"),
requirement_text=req_data.get("requirement_text"),
breakpilot_interpretation=req_data.get("breakpilot_interpretation"),
is_applicable=req_data.get("is_applicable", True),
applicability_reason=req_data.get("applicability_reason"),
priority=req_data.get("priority", 2),
)
self.db.add(requirement)
count += 1
return count
def _seed_default_mappings(self) -> int:
"""Create default mappings between requirements and controls."""
# Define default mappings based on domain/regulation relationships
mapping_rules = [
# GDPR Privacy mappings
("GDPR", "Art. 5", ["PRIV-001", "PRIV-003", "PRIV-006", "PRIV-007"]),
("GDPR", "Art. 25", ["PRIV-003", "PRIV-007"]),
("GDPR", "Art. 28", ["PRIV-005"]),
("GDPR", "Art. 30", ["PRIV-001"]),
("GDPR", "Art. 32", ["CRYPTO-001", "CRYPTO-002", "CRYPTO-003", "IAM-001", "OPS-002"]),
("GDPR", "Art. 35", ["PRIV-002", "AI-005"]),
# AI Act mappings
("AIACT", "Art. 9", ["AI-001", "AI-004", "AI-005"]),
("AIACT", "Art. 13", ["AI-002", "AI-003"]),
("AIACT", "Art. 14", ["AI-003"]),
("AIACT", "Art. 15", ["AI-004", "SDLC-001", "SDLC-002"]),
("AIACT", "Art. 50", ["AI-002"]),
# CRA mappings
("CRA", "Art. 10", ["SDLC-001", "SDLC-002", "SDLC-006"]),
("CRA", "Art. 11", ["GOV-005", "OPS-003"]),
("CRA", "Art. 13", ["CRA-001", "SDLC-005"]),
("CRA", "Art. 14", ["CRA-003", "OPS-004"]),
("CRA", "Art. 15", ["CRA-004"]),
# BSI-TR mappings
("BSI-TR-03161-1", "O.Arch_1", ["GOV-001", "GOV-002", "GOV-004"]),
("BSI-TR-03161-1", "O.Auth_1", ["IAM-001", "IAM-002", "IAM-004"]),
("BSI-TR-03161-1", "O.Cryp_1", ["CRYPTO-001", "CRYPTO-002", "CRYPTO-003", "CRYPTO-004"]),
("BSI-TR-03161-1", "O.Data_1", ["CRYPTO-001", "CRYPTO-002", "PRIV-007"]),
("BSI-TR-03161-2", "O.Auth_2", ["IAM-004"]),
("BSI-TR-03161-2", "O.Source_1", ["SDLC-001", "SDLC-004"]),
("BSI-TR-03161-3", "O.Back_1", ["CRYPTO-002"]),
("BSI-TR-03161-3", "O.Ops_1", ["OPS-001", "OPS-002", "OPS-005"]),
]
count = 0
for reg_code, article_prefix, control_ids in mapping_rules:
# Find requirements matching this regulation and article
requirements = self.db.query(RequirementDB).join(RegulationDB).filter(
RegulationDB.code == reg_code,
RequirementDB.article.like(f"{article_prefix}%"),
).all()
for req in requirements:
for control_id in control_ids:
# Find control
control = self.db.query(ControlDB).filter(
ControlDB.control_id == control_id
).first()
if not control:
continue
# Check if mapping exists
existing = self.db.query(ControlMappingDB).filter(
ControlMappingDB.requirement_id == req.id,
ControlMappingDB.control_id == control.id,
).first()
if existing:
continue
mapping = ControlMappingDB(
requirement_id=req.id,
control_id=control.id,
coverage_level="full",
)
self.db.add(mapping)
count += 1
return count
def seed_regulations_only(self) -> int:
"""Seed only regulations (useful for incremental updates)."""
count = self._seed_regulations()
self.db.commit()
return count
def seed_controls_only(self) -> int:
"""Seed only controls (useful for incremental updates)."""
count = self._seed_controls()
self.db.commit()
return count
def _seed_risks(self) -> int:
"""Seed risks from RISKS_SEED."""
count = 0
for risk_data in RISKS_SEED:
# Check if risk already exists
existing = self.db.query(RiskDB).filter(
RiskDB.risk_id == risk_data["risk_id"]
).first()
if existing:
continue
# Calculate inherent risk level
inherent_risk = RiskDB.calculate_risk_level(
risk_data["likelihood"],
risk_data["impact"]
)
risk = RiskDB(
risk_id=risk_data["risk_id"],
title=risk_data["title"],
description=risk_data.get("description"),
category=risk_data["category"],
likelihood=risk_data["likelihood"],
impact=risk_data["impact"],
inherent_risk=inherent_risk,
mitigating_controls=risk_data.get("mitigating_controls", []),
owner=risk_data.get("owner"),
treatment_plan=risk_data.get("treatment_plan"),
status="open",
)
self.db.add(risk)
count += 1
return count
def seed_risks_only(self) -> int:
"""Seed only risks (useful for incremental updates)."""
count = self._seed_risks()
self.db.commit()
return count
def _seed_service_modules(self) -> int:
"""Seed service modules from BREAKPILOT_SERVICES."""
count = 0
for service_data in BREAKPILOT_SERVICES:
# Check if service already exists
existing = self.db.query(ServiceModuleDB).filter(
ServiceModuleDB.name == service_data["name"]
).first()
if existing:
self._module_map[service_data["name"]] = existing.id
continue
module = ServiceModuleDB(
name=service_data["name"],
display_name=service_data["display_name"],
description=service_data.get("description"),
service_type=ServiceTypeEnum(service_data["service_type"]),
port=service_data.get("port"),
technology_stack=service_data.get("technology_stack", []),
repository_path=service_data.get("repository_path"),
docker_image=service_data.get("docker_image"),
data_categories=service_data.get("data_categories", []),
processes_pii=service_data.get("processes_pii", False),
processes_health_data=service_data.get("processes_health_data", False),
ai_components=service_data.get("ai_components", False),
is_active=True,
criticality=service_data.get("criticality", "medium"),
owner_team=service_data.get("owner_team"),
)
self.db.add(module)
self.db.flush() # Get the ID
self._module_map[service_data["name"]] = module.id
count += 1
return count
def _seed_module_regulation_mappings(self) -> int:
"""Create mappings between service modules and regulations."""
count = 0
for service_data in BREAKPILOT_SERVICES:
# Get module ID
module_id = self._module_map.get(service_data["name"])
if not module_id:
# Try to find in database
module = self.db.query(ServiceModuleDB).filter(
ServiceModuleDB.name == service_data["name"]
).first()
if module:
module_id = module.id
self._module_map[service_data["name"]] = module_id
else:
logger.warning(f"Module {service_data['name']} not found, skipping regulation mappings")
continue
# Process regulation mappings
regulations = service_data.get("regulations", [])
for reg_mapping in regulations:
# Find regulation by code
regulation_code = reg_mapping["code"]
regulation_id = self._regulation_map.get(regulation_code)
if not regulation_id:
regulation = self.db.query(RegulationDB).filter(
RegulationDB.code == regulation_code
).first()
if regulation:
regulation_id = regulation.id
self._regulation_map[regulation_code] = regulation_id
else:
logger.warning(f"Regulation {regulation_code} not found, skipping mapping for {service_data['name']}")
continue
# Check if mapping exists
existing = self.db.query(ModuleRegulationMappingDB).filter(
ModuleRegulationMappingDB.module_id == module_id,
ModuleRegulationMappingDB.regulation_id == regulation_id,
).first()
if existing:
continue
mapping = ModuleRegulationMappingDB(
module_id=module_id,
regulation_id=regulation_id,
relevance_level=RelevanceLevelEnum(reg_mapping["relevance"]),
notes=reg_mapping.get("notes"),
)
self.db.add(mapping)
count += 1
return count
def seed_service_modules_only(self) -> int:
"""Seed only service modules (useful for incremental updates)."""
results = {
"service_modules": 0,
"module_regulation_mappings": 0,
}
# Ensure regulations are loaded first
if not self._regulation_map:
self._seed_regulations()
results["service_modules"] = self._seed_service_modules()
results["module_regulation_mappings"] = self._seed_module_regulation_mappings()
self.db.commit()
logger.info(f"Service modules seeding completed: {results}")
return results["service_modules"] + results["module_regulation_mappings"]
def _seed_soa(self) -> int:
"""
Seed Statement of Applicability (SoA) entries from ISO 27001:2022 Annex A.
Creates SoA entries for all 93 Annex A controls.
This is MANDATORY for ISO 27001 certification.
"""
count = 0
for annex_control in ISO27001_ANNEX_A_CONTROLS:
control_id = annex_control["control_id"]
# Check if SoA entry already exists
existing = self.db.query(StatementOfApplicabilityDB).filter(
StatementOfApplicabilityDB.annex_a_control == control_id
).first()
if existing:
continue
# Create SoA entry
soa_entry = StatementOfApplicabilityDB(
annex_a_control=control_id,
annex_a_title=annex_control["title"],
annex_a_category=annex_control["category"],
is_applicable=annex_control.get("default_applicable", True),
applicability_justification=annex_control.get("description", ""),
implementation_status="planned",
implementation_notes=annex_control.get("implementation_guidance", ""),
breakpilot_control_ids=annex_control.get("breakpilot_controls", []),
evidence_description="",
risk_assessment_notes="",
)
self.db.add(soa_entry)
count += 1
logger.info(f"Seeded {count} SoA entries from ISO 27001:2022 Annex A")
return count
def seed_soa_only(self) -> int:
"""
Seed only SoA entries (useful for incremental updates).
Creates all 93 ISO 27001:2022 Annex A control entries in the SoA.
"""
count = self._seed_soa()
self.db.commit()
logger.info(f"SoA seeding completed: {count} entries")
return count