breakpilot-compliance/backend-compliance/compliance/services/export_generator.py

"""
Audit Export Generator.

Generates ZIP packages for external auditors containing:
- Regulations & Requirements
- Control Catalogue with status
- Evidence artifacts
- Risk register
- Summary reports
"""

import hashlib
import json
import logging
import os
import shutil
import tempfile
import zipfile
from datetime import datetime, date, timezone
from pathlib import Path
from typing import Dict, List, Optional, Any

from sqlalchemy.orm import Session

from ..db.models import (
    RegulationDB,
    RequirementDB,
    ControlDB,
    ControlMappingDB,
    EvidenceDB,
    RiskDB,
    AuditExportDB,
    ExportStatusEnum,
    ControlStatusEnum,
)

logger = logging.getLogger(__name__)


class AuditExportGenerator:
    """Generates audit export packages."""

    def __init__(self, db: Session, export_dir: str = "/tmp/compliance_exports"):
        self.db = db
        self.export_dir = Path(export_dir)
        self.export_dir.mkdir(parents=True, exist_ok=True)

    def create_export(
        self,
        requested_by: str,
        export_type: str = "full",
        included_regulations: Optional[List[str]] = None,
        included_domains: Optional[List[str]] = None,
        date_range_start: Optional[date] = None,
        date_range_end: Optional[date] = None,
    ) -> AuditExportDB:
        """
        Create a new audit export.

        Args:
            requested_by: User requesting the export
            export_type: "full", "controls_only", "evidence_only"
            included_regulations: Filter by regulation codes
            included_domains: Filter by control domains
            date_range_start: Evidence collected after this date
            date_range_end: Evidence collected before this date

        Returns:
            AuditExportDB record
        """
        # Create export record
        export_record = AuditExportDB(
            export_type=export_type,
            export_name=f"Breakpilot Compliance Export {datetime.now().strftime('%Y-%m-%d %H:%M')}",
            included_regulations=included_regulations,
            included_domains=included_domains,
            date_range_start=date_range_start,
            date_range_end=date_range_end,
            requested_by=requested_by,
            status=ExportStatusEnum.GENERATING,
        )
        self.db.add(export_record)
        self.db.flush()

        try:
            # Generate the export
            file_path, file_hash, file_size = self._generate_zip(
                export_record.id,
                export_type,
                included_regulations,
                included_domains,
                date_range_start,
                date_range_end,
            )

            # Update record with results
            export_record.file_path = str(file_path)
            export_record.file_hash = file_hash
            export_record.file_size_bytes = file_size
            export_record.status = ExportStatusEnum.COMPLETED
            export_record.completed_at = datetime.now(timezone.utc)

            # Calculate statistics
            stats = self._calculate_statistics(
                included_regulations, included_domains
            )
            export_record.total_controls = stats["total_controls"]
            export_record.total_evidence = stats["total_evidence"]
            export_record.compliance_score = stats["compliance_score"]

            self.db.commit()
            logger.info(f"Export completed: {file_path}")
            return export_record

        except Exception as e:
            export_record.status = ExportStatusEnum.FAILED
            export_record.error_message = str(e)
            self.db.commit()
            logger.error(f"Export failed: {e}")
            raise

    def _generate_zip(
        self,
        export_id: str,
        export_type: str,
        included_regulations: Optional[List[str]],
        included_domains: Optional[List[str]],
        date_range_start: Optional[date],
        date_range_end: Optional[date],
    ) -> tuple:
        """Generate the actual ZIP file."""
        timestamp = datetime.now().strftime("%Y-%m-%d_%H%M%S")
        zip_filename = f"audit_export_{timestamp}.zip"
        zip_path = self.export_dir / zip_filename

        with tempfile.TemporaryDirectory() as temp_dir:
            temp_path = Path(temp_dir)

            # Create directory structure
            (temp_path / "regulations").mkdir()
            (temp_path / "controls").mkdir()
            (temp_path / "evidence").mkdir()
            (temp_path / "risks").mkdir()

            # Generate content based on export type
            if export_type in ["full", "controls_only"]:
                self._export_regulations(temp_path / "regulations", included_regulations)
                self._export_controls(temp_path / "controls", included_domains)

            if export_type in ["full", "evidence_only"]:
                self._export_evidence(
                    temp_path / "evidence",
                    included_domains,
                    date_range_start,
                    date_range_end,
                )

            if export_type == "full":
                self._export_risks(temp_path / "risks")

            # Generate summary
            self._export_summary(
                temp_path,
                export_type,
                included_regulations,
                included_domains,
            )

            # Generate README
            self._export_readme(temp_path)

            # Generate index.html for navigation
            self._export_index_html(temp_path)

            # Create ZIP
            with zipfile.ZipFile(zip_path, "w", zipfile.ZIP_DEFLATED) as zf:
                for file_path in temp_path.rglob("*"):
                    if file_path.is_file():
                        arcname = file_path.relative_to(temp_path)
                        zf.write(file_path, arcname)

        # Calculate hash
        file_hash = self._calculate_file_hash(zip_path)
        file_size = zip_path.stat().st_size

        return zip_path, file_hash, file_size

    def _export_regulations(
        self, output_dir: Path, included_regulations: Optional[List[str]]
    ) -> None:
        """Export regulations to JSON files."""
        query = self.db.query(RegulationDB).filter(RegulationDB.is_active)
        if included_regulations:
            query = query.filter(RegulationDB.code.in_(included_regulations))

        regulations = query.all()

        for reg in regulations:
            # Get requirements for this regulation
            requirements = self.db.query(RequirementDB).filter(
                RequirementDB.regulation_id == reg.id
            ).all()

            data = {
                "code": reg.code,
                "name": reg.name,
                "full_name": reg.full_name,
                "type": reg.regulation_type.value if reg.regulation_type else None,
                "source_url": reg.source_url,
                "effective_date": reg.effective_date.isoformat() if reg.effective_date else None,
                "description": reg.description,
                "requirements": [
                    {
                        "article": r.article,
                        "paragraph": r.paragraph,
                        "title": r.title,
                        "description": r.description,
                        "is_applicable": r.is_applicable,
                        "breakpilot_interpretation": r.breakpilot_interpretation,
                    }
                    for r in requirements
                ],
            }

            file_path = output_dir / f"{reg.code.lower()}.json"
            with open(file_path, "w", encoding="utf-8") as f:
                json.dump(data, f, indent=2, ensure_ascii=False)

    def _export_controls(
        self, output_dir: Path, included_domains: Optional[List[str]]
    ) -> None:
        """Export controls to JSON and generate summary."""
        query = self.db.query(ControlDB)
        if included_domains:
            from ..db.models import ControlDomainEnum
            domain_enums = [ControlDomainEnum(d) for d in included_domains]
            query = query.filter(ControlDB.domain.in_(domain_enums))

        controls = query.order_by(ControlDB.control_id).all()

        controls_data = []
        for ctrl in controls:
            # Get mappings
            mappings = self.db.query(ControlMappingDB).filter(
                ControlMappingDB.control_id == ctrl.id
            ).all()

            # Get requirement references
            requirement_refs = []
            for m in mappings:
                req = self.db.query(RequirementDB).get(m.requirement_id)
                if req:
                    reg = self.db.query(RegulationDB).get(req.regulation_id)
                    requirement_refs.append({
                        "regulation": reg.code if reg else None,
                        "article": req.article,
                        "paragraph": req.paragraph,
                        "coverage": m.coverage_level,
                    })

            ctrl_data = {
                "control_id": ctrl.control_id,
                "domain": ctrl.domain.value if ctrl.domain else None,
                "type": ctrl.control_type.value if ctrl.control_type else None,
                "title": ctrl.title,
                "description": ctrl.description,
                "pass_criteria": ctrl.pass_criteria,
                "status": ctrl.status.value if ctrl.status else None,
                "is_automated": ctrl.is_automated,
                "automation_tool": ctrl.automation_tool,
                "owner": ctrl.owner,
                "last_reviewed": ctrl.last_reviewed_at.isoformat() if ctrl.last_reviewed_at else None,
                "code_reference": ctrl.code_reference,
                "mapped_requirements": requirement_refs,
            }
            controls_data.append(ctrl_data)

        # Write full catalogue
        with open(output_dir / "control_catalogue.json", "w", encoding="utf-8") as f:
            json.dump(controls_data, f, indent=2, ensure_ascii=False)

        # Write summary by domain
        domain_summary = {}
        for ctrl in controls_data:
            domain = ctrl["domain"]
            if domain not in domain_summary:
                domain_summary[domain] = {"total": 0, "pass": 0, "partial": 0, "fail": 0}
            domain_summary[domain]["total"] += 1
            status = ctrl["status"]
            if status in domain_summary[domain]:
                domain_summary[domain][status] += 1

        with open(output_dir / "domain_summary.json", "w", encoding="utf-8") as f:
            json.dump(domain_summary, f, indent=2, ensure_ascii=False)

    def _export_evidence(
        self,
        output_dir: Path,
        included_domains: Optional[List[str]],
        date_range_start: Optional[date],
        date_range_end: Optional[date],
    ) -> None:
        """Export evidence metadata and files."""
        query = self.db.query(EvidenceDB)

        if date_range_start:
            query = query.filter(EvidenceDB.collected_at >= datetime.combine(date_range_start, datetime.min.time()))
        if date_range_end:
            query = query.filter(EvidenceDB.collected_at <= datetime.combine(date_range_end, datetime.max.time()))

        if included_domains:
            from ..db.models import ControlDomainEnum
            domain_enums = [ControlDomainEnum(d) for d in included_domains]
            query = query.join(ControlDB).filter(ControlDB.domain.in_(domain_enums))

        evidence_list = query.all()

        evidence_data = []
        for ev in evidence_list:
            ctrl = self.db.query(ControlDB).get(ev.control_id)

            ev_data = {
                "id": ev.id,
                "control_id": ctrl.control_id if ctrl else None,
                "evidence_type": ev.evidence_type,
                "title": ev.title,
                "description": ev.description,
                "artifact_path": ev.artifact_path,
                "artifact_url": ev.artifact_url,
                "artifact_hash": ev.artifact_hash,
                "status": ev.status.value if ev.status else None,
                "valid_from": ev.valid_from.isoformat() if ev.valid_from else None,
                "valid_until": ev.valid_until.isoformat() if ev.valid_until else None,
                "collected_at": ev.collected_at.isoformat() if ev.collected_at else None,
                "source": ev.source,
            }
            evidence_data.append(ev_data)

            # Copy evidence files if they exist
            if ev.artifact_path and os.path.exists(ev.artifact_path):
                evidence_subdir = output_dir / ev.evidence_type
                evidence_subdir.mkdir(exist_ok=True)
                filename = os.path.basename(ev.artifact_path)
                shutil.copy2(ev.artifact_path, evidence_subdir / filename)

        with open(output_dir / "evidence_index.json", "w", encoding="utf-8") as f:
            json.dump(evidence_data, f, indent=2, ensure_ascii=False)

    def _export_risks(self, output_dir: Path) -> None:
        """Export risk register."""
        risks = self.db.query(RiskDB).order_by(RiskDB.risk_id).all()

        risks_data = []
        for risk in risks:
            risk_data = {
                "risk_id": risk.risk_id,
                "title": risk.title,
                "description": risk.description,
                "category": risk.category,
                "likelihood": risk.likelihood,
                "impact": risk.impact,
                "inherent_risk": risk.inherent_risk.value if risk.inherent_risk else None,
                "mitigating_controls": risk.mitigating_controls,
                "residual_likelihood": risk.residual_likelihood,
                "residual_impact": risk.residual_impact,
                "residual_risk": risk.residual_risk.value if risk.residual_risk else None,
                "owner": risk.owner,
                "status": risk.status,
                "treatment_plan": risk.treatment_plan,
            }
            risks_data.append(risk_data)

        with open(output_dir / "risk_register.json", "w", encoding="utf-8") as f:
            json.dump(risks_data, f, indent=2, ensure_ascii=False)

    def _export_summary(
        self,
        output_dir: Path,
        export_type: str,
        included_regulations: Optional[List[str]],
        included_domains: Optional[List[str]],
    ) -> None:
        """Generate summary.json with overall statistics."""
        stats = self._calculate_statistics(included_regulations, included_domains)

        summary = {
            "export_date": datetime.now().isoformat(),
            "export_type": export_type,
            "filters": {
                "regulations": included_regulations,
                "domains": included_domains,
            },
            "statistics": stats,
            "organization": "Breakpilot",
            "version": "1.0.0",
        }

        with open(output_dir / "summary.json", "w", encoding="utf-8") as f:
            json.dump(summary, f, indent=2, ensure_ascii=False)

    def _export_readme(self, output_dir: Path) -> None:
        """Generate README.md for auditors."""
        readme = """# Breakpilot Compliance Export

Dieses Paket enthält die Compliance-Dokumentation von Breakpilot.

## Struktur

```
├── summary.json           # Zusammenfassung und Statistiken
├── index.html             # HTML-Navigation (im Browser öffnen)
├── regulations/           # Verordnungen und Anforderungen
│   ├── gdpr.json
│   ├── aiact.json
│   └── ...
├── controls/              # Control Catalogue
│   ├── control_catalogue.json
│   └── domain_summary.json
├── evidence/              # Nachweise
│   ├── evidence_index.json
│   └── [evidence_type]/
└── risks/                 # Risikoregister
    └── risk_register.json
```

## Verwendung

1. **HTML-Navigation**: Öffnen Sie `index.html` im Browser für eine visuelle Übersicht.
2. **JSON-Dateien**: Maschinenlesbare Daten für Import in GRC-Tools.
3. **Nachweis-Dateien**: Originale Scan-Reports und Konfigurationen.

## Kontakt

Bei Fragen wenden Sie sich an das Breakpilot Security Team.

---
Generiert am: """ + datetime.now().strftime("%Y-%m-%d %H:%M:%S")

        with open(output_dir / "README.md", "w", encoding="utf-8") as f:
            f.write(readme)

    def _export_index_html(self, output_dir: Path) -> None:
        """Generate index.html for browser navigation."""
        html = """<!DOCTYPE html>
<html lang="de">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Breakpilot Compliance Export</title>
    <style>
        body { font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif; max-width: 1200px; margin: 0 auto; padding: 2rem; background: #f5f5f5; }
        h1 { color: #1a1a1a; border-bottom: 3px solid #0066cc; padding-bottom: 1rem; }
        h2 { color: #333; margin-top: 2rem; }
        .card { background: white; border-radius: 8px; padding: 1.5rem; margin: 1rem 0; box-shadow: 0 2px 4px rgba(0,0,0,0.1); }
        .stats { display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 1rem; }
        .stat { background: linear-gradient(135deg, #0066cc, #004499); color: white; padding: 1.5rem; border-radius: 8px; text-align: center; }
        .stat-value { font-size: 2.5rem; font-weight: bold; }
        .stat-label { opacity: 0.9; margin-top: 0.5rem; }
        ul { list-style: none; padding: 0; }
        li { padding: 0.75rem; border-bottom: 1px solid #eee; }
        li:last-child { border-bottom: none; }
        a { color: #0066cc; text-decoration: none; }
        a:hover { text-decoration: underline; }
        .footer { margin-top: 3rem; padding-top: 1rem; border-top: 1px solid #ddd; color: #666; font-size: 0.9rem; }
    </style>
</head>
<body>
    <h1>Breakpilot Compliance Export</h1>

    <div class="stats">
        <div class="stat">
            <div class="stat-value" id="score">--%</div>
            <div class="stat-label">Compliance Score</div>
        </div>
        <div class="stat">
            <div class="stat-value" id="controls">--</div>
            <div class="stat-label">Controls</div>
        </div>
        <div class="stat">
            <div class="stat-value" id="evidence">--</div>
            <div class="stat-label">Evidence Items</div>
        </div>
        <div class="stat">
            <div class="stat-value" id="regulations">--</div>
            <div class="stat-label">Regulations</div>
        </div>
    </div>

    <div class="card">
        <h2>Regulations & Requirements</h2>
        <ul id="regulations-list">
            <li>Loading...</li>
        </ul>
    </div>

    <div class="card">
        <h2>Controls by Domain</h2>
        <ul id="domains-list">
            <li>Loading...</li>
        </ul>
    </div>

    <div class="card">
        <h2>Export Contents</h2>
        <ul>
            <li><a href="summary.json">summary.json</a> - Export metadata and statistics</li>
            <li><a href="controls/control_catalogue.json">controls/control_catalogue.json</a> - Full control catalogue</li>
            <li><a href="evidence/evidence_index.json">evidence/evidence_index.json</a> - Evidence index</li>
            <li><a href="risks/risk_register.json">risks/risk_register.json</a> - Risk register</li>
        </ul>
    </div>

    <div class="footer">
        <p>Generated by Breakpilot Compliance Framework</p>
    </div>

    <script>
        // Load summary and populate stats
        fetch('summary.json')
            .then(r => r.json())
            .then(data => {
                document.getElementById('score').textContent = (data.statistics.compliance_score || 0).toFixed(0) + '%';
                document.getElementById('controls').textContent = data.statistics.total_controls || 0;
                document.getElementById('evidence').textContent = data.statistics.total_evidence || 0;
                document.getElementById('regulations').textContent = data.statistics.total_regulations || 0;
            })
            .catch(() => console.log('Could not load summary'));

        // Load regulations list
        const regsDir = 'regulations/';
        document.getElementById('regulations-list').innerHTML =
            '<li><a href="regulations/gdpr.json">GDPR</a> - Datenschutz-Grundverordnung</li>' +
            '<li><a href="regulations/aiact.json">AI Act</a> - KI-Verordnung</li>' +
            '<li><a href="regulations/cra.json">CRA</a> - Cyber Resilience Act</li>';

        // Load domain summary
        fetch('controls/domain_summary.json')
            .then(r => r.json())
            .then(data => {
                const list = document.getElementById('domains-list');
                list.innerHTML = Object.entries(data).map(([domain, stats]) =>
                    `<li><strong>${domain.toUpperCase()}</strong>: ${stats.pass || 0}/${stats.total} controls passing</li>`
                ).join('');
            })
            .catch(() => console.log('Could not load domain summary'));
    </script>
</body>
</html>"""

        with open(output_dir / "index.html", "w", encoding="utf-8") as f:
            f.write(html)

    def _calculate_statistics(
        self,
        included_regulations: Optional[List[str]],
        included_domains: Optional[List[str]],
    ) -> Dict[str, Any]:
        """Calculate compliance statistics."""
        # Count regulations
        reg_query = self.db.query(RegulationDB).filter(RegulationDB.is_active)
        if included_regulations:
            reg_query = reg_query.filter(RegulationDB.code.in_(included_regulations))
        total_regulations = reg_query.count()

        # Count controls
        ctrl_query = self.db.query(ControlDB)
        if included_domains:
            from ..db.models import ControlDomainEnum
            domain_enums = [ControlDomainEnum(d) for d in included_domains]
            ctrl_query = ctrl_query.filter(ControlDB.domain.in_(domain_enums))

        total_controls = ctrl_query.count()
        passing_controls = ctrl_query.filter(ControlDB.status == ControlStatusEnum.PASS).count()
        partial_controls = ctrl_query.filter(ControlDB.status == ControlStatusEnum.PARTIAL).count()

        # Count evidence
        total_evidence = self.db.query(EvidenceDB).count()

        # Calculate compliance score
        if total_controls > 0:
            score = ((passing_controls + partial_controls * 0.5) / total_controls) * 100
        else:
            score = 0

        return {
            "total_regulations": total_regulations,
            "total_controls": total_controls,
            "passing_controls": passing_controls,
            "partial_controls": partial_controls,
            "total_evidence": total_evidence,
            "compliance_score": round(score, 1),
        }

    def _calculate_file_hash(self, file_path: Path) -> str:
        """Calculate SHA-256 hash of file."""
        sha256 = hashlib.sha256()
        with open(file_path, "rb") as f:
            for chunk in iter(lambda: f.read(8192), b""):
                sha256.update(chunk)
        return sha256.hexdigest()

    def get_export_status(self, export_id: str) -> Optional[AuditExportDB]:
        """Get status of an export."""
        return self.db.query(AuditExportDB).get(export_id)

    def list_exports(
        self, limit: int = 20, offset: int = 0
    ) -> List[AuditExportDB]:
        """List recent exports."""
        return (
            self.db.query(AuditExportDB)
            .order_by(AuditExportDB.requested_at.desc())
            .offset(offset)
            .limit(limit)
            .all()
        )