feat(training+controls): interactive video pipeline, training blocks, control generator, CE libraries

Interactive Training Videos (CP-TRAIN): - DB migration 022: training_checkpoints + checkpoint_progress tables - NarratorScript generation via Anthropic (AI Teacher persona, German) - TTS batch synthesis + interactive video pipeline (slides + checkpoint slides + FFmpeg) - 4 new API endpoints: generate-interactive, interactive-manifest, checkpoint submit, checkpoint progress - InteractiveVideoPlayer component (HTML5 Video, quiz overlay, seek protection, progress tracking) - Learner portal integration with automatic completion on all checkpoints passed - 30 new tests (handler validation + grading logic + manifest/progress + seek protection) Training Blocks: - Block generator, block store, block config CRUD + preview/generate endpoints - Migration 021: training_blocks schema Control Generator + Canonical Library: - Control generator routes + service enhancements - Canonical control library helpers, sidebar entry - Citation backfill service + tests - CE libraries data (hazard, protection, evidence, lifecycle, components) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-16 21:41:48 +01:00
parent d2133dbfa2
commit 4f6bc8f6f6
50 changed files with 17299 additions and 198 deletions
--- a/docs-src/control_generator.py
+++ b/docs-src/control_generator.py
--- a/docs-src/control_generator_routes.py
+++ b/docs-src/control_generator_routes.py
@@ -0,0 +1,976 @@
+"""
+FastAPI routes for the Control Generator Pipeline.
+
+Endpoints:
+  POST /v1/canonical/generate                     — Start generation run
+  GET  /v1/canonical/generate/status/{job_id}     — Job status
+  GET  /v1/canonical/generate/jobs                — All jobs
+  GET  /v1/canonical/generate/review-queue        — Controls needing review
+  POST /v1/canonical/generate/review/{control_id} — Complete review
+  GET  /v1/canonical/generate/processed-stats     — Processing stats per collection
+  GET  /v1/canonical/blocked-sources              — Blocked sources list
+  POST /v1/canonical/blocked-sources/cleanup      — Start cleanup workflow
+"""
+
+import asyncio
+import json
+import logging
+from typing import Optional, List
+
+from fastapi import APIRouter, HTTPException, Query
+from pydantic import BaseModel
+from sqlalchemy import text
+
+from database import SessionLocal
+from compliance.services.control_generator import (
+    ControlGeneratorPipeline,
+    GeneratorConfig,
+    ALL_COLLECTIONS,
+    VALID_CATEGORIES,
+    VALID_DOMAINS,
+    _detect_category,
+    _detect_domain,
+    _llm_local,
+    _parse_llm_json,
+    CATEGORY_LIST_STR,
+)
+from compliance.services.citation_backfill import CitationBackfill, BackfillResult
+from compliance.services.rag_client import get_rag_client
+
+logger = logging.getLogger(__name__)
+router = APIRouter(prefix="/v1/canonical", tags=["control-generator"])
+
+
+# =============================================================================
+# REQUEST / RESPONSE MODELS
+# =============================================================================
+
+class GenerateRequest(BaseModel):
+    domain: Optional[str] = None
+    collections: Optional[List[str]] = None
+    max_controls: int = 50
+    max_chunks: int = 1000  # Default: process max 1000 chunks per job (respects document boundaries)
+    batch_size: int = 5
+    skip_web_search: bool = False
+    dry_run: bool = False
+
+
+class GenerateResponse(BaseModel):
+    job_id: str
+    status: str
+    message: str
+    total_chunks_scanned: int = 0
+    controls_generated: int = 0
+    controls_verified: int = 0
+    controls_needs_review: int = 0
+    controls_too_close: int = 0
+    controls_duplicates_found: int = 0
+    controls_qa_fixed: int = 0
+    errors: list = []
+    controls: list = []
+
+
+class ReviewRequest(BaseModel):
+    action: str  # "approve", "reject", "needs_rework"
+    release_state: Optional[str] = None  # Override release_state
+    notes: Optional[str] = None
+
+
+class ProcessedStats(BaseModel):
+    collection: str
+    total_chunks_estimated: int
+    processed_chunks: int
+    pending_chunks: int
+    direct_adopted: int
+    llm_reformed: int
+    skipped: int
+
+
+class BlockedSourceResponse(BaseModel):
+    id: str
+    regulation_code: str
+    document_title: str
+    reason: str
+    deletion_status: str
+    qdrant_collection: Optional[str] = None
+    marked_at: str
+
+
+# =============================================================================
+# ENDPOINTS
+# =============================================================================
+
+async def _run_pipeline_background(config: GeneratorConfig, job_id: str):
+    """Run the pipeline in the background. Uses its own DB session."""
+    db = SessionLocal()
+    try:
+        config.existing_job_id = job_id
+        pipeline = ControlGeneratorPipeline(db=db, rag_client=get_rag_client())
+        result = await pipeline.run(config)
+        logger.info(
+            "Background generation job %s completed: %d controls from %d chunks",
+            job_id, result.controls_generated, result.total_chunks_scanned,
+        )
+    except Exception as e:
+        logger.error("Background generation job %s failed: %s", job_id, e)
+        # Update job as failed
+        try:
+            db.execute(
+                text("""
+                    UPDATE canonical_generation_jobs
+                    SET status = 'failed', errors = :errors, completed_at = NOW()
+                    WHERE id = CAST(:job_id AS uuid)
+                """),
+                {"job_id": job_id, "errors": json.dumps([str(e)])},
+            )
+            db.commit()
+        except Exception:
+            pass
+    finally:
+        db.close()
+
+
+@router.post("/generate", response_model=GenerateResponse)
+async def start_generation(req: GenerateRequest):
+    """Start a control generation run (runs in background).
+
+    Returns immediately with job_id. Use GET /generate/status/{job_id} to poll progress.
+    """
+    config = GeneratorConfig(
+        collections=req.collections,
+        domain=req.domain,
+        batch_size=req.batch_size,
+        max_controls=req.max_controls,
+        max_chunks=req.max_chunks,
+        skip_web_search=req.skip_web_search,
+        dry_run=req.dry_run,
+    )
+
+    if req.dry_run:
+        # Dry run: execute synchronously and return controls
+        db = SessionLocal()
+        try:
+            pipeline = ControlGeneratorPipeline(db=db, rag_client=get_rag_client())
+            result = await pipeline.run(config)
+            return GenerateResponse(
+                job_id=result.job_id,
+                status=result.status,
+                message=f"Dry run: {result.controls_generated} controls from {result.total_chunks_scanned} chunks",
+                total_chunks_scanned=result.total_chunks_scanned,
+                controls_generated=result.controls_generated,
+                controls_verified=result.controls_verified,
+                controls_needs_review=result.controls_needs_review,
+                controls_too_close=result.controls_too_close,
+                controls_duplicates_found=result.controls_duplicates_found,
+                errors=result.errors,
+                controls=result.controls,
+            )
+        except Exception as e:
+            logger.error("Dry run failed: %s", e)
+            raise HTTPException(status_code=500, detail=str(e))
+        finally:
+            db.close()
+
+    # Create job record first so we can return the ID
+    db = SessionLocal()
+    try:
+        result = db.execute(
+            text("""
+                INSERT INTO canonical_generation_jobs (status, config)
+                VALUES ('running', :config)
+                RETURNING id
+            """),
+            {"config": json.dumps(config.model_dump())},
+        )
+        db.commit()
+        row = result.fetchone()
+        job_id = str(row[0]) if row else None
+    except Exception as e:
+        logger.error("Failed to create job: %s", e)
+        raise HTTPException(status_code=500, detail=f"Failed to create job: {e}")
+    finally:
+        db.close()
+
+    if not job_id:
+        raise HTTPException(status_code=500, detail="Failed to create job record")
+
+    # Launch pipeline in background
+    asyncio.create_task(_run_pipeline_background(config, job_id))
+
+    return GenerateResponse(
+        job_id=job_id,
+        status="running",
+        message="Generation started in background. Poll /generate/status/{job_id} for progress.",
+    )
+
+
+@router.get("/generate/status/{job_id}")
+async def get_job_status(job_id: str):
+    """Get status of a generation job."""
+    db = SessionLocal()
+    try:
+        result = db.execute(
+            text("SELECT * FROM canonical_generation_jobs WHERE id = CAST(:id AS uuid)"),
+            {"id": job_id},
+        )
+        row = result.fetchone()
+        if not row:
+            raise HTTPException(status_code=404, detail="Job not found")
+
+        cols = result.keys()
+        job = dict(zip(cols, row))
+        # Serialize datetime fields
+        for key in ("started_at", "completed_at", "created_at"):
+            if job.get(key):
+                job[key] = str(job[key])
+        job["id"] = str(job["id"])
+        return job
+    finally:
+        db.close()
+
+
+@router.get("/generate/jobs")
+async def list_jobs(
+    limit: int = Query(20, ge=1, le=100),
+    offset: int = Query(0, ge=0),
+):
+    """List all generation jobs."""
+    db = SessionLocal()
+    try:
+        result = db.execute(
+            text("""
+                SELECT id, status, total_chunks_scanned, controls_generated,
+                       controls_verified, controls_needs_review, controls_too_close,
+                       controls_duplicates_found, created_at, completed_at
+                FROM canonical_generation_jobs
+                ORDER BY created_at DESC
+                LIMIT :limit OFFSET :offset
+            """),
+            {"limit": limit, "offset": offset},
+        )
+        jobs = []
+        cols = result.keys()
+        for row in result:
+            job = dict(zip(cols, row))
+            job["id"] = str(job["id"])
+            for key in ("created_at", "completed_at"):
+                if job.get(key):
+                    job[key] = str(job[key])
+            jobs.append(job)
+        return {"jobs": jobs, "total": len(jobs)}
+    finally:
+        db.close()
+
+
+@router.get("/generate/review-queue")
+async def get_review_queue(
+    release_state: str = Query("needs_review", regex="^(needs_review|too_close|duplicate)$"),
+    limit: int = Query(50, ge=1, le=200),
+):
+    """Get controls that need manual review."""
+    db = SessionLocal()
+    try:
+        result = db.execute(
+            text("""
+                SELECT c.id, c.control_id, c.title, c.objective, c.severity,
+                       c.release_state, c.license_rule, c.customer_visible,
+                       c.generation_metadata, c.open_anchors, c.tags,
+                       c.created_at
+                FROM canonical_controls c
+                WHERE c.release_state = :state
+                ORDER BY c.created_at DESC
+                LIMIT :limit
+            """),
+            {"state": release_state, "limit": limit},
+        )
+        controls = []
+        cols = result.keys()
+        for row in result:
+            ctrl = dict(zip(cols, row))
+            ctrl["id"] = str(ctrl["id"])
+            ctrl["created_at"] = str(ctrl["created_at"])
+            # Parse JSON fields
+            for jf in ("generation_metadata", "open_anchors", "tags"):
+                if isinstance(ctrl.get(jf), str):
+                    try:
+                        ctrl[jf] = json.loads(ctrl[jf])
+                    except (json.JSONDecodeError, TypeError):
+                        pass
+            controls.append(ctrl)
+        return {"controls": controls, "total": len(controls)}
+    finally:
+        db.close()
+
+
+@router.post("/generate/review/{control_id}")
+async def review_control(control_id: str, req: ReviewRequest):
+    """Complete review of a generated control."""
+    db = SessionLocal()
+    try:
+        # Validate control exists and is in reviewable state
+        result = db.execute(
+            text("SELECT id, release_state FROM canonical_controls WHERE control_id = :cid"),
+            {"cid": control_id},
+        )
+        row = result.fetchone()
+        if not row:
+            raise HTTPException(status_code=404, detail="Control not found")
+
+        current_state = row[1]
+        if current_state not in ("needs_review", "too_close", "duplicate"):
+            raise HTTPException(status_code=400, detail=f"Control is in state '{current_state}', not reviewable")
+
+        # Determine new state
+        if req.action == "approve":
+            new_state = req.release_state or "draft"
+        elif req.action == "reject":
+            new_state = "deprecated"
+        elif req.action == "needs_rework":
+            new_state = "needs_review"
+        else:
+            raise HTTPException(status_code=400, detail=f"Unknown action: {req.action}")
+
+        if new_state not in ("draft", "review", "approved", "deprecated", "needs_review", "too_close", "duplicate"):
+            raise HTTPException(status_code=400, detail=f"Invalid release_state: {new_state}")
+
+        db.execute(
+            text("""
+                UPDATE canonical_controls
+                SET release_state = :state, updated_at = NOW()
+                WHERE control_id = :cid
+            """),
+            {"state": new_state, "cid": control_id},
+        )
+        db.commit()
+
+        return {"control_id": control_id, "release_state": new_state, "action": req.action}
+    finally:
+        db.close()
+
+
+class BulkReviewRequest(BaseModel):
+    release_state: str  # Filter: which controls to bulk-review
+    action: str  # "approve" or "reject"
+    new_state: Optional[str] = None  # Override target state
+
+
+@router.post("/generate/bulk-review")
+async def bulk_review(req: BulkReviewRequest):
+    """Bulk review all controls matching a release_state filter.
+
+    Example: reject all needs_review → sets them to deprecated.
+    """
+    if req.release_state not in ("needs_review", "too_close", "duplicate"):
+        raise HTTPException(status_code=400, detail=f"Invalid filter state: {req.release_state}")
+
+    if req.action == "approve":
+        target = req.new_state or "draft"
+    elif req.action == "reject":
+        target = "deprecated"
+    else:
+        raise HTTPException(status_code=400, detail=f"Unknown action: {req.action}")
+
+    if target not in ("draft", "review", "approved", "deprecated", "needs_review"):
+        raise HTTPException(status_code=400, detail=f"Invalid target state: {target}")
+
+    db = SessionLocal()
+    try:
+        result = db.execute(
+            text("""
+                UPDATE canonical_controls
+                SET release_state = :target, updated_at = NOW()
+                WHERE release_state = :source
+                RETURNING control_id
+            """),
+            {"source": req.release_state, "target": target},
+        )
+        affected = [row[0] for row in result]
+        db.commit()
+
+        return {
+            "action": req.action,
+            "source_state": req.release_state,
+            "target_state": target,
+            "affected_count": len(affected),
+        }
+    finally:
+        db.close()
+
+
+class QAReclassifyRequest(BaseModel):
+    limit: int = 100  # How many controls to reclassify per run
+    dry_run: bool = True  # Preview only by default
+    filter_category: Optional[str] = None  # Only reclassify controls of this category
+    filter_domain_prefix: Optional[str] = None  # Only reclassify controls with this prefix
+
+
+@router.post("/generate/qa-reclassify")
+async def qa_reclassify(req: QAReclassifyRequest):
+    """Run QA reclassification on existing controls using local LLM.
+
+    Finds controls where keyword-detection disagrees with current category/domain,
+    then uses Ollama to determine the correct classification.
+    """
+    db = SessionLocal()
+    try:
+        # Load controls to check
+        where_clauses = ["release_state NOT IN ('deprecated')"]
+        params = {"limit": req.limit}
+        if req.filter_category:
+            where_clauses.append("category = :cat")
+            params["cat"] = req.filter_category
+        if req.filter_domain_prefix:
+            where_clauses.append("control_id LIKE :prefix")
+            params["prefix"] = f"{req.filter_domain_prefix}-%"
+
+        where_sql = " AND ".join(where_clauses)
+        rows = db.execute(
+            text(f"""
+                SELECT id, control_id, title, objective, category,
+                       COALESCE(requirements::text, '[]') as requirements,
+                       COALESCE(source_original_text, '') as source_text
+                FROM canonical_controls
+                WHERE {where_sql}
+                ORDER BY created_at DESC
+                LIMIT :limit
+            """),
+            params,
+        ).fetchall()
+
+        results = {"checked": 0, "mismatches": 0, "fixes": [], "errors": []}
+
+        for row in rows:
+            results["checked"] += 1
+            control_id = row[1]
+            title = row[2]
+            objective = row[3] or ""
+            current_category = row[4]
+            source_text = row[6] or objective
+
+            # Keyword detection on source text
+            kw_category = _detect_category(source_text) or _detect_category(objective)
+            kw_domain = _detect_domain(source_text)
+            current_prefix = control_id.split("-")[0] if "-" in control_id else ""
+
+            # Skip if keyword detection agrees with current classification
+            if kw_category == current_category and kw_domain == current_prefix:
+                continue
+
+            results["mismatches"] += 1
+
+            # Ask Ollama to arbitrate
+            try:
+                reqs_text = ""
+                try:
+                    reqs = json.loads(row[5])
+                    if isinstance(reqs, list):
+                        reqs_text = ", ".join(str(r) for r in reqs[:3])
+                except Exception:
+                    pass
+
+                prompt = f"""Pruefe dieses Compliance-Control auf korrekte Klassifizierung.
+
+Titel: {title[:100]}
+Ziel: {objective[:200]}
+Anforderungen: {reqs_text[:200]}
+
+Aktuelle Zuordnung: domain={current_prefix}, category={current_category}
+Keyword-Erkennung: domain={kw_domain}, category={kw_category}
+
+Welche Zuordnung ist korrekt? Antworte NUR als JSON:
+{{"domain": "KUERZEL", "category": "kategorie_name", "reason": "kurze Begruendung"}}
+
+Domains: AUTH=Authentifizierung, CRYP=Kryptographie, NET=Netzwerk, DATA=Datenschutz, LOG=Logging, ACC=Zugriffskontrolle, SEC=IT-Sicherheit, INC=Vorfallmanagement, AI=KI, COMP=Compliance, GOV=Behoerden, LAB=Arbeitsrecht, FIN=Finanzregulierung, TRD=Gewerbe, ENV=Umwelt, HLT=Gesundheit
+Kategorien: {CATEGORY_LIST_STR}"""
+
+                raw = await _llm_local(prompt)
+                data = _parse_llm_json(raw)
+                if not data:
+                    continue
+
+                qa_domain = data.get("domain", "").upper()
+                qa_category = data.get("category", "")
+                reason = data.get("reason", "")
+
+                fix_entry = {
+                    "control_id": control_id,
+                    "title": title[:80],
+                    "old_category": current_category,
+                    "old_domain": current_prefix,
+                    "new_category": qa_category if qa_category in VALID_CATEGORIES else current_category,
+                    "new_domain": qa_domain if qa_domain in VALID_DOMAINS else current_prefix,
+                    "reason": reason,
+                }
+
+                category_changed = qa_category in VALID_CATEGORIES and qa_category != current_category
+
+                if category_changed and not req.dry_run:
+                    db.execute(
+                        text("""
+                            UPDATE canonical_controls
+                            SET category = :category, updated_at = NOW()
+                            WHERE id = :id
+                        """),
+                        {"id": row[0], "category": qa_category},
+                    )
+                    fix_entry["applied"] = True
+                else:
+                    fix_entry["applied"] = False
+
+                results["fixes"].append(fix_entry)
+
+            except Exception as e:
+                results["errors"].append({"control_id": control_id, "error": str(e)})
+
+        if not req.dry_run:
+            db.commit()
+
+        return results
+    finally:
+        db.close()
+
+
+@router.get("/generate/processed-stats")
+async def get_processed_stats():
+    """Get processing statistics per collection."""
+    db = SessionLocal()
+    try:
+        result = db.execute(
+            text("""
+                SELECT
+                    collection,
+                    COUNT(*) as processed_chunks,
+                    COUNT(*) FILTER (WHERE processing_path = 'structured') as direct_adopted,
+                    COUNT(*) FILTER (WHERE processing_path = 'llm_reform') as llm_reformed,
+                    COUNT(*) FILTER (WHERE processing_path = 'skipped') as skipped
+                FROM canonical_processed_chunks
+                GROUP BY collection
+                ORDER BY collection
+            """)
+        )
+        stats = []
+        cols = result.keys()
+        for row in result:
+            stat = dict(zip(cols, row))
+            stat["total_chunks_estimated"] = 0  # Would need Qdrant API to get total
+            stat["pending_chunks"] = 0
+            stats.append(stat)
+        return {"stats": stats}
+    finally:
+        db.close()
+
+
+# =============================================================================
+# BLOCKED SOURCES
+# =============================================================================
+
+@router.get("/blocked-sources")
+async def list_blocked_sources():
+    """List all blocked (Rule 3) sources."""
+    db = SessionLocal()
+    try:
+        result = db.execute(
+            text("""
+                SELECT id, regulation_code, document_title, reason,
+                       deletion_status, qdrant_collection, marked_at
+                FROM canonical_blocked_sources
+                ORDER BY marked_at DESC
+            """)
+        )
+        sources = []
+        cols = result.keys()
+        for row in result:
+            src = dict(zip(cols, row))
+            src["id"] = str(src["id"])
+            src["marked_at"] = str(src["marked_at"])
+            sources.append(src)
+        return {"sources": sources}
+    finally:
+        db.close()
+
+
+@router.post("/blocked-sources/cleanup")
+async def start_cleanup():
+    """Start cleanup workflow for blocked sources.
+
+    This marks all pending blocked sources for deletion.
+    Actual RAG chunk deletion and file removal is a separate manual step.
+    """
+    db = SessionLocal()
+    try:
+        result = db.execute(
+            text("""
+                UPDATE canonical_blocked_sources
+                SET deletion_status = 'marked_for_deletion'
+                WHERE deletion_status = 'pending'
+                RETURNING regulation_code
+            """)
+        )
+        marked = [row[0] for row in result]
+        db.commit()
+
+        return {
+            "status": "marked_for_deletion",
+            "marked_count": len(marked),
+            "regulation_codes": marked,
+            "message": "Sources marked for deletion. Run manual cleanup to remove RAG chunks and files.",
+        }
+    finally:
+        db.close()
+
+
+# =============================================================================
+# CUSTOMER VIEW FILTER
+# =============================================================================
+
+@router.get("/controls-customer")
+async def get_controls_customer_view(
+    severity: Optional[str] = Query(None),
+    domain: Optional[str] = Query(None),
+):
+    """Get controls filtered for customer visibility.
+
+    Rule 3 controls have source_citation and source_original_text hidden.
+    generation_metadata is NEVER shown to customers.
+    """
+    db = SessionLocal()
+    try:
+        query = """
+            SELECT c.id, c.control_id, c.title, c.objective, c.rationale,
+                   c.scope, c.requirements, c.test_procedure, c.evidence,
+                   c.severity, c.risk_score, c.implementation_effort,
+                   c.open_anchors, c.release_state, c.tags,
+                   c.license_rule, c.customer_visible,
+                   c.source_original_text, c.source_citation,
+                   c.created_at, c.updated_at
+            FROM canonical_controls c
+            WHERE c.release_state IN ('draft', 'approved')
+        """
+        params: dict = {}
+
+        if severity:
+            query += " AND c.severity = :severity"
+            params["severity"] = severity
+        if domain:
+            query += " AND c.control_id LIKE :domain"
+            params["domain"] = f"{domain.upper()}-%"
+
+        query += " ORDER BY c.control_id"
+
+        result = db.execute(text(query), params)
+        controls = []
+        cols = result.keys()
+        for row in result:
+            ctrl = dict(zip(cols, row))
+            ctrl["id"] = str(ctrl["id"])
+            for key in ("created_at", "updated_at"):
+                if ctrl.get(key):
+                    ctrl[key] = str(ctrl[key])
+
+            # Parse JSON fields
+            for jf in ("scope", "requirements", "test_procedure", "evidence",
+                        "open_anchors", "tags", "source_citation"):
+                if isinstance(ctrl.get(jf), str):
+                    try:
+                        ctrl[jf] = json.loads(ctrl[jf])
+                    except (json.JSONDecodeError, TypeError):
+                        pass
+
+            # Customer visibility rules:
+            # - NEVER show generation_metadata
+            # - Rule 3: NEVER show source_citation or source_original_text
+            ctrl.pop("generation_metadata", None)
+            if not ctrl.get("customer_visible", True):
+                ctrl["source_citation"] = None
+                ctrl["source_original_text"] = None
+
+            controls.append(ctrl)
+
+        return {"controls": controls, "total": len(controls)}
+    finally:
+        db.close()
+
+
+# =============================================================================
+# CITATION BACKFILL
+# =============================================================================
+
+class BackfillRequest(BaseModel):
+    dry_run: bool = True  # Default to dry_run for safety
+    limit: int = 0  # 0 = all controls
+
+
+class BackfillResponse(BaseModel):
+    status: str
+    total_controls: int = 0
+    matched_hash: int = 0
+    matched_regex: int = 0
+    matched_llm: int = 0
+    unmatched: int = 0
+    updated: int = 0
+    errors: list = []
+
+
+_backfill_status: dict = {}
+
+
+async def _run_backfill_background(dry_run: bool, limit: int, backfill_id: str):
+    """Run backfill in background with own DB session."""
+    db = SessionLocal()
+    try:
+        backfill = CitationBackfill(db=db, rag_client=get_rag_client())
+        result = await backfill.run(dry_run=dry_run, limit=limit)
+        _backfill_status[backfill_id] = {
+            "status": "completed",
+            "total_controls": result.total_controls,
+            "matched_hash": result.matched_hash,
+            "matched_regex": result.matched_regex,
+            "matched_llm": result.matched_llm,
+            "unmatched": result.unmatched,
+            "updated": result.updated,
+            "errors": result.errors[:50],
+        }
+        logger.info("Backfill %s completed: %d updated", backfill_id, result.updated)
+    except Exception as e:
+        logger.error("Backfill %s failed: %s", backfill_id, e)
+        _backfill_status[backfill_id] = {"status": "failed", "errors": [str(e)]}
+    finally:
+        db.close()
+
+
+@router.post("/generate/backfill-citations", response_model=BackfillResponse)
+async def start_backfill(req: BackfillRequest):
+    """Backfill article/paragraph into existing control source_citations.
+
+    Uses 3-tier matching: hash lookup → regex parse → Ollama LLM.
+    Default is dry_run=True (preview only, no DB changes).
+    """
+    import uuid
+    backfill_id = str(uuid.uuid4())[:8]
+    _backfill_status[backfill_id] = {"status": "running"}
+
+    # Always run in background (RAG index build takes minutes)
+    asyncio.create_task(_run_backfill_background(req.dry_run, req.limit, backfill_id))
+    return BackfillResponse(
+        status=f"running (id={backfill_id})",
+    )
+
+
+@router.get("/generate/backfill-status/{backfill_id}")
+async def get_backfill_status(backfill_id: str):
+    """Get status of a backfill job."""
+    status = _backfill_status.get(backfill_id)
+    if not status:
+        raise HTTPException(status_code=404, detail="Backfill job not found")
+    return status
+
+
+# =============================================================================
+# DOMAIN + TARGET AUDIENCE BACKFILL
+# =============================================================================
+
+class DomainBackfillRequest(BaseModel):
+    dry_run: bool = True
+    job_id: Optional[str] = None  # Only backfill controls from this job
+    limit: int = 0  # 0 = all
+
+_domain_backfill_status: dict = {}
+
+
+async def _run_domain_backfill(req: DomainBackfillRequest, backfill_id: str):
+    """Backfill domain, category, and target_audience for existing controls using Anthropic."""
+    import os
+    import httpx
+
+    ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY", "")
+    ANTHROPIC_MODEL = os.getenv("CONTROL_GEN_ANTHROPIC_MODEL", "claude-sonnet-4-6")
+
+    if not ANTHROPIC_API_KEY:
+        _domain_backfill_status[backfill_id] = {
+            "status": "failed", "error": "ANTHROPIC_API_KEY not set"
+        }
+        return
+
+    db = SessionLocal()
+    try:
+        # Find controls needing backfill
+        where_clauses = ["(target_audience IS NULL OR target_audience = '[]' OR target_audience = 'null')"]
+        params: dict = {}
+        if req.job_id:
+            where_clauses.append("generation_metadata->>'job_id' = :job_id")
+            params["job_id"] = req.job_id
+
+        query = f"""
+            SELECT id, control_id, title, objective, category, source_original_text, tags
+            FROM canonical_controls
+            WHERE {' AND '.join(where_clauses)}
+            ORDER BY control_id
+        """
+        if req.limit > 0:
+            query += f" LIMIT {req.limit}"
+
+        result = db.execute(text(query), params)
+        controls = [dict(zip(result.keys(), row)) for row in result]
+
+        total = len(controls)
+        updated = 0
+        errors = []
+
+        _domain_backfill_status[backfill_id] = {
+            "status": "running", "total": total, "updated": 0, "errors": []
+        }
+
+        # Process in batches of 10
+        BATCH_SIZE = 10
+        for batch_start in range(0, total, BATCH_SIZE):
+            batch = controls[batch_start:batch_start + BATCH_SIZE]
+
+            entries = []
+            for idx, ctrl in enumerate(batch):
+                text_for_analysis = ctrl.get("objective") or ctrl.get("title") or ""
+                original = ctrl.get("source_original_text") or ""
+                if original:
+                    text_for_analysis += f"\n\nQuelltext-Auszug: {original[:500]}"
+                entries.append(
+                    f"--- CONTROL {idx + 1}: {ctrl['control_id']} ---\n"
+                    f"Titel: {ctrl.get('title', '')}\n"
+                    f"Objective: {text_for_analysis[:800]}\n"
+                    f"Tags: {json.dumps(ctrl.get('tags', []))}"
+                )
+
+            prompt = f"""Analysiere die folgenden {len(batch)} Controls und bestimme fuer jedes:
+1. domain: Das Fachgebiet (AUTH, CRYP, NET, DATA, LOG, ACC, SEC, INC, AI, COMP, GOV, LAB, FIN, TRD, ENV, HLT)
+2. category: Die Kategorie (encryption, authentication, network, data_protection, logging, incident, continuity, compliance, supply_chain, physical, personnel, application, system, risk, governance, hardware, identity, public_administration, labor_law, finance, trade_regulation, environmental, health)
+3. target_audience: Liste der Zielgruppen (moegliche Werte: "unternehmen", "behoerden", "entwickler", "datenschutzbeauftragte", "geschaeftsfuehrung", "it-abteilung", "rechtsabteilung", "compliance-officer", "personalwesen", "einkauf", "produktion", "vertrieb", "gesundheitswesen", "finanzwesen", "oeffentlicher_dienst")
+
+Antworte mit einem JSON-Array mit {len(batch)} Objekten. Jedes Objekt hat:
+- control_index: 1-basierter Index
+- domain: Fachgebiet-Kuerzel
+- category: Kategorie
+- target_audience: Liste der Zielgruppen
+
+{"".join(entries)}"""
+
+            try:
+                headers = {
+                    "x-api-key": ANTHROPIC_API_KEY,
+                    "anthropic-version": "2023-06-01",
+                    "content-type": "application/json",
+                }
+                payload = {
+                    "model": ANTHROPIC_MODEL,
+                    "max_tokens": 4096,
+                    "system": "Du bist ein Compliance-Experte. Klassifiziere Controls nach Fachgebiet und Zielgruppe. Antworte NUR mit validem JSON.",
+                    "messages": [{"role": "user", "content": prompt}],
+                }
+
+                async with httpx.AsyncClient(timeout=60.0) as client:
+                    resp = await client.post(
+                        "https://api.anthropic.com/v1/messages",
+                        headers=headers,
+                        json=payload,
+                    )
+                    if resp.status_code != 200:
+                        errors.append(f"Anthropic API {resp.status_code} at batch {batch_start}")
+                        continue
+
+                    raw = resp.json().get("content", [{}])[0].get("text", "")
+
+                # Parse response
+                import re
+                bracket_match = re.search(r"\[.*\]", raw, re.DOTALL)
+                if not bracket_match:
+                    errors.append(f"No JSON array in response at batch {batch_start}")
+                    continue
+
+                results_list = json.loads(bracket_match.group(0))
+
+                for item in results_list:
+                    idx = item.get("control_index", 0) - 1
+                    if idx < 0 or idx >= len(batch):
+                        continue
+                    ctrl = batch[idx]
+                    ctrl_id = str(ctrl["id"])
+
+                    new_domain = item.get("domain", "")
+                    new_category = item.get("category", "")
+                    new_audience = item.get("target_audience", [])
+
+                    if not isinstance(new_audience, list):
+                        new_audience = []
+
+                    # Build new control_id from domain if domain changed
+                    old_prefix = ctrl["control_id"].split("-")[0] if ctrl["control_id"] else ""
+                    new_prefix = new_domain.upper()[:4] if new_domain else old_prefix
+
+                    if not req.dry_run:
+                        update_parts = []
+                        update_params: dict = {"ctrl_id": ctrl_id}
+
+                        if new_category:
+                            update_parts.append("category = :category")
+                            update_params["category"] = new_category
+
+                        if new_audience:
+                            update_parts.append("target_audience = :target_audience")
+                            update_params["target_audience"] = json.dumps(new_audience)
+
+                        # Note: We do NOT rename control_ids here — that would
+                        # break references and cause unique constraint violations.
+
+                        if update_parts:
+                            update_parts.append("updated_at = NOW()")
+                            db.execute(
+                                text(f"UPDATE canonical_controls SET {', '.join(update_parts)} WHERE id = CAST(:ctrl_id AS uuid)"),
+                                update_params,
+                            )
+                            updated += 1
+
+                if not req.dry_run:
+                    db.commit()
+
+            except Exception as e:
+                errors.append(f"Batch {batch_start}: {str(e)}")
+                db.rollback()
+
+            _domain_backfill_status[backfill_id] = {
+                "status": "running", "total": total, "updated": updated,
+                "progress": f"{min(batch_start + BATCH_SIZE, total)}/{total}",
+                "errors": errors[-10:],
+            }
+
+        _domain_backfill_status[backfill_id] = {
+            "status": "completed", "total": total, "updated": updated,
+            "errors": errors[-50:],
+        }
+        logger.info("Domain backfill %s completed: %d/%d updated", backfill_id, updated, total)
+
+    except Exception as e:
+        logger.error("Domain backfill %s failed: %s", backfill_id, e)
+        _domain_backfill_status[backfill_id] = {"status": "failed", "error": str(e)}
+    finally:
+        db.close()
+
+
+@router.post("/generate/backfill-domain")
+async def start_domain_backfill(req: DomainBackfillRequest):
+    """Backfill domain, category, and target_audience for controls using Anthropic API.
+
+    Finds controls where target_audience is NULL and enriches them.
+    Default is dry_run=True (preview only).
+    """
+    import uuid
+    backfill_id = str(uuid.uuid4())[:8]
+    _domain_backfill_status[backfill_id] = {"status": "starting"}
+    asyncio.create_task(_run_domain_backfill(req, backfill_id))
+    return {"status": "running", "backfill_id": backfill_id,
+            "message": f"Domain backfill started. Poll /generate/backfill-status/{backfill_id}"}
+
+
+@router.get("/generate/domain-backfill-status/{backfill_id}")
+async def get_domain_backfill_status(backfill_id: str):
+    """Get status of a domain backfill job."""
+    status = _domain_backfill_status.get(backfill_id)
+    if not status:
+        raise HTTPException(status_code=404, detail="Domain backfill job not found")
+    return status
--- a/docs-src/services/sdk-modules/canonical-control-library.md
+++ b/docs-src/services/sdk-modules/canonical-control-library.md
@@ -37,13 +37,21 @@ Wir benoetigen ein System, um aus verschiedenen Security-Guidelines **eigenstaen
 | Domain | Name | Beschreibung |
 |--------|------|-------------|
 | AUTH | Identity & Access Management | Authentisierung, MFA, Token-Management |
-| NET | Network & Transport Security | TLS, Zertifikate, Netzwerk-Haertung |
-| SUP | Software Supply Chain | Signierung, SBOM, Dependency-Scanning |
-| LOG | Security Operations & Logging | Privacy-Aware Logging, SIEM |
-| WEB | Web Application Security | Admin-Flows, Account Recovery |
-| DATA | Data Governance & Classification | Datenklassifikation, Schutzmassnahmen |
 | CRYP | Cryptographic Operations | Key Management, Rotation, HSM |
-| REL | Release & Change Governance | Change Impact Assessment, Security Review |
+| NET | Network & Transport Security | TLS, Zertifikate, Netzwerk-Haertung |
+| DATA | Data Governance & Classification | Datenklassifikation, Schutzmassnahmen |
+| LOG | Security Operations & Logging | Privacy-Aware Logging, SIEM |
+| ACC | Access Control | Zugriffskontrolle, Berechtigungen |
+| SEC | IT Security | Schwachstellen, Haertung, Konfiguration |
+| INC | Incident Management | Vorfallmanagement, Wiederherstellung |
+| AI | Artificial Intelligence | KI-Compliance, Bias, Transparenz |
+| COMP | Compliance | Konformitaet, Audit, Zertifizierung |
+| GOV | Government & Public Administration | Behoerden, Verwaltung, Aufsicht |
+| LAB | Labor Law | Arbeitsrecht, Arbeitsschutz, Betriebsverfassung |
+| FIN | Financial Regulation | Finanzregulierung, Rechnungslegung, BaFin |
+| TRD | Trade Regulation | Gewerbe, Handelsrecht, Produktsicherheit |
+| ENV | Environmental | Umweltschutz, Nachhaltigkeit, Emissionen |
+| HLT | Health | Gesundheit, Medizinprodukte, Hygiene |

 !!! warning "Keine BSI-Nomenklatur"
    Die Domains verwenden bewusst KEINE BSI-Bezeichner (O.Auth_*, O.Netz_*).
@@ -123,6 +131,8 @@ erDiagram
 | `GET` | `/v1/canonical/generate/processed-stats` | Verarbeitungsstatistik pro Collection |
 | `GET` | `/v1/canonical/generate/review-queue` | Controls zur Pruefung |
 | `POST` | `/v1/canonical/generate/review/{control_id}` | Review abschliessen |
+| `POST` | `/v1/canonical/generate/bulk-review` | Bulk-Review (approve/reject nach State) |
+| `POST` | `/v1/canonical/generate/qa-reclassify` | QA-Reklassifizierung bestehender Controls |
 | `GET` | `/v1/canonical/blocked-sources` | Gesperrte Quellen (Rule 3) |
 | `POST` | `/v1/canonical/blocked-sources/cleanup` | Cleanup-Workflow starten |

@@ -231,25 +241,28 @@ Der Validator (`scripts/validate-controls.py`) prueft bei jedem Commit:

 ## Control Generator Pipeline

-Automatische Generierung von Controls aus dem gesamten RAG-Korpus (~183.000 Chunks aus Gesetzen, Verordnungen und Standards).
-Aktueller Stand: **~2.120 Controls** generiert.
+Automatische Generierung von Controls aus dem gesamten RAG-Korpus (~105.000 Chunks aus Gesetzen, Verordnungen und Standards).
+Aktueller Stand: **~4.738 Controls** generiert.

-### 8-Stufen-Pipeline
+### 9-Stufen-Pipeline

 ```mermaid
 flowchart TD
-    A[1. RAG Scroll] -->|Alle Chunks| B[2. Prefilter - Lokales LLM]
+    A[1. RAG Scroll] -->|max_chunks| B[2. Prefilter - Lokales LLM]
    B -->|Irrelevant| C[Als processed markieren]
    B -->|Relevant| D[3. License Classify]
    D -->|Batch sammeln| E[4. Batch Processing - 5 Chunks/API-Call]
    E -->|Rule 1/2| F[4a. Structure Batch - Anthropic]
    E -->|Rule 3| G[4b. Reform Batch - Anthropic]
-    F --> H[5. Harmonization - Embeddings]
-    G --> H
+    F --> QA[5. QA Validation - Lokales LLM]
+    G --> QA
+    QA -->|Mismatch| QAF[Auto-Fix Category/Domain]
+    QA -->|OK| H[6. Harmonization - Embeddings]
+    QAF --> H
    H -->|Duplikat| I[Als Duplikat speichern]
-    H -->|Neu| J[6. Anchor Search]
-    J --> K[7. Store Control]
-    K --> L[8. Mark Processed]
+    H -->|Neu| J[7. Anchor Search]
+    J --> K[8. Store Control]
+    K --> L[9. Mark Processed]
 ```

 ### Stufe 1: RAG Scroll (Vollstaendig)
@@ -343,12 +356,64 @@ except Exception as e:
    Die `batch_size` ist ueber `GeneratorConfig` konfigurierbar.
    Bei grosser Batch-Size steigt die Wahrscheinlichkeit fuer Parsing-Fehler.

-### Stufe 5: Harmonisierung (Embedding-basiert)
+### Stufe 5: QA Validation (Automatische Qualitaetspruefung)
+
+Die QA-Stufe validiert die Klassifizierung jedes Controls automatisch. Sie vergleicht die LLM-Klassifizierung mit Keyword-basierter Erkennung und loest bei Abweichungen eine Arbitrierung durch das lokale Ollama-Modell aus.
+
+#### Ablauf
+
+1. **LLM-Category auswerten:** Der Anthropic-Prompt fragt jetzt explizit nach `category` und `domain`
+2. **Keyword-Detection als Cross-Check:** `_detect_category(chunk.text)` liefert eine zweite Meinung
+3. **Stimmen beide ueberein?** → Kein QA noetig (schneller Pfad)
+4. **Bei Disagreement:** Lokales LLM (Ollama qwen3.5:35b-a3b) arbitriert
+5. **Auto-Fix:** Bei hoher Konfidenz wird Category/Domain automatisch korrigiert
+
+#### Beispiel
+
+```
+Control: "Offenlegung von Risikokonzentrationen bei Finanzinstrumenten"
+LLM sagt:     domain=AUTH, category=authentication
+Keyword sagt:  domain=FIN,  category=finance
+→ QA via Ollama: domain=FIN, category=finance (Grund: IFRS-Thema)
+→ Auto-Fix: AUTH-315 → FIN-xxx
+```
+
+#### QA-Metriken in generation_metadata
+
+```json
+{
+  "qa_category_fix": {"from": "authentication", "to": "finance", "reason": "IFRS-Thema"},
+  "qa_domain_fix": {"from": "AUTH", "to": "FIN", "reason": "Finanzregulierung"}
+}
+```
+
+#### QA-Reklassifizierung bestehender Controls
+
+Fuer bereits generierte Controls gibt es den Backfill-Endpoint:
+
+```bash
+# Dry Run: Welche AUTH-Controls sind falsch klassifiziert?
+curl -X POST https://macmini:8002/api/compliance/v1/canonical/generate/qa-reclassify \
+  -H 'Content-Type: application/json' \
+  -d '{"limit": 50, "dry_run": true, "filter_domain_prefix": "AUTH"}'
+
+# Korrekturen anwenden:
+curl -X POST https://macmini:8002/api/compliance/v1/canonical/generate/qa-reclassify \
+  -H 'Content-Type: application/json' \
+  -d '{"limit": 50, "dry_run": false, "filter_domain_prefix": "AUTH"}'
+```
+
+!!! info "Performance"
+    Die QA-Stufe nutzt das lokale Ollama-Modell (kostenlos, ~2s/Control).
+    Sie wird nur bei Disagreement zwischen LLM und Keyword getriggert (~10-15% der Controls),
+    sodass der Overhead minimal bleibt.
+
+### Stufe 6: Harmonisierung (Embedding-basiert)

 Prueft per bge-m3 Embeddings (Cosine Similarity > 0.85), ob ein aehnliches Control existiert.
 Embeddings werden in Batches vorgeladen (32 Texte/Request) fuer maximale Performance.

-### Stufe 6-8: Anchor Search, Store, Mark Processed
+### Stufe 7-9: Anchor Search, Store, Mark Processed

 - **Anchor Search:** Findet Open-Source-Referenzen (OWASP, NIST, ENISA)
 - **Store:** Persistiert Control mit `verification_method` und `category`
@@ -358,6 +423,12 @@ Embeddings werden in Batches vorgeladen (32 Texte/Request) fuer maximale Perform

 Bei der Generierung werden automatisch zugewiesen:

+**Category** wird seit 2026-03-16 **dreigleisig** bestimmt:
+
+1. **LLM-Klassifikation (primaer):** Anthropic liefert `category` im JSON-Response
+2. **Keyword-Detection (fallback):** Falls LLM keine Category liefert, greift `_detect_category()`
+3. **QA-Arbitrierung (bei Mismatch):** Lokales LLM entscheidet bei Widerspruch
+
 **Verification Method** (Nachweis-Methode):

 | Methode | Beschreibung |
@@ -367,10 +438,33 @@ Bei der Generierung werden automatisch zugewiesen:
 | `tool` | Tool-basierte Pruefung |
 | `hybrid` | Kombination mehrerer Methoden |

-**Category** (17 thematische Kategorien):
-encryption, authentication, network, data_protection, logging, incident,
-continuity, compliance, supply_chain, physical, personnel, application,
-system, risk, governance, hardware, identity
+**Category** (22 thematische Kategorien):
+
+| Kategorie | Beschreibung |
+|-----------|-------------|
+| `encryption` | Verschluesselung, Kryptographie |
+| `authentication` | Authentifizierung, Login, MFA |
+| `network` | Netzwerk, Firewall, VPN |
+| `data_protection` | Datenschutz, DSGVO |
+| `logging` | Protokollierung, Monitoring |
+| `incident` | Vorfallmanagement |
+| `continuity` | Business Continuity, Backup |
+| `compliance` | Konformitaet, Audit, Zertifizierung |
+| `supply_chain` | Lieferkette, Dienstleister |
+| `physical` | Physische Sicherheit |
+| `personnel` | Schulung, Mitarbeiter |
+| `application` | Software, Code Review, API |
+| `system` | Haertung, Patch, Konfiguration |
+| `risk` | Risikobewertung, -management |
+| `governance` | Sicherheitsorganisation, Richtlinien |
+| `hardware` | Hardware, Firmware, TPM |
+| `identity` | IAM, SSO, Verzeichnisdienste |
+| `public_administration` | Behoerden, Verwaltung |
+| `labor_law` | Arbeitsrecht, Arbeitsschutz |
+| `finance` | Finanzregulierung, Rechnungslegung |
+| `trade_regulation` | Gewerbe, Handelsrecht |
+| `environmental` | Umweltschutz, Nachhaltigkeit |
+| `health` | Gesundheit, Medizinprodukte |

 ### Konfiguration

@@ -379,7 +473,7 @@ system, risk, governance, hardware, identity
 | `ANTHROPIC_API_KEY` | — | API-Key fuer Anthropic Claude |
 | `CONTROL_GEN_ANTHROPIC_MODEL` | `claude-sonnet-4-6` | Anthropic-Modell fuer Formulierung |
 | `OLLAMA_URL` | `http://host.docker.internal:11434` | Lokaler Ollama-Server (Vorfilter) |
-| `CONTROL_GEN_OLLAMA_MODEL` | `qwen3:30b-a3b` | Lokales LLM fuer Vorfilter |
+| `CONTROL_GEN_OLLAMA_MODEL` | `qwen3.5:35b-a3b` | Lokales LLM fuer Vorfilter + QA |
 | `CONTROL_GEN_LLM_TIMEOUT` | `180` | Timeout in Sekunden (erhoet fuer Batch-Calls) |

 **Pipeline-Konfiguration (via `GeneratorConfig`):**
@@ -388,6 +482,7 @@ system, risk, governance, hardware, identity
 |-----------|---------|-------------|
 | `batch_size` | `5` | Chunks pro Anthropic-API-Call |
 | `max_controls` | `0` | Limit (0 = alle Chunks verarbeiten) |
+| `max_chunks` | `1000` | Max Chunks pro Job (respektiert Dokumentgrenzen) |
 | `skip_processed` | `true` | Bereits verarbeitete Chunks ueberspringen |
 | `dry_run` | `false` | Trockenlauf ohne DB-Schreibzugriffe |
 | `skip_web_search` | `false` | Web-Suche fuer Anchor-Finder ueberspringen |
@@ -423,12 +518,16 @@ curl https://macmini:8002/api/compliance/v1/canonical/generate/jobs \
 | Collection | Inhalte | Erwartete Regel |
 |-----------|---------|----------------|
 | `bp_compliance_gesetze` | Deutsche Gesetze (BDSG, TTDSG, TKG etc.) | Rule 1 |
-| `bp_compliance_recht` | EU-Verordnungen (DSGVO, NIS2, AI Act etc.) | Rule 1 |
-| `bp_compliance_datenschutz` | Datenschutz-Leitlinien | Rule 1/2 |
+| `bp_compliance_datenschutz` | Datenschutz-Leitlinien + EU-Verordnungen | Rule 1/2 |
 | `bp_compliance_ce` | CE/Sicherheitsstandards | Rule 1/2/3 |
 | `bp_dsfa_corpus` | DSFA-Korpus | Rule 1/2 |
 | `bp_legal_templates` | Rechtsvorlagen | Rule 1 |

+!!! warning "bp_compliance_recht entfernt (2026-03-16)"
+    Die Collection `bp_compliance_recht` wurde geloescht, da sie mit `bp_compliance_datenschutz`
+    ueberlappte (~20.000 Duplikat-Chunks). Alle relevanten EU-Verordnungen sind in den anderen
+    Collections enthalten.
+
 ---

 ## Processed Chunks Tracking
@@ -514,10 +613,10 @@ curl -s https://macmini:8002/api/compliance/v1/canonical/generate/processed-stat

 | Metrik | Wert |
 |--------|------|
-| RAG-Chunks gesamt | ~183.000 |
-| Verarbeitete Chunks | ~183.000 (vollstaendig) |
-| Generierte Controls | **~2.120** |
-| Konversionsrate | ~1,2% (nur sicherheitsrelevante Chunks erzeugen Controls) |
+| RAG-Chunks gesamt | ~105.000 (nach Dedup 2026-03-16) |
+| Verarbeitete Chunks | ~105.000 |
+| Generierte Controls | **~4.738** |
+| Konversionsrate | ~4,5% (nur sicherheitsrelevante Chunks erzeugen Controls) |

 !!! info "Warum so wenige Controls?"
    Die meisten RAG-Chunks sind Definitionen, Begriffsbestimmungen, Inhaltsverzeichnisse oder
--- a/docs-src/services/sdk-modules/training.md
+++ b/docs-src/services/sdk-modules/training.md
@@ -1,22 +1,27 @@
 # Training Engine (CP-TRAIN)

-KI-generierte Schulungsinhalte, Rollenmatrix, Quiz-Engine und Zertifikatsverwaltung für Compliance-Schulungen.
+KI-generierte Schulungsinhalte, Rollenmatrix, Quiz-Engine, Zertifikate und Training Blocks fuer Compliance-Schulungen.

 **Prefix:** `CP-TRAIN` · **seq:** 4800 · **Frontend:** `https://macmini:3007/sdk/training`
+**Learner-Portal:** `https://macmini:3007/sdk/training/learner`
 **Service:** `ai-compliance-sdk` (Go/Gin, Port 8093)
 **Proxy:** `/api/sdk/v1/training/[[...path]]` → `ai-compliance-sdk:8090/sdk/v1/training/...`
+**Developer Portal:** `https://macmini:3006/api/training`

 ---

 ## Features

- Rollenbasierte Schulungsmatrix (Wer muss was absolvieren?)
+- Rollenbasierte Schulungsmatrix (CTM) — 10 Rollen (R1–R10)
 - KI-generierte Schulungsinhalte (Text, Audio, Video via TTS-Service)
- Quiz-Engine mit automatischer Auswertung
- Deadline-Tracking und Eskalation bei Überziehung
+- Quiz-Engine mit automatischer Auswertung und Bestehensgrenze
+- Deadline-Tracking und 4-stufige Eskalation (7/14/30/45 Tage)
 - Aufgabenzuweisung (Assignments) mit Fortschrittstracking
- Unveränderliches Audit-Log für Compliance-Nachweise
- Bulk-Content-Generierung für alle Module auf einmal
+- PDF-Zertifikate nach erfolgreichem Abschluss
+- Training Blocks — automatische Modul-Erstellung aus Canonical Controls
+- Learner-Portal fuer Mitarbeiter (Schulung absolvieren, Quiz, Zertifikat)
+- Unveraenderliches Audit-Log fuer Compliance-Nachweise
+- Bulk-Content-Generierung fuer alle Module auf einmal

 ---

@@ -25,13 +30,15 @@ KI-generierte Schulungsinhalte, Rollenmatrix, Quiz-Engine und Zertifikatsverwalt
 | Artikel | Bezug |
 |---------|-------|
 | Art. 39 Abs. 1b DSGVO | DSB-Aufgabe: Sensibilisierung und Schulung |
-| Art. 5 AI Act | Schulungspflicht für verbotene KI-Praktiken |
-| Art. 4 Abs. 2 AI Act | Schulung für KI-Alphabetisierung |
+| Art. 5 AI Act | Schulungspflicht fuer verbotene KI-Praktiken |
+| Art. 4 Abs. 2 AI Act | Schulung fuer KI-Alphabetisierung |

 ---

 ## Tabs / Ansichten

+### Admin-Frontend (`/sdk/training`)
+
 | Tab | Inhalt |
 |-----|--------|
 | `overview` | Statistiken, Deadline-Warnung, Eskalation-Check |
@@ -39,7 +46,23 @@ KI-generierte Schulungsinhalte, Rollenmatrix, Quiz-Engine und Zertifikatsverwalt
 | `matrix` | Rollen-Modul-Matrix — wer muss welche Schulung absolvieren |
 | `assignments` | Zugewiesene Schulungen mit Fortschritt und Deadline |
 | `content` | KI-generierter Inhalt pro Modul + Audio/Video-Player |
-| `audit` | Unveränderliches Audit-Log aller Schulungsaktivitäten |
+| `audit` | Unveraenderliches Audit-Log aller Schulungsaktivitaeten |
+
+### Learner-Portal (`/sdk/training/learner`)
+
+| Tab | Inhalt |
+|-----|--------|
+| Meine Schulungen | Zuweisungsliste mit Status-Badges, Fortschrittsbalken, Deadline |
+| Schulungsinhalt | Modul-Content (Markdown), AudioPlayer, VideoPlayer |
+| Quiz | Fragen mit Antwortauswahl, Timer, Ergebnis-Anzeige |
+| Zertifikate | Grid mit abgeschlossenen Schulungen, PDF-Download |
+
+**Learner-Workflow:**
+
+1. Mitarbeiter sieht seine Zuweisungen
+2. Klickt "Schulung starten" → Content lesen, Audio/Video anhoeren
+3. "Quiz starten" → Fragen beantworten
+4. Bei Bestehen: "Zertifikat generieren" → PDF-Download

 ---

@@ -49,65 +72,122 @@ KI-generierte Schulungsinhalte, Rollenmatrix, Quiz-Engine und Zertifikatsverwalt

 | Methode | Pfad | Beschreibung |
 |---------|------|--------------|
-| `GET` | `/sdk/v1/training/modules` | Alle Module (`status`, `regulation` Filter) |
-| `GET` | `/sdk/v1/training/modules/:id` | Modul-Detail |
+| `GET` | `/sdk/v1/training/modules` | Alle Module (`regulation_area`, `frequency_type`, `search` Filter) |
+| `GET` | `/sdk/v1/training/modules/:id` | Modul-Detail mit Content und Quiz-Fragen |
 | `POST` | `/sdk/v1/training/modules` | Modul erstellen |
 | `PUT` | `/sdk/v1/training/modules/:id` | Modul aktualisieren |
-| `DELETE` | `/sdk/v1/training/modules/:id` | Modul löschen |
+| `DELETE` | `/sdk/v1/training/modules/:id` | Modul loeschen |

-### Rollenmatrix
+### Rollenmatrix (CTM)

 | Methode | Pfad | Beschreibung |
 |---------|------|--------------|
-| `GET` | `/sdk/v1/training/matrix` | Vollständige Rollenmatrix |
-| `GET` | `/sdk/v1/training/matrix/role/:role` | Matrix für eine Rolle |
-| `POST` | `/sdk/v1/training/matrix` | Eintrag hinzufügen |
+| `GET` | `/sdk/v1/training/matrix` | Vollstaendige Rollenmatrix |
+| `GET` | `/sdk/v1/training/matrix/:role` | Matrix fuer eine Rolle |
+| `POST` | `/sdk/v1/training/matrix` | Eintrag hinzufuegen |
 | `DELETE` | `/sdk/v1/training/matrix/:role/:moduleId` | Eintrag entfernen |

+**Rollen:** R1 Geschaeftsfuehrung, R2 IT-Leitung, R3 DSB, R4 ISB, R5 HR, R6 Einkauf, R7 Fachabteilung, R8 IT-Admin, R9 Alle Mitarbeiter, R10 Behoerden
+
 ### Assignments

 | Methode | Pfad | Beschreibung |
 |---------|------|--------------|
-| `GET` | `/sdk/v1/training/assignments` | Alle Zuweisungen (Filter: `status`, `role`, `moduleId`) |
+| `GET` | `/sdk/v1/training/assignments` | Alle Zuweisungen (Filter: `user_id`, `module_id`, `role`, `status`) |
 | `GET` | `/sdk/v1/training/assignments/:id` | Zuweisung-Detail |
 | `POST` | `/sdk/v1/training/assignments/compute` | Zuweisungen aus Matrix berechnen |
 | `POST` | `/sdk/v1/training/assignments/:id/start` | Schulung starten |
+| `POST` | `/sdk/v1/training/assignments/:id/progress` | Fortschritt aktualisieren |
+| `POST` | `/sdk/v1/training/assignments/:id/complete` | Schulung abschliessen |
 | `PUT` | `/sdk/v1/training/assignments/:id` | Deadline aktualisieren |
-| `POST` | `/sdk/v1/training/assignments/:id/complete` | Schulung abschließen |

-### KI-Inhalt
+### KI-Content-Generierung

 | Methode | Pfad | Beschreibung |
 |---------|------|--------------|
-| `GET` | `/sdk/v1/training/modules/:id/content` | Aktueller Inhalt |
-| `POST` | `/sdk/v1/training/modules/:id/generate` | Inhalt KI-generieren (Skript, Audio, Video) |
+| `POST` | `/sdk/v1/training/content/generate` | Inhalt KI-generieren (Markdown) |
+| `POST` | `/sdk/v1/training/content/generate-quiz` | Quiz-Fragen KI-generieren |
+| `GET` | `/sdk/v1/training/content/:moduleId` | Veroeffentlichten Content laden |
+| `POST` | `/sdk/v1/training/content/:contentId/publish` | Inhalt freigeben |
 | `POST` | `/sdk/v1/training/content/generate-all` | Alle Module bulk-generieren |
-| `POST` | `/sdk/v1/training/content/:id/publish` | Inhalt freigeben |
+| `POST` | `/sdk/v1/training/content/generate-all-quiz` | Alle Quizzes bulk-generieren |

 ### Quiz

 | Methode | Pfad | Beschreibung |
 |---------|------|--------------|
-| `GET` | `/sdk/v1/training/modules/:id/quiz` | Quiz-Fragen |
-| `POST` | `/sdk/v1/training/modules/:id/quiz/generate` | Quiz KI-generieren |
-| `POST` | `/sdk/v1/training/modules/:id/quiz/generate-all` | Alle Quizzes bulk-generieren |
-| `POST` | `/sdk/v1/training/modules/:id/quiz/submit` | Antworten einreichen |
-| `GET` | `/sdk/v1/training/assignments/:id/attempts` | Quiz-Versuche |
+| `GET` | `/sdk/v1/training/quiz/:moduleId` | Quiz-Fragen fuer ein Modul |
+| `POST` | `/sdk/v1/training/quiz/:moduleId/submit` | Antworten einreichen |
+| `GET` | `/sdk/v1/training/quiz/attempts/:assignmentId` | Quiz-Versuche anzeigen |
+
+### Media (Audio/Video)
+
+| Methode | Pfad | Beschreibung |
+|---------|------|--------------|
+| `POST` | `/sdk/v1/training/content/:moduleId/generate-audio` | Audio generieren (Piper TTS) |
+| `POST` | `/sdk/v1/training/content/:moduleId/generate-video` | Video generieren (TTS + Folien) |
+| `POST` | `/sdk/v1/training/content/:moduleId/preview-script` | Video-Script Vorschau (JSON) |
+| `GET` | `/sdk/v1/training/media/module/:moduleId` | Alle Medien eines Moduls |
+| `GET` | `/sdk/v1/training/media/:mediaId/url` | Metadaten (Bucket, Object Key) |
+| `POST` | `/sdk/v1/training/media/:mediaId/publish` | Media veroeffentlichen |
+| `GET` | `/sdk/v1/training/media/:mediaId/stream` | **Media streamen** (307 → Presigned URL) |
+
+**Media-Streaming:** Der `/stream`-Endpoint liefert einen `307 Temporary Redirect` zu einer zeitlich begrenzten Presigned URL (MinIO/S3). Browser und Player folgen dem Redirect automatisch. Der Next.js-Proxy leitet den Redirect transparent weiter.

 ### Deadlines & Eskalation

 | Methode | Pfad | Beschreibung |
 |---------|------|--------------|
 | `GET` | `/sdk/v1/training/deadlines` | Bevorstehende Deadlines |
-| `GET` | `/sdk/v1/training/deadlines/overdue` | Überfällige Deadlines |
-| `POST` | `/sdk/v1/training/escalations/check` | Eskalation auslösen |
+| `GET` | `/sdk/v1/training/deadlines/overdue` | Ueberfaellige Deadlines |
+| `POST` | `/sdk/v1/training/escalation/check` | Eskalation ausfuehren |
+
+**Eskalationsstufen:**
+
+| Stufe | Tage ueberfaellig | Aktion |
+|-------|-------------------|--------|
+| 1 | 7 Tage | Erinnerung an Mitarbeiter |
+| 2 | 14 Tage | Benachrichtigung Teamleitung |
+| 3 | 30 Tage | Benachrichtigung Management |
+| 4 | 45 Tage | Benachrichtigung Compliance Officer |
+
+### Zertifikate
+
+| Methode | Pfad | Beschreibung |
+|---------|------|--------------|
+| `POST` | `/sdk/v1/training/certificates/generate/:assignmentId` | Zertifikat generieren |
+| `GET` | `/sdk/v1/training/certificates` | Alle Zertifikate des Tenants |
+| `GET` | `/sdk/v1/training/certificates/:id/pdf` | Zertifikat als PDF herunterladen |
+| `GET` | `/sdk/v1/training/certificates/:id/verify` | Zertifikat verifizieren |
+
+**Voraussetzungen:** Status `completed` UND `quiz_passed = true`. Das PDF wird im Querformat (A4 Landscape) generiert und enthaelt Modul-Titel, Benutzer-Name, Datum und eine eindeutige Zertifikats-ID.
+
+### Training Blocks (Controls → Module)
+
+| Methode | Pfad | Beschreibung |
+|---------|------|--------------|
+| `GET` | `/sdk/v1/training/blocks` | Alle Block-Konfigurationen |
+| `POST` | `/sdk/v1/training/blocks` | Block-Konfiguration erstellen |
+| `GET` | `/sdk/v1/training/blocks/:id` | Block-Konfiguration laden |
+| `PUT` | `/sdk/v1/training/blocks/:id` | Block-Konfiguration aktualisieren |
+| `DELETE` | `/sdk/v1/training/blocks/:id` | Block-Konfiguration loeschen |
+| `POST` | `/sdk/v1/training/blocks/:id/preview` | Vorschau: Welche Controls/Module? |
+| `POST` | `/sdk/v1/training/blocks/:id/generate` | Module generieren (Content + CTM) |
+| `GET` | `/sdk/v1/training/blocks/:id/controls` | Verlinkte Controls anzeigen |
+
+### Canonical Controls
+
+| Methode | Pfad | Beschreibung |
+|---------|------|--------------|
+| `GET` | `/sdk/v1/training/canonical/controls` | Controls auflisten (Filter: domain, category, severity, target_audience) |
+| `GET` | `/sdk/v1/training/canonical/meta` | Metadaten (Domains, Kategorien, Audiences mit Counts) |

 ### Statistiken & Audit

 | Methode | Pfad | Beschreibung |
 |---------|------|--------------|
-| `GET` | `/sdk/v1/training/stats` | Überblick-Statistiken |
-| `GET` | `/sdk/v1/training/audit` | Audit-Log (Filter: `action`, `moduleId`, `userId`) |
+| `GET` | `/sdk/v1/training/stats` | Ueberblick-Statistiken |
+| `GET` | `/sdk/v1/training/audit-log` | Audit-Log (Filter: `action`, `entity_type`, `limit`, `offset`) |

 ---

@@ -118,7 +198,7 @@ seq 4700: Academy (Compliance Academy — manuelle Schulungen)
 seq 4800: Training Engine (KI-generierte Schulungen, Matrix, Quiz)
 ```

-Das Training-Modul erweitert die Academy um KI-generierte Inhalte. Während die Academy einfache Schulungseinheiten verwaltet, bietet die Training Engine automatische Inhaltsgenerierung, eine Rollenmatrix und eine vollständige Quiz-Engine.
+Das Training-Modul erweitert die Academy um KI-generierte Inhalte. Waehrend die Academy einfache Schulungseinheiten verwaltet, bietet die Training Engine automatische Inhaltsgenerierung, eine Rollenmatrix und eine vollstaendige Quiz-Engine.

 ---

@@ -128,25 +208,39 @@ KI-generierte Inhalte werden via `compliance-tts-service` (Port 8095) in Audio u

 - **Audio:** Piper TTS → MP3 (Modell: `de_DE-thorsten-high.onnx`)
 - **Video:** FFmpeg → MP4 (Skript + Stimme + Untertitel)
- **Storage:** S3-kompatibles Object Storage (TLS)
+- **Storage:** S3-kompatibles Object Storage (Hetzner, TLS)
+- **Streaming:** `/media/:id/stream` → 307 Redirect zu MinIO Presigned URL

 ```
-AudioPlayer → /sdk/v1/training/modules/:id/media (audio)
-VideoPlayer → /sdk/v1/training/modules/:id/media (video)
+AudioPlayer → /sdk/v1/training/media/:mediaId/stream
+VideoPlayer → /sdk/v1/training/media/:mediaId/stream
+TTS Service → POST /presigned-url (returns pre-signed MinIO URL)
 ```

 ---

 ## Frontend

+### Admin-Frontend
+
 **URL:** `https://macmini:3007/sdk/training`

-6-Tab-Oberfläche mit:
+6-Tab-Oberflaeche mit:
 - Statistik-Dashboard (Abschlussquote, offene Schulungen, Deadlines)
- Modul-CRUD mit Regulierungs-Badges
+- Modul-CRUD mit Regulierungs-Badges und Loeschfunktion
 - Interaktive Rollenmatrix (Checkboxen)
 - Fortschritts-Balken pro Assignment
- Eingebetteter Audio/Video-Player für KI-generierte Inhalte
+- Eingebetteter Audio/Video-Player fuer KI-generierte Inhalte
+
+### Learner-Portal
+
+**URL:** `https://macmini:3007/sdk/training/learner`
+
+4-Tab-Oberflaeche fuer Mitarbeiter:
+- Meine Schulungen: Zuweisungsliste mit Status, Fortschritt, Deadline
+- Schulungsinhalt: Markdown-Rendering, Audio-Player, Video-Player
+- Quiz: Multiple-Choice mit Timer, Ergebnis-Anzeige, Bestehens-Logik
+- Zertifikate: Uebersicht abgeschlossener Schulungen, PDF-Download

 ---

@@ -155,11 +249,29 @@ VideoPlayer → /sdk/v1/training/modules/:id/media (video)
 Die Training Engine verwendet eigene Tabellen im `compliance` Schema:

 ```sql
-training_modules       -- Schulungsmodule (title, regulation, frequency, ...)
-training_matrix        -- Rollen-Modul-Zuordnungen
-training_assignments   -- Zuweisungen (user, module, status, progress, deadline)
-training_content       -- KI-generierter Inhalt (script, audio_url, video_url, ...)
-training_quiz          -- Quiz-Fragen pro Modul
-training_quiz_attempts -- Eingereichter Antworten + Score
-training_audit_log     -- Unveränderliches Audit-Log
+training_modules           -- Schulungsmodule (title, regulation, frequency, ...)
+training_matrix            -- Rollen-Modul-Zuordnungen (CTM)
+training_assignments       -- Zuweisungen (user, module, status, progress, deadline, certificate_id)
+training_content           -- KI-generierter Inhalt (markdown, summary, llm_model)
+training_quiz_questions    -- Quiz-Fragen pro Modul (options JSONB, correct_index)
+training_quiz_attempts     -- Eingereichte Antworten + Score
+training_media             -- Audio/Video-Dateien (bucket, object_key, status)
+training_audit_log         -- Unveraenderliches Audit-Log
+training_block_configs     -- Block-Konfigurationen (Filter, Prefix, Frequenz)
+training_block_controls    -- Verlinkte Canonical Controls pro Block
+```
+
+---
+
+## Tests
+
+```bash
+# Handler-Tests (47 Tests)
+cd ai-compliance-sdk && go test -v ./internal/api/handlers/ -run "TestGet|TestCreate|TestUpdate|TestDelete|..."
+
+# Escalation + Content Generator Tests (43 Tests)
+cd ai-compliance-sdk && go test -v ./internal/training/ -run "TestEscalation|TestBuildContent|TestParseQuiz|..."
+
+# Block Generator Tests (14 Tests)
+cd ai-compliance-sdk && go test -v ./internal/training/ -run "TestBlock"
 ```