feat(training+controls): interactive video pipeline, training blocks, control generator, CE libraries

Interactive Training Videos (CP-TRAIN): - DB migration 022: training_checkpoints + checkpoint_progress tables - NarratorScript generation via Anthropic (AI Teacher persona, German) - TTS batch synthesis + interactive video pipeline (slides + checkpoint slides + FFmpeg) - 4 new API endpoints: generate-interactive, interactive-manifest, checkpoint submit, checkpoint progress - InteractiveVideoPlayer component (HTML5 Video, quiz overlay, seek protection, progress tracking) - Learner portal integration with automatic completion on all checkpoints passed - 30 new tests (handler validation + grading logic + manifest/progress + seek protection) Training Blocks: - Block generator, block store, block config CRUD + preview/generate endpoints - Migration 021: training_blocks schema Control Generator + Canonical Library: - Control generator routes + service enhancements - Canonical control library helpers, sidebar entry - Citation backfill service + tests - CE libraries data (hazard, protection, evidence, lifecycle, components) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-16 21:41:48 +01:00
parent d2133dbfa2
commit 4f6bc8f6f6
50 changed files with 17299 additions and 198 deletions
@@ -22,6 +22,7 @@ Endpoints:

 from __future__ import annotations

+import json
 import logging
 from typing import Any, Optional

@@ -277,8 +278,8 @@ async def list_framework_controls(
            query += " AND category = :cat"
            params["cat"] = category
        if target_audience:
-            query += " AND target_audience = :ta"
-            params["ta"] = target_audience
+            query += " AND target_audience::jsonb @> (:ta)::jsonb"
+            params["ta"] = json.dumps([target_audience])

        query += " ORDER BY control_id"
        rows = db.execute(text(query), params).fetchall()
@@ -329,8 +330,8 @@ async def list_controls(
        query += " AND category = :cat"
        params["cat"] = category
    if target_audience:
-        query += " AND target_audience = :ta"
-        params["ta"] = target_audience
+        query += " AND target_audience LIKE :ta_pattern"
+        params["ta_pattern"] = f'%"{target_audience}"%'
    if source:
        if source == "__none__":
            query += " AND (source_citation IS NULL OR source_citation->>'source' IS NULL OR source_citation->>'source' = '')"
@@ -398,8 +399,8 @@ async def count_controls(
        query += " AND category = :cat"
        params["cat"] = category
    if target_audience:
-        query += " AND target_audience = :ta"
-        params["ta"] = target_audience
+        query += " AND target_audience LIKE :ta_pattern"
+        params["ta_pattern"] = f'%"{target_audience}"%'
    if source:
        if source == "__none__":
            query += " AND (source_citation IS NULL OR source_citation->>'source' IS NULL OR source_citation->>'source' = '')"
@@ -26,6 +26,13 @@ from compliance.services.control_generator import (
    ControlGeneratorPipeline,
    GeneratorConfig,
    ALL_COLLECTIONS,
+    VALID_CATEGORIES,
+    VALID_DOMAINS,
+    _detect_category,
+    _detect_domain,
+    _llm_local,
+    _parse_llm_json,
+    CATEGORY_LIST_STR,
 )
 from compliance.services.citation_backfill import CitationBackfill, BackfillResult
 from compliance.services.rag_client import get_rag_client
@@ -42,6 +49,7 @@ class GenerateRequest(BaseModel):
    domain: Optional[str] = None
    collections: Optional[List[str]] = None
    max_controls: int = 50
+    max_chunks: int = 1000  # Default: process max 1000 chunks per job (respects document boundaries)
    batch_size: int = 5
    skip_web_search: bool = False
    dry_run: bool = False
@@ -57,6 +65,7 @@ class GenerateResponse(BaseModel):
    controls_needs_review: int = 0
    controls_too_close: int = 0
    controls_duplicates_found: int = 0
+    controls_qa_fixed: int = 0
    errors: list = []
    controls: list = []

@@ -132,6 +141,7 @@ async def start_generation(req: GenerateRequest):
        domain=req.domain,
        batch_size=req.batch_size,
        max_controls=req.max_controls,
+        max_chunks=req.max_chunks,
        skip_web_search=req.skip_web_search,
        dry_run=req.dry_run,
    )
@@ -338,6 +348,188 @@ async def review_control(control_id: str, req: ReviewRequest):
        db.close()


+class BulkReviewRequest(BaseModel):
+    release_state: str  # Filter: which controls to bulk-review
+    action: str  # "approve" or "reject"
+    new_state: Optional[str] = None  # Override target state
+
+
+@router.post("/generate/bulk-review")
+async def bulk_review(req: BulkReviewRequest):
+    """Bulk review all controls matching a release_state filter.
+
+    Example: reject all needs_review → sets them to deprecated.
+    """
+    if req.release_state not in ("needs_review", "too_close", "duplicate"):
+        raise HTTPException(status_code=400, detail=f"Invalid filter state: {req.release_state}")
+
+    if req.action == "approve":
+        target = req.new_state or "draft"
+    elif req.action == "reject":
+        target = "deprecated"
+    else:
+        raise HTTPException(status_code=400, detail=f"Unknown action: {req.action}")
+
+    if target not in ("draft", "review", "approved", "deprecated", "needs_review"):
+        raise HTTPException(status_code=400, detail=f"Invalid target state: {target}")
+
+    db = SessionLocal()
+    try:
+        result = db.execute(
+            text("""
+                UPDATE canonical_controls
+                SET release_state = :target, updated_at = NOW()
+                WHERE release_state = :source
+                RETURNING control_id
+            """),
+            {"source": req.release_state, "target": target},
+        )
+        affected = [row[0] for row in result]
+        db.commit()
+
+        return {
+            "action": req.action,
+            "source_state": req.release_state,
+            "target_state": target,
+            "affected_count": len(affected),
+        }
+    finally:
+        db.close()
+
+
+class QAReclassifyRequest(BaseModel):
+    limit: int = 100  # How many controls to reclassify per run
+    dry_run: bool = True  # Preview only by default
+    filter_category: Optional[str] = None  # Only reclassify controls of this category
+    filter_domain_prefix: Optional[str] = None  # Only reclassify controls with this prefix
+
+
+@router.post("/generate/qa-reclassify")
+async def qa_reclassify(req: QAReclassifyRequest):
+    """Run QA reclassification on existing controls using local LLM.
+
+    Finds controls where keyword-detection disagrees with current category/domain,
+    then uses Ollama to determine the correct classification.
+    """
+    db = SessionLocal()
+    try:
+        # Load controls to check
+        where_clauses = ["release_state NOT IN ('deprecated')"]
+        params = {"limit": req.limit}
+        if req.filter_category:
+            where_clauses.append("category = :cat")
+            params["cat"] = req.filter_category
+        if req.filter_domain_prefix:
+            where_clauses.append("control_id LIKE :prefix")
+            params["prefix"] = f"{req.filter_domain_prefix}-%"
+
+        where_sql = " AND ".join(where_clauses)
+        rows = db.execute(
+            text(f"""
+                SELECT id, control_id, title, objective, category,
+                       COALESCE(requirements::text, '[]') as requirements,
+                       COALESCE(source_original_text, '') as source_text
+                FROM canonical_controls
+                WHERE {where_sql}
+                ORDER BY created_at DESC
+                LIMIT :limit
+            """),
+            params,
+        ).fetchall()
+
+        results = {"checked": 0, "mismatches": 0, "fixes": [], "errors": []}
+
+        for row in rows:
+            results["checked"] += 1
+            control_id = row[1]
+            title = row[2]
+            objective = row[3] or ""
+            current_category = row[4]
+            source_text = row[6] or objective
+
+            # Keyword detection on source text
+            kw_category = _detect_category(source_text) or _detect_category(objective)
+            kw_domain = _detect_domain(source_text)
+            current_prefix = control_id.split("-")[0] if "-" in control_id else ""
+
+            # Skip if keyword detection agrees with current classification
+            if kw_category == current_category and kw_domain == current_prefix:
+                continue
+
+            results["mismatches"] += 1
+
+            # Ask Ollama to arbitrate
+            try:
+                reqs_text = ""
+                try:
+                    reqs = json.loads(row[5])
+                    if isinstance(reqs, list):
+                        reqs_text = ", ".join(str(r) for r in reqs[:3])
+                except Exception:
+                    pass
+
+                prompt = f"""Pruefe dieses Compliance-Control auf korrekte Klassifizierung.
+
+Titel: {title[:100]}
+Ziel: {objective[:200]}
+Anforderungen: {reqs_text[:200]}
+
+Aktuelle Zuordnung: domain={current_prefix}, category={current_category}
+Keyword-Erkennung: domain={kw_domain}, category={kw_category}
+
+Welche Zuordnung ist korrekt? Antworte NUR als JSON:
+{{"domain": "KUERZEL", "category": "kategorie_name", "reason": "kurze Begruendung"}}
+
+Domains: AUTH=Authentifizierung, CRYP=Kryptographie, NET=Netzwerk, DATA=Datenschutz, LOG=Logging, ACC=Zugriffskontrolle, SEC=IT-Sicherheit, INC=Vorfallmanagement, AI=KI, COMP=Compliance, GOV=Behoerden, LAB=Arbeitsrecht, FIN=Finanzregulierung, TRD=Gewerbe, ENV=Umwelt, HLT=Gesundheit
+Kategorien: {CATEGORY_LIST_STR}"""
+
+                raw = await _llm_local(prompt)
+                data = _parse_llm_json(raw)
+                if not data:
+                    continue
+
+                qa_domain = data.get("domain", "").upper()
+                qa_category = data.get("category", "")
+                reason = data.get("reason", "")
+
+                fix_entry = {
+                    "control_id": control_id,
+                    "title": title[:80],
+                    "old_category": current_category,
+                    "old_domain": current_prefix,
+                    "new_category": qa_category if qa_category in VALID_CATEGORIES else current_category,
+                    "new_domain": qa_domain if qa_domain in VALID_DOMAINS else current_prefix,
+                    "reason": reason,
+                }
+
+                category_changed = qa_category in VALID_CATEGORIES and qa_category != current_category
+
+                if category_changed and not req.dry_run:
+                    db.execute(
+                        text("""
+                            UPDATE canonical_controls
+                            SET category = :category, updated_at = NOW()
+                            WHERE id = :id
+                        """),
+                        {"id": row[0], "category": qa_category},
+                    )
+                    fix_entry["applied"] = True
+                else:
+                    fix_entry["applied"] = False
+
+                results["fixes"].append(fix_entry)
+
+            except Exception as e:
+                results["errors"].append({"control_id": control_id, "error": str(e)})
+
+        if not req.dry_run:
+            db.commit()
+
+        return results
+    finally:
+        db.close()
+
+
@router.get("/generate/processed-stats")
 async def get_processed_stats():
    """Get processing statistics per collection."""
@@ -39,7 +39,6 @@ router = APIRouter(tags=["extraction"])

 ALL_COLLECTIONS = [
    "bp_compliance_ce",          # BSI-TR documents — primary Prüfaspekte source
-    "bp_compliance_recht",       # Legal texts (GDPR, AI Act, ...)
    "bp_compliance_gesetze",     # German laws
    "bp_compliance_datenschutz", # Data protection documents
    "bp_dsfa_corpus",            # DSFA corpus
@@ -0,0 +1,437 @@
+"""
+Citation Backfill Service — enrich existing controls with article/paragraph provenance.
+
+3-tier matching strategy:
+  Tier 1 — Hash match:  sha256(source_original_text) → RAG chunk lookup
+  Tier 2 — Regex parse: split concatenated "DSGVO Art. 35" → regulation + article
+  Tier 3 — Ollama LLM:  ask local LLM to identify article/paragraph from text
+"""
+
+import hashlib
+import json
+import logging
+import os
+import re
+from dataclasses import dataclass, field
+from datetime import datetime, timezone
+from typing import Optional
+
+import httpx
+from sqlalchemy import text
+from sqlalchemy.orm import Session
+
+from .rag_client import ComplianceRAGClient, RAGSearchResult
+
+logger = logging.getLogger(__name__)
+
+OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
+OLLAMA_MODEL = os.getenv("CONTROL_GEN_OLLAMA_MODEL", "qwen3.5:35b-a3b")
+LLM_TIMEOUT = float(os.getenv("CONTROL_GEN_LLM_TIMEOUT", "180"))
+
+ALL_COLLECTIONS = [
+    "bp_compliance_ce",
+    "bp_compliance_gesetze",
+    "bp_compliance_datenschutz",
+    "bp_dsfa_corpus",
+    "bp_legal_templates",
+]
+
+BACKFILL_SYSTEM_PROMPT = (
+    "Du bist ein Rechtsexperte. Deine Aufgabe ist es, aus einem Gesetzestext "
+    "den genauen Artikel und Absatz zu bestimmen. Antworte NUR mit validem JSON."
+)
+
+# Regex to split concatenated source like "DSGVO Art. 35" or "NIS2 Artikel 21 Abs. 2"
+_SOURCE_ARTICLE_RE = re.compile(
+    r"^(.+?)\s+(Art(?:ikel)?\.?\s*\d+.*)$", re.IGNORECASE
+)
+
+
+@dataclass
+class MatchResult:
+    article: str
+    paragraph: str
+    method: str  # "hash", "regex", "llm"
+
+
+@dataclass
+class BackfillResult:
+    total_controls: int = 0
+    matched_hash: int = 0
+    matched_regex: int = 0
+    matched_llm: int = 0
+    unmatched: int = 0
+    updated: int = 0
+    errors: list = field(default_factory=list)
+
+
+class CitationBackfill:
+    """Backfill article/paragraph into existing control source_citations."""
+
+    def __init__(self, db: Session, rag_client: ComplianceRAGClient):
+        self.db = db
+        self.rag = rag_client
+        self._rag_index: dict[str, RAGSearchResult] = {}
+
+    async def run(self, dry_run: bool = True, limit: int = 0) -> BackfillResult:
+        """Main entry: iterate controls missing article/paragraph, match to RAG, update."""
+        result = BackfillResult()
+
+        # Load controls needing backfill
+        controls = self._load_controls_needing_backfill(limit)
+        result.total_controls = len(controls)
+        logger.info("Backfill: %d controls need article/paragraph enrichment", len(controls))
+
+        if not controls:
+            return result
+
+        # Collect hashes we need to find — only build index for controls with source text
+        needed_hashes: set[str] = set()
+        for ctrl in controls:
+            src = ctrl.get("source_original_text")
+            if src:
+                needed_hashes.add(hashlib.sha256(src.encode()).hexdigest())
+
+        if needed_hashes:
+            # Build targeted RAG index — only scroll collections that our controls reference
+            logger.info("Building targeted RAG hash index for %d source texts...", len(needed_hashes))
+            await self._build_rag_index_targeted(controls)
+            logger.info("RAG index built: %d chunks indexed, %d hashes needed", len(self._rag_index), len(needed_hashes))
+        else:
+            logger.info("No source_original_text found — skipping RAG index build")
+
+        # Process each control
+        for i, ctrl in enumerate(controls):
+            if i > 0 and i % 100 == 0:
+                logger.info("Backfill progress: %d/%d processed", i, result.total_controls)
+
+            try:
+                match = await self._match_control(ctrl)
+                if match:
+                    if match.method == "hash":
+                        result.matched_hash += 1
+                    elif match.method == "regex":
+                        result.matched_regex += 1
+                    elif match.method == "llm":
+                        result.matched_llm += 1
+
+                    if not dry_run:
+                        self._update_control(ctrl, match)
+                        result.updated += 1
+                    else:
+                        logger.debug(
+                            "DRY RUN: Would update %s with article=%s paragraph=%s (method=%s)",
+                            ctrl["control_id"], match.article, match.paragraph, match.method,
+                        )
+                else:
+                    result.unmatched += 1
+
+            except Exception as e:
+                error_msg = f"Error backfilling {ctrl.get('control_id', '?')}: {e}"
+                logger.error(error_msg)
+                result.errors.append(error_msg)
+
+        if not dry_run:
+            try:
+                self.db.commit()
+            except Exception as e:
+                logger.error("Backfill commit failed: %s", e)
+                result.errors.append(f"Commit failed: {e}")
+
+        logger.info(
+            "Backfill complete: %d total, hash=%d regex=%d llm=%d unmatched=%d updated=%d",
+            result.total_controls, result.matched_hash, result.matched_regex,
+            result.matched_llm, result.unmatched, result.updated,
+        )
+        return result
+
+    def _load_controls_needing_backfill(self, limit: int = 0) -> list[dict]:
+        """Load controls where source_citation exists but lacks separate 'article' key."""
+        query = """
+            SELECT id, control_id, source_citation, source_original_text,
+                   generation_metadata, license_rule
+            FROM canonical_controls
+            WHERE license_rule IN (1, 2)
+              AND source_citation IS NOT NULL
+              AND (
+                  source_citation->>'article' IS NULL
+                  OR source_citation->>'article' = ''
+              )
+            ORDER BY control_id
+        """
+        if limit > 0:
+            query += f" LIMIT {limit}"
+
+        result = self.db.execute(text(query))
+        cols = result.keys()
+        controls = []
+        for row in result:
+            ctrl = dict(zip(cols, row))
+            ctrl["id"] = str(ctrl["id"])
+            # Parse JSON fields
+            for jf in ("source_citation", "generation_metadata"):
+                if isinstance(ctrl.get(jf), str):
+                    try:
+                        ctrl[jf] = json.loads(ctrl[jf])
+                    except (json.JSONDecodeError, TypeError):
+                        ctrl[jf] = {}
+            controls.append(ctrl)
+        return controls
+
+    async def _build_rag_index_targeted(self, controls: list[dict]):
+        """Build RAG index by scrolling only collections relevant to our controls.
+
+        Uses regulation codes from generation_metadata to identify which collections
+        to search, falling back to all collections only if needed.
+        """
+        # Determine which collections are relevant based on regulation codes
+        regulation_to_collection = self._map_regulations_to_collections(controls)
+        collections_to_search = set(regulation_to_collection.values()) or set(ALL_COLLECTIONS)
+
+        logger.info("Targeted index: searching %d collections: %s",
+                     len(collections_to_search), ", ".join(collections_to_search))
+
+        for collection in collections_to_search:
+            offset = None
+            page = 0
+            seen_offsets: set[str] = set()
+            while True:
+                chunks, next_offset = await self.rag.scroll(
+                    collection=collection, offset=offset, limit=200,
+                )
+                if not chunks:
+                    break
+                for chunk in chunks:
+                    if chunk.text and len(chunk.text.strip()) >= 50:
+                        h = hashlib.sha256(chunk.text.encode()).hexdigest()
+                        self._rag_index[h] = chunk
+                page += 1
+                if page % 50 == 0:
+                    logger.info("Indexing %s: page %d (%d chunks so far)",
+                                collection, page, len(self._rag_index))
+                if not next_offset:
+                    break
+                if next_offset in seen_offsets:
+                    logger.warning("Scroll loop in %s at page %d — stopping", collection, page)
+                    break
+                seen_offsets.add(next_offset)
+                offset = next_offset
+
+            logger.info("Indexed collection %s: %d pages", collection, page)
+
+    def _map_regulations_to_collections(self, controls: list[dict]) -> dict[str, str]:
+        """Map regulation codes from controls to likely Qdrant collections."""
+        # Heuristic: regulation code prefix → collection
+        collection_map = {
+            "eu_": "bp_compliance_gesetze",
+            "dsgvo": "bp_compliance_datenschutz",
+            "bdsg": "bp_compliance_gesetze",
+            "ttdsg": "bp_compliance_gesetze",
+            "nist_": "bp_compliance_ce",
+            "owasp": "bp_compliance_ce",
+            "bsi_": "bp_compliance_ce",
+            "enisa": "bp_compliance_ce",
+            "at_": "bp_compliance_recht",
+            "fr_": "bp_compliance_recht",
+            "es_": "bp_compliance_recht",
+        }
+        result: dict[str, str] = {}
+        for ctrl in controls:
+            meta = ctrl.get("generation_metadata") or {}
+            reg = meta.get("source_regulation", "")
+            if not reg:
+                continue
+            for prefix, coll in collection_map.items():
+                if reg.startswith(prefix):
+                    result[reg] = coll
+                    break
+            else:
+                # Unknown regulation — search all
+                for coll in ALL_COLLECTIONS:
+                    result[f"_all_{coll}"] = coll
+        return result
+
+    async def _match_control(self, ctrl: dict) -> Optional[MatchResult]:
+        """3-tier matching: hash → regex → LLM."""
+
+        # Tier 1: Hash match against RAG index
+        source_text = ctrl.get("source_original_text")
+        if source_text:
+            h = hashlib.sha256(source_text.encode()).hexdigest()
+            chunk = self._rag_index.get(h)
+            if chunk and (chunk.article or chunk.paragraph):
+                return MatchResult(
+                    article=chunk.article or "",
+                    paragraph=chunk.paragraph or "",
+                    method="hash",
+                )
+
+        # Tier 2: Regex parse concatenated source
+        citation = ctrl.get("source_citation") or {}
+        source_str = citation.get("source", "")
+        parsed = _parse_concatenated_source(source_str)
+        if parsed and parsed["article"]:
+            return MatchResult(
+                article=parsed["article"],
+                paragraph="",  # Regex can't extract paragraph from concatenated format
+                method="regex",
+            )
+
+        # Tier 3: Ollama LLM
+        if source_text:
+            return await self._llm_match(ctrl)
+
+        return None
+
+    async def _llm_match(self, ctrl: dict) -> Optional[MatchResult]:
+        """Use Ollama to identify article/paragraph from source text."""
+        citation = ctrl.get("source_citation") or {}
+        regulation_name = citation.get("source", "")
+        metadata = ctrl.get("generation_metadata") or {}
+        regulation_code = metadata.get("source_regulation", "")
+        source_text = ctrl.get("source_original_text", "")
+
+        prompt = f"""Analysiere den folgenden Gesetzestext und bestimme den genauen Artikel und Absatz.
+
+Gesetz: {regulation_name} (Code: {regulation_code})
+
+Text:
+---
+{source_text[:2000]}
+---
+
+Antworte NUR mit JSON:
+{{"article": "Art. XX", "paragraph": "Abs. Y"}}
+
+Falls kein spezifischer Absatz erkennbar ist, setze paragraph auf "".
+Falls kein Artikel erkennbar ist, setze article auf "".
+Bei deutschen Gesetzen mit § verwende: "§ XX" statt "Art. XX"."""
+
+        try:
+            raw = await _llm_ollama(prompt, BACKFILL_SYSTEM_PROMPT)
+            data = _parse_json(raw)
+            if data and (data.get("article") or data.get("paragraph")):
+                return MatchResult(
+                    article=data.get("article", ""),
+                    paragraph=data.get("paragraph", ""),
+                    method="llm",
+                )
+        except Exception as e:
+            logger.warning("LLM match failed for %s: %s", ctrl.get("control_id"), e)
+
+        return None
+
+    def _update_control(self, ctrl: dict, match: MatchResult):
+        """Update source_citation and generation_metadata in DB."""
+        citation = ctrl.get("source_citation") or {}
+
+        # Clean the source name: remove concatenated article if present
+        source_str = citation.get("source", "")
+        parsed = _parse_concatenated_source(source_str)
+        if parsed:
+            citation["source"] = parsed["name"]
+
+        # Add separate article/paragraph fields
+        citation["article"] = match.article
+        citation["paragraph"] = match.paragraph
+
+        # Update generation_metadata
+        metadata = ctrl.get("generation_metadata") or {}
+        if match.article:
+            metadata["source_article"] = match.article
+        metadata["source_paragraph"] = match.paragraph
+        metadata["backfill_method"] = match.method
+        metadata["backfill_at"] = datetime.now(timezone.utc).isoformat()
+
+        self.db.execute(
+            text("""
+                UPDATE canonical_controls
+                SET source_citation = :citation,
+                    generation_metadata = :metadata,
+                    updated_at = NOW()
+                WHERE id = CAST(:id AS uuid)
+            """),
+            {
+                "id": ctrl["id"],
+                "citation": json.dumps(citation),
+                "metadata": json.dumps(metadata),
+            },
+        )
+
+
+def _parse_concatenated_source(source: str) -> Optional[dict]:
+    """Parse 'DSGVO Art. 35' → {name: 'DSGVO', article: 'Art. 35'}.
+
+    Also handles '§' format: 'BDSG § 42' → {name: 'BDSG', article: '§ 42'}.
+    """
+    if not source:
+        return None
+
+    # Try Art./Artikel pattern
+    m = _SOURCE_ARTICLE_RE.match(source)
+    if m:
+        return {"name": m.group(1).strip(), "article": m.group(2).strip()}
+
+    # Try § pattern
+    m2 = re.match(r"^(.+?)\s+(§\s*\d+.*)$", source)
+    if m2:
+        return {"name": m2.group(1).strip(), "article": m2.group(2).strip()}
+
+    return None
+
+
+async def _llm_ollama(prompt: str, system_prompt: Optional[str] = None) -> str:
+    """Call Ollama chat API for backfill matching."""
+    messages = []
+    if system_prompt:
+        messages.append({"role": "system", "content": system_prompt})
+    messages.append({"role": "user", "content": prompt})
+
+    payload = {
+        "model": OLLAMA_MODEL,
+        "messages": messages,
+        "stream": False,
+        "options": {"num_predict": 256},
+        "think": False,
+    }
+
+    try:
+        async with httpx.AsyncClient(timeout=LLM_TIMEOUT) as client:
+            resp = await client.post(f"{OLLAMA_URL}/api/chat", json=payload)
+            if resp.status_code != 200:
+                logger.error("Ollama backfill failed %d: %s", resp.status_code, resp.text[:300])
+                return ""
+            data = resp.json()
+            msg = data.get("message", {})
+            if isinstance(msg, dict):
+                return msg.get("content", "")
+            return data.get("response", str(msg))
+    except Exception as e:
+        logger.error("Ollama backfill request failed: %s", e)
+        return ""
+
+
+def _parse_json(raw: str) -> Optional[dict]:
+    """Extract JSON object from LLM output."""
+    if not raw:
+        return None
+    # Try direct parse
+    try:
+        return json.loads(raw)
+    except json.JSONDecodeError:
+        pass
+    # Try extracting from markdown code block
+    m = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", raw, re.DOTALL)
+    if m:
+        try:
+            return json.loads(m.group(1))
+        except json.JSONDecodeError:
+            pass
+    # Try finding first { ... }
+    m = re.search(r"\{[^{}]*\}", raw)
+    if m:
+        try:
+            return json.loads(m.group(0))
+        except json.JSONDecodeError:
+            pass
+    return None
@@ -44,6 +44,7 @@ logger = logging.getLogger(__name__)

 SDK_URL = os.getenv("SDK_URL", "http://ai-compliance-sdk:8090")
 EMBEDDING_URL = os.getenv("EMBEDDING_URL", "http://embedding-service:8087")
+QDRANT_URL = os.getenv("QDRANT_URL", "http://host.docker.internal:6333")
 ANTHROPIC_API_KEY = os.getenv("ANTHROPIC_API_KEY", "")
 ANTHROPIC_MODEL = os.getenv("CONTROL_GEN_ANTHROPIC_MODEL", "claude-sonnet-4-6")
 OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
@@ -54,7 +55,6 @@ HARMONIZATION_THRESHOLD = 0.85  # Cosine similarity above this = duplicate

 ALL_COLLECTIONS = [
    "bp_compliance_ce",
-    "bp_compliance_recht",
    "bp_compliance_gesetze",
    "bp_compliance_datenschutz",
    "bp_dsfa_corpus",
@@ -312,6 +312,12 @@ CATEGORY_KEYWORDS = {
               "hygiene", "infektionsschutz", "pflege"],
 }

+VALID_CATEGORIES = set(CATEGORY_KEYWORDS.keys())
+VALID_DOMAINS = {"AUTH", "CRYP", "NET", "DATA", "LOG", "ACC", "SEC", "INC",
+                 "AI", "COMP", "GOV", "LAB", "FIN", "TRD", "ENV", "HLT"}
+
+CATEGORY_LIST_STR = ", ".join(sorted(VALID_CATEGORIES))
+
 VERIFICATION_KEYWORDS = {
    "code_review": ["source code", "code review", "static analysis", "sast", "dast",
                    "dependency check", "quellcode", "codeanalyse", "secure coding",
@@ -373,6 +379,7 @@ class GeneratorConfig(BaseModel):
    domain: Optional[str] = None
    batch_size: int = 5
    max_controls: int = 0  # 0 = unlimited (process ALL chunks)
+    max_chunks: int = 0  # 0 = unlimited; >0 = stop after N chunks (respects document boundaries)
    skip_processed: bool = True
    skip_web_search: bool = False
    dry_run: bool = False
@@ -418,6 +425,7 @@ class GeneratorResult:
    controls_needs_review: int = 0
    controls_too_close: int = 0
    controls_duplicates_found: int = 0
+    controls_qa_fixed: int = 0
    chunks_skipped_prefilter: int = 0
    errors: list = field(default_factory=list)
    controls: list = field(default_factory=list)
@@ -713,7 +721,7 @@ class ControlGeneratorPipeline:
    async def _scan_rag(self, config: GeneratorConfig) -> list[RAGSearchResult]:
        """Scroll through ALL chunks in RAG collections.

-        Uses the scroll endpoint to iterate over every chunk (not just top-K search).
+        Uses DIRECT Qdrant scroll API (bypasses Go SDK which has offset cycling bugs).
        Filters out already-processed chunks by hash.
        """
        collections = config.collections or ALL_COLLECTIONS
@@ -734,80 +742,105 @@ class ControlGeneratorPipeline:
        seen_hashes: set[str] = set()

        for collection in collections:
-            offset = None
            page = 0
            collection_total = 0
            collection_new = 0
-            max_pages = 1000  # Safety limit: 1000 pages × 200 = 200K chunks max per collection
-            prev_chunk_count = -1  # Track stalls (same count means no progress)
-            stall_count = 0
+            qdrant_offset = None  # Qdrant uses point ID as offset

-            while page < max_pages:
-                chunks, next_offset = await self.rag.scroll(
-                    collection=collection,
-                    offset=offset,
-                    limit=200,
-                )
+            while True:
+                # Direct Qdrant scroll API — bypasses Go SDK offset cycling bug
+                try:
+                    scroll_body: dict = {
+                        "limit": 250,
+                        "with_payload": True,
+                        "with_vector": False,
+                    }
+                    if qdrant_offset is not None:
+                        scroll_body["offset"] = qdrant_offset

-                if not chunks:
+                    async with httpx.AsyncClient(timeout=30.0) as client:
+                        resp = await client.post(
+                            f"{QDRANT_URL}/collections/{collection}/points/scroll",
+                            json=scroll_body,
+                        )
+                        if resp.status_code != 200:
+                            logger.error("Qdrant scroll %s failed: %d %s", collection, resp.status_code, resp.text[:200])
+                            break
+                        data = resp.json().get("result", {})
+                        points = data.get("points", [])
+                        next_page_offset = data.get("next_page_offset")
+                except Exception as e:
+                    logger.error("Qdrant scroll error for %s: %s", collection, e)
                    break

-                collection_total += len(chunks)
+                if not points:
+                    break

-                for chunk in chunks:
-                    if not chunk.text or len(chunk.text.strip()) < 50:
-                        continue  # Skip empty/tiny chunks
+                collection_total += len(points)

-                    h = hashlib.sha256(chunk.text.encode()).hexdigest()
+                for point in points:
+                    payload = point.get("payload", {})
+                    # Different collections use different field names for text
+                    chunk_text = (payload.get("chunk_text", "")
+                                  or payload.get("content", "")
+                                  or payload.get("text", "")
+                                  or payload.get("page_content", ""))
+                    if not chunk_text or len(chunk_text.strip()) < 50:
+                        continue
+
+                    h = hashlib.sha256(chunk_text.encode()).hexdigest()

-                    # Skip duplicates (same text in multiple collections)
                    if h in seen_hashes:
                        continue
                    seen_hashes.add(h)

-                    # Skip already-processed
                    if h in processed_hashes:
                        continue

+                    # Convert Qdrant point to RAGSearchResult
+                    # Handle varying payload schemas across collections
+                    reg_code = (payload.get("regulation_id", "")
+                                or payload.get("regulation_code", "")
+                                or payload.get("source_id", "")
+                                or payload.get("source_code", ""))
+                    reg_name = (payload.get("regulation_name_de", "")
+                                or payload.get("regulation_name", "")
+                                or payload.get("source_name", "")
+                                or payload.get("guideline_name", "")
+                                or payload.get("document_title", "")
+                                or payload.get("filename", ""))
+                    reg_short = (payload.get("regulation_short", "")
+                                 or reg_code)
+                    chunk = RAGSearchResult(
+                        text=chunk_text,
+                        regulation_code=reg_code,
+                        regulation_name=reg_name,
+                        regulation_short=reg_short,
+                        category=payload.get("category", "") or payload.get("data_type", ""),
+                        article=payload.get("article", "") or payload.get("section_title", "") or payload.get("section", ""),
+                        paragraph=payload.get("paragraph", ""),
+                        source_url=payload.get("source_url", "") or payload.get("source", "") or payload.get("url", ""),
+                        score=0.0,
+                        collection=collection,
+                    )
                    all_results.append(chunk)
                    collection_new += 1

                page += 1
-                if page % 50 == 0:
+                if page % 100 == 0:
                    logger.info(
-                        "Scrolling %s: page %d, %d total chunks, %d new unprocessed",
+                        "Scrolling %s (direct Qdrant): page %d, %d total chunks, %d new unprocessed",
                        collection, page, collection_total, collection_new,
                    )

                # Stop conditions
-                if not next_offset:
-                    break
+                if next_page_offset is None:
+                    break  # Qdrant returns null when no more pages

-                # Detect stalls: if no NEW unique chunks found for several pages,
-                # we've likely cycled through all chunks in this collection.
-                # (Safer than offset dedup which breaks with mixed Qdrant ID types)
-                if collection_new == prev_chunk_count:
-                    stall_count += 1
-                    if stall_count >= 5:
-                        logger.warning(
-                            "Scroll stalled in %s at page %d — no new unique chunks for 5 pages (%d total, %d new) — stopping",
-                            collection, page, collection_total, collection_new,
-                        )
-                        break
-                else:
-                    stall_count = 0
-                prev_chunk_count = collection_new
-
-                offset = next_offset
-
-            if page >= max_pages:
-                logger.warning(
-                    "Collection %s: reached max_pages limit (%d). %d chunks scrolled.",
-                    collection, max_pages, collection_total,
-                )
+                qdrant_offset = next_page_offset

            logger.info(
-                "Collection %s: %d total chunks scrolled, %d new unprocessed",
+                "Collection %s: %d total chunks scrolled (direct Qdrant), %d new unprocessed",
                collection, collection_total, collection_new,
            )

@@ -857,6 +890,11 @@ Gib JSON zurück mit diesen Feldern:
 - evidence: Liste von Nachweisdokumenten (Strings)
 - severity: low/medium/high/critical
 - tags: Liste von Tags
+- domain: Fachgebiet als Kuerzel (AUTH=Authentifizierung, CRYP=Kryptographie, NET=Netzwerk, DATA=Datenschutz, LOG=Logging, ACC=Zugriffskontrolle, SEC=IT-Sicherheit, INC=Vorfallmanagement, AI=KI, COMP=Compliance, GOV=Behoerden/Verwaltung, LAB=Arbeitsrecht, FIN=Finanzregulierung, TRD=Gewerbe/Handelsrecht, ENV=Umwelt, HLT=Gesundheit)
+- category: Inhaltliche Kategorie — MUSS zum domain passen. Moegliche Werte: {CATEGORY_LIST_STR}
+- target_audience: Liste der Zielgruppen (z.B. "unternehmen", "behoerden", "entwickler", "datenschutzbeauftragte", "geschaeftsfuehrung", "it-abteilung", "rechtsabteilung", "compliance-officer", "personalwesen", "einkauf", "produktion", "gesundheitswesen", "finanzwesen", "oeffentlicher_dienst")
+- source_article: Artikel-/Paragraphen-Referenz aus dem Text (z.B. "Artikel 10", "§ 42"). Leer lassen wenn nicht erkennbar.
+- source_paragraph: Absatz-Referenz aus dem Text (z.B. "Absatz 5", "Nr. 2"). Leer lassen wenn nicht erkennbar.

 Text: {chunk.text[:2000]}
 Quelle: {chunk.regulation_name} ({chunk.regulation_code}), {chunk.article}"""
@@ -868,24 +906,29 @@ Quelle: {chunk.regulation_name} ({chunk.regulation_code}), {chunk.article}"""

        domain = _detect_domain(chunk.text)
        control = self._build_control_from_json(data, domain)
+        llm_article = str(data.get("source_article", "")).strip()
+        llm_paragraph = str(data.get("source_paragraph", "")).strip()
+        effective_article = llm_article or chunk.article or ""
+        effective_paragraph = llm_paragraph or chunk.paragraph or ""
        control.license_rule = 1
        control.source_original_text = chunk.text
        control.source_citation = {
            "source": chunk.regulation_name,
-            "article": chunk.article or "",
-            "paragraph": chunk.paragraph or "",
+            "article": effective_article,
+            "paragraph": effective_paragraph,
            "license": license_info.get("license", ""),
            "url": chunk.source_url or "",
        }
        control.customer_visible = True
        control.verification_method = _detect_verification_method(chunk.text)
-        control.category = _detect_category(chunk.text)
+        if not control.category:
+            control.category = _detect_category(chunk.text)
        control.generation_metadata = {
            "processing_path": "structured",
            "license_rule": 1,
            "source_regulation": chunk.regulation_code,
-            "source_article": chunk.article,
-            "source_paragraph": chunk.paragraph,
+            "source_article": effective_article,
+            "source_paragraph": effective_paragraph,
        }
        return control

@@ -910,6 +953,11 @@ Gib JSON zurück mit diesen Feldern:
 - evidence: Liste von Nachweisdokumenten (Strings)
 - severity: low/medium/high/critical
 - tags: Liste von Tags
+- domain: Fachgebiet als Kuerzel (AUTH=Authentifizierung, CRYP=Kryptographie, NET=Netzwerk, DATA=Datenschutz, LOG=Logging, ACC=Zugriffskontrolle, SEC=IT-Sicherheit, INC=Vorfallmanagement, AI=KI, COMP=Compliance, GOV=Behoerden/Verwaltung, LAB=Arbeitsrecht, FIN=Finanzregulierung, TRD=Gewerbe/Handelsrecht, ENV=Umwelt, HLT=Gesundheit)
+- category: Inhaltliche Kategorie — MUSS zum domain passen. Moegliche Werte: {CATEGORY_LIST_STR}
+- target_audience: Liste der Zielgruppen (z.B. "unternehmen", "behoerden", "entwickler", "datenschutzbeauftragte", "geschaeftsfuehrung", "it-abteilung", "rechtsabteilung", "compliance-officer", "personalwesen", "einkauf", "produktion", "gesundheitswesen", "finanzwesen", "oeffentlicher_dienst")
+- source_article: Artikel-/Paragraphen-Referenz aus dem Text (z.B. "Artikel 10", "§ 42"). Leer lassen wenn nicht erkennbar.
+- source_paragraph: Absatz-Referenz aus dem Text (z.B. "Absatz 5", "Nr. 2"). Leer lassen wenn nicht erkennbar.

 Text: {chunk.text[:2000]}
 Quelle: {chunk.regulation_name}, {chunk.article}"""
@@ -921,25 +969,30 @@ Quelle: {chunk.regulation_name}, {chunk.article}"""

        domain = _detect_domain(chunk.text)
        control = self._build_control_from_json(data, domain)
+        llm_article = str(data.get("source_article", "")).strip()
+        llm_paragraph = str(data.get("source_paragraph", "")).strip()
+        effective_article = llm_article or chunk.article or ""
+        effective_paragraph = llm_paragraph or chunk.paragraph or ""
        control.license_rule = 2
        control.source_original_text = chunk.text
        control.source_citation = {
            "source": chunk.regulation_name,
-            "article": chunk.article or "",
-            "paragraph": chunk.paragraph or "",
+            "article": effective_article,
+            "paragraph": effective_paragraph,
            "license": license_info.get("license", ""),
            "license_notice": attribution,
            "url": chunk.source_url or "",
        }
        control.customer_visible = True
        control.verification_method = _detect_verification_method(chunk.text)
-        control.category = _detect_category(chunk.text)
+        if not control.category:
+            control.category = _detect_category(chunk.text)
        control.generation_metadata = {
            "processing_path": "structured",
            "license_rule": 2,
            "source_regulation": chunk.regulation_code,
-            "source_article": chunk.article,
-            "source_paragraph": chunk.paragraph,
+            "source_article": effective_article,
+            "source_paragraph": effective_paragraph,
        }
        return control

@@ -968,7 +1021,8 @@ Gib JSON zurück mit diesen Feldern:
 - evidence: Liste von Nachweisdokumenten (Strings)
 - severity: low/medium/high/critical
 - tags: Liste von Tags (eigene Begriffe)
- domain: Fachgebiet als Kuerzel (AUTH, CRYP, NET, DATA, LOG, ACC, SEC, INC, AI, COMP, GOV, LAB, FIN, TRD, ENV, HLT)
+- domain: Fachgebiet als Kuerzel (AUTH=Authentifizierung, CRYP=Kryptographie, NET=Netzwerk, DATA=Datenschutz, LOG=Logging, ACC=Zugriffskontrolle, SEC=IT-Sicherheit, INC=Vorfallmanagement, AI=KI, COMP=Compliance, GOV=Behoerden/Verwaltung, LAB=Arbeitsrecht, FIN=Finanzregulierung, TRD=Gewerbe/Handelsrecht, ENV=Umwelt, HLT=Gesundheit)
+- category: Inhaltliche Kategorie — MUSS zum domain passen. Moegliche Werte: {CATEGORY_LIST_STR}
 - target_audience: Liste der Zielgruppen (z.B. "unternehmen", "behoerden", "entwickler", "datenschutzbeauftragte", "geschaeftsfuehrung", "it-abteilung", "rechtsabteilung", "compliance-officer", "personalwesen", "oeffentlicher_dienst")"""

        raw = await _llm_chat(prompt, REFORM_SYSTEM_PROMPT)
@@ -982,7 +1036,8 @@ Gib JSON zurück mit diesen Feldern:
        control.source_citation = None        # NEVER cite source
        control.customer_visible = False      # Only our formulation
        control.verification_method = _detect_verification_method(chunk.text)
-        control.category = _detect_category(chunk.text)
+        if not control.category:
+            control.category = _detect_category(chunk.text)
        # generation_metadata: NO source names, NO original texts
        control.generation_metadata = {
            "processing_path": "llm_reform",
@@ -1046,7 +1101,10 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Objekten. Jedes Objekt hat di
 - severity: low/medium/high/critical
 - tags: Liste von Tags
 - domain: Fachgebiet als Kuerzel (AUTH=Authentifizierung, CRYP=Kryptographie, NET=Netzwerk, DATA=Datenschutz, LOG=Logging, ACC=Zugriffskontrolle, SEC=IT-Sicherheit, INC=Vorfallmanagement, AI=KI, COMP=Compliance, GOV=Behoerden/Verwaltung, LAB=Arbeitsrecht, FIN=Finanzregulierung, TRD=Gewerbe/Handelsrecht, ENV=Umwelt, HLT=Gesundheit)
+- category: Inhaltliche Kategorie — MUSS zum domain passen. Moegliche Werte: {CATEGORY_LIST_STR}
 - target_audience: Liste der Zielgruppen fuer die dieses Control relevant ist. Moegliche Werte: "unternehmen", "behoerden", "entwickler", "datenschutzbeauftragte", "geschaeftsfuehrung", "it-abteilung", "rechtsabteilung", "compliance-officer", "personalwesen", "einkauf", "produktion", "vertrieb", "gesundheitswesen", "finanzwesen", "oeffentlicher_dienst"
+- source_article: Artikel-/Paragraphen-Referenz aus dem Text extrahieren (z.B. "Artikel 10", "Art. 5", "§ 42", "Section 3"). Leer lassen wenn nicht erkennbar.
+- source_paragraph: Absatz-Referenz aus dem Text extrahieren (z.B. "Absatz 5", "Abs. 3", "Nr. 2", "(1)"). Leer lassen wenn nicht erkennbar.

 {joined}"""

@@ -1071,26 +1129,32 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Objekten. Jedes Objekt hat di
            domain = _detect_domain(chunk.text)
            control = self._build_control_from_json(data, domain)
            control.license_rule = lic["rule"]
+            # Use LLM-extracted article/paragraph, fall back to chunk metadata
+            llm_article = str(data.get("source_article", "")).strip()
+            llm_paragraph = str(data.get("source_paragraph", "")).strip()
+            effective_article = llm_article or chunk.article or ""
+            effective_paragraph = llm_paragraph or chunk.paragraph or ""
            if lic["rule"] in (1, 2):
                control.source_original_text = chunk.text
                control.source_citation = {
                    "source": chunk.regulation_name,
-                    "article": chunk.article or "",
-                    "paragraph": chunk.paragraph or "",
+                    "article": effective_article,
+                    "paragraph": effective_paragraph,
                    "license": lic.get("license", ""),
                    "license_notice": lic.get("attribution", ""),
                    "url": chunk.source_url or "",
                }
                control.customer_visible = True
            control.verification_method = _detect_verification_method(chunk.text)
-            control.category = _detect_category(chunk.text)
+            if not control.category:
+                control.category = _detect_category(chunk.text)
            same_doc = len(set(c.regulation_code for c in chunks)) == 1
            control.generation_metadata = {
                "processing_path": "structured_batch",
                "license_rule": lic["rule"],
                "source_regulation": chunk.regulation_code,
-                "source_article": chunk.article,
-                "source_paragraph": chunk.paragraph,
+                "source_article": effective_article,
+                "source_paragraph": effective_paragraph,
                "batch_size": len(chunks),
                "document_grouped": same_doc,
            }
@@ -1133,6 +1197,7 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Objekten. Jedes Objekt hat di
 - severity: low/medium/high/critical
 - tags: Liste von Tags (eigene Begriffe)
 - domain: Fachgebiet als Kuerzel (AUTH=Authentifizierung, CRYP=Kryptographie, NET=Netzwerk, DATA=Datenschutz, LOG=Logging, ACC=Zugriffskontrolle, SEC=IT-Sicherheit, INC=Vorfallmanagement, AI=KI, COMP=Compliance, GOV=Behoerden/Verwaltung, LAB=Arbeitsrecht, FIN=Finanzregulierung, TRD=Gewerbe/Handelsrecht, ENV=Umwelt, HLT=Gesundheit)
+- category: Inhaltliche Kategorie — MUSS zum domain passen. Moegliche Werte: {CATEGORY_LIST_STR}
 - target_audience: Liste der Zielgruppen (z.B. "unternehmen", "behoerden", "entwickler", "datenschutzbeauftragte", "geschaeftsfuehrung", "it-abteilung", "rechtsabteilung", "compliance-officer", "personalwesen", "einkauf", "produktion", "gesundheitswesen", "finanzwesen", "oeffentlicher_dienst")

 {joined}"""
@@ -1159,7 +1224,8 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Objekten. Jedes Objekt hat di
            control.source_citation = None
            control.customer_visible = False
            control.verification_method = _detect_verification_method(chunk.text)
-            control.category = _detect_category(chunk.text)
+            if not control.category:
+                control.category = _detect_category(chunk.text)
            control.generation_metadata = {
                "processing_path": "llm_reform_batch",
                "license_rule": 3,
@@ -1209,6 +1275,9 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Objekten. Jedes Objekt hat di
                all_controls[orig_idx] = ctrl

        # Post-process all controls: harmonization + anchor search
+        # NOTE: QA validation runs as a separate batch AFTER generation (qa-reclassify endpoint)
+        # to avoid competing with Ollama prefilter for resources.
+        qa_fixed_count = 0
        final: list[Optional[GeneratedControl]] = []
        for i in range(len(batch_items)):
            control = all_controls.get(i)
@@ -1245,7 +1314,7 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Objekten. Jedes Objekt hat di
            else:
                control.release_state = "needs_review"

-            # Control ID — prefer LLM-assigned domain over keyword detection
+            # Control ID — prefer QA-corrected or LLM-assigned domain over keyword detection
            domain = (control.generation_metadata.get("_effective_domain")
                      or config.domain
                      or _detect_domain(control.objective))
@@ -1254,7 +1323,9 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Objekten. Jedes Objekt hat di

            final.append(control)

-        return final
+        if qa_fixed_count:
+            logger.info("QA validation: fixed %d/%d controls in batch", qa_fixed_count, len(final))
+        return final, qa_fixed_count

    # ── Stage 4: Harmonization ─────────────────────────────────────────

@@ -1337,11 +1408,15 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Objekten. Jedes Objekt hat di

        # Use LLM-provided domain if available, fallback to keyword-detected domain
        llm_domain = data.get("domain")
-        valid_domains = {"AUTH", "CRYP", "NET", "DATA", "LOG", "ACC", "SEC", "INC",
-                         "AI", "COMP", "GOV", "LAB", "FIN", "TRD", "ENV", "HLT"}
-        if llm_domain and llm_domain.upper() in valid_domains:
+        if llm_domain and llm_domain.upper() in VALID_DOMAINS:
            domain = llm_domain.upper()

+        # Use LLM-provided category if available
+        llm_category = data.get("category")
+        category = None
+        if llm_category and llm_category in VALID_CATEGORIES:
+            category = llm_category
+
        # Parse target_audience from LLM response
        target_audience = data.get("target_audience")
        if isinstance(target_audience, str):
@@ -1362,6 +1437,7 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Objekten. Jedes Objekt hat di
            implementation_effort=data.get("implementation_effort", "m") if data.get("implementation_effort") in ("s", "m", "l", "xl") else "m",
            tags=tags[:20],
            target_audience=target_audience,
+            category=category,
        )
        # Store effective domain for later control_id generation
        control.generation_metadata["_effective_domain"] = domain
@@ -1395,6 +1471,79 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Objekten. Jedes Objekt hat di
            pass
        return f"{prefix}-001"

+    # ── Stage QA: Automated Quality Validation ───────────────────────
+
+    async def _qa_validate_control(
+        self, control: GeneratedControl, chunk_text: str
+    ) -> tuple[GeneratedControl, bool]:
+        """Cross-validate category/domain using keyword detection + local LLM.
+
+        Returns (control, was_fixed). Only triggers Ollama QA when the LLM
+        classification disagrees with keyword detection — keeps it fast.
+        """
+        kw_category = _detect_category(chunk_text) or _detect_category(control.objective)
+        kw_domain = _detect_domain(chunk_text)
+        llm_domain = control.generation_metadata.get("_effective_domain", "")
+
+        # If keyword and LLM agree → no QA needed
+        if control.category == kw_category and llm_domain == kw_domain:
+            return control, False
+
+        # Disagreement detected → ask local LLM to arbitrate
+        title = control.title[:100]
+        objective = control.objective[:200]
+        reqs = ", ".join(control.requirements[:3]) if control.requirements else "keine"
+        prompt = f"""Pruefe dieses Compliance-Control auf korrekte Klassifizierung.
+
+Titel: {title}
+Ziel: {objective}
+Anforderungen: {reqs}
+
+Aktuelle Zuordnung: domain={llm_domain}, category={control.category}
+Keyword-Erkennung: domain={kw_domain}, category={kw_category}
+
+Welche Zuordnung ist korrekt? Antworte NUR als JSON:
+{{"domain": "KUERZEL", "category": "kategorie_name", "reason": "kurze Begruendung"}}
+
+Domains: AUTH=Authentifizierung, CRYP=Kryptographie, NET=Netzwerk, DATA=Datenschutz, LOG=Logging, ACC=Zugriffskontrolle, SEC=IT-Sicherheit, INC=Vorfallmanagement, AI=KI, COMP=Compliance, GOV=Behoerden, LAB=Arbeitsrecht, FIN=Finanzregulierung, TRD=Gewerbe, ENV=Umwelt, HLT=Gesundheit
+Kategorien: {CATEGORY_LIST_STR}"""
+
+        try:
+            raw = await _llm_local(prompt)
+            data = _parse_llm_json(raw)
+            if not data:
+                return control, False
+
+            fixed = False
+            qa_domain = data.get("domain", "").upper()
+            qa_category = data.get("category", "")
+            reason = data.get("reason", "")
+
+            if qa_category and qa_category in VALID_CATEGORIES and qa_category != control.category:
+                old_cat = control.category
+                control.category = qa_category
+                control.generation_metadata["qa_category_fix"] = {
+                    "from": old_cat, "to": qa_category, "reason": reason,
+                }
+                logger.info("QA fix: '%s' category '%s' -> '%s' (%s)",
+                            title[:40], old_cat, qa_category, reason)
+                fixed = True
+
+            if qa_domain and qa_domain in VALID_DOMAINS and qa_domain != llm_domain:
+                control.generation_metadata["qa_domain_fix"] = {
+                    "from": llm_domain, "to": qa_domain, "reason": reason,
+                }
+                control.generation_metadata["_effective_domain"] = qa_domain
+                logger.info("QA fix: '%s' domain '%s' -> '%s' (%s)",
+                            title[:40], llm_domain, qa_domain, reason)
+                fixed = True
+
+            return control, fixed
+
+        except Exception as e:
+            logger.warning("QA validation failed for '%s': %s", title[:40], e)
+            return control, False
+
    # ── Pipeline Orchestration ─────────────────────────────────────────

    def _create_job(self, config: GeneratorConfig) -> str:
@@ -1605,10 +1754,28 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Objekten. Jedes Objekt hat di
                len(chunks), len(doc_groups),
            )

-            # Flatten back: chunks from same document are now adjacent
+            # ── Apply max_chunks limit respecting document boundaries ──
+            # Process complete documents until we exceed the limit.
+            # Never split a document across jobs.
            chunks = []
-            for group_list in doc_groups.values():
-                chunks.extend(group_list)
+            if config.max_chunks and config.max_chunks > 0:
+                for group_key, group_list in doc_groups.items():
+                    if chunks and len(chunks) + len(group_list) > config.max_chunks:
+                        # Adding this document would exceed the limit — stop here
+                        break
+                    chunks.extend(group_list)
+                logger.info(
+                    "max_chunks=%d: selected %d chunks from %d complete documents (of %d total groups)",
+                    config.max_chunks, len(chunks),
+                    len(set(c.regulation_code for c in chunks)),
+                    len(doc_groups),
+                )
+            else:
+                # No limit: flatten all groups
+                for group_list in doc_groups.values():
+                    chunks.extend(group_list)
+
+            result.total_chunks_scanned = len(chunks)

            # Process chunks — batch mode (N chunks per Anthropic API call)
            BATCH_SIZE = config.batch_size or 5
@@ -1633,7 +1800,8 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Objekten. Jedes Objekt hat di
                    len(batch), ", ".join(regs_in_batch),
                )
                try:
-                    batch_controls = await self._process_batch(batch, config, job_id)
+                    batch_controls, batch_qa_fixes = await self._process_batch(batch, config, job_id)
+                    result.controls_qa_fixed += batch_qa_fixes
                except Exception as e:
                    logger.error("Batch processing failed: %s — falling back to single-chunk mode", e)
                    # Fallback: process each chunk individually
@@ -1785,6 +1953,8 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Objekten. Jedes Objekt hat di
        if not control.title or not control.objective:
            return None

+        # NOTE: QA validation runs as a separate batch AFTER generation (qa-reclassify endpoint)
+
        # Stage 4: Harmonization
        duplicates = await self._check_harmonization(control)
        if duplicates:
@@ -1809,8 +1979,10 @@ Gib ein JSON-Array zurueck mit GENAU {len(chunks)} Objekten. Jedes Objekt hat di
        else:
            control.release_state = "needs_review"

-        # Generate control_id
-        domain = config.domain or _detect_domain(control.objective)
+        # Generate control_id — prefer QA-corrected or LLM-assigned domain
+        domain = (control.generation_metadata.get("_effective_domain")
+                  or config.domain
+                  or _detect_domain(control.objective))
        control.control_id = self._generate_control_id(domain, self.db)

        # Store job_id in metadata