feat(training+controls): interactive video pipeline, training blocks, control generator, CE libraries

Interactive Training Videos (CP-TRAIN): - DB migration 022: training_checkpoints + checkpoint_progress tables - NarratorScript generation via Anthropic (AI Teacher persona, German) - TTS batch synthesis + interactive video pipeline (slides + checkpoint slides + FFmpeg) - 4 new API endpoints: generate-interactive, interactive-manifest, checkpoint submit, checkpoint progress - InteractiveVideoPlayer component (HTML5 Video, quiz overlay, seek protection, progress tracking) - Learner portal integration with automatic completion on all checkpoints passed - 30 new tests (handler validation + grading logic + manifest/progress + seek protection) Training Blocks: - Block generator, block store, block config CRUD + preview/generate endpoints - Migration 021: training_blocks schema Control Generator + Canonical Library: - Control generator routes + service enhancements - Canonical control library helpers, sidebar entry - Citation backfill service + tests - CE libraries data (hazard, protection, evidence, lifecycle, components) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-16 21:41:48 +01:00
parent d2133dbfa2
commit 4f6bc8f6f6
50 changed files with 17299 additions and 198 deletions
@@ -0,0 +1,437 @@
+"""
+Citation Backfill Service — enrich existing controls with article/paragraph provenance.
+
+3-tier matching strategy:
+  Tier 1 — Hash match:  sha256(source_original_text) → RAG chunk lookup
+  Tier 2 — Regex parse: split concatenated "DSGVO Art. 35" → regulation + article
+  Tier 3 — Ollama LLM:  ask local LLM to identify article/paragraph from text
+"""
+
+import hashlib
+import json
+import logging
+import os
+import re
+from dataclasses import dataclass, field
+from datetime import datetime, timezone
+from typing import Optional
+
+import httpx
+from sqlalchemy import text
+from sqlalchemy.orm import Session
+
+from .rag_client import ComplianceRAGClient, RAGSearchResult
+
+logger = logging.getLogger(__name__)
+
+OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
+OLLAMA_MODEL = os.getenv("CONTROL_GEN_OLLAMA_MODEL", "qwen3.5:35b-a3b")
+LLM_TIMEOUT = float(os.getenv("CONTROL_GEN_LLM_TIMEOUT", "180"))
+
+ALL_COLLECTIONS = [
+    "bp_compliance_ce",
+    "bp_compliance_gesetze",
+    "bp_compliance_datenschutz",
+    "bp_dsfa_corpus",
+    "bp_legal_templates",
+]
+
+BACKFILL_SYSTEM_PROMPT = (
+    "Du bist ein Rechtsexperte. Deine Aufgabe ist es, aus einem Gesetzestext "
+    "den genauen Artikel und Absatz zu bestimmen. Antworte NUR mit validem JSON."
+)
+
+# Regex to split concatenated source like "DSGVO Art. 35" or "NIS2 Artikel 21 Abs. 2"
+_SOURCE_ARTICLE_RE = re.compile(
+    r"^(.+?)\s+(Art(?:ikel)?\.?\s*\d+.*)$", re.IGNORECASE
+)
+
+
+@dataclass
+class MatchResult:
+    article: str
+    paragraph: str
+    method: str  # "hash", "regex", "llm"
+
+
+@dataclass
+class BackfillResult:
+    total_controls: int = 0
+    matched_hash: int = 0
+    matched_regex: int = 0
+    matched_llm: int = 0
+    unmatched: int = 0
+    updated: int = 0
+    errors: list = field(default_factory=list)
+
+
+class CitationBackfill:
+    """Backfill article/paragraph into existing control source_citations."""
+
+    def __init__(self, db: Session, rag_client: ComplianceRAGClient):
+        self.db = db
+        self.rag = rag_client
+        self._rag_index: dict[str, RAGSearchResult] = {}
+
+    async def run(self, dry_run: bool = True, limit: int = 0) -> BackfillResult:
+        """Main entry: iterate controls missing article/paragraph, match to RAG, update."""
+        result = BackfillResult()
+
+        # Load controls needing backfill
+        controls = self._load_controls_needing_backfill(limit)
+        result.total_controls = len(controls)
+        logger.info("Backfill: %d controls need article/paragraph enrichment", len(controls))
+
+        if not controls:
+            return result
+
+        # Collect hashes we need to find — only build index for controls with source text
+        needed_hashes: set[str] = set()
+        for ctrl in controls:
+            src = ctrl.get("source_original_text")
+            if src:
+                needed_hashes.add(hashlib.sha256(src.encode()).hexdigest())
+
+        if needed_hashes:
+            # Build targeted RAG index — only scroll collections that our controls reference
+            logger.info("Building targeted RAG hash index for %d source texts...", len(needed_hashes))
+            await self._build_rag_index_targeted(controls)
+            logger.info("RAG index built: %d chunks indexed, %d hashes needed", len(self._rag_index), len(needed_hashes))
+        else:
+            logger.info("No source_original_text found — skipping RAG index build")
+
+        # Process each control
+        for i, ctrl in enumerate(controls):
+            if i > 0 and i % 100 == 0:
+                logger.info("Backfill progress: %d/%d processed", i, result.total_controls)
+
+            try:
+                match = await self._match_control(ctrl)
+                if match:
+                    if match.method == "hash":
+                        result.matched_hash += 1
+                    elif match.method == "regex":
+                        result.matched_regex += 1
+                    elif match.method == "llm":
+                        result.matched_llm += 1
+
+                    if not dry_run:
+                        self._update_control(ctrl, match)
+                        result.updated += 1
+                    else:
+                        logger.debug(
+                            "DRY RUN: Would update %s with article=%s paragraph=%s (method=%s)",
+                            ctrl["control_id"], match.article, match.paragraph, match.method,
+                        )
+                else:
+                    result.unmatched += 1
+
+            except Exception as e:
+                error_msg = f"Error backfilling {ctrl.get('control_id', '?')}: {e}"
+                logger.error(error_msg)
+                result.errors.append(error_msg)
+
+        if not dry_run:
+            try:
+                self.db.commit()
+            except Exception as e:
+                logger.error("Backfill commit failed: %s", e)
+                result.errors.append(f"Commit failed: {e}")
+
+        logger.info(
+            "Backfill complete: %d total, hash=%d regex=%d llm=%d unmatched=%d updated=%d",
+            result.total_controls, result.matched_hash, result.matched_regex,
+            result.matched_llm, result.unmatched, result.updated,
+        )
+        return result
+
+    def _load_controls_needing_backfill(self, limit: int = 0) -> list[dict]:
+        """Load controls where source_citation exists but lacks separate 'article' key."""
+        query = """
+            SELECT id, control_id, source_citation, source_original_text,
+                   generation_metadata, license_rule
+            FROM canonical_controls
+            WHERE license_rule IN (1, 2)
+              AND source_citation IS NOT NULL
+              AND (
+                  source_citation->>'article' IS NULL
+                  OR source_citation->>'article' = ''
+              )
+            ORDER BY control_id
+        """
+        if limit > 0:
+            query += f" LIMIT {limit}"
+
+        result = self.db.execute(text(query))
+        cols = result.keys()
+        controls = []
+        for row in result:
+            ctrl = dict(zip(cols, row))
+            ctrl["id"] = str(ctrl["id"])
+            # Parse JSON fields
+            for jf in ("source_citation", "generation_metadata"):
+                if isinstance(ctrl.get(jf), str):
+                    try:
+                        ctrl[jf] = json.loads(ctrl[jf])
+                    except (json.JSONDecodeError, TypeError):
+                        ctrl[jf] = {}
+            controls.append(ctrl)
+        return controls
+
+    async def _build_rag_index_targeted(self, controls: list[dict]):
+        """Build RAG index by scrolling only collections relevant to our controls.
+
+        Uses regulation codes from generation_metadata to identify which collections
+        to search, falling back to all collections only if needed.
+        """
+        # Determine which collections are relevant based on regulation codes
+        regulation_to_collection = self._map_regulations_to_collections(controls)
+        collections_to_search = set(regulation_to_collection.values()) or set(ALL_COLLECTIONS)
+
+        logger.info("Targeted index: searching %d collections: %s",
+                     len(collections_to_search), ", ".join(collections_to_search))
+
+        for collection in collections_to_search:
+            offset = None
+            page = 0
+            seen_offsets: set[str] = set()
+            while True:
+                chunks, next_offset = await self.rag.scroll(
+                    collection=collection, offset=offset, limit=200,
+                )
+                if not chunks:
+                    break
+                for chunk in chunks:
+                    if chunk.text and len(chunk.text.strip()) >= 50:
+                        h = hashlib.sha256(chunk.text.encode()).hexdigest()
+                        self._rag_index[h] = chunk
+                page += 1
+                if page % 50 == 0:
+                    logger.info("Indexing %s: page %d (%d chunks so far)",
+                                collection, page, len(self._rag_index))
+                if not next_offset:
+                    break
+                if next_offset in seen_offsets:
+                    logger.warning("Scroll loop in %s at page %d — stopping", collection, page)
+                    break
+                seen_offsets.add(next_offset)
+                offset = next_offset
+
+            logger.info("Indexed collection %s: %d pages", collection, page)
+
+    def _map_regulations_to_collections(self, controls: list[dict]) -> dict[str, str]:
+        """Map regulation codes from controls to likely Qdrant collections."""
+        # Heuristic: regulation code prefix → collection
+        collection_map = {
+            "eu_": "bp_compliance_gesetze",
+            "dsgvo": "bp_compliance_datenschutz",
+            "bdsg": "bp_compliance_gesetze",
+            "ttdsg": "bp_compliance_gesetze",
+            "nist_": "bp_compliance_ce",
+            "owasp": "bp_compliance_ce",
+            "bsi_": "bp_compliance_ce",
+            "enisa": "bp_compliance_ce",
+            "at_": "bp_compliance_recht",
+            "fr_": "bp_compliance_recht",
+            "es_": "bp_compliance_recht",
+        }
+        result: dict[str, str] = {}
+        for ctrl in controls:
+            meta = ctrl.get("generation_metadata") or {}
+            reg = meta.get("source_regulation", "")
+            if not reg:
+                continue
+            for prefix, coll in collection_map.items():
+                if reg.startswith(prefix):
+                    result[reg] = coll
+                    break
+            else:
+                # Unknown regulation — search all
+                for coll in ALL_COLLECTIONS:
+                    result[f"_all_{coll}"] = coll
+        return result
+
+    async def _match_control(self, ctrl: dict) -> Optional[MatchResult]:
+        """3-tier matching: hash → regex → LLM."""
+
+        # Tier 1: Hash match against RAG index
+        source_text = ctrl.get("source_original_text")
+        if source_text:
+            h = hashlib.sha256(source_text.encode()).hexdigest()
+            chunk = self._rag_index.get(h)
+            if chunk and (chunk.article or chunk.paragraph):
+                return MatchResult(
+                    article=chunk.article or "",
+                    paragraph=chunk.paragraph or "",
+                    method="hash",
+                )
+
+        # Tier 2: Regex parse concatenated source
+        citation = ctrl.get("source_citation") or {}
+        source_str = citation.get("source", "")
+        parsed = _parse_concatenated_source(source_str)
+        if parsed and parsed["article"]:
+            return MatchResult(
+                article=parsed["article"],
+                paragraph="",  # Regex can't extract paragraph from concatenated format
+                method="regex",
+            )
+
+        # Tier 3: Ollama LLM
+        if source_text:
+            return await self._llm_match(ctrl)
+
+        return None
+
+    async def _llm_match(self, ctrl: dict) -> Optional[MatchResult]:
+        """Use Ollama to identify article/paragraph from source text."""
+        citation = ctrl.get("source_citation") or {}
+        regulation_name = citation.get("source", "")
+        metadata = ctrl.get("generation_metadata") or {}
+        regulation_code = metadata.get("source_regulation", "")
+        source_text = ctrl.get("source_original_text", "")
+
+        prompt = f"""Analysiere den folgenden Gesetzestext und bestimme den genauen Artikel und Absatz.
+
+Gesetz: {regulation_name} (Code: {regulation_code})
+
+Text:
+---
+{source_text[:2000]}
+---
+
+Antworte NUR mit JSON:
+{{"article": "Art. XX", "paragraph": "Abs. Y"}}
+
+Falls kein spezifischer Absatz erkennbar ist, setze paragraph auf "".
+Falls kein Artikel erkennbar ist, setze article auf "".
+Bei deutschen Gesetzen mit § verwende: "§ XX" statt "Art. XX"."""
+
+        try:
+            raw = await _llm_ollama(prompt, BACKFILL_SYSTEM_PROMPT)
+            data = _parse_json(raw)
+            if data and (data.get("article") or data.get("paragraph")):
+                return MatchResult(
+                    article=data.get("article", ""),
+                    paragraph=data.get("paragraph", ""),
+                    method="llm",
+                )
+        except Exception as e:
+            logger.warning("LLM match failed for %s: %s", ctrl.get("control_id"), e)
+
+        return None
+
+    def _update_control(self, ctrl: dict, match: MatchResult):
+        """Update source_citation and generation_metadata in DB."""
+        citation = ctrl.get("source_citation") or {}
+
+        # Clean the source name: remove concatenated article if present
+        source_str = citation.get("source", "")
+        parsed = _parse_concatenated_source(source_str)
+        if parsed:
+            citation["source"] = parsed["name"]
+
+        # Add separate article/paragraph fields
+        citation["article"] = match.article
+        citation["paragraph"] = match.paragraph
+
+        # Update generation_metadata
+        metadata = ctrl.get("generation_metadata") or {}
+        if match.article:
+            metadata["source_article"] = match.article
+        metadata["source_paragraph"] = match.paragraph
+        metadata["backfill_method"] = match.method
+        metadata["backfill_at"] = datetime.now(timezone.utc).isoformat()
+
+        self.db.execute(
+            text("""
+                UPDATE canonical_controls
+                SET source_citation = :citation,
+                    generation_metadata = :metadata,
+                    updated_at = NOW()
+                WHERE id = CAST(:id AS uuid)
+            """),
+            {
+                "id": ctrl["id"],
+                "citation": json.dumps(citation),
+                "metadata": json.dumps(metadata),
+            },
+        )
+
+
+def _parse_concatenated_source(source: str) -> Optional[dict]:
+    """Parse 'DSGVO Art. 35' → {name: 'DSGVO', article: 'Art. 35'}.
+
+    Also handles '§' format: 'BDSG § 42' → {name: 'BDSG', article: '§ 42'}.
+    """
+    if not source:
+        return None
+
+    # Try Art./Artikel pattern
+    m = _SOURCE_ARTICLE_RE.match(source)
+    if m:
+        return {"name": m.group(1).strip(), "article": m.group(2).strip()}
+
+    # Try § pattern
+    m2 = re.match(r"^(.+?)\s+(§\s*\d+.*)$", source)
+    if m2:
+        return {"name": m2.group(1).strip(), "article": m2.group(2).strip()}
+
+    return None
+
+
+async def _llm_ollama(prompt: str, system_prompt: Optional[str] = None) -> str:
+    """Call Ollama chat API for backfill matching."""
+    messages = []
+    if system_prompt:
+        messages.append({"role": "system", "content": system_prompt})
+    messages.append({"role": "user", "content": prompt})
+
+    payload = {
+        "model": OLLAMA_MODEL,
+        "messages": messages,
+        "stream": False,
+        "options": {"num_predict": 256},
+        "think": False,
+    }
+
+    try:
+        async with httpx.AsyncClient(timeout=LLM_TIMEOUT) as client:
+            resp = await client.post(f"{OLLAMA_URL}/api/chat", json=payload)
+            if resp.status_code != 200:
+                logger.error("Ollama backfill failed %d: %s", resp.status_code, resp.text[:300])
+                return ""
+            data = resp.json()
+            msg = data.get("message", {})
+            if isinstance(msg, dict):
+                return msg.get("content", "")
+            return data.get("response", str(msg))
+    except Exception as e:
+        logger.error("Ollama backfill request failed: %s", e)
+        return ""
+
+
+def _parse_json(raw: str) -> Optional[dict]:
+    """Extract JSON object from LLM output."""
+    if not raw:
+        return None
+    # Try direct parse
+    try:
+        return json.loads(raw)
+    except json.JSONDecodeError:
+        pass
+    # Try extracting from markdown code block
+    m = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", raw, re.DOTALL)
+    if m:
+        try:
+            return json.loads(m.group(1))
+        except json.JSONDecodeError:
+            pass
+    # Try finding first { ... }
+    m = re.search(r"\{[^{}]*\}", raw)
+    if m:
+        try:
+            return json.loads(m.group(0))
+        except json.JSONDecodeError:
+            pass
+    return None