feat(controls): Zitierfaehigkeit — Embedding-Re-Link + Atom-Vererbung

citation_backfill Tier-1 von totem sha256-Hash auf Semantik-Suche gegen die re-ingestierten, article_label-tragenden Chunks umgestellt (Fundstelle aus article_label); rag_client reicht article_label durch (additiv, Default-Feld). NEU: scripts/atom_citation_inheritance.py vererbt source_citation parent->atom (license_rule != 3), iterativ. macmini-Apply verifiziert: Zitierfaehigkeit 6.9%->61.3% (+171.765 Atome), Stichprobe korrekt (Atom == Parent-Fundstelle). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-21 14:17:57 +02:00
parent ff4a743558
commit de542633e2
3 changed files with 238 additions and 111 deletions
@@ -7,7 +7,6 @@ Citation Backfill Service — enrich existing controls with article/paragraph pr
  Tier 3 — Ollama LLM:  ask local LLM to identify article/paragraph from text
 """

-import hashlib
 import json
 import logging
 import os
@@ -28,12 +27,13 @@ OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
 OLLAMA_MODEL = os.getenv("CONTROL_GEN_OLLAMA_MODEL", "qwen3.5:35b-a3b")
 LLM_TIMEOUT = float(os.getenv("CONTROL_GEN_LLM_TIMEOUT", "180"))

-ALL_COLLECTIONS = [
-    "bp_compliance_ce",
+# Tier-1 semantic re-link: min cosine for a source_original_text → chunk match.
+EMBED_THRESHOLD = float(os.getenv("CITATION_EMBED_THRESHOLD", "0.80"))
+# Collections that carry re-ingested, article_label-bearing chunks.
+RELINK_COLLECTIONS = [
    "bp_compliance_gesetze",
    "bp_compliance_datenschutz",
-    "bp_dsfa_corpus",
-    "bp_legal_templates",
+    "bp_compliance_ce",
 ]

 BACKFILL_SYSTEM_PROMPT = (
@@ -51,13 +51,14 @@ _SOURCE_ARTICLE_RE = re.compile(
 class MatchResult:
    article: str
    paragraph: str
-    method: str  # "hash", "regex", "llm"
+    method: str  # "embed", "regex", "llm"
+    source: str = ""  # regulation short/name (embed tier sets the cleaned source)


@dataclass
 class BackfillResult:
    total_controls: int = 0
-    matched_hash: int = 0
+    matched_embed: int = 0
    matched_regex: int = 0
    matched_llm: int = 0
    unmatched: int = 0
@@ -71,7 +72,6 @@ class CitationBackfill:
    def __init__(self, db: Session, rag_client: ComplianceRAGClient):
        self.db = db
        self.rag = rag_client
-        self._rag_index: dict[str, RAGSearchResult] = {}

    async def run(self, dry_run: bool = True, limit: int = 0) -> BackfillResult:
        """Main entry: iterate controls missing article/paragraph, match to RAG, update."""
@@ -85,20 +85,10 @@ class CitationBackfill:
        if not controls:
            return result

-        # Collect hashes we need to find — only build index for controls with source text
-        needed_hashes: set[str] = set()
-        for ctrl in controls:
-            src = ctrl.get("source_original_text")
-            if src:
-                needed_hashes.add(hashlib.sha256(src.encode()).hexdigest())
-
-        if needed_hashes:
-            # Build targeted RAG index — only scroll collections that our controls reference
-            logger.info("Building targeted RAG hash index for %d source texts...", len(needed_hashes))
-            await self._build_rag_index_targeted(controls)
-            logger.info("RAG index built: %d chunks indexed, %d hashes needed", len(self._rag_index), len(needed_hashes))
-        else:
-            logger.info("No source_original_text found — skipping RAG index build")
+        # Tier-1 = per-control semantic search against the re-ingested, labeled chunks.
+        # (The old sha256(chunk.text) hash index died with re-chunking and is gone.)
+        with_source = sum(1 for c in controls if c.get("source_original_text"))
+        logger.info("Embedding-relink candidates (with source_original_text): %d", with_source)

        # Process each control
        for i, ctrl in enumerate(controls):
@@ -108,8 +98,8 @@ class CitationBackfill:
            try:
                match = await self._match_control(ctrl)
                if match:
-                    if match.method == "hash":
-                        result.matched_hash += 1
+                    if match.method == "embed":
+                        result.matched_embed += 1
                    elif match.method == "regex":
                        result.matched_regex += 1
                    elif match.method == "llm":
@@ -139,8 +129,8 @@ class CitationBackfill:
                result.errors.append(f"Commit failed: {e}")

        logger.info(
-            "Backfill complete: %d total, hash=%d regex=%d llm=%d unmatched=%d updated=%d",
-            result.total_controls, result.matched_hash, result.matched_regex,
+            "Backfill complete: %d total, embed=%d regex=%d llm=%d unmatched=%d updated=%d",
+            result.total_controls, result.matched_embed, result.matched_regex,
            result.matched_llm, result.unmatched, result.updated,
        )
        return result
@@ -178,93 +168,13 @@ class CitationBackfill:
            controls.append(ctrl)
        return controls

-    async def _build_rag_index_targeted(self, controls: list[dict]):
-        """Build RAG index by scrolling only collections relevant to our controls.
-
-        Uses regulation codes from generation_metadata to identify which collections
-        to search, falling back to all collections only if needed.
-        """
-        # Determine which collections are relevant based on regulation codes
-        regulation_to_collection = self._map_regulations_to_collections(controls)
-        collections_to_search = set(regulation_to_collection.values()) or set(ALL_COLLECTIONS)
-
-        logger.info("Targeted index: searching %d collections: %s",
-                     len(collections_to_search), ", ".join(collections_to_search))
-
-        for collection in collections_to_search:
-            offset = None
-            page = 0
-            seen_offsets: set[str] = set()
-            while True:
-                chunks, next_offset = await self.rag.scroll(
-                    collection=collection, offset=offset, limit=200,
-                )
-                if not chunks:
-                    break
-                for chunk in chunks:
-                    if chunk.text and len(chunk.text.strip()) >= 50:
-                        h = hashlib.sha256(chunk.text.encode()).hexdigest()
-                        self._rag_index[h] = chunk
-                page += 1
-                if page % 50 == 0:
-                    logger.info("Indexing %s: page %d (%d chunks so far)",
-                                collection, page, len(self._rag_index))
-                if not next_offset:
-                    break
-                if next_offset in seen_offsets:
-                    logger.warning("Scroll loop in %s at page %d — stopping", collection, page)
-                    break
-                seen_offsets.add(next_offset)
-                offset = next_offset
-
-            logger.info("Indexed collection %s: %d pages", collection, page)
-
-    def _map_regulations_to_collections(self, controls: list[dict]) -> dict[str, str]:
-        """Map regulation codes from controls to likely Qdrant collections."""
-        # Heuristic: regulation code prefix → collection
-        collection_map = {
-            "eu_": "bp_compliance_gesetze",
-            "dsgvo": "bp_compliance_datenschutz",
-            "bdsg": "bp_compliance_gesetze",
-            "ttdsg": "bp_compliance_gesetze",
-            "nist_": "bp_compliance_ce",
-            "owasp": "bp_compliance_ce",
-            "bsi_": "bp_compliance_ce",
-            "enisa": "bp_compliance_ce",
-            "at_": "bp_compliance_recht",
-            "fr_": "bp_compliance_recht",
-            "es_": "bp_compliance_recht",
-        }
-        result: dict[str, str] = {}
-        for ctrl in controls:
-            meta = ctrl.get("generation_metadata") or {}
-            reg = meta.get("source_regulation", "")
-            if not reg:
-                continue
-            for prefix, coll in collection_map.items():
-                if reg.startswith(prefix):
-                    result[reg] = coll
-                    break
-            else:
-                # Unknown regulation — search all
-                for coll in ALL_COLLECTIONS:
-                    result[f"_all_{coll}"] = coll
-        return result
-
    async def _match_control(self, ctrl: dict) -> Optional[MatchResult]:
        """3-tier matching: hash → regex → LLM."""

-        # Tier 1: Hash match against RAG index
-        source_text = ctrl.get("source_original_text")
-        if source_text:
-            h = hashlib.sha256(source_text.encode()).hexdigest()
-            chunk = self._rag_index.get(h)
-            if chunk and (chunk.article or chunk.paragraph):
-                return MatchResult(
-                    article=chunk.article or "",
-                    paragraph=chunk.paragraph or "",
-                    method="hash",
-                )
+        # Tier 1: Semantic search against the re-ingested, labeled chunks
+        embed = await self._embedding_match(ctrl)
+        if embed:
+            return embed

        # Tier 2: Regex parse concatenated source
        citation = ctrl.get("source_citation") or {}
@@ -278,11 +188,60 @@ class CitationBackfill:
            )

        # Tier 3: Ollama LLM
-        if source_text:
+        if ctrl.get("source_original_text"):
            return await self._llm_match(ctrl)

        return None

+    async def _embedding_match(self, ctrl: dict) -> Optional[MatchResult]:
+        """Tier 1: semantic-search source_original_text against the labeled chunks.
+
+        Takes the top hit (cosine >= EMBED_THRESHOLD) that carries a real article
+        and turns its article_label into a precise citation.
+        """
+        source_text = ctrl.get("source_original_text")
+        if not source_text:
+            return None
+        query = source_text.strip()[:512]
+        best: Optional[RAGSearchResult] = None
+        for collection in self._collections_for(ctrl):
+            try:
+                hits = await self.rag.search(query, collection=collection, top_k=3)
+            except Exception as e:
+                logger.debug("embed search failed (%s): %s", collection, e)
+                hits = []
+            if hits and (best is None or hits[0].score > best.score):
+                best = hits[0]
+        if not best or best.score < EMBED_THRESHOLD:
+            return None
+        article = _article_part(best)
+        if not article:
+            return None
+        return MatchResult(
+            article=article,
+            paragraph=best.paragraph or "",
+            method="embed",
+            source=best.regulation_short or best.regulation_name or "",
+        )
+
+    def _collections_for(self, ctrl: dict) -> list[str]:
+        """Likely collection(s) for a control's regulation; falls back to all three."""
+        meta = ctrl.get("generation_metadata") or {}
+        reg = (meta.get("source_regulation") or "").lower()
+        prefix_map = {
+            "eu_": "bp_compliance_gesetze", "bdsg": "bp_compliance_gesetze",
+            "de_": "bp_compliance_gesetze", "at_": "bp_compliance_gesetze",
+            "ch_": "bp_compliance_gesetze", "dsgvo": "bp_compliance_gesetze",
+            "trgs": "bp_compliance_ce", "trbs": "bp_compliance_ce", "asr": "bp_compliance_ce",
+            "nist": "bp_compliance_ce", "owasp": "bp_compliance_ce", "enisa": "bp_compliance_ce",
+            "edpb": "bp_compliance_datenschutz", "dsk": "bp_compliance_datenschutz",
+            "bfdi": "bp_compliance_datenschutz",
+        }
+        for prefix, coll in prefix_map.items():
+            if reg.startswith(prefix):
+                return [coll]
+        return list(RELINK_COLLECTIONS)
+
    async def _llm_match(self, ctrl: dict) -> Optional[MatchResult]:
        """Use Ollama to identify article/paragraph from source text."""
        citation = ctrl.get("source_citation") or {}
@@ -331,6 +290,9 @@ Bei deutschen Gesetzen mit § verwende: "§ XX" statt "Art. XX"."""
        if parsed:
            citation["source"] = parsed["name"]

+        # Embed tier carries the cleaned regulation name → prefer it as source.
+        if match.source:
+            citation["source"] = match.source
        # Add separate article/paragraph fields
        citation["article"] = match.article
        citation["paragraph"] = match.paragraph
@@ -359,6 +321,23 @@ Bei deutschen Gesetzen mit § verwende: "§ XX" statt "Art. XX"."""
        )


+def _article_part(chunk: RAGSearchResult) -> str:
+    """Precise article from a chunk: article_label minus the regulation name.
+
+    'BDSG § 38' -> '§ 38'; 'Art. 39 DSGVO' -> 'Art. 39'; 'NIST SP 800-53r5 SA-12' -> 'SA-12'.
+    Falls back to the bare article field. Returns '' if only a doc-level name is present.
+    """
+    label = (chunk.article_label or "").strip()
+    reg = (chunk.regulation_short or "").strip()
+    if label:
+        part = label
+        if reg and reg in label:
+            part = label.replace(reg, "").strip(" ,;-")
+        if part and part != reg:
+            return part
+    return (chunk.article or "").strip()
+
+
 def _parse_concatenated_source(source: str) -> Optional[dict]:
    """Parse 'DSGVO Art. 35' → {name: 'DSGVO', article: 'Art. 35'}.