feat(controls): Zitierfaehigkeit — Embedding-Re-Link + Atom-Vererbung

citation_backfill Tier-1 von totem sha256-Hash auf Semantik-Suche gegen die re-ingestierten, article_label-tragenden Chunks umgestellt (Fundstelle aus article_label); rag_client reicht article_label durch (additiv, Default-Feld). NEU: scripts/atom_citation_inheritance.py vererbt source_citation parent->atom (license_rule != 3), iterativ. macmini-Apply verifiziert: Zitierfaehigkeit 6.9%->61.3% (+171.765 Atome), Stichprobe korrekt (Atom == Parent-Fundstelle). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-21 14:17:57 +02:00
parent ff4a743558
commit de542633e2
3 changed files with 238 additions and 111 deletions
@@ -0,0 +1,145 @@
 #!/usr/bin/env python3
 """Inherit source_citation from parent to atom controls.
 Background
 ==========
 citation_backfill.py fills source_citation on the *source-bearing* controls
 (those with source_original_text — ~2-7 %) by re-linking them to the
 re-ingested, article_label-bearing chunks. The remaining ~93 % are "atom"
 controls (decompositions) that carry a parent_control_uuid but no own citation.
 They cite the SAME norm as their parent, so the citation can be inherited —
 no re-matching needed.
 Self-written controls (license_rule = 3) are skipped (no external source).
 Runs in idempotent iterations (atom -> master -> grandmaster) and prints
 per-stage counts before any write. Safe to rerun — only fills rows whose
 source_citation lacks an 'article'.
 Usage::
    python3 scripts/atom_citation_inheritance.py --db-host 100.80.114.48 \\
        --db-password breakpilot123 --dry-run
    python3 scripts/atom_citation_inheritance.py --db-host 100.80.114.48 \\
        --db-password breakpilot123 --apply
 """
 from __future__ import annotations
 import argparse
 import os
 import sys
 DB_URL = os.getenv("DATABASE_URL", "postgresql://breakpilot:breakpilot@localhost:5432/breakpilot_db")
 # A row "needs" a citation when it has no article yet.
 _NEEDS = (
    "(cc.source_citation IS NULL "
    " OR cc.source_citation->>'article' IS NULL "
    " OR cc.source_citation->>'article' = '')"
 )
 # A parent can supply one when it carries a real article.
 _PARENT_HAS = (
    "p.source_citation IS NOT NULL "
    "AND p.source_citation->>'article' IS NOT NULL "
    "AND p.source_citation->>'article' <> ''"
 )
 SQL_REPORT = f"""
 SET search_path TO compliance, public;
 SELECT
  CASE WHEN cc.parent_control_uuid IS NULL THEN 'no_parent'
       WHEN ({_PARENT_HAS.replace('p.', 'p2.')}) THEN 'parent_has_article'
       ELSE 'parent_no_article' END AS bucket,
  COUNT(*) AS n
 FROM canonical_controls cc
 LEFT JOIN canonical_controls p2 ON cc.parent_control_uuid = p2.id
 WHERE {_NEEDS}
  AND cc.license_rule IS DISTINCT FROM 3
 GROUP BY 1 ORDER BY 2 DESC;
 """
 SQL_INHERIT = f"""
 SET search_path TO compliance, public;
 UPDATE canonical_controls cc
 SET source_citation = p.source_citation, updated_at = NOW()
 FROM canonical_controls p
 WHERE cc.parent_control_uuid = p.id
  AND {_NEEDS}
  AND {_PARENT_HAS}
  AND cc.license_rule IS DISTINCT FROM 3;
 """
 def parse_args() -> argparse.Namespace:
    p = argparse.ArgumentParser(description=__doc__)
    p.add_argument("--db-url", default=DB_URL,
                   help="Postgres URL (default: $DATABASE_URL)")
    p.add_argument("--max-iterations", type=int, default=6,
                   help="Cap on inheritance iterations to avoid loops")
    g = p.add_mutually_exclusive_group(required=True)
    g.add_argument("--dry-run", action="store_true")
    g.add_argument("--apply", action="store_true")
    return p.parse_args()
 def print_bucket(rows, label: str) -> None:
    print(f"\n## {label}")
    total = 0
    for bucket, n in rows:
        print(f"  {bucket:20}  {n:>8}")
        total += n
    print(f"  {'TOTAL':20}  {total:>8}")
 def main() -> int:
    args = parse_args()
    try:
        import psycopg2
    except ImportError:
        print("error: psycopg2 not installed", file=sys.stderr)
        return 2
    conn = psycopg2.connect(args.db_url)
    conn.autocommit = False
    cur = conn.cursor()
    print("=" * 60)
    print(" Atom citation inheritance — source_citation via parent")
    print(f" Mode: {'DRY-RUN' if args.dry_run else 'APPLY'}")
    print("=" * 60)
    cur.execute(SQL_REPORT)
    print_bucket(cur.fetchall(), "Controls without article (need citation)")
    if args.dry_run:
        cur.execute(
            "SET search_path TO compliance, public; "
            f"SELECT COUNT(*) FROM canonical_controls cc "
            f"JOIN canonical_controls p ON cc.parent_control_uuid = p.id "
            f"WHERE {_NEEDS} AND {_PARENT_HAS} AND cc.license_rule IS DISTINCT FROM 3;"
        )
        print(f"\n## First inherit-pass would fill: {cur.fetchone()[0]} rows")
        print("\nNo writes performed. Use --apply to execute.")
        conn.rollback()
        return 0
    total = 0
    for i in range(1, args.max_iterations + 1):
        cur.execute(SQL_INHERIT)
        updated = cur.rowcount
        total += updated
        print(f"\n  iteration {i}: {updated} rows inherited")
        if updated == 0:
            break
    conn.commit()
    print(f"\n✓ Total atoms inherited: {total}")
    cur.execute(SQL_REPORT)
    print_bucket(cur.fetchall(), "Remaining without article")
    return 0
 if __name__ == "__main__":
    raise SystemExit(main())
@@ -7,7 +7,6 @@ Citation Backfill Service — enrich existing controls with article/paragraph pr
  Tier 3 — Ollama LLM:  ask local LLM to identify article/paragraph from text
 """
 import hashlib
 import json
 import logging
 import os
@@ -28,12 +27,13 @@ OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
 OLLAMA_MODEL = os.getenv("CONTROL_GEN_OLLAMA_MODEL", "qwen3.5:35b-a3b")
 LLM_TIMEOUT = float(os.getenv("CONTROL_GEN_LLM_TIMEOUT", "180"))
-ALL_COLLECTIONS = [
+# Tier-1 semantic re-link: min cosine for a source_original_text → chunk match.
-    "bp_compliance_ce",
+EMBED_THRESHOLD = float(os.getenv("CITATION_EMBED_THRESHOLD", "0.80"))
 # Collections that carry re-ingested, article_label-bearing chunks.
 RELINK_COLLECTIONS = [
    "bp_compliance_gesetze",
    "bp_compliance_datenschutz",
-    "bp_dsfa_corpus",
+    "bp_compliance_ce",
    "bp_legal_templates",
 ]
 BACKFILL_SYSTEM_PROMPT = (
@@ -51,13 +51,14 @@ _SOURCE_ARTICLE_RE = re.compile(
 class MatchResult:
    article: str
    paragraph: str
-    method: str  # "hash", "regex", "llm"
+    method: str  # "embed", "regex", "llm"
    source: str = ""  # regulation short/name (embed tier sets the cleaned source)
@dataclass
 class BackfillResult:
    total_controls: int = 0
-    matched_hash: int = 0
+    matched_embed: int = 0
    matched_regex: int = 0
    matched_llm: int = 0
    unmatched: int = 0
@@ -71,7 +72,6 @@ class CitationBackfill:
    def __init__(self, db: Session, rag_client: ComplianceRAGClient):
        self.db = db
        self.rag = rag_client
        self._rag_index: dict[str, RAGSearchResult] = {}
    async def run(self, dry_run: bool = True, limit: int = 0) -> BackfillResult:
        """Main entry: iterate controls missing article/paragraph, match to RAG, update."""
@@ -85,20 +85,10 @@ class CitationBackfill:
        if not controls:
            return result
-        # Collect hashes we need to find — only build index for controls with source text
+        # Tier-1 = per-control semantic search against the re-ingested, labeled chunks.
-        needed_hashes: set[str] = set()
+        # (The old sha256(chunk.text) hash index died with re-chunking and is gone.)
-        for ctrl in controls:
+        with_source = sum(1 for c in controls if c.get("source_original_text"))
-            src = ctrl.get("source_original_text")
+        logger.info("Embedding-relink candidates (with source_original_text): %d", with_source)
            if src:
                needed_hashes.add(hashlib.sha256(src.encode()).hexdigest())
        if needed_hashes:
            # Build targeted RAG index — only scroll collections that our controls reference
            logger.info("Building targeted RAG hash index for %d source texts...", len(needed_hashes))
            await self._build_rag_index_targeted(controls)
            logger.info("RAG index built: %d chunks indexed, %d hashes needed", len(self._rag_index), len(needed_hashes))
        else:
            logger.info("No source_original_text found — skipping RAG index build")
        # Process each control
        for i, ctrl in enumerate(controls):
@@ -108,8 +98,8 @@ class CitationBackfill:
            try:
                match = await self._match_control(ctrl)
                if match:
-                    if match.method == "hash":
+                    if match.method == "embed":
-                        result.matched_hash += 1
+                        result.matched_embed += 1
                    elif match.method == "regex":
                        result.matched_regex += 1
                    elif match.method == "llm":
@@ -139,8 +129,8 @@ class CitationBackfill:
                result.errors.append(f"Commit failed: {e}")
        logger.info(
-            "Backfill complete: %d total, hash=%d regex=%d llm=%d unmatched=%d updated=%d",
+            "Backfill complete: %d total, embed=%d regex=%d llm=%d unmatched=%d updated=%d",
-            result.total_controls, result.matched_hash, result.matched_regex,
+            result.total_controls, result.matched_embed, result.matched_regex,
            result.matched_llm, result.unmatched, result.updated,
        )
        return result
@@ -178,93 +168,13 @@ class CitationBackfill:
            controls.append(ctrl)
        return controls
    async def _build_rag_index_targeted(self, controls: list[dict]):
        """Build RAG index by scrolling only collections relevant to our controls.
        Uses regulation codes from generation_metadata to identify which collections
        to search, falling back to all collections only if needed.
        """
        # Determine which collections are relevant based on regulation codes
        regulation_to_collection = self._map_regulations_to_collections(controls)
        collections_to_search = set(regulation_to_collection.values()) or set(ALL_COLLECTIONS)
        logger.info("Targeted index: searching %d collections: %s",
                     len(collections_to_search), ", ".join(collections_to_search))
        for collection in collections_to_search:
            offset = None
            page = 0
            seen_offsets: set[str] = set()
            while True:
                chunks, next_offset = await self.rag.scroll(
                    collection=collection, offset=offset, limit=200,
                )
                if not chunks:
                    break
                for chunk in chunks:
                    if chunk.text and len(chunk.text.strip()) >= 50:
                        h = hashlib.sha256(chunk.text.encode()).hexdigest()
                        self._rag_index[h] = chunk
                page += 1
                if page % 50 == 0:
                    logger.info("Indexing %s: page %d (%d chunks so far)",
                                collection, page, len(self._rag_index))
                if not next_offset:
                    break
                if next_offset in seen_offsets:
                    logger.warning("Scroll loop in %s at page %d — stopping", collection, page)
                    break
                seen_offsets.add(next_offset)
                offset = next_offset
            logger.info("Indexed collection %s: %d pages", collection, page)
    def _map_regulations_to_collections(self, controls: list[dict]) -> dict[str, str]:
        """Map regulation codes from controls to likely Qdrant collections."""
        # Heuristic: regulation code prefix → collection
        collection_map = {
            "eu_": "bp_compliance_gesetze",
            "dsgvo": "bp_compliance_datenschutz",
            "bdsg": "bp_compliance_gesetze",
            "ttdsg": "bp_compliance_gesetze",
            "nist_": "bp_compliance_ce",
            "owasp": "bp_compliance_ce",
            "bsi_": "bp_compliance_ce",
            "enisa": "bp_compliance_ce",
            "at_": "bp_compliance_recht",
            "fr_": "bp_compliance_recht",
            "es_": "bp_compliance_recht",
        }
        result: dict[str, str] = {}
        for ctrl in controls:
            meta = ctrl.get("generation_metadata") or {}
            reg = meta.get("source_regulation", "")
            if not reg:
                continue
            for prefix, coll in collection_map.items():
                if reg.startswith(prefix):
                    result[reg] = coll
                    break
            else:
                # Unknown regulation — search all
                for coll in ALL_COLLECTIONS:
                    result[f"_all_{coll}"] = coll
        return result
    async def _match_control(self, ctrl: dict) -> Optional[MatchResult]:
        """3-tier matching: hash → regex → LLM."""
-        # Tier 1: Hash match against RAG index
+        # Tier 1: Semantic search against the re-ingested, labeled chunks
-        source_text = ctrl.get("source_original_text")
+        embed = await self._embedding_match(ctrl)
-        if source_text:
+        if embed:
-            h = hashlib.sha256(source_text.encode()).hexdigest()
+            return embed
            chunk = self._rag_index.get(h)
            if chunk and (chunk.article or chunk.paragraph):
                return MatchResult(
                    article=chunk.article or "",
                    paragraph=chunk.paragraph or "",
                    method="hash",
                )
        # Tier 2: Regex parse concatenated source
        citation = ctrl.get("source_citation") or {}
@@ -278,11 +188,60 @@ class CitationBackfill:
            )
        # Tier 3: Ollama LLM
-        if source_text:
+        if ctrl.get("source_original_text"):
            return await self._llm_match(ctrl)
        return None
    async def _embedding_match(self, ctrl: dict) -> Optional[MatchResult]:
        """Tier 1: semantic-search source_original_text against the labeled chunks.
        Takes the top hit (cosine >= EMBED_THRESHOLD) that carries a real article
        and turns its article_label into a precise citation.
        """
        source_text = ctrl.get("source_original_text")
        if not source_text:
            return None
        query = source_text.strip()[:512]
        best: Optional[RAGSearchResult] = None
        for collection in self._collections_for(ctrl):
            try:
                hits = await self.rag.search(query, collection=collection, top_k=3)
            except Exception as e:
                logger.debug("embed search failed (%s): %s", collection, e)
                hits = []
            if hits and (best is None or hits[0].score > best.score):
                best = hits[0]
        if not best or best.score < EMBED_THRESHOLD:
            return None
        article = _article_part(best)
        if not article:
            return None
        return MatchResult(
            article=article,
            paragraph=best.paragraph or "",
            method="embed",
            source=best.regulation_short or best.regulation_name or "",
        )
    def _collections_for(self, ctrl: dict) -> list[str]:
        """Likely collection(s) for a control's regulation; falls back to all three."""
        meta = ctrl.get("generation_metadata") or {}
        reg = (meta.get("source_regulation") or "").lower()
        prefix_map = {
            "eu_": "bp_compliance_gesetze", "bdsg": "bp_compliance_gesetze",
            "de_": "bp_compliance_gesetze", "at_": "bp_compliance_gesetze",
            "ch_": "bp_compliance_gesetze", "dsgvo": "bp_compliance_gesetze",
            "trgs": "bp_compliance_ce", "trbs": "bp_compliance_ce", "asr": "bp_compliance_ce",
            "nist": "bp_compliance_ce", "owasp": "bp_compliance_ce", "enisa": "bp_compliance_ce",
            "edpb": "bp_compliance_datenschutz", "dsk": "bp_compliance_datenschutz",
            "bfdi": "bp_compliance_datenschutz",
        }
        for prefix, coll in prefix_map.items():
            if reg.startswith(prefix):
                return [coll]
        return list(RELINK_COLLECTIONS)
    async def _llm_match(self, ctrl: dict) -> Optional[MatchResult]:
        """Use Ollama to identify article/paragraph from source text."""
        citation = ctrl.get("source_citation") or {}
@@ -331,6 +290,9 @@ Bei deutschen Gesetzen mit § verwende: "§ XX" statt "Art. XX"."""
        if parsed:
            citation["source"] = parsed["name"]
        # Embed tier carries the cleaned regulation name → prefer it as source.
        if match.source:
            citation["source"] = match.source
        # Add separate article/paragraph fields
        citation["article"] = match.article
        citation["paragraph"] = match.paragraph
@@ -359,6 +321,23 @@ Bei deutschen Gesetzen mit § verwende: "§ XX" statt "Art. XX"."""
        )
 def _article_part(chunk: RAGSearchResult) -> str:
    """Precise article from a chunk: article_label minus the regulation name.
    'BDSG § 38' -> '§ 38'; 'Art. 39 DSGVO' -> 'Art. 39'; 'NIST SP 800-53r5 SA-12' -> 'SA-12'.
    Falls back to the bare article field. Returns '' if only a doc-level name is present.
    """
    label = (chunk.article_label or "").strip()
    reg = (chunk.regulation_short or "").strip()
    if label:
        part = label
        if reg and reg in label:
            part = label.replace(reg, "").strip(" ,;-")
        if part and part != reg:
            return part
    return (chunk.article or "").strip()
 def _parse_concatenated_source(source: str) -> Optional[dict]:
    """Parse 'DSGVO Art. 35' → {name: 'DSGVO', article: 'Art. 35'}.
@@ -33,6 +33,7 @@ class RAGSearchResult:
    paragraph: str
    source_url: str
    score: float
    article_label: str = ""
    collection: str = ""
    page: Optional[int] = None
@@ -90,6 +91,7 @@ class ComplianceRAGClient:
                    regulation_short=r.get("regulation_short", ""),
                    category=r.get("category", ""),
                    article=r.get("article", ""),
                    article_label=r.get("article_label", ""),
                    paragraph=r.get("paragraph", ""),
                    source_url=r.get("source_url", ""),
                    score=r.get("score", 0.0),
@@ -171,6 +173,7 @@ class ComplianceRAGClient:
                    regulation_short=r.get("regulation_short", ""),
                    category=r.get("category", ""),
                    article=r.get("article", ""),
                    article_label=r.get("article_label", ""),
                    paragraph=r.get("paragraph", ""),
                    source_url=r.get("source_url", ""),
                    score=0.0,