""" Citation Backfill Service — enrich existing controls with article/paragraph provenance. 3-tier matching strategy: Tier 1 — Hash match: sha256(source_original_text) → RAG chunk lookup Tier 2 — Regex parse: split concatenated "DSGVO Art. 35" → regulation + article Tier 3 — Ollama LLM: ask local LLM to identify article/paragraph from text """ import hashlib import json import logging import os import re from dataclasses import dataclass, field from datetime import datetime, timezone from typing import Optional import httpx from sqlalchemy import text from sqlalchemy.orm import Session from .rag_client import ComplianceRAGClient, RAGSearchResult logger = logging.getLogger(__name__) OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434") OLLAMA_MODEL = os.getenv("CONTROL_GEN_OLLAMA_MODEL", "qwen3.5:35b-a3b") LLM_TIMEOUT = float(os.getenv("CONTROL_GEN_LLM_TIMEOUT", "180")) ALL_COLLECTIONS = [ "bp_compliance_ce", "bp_compliance_gesetze", "bp_compliance_datenschutz", "bp_dsfa_corpus", "bp_legal_templates", ] BACKFILL_SYSTEM_PROMPT = ( "Du bist ein Rechtsexperte. Deine Aufgabe ist es, aus einem Gesetzestext " "den genauen Artikel und Absatz zu bestimmen. Antworte NUR mit validem JSON." ) # Regex to split concatenated source like "DSGVO Art. 35" or "NIS2 Artikel 21 Abs. 2" _SOURCE_ARTICLE_RE = re.compile( r"^(.+?)\s+(Art(?:ikel)?\.?\s*\d+.*)$", re.IGNORECASE ) @dataclass class MatchResult: article: str paragraph: str method: str # "hash", "regex", "llm" @dataclass class BackfillResult: total_controls: int = 0 matched_hash: int = 0 matched_regex: int = 0 matched_llm: int = 0 unmatched: int = 0 updated: int = 0 errors: list = field(default_factory=list) class CitationBackfill: """Backfill article/paragraph into existing control source_citations.""" def __init__(self, db: Session, rag_client: ComplianceRAGClient): self.db = db self.rag = rag_client self._rag_index: dict[str, RAGSearchResult] = {} async def run(self, dry_run: bool = True, limit: int = 0) -> BackfillResult: """Main entry: iterate controls missing article/paragraph, match to RAG, update.""" result = BackfillResult() # Load controls needing backfill controls = self._load_controls_needing_backfill(limit) result.total_controls = len(controls) logger.info("Backfill: %d controls need article/paragraph enrichment", len(controls)) if not controls: return result # Collect hashes we need to find — only build index for controls with source text needed_hashes: set[str] = set() for ctrl in controls: src = ctrl.get("source_original_text") if src: needed_hashes.add(hashlib.sha256(src.encode()).hexdigest()) if needed_hashes: # Build targeted RAG index — only scroll collections that our controls reference logger.info("Building targeted RAG hash index for %d source texts...", len(needed_hashes)) await self._build_rag_index_targeted(controls) logger.info("RAG index built: %d chunks indexed, %d hashes needed", len(self._rag_index), len(needed_hashes)) else: logger.info("No source_original_text found — skipping RAG index build") # Process each control for i, ctrl in enumerate(controls): if i > 0 and i % 100 == 0: logger.info("Backfill progress: %d/%d processed", i, result.total_controls) try: match = await self._match_control(ctrl) if match: if match.method == "hash": result.matched_hash += 1 elif match.method == "regex": result.matched_regex += 1 elif match.method == "llm": result.matched_llm += 1 if not dry_run: self._update_control(ctrl, match) result.updated += 1 else: logger.debug( "DRY RUN: Would update %s with article=%s paragraph=%s (method=%s)", ctrl["control_id"], match.article, match.paragraph, match.method, ) else: result.unmatched += 1 except Exception as e: error_msg = f"Error backfilling {ctrl.get('control_id', '?')}: {e}" logger.error(error_msg) result.errors.append(error_msg) if not dry_run: try: self.db.commit() except Exception as e: logger.error("Backfill commit failed: %s", e) result.errors.append(f"Commit failed: {e}") logger.info( "Backfill complete: %d total, hash=%d regex=%d llm=%d unmatched=%d updated=%d", result.total_controls, result.matched_hash, result.matched_regex, result.matched_llm, result.unmatched, result.updated, ) return result def _load_controls_needing_backfill(self, limit: int = 0) -> list[dict]: """Load controls where source_citation exists but lacks separate 'article' key.""" query = """ SELECT id, control_id, source_citation, source_original_text, generation_metadata, license_rule FROM canonical_controls WHERE license_rule IN (1, 2) AND source_citation IS NOT NULL AND ( source_citation->>'article' IS NULL OR source_citation->>'article' = '' ) ORDER BY control_id """ if limit > 0: query += f" LIMIT {limit}" result = self.db.execute(text(query)) cols = result.keys() controls = [] for row in result: ctrl = dict(zip(cols, row)) ctrl["id"] = str(ctrl["id"]) # Parse JSON fields for jf in ("source_citation", "generation_metadata"): if isinstance(ctrl.get(jf), str): try: ctrl[jf] = json.loads(ctrl[jf]) except (json.JSONDecodeError, TypeError): ctrl[jf] = {} controls.append(ctrl) return controls async def _build_rag_index_targeted(self, controls: list[dict]): """Build RAG index by scrolling only collections relevant to our controls. Uses regulation codes from generation_metadata to identify which collections to search, falling back to all collections only if needed. """ # Determine which collections are relevant based on regulation codes regulation_to_collection = self._map_regulations_to_collections(controls) collections_to_search = set(regulation_to_collection.values()) or set(ALL_COLLECTIONS) logger.info("Targeted index: searching %d collections: %s", len(collections_to_search), ", ".join(collections_to_search)) for collection in collections_to_search: offset = None page = 0 seen_offsets: set[str] = set() while True: chunks, next_offset = await self.rag.scroll( collection=collection, offset=offset, limit=200, ) if not chunks: break for chunk in chunks: if chunk.text and len(chunk.text.strip()) >= 50: h = hashlib.sha256(chunk.text.encode()).hexdigest() self._rag_index[h] = chunk page += 1 if page % 50 == 0: logger.info("Indexing %s: page %d (%d chunks so far)", collection, page, len(self._rag_index)) if not next_offset: break if next_offset in seen_offsets: logger.warning("Scroll loop in %s at page %d — stopping", collection, page) break seen_offsets.add(next_offset) offset = next_offset logger.info("Indexed collection %s: %d pages", collection, page) def _map_regulations_to_collections(self, controls: list[dict]) -> dict[str, str]: """Map regulation codes from controls to likely Qdrant collections.""" # Heuristic: regulation code prefix → collection collection_map = { "eu_": "bp_compliance_gesetze", "dsgvo": "bp_compliance_datenschutz", "bdsg": "bp_compliance_gesetze", "ttdsg": "bp_compliance_gesetze", "nist_": "bp_compliance_ce", "owasp": "bp_compliance_ce", "bsi_": "bp_compliance_ce", "enisa": "bp_compliance_ce", "at_": "bp_compliance_recht", "fr_": "bp_compliance_recht", "es_": "bp_compliance_recht", } result: dict[str, str] = {} for ctrl in controls: meta = ctrl.get("generation_metadata") or {} reg = meta.get("source_regulation", "") if not reg: continue for prefix, coll in collection_map.items(): if reg.startswith(prefix): result[reg] = coll break else: # Unknown regulation — search all for coll in ALL_COLLECTIONS: result[f"_all_{coll}"] = coll return result async def _match_control(self, ctrl: dict) -> Optional[MatchResult]: """3-tier matching: hash → regex → LLM.""" # Tier 1: Hash match against RAG index source_text = ctrl.get("source_original_text") if source_text: h = hashlib.sha256(source_text.encode()).hexdigest() chunk = self._rag_index.get(h) if chunk and (chunk.article or chunk.paragraph): return MatchResult( article=chunk.article or "", paragraph=chunk.paragraph or "", method="hash", ) # Tier 2: Regex parse concatenated source citation = ctrl.get("source_citation") or {} source_str = citation.get("source", "") parsed = _parse_concatenated_source(source_str) if parsed and parsed["article"]: return MatchResult( article=parsed["article"], paragraph="", # Regex can't extract paragraph from concatenated format method="regex", ) # Tier 3: Ollama LLM if source_text: return await self._llm_match(ctrl) return None async def _llm_match(self, ctrl: dict) -> Optional[MatchResult]: """Use Ollama to identify article/paragraph from source text.""" citation = ctrl.get("source_citation") or {} regulation_name = citation.get("source", "") metadata = ctrl.get("generation_metadata") or {} regulation_code = metadata.get("source_regulation", "") source_text = ctrl.get("source_original_text", "") prompt = f"""Analysiere den folgenden Gesetzestext und bestimme den genauen Artikel und Absatz. Gesetz: {regulation_name} (Code: {regulation_code}) Text: --- {source_text[:2000]} --- Antworte NUR mit JSON: {{"article": "Art. XX", "paragraph": "Abs. Y"}} Falls kein spezifischer Absatz erkennbar ist, setze paragraph auf "". Falls kein Artikel erkennbar ist, setze article auf "". Bei deutschen Gesetzen mit § verwende: "§ XX" statt "Art. XX".""" try: raw = await _llm_ollama(prompt, BACKFILL_SYSTEM_PROMPT) data = _parse_json(raw) if data and (data.get("article") or data.get("paragraph")): return MatchResult( article=data.get("article", ""), paragraph=data.get("paragraph", ""), method="llm", ) except Exception as e: logger.warning("LLM match failed for %s: %s", ctrl.get("control_id"), e) return None def _update_control(self, ctrl: dict, match: MatchResult): """Update source_citation and generation_metadata in DB.""" citation = ctrl.get("source_citation") or {} # Clean the source name: remove concatenated article if present source_str = citation.get("source", "") parsed = _parse_concatenated_source(source_str) if parsed: citation["source"] = parsed["name"] # Add separate article/paragraph fields citation["article"] = match.article citation["paragraph"] = match.paragraph # Update generation_metadata metadata = ctrl.get("generation_metadata") or {} if match.article: metadata["source_article"] = match.article metadata["source_paragraph"] = match.paragraph metadata["backfill_method"] = match.method metadata["backfill_at"] = datetime.now(timezone.utc).isoformat() self.db.execute( text(""" UPDATE canonical_controls SET source_citation = :citation, generation_metadata = :metadata, updated_at = NOW() WHERE id = CAST(:id AS uuid) """), { "id": ctrl["id"], "citation": json.dumps(citation), "metadata": json.dumps(metadata), }, ) def _parse_concatenated_source(source: str) -> Optional[dict]: """Parse 'DSGVO Art. 35' → {name: 'DSGVO', article: 'Art. 35'}. Also handles '§' format: 'BDSG § 42' → {name: 'BDSG', article: '§ 42'}. """ if not source: return None # Try Art./Artikel pattern m = _SOURCE_ARTICLE_RE.match(source) if m: return {"name": m.group(1).strip(), "article": m.group(2).strip()} # Try § pattern m2 = re.match(r"^(.+?)\s+(§\s*\d+.*)$", source) if m2: return {"name": m2.group(1).strip(), "article": m2.group(2).strip()} return None async def _llm_ollama(prompt: str, system_prompt: Optional[str] = None) -> str: """Call Ollama chat API for backfill matching.""" messages = [] if system_prompt: messages.append({"role": "system", "content": system_prompt}) messages.append({"role": "user", "content": prompt}) payload = { "model": OLLAMA_MODEL, "messages": messages, "stream": False, "format": "json", "options": {"num_predict": 256}, "think": False, } try: async with httpx.AsyncClient(timeout=LLM_TIMEOUT) as client: resp = await client.post(f"{OLLAMA_URL}/api/chat", json=payload) if resp.status_code != 200: logger.error("Ollama backfill failed %d: %s", resp.status_code, resp.text[:300]) return "" data = resp.json() msg = data.get("message", {}) if isinstance(msg, dict): return msg.get("content", "") return data.get("response", str(msg)) except Exception as e: logger.error("Ollama backfill request failed: %s", e) return "" def _parse_json(raw: str) -> Optional[dict]: """Extract JSON object from LLM output.""" if not raw: return None # Try direct parse try: return json.loads(raw) except json.JSONDecodeError: pass # Try extracting from markdown code block m = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", raw, re.DOTALL) if m: try: return json.loads(m.group(1)) except json.JSONDecodeError: pass # Try finding first { ... } m = re.search(r"\{[^{}]*\}", raw) if m: try: return json.loads(m.group(0)) except json.JSONDecodeError: pass return None