Files
breakpilot-compliance/backend-compliance/compliance/services/citation_backfill.py
Benjamin Admin c52dbdb8f1
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 42s
CI/CD / test-python-backend-compliance (push) Successful in 1m38s
CI/CD / test-python-document-crawler (push) Successful in 20s
CI/CD / test-python-dsms-gateway (push) Successful in 17s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped
feat(rag): optimize RAG pipeline — JSON-Mode, CoT, Hybrid Search, Re-Ranking, Cross-Reg Dedup, chunk 1024
Phase 1 (LLM Quality):
- Add format=json to all Ollama payloads (obligation_extractor, control_generator, citation_backfill)
- Add Chain-of-Thought analysis steps to Pass 0a/0b system prompts

Phase 2 (Retrieval Quality):
- Hybrid search via Qdrant Query API with RRF fusion + automatic text index (legal_rag.go)
- Fallback to dense-only search if Query API unavailable
- Cross-encoder re-ranking with BGE Reranker v2 (RERANK_ENABLED=false by default)
- CPU-only PyTorch dependency to keep Docker image small

Phase 3 (Data Layer):
- Cross-regulation dedup pass (threshold 0.95) links controls across regulations
- DedupResult.link_type field distinguishes dedup_merge vs cross_regulation
- Chunk size defaults updated 512/50 → 1024/128 for new ingestions only
- Existing collections and controls are NOT affected

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-21 11:49:43 +01:00

439 lines
16 KiB
Python

"""
Citation Backfill Service — enrich existing controls with article/paragraph provenance.
3-tier matching strategy:
Tier 1 — Hash match: sha256(source_original_text) → RAG chunk lookup
Tier 2 — Regex parse: split concatenated "DSGVO Art. 35" → regulation + article
Tier 3 — Ollama LLM: ask local LLM to identify article/paragraph from text
"""
import hashlib
import json
import logging
import os
import re
from dataclasses import dataclass, field
from datetime import datetime, timezone
from typing import Optional
import httpx
from sqlalchemy import text
from sqlalchemy.orm import Session
from .rag_client import ComplianceRAGClient, RAGSearchResult
logger = logging.getLogger(__name__)
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
OLLAMA_MODEL = os.getenv("CONTROL_GEN_OLLAMA_MODEL", "qwen3.5:35b-a3b")
LLM_TIMEOUT = float(os.getenv("CONTROL_GEN_LLM_TIMEOUT", "180"))
ALL_COLLECTIONS = [
"bp_compliance_ce",
"bp_compliance_gesetze",
"bp_compliance_datenschutz",
"bp_dsfa_corpus",
"bp_legal_templates",
]
BACKFILL_SYSTEM_PROMPT = (
"Du bist ein Rechtsexperte. Deine Aufgabe ist es, aus einem Gesetzestext "
"den genauen Artikel und Absatz zu bestimmen. Antworte NUR mit validem JSON."
)
# Regex to split concatenated source like "DSGVO Art. 35" or "NIS2 Artikel 21 Abs. 2"
_SOURCE_ARTICLE_RE = re.compile(
r"^(.+?)\s+(Art(?:ikel)?\.?\s*\d+.*)$", re.IGNORECASE
)
@dataclass
class MatchResult:
article: str
paragraph: str
method: str # "hash", "regex", "llm"
@dataclass
class BackfillResult:
total_controls: int = 0
matched_hash: int = 0
matched_regex: int = 0
matched_llm: int = 0
unmatched: int = 0
updated: int = 0
errors: list = field(default_factory=list)
class CitationBackfill:
"""Backfill article/paragraph into existing control source_citations."""
def __init__(self, db: Session, rag_client: ComplianceRAGClient):
self.db = db
self.rag = rag_client
self._rag_index: dict[str, RAGSearchResult] = {}
async def run(self, dry_run: bool = True, limit: int = 0) -> BackfillResult:
"""Main entry: iterate controls missing article/paragraph, match to RAG, update."""
result = BackfillResult()
# Load controls needing backfill
controls = self._load_controls_needing_backfill(limit)
result.total_controls = len(controls)
logger.info("Backfill: %d controls need article/paragraph enrichment", len(controls))
if not controls:
return result
# Collect hashes we need to find — only build index for controls with source text
needed_hashes: set[str] = set()
for ctrl in controls:
src = ctrl.get("source_original_text")
if src:
needed_hashes.add(hashlib.sha256(src.encode()).hexdigest())
if needed_hashes:
# Build targeted RAG index — only scroll collections that our controls reference
logger.info("Building targeted RAG hash index for %d source texts...", len(needed_hashes))
await self._build_rag_index_targeted(controls)
logger.info("RAG index built: %d chunks indexed, %d hashes needed", len(self._rag_index), len(needed_hashes))
else:
logger.info("No source_original_text found — skipping RAG index build")
# Process each control
for i, ctrl in enumerate(controls):
if i > 0 and i % 100 == 0:
logger.info("Backfill progress: %d/%d processed", i, result.total_controls)
try:
match = await self._match_control(ctrl)
if match:
if match.method == "hash":
result.matched_hash += 1
elif match.method == "regex":
result.matched_regex += 1
elif match.method == "llm":
result.matched_llm += 1
if not dry_run:
self._update_control(ctrl, match)
result.updated += 1
else:
logger.debug(
"DRY RUN: Would update %s with article=%s paragraph=%s (method=%s)",
ctrl["control_id"], match.article, match.paragraph, match.method,
)
else:
result.unmatched += 1
except Exception as e:
error_msg = f"Error backfilling {ctrl.get('control_id', '?')}: {e}"
logger.error(error_msg)
result.errors.append(error_msg)
if not dry_run:
try:
self.db.commit()
except Exception as e:
logger.error("Backfill commit failed: %s", e)
result.errors.append(f"Commit failed: {e}")
logger.info(
"Backfill complete: %d total, hash=%d regex=%d llm=%d unmatched=%d updated=%d",
result.total_controls, result.matched_hash, result.matched_regex,
result.matched_llm, result.unmatched, result.updated,
)
return result
def _load_controls_needing_backfill(self, limit: int = 0) -> list[dict]:
"""Load controls where source_citation exists but lacks separate 'article' key."""
query = """
SELECT id, control_id, source_citation, source_original_text,
generation_metadata, license_rule
FROM canonical_controls
WHERE license_rule IN (1, 2)
AND source_citation IS NOT NULL
AND (
source_citation->>'article' IS NULL
OR source_citation->>'article' = ''
)
ORDER BY control_id
"""
if limit > 0:
query += f" LIMIT {limit}"
result = self.db.execute(text(query))
cols = result.keys()
controls = []
for row in result:
ctrl = dict(zip(cols, row))
ctrl["id"] = str(ctrl["id"])
# Parse JSON fields
for jf in ("source_citation", "generation_metadata"):
if isinstance(ctrl.get(jf), str):
try:
ctrl[jf] = json.loads(ctrl[jf])
except (json.JSONDecodeError, TypeError):
ctrl[jf] = {}
controls.append(ctrl)
return controls
async def _build_rag_index_targeted(self, controls: list[dict]):
"""Build RAG index by scrolling only collections relevant to our controls.
Uses regulation codes from generation_metadata to identify which collections
to search, falling back to all collections only if needed.
"""
# Determine which collections are relevant based on regulation codes
regulation_to_collection = self._map_regulations_to_collections(controls)
collections_to_search = set(regulation_to_collection.values()) or set(ALL_COLLECTIONS)
logger.info("Targeted index: searching %d collections: %s",
len(collections_to_search), ", ".join(collections_to_search))
for collection in collections_to_search:
offset = None
page = 0
seen_offsets: set[str] = set()
while True:
chunks, next_offset = await self.rag.scroll(
collection=collection, offset=offset, limit=200,
)
if not chunks:
break
for chunk in chunks:
if chunk.text and len(chunk.text.strip()) >= 50:
h = hashlib.sha256(chunk.text.encode()).hexdigest()
self._rag_index[h] = chunk
page += 1
if page % 50 == 0:
logger.info("Indexing %s: page %d (%d chunks so far)",
collection, page, len(self._rag_index))
if not next_offset:
break
if next_offset in seen_offsets:
logger.warning("Scroll loop in %s at page %d — stopping", collection, page)
break
seen_offsets.add(next_offset)
offset = next_offset
logger.info("Indexed collection %s: %d pages", collection, page)
def _map_regulations_to_collections(self, controls: list[dict]) -> dict[str, str]:
"""Map regulation codes from controls to likely Qdrant collections."""
# Heuristic: regulation code prefix → collection
collection_map = {
"eu_": "bp_compliance_gesetze",
"dsgvo": "bp_compliance_datenschutz",
"bdsg": "bp_compliance_gesetze",
"ttdsg": "bp_compliance_gesetze",
"nist_": "bp_compliance_ce",
"owasp": "bp_compliance_ce",
"bsi_": "bp_compliance_ce",
"enisa": "bp_compliance_ce",
"at_": "bp_compliance_recht",
"fr_": "bp_compliance_recht",
"es_": "bp_compliance_recht",
}
result: dict[str, str] = {}
for ctrl in controls:
meta = ctrl.get("generation_metadata") or {}
reg = meta.get("source_regulation", "")
if not reg:
continue
for prefix, coll in collection_map.items():
if reg.startswith(prefix):
result[reg] = coll
break
else:
# Unknown regulation — search all
for coll in ALL_COLLECTIONS:
result[f"_all_{coll}"] = coll
return result
async def _match_control(self, ctrl: dict) -> Optional[MatchResult]:
"""3-tier matching: hash → regex → LLM."""
# Tier 1: Hash match against RAG index
source_text = ctrl.get("source_original_text")
if source_text:
h = hashlib.sha256(source_text.encode()).hexdigest()
chunk = self._rag_index.get(h)
if chunk and (chunk.article or chunk.paragraph):
return MatchResult(
article=chunk.article or "",
paragraph=chunk.paragraph or "",
method="hash",
)
# Tier 2: Regex parse concatenated source
citation = ctrl.get("source_citation") or {}
source_str = citation.get("source", "")
parsed = _parse_concatenated_source(source_str)
if parsed and parsed["article"]:
return MatchResult(
article=parsed["article"],
paragraph="", # Regex can't extract paragraph from concatenated format
method="regex",
)
# Tier 3: Ollama LLM
if source_text:
return await self._llm_match(ctrl)
return None
async def _llm_match(self, ctrl: dict) -> Optional[MatchResult]:
"""Use Ollama to identify article/paragraph from source text."""
citation = ctrl.get("source_citation") or {}
regulation_name = citation.get("source", "")
metadata = ctrl.get("generation_metadata") or {}
regulation_code = metadata.get("source_regulation", "")
source_text = ctrl.get("source_original_text", "")
prompt = f"""Analysiere den folgenden Gesetzestext und bestimme den genauen Artikel und Absatz.
Gesetz: {regulation_name} (Code: {regulation_code})
Text:
---
{source_text[:2000]}
---
Antworte NUR mit JSON:
{{"article": "Art. XX", "paragraph": "Abs. Y"}}
Falls kein spezifischer Absatz erkennbar ist, setze paragraph auf "".
Falls kein Artikel erkennbar ist, setze article auf "".
Bei deutschen Gesetzen mit § verwende: "§ XX" statt "Art. XX"."""
try:
raw = await _llm_ollama(prompt, BACKFILL_SYSTEM_PROMPT)
data = _parse_json(raw)
if data and (data.get("article") or data.get("paragraph")):
return MatchResult(
article=data.get("article", ""),
paragraph=data.get("paragraph", ""),
method="llm",
)
except Exception as e:
logger.warning("LLM match failed for %s: %s", ctrl.get("control_id"), e)
return None
def _update_control(self, ctrl: dict, match: MatchResult):
"""Update source_citation and generation_metadata in DB."""
citation = ctrl.get("source_citation") or {}
# Clean the source name: remove concatenated article if present
source_str = citation.get("source", "")
parsed = _parse_concatenated_source(source_str)
if parsed:
citation["source"] = parsed["name"]
# Add separate article/paragraph fields
citation["article"] = match.article
citation["paragraph"] = match.paragraph
# Update generation_metadata
metadata = ctrl.get("generation_metadata") or {}
if match.article:
metadata["source_article"] = match.article
metadata["source_paragraph"] = match.paragraph
metadata["backfill_method"] = match.method
metadata["backfill_at"] = datetime.now(timezone.utc).isoformat()
self.db.execute(
text("""
UPDATE canonical_controls
SET source_citation = :citation,
generation_metadata = :metadata,
updated_at = NOW()
WHERE id = CAST(:id AS uuid)
"""),
{
"id": ctrl["id"],
"citation": json.dumps(citation),
"metadata": json.dumps(metadata),
},
)
def _parse_concatenated_source(source: str) -> Optional[dict]:
"""Parse 'DSGVO Art. 35'{name: 'DSGVO', article: 'Art. 35'}.
Also handles '§' format: 'BDSG § 42'{name: 'BDSG', article: '§ 42'}.
"""
if not source:
return None
# Try Art./Artikel pattern
m = _SOURCE_ARTICLE_RE.match(source)
if m:
return {"name": m.group(1).strip(), "article": m.group(2).strip()}
# Try § pattern
m2 = re.match(r"^(.+?)\s+(§\s*\d+.*)$", source)
if m2:
return {"name": m2.group(1).strip(), "article": m2.group(2).strip()}
return None
async def _llm_ollama(prompt: str, system_prompt: Optional[str] = None) -> str:
"""Call Ollama chat API for backfill matching."""
messages = []
if system_prompt:
messages.append({"role": "system", "content": system_prompt})
messages.append({"role": "user", "content": prompt})
payload = {
"model": OLLAMA_MODEL,
"messages": messages,
"stream": False,
"format": "json",
"options": {"num_predict": 256},
"think": False,
}
try:
async with httpx.AsyncClient(timeout=LLM_TIMEOUT) as client:
resp = await client.post(f"{OLLAMA_URL}/api/chat", json=payload)
if resp.status_code != 200:
logger.error("Ollama backfill failed %d: %s", resp.status_code, resp.text[:300])
return ""
data = resp.json()
msg = data.get("message", {})
if isinstance(msg, dict):
return msg.get("content", "")
return data.get("response", str(msg))
except Exception as e:
logger.error("Ollama backfill request failed: %s", e)
return ""
def _parse_json(raw: str) -> Optional[dict]:
"""Extract JSON object from LLM output."""
if not raw:
return None
# Try direct parse
try:
return json.loads(raw)
except json.JSONDecodeError:
pass
# Try extracting from markdown code block
m = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", raw, re.DOTALL)
if m:
try:
return json.loads(m.group(1))
except json.JSONDecodeError:
pass
# Try finding first { ... }
m = re.search(r"\{[^{}]*\}", raw)
if m:
try:
return json.loads(m.group(0))
except json.JSONDecodeError:
pass
return None