Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 42s
CI/CD / test-python-backend-compliance (push) Successful in 1m38s
CI/CD / test-python-document-crawler (push) Successful in 20s
CI/CD / test-python-dsms-gateway (push) Successful in 17s
CI/CD / validate-canonical-controls (push) Successful in 10s
CI/CD / Deploy (push) Has been skipped
Phase 1 (LLM Quality): - Add format=json to all Ollama payloads (obligation_extractor, control_generator, citation_backfill) - Add Chain-of-Thought analysis steps to Pass 0a/0b system prompts Phase 2 (Retrieval Quality): - Hybrid search via Qdrant Query API with RRF fusion + automatic text index (legal_rag.go) - Fallback to dense-only search if Query API unavailable - Cross-encoder re-ranking with BGE Reranker v2 (RERANK_ENABLED=false by default) - CPU-only PyTorch dependency to keep Docker image small Phase 3 (Data Layer): - Cross-regulation dedup pass (threshold 0.95) links controls across regulations - DedupResult.link_type field distinguishes dedup_merge vs cross_regulation - Chunk size defaults updated 512/50 → 1024/128 for new ingestions only - Existing collections and controls are NOT affected Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
439 lines
16 KiB
Python
439 lines
16 KiB
Python
"""
|
|
Citation Backfill Service — enrich existing controls with article/paragraph provenance.
|
|
|
|
3-tier matching strategy:
|
|
Tier 1 — Hash match: sha256(source_original_text) → RAG chunk lookup
|
|
Tier 2 — Regex parse: split concatenated "DSGVO Art. 35" → regulation + article
|
|
Tier 3 — Ollama LLM: ask local LLM to identify article/paragraph from text
|
|
"""
|
|
|
|
import hashlib
|
|
import json
|
|
import logging
|
|
import os
|
|
import re
|
|
from dataclasses import dataclass, field
|
|
from datetime import datetime, timezone
|
|
from typing import Optional
|
|
|
|
import httpx
|
|
from sqlalchemy import text
|
|
from sqlalchemy.orm import Session
|
|
|
|
from .rag_client import ComplianceRAGClient, RAGSearchResult
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
|
|
OLLAMA_MODEL = os.getenv("CONTROL_GEN_OLLAMA_MODEL", "qwen3.5:35b-a3b")
|
|
LLM_TIMEOUT = float(os.getenv("CONTROL_GEN_LLM_TIMEOUT", "180"))
|
|
|
|
ALL_COLLECTIONS = [
|
|
"bp_compliance_ce",
|
|
"bp_compliance_gesetze",
|
|
"bp_compliance_datenschutz",
|
|
"bp_dsfa_corpus",
|
|
"bp_legal_templates",
|
|
]
|
|
|
|
BACKFILL_SYSTEM_PROMPT = (
|
|
"Du bist ein Rechtsexperte. Deine Aufgabe ist es, aus einem Gesetzestext "
|
|
"den genauen Artikel und Absatz zu bestimmen. Antworte NUR mit validem JSON."
|
|
)
|
|
|
|
# Regex to split concatenated source like "DSGVO Art. 35" or "NIS2 Artikel 21 Abs. 2"
|
|
_SOURCE_ARTICLE_RE = re.compile(
|
|
r"^(.+?)\s+(Art(?:ikel)?\.?\s*\d+.*)$", re.IGNORECASE
|
|
)
|
|
|
|
|
|
@dataclass
|
|
class MatchResult:
|
|
article: str
|
|
paragraph: str
|
|
method: str # "hash", "regex", "llm"
|
|
|
|
|
|
@dataclass
|
|
class BackfillResult:
|
|
total_controls: int = 0
|
|
matched_hash: int = 0
|
|
matched_regex: int = 0
|
|
matched_llm: int = 0
|
|
unmatched: int = 0
|
|
updated: int = 0
|
|
errors: list = field(default_factory=list)
|
|
|
|
|
|
class CitationBackfill:
|
|
"""Backfill article/paragraph into existing control source_citations."""
|
|
|
|
def __init__(self, db: Session, rag_client: ComplianceRAGClient):
|
|
self.db = db
|
|
self.rag = rag_client
|
|
self._rag_index: dict[str, RAGSearchResult] = {}
|
|
|
|
async def run(self, dry_run: bool = True, limit: int = 0) -> BackfillResult:
|
|
"""Main entry: iterate controls missing article/paragraph, match to RAG, update."""
|
|
result = BackfillResult()
|
|
|
|
# Load controls needing backfill
|
|
controls = self._load_controls_needing_backfill(limit)
|
|
result.total_controls = len(controls)
|
|
logger.info("Backfill: %d controls need article/paragraph enrichment", len(controls))
|
|
|
|
if not controls:
|
|
return result
|
|
|
|
# Collect hashes we need to find — only build index for controls with source text
|
|
needed_hashes: set[str] = set()
|
|
for ctrl in controls:
|
|
src = ctrl.get("source_original_text")
|
|
if src:
|
|
needed_hashes.add(hashlib.sha256(src.encode()).hexdigest())
|
|
|
|
if needed_hashes:
|
|
# Build targeted RAG index — only scroll collections that our controls reference
|
|
logger.info("Building targeted RAG hash index for %d source texts...", len(needed_hashes))
|
|
await self._build_rag_index_targeted(controls)
|
|
logger.info("RAG index built: %d chunks indexed, %d hashes needed", len(self._rag_index), len(needed_hashes))
|
|
else:
|
|
logger.info("No source_original_text found — skipping RAG index build")
|
|
|
|
# Process each control
|
|
for i, ctrl in enumerate(controls):
|
|
if i > 0 and i % 100 == 0:
|
|
logger.info("Backfill progress: %d/%d processed", i, result.total_controls)
|
|
|
|
try:
|
|
match = await self._match_control(ctrl)
|
|
if match:
|
|
if match.method == "hash":
|
|
result.matched_hash += 1
|
|
elif match.method == "regex":
|
|
result.matched_regex += 1
|
|
elif match.method == "llm":
|
|
result.matched_llm += 1
|
|
|
|
if not dry_run:
|
|
self._update_control(ctrl, match)
|
|
result.updated += 1
|
|
else:
|
|
logger.debug(
|
|
"DRY RUN: Would update %s with article=%s paragraph=%s (method=%s)",
|
|
ctrl["control_id"], match.article, match.paragraph, match.method,
|
|
)
|
|
else:
|
|
result.unmatched += 1
|
|
|
|
except Exception as e:
|
|
error_msg = f"Error backfilling {ctrl.get('control_id', '?')}: {e}"
|
|
logger.error(error_msg)
|
|
result.errors.append(error_msg)
|
|
|
|
if not dry_run:
|
|
try:
|
|
self.db.commit()
|
|
except Exception as e:
|
|
logger.error("Backfill commit failed: %s", e)
|
|
result.errors.append(f"Commit failed: {e}")
|
|
|
|
logger.info(
|
|
"Backfill complete: %d total, hash=%d regex=%d llm=%d unmatched=%d updated=%d",
|
|
result.total_controls, result.matched_hash, result.matched_regex,
|
|
result.matched_llm, result.unmatched, result.updated,
|
|
)
|
|
return result
|
|
|
|
def _load_controls_needing_backfill(self, limit: int = 0) -> list[dict]:
|
|
"""Load controls where source_citation exists but lacks separate 'article' key."""
|
|
query = """
|
|
SELECT id, control_id, source_citation, source_original_text,
|
|
generation_metadata, license_rule
|
|
FROM canonical_controls
|
|
WHERE license_rule IN (1, 2)
|
|
AND source_citation IS NOT NULL
|
|
AND (
|
|
source_citation->>'article' IS NULL
|
|
OR source_citation->>'article' = ''
|
|
)
|
|
ORDER BY control_id
|
|
"""
|
|
if limit > 0:
|
|
query += f" LIMIT {limit}"
|
|
|
|
result = self.db.execute(text(query))
|
|
cols = result.keys()
|
|
controls = []
|
|
for row in result:
|
|
ctrl = dict(zip(cols, row))
|
|
ctrl["id"] = str(ctrl["id"])
|
|
# Parse JSON fields
|
|
for jf in ("source_citation", "generation_metadata"):
|
|
if isinstance(ctrl.get(jf), str):
|
|
try:
|
|
ctrl[jf] = json.loads(ctrl[jf])
|
|
except (json.JSONDecodeError, TypeError):
|
|
ctrl[jf] = {}
|
|
controls.append(ctrl)
|
|
return controls
|
|
|
|
async def _build_rag_index_targeted(self, controls: list[dict]):
|
|
"""Build RAG index by scrolling only collections relevant to our controls.
|
|
|
|
Uses regulation codes from generation_metadata to identify which collections
|
|
to search, falling back to all collections only if needed.
|
|
"""
|
|
# Determine which collections are relevant based on regulation codes
|
|
regulation_to_collection = self._map_regulations_to_collections(controls)
|
|
collections_to_search = set(regulation_to_collection.values()) or set(ALL_COLLECTIONS)
|
|
|
|
logger.info("Targeted index: searching %d collections: %s",
|
|
len(collections_to_search), ", ".join(collections_to_search))
|
|
|
|
for collection in collections_to_search:
|
|
offset = None
|
|
page = 0
|
|
seen_offsets: set[str] = set()
|
|
while True:
|
|
chunks, next_offset = await self.rag.scroll(
|
|
collection=collection, offset=offset, limit=200,
|
|
)
|
|
if not chunks:
|
|
break
|
|
for chunk in chunks:
|
|
if chunk.text and len(chunk.text.strip()) >= 50:
|
|
h = hashlib.sha256(chunk.text.encode()).hexdigest()
|
|
self._rag_index[h] = chunk
|
|
page += 1
|
|
if page % 50 == 0:
|
|
logger.info("Indexing %s: page %d (%d chunks so far)",
|
|
collection, page, len(self._rag_index))
|
|
if not next_offset:
|
|
break
|
|
if next_offset in seen_offsets:
|
|
logger.warning("Scroll loop in %s at page %d — stopping", collection, page)
|
|
break
|
|
seen_offsets.add(next_offset)
|
|
offset = next_offset
|
|
|
|
logger.info("Indexed collection %s: %d pages", collection, page)
|
|
|
|
def _map_regulations_to_collections(self, controls: list[dict]) -> dict[str, str]:
|
|
"""Map regulation codes from controls to likely Qdrant collections."""
|
|
# Heuristic: regulation code prefix → collection
|
|
collection_map = {
|
|
"eu_": "bp_compliance_gesetze",
|
|
"dsgvo": "bp_compliance_datenschutz",
|
|
"bdsg": "bp_compliance_gesetze",
|
|
"ttdsg": "bp_compliance_gesetze",
|
|
"nist_": "bp_compliance_ce",
|
|
"owasp": "bp_compliance_ce",
|
|
"bsi_": "bp_compliance_ce",
|
|
"enisa": "bp_compliance_ce",
|
|
"at_": "bp_compliance_recht",
|
|
"fr_": "bp_compliance_recht",
|
|
"es_": "bp_compliance_recht",
|
|
}
|
|
result: dict[str, str] = {}
|
|
for ctrl in controls:
|
|
meta = ctrl.get("generation_metadata") or {}
|
|
reg = meta.get("source_regulation", "")
|
|
if not reg:
|
|
continue
|
|
for prefix, coll in collection_map.items():
|
|
if reg.startswith(prefix):
|
|
result[reg] = coll
|
|
break
|
|
else:
|
|
# Unknown regulation — search all
|
|
for coll in ALL_COLLECTIONS:
|
|
result[f"_all_{coll}"] = coll
|
|
return result
|
|
|
|
async def _match_control(self, ctrl: dict) -> Optional[MatchResult]:
|
|
"""3-tier matching: hash → regex → LLM."""
|
|
|
|
# Tier 1: Hash match against RAG index
|
|
source_text = ctrl.get("source_original_text")
|
|
if source_text:
|
|
h = hashlib.sha256(source_text.encode()).hexdigest()
|
|
chunk = self._rag_index.get(h)
|
|
if chunk and (chunk.article or chunk.paragraph):
|
|
return MatchResult(
|
|
article=chunk.article or "",
|
|
paragraph=chunk.paragraph or "",
|
|
method="hash",
|
|
)
|
|
|
|
# Tier 2: Regex parse concatenated source
|
|
citation = ctrl.get("source_citation") or {}
|
|
source_str = citation.get("source", "")
|
|
parsed = _parse_concatenated_source(source_str)
|
|
if parsed and parsed["article"]:
|
|
return MatchResult(
|
|
article=parsed["article"],
|
|
paragraph="", # Regex can't extract paragraph from concatenated format
|
|
method="regex",
|
|
)
|
|
|
|
# Tier 3: Ollama LLM
|
|
if source_text:
|
|
return await self._llm_match(ctrl)
|
|
|
|
return None
|
|
|
|
async def _llm_match(self, ctrl: dict) -> Optional[MatchResult]:
|
|
"""Use Ollama to identify article/paragraph from source text."""
|
|
citation = ctrl.get("source_citation") or {}
|
|
regulation_name = citation.get("source", "")
|
|
metadata = ctrl.get("generation_metadata") or {}
|
|
regulation_code = metadata.get("source_regulation", "")
|
|
source_text = ctrl.get("source_original_text", "")
|
|
|
|
prompt = f"""Analysiere den folgenden Gesetzestext und bestimme den genauen Artikel und Absatz.
|
|
|
|
Gesetz: {regulation_name} (Code: {regulation_code})
|
|
|
|
Text:
|
|
---
|
|
{source_text[:2000]}
|
|
---
|
|
|
|
Antworte NUR mit JSON:
|
|
{{"article": "Art. XX", "paragraph": "Abs. Y"}}
|
|
|
|
Falls kein spezifischer Absatz erkennbar ist, setze paragraph auf "".
|
|
Falls kein Artikel erkennbar ist, setze article auf "".
|
|
Bei deutschen Gesetzen mit § verwende: "§ XX" statt "Art. XX"."""
|
|
|
|
try:
|
|
raw = await _llm_ollama(prompt, BACKFILL_SYSTEM_PROMPT)
|
|
data = _parse_json(raw)
|
|
if data and (data.get("article") or data.get("paragraph")):
|
|
return MatchResult(
|
|
article=data.get("article", ""),
|
|
paragraph=data.get("paragraph", ""),
|
|
method="llm",
|
|
)
|
|
except Exception as e:
|
|
logger.warning("LLM match failed for %s: %s", ctrl.get("control_id"), e)
|
|
|
|
return None
|
|
|
|
def _update_control(self, ctrl: dict, match: MatchResult):
|
|
"""Update source_citation and generation_metadata in DB."""
|
|
citation = ctrl.get("source_citation") or {}
|
|
|
|
# Clean the source name: remove concatenated article if present
|
|
source_str = citation.get("source", "")
|
|
parsed = _parse_concatenated_source(source_str)
|
|
if parsed:
|
|
citation["source"] = parsed["name"]
|
|
|
|
# Add separate article/paragraph fields
|
|
citation["article"] = match.article
|
|
citation["paragraph"] = match.paragraph
|
|
|
|
# Update generation_metadata
|
|
metadata = ctrl.get("generation_metadata") or {}
|
|
if match.article:
|
|
metadata["source_article"] = match.article
|
|
metadata["source_paragraph"] = match.paragraph
|
|
metadata["backfill_method"] = match.method
|
|
metadata["backfill_at"] = datetime.now(timezone.utc).isoformat()
|
|
|
|
self.db.execute(
|
|
text("""
|
|
UPDATE canonical_controls
|
|
SET source_citation = :citation,
|
|
generation_metadata = :metadata,
|
|
updated_at = NOW()
|
|
WHERE id = CAST(:id AS uuid)
|
|
"""),
|
|
{
|
|
"id": ctrl["id"],
|
|
"citation": json.dumps(citation),
|
|
"metadata": json.dumps(metadata),
|
|
},
|
|
)
|
|
|
|
|
|
def _parse_concatenated_source(source: str) -> Optional[dict]:
|
|
"""Parse 'DSGVO Art. 35' → {name: 'DSGVO', article: 'Art. 35'}.
|
|
|
|
Also handles '§' format: 'BDSG § 42' → {name: 'BDSG', article: '§ 42'}.
|
|
"""
|
|
if not source:
|
|
return None
|
|
|
|
# Try Art./Artikel pattern
|
|
m = _SOURCE_ARTICLE_RE.match(source)
|
|
if m:
|
|
return {"name": m.group(1).strip(), "article": m.group(2).strip()}
|
|
|
|
# Try § pattern
|
|
m2 = re.match(r"^(.+?)\s+(§\s*\d+.*)$", source)
|
|
if m2:
|
|
return {"name": m2.group(1).strip(), "article": m2.group(2).strip()}
|
|
|
|
return None
|
|
|
|
|
|
async def _llm_ollama(prompt: str, system_prompt: Optional[str] = None) -> str:
|
|
"""Call Ollama chat API for backfill matching."""
|
|
messages = []
|
|
if system_prompt:
|
|
messages.append({"role": "system", "content": system_prompt})
|
|
messages.append({"role": "user", "content": prompt})
|
|
|
|
payload = {
|
|
"model": OLLAMA_MODEL,
|
|
"messages": messages,
|
|
"stream": False,
|
|
"format": "json",
|
|
"options": {"num_predict": 256},
|
|
"think": False,
|
|
}
|
|
|
|
try:
|
|
async with httpx.AsyncClient(timeout=LLM_TIMEOUT) as client:
|
|
resp = await client.post(f"{OLLAMA_URL}/api/chat", json=payload)
|
|
if resp.status_code != 200:
|
|
logger.error("Ollama backfill failed %d: %s", resp.status_code, resp.text[:300])
|
|
return ""
|
|
data = resp.json()
|
|
msg = data.get("message", {})
|
|
if isinstance(msg, dict):
|
|
return msg.get("content", "")
|
|
return data.get("response", str(msg))
|
|
except Exception as e:
|
|
logger.error("Ollama backfill request failed: %s", e)
|
|
return ""
|
|
|
|
|
|
def _parse_json(raw: str) -> Optional[dict]:
|
|
"""Extract JSON object from LLM output."""
|
|
if not raw:
|
|
return None
|
|
# Try direct parse
|
|
try:
|
|
return json.loads(raw)
|
|
except json.JSONDecodeError:
|
|
pass
|
|
# Try extracting from markdown code block
|
|
m = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", raw, re.DOTALL)
|
|
if m:
|
|
try:
|
|
return json.loads(m.group(1))
|
|
except json.JSONDecodeError:
|
|
pass
|
|
# Try finding first { ... }
|
|
m = re.search(r"\{[^{}]*\}", raw)
|
|
if m:
|
|
try:
|
|
return json.loads(m.group(0))
|
|
except json.JSONDecodeError:
|
|
pass
|
|
return None
|