Files
breakpilot-core/control-pipeline/services/citation_backfill.py
T
Benjamin Admin de542633e2 feat(controls): Zitierfaehigkeit — Embedding-Re-Link + Atom-Vererbung
citation_backfill Tier-1 von totem sha256-Hash auf Semantik-Suche gegen die
re-ingestierten, article_label-tragenden Chunks umgestellt (Fundstelle aus
article_label); rag_client reicht article_label durch (additiv, Default-Feld).
NEU: scripts/atom_citation_inheritance.py vererbt source_citation parent->atom
(license_rule != 3), iterativ. macmini-Apply verifiziert: Zitierfaehigkeit
6.9%->61.3% (+171.765 Atome), Stichprobe korrekt (Atom == Parent-Fundstelle).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-21 14:17:57 +02:00

418 lines
15 KiB
Python

"""
Citation Backfill Service — enrich existing controls with article/paragraph provenance.
3-tier matching strategy:
Tier 1 — Hash match: sha256(source_original_text) → RAG chunk lookup
Tier 2 — Regex parse: split concatenated "DSGVO Art. 35" → regulation + article
Tier 3 — Ollama LLM: ask local LLM to identify article/paragraph from text
"""
import json
import logging
import os
import re
from dataclasses import dataclass, field
from datetime import datetime, timezone
from typing import Optional
import httpx
from sqlalchemy import text
from sqlalchemy.orm import Session
from .rag_client import ComplianceRAGClient, RAGSearchResult
logger = logging.getLogger(__name__)
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
OLLAMA_MODEL = os.getenv("CONTROL_GEN_OLLAMA_MODEL", "qwen3.5:35b-a3b")
LLM_TIMEOUT = float(os.getenv("CONTROL_GEN_LLM_TIMEOUT", "180"))
# Tier-1 semantic re-link: min cosine for a source_original_text → chunk match.
EMBED_THRESHOLD = float(os.getenv("CITATION_EMBED_THRESHOLD", "0.80"))
# Collections that carry re-ingested, article_label-bearing chunks.
RELINK_COLLECTIONS = [
"bp_compliance_gesetze",
"bp_compliance_datenschutz",
"bp_compliance_ce",
]
BACKFILL_SYSTEM_PROMPT = (
"Du bist ein Rechtsexperte. Deine Aufgabe ist es, aus einem Gesetzestext "
"den genauen Artikel und Absatz zu bestimmen. Antworte NUR mit validem JSON."
)
# Regex to split concatenated source like "DSGVO Art. 35" or "NIS2 Artikel 21 Abs. 2"
_SOURCE_ARTICLE_RE = re.compile(
r"^(.+?)\s+(Art(?:ikel)?\.?\s*\d+.*)$", re.IGNORECASE
)
@dataclass
class MatchResult:
article: str
paragraph: str
method: str # "embed", "regex", "llm"
source: str = "" # regulation short/name (embed tier sets the cleaned source)
@dataclass
class BackfillResult:
total_controls: int = 0
matched_embed: int = 0
matched_regex: int = 0
matched_llm: int = 0
unmatched: int = 0
updated: int = 0
errors: list = field(default_factory=list)
class CitationBackfill:
"""Backfill article/paragraph into existing control source_citations."""
def __init__(self, db: Session, rag_client: ComplianceRAGClient):
self.db = db
self.rag = rag_client
async def run(self, dry_run: bool = True, limit: int = 0) -> BackfillResult:
"""Main entry: iterate controls missing article/paragraph, match to RAG, update."""
result = BackfillResult()
# Load controls needing backfill
controls = self._load_controls_needing_backfill(limit)
result.total_controls = len(controls)
logger.info("Backfill: %d controls need article/paragraph enrichment", len(controls))
if not controls:
return result
# Tier-1 = per-control semantic search against the re-ingested, labeled chunks.
# (The old sha256(chunk.text) hash index died with re-chunking and is gone.)
with_source = sum(1 for c in controls if c.get("source_original_text"))
logger.info("Embedding-relink candidates (with source_original_text): %d", with_source)
# Process each control
for i, ctrl in enumerate(controls):
if i > 0 and i % 100 == 0:
logger.info("Backfill progress: %d/%d processed", i, result.total_controls)
try:
match = await self._match_control(ctrl)
if match:
if match.method == "embed":
result.matched_embed += 1
elif match.method == "regex":
result.matched_regex += 1
elif match.method == "llm":
result.matched_llm += 1
if not dry_run:
self._update_control(ctrl, match)
result.updated += 1
else:
logger.debug(
"DRY RUN: Would update %s with article=%s paragraph=%s (method=%s)",
ctrl["control_id"], match.article, match.paragraph, match.method,
)
else:
result.unmatched += 1
except Exception as e:
error_msg = f"Error backfilling {ctrl.get('control_id', '?')}: {e}"
logger.error(error_msg)
result.errors.append(error_msg)
if not dry_run:
try:
self.db.commit()
except Exception as e:
logger.error("Backfill commit failed: %s", e)
result.errors.append(f"Commit failed: {e}")
logger.info(
"Backfill complete: %d total, embed=%d regex=%d llm=%d unmatched=%d updated=%d",
result.total_controls, result.matched_embed, result.matched_regex,
result.matched_llm, result.unmatched, result.updated,
)
return result
def _load_controls_needing_backfill(self, limit: int = 0) -> list[dict]:
"""Load controls where source_citation exists but lacks separate 'article' key."""
query = """
SELECT id, control_id, source_citation, source_original_text,
generation_metadata, license_rule
FROM canonical_controls
WHERE license_rule IN (1, 2)
AND source_citation IS NOT NULL
AND (
source_citation->>'article' IS NULL
OR source_citation->>'article' = ''
)
ORDER BY control_id
"""
if limit > 0:
query += f" LIMIT {limit}"
result = self.db.execute(text(query))
cols = result.keys()
controls = []
for row in result:
ctrl = dict(zip(cols, row))
ctrl["id"] = str(ctrl["id"])
# Parse JSON fields
for jf in ("source_citation", "generation_metadata"):
if isinstance(ctrl.get(jf), str):
try:
ctrl[jf] = json.loads(ctrl[jf])
except (json.JSONDecodeError, TypeError):
ctrl[jf] = {}
controls.append(ctrl)
return controls
async def _match_control(self, ctrl: dict) -> Optional[MatchResult]:
"""3-tier matching: hash → regex → LLM."""
# Tier 1: Semantic search against the re-ingested, labeled chunks
embed = await self._embedding_match(ctrl)
if embed:
return embed
# Tier 2: Regex parse concatenated source
citation = ctrl.get("source_citation") or {}
source_str = citation.get("source", "")
parsed = _parse_concatenated_source(source_str)
if parsed and parsed["article"]:
return MatchResult(
article=parsed["article"],
paragraph="", # Regex can't extract paragraph from concatenated format
method="regex",
)
# Tier 3: Ollama LLM
if ctrl.get("source_original_text"):
return await self._llm_match(ctrl)
return None
async def _embedding_match(self, ctrl: dict) -> Optional[MatchResult]:
"""Tier 1: semantic-search source_original_text against the labeled chunks.
Takes the top hit (cosine >= EMBED_THRESHOLD) that carries a real article
and turns its article_label into a precise citation.
"""
source_text = ctrl.get("source_original_text")
if not source_text:
return None
query = source_text.strip()[:512]
best: Optional[RAGSearchResult] = None
for collection in self._collections_for(ctrl):
try:
hits = await self.rag.search(query, collection=collection, top_k=3)
except Exception as e:
logger.debug("embed search failed (%s): %s", collection, e)
hits = []
if hits and (best is None or hits[0].score > best.score):
best = hits[0]
if not best or best.score < EMBED_THRESHOLD:
return None
article = _article_part(best)
if not article:
return None
return MatchResult(
article=article,
paragraph=best.paragraph or "",
method="embed",
source=best.regulation_short or best.regulation_name or "",
)
def _collections_for(self, ctrl: dict) -> list[str]:
"""Likely collection(s) for a control's regulation; falls back to all three."""
meta = ctrl.get("generation_metadata") or {}
reg = (meta.get("source_regulation") or "").lower()
prefix_map = {
"eu_": "bp_compliance_gesetze", "bdsg": "bp_compliance_gesetze",
"de_": "bp_compliance_gesetze", "at_": "bp_compliance_gesetze",
"ch_": "bp_compliance_gesetze", "dsgvo": "bp_compliance_gesetze",
"trgs": "bp_compliance_ce", "trbs": "bp_compliance_ce", "asr": "bp_compliance_ce",
"nist": "bp_compliance_ce", "owasp": "bp_compliance_ce", "enisa": "bp_compliance_ce",
"edpb": "bp_compliance_datenschutz", "dsk": "bp_compliance_datenschutz",
"bfdi": "bp_compliance_datenschutz",
}
for prefix, coll in prefix_map.items():
if reg.startswith(prefix):
return [coll]
return list(RELINK_COLLECTIONS)
async def _llm_match(self, ctrl: dict) -> Optional[MatchResult]:
"""Use Ollama to identify article/paragraph from source text."""
citation = ctrl.get("source_citation") or {}
regulation_name = citation.get("source", "")
metadata = ctrl.get("generation_metadata") or {}
regulation_code = metadata.get("source_regulation", "")
source_text = ctrl.get("source_original_text", "")
prompt = f"""Analysiere den folgenden Gesetzestext und bestimme den genauen Artikel und Absatz.
Gesetz: {regulation_name} (Code: {regulation_code})
Text:
---
{source_text[:2000]}
---
Antworte NUR mit JSON:
{{"article": "Art. XX", "paragraph": "Abs. Y"}}
Falls kein spezifischer Absatz erkennbar ist, setze paragraph auf "".
Falls kein Artikel erkennbar ist, setze article auf "".
Bei deutschen Gesetzen mit § verwende: "§ XX" statt "Art. XX"."""
try:
raw = await _llm_ollama(prompt, BACKFILL_SYSTEM_PROMPT)
data = _parse_json(raw)
if data and (data.get("article") or data.get("paragraph")):
return MatchResult(
article=data.get("article", ""),
paragraph=data.get("paragraph", ""),
method="llm",
)
except Exception as e:
logger.warning("LLM match failed for %s: %s", ctrl.get("control_id"), e)
return None
def _update_control(self, ctrl: dict, match: MatchResult):
"""Update source_citation and generation_metadata in DB."""
citation = ctrl.get("source_citation") or {}
# Clean the source name: remove concatenated article if present
source_str = citation.get("source", "")
parsed = _parse_concatenated_source(source_str)
if parsed:
citation["source"] = parsed["name"]
# Embed tier carries the cleaned regulation name → prefer it as source.
if match.source:
citation["source"] = match.source
# Add separate article/paragraph fields
citation["article"] = match.article
citation["paragraph"] = match.paragraph
# Update generation_metadata
metadata = ctrl.get("generation_metadata") or {}
if match.article:
metadata["source_article"] = match.article
metadata["source_paragraph"] = match.paragraph
metadata["backfill_method"] = match.method
metadata["backfill_at"] = datetime.now(timezone.utc).isoformat()
self.db.execute(
text("""
UPDATE canonical_controls
SET source_citation = :citation,
generation_metadata = :metadata,
updated_at = NOW()
WHERE id = CAST(:id AS uuid)
"""),
{
"id": ctrl["id"],
"citation": json.dumps(citation),
"metadata": json.dumps(metadata),
},
)
def _article_part(chunk: RAGSearchResult) -> str:
"""Precise article from a chunk: article_label minus the regulation name.
'BDSG § 38' -> '§ 38'; 'Art. 39 DSGVO' -> 'Art. 39'; 'NIST SP 800-53r5 SA-12' -> 'SA-12'.
Falls back to the bare article field. Returns '' if only a doc-level name is present.
"""
label = (chunk.article_label or "").strip()
reg = (chunk.regulation_short or "").strip()
if label:
part = label
if reg and reg in label:
part = label.replace(reg, "").strip(" ,;-")
if part and part != reg:
return part
return (chunk.article or "").strip()
def _parse_concatenated_source(source: str) -> Optional[dict]:
"""Parse 'DSGVO Art. 35'{name: 'DSGVO', article: 'Art. 35'}.
Also handles '§' format: 'BDSG § 42'{name: 'BDSG', article: '§ 42'}.
"""
if not source:
return None
# Try Art./Artikel pattern
m = _SOURCE_ARTICLE_RE.match(source)
if m:
return {"name": m.group(1).strip(), "article": m.group(2).strip()}
# Try § pattern
m2 = re.match(r"^(.+?)\s+(§\s*\d+.*)$", source)
if m2:
return {"name": m2.group(1).strip(), "article": m2.group(2).strip()}
return None
async def _llm_ollama(prompt: str, system_prompt: Optional[str] = None) -> str:
"""Call Ollama chat API for backfill matching."""
messages = []
if system_prompt:
messages.append({"role": "system", "content": system_prompt})
messages.append({"role": "user", "content": prompt})
payload = {
"model": OLLAMA_MODEL,
"messages": messages,
"stream": False,
"format": "json",
"options": {"num_predict": 256},
"think": False,
}
try:
async with httpx.AsyncClient(timeout=LLM_TIMEOUT) as client:
resp = await client.post(f"{OLLAMA_URL}/api/chat", json=payload)
if resp.status_code != 200:
logger.error("Ollama backfill failed %d: %s", resp.status_code, resp.text[:300])
return ""
data = resp.json()
msg = data.get("message", {})
if isinstance(msg, dict):
return msg.get("content", "")
return data.get("response", str(msg))
except Exception as e:
logger.error("Ollama backfill request failed: %s", e)
return ""
def _parse_json(raw: str) -> Optional[dict]:
"""Extract JSON object from LLM output."""
if not raw:
return None
# Try direct parse
try:
return json.loads(raw)
except json.JSONDecodeError:
pass
# Try extracting from markdown code block
m = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", raw, re.DOTALL)
if m:
try:
return json.loads(m.group(1))
except json.JSONDecodeError:
pass
# Try finding first { ... }
m = re.search(r"\{[^{}]*\}", raw)
if m:
try:
return json.loads(m.group(0))
except json.JSONDecodeError:
pass
return None