de542633e2
citation_backfill Tier-1 von totem sha256-Hash auf Semantik-Suche gegen die re-ingestierten, article_label-tragenden Chunks umgestellt (Fundstelle aus article_label); rag_client reicht article_label durch (additiv, Default-Feld). NEU: scripts/atom_citation_inheritance.py vererbt source_citation parent->atom (license_rule != 3), iterativ. macmini-Apply verifiziert: Zitierfaehigkeit 6.9%->61.3% (+171.765 Atome), Stichprobe korrekt (Atom == Parent-Fundstelle). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
418 lines
15 KiB
Python
418 lines
15 KiB
Python
"""
|
|
Citation Backfill Service — enrich existing controls with article/paragraph provenance.
|
|
|
|
3-tier matching strategy:
|
|
Tier 1 — Hash match: sha256(source_original_text) → RAG chunk lookup
|
|
Tier 2 — Regex parse: split concatenated "DSGVO Art. 35" → regulation + article
|
|
Tier 3 — Ollama LLM: ask local LLM to identify article/paragraph from text
|
|
"""
|
|
|
|
import json
|
|
import logging
|
|
import os
|
|
import re
|
|
from dataclasses import dataclass, field
|
|
from datetime import datetime, timezone
|
|
from typing import Optional
|
|
|
|
import httpx
|
|
from sqlalchemy import text
|
|
from sqlalchemy.orm import Session
|
|
|
|
from .rag_client import ComplianceRAGClient, RAGSearchResult
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
|
|
OLLAMA_MODEL = os.getenv("CONTROL_GEN_OLLAMA_MODEL", "qwen3.5:35b-a3b")
|
|
LLM_TIMEOUT = float(os.getenv("CONTROL_GEN_LLM_TIMEOUT", "180"))
|
|
|
|
# Tier-1 semantic re-link: min cosine for a source_original_text → chunk match.
|
|
EMBED_THRESHOLD = float(os.getenv("CITATION_EMBED_THRESHOLD", "0.80"))
|
|
# Collections that carry re-ingested, article_label-bearing chunks.
|
|
RELINK_COLLECTIONS = [
|
|
"bp_compliance_gesetze",
|
|
"bp_compliance_datenschutz",
|
|
"bp_compliance_ce",
|
|
]
|
|
|
|
BACKFILL_SYSTEM_PROMPT = (
|
|
"Du bist ein Rechtsexperte. Deine Aufgabe ist es, aus einem Gesetzestext "
|
|
"den genauen Artikel und Absatz zu bestimmen. Antworte NUR mit validem JSON."
|
|
)
|
|
|
|
# Regex to split concatenated source like "DSGVO Art. 35" or "NIS2 Artikel 21 Abs. 2"
|
|
_SOURCE_ARTICLE_RE = re.compile(
|
|
r"^(.+?)\s+(Art(?:ikel)?\.?\s*\d+.*)$", re.IGNORECASE
|
|
)
|
|
|
|
|
|
@dataclass
|
|
class MatchResult:
|
|
article: str
|
|
paragraph: str
|
|
method: str # "embed", "regex", "llm"
|
|
source: str = "" # regulation short/name (embed tier sets the cleaned source)
|
|
|
|
|
|
@dataclass
|
|
class BackfillResult:
|
|
total_controls: int = 0
|
|
matched_embed: int = 0
|
|
matched_regex: int = 0
|
|
matched_llm: int = 0
|
|
unmatched: int = 0
|
|
updated: int = 0
|
|
errors: list = field(default_factory=list)
|
|
|
|
|
|
class CitationBackfill:
|
|
"""Backfill article/paragraph into existing control source_citations."""
|
|
|
|
def __init__(self, db: Session, rag_client: ComplianceRAGClient):
|
|
self.db = db
|
|
self.rag = rag_client
|
|
|
|
async def run(self, dry_run: bool = True, limit: int = 0) -> BackfillResult:
|
|
"""Main entry: iterate controls missing article/paragraph, match to RAG, update."""
|
|
result = BackfillResult()
|
|
|
|
# Load controls needing backfill
|
|
controls = self._load_controls_needing_backfill(limit)
|
|
result.total_controls = len(controls)
|
|
logger.info("Backfill: %d controls need article/paragraph enrichment", len(controls))
|
|
|
|
if not controls:
|
|
return result
|
|
|
|
# Tier-1 = per-control semantic search against the re-ingested, labeled chunks.
|
|
# (The old sha256(chunk.text) hash index died with re-chunking and is gone.)
|
|
with_source = sum(1 for c in controls if c.get("source_original_text"))
|
|
logger.info("Embedding-relink candidates (with source_original_text): %d", with_source)
|
|
|
|
# Process each control
|
|
for i, ctrl in enumerate(controls):
|
|
if i > 0 and i % 100 == 0:
|
|
logger.info("Backfill progress: %d/%d processed", i, result.total_controls)
|
|
|
|
try:
|
|
match = await self._match_control(ctrl)
|
|
if match:
|
|
if match.method == "embed":
|
|
result.matched_embed += 1
|
|
elif match.method == "regex":
|
|
result.matched_regex += 1
|
|
elif match.method == "llm":
|
|
result.matched_llm += 1
|
|
|
|
if not dry_run:
|
|
self._update_control(ctrl, match)
|
|
result.updated += 1
|
|
else:
|
|
logger.debug(
|
|
"DRY RUN: Would update %s with article=%s paragraph=%s (method=%s)",
|
|
ctrl["control_id"], match.article, match.paragraph, match.method,
|
|
)
|
|
else:
|
|
result.unmatched += 1
|
|
|
|
except Exception as e:
|
|
error_msg = f"Error backfilling {ctrl.get('control_id', '?')}: {e}"
|
|
logger.error(error_msg)
|
|
result.errors.append(error_msg)
|
|
|
|
if not dry_run:
|
|
try:
|
|
self.db.commit()
|
|
except Exception as e:
|
|
logger.error("Backfill commit failed: %s", e)
|
|
result.errors.append(f"Commit failed: {e}")
|
|
|
|
logger.info(
|
|
"Backfill complete: %d total, embed=%d regex=%d llm=%d unmatched=%d updated=%d",
|
|
result.total_controls, result.matched_embed, result.matched_regex,
|
|
result.matched_llm, result.unmatched, result.updated,
|
|
)
|
|
return result
|
|
|
|
def _load_controls_needing_backfill(self, limit: int = 0) -> list[dict]:
|
|
"""Load controls where source_citation exists but lacks separate 'article' key."""
|
|
query = """
|
|
SELECT id, control_id, source_citation, source_original_text,
|
|
generation_metadata, license_rule
|
|
FROM canonical_controls
|
|
WHERE license_rule IN (1, 2)
|
|
AND source_citation IS NOT NULL
|
|
AND (
|
|
source_citation->>'article' IS NULL
|
|
OR source_citation->>'article' = ''
|
|
)
|
|
ORDER BY control_id
|
|
"""
|
|
if limit > 0:
|
|
query += f" LIMIT {limit}"
|
|
|
|
result = self.db.execute(text(query))
|
|
cols = result.keys()
|
|
controls = []
|
|
for row in result:
|
|
ctrl = dict(zip(cols, row))
|
|
ctrl["id"] = str(ctrl["id"])
|
|
# Parse JSON fields
|
|
for jf in ("source_citation", "generation_metadata"):
|
|
if isinstance(ctrl.get(jf), str):
|
|
try:
|
|
ctrl[jf] = json.loads(ctrl[jf])
|
|
except (json.JSONDecodeError, TypeError):
|
|
ctrl[jf] = {}
|
|
controls.append(ctrl)
|
|
return controls
|
|
|
|
async def _match_control(self, ctrl: dict) -> Optional[MatchResult]:
|
|
"""3-tier matching: hash → regex → LLM."""
|
|
|
|
# Tier 1: Semantic search against the re-ingested, labeled chunks
|
|
embed = await self._embedding_match(ctrl)
|
|
if embed:
|
|
return embed
|
|
|
|
# Tier 2: Regex parse concatenated source
|
|
citation = ctrl.get("source_citation") or {}
|
|
source_str = citation.get("source", "")
|
|
parsed = _parse_concatenated_source(source_str)
|
|
if parsed and parsed["article"]:
|
|
return MatchResult(
|
|
article=parsed["article"],
|
|
paragraph="", # Regex can't extract paragraph from concatenated format
|
|
method="regex",
|
|
)
|
|
|
|
# Tier 3: Ollama LLM
|
|
if ctrl.get("source_original_text"):
|
|
return await self._llm_match(ctrl)
|
|
|
|
return None
|
|
|
|
async def _embedding_match(self, ctrl: dict) -> Optional[MatchResult]:
|
|
"""Tier 1: semantic-search source_original_text against the labeled chunks.
|
|
|
|
Takes the top hit (cosine >= EMBED_THRESHOLD) that carries a real article
|
|
and turns its article_label into a precise citation.
|
|
"""
|
|
source_text = ctrl.get("source_original_text")
|
|
if not source_text:
|
|
return None
|
|
query = source_text.strip()[:512]
|
|
best: Optional[RAGSearchResult] = None
|
|
for collection in self._collections_for(ctrl):
|
|
try:
|
|
hits = await self.rag.search(query, collection=collection, top_k=3)
|
|
except Exception as e:
|
|
logger.debug("embed search failed (%s): %s", collection, e)
|
|
hits = []
|
|
if hits and (best is None or hits[0].score > best.score):
|
|
best = hits[0]
|
|
if not best or best.score < EMBED_THRESHOLD:
|
|
return None
|
|
article = _article_part(best)
|
|
if not article:
|
|
return None
|
|
return MatchResult(
|
|
article=article,
|
|
paragraph=best.paragraph or "",
|
|
method="embed",
|
|
source=best.regulation_short or best.regulation_name or "",
|
|
)
|
|
|
|
def _collections_for(self, ctrl: dict) -> list[str]:
|
|
"""Likely collection(s) for a control's regulation; falls back to all three."""
|
|
meta = ctrl.get("generation_metadata") or {}
|
|
reg = (meta.get("source_regulation") or "").lower()
|
|
prefix_map = {
|
|
"eu_": "bp_compliance_gesetze", "bdsg": "bp_compliance_gesetze",
|
|
"de_": "bp_compliance_gesetze", "at_": "bp_compliance_gesetze",
|
|
"ch_": "bp_compliance_gesetze", "dsgvo": "bp_compliance_gesetze",
|
|
"trgs": "bp_compliance_ce", "trbs": "bp_compliance_ce", "asr": "bp_compliance_ce",
|
|
"nist": "bp_compliance_ce", "owasp": "bp_compliance_ce", "enisa": "bp_compliance_ce",
|
|
"edpb": "bp_compliance_datenschutz", "dsk": "bp_compliance_datenschutz",
|
|
"bfdi": "bp_compliance_datenschutz",
|
|
}
|
|
for prefix, coll in prefix_map.items():
|
|
if reg.startswith(prefix):
|
|
return [coll]
|
|
return list(RELINK_COLLECTIONS)
|
|
|
|
async def _llm_match(self, ctrl: dict) -> Optional[MatchResult]:
|
|
"""Use Ollama to identify article/paragraph from source text."""
|
|
citation = ctrl.get("source_citation") or {}
|
|
regulation_name = citation.get("source", "")
|
|
metadata = ctrl.get("generation_metadata") or {}
|
|
regulation_code = metadata.get("source_regulation", "")
|
|
source_text = ctrl.get("source_original_text", "")
|
|
|
|
prompt = f"""Analysiere den folgenden Gesetzestext und bestimme den genauen Artikel und Absatz.
|
|
|
|
Gesetz: {regulation_name} (Code: {regulation_code})
|
|
|
|
Text:
|
|
---
|
|
{source_text[:2000]}
|
|
---
|
|
|
|
Antworte NUR mit JSON:
|
|
{{"article": "Art. XX", "paragraph": "Abs. Y"}}
|
|
|
|
Falls kein spezifischer Absatz erkennbar ist, setze paragraph auf "".
|
|
Falls kein Artikel erkennbar ist, setze article auf "".
|
|
Bei deutschen Gesetzen mit § verwende: "§ XX" statt "Art. XX"."""
|
|
|
|
try:
|
|
raw = await _llm_ollama(prompt, BACKFILL_SYSTEM_PROMPT)
|
|
data = _parse_json(raw)
|
|
if data and (data.get("article") or data.get("paragraph")):
|
|
return MatchResult(
|
|
article=data.get("article", ""),
|
|
paragraph=data.get("paragraph", ""),
|
|
method="llm",
|
|
)
|
|
except Exception as e:
|
|
logger.warning("LLM match failed for %s: %s", ctrl.get("control_id"), e)
|
|
|
|
return None
|
|
|
|
def _update_control(self, ctrl: dict, match: MatchResult):
|
|
"""Update source_citation and generation_metadata in DB."""
|
|
citation = ctrl.get("source_citation") or {}
|
|
|
|
# Clean the source name: remove concatenated article if present
|
|
source_str = citation.get("source", "")
|
|
parsed = _parse_concatenated_source(source_str)
|
|
if parsed:
|
|
citation["source"] = parsed["name"]
|
|
|
|
# Embed tier carries the cleaned regulation name → prefer it as source.
|
|
if match.source:
|
|
citation["source"] = match.source
|
|
# Add separate article/paragraph fields
|
|
citation["article"] = match.article
|
|
citation["paragraph"] = match.paragraph
|
|
|
|
# Update generation_metadata
|
|
metadata = ctrl.get("generation_metadata") or {}
|
|
if match.article:
|
|
metadata["source_article"] = match.article
|
|
metadata["source_paragraph"] = match.paragraph
|
|
metadata["backfill_method"] = match.method
|
|
metadata["backfill_at"] = datetime.now(timezone.utc).isoformat()
|
|
|
|
self.db.execute(
|
|
text("""
|
|
UPDATE canonical_controls
|
|
SET source_citation = :citation,
|
|
generation_metadata = :metadata,
|
|
updated_at = NOW()
|
|
WHERE id = CAST(:id AS uuid)
|
|
"""),
|
|
{
|
|
"id": ctrl["id"],
|
|
"citation": json.dumps(citation),
|
|
"metadata": json.dumps(metadata),
|
|
},
|
|
)
|
|
|
|
|
|
def _article_part(chunk: RAGSearchResult) -> str:
|
|
"""Precise article from a chunk: article_label minus the regulation name.
|
|
|
|
'BDSG § 38' -> '§ 38'; 'Art. 39 DSGVO' -> 'Art. 39'; 'NIST SP 800-53r5 SA-12' -> 'SA-12'.
|
|
Falls back to the bare article field. Returns '' if only a doc-level name is present.
|
|
"""
|
|
label = (chunk.article_label or "").strip()
|
|
reg = (chunk.regulation_short or "").strip()
|
|
if label:
|
|
part = label
|
|
if reg and reg in label:
|
|
part = label.replace(reg, "").strip(" ,;-")
|
|
if part and part != reg:
|
|
return part
|
|
return (chunk.article or "").strip()
|
|
|
|
|
|
def _parse_concatenated_source(source: str) -> Optional[dict]:
|
|
"""Parse 'DSGVO Art. 35' → {name: 'DSGVO', article: 'Art. 35'}.
|
|
|
|
Also handles '§' format: 'BDSG § 42' → {name: 'BDSG', article: '§ 42'}.
|
|
"""
|
|
if not source:
|
|
return None
|
|
|
|
# Try Art./Artikel pattern
|
|
m = _SOURCE_ARTICLE_RE.match(source)
|
|
if m:
|
|
return {"name": m.group(1).strip(), "article": m.group(2).strip()}
|
|
|
|
# Try § pattern
|
|
m2 = re.match(r"^(.+?)\s+(§\s*\d+.*)$", source)
|
|
if m2:
|
|
return {"name": m2.group(1).strip(), "article": m2.group(2).strip()}
|
|
|
|
return None
|
|
|
|
|
|
async def _llm_ollama(prompt: str, system_prompt: Optional[str] = None) -> str:
|
|
"""Call Ollama chat API for backfill matching."""
|
|
messages = []
|
|
if system_prompt:
|
|
messages.append({"role": "system", "content": system_prompt})
|
|
messages.append({"role": "user", "content": prompt})
|
|
|
|
payload = {
|
|
"model": OLLAMA_MODEL,
|
|
"messages": messages,
|
|
"stream": False,
|
|
"format": "json",
|
|
"options": {"num_predict": 256},
|
|
"think": False,
|
|
}
|
|
|
|
try:
|
|
async with httpx.AsyncClient(timeout=LLM_TIMEOUT) as client:
|
|
resp = await client.post(f"{OLLAMA_URL}/api/chat", json=payload)
|
|
if resp.status_code != 200:
|
|
logger.error("Ollama backfill failed %d: %s", resp.status_code, resp.text[:300])
|
|
return ""
|
|
data = resp.json()
|
|
msg = data.get("message", {})
|
|
if isinstance(msg, dict):
|
|
return msg.get("content", "")
|
|
return data.get("response", str(msg))
|
|
except Exception as e:
|
|
logger.error("Ollama backfill request failed: %s", e)
|
|
return ""
|
|
|
|
|
|
def _parse_json(raw: str) -> Optional[dict]:
|
|
"""Extract JSON object from LLM output."""
|
|
if not raw:
|
|
return None
|
|
# Try direct parse
|
|
try:
|
|
return json.loads(raw)
|
|
except json.JSONDecodeError:
|
|
pass
|
|
# Try extracting from markdown code block
|
|
m = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", raw, re.DOTALL)
|
|
if m:
|
|
try:
|
|
return json.loads(m.group(1))
|
|
except json.JSONDecodeError:
|
|
pass
|
|
# Try finding first { ... }
|
|
m = re.search(r"\{[^{}]*\}", raw)
|
|
if m:
|
|
try:
|
|
return json.loads(m.group(0))
|
|
except json.JSONDecodeError:
|
|
pass
|
|
return None
|