feat(controls): Zitierfaehigkeit — Embedding-Re-Link + Atom-Vererbung

citation_backfill Tier-1 von totem sha256-Hash auf Semantik-Suche gegen die
re-ingestierten, article_label-tragenden Chunks umgestellt (Fundstelle aus
article_label); rag_client reicht article_label durch (additiv, Default-Feld).
NEU: scripts/atom_citation_inheritance.py vererbt source_citation parent->atom
(license_rule != 3), iterativ. macmini-Apply verifiziert: Zitierfaehigkeit
6.9%->61.3% (+171.765 Atome), Stichprobe korrekt (Atom == Parent-Fundstelle).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-06-21 14:17:57 +02:00
parent ff4a743558
commit de542633e2
3 changed files with 238 additions and 111 deletions
+90 -111
View File
@@ -7,7 +7,6 @@ Citation Backfill Service — enrich existing controls with article/paragraph pr
Tier 3 — Ollama LLM: ask local LLM to identify article/paragraph from text
"""
import hashlib
import json
import logging
import os
@@ -28,12 +27,13 @@ OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
OLLAMA_MODEL = os.getenv("CONTROL_GEN_OLLAMA_MODEL", "qwen3.5:35b-a3b")
LLM_TIMEOUT = float(os.getenv("CONTROL_GEN_LLM_TIMEOUT", "180"))
ALL_COLLECTIONS = [
"bp_compliance_ce",
# Tier-1 semantic re-link: min cosine for a source_original_text → chunk match.
EMBED_THRESHOLD = float(os.getenv("CITATION_EMBED_THRESHOLD", "0.80"))
# Collections that carry re-ingested, article_label-bearing chunks.
RELINK_COLLECTIONS = [
"bp_compliance_gesetze",
"bp_compliance_datenschutz",
"bp_dsfa_corpus",
"bp_legal_templates",
"bp_compliance_ce",
]
BACKFILL_SYSTEM_PROMPT = (
@@ -51,13 +51,14 @@ _SOURCE_ARTICLE_RE = re.compile(
class MatchResult:
article: str
paragraph: str
method: str # "hash", "regex", "llm"
method: str # "embed", "regex", "llm"
source: str = "" # regulation short/name (embed tier sets the cleaned source)
@dataclass
class BackfillResult:
total_controls: int = 0
matched_hash: int = 0
matched_embed: int = 0
matched_regex: int = 0
matched_llm: int = 0
unmatched: int = 0
@@ -71,7 +72,6 @@ class CitationBackfill:
def __init__(self, db: Session, rag_client: ComplianceRAGClient):
self.db = db
self.rag = rag_client
self._rag_index: dict[str, RAGSearchResult] = {}
async def run(self, dry_run: bool = True, limit: int = 0) -> BackfillResult:
"""Main entry: iterate controls missing article/paragraph, match to RAG, update."""
@@ -85,20 +85,10 @@ class CitationBackfill:
if not controls:
return result
# Collect hashes we need to find — only build index for controls with source text
needed_hashes: set[str] = set()
for ctrl in controls:
src = ctrl.get("source_original_text")
if src:
needed_hashes.add(hashlib.sha256(src.encode()).hexdigest())
if needed_hashes:
# Build targeted RAG index — only scroll collections that our controls reference
logger.info("Building targeted RAG hash index for %d source texts...", len(needed_hashes))
await self._build_rag_index_targeted(controls)
logger.info("RAG index built: %d chunks indexed, %d hashes needed", len(self._rag_index), len(needed_hashes))
else:
logger.info("No source_original_text found — skipping RAG index build")
# Tier-1 = per-control semantic search against the re-ingested, labeled chunks.
# (The old sha256(chunk.text) hash index died with re-chunking and is gone.)
with_source = sum(1 for c in controls if c.get("source_original_text"))
logger.info("Embedding-relink candidates (with source_original_text): %d", with_source)
# Process each control
for i, ctrl in enumerate(controls):
@@ -108,8 +98,8 @@ class CitationBackfill:
try:
match = await self._match_control(ctrl)
if match:
if match.method == "hash":
result.matched_hash += 1
if match.method == "embed":
result.matched_embed += 1
elif match.method == "regex":
result.matched_regex += 1
elif match.method == "llm":
@@ -139,8 +129,8 @@ class CitationBackfill:
result.errors.append(f"Commit failed: {e}")
logger.info(
"Backfill complete: %d total, hash=%d regex=%d llm=%d unmatched=%d updated=%d",
result.total_controls, result.matched_hash, result.matched_regex,
"Backfill complete: %d total, embed=%d regex=%d llm=%d unmatched=%d updated=%d",
result.total_controls, result.matched_embed, result.matched_regex,
result.matched_llm, result.unmatched, result.updated,
)
return result
@@ -178,93 +168,13 @@ class CitationBackfill:
controls.append(ctrl)
return controls
async def _build_rag_index_targeted(self, controls: list[dict]):
"""Build RAG index by scrolling only collections relevant to our controls.
Uses regulation codes from generation_metadata to identify which collections
to search, falling back to all collections only if needed.
"""
# Determine which collections are relevant based on regulation codes
regulation_to_collection = self._map_regulations_to_collections(controls)
collections_to_search = set(regulation_to_collection.values()) or set(ALL_COLLECTIONS)
logger.info("Targeted index: searching %d collections: %s",
len(collections_to_search), ", ".join(collections_to_search))
for collection in collections_to_search:
offset = None
page = 0
seen_offsets: set[str] = set()
while True:
chunks, next_offset = await self.rag.scroll(
collection=collection, offset=offset, limit=200,
)
if not chunks:
break
for chunk in chunks:
if chunk.text and len(chunk.text.strip()) >= 50:
h = hashlib.sha256(chunk.text.encode()).hexdigest()
self._rag_index[h] = chunk
page += 1
if page % 50 == 0:
logger.info("Indexing %s: page %d (%d chunks so far)",
collection, page, len(self._rag_index))
if not next_offset:
break
if next_offset in seen_offsets:
logger.warning("Scroll loop in %s at page %d — stopping", collection, page)
break
seen_offsets.add(next_offset)
offset = next_offset
logger.info("Indexed collection %s: %d pages", collection, page)
def _map_regulations_to_collections(self, controls: list[dict]) -> dict[str, str]:
"""Map regulation codes from controls to likely Qdrant collections."""
# Heuristic: regulation code prefix → collection
collection_map = {
"eu_": "bp_compliance_gesetze",
"dsgvo": "bp_compliance_datenschutz",
"bdsg": "bp_compliance_gesetze",
"ttdsg": "bp_compliance_gesetze",
"nist_": "bp_compliance_ce",
"owasp": "bp_compliance_ce",
"bsi_": "bp_compliance_ce",
"enisa": "bp_compliance_ce",
"at_": "bp_compliance_recht",
"fr_": "bp_compliance_recht",
"es_": "bp_compliance_recht",
}
result: dict[str, str] = {}
for ctrl in controls:
meta = ctrl.get("generation_metadata") or {}
reg = meta.get("source_regulation", "")
if not reg:
continue
for prefix, coll in collection_map.items():
if reg.startswith(prefix):
result[reg] = coll
break
else:
# Unknown regulation — search all
for coll in ALL_COLLECTIONS:
result[f"_all_{coll}"] = coll
return result
async def _match_control(self, ctrl: dict) -> Optional[MatchResult]:
"""3-tier matching: hash → regex → LLM."""
# Tier 1: Hash match against RAG index
source_text = ctrl.get("source_original_text")
if source_text:
h = hashlib.sha256(source_text.encode()).hexdigest()
chunk = self._rag_index.get(h)
if chunk and (chunk.article or chunk.paragraph):
return MatchResult(
article=chunk.article or "",
paragraph=chunk.paragraph or "",
method="hash",
)
# Tier 1: Semantic search against the re-ingested, labeled chunks
embed = await self._embedding_match(ctrl)
if embed:
return embed
# Tier 2: Regex parse concatenated source
citation = ctrl.get("source_citation") or {}
@@ -278,11 +188,60 @@ class CitationBackfill:
)
# Tier 3: Ollama LLM
if source_text:
if ctrl.get("source_original_text"):
return await self._llm_match(ctrl)
return None
async def _embedding_match(self, ctrl: dict) -> Optional[MatchResult]:
"""Tier 1: semantic-search source_original_text against the labeled chunks.
Takes the top hit (cosine >= EMBED_THRESHOLD) that carries a real article
and turns its article_label into a precise citation.
"""
source_text = ctrl.get("source_original_text")
if not source_text:
return None
query = source_text.strip()[:512]
best: Optional[RAGSearchResult] = None
for collection in self._collections_for(ctrl):
try:
hits = await self.rag.search(query, collection=collection, top_k=3)
except Exception as e:
logger.debug("embed search failed (%s): %s", collection, e)
hits = []
if hits and (best is None or hits[0].score > best.score):
best = hits[0]
if not best or best.score < EMBED_THRESHOLD:
return None
article = _article_part(best)
if not article:
return None
return MatchResult(
article=article,
paragraph=best.paragraph or "",
method="embed",
source=best.regulation_short or best.regulation_name or "",
)
def _collections_for(self, ctrl: dict) -> list[str]:
"""Likely collection(s) for a control's regulation; falls back to all three."""
meta = ctrl.get("generation_metadata") or {}
reg = (meta.get("source_regulation") or "").lower()
prefix_map = {
"eu_": "bp_compliance_gesetze", "bdsg": "bp_compliance_gesetze",
"de_": "bp_compliance_gesetze", "at_": "bp_compliance_gesetze",
"ch_": "bp_compliance_gesetze", "dsgvo": "bp_compliance_gesetze",
"trgs": "bp_compliance_ce", "trbs": "bp_compliance_ce", "asr": "bp_compliance_ce",
"nist": "bp_compliance_ce", "owasp": "bp_compliance_ce", "enisa": "bp_compliance_ce",
"edpb": "bp_compliance_datenschutz", "dsk": "bp_compliance_datenschutz",
"bfdi": "bp_compliance_datenschutz",
}
for prefix, coll in prefix_map.items():
if reg.startswith(prefix):
return [coll]
return list(RELINK_COLLECTIONS)
async def _llm_match(self, ctrl: dict) -> Optional[MatchResult]:
"""Use Ollama to identify article/paragraph from source text."""
citation = ctrl.get("source_citation") or {}
@@ -331,6 +290,9 @@ Bei deutschen Gesetzen mit § verwende: "§ XX" statt "Art. XX"."""
if parsed:
citation["source"] = parsed["name"]
# Embed tier carries the cleaned regulation name → prefer it as source.
if match.source:
citation["source"] = match.source
# Add separate article/paragraph fields
citation["article"] = match.article
citation["paragraph"] = match.paragraph
@@ -359,6 +321,23 @@ Bei deutschen Gesetzen mit § verwende: "§ XX" statt "Art. XX"."""
)
def _article_part(chunk: RAGSearchResult) -> str:
"""Precise article from a chunk: article_label minus the regulation name.
'BDSG § 38' -> '§ 38'; 'Art. 39 DSGVO' -> 'Art. 39'; 'NIST SP 800-53r5 SA-12' -> 'SA-12'.
Falls back to the bare article field. Returns '' if only a doc-level name is present.
"""
label = (chunk.article_label or "").strip()
reg = (chunk.regulation_short or "").strip()
if label:
part = label
if reg and reg in label:
part = label.replace(reg, "").strip(" ,;-")
if part and part != reg:
return part
return (chunk.article or "").strip()
def _parse_concatenated_source(source: str) -> Optional[dict]:
"""Parse 'DSGVO Art. 35'{name: 'DSGVO', article: 'Art. 35'}.