feat(controls): Zitierfaehigkeit — Embedding-Re-Link + Atom-Vererbung
citation_backfill Tier-1 von totem sha256-Hash auf Semantik-Suche gegen die re-ingestierten, article_label-tragenden Chunks umgestellt (Fundstelle aus article_label); rag_client reicht article_label durch (additiv, Default-Feld). NEU: scripts/atom_citation_inheritance.py vererbt source_citation parent->atom (license_rule != 3), iterativ. macmini-Apply verifiziert: Zitierfaehigkeit 6.9%->61.3% (+171.765 Atome), Stichprobe korrekt (Atom == Parent-Fundstelle). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -7,7 +7,6 @@ Citation Backfill Service — enrich existing controls with article/paragraph pr
|
||||
Tier 3 — Ollama LLM: ask local LLM to identify article/paragraph from text
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
@@ -28,12 +27,13 @@ OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
|
||||
OLLAMA_MODEL = os.getenv("CONTROL_GEN_OLLAMA_MODEL", "qwen3.5:35b-a3b")
|
||||
LLM_TIMEOUT = float(os.getenv("CONTROL_GEN_LLM_TIMEOUT", "180"))
|
||||
|
||||
ALL_COLLECTIONS = [
|
||||
"bp_compliance_ce",
|
||||
# Tier-1 semantic re-link: min cosine for a source_original_text → chunk match.
|
||||
EMBED_THRESHOLD = float(os.getenv("CITATION_EMBED_THRESHOLD", "0.80"))
|
||||
# Collections that carry re-ingested, article_label-bearing chunks.
|
||||
RELINK_COLLECTIONS = [
|
||||
"bp_compliance_gesetze",
|
||||
"bp_compliance_datenschutz",
|
||||
"bp_dsfa_corpus",
|
||||
"bp_legal_templates",
|
||||
"bp_compliance_ce",
|
||||
]
|
||||
|
||||
BACKFILL_SYSTEM_PROMPT = (
|
||||
@@ -51,13 +51,14 @@ _SOURCE_ARTICLE_RE = re.compile(
|
||||
class MatchResult:
|
||||
article: str
|
||||
paragraph: str
|
||||
method: str # "hash", "regex", "llm"
|
||||
method: str # "embed", "regex", "llm"
|
||||
source: str = "" # regulation short/name (embed tier sets the cleaned source)
|
||||
|
||||
|
||||
@dataclass
|
||||
class BackfillResult:
|
||||
total_controls: int = 0
|
||||
matched_hash: int = 0
|
||||
matched_embed: int = 0
|
||||
matched_regex: int = 0
|
||||
matched_llm: int = 0
|
||||
unmatched: int = 0
|
||||
@@ -71,7 +72,6 @@ class CitationBackfill:
|
||||
def __init__(self, db: Session, rag_client: ComplianceRAGClient):
|
||||
self.db = db
|
||||
self.rag = rag_client
|
||||
self._rag_index: dict[str, RAGSearchResult] = {}
|
||||
|
||||
async def run(self, dry_run: bool = True, limit: int = 0) -> BackfillResult:
|
||||
"""Main entry: iterate controls missing article/paragraph, match to RAG, update."""
|
||||
@@ -85,20 +85,10 @@ class CitationBackfill:
|
||||
if not controls:
|
||||
return result
|
||||
|
||||
# Collect hashes we need to find — only build index for controls with source text
|
||||
needed_hashes: set[str] = set()
|
||||
for ctrl in controls:
|
||||
src = ctrl.get("source_original_text")
|
||||
if src:
|
||||
needed_hashes.add(hashlib.sha256(src.encode()).hexdigest())
|
||||
|
||||
if needed_hashes:
|
||||
# Build targeted RAG index — only scroll collections that our controls reference
|
||||
logger.info("Building targeted RAG hash index for %d source texts...", len(needed_hashes))
|
||||
await self._build_rag_index_targeted(controls)
|
||||
logger.info("RAG index built: %d chunks indexed, %d hashes needed", len(self._rag_index), len(needed_hashes))
|
||||
else:
|
||||
logger.info("No source_original_text found — skipping RAG index build")
|
||||
# Tier-1 = per-control semantic search against the re-ingested, labeled chunks.
|
||||
# (The old sha256(chunk.text) hash index died with re-chunking and is gone.)
|
||||
with_source = sum(1 for c in controls if c.get("source_original_text"))
|
||||
logger.info("Embedding-relink candidates (with source_original_text): %d", with_source)
|
||||
|
||||
# Process each control
|
||||
for i, ctrl in enumerate(controls):
|
||||
@@ -108,8 +98,8 @@ class CitationBackfill:
|
||||
try:
|
||||
match = await self._match_control(ctrl)
|
||||
if match:
|
||||
if match.method == "hash":
|
||||
result.matched_hash += 1
|
||||
if match.method == "embed":
|
||||
result.matched_embed += 1
|
||||
elif match.method == "regex":
|
||||
result.matched_regex += 1
|
||||
elif match.method == "llm":
|
||||
@@ -139,8 +129,8 @@ class CitationBackfill:
|
||||
result.errors.append(f"Commit failed: {e}")
|
||||
|
||||
logger.info(
|
||||
"Backfill complete: %d total, hash=%d regex=%d llm=%d unmatched=%d updated=%d",
|
||||
result.total_controls, result.matched_hash, result.matched_regex,
|
||||
"Backfill complete: %d total, embed=%d regex=%d llm=%d unmatched=%d updated=%d",
|
||||
result.total_controls, result.matched_embed, result.matched_regex,
|
||||
result.matched_llm, result.unmatched, result.updated,
|
||||
)
|
||||
return result
|
||||
@@ -178,93 +168,13 @@ class CitationBackfill:
|
||||
controls.append(ctrl)
|
||||
return controls
|
||||
|
||||
async def _build_rag_index_targeted(self, controls: list[dict]):
|
||||
"""Build RAG index by scrolling only collections relevant to our controls.
|
||||
|
||||
Uses regulation codes from generation_metadata to identify which collections
|
||||
to search, falling back to all collections only if needed.
|
||||
"""
|
||||
# Determine which collections are relevant based on regulation codes
|
||||
regulation_to_collection = self._map_regulations_to_collections(controls)
|
||||
collections_to_search = set(regulation_to_collection.values()) or set(ALL_COLLECTIONS)
|
||||
|
||||
logger.info("Targeted index: searching %d collections: %s",
|
||||
len(collections_to_search), ", ".join(collections_to_search))
|
||||
|
||||
for collection in collections_to_search:
|
||||
offset = None
|
||||
page = 0
|
||||
seen_offsets: set[str] = set()
|
||||
while True:
|
||||
chunks, next_offset = await self.rag.scroll(
|
||||
collection=collection, offset=offset, limit=200,
|
||||
)
|
||||
if not chunks:
|
||||
break
|
||||
for chunk in chunks:
|
||||
if chunk.text and len(chunk.text.strip()) >= 50:
|
||||
h = hashlib.sha256(chunk.text.encode()).hexdigest()
|
||||
self._rag_index[h] = chunk
|
||||
page += 1
|
||||
if page % 50 == 0:
|
||||
logger.info("Indexing %s: page %d (%d chunks so far)",
|
||||
collection, page, len(self._rag_index))
|
||||
if not next_offset:
|
||||
break
|
||||
if next_offset in seen_offsets:
|
||||
logger.warning("Scroll loop in %s at page %d — stopping", collection, page)
|
||||
break
|
||||
seen_offsets.add(next_offset)
|
||||
offset = next_offset
|
||||
|
||||
logger.info("Indexed collection %s: %d pages", collection, page)
|
||||
|
||||
def _map_regulations_to_collections(self, controls: list[dict]) -> dict[str, str]:
|
||||
"""Map regulation codes from controls to likely Qdrant collections."""
|
||||
# Heuristic: regulation code prefix → collection
|
||||
collection_map = {
|
||||
"eu_": "bp_compliance_gesetze",
|
||||
"dsgvo": "bp_compliance_datenschutz",
|
||||
"bdsg": "bp_compliance_gesetze",
|
||||
"ttdsg": "bp_compliance_gesetze",
|
||||
"nist_": "bp_compliance_ce",
|
||||
"owasp": "bp_compliance_ce",
|
||||
"bsi_": "bp_compliance_ce",
|
||||
"enisa": "bp_compliance_ce",
|
||||
"at_": "bp_compliance_recht",
|
||||
"fr_": "bp_compliance_recht",
|
||||
"es_": "bp_compliance_recht",
|
||||
}
|
||||
result: dict[str, str] = {}
|
||||
for ctrl in controls:
|
||||
meta = ctrl.get("generation_metadata") or {}
|
||||
reg = meta.get("source_regulation", "")
|
||||
if not reg:
|
||||
continue
|
||||
for prefix, coll in collection_map.items():
|
||||
if reg.startswith(prefix):
|
||||
result[reg] = coll
|
||||
break
|
||||
else:
|
||||
# Unknown regulation — search all
|
||||
for coll in ALL_COLLECTIONS:
|
||||
result[f"_all_{coll}"] = coll
|
||||
return result
|
||||
|
||||
async def _match_control(self, ctrl: dict) -> Optional[MatchResult]:
|
||||
"""3-tier matching: hash → regex → LLM."""
|
||||
|
||||
# Tier 1: Hash match against RAG index
|
||||
source_text = ctrl.get("source_original_text")
|
||||
if source_text:
|
||||
h = hashlib.sha256(source_text.encode()).hexdigest()
|
||||
chunk = self._rag_index.get(h)
|
||||
if chunk and (chunk.article or chunk.paragraph):
|
||||
return MatchResult(
|
||||
article=chunk.article or "",
|
||||
paragraph=chunk.paragraph or "",
|
||||
method="hash",
|
||||
)
|
||||
# Tier 1: Semantic search against the re-ingested, labeled chunks
|
||||
embed = await self._embedding_match(ctrl)
|
||||
if embed:
|
||||
return embed
|
||||
|
||||
# Tier 2: Regex parse concatenated source
|
||||
citation = ctrl.get("source_citation") or {}
|
||||
@@ -278,11 +188,60 @@ class CitationBackfill:
|
||||
)
|
||||
|
||||
# Tier 3: Ollama LLM
|
||||
if source_text:
|
||||
if ctrl.get("source_original_text"):
|
||||
return await self._llm_match(ctrl)
|
||||
|
||||
return None
|
||||
|
||||
async def _embedding_match(self, ctrl: dict) -> Optional[MatchResult]:
|
||||
"""Tier 1: semantic-search source_original_text against the labeled chunks.
|
||||
|
||||
Takes the top hit (cosine >= EMBED_THRESHOLD) that carries a real article
|
||||
and turns its article_label into a precise citation.
|
||||
"""
|
||||
source_text = ctrl.get("source_original_text")
|
||||
if not source_text:
|
||||
return None
|
||||
query = source_text.strip()[:512]
|
||||
best: Optional[RAGSearchResult] = None
|
||||
for collection in self._collections_for(ctrl):
|
||||
try:
|
||||
hits = await self.rag.search(query, collection=collection, top_k=3)
|
||||
except Exception as e:
|
||||
logger.debug("embed search failed (%s): %s", collection, e)
|
||||
hits = []
|
||||
if hits and (best is None or hits[0].score > best.score):
|
||||
best = hits[0]
|
||||
if not best or best.score < EMBED_THRESHOLD:
|
||||
return None
|
||||
article = _article_part(best)
|
||||
if not article:
|
||||
return None
|
||||
return MatchResult(
|
||||
article=article,
|
||||
paragraph=best.paragraph or "",
|
||||
method="embed",
|
||||
source=best.regulation_short or best.regulation_name or "",
|
||||
)
|
||||
|
||||
def _collections_for(self, ctrl: dict) -> list[str]:
|
||||
"""Likely collection(s) for a control's regulation; falls back to all three."""
|
||||
meta = ctrl.get("generation_metadata") or {}
|
||||
reg = (meta.get("source_regulation") or "").lower()
|
||||
prefix_map = {
|
||||
"eu_": "bp_compliance_gesetze", "bdsg": "bp_compliance_gesetze",
|
||||
"de_": "bp_compliance_gesetze", "at_": "bp_compliance_gesetze",
|
||||
"ch_": "bp_compliance_gesetze", "dsgvo": "bp_compliance_gesetze",
|
||||
"trgs": "bp_compliance_ce", "trbs": "bp_compliance_ce", "asr": "bp_compliance_ce",
|
||||
"nist": "bp_compliance_ce", "owasp": "bp_compliance_ce", "enisa": "bp_compliance_ce",
|
||||
"edpb": "bp_compliance_datenschutz", "dsk": "bp_compliance_datenschutz",
|
||||
"bfdi": "bp_compliance_datenschutz",
|
||||
}
|
||||
for prefix, coll in prefix_map.items():
|
||||
if reg.startswith(prefix):
|
||||
return [coll]
|
||||
return list(RELINK_COLLECTIONS)
|
||||
|
||||
async def _llm_match(self, ctrl: dict) -> Optional[MatchResult]:
|
||||
"""Use Ollama to identify article/paragraph from source text."""
|
||||
citation = ctrl.get("source_citation") or {}
|
||||
@@ -331,6 +290,9 @@ Bei deutschen Gesetzen mit § verwende: "§ XX" statt "Art. XX"."""
|
||||
if parsed:
|
||||
citation["source"] = parsed["name"]
|
||||
|
||||
# Embed tier carries the cleaned regulation name → prefer it as source.
|
||||
if match.source:
|
||||
citation["source"] = match.source
|
||||
# Add separate article/paragraph fields
|
||||
citation["article"] = match.article
|
||||
citation["paragraph"] = match.paragraph
|
||||
@@ -359,6 +321,23 @@ Bei deutschen Gesetzen mit § verwende: "§ XX" statt "Art. XX"."""
|
||||
)
|
||||
|
||||
|
||||
def _article_part(chunk: RAGSearchResult) -> str:
|
||||
"""Precise article from a chunk: article_label minus the regulation name.
|
||||
|
||||
'BDSG § 38' -> '§ 38'; 'Art. 39 DSGVO' -> 'Art. 39'; 'NIST SP 800-53r5 SA-12' -> 'SA-12'.
|
||||
Falls back to the bare article field. Returns '' if only a doc-level name is present.
|
||||
"""
|
||||
label = (chunk.article_label or "").strip()
|
||||
reg = (chunk.regulation_short or "").strip()
|
||||
if label:
|
||||
part = label
|
||||
if reg and reg in label:
|
||||
part = label.replace(reg, "").strip(" ,;-")
|
||||
if part and part != reg:
|
||||
return part
|
||||
return (chunk.article or "").strip()
|
||||
|
||||
|
||||
def _parse_concatenated_source(source: str) -> Optional[dict]:
|
||||
"""Parse 'DSGVO Art. 35' → {name: 'DSGVO', article: 'Art. 35'}.
|
||||
|
||||
|
||||
Reference in New Issue
Block a user