feat(training+controls): interactive video pipeline, training blocks, control generator, CE libraries
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 37s
CI/CD / test-python-backend-compliance (push) Successful in 39s
CI/CD / test-python-document-crawler (push) Successful in 26s
CI/CD / test-python-dsms-gateway (push) Successful in 23s
CI/CD / validate-canonical-controls (push) Successful in 12s
CI/CD / Deploy (push) Has been skipped
Some checks failed
CI/CD / go-lint (push) Has been skipped
CI/CD / python-lint (push) Has been skipped
CI/CD / nodejs-lint (push) Has been skipped
CI/CD / test-go-ai-compliance (push) Failing after 37s
CI/CD / test-python-backend-compliance (push) Successful in 39s
CI/CD / test-python-document-crawler (push) Successful in 26s
CI/CD / test-python-dsms-gateway (push) Successful in 23s
CI/CD / validate-canonical-controls (push) Successful in 12s
CI/CD / Deploy (push) Has been skipped
Interactive Training Videos (CP-TRAIN): - DB migration 022: training_checkpoints + checkpoint_progress tables - NarratorScript generation via Anthropic (AI Teacher persona, German) - TTS batch synthesis + interactive video pipeline (slides + checkpoint slides + FFmpeg) - 4 new API endpoints: generate-interactive, interactive-manifest, checkpoint submit, checkpoint progress - InteractiveVideoPlayer component (HTML5 Video, quiz overlay, seek protection, progress tracking) - Learner portal integration with automatic completion on all checkpoints passed - 30 new tests (handler validation + grading logic + manifest/progress + seek protection) Training Blocks: - Block generator, block store, block config CRUD + preview/generate endpoints - Migration 021: training_blocks schema Control Generator + Canonical Library: - Control generator routes + service enhancements - Canonical control library helpers, sidebar entry - Citation backfill service + tests - CE libraries data (hazard, protection, evidence, lifecycle, components) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
437
backend-compliance/compliance/services/citation_backfill.py
Normal file
437
backend-compliance/compliance/services/citation_backfill.py
Normal file
@@ -0,0 +1,437 @@
|
||||
"""
|
||||
Citation Backfill Service — enrich existing controls with article/paragraph provenance.
|
||||
|
||||
3-tier matching strategy:
|
||||
Tier 1 — Hash match: sha256(source_original_text) → RAG chunk lookup
|
||||
Tier 2 — Regex parse: split concatenated "DSGVO Art. 35" → regulation + article
|
||||
Tier 3 — Ollama LLM: ask local LLM to identify article/paragraph from text
|
||||
"""
|
||||
|
||||
import hashlib
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime, timezone
|
||||
from typing import Optional
|
||||
|
||||
import httpx
|
||||
from sqlalchemy import text
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from .rag_client import ComplianceRAGClient, RAGSearchResult
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
|
||||
OLLAMA_MODEL = os.getenv("CONTROL_GEN_OLLAMA_MODEL", "qwen3.5:35b-a3b")
|
||||
LLM_TIMEOUT = float(os.getenv("CONTROL_GEN_LLM_TIMEOUT", "180"))
|
||||
|
||||
ALL_COLLECTIONS = [
|
||||
"bp_compliance_ce",
|
||||
"bp_compliance_gesetze",
|
||||
"bp_compliance_datenschutz",
|
||||
"bp_dsfa_corpus",
|
||||
"bp_legal_templates",
|
||||
]
|
||||
|
||||
BACKFILL_SYSTEM_PROMPT = (
|
||||
"Du bist ein Rechtsexperte. Deine Aufgabe ist es, aus einem Gesetzestext "
|
||||
"den genauen Artikel und Absatz zu bestimmen. Antworte NUR mit validem JSON."
|
||||
)
|
||||
|
||||
# Regex to split concatenated source like "DSGVO Art. 35" or "NIS2 Artikel 21 Abs. 2"
|
||||
_SOURCE_ARTICLE_RE = re.compile(
|
||||
r"^(.+?)\s+(Art(?:ikel)?\.?\s*\d+.*)$", re.IGNORECASE
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class MatchResult:
|
||||
article: str
|
||||
paragraph: str
|
||||
method: str # "hash", "regex", "llm"
|
||||
|
||||
|
||||
@dataclass
|
||||
class BackfillResult:
|
||||
total_controls: int = 0
|
||||
matched_hash: int = 0
|
||||
matched_regex: int = 0
|
||||
matched_llm: int = 0
|
||||
unmatched: int = 0
|
||||
updated: int = 0
|
||||
errors: list = field(default_factory=list)
|
||||
|
||||
|
||||
class CitationBackfill:
|
||||
"""Backfill article/paragraph into existing control source_citations."""
|
||||
|
||||
def __init__(self, db: Session, rag_client: ComplianceRAGClient):
|
||||
self.db = db
|
||||
self.rag = rag_client
|
||||
self._rag_index: dict[str, RAGSearchResult] = {}
|
||||
|
||||
async def run(self, dry_run: bool = True, limit: int = 0) -> BackfillResult:
|
||||
"""Main entry: iterate controls missing article/paragraph, match to RAG, update."""
|
||||
result = BackfillResult()
|
||||
|
||||
# Load controls needing backfill
|
||||
controls = self._load_controls_needing_backfill(limit)
|
||||
result.total_controls = len(controls)
|
||||
logger.info("Backfill: %d controls need article/paragraph enrichment", len(controls))
|
||||
|
||||
if not controls:
|
||||
return result
|
||||
|
||||
# Collect hashes we need to find — only build index for controls with source text
|
||||
needed_hashes: set[str] = set()
|
||||
for ctrl in controls:
|
||||
src = ctrl.get("source_original_text")
|
||||
if src:
|
||||
needed_hashes.add(hashlib.sha256(src.encode()).hexdigest())
|
||||
|
||||
if needed_hashes:
|
||||
# Build targeted RAG index — only scroll collections that our controls reference
|
||||
logger.info("Building targeted RAG hash index for %d source texts...", len(needed_hashes))
|
||||
await self._build_rag_index_targeted(controls)
|
||||
logger.info("RAG index built: %d chunks indexed, %d hashes needed", len(self._rag_index), len(needed_hashes))
|
||||
else:
|
||||
logger.info("No source_original_text found — skipping RAG index build")
|
||||
|
||||
# Process each control
|
||||
for i, ctrl in enumerate(controls):
|
||||
if i > 0 and i % 100 == 0:
|
||||
logger.info("Backfill progress: %d/%d processed", i, result.total_controls)
|
||||
|
||||
try:
|
||||
match = await self._match_control(ctrl)
|
||||
if match:
|
||||
if match.method == "hash":
|
||||
result.matched_hash += 1
|
||||
elif match.method == "regex":
|
||||
result.matched_regex += 1
|
||||
elif match.method == "llm":
|
||||
result.matched_llm += 1
|
||||
|
||||
if not dry_run:
|
||||
self._update_control(ctrl, match)
|
||||
result.updated += 1
|
||||
else:
|
||||
logger.debug(
|
||||
"DRY RUN: Would update %s with article=%s paragraph=%s (method=%s)",
|
||||
ctrl["control_id"], match.article, match.paragraph, match.method,
|
||||
)
|
||||
else:
|
||||
result.unmatched += 1
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Error backfilling {ctrl.get('control_id', '?')}: {e}"
|
||||
logger.error(error_msg)
|
||||
result.errors.append(error_msg)
|
||||
|
||||
if not dry_run:
|
||||
try:
|
||||
self.db.commit()
|
||||
except Exception as e:
|
||||
logger.error("Backfill commit failed: %s", e)
|
||||
result.errors.append(f"Commit failed: {e}")
|
||||
|
||||
logger.info(
|
||||
"Backfill complete: %d total, hash=%d regex=%d llm=%d unmatched=%d updated=%d",
|
||||
result.total_controls, result.matched_hash, result.matched_regex,
|
||||
result.matched_llm, result.unmatched, result.updated,
|
||||
)
|
||||
return result
|
||||
|
||||
def _load_controls_needing_backfill(self, limit: int = 0) -> list[dict]:
|
||||
"""Load controls where source_citation exists but lacks separate 'article' key."""
|
||||
query = """
|
||||
SELECT id, control_id, source_citation, source_original_text,
|
||||
generation_metadata, license_rule
|
||||
FROM canonical_controls
|
||||
WHERE license_rule IN (1, 2)
|
||||
AND source_citation IS NOT NULL
|
||||
AND (
|
||||
source_citation->>'article' IS NULL
|
||||
OR source_citation->>'article' = ''
|
||||
)
|
||||
ORDER BY control_id
|
||||
"""
|
||||
if limit > 0:
|
||||
query += f" LIMIT {limit}"
|
||||
|
||||
result = self.db.execute(text(query))
|
||||
cols = result.keys()
|
||||
controls = []
|
||||
for row in result:
|
||||
ctrl = dict(zip(cols, row))
|
||||
ctrl["id"] = str(ctrl["id"])
|
||||
# Parse JSON fields
|
||||
for jf in ("source_citation", "generation_metadata"):
|
||||
if isinstance(ctrl.get(jf), str):
|
||||
try:
|
||||
ctrl[jf] = json.loads(ctrl[jf])
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
ctrl[jf] = {}
|
||||
controls.append(ctrl)
|
||||
return controls
|
||||
|
||||
async def _build_rag_index_targeted(self, controls: list[dict]):
|
||||
"""Build RAG index by scrolling only collections relevant to our controls.
|
||||
|
||||
Uses regulation codes from generation_metadata to identify which collections
|
||||
to search, falling back to all collections only if needed.
|
||||
"""
|
||||
# Determine which collections are relevant based on regulation codes
|
||||
regulation_to_collection = self._map_regulations_to_collections(controls)
|
||||
collections_to_search = set(regulation_to_collection.values()) or set(ALL_COLLECTIONS)
|
||||
|
||||
logger.info("Targeted index: searching %d collections: %s",
|
||||
len(collections_to_search), ", ".join(collections_to_search))
|
||||
|
||||
for collection in collections_to_search:
|
||||
offset = None
|
||||
page = 0
|
||||
seen_offsets: set[str] = set()
|
||||
while True:
|
||||
chunks, next_offset = await self.rag.scroll(
|
||||
collection=collection, offset=offset, limit=200,
|
||||
)
|
||||
if not chunks:
|
||||
break
|
||||
for chunk in chunks:
|
||||
if chunk.text and len(chunk.text.strip()) >= 50:
|
||||
h = hashlib.sha256(chunk.text.encode()).hexdigest()
|
||||
self._rag_index[h] = chunk
|
||||
page += 1
|
||||
if page % 50 == 0:
|
||||
logger.info("Indexing %s: page %d (%d chunks so far)",
|
||||
collection, page, len(self._rag_index))
|
||||
if not next_offset:
|
||||
break
|
||||
if next_offset in seen_offsets:
|
||||
logger.warning("Scroll loop in %s at page %d — stopping", collection, page)
|
||||
break
|
||||
seen_offsets.add(next_offset)
|
||||
offset = next_offset
|
||||
|
||||
logger.info("Indexed collection %s: %d pages", collection, page)
|
||||
|
||||
def _map_regulations_to_collections(self, controls: list[dict]) -> dict[str, str]:
|
||||
"""Map regulation codes from controls to likely Qdrant collections."""
|
||||
# Heuristic: regulation code prefix → collection
|
||||
collection_map = {
|
||||
"eu_": "bp_compliance_gesetze",
|
||||
"dsgvo": "bp_compliance_datenschutz",
|
||||
"bdsg": "bp_compliance_gesetze",
|
||||
"ttdsg": "bp_compliance_gesetze",
|
||||
"nist_": "bp_compliance_ce",
|
||||
"owasp": "bp_compliance_ce",
|
||||
"bsi_": "bp_compliance_ce",
|
||||
"enisa": "bp_compliance_ce",
|
||||
"at_": "bp_compliance_recht",
|
||||
"fr_": "bp_compliance_recht",
|
||||
"es_": "bp_compliance_recht",
|
||||
}
|
||||
result: dict[str, str] = {}
|
||||
for ctrl in controls:
|
||||
meta = ctrl.get("generation_metadata") or {}
|
||||
reg = meta.get("source_regulation", "")
|
||||
if not reg:
|
||||
continue
|
||||
for prefix, coll in collection_map.items():
|
||||
if reg.startswith(prefix):
|
||||
result[reg] = coll
|
||||
break
|
||||
else:
|
||||
# Unknown regulation — search all
|
||||
for coll in ALL_COLLECTIONS:
|
||||
result[f"_all_{coll}"] = coll
|
||||
return result
|
||||
|
||||
async def _match_control(self, ctrl: dict) -> Optional[MatchResult]:
|
||||
"""3-tier matching: hash → regex → LLM."""
|
||||
|
||||
# Tier 1: Hash match against RAG index
|
||||
source_text = ctrl.get("source_original_text")
|
||||
if source_text:
|
||||
h = hashlib.sha256(source_text.encode()).hexdigest()
|
||||
chunk = self._rag_index.get(h)
|
||||
if chunk and (chunk.article or chunk.paragraph):
|
||||
return MatchResult(
|
||||
article=chunk.article or "",
|
||||
paragraph=chunk.paragraph or "",
|
||||
method="hash",
|
||||
)
|
||||
|
||||
# Tier 2: Regex parse concatenated source
|
||||
citation = ctrl.get("source_citation") or {}
|
||||
source_str = citation.get("source", "")
|
||||
parsed = _parse_concatenated_source(source_str)
|
||||
if parsed and parsed["article"]:
|
||||
return MatchResult(
|
||||
article=parsed["article"],
|
||||
paragraph="", # Regex can't extract paragraph from concatenated format
|
||||
method="regex",
|
||||
)
|
||||
|
||||
# Tier 3: Ollama LLM
|
||||
if source_text:
|
||||
return await self._llm_match(ctrl)
|
||||
|
||||
return None
|
||||
|
||||
async def _llm_match(self, ctrl: dict) -> Optional[MatchResult]:
|
||||
"""Use Ollama to identify article/paragraph from source text."""
|
||||
citation = ctrl.get("source_citation") or {}
|
||||
regulation_name = citation.get("source", "")
|
||||
metadata = ctrl.get("generation_metadata") or {}
|
||||
regulation_code = metadata.get("source_regulation", "")
|
||||
source_text = ctrl.get("source_original_text", "")
|
||||
|
||||
prompt = f"""Analysiere den folgenden Gesetzestext und bestimme den genauen Artikel und Absatz.
|
||||
|
||||
Gesetz: {regulation_name} (Code: {regulation_code})
|
||||
|
||||
Text:
|
||||
---
|
||||
{source_text[:2000]}
|
||||
---
|
||||
|
||||
Antworte NUR mit JSON:
|
||||
{{"article": "Art. XX", "paragraph": "Abs. Y"}}
|
||||
|
||||
Falls kein spezifischer Absatz erkennbar ist, setze paragraph auf "".
|
||||
Falls kein Artikel erkennbar ist, setze article auf "".
|
||||
Bei deutschen Gesetzen mit § verwende: "§ XX" statt "Art. XX"."""
|
||||
|
||||
try:
|
||||
raw = await _llm_ollama(prompt, BACKFILL_SYSTEM_PROMPT)
|
||||
data = _parse_json(raw)
|
||||
if data and (data.get("article") or data.get("paragraph")):
|
||||
return MatchResult(
|
||||
article=data.get("article", ""),
|
||||
paragraph=data.get("paragraph", ""),
|
||||
method="llm",
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning("LLM match failed for %s: %s", ctrl.get("control_id"), e)
|
||||
|
||||
return None
|
||||
|
||||
def _update_control(self, ctrl: dict, match: MatchResult):
|
||||
"""Update source_citation and generation_metadata in DB."""
|
||||
citation = ctrl.get("source_citation") or {}
|
||||
|
||||
# Clean the source name: remove concatenated article if present
|
||||
source_str = citation.get("source", "")
|
||||
parsed = _parse_concatenated_source(source_str)
|
||||
if parsed:
|
||||
citation["source"] = parsed["name"]
|
||||
|
||||
# Add separate article/paragraph fields
|
||||
citation["article"] = match.article
|
||||
citation["paragraph"] = match.paragraph
|
||||
|
||||
# Update generation_metadata
|
||||
metadata = ctrl.get("generation_metadata") or {}
|
||||
if match.article:
|
||||
metadata["source_article"] = match.article
|
||||
metadata["source_paragraph"] = match.paragraph
|
||||
metadata["backfill_method"] = match.method
|
||||
metadata["backfill_at"] = datetime.now(timezone.utc).isoformat()
|
||||
|
||||
self.db.execute(
|
||||
text("""
|
||||
UPDATE canonical_controls
|
||||
SET source_citation = :citation,
|
||||
generation_metadata = :metadata,
|
||||
updated_at = NOW()
|
||||
WHERE id = CAST(:id AS uuid)
|
||||
"""),
|
||||
{
|
||||
"id": ctrl["id"],
|
||||
"citation": json.dumps(citation),
|
||||
"metadata": json.dumps(metadata),
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
def _parse_concatenated_source(source: str) -> Optional[dict]:
|
||||
"""Parse 'DSGVO Art. 35' → {name: 'DSGVO', article: 'Art. 35'}.
|
||||
|
||||
Also handles '§' format: 'BDSG § 42' → {name: 'BDSG', article: '§ 42'}.
|
||||
"""
|
||||
if not source:
|
||||
return None
|
||||
|
||||
# Try Art./Artikel pattern
|
||||
m = _SOURCE_ARTICLE_RE.match(source)
|
||||
if m:
|
||||
return {"name": m.group(1).strip(), "article": m.group(2).strip()}
|
||||
|
||||
# Try § pattern
|
||||
m2 = re.match(r"^(.+?)\s+(§\s*\d+.*)$", source)
|
||||
if m2:
|
||||
return {"name": m2.group(1).strip(), "article": m2.group(2).strip()}
|
||||
|
||||
return None
|
||||
|
||||
|
||||
async def _llm_ollama(prompt: str, system_prompt: Optional[str] = None) -> str:
|
||||
"""Call Ollama chat API for backfill matching."""
|
||||
messages = []
|
||||
if system_prompt:
|
||||
messages.append({"role": "system", "content": system_prompt})
|
||||
messages.append({"role": "user", "content": prompt})
|
||||
|
||||
payload = {
|
||||
"model": OLLAMA_MODEL,
|
||||
"messages": messages,
|
||||
"stream": False,
|
||||
"options": {"num_predict": 256},
|
||||
"think": False,
|
||||
}
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=LLM_TIMEOUT) as client:
|
||||
resp = await client.post(f"{OLLAMA_URL}/api/chat", json=payload)
|
||||
if resp.status_code != 200:
|
||||
logger.error("Ollama backfill failed %d: %s", resp.status_code, resp.text[:300])
|
||||
return ""
|
||||
data = resp.json()
|
||||
msg = data.get("message", {})
|
||||
if isinstance(msg, dict):
|
||||
return msg.get("content", "")
|
||||
return data.get("response", str(msg))
|
||||
except Exception as e:
|
||||
logger.error("Ollama backfill request failed: %s", e)
|
||||
return ""
|
||||
|
||||
|
||||
def _parse_json(raw: str) -> Optional[dict]:
|
||||
"""Extract JSON object from LLM output."""
|
||||
if not raw:
|
||||
return None
|
||||
# Try direct parse
|
||||
try:
|
||||
return json.loads(raw)
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
# Try extracting from markdown code block
|
||||
m = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", raw, re.DOTALL)
|
||||
if m:
|
||||
try:
|
||||
return json.loads(m.group(1))
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
# Try finding first { ... }
|
||||
m = re.search(r"\{[^{}]*\}", raw)
|
||||
if m:
|
||||
try:
|
||||
return json.loads(m.group(0))
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
return None
|
||||
Reference in New Issue
Block a user