breakpilot-core/control-pipeline/services/citation_backfill.py

"""
Citation Backfill Service — enrich existing controls with article/paragraph provenance.

3-tier matching strategy:
  Tier 1 — Hash match:  sha256(source_original_text) → RAG chunk lookup
  Tier 2 — Regex parse: split concatenated "DSGVO Art. 35" → regulation + article
  Tier 3 — Ollama LLM:  ask local LLM to identify article/paragraph from text
"""

import json
import logging
import os
import re
from dataclasses import dataclass, field
from datetime import datetime, timezone
from typing import Optional

import httpx
from sqlalchemy import text
from sqlalchemy.orm import Session

from .rag_client import ComplianceRAGClient, RAGSearchResult

logger = logging.getLogger(__name__)

OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
OLLAMA_MODEL = os.getenv("CONTROL_GEN_OLLAMA_MODEL", "qwen3.5:35b-a3b")
LLM_TIMEOUT = float(os.getenv("CONTROL_GEN_LLM_TIMEOUT", "180"))

# Tier-1 semantic re-link: min cosine for a source_original_text → chunk match.
EMBED_THRESHOLD = float(os.getenv("CITATION_EMBED_THRESHOLD", "0.80"))
# Collections that carry re-ingested, article_label-bearing chunks.
RELINK_COLLECTIONS = [
    "bp_compliance_gesetze",
    "bp_compliance_datenschutz",
    "bp_compliance_ce",
]

BACKFILL_SYSTEM_PROMPT = (
    "Du bist ein Rechtsexperte. Deine Aufgabe ist es, aus einem Gesetzestext "
    "den genauen Artikel und Absatz zu bestimmen. Antworte NUR mit validem JSON."
)

# Regex to split concatenated source like "DSGVO Art. 35" or "NIS2 Artikel 21 Abs. 2"
_SOURCE_ARTICLE_RE = re.compile(
    r"^(.+?)\s+(Art(?:ikel)?\.?\s*\d+.*)$", re.IGNORECASE
)


@dataclass
class MatchResult:
    article: str
    paragraph: str
    method: str  # "embed", "regex", "llm"
    source: str = ""  # regulation short/name (embed tier sets the cleaned source)


@dataclass
class BackfillResult:
    total_controls: int = 0
    matched_embed: int = 0
    matched_regex: int = 0
    matched_llm: int = 0
    unmatched: int = 0
    updated: int = 0
    errors: list = field(default_factory=list)


class CitationBackfill:
    """Backfill article/paragraph into existing control source_citations."""

    def __init__(self, db: Session, rag_client: ComplianceRAGClient):
        self.db = db
        self.rag = rag_client

    async def run(self, dry_run: bool = True, limit: int = 0) -> BackfillResult:
        """Main entry: iterate controls missing article/paragraph, match to RAG, update."""
        result = BackfillResult()

        # Load controls needing backfill
        controls = self._load_controls_needing_backfill(limit)
        result.total_controls = len(controls)
        logger.info("Backfill: %d controls need article/paragraph enrichment", len(controls))

        if not controls:
            return result

        # Tier-1 = per-control semantic search against the re-ingested, labeled chunks.
        # (The old sha256(chunk.text) hash index died with re-chunking and is gone.)
        with_source = sum(1 for c in controls if c.get("source_original_text"))
        logger.info("Embedding-relink candidates (with source_original_text): %d", with_source)

        # Process each control
        for i, ctrl in enumerate(controls):
            if i > 0 and i % 100 == 0:
                logger.info("Backfill progress: %d/%d processed", i, result.total_controls)

            try:
                match = await self._match_control(ctrl)
                if match:
                    if match.method == "embed":
                        result.matched_embed += 1
                    elif match.method == "regex":
                        result.matched_regex += 1
                    elif match.method == "llm":
                        result.matched_llm += 1

                    if not dry_run:
                        self._update_control(ctrl, match)
                        result.updated += 1
                    else:
                        logger.debug(
                            "DRY RUN: Would update %s with article=%s paragraph=%s (method=%s)",
                            ctrl["control_id"], match.article, match.paragraph, match.method,
                        )
                else:
                    result.unmatched += 1

            except Exception as e:
                error_msg = f"Error backfilling {ctrl.get('control_id', '?')}: {e}"
                logger.error(error_msg)
                result.errors.append(error_msg)

        if not dry_run:
            try:
                self.db.commit()
            except Exception as e:
                logger.error("Backfill commit failed: %s", e)
                result.errors.append(f"Commit failed: {e}")

        logger.info(
            "Backfill complete: %d total, embed=%d regex=%d llm=%d unmatched=%d updated=%d",
            result.total_controls, result.matched_embed, result.matched_regex,
            result.matched_llm, result.unmatched, result.updated,
        )
        return result

    def _load_controls_needing_backfill(self, limit: int = 0) -> list[dict]:
        """Load controls where source_citation exists but lacks separate 'article' key."""
        query = """
            SELECT id, control_id, source_citation, source_original_text,
                   generation_metadata, license_rule
            FROM canonical_controls
            WHERE license_rule IN (1, 2)
              AND source_citation IS NOT NULL
              AND (
                  source_citation->>'article' IS NULL
                  OR source_citation->>'article' = ''
              )
            ORDER BY control_id
        """
        if limit > 0:
            query += f" LIMIT {limit}"

        result = self.db.execute(text(query))
        cols = result.keys()
        controls = []
        for row in result:
            ctrl = dict(zip(cols, row))
            ctrl["id"] = str(ctrl["id"])
            # Parse JSON fields
            for jf in ("source_citation", "generation_metadata"):
                if isinstance(ctrl.get(jf), str):
                    try:
                        ctrl[jf] = json.loads(ctrl[jf])
                    except (json.JSONDecodeError, TypeError):
                        ctrl[jf] = {}
            controls.append(ctrl)
        return controls

    async def _match_control(self, ctrl: dict) -> Optional[MatchResult]:
        """3-tier matching: hash → regex → LLM."""

        # Tier 1: Semantic search against the re-ingested, labeled chunks
        embed = await self._embedding_match(ctrl)
        if embed:
            return embed

        # Tier 2: Regex parse concatenated source
        citation = ctrl.get("source_citation") or {}
        source_str = citation.get("source", "")
        parsed = _parse_concatenated_source(source_str)
        if parsed and parsed["article"]:
            return MatchResult(
                article=parsed["article"],
                paragraph="",  # Regex can't extract paragraph from concatenated format
                method="regex",
            )

        # Tier 3: Ollama LLM
        if ctrl.get("source_original_text"):
            return await self._llm_match(ctrl)

        return None

    async def _embedding_match(self, ctrl: dict) -> Optional[MatchResult]:
        """Tier 1: semantic-search source_original_text against the labeled chunks.

        Takes the top hit (cosine >= EMBED_THRESHOLD) that carries a real article
        and turns its article_label into a precise citation.
        """
        source_text = ctrl.get("source_original_text")
        if not source_text:
            return None
        query = source_text.strip()[:512]
        best: Optional[RAGSearchResult] = None
        for collection in self._collections_for(ctrl):
            try:
                hits = await self.rag.search(query, collection=collection, top_k=3)
            except Exception as e:
                logger.debug("embed search failed (%s): %s", collection, e)
                hits = []
            if hits and (best is None or hits[0].score > best.score):
                best = hits[0]
        if not best or best.score < EMBED_THRESHOLD:
            return None
        article = _article_part(best)
        if not article:
            return None
        return MatchResult(
            article=article,
            paragraph=best.paragraph or "",
            method="embed",
            source=best.regulation_short or best.regulation_name or "",
        )

    def _collections_for(self, ctrl: dict) -> list[str]:
        """Likely collection(s) for a control's regulation; falls back to all three."""
        meta = ctrl.get("generation_metadata") or {}
        reg = (meta.get("source_regulation") or "").lower()
        prefix_map = {
            "eu_": "bp_compliance_gesetze", "bdsg": "bp_compliance_gesetze",
            "de_": "bp_compliance_gesetze", "at_": "bp_compliance_gesetze",
            "ch_": "bp_compliance_gesetze", "dsgvo": "bp_compliance_gesetze",
            "trgs": "bp_compliance_ce", "trbs": "bp_compliance_ce", "asr": "bp_compliance_ce",
            "nist": "bp_compliance_ce", "owasp": "bp_compliance_ce", "enisa": "bp_compliance_ce",
            "edpb": "bp_compliance_datenschutz", "dsk": "bp_compliance_datenschutz",
            "bfdi": "bp_compliance_datenschutz",
        }
        for prefix, coll in prefix_map.items():
            if reg.startswith(prefix):
                return [coll]
        return list(RELINK_COLLECTIONS)

    async def _llm_match(self, ctrl: dict) -> Optional[MatchResult]:
        """Use Ollama to identify article/paragraph from source text."""
        citation = ctrl.get("source_citation") or {}
        regulation_name = citation.get("source", "")
        metadata = ctrl.get("generation_metadata") or {}
        regulation_code = metadata.get("source_regulation", "")
        source_text = ctrl.get("source_original_text", "")

        prompt = f"""Analysiere den folgenden Gesetzestext und bestimme den genauen Artikel und Absatz.

Gesetz: {regulation_name} (Code: {regulation_code})

Text:
---
{source_text[:2000]}
---

Antworte NUR mit JSON:
{{"article": "Art. XX", "paragraph": "Abs. Y"}}

Falls kein spezifischer Absatz erkennbar ist, setze paragraph auf "".
Falls kein Artikel erkennbar ist, setze article auf "".
Bei deutschen Gesetzen mit § verwende: "§ XX" statt "Art. XX"."""

        try:
            raw = await _llm_ollama(prompt, BACKFILL_SYSTEM_PROMPT)
            data = _parse_json(raw)
            if data and (data.get("article") or data.get("paragraph")):
                return MatchResult(
                    article=data.get("article", ""),
                    paragraph=data.get("paragraph", ""),
                    method="llm",
                )
        except Exception as e:
            logger.warning("LLM match failed for %s: %s", ctrl.get("control_id"), e)

        return None

    def _update_control(self, ctrl: dict, match: MatchResult):
        """Update source_citation and generation_metadata in DB."""
        citation = ctrl.get("source_citation") or {}

        # Clean the source name: remove concatenated article if present
        source_str = citation.get("source", "")
        parsed = _parse_concatenated_source(source_str)
        if parsed:
            citation["source"] = parsed["name"]

        # Embed tier carries the cleaned regulation name → prefer it as source.
        if match.source:
            citation["source"] = match.source
        # Add separate article/paragraph fields
        citation["article"] = match.article
        citation["paragraph"] = match.paragraph

        # Update generation_metadata
        metadata = ctrl.get("generation_metadata") or {}
        if match.article:
            metadata["source_article"] = match.article
        metadata["source_paragraph"] = match.paragraph
        metadata["backfill_method"] = match.method
        metadata["backfill_at"] = datetime.now(timezone.utc).isoformat()

        self.db.execute(
            text("""
                UPDATE canonical_controls
                SET source_citation = :citation,
                    generation_metadata = :metadata,
                    updated_at = NOW()
                WHERE id = CAST(:id AS uuid)
            """),
            {
                "id": ctrl["id"],
                "citation": json.dumps(citation),
                "metadata": json.dumps(metadata),
            },
        )


def _article_part(chunk: RAGSearchResult) -> str:
    """Precise article from a chunk: article_label minus the regulation name.

    'BDSG § 38' -> '§ 38'; 'Art. 39 DSGVO' -> 'Art. 39'; 'NIST SP 800-53r5 SA-12' -> 'SA-12'.
    Falls back to the bare article field. Returns '' if only a doc-level name is present.
    """
    label = (chunk.article_label or "").strip()
    reg = (chunk.regulation_short or "").strip()
    if label:
        part = label
        if reg and reg in label:
            part = label.replace(reg, "").strip(" ,;-")
        if part and part != reg:
            return part
    return (chunk.article or "").strip()


def _parse_concatenated_source(source: str) -> Optional[dict]:
    """Parse 'DSGVO Art. 35' → {name: 'DSGVO', article: 'Art. 35'}.

    Also handles '§' format: 'BDSG § 42' → {name: 'BDSG', article: '§ 42'}.
    """
    if not source:
        return None

    # Try Art./Artikel pattern
    m = _SOURCE_ARTICLE_RE.match(source)
    if m:
        return {"name": m.group(1).strip(), "article": m.group(2).strip()}

    # Try § pattern
    m2 = re.match(r"^(.+?)\s+(§\s*\d+.*)$", source)
    if m2:
        return {"name": m2.group(1).strip(), "article": m2.group(2).strip()}

    return None


async def _llm_ollama(prompt: str, system_prompt: Optional[str] = None) -> str:
    """Call Ollama chat API for backfill matching."""
    messages = []
    if system_prompt:
        messages.append({"role": "system", "content": system_prompt})
    messages.append({"role": "user", "content": prompt})

    payload = {
        "model": OLLAMA_MODEL,
        "messages": messages,
        "stream": False,
        "format": "json",
        "options": {"num_predict": 256},
        "think": False,
    }

    try:
        async with httpx.AsyncClient(timeout=LLM_TIMEOUT) as client:
            resp = await client.post(f"{OLLAMA_URL}/api/chat", json=payload)
            if resp.status_code != 200:
                logger.error("Ollama backfill failed %d: %s", resp.status_code, resp.text[:300])
                return ""
            data = resp.json()
            msg = data.get("message", {})
            if isinstance(msg, dict):
                return msg.get("content", "")
            return data.get("response", str(msg))
    except Exception as e:
        logger.error("Ollama backfill request failed: %s", e)
        return ""


def _parse_json(raw: str) -> Optional[dict]:
    """Extract JSON object from LLM output."""
    if not raw:
        return None
    # Try direct parse
    try:
        return json.loads(raw)
    except json.JSONDecodeError:
        pass
    # Try extracting from markdown code block
    m = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", raw, re.DOTALL)
    if m:
        try:
            return json.loads(m.group(1))
        except json.JSONDecodeError:
            pass
    # Try finding first { ... }
    m = re.search(r"\{[^{}]*\}", raw)
    if m:
        try:
            return json.loads(m.group(0))
        except json.JSONDecodeError:
            pass
    return None