breakpilot-compliance/backend-compliance/compliance/services/citation_backfill.py

"""
Citation Backfill Service — enrich existing controls with article/paragraph provenance.

3-tier matching strategy:
  Tier 1 — Hash match:  sha256(source_original_text) → RAG chunk lookup
  Tier 2 — Regex parse: split concatenated "DSGVO Art. 35" → regulation + article
  Tier 3 — Ollama LLM:  ask local LLM to identify article/paragraph from text
"""

import hashlib
import json
import logging
import os
import re
from dataclasses import dataclass, field
from datetime import datetime, timezone
from typing import Optional

import httpx
from sqlalchemy import text
from sqlalchemy.orm import Session

from .rag_client import ComplianceRAGClient, RAGSearchResult

logger = logging.getLogger(__name__)

OLLAMA_URL = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
OLLAMA_MODEL = os.getenv("CONTROL_GEN_OLLAMA_MODEL", "qwen3.5:35b-a3b")
LLM_TIMEOUT = float(os.getenv("CONTROL_GEN_LLM_TIMEOUT", "180"))

ALL_COLLECTIONS = [
    "bp_compliance_ce",
    "bp_compliance_gesetze",
    "bp_compliance_datenschutz",
    "bp_dsfa_corpus",
    "bp_legal_templates",
]

BACKFILL_SYSTEM_PROMPT = (
    "Du bist ein Rechtsexperte. Deine Aufgabe ist es, aus einem Gesetzestext "
    "den genauen Artikel und Absatz zu bestimmen. Antworte NUR mit validem JSON."
)

# Regex to split concatenated source like "DSGVO Art. 35" or "NIS2 Artikel 21 Abs. 2"
_SOURCE_ARTICLE_RE = re.compile(
    r"^(.+?)\s+(Art(?:ikel)?\.?\s*\d+.*)$", re.IGNORECASE
)


@dataclass
class MatchResult:
    article: str
    paragraph: str
    method: str  # "hash", "regex", "llm"


@dataclass
class BackfillResult:
    total_controls: int = 0
    matched_hash: int = 0
    matched_regex: int = 0
    matched_llm: int = 0
    unmatched: int = 0
    updated: int = 0
    errors: list = field(default_factory=list)


class CitationBackfill:
    """Backfill article/paragraph into existing control source_citations."""

    def __init__(self, db: Session, rag_client: ComplianceRAGClient):
        self.db = db
        self.rag = rag_client
        self._rag_index: dict[str, RAGSearchResult] = {}

    async def run(self, dry_run: bool = True, limit: int = 0) -> BackfillResult:
        """Main entry: iterate controls missing article/paragraph, match to RAG, update."""
        result = BackfillResult()

        # Load controls needing backfill
        controls = self._load_controls_needing_backfill(limit)
        result.total_controls = len(controls)
        logger.info("Backfill: %d controls need article/paragraph enrichment", len(controls))

        if not controls:
            return result

        # Collect hashes we need to find — only build index for controls with source text
        needed_hashes: set[str] = set()
        for ctrl in controls:
            src = ctrl.get("source_original_text")
            if src:
                needed_hashes.add(hashlib.sha256(src.encode()).hexdigest())

        if needed_hashes:
            # Build targeted RAG index — only scroll collections that our controls reference
            logger.info("Building targeted RAG hash index for %d source texts...", len(needed_hashes))
            await self._build_rag_index_targeted(controls)
            logger.info("RAG index built: %d chunks indexed, %d hashes needed", len(self._rag_index), len(needed_hashes))
        else:
            logger.info("No source_original_text found — skipping RAG index build")

        # Process each control
        for i, ctrl in enumerate(controls):
            if i > 0 and i % 100 == 0:
                logger.info("Backfill progress: %d/%d processed", i, result.total_controls)

            try:
                match = await self._match_control(ctrl)
                if match:
                    if match.method == "hash":
                        result.matched_hash += 1
                    elif match.method == "regex":
                        result.matched_regex += 1
                    elif match.method == "llm":
                        result.matched_llm += 1

                    if not dry_run:
                        self._update_control(ctrl, match)
                        result.updated += 1
                    else:
                        logger.debug(
                            "DRY RUN: Would update %s with article=%s paragraph=%s (method=%s)",
                            ctrl["control_id"], match.article, match.paragraph, match.method,
                        )
                else:
                    result.unmatched += 1

            except Exception as e:
                error_msg = f"Error backfilling {ctrl.get('control_id', '?')}: {e}"
                logger.error(error_msg)
                result.errors.append(error_msg)

        if not dry_run:
            try:
                self.db.commit()
            except Exception as e:
                logger.error("Backfill commit failed: %s", e)
                result.errors.append(f"Commit failed: {e}")

        logger.info(
            "Backfill complete: %d total, hash=%d regex=%d llm=%d unmatched=%d updated=%d",
            result.total_controls, result.matched_hash, result.matched_regex,
            result.matched_llm, result.unmatched, result.updated,
        )
        return result

    def _load_controls_needing_backfill(self, limit: int = 0) -> list[dict]:
        """Load controls where source_citation exists but lacks separate 'article' key."""
        query = """
            SELECT id, control_id, source_citation, source_original_text,
                   generation_metadata, license_rule
            FROM canonical_controls
            WHERE license_rule IN (1, 2)
              AND source_citation IS NOT NULL
              AND (
                  source_citation->>'article' IS NULL
                  OR source_citation->>'article' = ''
              )
            ORDER BY control_id
        """
        if limit > 0:
            query += f" LIMIT {limit}"

        result = self.db.execute(text(query))
        cols = result.keys()
        controls = []
        for row in result:
            ctrl = dict(zip(cols, row))
            ctrl["id"] = str(ctrl["id"])
            # Parse JSON fields
            for jf in ("source_citation", "generation_metadata"):
                if isinstance(ctrl.get(jf), str):
                    try:
                        ctrl[jf] = json.loads(ctrl[jf])
                    except (json.JSONDecodeError, TypeError):
                        ctrl[jf] = {}
            controls.append(ctrl)
        return controls

    async def _build_rag_index_targeted(self, controls: list[dict]):
        """Build RAG index by scrolling only collections relevant to our controls.

        Uses regulation codes from generation_metadata to identify which collections
        to search, falling back to all collections only if needed.
        """
        # Determine which collections are relevant based on regulation codes
        regulation_to_collection = self._map_regulations_to_collections(controls)
        collections_to_search = set(regulation_to_collection.values()) or set(ALL_COLLECTIONS)

        logger.info("Targeted index: searching %d collections: %s",
                     len(collections_to_search), ", ".join(collections_to_search))

        for collection in collections_to_search:
            offset = None
            page = 0
            seen_offsets: set[str] = set()
            while True:
                chunks, next_offset = await self.rag.scroll(
                    collection=collection, offset=offset, limit=200,
                )
                if not chunks:
                    break
                for chunk in chunks:
                    if chunk.text and len(chunk.text.strip()) >= 50:
                        h = hashlib.sha256(chunk.text.encode()).hexdigest()
                        self._rag_index[h] = chunk
                page += 1
                if page % 50 == 0:
                    logger.info("Indexing %s: page %d (%d chunks so far)",
                                collection, page, len(self._rag_index))
                if not next_offset:
                    break
                if next_offset in seen_offsets:
                    logger.warning("Scroll loop in %s at page %d — stopping", collection, page)
                    break
                seen_offsets.add(next_offset)
                offset = next_offset

            logger.info("Indexed collection %s: %d pages", collection, page)

    def _map_regulations_to_collections(self, controls: list[dict]) -> dict[str, str]:
        """Map regulation codes from controls to likely Qdrant collections."""
        # Heuristic: regulation code prefix → collection
        collection_map = {
            "eu_": "bp_compliance_gesetze",
            "dsgvo": "bp_compliance_datenschutz",
            "bdsg": "bp_compliance_gesetze",
            "ttdsg": "bp_compliance_gesetze",
            "nist_": "bp_compliance_ce",
            "owasp": "bp_compliance_ce",
            "bsi_": "bp_compliance_ce",
            "enisa": "bp_compliance_ce",
            "at_": "bp_compliance_recht",
            "fr_": "bp_compliance_recht",
            "es_": "bp_compliance_recht",
        }
        result: dict[str, str] = {}
        for ctrl in controls:
            meta = ctrl.get("generation_metadata") or {}
            reg = meta.get("source_regulation", "")
            if not reg:
                continue
            for prefix, coll in collection_map.items():
                if reg.startswith(prefix):
                    result[reg] = coll
                    break
            else:
                # Unknown regulation — search all
                for coll in ALL_COLLECTIONS:
                    result[f"_all_{coll}"] = coll
        return result

    async def _match_control(self, ctrl: dict) -> Optional[MatchResult]:
        """3-tier matching: hash → regex → LLM."""

        # Tier 1: Hash match against RAG index
        source_text = ctrl.get("source_original_text")
        if source_text:
            h = hashlib.sha256(source_text.encode()).hexdigest()
            chunk = self._rag_index.get(h)
            if chunk and (chunk.article or chunk.paragraph):
                return MatchResult(
                    article=chunk.article or "",
                    paragraph=chunk.paragraph or "",
                    method="hash",
                )

        # Tier 2: Regex parse concatenated source
        citation = ctrl.get("source_citation") or {}
        source_str = citation.get("source", "")
        parsed = _parse_concatenated_source(source_str)
        if parsed and parsed["article"]:
            return MatchResult(
                article=parsed["article"],
                paragraph="",  # Regex can't extract paragraph from concatenated format
                method="regex",
            )

        # Tier 3: Ollama LLM
        if source_text:
            return await self._llm_match(ctrl)

        return None

    async def _llm_match(self, ctrl: dict) -> Optional[MatchResult]:
        """Use Ollama to identify article/paragraph from source text."""
        citation = ctrl.get("source_citation") or {}
        regulation_name = citation.get("source", "")
        metadata = ctrl.get("generation_metadata") or {}
        regulation_code = metadata.get("source_regulation", "")
        source_text = ctrl.get("source_original_text", "")

        prompt = f"""Analysiere den folgenden Gesetzestext und bestimme den genauen Artikel und Absatz.

Gesetz: {regulation_name} (Code: {regulation_code})

Text:
---
{source_text[:2000]}
---

Antworte NUR mit JSON:
{{"article": "Art. XX", "paragraph": "Abs. Y"}}

Falls kein spezifischer Absatz erkennbar ist, setze paragraph auf "".
Falls kein Artikel erkennbar ist, setze article auf "".
Bei deutschen Gesetzen mit § verwende: "§ XX" statt "Art. XX"."""

        try:
            raw = await _llm_ollama(prompt, BACKFILL_SYSTEM_PROMPT)
            data = _parse_json(raw)
            if data and (data.get("article") or data.get("paragraph")):
                return MatchResult(
                    article=data.get("article", ""),
                    paragraph=data.get("paragraph", ""),
                    method="llm",
                )
        except Exception as e:
            logger.warning("LLM match failed for %s: %s", ctrl.get("control_id"), e)

        return None

    def _update_control(self, ctrl: dict, match: MatchResult):
        """Update source_citation and generation_metadata in DB."""
        citation = ctrl.get("source_citation") or {}

        # Clean the source name: remove concatenated article if present
        source_str = citation.get("source", "")
        parsed = _parse_concatenated_source(source_str)
        if parsed:
            citation["source"] = parsed["name"]

        # Add separate article/paragraph fields
        citation["article"] = match.article
        citation["paragraph"] = match.paragraph

        # Update generation_metadata
        metadata = ctrl.get("generation_metadata") or {}
        if match.article:
            metadata["source_article"] = match.article
        metadata["source_paragraph"] = match.paragraph
        metadata["backfill_method"] = match.method
        metadata["backfill_at"] = datetime.now(timezone.utc).isoformat()

        self.db.execute(
            text("""
                UPDATE canonical_controls
                SET source_citation = :citation,
                    generation_metadata = :metadata,
                    updated_at = NOW()
                WHERE id = CAST(:id AS uuid)
            """),
            {
                "id": ctrl["id"],
                "citation": json.dumps(citation),
                "metadata": json.dumps(metadata),
            },
        )


def _parse_concatenated_source(source: str) -> Optional[dict]:
    """Parse 'DSGVO Art. 35' → {name: 'DSGVO', article: 'Art. 35'}.

    Also handles '§' format: 'BDSG § 42' → {name: 'BDSG', article: '§ 42'}.
    """
    if not source:
        return None

    # Try Art./Artikel pattern
    m = _SOURCE_ARTICLE_RE.match(source)
    if m:
        return {"name": m.group(1).strip(), "article": m.group(2).strip()}

    # Try § pattern
    m2 = re.match(r"^(.+?)\s+(§\s*\d+.*)$", source)
    if m2:
        return {"name": m2.group(1).strip(), "article": m2.group(2).strip()}

    return None


async def _llm_ollama(prompt: str, system_prompt: Optional[str] = None) -> str:
    """Call Ollama chat API for backfill matching."""
    messages = []
    if system_prompt:
        messages.append({"role": "system", "content": system_prompt})
    messages.append({"role": "user", "content": prompt})

    payload = {
        "model": OLLAMA_MODEL,
        "messages": messages,
        "stream": False,
        "format": "json",
        "options": {"num_predict": 256},
        "think": False,
    }

    try:
        async with httpx.AsyncClient(timeout=LLM_TIMEOUT) as client:
            resp = await client.post(f"{OLLAMA_URL}/api/chat", json=payload)
            if resp.status_code != 200:
                logger.error("Ollama backfill failed %d: %s", resp.status_code, resp.text[:300])
                return ""
            data = resp.json()
            msg = data.get("message", {})
            if isinstance(msg, dict):
                return msg.get("content", "")
            return data.get("response", str(msg))
    except Exception as e:
        logger.error("Ollama backfill request failed: %s", e)
        return ""


def _parse_json(raw: str) -> Optional[dict]:
    """Extract JSON object from LLM output."""
    if not raw:
        return None
    # Try direct parse
    try:
        return json.loads(raw)
    except json.JSONDecodeError:
        pass
    # Try extracting from markdown code block
    m = re.search(r"```(?:json)?\s*(\{.*?\})\s*```", raw, re.DOTALL)
    if m:
        try:
            return json.loads(m.group(1))
        except json.JSONDecodeError:
            pass
    # Try finding first { ... }
    m = re.search(r"\{[^{}]*\}", raw)
    if m:
        try:
            return json.loads(m.group(0))
        except json.JSONDecodeError:
            pass
    return None