breakpilot-compliance/consent-tester/services/cmp_llm_fallback.py

"""
LLM-based Cookie-Policy Fallback (Phase C+D of the cascade).

When the named CMP library (Phase B) AND the generic JSON heuristic (Phase A)
both come up empty, we send the page's DOM snapshot + network log to an LLM
and ask it to find the policy. Cascade order:

    Qwen (local Ollama on Mac Mini)   ← Phase C, fast + private
        ↓ if invalid response
    OVH (managed 120B model)          ← Phase D, more capable backup

The LLM returns one of:
    {"strategy": "url",      "value": "<full-json-endpoint-url>"}
    {"strategy": "selector", "value": "<css-selector-in-current-DOM>"}
    {"strategy": "text",     "value": "<direct extracted policy text>"}

Per-domain results are cached in Valkey (7-day TTL) so we only pay the LLM
cost once per site.

Wiring: dsi_discovery.py calls LLMCascade.analyze(...) when self_wc < 300
AND cmp_capture.payloads is empty.
"""

from __future__ import annotations

import json
import logging
import os
from typing import Any, Literal

import httpx

logger = logging.getLogger(__name__)


_SYSTEM_PROMPT = (
    "Du analysierst eine Website-Inspektion, um die Cookie-Richtlinie zu "
    "finden. Wir haben einen DOM-Auszug und ein Netzwerk-Log mit JSON-"
    "Responses geladen. Gib genau EIN JSON-Objekt zurueck:\n"
    '{"strategy": "url"|"selector"|"text", "value": "..."}\n\n'
    "- strategy=url: vollstaendige JSON-URL, die die strukturierte Cookie-"
    "Policy enthaelt (z.B. CMP-Endpoints von OneTrust, Cookiebot, etc.)\n"
    "- strategy=selector: CSS-Selector im aktuellen DOM, der den eigentlichen "
    "Policy-Text enthaelt (kein Sub-iframe).\n"
    "- strategy=text: nur wenn der Policy-Text bereits im DOM steht und du "
    "ihn extrahieren kannst (max 4000 Worte).\n\n"
    "Antworte AUSSCHLIESSLICH mit JSON. Keine Erklaerung. Wenn nichts "
    "gefunden: {\"strategy\":\"none\",\"value\":\"\"}"
)


class LLMCookieExtractor:
    """One LLM endpoint (Ollama or OpenAI-compatible)."""

    def __init__(
        self,
        kind: Literal["ollama", "openai"],
        base_url: str,
        model: str,
        api_key: str = "",
        timeout: float = 60.0,
    ) -> None:
        self.kind = kind
        self.base_url = base_url.rstrip("/")
        self.model = model
        self.api_key = api_key
        self.timeout = timeout

    async def analyze(
        self,
        target_url: str,
        dom_snapshot: str,
        network_log: list[dict],
    ) -> dict:
        """Send DOM + network info to the LLM, parse the JSON response."""
        user_prompt = _build_user_prompt(target_url, dom_snapshot, network_log)
        try:
            content = await self._chat(_SYSTEM_PROMPT, user_prompt)
        except Exception as e:
            logger.warning("%s LLM call failed: %s", self.kind, e)
            return {}
        return _parse_json_response(content)

    async def _chat(self, system: str, user: str) -> str:
        if self.kind == "ollama":
            return await self._chat_ollama(system, user)
        return await self._chat_openai(system, user)

    async def _chat_ollama(self, system: str, user: str) -> str:
        payload = {
            "model": self.model,
            "messages": [
                {"role": "system", "content": system},
                {"role": "user", "content": user},
            ],
            "stream": False,
            "format": "json",
            "options": {"temperature": 0.1, "num_predict": 2000},
        }
        async with httpx.AsyncClient(timeout=self.timeout) as client:
            resp = await client.post(f"{self.base_url}/api/chat", json=payload)
            resp.raise_for_status()
        data = resp.json()
        return (data.get("message") or {}).get("content", "")

    async def _chat_openai(self, system: str, user: str) -> str:
        headers = {"Content-Type": "application/json"}
        if self.api_key:
            headers["Authorization"] = f"Bearer {self.api_key}"
        payload = {
            "model": self.model,
            "messages": [
                {"role": "system", "content": system},
                {"role": "user", "content": user},
            ],
            "temperature": 0.1,
            "max_tokens": 2000,
            "response_format": {"type": "json_object"},
        }
        async with httpx.AsyncClient(timeout=self.timeout) as client:
            resp = await client.post(
                f"{self.base_url}/v1/chat/completions",
                json=payload, headers=headers,
            )
            resp.raise_for_status()
        data = resp.json()
        choice = (data.get("choices") or [{}])[0]
        return (choice.get("message") or {}).get("content", "") or ""


class LLMCascade:
    """Try Qwen first (local, fast). Fall through to OVH on invalid result."""

    def __init__(
        self,
        qwen: LLMCookieExtractor | None = None,
        ovh: LLMCookieExtractor | None = None,
    ) -> None:
        self.qwen = qwen
        self.ovh = ovh

    @classmethod
    def from_env(cls) -> "LLMCascade":
        qwen = None
        ovh = None

        ollama_url = os.getenv("OLLAMA_URL", "http://bp-core-ollama:11434")
        qwen_model = os.getenv("CMP_LLM_MODEL", "qwen3:30b-a3b")
        if ollama_url and qwen_model:
            qwen = LLMCookieExtractor(
                kind="ollama", base_url=ollama_url, model=qwen_model, timeout=90.0,
            )

        ovh_url = os.getenv("OVH_LLM_URL", "").strip()
        ovh_key = os.getenv("OVH_LLM_KEY", "").strip()
        ovh_model = os.getenv("OVH_LLM_MODEL", "").strip()
        if ovh_url and ovh_model:
            ovh = LLMCookieExtractor(
                kind="openai", base_url=ovh_url, model=ovh_model,
                api_key=ovh_key, timeout=60.0,
            )

        if not qwen and not ovh:
            logger.warning("LLMCascade: neither Qwen nor OVH configured")
        return cls(qwen=qwen, ovh=ovh)

    async def analyze(
        self,
        target_url: str,
        dom_snapshot: str,
        network_log: list[dict],
    ) -> dict:
        """Return the first valid LLM analysis, or {} if all fail."""
        for tier_name, ex in (("qwen", self.qwen), ("ovh", self.ovh)):
            if ex is None:
                continue
            res = await ex.analyze(target_url, dom_snapshot, network_log)
            strategy = res.get("strategy")
            if strategy in ("url", "selector", "text") and res.get("value"):
                logger.info("LLM (%s) suggested strategy=%s", tier_name, strategy)
                res["_tier"] = tier_name
                return res
            logger.info("LLM (%s) returned no usable strategy", tier_name)
        return {}


def _build_user_prompt(
    target_url: str, dom_snapshot: str, network_log: list[dict],
) -> str:
    # Truncate to keep prompt < ~16K chars
    dom = dom_snapshot[:5000]
    log_lines = []
    for entry in network_log[:60]:
        log_lines.append(
            f"- {entry.get('status', '?')} "
            f"{entry.get('content_type', '?')[:30]} "
            f"{entry.get('size', '?')}B "
            f"{entry.get('url', '')[:150]}"
        )
    log_text = "\n".join(log_lines)
    return (
        f"Ziel-URL: {target_url}\n\n"
        f"=== DOM-Auszug (body.innerText, gekuerzt) ===\n{dom}\n\n"
        f"=== Netzwerk-Log (JSON-Responses dieser Seite) ===\n{log_text}"
    )


def _parse_json_response(content: str) -> dict:
    """LLMs sometimes wrap JSON in code-fences or add prose. Be lenient."""
    if not content:
        return {}
    # Try direct parse
    for candidate in (content, _strip_code_fence(content), _find_json_block(content)):
        if not candidate:
            continue
        try:
            obj = json.loads(candidate)
            if isinstance(obj, dict):
                return obj
        except Exception:
            continue
    return {}


def _strip_code_fence(s: str) -> str:
    s = s.strip()
    if s.startswith("```"):
        lines = s.split("\n")
        return "\n".join(lines[1:-1]) if lines[-1].strip().startswith("```") else "\n".join(lines[1:])
    return s


def _find_json_block(s: str) -> str:
    start = s.find("{")
    end = s.rfind("}")
    if start >= 0 and end > start:
        return s[start:end + 1]
    return ""


# ── Valkey cache ────────────────────────────────────────────────────

async def cache_get(netloc: str) -> dict | None:
    """Read a cached LLM hint for this netloc from Valkey, if any."""
    client = _valkey_client()
    if not client:
        return None
    try:
        raw = await client.get(_cache_key(netloc))
        return json.loads(raw) if raw else None
    except Exception as e:
        logger.debug("cache_get failed: %s", e)
        return None


async def cache_set(netloc: str, hint: dict, ttl: int = 604800) -> None:
    """Cache an LLM result for 7 days (default)."""
    client = _valkey_client()
    if not client:
        return
    try:
        await client.set(_cache_key(netloc), json.dumps(hint), ex=ttl)
    except Exception as e:
        logger.debug("cache_set failed: %s", e)


def _cache_key(netloc: str) -> str:
    return f"cmp:hint:{netloc.lower()}"


_valkey_singleton: Any = None


def _valkey_client() -> Any:
    """Lazy-init a redis.asyncio client (Valkey-compatible). Returns None if unavailable."""
    global _valkey_singleton
    if _valkey_singleton is not None:
        return _valkey_singleton
    try:
        import redis.asyncio as redis  # type: ignore[import-not-found]
        url = os.getenv("VALKEY_URL", "redis://bp-core-valkey:6379")
        _valkey_singleton = redis.from_url(
            url, decode_responses=True, socket_connect_timeout=2.0,
        )
        return _valkey_singleton
    except Exception as e:
        logger.debug("valkey client init failed: %s", e)
        return None