breakpilot-compliance/backend-compliance/compliance/services/llm_cascade.py

"""
P31 — Tiered LLM-Cascade mit Confidence + Valkey-Cache.

Bisherige LLM-Calls (vendor_llm_extractor, mc_solution_generator):
* gehen direkt an Qwen lokal → bei kompliziertem Input lange Latenz
* fallen bei Fail manuell auf OVH 120B zurueck
* Kein Cache → gleiche Eingabe kostet x-mal Zeit

Diese Modul vereinheitlicht:
1. Cache-Lookup (md5(prompt) → cached response, TTL 7d)
2. Qwen-Aufruf mit kurzem Timeout (90s)
3. Wenn fail/leer ODER confidence < threshold → OVH 120B (45s)
4. Wenn auch fail → Anthropic Claude (last resort)
5. Response wird gecached

confidence-Heuristik:
* parsed JSON erfolgreich + non-empty → 0.8
* JSON-Parse failed → 0.0
* JSON ok aber nur 1 Item bei >5000 chars input → 0.3

Backend-API: await call_with_cascade(prompt, system_prompt, expected_min_items)
"""

from __future__ import annotations

import hashlib
import json
import logging
import os
from typing import Any

import httpx

logger = logging.getLogger(__name__)

# In-process Cache wenn kein Valkey verfuegbar
_LOCAL_CACHE: dict[str, dict] = {}
_LOCAL_CACHE_MAX = 200


def _cache_key(system: str, user: str, model_hint: str = "") -> str:
    blob = f"{system}\n---\n{user}\n---\n{model_hint}"
    return "llm:" + hashlib.md5(blob.encode()).hexdigest()[:24]


def _cache_get(key: str) -> dict | None:
    try:
        import redis  # noqa: WPS433
        url = os.getenv("VALKEY_URL", "redis://bp-core-valkey:6379")
        r = redis.Redis.from_url(url, socket_timeout=2.0,
                                  decode_responses=True)
        v = r.get(key)
        if v:
            return json.loads(v)
    except Exception:
        pass
    return _LOCAL_CACHE.get(key)


def _cache_put(key: str, value: dict, ttl: int = 604800) -> None:
    try:
        import redis  # noqa: WPS433
        url = os.getenv("VALKEY_URL", "redis://bp-core-valkey:6379")
        r = redis.Redis.from_url(url, socket_timeout=2.0,
                                  decode_responses=True)
        r.setex(key, ttl, json.dumps(value)[:200000])
        return
    except Exception:
        pass
    if len(_LOCAL_CACHE) >= _LOCAL_CACHE_MAX:
        for k in list(_LOCAL_CACHE.keys())[:50]:
            _LOCAL_CACHE.pop(k, None)
    _LOCAL_CACHE[key] = value


def _heuristic_confidence(response_text: str, input_len: int) -> float:
    if not response_text:
        return 0.0
    try:
        obj = json.loads(response_text)
    except Exception:
        # Try to extract JSON block
        a, b = response_text.find("{"), response_text.rfind("}")
        if 0 <= a < b:
            try:
                obj = json.loads(response_text[a:b + 1])
            except Exception:
                return 0.1
        else:
            return 0.1
    n_items = 0
    if isinstance(obj, dict):
        for v in obj.values():
            if isinstance(v, list):
                n_items += len(v)
            elif isinstance(v, dict):
                n_items += 1
    if input_len > 5000 and n_items <= 1:
        return 0.3
    if n_items >= 5:
        return 0.9
    return 0.7


async def _call_ollama(system: str, user: str,
                        max_tokens: int = 6000,
                        timeout: float = 90.0) -> str:
    base = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
    model = os.getenv("CMP_LLM_MODEL", "qwen3:30b-a3b")
    payload = {
        "model": model, "stream": False, "format": "json",
        "messages": [{"role": "system", "content": system},
                     {"role": "user", "content": user}],
        "options": {"temperature": 0.05, "num_predict": max_tokens},
    }
    try:
        async with httpx.AsyncClient(timeout=timeout) as c:
            r = await c.post(f"{base.rstrip('/')}/api/chat", json=payload)
            r.raise_for_status()
        return (r.json().get("message") or {}).get("content", "") or ""
    except Exception as e:
        # P83-followup: explizit type+message loggen damit empty-message
        # exceptions (z.B. ReadTimeout) diagnostizierbar sind.
        logger.warning(
            "ollama cascade tier 1 failed: %s (%s) model=%s base=%s",
            str(e) or "(no message)", type(e).__name__, model, base,
        )
        return ""


async def _call_ovh(system: str, user: str, max_tokens: int = 6000) -> str:
    base = os.getenv("OVH_LLM_URL", "").strip()
    key = os.getenv("OVH_LLM_KEY", "").strip()
    model = os.getenv("OVH_LLM_MODEL", "").strip()
    if not base or not model:
        return ""
    headers = {"Content-Type": "application/json"}
    if key:
        headers["Authorization"] = f"Bearer {key}"
    payload = {
        "model": model, "temperature": 0.05, "max_tokens": max_tokens,
        "messages": [{"role": "system", "content": system},
                     {"role": "user", "content": user}],
        "response_format": {"type": "json_object"},
    }
    try:
        async with httpx.AsyncClient(timeout=45.0) as c:
            r = await c.post(f"{base.rstrip('/')}/v1/chat/completions",
                             json=payload, headers=headers)
            r.raise_for_status()
        choice = (r.json().get("choices") or [{}])[0]
        return (choice.get("message") or {}).get("content", "") or ""
    except Exception as e:
        logger.warning("ovh cascade tier 2 failed: %s", e)
        return ""


async def _call_anthropic(system: str, user: str,
                            max_tokens: int = 4000) -> str:
    key = os.getenv("ANTHROPIC_API_KEY", "").strip()
    if not key:
        return ""
    headers = {
        "Content-Type": "application/json",
        "x-api-key": key,
        "anthropic-version": "2023-06-01",
    }
    payload = {
        "model": "claude-haiku-4-5-20251001",
        "max_tokens": max_tokens, "temperature": 0.05,
        "system": system,
        "messages": [{"role": "user", "content": user}],
    }
    try:
        async with httpx.AsyncClient(timeout=30.0) as c:
            r = await c.post("https://api.anthropic.com/v1/messages",
                              json=payload, headers=headers)
            r.raise_for_status()
        blocks = r.json().get("content") or []
        return "".join(b.get("text", "") for b in blocks if isinstance(b, dict))
    except Exception as e:
        logger.warning("anthropic cascade tier 3 failed: %s", e)
        return ""


async def call_with_cascade(
    system: str,
    user: str,
    min_confidence: float = 0.6,
    max_tokens: int = 6000,
) -> dict:
    """Returns {'text': str, 'confidence': float, 'source': str,
    'cached': bool}."""
    key = _cache_key(system, user)
    cached = _cache_get(key)
    if cached:
        cached["cached"] = True
        logger.info("cascade cache HIT key=%s len=%d", key[-12:],
                     len(cached.get("text", "")))
        return cached

    input_len = len(user)
    # Pre-flight: warn if Tier 2 + Tier 3 are unconfigured so user knows
    # we are de-facto running single-tier (cascade collapse).
    if not (os.getenv("OVH_LLM_URL", "").strip()
            and os.getenv("OVH_LLM_MODEL", "").strip()):
        if not os.getenv("ANTHROPIC_API_KEY", "").strip():
            logger.warning(
                "cascade: Tier 2 (OVH) AND Tier 3 (Anthropic) unconfigured — "
                "running on Tier 1 (Qwen) only. Set OVH_LLM_URL/MODEL/KEY "
                "or ANTHROPIC_API_KEY to enable fallbacks."
            )
    # Tier 1: Qwen lokal
    text = await _call_ollama(system, user, max_tokens=max_tokens)
    conf = _heuristic_confidence(text, input_len)
    if text and conf >= min_confidence:
        out = {"text": text, "confidence": conf,
                "source": "qwen", "cached": False}
        _cache_put(key, out)
        return out

    # Tier 2: OVH 120B
    text2 = await _call_ovh(system, user, max_tokens=max_tokens)
    conf2 = _heuristic_confidence(text2, input_len)
    if text2 and conf2 >= min_confidence:
        out = {"text": text2, "confidence": conf2,
                "source": "ovh_120b", "cached": False}
        _cache_put(key, out)
        return out

    # Tier 3: Anthropic Claude (Notnagel)
    text3 = await _call_anthropic(system, user, max_tokens=max_tokens // 2)
    conf3 = _heuristic_confidence(text3, input_len)
    if text3 and conf3 >= min_confidence:
        out = {"text": text3, "confidence": conf3,
                "source": "anthropic_claude", "cached": False}
        _cache_put(key, out)
        return out

    # Nichts hat geliefert — beste Variante wenigstens zurueckgeben
    best_text = text or text2 or text3 or ""
    best_conf = max(conf, conf2, conf3)
    best_source = "qwen" if text else ("ovh_120b" if text2 else "anthropic")
    return {"text": best_text, "confidence": best_conf,
            "source": best_source, "cached": False,
            "below_threshold": True}