""" P31 — Tiered LLM-Cascade mit Confidence + Valkey-Cache. Bisherige LLM-Calls (vendor_llm_extractor, mc_solution_generator): * gehen direkt an Qwen lokal → bei kompliziertem Input lange Latenz * fallen bei Fail manuell auf OVH 120B zurueck * Kein Cache → gleiche Eingabe kostet x-mal Zeit Diese Modul vereinheitlicht: 1. Cache-Lookup (md5(prompt) → cached response, TTL 7d) 2. Qwen-Aufruf mit kurzem Timeout (90s) 3. Wenn fail/leer ODER confidence < threshold → OVH 120B (45s) 4. Wenn auch fail → Anthropic Claude (last resort) 5. Response wird gecached confidence-Heuristik: * parsed JSON erfolgreich + non-empty → 0.8 * JSON-Parse failed → 0.0 * JSON ok aber nur 1 Item bei >5000 chars input → 0.3 Backend-API: await call_with_cascade(prompt, system_prompt, expected_min_items) """ from __future__ import annotations import hashlib import json import logging import os from typing import Any import httpx logger = logging.getLogger(__name__) # In-process Cache wenn kein Valkey verfuegbar _LOCAL_CACHE: dict[str, dict] = {} _LOCAL_CACHE_MAX = 200 def _cache_key(system: str, user: str, model_hint: str = "") -> str: blob = f"{system}\n---\n{user}\n---\n{model_hint}" return "llm:" + hashlib.md5(blob.encode()).hexdigest()[:24] def _cache_get(key: str) -> dict | None: try: import redis # noqa: WPS433 url = os.getenv("VALKEY_URL", "redis://bp-core-valkey:6379") r = redis.Redis.from_url(url, socket_timeout=2.0, decode_responses=True) v = r.get(key) if v: return json.loads(v) except Exception: pass return _LOCAL_CACHE.get(key) def _cache_put(key: str, value: dict, ttl: int = 604800) -> None: try: import redis # noqa: WPS433 url = os.getenv("VALKEY_URL", "redis://bp-core-valkey:6379") r = redis.Redis.from_url(url, socket_timeout=2.0, decode_responses=True) r.setex(key, ttl, json.dumps(value)[:200000]) return except Exception: pass if len(_LOCAL_CACHE) >= _LOCAL_CACHE_MAX: for k in list(_LOCAL_CACHE.keys())[:50]: _LOCAL_CACHE.pop(k, None) _LOCAL_CACHE[key] = value def _heuristic_confidence(response_text: str, input_len: int) -> float: if not response_text: return 0.0 try: obj = json.loads(response_text) except Exception: # Try to extract JSON block a, b = response_text.find("{"), response_text.rfind("}") if 0 <= a < b: try: obj = json.loads(response_text[a:b + 1]) except Exception: return 0.1 else: return 0.1 n_items = 0 if isinstance(obj, dict): for v in obj.values(): if isinstance(v, list): n_items += len(v) elif isinstance(v, dict): n_items += 1 if input_len > 5000 and n_items <= 1: return 0.3 if n_items >= 5: return 0.9 return 0.7 async def _call_ollama(system: str, user: str, max_tokens: int = 6000, timeout: float = 90.0) -> str: base = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434") model = os.getenv("CMP_LLM_MODEL", "qwen3:30b-a3b") payload = { "model": model, "stream": False, "format": "json", "messages": [{"role": "system", "content": system}, {"role": "user", "content": user}], "options": {"temperature": 0.05, "num_predict": max_tokens}, } try: async with httpx.AsyncClient(timeout=timeout) as c: r = await c.post(f"{base.rstrip('/')}/api/chat", json=payload) r.raise_for_status() return (r.json().get("message") or {}).get("content", "") or "" except Exception as e: logger.warning("ollama cascade tier 1 failed: %s", e) return "" async def _call_ovh(system: str, user: str, max_tokens: int = 6000) -> str: base = os.getenv("OVH_LLM_URL", "").strip() key = os.getenv("OVH_LLM_KEY", "").strip() model = os.getenv("OVH_LLM_MODEL", "").strip() if not base or not model: return "" headers = {"Content-Type": "application/json"} if key: headers["Authorization"] = f"Bearer {key}" payload = { "model": model, "temperature": 0.05, "max_tokens": max_tokens, "messages": [{"role": "system", "content": system}, {"role": "user", "content": user}], "response_format": {"type": "json_object"}, } try: async with httpx.AsyncClient(timeout=45.0) as c: r = await c.post(f"{base.rstrip('/')}/v1/chat/completions", json=payload, headers=headers) r.raise_for_status() choice = (r.json().get("choices") or [{}])[0] return (choice.get("message") or {}).get("content", "") or "" except Exception as e: logger.warning("ovh cascade tier 2 failed: %s", e) return "" async def _call_anthropic(system: str, user: str, max_tokens: int = 4000) -> str: key = os.getenv("ANTHROPIC_API_KEY", "").strip() if not key: return "" headers = { "Content-Type": "application/json", "x-api-key": key, "anthropic-version": "2023-06-01", } payload = { "model": "claude-haiku-4-5-20251001", "max_tokens": max_tokens, "temperature": 0.05, "system": system, "messages": [{"role": "user", "content": user}], } try: async with httpx.AsyncClient(timeout=30.0) as c: r = await c.post("https://api.anthropic.com/v1/messages", json=payload, headers=headers) r.raise_for_status() blocks = r.json().get("content") or [] return "".join(b.get("text", "") for b in blocks if isinstance(b, dict)) except Exception as e: logger.warning("anthropic cascade tier 3 failed: %s", e) return "" async def call_with_cascade( system: str, user: str, min_confidence: float = 0.6, max_tokens: int = 6000, ) -> dict: """Returns {'text': str, 'confidence': float, 'source': str, 'cached': bool}.""" key = _cache_key(system, user) cached = _cache_get(key) if cached: cached["cached"] = True return cached input_len = len(user) # Tier 1: Qwen lokal text = await _call_ollama(system, user, max_tokens=max_tokens) conf = _heuristic_confidence(text, input_len) if text and conf >= min_confidence: out = {"text": text, "confidence": conf, "source": "qwen", "cached": False} _cache_put(key, out) return out # Tier 2: OVH 120B text2 = await _call_ovh(system, user, max_tokens=max_tokens) conf2 = _heuristic_confidence(text2, input_len) if text2 and conf2 >= min_confidence: out = {"text": text2, "confidence": conf2, "source": "ovh_120b", "cached": False} _cache_put(key, out) return out # Tier 3: Anthropic Claude (Notnagel) text3 = await _call_anthropic(system, user, max_tokens=max_tokens // 2) conf3 = _heuristic_confidence(text3, input_len) if text3 and conf3 >= min_confidence: out = {"text": text3, "confidence": conf3, "source": "anthropic_claude", "cached": False} _cache_put(key, out) return out # Nichts hat geliefert — beste Variante wenigstens zurueckgeben best_text = text or text2 or text3 or "" best_conf = max(conf, conf2, conf3) best_source = "qwen" if text else ("ovh_120b" if text2 else "anthropic") return {"text": best_text, "confidence": best_conf, "source": best_source, "cached": False, "below_threshold": True}