Files
breakpilot-compliance/backend-compliance/compliance/services/llm_cascade.py
T
Benjamin Admin 1ccfdb5d3d fix(scan): TCF SQL column + cascade diagnose-logs
VW-Scan-Befunde aus 0a8aa16e:
1. TCF lookup failed 5x mit: column 'source' does not exist. Korrekt:
   'source_name' (siehe DELETE-Query in derselben Datei). Mit dem Fix
   funktioniert das TCF-Cross-Reference fuer alle Vendors statt 0.
2. Cascade tier-1 fail loggte leere message — jetzt mit type+model+base.
3. Cascade collapse (tier 2+3 unconfigured) wird beim ersten Aufruf
   geloggt damit der Operator den ENV-Mangel sofort sieht.
4. vendor_llm_extractor loggt jetzt START + 0-vendor-Return (vorher
   silent skip — sah aus als waere er nie aufgerufen worden).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-22 19:00:27 +02:00

247 lines
8.6 KiB
Python

"""
P31 — Tiered LLM-Cascade mit Confidence + Valkey-Cache.
Bisherige LLM-Calls (vendor_llm_extractor, mc_solution_generator):
* gehen direkt an Qwen lokal → bei kompliziertem Input lange Latenz
* fallen bei Fail manuell auf OVH 120B zurueck
* Kein Cache → gleiche Eingabe kostet x-mal Zeit
Diese Modul vereinheitlicht:
1. Cache-Lookup (md5(prompt) → cached response, TTL 7d)
2. Qwen-Aufruf mit kurzem Timeout (90s)
3. Wenn fail/leer ODER confidence < threshold → OVH 120B (45s)
4. Wenn auch fail → Anthropic Claude (last resort)
5. Response wird gecached
confidence-Heuristik:
* parsed JSON erfolgreich + non-empty → 0.8
* JSON-Parse failed → 0.0
* JSON ok aber nur 1 Item bei >5000 chars input → 0.3
Backend-API: await call_with_cascade(prompt, system_prompt, expected_min_items)
"""
from __future__ import annotations
import hashlib
import json
import logging
import os
from typing import Any
import httpx
logger = logging.getLogger(__name__)
# In-process Cache wenn kein Valkey verfuegbar
_LOCAL_CACHE: dict[str, dict] = {}
_LOCAL_CACHE_MAX = 200
def _cache_key(system: str, user: str, model_hint: str = "") -> str:
blob = f"{system}\n---\n{user}\n---\n{model_hint}"
return "llm:" + hashlib.md5(blob.encode()).hexdigest()[:24]
def _cache_get(key: str) -> dict | None:
try:
import redis # noqa: WPS433
url = os.getenv("VALKEY_URL", "redis://bp-core-valkey:6379")
r = redis.Redis.from_url(url, socket_timeout=2.0,
decode_responses=True)
v = r.get(key)
if v:
return json.loads(v)
except Exception:
pass
return _LOCAL_CACHE.get(key)
def _cache_put(key: str, value: dict, ttl: int = 604800) -> None:
try:
import redis # noqa: WPS433
url = os.getenv("VALKEY_URL", "redis://bp-core-valkey:6379")
r = redis.Redis.from_url(url, socket_timeout=2.0,
decode_responses=True)
r.setex(key, ttl, json.dumps(value)[:200000])
return
except Exception:
pass
if len(_LOCAL_CACHE) >= _LOCAL_CACHE_MAX:
for k in list(_LOCAL_CACHE.keys())[:50]:
_LOCAL_CACHE.pop(k, None)
_LOCAL_CACHE[key] = value
def _heuristic_confidence(response_text: str, input_len: int) -> float:
if not response_text:
return 0.0
try:
obj = json.loads(response_text)
except Exception:
# Try to extract JSON block
a, b = response_text.find("{"), response_text.rfind("}")
if 0 <= a < b:
try:
obj = json.loads(response_text[a:b + 1])
except Exception:
return 0.1
else:
return 0.1
n_items = 0
if isinstance(obj, dict):
for v in obj.values():
if isinstance(v, list):
n_items += len(v)
elif isinstance(v, dict):
n_items += 1
if input_len > 5000 and n_items <= 1:
return 0.3
if n_items >= 5:
return 0.9
return 0.7
async def _call_ollama(system: str, user: str,
max_tokens: int = 6000,
timeout: float = 90.0) -> str:
base = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
model = os.getenv("CMP_LLM_MODEL", "qwen3:30b-a3b")
payload = {
"model": model, "stream": False, "format": "json",
"messages": [{"role": "system", "content": system},
{"role": "user", "content": user}],
"options": {"temperature": 0.05, "num_predict": max_tokens},
}
try:
async with httpx.AsyncClient(timeout=timeout) as c:
r = await c.post(f"{base.rstrip('/')}/api/chat", json=payload)
r.raise_for_status()
return (r.json().get("message") or {}).get("content", "") or ""
except Exception as e:
# P83-followup: explizit type+message loggen damit empty-message
# exceptions (z.B. ReadTimeout) diagnostizierbar sind.
logger.warning(
"ollama cascade tier 1 failed: %s (%s) model=%s base=%s",
str(e) or "(no message)", type(e).__name__, model, base,
)
return ""
async def _call_ovh(system: str, user: str, max_tokens: int = 6000) -> str:
base = os.getenv("OVH_LLM_URL", "").strip()
key = os.getenv("OVH_LLM_KEY", "").strip()
model = os.getenv("OVH_LLM_MODEL", "").strip()
if not base or not model:
return ""
headers = {"Content-Type": "application/json"}
if key:
headers["Authorization"] = f"Bearer {key}"
payload = {
"model": model, "temperature": 0.05, "max_tokens": max_tokens,
"messages": [{"role": "system", "content": system},
{"role": "user", "content": user}],
"response_format": {"type": "json_object"},
}
try:
async with httpx.AsyncClient(timeout=45.0) as c:
r = await c.post(f"{base.rstrip('/')}/v1/chat/completions",
json=payload, headers=headers)
r.raise_for_status()
choice = (r.json().get("choices") or [{}])[0]
return (choice.get("message") or {}).get("content", "") or ""
except Exception as e:
logger.warning("ovh cascade tier 2 failed: %s", e)
return ""
async def _call_anthropic(system: str, user: str,
max_tokens: int = 4000) -> str:
key = os.getenv("ANTHROPIC_API_KEY", "").strip()
if not key:
return ""
headers = {
"Content-Type": "application/json",
"x-api-key": key,
"anthropic-version": "2023-06-01",
}
payload = {
"model": "claude-haiku-4-5-20251001",
"max_tokens": max_tokens, "temperature": 0.05,
"system": system,
"messages": [{"role": "user", "content": user}],
}
try:
async with httpx.AsyncClient(timeout=30.0) as c:
r = await c.post("https://api.anthropic.com/v1/messages",
json=payload, headers=headers)
r.raise_for_status()
blocks = r.json().get("content") or []
return "".join(b.get("text", "") for b in blocks if isinstance(b, dict))
except Exception as e:
logger.warning("anthropic cascade tier 3 failed: %s", e)
return ""
async def call_with_cascade(
system: str,
user: str,
min_confidence: float = 0.6,
max_tokens: int = 6000,
) -> dict:
"""Returns {'text': str, 'confidence': float, 'source': str,
'cached': bool}."""
key = _cache_key(system, user)
cached = _cache_get(key)
if cached:
cached["cached"] = True
logger.info("cascade cache HIT key=%s len=%d", key[-12:],
len(cached.get("text", "")))
return cached
input_len = len(user)
# Pre-flight: warn if Tier 2 + Tier 3 are unconfigured so user knows
# we are de-facto running single-tier (cascade collapse).
if not (os.getenv("OVH_LLM_URL", "").strip()
and os.getenv("OVH_LLM_MODEL", "").strip()):
if not os.getenv("ANTHROPIC_API_KEY", "").strip():
logger.warning(
"cascade: Tier 2 (OVH) AND Tier 3 (Anthropic) unconfigured — "
"running on Tier 1 (Qwen) only. Set OVH_LLM_URL/MODEL/KEY "
"or ANTHROPIC_API_KEY to enable fallbacks."
)
# Tier 1: Qwen lokal
text = await _call_ollama(system, user, max_tokens=max_tokens)
conf = _heuristic_confidence(text, input_len)
if text and conf >= min_confidence:
out = {"text": text, "confidence": conf,
"source": "qwen", "cached": False}
_cache_put(key, out)
return out
# Tier 2: OVH 120B
text2 = await _call_ovh(system, user, max_tokens=max_tokens)
conf2 = _heuristic_confidence(text2, input_len)
if text2 and conf2 >= min_confidence:
out = {"text": text2, "confidence": conf2,
"source": "ovh_120b", "cached": False}
_cache_put(key, out)
return out
# Tier 3: Anthropic Claude (Notnagel)
text3 = await _call_anthropic(system, user, max_tokens=max_tokens // 2)
conf3 = _heuristic_confidence(text3, input_len)
if text3 and conf3 >= min_confidence:
out = {"text": text3, "confidence": conf3,
"source": "anthropic_claude", "cached": False}
_cache_put(key, out)
return out
# Nichts hat geliefert — beste Variante wenigstens zurueckgeben
best_text = text or text2 or text3 or ""
best_conf = max(conf, conf2, conf3)
best_source = "qwen" if text else ("ovh_120b" if text2 else "anthropic")
return {"text": best_text, "confidence": best_conf,
"source": best_source, "cached": False,
"below_threshold": True}