From cf6005a47c3a3ec64bdbf9f6485c139dd9ab45c5 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Fri, 22 May 2026 09:40:11 +0200 Subject: [PATCH] perf(audit): vendor_llm_extractor + mc_solution_generator nutzen P31 LLM-Cascade MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Beide rufen jetzt llm_cascade.call_with_cascade() statt direkter Qwen/OVH- Aufrufe. Damit: * Cache-Hit auf identische Eingaben (Valkey, 7d TTL) → ~50ms statt 4-6min beim Re-Run derselben Cookie-Doc. * Tiered Cascade automatisch: Qwen → OVH 120B → Anthropic Claude Haiku wenn lower-tier under confidence-threshold. * Confidence-Scoring (JSON-parse + items_per_input_size) entscheidet ob weiter delegiert wird. Fallback auf alte _call_ollama/_call_ovh bleibt bestehen wenn der Cascade-Aufruf scheitert. Erwartete Wirkung beim 2. VW-Lauf: ~10min statt ~25min (Cache-Hit auf identische Cookie-Doc + MC-Solutions). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../services/mc_solution_generator.py | 15 +++++++++ .../services/vendor_llm_extractor.py | 31 ++++++++++++++----- 2 files changed, 38 insertions(+), 8 deletions(-) diff --git a/backend-compliance/compliance/services/mc_solution_generator.py b/backend-compliance/compliance/services/mc_solution_generator.py index c8069438..b8db4724 100644 --- a/backend-compliance/compliance/services/mc_solution_generator.py +++ b/backend-compliance/compliance/services/mc_solution_generator.py @@ -172,6 +172,21 @@ async def generate_solution( "Liefere die Loesung als JSON." ) + # P31: tiered Cascade (Qwen → OVH → Anthropic) mit Valkey-Cache. + try: + from compliance.services.llm_cascade import call_with_cascade + res = await call_with_cascade( + system=_SYSTEM_PROMPT, user=prompt, + min_confidence=0.5, max_tokens=600, + ) + parsed = _parse(res.get("text", "")) + if parsed: + _cache_put(cache_key, parsed) + return parsed + except Exception: + # fall through to legacy direct calls + pass + content = await _call_ollama(prompt) parsed = _parse(content) if not parsed: diff --git a/backend-compliance/compliance/services/vendor_llm_extractor.py b/backend-compliance/compliance/services/vendor_llm_extractor.py index 715579d3..10ecf14d 100644 --- a/backend-compliance/compliance/services/vendor_llm_extractor.py +++ b/backend-compliance/compliance/services/vendor_llm_extractor.py @@ -63,19 +63,34 @@ async def extract_vendors_via_llm( excerpt = cookie_text[:max_text_chars] user_prompt = f"Cookie-Richtlinie-Text:\n\n{excerpt}" - # Stage 1: local Qwen + # P31: nutze tiered LLM-Cascade mit Cache (Qwen → OVH → Anthropic). + # Re-Runs derselben Cookie-Doc landen im Valkey-Cache (7d TTL) und + # gehen in ~50ms statt 4-6min durch. Erstaufruf bleibt 4-6min lokal + # bzw ~2min auf OVH. + try: + from compliance.services.llm_cascade import call_with_cascade + res = await call_with_cascade( + system=_SYSTEM_PROMPT, user=user_prompt, + min_confidence=0.6, max_tokens=16000, + ) + vendors = _parse_vendor_list(res.get("text", "")) + if vendors: + logger.info( + "LLM vendor extraction (cascade %s, conf=%.2f, cached=%s): %d vendors", + res.get("source"), res.get("confidence", 0), + res.get("cached"), len(vendors), + ) + return vendors + except Exception as e: + logger.warning("Cascade extract failed, fallback to direct Qwen: %s", e) + + # Fallback: alte direkte Logik content = await _call_ollama(user_prompt) vendors = _parse_vendor_list(content) if vendors: - logger.info("LLM vendor extraction (Qwen): %d vendors", len(vendors)) return vendors - - # Stage 2: OVH backup content = await _call_ovh(user_prompt) - vendors = _parse_vendor_list(content) - if vendors: - logger.info("LLM vendor extraction (OVH): %d vendors", len(vendors)) - return vendors + return _parse_vendor_list(content) async def _call_ollama(user_prompt: str) -> str: