""" LLM-based vendor extraction (V3 fallback). When the cookie-policy text does not come from a known CMP (so we have no structured JSON payload) we ask Qwen (local Ollama) → OVH (managed 120B) to extract a vendor list as JSON. Output is then mapped to the same VendorRecord schema used by vendor_extractor.py — so the rest of the pipeline (URL probing, scoring, VVT table) works unchanged. This bridges the long tail of cookie-policy implementations where the content sits in DOM accordions rather than a CMP JSON endpoint. """ from __future__ import annotations import json import logging import os import re from typing import Optional import httpx logger = logging.getLogger(__name__) _SYSTEM_PROMPT = ( "Du bist ein Compliance-Tester. Extrahiere aus einer deutschen " "Cookie-Richtlinie alle erwaehnten Drittanbieter (Dienste, Vendors, " "Cookie-Provider).\n\n" "Gib NUR ein JSON-Objekt zurueck:\n" '{"vendors": [\n' ' {"name": "", "country": "", ' '"purpose": "", "category": "", ' '"opt_out_url": "", ' '"privacy_policy_url": "", ' '"persistence": "", ' '"cookies": [{"name": "", "purpose": "", ' '"expiry": "", "is_third_party": true}]\n' ' }\n' "]}\n\n" "Regeln:\n" "- Wenn ein Feld nicht im Text steht: leerer String oder leere Liste.\n" "- KEINE Anbieter erfinden oder halluzinieren.\n" "- Max 80 Anbieter, max 30 Cookies pro Anbieter.\n" "- Nur reines JSON, keine Prosa, keine Code-Fences." ) async def extract_vendors_via_llm( cookie_text: str, max_text_chars: int = 50000, ) -> list[dict]: """Run the Qwen → OVH cascade. Returns vendor records (possibly empty). max_text_chars: VW-Cookie-Richtlinie hat ~60k chars mit ~100 Cookies in der Tabelle. Bei 12k waren wir auf die ersten ~5 Cookies begrenzt und haben nur 1 Vendor extrahiert. 50k deckt VW/BMW/Mercedes komplett ab und passt in Qwen3-30b-a3b (128k Context) sowie OVH 120B. """ if not cookie_text or len(cookie_text) < 500: return [] excerpt = cookie_text[:max_text_chars] user_prompt = f"Cookie-Richtlinie-Text:\n\n{excerpt}" # P31: nutze tiered LLM-Cascade mit Cache (Qwen → OVH → Anthropic). # Re-Runs derselben Cookie-Doc landen im Valkey-Cache (7d TTL) und # gehen in ~50ms statt 4-6min durch. Erstaufruf bleibt 4-6min lokal # bzw ~2min auf OVH. try: from compliance.services.llm_cascade import call_with_cascade res = await call_with_cascade( system=_SYSTEM_PROMPT, user=user_prompt, min_confidence=0.6, max_tokens=16000, ) vendors = _parse_vendor_list(res.get("text", "")) if vendors: logger.info( "LLM vendor extraction (cascade %s, conf=%.2f, cached=%s): %d vendors", res.get("source"), res.get("confidence", 0), res.get("cached"), len(vendors), ) return vendors except Exception as e: logger.warning("Cascade extract failed, fallback to direct Qwen: %s", e) # Fallback: alte direkte Logik content = await _call_ollama(user_prompt) vendors = _parse_vendor_list(content) if vendors: return vendors content = await _call_ovh(user_prompt) return _parse_vendor_list(content) async def _call_ollama(user_prompt: str) -> str: base = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434") model = os.getenv("CMP_LLM_MODEL", os.getenv("OLLAMA_MODEL", "qwen3:30b-a3b")) payload = { "model": model, "messages": [ {"role": "system", "content": _SYSTEM_PROMPT}, {"role": "user", "content": user_prompt}, ], "stream": False, "format": "json", # 16k tokens fuer ~80 Vendors mit je 30 Cookies. War vorher 6k → # output wurde mittendrin abgeschnitten, JSON unparseable → 0 Vendors. "options": {"temperature": 0.05, "num_predict": 16000}, } try: # Qwen 30b braucht fuer 16k output ~4-6min auf M4 Pro. async with httpx.AsyncClient(timeout=420.0) as client: resp = await client.post(f"{base.rstrip('/')}/api/chat", json=payload) resp.raise_for_status() return (resp.json().get("message") or {}).get("content", "") except Exception as e: logger.warning("Qwen vendor-extract failed: %s", e) return "" async def _call_ovh(user_prompt: str) -> str: base = os.getenv("OVH_LLM_URL", "").strip() key = os.getenv("OVH_LLM_KEY", "").strip() model = os.getenv("OVH_LLM_MODEL", "").strip() if not base or not model: return "" headers = {"Content-Type": "application/json"} if key: headers["Authorization"] = f"Bearer {key}" payload = { "model": model, "messages": [ {"role": "system", "content": _SYSTEM_PROMPT}, {"role": "user", "content": user_prompt}, ], "temperature": 0.05, "max_tokens": 16000, "response_format": {"type": "json_object"}, } try: async with httpx.AsyncClient(timeout=90.0) as client: resp = await client.post( f"{base.rstrip('/')}/v1/chat/completions", json=payload, headers=headers, ) resp.raise_for_status() choice = (resp.json().get("choices") or [{}])[0] return (choice.get("message") or {}).get("content", "") or "" except Exception as e: logger.warning("OVH vendor-extract failed: %s", e) return "" def _parse_vendor_list(content: str) -> list[dict]: """Be lenient about JSON wrappers / code-fences.""" if not content: return [] for candidate in (content, _strip_fence(content), _grab_json(content)): if not candidate: continue try: obj = json.loads(candidate) except Exception: continue if isinstance(obj, dict): vendors = obj.get("vendors") or obj.get("Vendors") if isinstance(vendors, list): return _normalize(vendors) if isinstance(obj, list): return _normalize(obj) return [] def _normalize(items: list) -> list[dict]: out: list[dict] = [] for item in items[:80]: if not isinstance(item, dict): continue name = (item.get("name") or "").strip() if not name: continue cookies_raw = item.get("cookies") or [] cookies: list[dict] = [] for c in cookies_raw[:30]: if not isinstance(c, dict): continue cookies.append({ "name": (c.get("name") or "").strip(), "purpose": (c.get("purpose") or "").strip(), "expiry": (c.get("expiry") or "").strip(), "is_third_party": bool(c.get("is_third_party", True)), }) out.append({ "name": name, "country": (item.get("country") or "").strip()[:4], "purpose": (item.get("purpose") or "").strip()[:500], "category": (item.get("category") or "").strip(), "opt_out_url": _safe_url(item.get("opt_out_url")), "privacy_policy_url": _safe_url(item.get("privacy_policy_url")), "persistence": (item.get("persistence") or "").strip()[:200], "cookies": cookies, }) return out def _safe_url(value: Optional[str]) -> str: if not value or not isinstance(value, str): return "" v = value.strip() if v.startswith(("http://", "https://")): return v[:500] return "" def _strip_fence(s: str) -> str: s = s.strip() if s.startswith("```"): lines = s.split("\n") return "\n".join(lines[1:-1]) if lines[-1].strip().startswith("```") else "\n".join(lines[1:]) return s def _grab_json(s: str) -> str: a, b = s.find("{"), s.rfind("}") if 0 <= a < b: return s[a:b + 1] a, b = s.find("["), s.rfind("]") if 0 <= a < b: return s[a:b + 1] return "" # Defensive import to make optional dependency obvious _ = re # pragma: no cover