6dad42a8c0
VW-Loop-Iteration 1: LLM cascade lieferte 14 vendors (Lucky-Hit via Direct-Fallback). VW-Loop-Iteration 2: 0 vendors — qwen2.5:14b ReadTimeout auch im 420s-Direct-Fallback (50k input + 16k output output dauert > 7min auf M4 Pro). Fix: max_text_chars 50000 → 20000. Erfasst die ersten ~3000 Worte der Cookie-Tabelle (Tabellen-Kopf komplett). Vollstaendige Tabelle wird ohnehin deterministisch von parse_flat_cookie_text geparsed. LLM ist nur fuer Vendor-Namen die NICHT in der Tabelle stehen (z.B. aus Prosa) und Inferenz-faehiger. Erwartung: 60-120s LLM-call statt Timeout, reproduzierbar 10-15 LLM- Vendors → Vendor-Normalizer-Total bleibt stabil bei 20+ statt 17. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
256 lines
9.2 KiB
Python
256 lines
9.2 KiB
Python
"""
|
|
LLM-based vendor extraction (V3 fallback).
|
|
|
|
When the cookie-policy text does not come from a known CMP (so we have no
|
|
structured JSON payload) we ask Qwen (local Ollama) → OVH (managed 120B)
|
|
to extract a vendor list as JSON. Output is then mapped to the same
|
|
VendorRecord schema used by vendor_extractor.py — so the rest of the
|
|
pipeline (URL probing, scoring, VVT table) works unchanged.
|
|
|
|
This bridges the long tail of cookie-policy implementations where the
|
|
content sits in DOM accordions rather than a CMP JSON endpoint.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import json
|
|
import logging
|
|
import os
|
|
import re
|
|
from typing import Optional
|
|
|
|
import httpx
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
_SYSTEM_PROMPT = (
|
|
"Du bist ein Compliance-Tester. Extrahiere aus einer deutschen "
|
|
"Cookie-Richtlinie alle erwaehnten Drittanbieter (Dienste, Vendors, "
|
|
"Cookie-Provider).\n\n"
|
|
"Gib NUR ein JSON-Objekt zurueck:\n"
|
|
'{"vendors": [\n'
|
|
' {"name": "<Firmenname>", "country": "<DE|US|IE|...|>", '
|
|
'"purpose": "<Kurz>", "category": "<marketing|analytics|functional|necessary>", '
|
|
'"opt_out_url": "<vollstaendige URL oder \\"\\">", '
|
|
'"privacy_policy_url": "<vollstaendige URL oder \\"\\">", '
|
|
'"persistence": "<Speicherdauer in Worten>", '
|
|
'"cookies": [{"name": "<Cookie-Name>", "purpose": "<Kurz>", '
|
|
'"expiry": "<Dauer>", "is_third_party": true}]\n'
|
|
' }\n'
|
|
"]}\n\n"
|
|
"Regeln:\n"
|
|
"- Wenn ein Feld nicht im Text steht: leerer String oder leere Liste.\n"
|
|
"- KEINE Anbieter erfinden oder halluzinieren.\n"
|
|
"- Max 80 Anbieter, max 30 Cookies pro Anbieter.\n"
|
|
"- Nur reines JSON, keine Prosa, keine Code-Fences."
|
|
)
|
|
|
|
|
|
async def extract_vendors_via_llm(
|
|
cookie_text: str,
|
|
max_text_chars: int = 20000,
|
|
) -> list[dict]:
|
|
"""Run the Qwen → OVH cascade. Returns vendor records (possibly empty).
|
|
|
|
max_text_chars: P-fix 2026-05-22 — auf 20000 reduziert. Vorher 50000
|
|
hat qwen2.5:14b ReadTimeout (auch bei 420s — Output 16000 tokens fuer
|
|
50k input dauert > 7min). 20000 chars erfassen die ersten ~3000 Worte
|
|
der Cookie-Tabelle (bei VW deckt das den Tabellen-Kopf komplett ab).
|
|
Vollstaendige Tabelle wird vom deterministischen parse_flat erfasst,
|
|
LLM ist nur fuer Vendor-Namen die NICHT in der Tabelle stehen (z.B.
|
|
aus Prosa-Text). Schneller + zuverlaessiger.
|
|
"""
|
|
if not cookie_text or len(cookie_text) < 500:
|
|
logger.info(
|
|
"LLM vendor extraction SKIP: cookie_text too short (%d chars, need >=500)",
|
|
len(cookie_text or ""),
|
|
)
|
|
return []
|
|
excerpt = cookie_text[:max_text_chars]
|
|
user_prompt = f"Cookie-Richtlinie-Text:\n\n{excerpt}"
|
|
logger.info(
|
|
"LLM vendor extraction START: input=%d chars (excerpt %d), calling cascade",
|
|
len(cookie_text), len(excerpt),
|
|
)
|
|
|
|
# P31: nutze tiered LLM-Cascade mit Cache (Qwen → OVH → Anthropic).
|
|
# Re-Runs derselben Cookie-Doc landen im Valkey-Cache (7d TTL) und
|
|
# gehen in ~50ms statt 4-6min durch. Erstaufruf bleibt 4-6min lokal
|
|
# bzw ~2min auf OVH.
|
|
try:
|
|
from compliance.services.llm_cascade import call_with_cascade
|
|
res = await call_with_cascade(
|
|
system=_SYSTEM_PROMPT, user=user_prompt,
|
|
min_confidence=0.6, max_tokens=16000,
|
|
)
|
|
vendors = _parse_vendor_list(res.get("text", ""))
|
|
if vendors:
|
|
logger.info(
|
|
"LLM vendor extraction OK (cascade %s, conf=%.2f, cached=%s): %d vendors",
|
|
res.get("source"), res.get("confidence", 0),
|
|
res.get("cached"), len(vendors),
|
|
)
|
|
return vendors
|
|
# Silent failure ist unbrauchbar fuer Diagnose: log explicit dass
|
|
# cascade returnte aber 0 Vendors (parse failed oder LLM gab nix).
|
|
logger.warning(
|
|
"LLM vendor extraction returned 0 vendors (cascade source=%s "
|
|
"text_len=%d below_threshold=%s) — fallback to direct calls",
|
|
res.get("source"), len(res.get("text", "") or ""),
|
|
res.get("below_threshold"),
|
|
)
|
|
except Exception as e:
|
|
logger.warning(
|
|
"Cascade extract failed, fallback to direct Qwen: %s (%s)",
|
|
str(e) or "(no message)", type(e).__name__,
|
|
)
|
|
|
|
# Fallback: alte direkte Logik
|
|
content = await _call_ollama(user_prompt)
|
|
vendors = _parse_vendor_list(content)
|
|
if vendors:
|
|
return vendors
|
|
content = await _call_ovh(user_prompt)
|
|
return _parse_vendor_list(content)
|
|
|
|
|
|
async def _call_ollama(user_prompt: str) -> str:
|
|
base = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
|
|
model = os.getenv("CMP_LLM_MODEL", os.getenv("OLLAMA_MODEL", "qwen3:30b-a3b"))
|
|
payload = {
|
|
"model": model,
|
|
"messages": [
|
|
{"role": "system", "content": _SYSTEM_PROMPT},
|
|
{"role": "user", "content": user_prompt},
|
|
],
|
|
"stream": False, "format": "json",
|
|
# 16k tokens fuer ~80 Vendors mit je 30 Cookies. War vorher 6k →
|
|
# output wurde mittendrin abgeschnitten, JSON unparseable → 0 Vendors.
|
|
"options": {"temperature": 0.05, "num_predict": 16000},
|
|
}
|
|
try:
|
|
# Qwen 30b braucht fuer 16k output ~4-6min auf M4 Pro.
|
|
async with httpx.AsyncClient(timeout=420.0) as client:
|
|
resp = await client.post(f"{base.rstrip('/')}/api/chat", json=payload)
|
|
resp.raise_for_status()
|
|
return (resp.json().get("message") or {}).get("content", "")
|
|
except Exception as e:
|
|
logger.warning("Qwen vendor-extract failed: %s", e)
|
|
return ""
|
|
|
|
|
|
async def _call_ovh(user_prompt: str) -> str:
|
|
base = os.getenv("OVH_LLM_URL", "").strip()
|
|
key = os.getenv("OVH_LLM_KEY", "").strip()
|
|
model = os.getenv("OVH_LLM_MODEL", "").strip()
|
|
if not base or not model:
|
|
return ""
|
|
headers = {"Content-Type": "application/json"}
|
|
if key:
|
|
headers["Authorization"] = f"Bearer {key}"
|
|
payload = {
|
|
"model": model,
|
|
"messages": [
|
|
{"role": "system", "content": _SYSTEM_PROMPT},
|
|
{"role": "user", "content": user_prompt},
|
|
],
|
|
"temperature": 0.05, "max_tokens": 16000,
|
|
"response_format": {"type": "json_object"},
|
|
}
|
|
try:
|
|
async with httpx.AsyncClient(timeout=90.0) as client:
|
|
resp = await client.post(
|
|
f"{base.rstrip('/')}/v1/chat/completions",
|
|
json=payload, headers=headers,
|
|
)
|
|
resp.raise_for_status()
|
|
choice = (resp.json().get("choices") or [{}])[0]
|
|
return (choice.get("message") or {}).get("content", "") or ""
|
|
except Exception as e:
|
|
logger.warning("OVH vendor-extract failed: %s", e)
|
|
return ""
|
|
|
|
|
|
def _parse_vendor_list(content: str) -> list[dict]:
|
|
"""Be lenient about JSON wrappers / code-fences."""
|
|
if not content:
|
|
return []
|
|
for candidate in (content, _strip_fence(content), _grab_json(content)):
|
|
if not candidate:
|
|
continue
|
|
try:
|
|
obj = json.loads(candidate)
|
|
except Exception:
|
|
continue
|
|
if isinstance(obj, dict):
|
|
vendors = obj.get("vendors") or obj.get("Vendors")
|
|
if isinstance(vendors, list):
|
|
return _normalize(vendors)
|
|
if isinstance(obj, list):
|
|
return _normalize(obj)
|
|
return []
|
|
|
|
|
|
def _normalize(items: list) -> list[dict]:
|
|
out: list[dict] = []
|
|
for item in items[:80]:
|
|
if not isinstance(item, dict):
|
|
continue
|
|
name = (item.get("name") or "").strip()
|
|
if not name:
|
|
continue
|
|
cookies_raw = item.get("cookies") or []
|
|
cookies: list[dict] = []
|
|
for c in cookies_raw[:30]:
|
|
if not isinstance(c, dict):
|
|
continue
|
|
cookies.append({
|
|
"name": (c.get("name") or "").strip(),
|
|
"purpose": (c.get("purpose") or "").strip(),
|
|
"expiry": (c.get("expiry") or "").strip(),
|
|
"is_third_party": bool(c.get("is_third_party", True)),
|
|
})
|
|
out.append({
|
|
"name": name,
|
|
"country": (item.get("country") or "").strip()[:4],
|
|
"purpose": (item.get("purpose") or "").strip()[:500],
|
|
"category": (item.get("category") or "").strip(),
|
|
"opt_out_url": _safe_url(item.get("opt_out_url")),
|
|
"privacy_policy_url": _safe_url(item.get("privacy_policy_url")),
|
|
"persistence": (item.get("persistence") or "").strip()[:200],
|
|
"cookies": cookies,
|
|
})
|
|
return out
|
|
|
|
|
|
def _safe_url(value: Optional[str]) -> str:
|
|
if not value or not isinstance(value, str):
|
|
return ""
|
|
v = value.strip()
|
|
if v.startswith(("http://", "https://")):
|
|
return v[:500]
|
|
return ""
|
|
|
|
|
|
def _strip_fence(s: str) -> str:
|
|
s = s.strip()
|
|
if s.startswith("```"):
|
|
lines = s.split("\n")
|
|
return "\n".join(lines[1:-1]) if lines[-1].strip().startswith("```") else "\n".join(lines[1:])
|
|
return s
|
|
|
|
|
|
def _grab_json(s: str) -> str:
|
|
a, b = s.find("{"), s.rfind("}")
|
|
if 0 <= a < b:
|
|
return s[a:b + 1]
|
|
a, b = s.find("["), s.rfind("]")
|
|
if 0 <= a < b:
|
|
return s[a:b + 1]
|
|
return ""
|
|
|
|
|
|
# Defensive import to make optional dependency obvious
|
|
_ = re # pragma: no cover
|