Files
breakpilot-compliance/consent-tester/services/cmp_llm_fallback.py
T
Benjamin Admin 2400aa6a9e feat(consent-tester): Phase C+D — LLM cascade fallback (Qwen → OVH)
New module consent-tester/services/cmp_llm_fallback.py:
- LLMCookieExtractor: single-endpoint adapter (Ollama OR OpenAI-compat)
- LLMCascade: tries Qwen (local Mac Mini Ollama) first; falls through to
  OVH (managed 120B) when Qwen returns no usable strategy
- LLMCascade.from_env(): reads OLLAMA_URL/CMP_LLM_MODEL + OVH_LLM_URL/
  OVH_LLM_KEY/OVH_LLM_MODEL from environment
- LLM returns JSON {strategy: url|selector|text, value: ...}
- Valkey-backed cache per netloc (cmp:hint:<netloc>, 7-day TTL) — next run
  against the same domain skips the LLM entirely

dsi_discovery.py:
- Wired network_log collector (URL/status/content-type/size of every JSON
  response on the page) — passed to LLM prompt as observation
- After Named CMP (Phase B) + Heuristic (Phase A) both fail AND DOM
  < 300 words: invoke LLMCascade.analyze(...)
- _apply_llm_hint executes the LLM's strategy: refetch URL via Playwright
  request context, query DOM selector, or use text directly
- Cache HIT path: apply cached hint, only fall back to LLM if cache is stale

docker-compose.yml:
- consent-tester gets env vars + cmp-data volume (for Phase E)
- All LLM endpoints configurable via env, sensible defaults

consent-tester/requirements.txt:
- redis>=5.0 (asyncio client, Valkey-compatible)
- httpx>=0.27
2026-05-16 23:06:05 +02:00

289 lines
9.7 KiB
Python

"""
LLM-based Cookie-Policy Fallback (Phase C+D of the cascade).
When the named CMP library (Phase B) AND the generic JSON heuristic (Phase A)
both come up empty, we send the page's DOM snapshot + network log to an LLM
and ask it to find the policy. Cascade order:
Qwen (local Ollama on Mac Mini) ← Phase C, fast + private
↓ if invalid response
OVH (managed 120B model) ← Phase D, more capable backup
The LLM returns one of:
{"strategy": "url", "value": "<full-json-endpoint-url>"}
{"strategy": "selector", "value": "<css-selector-in-current-DOM>"}
{"strategy": "text", "value": "<direct extracted policy text>"}
Per-domain results are cached in Valkey (7-day TTL) so we only pay the LLM
cost once per site.
Wiring: dsi_discovery.py calls LLMCascade.analyze(...) when self_wc < 300
AND cmp_capture.payloads is empty.
"""
from __future__ import annotations
import json
import logging
import os
from typing import Any, Literal
import httpx
logger = logging.getLogger(__name__)
_SYSTEM_PROMPT = (
"Du analysierst eine Website-Inspektion, um die Cookie-Richtlinie zu "
"finden. Wir haben einen DOM-Auszug und ein Netzwerk-Log mit JSON-"
"Responses geladen. Gib genau EIN JSON-Objekt zurueck:\n"
'{"strategy": "url"|"selector"|"text", "value": "..."}\n\n'
"- strategy=url: vollstaendige JSON-URL, die die strukturierte Cookie-"
"Policy enthaelt (z.B. CMP-Endpoints von OneTrust, Cookiebot, etc.)\n"
"- strategy=selector: CSS-Selector im aktuellen DOM, der den eigentlichen "
"Policy-Text enthaelt (kein Sub-iframe).\n"
"- strategy=text: nur wenn der Policy-Text bereits im DOM steht und du "
"ihn extrahieren kannst (max 4000 Worte).\n\n"
"Antworte AUSSCHLIESSLICH mit JSON. Keine Erklaerung. Wenn nichts "
"gefunden: {\"strategy\":\"none\",\"value\":\"\"}"
)
class LLMCookieExtractor:
"""One LLM endpoint (Ollama or OpenAI-compatible)."""
def __init__(
self,
kind: Literal["ollama", "openai"],
base_url: str,
model: str,
api_key: str = "",
timeout: float = 60.0,
) -> None:
self.kind = kind
self.base_url = base_url.rstrip("/")
self.model = model
self.api_key = api_key
self.timeout = timeout
async def analyze(
self,
target_url: str,
dom_snapshot: str,
network_log: list[dict],
) -> dict:
"""Send DOM + network info to the LLM, parse the JSON response."""
user_prompt = _build_user_prompt(target_url, dom_snapshot, network_log)
try:
content = await self._chat(_SYSTEM_PROMPT, user_prompt)
except Exception as e:
logger.warning("%s LLM call failed: %s", self.kind, e)
return {}
return _parse_json_response(content)
async def _chat(self, system: str, user: str) -> str:
if self.kind == "ollama":
return await self._chat_ollama(system, user)
return await self._chat_openai(system, user)
async def _chat_ollama(self, system: str, user: str) -> str:
payload = {
"model": self.model,
"messages": [
{"role": "system", "content": system},
{"role": "user", "content": user},
],
"stream": False,
"format": "json",
"options": {"temperature": 0.1, "num_predict": 2000},
}
async with httpx.AsyncClient(timeout=self.timeout) as client:
resp = await client.post(f"{self.base_url}/api/chat", json=payload)
resp.raise_for_status()
data = resp.json()
return (data.get("message") or {}).get("content", "")
async def _chat_openai(self, system: str, user: str) -> str:
headers = {"Content-Type": "application/json"}
if self.api_key:
headers["Authorization"] = f"Bearer {self.api_key}"
payload = {
"model": self.model,
"messages": [
{"role": "system", "content": system},
{"role": "user", "content": user},
],
"temperature": 0.1,
"max_tokens": 2000,
"response_format": {"type": "json_object"},
}
async with httpx.AsyncClient(timeout=self.timeout) as client:
resp = await client.post(
f"{self.base_url}/v1/chat/completions",
json=payload, headers=headers,
)
resp.raise_for_status()
data = resp.json()
choice = (data.get("choices") or [{}])[0]
return (choice.get("message") or {}).get("content", "") or ""
class LLMCascade:
"""Try Qwen first (local, fast). Fall through to OVH on invalid result."""
def __init__(
self,
qwen: LLMCookieExtractor | None = None,
ovh: LLMCookieExtractor | None = None,
) -> None:
self.qwen = qwen
self.ovh = ovh
@classmethod
def from_env(cls) -> "LLMCascade":
qwen = None
ovh = None
ollama_url = os.getenv("OLLAMA_URL", "http://bp-core-ollama:11434")
qwen_model = os.getenv("CMP_LLM_MODEL", "qwen3:30b-a3b")
if ollama_url and qwen_model:
qwen = LLMCookieExtractor(
kind="ollama", base_url=ollama_url, model=qwen_model, timeout=90.0,
)
ovh_url = os.getenv("OVH_LLM_URL", "").strip()
ovh_key = os.getenv("OVH_LLM_KEY", "").strip()
ovh_model = os.getenv("OVH_LLM_MODEL", "").strip()
if ovh_url and ovh_model:
ovh = LLMCookieExtractor(
kind="openai", base_url=ovh_url, model=ovh_model,
api_key=ovh_key, timeout=60.0,
)
if not qwen and not ovh:
logger.warning("LLMCascade: neither Qwen nor OVH configured")
return cls(qwen=qwen, ovh=ovh)
async def analyze(
self,
target_url: str,
dom_snapshot: str,
network_log: list[dict],
) -> dict:
"""Return the first valid LLM analysis, or {} if all fail."""
for tier_name, ex in (("qwen", self.qwen), ("ovh", self.ovh)):
if ex is None:
continue
res = await ex.analyze(target_url, dom_snapshot, network_log)
strategy = res.get("strategy")
if strategy in ("url", "selector", "text") and res.get("value"):
logger.info("LLM (%s) suggested strategy=%s", tier_name, strategy)
res["_tier"] = tier_name
return res
logger.info("LLM (%s) returned no usable strategy", tier_name)
return {}
def _build_user_prompt(
target_url: str, dom_snapshot: str, network_log: list[dict],
) -> str:
# Truncate to keep prompt < ~16K chars
dom = dom_snapshot[:5000]
log_lines = []
for entry in network_log[:60]:
log_lines.append(
f"- {entry.get('status', '?')} "
f"{entry.get('content_type', '?')[:30]} "
f"{entry.get('size', '?')}B "
f"{entry.get('url', '')[:150]}"
)
log_text = "\n".join(log_lines)
return (
f"Ziel-URL: {target_url}\n\n"
f"=== DOM-Auszug (body.innerText, gekuerzt) ===\n{dom}\n\n"
f"=== Netzwerk-Log (JSON-Responses dieser Seite) ===\n{log_text}"
)
def _parse_json_response(content: str) -> dict:
"""LLMs sometimes wrap JSON in code-fences or add prose. Be lenient."""
if not content:
return {}
# Try direct parse
for candidate in (content, _strip_code_fence(content), _find_json_block(content)):
if not candidate:
continue
try:
obj = json.loads(candidate)
if isinstance(obj, dict):
return obj
except Exception:
continue
return {}
def _strip_code_fence(s: str) -> str:
s = s.strip()
if s.startswith("```"):
lines = s.split("\n")
return "\n".join(lines[1:-1]) if lines[-1].strip().startswith("```") else "\n".join(lines[1:])
return s
def _find_json_block(s: str) -> str:
start = s.find("{")
end = s.rfind("}")
if start >= 0 and end > start:
return s[start:end + 1]
return ""
# ── Valkey cache ────────────────────────────────────────────────────
async def cache_get(netloc: str) -> dict | None:
"""Read a cached LLM hint for this netloc from Valkey, if any."""
client = _valkey_client()
if not client:
return None
try:
raw = await client.get(_cache_key(netloc))
return json.loads(raw) if raw else None
except Exception as e:
logger.debug("cache_get failed: %s", e)
return None
async def cache_set(netloc: str, hint: dict, ttl: int = 604800) -> None:
"""Cache an LLM result for 7 days (default)."""
client = _valkey_client()
if not client:
return
try:
await client.set(_cache_key(netloc), json.dumps(hint), ex=ttl)
except Exception as e:
logger.debug("cache_set failed: %s", e)
def _cache_key(netloc: str) -> str:
return f"cmp:hint:{netloc.lower()}"
_valkey_singleton: Any = None
def _valkey_client() -> Any:
"""Lazy-init a redis.asyncio client (Valkey-compatible). Returns None if unavailable."""
global _valkey_singleton
if _valkey_singleton is not None:
return _valkey_singleton
try:
import redis.asyncio as redis # type: ignore[import-not-found]
url = os.getenv("VALKEY_URL", "redis://bp-core-valkey:6379")
_valkey_singleton = redis.from_url(
url, decode_responses=True, socket_connect_timeout=2.0,
)
return _valkey_singleton
except Exception as e:
logger.debug("valkey client init failed: %s", e)
return None