Files
breakpilot-compliance/backend-compliance/compliance/services/vendor_llm_extractor.py
T
Benjamin Admin cf6005a47c
CI / guardrail-integrity (push) Has been skipped
CI / detect-changes (push) Successful in 11s
CI / branch-name (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / validate-canonical-controls (push) Successful in 16s
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / loc-budget (push) Failing after 16s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / test-go (push) Has been skipped
CI / iace-gt-coverage (push) Has been skipped
CI / test-python-backend (push) Successful in 41s
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
perf(audit): vendor_llm_extractor + mc_solution_generator nutzen P31 LLM-Cascade
Beide rufen jetzt llm_cascade.call_with_cascade() statt direkter Qwen/OVH-
Aufrufe. Damit:
* Cache-Hit auf identische Eingaben (Valkey, 7d TTL) → ~50ms statt
  4-6min beim Re-Run derselben Cookie-Doc.
* Tiered Cascade automatisch: Qwen → OVH 120B → Anthropic Claude Haiku
  wenn lower-tier under confidence-threshold.
* Confidence-Scoring (JSON-parse + items_per_input_size) entscheidet ob
  weiter delegiert wird.

Fallback auf alte _call_ollama/_call_ovh bleibt bestehen wenn der
Cascade-Aufruf scheitert.

Erwartete Wirkung beim 2. VW-Lauf: ~10min statt ~25min (Cache-Hit auf
identische Cookie-Doc + MC-Solutions).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-22 09:40:11 +02:00

234 lines
8.1 KiB
Python

"""
LLM-based vendor extraction (V3 fallback).
When the cookie-policy text does not come from a known CMP (so we have no
structured JSON payload) we ask Qwen (local Ollama) → OVH (managed 120B)
to extract a vendor list as JSON. Output is then mapped to the same
VendorRecord schema used by vendor_extractor.py — so the rest of the
pipeline (URL probing, scoring, VVT table) works unchanged.
This bridges the long tail of cookie-policy implementations where the
content sits in DOM accordions rather than a CMP JSON endpoint.
"""
from __future__ import annotations
import json
import logging
import os
import re
from typing import Optional
import httpx
logger = logging.getLogger(__name__)
_SYSTEM_PROMPT = (
"Du bist ein Compliance-Tester. Extrahiere aus einer deutschen "
"Cookie-Richtlinie alle erwaehnten Drittanbieter (Dienste, Vendors, "
"Cookie-Provider).\n\n"
"Gib NUR ein JSON-Objekt zurueck:\n"
'{"vendors": [\n'
' {"name": "<Firmenname>", "country": "<DE|US|IE|...|>", '
'"purpose": "<Kurz>", "category": "<marketing|analytics|functional|necessary>", '
'"opt_out_url": "<vollstaendige URL oder \\"\\">", '
'"privacy_policy_url": "<vollstaendige URL oder \\"\\">", '
'"persistence": "<Speicherdauer in Worten>", '
'"cookies": [{"name": "<Cookie-Name>", "purpose": "<Kurz>", '
'"expiry": "<Dauer>", "is_third_party": true}]\n'
' }\n'
"]}\n\n"
"Regeln:\n"
"- Wenn ein Feld nicht im Text steht: leerer String oder leere Liste.\n"
"- KEINE Anbieter erfinden oder halluzinieren.\n"
"- Max 80 Anbieter, max 30 Cookies pro Anbieter.\n"
"- Nur reines JSON, keine Prosa, keine Code-Fences."
)
async def extract_vendors_via_llm(
cookie_text: str,
max_text_chars: int = 50000,
) -> list[dict]:
"""Run the Qwen → OVH cascade. Returns vendor records (possibly empty).
max_text_chars: VW-Cookie-Richtlinie hat ~60k chars mit ~100 Cookies in
der Tabelle. Bei 12k waren wir auf die ersten ~5 Cookies begrenzt und
haben nur 1 Vendor extrahiert. 50k deckt VW/BMW/Mercedes komplett ab
und passt in Qwen3-30b-a3b (128k Context) sowie OVH 120B.
"""
if not cookie_text or len(cookie_text) < 500:
return []
excerpt = cookie_text[:max_text_chars]
user_prompt = f"Cookie-Richtlinie-Text:\n\n{excerpt}"
# P31: nutze tiered LLM-Cascade mit Cache (Qwen → OVH → Anthropic).
# Re-Runs derselben Cookie-Doc landen im Valkey-Cache (7d TTL) und
# gehen in ~50ms statt 4-6min durch. Erstaufruf bleibt 4-6min lokal
# bzw ~2min auf OVH.
try:
from compliance.services.llm_cascade import call_with_cascade
res = await call_with_cascade(
system=_SYSTEM_PROMPT, user=user_prompt,
min_confidence=0.6, max_tokens=16000,
)
vendors = _parse_vendor_list(res.get("text", ""))
if vendors:
logger.info(
"LLM vendor extraction (cascade %s, conf=%.2f, cached=%s): %d vendors",
res.get("source"), res.get("confidence", 0),
res.get("cached"), len(vendors),
)
return vendors
except Exception as e:
logger.warning("Cascade extract failed, fallback to direct Qwen: %s", e)
# Fallback: alte direkte Logik
content = await _call_ollama(user_prompt)
vendors = _parse_vendor_list(content)
if vendors:
return vendors
content = await _call_ovh(user_prompt)
return _parse_vendor_list(content)
async def _call_ollama(user_prompt: str) -> str:
base = os.getenv("OLLAMA_URL", "http://host.docker.internal:11434")
model = os.getenv("CMP_LLM_MODEL", os.getenv("OLLAMA_MODEL", "qwen3:30b-a3b"))
payload = {
"model": model,
"messages": [
{"role": "system", "content": _SYSTEM_PROMPT},
{"role": "user", "content": user_prompt},
],
"stream": False, "format": "json",
# 16k tokens fuer ~80 Vendors mit je 30 Cookies. War vorher 6k →
# output wurde mittendrin abgeschnitten, JSON unparseable → 0 Vendors.
"options": {"temperature": 0.05, "num_predict": 16000},
}
try:
# Qwen 30b braucht fuer 16k output ~4-6min auf M4 Pro.
async with httpx.AsyncClient(timeout=420.0) as client:
resp = await client.post(f"{base.rstrip('/')}/api/chat", json=payload)
resp.raise_for_status()
return (resp.json().get("message") or {}).get("content", "")
except Exception as e:
logger.warning("Qwen vendor-extract failed: %s", e)
return ""
async def _call_ovh(user_prompt: str) -> str:
base = os.getenv("OVH_LLM_URL", "").strip()
key = os.getenv("OVH_LLM_KEY", "").strip()
model = os.getenv("OVH_LLM_MODEL", "").strip()
if not base or not model:
return ""
headers = {"Content-Type": "application/json"}
if key:
headers["Authorization"] = f"Bearer {key}"
payload = {
"model": model,
"messages": [
{"role": "system", "content": _SYSTEM_PROMPT},
{"role": "user", "content": user_prompt},
],
"temperature": 0.05, "max_tokens": 16000,
"response_format": {"type": "json_object"},
}
try:
async with httpx.AsyncClient(timeout=90.0) as client:
resp = await client.post(
f"{base.rstrip('/')}/v1/chat/completions",
json=payload, headers=headers,
)
resp.raise_for_status()
choice = (resp.json().get("choices") or [{}])[0]
return (choice.get("message") or {}).get("content", "") or ""
except Exception as e:
logger.warning("OVH vendor-extract failed: %s", e)
return ""
def _parse_vendor_list(content: str) -> list[dict]:
"""Be lenient about JSON wrappers / code-fences."""
if not content:
return []
for candidate in (content, _strip_fence(content), _grab_json(content)):
if not candidate:
continue
try:
obj = json.loads(candidate)
except Exception:
continue
if isinstance(obj, dict):
vendors = obj.get("vendors") or obj.get("Vendors")
if isinstance(vendors, list):
return _normalize(vendors)
if isinstance(obj, list):
return _normalize(obj)
return []
def _normalize(items: list) -> list[dict]:
out: list[dict] = []
for item in items[:80]:
if not isinstance(item, dict):
continue
name = (item.get("name") or "").strip()
if not name:
continue
cookies_raw = item.get("cookies") or []
cookies: list[dict] = []
for c in cookies_raw[:30]:
if not isinstance(c, dict):
continue
cookies.append({
"name": (c.get("name") or "").strip(),
"purpose": (c.get("purpose") or "").strip(),
"expiry": (c.get("expiry") or "").strip(),
"is_third_party": bool(c.get("is_third_party", True)),
})
out.append({
"name": name,
"country": (item.get("country") or "").strip()[:4],
"purpose": (item.get("purpose") or "").strip()[:500],
"category": (item.get("category") or "").strip(),
"opt_out_url": _safe_url(item.get("opt_out_url")),
"privacy_policy_url": _safe_url(item.get("privacy_policy_url")),
"persistence": (item.get("persistence") or "").strip()[:200],
"cookies": cookies,
})
return out
def _safe_url(value: Optional[str]) -> str:
if not value or not isinstance(value, str):
return ""
v = value.strip()
if v.startswith(("http://", "https://")):
return v[:500]
return ""
def _strip_fence(s: str) -> str:
s = s.strip()
if s.startswith("```"):
lines = s.split("\n")
return "\n".join(lines[1:-1]) if lines[-1].strip().startswith("```") else "\n".join(lines[1:])
return s
def _grab_json(s: str) -> str:
a, b = s.find("{"), s.rfind("}")
if 0 <= a < b:
return s[a:b + 1]
a, b = s.find("["), s.rfind("]")
if 0 <= a < b:
return s[a:b + 1]
return ""
# Defensive import to make optional dependency obvious
_ = re # pragma: no cover