fix(scan): TCF SQL column + cascade diagnose-logs
VW-Scan-Befunde aus 0a8aa16e: 1. TCF lookup failed 5x mit: column 'source' does not exist. Korrekt: 'source_name' (siehe DELETE-Query in derselben Datei). Mit dem Fix funktioniert das TCF-Cross-Reference fuer alle Vendors statt 0. 2. Cascade tier-1 fail loggte leere message — jetzt mit type+model+base. 3. Cascade collapse (tier 2+3 unconfigured) wird beim ersten Aufruf geloggt damit der Operator den ENV-Mangel sofort sieht. 4. vendor_llm_extractor loggt jetzt START + 0-vendor-Return (vorher silent skip — sah aus als waere er nie aufgerufen worden). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -119,7 +119,12 @@ async def _call_ollama(system: str, user: str,
|
|||||||
r.raise_for_status()
|
r.raise_for_status()
|
||||||
return (r.json().get("message") or {}).get("content", "") or ""
|
return (r.json().get("message") or {}).get("content", "") or ""
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning("ollama cascade tier 1 failed: %s", e)
|
# P83-followup: explizit type+message loggen damit empty-message
|
||||||
|
# exceptions (z.B. ReadTimeout) diagnostizierbar sind.
|
||||||
|
logger.warning(
|
||||||
|
"ollama cascade tier 1 failed: %s (%s) model=%s base=%s",
|
||||||
|
str(e) or "(no message)", type(e).__name__, model, base,
|
||||||
|
)
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
|
|
||||||
@@ -190,9 +195,21 @@ async def call_with_cascade(
|
|||||||
cached = _cache_get(key)
|
cached = _cache_get(key)
|
||||||
if cached:
|
if cached:
|
||||||
cached["cached"] = True
|
cached["cached"] = True
|
||||||
|
logger.info("cascade cache HIT key=%s len=%d", key[-12:],
|
||||||
|
len(cached.get("text", "")))
|
||||||
return cached
|
return cached
|
||||||
|
|
||||||
input_len = len(user)
|
input_len = len(user)
|
||||||
|
# Pre-flight: warn if Tier 2 + Tier 3 are unconfigured so user knows
|
||||||
|
# we are de-facto running single-tier (cascade collapse).
|
||||||
|
if not (os.getenv("OVH_LLM_URL", "").strip()
|
||||||
|
and os.getenv("OVH_LLM_MODEL", "").strip()):
|
||||||
|
if not os.getenv("ANTHROPIC_API_KEY", "").strip():
|
||||||
|
logger.warning(
|
||||||
|
"cascade: Tier 2 (OVH) AND Tier 3 (Anthropic) unconfigured — "
|
||||||
|
"running on Tier 1 (Qwen) only. Set OVH_LLM_URL/MODEL/KEY "
|
||||||
|
"or ANTHROPIC_API_KEY to enable fallbacks."
|
||||||
|
)
|
||||||
# Tier 1: Qwen lokal
|
# Tier 1: Qwen lokal
|
||||||
text = await _call_ollama(system, user, max_tokens=max_tokens)
|
text = await _call_ollama(system, user, max_tokens=max_tokens)
|
||||||
conf = _heuristic_confidence(text, input_len)
|
conf = _heuristic_confidence(text, input_len)
|
||||||
|
|||||||
@@ -146,7 +146,7 @@ def lookup_tcf_authority(
|
|||||||
"""
|
"""
|
||||||
SELECT cookie_name, actual_category, vendor_name
|
SELECT cookie_name, actual_category, vendor_name
|
||||||
FROM compliance.cookie_library
|
FROM compliance.cookie_library
|
||||||
WHERE source = 'iab_tcf_v2'
|
WHERE source_name = 'iab_tcf_v2'
|
||||||
AND LOWER(vendor_name) LIKE :pat
|
AND LOWER(vendor_name) LIKE :pat
|
||||||
LIMIT 5
|
LIMIT 5
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -59,9 +59,17 @@ async def extract_vendors_via_llm(
|
|||||||
und passt in Qwen3-30b-a3b (128k Context) sowie OVH 120B.
|
und passt in Qwen3-30b-a3b (128k Context) sowie OVH 120B.
|
||||||
"""
|
"""
|
||||||
if not cookie_text or len(cookie_text) < 500:
|
if not cookie_text or len(cookie_text) < 500:
|
||||||
|
logger.info(
|
||||||
|
"LLM vendor extraction SKIP: cookie_text too short (%d chars, need >=500)",
|
||||||
|
len(cookie_text or ""),
|
||||||
|
)
|
||||||
return []
|
return []
|
||||||
excerpt = cookie_text[:max_text_chars]
|
excerpt = cookie_text[:max_text_chars]
|
||||||
user_prompt = f"Cookie-Richtlinie-Text:\n\n{excerpt}"
|
user_prompt = f"Cookie-Richtlinie-Text:\n\n{excerpt}"
|
||||||
|
logger.info(
|
||||||
|
"LLM vendor extraction START: input=%d chars (excerpt %d), calling cascade",
|
||||||
|
len(cookie_text), len(excerpt),
|
||||||
|
)
|
||||||
|
|
||||||
# P31: nutze tiered LLM-Cascade mit Cache (Qwen → OVH → Anthropic).
|
# P31: nutze tiered LLM-Cascade mit Cache (Qwen → OVH → Anthropic).
|
||||||
# Re-Runs derselben Cookie-Doc landen im Valkey-Cache (7d TTL) und
|
# Re-Runs derselben Cookie-Doc landen im Valkey-Cache (7d TTL) und
|
||||||
@@ -76,13 +84,24 @@ async def extract_vendors_via_llm(
|
|||||||
vendors = _parse_vendor_list(res.get("text", ""))
|
vendors = _parse_vendor_list(res.get("text", ""))
|
||||||
if vendors:
|
if vendors:
|
||||||
logger.info(
|
logger.info(
|
||||||
"LLM vendor extraction (cascade %s, conf=%.2f, cached=%s): %d vendors",
|
"LLM vendor extraction OK (cascade %s, conf=%.2f, cached=%s): %d vendors",
|
||||||
res.get("source"), res.get("confidence", 0),
|
res.get("source"), res.get("confidence", 0),
|
||||||
res.get("cached"), len(vendors),
|
res.get("cached"), len(vendors),
|
||||||
)
|
)
|
||||||
return vendors
|
return vendors
|
||||||
|
# Silent failure ist unbrauchbar fuer Diagnose: log explicit dass
|
||||||
|
# cascade returnte aber 0 Vendors (parse failed oder LLM gab nix).
|
||||||
|
logger.warning(
|
||||||
|
"LLM vendor extraction returned 0 vendors (cascade source=%s "
|
||||||
|
"text_len=%d below_threshold=%s) — fallback to direct calls",
|
||||||
|
res.get("source"), len(res.get("text", "") or ""),
|
||||||
|
res.get("below_threshold"),
|
||||||
|
)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning("Cascade extract failed, fallback to direct Qwen: %s", e)
|
logger.warning(
|
||||||
|
"Cascade extract failed, fallback to direct Qwen: %s (%s)",
|
||||||
|
str(e) or "(no message)", type(e).__name__,
|
||||||
|
)
|
||||||
|
|
||||||
# Fallback: alte direkte Logik
|
# Fallback: alte direkte Logik
|
||||||
content = await _call_ollama(user_prompt)
|
content = await _call_ollama(user_prompt)
|
||||||
|
|||||||
Reference in New Issue
Block a user