e8ff75cbfe
5 Backlog-Items aus dem Multi-Site-Briefing in einem Sprint:
1. B13 B2C-Soft-Hints — Versicherungs/Tarif/Buchungs-Marker
_B2C_WEAK erweitert um "Reiseversicherung", "Tarifrechner",
"Online-Antrag", "Flug buchen", "Stromtarif" etc.
Fängt Allianz-Reise-Chatbot (vorher False-Negative).
2. Chatbot-Policy-Discovery (chatbot_policy_discovery.py)
Probt 14 Standard-Slugs (privacypolicychatbot, chatbot-datenschutz,
ai-policy, ki-datenschutz, ...) × 5 Lang-Prefixe auf jeder
submitted Origin. Successful >300-Wort-Findings werden in
doc_texts['dse'] gemerged. Audit-Trail über
doc_entries[dse].chatbot_policy_sources.
Hebt Westfield-iAdvize-Lücke.
3. API-Response-Payload erweitert
phase_f_persist.response um extra_findings, audit_walk und
html_blocks erweitert. B-Wiring-Output (B1, B3-B18) ist nicht
mehr nur im Mail-HTML versteckt — externe Aufrufer sehen jeden
Finding. Schema additiv, legacy clients ignorieren neue Felder.
4. Plausibility-LLM Empty-Response-Fix
Resilienz-Strategie A→B→C→D:
A) format='json' (strict, default)
B) format='' (loose, _try_extract_json mit ```json-fence + prose-
wrap-Unterstützung)
C) Split-Batch-Recursion (vorhanden)
D) Give up, leeres dict (callers behandeln als skipped)
Plus _post_llm() als isolierter LLM-Call-Helper, catched
Network-Errors.
5. Specialist-Agents Phase 2 LLM (MVP) — Impressum-Agent
impressum_agent_llm.py: qwen3:30b-a3b mit § 5 TMG System-Prompt,
business_scope-hints aus profile_dict. Output identisches Schema
wie pattern-agent für ein Merge ohne API-Bruch.
_b18_wiring.py orchestriert beide Agents + deduplet nach
field_id, rendert lila V2-Block mit KB/LLM-Tags pro Finding.
Pattern-first im Dedup (deterministisch + stable).
Tests: 107/107 grün (7 Test-Suites + chatbot-discovery + b18).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
162 lines
5.1 KiB
Python
162 lines
5.1 KiB
Python
"""Discover separate chatbot-/AI-policy pages and merge them into the
|
||
main DSE text.
|
||
|
||
Many sites publish their chatbot data-protection notice on a separate
|
||
URL (e.g. westfield.com/germany/privacypolicychatbot) that the regular
|
||
auto-discovery misses because it doesn't classify as 'dse'. As a
|
||
result, B12/B15 (chatbot-cookie classification, AI-Act legal basis)
|
||
never see the iAdvize/Vertex provider names.
|
||
|
||
Strategy:
|
||
1. From the discovered URLs derive the base host.
|
||
2. Probe a fixed list of well-known chatbot-policy paths.
|
||
3. For each 2xx-response with > 300 words, merge the text into
|
||
state['doc_texts']['dse'] with a separator.
|
||
|
||
Best-effort: a probe failure NEVER aborts the check.
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import asyncio
|
||
import logging
|
||
import re
|
||
from urllib.parse import urlparse
|
||
|
||
import httpx
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
|
||
# Slug-Kandidaten, sortiert von häufigsten zu seltensten.
|
||
_CHATBOT_POLICY_SLUGS = (
|
||
"privacypolicychatbot",
|
||
"chatbot-datenschutz", "chatbot/datenschutz",
|
||
"datenschutz-chatbot", "datenschutz/chatbot",
|
||
"ai-policy", "ai-datenschutz", "ki-datenschutz",
|
||
"privacy-chatbot", "privacy-ai",
|
||
"datenschutz-ki", "datenschutz-assistent",
|
||
"chatbot-privacy", "ai-privacy",
|
||
)
|
||
|
||
|
||
# Sprach-Prefixe die wir abklopfen.
|
||
_LANG_PREFIXES = ("", "/de", "/de_DE", "/en", "/germany")
|
||
|
||
|
||
def _build_candidate_urls(base_origin: str) -> list[str]:
|
||
"""Build all (lang × slug) combinations for one origin."""
|
||
out: list[str] = []
|
||
seen: set[str] = set()
|
||
for lang in _LANG_PREFIXES:
|
||
for slug in _CHATBOT_POLICY_SLUGS:
|
||
url = f"{base_origin}{lang}/{slug}".replace("//", "/")
|
||
url = url.replace("https:/", "https://").replace("http:/", "http://")
|
||
if url not in seen:
|
||
seen.add(url)
|
||
out.append(url)
|
||
return out
|
||
|
||
|
||
async def _probe(url: str, timeout_s: float = 4.0) -> tuple[str, str] | None:
|
||
"""Return (url, text) on 2xx + >300-word body, else None."""
|
||
try:
|
||
async with httpx.AsyncClient(
|
||
timeout=timeout_s, follow_redirects=True,
|
||
) as c:
|
||
r = await c.get(url)
|
||
if r.status_code >= 400:
|
||
return None
|
||
text = re.sub(r"<script.*?</script>", " ",
|
||
r.text, flags=re.S | re.I)
|
||
text = re.sub(r"<style.*?</style>", " ",
|
||
text, flags=re.S | re.I)
|
||
text = re.sub(r"<[^>]+>", " ", text)
|
||
text = re.sub(r"\s+", " ", text).strip()
|
||
if len(text.split()) < 300:
|
||
return None
|
||
return url, text
|
||
except Exception:
|
||
return None
|
||
|
||
|
||
def _base_origins(doc_entries: list[dict]) -> list[str]:
|
||
seen: set[str] = set()
|
||
out: list[str] = []
|
||
for e in doc_entries:
|
||
url = (e.get("url") or "").strip()
|
||
if not url:
|
||
continue
|
||
try:
|
||
p = urlparse(url)
|
||
if not p.scheme or not p.netloc:
|
||
continue
|
||
origin = f"{p.scheme}://{p.netloc}"
|
||
if origin not in seen:
|
||
seen.add(origin)
|
||
out.append(origin)
|
||
except Exception:
|
||
continue
|
||
return out
|
||
|
||
|
||
async def enrich_dse_with_chatbot_policies(state: dict) -> dict:
|
||
"""Probe known chatbot-policy paths; merge findings into DSE text.
|
||
|
||
Returns metadata dict describing what was merged (for logging /
|
||
debugging). Mutates state['doc_texts']['dse'] in place.
|
||
"""
|
||
doc_entries = state.get("doc_entries") or []
|
||
origins = _base_origins(doc_entries)
|
||
if not origins:
|
||
return {"probed": 0, "found": [], "merged_chars": 0}
|
||
|
||
# Build candidate URL list, capped per origin to avoid noise.
|
||
candidates: list[str] = []
|
||
for origin in origins[:2]: # cap origins for safety
|
||
candidates.extend(_build_candidate_urls(origin)[:20])
|
||
|
||
if not candidates:
|
||
return {"probed": 0, "found": [], "merged_chars": 0}
|
||
|
||
results = await asyncio.gather(
|
||
*[_probe(u) for u in candidates],
|
||
return_exceptions=True,
|
||
)
|
||
found = [r for r in results if isinstance(r, tuple) and r]
|
||
|
||
if not found:
|
||
return {"probed": len(candidates), "found": [], "merged_chars": 0}
|
||
|
||
# Merge into DSE text.
|
||
doc_texts = state.setdefault("doc_texts", {})
|
||
dse_text = doc_texts.get("dse") or ""
|
||
appended_chars = 0
|
||
appended_urls: list[str] = []
|
||
for url, text in found:
|
||
sep = (
|
||
f"\n\n--- ergänzt aus {url} (chatbot-policy-discovery) ---\n\n"
|
||
)
|
||
dse_text += sep + text
|
||
appended_chars += len(text)
|
||
appended_urls.append(url)
|
||
doc_texts["dse"] = dse_text
|
||
|
||
# Also record on the dse-entry (audit trail).
|
||
for e in doc_entries:
|
||
if e.get("doc_type") == "dse":
|
||
e["chatbot_policy_sources"] = appended_urls
|
||
e["text"] = dse_text
|
||
break
|
||
|
||
logger.info(
|
||
"chatbot-policy enrichment: %d candidate(s) probed, %d found, "
|
||
"+%d chars merged into DSE",
|
||
len(candidates), len(found), appended_chars,
|
||
)
|
||
return {
|
||
"probed": len(candidates),
|
||
"found": appended_urls,
|
||
"merged_chars": appended_chars,
|
||
}
|