"""Multi-Version-DSE-Analyse. Wenn Auto-Discovery + Legacy-URL-Discovery mehrere DSE-URLs auf der gleichen Domain finden, vergleichen wir Key-Felder pro Variante: - Stand-/Versionsdatum (sichtbar?) - DSB-Name (Mollstraße vs Proliance vs …) - Wortzahl (deutlich kürzere Version = veraltet?) - SHA-256-Hash (für Audit-Trail) Output: HTML-Block mit Vergleichstabelle + roter Hinweis "Nur eine Version kann gültig sein". Nicht-destruktiv: wir entscheiden NICHT welche Variante richtig ist — wir präsentieren beide nebeneinander. Performance: cap auf max 3 zusätzliche DSE-URLs (Sitemap kann Hunderte liefern, das würde 3min+ kosten). """ from __future__ import annotations import hashlib import logging import re from html import escape as h from urllib.parse import urlparse import httpx logger = logging.getLogger(__name__) _DSB_PATTERNS = ( r"datenschutzbeauftragt\w*[\s\S]{0,200}?" r"((?:[A-ZÄÖÜ][\w\-]{2,40}\s+){1,4}" r"(?:GmbH|AG|GbR|Mollstr|Stra(?:ße|sse|sse)|str\.))", r"(proliance\s+gmbh)", r"(datenschutzexperte\.de)", ) _DATE_PATTERN = re.compile( r"(?:stand|letzte\s+aktualisierung|version|effective)[:.]?\s*" r"(\d{4}[-./]\d{1,2}(?:[-./]\d{1,2})?|" r"(?:januar|februar|m(?:ae|ä)rz|april|mai|juni|juli|august|" r"september|oktober|november|dezember)\s+\d{4}|" r"\d{1,2}[./]\d{4})", re.IGNORECASE, ) async def _fetch_text(url: str) -> tuple[str, int]: try: async with httpx.AsyncClient(timeout=15.0, follow_redirects=True) as c: r = await c.get(url) if r.status_code != 200: return "", r.status_code text = re.sub(r"", " ", r.text, flags=re.S | re.I) text = re.sub(r"", " ", text, flags=re.S | re.I) text = re.sub(r"<[^>]+>", " ", text) text = re.sub(r"\s+", " ", text).strip() return text, 200 except Exception as e: logger.info("fetch failed for %s: %s", url, e) return "", 0 def _extract_dsb(text: str) -> str: if not text: return "" for pat in _DSB_PATTERNS: m = re.search(pat, text, re.IGNORECASE) if m: return (m.group(1) if m.lastindex else m.group(0))[:120].strip() return "" def _extract_date(text: str) -> str: if not text: return "" m = _DATE_PATTERN.search(text) return (m.group(1) if m else "")[:40].strip() async def analyze_multiple_dse_versions(state: dict) -> dict: """If ≥2 DSE-like URLs are reachable on the same domain, fetch each and produce a comparison table.""" doc_entries = state.get("doc_entries") or [] legacy = (state.get("legacy_url_inventory") or {}).get("candidates") or [] # Collect DSE-candidate URLs from doc_entries + legacy-inventory candidates: list[str] = [] seen: set[str] = set() for e in doc_entries: if (e.get("doc_type") or "") != "dse": continue url = (e.get("url") or "").strip() if url and url not in seen: candidates.append(url) seen.add(url) for c in legacy: url = (c.get("url") or "").strip() if not url or url in seen: continue # Only DSE-ish URLs url_lc = url.lower() if any(k in url_lc for k in ( "datenschutz", "privacy", "datenschutzerk", )): if c.get("status") == 200: candidates.append(url) seen.add(url) if len(candidates) < 2: return {"versions": [], "skipped": "single_version_or_none"} # Cap to 3 for performance candidates = candidates[:3] versions: list[dict] = [] for url in candidates: text, status = await _fetch_text(url) if not text: continue versions.append({ "url": url, "status": status, "word_count": len(text.split()), "sha256": hashlib.sha256(text.encode("utf-8")).hexdigest()[:16], "date_found": _extract_date(text) or "kein Datum", "dsb_found": _extract_dsb(text) or "—", }) if len(versions) < 2: return {"versions": versions, "skipped": "only_one_fetched"} # Detect contradictions dates = {v["date_found"] for v in versions if v["date_found"] != "kein Datum"} dsbs = {v["dsb_found"] for v in versions if v["dsb_found"] != "—"} return { "versions": versions, "date_divergent": len(dates) > 1, "dsb_divergent": len(dsbs) > 1, "no_date_count": sum( 1 for v in versions if v["date_found"] == "kein Datum" ), } def render_multi_version_block(info: dict) -> str: versions = info.get("versions") or [] if len(versions) < 2: return "" rows = [] for v in versions: rows.append( f"" f"" f"" f"{h(v['url'][:90])}" f"" f"{v['word_count']:,}" f"{h(v['sha256'])}…" f"" f"{h(v['date_found'])}" f"" f"{h(v['dsb_found'])}" f"" ) warnings = [] if info.get("date_divergent"): warnings.append("verschiedene Datumsangaben") if info.get("dsb_divergent"): warnings.append("verschiedene DSB benannt") if info.get("no_date_count"): warnings.append( f"{info['no_date_count']} von {len(versions)} ohne Datum" ) warn_html = "" if warnings: warn_html = ( "

" "Erkannte Inkonsistenzen: " + " · ".join(warnings) + "

" ) return ( "
" f"

" f"📑 Mehrere DSE-Versionen erkannt ({len(versions)})" "

" "

" "Auf deiner Domain sind mehrere DSE-URLs öffentlich reachable. " "Nur eine Version kann rechtsverbindlich gültig sein. " "Wir prüfen jede unabhängig — der Kunde wählt das gültige " "Ergebnis und sorgt dafür, dass die andere Variante " "301-Redirect oder offline wird." "

" + warn_html + "" "" "" "" "" "" "" "" + "".join(rows) + "
URLWörterSHA-256DatumDSB benannt
" "
" )