"""Auto-discovery of missing canonical doc-types. For each canonical type the user did NOT submit, try to find it on the homepage of the URLs they DID submit. Also follow same-owner subdomains mentioned in the submitted text (BMW Group → bmwgroup.com etc.). Discovered docs are classified by `_classify_discovered_doc` and merged back into `doc_entries`; entries that stayed empty get `discovery_attempted=True` so the padding step can differentiate "Nicht eingereicht" from "Auf der Website nicht gefunden". """ from __future__ import annotations import logging import re from urllib.parse import urlparse import httpx from ._constants import _ALL_DOC_TYPES, CONSENT_TESTER_URL from ._helpers import _classify_discovered_doc, _update logger = logging.getLogger(__name__) async def _autodiscover_missing( check_id: str, doc_entries: list[dict], doc_texts: dict[str, str], url_text_cache: dict[str, str], ) -> None: """For each canonical doc_type the user did not submit, try to find the corresponding document on the homepage of the site they DID submit. Modifies doc_entries in place: fills text/url/word_count and sets `auto_discovered=True`. Marks `discovery_attempted=True` on every missing entry (even when nothing was found) so the report can distinguish 'Nicht eingereicht' from 'Auf der Website nicht gefunden'. """ # VW-Fix: nur Doc-Types mit substantieller Text-Ausbeute zaehlen # als 'submitted'. Wenn der User eine URL eingegeben hat aber die # 404 liefert (VW cookie-richtlinie.html), oder der Crawler weniger # als 200 Zeichen extrahiert (SPA-Shell), als 'missing' behandeln # damit der Discovery-Pass alternative URLs probiert. _MIN_USEFUL_CHARS = 200 submitted_types = { e["doc_type"] for e in doc_entries if len((e.get("text") or "").strip()) >= _MIN_USEFUL_CHARS } # Markiere die fehlgeschlagenen URL-Submissions damit der Discovery # ihre URL nicht erneut probiert (waere sinnlos). failed_urls: set[str] = { (e.get("url") or "").strip() for e in doc_entries if (e.get("url") or "").strip() and len((e.get("text") or "").strip()) < _MIN_USEFUL_CHARS } if failed_urls: logger.info( "VW-Fix: %d eingegebene URLs lieferten <%d Zeichen — Discovery " "soll Alternativen probieren: %s", len(failed_urls), _MIN_USEFUL_CHARS, ", ".join(list(failed_urls)[:3]), ) # Map alias types to canonical submitted_canon = { "dse" if t in ("datenschutz", "privacy") else t for t in submitted_types } # Missing = canonical types the user did NOT submit missing = set(_ALL_DOC_TYPES) - submitted_canon if not missing: return # Pick the most common base (scheme://netloc) from submitted URLs. bases: dict[str, int] = {} for e in doc_entries: u = (e.get("url") or "").strip() if u and "://" in u: p = urlparse(u) base = f"{p.scheme}://{p.netloc}" bases[base] = bases.get(base, 0) + 1 if not bases: # No submitted URL at all — nothing to crawl from. Add empty # placeholders (with discovery_attempted=False) so the padding # step renders them as 'Nicht eingereicht' (not 'Nicht gefunden'). for dt in missing: doc_entries.append({ "doc_type": dt, "url": "", "text": "", "word_count": 0, "auto_discovered": False, "discovery_attempted": False, }) return # Build crawl plan: primary base + any related domains mentioned in # the submitted texts that share the owner's SLD. Example: BMW Group # text mentions bmwgroup.com and bmwgroup.jobs in addition to bmw.de. primary_base = max(bases, key=bases.get) + "/" crawl_bases: list[str] = [primary_base] primary_netloc = urlparse(primary_base).netloc.lower().lstrip("www.") owner_token = primary_netloc.split(".")[0] # 'bmw' if owner_token and len(owner_token) >= 3: domain_re = re.compile( r"https?://([a-z0-9][a-z0-9\-]*\.)*" + re.escape(owner_token) + r"[a-z0-9\-]*\.[a-z]{2,}", re.IGNORECASE, ) seen_bases = {primary_base} for entry in doc_entries: text = entry.get("text") or "" for m in domain_re.finditer(text): p = urlparse(m.group(0)) base = f"{p.scheme}://{p.netloc}/" base_netloc = p.netloc.lower().lstrip("www.") if base_netloc == primary_netloc: continue if base in seen_bases: continue seen_bases.add(base) crawl_bases.append(base) if len(crawl_bases) >= 3: break if len(crawl_bases) >= 3: break _update( check_id, f"Suche fehlende Dokumente auf {', '.join(urlparse(b).netloc for b in crawl_bases)}...", 18, ) discovered: list[dict] = [] disc_payloads: list[dict] = [] disc_cookie_texts: list[str] = [] for base in crawl_bases: try: async with httpx.AsyncClient(timeout=300.0) as client: # P90: 180s -> 300s resp = await client.post( f"{CONSENT_TESTER_URL}/dsi-discovery", json={"url": base, "max_documents": 15}, timeout=300.0, # P90: 180s -> 300s ) if resp.status_code != 200: logger.warning("auto-discovery: HTTP %d for %s", resp.status_code, base) continue body = resp.json() discovered.extend(body.get("documents", []) or []) disc_payloads.extend(body.get("cmp_payloads") or []) cmp_text = body.get("cmp_cookie_text") or "" if cmp_text: disc_cookie_texts.append(cmp_text) logger.info("auto-discovery on %s: %d docs, %d CMP payloads, " "cmp_cookie_text=%d words", base, len(body.get("documents", []) or []), len(body.get("cmp_payloads") or []), len(cmp_text.split())) except Exception as e: # P90: verbose exception fuer Diagnose logger.warning("auto-discovery failed for %s: %s (%s)", base, str(e) or "(empty)", type(e).__name__) # Classify each discovered doc into a canonical doc_type by_type: dict[str, dict] = {} for d in discovered: title = (d.get("title") or "").lower() url = (d.get("url") or "").lower() wc = d.get("word_count") or 0 if wc < 100: continue canon = _classify_discovered_doc(title, url) if canon and canon in missing and canon not in by_type: by_type[canon] = d # Append/Update entry for every missing canonical type. Auto-discovered # ones get the text/URL filled; ungratched ones stay empty so the # padding step renders them as 'Auf der Website nicht gefunden'. # VW-Fix: wenn schon ein leerer entry existiert (URL gesetzt, aber # fetch hat 0/Mini-Text geliefert), in-place updaten statt duplizieren. filled = 0 for dt in missing: existing = next((e for e in doc_entries if e.get("doc_type") == dt), None) new_entry: dict = existing if existing else { "doc_type": dt, "url": "", "text": "", "word_count": 0, "auto_discovered": False, "discovery_attempted": True, "cmp_payloads": [], } new_entry["discovery_attempted"] = True d = by_type.get(dt) if d: full = d.get("full_text") or d.get("text_preview") or "" # For cookie: prefer the CMP-reconstructed text when it's # substantially richer than the auto-discovered DOM extraction. # BMW homepage CMP yields ~1800 words of authoritative policy; # DOM extraction typically yields ~600 words of site chrome. if dt == "cookie" and disc_cookie_texts: cmp_merged = "\n\n".join(disc_cookie_texts) if len(cmp_merged.split()) > len(full.split()): logger.info( "cookie: using CMP-reconstructed text (%d words) " "instead of DOM (%d words)", len(cmp_merged.split()), len(full.split()), ) full = cmp_merged if len(full.split()) >= 100: new_entry["text"] = full # Behalte die original URL als "rejected_url" damit Audit # zeigt 'X war 404, wir haben Y gefunden'. if existing and (existing.get("url") or "").strip() in failed_urls: new_entry["rejected_url"] = existing.get("url") new_entry["url"] = d.get("url", "") new_entry["word_count"] = len(full.split()) new_entry["auto_discovered"] = True if dt == "cookie" and disc_payloads: new_entry["cmp_payloads"] = disc_payloads doc_texts[dt] = full filled += 1 logger.info( "auto-discovered %s on %s: %s (%d words)%s", dt, base, d.get("url", "")[:80], new_entry["word_count"], " [REPLACED failed URL]" if existing else "", ) if not existing: doc_entries.append(new_entry) logger.info( "auto-discovery: filled %d/%d missing types from %s", filled, len(missing), base, )