c2c8783fee
Phase-5 split of agent_compliance_check_routes.py — the 2700-line
monolith was decomposed into 19 modules in compliance/api/agent_check/:
- Phase A-F: resolve / profile+check / banner+TCF / vendors raw+finalize /
HTML blocks top+mid+bot / email / persist
- Helpers: _constants, _helpers, _fetch, _discovery, _single_check
- Schemas + State + thin _orchestrator
A1 ZIP-Anhang nativ in _phase_e_email: evidence_zip_builder.py bundles
slices + manifest.json + audit_metadata.json (SHA256 per slice +
build_sha + source_url). smtp_sender.py erweitert um attachments-Parameter.
B1 COOKIE-CONSENT-UX-001 (Mobile Reachability): consent_reachability_check.py
parses footer anchors, classifies intent (reopen_cmp / info_only /
browser_deflect) + target (same_page_cmp / new_tab / external).
_b1_wiring.py fetches homepage with iPhone-UA + renders Art-7-Abs-3
severity-coloured block.
B3 TH-RETENTION (Cross-Doc Speicherdauer): retention_comparator.py
compares DSI claim ↔ cookie-table duration ↔ actual Max-Age/expires
with 5% tolerance + severity hierarchy (dsi_under_actual HIGH,
table_under_actual HIGH, dsi_vs_table MEDIUM, actual_under_table LOW
Safari-ITP-Hint). _b3_wiring.py + Top-10 mismatches table in mail.
Side-effects:
- Fixed silent UnboundLocalError in original Step 5 (gf_one_pager used
audit_quality_findings before declaration, caught by surrounding
except → block never rendered). New _phase_d3_blocks_bot.py runs
audit-quality FIRST.
- agent_compliance_check_routes.py removed from loc-exceptions.txt
("Phase 5 split target" — done).
Tests: 55/55 grün (B1 22 + B3 27 + saving_scan 6).
E2E: smoke against Elli DSE+Cookie produced HIGH/missing B1 finding,
TH-RETENTION table (17 cookies / 3 ✓ / 3 ✗ / 11 ?), evidence-zip
with 2 slices + manifest + audit_metadata (12089B, SHA256-chained,
source verified), email sent (attachments=1).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
231 lines
9.6 KiB
Python
231 lines
9.6 KiB
Python
"""Auto-discovery of missing canonical doc-types.
|
|
|
|
For each canonical type the user did NOT submit, try to find it on the
|
|
homepage of the URLs they DID submit. Also follow same-owner subdomains
|
|
mentioned in the submitted text (BMW Group → bmwgroup.com etc.).
|
|
|
|
Discovered docs are classified by `_classify_discovered_doc` and merged
|
|
back into `doc_entries`; entries that stayed empty get
|
|
`discovery_attempted=True` so the padding step can differentiate
|
|
"Nicht eingereicht" from "Auf der Website nicht gefunden".
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
import re
|
|
from urllib.parse import urlparse
|
|
|
|
import httpx
|
|
|
|
from ._constants import _ALL_DOC_TYPES, CONSENT_TESTER_URL
|
|
from ._helpers import _classify_discovered_doc, _update
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
async def _autodiscover_missing(
|
|
check_id: str,
|
|
doc_entries: list[dict],
|
|
doc_texts: dict[str, str],
|
|
url_text_cache: dict[str, str],
|
|
) -> None:
|
|
"""For each canonical doc_type the user did not submit, try to find
|
|
the corresponding document on the homepage of the site they DID submit.
|
|
|
|
Modifies doc_entries in place: fills text/url/word_count and sets
|
|
`auto_discovered=True`. Marks `discovery_attempted=True` on every
|
|
missing entry (even when nothing was found) so the report can
|
|
distinguish 'Nicht eingereicht' from 'Auf der Website nicht gefunden'.
|
|
"""
|
|
# VW-Fix: nur Doc-Types mit substantieller Text-Ausbeute zaehlen
|
|
# als 'submitted'. Wenn der User eine URL eingegeben hat aber die
|
|
# 404 liefert (VW cookie-richtlinie.html), oder der Crawler weniger
|
|
# als 200 Zeichen extrahiert (SPA-Shell), als 'missing' behandeln
|
|
# damit der Discovery-Pass alternative URLs probiert.
|
|
_MIN_USEFUL_CHARS = 200
|
|
submitted_types = {
|
|
e["doc_type"] for e in doc_entries
|
|
if len((e.get("text") or "").strip()) >= _MIN_USEFUL_CHARS
|
|
}
|
|
# Markiere die fehlgeschlagenen URL-Submissions damit der Discovery
|
|
# ihre URL nicht erneut probiert (waere sinnlos).
|
|
failed_urls: set[str] = {
|
|
(e.get("url") or "").strip()
|
|
for e in doc_entries
|
|
if (e.get("url") or "").strip()
|
|
and len((e.get("text") or "").strip()) < _MIN_USEFUL_CHARS
|
|
}
|
|
if failed_urls:
|
|
logger.info(
|
|
"VW-Fix: %d eingegebene URLs lieferten <%d Zeichen — Discovery "
|
|
"soll Alternativen probieren: %s",
|
|
len(failed_urls), _MIN_USEFUL_CHARS,
|
|
", ".join(list(failed_urls)[:3]),
|
|
)
|
|
# Map alias types to canonical
|
|
submitted_canon = {
|
|
"dse" if t in ("datenschutz", "privacy") else t for t in submitted_types
|
|
}
|
|
# Missing = canonical types the user did NOT submit
|
|
missing = set(_ALL_DOC_TYPES) - submitted_canon
|
|
if not missing:
|
|
return
|
|
|
|
# Pick the most common base (scheme://netloc) from submitted URLs.
|
|
bases: dict[str, int] = {}
|
|
for e in doc_entries:
|
|
u = (e.get("url") or "").strip()
|
|
if u and "://" in u:
|
|
p = urlparse(u)
|
|
base = f"{p.scheme}://{p.netloc}"
|
|
bases[base] = bases.get(base, 0) + 1
|
|
if not bases:
|
|
# No submitted URL at all — nothing to crawl from. Add empty
|
|
# placeholders (with discovery_attempted=False) so the padding
|
|
# step renders them as 'Nicht eingereicht' (not 'Nicht gefunden').
|
|
for dt in missing:
|
|
doc_entries.append({
|
|
"doc_type": dt, "url": "", "text": "", "word_count": 0,
|
|
"auto_discovered": False, "discovery_attempted": False,
|
|
})
|
|
return
|
|
|
|
# Build crawl plan: primary base + any related domains mentioned in
|
|
# the submitted texts that share the owner's SLD. Example: BMW Group
|
|
# text mentions bmwgroup.com and bmwgroup.jobs in addition to bmw.de.
|
|
primary_base = max(bases, key=bases.get) + "/"
|
|
crawl_bases: list[str] = [primary_base]
|
|
primary_netloc = urlparse(primary_base).netloc.lower().lstrip("www.")
|
|
owner_token = primary_netloc.split(".")[0] # 'bmw'
|
|
|
|
if owner_token and len(owner_token) >= 3:
|
|
domain_re = re.compile(
|
|
r"https?://([a-z0-9][a-z0-9\-]*\.)*" + re.escape(owner_token)
|
|
+ r"[a-z0-9\-]*\.[a-z]{2,}",
|
|
re.IGNORECASE,
|
|
)
|
|
seen_bases = {primary_base}
|
|
for entry in doc_entries:
|
|
text = entry.get("text") or ""
|
|
for m in domain_re.finditer(text):
|
|
p = urlparse(m.group(0))
|
|
base = f"{p.scheme}://{p.netloc}/"
|
|
base_netloc = p.netloc.lower().lstrip("www.")
|
|
if base_netloc == primary_netloc:
|
|
continue
|
|
if base in seen_bases:
|
|
continue
|
|
seen_bases.add(base)
|
|
crawl_bases.append(base)
|
|
if len(crawl_bases) >= 3:
|
|
break
|
|
if len(crawl_bases) >= 3:
|
|
break
|
|
|
|
_update(
|
|
check_id,
|
|
f"Suche fehlende Dokumente auf {', '.join(urlparse(b).netloc for b in crawl_bases)}...",
|
|
18,
|
|
)
|
|
|
|
discovered: list[dict] = []
|
|
disc_payloads: list[dict] = []
|
|
disc_cookie_texts: list[str] = []
|
|
for base in crawl_bases:
|
|
try:
|
|
async with httpx.AsyncClient(timeout=300.0) as client: # P90: 180s -> 300s
|
|
resp = await client.post(
|
|
f"{CONSENT_TESTER_URL}/dsi-discovery",
|
|
json={"url": base, "max_documents": 15},
|
|
timeout=300.0, # P90: 180s -> 300s
|
|
)
|
|
if resp.status_code != 200:
|
|
logger.warning("auto-discovery: HTTP %d for %s",
|
|
resp.status_code, base)
|
|
continue
|
|
body = resp.json()
|
|
discovered.extend(body.get("documents", []) or [])
|
|
disc_payloads.extend(body.get("cmp_payloads") or [])
|
|
cmp_text = body.get("cmp_cookie_text") or ""
|
|
if cmp_text:
|
|
disc_cookie_texts.append(cmp_text)
|
|
logger.info("auto-discovery on %s: %d docs, %d CMP payloads, "
|
|
"cmp_cookie_text=%d words", base,
|
|
len(body.get("documents", []) or []),
|
|
len(body.get("cmp_payloads") or []),
|
|
len(cmp_text.split()))
|
|
except Exception as e:
|
|
# P90: verbose exception fuer Diagnose
|
|
logger.warning("auto-discovery failed for %s: %s (%s)",
|
|
base, str(e) or "(empty)", type(e).__name__)
|
|
|
|
# Classify each discovered doc into a canonical doc_type
|
|
by_type: dict[str, dict] = {}
|
|
for d in discovered:
|
|
title = (d.get("title") or "").lower()
|
|
url = (d.get("url") or "").lower()
|
|
wc = d.get("word_count") or 0
|
|
if wc < 100:
|
|
continue
|
|
canon = _classify_discovered_doc(title, url)
|
|
if canon and canon in missing and canon not in by_type:
|
|
by_type[canon] = d
|
|
|
|
# Append/Update entry for every missing canonical type. Auto-discovered
|
|
# ones get the text/URL filled; ungratched ones stay empty so the
|
|
# padding step renders them as 'Auf der Website nicht gefunden'.
|
|
# VW-Fix: wenn schon ein leerer entry existiert (URL gesetzt, aber
|
|
# fetch hat 0/Mini-Text geliefert), in-place updaten statt duplizieren.
|
|
filled = 0
|
|
for dt in missing:
|
|
existing = next((e for e in doc_entries
|
|
if e.get("doc_type") == dt), None)
|
|
new_entry: dict = existing if existing else {
|
|
"doc_type": dt, "url": "", "text": "", "word_count": 0,
|
|
"auto_discovered": False, "discovery_attempted": True,
|
|
"cmp_payloads": [],
|
|
}
|
|
new_entry["discovery_attempted"] = True
|
|
d = by_type.get(dt)
|
|
if d:
|
|
full = d.get("full_text") or d.get("text_preview") or ""
|
|
# For cookie: prefer the CMP-reconstructed text when it's
|
|
# substantially richer than the auto-discovered DOM extraction.
|
|
# BMW homepage CMP yields ~1800 words of authoritative policy;
|
|
# DOM extraction typically yields ~600 words of site chrome.
|
|
if dt == "cookie" and disc_cookie_texts:
|
|
cmp_merged = "\n\n".join(disc_cookie_texts)
|
|
if len(cmp_merged.split()) > len(full.split()):
|
|
logger.info(
|
|
"cookie: using CMP-reconstructed text (%d words) "
|
|
"instead of DOM (%d words)",
|
|
len(cmp_merged.split()), len(full.split()),
|
|
)
|
|
full = cmp_merged
|
|
if len(full.split()) >= 100:
|
|
new_entry["text"] = full
|
|
# Behalte die original URL als "rejected_url" damit Audit
|
|
# zeigt 'X war 404, wir haben Y gefunden'.
|
|
if existing and (existing.get("url") or "").strip() in failed_urls:
|
|
new_entry["rejected_url"] = existing.get("url")
|
|
new_entry["url"] = d.get("url", "")
|
|
new_entry["word_count"] = len(full.split())
|
|
new_entry["auto_discovered"] = True
|
|
if dt == "cookie" and disc_payloads:
|
|
new_entry["cmp_payloads"] = disc_payloads
|
|
doc_texts[dt] = full
|
|
filled += 1
|
|
logger.info(
|
|
"auto-discovered %s on %s: %s (%d words)%s",
|
|
dt, base, d.get("url", "")[:80], new_entry["word_count"],
|
|
" [REPLACED failed URL]" if existing else "",
|
|
)
|
|
if not existing:
|
|
doc_entries.append(new_entry)
|
|
|
|
logger.info(
|
|
"auto-discovery: filled %d/%d missing types from %s",
|
|
filled, len(missing), base,
|
|
)
|