refactor(agent-check): split routes file (2692→347 LOC) + wire B1/B3/A1 [guardrail-change]
Phase-5 split of agent_compliance_check_routes.py — the 2700-line
monolith was decomposed into 19 modules in compliance/api/agent_check/:
- Phase A-F: resolve / profile+check / banner+TCF / vendors raw+finalize /
HTML blocks top+mid+bot / email / persist
- Helpers: _constants, _helpers, _fetch, _discovery, _single_check
- Schemas + State + thin _orchestrator
A1 ZIP-Anhang nativ in _phase_e_email: evidence_zip_builder.py bundles
slices + manifest.json + audit_metadata.json (SHA256 per slice +
build_sha + source_url). smtp_sender.py erweitert um attachments-Parameter.
B1 COOKIE-CONSENT-UX-001 (Mobile Reachability): consent_reachability_check.py
parses footer anchors, classifies intent (reopen_cmp / info_only /
browser_deflect) + target (same_page_cmp / new_tab / external).
_b1_wiring.py fetches homepage with iPhone-UA + renders Art-7-Abs-3
severity-coloured block.
B3 TH-RETENTION (Cross-Doc Speicherdauer): retention_comparator.py
compares DSI claim ↔ cookie-table duration ↔ actual Max-Age/expires
with 5% tolerance + severity hierarchy (dsi_under_actual HIGH,
table_under_actual HIGH, dsi_vs_table MEDIUM, actual_under_table LOW
Safari-ITP-Hint). _b3_wiring.py + Top-10 mismatches table in mail.
Side-effects:
- Fixed silent UnboundLocalError in original Step 5 (gf_one_pager used
audit_quality_findings before declaration, caught by surrounding
except → block never rendered). New _phase_d3_blocks_bot.py runs
audit-quality FIRST.
- agent_compliance_check_routes.py removed from loc-exceptions.txt
("Phase 5 split target" — done).
Tests: 55/55 grün (B1 22 + B3 27 + saving_scan 6).
E2E: smoke against Elli DSE+Cookie produced HIGH/missing B1 finding,
TH-RETENTION table (17 cookies / 3 ✓ / 3 ✗ / 11 ?), evidence-zip
with 2 slices + manifest + audit_metadata (12089B, SHA256-chained,
source verified), email sent (attachments=1).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,230 @@
|
||||
"""Auto-discovery of missing canonical doc-types.
|
||||
|
||||
For each canonical type the user did NOT submit, try to find it on the
|
||||
homepage of the URLs they DID submit. Also follow same-owner subdomains
|
||||
mentioned in the submitted text (BMW Group → bmwgroup.com etc.).
|
||||
|
||||
Discovered docs are classified by `_classify_discovered_doc` and merged
|
||||
back into `doc_entries`; entries that stayed empty get
|
||||
`discovery_attempted=True` so the padding step can differentiate
|
||||
"Nicht eingereicht" from "Auf der Website nicht gefunden".
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import re
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import httpx
|
||||
|
||||
from ._constants import _ALL_DOC_TYPES, CONSENT_TESTER_URL
|
||||
from ._helpers import _classify_discovered_doc, _update
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
async def _autodiscover_missing(
|
||||
check_id: str,
|
||||
doc_entries: list[dict],
|
||||
doc_texts: dict[str, str],
|
||||
url_text_cache: dict[str, str],
|
||||
) -> None:
|
||||
"""For each canonical doc_type the user did not submit, try to find
|
||||
the corresponding document on the homepage of the site they DID submit.
|
||||
|
||||
Modifies doc_entries in place: fills text/url/word_count and sets
|
||||
`auto_discovered=True`. Marks `discovery_attempted=True` on every
|
||||
missing entry (even when nothing was found) so the report can
|
||||
distinguish 'Nicht eingereicht' from 'Auf der Website nicht gefunden'.
|
||||
"""
|
||||
# VW-Fix: nur Doc-Types mit substantieller Text-Ausbeute zaehlen
|
||||
# als 'submitted'. Wenn der User eine URL eingegeben hat aber die
|
||||
# 404 liefert (VW cookie-richtlinie.html), oder der Crawler weniger
|
||||
# als 200 Zeichen extrahiert (SPA-Shell), als 'missing' behandeln
|
||||
# damit der Discovery-Pass alternative URLs probiert.
|
||||
_MIN_USEFUL_CHARS = 200
|
||||
submitted_types = {
|
||||
e["doc_type"] for e in doc_entries
|
||||
if len((e.get("text") or "").strip()) >= _MIN_USEFUL_CHARS
|
||||
}
|
||||
# Markiere die fehlgeschlagenen URL-Submissions damit der Discovery
|
||||
# ihre URL nicht erneut probiert (waere sinnlos).
|
||||
failed_urls: set[str] = {
|
||||
(e.get("url") or "").strip()
|
||||
for e in doc_entries
|
||||
if (e.get("url") or "").strip()
|
||||
and len((e.get("text") or "").strip()) < _MIN_USEFUL_CHARS
|
||||
}
|
||||
if failed_urls:
|
||||
logger.info(
|
||||
"VW-Fix: %d eingegebene URLs lieferten <%d Zeichen — Discovery "
|
||||
"soll Alternativen probieren: %s",
|
||||
len(failed_urls), _MIN_USEFUL_CHARS,
|
||||
", ".join(list(failed_urls)[:3]),
|
||||
)
|
||||
# Map alias types to canonical
|
||||
submitted_canon = {
|
||||
"dse" if t in ("datenschutz", "privacy") else t for t in submitted_types
|
||||
}
|
||||
# Missing = canonical types the user did NOT submit
|
||||
missing = set(_ALL_DOC_TYPES) - submitted_canon
|
||||
if not missing:
|
||||
return
|
||||
|
||||
# Pick the most common base (scheme://netloc) from submitted URLs.
|
||||
bases: dict[str, int] = {}
|
||||
for e in doc_entries:
|
||||
u = (e.get("url") or "").strip()
|
||||
if u and "://" in u:
|
||||
p = urlparse(u)
|
||||
base = f"{p.scheme}://{p.netloc}"
|
||||
bases[base] = bases.get(base, 0) + 1
|
||||
if not bases:
|
||||
# No submitted URL at all — nothing to crawl from. Add empty
|
||||
# placeholders (with discovery_attempted=False) so the padding
|
||||
# step renders them as 'Nicht eingereicht' (not 'Nicht gefunden').
|
||||
for dt in missing:
|
||||
doc_entries.append({
|
||||
"doc_type": dt, "url": "", "text": "", "word_count": 0,
|
||||
"auto_discovered": False, "discovery_attempted": False,
|
||||
})
|
||||
return
|
||||
|
||||
# Build crawl plan: primary base + any related domains mentioned in
|
||||
# the submitted texts that share the owner's SLD. Example: BMW Group
|
||||
# text mentions bmwgroup.com and bmwgroup.jobs in addition to bmw.de.
|
||||
primary_base = max(bases, key=bases.get) + "/"
|
||||
crawl_bases: list[str] = [primary_base]
|
||||
primary_netloc = urlparse(primary_base).netloc.lower().lstrip("www.")
|
||||
owner_token = primary_netloc.split(".")[0] # 'bmw'
|
||||
|
||||
if owner_token and len(owner_token) >= 3:
|
||||
domain_re = re.compile(
|
||||
r"https?://([a-z0-9][a-z0-9\-]*\.)*" + re.escape(owner_token)
|
||||
+ r"[a-z0-9\-]*\.[a-z]{2,}",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
seen_bases = {primary_base}
|
||||
for entry in doc_entries:
|
||||
text = entry.get("text") or ""
|
||||
for m in domain_re.finditer(text):
|
||||
p = urlparse(m.group(0))
|
||||
base = f"{p.scheme}://{p.netloc}/"
|
||||
base_netloc = p.netloc.lower().lstrip("www.")
|
||||
if base_netloc == primary_netloc:
|
||||
continue
|
||||
if base in seen_bases:
|
||||
continue
|
||||
seen_bases.add(base)
|
||||
crawl_bases.append(base)
|
||||
if len(crawl_bases) >= 3:
|
||||
break
|
||||
if len(crawl_bases) >= 3:
|
||||
break
|
||||
|
||||
_update(
|
||||
check_id,
|
||||
f"Suche fehlende Dokumente auf {', '.join(urlparse(b).netloc for b in crawl_bases)}...",
|
||||
18,
|
||||
)
|
||||
|
||||
discovered: list[dict] = []
|
||||
disc_payloads: list[dict] = []
|
||||
disc_cookie_texts: list[str] = []
|
||||
for base in crawl_bases:
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=300.0) as client: # P90: 180s -> 300s
|
||||
resp = await client.post(
|
||||
f"{CONSENT_TESTER_URL}/dsi-discovery",
|
||||
json={"url": base, "max_documents": 15},
|
||||
timeout=300.0, # P90: 180s -> 300s
|
||||
)
|
||||
if resp.status_code != 200:
|
||||
logger.warning("auto-discovery: HTTP %d for %s",
|
||||
resp.status_code, base)
|
||||
continue
|
||||
body = resp.json()
|
||||
discovered.extend(body.get("documents", []) or [])
|
||||
disc_payloads.extend(body.get("cmp_payloads") or [])
|
||||
cmp_text = body.get("cmp_cookie_text") or ""
|
||||
if cmp_text:
|
||||
disc_cookie_texts.append(cmp_text)
|
||||
logger.info("auto-discovery on %s: %d docs, %d CMP payloads, "
|
||||
"cmp_cookie_text=%d words", base,
|
||||
len(body.get("documents", []) or []),
|
||||
len(body.get("cmp_payloads") or []),
|
||||
len(cmp_text.split()))
|
||||
except Exception as e:
|
||||
# P90: verbose exception fuer Diagnose
|
||||
logger.warning("auto-discovery failed for %s: %s (%s)",
|
||||
base, str(e) or "(empty)", type(e).__name__)
|
||||
|
||||
# Classify each discovered doc into a canonical doc_type
|
||||
by_type: dict[str, dict] = {}
|
||||
for d in discovered:
|
||||
title = (d.get("title") or "").lower()
|
||||
url = (d.get("url") or "").lower()
|
||||
wc = d.get("word_count") or 0
|
||||
if wc < 100:
|
||||
continue
|
||||
canon = _classify_discovered_doc(title, url)
|
||||
if canon and canon in missing and canon not in by_type:
|
||||
by_type[canon] = d
|
||||
|
||||
# Append/Update entry for every missing canonical type. Auto-discovered
|
||||
# ones get the text/URL filled; ungratched ones stay empty so the
|
||||
# padding step renders them as 'Auf der Website nicht gefunden'.
|
||||
# VW-Fix: wenn schon ein leerer entry existiert (URL gesetzt, aber
|
||||
# fetch hat 0/Mini-Text geliefert), in-place updaten statt duplizieren.
|
||||
filled = 0
|
||||
for dt in missing:
|
||||
existing = next((e for e in doc_entries
|
||||
if e.get("doc_type") == dt), None)
|
||||
new_entry: dict = existing if existing else {
|
||||
"doc_type": dt, "url": "", "text": "", "word_count": 0,
|
||||
"auto_discovered": False, "discovery_attempted": True,
|
||||
"cmp_payloads": [],
|
||||
}
|
||||
new_entry["discovery_attempted"] = True
|
||||
d = by_type.get(dt)
|
||||
if d:
|
||||
full = d.get("full_text") or d.get("text_preview") or ""
|
||||
# For cookie: prefer the CMP-reconstructed text when it's
|
||||
# substantially richer than the auto-discovered DOM extraction.
|
||||
# BMW homepage CMP yields ~1800 words of authoritative policy;
|
||||
# DOM extraction typically yields ~600 words of site chrome.
|
||||
if dt == "cookie" and disc_cookie_texts:
|
||||
cmp_merged = "\n\n".join(disc_cookie_texts)
|
||||
if len(cmp_merged.split()) > len(full.split()):
|
||||
logger.info(
|
||||
"cookie: using CMP-reconstructed text (%d words) "
|
||||
"instead of DOM (%d words)",
|
||||
len(cmp_merged.split()), len(full.split()),
|
||||
)
|
||||
full = cmp_merged
|
||||
if len(full.split()) >= 100:
|
||||
new_entry["text"] = full
|
||||
# Behalte die original URL als "rejected_url" damit Audit
|
||||
# zeigt 'X war 404, wir haben Y gefunden'.
|
||||
if existing and (existing.get("url") or "").strip() in failed_urls:
|
||||
new_entry["rejected_url"] = existing.get("url")
|
||||
new_entry["url"] = d.get("url", "")
|
||||
new_entry["word_count"] = len(full.split())
|
||||
new_entry["auto_discovered"] = True
|
||||
if dt == "cookie" and disc_payloads:
|
||||
new_entry["cmp_payloads"] = disc_payloads
|
||||
doc_texts[dt] = full
|
||||
filled += 1
|
||||
logger.info(
|
||||
"auto-discovered %s on %s: %s (%d words)%s",
|
||||
dt, base, d.get("url", "")[:80], new_entry["word_count"],
|
||||
" [REPLACED failed URL]" if existing else "",
|
||||
)
|
||||
if not existing:
|
||||
doc_entries.append(new_entry)
|
||||
|
||||
logger.info(
|
||||
"auto-discovery: filled %d/%d missing types from %s",
|
||||
filled, len(missing), base,
|
||||
)
|
||||
Reference in New Issue
Block a user