feat(agent): progress_pct + 6 BMW-Run Verbesserungen

Backend (agent_compliance_check_routes.py):
- progress_pct (0-100%) im Job-State, ueber alle Phasen verteilt
  (Laden 0-30, Profil 35-40, Pruefen 40-80, Banner 80-92, Report 95-100)
- Status-Texte vereinheitlicht ("Texte laden X/N", "Pruefen X/N")
- Firmenname fuer Email-Subject jetzt aus URL abgeleitet
  (bmw.de -> "BMW", mercedes-benz.de -> "Mercedes-Benz") statt
  unzuverlaessigem extracted_profile.companyName (matchte oft juris.de)
- E-Mail-Report enthaelt jetzt Banner+TCF-Vendor-Liste (build_provider_list_html)

Backend (agent_doc_check_extras.py — neu):
- build_scanned_urls_html: gepruefte URLs als Tabelle oben im Report
  (transparent fuer GF, welche Quellen wirklich gezogen wurden)
- Cross-Domain-Hinweis bei >1 netloc (BMW: bmw.de / bmwgroup.com /
  bmwgroup.jobs — Auffindbarkeit nach Art. 12 DSGVO)
- build_provider_list_html: Banner-Box + TCF-Vendor-Tabelle mit Spalten
  Name | Kategorie | Zweck | Drittland | Rechtsgrundlage

Backend (business_profiler.py):
- §34d-GewO Versicherungsvermittler-Hinweise zaehlen nicht mehr als
  "finance"-Industrie (BMW wurde dadurch falsch als B2B/finance erkannt)
- Neue Industry "automotive" (Fahrzeug/KFZ/Konfigurator/Modellpalette)
- B2B-Keywords: generische Begriffe wie "unternehmen", "beratung",
  "consulting" entfernt (matchten in jedem Konzerntext)
- B2C-Fallback: bei Verbraucher-Signalen ("widerruf", "kunde",
  redaktioneller Inhalt) tendiert auf b2c statt b2b

Frontend (ComplianceCheckTab.tsx):
- Progress-Balken mit Width-% und XX%-Anzeige rechts
- liest data.progress_pct aus Polling-Response

Consent-Tester (dsi_discovery.py):
- Cookie-Policy-Extraktion kritisch fixt: wait_for_function bis
  body.innerText > 500 chars (BMW SPA-Rendering brauchte mehr Zeit)
- _extract_text_robust: 3-Strategien-Extraktion (Selektoren -> Body-
  Cleanup -> P/LI/TD-Tags)
- _extract_text_from_iframes: liest OneTrust/Sourcepoint/Usercentrics
  Iframe-Inhalte (manche Cookie-Policies leben dort)

Adressiert alle Findings aus dem BMW-Ground-Truth-Vergleich.
This commit is contained in:
Benjamin Admin
2026-05-16 17:53:14 +02:00
parent 4d1e0a7f8e
commit e61e9d9e2a
6 changed files with 515 additions and 53 deletions
@@ -73,6 +73,7 @@ export function ComplianceCheckTab() {
const [useAgent, setUseAgent] = useState(false) const [useAgent, setUseAgent] = useState(false)
const [loading, setLoading] = useState(false) const [loading, setLoading] = useState(false)
const [progress, setProgress] = useState('') const [progress, setProgress] = useState('')
const [progressPct, setProgressPct] = useState(0)
const [results, setResults] = useState<any>(() => { const [results, setResults] = useState<any>(() => {
if (typeof window === 'undefined') return null if (typeof window === 'undefined') return null
try { const s = localStorage.getItem(STORAGE_KEY_RESULTS); return s ? JSON.parse(s) : null } catch { return null } try { const s = localStorage.getItem(STORAGE_KEY_RESULTS); return s ? JSON.parse(s) : null } catch { return null }
@@ -109,15 +110,16 @@ export function ComplianceCheckTab() {
if (!res.ok) continue if (!res.ok) continue
const data = await res.json() const data = await res.json()
if (data.progress) setProgress(data.progress) if (data.progress) setProgress(data.progress)
if (typeof data.progress_pct === 'number') setProgressPct(data.progress_pct)
if (data.status === 'completed' && data.result) { if (data.status === 'completed' && data.result) {
setResults(data.result); setProgress(''); setLoading(false) setResults(data.result); setProgress(''); setProgressPct(0); setLoading(false)
localStorage.setItem(STORAGE_KEY_RESULTS, JSON.stringify(data.result)) localStorage.setItem(STORAGE_KEY_RESULTS, JSON.stringify(data.result))
localStorage.removeItem(STORAGE_KEY_CHECK_ID); setActiveCheckId('') localStorage.removeItem(STORAGE_KEY_CHECK_ID); setActiveCheckId('')
return return
} }
if (data.status === 'failed' || data.status === 'not_found') { if (data.status === 'failed' || data.status === 'not_found') {
if (data.status === 'failed') setError(data.error || 'Pruefung fehlgeschlagen') if (data.status === 'failed') setError(data.error || 'Pruefung fehlgeschlagen')
setProgress(''); setLoading(false) setProgress(''); setProgressPct(0); setLoading(false)
localStorage.removeItem(STORAGE_KEY_CHECK_ID); setActiveCheckId('') localStorage.removeItem(STORAGE_KEY_CHECK_ID); setActiveCheckId('')
return return
} }
@@ -177,6 +179,7 @@ export function ComplianceCheckTab() {
setError(null) setError(null)
setResults(null) setResults(null)
setProgress('Compliance-Check wird gestartet...') setProgress('Compliance-Check wird gestartet...')
setProgressPct(0)
try { try {
const entries = DOCUMENT_TYPES const entries = DOCUMENT_TYPES
@@ -210,9 +213,11 @@ export function ComplianceCheckTab() {
if (!pollRes.ok) { attempts++; continue } if (!pollRes.ok) { attempts++; continue }
const pollData = await pollRes.json() const pollData = await pollRes.json()
if (pollData.progress) setProgress(pollData.progress) if (pollData.progress) setProgress(pollData.progress)
if (typeof pollData.progress_pct === 'number') setProgressPct(pollData.progress_pct)
if (pollData.status === 'completed' && pollData.result) { if (pollData.status === 'completed' && pollData.result) {
setResults(pollData.result) setResults(pollData.result)
setProgress('') setProgress('')
setProgressPct(0)
localStorage.setItem(STORAGE_KEY_RESULTS, JSON.stringify(pollData.result)) localStorage.setItem(STORAGE_KEY_RESULTS, JSON.stringify(pollData.result))
localStorage.removeItem(STORAGE_KEY_CHECK_ID); setActiveCheckId('') localStorage.removeItem(STORAGE_KEY_CHECK_ID); setActiveCheckId('')
@@ -242,6 +247,7 @@ export function ComplianceCheckTab() {
} catch (e) { } catch (e) {
setError(e instanceof Error ? e.message : 'Unbekannter Fehler') setError(e instanceof Error ? e.message : 'Unbekannter Fehler')
setProgress('') setProgress('')
setProgressPct(0)
} finally { } finally {
setLoading(false) setLoading(false)
} }
@@ -334,12 +340,21 @@ export function ComplianceCheckTab() {
{/* Progress */} {/* Progress */}
{progress && ( {progress && (
<div className="bg-purple-50 border border-purple-200 rounded-lg p-3 text-sm text-purple-700 flex items-center gap-3"> <div className="bg-purple-50 border border-purple-200 rounded-lg p-3 text-sm text-purple-700 space-y-2">
<svg className="animate-spin w-4 h-4 text-purple-500 shrink-0" fill="none" viewBox="0 0 24 24"> <div className="flex items-center gap-3">
<circle className="opacity-25" cx="12" cy="12" r="10" stroke="currentColor" strokeWidth="4" /> <svg className="animate-spin w-4 h-4 text-purple-500 shrink-0" fill="none" viewBox="0 0 24 24">
<path className="opacity-75" fill="currentColor" d="M4 12a8 8 0 018-8V0C5.373 0 0 5.373 0 12h4z" /> <circle className="opacity-25" cx="12" cy="12" r="10" stroke="currentColor" strokeWidth="4" />
</svg> <path className="opacity-75" fill="currentColor" d="M4 12a8 8 0 018-8V0C5.373 0 0 5.373 0 12h4z" />
{progress} </svg>
<span className="flex-1">{progress}</span>
<span className="text-xs font-mono text-purple-600 tabular-nums">{progressPct}%</span>
</div>
<div className="h-1.5 bg-purple-100 rounded-full overflow-hidden">
<div
className="h-full bg-purple-500 rounded-full transition-all duration-500 ease-out"
style={{ width: `${Math.max(2, progressPct)}%` }}
/>
</div>
</div> </div>
)} )}
@@ -56,6 +56,7 @@ class ComplianceCheckStatusResponse(BaseModel):
check_id: str check_id: str
status: str status: str
progress: str = "" progress: str = ""
progress_pct: int = 0
result: dict | None = None result: dict | None = None
error: str = "" error: str = ""
@@ -124,6 +125,7 @@ async def start_compliance_check(req: ComplianceCheckRequest):
_compliance_check_jobs[check_id] = { _compliance_check_jobs[check_id] = {
"status": "running", "status": "running",
"progress": "Pruefung gestartet...", "progress": "Pruefung gestartet...",
"progress_pct": 0,
"result": None, "result": None,
"error": "", "error": "",
} }
@@ -141,6 +143,7 @@ async def get_compliance_check_status(check_id: str):
check_id=check_id, check_id=check_id,
status=job["status"], status=job["status"],
progress=job.get("progress", ""), progress=job.get("progress", ""),
progress_pct=job.get("progress_pct", 0),
result=job.get("result"), result=job.get("result"),
error=job.get("error", ""), error=job.get("error", ""),
) )
@@ -155,16 +158,18 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
from .agent_doc_check_routes import CheckItem, DocCheckResult from .agent_doc_check_routes import CheckItem, DocCheckResult
from .agent_doc_check_report import build_html_report from .agent_doc_check_report import build_html_report
# Step 1: Resolve texts (fetch from URL if needed) # Step 1: Resolve texts (fetch from URL if needed) — 0-30%
_update(check_id, "Texte werden geladen...") _update(check_id, "Texte werden geladen...", 1)
doc_texts: dict[str, str] = {} doc_texts: dict[str, str] = {}
doc_entries: list[dict] = [] doc_entries: list[dict] = []
# Cache fetched URLs to detect duplicates # Cache fetched URLs to detect duplicates
url_text_cache: dict[str, str] = {} url_text_cache: dict[str, str] = {}
n_docs = max(1, len(req.documents))
for i, doc in enumerate(req.documents): for i, doc in enumerate(req.documents):
_update(check_id, f"Dokument {i+1}/{len(req.documents)}: {doc.doc_type}...") pct = int(1 + (i / n_docs) * 29)
_update(check_id, f"Texte laden {i+1}/{n_docs}: {doc.doc_type}...", pct)
text = doc.text text = doc.text
if not text and doc.url: if not text and doc.url:
url_key = doc.url.strip().rstrip("/").lower() url_key = doc.url.strip().rstrip("/").lower()
@@ -192,8 +197,8 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
split_shared_texts(doc_entries, url_text_cache) split_shared_texts(doc_entries, url_text_cache)
auto_fill_from_dsi(doc_entries) auto_fill_from_dsi(doc_entries)
# Step 1c: Cross-document search — find doc_types in wrong documents # Step 1c: Cross-document search — find doc_types in wrong documents (30-35%)
_update(check_id, "Dokumente werden uebergreifend durchsucht...") _update(check_id, "Dokumente werden uebergreifend durchsucht...", 32)
placement_findings = cross_search_documents(doc_entries) placement_findings = cross_search_documents(doc_entries)
# Refresh doc_texts after all splitting/searching # Refresh doc_texts after all splitting/searching
@@ -201,8 +206,8 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
if entry.get("text"): if entry.get("text"):
doc_texts[entry["doc_type"]] = entry["text"] doc_texts[entry["doc_type"]] = entry["text"]
# Step 2: Detect business profile # Step 2: Detect business profile (35-40%)
_update(check_id, "Geschaeftsmodell wird erkannt...") _update(check_id, "Geschaeftsmodell wird erkannt...", 37)
profile = await detect_business_profile(doc_texts) profile = await detect_business_profile(doc_texts)
profile_dict = asdict(profile) profile_dict = asdict(profile)
@@ -216,6 +221,8 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
# Filter out doc_types that don't apply to this business profile # Filter out doc_types that don't apply to this business profile
skip_types = _get_skip_types(profile) skip_types = _get_skip_types(profile)
# Document checks: 40-80%
n_entries = max(1, len(doc_entries))
for i, entry in enumerate(doc_entries): for i, entry in enumerate(doc_entries):
text = entry["text"] text = entry["text"]
doc_type = entry["doc_type"] doc_type = entry["doc_type"]
@@ -229,7 +236,8 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
)) ))
continue continue
_update(check_id, f"Pruefe {label} ({i+1}/{len(doc_entries)})...") pct = int(40 + (i / n_entries) * 40)
_update(check_id, f"Pruefen {i+1}/{n_entries}: {label}...", pct)
if not text or len(text) < 50: if not text or len(text) < 50:
results.append(DocCheckResult( results.append(DocCheckResult(
@@ -268,7 +276,7 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
parsed = urlparse(banner_url) parsed = urlparse(banner_url)
banner_url = f"{parsed.scheme}://{parsed.netloc}" banner_url = f"{parsed.scheme}://{parsed.netloc}"
if banner_url: if banner_url:
_update(check_id, "Cookie-Banner wird geprueft...") _update(check_id, "Cookie-Banner wird geprueft...", 82)
try: try:
async with httpx.AsyncClient(timeout=120.0) as client: async with httpx.AsyncClient(timeout=120.0) as client:
resp = await client.post( resp = await client.post(
@@ -280,9 +288,9 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
except Exception as e: except Exception as e:
logger.warning("Banner check failed: %s", e) logger.warning("Banner check failed: %s", e)
# Step 3c: Cross-check Banner vs Cookie-Richtlinie # Step 3c: Cross-check Banner vs Cookie-Richtlinie (88-90%)
if banner_result and "cookie" in doc_texts: if banner_result and "cookie" in doc_texts:
_update(check_id, "Banner vs. Cookie-Richtlinie abgleichen...") _update(check_id, "Banner vs. Cookie-Richtlinie abgleichen...", 89)
cross_findings = _cross_check_banner_vs_cookie( cross_findings = _cross_check_banner_vs_cookie(
banner_result, doc_texts["cookie"], banner_result, doc_texts["cookie"],
) )
@@ -299,7 +307,7 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
tcf_vendors = banner_result.get("tcf_vendors", []) if banner_result else [] tcf_vendors = banner_result.get("tcf_vendors", []) if banner_result else []
vvt_entries: list[dict] = [] vvt_entries: list[dict] = []
if tcf_vendors and "dse" in doc_texts: if tcf_vendors and "dse" in doc_texts:
_update(check_id, f"{len(tcf_vendors)} TCF-Verarbeiter vs. DSI abgleichen...") _update(check_id, f"{len(tcf_vendors)} TCF-Verarbeiter vs. DSI abgleichen...", 91)
from compliance.services.banner_cookie_cross_check import cross_check_vendors_vs_dsi from compliance.services.banner_cookie_cross_check import cross_check_vendors_vs_dsi
from compliance.services.vendor_vvt_mapper import map_vendors_to_vvt from compliance.services.vendor_vvt_mapper import map_vendors_to_vvt
vendor_findings = cross_check_vendors_vs_dsi(tcf_vendors, doc_texts["dse"]) vendor_findings = cross_check_vendors_vs_dsi(tcf_vendors, doc_texts["dse"])
@@ -310,8 +318,8 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
r.checks.append(CheckItem(**vf)) r.checks.append(CheckItem(**vf))
vvt_entries = map_vendors_to_vvt(tcf_vendors) vvt_entries = map_vendors_to_vvt(tcf_vendors)
# Step 4: Extract profile hints from documents # Step 4: Extract profile hints from documents (92-95%)
_update(check_id, "Profil wird aus Dokumenten extrahiert...") _update(check_id, "Profil wird aus Dokumenten extrahiert...", 93)
from compliance.services.profile_extractor import extract_profile_from_documents from compliance.services.profile_extractor import extract_profile_from_documents
extracted_profile = extract_profile_from_documents(doc_texts, profile_dict) extracted_profile = extract_profile_from_documents(doc_texts, profile_dict)
@@ -326,21 +334,32 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
else: else:
r.scenario = "import" r.scenario = "import"
# Step 5: Build report with management summary # Step 5: Build report with management summary (95-98%)
_update(check_id, "Report wird erstellt...") _update(check_id, "Report wird erstellt...", 96)
from .agent_doc_check_report import build_management_summary from .agent_doc_check_report import (
build_management_summary,
build_scanned_urls_html,
build_provider_list_html,
)
summary_html = build_management_summary(results) summary_html = build_management_summary(results)
scanned_html = build_scanned_urls_html(doc_entries)
providers_html = build_provider_list_html(banner_result, vvt_entries)
report_html = build_html_report(results, None) report_html = build_html_report(results, None)
profile_html = _build_profile_html(profile) profile_html = _build_profile_html(profile)
full_html = summary_html + profile_html + report_html full_html = (
summary_html + scanned_html + profile_html
# Step 6: Send email — include website/company name in subject + providers_html + report_html
doc_count = len([r for r in results if not r.error])
site_name = (
extracted_profile.get("company_profile", {}).get("companyName")
or _extract_domain(doc_entries)
or "Unbekannt"
) )
# Step 6: Send email — derive site name primarily from entered URL.
# The extracted_profile.companyName is often noisy (e.g. picks up
# juris.de from legal references). Domain-derived name is more
# predictable for the GF email subject.
doc_count = len([r for r in results if not r.error])
url_company = _company_name_from_url(doc_entries)
domain = _extract_domain(doc_entries)
site_name = url_company or domain or "Unbekannt"
_update(check_id, "E-Mail wird versendet...", 98)
email_result = send_email( email_result = send_email(
recipient=req.recipient, recipient=req.recipient,
subject=f"[COMPLIANCE-CHECK] {site_name}{doc_count} Dokumente geprueft", subject=f"[COMPLIANCE-CHECK] {site_name}{doc_count} Dokumente geprueft",
@@ -368,6 +387,7 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
_compliance_check_jobs[check_id]["status"] = "completed" _compliance_check_jobs[check_id]["status"] = "completed"
_compliance_check_jobs[check_id]["result"] = response _compliance_check_jobs[check_id]["result"] = response
_compliance_check_jobs[check_id]["progress"] = "Fertig" _compliance_check_jobs[check_id]["progress"] = "Fertig"
_compliance_check_jobs[check_id]["progress_pct"] = 100
except Exception as e: except Exception as e:
logger.error("Compliance check %s failed: %s", check_id, e, exc_info=True) logger.error("Compliance check %s failed: %s", check_id, e, exc_info=True)
@@ -375,8 +395,11 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
_compliance_check_jobs[check_id]["error"] = str(e)[:500] _compliance_check_jobs[check_id]["error"] = str(e)[:500]
def _update(check_id: str, msg: str): def _update(check_id: str, msg: str, pct: int | None = None):
_compliance_check_jobs[check_id]["progress"] = msg job = _compliance_check_jobs[check_id]
job["progress"] = msg
if pct is not None:
job["progress_pct"] = max(0, min(100, int(pct)))
async def _fetch_text(url: str) -> str: async def _fetch_text(url: str) -> str:
@@ -503,14 +526,59 @@ async def _check_single(
) )
_COMPOUND_TLDS = {
"co.uk", "co.jp", "co.nz", "co.kr", "co.za", "co.in",
"com.au", "com.br", "com.mx", "com.tr", "com.sg",
}
def _extract_domain(doc_entries: list[dict]) -> str | None: def _extract_domain(doc_entries: list[dict]) -> str | None:
"""Extract domain name from first URL for email subject.""" """Extract base domain (without www) from first URL."""
for entry in doc_entries: for entry in doc_entries:
url = entry.get("url", "") url = entry.get("url", "")
if url and "://" in url: if url and "://" in url:
from urllib.parse import urlparse from urllib.parse import urlparse
host = urlparse(url).netloc host = urlparse(url).netloc.lower()
return host.replace("www.", "") if host else None if host.startswith("www."):
host = host[4:]
return host or None
return None
def _company_name_from_url(doc_entries: list[dict]) -> str | None:
"""Derive a display company name from the entered URLs.
Heuristic: take the second-level domain (e.g. "bmw" from "www.bmw.de"),
uppercase short acronyms (<=4 chars, no hyphens), title-case the rest.
Examples:
www.bmw.de -> BMW
mercedes-benz.de -> Mercedes-Benz
shop.example.co.uk -> Example
juris.de -> Juris
"""
from urllib.parse import urlparse
for entry in doc_entries:
url = entry.get("url", "")
if not url or "://" not in url:
continue
host = urlparse(url).netloc.lower()
if host.startswith("www."):
host = host[4:]
parts = host.split(".")
if len(parts) < 2:
continue
# Handle compound TLDs (.co.uk etc.)
if len(parts) >= 3 and ".".join(parts[-2:]) in _COMPOUND_TLDS:
sld = parts[-3]
else:
sld = parts[-2]
if not sld:
continue
if len(sld) <= 4 and "-" not in sld:
return sld.upper()
return "-".join(p.capitalize() for p in sld.split("-"))
return None return None
@@ -0,0 +1,226 @@
"""
Extras for the agent doc-check email report.
Split out from agent_doc_check_report.py to keep both files under the
500-line hard cap. Contains:
- build_scanned_urls_html (list of fetched URLs + cross-domain notice)
- build_provider_list_html (cookie banner + TCF vendor table)
"""
from __future__ import annotations
def build_scanned_urls_html(doc_entries: list[dict]) -> str:
"""Render the list of scanned URLs at the top of the report.
Transparent for the GF which sources were actually fetched/analysed.
Skips empty URLs (text-only uploads). Adds a cross-domain warning when
legal texts are distributed across multiple domains (e.g. BMW spreads
across bmw.de, bmwgroup.com, bmwgroup.jobs).
"""
from urllib.parse import urlparse
rows: list[str] = []
seen: set[str] = set()
domains: dict[str, list[str]] = {} # netloc -> list of doc_types
for entry in doc_entries:
url = (entry.get("url") or "").strip()
if not url or url in seen:
continue
seen.add(url)
label = _doc_type_label(entry.get("doc_type", ""))
words = entry.get("word_count") or 0
try:
netloc = urlparse(url).netloc.lower().lstrip("www.")
if netloc:
domains.setdefault(netloc, []).append(label)
except Exception:
pass
rows.append(
f'<tr>'
f'<td style="padding:3px 12px 3px 0;color:#475569;font-size:12px">{label}</td>'
f'<td style="padding:3px 12px 3px 0;font-size:12px;'
f'font-family:ui-monospace,monospace;color:#1e293b;word-break:break-all">'
f'<a href="{url}" style="color:#2563eb;text-decoration:none">{url}</a></td>'
f'<td style="padding:3px 0;color:#94a3b8;font-size:11px;text-align:right;'
f'white-space:nowrap">{words} Woerter</td>'
f'</tr>'
)
if not rows:
return ""
cross_domain_html = _cross_domain_notice(domains) if len(domains) >= 2 else ""
return (
'<div style="font-family:-apple-system,BlinkMacSystemFont,sans-serif;'
'max-width:700px;margin:0 auto 16px;padding:12px 16px;'
'background:#fafafa;border:1px solid #e5e7eb;border-radius:8px">'
'<h3 style="margin:0 0 8px;font-size:14px;color:#334155">'
f'Gepruefte Quellen ({len(rows)})</h3>'
'<table style="width:100%;border-collapse:collapse">'
+ "".join(rows)
+ '</table>'
+ cross_domain_html
+ '</div>'
)
def _cross_domain_notice(domains: dict[str, list[str]]) -> str:
"""Warning box when legal texts are spread across multiple domains.
Relevant for big corporate groups (BMW Group: bmw.de / bmwgroup.com /
bmwgroup.jobs). Affects findability for data subjects and may indicate
incomplete disclosure on the main site.
"""
items = []
for netloc, labels in sorted(domains.items()):
labels_str = ", ".join(sorted(set(labels)))
items.append(
f'<li style="margin-bottom:2px"><strong>{netloc}</strong> '
f'<span style="color:#92400e;font-size:11px">&rarr; {labels_str}</span></li>'
)
return (
'<div style="margin-top:12px;padding:10px 12px;background:#fffbeb;'
'border-left:3px solid #f59e0b;border-radius:4px;font-size:12px;'
'color:#78350f">'
'<strong>Hinweis: Rechtstexte verteilt auf '
f'{len(domains)} Domains.</strong> '
'Erschwert die Auffindbarkeit fuer Betroffene (Art. 12 Abs. 1 DSGVO &mdash; '
'transparente Information). Pruefen Sie, ob alle Texte auch von der '
'Hauptdomain aus klar verlinkt sind.'
'<ul style="margin:6px 0 0 16px;padding-left:0">'
+ "".join(items) +
'</ul></div>'
)
def _doc_type_label(doc_type: str) -> str:
"""Lazy resolver — avoids circular import with agent_compliance_check_routes."""
labels = {
"dse": "Datenschutzerklaerung",
"datenschutz": "Datenschutzerklaerung",
"privacy": "Datenschutzerklaerung",
"impressum": "Impressum",
"agb": "AGB",
"widerruf": "Widerrufsbelehrung",
"cookie": "Cookie-Richtlinie",
"avv": "Auftragsverarbeitung",
"loeschkonzept": "Loeschkonzept",
"dsfa": "Datenschutz-Folgenabschaetzung",
"social_media": "Social Media Datenschutz",
"nutzungsbedingungen": "Nutzungsbedingungen",
"dsb": "DSB-Kontakt",
}
return labels.get(doc_type, doc_type.upper() if doc_type else "Dokument")
def build_provider_list_html(
banner_result: dict | None,
vvt_entries: list[dict] | None,
) -> str:
"""Render the cookie banner result + TCF vendor table for the email.
Sections:
1. Banner summary (provider, violations count)
2. Vendor table: Name | Kategorie | Zweck | Drittland | Rechtsgrundlage
"""
if not banner_result and not vvt_entries:
return ""
parts: list[str] = [
'<div style="font-family:-apple-system,BlinkMacSystemFont,sans-serif;'
'max-width:700px;margin:0 auto 16px;padding:12px 16px;'
'background:#fafafa;border:1px solid #e5e7eb;border-radius:8px">'
'<h3 style="margin:0 0 10px;font-size:14px;color:#334155">'
'Cookie-Banner &amp; Verarbeiter</h3>'
]
if banner_result:
detected = banner_result.get("banner_detected", False)
provider = banner_result.get("banner_provider") or "unbekannt"
violations = banner_result.get("banner_checks", {}).get("violations", [])
n_viol = len(violations) if isinstance(violations, list) else int(violations or 0)
status_color = "#16a34a" if detected and n_viol == 0 else (
"#d97706" if detected else "#6b7280"
)
parts.append(
f'<div style="font-size:13px;color:#374151;margin-bottom:10px">'
f'<span style="display:inline-block;width:8px;height:8px;'
f'border-radius:50%;background:{status_color};margin-right:8px"></span>'
f'Banner erkannt: <strong>{"Ja" if detected else "Nein"}</strong>'
f' &nbsp;&middot;&nbsp; Anbieter: <strong>{provider}</strong>'
f' &nbsp;&middot;&nbsp; Auffaelligkeiten: <strong>{n_viol}</strong>'
f'</div>'
)
vendors = vvt_entries or []
if vendors:
parts.append(
f'<div style="font-size:12px;color:#475569;margin:8px 0 6px">'
f'<strong>{len(vendors)} TCF-Verarbeiter ueber das Banner eingebunden:</strong>'
f'</div>'
'<table style="width:100%;border-collapse:collapse;font-size:11px">'
'<thead><tr style="background:#f1f5f9;color:#475569;text-align:left">'
'<th style="padding:5px 8px">Name</th>'
'<th style="padding:5px 8px">Kategorie</th>'
'<th style="padding:5px 8px">Zweck</th>'
'<th style="padding:5px 8px">Drittland</th>'
'<th style="padding:5px 8px">Rechtsgrundlage</th>'
'</tr></thead><tbody>'
)
for v in vendors[:50]:
parts.append(_render_vendor_row(v))
parts.append('</tbody></table>')
if len(vendors) > 50:
parts.append(
f'<div style="font-size:11px;color:#94a3b8;margin-top:4px">'
f'... und {len(vendors) - 50} weitere</div>'
)
elif banner_result and banner_result.get("banner_detected"):
parts.append(
'<div style="font-size:11px;color:#94a3b8">'
'Keine TCF-Verarbeiter erkannt (Banner nutzt kein TCF v2 Framework '
'oder Vendor-Liste konnte nicht ausgelesen werden).</div>'
)
parts.append('</div>')
return "".join(parts)
def _render_vendor_row(v: dict) -> str:
name = v.get("name") or "Unbekannt"
kategorie = _category_label(v.get("kategorie", ""))
zweck = v.get("zweck_kurz") or ", ".join((v.get("zweck") or [])[:2])
drittland = v.get("drittland")
land = v.get("land") or ""
if drittland is True:
drittland_str = (f'<span style="color:#dc2626">Ja ({land})</span>'
if land else '<span style="color:#dc2626">Ja</span>')
elif drittland is False:
drittland_str = (f'<span style="color:#16a34a">Nein ({land})</span>'
if land else '<span style="color:#16a34a">Nein</span>')
else:
drittland_str = '<span style="color:#94a3b8">unbekannt</span>'
rg = v.get("rechtsgrundlage", "")
rg_short = "Einwilligung" if "Einwilligung" in rg else (
"Berechtigtes Interesse" if "Berechtigtes" in rg else rg[:40]
)
return (
f'<tr style="border-top:1px solid #e2e8f0">'
f'<td style="padding:4px 8px;color:#1e293b">{name}</td>'
f'<td style="padding:4px 8px;color:#475569">{kategorie}</td>'
f'<td style="padding:4px 8px;color:#475569">{zweck}</td>'
f'<td style="padding:4px 8px">{drittland_str}</td>'
f'<td style="padding:4px 8px;color:#475569">{rg_short}</td>'
f'</tr>'
)
def _category_label(kat: str) -> str:
return {
"necessary": "Notwendig",
"functional": "Funktional",
"statistics": "Statistik",
"marketing": "Marketing",
}.get(kat, kat or "")
@@ -290,6 +290,15 @@ def _render_cookie_banner(html: list[str], cookie_result: dict) -> None:
html.append('</div>') html.append('</div>')
# Re-export the helpers extracted to agent_doc_check_extras.py so existing
# callers that did `from .agent_doc_check_report import build_scanned_urls_html`
# keep working.
from .agent_doc_check_extras import ( # noqa: E402,F401
build_provider_list_html,
build_scanned_urls_html,
)
def build_profile_html(profile) -> str: def build_profile_html(profile) -> str:
"""Build a small HTML block summarizing the detected business profile.""" """Build a small HTML block summarizing the detected business profile."""
service_tags = ", ".join(profile.detected_services[:10]) or "keine erkannt" service_tags = ", ".join(profile.detected_services[:10]) or "keine erkannt"
@@ -39,10 +39,13 @@ _B2C_KEYWORDS = [
] ]
_B2B_KEYWORDS = [ _B2B_KEYWORDS = [
"unternehmen", "geschaeftskunden", "geschäftskunden", "gewerblich", # Discriminative — these don't appear in B2C consumer texts
"auftraggeber", "auftragnehmer", "geschaeftspartner", "geschaeftskunden", "geschäftskunden", "firmenkunde", "b2b",
"geschäftspartner", "firmenkunde", "b2b", "industriekunden", "industriekunden", "ausschliesslich gewerblich", "ausschließlich gewerblich",
"beratung", "consulting", "dienstleistung", "engineering", "ausschliesslich unternehmer", "ausschließlich unternehmer",
"kein verbrauchergeschaeft", "kein verbrauchergeschäft",
# Note: "unternehmen", "beratung", "consulting", "dienstleistung"
# were removed — they match in any company text and bias toward B2B.
] ]
_B2G_KEYWORDS = [ _B2G_KEYWORDS = [
@@ -116,9 +119,20 @@ _INDUSTRY_KEYWORDS = {
"arbeitssicherheit", "brandschutz", "sicherheitstechnik", "zertifizierung"], "arbeitssicherheit", "brandschutz", "sicherheitstechnik", "zertifizierung"],
"manufacturing": ["fertigung", "produktion", "maschinenbau", "anlagenbau", "zulieferer", "manufacturing": ["fertigung", "produktion", "maschinenbau", "anlagenbau", "zulieferer",
"werkzeugbau", "spritzguss", "cnc", "industrietechnik"], "werkzeugbau", "spritzguss", "cnc", "industrietechnik"],
"automotive": ["fahrzeug", "kraftfahrzeug", "kfz", "automobil", "neuwagen",
"gebrauchtwagen", "konfigurator", "modellreihe", "modellpalette"],
"media": ["redaktion", "verlag", "medien", "journalismus", "presse"], "media": ["redaktion", "verlag", "medien", "journalismus", "presse"],
} }
# Terms that indicate "versicherung" / "bank" is only mentioned as a
# §34d/§34c GewO disclosure (Versicherungsvermittler / Finanzanlagenvermittler)
# rather than the core business. Used to suppress false finance matches.
_VERMITTLER_CONTEXT_TERMS = [
"versicherungsvermittler", "berufshaftpflichtversicherung",
"vermittlerregister", "§34d", "§ 34 d", "§34c", "§ 34 c",
"finanzanlagenvermittler", "ihk muenchen", "ihk münchen",
]
_TRACKING_SERVICES = { _TRACKING_SERVICES = {
"google analytics": "Google Analytics", "google analytics": "Google Analytics",
"google tag manager": "Google Tag Manager", "google tag manager": "Google Tag Manager",
@@ -231,13 +245,23 @@ async def detect_business_profile(documents: dict[str, str]) -> BusinessProfile:
total = sum(max(0, v) for v in scores.values()) total = sum(max(0, v) for v in scores.values())
profile.confidence = round(best_val / total, 2) if total > 0 else 0.5 profile.confidence = round(best_val / total, 2) if total > 0 else 0.5
else: else:
# Fallback: GmbH/AG without B2C signals → assume B2B # Fallback: prefer B2C when the text mentions Verbraucherrechte,
has_company = any(kw in full_text for kw in [ # editorial content, or consumer-direction signals — even without
"gmbh", "ag ", "ohg", "kg ", "ug ", "gbr", # checkout keywords. Only fall back to B2B if discriminative B2B
]) # markers fired (which the keyword list above already filtered to
if has_company and b2c_score <= 0: # genuinely B2B-only terms).
consumer_hint = (
"verbraucher" in full_text
or "widerruf" in full_text
or "kunde" in full_text
or profile.has_editorial_content
)
if b2b_score >= 1 and not consumer_hint:
profile.business_type = "b2b" profile.business_type = "b2b"
profile.confidence = 0.4 profile.confidence = 0.4
elif consumer_hint:
profile.business_type = "b2c"
profile.confidence = 0.4
else: else:
profile.business_type = "unknown" profile.business_type = "unknown"
profile.confidence = 0.2 profile.confidence = 0.2
@@ -255,8 +279,31 @@ async def detect_business_profile(documents: dict[str, str]) -> BusinessProfile:
if hits >= 1: if hits >= 1:
industry_scores[industry] = hits industry_scores[industry] = hits
# Suppress finance/insurance false positives caused by §34d/§34c GewO
# disclosures (Versicherungsvermittler, Berufshaftpflicht, etc.) — these
# are pflichtangaben for many companies (e.g. BMW AG) without being
# actual financial services providers.
if industry_scores.get("finance"):
vermittler_hits = _count_hits(full_text, _VERMITTLER_CONTEXT_TERMS)
if vermittler_hits >= 2:
# Only the §34d boilerplate triggered the match — drop or shrink.
non_insurance_finance = _count_hits(
full_text, ["bank", "finanz", "kredit", "anlage"],
)
if non_insurance_finance == 0:
industry_scores.pop("finance", None)
else:
industry_scores["finance"] = non_insurance_finance
# Require a clear winner — if top score is 1 and there are ties, prefer
# "unknown" over guessing.
if industry_scores: if industry_scores:
profile.industry = max(industry_scores, key=industry_scores.get) # type: ignore[arg-type] top = max(industry_scores.values())
winners = [k for k, v in industry_scores.items() if v == top]
if top >= 2 or len(winners) == 1:
profile.industry = winners[0]
else:
profile.industry = "unknown"
elif profile.is_regulated_profession: elif profile.is_regulated_profession:
prof_map = {"anwalt": "legal", "arzt": "healthcare", prof_map = {"anwalt": "legal", "arzt": "healthcare",
"steuerberater": "finance", "architekt": "craft"} "steuerberater": "finance", "architekt": "craft"}
+102 -5
View File
@@ -273,18 +273,35 @@ async def discover_dsi_documents(
is_self_dsi, self_lang = _matches_dsi_keyword(page_title) is_self_dsi, self_lang = _matches_dsi_keyword(page_title)
if is_self_dsi: if is_self_dsi:
try: try:
# Wait for substantive content to appear (SPAs need time to render).
# Polls body.innerText length up to 10s. Many sites (BMW, Daimler)
# render via React/Vue after domcontentloaded fires.
try:
await page.wait_for_function(
"() => (document.body && document.body.innerText || '').length > 500",
timeout=10000,
)
except Exception:
pass # Continue anyway, extractor below has fallbacks
# Scroll to bottom to trigger lazy-loading of full content # Scroll to bottom to trigger lazy-loading of full content
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
await page.wait_for_timeout(1500) await page.wait_for_timeout(1500)
await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
await page.wait_for_timeout(1000) await page.wait_for_timeout(1000)
self_text = await page.evaluate("""() => { self_text = await _extract_text_robust(page)
const main = document.querySelector('main, article, [role="main"], .content, #content, .bodytext')
|| document.body;
return main ? main.innerText : document.body.innerText;
}""")
self_wc = len(self_text.split()) if self_text else 0 self_wc = len(self_text.split()) if self_text else 0
# If still too short, try same-origin iframes (some sites
# embed cookie policies via OneTrust/Sourcepoint iframes).
if self_wc < 100:
iframe_text = await _extract_text_from_iframes(page)
if iframe_text and len(iframe_text.split()) > self_wc:
self_text = iframe_text
self_wc = len(self_text.split())
logger.info("Self-extraction via iframe for %s: %d words", url, self_wc)
if self_wc >= 100: if self_wc >= 100:
page_title = await page.title() or url page_title = await page.title() or url
result.documents.append(DiscoveredDSI( result.documents.append(DiscoveredDSI(
@@ -622,3 +639,83 @@ async def _find_inline_dsi_sections(page: Page) -> list[dict]:
return sections or [] return sections or []
except Exception: except Exception:
return [] return []
async def _extract_text_robust(page: Page) -> str:
"""Multi-strategy text extraction for SPA-heavy pages (BMW, Daimler, etc).
Tries progressively broader selectors, falls back to body-minus-chrome,
final fallback: join all paragraph/list/cell tags' textContent.
"""
try:
return await page.evaluate("""
() => {
// 1) Specific content containers
const selectors = [
'.article-content', '.page-content', '.entry-content',
'[class*="content-area"]', '[class*="main-content"]',
'[class*="legal-text"]', '[class*="policy-content"]',
'main article', 'main', 'article',
'[role="main"]', '.content', '#content', '.bodytext',
];
for (const sel of selectors) {
const el = document.querySelector(sel);
if (el && el.textContent.trim().length > 200) {
return el.textContent.trim().replace(/\\s+/g, ' ');
}
}
// 2) Body minus nav/header/footer/scripts
const body = document.body.cloneNode(true);
body.querySelectorAll(
'nav, header, footer, script, style, noscript,' +
' [class*="nav"], [class*="sidebar"], [class*="cookie"],' +
' [class*="banner"], [id*="cookie"], [id*="banner"]'
).forEach(e => e.remove());
const bodyText = (body.textContent || '').trim().replace(/\\s+/g, ' ');
if (bodyText.length > 200) return bodyText;
// 3) Final fallback: collect all text-bearing tags
const blocks = document.querySelectorAll('p, li, dd, td, h1, h2, h3, h4');
const parts = [];
for (const b of blocks) {
const t = (b.textContent || '').trim();
if (t.length > 20) parts.push(t);
}
return parts.join(' ').replace(/\\s+/g, ' ');
}
""") or ""
except Exception as e:
logger.warning("Robust text extraction failed: %s", e)
return ""
async def _extract_text_from_iframes(page: Page) -> str:
"""Collect text from same-origin iframes (OneTrust, Sourcepoint embeds).
Many sites render cookie policies inside iframes managed by CMP vendors.
"""
try:
from urllib.parse import urlparse
page_host = urlparse(page.url).netloc
chunks: list[str] = []
for frame in page.frames:
if frame == page.main_frame:
continue
try:
frame_host = urlparse(frame.url).netloc
# Accept same-origin or known CMP frames
if frame_host and frame_host != page_host:
cmp_hosts = ("onetrust", "cookiebot", "consensu", "sourcepoint",
"usercentrics", "didomi", "klaro")
if not any(h in frame_host for h in cmp_hosts):
continue
text = await frame.evaluate(
"() => (document.body && document.body.innerText || '').trim()"
)
if text and len(text.split()) > 50:
chunks.append(text)
except Exception:
continue
return "\n\n".join(chunks)
except Exception as e:
logger.debug("Iframe extraction failed: %s", e)
return ""