feat(agent): progress_pct + 6 BMW-Run Verbesserungen
Backend (agent_compliance_check_routes.py):
- progress_pct (0-100%) im Job-State, ueber alle Phasen verteilt
(Laden 0-30, Profil 35-40, Pruefen 40-80, Banner 80-92, Report 95-100)
- Status-Texte vereinheitlicht ("Texte laden X/N", "Pruefen X/N")
- Firmenname fuer Email-Subject jetzt aus URL abgeleitet
(bmw.de -> "BMW", mercedes-benz.de -> "Mercedes-Benz") statt
unzuverlaessigem extracted_profile.companyName (matchte oft juris.de)
- E-Mail-Report enthaelt jetzt Banner+TCF-Vendor-Liste (build_provider_list_html)
Backend (agent_doc_check_extras.py — neu):
- build_scanned_urls_html: gepruefte URLs als Tabelle oben im Report
(transparent fuer GF, welche Quellen wirklich gezogen wurden)
- Cross-Domain-Hinweis bei >1 netloc (BMW: bmw.de / bmwgroup.com /
bmwgroup.jobs — Auffindbarkeit nach Art. 12 DSGVO)
- build_provider_list_html: Banner-Box + TCF-Vendor-Tabelle mit Spalten
Name | Kategorie | Zweck | Drittland | Rechtsgrundlage
Backend (business_profiler.py):
- §34d-GewO Versicherungsvermittler-Hinweise zaehlen nicht mehr als
"finance"-Industrie (BMW wurde dadurch falsch als B2B/finance erkannt)
- Neue Industry "automotive" (Fahrzeug/KFZ/Konfigurator/Modellpalette)
- B2B-Keywords: generische Begriffe wie "unternehmen", "beratung",
"consulting" entfernt (matchten in jedem Konzerntext)
- B2C-Fallback: bei Verbraucher-Signalen ("widerruf", "kunde",
redaktioneller Inhalt) tendiert auf b2c statt b2b
Frontend (ComplianceCheckTab.tsx):
- Progress-Balken mit Width-% und XX%-Anzeige rechts
- liest data.progress_pct aus Polling-Response
Consent-Tester (dsi_discovery.py):
- Cookie-Policy-Extraktion kritisch fixt: wait_for_function bis
body.innerText > 500 chars (BMW SPA-Rendering brauchte mehr Zeit)
- _extract_text_robust: 3-Strategien-Extraktion (Selektoren -> Body-
Cleanup -> P/LI/TD-Tags)
- _extract_text_from_iframes: liest OneTrust/Sourcepoint/Usercentrics
Iframe-Inhalte (manche Cookie-Policies leben dort)
Adressiert alle Findings aus dem BMW-Ground-Truth-Vergleich.
This commit is contained in:
@@ -56,6 +56,7 @@ class ComplianceCheckStatusResponse(BaseModel):
|
||||
check_id: str
|
||||
status: str
|
||||
progress: str = ""
|
||||
progress_pct: int = 0
|
||||
result: dict | None = None
|
||||
error: str = ""
|
||||
|
||||
@@ -124,6 +125,7 @@ async def start_compliance_check(req: ComplianceCheckRequest):
|
||||
_compliance_check_jobs[check_id] = {
|
||||
"status": "running",
|
||||
"progress": "Pruefung gestartet...",
|
||||
"progress_pct": 0,
|
||||
"result": None,
|
||||
"error": "",
|
||||
}
|
||||
@@ -141,6 +143,7 @@ async def get_compliance_check_status(check_id: str):
|
||||
check_id=check_id,
|
||||
status=job["status"],
|
||||
progress=job.get("progress", ""),
|
||||
progress_pct=job.get("progress_pct", 0),
|
||||
result=job.get("result"),
|
||||
error=job.get("error", ""),
|
||||
)
|
||||
@@ -155,16 +158,18 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
|
||||
from .agent_doc_check_routes import CheckItem, DocCheckResult
|
||||
from .agent_doc_check_report import build_html_report
|
||||
|
||||
# Step 1: Resolve texts (fetch from URL if needed)
|
||||
_update(check_id, "Texte werden geladen...")
|
||||
# Step 1: Resolve texts (fetch from URL if needed) — 0-30%
|
||||
_update(check_id, "Texte werden geladen...", 1)
|
||||
doc_texts: dict[str, str] = {}
|
||||
doc_entries: list[dict] = []
|
||||
|
||||
# Cache fetched URLs to detect duplicates
|
||||
url_text_cache: dict[str, str] = {}
|
||||
|
||||
n_docs = max(1, len(req.documents))
|
||||
for i, doc in enumerate(req.documents):
|
||||
_update(check_id, f"Dokument {i+1}/{len(req.documents)}: {doc.doc_type}...")
|
||||
pct = int(1 + (i / n_docs) * 29)
|
||||
_update(check_id, f"Texte laden {i+1}/{n_docs}: {doc.doc_type}...", pct)
|
||||
text = doc.text
|
||||
if not text and doc.url:
|
||||
url_key = doc.url.strip().rstrip("/").lower()
|
||||
@@ -192,8 +197,8 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
|
||||
split_shared_texts(doc_entries, url_text_cache)
|
||||
auto_fill_from_dsi(doc_entries)
|
||||
|
||||
# Step 1c: Cross-document search — find doc_types in wrong documents
|
||||
_update(check_id, "Dokumente werden uebergreifend durchsucht...")
|
||||
# Step 1c: Cross-document search — find doc_types in wrong documents (30-35%)
|
||||
_update(check_id, "Dokumente werden uebergreifend durchsucht...", 32)
|
||||
placement_findings = cross_search_documents(doc_entries)
|
||||
|
||||
# Refresh doc_texts after all splitting/searching
|
||||
@@ -201,8 +206,8 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
|
||||
if entry.get("text"):
|
||||
doc_texts[entry["doc_type"]] = entry["text"]
|
||||
|
||||
# Step 2: Detect business profile
|
||||
_update(check_id, "Geschaeftsmodell wird erkannt...")
|
||||
# Step 2: Detect business profile (35-40%)
|
||||
_update(check_id, "Geschaeftsmodell wird erkannt...", 37)
|
||||
profile = await detect_business_profile(doc_texts)
|
||||
profile_dict = asdict(profile)
|
||||
|
||||
@@ -216,6 +221,8 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
|
||||
# Filter out doc_types that don't apply to this business profile
|
||||
skip_types = _get_skip_types(profile)
|
||||
|
||||
# Document checks: 40-80%
|
||||
n_entries = max(1, len(doc_entries))
|
||||
for i, entry in enumerate(doc_entries):
|
||||
text = entry["text"]
|
||||
doc_type = entry["doc_type"]
|
||||
@@ -229,7 +236,8 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
|
||||
))
|
||||
continue
|
||||
|
||||
_update(check_id, f"Pruefe {label} ({i+1}/{len(doc_entries)})...")
|
||||
pct = int(40 + (i / n_entries) * 40)
|
||||
_update(check_id, f"Pruefen {i+1}/{n_entries}: {label}...", pct)
|
||||
|
||||
if not text or len(text) < 50:
|
||||
results.append(DocCheckResult(
|
||||
@@ -268,7 +276,7 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
|
||||
parsed = urlparse(banner_url)
|
||||
banner_url = f"{parsed.scheme}://{parsed.netloc}"
|
||||
if banner_url:
|
||||
_update(check_id, "Cookie-Banner wird geprueft...")
|
||||
_update(check_id, "Cookie-Banner wird geprueft...", 82)
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=120.0) as client:
|
||||
resp = await client.post(
|
||||
@@ -280,9 +288,9 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
|
||||
except Exception as e:
|
||||
logger.warning("Banner check failed: %s", e)
|
||||
|
||||
# Step 3c: Cross-check Banner vs Cookie-Richtlinie
|
||||
# Step 3c: Cross-check Banner vs Cookie-Richtlinie (88-90%)
|
||||
if banner_result and "cookie" in doc_texts:
|
||||
_update(check_id, "Banner vs. Cookie-Richtlinie abgleichen...")
|
||||
_update(check_id, "Banner vs. Cookie-Richtlinie abgleichen...", 89)
|
||||
cross_findings = _cross_check_banner_vs_cookie(
|
||||
banner_result, doc_texts["cookie"],
|
||||
)
|
||||
@@ -299,7 +307,7 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
|
||||
tcf_vendors = banner_result.get("tcf_vendors", []) if banner_result else []
|
||||
vvt_entries: list[dict] = []
|
||||
if tcf_vendors and "dse" in doc_texts:
|
||||
_update(check_id, f"{len(tcf_vendors)} TCF-Verarbeiter vs. DSI abgleichen...")
|
||||
_update(check_id, f"{len(tcf_vendors)} TCF-Verarbeiter vs. DSI abgleichen...", 91)
|
||||
from compliance.services.banner_cookie_cross_check import cross_check_vendors_vs_dsi
|
||||
from compliance.services.vendor_vvt_mapper import map_vendors_to_vvt
|
||||
vendor_findings = cross_check_vendors_vs_dsi(tcf_vendors, doc_texts["dse"])
|
||||
@@ -310,8 +318,8 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
|
||||
r.checks.append(CheckItem(**vf))
|
||||
vvt_entries = map_vendors_to_vvt(tcf_vendors)
|
||||
|
||||
# Step 4: Extract profile hints from documents
|
||||
_update(check_id, "Profil wird aus Dokumenten extrahiert...")
|
||||
# Step 4: Extract profile hints from documents (92-95%)
|
||||
_update(check_id, "Profil wird aus Dokumenten extrahiert...", 93)
|
||||
from compliance.services.profile_extractor import extract_profile_from_documents
|
||||
extracted_profile = extract_profile_from_documents(doc_texts, profile_dict)
|
||||
|
||||
@@ -326,21 +334,32 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
|
||||
else:
|
||||
r.scenario = "import"
|
||||
|
||||
# Step 5: Build report with management summary
|
||||
_update(check_id, "Report wird erstellt...")
|
||||
from .agent_doc_check_report import build_management_summary
|
||||
# Step 5: Build report with management summary (95-98%)
|
||||
_update(check_id, "Report wird erstellt...", 96)
|
||||
from .agent_doc_check_report import (
|
||||
build_management_summary,
|
||||
build_scanned_urls_html,
|
||||
build_provider_list_html,
|
||||
)
|
||||
summary_html = build_management_summary(results)
|
||||
scanned_html = build_scanned_urls_html(doc_entries)
|
||||
providers_html = build_provider_list_html(banner_result, vvt_entries)
|
||||
report_html = build_html_report(results, None)
|
||||
profile_html = _build_profile_html(profile)
|
||||
full_html = summary_html + profile_html + report_html
|
||||
|
||||
# Step 6: Send email — include website/company name in subject
|
||||
doc_count = len([r for r in results if not r.error])
|
||||
site_name = (
|
||||
extracted_profile.get("company_profile", {}).get("companyName")
|
||||
or _extract_domain(doc_entries)
|
||||
or "Unbekannt"
|
||||
full_html = (
|
||||
summary_html + scanned_html + profile_html
|
||||
+ providers_html + report_html
|
||||
)
|
||||
|
||||
# Step 6: Send email — derive site name primarily from entered URL.
|
||||
# The extracted_profile.companyName is often noisy (e.g. picks up
|
||||
# juris.de from legal references). Domain-derived name is more
|
||||
# predictable for the GF email subject.
|
||||
doc_count = len([r for r in results if not r.error])
|
||||
url_company = _company_name_from_url(doc_entries)
|
||||
domain = _extract_domain(doc_entries)
|
||||
site_name = url_company or domain or "Unbekannt"
|
||||
_update(check_id, "E-Mail wird versendet...", 98)
|
||||
email_result = send_email(
|
||||
recipient=req.recipient,
|
||||
subject=f"[COMPLIANCE-CHECK] {site_name} — {doc_count} Dokumente geprueft",
|
||||
@@ -368,6 +387,7 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
|
||||
_compliance_check_jobs[check_id]["status"] = "completed"
|
||||
_compliance_check_jobs[check_id]["result"] = response
|
||||
_compliance_check_jobs[check_id]["progress"] = "Fertig"
|
||||
_compliance_check_jobs[check_id]["progress_pct"] = 100
|
||||
|
||||
except Exception as e:
|
||||
logger.error("Compliance check %s failed: %s", check_id, e, exc_info=True)
|
||||
@@ -375,8 +395,11 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
|
||||
_compliance_check_jobs[check_id]["error"] = str(e)[:500]
|
||||
|
||||
|
||||
def _update(check_id: str, msg: str):
|
||||
_compliance_check_jobs[check_id]["progress"] = msg
|
||||
def _update(check_id: str, msg: str, pct: int | None = None):
|
||||
job = _compliance_check_jobs[check_id]
|
||||
job["progress"] = msg
|
||||
if pct is not None:
|
||||
job["progress_pct"] = max(0, min(100, int(pct)))
|
||||
|
||||
|
||||
async def _fetch_text(url: str) -> str:
|
||||
@@ -503,14 +526,59 @@ async def _check_single(
|
||||
)
|
||||
|
||||
|
||||
_COMPOUND_TLDS = {
|
||||
"co.uk", "co.jp", "co.nz", "co.kr", "co.za", "co.in",
|
||||
"com.au", "com.br", "com.mx", "com.tr", "com.sg",
|
||||
}
|
||||
|
||||
|
||||
def _extract_domain(doc_entries: list[dict]) -> str | None:
|
||||
"""Extract domain name from first URL for email subject."""
|
||||
"""Extract base domain (without www) from first URL."""
|
||||
for entry in doc_entries:
|
||||
url = entry.get("url", "")
|
||||
if url and "://" in url:
|
||||
from urllib.parse import urlparse
|
||||
host = urlparse(url).netloc
|
||||
return host.replace("www.", "") if host else None
|
||||
host = urlparse(url).netloc.lower()
|
||||
if host.startswith("www."):
|
||||
host = host[4:]
|
||||
return host or None
|
||||
return None
|
||||
|
||||
|
||||
def _company_name_from_url(doc_entries: list[dict]) -> str | None:
|
||||
"""Derive a display company name from the entered URLs.
|
||||
|
||||
Heuristic: take the second-level domain (e.g. "bmw" from "www.bmw.de"),
|
||||
uppercase short acronyms (<=4 chars, no hyphens), title-case the rest.
|
||||
|
||||
Examples:
|
||||
www.bmw.de -> BMW
|
||||
mercedes-benz.de -> Mercedes-Benz
|
||||
shop.example.co.uk -> Example
|
||||
juris.de -> Juris
|
||||
"""
|
||||
from urllib.parse import urlparse
|
||||
|
||||
for entry in doc_entries:
|
||||
url = entry.get("url", "")
|
||||
if not url or "://" not in url:
|
||||
continue
|
||||
host = urlparse(url).netloc.lower()
|
||||
if host.startswith("www."):
|
||||
host = host[4:]
|
||||
parts = host.split(".")
|
||||
if len(parts) < 2:
|
||||
continue
|
||||
# Handle compound TLDs (.co.uk etc.)
|
||||
if len(parts) >= 3 and ".".join(parts[-2:]) in _COMPOUND_TLDS:
|
||||
sld = parts[-3]
|
||||
else:
|
||||
sld = parts[-2]
|
||||
if not sld:
|
||||
continue
|
||||
if len(sld) <= 4 and "-" not in sld:
|
||||
return sld.upper()
|
||||
return "-".join(p.capitalize() for p in sld.split("-"))
|
||||
return None
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user