feat(agent): progress_pct + 6 BMW-Run Verbesserungen
Backend (agent_compliance_check_routes.py):
- progress_pct (0-100%) im Job-State, ueber alle Phasen verteilt
(Laden 0-30, Profil 35-40, Pruefen 40-80, Banner 80-92, Report 95-100)
- Status-Texte vereinheitlicht ("Texte laden X/N", "Pruefen X/N")
- Firmenname fuer Email-Subject jetzt aus URL abgeleitet
(bmw.de -> "BMW", mercedes-benz.de -> "Mercedes-Benz") statt
unzuverlaessigem extracted_profile.companyName (matchte oft juris.de)
- E-Mail-Report enthaelt jetzt Banner+TCF-Vendor-Liste (build_provider_list_html)
Backend (agent_doc_check_extras.py — neu):
- build_scanned_urls_html: gepruefte URLs als Tabelle oben im Report
(transparent fuer GF, welche Quellen wirklich gezogen wurden)
- Cross-Domain-Hinweis bei >1 netloc (BMW: bmw.de / bmwgroup.com /
bmwgroup.jobs — Auffindbarkeit nach Art. 12 DSGVO)
- build_provider_list_html: Banner-Box + TCF-Vendor-Tabelle mit Spalten
Name | Kategorie | Zweck | Drittland | Rechtsgrundlage
Backend (business_profiler.py):
- §34d-GewO Versicherungsvermittler-Hinweise zaehlen nicht mehr als
"finance"-Industrie (BMW wurde dadurch falsch als B2B/finance erkannt)
- Neue Industry "automotive" (Fahrzeug/KFZ/Konfigurator/Modellpalette)
- B2B-Keywords: generische Begriffe wie "unternehmen", "beratung",
"consulting" entfernt (matchten in jedem Konzerntext)
- B2C-Fallback: bei Verbraucher-Signalen ("widerruf", "kunde",
redaktioneller Inhalt) tendiert auf b2c statt b2b
Frontend (ComplianceCheckTab.tsx):
- Progress-Balken mit Width-% und XX%-Anzeige rechts
- liest data.progress_pct aus Polling-Response
Consent-Tester (dsi_discovery.py):
- Cookie-Policy-Extraktion kritisch fixt: wait_for_function bis
body.innerText > 500 chars (BMW SPA-Rendering brauchte mehr Zeit)
- _extract_text_robust: 3-Strategien-Extraktion (Selektoren -> Body-
Cleanup -> P/LI/TD-Tags)
- _extract_text_from_iframes: liest OneTrust/Sourcepoint/Usercentrics
Iframe-Inhalte (manche Cookie-Policies leben dort)
Adressiert alle Findings aus dem BMW-Ground-Truth-Vergleich.
This commit is contained in:
@@ -39,10 +39,13 @@ _B2C_KEYWORDS = [
|
||||
]
|
||||
|
||||
_B2B_KEYWORDS = [
|
||||
"unternehmen", "geschaeftskunden", "geschäftskunden", "gewerblich",
|
||||
"auftraggeber", "auftragnehmer", "geschaeftspartner",
|
||||
"geschäftspartner", "firmenkunde", "b2b", "industriekunden",
|
||||
"beratung", "consulting", "dienstleistung", "engineering",
|
||||
# Discriminative — these don't appear in B2C consumer texts
|
||||
"geschaeftskunden", "geschäftskunden", "firmenkunde", "b2b",
|
||||
"industriekunden", "ausschliesslich gewerblich", "ausschließlich gewerblich",
|
||||
"ausschliesslich unternehmer", "ausschließlich unternehmer",
|
||||
"kein verbrauchergeschaeft", "kein verbrauchergeschäft",
|
||||
# Note: "unternehmen", "beratung", "consulting", "dienstleistung"
|
||||
# were removed — they match in any company text and bias toward B2B.
|
||||
]
|
||||
|
||||
_B2G_KEYWORDS = [
|
||||
@@ -116,9 +119,20 @@ _INDUSTRY_KEYWORDS = {
|
||||
"arbeitssicherheit", "brandschutz", "sicherheitstechnik", "zertifizierung"],
|
||||
"manufacturing": ["fertigung", "produktion", "maschinenbau", "anlagenbau", "zulieferer",
|
||||
"werkzeugbau", "spritzguss", "cnc", "industrietechnik"],
|
||||
"automotive": ["fahrzeug", "kraftfahrzeug", "kfz", "automobil", "neuwagen",
|
||||
"gebrauchtwagen", "konfigurator", "modellreihe", "modellpalette"],
|
||||
"media": ["redaktion", "verlag", "medien", "journalismus", "presse"],
|
||||
}
|
||||
|
||||
# Terms that indicate "versicherung" / "bank" is only mentioned as a
|
||||
# §34d/§34c GewO disclosure (Versicherungsvermittler / Finanzanlagenvermittler)
|
||||
# rather than the core business. Used to suppress false finance matches.
|
||||
_VERMITTLER_CONTEXT_TERMS = [
|
||||
"versicherungsvermittler", "berufshaftpflichtversicherung",
|
||||
"vermittlerregister", "§34d", "§ 34 d", "§34c", "§ 34 c",
|
||||
"finanzanlagenvermittler", "ihk muenchen", "ihk münchen",
|
||||
]
|
||||
|
||||
_TRACKING_SERVICES = {
|
||||
"google analytics": "Google Analytics",
|
||||
"google tag manager": "Google Tag Manager",
|
||||
@@ -231,13 +245,23 @@ async def detect_business_profile(documents: dict[str, str]) -> BusinessProfile:
|
||||
total = sum(max(0, v) for v in scores.values())
|
||||
profile.confidence = round(best_val / total, 2) if total > 0 else 0.5
|
||||
else:
|
||||
# Fallback: GmbH/AG without B2C signals → assume B2B
|
||||
has_company = any(kw in full_text for kw in [
|
||||
"gmbh", "ag ", "ohg", "kg ", "ug ", "gbr",
|
||||
])
|
||||
if has_company and b2c_score <= 0:
|
||||
# Fallback: prefer B2C when the text mentions Verbraucherrechte,
|
||||
# editorial content, or consumer-direction signals — even without
|
||||
# checkout keywords. Only fall back to B2B if discriminative B2B
|
||||
# markers fired (which the keyword list above already filtered to
|
||||
# genuinely B2B-only terms).
|
||||
consumer_hint = (
|
||||
"verbraucher" in full_text
|
||||
or "widerruf" in full_text
|
||||
or "kunde" in full_text
|
||||
or profile.has_editorial_content
|
||||
)
|
||||
if b2b_score >= 1 and not consumer_hint:
|
||||
profile.business_type = "b2b"
|
||||
profile.confidence = 0.4
|
||||
elif consumer_hint:
|
||||
profile.business_type = "b2c"
|
||||
profile.confidence = 0.4
|
||||
else:
|
||||
profile.business_type = "unknown"
|
||||
profile.confidence = 0.2
|
||||
@@ -255,8 +279,31 @@ async def detect_business_profile(documents: dict[str, str]) -> BusinessProfile:
|
||||
if hits >= 1:
|
||||
industry_scores[industry] = hits
|
||||
|
||||
# Suppress finance/insurance false positives caused by §34d/§34c GewO
|
||||
# disclosures (Versicherungsvermittler, Berufshaftpflicht, etc.) — these
|
||||
# are pflichtangaben for many companies (e.g. BMW AG) without being
|
||||
# actual financial services providers.
|
||||
if industry_scores.get("finance"):
|
||||
vermittler_hits = _count_hits(full_text, _VERMITTLER_CONTEXT_TERMS)
|
||||
if vermittler_hits >= 2:
|
||||
# Only the §34d boilerplate triggered the match — drop or shrink.
|
||||
non_insurance_finance = _count_hits(
|
||||
full_text, ["bank", "finanz", "kredit", "anlage"],
|
||||
)
|
||||
if non_insurance_finance == 0:
|
||||
industry_scores.pop("finance", None)
|
||||
else:
|
||||
industry_scores["finance"] = non_insurance_finance
|
||||
|
||||
# Require a clear winner — if top score is 1 and there are ties, prefer
|
||||
# "unknown" over guessing.
|
||||
if industry_scores:
|
||||
profile.industry = max(industry_scores, key=industry_scores.get) # type: ignore[arg-type]
|
||||
top = max(industry_scores.values())
|
||||
winners = [k for k, v in industry_scores.items() if v == top]
|
||||
if top >= 2 or len(winners) == 1:
|
||||
profile.industry = winners[0]
|
||||
else:
|
||||
profile.industry = "unknown"
|
||||
elif profile.is_regulated_profession:
|
||||
prof_map = {"anwalt": "legal", "arzt": "healthcare",
|
||||
"steuerberater": "finance", "architekt": "craft"}
|
||||
|
||||
Reference in New Issue
Block a user