fix(compliance-check): always render 8 doc types + 4 BMW GT-gap fixes
Always-show-8 (user-requested): - agent_compliance_check_routes.py: _pad_results_with_missing pads the results list to always include all 8 canonical doc_types in canonical order. Missing types get a placeholder DocCheckResult with error= 'Nicht eingereicht' + scenario='missing'. - agent_doc_check_report.py: NICHT EINGEREICHT status label (neutral), friendly grey body block instead of red error. - ChecklistView.tsx: 'Nicht eingereicht' chip (neutral grey, not red 'Fehler'); SCENARIO_LABELS adds missing entry + header chip counter. Impressum-Regression fix (#18): - _fetch_text(url, doc_type): cookie/dse/social_media -> max_documents=1 (CMP capture authoritative, sub-pages dilute). Other types -> =3 (Impressum needs Versicherungsvermittler, Aufsicht, Berufsrecht sub- pages). 15s networkidle bail keeps timing safe. ODR/Verbraucherstreitbeilegung filter (#19): - _apply_profile_filter: when profile.needs_odr=True (B2C), override the check's default B2B-oriented hint with action-oriented B2C guidance pointing at Art. 14 EU-VO 524/2013 + §36 VSBG. Previously the check contradicted itself: 'profile says B2C' + hint 'only relevant for B2C online vendors'. Registergericht regex (#20): - impressum_checks.py: accept colon/dot/dash between keyword and city (BMW writes 'registergericht: münchen hrb 42243'). Add 'sitz und registergericht: X' as separate pattern. Industry detection (#21): - business_profiler.py: 'automotive' keywords broadened (antriebs, motor, leasing, werkstatt, probefahrt, plus brand names BMW/Mercedes/ Audi/VW/Porsche/Opel). 'it_services' keywords narrowed — software/ cloud/hosting are mentioned in every privacy policy and were biasing the result toward IT for any tech-aware company.
This commit is contained in:
@@ -31,6 +31,7 @@ const SCENARIO_LABELS: Record<string, { label: string; color: string; bg: string
|
|||||||
regenerate: { label: 'Neugenerierung', color: 'text-red-700', bg: 'bg-red-100' },
|
regenerate: { label: 'Neugenerierung', color: 'text-red-700', bg: 'bg-red-100' },
|
||||||
fix: { label: 'Korrekturen', color: 'text-amber-700', bg: 'bg-amber-100' },
|
fix: { label: 'Korrekturen', color: 'text-amber-700', bg: 'bg-amber-100' },
|
||||||
import: { label: 'Konform', color: 'text-green-700', bg: 'bg-green-100' },
|
import: { label: 'Konform', color: 'text-green-700', bg: 'bg-green-100' },
|
||||||
|
missing: { label: 'Fehlt', color: 'text-gray-600', bg: 'bg-gray-100' },
|
||||||
}
|
}
|
||||||
|
|
||||||
const DOC_TYPE_LABELS: Record<string, string> = {
|
const DOC_TYPE_LABELS: Record<string, string> = {
|
||||||
@@ -102,6 +103,7 @@ export function ChecklistView({ results }: { results: DocResult[] }) {
|
|||||||
regenerate: results.filter(r => r.scenario === 'regenerate').length,
|
regenerate: results.filter(r => r.scenario === 'regenerate').length,
|
||||||
fix: results.filter(r => r.scenario === 'fix').length,
|
fix: results.filter(r => r.scenario === 'fix').length,
|
||||||
import: results.filter(r => r.scenario === 'import').length,
|
import: results.filter(r => r.scenario === 'import').length,
|
||||||
|
missing: results.filter(r => r.scenario === 'missing').length,
|
||||||
}
|
}
|
||||||
|
|
||||||
return (
|
return (
|
||||||
@@ -114,6 +116,7 @@ export function ChecklistView({ results }: { results: DocResult[] }) {
|
|||||||
{scenarioCounts.import > 0 && <span className="bg-green-100 text-green-700 px-2 py-0.5 rounded-full">{scenarioCounts.import} konform</span>}
|
{scenarioCounts.import > 0 && <span className="bg-green-100 text-green-700 px-2 py-0.5 rounded-full">{scenarioCounts.import} konform</span>}
|
||||||
{scenarioCounts.fix > 0 && <span className="bg-amber-100 text-amber-700 px-2 py-0.5 rounded-full">{scenarioCounts.fix} Korrekturen</span>}
|
{scenarioCounts.fix > 0 && <span className="bg-amber-100 text-amber-700 px-2 py-0.5 rounded-full">{scenarioCounts.fix} Korrekturen</span>}
|
||||||
{scenarioCounts.regenerate > 0 && <span className="bg-red-100 text-red-700 px-2 py-0.5 rounded-full">{scenarioCounts.regenerate} Neugenerierung</span>}
|
{scenarioCounts.regenerate > 0 && <span className="bg-red-100 text-red-700 px-2 py-0.5 rounded-full">{scenarioCounts.regenerate} Neugenerierung</span>}
|
||||||
|
{scenarioCounts.missing > 0 && <span className="bg-gray-100 text-gray-600 px-2 py-0.5 rounded-full">{scenarioCounts.missing} fehlt</span>}
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
@@ -164,7 +167,11 @@ export function ChecklistView({ results }: { results: DocResult[] }) {
|
|||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
<div className="flex items-center gap-3 shrink-0 ml-3">
|
<div className="flex items-center gap-3 shrink-0 ml-3">
|
||||||
{r.error ? (
|
{r.error && r.error.startsWith("Nicht eingereicht") ? (
|
||||||
|
<span className="text-xs text-gray-500 font-medium px-2 py-0.5 bg-gray-100 rounded-full whitespace-nowrap">
|
||||||
|
Nicht eingereicht
|
||||||
|
</span>
|
||||||
|
) : r.error ? (
|
||||||
<span className="text-xs text-red-600 font-medium">Fehler</span>
|
<span className="text-xs text-red-600 font-medium">Fehler</span>
|
||||||
) : (
|
) : (
|
||||||
<div className="flex flex-col gap-1">
|
<div className="flex flex-col gap-1">
|
||||||
|
|||||||
@@ -176,7 +176,7 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
|
|||||||
if url_key in url_text_cache:
|
if url_key in url_text_cache:
|
||||||
text = url_text_cache[url_key]
|
text = url_text_cache[url_key]
|
||||||
else:
|
else:
|
||||||
text = await _fetch_text(doc.url)
|
text = await _fetch_text(doc.url, doc_type=doc.doc_type)
|
||||||
if text:
|
if text:
|
||||||
url_text_cache[url_key] = text
|
url_text_cache[url_key] = text
|
||||||
if text:
|
if text:
|
||||||
@@ -334,6 +334,11 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
|
|||||||
else:
|
else:
|
||||||
r.scenario = "import"
|
r.scenario = "import"
|
||||||
|
|
||||||
|
# Step 4c: Always render all 8 canonical doc types, even when the
|
||||||
|
# user left a row blank. Missing types get a placeholder so the
|
||||||
|
# email + frontend make absent documents immediately visible.
|
||||||
|
results = _pad_results_with_missing(results)
|
||||||
|
|
||||||
# Step 5: Build report with management summary (95-98%)
|
# Step 5: Build report with management summary (95-98%)
|
||||||
_update(check_id, "Report wird erstellt...", 96)
|
_update(check_id, "Report wird erstellt...", 96)
|
||||||
from .agent_doc_check_report import (
|
from .agent_doc_check_report import (
|
||||||
@@ -402,23 +407,31 @@ def _update(check_id: str, msg: str, pct: int | None = None):
|
|||||||
job["progress_pct"] = max(0, min(100, int(pct)))
|
job["progress_pct"] = max(0, min(100, int(pct)))
|
||||||
|
|
||||||
|
|
||||||
async def _fetch_text(url: str) -> str:
|
async def _fetch_text(url: str, doc_type: str = "") -> str:
|
||||||
"""Fetch text from URL via consent-tester, with HTTP fallback.
|
"""Fetch text from URL via consent-tester, with HTTP fallback.
|
||||||
|
|
||||||
1. Try consent-tester (Playwright) — handles JS-heavy SPAs
|
1. Try consent-tester (Playwright) — handles JS-heavy SPAs
|
||||||
2. Fallback: direct HTTP fetch + HTML strip — fast, works for SSR pages
|
2. Fallback: direct HTTP fetch + HTML strip — fast, works for SSR pages
|
||||||
|
|
||||||
|
doc_type controls how aggressively we follow sub-links — cookie/dse
|
||||||
|
pages prefer self-extract only (CMP capture is authoritative); legal/
|
||||||
|
imprint pages need to follow sub-pages (Versicherungsvermittler etc).
|
||||||
"""
|
"""
|
||||||
# 1. Consent-tester (Playwright-based, full JS rendering).
|
# 1. Consent-tester (Playwright-based, full JS rendering).
|
||||||
# max_documents=1: for a *specific* user-entered URL (cookie, impressum,
|
# max_documents depends on doc_type:
|
||||||
# privacy) we only want the self-extracted text of THAT page. Following
|
# - cookie/dse/social_media: self-extract (often + CMP capture) is
|
||||||
# sub-links was triggering 4x networkidle timeouts (~240s) and made the
|
# authoritative, sub-pages dilute the policy text. max=1.
|
||||||
# backend httpx call time out, dropping us to the raw HTTP fallback
|
# - impressum/agb/widerruf/nutzungsbedingungen/dsb: BMW & similar
|
||||||
# which returned site navigation as garbage text.
|
# enterprise sites split this across 3-4 short sub-pages
|
||||||
|
# (Versicherungsvermittler, Aufsicht, Berufsrecht). max=3 follows
|
||||||
|
# them. The 15s networkidle bail (dsi_helpers) keeps timing safe.
|
||||||
|
short_extract_types = {"cookie", "dse", "datenschutz", "privacy", "social_media"}
|
||||||
|
max_docs = 1 if (doc_type or "") in short_extract_types else 3
|
||||||
try:
|
try:
|
||||||
async with httpx.AsyncClient(timeout=120.0) as client:
|
async with httpx.AsyncClient(timeout=120.0) as client:
|
||||||
resp = await client.post(
|
resp = await client.post(
|
||||||
f"{CONSENT_TESTER_URL}/dsi-discovery",
|
f"{CONSENT_TESTER_URL}/dsi-discovery",
|
||||||
json={"url": url, "max_documents": 1},
|
json={"url": url, "max_documents": max_docs},
|
||||||
timeout=120.0,
|
timeout=120.0,
|
||||||
)
|
)
|
||||||
if resp.status_code == 200:
|
if resp.status_code == 200:
|
||||||
@@ -531,6 +544,50 @@ async def _check_single(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _pad_results_with_missing(results: list) -> list:
|
||||||
|
"""Ensure every canonical doc_type has an entry in the results list.
|
||||||
|
|
||||||
|
Doc_types the user did not submit get a placeholder DocCheckResult
|
||||||
|
with a 'Nicht eingereicht' marker so the email + frontend make
|
||||||
|
absent documents visible at a glance.
|
||||||
|
|
||||||
|
Preserves the canonical ordering from _ALL_DOC_TYPES so the report
|
||||||
|
layout is stable.
|
||||||
|
"""
|
||||||
|
from .agent_doc_check_routes import DocCheckResult
|
||||||
|
|
||||||
|
by_type: dict[str, object] = {}
|
||||||
|
for r in results:
|
||||||
|
# Map alias types (datenschutz/privacy → dse) to the canonical key
|
||||||
|
canon = "dse" if r.doc_type in ("datenschutz", "privacy") else r.doc_type
|
||||||
|
by_type[canon] = r
|
||||||
|
|
||||||
|
ordered: list = []
|
||||||
|
for dt in _ALL_DOC_TYPES:
|
||||||
|
if dt in by_type:
|
||||||
|
ordered.append(by_type[dt])
|
||||||
|
continue
|
||||||
|
ordered.append(DocCheckResult(
|
||||||
|
label=_doc_type_label(dt),
|
||||||
|
url="",
|
||||||
|
doc_type=dt,
|
||||||
|
word_count=0,
|
||||||
|
completeness_pct=0,
|
||||||
|
correctness_pct=0,
|
||||||
|
checks=[],
|
||||||
|
findings_count=0,
|
||||||
|
error="Nicht eingereicht — Quelle nicht angegeben",
|
||||||
|
scenario="missing",
|
||||||
|
))
|
||||||
|
|
||||||
|
# Append any results not in _ALL_DOC_TYPES (e.g. avv, dsfa) at the end
|
||||||
|
extras = [r for r in results
|
||||||
|
if (r.doc_type if r.doc_type not in ("datenschutz", "privacy") else "dse")
|
||||||
|
not in _ALL_DOC_TYPES]
|
||||||
|
ordered.extend(extras)
|
||||||
|
return ordered
|
||||||
|
|
||||||
|
|
||||||
_COMPOUND_TLDS = {
|
_COMPOUND_TLDS = {
|
||||||
"co.uk", "co.jp", "co.nz", "co.kr", "co.za", "co.in",
|
"co.uk", "co.jp", "co.nz", "co.kr", "co.za", "co.in",
|
||||||
"com.au", "com.br", "com.mx", "com.tr", "com.sg",
|
"com.au", "com.br", "com.mx", "com.tr", "com.sg",
|
||||||
@@ -603,9 +660,21 @@ def _apply_profile_filter(result, profile, doc_type: str):
|
|||||||
for check in result.checks:
|
for check in result.checks:
|
||||||
cid = check.id.lower()
|
cid = check.id.lower()
|
||||||
|
|
||||||
# ODR/OS-Link only relevant for B2C online shops
|
# ODR/OS-Link: relevant ONLY for B2C online shops. The check's
|
||||||
|
# default hint is written for B2B (it explains why it's not
|
||||||
|
# relevant) — for B2C we must replace it with action-oriented
|
||||||
|
# guidance, otherwise the report contradicts itself.
|
||||||
if "odr" in cid or "os-link" in cid or "streitbeilegung" in check.label.lower():
|
if "odr" in cid or "os-link" in cid or "streitbeilegung" in check.label.lower():
|
||||||
if not profile.needs_odr:
|
if profile.needs_odr:
|
||||||
|
if not check.passed:
|
||||||
|
check.hint = (
|
||||||
|
"Als B2C-Anbieter muessen Sie nach Art. 14 EU-VO 524/2013 "
|
||||||
|
"auf die OS-Plattform (https://ec.europa.eu/consumers/odr) "
|
||||||
|
"verlinken — klickbarer Link, nicht nur Text. Zusaetzlich "
|
||||||
|
"§36 VSBG: angeben, ob Sie an Verbraucher-"
|
||||||
|
"Streitbeilegungsverfahren teilnehmen (oder nicht)."
|
||||||
|
)
|
||||||
|
else:
|
||||||
check.skipped = True
|
check.skipped = True
|
||||||
check.hint = "Nicht relevant (kein B2C Online-Shop)"
|
check.hint = "Nicht relevant (kein B2C Online-Shop)"
|
||||||
|
|
||||||
@@ -643,8 +712,19 @@ _DOC_TYPE_LABELS = {
|
|||||||
"loeschkonzept": "Loeschkonzept",
|
"loeschkonzept": "Loeschkonzept",
|
||||||
"dsfa": "Datenschutz-Folgenabschaetzung",
|
"dsfa": "Datenschutz-Folgenabschaetzung",
|
||||||
"social_media": "Social Media Datenschutz",
|
"social_media": "Social Media Datenschutz",
|
||||||
|
"nutzungsbedingungen": "Nutzungsbedingungen",
|
||||||
|
"dsb": "DSB-Kontakt",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Canonical 8 doc types in the same order as the frontend ComplianceCheckTab.
|
||||||
|
# The route pads `results` to always contain an entry for each — even if
|
||||||
|
# the user did not submit a URL — so the email + frontend always show
|
||||||
|
# the complete checklist (missing rows marked as 'Nicht eingereicht').
|
||||||
|
_ALL_DOC_TYPES = [
|
||||||
|
"dse", "impressum", "social_media", "cookie",
|
||||||
|
"agb", "nutzungsbedingungen", "widerruf", "dsb",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
def _doc_type_label(doc_type: str) -> str:
|
def _doc_type_label(doc_type: str) -> str:
|
||||||
return _DOC_TYPE_LABELS.get(doc_type, doc_type.upper())
|
return _DOC_TYPE_LABELS.get(doc_type, doc_type.upper())
|
||||||
|
|||||||
@@ -184,7 +184,10 @@ def _render_document(html: list[str], r: DocCheckResult) -> None:
|
|||||||
cpct = r.correctness_pct
|
cpct = r.correctness_pct
|
||||||
bar_color = "green" if pct >= 80 else "yellow" if pct >= 50 else "red"
|
bar_color = "green" if pct >= 80 else "yellow" if pct >= 50 else "red"
|
||||||
status_label = "OK" if pct == 100 else "LUECKENHAFT" if pct >= 50 else "MANGELHAFT"
|
status_label = "OK" if pct == 100 else "LUECKENHAFT" if pct >= 50 else "MANGELHAFT"
|
||||||
if r.error:
|
is_missing = bool(r.error) and r.error.startswith("Nicht eingereicht")
|
||||||
|
if is_missing:
|
||||||
|
status_label = "NICHT EINGEREICHT"
|
||||||
|
elif r.error:
|
||||||
status_label = "FEHLER"
|
status_label = "FEHLER"
|
||||||
|
|
||||||
l1_checks = [c for c in r.checks if c.level == 1]
|
l1_checks = [c for c in r.checks if c.level == 1]
|
||||||
@@ -216,7 +219,16 @@ def _render_document(html: list[str], r: DocCheckResult) -> None:
|
|||||||
html.append('</div></div></div>')
|
html.append('</div></div></div>')
|
||||||
|
|
||||||
# Body
|
# Body
|
||||||
if r.error:
|
if is_missing:
|
||||||
|
html.append(
|
||||||
|
'<div style="padding:12px 16px;color:#6b7280;font-size:12px;'
|
||||||
|
'background:#fafafa;border-top:1px solid #f3f4f6">'
|
||||||
|
'Keine URL oder Text fuer dieses Dokument angegeben. '
|
||||||
|
'Tragen Sie die Quelle im Compliance-Check Formular nach, '
|
||||||
|
'um diese Pflichtangabe zu pruefen.'
|
||||||
|
'</div>'
|
||||||
|
)
|
||||||
|
elif r.error:
|
||||||
html.append(f'<div style="padding:12px 16px;color:#991b1b">{r.error}</div>')
|
html.append(f'<div style="padding:12px 16px;color:#991b1b">{r.error}</div>')
|
||||||
else:
|
else:
|
||||||
html.append('<div style="padding:8px 16px 12px">')
|
html.append('<div style="padding:8px 16px 12px">')
|
||||||
|
|||||||
@@ -107,7 +107,13 @@ _EDITORIAL_KEYWORDS = [
|
|||||||
]
|
]
|
||||||
|
|
||||||
_INDUSTRY_KEYWORDS = {
|
_INDUSTRY_KEYWORDS = {
|
||||||
"it_services": ["software", "saas", "cloud", "hosting", "api", "plattform"],
|
# "software/cloud/hosting" are often mentioned in privacy texts of any
|
||||||
|
# vendor (Cloud-Hosting fuer Newsletter, SaaS-Tools etc.) without making
|
||||||
|
# the company an IT-services vendor itself. Keep the list deliberately
|
||||||
|
# narrow: only patterns that strongly suggest IT/SaaS as the core business.
|
||||||
|
"it_services": ["saas-anbieter", "software-as-a-service",
|
||||||
|
"ihr saas", "ihre cloud", "hosting-provider",
|
||||||
|
"api-anbieter", "developer-portal"],
|
||||||
"retail": ["shop", "warenkorb", "versand", "lieferung", "einzelhandel"],
|
"retail": ["shop", "warenkorb", "versand", "lieferung", "einzelhandel"],
|
||||||
"healthcare": ["arzt", "praxis", "patient", "gesundheit", "therapie", "klinik"],
|
"healthcare": ["arzt", "praxis", "patient", "gesundheit", "therapie", "klinik"],
|
||||||
"legal": ["kanzlei", "rechtsanwalt", "mandant", "anwalt"],
|
"legal": ["kanzlei", "rechtsanwalt", "mandant", "anwalt"],
|
||||||
@@ -120,7 +126,11 @@ _INDUSTRY_KEYWORDS = {
|
|||||||
"manufacturing": ["fertigung", "produktion", "maschinenbau", "anlagenbau", "zulieferer",
|
"manufacturing": ["fertigung", "produktion", "maschinenbau", "anlagenbau", "zulieferer",
|
||||||
"werkzeugbau", "spritzguss", "cnc", "industrietechnik"],
|
"werkzeugbau", "spritzguss", "cnc", "industrietechnik"],
|
||||||
"automotive": ["fahrzeug", "kraftfahrzeug", "kfz", "automobil", "neuwagen",
|
"automotive": ["fahrzeug", "kraftfahrzeug", "kfz", "automobil", "neuwagen",
|
||||||
"gebrauchtwagen", "konfigurator", "modellreihe", "modellpalette"],
|
"gebrauchtwagen", "fahrzeugempfehlung", "modellreihe",
|
||||||
|
"modellpalette", "antriebs", "motor", "reifen", "elektroauto",
|
||||||
|
"verbrenner", "hybridfahrzeug", "leasing", "werkstatt",
|
||||||
|
"wartung und reparatur", "probefahrt", "bmw", "mercedes",
|
||||||
|
"audi", "volkswagen", "porsche", "opel"],
|
||||||
"media": ["redaktion", "verlag", "medien", "journalismus", "presse"],
|
"media": ["redaktion", "verlag", "medien", "journalismus", "presse"],
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -111,9 +111,16 @@ IMPRESSUM_CHECKLIST = [
|
|||||||
"label": "Registergericht benannt (Amtsgericht X)",
|
"label": "Registergericht benannt (Amtsgericht X)",
|
||||||
"level": 2, "parent": "register",
|
"level": 2, "parent": "register",
|
||||||
"patterns": [
|
"patterns": [
|
||||||
r"(?:amtsgericht|registergericht)\s+[a-zA-Z\u00c0-\u017e]\w+",
|
# "Amtsgericht <Stadt>" or "Registergericht <Stadt>"
|
||||||
|
# Allow colon/dot/dash between keyword and city (BMW writes
|
||||||
|
# "registergericht: m\u00fcnchen hrb 42243").
|
||||||
|
r"(?:amtsgericht|registergericht)[\s:\.\-,]+[a-zA-Z\u00c0-\u017e]\w+",
|
||||||
|
# "AG <Stadt>" short form
|
||||||
r"\bag\s+[a-zA-Z\u00c0-\u017e]\w+",
|
r"\bag\s+[a-zA-Z\u00c0-\u017e]\w+",
|
||||||
|
# "Handelsregister AG/Amtsgericht <Stadt>"
|
||||||
r"(?:handelsregister|register)\s+(?:ag|amtsgericht)\s+\w+",
|
r"(?:handelsregister|register)\s+(?:ag|amtsgericht)\s+\w+",
|
||||||
|
# "Sitz und Registergericht: M\u00fcnchen" \u2014 BMW pattern
|
||||||
|
r"sitz\s+und\s+registergericht[\s:\.\-,]+[a-zA-Z\u00c0-\u017e]\w+",
|
||||||
],
|
],
|
||||||
"severity": "LOW",
|
"severity": "LOW",
|
||||||
"hint": "Registergericht benennen (z.B. 'Amtsgericht Freiburg' oder 'AG Freiburg'). Beides ist korrekt.",
|
"hint": "Registergericht benennen (z.B. 'Amtsgericht Freiburg' oder 'AG Freiburg'). Beides ist korrekt.",
|
||||||
|
|||||||
Reference in New Issue
Block a user