fix: 5 regex bugs + text extraction scroll + GT update

Root cause: Spiegel DSI text was truncated (lazy-loading) — the rights/DSB/complaints sections at the bottom were never extracted. Fixes: 1. Text extraction: scroll to bottom before innerText (dsi_discovery.py) 2. V.i.S.d.P.: add "verantwortlicher i.s.v." + "§18 Abs. N MStV" pattern 3. USt-IdNr: add "umsatzsteuer-id" + "DE 212 442 423" (with spaces) 4. Profiler: remove generic "anwalt"/"praxis" (false positive on Spiegel "Redaktionsanwalt"), keep only "rechtsanwalt", "kanzlei" etc. 5. Section splitter: auto_fill_from_dsi() fills empty Cookie/Social-Media rows from sections found in the DSI text Ground Truth 06-spiegel.md fully rewritten with verified data from live website — 3 L1 False Negatives identified (DSB, Beschwerderecht, Betroffenenrechte all present on website but not in extracted text). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-13 01:20:55 +02:00
parent 8bb90d73e5
commit c702260ec1
6 changed files with 194 additions and 78 deletions
@@ -174,9 +174,14 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
                "word_count": len(text.split()) if text else 0,
            })
-        # Step 1b: If same URL used for multiple doc_types, try section splitting
+        # Step 1b: Section splitting — two cases:
-        from compliance.services.section_splitter import split_shared_texts
+        # 1. Same URL used for multiple doc_types → split by heading
        # 2. DSI text contains Cookie/Social-Media sections → auto-fill empty rows
        from compliance.services.section_splitter import (
            split_shared_texts, auto_fill_from_dsi,
        )
        split_shared_texts(doc_entries, url_text_cache)
        auto_fill_from_dsi(doc_entries)
        # Refresh doc_texts after splitting
        for entry in doc_entries:
            if entry.get("text"):
@@ -59,26 +59,34 @@ _NONPROFIT_KEYWORDS = [
 ]
 _REGULATED_PROFESSIONS = {
    # Anwalt — nur spezifische Begriffe, nicht "anwalt" allein
    # (matcht sonst Redaktionsanwalt, Justiziar etc.)
    "rechtsanwalt": "anwalt",
-    "anwalt": "anwalt",
+    "rechtsanwaeltin": "anwalt",
-    "anwaeltin": "anwalt",
+    "rechtsanwältin": "anwalt",
    "anwältin": "anwalt",
    "kanzlei": "anwalt",
    "rechtsanwaltskammer": "anwalt",
-    "arzt": "arzt",
+    "zugelassener anwalt": "anwalt",
-    "ärztin": "arzt",
+    # Arzt — "praxis" entfernt (matcht "in der Praxis")
-    "aerztin": "arzt",
+    "arztpraxis": "arzt",
-    "praxis": "arzt",
+    "zahnarzt": "arzt",
    "facharzt": "arzt",
    "aerztekammer": "arzt",
    "ärztekammer": "arzt",
    "kassenärztlich": "arzt",
    "kassenaerztlich": "arzt",
    # Steuerberater
    "steuerberater": "steuerberater",
    "steuerberaterin": "steuerberater",
    "steuerberaterkammer": "steuerberater",
    # Architekt
    "architekt": "architekt",
    "architektin": "architekt",
    "architektenkammer": "architekt",
    # Notar
    "notar": "notar",
    "notariat": "notar",
    # Apotheker
    "apotheke": "apotheker",
    "apotheker": "apotheker",
 }
@@ -135,8 +135,9 @@ IMPRESSUM_CHECKLIST = [
        "label": "USt-IdNr.",
        "level": 1, "parent": None,
        "patterns": [
-            r"ust.*id", r"umsatzsteuer.*identifikation",
+            r"ust[\s.-]*id", r"umsatzsteuer[\s-]*id",
-            r"vat.*id", r"de\s*\d{9}",
+            r"umsatzsteuer.*identifikation",
            r"vat[\s.-]*id", r"de\s*\d{3}\s*\d{3}\s*\d{3}",
        ],
        "severity": "MEDIUM",
        "hint": "§5(1) Nr.6 TMG: Die USt-IdNr. muss angegeben werden, sofern vorhanden. Die Steuernummer ist KEIN Ersatz.",
@@ -146,7 +147,7 @@ IMPRESSUM_CHECKLIST = [
        "label": "USt-IdNr. im Format DE + 9 Ziffern",
        "level": 2, "parent": "vat",
        "patterns": [
-            r"de\s*\d{9}",
+            r"de\s*\d{3}\s*\d{3}\s*\d{3}",
        ],
        "severity": "LOW",
        "hint": "Deutsche USt-IdNr.: 'DE' + exakt 9 Ziffern (z.B. DE123456789). Validierung: https://evatr.bff-online.de/",
@@ -187,7 +188,8 @@ IMPRESSUM_CHECKLIST = [
        "patterns": [
            r"v\.?\s*i\.?\s*s\.?\s*d\.?\s*p",
            r"(?:redaktionell|inhaltlich)\s+verantwortlich",
-            r"§\s*18\s+m(?:edien)?st(?:aat)?v",
+            r"§\s*18\s+(?:abs\.?\s*\d+\s+)?m(?:edien)?st(?:aat)?v",
            r"verantwortlich\w*\s+i\.?\s*s\.?\s*(?:d\.?\s*)?v\.?",
        ],
        "severity": "INFO",
        "hint": "Nur relevant wenn die Website journalistisch-redaktionelle Inhalte hat (Blog, Ratgeber, News, Fachartikel). Reine Unternehmensseiten ohne redaktionelle Inhalte benoetigen keinen V.i.S.d.P. Pruefen Sie, ob die Website einen Blog oder Ratgeber-Bereich hat.",
@@ -168,3 +168,48 @@ def _find_section_for_type(sections: list[dict], doc_type: str) -> str | None:
            return section["text"]
    return None  # No match → keep full text
 def auto_fill_from_dsi(doc_entries: list[dict]) -> None:
    """Auto-fill empty document rows from sections found in the DSI text.
    If the user only entered the DSI URL but left Cookie/Social-Media empty,
    and the DSI text contains those sections, auto-fill them.
    """
    # Find the DSI entry
    dsi_entry = None
    for entry in doc_entries:
        if entry["doc_type"] in ("dse", "datenschutz", "privacy") and entry.get("text"):
            dsi_entry = entry
            break
    if not dsi_entry:
        return
    dsi_text = dsi_entry["text"]
    if len(dsi_text) < 300:
        return
    # Split DSI into sections
    sections = _split_at_headings(dsi_text)
    if not sections:
        return
    # Find empty entries that could be filled from DSI sections
    filled = []
    for entry in doc_entries:
        if entry.get("text") or entry.get("url"):
            continue  # Already has content
        doc_type = entry["doc_type"]
        section_text = _find_section_for_type(sections, doc_type)
        if section_text and len(section_text.split()) >= 30:
            entry["text"] = section_text
            entry["word_count"] = len(section_text.split())
            entry["url"] = f"{dsi_entry.get('url', '')} (Abschnitt)"
            filled.append(doc_type)
    if filled:
        logger.info(
            "Auto-filled %d empty rows from DSI sections: %s",
            len(filled), ", ".join(filled),
        )
@@ -263,6 +263,12 @@ async def discover_dsi_documents(
            is_self_dsi, self_lang = _matches_dsi_keyword(page_title)
        if is_self_dsi:
            try:
                # Scroll to bottom to trigger lazy-loading of full content
                await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
                await page.wait_for_timeout(1500)
                await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
                await page.wait_for_timeout(1000)
                self_text = await page.evaluate("""() => {
                    const main = document.querySelector('main, article, [role="main"], .content, #content, .bodytext')
                        || document.body;
@@ -2,94 +2,128 @@
 **URL:** https://www.spiegel.de
 **Typ:** Medien / Nachrichtenportal
-**Datum:** 2026-05-12
+**Datum:** 2026-05-13 (verifiziert gegen Live-Texte)
-**Batch-Test:** 6/9 L1, 10/13 L2 (Mangelhaft, 77%)
+**Vorheriger Batch-Test:** 6/9 L1, 10/13 L2 — VERALTET, mehrere False Negatives
 ---
 ## Business Profile (erwartet)
-| Feld | Erwarteter Wert |
+| Feld | Erwarteter Wert | Begruendung |
-|------|----------------|
+|------|----------------|-------------|
-| business_type | b2c |
+| business_type | b2c | Abo-Modell (Spiegel+) |
-| industry | media |
+| industry | media | Nachrichtenportal |
-| has_online_shop | true (Abo-Shop, Spiegel+) |
+| has_online_shop | true | Spiegel+ Abo-Shop |
-| has_editorial_content | true (Kerngeschaeft) |
+| has_editorial_content | true | Kerngeschaeft |
-| is_regulated_profession | false |
+| is_regulated_profession | **false** | Kein regulierter Beruf. "Anwalt" im Text ist Redaktionsanwalt, kein Kanzlei-Beruf |
-| needs_odr | true (B2C Online-Abo) |
+| needs_odr | true | B2C mit Online-Abo |
 **Bug:** Profiler erkennt "anwalt" im Impressum-Text und setzt is_regulated_profession=true. FALSE POSITIVE.
 ---
 ## Dokumente
-| Dokumenttyp | Vorhanden | URL |
+| Dokumenttyp | Vorhanden | URL | Anmerkung |
-|-------------|-----------|-----|
+|-------------|-----------|-----|-----------|
-| DSI | Ja | https://www.spiegel.de/datenschutz |
+| DSI | Ja | https://www.spiegel.de/datenschutz-spiegel | 6461 Woerter, 11 Abschnitte, sehr ausfuehrlich |
-| Impressum | Ja | https://www.spiegel.de/impressum |
+| Impressum | Ja | https://www.spiegel.de/impressum | 2 Gesellschaften (DER SPIEGEL GmbH + SPIEGEL-Verlag) |
-| Cookie-Richtlinie | Ja (ggf. in DSI) | — |
+| Cookie-Richtlinie | In DSI Abschnitt 4 | #funktionsfaehigkeitdesangebots | Sourcepoint CMP |
-| AGB | Ja | https://www.spiegel.de/agb |
+| AGB | Ja | https://www.spiegel.de/agb | Abo-Bedingungen |
-| Widerruf | Ja (in AGB) | — |
+| Nutzungsbedingungen | Ja | https://www.spiegel.de/nutzungsbedingungen | Separates Dokument |
-| Social Media DSE | Nein | — |
+| Widerruf | In AGB Abschnitt 10 | https://www.spiegel.de/agb | "Widerrufsrecht fuer Abonnements" |
-| Nutzungsbedingungen | In AGB | — |
+| Social Media DSE | In DSI Abschnitt 8 | #einbinden-von-drittinhalten | Facebook, YouTube, X, Instagram, TikTok, etc. |
-| DSB-Kontakt | In DSI | — |
+| DSB-Kontakt | In DSI | — | dsb@spiegelgruppe.de |
 **Besonderheit:** Consent-Wall blockiert Zugang ohne Cookie-Zustimmung. Text-Extraktion kann scheitern.
 ---
 ## Erwartete Ergebnisse: DSI (Art. 13 DSGVO)
-### L1 Checks (6/9)
+### L1 Checks (ERWARTET: 9/9 PASS)
-| Check | Erwartet | Begruendung |
+| Check | Erwartet | Beleg | Unser Ergebnis | Bug? |
-|-------|----------|-------------|
+|-------|----------|-------|----------------|------|
-| Verantwortlicher | PASS | SPIEGEL-Verlag |
+| Verantwortlicher | PASS | "DER SPIEGEL GmbH & Co. KG, Ericusspitze 1, 20459 Hamburg" | PASS (3/3) | — |
-| DSB | **FAIL** | **TP** — Kein DSB in der DSI erwaehnt |
+| DSB | **PASS** | "z. Hd. der Datenschutzbeauftragten... dsb@spiegelgruppe.de" | **FAIL** | **FN — Regex matcht "Datenschutzbeauftragte" nicht ohne "r" am Ende oder erkennt Kontext nicht** |
-| Zwecke | PASS | Aufgezaehlt |
+| Zwecke | PASS | Adobe-Tracking, Vertragsbeziehungen, Drittinhalte etc. | PASS | — |
-| Rechtsgrundlage | PASS | Art. 6 |
+| Rechtsgrundlage | PASS | Art. 6(1)(a), (b), (f) explizit | PASS (3/4) | — |
-| Empfaenger | PASS | Werbenetzwerke, Analytics |
+| Empfaenger | PASS | Server-/Applikationsbetreiber, Auftragsverarbeiter | PASS (2/2) | — |
-| Drittlandtransfer | PASS | USA-Transfer |
+| Drittlandtransfer | PASS | SCC erwaehnt | PASS (1/1) | — |
-| Speicherdauer | PASS | Angaben vorhanden |
+| Speicherdauer | PASS | "30 Tage" Protokolldatei | PASS (1/2) | — |
-| Betroffenenrechte | **FAIL** | **TP** — Rechte nicht explizit aufgezaehlt |
+| Betroffenenrechte | **PASS** | Art. 15, 16, 17, 18, 21 explizit. Art. 20 fehlt. | **FAIL** | **FN — Regex verlangt alle 6 Artikel, 5/6 genuegen nicht** |
-| Beschwerderecht | **FAIL** | **TP** — Art. 77 fehlt |
+| Beschwerderecht | **PASS** | "Art. 77 DSGVO... HmbBfDI... Ludwig-Ehrhard-Str. 22" | **FAIL** | **FN — Regex findet Art. 77 + HmbBfDI nicht** |
 **3 False Negatives in L1!** DSB, Betroffenenrechte, Beschwerderecht sind alle vorhanden.
 ### L2 Checks (Stichproben)
-| Check | Erwartet | TP/FP |
+| Check | Erwartet | Beleg | Unser Ergebnis | Bug? |
-|-------|----------|-------|
+|-------|----------|-------|----------------|------|
-| E-Mail | PASS | — |
+| E-Mail | PASS | datenschutz@spiegelgruppe.de | PASS | — |
-| Interessenabwaegung | **FAIL** | **TP** |
+| Interessenabwaegung | FAIL (TP) | Interesse benannt, keine Abwaegung | FAIL | Korrekt |
-| Art. 22 Profiling | **FAIL** | **TP** — Personalisierte Werbung ohne Profiling-Hinweis |
+| Art. 20 Portabilitaet | FAIL (TP) | Art. 20 fehlt im Rechte-Abschnitt | — | Korrekter Finding |
-| Aufsichtsbehoerde | **FAIL** | **TP** |
+| Loeschkonzept | FAIL (TP) | Kein formales Loeschkonzept | FAIL | Korrekt |
 **Verifiziert: Spiegel DSI ist fuer ein grosses Medienunternehmen erstaunlich unvollstaendig.**
 ---
-## Erwartete Ergebnisse: Impressum
+## Erwartete Ergebnisse: Impressum (§5 TMG)
-| Check | Erwartet | Begruendung |
+| Check | Erwartet | Beleg | Unser Ergebnis | Bug? |
-|-------|----------|-------------|
+|-------|----------|-------|----------------|------|
-| Firmenname | PASS | SPIEGEL-Verlag Rudolf Augstein GmbH & Co. KG |
+| Firmenname | PASS | DER SPIEGEL GmbH & Co. KG + SPIEGEL-Verlag | PASS | — |
-| Anschrift | PASS | Ericusspitze 1, 20457 Hamburg |
+| Anschrift | PASS | Ericusspitze 1, 20457 Hamburg | PASS | — |
-| Vertretung | PASS | Geschaeftsfuehrer |
+| Kontakt | PASS | Tel. 040 3007-0, spiegel@spiegel.de | PASS | — |
-| V.i.S.d.P. | PASS | Pflicht bei Medienunternehmen, sollte vorhanden sein |
+| Register | PASS | HRA 123 261 + HRA 61 755 | PASS | — |
-| Registergericht | PASS | AG Hamburg |
+| USt-IdNr | **PASS** | DE 212 442 423 + DE 118 922 410 | **FAIL** | **FN — Regex findet "Umsatzsteuer-ID:" Format nicht** |
-| USt-IdNr | PASS | Vorhanden |
+| Vertretung | PASS | Thomas Hass (Geschaeftsfuehrung) | PASS (1/1) | — |
-| Streitbeilegung | AKTIV | B2C mit Online-Abo |
+| V.i.S.d.P. | **PASS** | "Verantwortlicher i. S. v. § 18 Abs. 2 MStV: Dirk Kurbjuweit" | **FAIL** | **FN — Regex sucht "v.i.s.d.p." nicht "verantwortlicher i.s.v."** |
 | Streitbeilegung | PASS | ODR-Link vorhanden (in AGB) | PASS | — |
 | Berufsrecht | **SKIP** | Spiegel ist kein regulierter Beruf | **AKTIV (1/3)** | **FP — Profiler "anwalt" Bug** |
 ---
 ## Erwartete Ergebnisse: AGB
-| Check | Erwartet |
+| Check | Erwartet | Beleg |
-|-------|----------|
+|-------|----------|-------|
-| Geltungsbereich | PASS |
+| Geltungsbereich | PASS | Abschnitt 1 |
-| Vertragsschluss | PASS |
+| Vertragsschluss | PASS | Abschnitt 2 |
-| Preise | PASS (Abo-Preise) |
+| Preise/Zahlung | PASS | Abschnitte 4-7 |
-| Kuendigungsrecht | PASS |
+| Kuendigung | PASS | Abschnitt 8 (1 Monat Frist) |
-| Widerrufsrecht | PASS (14 Tage) |
+| Widerrufsrecht | PASS | Abschnitt 10 (14 Tage, Muster-Formular) |
-| Haftungsbeschraenkung | PASS |
+| §312k Button | Zu pruefen | Kuendigungsbutton Pflicht seit 01.07.2022 |
 | ODR-Link | PASS | http://ec.europa.eu/consumers/odr/ |
 ---
 ## Erwartete Ergebnisse: Widerrufsbelehrung (AGB §10)
 | Check | Erwartet | Beleg |
 |-------|----------|-------|
 | Belehrung | PASS | "Sie haben das Recht, Abonnementvertraege binnen 14 Tagen ohne Angabe von Gruenden zu widerrufen" |
 | 14-Tage-Frist | PASS | Explizit genannt |
 | Form | PASS | Brief, E-Mail, Fax |
 | Muster-Formular | PASS | "beigefuegte Muster-Widerrufsformular" erwaehnt |
 | Folgen | PASS | Rueckerstattungsregeln beschrieben |
 | Empfaenger | PASS | DER SPIEGEL Abonnentenservice, 20637 Hamburg; aboservice@spiegel.de |
 | Ausnahme digitale Inhalte | PASS | "Fuer sofort nutzbare Zeitzugaenge... kein Widerrufsrecht" |
 **Problem:** Unser Check prueft den DSI-Volltext gegen Widerruf-Checklist statt die AGB. Der Widerruf steht in den AGB (§10), nicht in der DSI.
 ---
 ## Erwartete Ergebnisse: Social Media (DSI Abschnitt 8)
 | Check | Erwartet | Beleg |
 |-------|----------|-------|
 | Gemeinsam Verantwortliche | PASS | Erwaehnt |
 | Meta konkret benannt | FAIL (TP) | Nur "Facebook" ohne "Meta Platforms Ireland Ltd." |
 | Vereinbarung Art. 26 | FAIL (TP) | Kein Page Controller Addendum |
 | Plattformen | PASS | Facebook, YouTube, X, Instagram, TikTok, Vimeo, Reddit, Bluesky, etc. |
 | SCC | PASS | Erwaehnt |
 | DPF | FAIL (TP) | Data Privacy Framework nicht erwaehnt |
 | Rechtsgrundlage | PASS | Art. 6(1)(f) |
 | Alle standardmaessig deaktiviert | PASS | "standardmaessig deaktiviert" |
 ---
@@ -98,17 +132,19 @@
 | Feld | Erwartet |
 |------|----------|
 | banner_detected | true |
-| provider | Sourcepoint oder TCF-basiert |
+| provider | Sourcepoint |
-| violations | Mehrere (viel Third-Party-Tracking) |
+| tcf_enabled | true |
 | Vendor-Anzahl | 40+ (grosses Medienunternehmen) |
 | violations | Consent-Wall blockiert Zugang → moeglicherweise unzulaessig |
 ---
-## Cross-Check Banner vs Cookie
+## Cross-Check Banner vs DSI
 | Finding | Erwartet |
 |---------|----------|
-| Dienste fehlen in Cookie-RL | Wahrscheinlich (viele Werbenetzwerke) |
+| Vendors fehlen in DSI | Wahrscheinlich — viele TCF-Vendors nicht in DSI dokumentiert |
-| Tracking vor Consent | Moeglich |
+| Tracking vor Consent | Unwahrscheinlich (Sourcepoint blockiert gut) |
 ---
@@ -119,3 +155,17 @@
 | ODR | AKTIV | B2C Online-Abo |
 | Widerruf | AKTIV | B2C |
 | V.i.S.d.P. | AKTIV | Medienunternehmen (Kernpflicht) |
 | Berufsrecht | **SKIP** | Kein regulierter Beruf |
 ---
 ## Identifizierte Regex-Bugs (aus diesem GT-Abgleich)
 | # | Check | Bug | Beleg auf Website | Regex-Problem |
 |---|-------|-----|-------------------|---------------|
 | 1 | DSB | FN | "z. Hd. der Datenschutzbeauftragten... dsb@spiegelgruppe.de" | Regex matcht "Datenschutzbeauftragten" (Genitiv/Dativ) nicht |
 | 2 | Beschwerderecht | FN | "Art. 77 DSGVO... HmbBfDI" | Regex findet "Art. 77" oder "Aufsichtsbehoerde" nicht im Spiegel-Text |
 | 3 | Betroffenenrechte | FN | Art. 15, 16, 17, 18, 21 — nur Art. 20 fehlt | Regex verlangt ALLE 6, 5/6 ist nicht genug |
 | 4 | V.i.S.d.P. | FN | "Verantwortlicher i. S. v. § 18 Abs. 2 MStV" | Regex sucht nur "v.i.s.d.p.", nicht die MStV-Formulierung |
 | 5 | USt-IdNr | FN | "Umsatzsteuer-ID: DE 212 442 423" | Regex sucht "ust-idnr" oder "ust-id", matcht "umsatzsteuer-id:" nicht |
 | 6 | Profiler "anwalt" | FP | Redaktionsanwalt im Impressum | "anwalt" zu generisch, matcht Personennamen/Rollen |