fix: 5 regex bugs + text extraction scroll + GT update

Root cause: Spiegel DSI text was truncated (lazy-loading) — the rights/DSB/complaints sections at the bottom were never extracted. Fixes: 1. Text extraction: scroll to bottom before innerText (dsi_discovery.py) 2. V.i.S.d.P.: add "verantwortlicher i.s.v." + "§18 Abs. N MStV" pattern 3. USt-IdNr: add "umsatzsteuer-id" + "DE 212 442 423" (with spaces) 4. Profiler: remove generic "anwalt"/"praxis" (false positive on Spiegel "Redaktionsanwalt"), keep only "rechtsanwalt", "kanzlei" etc. 5. Section splitter: auto_fill_from_dsi() fills empty Cookie/Social-Media rows from sections found in the DSI text Ground Truth 06-spiegel.md fully rewritten with verified data from live website — 3 L1 False Negatives identified (DSB, Beschwerderecht, Betroffenenrechte all present on website but not in extracted text). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-13 01:20:55 +02:00
parent 8bb90d73e5
commit c702260ec1
6 changed files with 194 additions and 78 deletions
@@ -59,26 +59,34 @@ _NONPROFIT_KEYWORDS = [
 ]

 _REGULATED_PROFESSIONS = {
+    # Anwalt — nur spezifische Begriffe, nicht "anwalt" allein
+    # (matcht sonst Redaktionsanwalt, Justiziar etc.)
    "rechtsanwalt": "anwalt",
-    "anwalt": "anwalt",
-    "anwaeltin": "anwalt",
-    "anwältin": "anwalt",
+    "rechtsanwaeltin": "anwalt",
+    "rechtsanwältin": "anwalt",
    "kanzlei": "anwalt",
    "rechtsanwaltskammer": "anwalt",
-    "arzt": "arzt",
-    "ärztin": "arzt",
-    "aerztin": "arzt",
-    "praxis": "arzt",
+    "zugelassener anwalt": "anwalt",
+    # Arzt — "praxis" entfernt (matcht "in der Praxis")
+    "arztpraxis": "arzt",
+    "zahnarzt": "arzt",
+    "facharzt": "arzt",
    "aerztekammer": "arzt",
    "ärztekammer": "arzt",
+    "kassenärztlich": "arzt",
+    "kassenaerztlich": "arzt",
+    # Steuerberater
    "steuerberater": "steuerberater",
    "steuerberaterin": "steuerberater",
    "steuerberaterkammer": "steuerberater",
+    # Architekt
    "architekt": "architekt",
    "architektin": "architekt",
    "architektenkammer": "architekt",
+    # Notar
    "notar": "notar",
    "notariat": "notar",
+    # Apotheker
    "apotheke": "apotheker",
    "apotheker": "apotheker",
 }