fix(rag): HTML charset detection + opening block tag newlines

Two bugs fixed: 1. Opening block tags (<h3>, <div>) now also create newlines, not just closing tags. Fixes: gesetze-im-internet.de puts § inside <h3> which followed inline <a> text — § ended up mid-line, not at line start. 2. HTML charset detection from meta tag (charset=iso-8859-1). Files from gesetze-im-internet.de use ISO-8859-1, not UTF-8. The § byte (0xA7) was destroyed by UTF-8 decode. Now: try UTF-8 → check meta charset → fallback ISO-8859-1. 32 rag-service tests passing. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-02 08:35:47 +02:00
parent ddad58f607
commit a459636bc4
3 changed files with 85 additions and 9 deletions
@@ -4,6 +4,9 @@ import re
 from html import unescape

 _HTML_TAG_RE = re.compile(r'<(html|head|body|div|p|span|table)\b', re.IGNORECASE)
+_CHARSET_RE = re.compile(
+    r'<meta[^>]+charset\s*=\s*["\']?([a-zA-Z0-9_-]+)', re.IGNORECASE,
+)


 def looks_like_html(text: str) -> bool:
@@ -11,14 +14,46 @@ def looks_like_html(text: str) -> bool:
    return bool(_HTML_TAG_RE.search(text[:500]))


+def decode_html_bytes(raw: bytes) -> str:
+    """Decode HTML bytes with charset detection from meta tags.
+
+    Tries UTF-8 first, falls back to charset from HTML meta tag, then latin-1.
+    """
+    try:
+        text = raw.decode("utf-8")
+        # Check if UTF-8 decode produced replacement characters
+        if "\ufffd" not in text:
+            return text
+    except UnicodeDecodeError:
+        pass
+
+    # Peek at ASCII-safe portion to find charset
+    ascii_head = raw[:2000].decode("ascii", errors="ignore")
+    m = _CHARSET_RE.search(ascii_head)
+    if m:
+        charset = m.group(1).lower().replace("_", "-")
+        try:
+            return raw.decode(charset)
+        except (UnicodeDecodeError, LookupError):
+            pass
+
+    # Last resort: iso-8859-1 (covers all byte values)
+    return raw.decode("iso-8859-1")
+
+
 def strip_html(html_text: str) -> str:
    """Convert HTML to plain text preserving legal document structure."""
    text = html_text
    # Remove script/style blocks
    text = re.sub(r'<(script|style)[^>]*>.*?</\1>', '', text, flags=re.DOTALL | re.IGNORECASE)
    # Block elements → newline (preserves § paragraph structure)
+    # Opening block tags also get newline (e.g., <h3> before § signs)
    text = re.sub(
-        r'</(div|p|h[1-6]|li|tr|section|article|blockquote)>',
+        r'<(div|p|h[1-6]|li|tr|dt|dd|section|article|blockquote)\b[^>]*>',
+        '\n', text, flags=re.IGNORECASE,
+    )
+    text = re.sub(
+        r'</(div|p|h[1-6]|li|tr|dt|dd|section|article|blockquote)>',
        '\n', text, flags=re.IGNORECASE,
    )
    text = re.sub(r'<br\s*/?>', '\n', text, flags=re.IGNORECASE)