fix(rag): HTML charset detection + opening block tag newlines

Two bugs fixed: 1. Opening block tags (<h3>, <div>) now also create newlines, not just closing tags. Fixes: gesetze-im-internet.de puts § inside <h3> which followed inline <a> text — § ended up mid-line, not at line start. 2. HTML charset detection from meta tag (charset=iso-8859-1). Files from gesetze-im-internet.de use ISO-8859-1, not UTF-8. The § byte (0xA7) was destroyed by UTF-8 decode. Now: try UTF-8 → check meta charset → fallback ISO-8859-1. 32 rag-service tests passing. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-02 08:35:47 +02:00
parent ddad58f607
commit a459636bc4
3 changed files with 85 additions and 9 deletions
@@ -1,6 +1,10 @@
 """Tests for HTML detection and stripping in document upload."""

-from html_utils import looks_like_html as _looks_like_html, strip_html as _strip_html
+from html_utils import (
+    decode_html_bytes,
+    looks_like_html as _looks_like_html,
+    strip_html as _strip_html,
+)


 class TestLooksLikeHtml:
@@ -120,3 +124,38 @@ class TestStripHtml:
        result = _strip_html(text)
        assert "§ 312 Anwendungsbereich" in result
        assert "(1) Die Vorschriften" in result
+
+    def test_opening_h3_creates_newline(self):
+        """Opening <h3> must create newline so § is at line start."""
+        html = '<a href="#">Inhaltsverzeichnis</a><h3><span>§ 1</span> Titel</h3>'
+        result = _strip_html(html)
+        found = any(line.strip().startswith("§ 1") for line in result.split("\n"))
+        assert found, f"§ 1 not at line start: {result!r}"
+
+
+class TestDecodeHtmlBytes:
+
+    def test_utf8_file(self):
+        raw = "<div>§ 312 Anwendungsbereich</div>".encode("utf-8")
+        text = decode_html_bytes(raw)
+        assert "§ 312" in text
+
+    def test_iso_8859_1_with_meta(self):
+        html = '<html><head><meta charset="iso-8859-1"></head><body>§ 1 Test</body></html>'
+        raw = html.encode("iso-8859-1")
+        text = decode_html_bytes(raw)
+        assert "§ 1 Test" in text
+
+    def test_iso_8859_1_without_meta(self):
+        """Even without meta tag, iso-8859-1 is fallback."""
+        raw = "§ 312 Anwendungsbereich".encode("iso-8859-1")
+        text = decode_html_bytes(raw)
+        assert "§ 312" in text
+
+    def test_gesetze_im_internet_encoding(self):
+        """gesetze-im-internet.de uses iso-8859-1 with &#167; entities."""
+        html = '<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />'
+        html += '<div>Kündigungsschutzgesetz</div>'
+        raw = html.encode("iso-8859-1")
+        text = decode_html_bytes(raw)
+        assert "Kündigungsschutzgesetz" in text