fix(rag): HTML charset detection + opening block tag newlines

Two bugs fixed:
1. Opening block tags (<h3>, <div>) now also create newlines, not just
   closing tags. Fixes: gesetze-im-internet.de puts § inside <h3> which
   followed inline <a> text — § ended up mid-line, not at line start.

2. HTML charset detection from meta tag (charset=iso-8859-1). Files from
   gesetze-im-internet.de use ISO-8859-1, not UTF-8. The § byte (0xA7)
   was destroyed by UTF-8 decode. Now: try UTF-8 → check meta charset →
   fallback ISO-8859-1.

32 rag-service tests passing.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-05-02 08:35:47 +02:00
parent ddad58f607
commit a459636bc4
3 changed files with 85 additions and 9 deletions
+40 -1
View File
@@ -1,6 +1,10 @@
"""Tests for HTML detection and stripping in document upload."""
from html_utils import looks_like_html as _looks_like_html, strip_html as _strip_html
from html_utils import (
decode_html_bytes,
looks_like_html as _looks_like_html,
strip_html as _strip_html,
)
class TestLooksLikeHtml:
@@ -120,3 +124,38 @@ class TestStripHtml:
result = _strip_html(text)
assert "§ 312 Anwendungsbereich" in result
assert "(1) Die Vorschriften" in result
def test_opening_h3_creates_newline(self):
"""Opening <h3> must create newline so § is at line start."""
html = '<a href="#">Inhaltsverzeichnis</a><h3><span>§ 1</span> Titel</h3>'
result = _strip_html(html)
found = any(line.strip().startswith("§ 1") for line in result.split("\n"))
assert found, f"§ 1 not at line start: {result!r}"
class TestDecodeHtmlBytes:
def test_utf8_file(self):
raw = "<div>§ 312 Anwendungsbereich</div>".encode("utf-8")
text = decode_html_bytes(raw)
assert "§ 312" in text
def test_iso_8859_1_with_meta(self):
html = '<html><head><meta charset="iso-8859-1"></head><body>§ 1 Test</body></html>'
raw = html.encode("iso-8859-1")
text = decode_html_bytes(raw)
assert "§ 1 Test" in text
def test_iso_8859_1_without_meta(self):
"""Even without meta tag, iso-8859-1 is fallback."""
raw = "§ 312 Anwendungsbereich".encode("iso-8859-1")
text = decode_html_bytes(raw)
assert "§ 312" in text
def test_gesetze_im_internet_encoding(self):
"""gesetze-im-internet.de uses iso-8859-1 with &#167; entities."""
html = '<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />'
html += '<div>Kündigungsschutzgesetz</div>'
raw = html.encode("iso-8859-1")
text = decode_html_bytes(raw)
assert "Kündigungsschutzgesetz" in text