fix(rag): HTML charset detection + opening block tag newlines
Two bugs fixed: 1. Opening block tags (<h3>, <div>) now also create newlines, not just closing tags. Fixes: gesetze-im-internet.de puts § inside <h3> which followed inline <a> text — § ended up mid-line, not at line start. 2. HTML charset detection from meta tag (charset=iso-8859-1). Files from gesetze-im-internet.de use ISO-8859-1, not UTF-8. The § byte (0xA7) was destroyed by UTF-8 decode. Now: try UTF-8 → check meta charset → fallback ISO-8859-1. 32 rag-service tests passing. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1,6 +1,10 @@
|
||||
"""Tests for HTML detection and stripping in document upload."""
|
||||
|
||||
from html_utils import looks_like_html as _looks_like_html, strip_html as _strip_html
|
||||
from html_utils import (
|
||||
decode_html_bytes,
|
||||
looks_like_html as _looks_like_html,
|
||||
strip_html as _strip_html,
|
||||
)
|
||||
|
||||
|
||||
class TestLooksLikeHtml:
|
||||
@@ -120,3 +124,38 @@ class TestStripHtml:
|
||||
result = _strip_html(text)
|
||||
assert "§ 312 Anwendungsbereich" in result
|
||||
assert "(1) Die Vorschriften" in result
|
||||
|
||||
def test_opening_h3_creates_newline(self):
|
||||
"""Opening <h3> must create newline so § is at line start."""
|
||||
html = '<a href="#">Inhaltsverzeichnis</a><h3><span>§ 1</span> Titel</h3>'
|
||||
result = _strip_html(html)
|
||||
found = any(line.strip().startswith("§ 1") for line in result.split("\n"))
|
||||
assert found, f"§ 1 not at line start: {result!r}"
|
||||
|
||||
|
||||
class TestDecodeHtmlBytes:
|
||||
|
||||
def test_utf8_file(self):
|
||||
raw = "<div>§ 312 Anwendungsbereich</div>".encode("utf-8")
|
||||
text = decode_html_bytes(raw)
|
||||
assert "§ 312" in text
|
||||
|
||||
def test_iso_8859_1_with_meta(self):
|
||||
html = '<html><head><meta charset="iso-8859-1"></head><body>§ 1 Test</body></html>'
|
||||
raw = html.encode("iso-8859-1")
|
||||
text = decode_html_bytes(raw)
|
||||
assert "§ 1 Test" in text
|
||||
|
||||
def test_iso_8859_1_without_meta(self):
|
||||
"""Even without meta tag, iso-8859-1 is fallback."""
|
||||
raw = "§ 312 Anwendungsbereich".encode("iso-8859-1")
|
||||
text = decode_html_bytes(raw)
|
||||
assert "§ 312" in text
|
||||
|
||||
def test_gesetze_im_internet_encoding(self):
|
||||
"""gesetze-im-internet.de uses iso-8859-1 with § entities."""
|
||||
html = '<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />'
|
||||
html += '<div>Kündigungsschutzgesetz</div>'
|
||||
raw = html.encode("iso-8859-1")
|
||||
text = decode_html_bytes(raw)
|
||||
assert "Kündigungsschutzgesetz" in text
|
||||
|
||||
Reference in New Issue
Block a user