Files
breakpilot-core/rag-service/tests/test_html_stripping.py
T
Benjamin Admin a459636bc4 fix(rag): HTML charset detection + opening block tag newlines
Two bugs fixed:
1. Opening block tags (<h3>, <div>) now also create newlines, not just
   closing tags. Fixes: gesetze-im-internet.de puts § inside <h3> which
   followed inline <a> text — § ended up mid-line, not at line start.

2. HTML charset detection from meta tag (charset=iso-8859-1). Files from
   gesetze-im-internet.de use ISO-8859-1, not UTF-8. The § byte (0xA7)
   was destroyed by UTF-8 decode. Now: try UTF-8 → check meta charset →
   fallback ISO-8859-1.

32 rag-service tests passing.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-02 08:35:47 +02:00

162 lines
6.1 KiB
Python

"""Tests for HTML detection and stripping in document upload."""
from html_utils import (
decode_html_bytes,
looks_like_html as _looks_like_html,
strip_html as _strip_html,
)
class TestLooksLikeHtml:
def test_html_document(self):
assert _looks_like_html("<html><body><p>Text</p></body></html>")
def test_html_div(self):
assert _looks_like_html('<div class="jurAbsatz">§ 312</div>')
def test_html_with_doctype(self):
assert _looks_like_html("<!DOCTYPE html><html><head></head><body>")
def test_plain_text(self):
assert not _looks_like_html("§ 312 Anwendungsbereich\n\n(1) Die Vorschriften...")
def test_legal_text_with_angle_brackets(self):
# Legal text might use < or > but not as HTML tags
assert not _looks_like_html("Wert < 100 EUR und > 50 EUR ist zulaessig.")
def test_markdown(self):
assert not _looks_like_html("# § 312 Anwendungsbereich\n\n(1) Die Vorschriften...")
class TestStripHtml:
def test_basic_div_tags(self):
html = "<div>§ 312 Anwendungsbereich</div>"
result = _strip_html(html)
assert result.startswith("§ 312 Anwendungsbereich")
def test_paragraph_tags_become_newlines(self):
html = "<p>Absatz 1</p><p>Absatz 2</p>"
result = _strip_html(html)
assert "Absatz 1" in result
assert "Absatz 2" in result
# Paragraphs should be on separate lines
lines = [ln.strip() for ln in result.split("\n") if ln.strip()]
assert len(lines) >= 2
def test_preserves_section_headers(self):
"""§ signs must be at line starts after stripping."""
html = '<div class="jurAbsatz">§ 312 Anwendungsbereich</div>'
result = _strip_html(html)
# § should be at the start of a line
for line in result.split("\n"):
if "§ 312" in line:
assert line.strip().startswith("§ 312")
break
else:
raise AssertionError("§ 312 not found in stripped text")
def test_decodes_html_entities(self):
html = "Gel&#246;scht und ge&#228;ndert und &#167; 312"
result = _strip_html(html)
assert "Gelöscht" in result
assert "geändert" in result
assert "§ 312" in result
def test_decodes_named_entities(self):
html = "&sect; 312 &amp; &sect; 313"
result = _strip_html(html)
assert "§ 312" in result
assert "§ 313" in result
def test_removes_script_style(self):
html = '<style>body{color:red}</style><script>alert("x")</script><p>§ 1 Text</p>'
result = _strip_html(html)
assert "color" not in result
assert "alert" not in result
assert "§ 1 Text" in result
def test_br_becomes_newline(self):
html = "Zeile 1<br/>Zeile 2<br>Zeile 3"
result = _strip_html(html)
assert "Zeile 1" in result
assert "Zeile 2" in result
def test_no_excessive_whitespace(self):
html = "<div></div><div></div><div></div><div>Text</div>"
result = _strip_html(html)
assert "\n\n\n" not in result
def test_gesetze_im_internet_format(self):
"""Realistic HTML from gesetze-im-internet.de."""
html = """<div class="jnhtml">
<div>
<div class="jurAbsatz">
§ 312k Kündigung von Verbraucherverträgen im elektronischen Geschäftsverkehr
</div>
<div class="jurAbsatz">
(1) Wird Verbrauchern über eine Webseite ermöglicht, einen Vertrag im elektronischen Geschäftsverkehr zu schließen, der auf die Begründung eines Dauerschuldverhältnisses gerichtet ist, das einen Unternehmer zu einer entgeltlichen Leistung verpflichtet, so treffen den Unternehmer die Pflichten nach dieser Vorschrift.
</div>
<div class="jurAbsatz">
(2) Der Unternehmer hat sicherzustellen, dass der Verbraucher auf der Webseite eine Erklärung zur ordentlichen oder außerordentlichen Kündigung abgeben kann.
</div>
</div></div>"""
result = _strip_html(html)
# § 312k should be at start of a line
found_312k = False
for line in result.split("\n"):
stripped = line.strip()
if stripped.startswith("§ 312k"):
found_312k = True
break
assert found_312k, f"§ 312k not at line start. Text:\n{result[:500]}"
# Content should be present without tags
assert "Dauerschuldverhältnisses" in result
assert "<div>" not in result
assert "class=" not in result
def test_plain_text_passthrough(self):
"""Non-HTML text should pass through unchanged."""
text = "§ 312 Anwendungsbereich\n\n(1) Die Vorschriften..."
result = _strip_html(text)
assert "§ 312 Anwendungsbereich" in result
assert "(1) Die Vorschriften" in result
def test_opening_h3_creates_newline(self):
"""Opening <h3> must create newline so § is at line start."""
html = '<a href="#">Inhaltsverzeichnis</a><h3><span>§ 1</span> Titel</h3>'
result = _strip_html(html)
found = any(line.strip().startswith("§ 1") for line in result.split("\n"))
assert found, f"§ 1 not at line start: {result!r}"
class TestDecodeHtmlBytes:
def test_utf8_file(self):
raw = "<div>§ 312 Anwendungsbereich</div>".encode("utf-8")
text = decode_html_bytes(raw)
assert "§ 312" in text
def test_iso_8859_1_with_meta(self):
html = '<html><head><meta charset="iso-8859-1"></head><body>§ 1 Test</body></html>'
raw = html.encode("iso-8859-1")
text = decode_html_bytes(raw)
assert "§ 1 Test" in text
def test_iso_8859_1_without_meta(self):
"""Even without meta tag, iso-8859-1 is fallback."""
raw = "§ 312 Anwendungsbereich".encode("iso-8859-1")
text = decode_html_bytes(raw)
assert "§ 312" in text
def test_gesetze_im_internet_encoding(self):
"""gesetze-im-internet.de uses iso-8859-1 with &#167; entities."""
html = '<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />'
html += '<div>Kündigungsschutzgesetz</div>'
raw = html.encode("iso-8859-1")
text = decode_html_bytes(raw)
assert "Kündigungsschutzgesetz" in text