Files
breakpilot-core/rag-service/html_utils.py
T
Benjamin Admin a459636bc4 fix(rag): HTML charset detection + opening block tag newlines
Two bugs fixed:
1. Opening block tags (<h3>, <div>) now also create newlines, not just
   closing tags. Fixes: gesetze-im-internet.de puts § inside <h3> which
   followed inline <a> text — § ended up mid-line, not at line start.

2. HTML charset detection from meta tag (charset=iso-8859-1). Files from
   gesetze-im-internet.de use ISO-8859-1, not UTF-8. The § byte (0xA7)
   was destroyed by UTF-8 decode. Now: try UTF-8 → check meta charset →
   fallback ISO-8859-1.

32 rag-service tests passing.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-02 08:35:47 +02:00

67 lines
2.2 KiB
Python

"""HTML detection and stripping for legal document ingestion."""
import re
from html import unescape
_HTML_TAG_RE = re.compile(r'<(html|head|body|div|p|span|table)\b', re.IGNORECASE)
_CHARSET_RE = re.compile(
r'<meta[^>]+charset\s*=\s*["\']?([a-zA-Z0-9_-]+)', re.IGNORECASE,
)
def looks_like_html(text: str) -> bool:
"""Check if text contains HTML tags."""
return bool(_HTML_TAG_RE.search(text[:500]))
def decode_html_bytes(raw: bytes) -> str:
"""Decode HTML bytes with charset detection from meta tags.
Tries UTF-8 first, falls back to charset from HTML meta tag, then latin-1.
"""
try:
text = raw.decode("utf-8")
# Check if UTF-8 decode produced replacement characters
if "\ufffd" not in text:
return text
except UnicodeDecodeError:
pass
# Peek at ASCII-safe portion to find charset
ascii_head = raw[:2000].decode("ascii", errors="ignore")
m = _CHARSET_RE.search(ascii_head)
if m:
charset = m.group(1).lower().replace("_", "-")
try:
return raw.decode(charset)
except (UnicodeDecodeError, LookupError):
pass
# Last resort: iso-8859-1 (covers all byte values)
return raw.decode("iso-8859-1")
def strip_html(html_text: str) -> str:
"""Convert HTML to plain text preserving legal document structure."""
text = html_text
# Remove script/style blocks
text = re.sub(r'<(script|style)[^>]*>.*?</\1>', '', text, flags=re.DOTALL | re.IGNORECASE)
# Block elements → newline (preserves § paragraph structure)
# Opening block tags also get newline (e.g., <h3> before § signs)
text = re.sub(
r'<(div|p|h[1-6]|li|tr|dt|dd|section|article|blockquote)\b[^>]*>',
'\n', text, flags=re.IGNORECASE,
)
text = re.sub(
r'</(div|p|h[1-6]|li|tr|dt|dd|section|article|blockquote)>',
'\n', text, flags=re.IGNORECASE,
)
text = re.sub(r'<br\s*/?>', '\n', text, flags=re.IGNORECASE)
# Strip remaining tags
text = re.sub(r'<[^>]+>', '', text)
# Decode HTML entities (&#246; → ö, &sect; → §)
text = unescape(text)
# Clean up excessive whitespace
text = re.sub(r'\n{3,}', '\n\n', text)
return text.strip()