From a459636bc4f587a6f2d64ad8b786a6f18880c847 Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Sat, 2 May 2026 08:35:47 +0200 Subject: [PATCH] fix(rag): HTML charset detection + opening block tag newlines MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two bugs fixed: 1. Opening block tags (

,
) now also create newlines, not just closing tags. Fixes: gesetze-im-internet.de puts § inside

which followed inline text — § ended up mid-line, not at line start. 2. HTML charset detection from meta tag (charset=iso-8859-1). Files from gesetze-im-internet.de use ISO-8859-1, not UTF-8. The § byte (0xA7) was destroyed by UTF-8 decode. Now: try UTF-8 → check meta charset → fallback ISO-8859-1. 32 rag-service tests passing. Co-Authored-By: Claude Opus 4.6 (1M context) --- rag-service/api/documents.py | 16 +++++---- rag-service/html_utils.py | 37 ++++++++++++++++++++- rag-service/tests/test_html_stripping.py | 41 +++++++++++++++++++++++- 3 files changed, 85 insertions(+), 9 deletions(-) diff --git a/rag-service/api/documents.py b/rag-service/api/documents.py index a8170d1..6f8af4b 100644 --- a/rag-service/api/documents.py +++ b/rag-service/api/documents.py @@ -7,7 +7,7 @@ from pydantic import BaseModel from api.auth import optional_jwt_auth from embedding_client import embedding_client -from html_utils import looks_like_html, strip_html +from html_utils import decode_html_bytes, looks_like_html, strip_html from minio_client_wrapper import minio_wrapper from qdrant_client_wrapper import qdrant_wrapper @@ -102,9 +102,16 @@ async def upload_document( try: if content_type == "application/pdf" or filename.lower().endswith(".pdf"): text = await embedding_client.extract_pdf(file_bytes) + elif filename.lower().endswith((".html", ".htm")): + text = decode_html_bytes(file_bytes) + text = strip_html(text) + logger.info("Decoded + stripped HTML from %s", filename) else: - # Try to decode as text text = file_bytes.decode("utf-8", errors="replace") + # Strip HTML if content looks like HTML despite extension + if looks_like_html(text): + text = strip_html(text) + logger.info("Stripped HTML tags from %s", filename) except Exception as exc: logger.error("Text extraction failed: %s", exc) raise HTTPException(status_code=500, detail=f"Text extraction failed: {exc}") @@ -112,11 +119,6 @@ async def upload_document( if not text or not text.strip(): raise HTTPException(status_code=400, detail="Could not extract any text from the document") - # --- Strip HTML if detected --- - if looks_like_html(text): - text = strip_html(text) - logger.info("Stripped HTML tags from %s", filename) - # --- Chunk --- try: chunk_result = await embedding_client.chunk_text( diff --git a/rag-service/html_utils.py b/rag-service/html_utils.py index 3fa9bd4..a330637 100644 --- a/rag-service/html_utils.py +++ b/rag-service/html_utils.py @@ -4,6 +4,9 @@ import re from html import unescape _HTML_TAG_RE = re.compile(r'<(html|head|body|div|p|span|table)\b', re.IGNORECASE) +_CHARSET_RE = re.compile( + r']+charset\s*=\s*["\']?([a-zA-Z0-9_-]+)', re.IGNORECASE, +) def looks_like_html(text: str) -> bool: @@ -11,14 +14,46 @@ def looks_like_html(text: str) -> bool: return bool(_HTML_TAG_RE.search(text[:500])) +def decode_html_bytes(raw: bytes) -> str: + """Decode HTML bytes with charset detection from meta tags. + + Tries UTF-8 first, falls back to charset from HTML meta tag, then latin-1. + """ + try: + text = raw.decode("utf-8") + # Check if UTF-8 decode produced replacement characters + if "\ufffd" not in text: + return text + except UnicodeDecodeError: + pass + + # Peek at ASCII-safe portion to find charset + ascii_head = raw[:2000].decode("ascii", errors="ignore") + m = _CHARSET_RE.search(ascii_head) + if m: + charset = m.group(1).lower().replace("_", "-") + try: + return raw.decode(charset) + except (UnicodeDecodeError, LookupError): + pass + + # Last resort: iso-8859-1 (covers all byte values) + return raw.decode("iso-8859-1") + + def strip_html(html_text: str) -> str: """Convert HTML to plain text preserving legal document structure.""" text = html_text # Remove script/style blocks text = re.sub(r'<(script|style)[^>]*>.*?', '', text, flags=re.DOTALL | re.IGNORECASE) # Block elements → newline (preserves § paragraph structure) + # Opening block tags also get newline (e.g.,

before § signs) text = re.sub( - r'', + r'<(div|p|h[1-6]|li|tr|dt|dd|section|article|blockquote)\b[^>]*>', + '\n', text, flags=re.IGNORECASE, + ) + text = re.sub( + r'', '\n', text, flags=re.IGNORECASE, ) text = re.sub(r'', '\n', text, flags=re.IGNORECASE) diff --git a/rag-service/tests/test_html_stripping.py b/rag-service/tests/test_html_stripping.py index 159469d..ec085a2 100644 --- a/rag-service/tests/test_html_stripping.py +++ b/rag-service/tests/test_html_stripping.py @@ -1,6 +1,10 @@ """Tests for HTML detection and stripping in document upload.""" -from html_utils import looks_like_html as _looks_like_html, strip_html as _strip_html +from html_utils import ( + decode_html_bytes, + looks_like_html as _looks_like_html, + strip_html as _strip_html, +) class TestLooksLikeHtml: @@ -120,3 +124,38 @@ class TestStripHtml: result = _strip_html(text) assert "§ 312 Anwendungsbereich" in result assert "(1) Die Vorschriften" in result + + def test_opening_h3_creates_newline(self): + """Opening

must create newline so § is at line start.""" + html = 'Inhaltsverzeichnis

§ 1 Titel

' + result = _strip_html(html) + found = any(line.strip().startswith("§ 1") for line in result.split("\n")) + assert found, f"§ 1 not at line start: {result!r}" + + +class TestDecodeHtmlBytes: + + def test_utf8_file(self): + raw = "
§ 312 Anwendungsbereich
".encode("utf-8") + text = decode_html_bytes(raw) + assert "§ 312" in text + + def test_iso_8859_1_with_meta(self): + html = '§ 1 Test' + raw = html.encode("iso-8859-1") + text = decode_html_bytes(raw) + assert "§ 1 Test" in text + + def test_iso_8859_1_without_meta(self): + """Even without meta tag, iso-8859-1 is fallback.""" + raw = "§ 312 Anwendungsbereich".encode("iso-8859-1") + text = decode_html_bytes(raw) + assert "§ 312" in text + + def test_gesetze_im_internet_encoding(self): + """gesetze-im-internet.de uses iso-8859-1 with § entities.""" + html = '' + html += '
Kündigungsschutzgesetz
' + raw = html.encode("iso-8859-1") + text = decode_html_bytes(raw) + assert "Kündigungsschutzgesetz" in text