From a459636bc4f587a6f2d64ad8b786a6f18880c847 Mon Sep 17 00:00:00 2001
From: Benjamin Admin <benjaminadmin@MacBook-Pro.local>
Date: Sat, 2 May 2026 08:35:47 +0200
Subject: [PATCH] fix(rag): HTML charset detection + opening block tag newlines
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two bugs fixed:
1. Opening block tags (<h3>, <div>) now also create newlines, not just
   closing tags. Fixes: gesetze-im-internet.de puts § inside <h3> which
   followed inline <a> text — § ended up mid-line, not at line start.

2. HTML charset detection from meta tag (charset=iso-8859-1). Files from
   gesetze-im-internet.de use ISO-8859-1, not UTF-8. The § byte (0xA7)
   was destroyed by UTF-8 decode. Now: try UTF-8 → check meta charset →
   fallback ISO-8859-1.

32 rag-service tests passing.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 rag-service/api/documents.py             | 16 +++++----
 rag-service/html_utils.py                | 37 ++++++++++++++++++++-
 rag-service/tests/test_html_stripping.py | 41 +++++++++++++++++++++++-
 3 files changed, 85 insertions(+), 9 deletions(-)
diff --git a/rag-service/api/documents.py b/rag-service/api/documents.py
index a8170d1..6f8af4b 100644
--- a/rag-service/api/documents.py
+++ b/rag-service/api/documents.py
@@ -7,7 +7,7 @@ from pydantic import BaseModel
 
 from api.auth import optional_jwt_auth
 from embedding_client import embedding_client
-from html_utils import looks_like_html, strip_html
+from html_utils import decode_html_bytes, looks_like_html, strip_html
 from minio_client_wrapper import minio_wrapper
 from qdrant_client_wrapper import qdrant_wrapper
 
@@ -102,9 +102,16 @@ async def upload_document(
     try:
         if content_type == "application/pdf" or filename.lower().endswith(".pdf"):
             text = await embedding_client.extract_pdf(file_bytes)
+        elif filename.lower().endswith((".html", ".htm")):
+            text = decode_html_bytes(file_bytes)
+            text = strip_html(text)
+            logger.info("Decoded + stripped HTML from %s", filename)
         else:
-            # Try to decode as text
             text = file_bytes.decode("utf-8", errors="replace")
+            # Strip HTML if content looks like HTML despite extension
+            if looks_like_html(text):
+                text = strip_html(text)
+                logger.info("Stripped HTML tags from %s", filename)
     except Exception as exc:
         logger.error("Text extraction failed: %s", exc)
         raise HTTPException(status_code=500, detail=f"Text extraction failed: {exc}")
@@ -112,11 +119,6 @@ async def upload_document(
     if not text or not text.strip():
         raise HTTPException(status_code=400, detail="Could not extract any text from the document")
 
-    # --- Strip HTML if detected ---
-    if looks_like_html(text):
-        text = strip_html(text)
-        logger.info("Stripped HTML tags from %s", filename)
-
     # --- Chunk ---
     try:
         chunk_result = await embedding_client.chunk_text(
diff --git a/rag-service/html_utils.py b/rag-service/html_utils.py
index 3fa9bd4..a330637 100644
--- a/rag-service/html_utils.py
+++ b/rag-service/html_utils.py
@@ -4,6 +4,9 @@ import re
 from html import unescape
 
 _HTML_TAG_RE = re.compile(r'<(html|head|body|div|p|span|table)\b', re.IGNORECASE)
+_CHARSET_RE = re.compile(
+    r'<meta[^>]+charset\s*=\s*["\']?([a-zA-Z0-9_-]+)', re.IGNORECASE,
+)
 
 
 def looks_like_html(text: str) -> bool:
@@ -11,14 +14,46 @@ def looks_like_html(text: str) -> bool:
     return bool(_HTML_TAG_RE.search(text[:500]))
 
 
+def decode_html_bytes(raw: bytes) -> str:
+    """Decode HTML bytes with charset detection from meta tags.
+
+    Tries UTF-8 first, falls back to charset from HTML meta tag, then latin-1.
+    """
+    try:
+        text = raw.decode("utf-8")
+        # Check if UTF-8 decode produced replacement characters
+        if "\ufffd" not in text:
+            return text
+    except UnicodeDecodeError:
+        pass
+
+    # Peek at ASCII-safe portion to find charset
+    ascii_head = raw[:2000].decode("ascii", errors="ignore")
+    m = _CHARSET_RE.search(ascii_head)
+    if m:
+        charset = m.group(1).lower().replace("_", "-")
+        try:
+            return raw.decode(charset)
+        except (UnicodeDecodeError, LookupError):
+            pass
+
+    # Last resort: iso-8859-1 (covers all byte values)
+    return raw.decode("iso-8859-1")
+
+
 def strip_html(html_text: str) -> str:
     """Convert HTML to plain text preserving legal document structure."""
     text = html_text
     # Remove script/style blocks
     text = re.sub(r'<(script|style)[^>]*>.*?</\1>', '', text, flags=re.DOTALL | re.IGNORECASE)
     # Block elements → newline (preserves § paragraph structure)
+    # Opening block tags also get newline (e.g., <h3> before § signs)
     text = re.sub(
-        r'</(div|p|h[1-6]|li|tr|section|article|blockquote)>',
+        r'<(div|p|h[1-6]|li|tr|dt|dd|section|article|blockquote)\b[^>]*>',
+        '\n', text, flags=re.IGNORECASE,
+    )
+    text = re.sub(
+        r'</(div|p|h[1-6]|li|tr|dt|dd|section|article|blockquote)>',
         '\n', text, flags=re.IGNORECASE,
     )
     text = re.sub(r'<br\s*/?>', '\n', text, flags=re.IGNORECASE)
diff --git a/rag-service/tests/test_html_stripping.py b/rag-service/tests/test_html_stripping.py
index 159469d..ec085a2 100644
--- a/rag-service/tests/test_html_stripping.py
+++ b/rag-service/tests/test_html_stripping.py
@@ -1,6 +1,10 @@
 """Tests for HTML detection and stripping in document upload."""
 
-from html_utils import looks_like_html as _looks_like_html, strip_html as _strip_html
+from html_utils import (
+    decode_html_bytes,
+    looks_like_html as _looks_like_html,
+    strip_html as _strip_html,
+)
 
 
 class TestLooksLikeHtml:
@@ -120,3 +124,38 @@ class TestStripHtml:
         result = _strip_html(text)
         assert "§ 312 Anwendungsbereich" in result
         assert "(1) Die Vorschriften" in result
+
+    def test_opening_h3_creates_newline(self):
+        """Opening <h3> must create newline so § is at line start."""
+        html = '<a href="#">Inhaltsverzeichnis</a><h3><span>§ 1</span> Titel</h3>'
+        result = _strip_html(html)
+        found = any(line.strip().startswith("§ 1") for line in result.split("\n"))
+        assert found, f"§ 1 not at line start: {result!r}"
+
+
+class TestDecodeHtmlBytes:
+
+    def test_utf8_file(self):
+        raw = "<div>§ 312 Anwendungsbereich</div>".encode("utf-8")
+        text = decode_html_bytes(raw)
+        assert "§ 312" in text
+
+    def test_iso_8859_1_with_meta(self):
+        html = '<html><head><meta charset="iso-8859-1"></head><body>§ 1 Test</body></html>'
+        raw = html.encode("iso-8859-1")
+        text = decode_html_bytes(raw)
+        assert "§ 1 Test" in text
+
+    def test_iso_8859_1_without_meta(self):
+        """Even without meta tag, iso-8859-1 is fallback."""
+        raw = "§ 312 Anwendungsbereich".encode("iso-8859-1")
+        text = decode_html_bytes(raw)
+        assert "§ 312" in text
+
+    def test_gesetze_im_internet_encoding(self):
+        """gesetze-im-internet.de uses iso-8859-1 with &#167; entities."""
+        html = '<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />'
+        html += '<div>Kündigungsschutzgesetz</div>'
+        raw = html.encode("iso-8859-1")
+        text = decode_html_bytes(raw)
+        assert "Kündigungsschutzgesetz" in text