fix(rag): HTML charset detection + opening block tag newlines
Two bugs fixed: 1. Opening block tags (<h3>, <div>) now also create newlines, not just closing tags. Fixes: gesetze-im-internet.de puts § inside <h3> which followed inline <a> text — § ended up mid-line, not at line start. 2. HTML charset detection from meta tag (charset=iso-8859-1). Files from gesetze-im-internet.de use ISO-8859-1, not UTF-8. The § byte (0xA7) was destroyed by UTF-8 decode. Now: try UTF-8 → check meta charset → fallback ISO-8859-1. 32 rag-service tests passing. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -7,7 +7,7 @@ from pydantic import BaseModel
|
|||||||
|
|
||||||
from api.auth import optional_jwt_auth
|
from api.auth import optional_jwt_auth
|
||||||
from embedding_client import embedding_client
|
from embedding_client import embedding_client
|
||||||
from html_utils import looks_like_html, strip_html
|
from html_utils import decode_html_bytes, looks_like_html, strip_html
|
||||||
from minio_client_wrapper import minio_wrapper
|
from minio_client_wrapper import minio_wrapper
|
||||||
from qdrant_client_wrapper import qdrant_wrapper
|
from qdrant_client_wrapper import qdrant_wrapper
|
||||||
|
|
||||||
@@ -102,9 +102,16 @@ async def upload_document(
|
|||||||
try:
|
try:
|
||||||
if content_type == "application/pdf" or filename.lower().endswith(".pdf"):
|
if content_type == "application/pdf" or filename.lower().endswith(".pdf"):
|
||||||
text = await embedding_client.extract_pdf(file_bytes)
|
text = await embedding_client.extract_pdf(file_bytes)
|
||||||
|
elif filename.lower().endswith((".html", ".htm")):
|
||||||
|
text = decode_html_bytes(file_bytes)
|
||||||
|
text = strip_html(text)
|
||||||
|
logger.info("Decoded + stripped HTML from %s", filename)
|
||||||
else:
|
else:
|
||||||
# Try to decode as text
|
|
||||||
text = file_bytes.decode("utf-8", errors="replace")
|
text = file_bytes.decode("utf-8", errors="replace")
|
||||||
|
# Strip HTML if content looks like HTML despite extension
|
||||||
|
if looks_like_html(text):
|
||||||
|
text = strip_html(text)
|
||||||
|
logger.info("Stripped HTML tags from %s", filename)
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
logger.error("Text extraction failed: %s", exc)
|
logger.error("Text extraction failed: %s", exc)
|
||||||
raise HTTPException(status_code=500, detail=f"Text extraction failed: {exc}")
|
raise HTTPException(status_code=500, detail=f"Text extraction failed: {exc}")
|
||||||
@@ -112,11 +119,6 @@ async def upload_document(
|
|||||||
if not text or not text.strip():
|
if not text or not text.strip():
|
||||||
raise HTTPException(status_code=400, detail="Could not extract any text from the document")
|
raise HTTPException(status_code=400, detail="Could not extract any text from the document")
|
||||||
|
|
||||||
# --- Strip HTML if detected ---
|
|
||||||
if looks_like_html(text):
|
|
||||||
text = strip_html(text)
|
|
||||||
logger.info("Stripped HTML tags from %s", filename)
|
|
||||||
|
|
||||||
# --- Chunk ---
|
# --- Chunk ---
|
||||||
try:
|
try:
|
||||||
chunk_result = await embedding_client.chunk_text(
|
chunk_result = await embedding_client.chunk_text(
|
||||||
|
|||||||
@@ -4,6 +4,9 @@ import re
|
|||||||
from html import unescape
|
from html import unescape
|
||||||
|
|
||||||
_HTML_TAG_RE = re.compile(r'<(html|head|body|div|p|span|table)\b', re.IGNORECASE)
|
_HTML_TAG_RE = re.compile(r'<(html|head|body|div|p|span|table)\b', re.IGNORECASE)
|
||||||
|
_CHARSET_RE = re.compile(
|
||||||
|
r'<meta[^>]+charset\s*=\s*["\']?([a-zA-Z0-9_-]+)', re.IGNORECASE,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def looks_like_html(text: str) -> bool:
|
def looks_like_html(text: str) -> bool:
|
||||||
@@ -11,14 +14,46 @@ def looks_like_html(text: str) -> bool:
|
|||||||
return bool(_HTML_TAG_RE.search(text[:500]))
|
return bool(_HTML_TAG_RE.search(text[:500]))
|
||||||
|
|
||||||
|
|
||||||
|
def decode_html_bytes(raw: bytes) -> str:
|
||||||
|
"""Decode HTML bytes with charset detection from meta tags.
|
||||||
|
|
||||||
|
Tries UTF-8 first, falls back to charset from HTML meta tag, then latin-1.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
text = raw.decode("utf-8")
|
||||||
|
# Check if UTF-8 decode produced replacement characters
|
||||||
|
if "\ufffd" not in text:
|
||||||
|
return text
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Peek at ASCII-safe portion to find charset
|
||||||
|
ascii_head = raw[:2000].decode("ascii", errors="ignore")
|
||||||
|
m = _CHARSET_RE.search(ascii_head)
|
||||||
|
if m:
|
||||||
|
charset = m.group(1).lower().replace("_", "-")
|
||||||
|
try:
|
||||||
|
return raw.decode(charset)
|
||||||
|
except (UnicodeDecodeError, LookupError):
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Last resort: iso-8859-1 (covers all byte values)
|
||||||
|
return raw.decode("iso-8859-1")
|
||||||
|
|
||||||
|
|
||||||
def strip_html(html_text: str) -> str:
|
def strip_html(html_text: str) -> str:
|
||||||
"""Convert HTML to plain text preserving legal document structure."""
|
"""Convert HTML to plain text preserving legal document structure."""
|
||||||
text = html_text
|
text = html_text
|
||||||
# Remove script/style blocks
|
# Remove script/style blocks
|
||||||
text = re.sub(r'<(script|style)[^>]*>.*?</\1>', '', text, flags=re.DOTALL | re.IGNORECASE)
|
text = re.sub(r'<(script|style)[^>]*>.*?</\1>', '', text, flags=re.DOTALL | re.IGNORECASE)
|
||||||
# Block elements → newline (preserves § paragraph structure)
|
# Block elements → newline (preserves § paragraph structure)
|
||||||
|
# Opening block tags also get newline (e.g., <h3> before § signs)
|
||||||
text = re.sub(
|
text = re.sub(
|
||||||
r'</(div|p|h[1-6]|li|tr|section|article|blockquote)>',
|
r'<(div|p|h[1-6]|li|tr|dt|dd|section|article|blockquote)\b[^>]*>',
|
||||||
|
'\n', text, flags=re.IGNORECASE,
|
||||||
|
)
|
||||||
|
text = re.sub(
|
||||||
|
r'</(div|p|h[1-6]|li|tr|dt|dd|section|article|blockquote)>',
|
||||||
'\n', text, flags=re.IGNORECASE,
|
'\n', text, flags=re.IGNORECASE,
|
||||||
)
|
)
|
||||||
text = re.sub(r'<br\s*/?>', '\n', text, flags=re.IGNORECASE)
|
text = re.sub(r'<br\s*/?>', '\n', text, flags=re.IGNORECASE)
|
||||||
|
|||||||
@@ -1,6 +1,10 @@
|
|||||||
"""Tests for HTML detection and stripping in document upload."""
|
"""Tests for HTML detection and stripping in document upload."""
|
||||||
|
|
||||||
from html_utils import looks_like_html as _looks_like_html, strip_html as _strip_html
|
from html_utils import (
|
||||||
|
decode_html_bytes,
|
||||||
|
looks_like_html as _looks_like_html,
|
||||||
|
strip_html as _strip_html,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class TestLooksLikeHtml:
|
class TestLooksLikeHtml:
|
||||||
@@ -120,3 +124,38 @@ class TestStripHtml:
|
|||||||
result = _strip_html(text)
|
result = _strip_html(text)
|
||||||
assert "§ 312 Anwendungsbereich" in result
|
assert "§ 312 Anwendungsbereich" in result
|
||||||
assert "(1) Die Vorschriften" in result
|
assert "(1) Die Vorschriften" in result
|
||||||
|
|
||||||
|
def test_opening_h3_creates_newline(self):
|
||||||
|
"""Opening <h3> must create newline so § is at line start."""
|
||||||
|
html = '<a href="#">Inhaltsverzeichnis</a><h3><span>§ 1</span> Titel</h3>'
|
||||||
|
result = _strip_html(html)
|
||||||
|
found = any(line.strip().startswith("§ 1") for line in result.split("\n"))
|
||||||
|
assert found, f"§ 1 not at line start: {result!r}"
|
||||||
|
|
||||||
|
|
||||||
|
class TestDecodeHtmlBytes:
|
||||||
|
|
||||||
|
def test_utf8_file(self):
|
||||||
|
raw = "<div>§ 312 Anwendungsbereich</div>".encode("utf-8")
|
||||||
|
text = decode_html_bytes(raw)
|
||||||
|
assert "§ 312" in text
|
||||||
|
|
||||||
|
def test_iso_8859_1_with_meta(self):
|
||||||
|
html = '<html><head><meta charset="iso-8859-1"></head><body>§ 1 Test</body></html>'
|
||||||
|
raw = html.encode("iso-8859-1")
|
||||||
|
text = decode_html_bytes(raw)
|
||||||
|
assert "§ 1 Test" in text
|
||||||
|
|
||||||
|
def test_iso_8859_1_without_meta(self):
|
||||||
|
"""Even without meta tag, iso-8859-1 is fallback."""
|
||||||
|
raw = "§ 312 Anwendungsbereich".encode("iso-8859-1")
|
||||||
|
text = decode_html_bytes(raw)
|
||||||
|
assert "§ 312" in text
|
||||||
|
|
||||||
|
def test_gesetze_im_internet_encoding(self):
|
||||||
|
"""gesetze-im-internet.de uses iso-8859-1 with § entities."""
|
||||||
|
html = '<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />'
|
||||||
|
html += '<div>Kündigungsschutzgesetz</div>'
|
||||||
|
raw = html.encode("iso-8859-1")
|
||||||
|
text = decode_html_bytes(raw)
|
||||||
|
assert "Kündigungsschutzgesetz" in text
|
||||||
|
|||||||
Reference in New Issue
Block a user