diff --git a/rag-service/api/documents.py b/rag-service/api/documents.py index a8170d1..6f8af4b 100644 --- a/rag-service/api/documents.py +++ b/rag-service/api/documents.py @@ -7,7 +7,7 @@ from pydantic import BaseModel from api.auth import optional_jwt_auth from embedding_client import embedding_client -from html_utils import looks_like_html, strip_html +from html_utils import decode_html_bytes, looks_like_html, strip_html from minio_client_wrapper import minio_wrapper from qdrant_client_wrapper import qdrant_wrapper @@ -102,9 +102,16 @@ async def upload_document( try: if content_type == "application/pdf" or filename.lower().endswith(".pdf"): text = await embedding_client.extract_pdf(file_bytes) + elif filename.lower().endswith((".html", ".htm")): + text = decode_html_bytes(file_bytes) + text = strip_html(text) + logger.info("Decoded + stripped HTML from %s", filename) else: - # Try to decode as text text = file_bytes.decode("utf-8", errors="replace") + # Strip HTML if content looks like HTML despite extension + if looks_like_html(text): + text = strip_html(text) + logger.info("Stripped HTML tags from %s", filename) except Exception as exc: logger.error("Text extraction failed: %s", exc) raise HTTPException(status_code=500, detail=f"Text extraction failed: {exc}") @@ -112,11 +119,6 @@ async def upload_document( if not text or not text.strip(): raise HTTPException(status_code=400, detail="Could not extract any text from the document") - # --- Strip HTML if detected --- - if looks_like_html(text): - text = strip_html(text) - logger.info("Stripped HTML tags from %s", filename) - # --- Chunk --- try: chunk_result = await embedding_client.chunk_text( diff --git a/rag-service/html_utils.py b/rag-service/html_utils.py index 3fa9bd4..a330637 100644 --- a/rag-service/html_utils.py +++ b/rag-service/html_utils.py @@ -4,6 +4,9 @@ import re from html import unescape _HTML_TAG_RE = re.compile(r'<(html|head|body|div|p|span|table)\b', re.IGNORECASE) +_CHARSET_RE = re.compile( + r']+charset\s*=\s*["\']?([a-zA-Z0-9_-]+)', re.IGNORECASE, +) def looks_like_html(text: str) -> bool: @@ -11,14 +14,46 @@ def looks_like_html(text: str) -> bool: return bool(_HTML_TAG_RE.search(text[:500])) +def decode_html_bytes(raw: bytes) -> str: + """Decode HTML bytes with charset detection from meta tags. + + Tries UTF-8 first, falls back to charset from HTML meta tag, then latin-1. + """ + try: + text = raw.decode("utf-8") + # Check if UTF-8 decode produced replacement characters + if "\ufffd" not in text: + return text + except UnicodeDecodeError: + pass + + # Peek at ASCII-safe portion to find charset + ascii_head = raw[:2000].decode("ascii", errors="ignore") + m = _CHARSET_RE.search(ascii_head) + if m: + charset = m.group(1).lower().replace("_", "-") + try: + return raw.decode(charset) + except (UnicodeDecodeError, LookupError): + pass + + # Last resort: iso-8859-1 (covers all byte values) + return raw.decode("iso-8859-1") + + def strip_html(html_text: str) -> str: """Convert HTML to plain text preserving legal document structure.""" text = html_text # Remove script/style blocks text = re.sub(r'<(script|style)[^>]*>.*?\1>', '', text, flags=re.DOTALL | re.IGNORECASE) # Block elements → newline (preserves § paragraph structure) + # Opening block tags also get newline (e.g.,