breakpilot-core/rag-service/html_utils.py

"""HTML detection and stripping for legal document ingestion."""

import re
from html import unescape

_HTML_TAG_RE = re.compile(r'<(html|head|body|div|p|span|table)\b', re.IGNORECASE)
_CHARSET_RE = re.compile(
    r'<meta[^>]+charset\s*=\s*["\']?([a-zA-Z0-9_-]+)', re.IGNORECASE,
)


def looks_like_html(text: str) -> bool:
    """Check if text contains HTML tags."""
    return bool(_HTML_TAG_RE.search(text[:500]))


def decode_html_bytes(raw: bytes) -> str:
    """Decode HTML bytes with charset detection from meta tags.

    Tries UTF-8 first, falls back to charset from HTML meta tag, then latin-1.
    """
    try:
        text = raw.decode("utf-8")
        # Check if UTF-8 decode produced replacement characters
        if "\ufffd" not in text:
            return text
    except UnicodeDecodeError:
        pass

    # Peek at ASCII-safe portion to find charset
    ascii_head = raw[:2000].decode("ascii", errors="ignore")
    m = _CHARSET_RE.search(ascii_head)
    if m:
        charset = m.group(1).lower().replace("_", "-")
        try:
            return raw.decode(charset)
        except (UnicodeDecodeError, LookupError):
            pass

    # Last resort: iso-8859-1 (covers all byte values)
    return raw.decode("iso-8859-1")


def strip_html(html_text: str) -> str:
    """Convert HTML to plain text preserving legal document structure."""
    text = html_text
    # Remove script/style blocks
    text = re.sub(r'<(script|style)[^>]*>.*?</\1>', '', text, flags=re.DOTALL | re.IGNORECASE)
    # Block elements → newline (preserves § paragraph structure)
    # Opening block tags also get newline (e.g., <h3> before § signs)
    text = re.sub(
        r'<(div|p|h[1-6]|li|tr|dt|dd|section|article|blockquote)\b[^>]*>',
        '\n', text, flags=re.IGNORECASE,
    )
    text = re.sub(
        r'</(div|p|h[1-6]|li|tr|dt|dd|section|article|blockquote)>',
        '\n', text, flags=re.IGNORECASE,
    )
    text = re.sub(r'<br\s*/?>', '\n', text, flags=re.IGNORECASE)
    # Strip remaining tags
    text = re.sub(r'<[^>]+>', '', text)
    # Decode HTML entities (&#246; → ö, &sect; → §)
    text = unescape(text)
    # Clean up excessive whitespace
    text = re.sub(r'\n{3,}', '\n\n', text)
    return text.strip()