breakpilot-core/rag-service/html_utils.py

"""HTML detection and stripping for legal document ingestion."""

import re
from html import unescape

_HTML_TAG_RE = re.compile(r'<(html|head|body|div|p|span|table)\b', re.IGNORECASE)


def looks_like_html(text: str) -> bool:
    """Check if text contains HTML tags."""
    return bool(_HTML_TAG_RE.search(text[:500]))


def strip_html(html_text: str) -> str:
    """Convert HTML to plain text preserving legal document structure."""
    text = html_text
    # Remove script/style blocks
    text = re.sub(r'<(script|style)[^>]*>.*?</\1>', '', text, flags=re.DOTALL | re.IGNORECASE)
    # Block elements → newline (preserves § paragraph structure)
    text = re.sub(
        r'</(div|p|h[1-6]|li|tr|section|article|blockquote)>',
        '\n', text, flags=re.IGNORECASE,
    )
    text = re.sub(r'<br\s*/?>', '\n', text, flags=re.IGNORECASE)
    # Strip remaining tags
    text = re.sub(r'<[^>]+>', '', text)
    # Decode HTML entities (&#246; → ö, &sect; → §)
    text = unescape(text)
    # Clean up excessive whitespace
    text = re.sub(r'\n{3,}', '\n\n', text)
    return text.strip()