"""HTML detection and stripping for legal document ingestion."""
import re
from html import unescape
_HTML_TAG_RE = re.compile(r'<(html|head|body|div|p|span|table)\b', re.IGNORECASE)
def looks_like_html(text: str) -> bool:
"""Check if text contains HTML tags."""
return bool(_HTML_TAG_RE.search(text[:500]))
def strip_html(html_text: str) -> str:
"""Convert HTML to plain text preserving legal document structure."""
text = html_text
# Remove script/style blocks
text = re.sub(r'<(script|style)[^>]*>.*?\1>', '', text, flags=re.DOTALL | re.IGNORECASE)
# Block elements → newline (preserves § paragraph structure)
text = re.sub(
r'(div|p|h[1-6]|li|tr|section|article|blockquote)>',
'\n', text, flags=re.IGNORECASE,
)
text = re.sub(r'
', '\n', text, flags=re.IGNORECASE)
# Strip remaining tags
text = re.sub(r'<[^>]+>', '', text)
# Decode HTML entities (ö → ö, § → §)
text = unescape(text)
# Clean up excessive whitespace
text = re.sub(r'\n{3,}', '\n\n', text)
return text.strip()