"""HTML detection and stripping for legal document ingestion.""" import re from html import unescape _HTML_TAG_RE = re.compile(r'<(html|head|body|div|p|span|table)\b', re.IGNORECASE) def looks_like_html(text: str) -> bool: """Check if text contains HTML tags.""" return bool(_HTML_TAG_RE.search(text[:500])) def strip_html(html_text: str) -> str: """Convert HTML to plain text preserving legal document structure.""" text = html_text # Remove script/style blocks text = re.sub(r'<(script|style)[^>]*>.*?', '', text, flags=re.DOTALL | re.IGNORECASE) # Block elements → newline (preserves § paragraph structure) text = re.sub( r'', '\n', text, flags=re.IGNORECASE, ) text = re.sub(r'', '\n', text, flags=re.IGNORECASE) # Strip remaining tags text = re.sub(r'<[^>]+>', '', text) # Decode HTML entities (ö → ö, § → §) text = unescape(text) # Clean up excessive whitespace text = re.sub(r'\n{3,}', '\n\n', text) return text.strip()