"""HTML detection and stripping for legal document ingestion.""" import re from html import unescape _HTML_TAG_RE = re.compile(r'<(html|head|body|div|p|span|table)\b', re.IGNORECASE) _CHARSET_RE = re.compile( r']+charset\s*=\s*["\']?([a-zA-Z0-9_-]+)', re.IGNORECASE, ) def looks_like_html(text: str) -> bool: """Check if text contains HTML tags.""" return bool(_HTML_TAG_RE.search(text[:500])) def decode_html_bytes(raw: bytes) -> str: """Decode HTML bytes with charset detection from meta tags. Tries UTF-8 first, falls back to charset from HTML meta tag, then latin-1. """ try: text = raw.decode("utf-8") # Check if UTF-8 decode produced replacement characters if "\ufffd" not in text: return text except UnicodeDecodeError: pass # Peek at ASCII-safe portion to find charset ascii_head = raw[:2000].decode("ascii", errors="ignore") m = _CHARSET_RE.search(ascii_head) if m: charset = m.group(1).lower().replace("_", "-") try: return raw.decode(charset) except (UnicodeDecodeError, LookupError): pass # Last resort: iso-8859-1 (covers all byte values) return raw.decode("iso-8859-1") def strip_html(html_text: str) -> str: """Convert HTML to plain text preserving legal document structure.""" text = html_text # Remove script/style blocks text = re.sub(r'<(script|style)[^>]*>.*?', '', text, flags=re.DOTALL | re.IGNORECASE) # Block elements → newline (preserves § paragraph structure) # Opening block tags also get newline (e.g.,

before § signs) text = re.sub( r'<(div|p|h[1-6]|li|tr|dt|dd|section|article|blockquote)\b[^>]*>', '\n', text, flags=re.IGNORECASE, ) text = re.sub( r'', '\n', text, flags=re.IGNORECASE, ) text = re.sub(r'', '\n', text, flags=re.IGNORECASE) # Strip remaining tags text = re.sub(r'<[^>]+>', '', text) # Decode HTML entities (ö → ö, § → §) text = unescape(text) # Clean up excessive whitespace text = re.sub(r'\n{3,}', '\n\n', text) return text.strip()