ddad58f607
HTML files from gesetze-im-internet.de were decoded as raw UTF-8, keeping <div>/<p> tags intact. The legal chunker regex requires § at line start, which never matched inside HTML tags → 0% section metadata for HTML docs. Fix: detect HTML content and strip tags before sending to embedding service. Block elements become newlines, entities are decoded. § signs now appear at line starts → section detection works. Also adds D5 re-ingestion scripts (reingest_d5.py + config) for batch re-processing of all documents in Qdrant collections. 27 rag-service tests passing. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
32 lines
1.1 KiB
Python
32 lines
1.1 KiB
Python
"""HTML detection and stripping for legal document ingestion."""
|
|
|
|
import re
|
|
from html import unescape
|
|
|
|
_HTML_TAG_RE = re.compile(r'<(html|head|body|div|p|span|table)\b', re.IGNORECASE)
|
|
|
|
|
|
def looks_like_html(text: str) -> bool:
|
|
"""Check if text contains HTML tags."""
|
|
return bool(_HTML_TAG_RE.search(text[:500]))
|
|
|
|
|
|
def strip_html(html_text: str) -> str:
|
|
"""Convert HTML to plain text preserving legal document structure."""
|
|
text = html_text
|
|
# Remove script/style blocks
|
|
text = re.sub(r'<(script|style)[^>]*>.*?</\1>', '', text, flags=re.DOTALL | re.IGNORECASE)
|
|
# Block elements → newline (preserves § paragraph structure)
|
|
text = re.sub(
|
|
r'</(div|p|h[1-6]|li|tr|section|article|blockquote)>',
|
|
'\n', text, flags=re.IGNORECASE,
|
|
)
|
|
text = re.sub(r'<br\s*/?>', '\n', text, flags=re.IGNORECASE)
|
|
# Strip remaining tags
|
|
text = re.sub(r'<[^>]+>', '', text)
|
|
# Decode HTML entities (ö → ö, § → §)
|
|
text = unescape(text)
|
|
# Clean up excessive whitespace
|
|
text = re.sub(r'\n{3,}', '\n\n', text)
|
|
return text.strip()
|