fix(rag): strip HTML tags before chunking + D5 re-ingestion scripts
HTML files from gesetze-im-internet.de were decoded as raw UTF-8, keeping <div>/<p> tags intact. The legal chunker regex requires § at line start, which never matched inside HTML tags → 0% section metadata for HTML docs. Fix: detect HTML content and strip tags before sending to embedding service. Block elements become newlines, entities are decoded. § signs now appear at line starts → section detection works. Also adds D5 re-ingestion scripts (reingest_d5.py + config) for batch re-processing of all documents in Qdrant collections. 27 rag-service tests passing. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,31 @@
|
||||
"""HTML detection and stripping for legal document ingestion."""
|
||||
|
||||
import re
|
||||
from html import unescape
|
||||
|
||||
_HTML_TAG_RE = re.compile(r'<(html|head|body|div|p|span|table)\b', re.IGNORECASE)
|
||||
|
||||
|
||||
def looks_like_html(text: str) -> bool:
|
||||
"""Check if text contains HTML tags."""
|
||||
return bool(_HTML_TAG_RE.search(text[:500]))
|
||||
|
||||
|
||||
def strip_html(html_text: str) -> str:
|
||||
"""Convert HTML to plain text preserving legal document structure."""
|
||||
text = html_text
|
||||
# Remove script/style blocks
|
||||
text = re.sub(r'<(script|style)[^>]*>.*?</\1>', '', text, flags=re.DOTALL | re.IGNORECASE)
|
||||
# Block elements → newline (preserves § paragraph structure)
|
||||
text = re.sub(
|
||||
r'</(div|p|h[1-6]|li|tr|section|article|blockquote)>',
|
||||
'\n', text, flags=re.IGNORECASE,
|
||||
)
|
||||
text = re.sub(r'<br\s*/?>', '\n', text, flags=re.IGNORECASE)
|
||||
# Strip remaining tags
|
||||
text = re.sub(r'<[^>]+>', '', text)
|
||||
# Decode HTML entities (ö → ö, § → §)
|
||||
text = unescape(text)
|
||||
# Clean up excessive whitespace
|
||||
text = re.sub(r'\n{3,}', '\n\n', text)
|
||||
return text.strip()
|
||||
Reference in New Issue
Block a user