fix(rag): strip HTML tags before chunking + D5 re-ingestion scripts

HTML files from gesetze-im-internet.de were decoded as raw UTF-8, keeping
<div>/<p> tags intact. The legal chunker regex requires § at line start,
which never matched inside HTML tags → 0% section metadata for HTML docs.

Fix: detect HTML content and strip tags before sending to embedding
service. Block elements become newlines, entities are decoded.
§ signs now appear at line starts → section detection works.

Also adds D5 re-ingestion scripts (reingest_d5.py + config) for
batch re-processing of all documents in Qdrant collections.

27 rag-service tests passing.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-05-02 08:18:25 +02:00
parent 93099b2770
commit ddad58f607
5 changed files with 698 additions and 0 deletions
+31
View File
@@ -0,0 +1,31 @@
"""HTML detection and stripping for legal document ingestion."""
import re
from html import unescape
_HTML_TAG_RE = re.compile(r'<(html|head|body|div|p|span|table)\b', re.IGNORECASE)
def looks_like_html(text: str) -> bool:
"""Check if text contains HTML tags."""
return bool(_HTML_TAG_RE.search(text[:500]))
def strip_html(html_text: str) -> str:
"""Convert HTML to plain text preserving legal document structure."""
text = html_text
# Remove script/style blocks
text = re.sub(r'<(script|style)[^>]*>.*?</\1>', '', text, flags=re.DOTALL | re.IGNORECASE)
# Block elements → newline (preserves § paragraph structure)
text = re.sub(
r'</(div|p|h[1-6]|li|tr|section|article|blockquote)>',
'\n', text, flags=re.IGNORECASE,
)
text = re.sub(r'<br\s*/?>', '\n', text, flags=re.IGNORECASE)
# Strip remaining tags
text = re.sub(r'<[^>]+>', '', text)
# Decode HTML entities (&#246; → ö, &sect; → §)
text = unescape(text)
# Clean up excessive whitespace
text = re.sub(r'\n{3,}', '\n\n', text)
return text.strip()