fix(rag): strip HTML tags before chunking + D5 re-ingestion scripts

HTML files from gesetze-im-internet.de were decoded as raw UTF-8, keeping <div>/<p> tags intact. The legal chunker regex requires § at line start, which never matched inside HTML tags → 0% section metadata for HTML docs. Fix: detect HTML content and strip tags before sending to embedding service. Block elements become newlines, entities are decoded. § signs now appear at line starts → section detection works. Also adds D5 re-ingestion scripts (reingest_d5.py + config) for batch re-processing of all documents in Qdrant collections. 27 rag-service tests passing. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-02 08:18:25 +02:00
parent 93099b2770
commit ddad58f607
5 changed files with 698 additions and 0 deletions
@@ -0,0 +1,31 @@
+"""HTML detection and stripping for legal document ingestion."""
+
+import re
+from html import unescape
+
+_HTML_TAG_RE = re.compile(r'<(html|head|body|div|p|span|table)\b', re.IGNORECASE)
+
+
+def looks_like_html(text: str) -> bool:
+    """Check if text contains HTML tags."""
+    return bool(_HTML_TAG_RE.search(text[:500]))
+
+
+def strip_html(html_text: str) -> str:
+    """Convert HTML to plain text preserving legal document structure."""
+    text = html_text
+    # Remove script/style blocks
+    text = re.sub(r'<(script|style)[^>]*>.*?</\1>', '', text, flags=re.DOTALL | re.IGNORECASE)
+    # Block elements → newline (preserves § paragraph structure)
+    text = re.sub(
+        r'</(div|p|h[1-6]|li|tr|section|article|blockquote)>',
+        '\n', text, flags=re.IGNORECASE,
+    )
+    text = re.sub(r'<br\s*/?>', '\n', text, flags=re.IGNORECASE)
+    # Strip remaining tags
+    text = re.sub(r'<[^>]+>', '', text)
+    # Decode HTML entities (&#246; → ö, &sect; → §)
+    text = unescape(text)
+    # Clean up excessive whitespace
+    text = re.sub(r'\n{3,}', '\n\n', text)
+    return text.strip()