fix(rag): strip HTML tags before chunking + D5 re-ingestion scripts

HTML files from gesetze-im-internet.de were decoded as raw UTF-8, keeping <div>/<p> tags intact. The legal chunker regex requires § at line start, which never matched inside HTML tags → 0% section metadata for HTML docs. Fix: detect HTML content and strip tags before sending to embedding service. Block elements become newlines, entities are decoded. § signs now appear at line starts → section detection works. Also adds D5 re-ingestion scripts (reingest_d5.py + config) for batch re-processing of all documents in Qdrant collections. 27 rag-service tests passing. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-02 08:18:25 +02:00
parent 93099b2770
commit ddad58f607
5 changed files with 698 additions and 0 deletions
@@ -0,0 +1,92 @@
+"""D5 Re-Ingestion: Constants, helpers, progress tracking."""
+
+import json
+import logging
+import os
+
+logger = logging.getLogger("d5-reingest")
+
+# ---------------------------------------------------------------------------
+# Defaults (overridable via CLI args)
+# ---------------------------------------------------------------------------
+DEFAULT_RAG_URL = "https://macmini:8097"
+DEFAULT_QDRANT_URL = "http://macmini:6333"
+
+TARGET_COLLECTIONS = [
+    "bp_compliance_ce",
+    "bp_compliance_gesetze",
+    "bp_compliance_datenschutz",
+    "bp_dsfa_corpus",
+    "bp_legal_templates",
+    "bp_compliance_schulrecht",
+]
+
+# New chunking parameters (D1-D4 validated)
+CHUNK_STRATEGY = "recursive"
+CHUNK_SIZE = 1500
+CHUNK_OVERLAP = 100
+
+PROGRESS_FILE = "d5_reingest_progress.json"
+MANIFEST_FILE = "d5_manifest.json"
+
+# Per-chunk fields (NOT carried as extra metadata during re-upload)
+PER_CHUNK_FIELDS = frozenset({
+    "chunk_text", "chunk_index", "document_id", "object_name",
+    "filename", "data_type", "bundesland", "use_case", "year",
+    "section", "section_title", "paragraph", "paragraph_num", "page",
+})
+
+# Upload form fields that come from the payload (not metadata_json)
+FORM_FIELDS = frozenset({"data_type", "bundesland", "use_case", "year"})
+
+
+# ---------------------------------------------------------------------------
+# Progress tracking
+# ---------------------------------------------------------------------------
+def load_progress(path: str = PROGRESS_FILE) -> dict:
+    if os.path.exists(path):
+        with open(path, encoding="utf-8") as f:
+            return json.load(f)
+    return {"documents": {}}
+
+
+def save_progress(data: dict, path: str = PROGRESS_FILE):
+    with open(path, "w", encoding="utf-8") as f:
+        json.dump(data, f, indent=2, ensure_ascii=False, default=str)
+
+
+# ---------------------------------------------------------------------------
+# Metadata extraction
+# ---------------------------------------------------------------------------
+def extract_doc_metadata(payload: dict) -> dict:
+    """Split Qdrant payload into form fields + extra metadata.
+
+    Returns: {"form": {data_type, bundesland, ...}, "extra": {regulation_code, ...}}
+    """
+    form = {}
+    extra = {}
+    for k, v in payload.items():
+        if k in PER_CHUNK_FIELDS:
+            continue
+        if k in FORM_FIELDS:
+            form[k] = v
+        else:
+            extra[k] = v
+    return {"form": form, "extra": extra}
+
+
+def doc_key(object_name: str, collection: str) -> str:
+    """Unique key for a document in the progress file."""
+    return f"{object_name}|{collection}"
+
+
+def content_type_from_filename(filename: str) -> str:
+    """Infer MIME type from file extension."""
+    ext = os.path.splitext(filename)[1].lower()
+    return {
+        ".pdf": "application/pdf",
+        ".html": "text/html",
+        ".htm": "text/html",
+        ".md": "text/markdown",
+        ".txt": "text/plain",
+    }.get(ext, "application/octet-stream")