fix(rag): strip HTML tags before chunking + D5 re-ingestion scripts

HTML files from gesetze-im-internet.de were decoded as raw UTF-8, keeping
<div>/<p> tags intact. The legal chunker regex requires § at line start,
which never matched inside HTML tags → 0% section metadata for HTML docs.

Fix: detect HTML content and strip tags before sending to embedding
service. Block elements become newlines, entities are decoded.
§ signs now appear at line starts → section detection works.

Also adds D5 re-ingestion scripts (reingest_d5.py + config) for
batch re-processing of all documents in Qdrant collections.

27 rag-service tests passing.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-05-02 08:18:25 +02:00
parent 93099b2770
commit ddad58f607
5 changed files with 698 additions and 0 deletions
@@ -0,0 +1,92 @@
"""D5 Re-Ingestion: Constants, helpers, progress tracking."""
import json
import logging
import os
logger = logging.getLogger("d5-reingest")
# ---------------------------------------------------------------------------
# Defaults (overridable via CLI args)
# ---------------------------------------------------------------------------
DEFAULT_RAG_URL = "https://macmini:8097"
DEFAULT_QDRANT_URL = "http://macmini:6333"
TARGET_COLLECTIONS = [
"bp_compliance_ce",
"bp_compliance_gesetze",
"bp_compliance_datenschutz",
"bp_dsfa_corpus",
"bp_legal_templates",
"bp_compliance_schulrecht",
]
# New chunking parameters (D1-D4 validated)
CHUNK_STRATEGY = "recursive"
CHUNK_SIZE = 1500
CHUNK_OVERLAP = 100
PROGRESS_FILE = "d5_reingest_progress.json"
MANIFEST_FILE = "d5_manifest.json"
# Per-chunk fields (NOT carried as extra metadata during re-upload)
PER_CHUNK_FIELDS = frozenset({
"chunk_text", "chunk_index", "document_id", "object_name",
"filename", "data_type", "bundesland", "use_case", "year",
"section", "section_title", "paragraph", "paragraph_num", "page",
})
# Upload form fields that come from the payload (not metadata_json)
FORM_FIELDS = frozenset({"data_type", "bundesland", "use_case", "year"})
# ---------------------------------------------------------------------------
# Progress tracking
# ---------------------------------------------------------------------------
def load_progress(path: str = PROGRESS_FILE) -> dict:
if os.path.exists(path):
with open(path, encoding="utf-8") as f:
return json.load(f)
return {"documents": {}}
def save_progress(data: dict, path: str = PROGRESS_FILE):
with open(path, "w", encoding="utf-8") as f:
json.dump(data, f, indent=2, ensure_ascii=False, default=str)
# ---------------------------------------------------------------------------
# Metadata extraction
# ---------------------------------------------------------------------------
def extract_doc_metadata(payload: dict) -> dict:
"""Split Qdrant payload into form fields + extra metadata.
Returns: {"form": {data_type, bundesland, ...}, "extra": {regulation_code, ...}}
"""
form = {}
extra = {}
for k, v in payload.items():
if k in PER_CHUNK_FIELDS:
continue
if k in FORM_FIELDS:
form[k] = v
else:
extra[k] = v
return {"form": form, "extra": extra}
def doc_key(object_name: str, collection: str) -> str:
"""Unique key for a document in the progress file."""
return f"{object_name}|{collection}"
def content_type_from_filename(filename: str) -> str:
"""Infer MIME type from file extension."""
ext = os.path.splitext(filename)[1].lower()
return {
".pdf": "application/pdf",
".html": "text/html",
".htm": "text/html",
".md": "text/markdown",
".txt": "text/plain",
}.get(ext, "application/octet-stream")