fix(rag): strip HTML tags before chunking + D5 re-ingestion scripts
HTML files from gesetze-im-internet.de were decoded as raw UTF-8, keeping <div>/<p> tags intact. The legal chunker regex requires § at line start, which never matched inside HTML tags → 0% section metadata for HTML docs. Fix: detect HTML content and strip tags before sending to embedding service. Block elements become newlines, entities are decoded. § signs now appear at line starts → section detection works. Also adds D5 re-ingestion scripts (reingest_d5.py + config) for batch re-processing of all documents in Qdrant collections. 27 rag-service tests passing. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,92 @@
|
||||
"""D5 Re-Ingestion: Constants, helpers, progress tracking."""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
|
||||
logger = logging.getLogger("d5-reingest")
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Defaults (overridable via CLI args)
|
||||
# ---------------------------------------------------------------------------
|
||||
DEFAULT_RAG_URL = "https://macmini:8097"
|
||||
DEFAULT_QDRANT_URL = "http://macmini:6333"
|
||||
|
||||
TARGET_COLLECTIONS = [
|
||||
"bp_compliance_ce",
|
||||
"bp_compliance_gesetze",
|
||||
"bp_compliance_datenschutz",
|
||||
"bp_dsfa_corpus",
|
||||
"bp_legal_templates",
|
||||
"bp_compliance_schulrecht",
|
||||
]
|
||||
|
||||
# New chunking parameters (D1-D4 validated)
|
||||
CHUNK_STRATEGY = "recursive"
|
||||
CHUNK_SIZE = 1500
|
||||
CHUNK_OVERLAP = 100
|
||||
|
||||
PROGRESS_FILE = "d5_reingest_progress.json"
|
||||
MANIFEST_FILE = "d5_manifest.json"
|
||||
|
||||
# Per-chunk fields (NOT carried as extra metadata during re-upload)
|
||||
PER_CHUNK_FIELDS = frozenset({
|
||||
"chunk_text", "chunk_index", "document_id", "object_name",
|
||||
"filename", "data_type", "bundesland", "use_case", "year",
|
||||
"section", "section_title", "paragraph", "paragraph_num", "page",
|
||||
})
|
||||
|
||||
# Upload form fields that come from the payload (not metadata_json)
|
||||
FORM_FIELDS = frozenset({"data_type", "bundesland", "use_case", "year"})
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Progress tracking
|
||||
# ---------------------------------------------------------------------------
|
||||
def load_progress(path: str = PROGRESS_FILE) -> dict:
|
||||
if os.path.exists(path):
|
||||
with open(path, encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
return {"documents": {}}
|
||||
|
||||
|
||||
def save_progress(data: dict, path: str = PROGRESS_FILE):
|
||||
with open(path, "w", encoding="utf-8") as f:
|
||||
json.dump(data, f, indent=2, ensure_ascii=False, default=str)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Metadata extraction
|
||||
# ---------------------------------------------------------------------------
|
||||
def extract_doc_metadata(payload: dict) -> dict:
|
||||
"""Split Qdrant payload into form fields + extra metadata.
|
||||
|
||||
Returns: {"form": {data_type, bundesland, ...}, "extra": {regulation_code, ...}}
|
||||
"""
|
||||
form = {}
|
||||
extra = {}
|
||||
for k, v in payload.items():
|
||||
if k in PER_CHUNK_FIELDS:
|
||||
continue
|
||||
if k in FORM_FIELDS:
|
||||
form[k] = v
|
||||
else:
|
||||
extra[k] = v
|
||||
return {"form": form, "extra": extra}
|
||||
|
||||
|
||||
def doc_key(object_name: str, collection: str) -> str:
|
||||
"""Unique key for a document in the progress file."""
|
||||
return f"{object_name}|{collection}"
|
||||
|
||||
|
||||
def content_type_from_filename(filename: str) -> str:
|
||||
"""Infer MIME type from file extension."""
|
||||
ext = os.path.splitext(filename)[1].lower()
|
||||
return {
|
||||
".pdf": "application/pdf",
|
||||
".html": "text/html",
|
||||
".htm": "text/html",
|
||||
".md": "text/markdown",
|
||||
".txt": "text/plain",
|
||||
}.get(ext, "application/octet-stream")
|
||||
Reference in New Issue
Block a user