ddad58f607
HTML files from gesetze-im-internet.de were decoded as raw UTF-8, keeping <div>/<p> tags intact. The legal chunker regex requires § at line start, which never matched inside HTML tags → 0% section metadata for HTML docs. Fix: detect HTML content and strip tags before sending to embedding service. Block elements become newlines, entities are decoded. § signs now appear at line starts → section detection works. Also adds D5 re-ingestion scripts (reingest_d5.py + config) for batch re-processing of all documents in Qdrant collections. 27 rag-service tests passing. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
93 lines
2.9 KiB
Python
93 lines
2.9 KiB
Python
"""D5 Re-Ingestion: Constants, helpers, progress tracking."""
|
|
|
|
import json
|
|
import logging
|
|
import os
|
|
|
|
logger = logging.getLogger("d5-reingest")
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Defaults (overridable via CLI args)
|
|
# ---------------------------------------------------------------------------
|
|
DEFAULT_RAG_URL = "https://macmini:8097"
|
|
DEFAULT_QDRANT_URL = "http://macmini:6333"
|
|
|
|
TARGET_COLLECTIONS = [
|
|
"bp_compliance_ce",
|
|
"bp_compliance_gesetze",
|
|
"bp_compliance_datenschutz",
|
|
"bp_dsfa_corpus",
|
|
"bp_legal_templates",
|
|
"bp_compliance_schulrecht",
|
|
]
|
|
|
|
# New chunking parameters (D1-D4 validated)
|
|
CHUNK_STRATEGY = "recursive"
|
|
CHUNK_SIZE = 1500
|
|
CHUNK_OVERLAP = 100
|
|
|
|
PROGRESS_FILE = "d5_reingest_progress.json"
|
|
MANIFEST_FILE = "d5_manifest.json"
|
|
|
|
# Per-chunk fields (NOT carried as extra metadata during re-upload)
|
|
PER_CHUNK_FIELDS = frozenset({
|
|
"chunk_text", "chunk_index", "document_id", "object_name",
|
|
"filename", "data_type", "bundesland", "use_case", "year",
|
|
"section", "section_title", "paragraph", "paragraph_num", "page",
|
|
})
|
|
|
|
# Upload form fields that come from the payload (not metadata_json)
|
|
FORM_FIELDS = frozenset({"data_type", "bundesland", "use_case", "year"})
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Progress tracking
|
|
# ---------------------------------------------------------------------------
|
|
def load_progress(path: str = PROGRESS_FILE) -> dict:
|
|
if os.path.exists(path):
|
|
with open(path, encoding="utf-8") as f:
|
|
return json.load(f)
|
|
return {"documents": {}}
|
|
|
|
|
|
def save_progress(data: dict, path: str = PROGRESS_FILE):
|
|
with open(path, "w", encoding="utf-8") as f:
|
|
json.dump(data, f, indent=2, ensure_ascii=False, default=str)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Metadata extraction
|
|
# ---------------------------------------------------------------------------
|
|
def extract_doc_metadata(payload: dict) -> dict:
|
|
"""Split Qdrant payload into form fields + extra metadata.
|
|
|
|
Returns: {"form": {data_type, bundesland, ...}, "extra": {regulation_code, ...}}
|
|
"""
|
|
form = {}
|
|
extra = {}
|
|
for k, v in payload.items():
|
|
if k in PER_CHUNK_FIELDS:
|
|
continue
|
|
if k in FORM_FIELDS:
|
|
form[k] = v
|
|
else:
|
|
extra[k] = v
|
|
return {"form": form, "extra": extra}
|
|
|
|
|
|
def doc_key(object_name: str, collection: str) -> str:
|
|
"""Unique key for a document in the progress file."""
|
|
return f"{object_name}|{collection}"
|
|
|
|
|
|
def content_type_from_filename(filename: str) -> str:
|
|
"""Infer MIME type from file extension."""
|
|
ext = os.path.splitext(filename)[1].lower()
|
|
return {
|
|
".pdf": "application/pdf",
|
|
".html": "text/html",
|
|
".htm": "text/html",
|
|
".md": "text/markdown",
|
|
".txt": "text/plain",
|
|
}.get(ext, "application/octet-stream")
|