"""D5 Re-Ingestion: Constants, helpers, progress tracking.""" import json import logging import os logger = logging.getLogger("d5-reingest") # --------------------------------------------------------------------------- # Defaults (overridable via CLI args) # --------------------------------------------------------------------------- DEFAULT_RAG_URL = "https://macmini:8097" DEFAULT_QDRANT_URL = "http://macmini:6333" TARGET_COLLECTIONS = [ "bp_compliance_ce", "bp_compliance_gesetze", "bp_compliance_datenschutz", "bp_dsfa_corpus", "bp_legal_templates", "bp_compliance_schulrecht", ] # New chunking parameters (D1-D4 validated) CHUNK_STRATEGY = "recursive" CHUNK_SIZE = 1500 CHUNK_OVERLAP = 100 PROGRESS_FILE = "d5_reingest_progress.json" MANIFEST_FILE = "d5_manifest.json" # Per-chunk fields (NOT carried as extra metadata during re-upload) PER_CHUNK_FIELDS = frozenset({ "chunk_text", "chunk_index", "document_id", "object_name", "filename", "data_type", "bundesland", "use_case", "year", "section", "section_title", "paragraph", "paragraph_num", "page", }) # Upload form fields that come from the payload (not metadata_json) FORM_FIELDS = frozenset({"data_type", "bundesland", "use_case", "year"}) # --------------------------------------------------------------------------- # Progress tracking # --------------------------------------------------------------------------- def load_progress(path: str = PROGRESS_FILE) -> dict: if os.path.exists(path): with open(path, encoding="utf-8") as f: return json.load(f) return {"documents": {}} def save_progress(data: dict, path: str = PROGRESS_FILE): with open(path, "w", encoding="utf-8") as f: json.dump(data, f, indent=2, ensure_ascii=False, default=str) # --------------------------------------------------------------------------- # Metadata extraction # --------------------------------------------------------------------------- def extract_doc_metadata(payload: dict) -> dict: """Split Qdrant payload into form fields + extra metadata. Returns: {"form": {data_type, bundesland, ...}, "extra": {regulation_code, ...}} """ form = {} extra = {} for k, v in payload.items(): if k in PER_CHUNK_FIELDS: continue if k in FORM_FIELDS: form[k] = v else: extra[k] = v return {"form": form, "extra": extra} def doc_key(object_name: str, collection: str) -> str: """Unique key for a document in the progress file.""" return f"{object_name}|{collection}" def content_type_from_filename(filename: str) -> str: """Infer MIME type from file extension.""" ext = os.path.splitext(filename)[1].lower() return { ".pdf": "application/pdf", ".html": "text/html", ".htm": "text/html", ".md": "text/markdown", ".txt": "text/plain", }.get(ext, "application/octet-stream")