#!/usr/bin/env python3 """ Ingest downloaded TRBS/TRGS/ASR PDFs into Qdrant via RAG Service. Reads the source_registry.json and uploads each PDF to the RAG service. Usage: python3 ingest_to_qdrant.py # ingest all python3 ingest_to_qdrant.py --category trbs # only TRBS python3 ingest_to_qdrant.py --dry-run # list without uploading """ import argparse import json import logging import time from pathlib import Path import httpx logging.basicConfig( level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s" ) logger = logging.getLogger("ingest-trbs") REGISTRY_FILE = Path(__file__).parent / "source_registry.json" RAG_URL = "https://macmini:8097/api/v1/documents/upload" COLLECTION = "bp_compliance_ce" # Same collection as other CE documents def ingest_pdf(entry: dict) -> dict: """Upload a single PDF to the RAG service.""" local_path = entry.get("local_path", "") if not local_path or not Path(local_path).exists(): return {"status": "skipped", "reason": "no local file"} pdf_path = Path(local_path) category = entry.get("category", "unknown") filename = entry.get("filename", pdf_path.name) title = entry.get("title", filename) metadata = { "source": title, "regulation_id": f"{category}_{filename}".lower().replace("-", "_"), "jurisdiction": "DE", "source_type": "technical_rule", "license_rule": 1, "category": category, "legal_basis": entry.get("legal_basis", ""), } try: with open(pdf_path, "rb") as f: files = {"file": (pdf_path.name, f, "application/pdf")} data = { "collection": COLLECTION, "data_type": "legal", "use_case": "compliance", "year": "2026", "chunk_size": "512", "chunk_overlap": "50", "metadata_json": json.dumps(metadata), } resp = httpx.post(RAG_URL, files=files, data=data, timeout=300.0, verify=False) resp.raise_for_status() result = resp.json() return { "status": "ok", "document_id": result.get("document_id", ""), "chunks": result.get("chunks_count", 0), } except Exception as e: return {"status": "error", "reason": str(e)} def main(): parser = argparse.ArgumentParser() parser.add_argument("--category", choices=["trbs", "trgs", "asr"]) parser.add_argument("--dry-run", action="store_true") args = parser.parse_args() registry = json.loads(REGISTRY_FILE.read_text()) if args.category: registry = [e for e in registry if e.get("category") == args.category] logger.info("Ingesting %d documents into Qdrant (%s)", len(registry), COLLECTION) total_ok = 0 total_chunks = 0 total_err = 0 for i, entry in enumerate(registry): if not entry.get("local_path"): continue if args.dry_run: logger.info("[%d/%d] %s — %s (dry-run)", i + 1, len(registry), entry["filename"], entry.get("title", "")[:60]) continue logger.info("[%d/%d] %s", i + 1, len(registry), entry["filename"]) result = ingest_pdf(entry) if result["status"] == "ok": total_ok += 1 total_chunks += result["chunks"] logger.info(" → %d chunks indexed", result["chunks"]) else: total_err += 1 logger.error(" → %s: %s", result["status"], result.get("reason", "")) time.sleep(1) # Be gentle logger.info("\nDone: %d OK (%d chunks), %d errors, %d total", total_ok, total_chunks, total_err, len(registry)) if __name__ == "__main__": main()