Files
breakpilot-core/legal-sources/trbs-trgs-asr/ingest_to_qdrant.py
T
Benjamin Admin 81db904b3e feat(legal-sources): add OSHA machinery safety standards + international norms mapping
OSHA 29 CFR 1910 Subpart O (1910.211-1910.219) — complete machine
guarding requirements. US federal law, public domain.

International norms mapping table: China GB/T, Korea KS, India BIS
equivalents to ISO/EN standards. Unfortunately all countries protect
ISO copyright even for identical national adoptions (IDT).

Only OSHA provides truly free machinery safety content.
EU Excel harmonised standards list included for reference.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-09 10:50:43 +02:00

120 lines
3.7 KiB
Python

#!/usr/bin/env python3
"""
Ingest downloaded TRBS/TRGS/ASR PDFs into Qdrant via RAG Service.
Reads the source_registry.json and uploads each PDF to the RAG service.
Usage:
python3 ingest_to_qdrant.py # ingest all
python3 ingest_to_qdrant.py --category trbs # only TRBS
python3 ingest_to_qdrant.py --dry-run # list without uploading
"""
import argparse
import json
import logging
import time
from pathlib import Path
import httpx
logging.basicConfig(
level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s"
)
logger = logging.getLogger("ingest-trbs")
REGISTRY_FILE = Path(__file__).parent / "source_registry.json"
RAG_URL = "https://macmini:8097/api/v1/documents/upload"
COLLECTION = "bp_compliance_ce" # Same collection as other CE documents
def ingest_pdf(entry: dict) -> dict:
"""Upload a single PDF to the RAG service."""
local_path = entry.get("local_path", "")
if not local_path or not Path(local_path).exists():
return {"status": "skipped", "reason": "no local file"}
pdf_path = Path(local_path)
category = entry.get("category", "unknown")
filename = entry.get("filename", pdf_path.name)
title = entry.get("title", filename)
metadata = {
"source": title,
"regulation_id": f"{category}_{filename}".lower().replace("-", "_"),
"jurisdiction": "DE",
"source_type": "technical_rule",
"license_rule": 1,
"category": category,
"legal_basis": entry.get("legal_basis", ""),
}
try:
with open(pdf_path, "rb") as f:
files = {"file": (pdf_path.name, f, "application/pdf")}
data = {
"collection": COLLECTION,
"data_type": "legal",
"use_case": "compliance",
"year": "2026",
"chunk_size": "512",
"chunk_overlap": "50",
"metadata_json": json.dumps(metadata),
}
resp = httpx.post(RAG_URL, files=files, data=data, timeout=300.0, verify=False)
resp.raise_for_status()
result = resp.json()
return {
"status": "ok",
"document_id": result.get("document_id", ""),
"chunks": result.get("chunks_count", 0),
}
except Exception as e:
return {"status": "error", "reason": str(e)}
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--category", choices=["trbs", "trgs", "asr"])
parser.add_argument("--dry-run", action="store_true")
args = parser.parse_args()
registry = json.loads(REGISTRY_FILE.read_text())
if args.category:
registry = [e for e in registry if e.get("category") == args.category]
logger.info("Ingesting %d documents into Qdrant (%s)", len(registry), COLLECTION)
total_ok = 0
total_chunks = 0
total_err = 0
for i, entry in enumerate(registry):
if not entry.get("local_path"):
continue
if args.dry_run:
logger.info("[%d/%d] %s%s (dry-run)",
i + 1, len(registry), entry["filename"], entry.get("title", "")[:60])
continue
logger.info("[%d/%d] %s", i + 1, len(registry), entry["filename"])
result = ingest_pdf(entry)
if result["status"] == "ok":
total_ok += 1
total_chunks += result["chunks"]
logger.info("%d chunks indexed", result["chunks"])
else:
total_err += 1
logger.error("%s: %s", result["status"], result.get("reason", ""))
time.sleep(1) # Be gentle
logger.info("\nDone: %d OK (%d chunks), %d errors, %d total",
total_ok, total_chunks, total_err, len(registry))
if __name__ == "__main__":
main()