81db904b3e
OSHA 29 CFR 1910 Subpart O (1910.211-1910.219) — complete machine guarding requirements. US federal law, public domain. International norms mapping table: China GB/T, Korea KS, India BIS equivalents to ISO/EN standards. Unfortunately all countries protect ISO copyright even for identical national adoptions (IDT). Only OSHA provides truly free machinery safety content. EU Excel harmonised standards list included for reference. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
120 lines
3.7 KiB
Python
120 lines
3.7 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Ingest downloaded TRBS/TRGS/ASR PDFs into Qdrant via RAG Service.
|
|
|
|
Reads the source_registry.json and uploads each PDF to the RAG service.
|
|
|
|
Usage:
|
|
python3 ingest_to_qdrant.py # ingest all
|
|
python3 ingest_to_qdrant.py --category trbs # only TRBS
|
|
python3 ingest_to_qdrant.py --dry-run # list without uploading
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import logging
|
|
import time
|
|
from pathlib import Path
|
|
|
|
import httpx
|
|
|
|
logging.basicConfig(
|
|
level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s"
|
|
)
|
|
logger = logging.getLogger("ingest-trbs")
|
|
|
|
REGISTRY_FILE = Path(__file__).parent / "source_registry.json"
|
|
RAG_URL = "https://macmini:8097/api/v1/documents/upload"
|
|
COLLECTION = "bp_compliance_ce" # Same collection as other CE documents
|
|
|
|
|
|
def ingest_pdf(entry: dict) -> dict:
|
|
"""Upload a single PDF to the RAG service."""
|
|
local_path = entry.get("local_path", "")
|
|
if not local_path or not Path(local_path).exists():
|
|
return {"status": "skipped", "reason": "no local file"}
|
|
|
|
pdf_path = Path(local_path)
|
|
category = entry.get("category", "unknown")
|
|
filename = entry.get("filename", pdf_path.name)
|
|
title = entry.get("title", filename)
|
|
|
|
metadata = {
|
|
"source": title,
|
|
"regulation_id": f"{category}_{filename}".lower().replace("-", "_"),
|
|
"jurisdiction": "DE",
|
|
"source_type": "technical_rule",
|
|
"license_rule": 1,
|
|
"category": category,
|
|
"legal_basis": entry.get("legal_basis", ""),
|
|
}
|
|
|
|
try:
|
|
with open(pdf_path, "rb") as f:
|
|
files = {"file": (pdf_path.name, f, "application/pdf")}
|
|
data = {
|
|
"collection": COLLECTION,
|
|
"data_type": "legal",
|
|
"use_case": "compliance",
|
|
"year": "2026",
|
|
"chunk_size": "512",
|
|
"chunk_overlap": "50",
|
|
"metadata_json": json.dumps(metadata),
|
|
}
|
|
resp = httpx.post(RAG_URL, files=files, data=data, timeout=300.0, verify=False)
|
|
resp.raise_for_status()
|
|
result = resp.json()
|
|
return {
|
|
"status": "ok",
|
|
"document_id": result.get("document_id", ""),
|
|
"chunks": result.get("chunks_count", 0),
|
|
}
|
|
except Exception as e:
|
|
return {"status": "error", "reason": str(e)}
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--category", choices=["trbs", "trgs", "asr"])
|
|
parser.add_argument("--dry-run", action="store_true")
|
|
args = parser.parse_args()
|
|
|
|
registry = json.loads(REGISTRY_FILE.read_text())
|
|
if args.category:
|
|
registry = [e for e in registry if e.get("category") == args.category]
|
|
|
|
logger.info("Ingesting %d documents into Qdrant (%s)", len(registry), COLLECTION)
|
|
|
|
total_ok = 0
|
|
total_chunks = 0
|
|
total_err = 0
|
|
|
|
for i, entry in enumerate(registry):
|
|
if not entry.get("local_path"):
|
|
continue
|
|
|
|
if args.dry_run:
|
|
logger.info("[%d/%d] %s — %s (dry-run)",
|
|
i + 1, len(registry), entry["filename"], entry.get("title", "")[:60])
|
|
continue
|
|
|
|
logger.info("[%d/%d] %s", i + 1, len(registry), entry["filename"])
|
|
result = ingest_pdf(entry)
|
|
|
|
if result["status"] == "ok":
|
|
total_ok += 1
|
|
total_chunks += result["chunks"]
|
|
logger.info(" → %d chunks indexed", result["chunks"])
|
|
else:
|
|
total_err += 1
|
|
logger.error(" → %s: %s", result["status"], result.get("reason", ""))
|
|
|
|
time.sleep(1) # Be gentle
|
|
|
|
logger.info("\nDone: %d OK (%d chunks), %d errors, %d total",
|
|
total_ok, total_chunks, total_err, len(registry))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|