#!/usr/bin/env python3 """Ingest EU legal acts from eur-lex/CELLAR via the LegalActIngester engine. For each act this downloads the German XHTML (CELLAR, eur-lex fallback), parses it into articles + annexes with full authority metadata + citation edges (services/legal_act_ingester.py), self-tests the parse, and uploads per unit. Acts whose CELEX already exists are SKIPPED — there is no automatic re-ingest. Usage (Mac Mini, with the RAG service reachable): python3 control-pipeline/scripts/ingest_eu_regulations.py --dry-run python3 control-pipeline/scripts/ingest_eu_regulations.py """ import argparse import logging import os import sys import time from typing import TypedDict import httpx sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) from services.legal_act_ingester import ( # noqa: E402 RegSpec, build_upload_units, download_act, parse_html, self_test, upload_unit, ) logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s") logger = logging.getLogger("ingest-eu") RAG_URL = os.getenv("RAG_URL", "https://localhost:8097") QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333") COLLECTION = "bp_compliance_ce" RUN_TAG = "2026-06-eu-v1" class IngestResult(TypedDict): reg: str status: str chunks: int def _rank(celex: str) -> str: """eu_directive for L-acts, eu_regulation otherwise (CELEX descriptor letter).""" return "eu_directive" if len(celex) > 5 and celex[5] == "L" else "eu_regulation" def _spec(celex: str, name_de: str, short: str, version_date: str = "") -> RegSpec: return RegSpec( reg=short, celex=celex, name_de=name_de, collection=COLLECTION, version_date=version_date, legal_basis_rank=_rank(celex), ) # Acts this script ingests. The proven MVP acts (CRA / AI Act / DORA / NIS2 / # MaschinenVO / DSGVO) are already in the corpus and get re-ingested via a # separate, controlled step — not here. SPECS = [ _spec("32022L2464", "Corporate Sustainability Reporting Directive (CSRD)", "CSRD"), _spec("32024L1760", "Corporate Sustainability Due Diligence Directive (CSDDD)", "CSDDD"), _spec("32020R0852", "EU-Taxonomie-Verordnung", "EU Taxonomy"), _spec("32024R1183", "eIDAS 2.0 Verordnung (EU Digital Identity)", "eIDAS 2.0"), _spec("32023L0970", "Entgelttransparenz-Richtlinie", "Pay Transparency"), _spec("32022R2065", "Digital Services Act (DSA)", "DSA"), ] def count_existing(celex: str) -> int: """Chunks already present for this CELEX (old or new tagging) — the skip guard.""" with httpx.Client(timeout=60.0, verify=False) as client: resp = client.post( f"{QDRANT_URL}/collections/{COLLECTION}/points/count", json={"filter": {"must": [{"key": "celex", "match": {"value": celex}}]}, "exact": True}, ) resp.raise_for_status() return int(resp.json()["result"]["count"]) def ingest_one(spec: RegSpec, dry_run: bool) -> IngestResult: if (existing := count_existing(spec.celex)) > 0: logger.info(" already present: %d chunks — SKIPPING (no re-ingest)", existing) return {"reg": spec.reg, "status": "exists", "chunks": existing} try: html = download_act(spec.celex) except Exception as exc: # noqa: BLE001 — log + continue with the next act logger.error(" download FAILED: %s", exc) return {"reg": spec.reg, "status": "download_failed", "chunks": 0} act = parse_html(html, spec.reg) passed, problems = self_test(act) logger.info(" parsed: %d articles, %d annexes", len(act.articles), len(act.annexes)) if not passed: logger.error(" GATE FAIL — %s", "; ".join(problems)) return {"reg": spec.reg, "status": "gate_failed", "chunks": 0} units = build_upload_units(act, spec, RUN_TAG) if dry_run: logger.info(" DRY RUN — would upload %d units", len(units)) return {"reg": spec.reg, "status": "dry_run", "chunks": len(units)} chunks = 0 with httpx.Client(timeout=600.0, verify=False) as client: for unit in units: chunks += upload_unit(client, RAG_URL, unit) logger.info(" uploaded: %d units, %d chunks", len(units), chunks) return {"reg": spec.reg, "status": "ok", "chunks": chunks} def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("--dry-run", action="store_true") args = parser.parse_args() logger.info("=" * 60) logger.info("LegalActIngester — %d acts | dry_run=%s", len(SPECS), args.dry_run) logger.info("=" * 60) results: list[IngestResult] = [] for i, spec in enumerate(SPECS, 1): logger.info("\n[%d/%d] %s (CELEX %s)", i, len(SPECS), spec.name_de, spec.celex) results.append(ingest_one(spec, args.dry_run)) if i < len(SPECS): time.sleep(1) logger.info("\n" + "=" * 60) for r in results: logger.info(" %-18s %-15s chunks=%s", r["reg"], r["status"].upper(), r["chunks"]) total = sum(r["chunks"] for r in results if r["status"] == "ok") logger.info("\nTotal new chunks: %d", total) if __name__ == "__main__": main()