#!/usr/bin/env python3 """Ingest missing EU regulations from EUR-Lex (HTML). Downloads German HTML from EUR-Lex via CELEX number, uploads with legal chunking. Usage (on Mac Mini): python3 control-pipeline/scripts/ingest_eu_regulations.py --dry-run python3 control-pipeline/scripts/ingest_eu_regulations.py """ import argparse import json import logging import time import httpx logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s") logger = logging.getLogger("ingest-eu") RAG_URL = "https://localhost:8097" QDRANT_URL = "http://localhost:6333" COLLECTION = "bp_compliance_ce" EURLEX_URL = "https://eur-lex.europa.eu/legal-content/DE/TXT/HTML/?uri=CELEX:{celex}" # ---- EU Regulations to ingest ---- REGULATIONS = [ { "celex": "32022L2464", "regulation_id": "csrd_2022", "name": "Corporate Sustainability Reporting Directive (CSRD)", "short": "CSRD", "category": "sustainability", }, { "celex": "32024L1760", "regulation_id": "csddd_2024", "name": "Corporate Sustainability Due Diligence Directive (CSDDD)", "short": "CSDDD", "category": "sustainability", }, { "celex": "32020R0852", "regulation_id": "eu_taxonomy_2020", "name": "EU-Taxonomie-Verordnung", "short": "EU Taxonomy", "category": "sustainability", }, { "celex": "32024R1183", "regulation_id": "eidas_2_0_2024", "name": "eIDAS 2.0 Verordnung (EU Digital Identity)", "short": "eIDAS 2.0", "category": "digital_identity", }, { "celex": "32023L0970", "regulation_id": "pay_transparency_2023", "name": "Entgelttransparenz-Richtlinie", "short": "Pay Transparency", "category": "employment", }, { "celex": "32022R2065", "regulation_id": "dsa_2022_updated", "name": "Digital Services Act (DSA) — aktualisiert", "short": "DSA", "category": "digital_services", "skip_if_exists": "dsa_2022", # already exists under different ID }, ] def download_eurlex(celex: str) -> str: """Download EU regulation HTML from EUR-Lex.""" url = EURLEX_URL.format(celex=celex) with httpx.Client(timeout=30.0, follow_redirects=True) as c: resp = c.get(url) resp.raise_for_status() return resp.text def upload_html(html: str, filename: str, reg: dict, dry_run: bool = False): """Upload HTML to RAG service.""" if dry_run: logger.info(" DRY RUN — would upload %d chars", len(html)) return {"chunks_count": 0} meta = { "regulation_id": reg["regulation_id"], "regulation_name_de": reg["name"], "regulation_short": reg["short"], "celex": reg["celex"], "category": reg["category"], "source": "EUR-Lex", "license": "EU_law", "jurisdiction": "EU", "source_type": "law", } form_data = { "collection": COLLECTION, "data_type": "compliance", "bundesland": "bund", "use_case": "compliance", "year": "2026", "chunk_strategy": "legal", "chunk_size": "1500", "chunk_overlap": "100", "metadata_json": json.dumps(meta, ensure_ascii=False), } with httpx.Client(timeout=600.0, verify=False) as c: resp = c.post( f"{RAG_URL}/api/v1/documents/upload", files={"file": (filename, html.encode("utf-8"), "text/html")}, data=form_data, ) resp.raise_for_status() return resp.json() def count_existing(regulation_id: str) -> int: with httpx.Client(timeout=60.0) as c: resp = c.post( f"{QDRANT_URL}/collections/{COLLECTION}/points/count", json={"filter": {"must": [ {"key": "regulation_id", "match": {"value": regulation_id}} ]}, "exact": True}, ) resp.raise_for_status() return resp.json()["result"]["count"] def main(): parser = argparse.ArgumentParser() parser.add_argument("--dry-run", action="store_true") args = parser.parse_args() logger.info("=" * 60) logger.info("Ingest EU Regulations from EUR-Lex") logger.info(" Regulations: %d", len(REGULATIONS)) logger.info(" Dry run: %s", args.dry_run) logger.info("=" * 60) results = [] for i, reg in enumerate(REGULATIONS, 1): logger.info("\n[%d/%d] %s (CELEX: %s)", i, len(REGULATIONS), reg["name"], reg["celex"]) # Skip if variant already exists skip_id = reg.get("skip_if_exists") if skip_id: existing = count_existing(skip_id) if existing > 0: logger.info(" Already exists as '%s' (%d chunks) — SKIPPING", skip_id, existing) results.append({"reg": reg["short"], "status": "exists", "chunks": existing}) continue # Check if this exact ID exists existing = count_existing(reg["regulation_id"]) if existing > 0: logger.info(" Already exists: %d chunks — SKIPPING", existing) results.append({"reg": reg["short"], "status": "exists", "chunks": existing}) continue # Download from EUR-Lex logger.info(" Downloading from EUR-Lex...") try: html = download_eurlex(reg["celex"]) logger.info(" Downloaded: %d chars", len(html)) except Exception as e: logger.error(" Download FAILED: %s", e) results.append({"reg": reg["short"], "status": "download_failed", "chunks": 0}) continue # Upload filename = f"{reg['regulation_id']}.html" try: result = upload_html(html, filename, reg, args.dry_run) chunks = result.get("chunks_count", 0) logger.info(" Uploaded: %d chunks", chunks) results.append({"reg": reg["short"], "status": "ok", "chunks": chunks}) except Exception as e: logger.error(" Upload FAILED: %s", e) results.append({"reg": reg["short"], "status": "error", "chunks": 0}) if i < len(REGULATIONS): time.sleep(2) # Summary logger.info("\n" + "=" * 60) logger.info("RESULTS") logger.info("=" * 60) for r in results: logger.info(" %-20s %s chunks=%d", r["reg"], r["status"].upper(), r["chunks"]) total_new = sum(r["chunks"] for r in results if r["status"] == "ok") logger.info("\nTotal new chunks: %d", total_new) if __name__ == "__main__": main()