Files
breakpilot-core/control-pipeline/scripts/ingest_eu_regulations.py
T
Benjamin Admin 569f64a400
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-consent (push) Successful in 28s
CI / test-python-voice (push) Successful in 32s
CI / test-bqas (push) Successful in 30s
feat(control-pipeline): production LegalActIngester for EU acts (Parser 1)
Add services/legal_act_ingester.py — the EU eur-lex LegalActIngester engine:
CELLAR download (with eur-lex fallback, bypassing the HTTP 202 web block on
large acts like DORA), parse into articles + annexes with full authority
metadata + forward citation edges (references_out), and a self-test gate before
upload. Refactor scripts/ingest_eu_regulations.py to use it: parse-based,
per-unit upload with a skip-by-CELEX guard (no automatic re-ingest). Recitals
are intentionally left to a separate ingester (Parser 2).

Tested: parser / metadata / self-test / refs_out over a synthetic eur-lex
fixture (7 tests), ruff + mypy clean, real CELLAR fetch of DORA verified
end-to-end (64 articles, full authority metadata).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-06-24 08:17:56 +02:00

141 lines
5.1 KiB
Python

#!/usr/bin/env python3
"""Ingest EU legal acts from eur-lex/CELLAR via the LegalActIngester engine.
For each act this downloads the German XHTML (CELLAR, eur-lex fallback), parses
it into articles + annexes with full authority metadata + citation edges
(services/legal_act_ingester.py), self-tests the parse, and uploads per unit.
Acts whose CELEX already exists are SKIPPED — there is no automatic re-ingest.
Usage (Mac Mini, with the RAG service reachable):
python3 control-pipeline/scripts/ingest_eu_regulations.py --dry-run
python3 control-pipeline/scripts/ingest_eu_regulations.py
"""
import argparse
import logging
import os
import sys
import time
from typing import TypedDict
import httpx
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
from services.legal_act_ingester import ( # noqa: E402
RegSpec,
build_upload_units,
download_act,
parse_html,
self_test,
upload_unit,
)
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
logger = logging.getLogger("ingest-eu")
RAG_URL = os.getenv("RAG_URL", "https://localhost:8097")
QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
COLLECTION = "bp_compliance_ce"
RUN_TAG = "2026-06-eu-v1"
class IngestResult(TypedDict):
reg: str
status: str
chunks: int
def _rank(celex: str) -> str:
"""eu_directive for L-acts, eu_regulation otherwise (CELEX descriptor letter)."""
return "eu_directive" if len(celex) > 5 and celex[5] == "L" else "eu_regulation"
def _spec(celex: str, name_de: str, short: str, version_date: str = "") -> RegSpec:
return RegSpec(
reg=short, celex=celex, name_de=name_de, collection=COLLECTION,
version_date=version_date, legal_basis_rank=_rank(celex),
)
# Acts this script ingests. The proven MVP acts (CRA / AI Act / DORA / NIS2 /
# MaschinenVO / DSGVO) are already in the corpus and get re-ingested via a
# separate, controlled step — not here.
SPECS = [
_spec("32022L2464", "Corporate Sustainability Reporting Directive (CSRD)", "CSRD"),
_spec("32024L1760", "Corporate Sustainability Due Diligence Directive (CSDDD)", "CSDDD"),
_spec("32020R0852", "EU-Taxonomie-Verordnung", "EU Taxonomy"),
_spec("32024R1183", "eIDAS 2.0 Verordnung (EU Digital Identity)", "eIDAS 2.0"),
_spec("32023L0970", "Entgelttransparenz-Richtlinie", "Pay Transparency"),
_spec("32022R2065", "Digital Services Act (DSA)", "DSA"),
]
def count_existing(celex: str) -> int:
"""Chunks already present for this CELEX (old or new tagging) — the skip guard."""
with httpx.Client(timeout=60.0, verify=False) as client:
resp = client.post(
f"{QDRANT_URL}/collections/{COLLECTION}/points/count",
json={"filter": {"must": [{"key": "celex", "match": {"value": celex}}]}, "exact": True},
)
resp.raise_for_status()
return int(resp.json()["result"]["count"])
def ingest_one(spec: RegSpec, dry_run: bool) -> IngestResult:
if (existing := count_existing(spec.celex)) > 0:
logger.info(" already present: %d chunks — SKIPPING (no re-ingest)", existing)
return {"reg": spec.reg, "status": "exists", "chunks": existing}
try:
html = download_act(spec.celex)
except Exception as exc: # noqa: BLE001 — log + continue with the next act
logger.error(" download FAILED: %s", exc)
return {"reg": spec.reg, "status": "download_failed", "chunks": 0}
act = parse_html(html, spec.reg)
passed, problems = self_test(act)
logger.info(" parsed: %d articles, %d annexes", len(act.articles), len(act.annexes))
if not passed:
logger.error(" GATE FAIL — %s", "; ".join(problems))
return {"reg": spec.reg, "status": "gate_failed", "chunks": 0}
units = build_upload_units(act, spec, RUN_TAG)
if dry_run:
logger.info(" DRY RUN — would upload %d units", len(units))
return {"reg": spec.reg, "status": "dry_run", "chunks": len(units)}
chunks = 0
with httpx.Client(timeout=600.0, verify=False) as client:
for unit in units:
chunks += upload_unit(client, RAG_URL, unit)
logger.info(" uploaded: %d units, %d chunks", len(units), chunks)
return {"reg": spec.reg, "status": "ok", "chunks": chunks}
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("--dry-run", action="store_true")
args = parser.parse_args()
logger.info("=" * 60)
logger.info("LegalActIngester — %d acts | dry_run=%s", len(SPECS), args.dry_run)
logger.info("=" * 60)
results: list[IngestResult] = []
for i, spec in enumerate(SPECS, 1):
logger.info("\n[%d/%d] %s (CELEX %s)", i, len(SPECS), spec.name_de, spec.celex)
results.append(ingest_one(spec, args.dry_run))
if i < len(SPECS):
time.sleep(1)
logger.info("\n" + "=" * 60)
for r in results:
logger.info(" %-18s %-15s chunks=%s", r["reg"], r["status"].upper(), r["chunks"])
total = sum(r["chunks"] for r in results if r["status"] == "ok")
logger.info("\nTotal new chunks: %d", total)
if __name__ == "__main__":
main()