feat(control-pipeline): production LegalActIngester for EU acts (Parser 1)
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-consent (push) Successful in 28s
CI / test-python-voice (push) Successful in 32s
CI / test-bqas (push) Successful in 30s
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / test-go-consent (push) Successful in 28s
CI / test-python-voice (push) Successful in 32s
CI / test-bqas (push) Successful in 30s
Add services/legal_act_ingester.py — the EU eur-lex LegalActIngester engine: CELLAR download (with eur-lex fallback, bypassing the HTTP 202 web block on large acts like DORA), parse into articles + annexes with full authority metadata + forward citation edges (references_out), and a self-test gate before upload. Refactor scripts/ingest_eu_regulations.py to use it: parse-based, per-unit upload with a skip-by-CELEX guard (no automatic re-ingest). Recitals are intentionally left to a separate ingester (Parser 2). Tested: parser / metadata / self-test / refs_out over a synthetic eur-lex fixture (7 tests), ruff + mypy clean, real CELLAR fetch of DORA verified end-to-end (64 articles, full authority metadata). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -1,200 +1,139 @@
|
||||
#!/usr/bin/env python3
|
||||
"""Ingest missing EU regulations from EUR-Lex (HTML).
|
||||
"""Ingest EU legal acts from eur-lex/CELLAR via the LegalActIngester engine.
|
||||
|
||||
Downloads German HTML from EUR-Lex via CELEX number, uploads with legal chunking.
|
||||
For each act this downloads the German XHTML (CELLAR, eur-lex fallback), parses
|
||||
it into articles + annexes with full authority metadata + citation edges
|
||||
(services/legal_act_ingester.py), self-tests the parse, and uploads per unit.
|
||||
Acts whose CELEX already exists are SKIPPED — there is no automatic re-ingest.
|
||||
|
||||
Usage (on Mac Mini):
|
||||
Usage (Mac Mini, with the RAG service reachable):
|
||||
python3 control-pipeline/scripts/ingest_eu_regulations.py --dry-run
|
||||
python3 control-pipeline/scripts/ingest_eu_regulations.py
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from typing import TypedDict
|
||||
|
||||
import httpx
|
||||
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))
|
||||
|
||||
from services.legal_act_ingester import ( # noqa: E402
|
||||
RegSpec,
|
||||
build_upload_units,
|
||||
download_act,
|
||||
parse_html,
|
||||
self_test,
|
||||
upload_unit,
|
||||
)
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
|
||||
logger = logging.getLogger("ingest-eu")
|
||||
|
||||
RAG_URL = "https://localhost:8097"
|
||||
QDRANT_URL = "http://localhost:6333"
|
||||
RAG_URL = os.getenv("RAG_URL", "https://localhost:8097")
|
||||
QDRANT_URL = os.getenv("QDRANT_URL", "http://localhost:6333")
|
||||
COLLECTION = "bp_compliance_ce"
|
||||
RUN_TAG = "2026-06-eu-v1"
|
||||
|
||||
EURLEX_URL = "https://eur-lex.europa.eu/legal-content/DE/TXT/HTML/?uri=CELEX:{celex}"
|
||||
|
||||
# ---- EU Regulations to ingest ----
|
||||
REGULATIONS = [
|
||||
{
|
||||
"celex": "32022L2464",
|
||||
"regulation_id": "csrd_2022",
|
||||
"name": "Corporate Sustainability Reporting Directive (CSRD)",
|
||||
"short": "CSRD",
|
||||
"category": "sustainability",
|
||||
},
|
||||
{
|
||||
"celex": "32024L1760",
|
||||
"regulation_id": "csddd_2024",
|
||||
"name": "Corporate Sustainability Due Diligence Directive (CSDDD)",
|
||||
"short": "CSDDD",
|
||||
"category": "sustainability",
|
||||
},
|
||||
{
|
||||
"celex": "32020R0852",
|
||||
"regulation_id": "eu_taxonomy_2020",
|
||||
"name": "EU-Taxonomie-Verordnung",
|
||||
"short": "EU Taxonomy",
|
||||
"category": "sustainability",
|
||||
},
|
||||
{
|
||||
"celex": "32024R1183",
|
||||
"regulation_id": "eidas_2_0_2024",
|
||||
"name": "eIDAS 2.0 Verordnung (EU Digital Identity)",
|
||||
"short": "eIDAS 2.0",
|
||||
"category": "digital_identity",
|
||||
},
|
||||
{
|
||||
"celex": "32023L0970",
|
||||
"regulation_id": "pay_transparency_2023",
|
||||
"name": "Entgelttransparenz-Richtlinie",
|
||||
"short": "Pay Transparency",
|
||||
"category": "employment",
|
||||
},
|
||||
{
|
||||
"celex": "32022R2065",
|
||||
"regulation_id": "dsa_2022_updated",
|
||||
"name": "Digital Services Act (DSA) — aktualisiert",
|
||||
"short": "DSA",
|
||||
"category": "digital_services",
|
||||
"skip_if_exists": "dsa_2022", # already exists under different ID
|
||||
},
|
||||
class IngestResult(TypedDict):
|
||||
reg: str
|
||||
status: str
|
||||
chunks: int
|
||||
|
||||
|
||||
def _rank(celex: str) -> str:
|
||||
"""eu_directive for L-acts, eu_regulation otherwise (CELEX descriptor letter)."""
|
||||
return "eu_directive" if len(celex) > 5 and celex[5] == "L" else "eu_regulation"
|
||||
|
||||
|
||||
def _spec(celex: str, name_de: str, short: str, version_date: str = "") -> RegSpec:
|
||||
return RegSpec(
|
||||
reg=short, celex=celex, name_de=name_de, collection=COLLECTION,
|
||||
version_date=version_date, legal_basis_rank=_rank(celex),
|
||||
)
|
||||
|
||||
|
||||
# Acts this script ingests. The proven MVP acts (CRA / AI Act / DORA / NIS2 /
|
||||
# MaschinenVO / DSGVO) are already in the corpus and get re-ingested via a
|
||||
# separate, controlled step — not here.
|
||||
SPECS = [
|
||||
_spec("32022L2464", "Corporate Sustainability Reporting Directive (CSRD)", "CSRD"),
|
||||
_spec("32024L1760", "Corporate Sustainability Due Diligence Directive (CSDDD)", "CSDDD"),
|
||||
_spec("32020R0852", "EU-Taxonomie-Verordnung", "EU Taxonomy"),
|
||||
_spec("32024R1183", "eIDAS 2.0 Verordnung (EU Digital Identity)", "eIDAS 2.0"),
|
||||
_spec("32023L0970", "Entgelttransparenz-Richtlinie", "Pay Transparency"),
|
||||
_spec("32022R2065", "Digital Services Act (DSA)", "DSA"),
|
||||
]
|
||||
|
||||
|
||||
def download_eurlex(celex: str) -> str:
|
||||
"""Download EU regulation HTML from EUR-Lex."""
|
||||
url = EURLEX_URL.format(celex=celex)
|
||||
with httpx.Client(timeout=30.0, follow_redirects=True) as c:
|
||||
resp = c.get(url)
|
||||
resp.raise_for_status()
|
||||
return resp.text
|
||||
|
||||
|
||||
def upload_html(html: str, filename: str, reg: dict, dry_run: bool = False):
|
||||
"""Upload HTML to RAG service."""
|
||||
if dry_run:
|
||||
logger.info(" DRY RUN — would upload %d chars", len(html))
|
||||
return {"chunks_count": 0}
|
||||
|
||||
meta = {
|
||||
"regulation_id": reg["regulation_id"],
|
||||
"regulation_name_de": reg["name"],
|
||||
"regulation_short": reg["short"],
|
||||
"celex": reg["celex"],
|
||||
"category": reg["category"],
|
||||
"source": "EUR-Lex",
|
||||
"license": "EU_law",
|
||||
"jurisdiction": "EU",
|
||||
"source_type": "law",
|
||||
}
|
||||
form_data = {
|
||||
"collection": COLLECTION,
|
||||
"data_type": "compliance",
|
||||
"bundesland": "bund",
|
||||
"use_case": "compliance",
|
||||
"year": "2026",
|
||||
"chunk_strategy": "legal",
|
||||
"chunk_size": "1500",
|
||||
"chunk_overlap": "100",
|
||||
"metadata_json": json.dumps(meta, ensure_ascii=False),
|
||||
}
|
||||
with httpx.Client(timeout=600.0, verify=False) as c:
|
||||
resp = c.post(
|
||||
f"{RAG_URL}/api/v1/documents/upload",
|
||||
files={"file": (filename, html.encode("utf-8"), "text/html")},
|
||||
data=form_data,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
return resp.json()
|
||||
|
||||
|
||||
def count_existing(regulation_id: str) -> int:
|
||||
with httpx.Client(timeout=60.0) as c:
|
||||
resp = c.post(
|
||||
def count_existing(celex: str) -> int:
|
||||
"""Chunks already present for this CELEX (old or new tagging) — the skip guard."""
|
||||
with httpx.Client(timeout=60.0, verify=False) as client:
|
||||
resp = client.post(
|
||||
f"{QDRANT_URL}/collections/{COLLECTION}/points/count",
|
||||
json={"filter": {"must": [
|
||||
{"key": "regulation_id", "match": {"value": regulation_id}}
|
||||
]}, "exact": True},
|
||||
json={"filter": {"must": [{"key": "celex", "match": {"value": celex}}]}, "exact": True},
|
||||
)
|
||||
resp.raise_for_status()
|
||||
return resp.json()["result"]["count"]
|
||||
return int(resp.json()["result"]["count"])
|
||||
|
||||
|
||||
def main():
|
||||
def ingest_one(spec: RegSpec, dry_run: bool) -> IngestResult:
|
||||
if (existing := count_existing(spec.celex)) > 0:
|
||||
logger.info(" already present: %d chunks — SKIPPING (no re-ingest)", existing)
|
||||
return {"reg": spec.reg, "status": "exists", "chunks": existing}
|
||||
|
||||
try:
|
||||
html = download_act(spec.celex)
|
||||
except Exception as exc: # noqa: BLE001 — log + continue with the next act
|
||||
logger.error(" download FAILED: %s", exc)
|
||||
return {"reg": spec.reg, "status": "download_failed", "chunks": 0}
|
||||
|
||||
act = parse_html(html, spec.reg)
|
||||
passed, problems = self_test(act)
|
||||
logger.info(" parsed: %d articles, %d annexes", len(act.articles), len(act.annexes))
|
||||
if not passed:
|
||||
logger.error(" GATE FAIL — %s", "; ".join(problems))
|
||||
return {"reg": spec.reg, "status": "gate_failed", "chunks": 0}
|
||||
|
||||
units = build_upload_units(act, spec, RUN_TAG)
|
||||
if dry_run:
|
||||
logger.info(" DRY RUN — would upload %d units", len(units))
|
||||
return {"reg": spec.reg, "status": "dry_run", "chunks": len(units)}
|
||||
|
||||
chunks = 0
|
||||
with httpx.Client(timeout=600.0, verify=False) as client:
|
||||
for unit in units:
|
||||
chunks += upload_unit(client, RAG_URL, unit)
|
||||
logger.info(" uploaded: %d units, %d chunks", len(units), chunks)
|
||||
return {"reg": spec.reg, "status": "ok", "chunks": chunks}
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--dry-run", action="store_true")
|
||||
args = parser.parse_args()
|
||||
|
||||
logger.info("=" * 60)
|
||||
logger.info("Ingest EU Regulations from EUR-Lex")
|
||||
logger.info(" Regulations: %d", len(REGULATIONS))
|
||||
logger.info(" Dry run: %s", args.dry_run)
|
||||
logger.info("LegalActIngester — %d acts | dry_run=%s", len(SPECS), args.dry_run)
|
||||
logger.info("=" * 60)
|
||||
|
||||
results = []
|
||||
for i, reg in enumerate(REGULATIONS, 1):
|
||||
logger.info("\n[%d/%d] %s (CELEX: %s)", i, len(REGULATIONS), reg["name"], reg["celex"])
|
||||
results: list[IngestResult] = []
|
||||
for i, spec in enumerate(SPECS, 1):
|
||||
logger.info("\n[%d/%d] %s (CELEX %s)", i, len(SPECS), spec.name_de, spec.celex)
|
||||
results.append(ingest_one(spec, args.dry_run))
|
||||
if i < len(SPECS):
|
||||
time.sleep(1)
|
||||
|
||||
# Skip if variant already exists
|
||||
skip_id = reg.get("skip_if_exists")
|
||||
if skip_id:
|
||||
existing = count_existing(skip_id)
|
||||
if existing > 0:
|
||||
logger.info(" Already exists as '%s' (%d chunks) — SKIPPING", skip_id, existing)
|
||||
results.append({"reg": reg["short"], "status": "exists", "chunks": existing})
|
||||
continue
|
||||
|
||||
# Check if this exact ID exists
|
||||
existing = count_existing(reg["regulation_id"])
|
||||
if existing > 0:
|
||||
logger.info(" Already exists: %d chunks — SKIPPING", existing)
|
||||
results.append({"reg": reg["short"], "status": "exists", "chunks": existing})
|
||||
continue
|
||||
|
||||
# Download from EUR-Lex
|
||||
logger.info(" Downloading from EUR-Lex...")
|
||||
try:
|
||||
html = download_eurlex(reg["celex"])
|
||||
logger.info(" Downloaded: %d chars", len(html))
|
||||
except Exception as e:
|
||||
logger.error(" Download FAILED: %s", e)
|
||||
results.append({"reg": reg["short"], "status": "download_failed", "chunks": 0})
|
||||
continue
|
||||
|
||||
# Upload
|
||||
filename = f"{reg['regulation_id']}.html"
|
||||
try:
|
||||
result = upload_html(html, filename, reg, args.dry_run)
|
||||
chunks = result.get("chunks_count", 0)
|
||||
logger.info(" Uploaded: %d chunks", chunks)
|
||||
results.append({"reg": reg["short"], "status": "ok", "chunks": chunks})
|
||||
except Exception as e:
|
||||
logger.error(" Upload FAILED: %s", e)
|
||||
results.append({"reg": reg["short"], "status": "error", "chunks": 0})
|
||||
|
||||
if i < len(REGULATIONS):
|
||||
time.sleep(2)
|
||||
|
||||
# Summary
|
||||
logger.info("\n" + "=" * 60)
|
||||
logger.info("RESULTS")
|
||||
logger.info("=" * 60)
|
||||
for r in results:
|
||||
logger.info(" %-20s %s chunks=%d", r["reg"], r["status"].upper(), r["chunks"])
|
||||
|
||||
total_new = sum(r["chunks"] for r in results if r["status"] == "ok")
|
||||
logger.info("\nTotal new chunks: %d", total_new)
|
||||
logger.info(" %-18s %-15s chunks=%s", r["reg"], r["status"].upper(), r["chunks"])
|
||||
total = sum(r["chunks"] for r in results if r["status"] == "ok")
|
||||
logger.info("\nTotal new chunks: %d", total)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
Reference in New Issue
Block a user