118be3540d
- d6_citation_backfill.py: 3-tier matching (hash/prefix/overlap), archives old citations, updated 3.651 controls (93.6% coverage) - ingest_de_laws.py: 8 German laws ingested (ArbZG, MuSchG, NachwG, MiLoG, GmbHG, AktG, InsO, BUrlG — 1.629 chunks) - ingest_eu_regulations.py: EUR-Lex ingestion (needs manual HTML due to AWS WAF). CSRD, CSDDD, EU Taxonomy, eIDAS 2.0, Pay Transparency manually ingested (1.057 chunks) - Updated session handover with current state Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
202 lines
6.5 KiB
Python
202 lines
6.5 KiB
Python
#!/usr/bin/env python3
|
|
"""Ingest missing EU regulations from EUR-Lex (HTML).
|
|
|
|
Downloads German HTML from EUR-Lex via CELEX number, uploads with legal chunking.
|
|
|
|
Usage (on Mac Mini):
|
|
python3 control-pipeline/scripts/ingest_eu_regulations.py --dry-run
|
|
python3 control-pipeline/scripts/ingest_eu_regulations.py
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import logging
|
|
import time
|
|
|
|
import httpx
|
|
|
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
|
|
logger = logging.getLogger("ingest-eu")
|
|
|
|
RAG_URL = "https://localhost:8097"
|
|
QDRANT_URL = "http://localhost:6333"
|
|
COLLECTION = "bp_compliance_ce"
|
|
|
|
EURLEX_URL = "https://eur-lex.europa.eu/legal-content/DE/TXT/HTML/?uri=CELEX:{celex}"
|
|
|
|
# ---- EU Regulations to ingest ----
|
|
REGULATIONS = [
|
|
{
|
|
"celex": "32022L2464",
|
|
"regulation_id": "csrd_2022",
|
|
"name": "Corporate Sustainability Reporting Directive (CSRD)",
|
|
"short": "CSRD",
|
|
"category": "sustainability",
|
|
},
|
|
{
|
|
"celex": "32024L1760",
|
|
"regulation_id": "csddd_2024",
|
|
"name": "Corporate Sustainability Due Diligence Directive (CSDDD)",
|
|
"short": "CSDDD",
|
|
"category": "sustainability",
|
|
},
|
|
{
|
|
"celex": "32020R0852",
|
|
"regulation_id": "eu_taxonomy_2020",
|
|
"name": "EU-Taxonomie-Verordnung",
|
|
"short": "EU Taxonomy",
|
|
"category": "sustainability",
|
|
},
|
|
{
|
|
"celex": "32024R1183",
|
|
"regulation_id": "eidas_2_0_2024",
|
|
"name": "eIDAS 2.0 Verordnung (EU Digital Identity)",
|
|
"short": "eIDAS 2.0",
|
|
"category": "digital_identity",
|
|
},
|
|
{
|
|
"celex": "32023L0970",
|
|
"regulation_id": "pay_transparency_2023",
|
|
"name": "Entgelttransparenz-Richtlinie",
|
|
"short": "Pay Transparency",
|
|
"category": "employment",
|
|
},
|
|
{
|
|
"celex": "32022R2065",
|
|
"regulation_id": "dsa_2022_updated",
|
|
"name": "Digital Services Act (DSA) — aktualisiert",
|
|
"short": "DSA",
|
|
"category": "digital_services",
|
|
"skip_if_exists": "dsa_2022", # already exists under different ID
|
|
},
|
|
]
|
|
|
|
|
|
def download_eurlex(celex: str) -> str:
|
|
"""Download EU regulation HTML from EUR-Lex."""
|
|
url = EURLEX_URL.format(celex=celex)
|
|
with httpx.Client(timeout=30.0, follow_redirects=True) as c:
|
|
resp = c.get(url)
|
|
resp.raise_for_status()
|
|
return resp.text
|
|
|
|
|
|
def upload_html(html: str, filename: str, reg: dict, dry_run: bool = False):
|
|
"""Upload HTML to RAG service."""
|
|
if dry_run:
|
|
logger.info(" DRY RUN — would upload %d chars", len(html))
|
|
return {"chunks_count": 0}
|
|
|
|
meta = {
|
|
"regulation_id": reg["regulation_id"],
|
|
"regulation_name_de": reg["name"],
|
|
"regulation_short": reg["short"],
|
|
"celex": reg["celex"],
|
|
"category": reg["category"],
|
|
"source": "EUR-Lex",
|
|
"license": "EU_law",
|
|
"jurisdiction": "EU",
|
|
"source_type": "law",
|
|
}
|
|
form_data = {
|
|
"collection": COLLECTION,
|
|
"data_type": "compliance",
|
|
"bundesland": "bund",
|
|
"use_case": "compliance",
|
|
"year": "2026",
|
|
"chunk_strategy": "legal",
|
|
"chunk_size": "1500",
|
|
"chunk_overlap": "100",
|
|
"metadata_json": json.dumps(meta, ensure_ascii=False),
|
|
}
|
|
with httpx.Client(timeout=600.0, verify=False) as c:
|
|
resp = c.post(
|
|
f"{RAG_URL}/api/v1/documents/upload",
|
|
files={"file": (filename, html.encode("utf-8"), "text/html")},
|
|
data=form_data,
|
|
)
|
|
resp.raise_for_status()
|
|
return resp.json()
|
|
|
|
|
|
def count_existing(regulation_id: str) -> int:
|
|
with httpx.Client(timeout=60.0) as c:
|
|
resp = c.post(
|
|
f"{QDRANT_URL}/collections/{COLLECTION}/points/count",
|
|
json={"filter": {"must": [
|
|
{"key": "regulation_id", "match": {"value": regulation_id}}
|
|
]}, "exact": True},
|
|
)
|
|
resp.raise_for_status()
|
|
return resp.json()["result"]["count"]
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument("--dry-run", action="store_true")
|
|
args = parser.parse_args()
|
|
|
|
logger.info("=" * 60)
|
|
logger.info("Ingest EU Regulations from EUR-Lex")
|
|
logger.info(" Regulations: %d", len(REGULATIONS))
|
|
logger.info(" Dry run: %s", args.dry_run)
|
|
logger.info("=" * 60)
|
|
|
|
results = []
|
|
for i, reg in enumerate(REGULATIONS, 1):
|
|
logger.info("\n[%d/%d] %s (CELEX: %s)", i, len(REGULATIONS), reg["name"], reg["celex"])
|
|
|
|
# Skip if variant already exists
|
|
skip_id = reg.get("skip_if_exists")
|
|
if skip_id:
|
|
existing = count_existing(skip_id)
|
|
if existing > 0:
|
|
logger.info(" Already exists as '%s' (%d chunks) — SKIPPING", skip_id, existing)
|
|
results.append({"reg": reg["short"], "status": "exists", "chunks": existing})
|
|
continue
|
|
|
|
# Check if this exact ID exists
|
|
existing = count_existing(reg["regulation_id"])
|
|
if existing > 0:
|
|
logger.info(" Already exists: %d chunks — SKIPPING", existing)
|
|
results.append({"reg": reg["short"], "status": "exists", "chunks": existing})
|
|
continue
|
|
|
|
# Download from EUR-Lex
|
|
logger.info(" Downloading from EUR-Lex...")
|
|
try:
|
|
html = download_eurlex(reg["celex"])
|
|
logger.info(" Downloaded: %d chars", len(html))
|
|
except Exception as e:
|
|
logger.error(" Download FAILED: %s", e)
|
|
results.append({"reg": reg["short"], "status": "download_failed", "chunks": 0})
|
|
continue
|
|
|
|
# Upload
|
|
filename = f"{reg['regulation_id']}.html"
|
|
try:
|
|
result = upload_html(html, filename, reg, args.dry_run)
|
|
chunks = result.get("chunks_count", 0)
|
|
logger.info(" Uploaded: %d chunks", chunks)
|
|
results.append({"reg": reg["short"], "status": "ok", "chunks": chunks})
|
|
except Exception as e:
|
|
logger.error(" Upload FAILED: %s", e)
|
|
results.append({"reg": reg["short"], "status": "error", "chunks": 0})
|
|
|
|
if i < len(REGULATIONS):
|
|
time.sleep(2)
|
|
|
|
# Summary
|
|
logger.info("\n" + "=" * 60)
|
|
logger.info("RESULTS")
|
|
logger.info("=" * 60)
|
|
for r in results:
|
|
logger.info(" %-20s %s chunks=%d", r["reg"], r["status"].upper(), r["chunks"])
|
|
|
|
total_new = sum(r["chunks"] for r in results if r["status"] == "ok")
|
|
logger.info("\nTotal new chunks: %d", total_new)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|