feat(pipeline): D6 citation backfill + E2/E3 law ingestion scripts

- d6_citation_backfill.py: 3-tier matching (hash/prefix/overlap), archives old citations, updated 3.651 controls (93.6% coverage) - ingest_de_laws.py: 8 German laws ingested (ArbZG, MuSchG, NachwG, MiLoG, GmbHG, AktG, InsO, BUrlG — 1.629 chunks) - ingest_eu_regulations.py: EUR-Lex ingestion (needs manual HTML due to AWS WAF). CSRD, CSDDD, EU Taxonomy, eIDAS 2.0, Pay Transparency manually ingested (1.057 chunks) - Updated session handover with current state Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-03 13:19:27 +02:00
parent a9671a572b
commit 118be3540d
4 changed files with 1033 additions and 109 deletions
@@ -0,0 +1,201 @@
+#!/usr/bin/env python3
+"""Ingest missing EU regulations from EUR-Lex (HTML).
+
+Downloads German HTML from EUR-Lex via CELEX number, uploads with legal chunking.
+
+Usage (on Mac Mini):
+    python3 control-pipeline/scripts/ingest_eu_regulations.py --dry-run
+    python3 control-pipeline/scripts/ingest_eu_regulations.py
+"""
+
+import argparse
+import json
+import logging
+import time
+
+import httpx
+
+logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
+logger = logging.getLogger("ingest-eu")
+
+RAG_URL = "https://localhost:8097"
+QDRANT_URL = "http://localhost:6333"
+COLLECTION = "bp_compliance_ce"
+
+EURLEX_URL = "https://eur-lex.europa.eu/legal-content/DE/TXT/HTML/?uri=CELEX:{celex}"
+
+# ---- EU Regulations to ingest ----
+REGULATIONS = [
+    {
+        "celex": "32022L2464",
+        "regulation_id": "csrd_2022",
+        "name": "Corporate Sustainability Reporting Directive (CSRD)",
+        "short": "CSRD",
+        "category": "sustainability",
+    },
+    {
+        "celex": "32024L1760",
+        "regulation_id": "csddd_2024",
+        "name": "Corporate Sustainability Due Diligence Directive (CSDDD)",
+        "short": "CSDDD",
+        "category": "sustainability",
+    },
+    {
+        "celex": "32020R0852",
+        "regulation_id": "eu_taxonomy_2020",
+        "name": "EU-Taxonomie-Verordnung",
+        "short": "EU Taxonomy",
+        "category": "sustainability",
+    },
+    {
+        "celex": "32024R1183",
+        "regulation_id": "eidas_2_0_2024",
+        "name": "eIDAS 2.0 Verordnung (EU Digital Identity)",
+        "short": "eIDAS 2.0",
+        "category": "digital_identity",
+    },
+    {
+        "celex": "32023L0970",
+        "regulation_id": "pay_transparency_2023",
+        "name": "Entgelttransparenz-Richtlinie",
+        "short": "Pay Transparency",
+        "category": "employment",
+    },
+    {
+        "celex": "32022R2065",
+        "regulation_id": "dsa_2022_updated",
+        "name": "Digital Services Act (DSA) — aktualisiert",
+        "short": "DSA",
+        "category": "digital_services",
+        "skip_if_exists": "dsa_2022",  # already exists under different ID
+    },
+]
+
+
+def download_eurlex(celex: str) -> str:
+    """Download EU regulation HTML from EUR-Lex."""
+    url = EURLEX_URL.format(celex=celex)
+    with httpx.Client(timeout=30.0, follow_redirects=True) as c:
+        resp = c.get(url)
+        resp.raise_for_status()
+        return resp.text
+
+
+def upload_html(html: str, filename: str, reg: dict, dry_run: bool = False):
+    """Upload HTML to RAG service."""
+    if dry_run:
+        logger.info("  DRY RUN — would upload %d chars", len(html))
+        return {"chunks_count": 0}
+
+    meta = {
+        "regulation_id": reg["regulation_id"],
+        "regulation_name_de": reg["name"],
+        "regulation_short": reg["short"],
+        "celex": reg["celex"],
+        "category": reg["category"],
+        "source": "EUR-Lex",
+        "license": "EU_law",
+        "jurisdiction": "EU",
+        "source_type": "law",
+    }
+    form_data = {
+        "collection": COLLECTION,
+        "data_type": "compliance",
+        "bundesland": "bund",
+        "use_case": "compliance",
+        "year": "2026",
+        "chunk_strategy": "legal",
+        "chunk_size": "1500",
+        "chunk_overlap": "100",
+        "metadata_json": json.dumps(meta, ensure_ascii=False),
+    }
+    with httpx.Client(timeout=600.0, verify=False) as c:
+        resp = c.post(
+            f"{RAG_URL}/api/v1/documents/upload",
+            files={"file": (filename, html.encode("utf-8"), "text/html")},
+            data=form_data,
+        )
+        resp.raise_for_status()
+        return resp.json()
+
+
+def count_existing(regulation_id: str) -> int:
+    with httpx.Client(timeout=60.0) as c:
+        resp = c.post(
+            f"{QDRANT_URL}/collections/{COLLECTION}/points/count",
+            json={"filter": {"must": [
+                {"key": "regulation_id", "match": {"value": regulation_id}}
+            ]}, "exact": True},
+        )
+        resp.raise_for_status()
+        return resp.json()["result"]["count"]
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--dry-run", action="store_true")
+    args = parser.parse_args()
+
+    logger.info("=" * 60)
+    logger.info("Ingest EU Regulations from EUR-Lex")
+    logger.info("  Regulations: %d", len(REGULATIONS))
+    logger.info("  Dry run: %s", args.dry_run)
+    logger.info("=" * 60)
+
+    results = []
+    for i, reg in enumerate(REGULATIONS, 1):
+        logger.info("\n[%d/%d] %s (CELEX: %s)", i, len(REGULATIONS), reg["name"], reg["celex"])
+
+        # Skip if variant already exists
+        skip_id = reg.get("skip_if_exists")
+        if skip_id:
+            existing = count_existing(skip_id)
+            if existing > 0:
+                logger.info("  Already exists as '%s' (%d chunks) — SKIPPING", skip_id, existing)
+                results.append({"reg": reg["short"], "status": "exists", "chunks": existing})
+                continue
+
+        # Check if this exact ID exists
+        existing = count_existing(reg["regulation_id"])
+        if existing > 0:
+            logger.info("  Already exists: %d chunks — SKIPPING", existing)
+            results.append({"reg": reg["short"], "status": "exists", "chunks": existing})
+            continue
+
+        # Download from EUR-Lex
+        logger.info("  Downloading from EUR-Lex...")
+        try:
+            html = download_eurlex(reg["celex"])
+            logger.info("  Downloaded: %d chars", len(html))
+        except Exception as e:
+            logger.error("  Download FAILED: %s", e)
+            results.append({"reg": reg["short"], "status": "download_failed", "chunks": 0})
+            continue
+
+        # Upload
+        filename = f"{reg['regulation_id']}.html"
+        try:
+            result = upload_html(html, filename, reg, args.dry_run)
+            chunks = result.get("chunks_count", 0)
+            logger.info("  Uploaded: %d chunks", chunks)
+            results.append({"reg": reg["short"], "status": "ok", "chunks": chunks})
+        except Exception as e:
+            logger.error("  Upload FAILED: %s", e)
+            results.append({"reg": reg["short"], "status": "error", "chunks": 0})
+
+        if i < len(REGULATIONS):
+            time.sleep(2)
+
+    # Summary
+    logger.info("\n" + "=" * 60)
+    logger.info("RESULTS")
+    logger.info("=" * 60)
+    for r in results:
+        logger.info("  %-20s %s  chunks=%d", r["reg"], r["status"].upper(), r["chunks"])
+
+    total_new = sum(r["chunks"] for r in results if r["status"] == "ok")
+    logger.info("\nTotal new chunks: %d", total_new)
+
+
+if __name__ == "__main__":
+    main()