Files
breakpilot-core/control-pipeline/scripts/ingest_de_laws.py
T
Benjamin Admin 118be3540d feat(pipeline): D6 citation backfill + E2/E3 law ingestion scripts
- d6_citation_backfill.py: 3-tier matching (hash/prefix/overlap),
  archives old citations, updated 3.651 controls (93.6% coverage)
- ingest_de_laws.py: 8 German laws ingested (ArbZG, MuSchG, NachwG,
  MiLoG, GmbHG, AktG, InsO, BUrlG — 1.629 chunks)
- ingest_eu_regulations.py: EUR-Lex ingestion (needs manual HTML due
  to AWS WAF). CSRD, CSDDD, EU Taxonomy, eIDAS 2.0, Pay Transparency
  manually ingested (1.057 chunks)
- Updated session handover with current state

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-03 13:19:27 +02:00

241 lines
7.8 KiB
Python

#!/usr/bin/env python3
"""Ingest missing German laws from gesetze-im-internet.de.
Downloads full HTML, strips to text, uploads with legal chunking strategy.
Handles ISO-8859-1 charset typical for gesetze-im-internet.de.
Usage (on Mac Mini):
python3 control-pipeline/scripts/ingest_de_laws.py --dry-run
python3 control-pipeline/scripts/ingest_de_laws.py
"""
import argparse
import json
import logging
import time
from typing import Optional
import httpx
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
logger = logging.getLogger("ingest-laws")
RAG_URL = "https://localhost:8097"
QDRANT_URL = "http://localhost:6333"
COLLECTION = "bp_compliance_gesetze"
# ---- Laws to ingest ----
# Format: (slug on gesetze-im-internet.de, regulation_id, display_name)
# URL pattern: https://www.gesetze-im-internet.de/{slug}/BJNR*.html (full text)
LAWS = [
{
"url": "https://www.gesetze-im-internet.de/arbzg/BJNR117100994.html",
"regulation_id": "de_arbzg",
"name": "Arbeitszeitgesetz (ArbZG)",
"short": "ArbZG",
},
{
"url": "https://www.gesetze-im-internet.de/muschg_2018/BJNR122810017.html",
"regulation_id": "de_muschg",
"name": "Mutterschutzgesetz (MuSchG)",
"short": "MuSchG",
},
{
"url": "https://www.gesetze-im-internet.de/nachwg/BJNR094610995.html",
"regulation_id": "de_nachwg",
"name": "Nachweisgesetz (NachwG)",
"short": "NachwG",
},
{
"url": "https://www.gesetze-im-internet.de/milog/BJNR134810014.html",
"regulation_id": "de_milog",
"name": "Mindestlohngesetz (MiLoG)",
"short": "MiLoG",
},
{
"url": "https://www.gesetze-im-internet.de/gmbhg/BJNR004770892.html",
"regulation_id": "de_gmbhg",
"name": "GmbH-Gesetz (GmbHG)",
"short": "GmbHG",
},
{
"url": "https://www.gesetze-im-internet.de/aktg/BJNR010890965.html",
"regulation_id": "de_aktg",
"name": "Aktiengesetz (AktG)",
"short": "AktG",
},
{
"url": "https://www.gesetze-im-internet.de/inso/BJNR286600994.html",
"regulation_id": "de_inso",
"name": "Insolvenzordnung (InsO)",
"short": "InsO",
},
# BEG IV ist ein Aenderungsgesetz — kein eigenstaendiger Text auf gesetze-im-internet.de
{
"url": "https://www.gesetze-im-internet.de/verpflg/BJNR009690974.html",
"regulation_id": "de_verpflichtungsgesetz",
"name": "Verpflichtungsgesetz",
"short": "VerpflG",
},
{
"url": "https://www.gesetze-im-internet.de/burlg/BJNR000020963.html",
"regulation_id": "de_burlg",
"name": "Bundesurlaubsgesetz (BUrlG)",
"short": "BUrlG",
},
{
"url": "https://www.gesetze-im-internet.de/entgfg/BJNR118010994.html",
"regulation_id": "de_entgfg",
"name": "Entgeltfortzahlungsgesetz (EntgFG)",
"short": "EntgFG",
},
]
def download_law(url: str) -> Optional[str]:
"""Download law HTML from gesetze-im-internet.de, handle charset."""
with httpx.Client(timeout=30.0, follow_redirects=True) as c:
resp = c.get(url)
if resp.status_code != 200:
logger.error(" HTTP %d for %s", resp.status_code, url)
return None
# gesetze-im-internet.de uses ISO-8859-1
content_type = resp.headers.get("content-type", "")
if "charset" in content_type:
# Use declared charset
html = resp.text
else:
# Try UTF-8 first, fall back to ISO-8859-1
try:
html = resp.content.decode("utf-8")
if "\ufffd" in html:
raise UnicodeDecodeError("utf-8", b"", 0, 1, "replacement chars")
except (UnicodeDecodeError, ValueError):
html = resp.content.decode("iso-8859-1")
return html
def upload_html(
html: str,
filename: str,
regulation_id: str,
name: str,
short: str,
dry_run: bool = False,
) -> Optional[dict]:
"""Upload HTML to RAG service with legal chunking."""
if dry_run:
logger.info(" DRY RUN — would upload %d chars", len(html))
return {"chunks_count": 0, "document_id": "dry-run"}
meta = {
"regulation_id": regulation_id,
"regulation_name_de": name,
"regulation_short": short,
"source": "gesetze-im-internet.de",
"license": "public_domain_de_law",
"jurisdiction": "DE",
"source_type": "law",
}
form_data = {
"collection": COLLECTION,
"data_type": "compliance",
"bundesland": "bund",
"use_case": "compliance",
"year": "2026",
"chunk_strategy": "legal",
"chunk_size": "1500",
"chunk_overlap": "100",
"metadata_json": json.dumps(meta, ensure_ascii=False),
}
with httpx.Client(timeout=600.0, verify=False) as c:
resp = c.post(
f"{RAG_URL}/api/v1/documents/upload",
files={"file": (filename, html.encode("utf-8"), "text/html")},
data=form_data,
)
resp.raise_for_status()
return resp.json()
def count_existing(regulation_id: str) -> int:
"""Check if regulation already exists in Qdrant."""
with httpx.Client(timeout=30.0) as c:
resp = c.post(
f"{QDRANT_URL}/collections/{COLLECTION}/points/count",
json={
"filter": {"must": [
{"key": "regulation_id", "match": {"value": regulation_id}}
]},
"exact": True,
},
)
resp.raise_for_status()
return resp.json()["result"]["count"]
def main():
parser = argparse.ArgumentParser(description="Ingest DE laws from gesetze-im-internet.de")
parser.add_argument("--dry-run", action="store_true")
args = parser.parse_args()
logger.info("=" * 60)
logger.info("Ingest German Laws")
logger.info(" Laws: %d", len(LAWS))
logger.info(" Collection: %s", COLLECTION)
logger.info(" Dry run: %s", args.dry_run)
logger.info("=" * 60)
results = []
for i, law in enumerate(LAWS, 1):
logger.info("\n[%d/%d] %s (%s)", i, len(LAWS), law["name"], law["regulation_id"])
# Check if already exists
existing = count_existing(law["regulation_id"])
if existing > 0:
logger.info(" Already exists: %d chunks — SKIPPING", existing)
results.append({"law": law["short"], "status": "exists", "chunks": existing})
continue
# Download
logger.info(" Downloading: %s", law["url"])
html = download_law(law["url"])
if not html:
results.append({"law": law["short"], "status": "download_failed", "chunks": 0})
continue
logger.info(" Downloaded: %d chars", len(html))
# Upload
filename = f"{law['regulation_id']}.html"
try:
result = upload_html(
html, filename, law["regulation_id"],
law["name"], law["short"], args.dry_run,
)
chunks = result.get("chunks_count", 0) if result else 0
logger.info(" Uploaded: %d chunks", chunks)
results.append({"law": law["short"], "status": "ok", "chunks": chunks})
except Exception as e:
logger.error(" Upload FAILED: %s", e)
results.append({"law": law["short"], "status": "error", "chunks": 0})
if i < len(LAWS):
time.sleep(1)
# Summary
logger.info("\n" + "=" * 60)
logger.info("RESULTS")
logger.info("=" * 60)
for r in results:
logger.info(" %-10s %s chunks=%d", r["law"], r["status"].upper(), r["chunks"])
total_new = sum(r["chunks"] for r in results if r["status"] == "ok")
logger.info("\nTotal new chunks: %d", total_new)
if __name__ == "__main__":
main()