#!/usr/bin/env python3 """Ingest missing German laws from gesetze-im-internet.de. Downloads full HTML, strips to text, uploads with legal chunking strategy. Handles ISO-8859-1 charset typical for gesetze-im-internet.de. Usage (on Mac Mini): python3 control-pipeline/scripts/ingest_de_laws.py --dry-run python3 control-pipeline/scripts/ingest_de_laws.py """ import argparse import json import logging import time from typing import Optional import httpx logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s") logger = logging.getLogger("ingest-laws") RAG_URL = "https://localhost:8097" QDRANT_URL = "http://localhost:6333" COLLECTION = "bp_compliance_gesetze" # ---- Laws to ingest ---- # Format: (slug on gesetze-im-internet.de, regulation_id, display_name) # URL pattern: https://www.gesetze-im-internet.de/{slug}/BJNR*.html (full text) LAWS = [ { "url": "https://www.gesetze-im-internet.de/arbzg/BJNR117100994.html", "regulation_id": "de_arbzg", "name": "Arbeitszeitgesetz (ArbZG)", "short": "ArbZG", }, { "url": "https://www.gesetze-im-internet.de/muschg_2018/BJNR122810017.html", "regulation_id": "de_muschg", "name": "Mutterschutzgesetz (MuSchG)", "short": "MuSchG", }, { "url": "https://www.gesetze-im-internet.de/nachwg/BJNR094610995.html", "regulation_id": "de_nachwg", "name": "Nachweisgesetz (NachwG)", "short": "NachwG", }, { "url": "https://www.gesetze-im-internet.de/milog/BJNR134810014.html", "regulation_id": "de_milog", "name": "Mindestlohngesetz (MiLoG)", "short": "MiLoG", }, { "url": "https://www.gesetze-im-internet.de/gmbhg/BJNR004770892.html", "regulation_id": "de_gmbhg", "name": "GmbH-Gesetz (GmbHG)", "short": "GmbHG", }, { "url": "https://www.gesetze-im-internet.de/aktg/BJNR010890965.html", "regulation_id": "de_aktg", "name": "Aktiengesetz (AktG)", "short": "AktG", }, { "url": "https://www.gesetze-im-internet.de/inso/BJNR286600994.html", "regulation_id": "de_inso", "name": "Insolvenzordnung (InsO)", "short": "InsO", }, # BEG IV ist ein Aenderungsgesetz — kein eigenstaendiger Text auf gesetze-im-internet.de { "url": "https://www.gesetze-im-internet.de/verpflg/BJNR009690974.html", "regulation_id": "de_verpflichtungsgesetz", "name": "Verpflichtungsgesetz", "short": "VerpflG", }, { "url": "https://www.gesetze-im-internet.de/burlg/BJNR000020963.html", "regulation_id": "de_burlg", "name": "Bundesurlaubsgesetz (BUrlG)", "short": "BUrlG", }, { "url": "https://www.gesetze-im-internet.de/entgfg/BJNR118010994.html", "regulation_id": "de_entgfg", "name": "Entgeltfortzahlungsgesetz (EntgFG)", "short": "EntgFG", }, ] def download_law(url: str) -> Optional[str]: """Download law HTML from gesetze-im-internet.de, handle charset.""" with httpx.Client(timeout=30.0, follow_redirects=True) as c: resp = c.get(url) if resp.status_code != 200: logger.error(" HTTP %d for %s", resp.status_code, url) return None # gesetze-im-internet.de uses ISO-8859-1 content_type = resp.headers.get("content-type", "") if "charset" in content_type: # Use declared charset html = resp.text else: # Try UTF-8 first, fall back to ISO-8859-1 try: html = resp.content.decode("utf-8") if "\ufffd" in html: raise UnicodeDecodeError("utf-8", b"", 0, 1, "replacement chars") except (UnicodeDecodeError, ValueError): html = resp.content.decode("iso-8859-1") return html def upload_html( html: str, filename: str, regulation_id: str, name: str, short: str, dry_run: bool = False, ) -> Optional[dict]: """Upload HTML to RAG service with legal chunking.""" if dry_run: logger.info(" DRY RUN — would upload %d chars", len(html)) return {"chunks_count": 0, "document_id": "dry-run"} meta = { "regulation_id": regulation_id, "regulation_name_de": name, "regulation_short": short, "source": "gesetze-im-internet.de", "license": "public_domain_de_law", "jurisdiction": "DE", "source_type": "law", } form_data = { "collection": COLLECTION, "data_type": "compliance", "bundesland": "bund", "use_case": "compliance", "year": "2026", "chunk_strategy": "legal", "chunk_size": "1500", "chunk_overlap": "100", "metadata_json": json.dumps(meta, ensure_ascii=False), } with httpx.Client(timeout=600.0, verify=False) as c: resp = c.post( f"{RAG_URL}/api/v1/documents/upload", files={"file": (filename, html.encode("utf-8"), "text/html")}, data=form_data, ) resp.raise_for_status() return resp.json() def count_existing(regulation_id: str) -> int: """Check if regulation already exists in Qdrant.""" with httpx.Client(timeout=30.0) as c: resp = c.post( f"{QDRANT_URL}/collections/{COLLECTION}/points/count", json={ "filter": {"must": [ {"key": "regulation_id", "match": {"value": regulation_id}} ]}, "exact": True, }, ) resp.raise_for_status() return resp.json()["result"]["count"] def main(): parser = argparse.ArgumentParser(description="Ingest DE laws from gesetze-im-internet.de") parser.add_argument("--dry-run", action="store_true") args = parser.parse_args() logger.info("=" * 60) logger.info("Ingest German Laws") logger.info(" Laws: %d", len(LAWS)) logger.info(" Collection: %s", COLLECTION) logger.info(" Dry run: %s", args.dry_run) logger.info("=" * 60) results = [] for i, law in enumerate(LAWS, 1): logger.info("\n[%d/%d] %s (%s)", i, len(LAWS), law["name"], law["regulation_id"]) # Check if already exists existing = count_existing(law["regulation_id"]) if existing > 0: logger.info(" Already exists: %d chunks — SKIPPING", existing) results.append({"law": law["short"], "status": "exists", "chunks": existing}) continue # Download logger.info(" Downloading: %s", law["url"]) html = download_law(law["url"]) if not html: results.append({"law": law["short"], "status": "download_failed", "chunks": 0}) continue logger.info(" Downloaded: %d chars", len(html)) # Upload filename = f"{law['regulation_id']}.html" try: result = upload_html( html, filename, law["regulation_id"], law["name"], law["short"], args.dry_run, ) chunks = result.get("chunks_count", 0) if result else 0 logger.info(" Uploaded: %d chunks", chunks) results.append({"law": law["short"], "status": "ok", "chunks": chunks}) except Exception as e: logger.error(" Upload FAILED: %s", e) results.append({"law": law["short"], "status": "error", "chunks": 0}) if i < len(LAWS): time.sleep(1) # Summary logger.info("\n" + "=" * 60) logger.info("RESULTS") logger.info("=" * 60) for r in results: logger.info(" %-10s %s chunks=%d", r["law"], r["status"].upper(), r["chunks"]) total_new = sum(r["chunks"] for r in results if r["status"] == "ok") logger.info("\nTotal new chunks: %d", total_new) if __name__ == "__main__": main()