breakpilot-core/control-pipeline/scripts/ingest_de_laws.py

#!/usr/bin/env python3
"""Ingest missing German laws from gesetze-im-internet.de.

Downloads full HTML, strips to text, uploads with legal chunking strategy.
Handles ISO-8859-1 charset typical for gesetze-im-internet.de.

Usage (on Mac Mini):
    python3 control-pipeline/scripts/ingest_de_laws.py --dry-run
    python3 control-pipeline/scripts/ingest_de_laws.py
"""

import argparse
import json
import logging
import time
from typing import Optional

import httpx

logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
logger = logging.getLogger("ingest-laws")

RAG_URL = "https://localhost:8097"
QDRANT_URL = "http://localhost:6333"
COLLECTION = "bp_compliance_gesetze"

# ---- Laws to ingest ----
# Format: (slug on gesetze-im-internet.de, regulation_id, display_name)
# URL pattern: https://www.gesetze-im-internet.de/{slug}/BJNR*.html (full text)

LAWS = [
    {
        "url": "https://www.gesetze-im-internet.de/arbzg/BJNR117100994.html",
        "regulation_id": "de_arbzg",
        "name": "Arbeitszeitgesetz (ArbZG)",
        "short": "ArbZG",
    },
    {
        "url": "https://www.gesetze-im-internet.de/muschg_2018/BJNR122810017.html",
        "regulation_id": "de_muschg",
        "name": "Mutterschutzgesetz (MuSchG)",
        "short": "MuSchG",
    },
    {
        "url": "https://www.gesetze-im-internet.de/nachwg/BJNR094610995.html",
        "regulation_id": "de_nachwg",
        "name": "Nachweisgesetz (NachwG)",
        "short": "NachwG",
    },
    {
        "url": "https://www.gesetze-im-internet.de/milog/BJNR134810014.html",
        "regulation_id": "de_milog",
        "name": "Mindestlohngesetz (MiLoG)",
        "short": "MiLoG",
    },
    {
        "url": "https://www.gesetze-im-internet.de/gmbhg/BJNR004770892.html",
        "regulation_id": "de_gmbhg",
        "name": "GmbH-Gesetz (GmbHG)",
        "short": "GmbHG",
    },
    {
        "url": "https://www.gesetze-im-internet.de/aktg/BJNR010890965.html",
        "regulation_id": "de_aktg",
        "name": "Aktiengesetz (AktG)",
        "short": "AktG",
    },
    {
        "url": "https://www.gesetze-im-internet.de/inso/BJNR286600994.html",
        "regulation_id": "de_inso",
        "name": "Insolvenzordnung (InsO)",
        "short": "InsO",
    },
    # BEG IV ist ein Aenderungsgesetz — kein eigenstaendiger Text auf gesetze-im-internet.de
    {
        "url": "https://www.gesetze-im-internet.de/verpflg/BJNR009690974.html",
        "regulation_id": "de_verpflichtungsgesetz",
        "name": "Verpflichtungsgesetz",
        "short": "VerpflG",
    },
    {
        "url": "https://www.gesetze-im-internet.de/burlg/BJNR000020963.html",
        "regulation_id": "de_burlg",
        "name": "Bundesurlaubsgesetz (BUrlG)",
        "short": "BUrlG",
    },
    {
        "url": "https://www.gesetze-im-internet.de/entgfg/BJNR118010994.html",
        "regulation_id": "de_entgfg",
        "name": "Entgeltfortzahlungsgesetz (EntgFG)",
        "short": "EntgFG",
    },
]


def download_law(url: str) -> Optional[str]:
    """Download law HTML from gesetze-im-internet.de, handle charset."""
    with httpx.Client(timeout=30.0, follow_redirects=True) as c:
        resp = c.get(url)
        if resp.status_code != 200:
            logger.error("  HTTP %d for %s", resp.status_code, url)
            return None

        # gesetze-im-internet.de uses ISO-8859-1
        content_type = resp.headers.get("content-type", "")
        if "charset" in content_type:
            # Use declared charset
            html = resp.text
        else:
            # Try UTF-8 first, fall back to ISO-8859-1
            try:
                html = resp.content.decode("utf-8")
                if "\ufffd" in html:
                    raise UnicodeDecodeError("utf-8", b"", 0, 1, "replacement chars")
            except (UnicodeDecodeError, ValueError):
                html = resp.content.decode("iso-8859-1")

    return html


def upload_html(
    html: str,
    filename: str,
    regulation_id: str,
    name: str,
    short: str,
    dry_run: bool = False,
) -> Optional[dict]:
    """Upload HTML to RAG service with legal chunking."""
    if dry_run:
        logger.info("  DRY RUN — would upload %d chars", len(html))
        return {"chunks_count": 0, "document_id": "dry-run"}

    meta = {
        "regulation_id": regulation_id,
        "regulation_name_de": name,
        "regulation_short": short,
        "source": "gesetze-im-internet.de",
        "license": "public_domain_de_law",
        "jurisdiction": "DE",
        "source_type": "law",
    }
    form_data = {
        "collection": COLLECTION,
        "data_type": "compliance",
        "bundesland": "bund",
        "use_case": "compliance",
        "year": "2026",
        "chunk_strategy": "legal",
        "chunk_size": "1500",
        "chunk_overlap": "100",
        "metadata_json": json.dumps(meta, ensure_ascii=False),
    }
    with httpx.Client(timeout=600.0, verify=False) as c:
        resp = c.post(
            f"{RAG_URL}/api/v1/documents/upload",
            files={"file": (filename, html.encode("utf-8"), "text/html")},
            data=form_data,
        )
        resp.raise_for_status()
        return resp.json()


def count_existing(regulation_id: str) -> int:
    """Check if regulation already exists in Qdrant."""
    with httpx.Client(timeout=30.0) as c:
        resp = c.post(
            f"{QDRANT_URL}/collections/{COLLECTION}/points/count",
            json={
                "filter": {"must": [
                    {"key": "regulation_id", "match": {"value": regulation_id}}
                ]},
                "exact": True,
            },
        )
        resp.raise_for_status()
        return resp.json()["result"]["count"]


def main():
    parser = argparse.ArgumentParser(description="Ingest DE laws from gesetze-im-internet.de")
    parser.add_argument("--dry-run", action="store_true")
    args = parser.parse_args()

    logger.info("=" * 60)
    logger.info("Ingest German Laws")
    logger.info("  Laws: %d", len(LAWS))
    logger.info("  Collection: %s", COLLECTION)
    logger.info("  Dry run: %s", args.dry_run)
    logger.info("=" * 60)

    results = []
    for i, law in enumerate(LAWS, 1):
        logger.info("\n[%d/%d] %s (%s)", i, len(LAWS), law["name"], law["regulation_id"])

        # Check if already exists
        existing = count_existing(law["regulation_id"])
        if existing > 0:
            logger.info("  Already exists: %d chunks — SKIPPING", existing)
            results.append({"law": law["short"], "status": "exists", "chunks": existing})
            continue

        # Download
        logger.info("  Downloading: %s", law["url"])
        html = download_law(law["url"])
        if not html:
            results.append({"law": law["short"], "status": "download_failed", "chunks": 0})
            continue
        logger.info("  Downloaded: %d chars", len(html))

        # Upload
        filename = f"{law['regulation_id']}.html"
        try:
            result = upload_html(
                html, filename, law["regulation_id"],
                law["name"], law["short"], args.dry_run,
            )
            chunks = result.get("chunks_count", 0) if result else 0
            logger.info("  Uploaded: %d chunks", chunks)
            results.append({"law": law["short"], "status": "ok", "chunks": chunks})
        except Exception as e:
            logger.error("  Upload FAILED: %s", e)
            results.append({"law": law["short"], "status": "error", "chunks": 0})

        if i < len(LAWS):
            time.sleep(1)

    # Summary
    logger.info("\n" + "=" * 60)
    logger.info("RESULTS")
    logger.info("=" * 60)
    for r in results:
        logger.info("  %-10s %s  chunks=%d", r["law"], r["status"].upper(), r["chunks"])

    total_new = sum(r["chunks"] for r in results if r["status"] == "ok")
    logger.info("\nTotal new chunks: %d", total_new)


if __name__ == "__main__":
    main()