feat(pipeline): F1 regulation registry — DB-backed license/source-type lookup

Migrates REGULATION_LICENSE_MAP (135 entries) and SOURCE_REGULATION_CLASSIFICATION (58 entries) from hardcoded Python dicts to compliance.regulation_registry table. - SQL migration: 002_regulation_registry.sql (table + indexes + trigger) - Migration script: f1_migrate_regulation_registry.py (162 rows, --dry-run) - RegulationRegistry cache: 5min TTL, prefix fallback, graceful degradation - control_generator._classify_regulation() delegates to DB with dict fallback - source_type_classification.classify_source_regulation() delegates to DB - 34 new tests (lookup, cache, degradation, migration data consistency) - 421 total tests pass, 0 regressions Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-03 23:14:06 +02:00
parent 4fd2bfefcd
commit 9437e029d0
7 changed files with 850 additions and 30 deletions
@@ -0,0 +1,247 @@
+#!/usr/bin/env python3
+"""
+F1 Migration: Populate regulation_registry from hardcoded Python dicts.
+
+Sources:
+  - REGULATION_LICENSE_MAP (control_generator.py) — 135 entries keyed by regulation_id
+  - SOURCE_REGULATION_CLASSIFICATION (source_type_classification.py) — 58 entries keyed by name
+
+Usage:
+  # Dry run (prints SQL, no DB write):
+  python3 scripts/f1_migrate_regulation_registry.py --dry-run
+
+  # Against Mac Mini:
+  python3 scripts/f1_migrate_regulation_registry.py --db-host macmini
+
+  # Against local Docker:
+  python3 scripts/f1_migrate_regulation_registry.py --db-host localhost
+"""
+
+import argparse
+import sys
+from pathlib import Path
+
+# Add parent so we can import from services/data
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
+
+from services.control_generator import REGULATION_LICENSE_MAP, _RULE2_PREFIXES, _RULE3_PREFIXES  # noqa: E402
+from data.source_type_classification import SOURCE_REGULATION_CLASSIFICATION  # noqa: E402
+
+# Derive jurisdiction from license_type
+_LICENSE_TO_JURISDICTION = {
+    "EU_LAW": "EU",
+    "EU_PUBLIC": "EU",
+    "DE_LAW": "DE",
+    "DE_PUBLIC": "DE",
+    "AT_LAW": "AT",
+    "CH_LAW": "CH",
+    "FR_LAW": "FR",
+    "ES_LAW": "ES",
+    "NL_LAW": "NL",
+    "IT_LAW": "IT",
+    "HU_LAW": "HU",
+    "NIST_PUBLIC_DOMAIN": "US",
+    "US_GOV_PUBLIC": "US",
+    "CC-BY-SA-4.0": "INT",
+    "CC-BY-4.0": "INT",
+    "OECD_PUBLIC": "INT",
+}
+
+
+def _derive_jurisdiction(license_type: str) -> str:
+    """Map license_type to jurisdiction code."""
+    return _LICENSE_TO_JURISDICTION.get(license_type, "INT")
+
+
+def build_rows() -> list[dict]:
+    """Merge REGULATION_LICENSE_MAP + SOURCE_REGULATION_CLASSIFICATION into rows."""
+    rows = []
+    # Track names we've seen (for dedup against SOURCE_REGULATION_CLASSIFICATION)
+    seen_names: set[str] = set()
+
+    # 1) Primary source: REGULATION_LICENSE_MAP (has regulation_id as key)
+    for reg_id, info in REGULATION_LICENSE_MAP.items():
+        name = info.get("name", reg_id)
+        seen_names.add(name)
+
+        rows.append({
+            "regulation_id": reg_id.lower().strip(),
+            "regulation_name_de": name,
+            "license_rule": info["rule"],
+            "license_type": info.get("license", ""),
+            "attribution": info.get("attribution"),
+            "source_type": info.get("source_type", "law"),
+            "jurisdiction": _derive_jurisdiction(info.get("license", "")),
+            "status": "active",
+        })
+
+    # 2) Secondary: SOURCE_REGULATION_CLASSIFICATION entries not already covered
+    #    These are keyed by name, not by regulation_id. We create synthetic IDs.
+    for name, source_type in SOURCE_REGULATION_CLASSIFICATION.items():
+        if name in seen_names:
+            continue
+        # Generate a regulation_id from the name
+        synthetic_id = (
+            name.lower()
+            .replace(" ", "_")
+            .replace("(", "")
+            .replace(")", "")
+            .replace("/", "_")
+            .replace("-", "_")
+            .replace(".", "")
+            .replace(",", "")
+            .replace("ä", "ae")
+            .replace("ö", "oe")
+            .replace("ü", "ue")
+            .replace("á", "a")
+            .replace("é", "e")
+            .replace("ó", "o")
+            .strip("_")
+        )[:100]
+
+        # Guess jurisdiction from name content
+        jurisdiction = "INT"
+        name_lower = name.lower()
+        if any(x in name_lower for x in ["edpb", "edps", "(eu)", "eu ", "wp2"]):
+            jurisdiction = "EU"
+        elif any(x in name_lower for x in ["bsi", "bdsg", "bundes", "gwg"]):
+            jurisdiction = "DE"
+        elif "nist" in name_lower or "cisa" in name_lower:
+            jurisdiction = "US"
+        elif "österreich" in name_lower:
+            jurisdiction = "AT"
+        elif "schweiz" in name_lower:
+            jurisdiction = "CH"
+        elif "spanien" in name_lower:
+            jurisdiction = "ES"
+        elif "frankreich" in name_lower:
+            jurisdiction = "FR"
+        elif "ungarn" in name_lower:
+            jurisdiction = "HU"
+
+        # Map source_type_classification's "framework" to our "standard"
+        # (source_type_classification uses law/guideline/framework)
+        mapped_source_type = source_type
+        if source_type == "framework":
+            mapped_source_type = "standard"
+
+        rows.append({
+            "regulation_id": synthetic_id,
+            "regulation_name_de": name,
+            "license_rule": 1,  # default: conservative
+            "license_type": "",
+            "attribution": None,
+            "source_type": mapped_source_type,
+            "jurisdiction": jurisdiction,
+            "status": "needs_review",  # needs manual review since we guessed
+        })
+
+    return rows
+
+
+def generate_sql(rows: list[dict]) -> str:
+    """Generate INSERT SQL for all rows."""
+    lines = [
+        "SET search_path TO compliance, public;",
+        "",
+        "-- Auto-generated by f1_migrate_regulation_registry.py",
+        f"-- {len(rows)} rows total",
+        "",
+    ]
+
+    for row in rows:
+        attr = f"'{row['attribution']}'" if row["attribution"] else "NULL"
+        lines.append(
+            f"INSERT INTO regulation_registry "
+            f"(regulation_id, regulation_name_de, license_rule, license_type, "
+            f"attribution, source_type, jurisdiction, status) "
+            f"VALUES ("
+            f"'{row['regulation_id']}', "
+            f"'{_escape_sql(row['regulation_name_de'])}', "
+            f"{row['license_rule']}, "
+            f"'{row['license_type']}', "
+            f"{attr}, "
+            f"'{row['source_type']}', "
+            f"'{row['jurisdiction']}', "
+            f"'{row['status']}'"
+            f") ON CONFLICT (regulation_id) DO UPDATE SET "
+            f"regulation_name_de = EXCLUDED.regulation_name_de, "
+            f"license_rule = EXCLUDED.license_rule, "
+            f"license_type = EXCLUDED.license_type, "
+            f"attribution = EXCLUDED.attribution, "
+            f"source_type = EXCLUDED.source_type, "
+            f"jurisdiction = EXCLUDED.jurisdiction;"
+        )
+
+    return "\n".join(lines)
+
+
+def _escape_sql(val: str) -> str:
+    """Escape single quotes for SQL."""
+    return val.replace("'", "''")
+
+
+def insert_via_sqlalchemy(rows: list[dict], db_host: str) -> int:
+    """Insert rows using SQLAlchemy (same pattern as control-pipeline)."""
+    from sqlalchemy import create_engine, text
+
+    url = f"postgresql://breakpilot:breakpilot123@{db_host}:5432/breakpilot_db"
+    engine = create_engine(url)
+
+    inserted = 0
+    with engine.connect() as conn:
+        conn.execute(text("SET search_path TO compliance, public"))
+        for row in rows:
+            conn.execute(
+                text("""
+                    INSERT INTO regulation_registry
+                        (regulation_id, regulation_name_de, license_rule, license_type,
+                         attribution, source_type, jurisdiction, status)
+                    VALUES
+                        (:regulation_id, :regulation_name_de, :license_rule, :license_type,
+                         :attribution, :source_type, :jurisdiction, :status)
+                    ON CONFLICT (regulation_id) DO UPDATE SET
+                        regulation_name_de = EXCLUDED.regulation_name_de,
+                        license_rule = EXCLUDED.license_rule,
+                        license_type = EXCLUDED.license_type,
+                        attribution = EXCLUDED.attribution,
+                        source_type = EXCLUDED.source_type,
+                        jurisdiction = EXCLUDED.jurisdiction
+                """),
+                row,
+            )
+            inserted += 1
+        conn.commit()
+
+    return inserted
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Migrate regulation registry data")
+    parser.add_argument("--dry-run", action="store_true", help="Print SQL only")
+    parser.add_argument("--db-host", default="localhost", help="PostgreSQL host")
+    args = parser.parse_args()
+
+    rows = build_rows()
+    print(f"Built {len(rows)} rows from hardcoded dicts")
+
+    # Stats
+    by_rule = {}
+    by_status = {}
+    for r in rows:
+        by_rule[r["license_rule"]] = by_rule.get(r["license_rule"], 0) + 1
+        by_status[r["status"]] = by_status.get(r["status"], 0) + 1
+    print(f"  By license_rule: {by_rule}")
+    print(f"  By status: {by_status}")
+
+    if args.dry_run:
+        print("\n--- DRY RUN (SQL output) ---\n")
+        print(generate_sql(rows))
+        return
+
+    inserted = insert_via_sqlalchemy(rows, args.db_host)
+    print(f"Inserted/updated {inserted} rows into regulation_registry")
+
+
+if __name__ == "__main__":
+    main()