feat(pipeline): F1 regulation registry — DB-backed license/source-type lookup

Migrates REGULATION_LICENSE_MAP (135 entries) and SOURCE_REGULATION_CLASSIFICATION
(58 entries) from hardcoded Python dicts to compliance.regulation_registry table.

- SQL migration: 002_regulation_registry.sql (table + indexes + trigger)
- Migration script: f1_migrate_regulation_registry.py (162 rows, --dry-run)
- RegulationRegistry cache: 5min TTL, prefix fallback, graceful degradation
- control_generator._classify_regulation() delegates to DB with dict fallback
- source_type_classification.classify_source_regulation() delegates to DB
- 34 new tests (lookup, cache, degradation, migration data consistency)
- 421 total tests pass, 0 regressions

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-05-03 23:14:06 +02:00
parent 4fd2bfefcd
commit 9437e029d0
7 changed files with 850 additions and 30 deletions
@@ -0,0 +1,247 @@
#!/usr/bin/env python3
"""
F1 Migration: Populate regulation_registry from hardcoded Python dicts.
Sources:
- REGULATION_LICENSE_MAP (control_generator.py) — 135 entries keyed by regulation_id
- SOURCE_REGULATION_CLASSIFICATION (source_type_classification.py) — 58 entries keyed by name
Usage:
# Dry run (prints SQL, no DB write):
python3 scripts/f1_migrate_regulation_registry.py --dry-run
# Against Mac Mini:
python3 scripts/f1_migrate_regulation_registry.py --db-host macmini
# Against local Docker:
python3 scripts/f1_migrate_regulation_registry.py --db-host localhost
"""
import argparse
import sys
from pathlib import Path
# Add parent so we can import from services/data
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
from services.control_generator import REGULATION_LICENSE_MAP, _RULE2_PREFIXES, _RULE3_PREFIXES # noqa: E402
from data.source_type_classification import SOURCE_REGULATION_CLASSIFICATION # noqa: E402
# Derive jurisdiction from license_type
_LICENSE_TO_JURISDICTION = {
"EU_LAW": "EU",
"EU_PUBLIC": "EU",
"DE_LAW": "DE",
"DE_PUBLIC": "DE",
"AT_LAW": "AT",
"CH_LAW": "CH",
"FR_LAW": "FR",
"ES_LAW": "ES",
"NL_LAW": "NL",
"IT_LAW": "IT",
"HU_LAW": "HU",
"NIST_PUBLIC_DOMAIN": "US",
"US_GOV_PUBLIC": "US",
"CC-BY-SA-4.0": "INT",
"CC-BY-4.0": "INT",
"OECD_PUBLIC": "INT",
}
def _derive_jurisdiction(license_type: str) -> str:
"""Map license_type to jurisdiction code."""
return _LICENSE_TO_JURISDICTION.get(license_type, "INT")
def build_rows() -> list[dict]:
"""Merge REGULATION_LICENSE_MAP + SOURCE_REGULATION_CLASSIFICATION into rows."""
rows = []
# Track names we've seen (for dedup against SOURCE_REGULATION_CLASSIFICATION)
seen_names: set[str] = set()
# 1) Primary source: REGULATION_LICENSE_MAP (has regulation_id as key)
for reg_id, info in REGULATION_LICENSE_MAP.items():
name = info.get("name", reg_id)
seen_names.add(name)
rows.append({
"regulation_id": reg_id.lower().strip(),
"regulation_name_de": name,
"license_rule": info["rule"],
"license_type": info.get("license", ""),
"attribution": info.get("attribution"),
"source_type": info.get("source_type", "law"),
"jurisdiction": _derive_jurisdiction(info.get("license", "")),
"status": "active",
})
# 2) Secondary: SOURCE_REGULATION_CLASSIFICATION entries not already covered
# These are keyed by name, not by regulation_id. We create synthetic IDs.
for name, source_type in SOURCE_REGULATION_CLASSIFICATION.items():
if name in seen_names:
continue
# Generate a regulation_id from the name
synthetic_id = (
name.lower()
.replace(" ", "_")
.replace("(", "")
.replace(")", "")
.replace("/", "_")
.replace("-", "_")
.replace(".", "")
.replace(",", "")
.replace("ä", "ae")
.replace("ö", "oe")
.replace("ü", "ue")
.replace("á", "a")
.replace("é", "e")
.replace("ó", "o")
.strip("_")
)[:100]
# Guess jurisdiction from name content
jurisdiction = "INT"
name_lower = name.lower()
if any(x in name_lower for x in ["edpb", "edps", "(eu)", "eu ", "wp2"]):
jurisdiction = "EU"
elif any(x in name_lower for x in ["bsi", "bdsg", "bundes", "gwg"]):
jurisdiction = "DE"
elif "nist" in name_lower or "cisa" in name_lower:
jurisdiction = "US"
elif "österreich" in name_lower:
jurisdiction = "AT"
elif "schweiz" in name_lower:
jurisdiction = "CH"
elif "spanien" in name_lower:
jurisdiction = "ES"
elif "frankreich" in name_lower:
jurisdiction = "FR"
elif "ungarn" in name_lower:
jurisdiction = "HU"
# Map source_type_classification's "framework" to our "standard"
# (source_type_classification uses law/guideline/framework)
mapped_source_type = source_type
if source_type == "framework":
mapped_source_type = "standard"
rows.append({
"regulation_id": synthetic_id,
"regulation_name_de": name,
"license_rule": 1, # default: conservative
"license_type": "",
"attribution": None,
"source_type": mapped_source_type,
"jurisdiction": jurisdiction,
"status": "needs_review", # needs manual review since we guessed
})
return rows
def generate_sql(rows: list[dict]) -> str:
"""Generate INSERT SQL for all rows."""
lines = [
"SET search_path TO compliance, public;",
"",
"-- Auto-generated by f1_migrate_regulation_registry.py",
f"-- {len(rows)} rows total",
"",
]
for row in rows:
attr = f"'{row['attribution']}'" if row["attribution"] else "NULL"
lines.append(
f"INSERT INTO regulation_registry "
f"(regulation_id, regulation_name_de, license_rule, license_type, "
f"attribution, source_type, jurisdiction, status) "
f"VALUES ("
f"'{row['regulation_id']}', "
f"'{_escape_sql(row['regulation_name_de'])}', "
f"{row['license_rule']}, "
f"'{row['license_type']}', "
f"{attr}, "
f"'{row['source_type']}', "
f"'{row['jurisdiction']}', "
f"'{row['status']}'"
f") ON CONFLICT (regulation_id) DO UPDATE SET "
f"regulation_name_de = EXCLUDED.regulation_name_de, "
f"license_rule = EXCLUDED.license_rule, "
f"license_type = EXCLUDED.license_type, "
f"attribution = EXCLUDED.attribution, "
f"source_type = EXCLUDED.source_type, "
f"jurisdiction = EXCLUDED.jurisdiction;"
)
return "\n".join(lines)
def _escape_sql(val: str) -> str:
"""Escape single quotes for SQL."""
return val.replace("'", "''")
def insert_via_sqlalchemy(rows: list[dict], db_host: str) -> int:
"""Insert rows using SQLAlchemy (same pattern as control-pipeline)."""
from sqlalchemy import create_engine, text
url = f"postgresql://breakpilot:breakpilot123@{db_host}:5432/breakpilot_db"
engine = create_engine(url)
inserted = 0
with engine.connect() as conn:
conn.execute(text("SET search_path TO compliance, public"))
for row in rows:
conn.execute(
text("""
INSERT INTO regulation_registry
(regulation_id, regulation_name_de, license_rule, license_type,
attribution, source_type, jurisdiction, status)
VALUES
(:regulation_id, :regulation_name_de, :license_rule, :license_type,
:attribution, :source_type, :jurisdiction, :status)
ON CONFLICT (regulation_id) DO UPDATE SET
regulation_name_de = EXCLUDED.regulation_name_de,
license_rule = EXCLUDED.license_rule,
license_type = EXCLUDED.license_type,
attribution = EXCLUDED.attribution,
source_type = EXCLUDED.source_type,
jurisdiction = EXCLUDED.jurisdiction
"""),
row,
)
inserted += 1
conn.commit()
return inserted
def main():
parser = argparse.ArgumentParser(description="Migrate regulation registry data")
parser.add_argument("--dry-run", action="store_true", help="Print SQL only")
parser.add_argument("--db-host", default="localhost", help="PostgreSQL host")
args = parser.parse_args()
rows = build_rows()
print(f"Built {len(rows)} rows from hardcoded dicts")
# Stats
by_rule = {}
by_status = {}
for r in rows:
by_rule[r["license_rule"]] = by_rule.get(r["license_rule"], 0) + 1
by_status[r["status"]] = by_status.get(r["status"], 0) + 1
print(f" By license_rule: {by_rule}")
print(f" By status: {by_status}")
if args.dry_run:
print("\n--- DRY RUN (SQL output) ---\n")
print(generate_sql(rows))
return
inserted = insert_via_sqlalchemy(rows, args.db_host)
print(f"Inserted/updated {inserted} rows into regulation_registry")
if __name__ == "__main__":
main()