feat(pipeline): F1 regulation registry — DB-backed license/source-type lookup

Migrates REGULATION_LICENSE_MAP (135 entries) and SOURCE_REGULATION_CLASSIFICATION (58 entries) from hardcoded Python dicts to compliance.regulation_registry table. - SQL migration: 002_regulation_registry.sql (table + indexes + trigger) - Migration script: f1_migrate_regulation_registry.py (162 rows, --dry-run) - RegulationRegistry cache: 5min TTL, prefix fallback, graceful degradation - control_generator._classify_regulation() delegates to DB with dict fallback - source_type_classification.classify_source_regulation() delegates to DB - 34 new tests (lookup, cache, degradation, migration data consistency) - 421 total tests pass, 0 regressions Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-03 23:14:06 +02:00
parent 4fd2bfefcd
commit 9437e029d0
7 changed files with 850 additions and 30 deletions
@@ -165,21 +165,29 @@ def classify_source_regulation(source_regulation: str) -> str:
    """
    Klassifiziert eine source_regulation als law, guideline oder framework.

-    Verwendet exaktes Matching gegen die Map. Bei unbekannten Quellen
-    wird anhand von Schluesselwoertern geraten, Fallback ist 'framework'
-    (konservativstes Ergebnis).
+    Delegates to DB-backed RegulationRegistry (with 5min cache).
+    Falls back to SOURCE_REGULATION_CLASSIFICATION dict + heuristic
+    if DB is unavailable.
    """
    if not source_regulation:
        return SOURCE_TYPE_FRAMEWORK

-    # Exaktes Match
+    # Try DB-backed registry first
+    try:
+        from services.regulation_registry import classify_source_regulation as _db_classify
+        result = _db_classify(source_regulation)
+        if result:
+            return result
+    except Exception:
+        pass
+
+    # Fallback: local dict
    if source_regulation in SOURCE_REGULATION_CLASSIFICATION:
        return SOURCE_REGULATION_CLASSIFICATION[source_regulation]

    # Heuristik fuer unbekannte Quellen
    lower = source_regulation.lower()

-    # Gesetze erkennen
    law_indicators = [
        "verordnung", "richtlinie", "gesetz", "directive", "regulation",
        "(eu)", "(eg)", "act", "ley", "loi", "törvény", "código",
@@ -187,19 +195,16 @@ def classify_source_regulation(source_regulation: str) -> str:
    if any(ind in lower for ind in law_indicators):
        return SOURCE_TYPE_LAW

-    # Leitlinien erkennen
    guideline_indicators = [
        "edpb", "leitlinie", "guideline", "wp2", "bsi", "empfehlung",
    ]
    if any(ind in lower for ind in guideline_indicators):
        return SOURCE_TYPE_GUIDELINE

-    # Frameworks erkennen
    framework_indicators = [
        "enisa", "nist", "owasp", "oecd", "cisa", "framework", "iso",
    ]
    if any(ind in lower for ind in framework_indicators):
        return SOURCE_TYPE_FRAMEWORK

-    # Konservativ: unbekannt = framework (geringste Verbindlichkeit)
    return SOURCE_TYPE_FRAMEWORK
@@ -0,0 +1,72 @@
+-- Migration 002: Regulation Registry (Block F1)
+-- Schema: compliance
+-- Run: ssh macmini "docker exec -i bp-core-postgres psql -U breakpilot -d breakpilot_db" < control-pipeline/migrations/002_regulation_registry.sql
+
+SET search_path TO compliance, public;
+
+-- ========================================
+-- regulation_registry
+-- ========================================
+-- Central registry for all regulations, laws, guidelines, and frameworks
+-- referenced by the control pipeline. Replaces hardcoded Python dicts
+-- (REGULATION_LICENSE_MAP, SOURCE_REGULATION_CLASSIFICATION).
+
+CREATE TABLE IF NOT EXISTS regulation_registry (
+    id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
+
+    -- regulation_id: machine key (e.g. "eu_2016_679", "nist_sp_800_53")
+    regulation_id VARCHAR(100) UNIQUE NOT NULL,
+
+    -- Display names
+    regulation_name_de TEXT,
+    regulation_name_en TEXT,
+    regulation_short VARCHAR(50),
+
+    -- License classification (3-rule system)
+    license_rule INTEGER NOT NULL DEFAULT 1
+        CHECK (license_rule IN (1, 2, 3)),
+    license_type VARCHAR(50),          -- EU_LAW, DE_LAW, CC-BY-SA-4.0, etc.
+    attribution TEXT,                   -- Required for Rule 2 (CC-BY)
+
+    -- Source classification
+    source_type VARCHAR(20) NOT NULL DEFAULT 'law'
+        CHECK (source_type IN ('law', 'guideline', 'standard', 'framework', 'restricted')),
+
+    -- Metadata
+    jurisdiction VARCHAR(10),           -- DE, EU, AT, CH, US, FR, ES, NL, IT, HU, INT
+    category VARCHAR(50),
+    celex VARCHAR(30),                  -- EU CELEX number if applicable
+    url TEXT,
+
+    -- Lifecycle
+    status VARCHAR(20) NOT NULL DEFAULT 'active'
+        CHECK (status IN ('active', 'needs_review', 'deprecated')),
+
+    created_at TIMESTAMPTZ DEFAULT NOW(),
+    updated_at TIMESTAMPTZ DEFAULT NOW()
+);
+
+-- Indexes
+CREATE INDEX IF NOT EXISTS idx_reg_registry_status
+    ON regulation_registry(status);
+CREATE INDEX IF NOT EXISTS idx_reg_registry_jurisdiction
+    ON regulation_registry(jurisdiction);
+CREATE INDEX IF NOT EXISTS idx_reg_registry_source_type
+    ON regulation_registry(source_type);
+CREATE INDEX IF NOT EXISTS idx_reg_registry_license_rule
+    ON regulation_registry(license_rule);
+
+-- Updated-at trigger
+CREATE OR REPLACE FUNCTION update_regulation_registry_updated_at()
+RETURNS TRIGGER AS $$
+BEGIN
+    NEW.updated_at = NOW();
+    RETURN NEW;
+END;
+$$ LANGUAGE plpgsql;
+
+DROP TRIGGER IF EXISTS trg_regulation_registry_updated_at ON regulation_registry;
+CREATE TRIGGER trg_regulation_registry_updated_at
+    BEFORE UPDATE ON regulation_registry
+    FOR EACH ROW
+    EXECUTE FUNCTION update_regulation_registry_updated_at();
@@ -0,0 +1,247 @@
+#!/usr/bin/env python3
+"""
+F1 Migration: Populate regulation_registry from hardcoded Python dicts.
+
+Sources:
+  - REGULATION_LICENSE_MAP (control_generator.py) — 135 entries keyed by regulation_id
+  - SOURCE_REGULATION_CLASSIFICATION (source_type_classification.py) — 58 entries keyed by name
+
+Usage:
+  # Dry run (prints SQL, no DB write):
+  python3 scripts/f1_migrate_regulation_registry.py --dry-run
+
+  # Against Mac Mini:
+  python3 scripts/f1_migrate_regulation_registry.py --db-host macmini
+
+  # Against local Docker:
+  python3 scripts/f1_migrate_regulation_registry.py --db-host localhost
+"""
+
+import argparse
+import sys
+from pathlib import Path
+
+# Add parent so we can import from services/data
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
+
+from services.control_generator import REGULATION_LICENSE_MAP, _RULE2_PREFIXES, _RULE3_PREFIXES  # noqa: E402
+from data.source_type_classification import SOURCE_REGULATION_CLASSIFICATION  # noqa: E402
+
+# Derive jurisdiction from license_type
+_LICENSE_TO_JURISDICTION = {
+    "EU_LAW": "EU",
+    "EU_PUBLIC": "EU",
+    "DE_LAW": "DE",
+    "DE_PUBLIC": "DE",
+    "AT_LAW": "AT",
+    "CH_LAW": "CH",
+    "FR_LAW": "FR",
+    "ES_LAW": "ES",
+    "NL_LAW": "NL",
+    "IT_LAW": "IT",
+    "HU_LAW": "HU",
+    "NIST_PUBLIC_DOMAIN": "US",
+    "US_GOV_PUBLIC": "US",
+    "CC-BY-SA-4.0": "INT",
+    "CC-BY-4.0": "INT",
+    "OECD_PUBLIC": "INT",
+}
+
+
+def _derive_jurisdiction(license_type: str) -> str:
+    """Map license_type to jurisdiction code."""
+    return _LICENSE_TO_JURISDICTION.get(license_type, "INT")
+
+
+def build_rows() -> list[dict]:
+    """Merge REGULATION_LICENSE_MAP + SOURCE_REGULATION_CLASSIFICATION into rows."""
+    rows = []
+    # Track names we've seen (for dedup against SOURCE_REGULATION_CLASSIFICATION)
+    seen_names: set[str] = set()
+
+    # 1) Primary source: REGULATION_LICENSE_MAP (has regulation_id as key)
+    for reg_id, info in REGULATION_LICENSE_MAP.items():
+        name = info.get("name", reg_id)
+        seen_names.add(name)
+
+        rows.append({
+            "regulation_id": reg_id.lower().strip(),
+            "regulation_name_de": name,
+            "license_rule": info["rule"],
+            "license_type": info.get("license", ""),
+            "attribution": info.get("attribution"),
+            "source_type": info.get("source_type", "law"),
+            "jurisdiction": _derive_jurisdiction(info.get("license", "")),
+            "status": "active",
+        })
+
+    # 2) Secondary: SOURCE_REGULATION_CLASSIFICATION entries not already covered
+    #    These are keyed by name, not by regulation_id. We create synthetic IDs.
+    for name, source_type in SOURCE_REGULATION_CLASSIFICATION.items():
+        if name in seen_names:
+            continue
+        # Generate a regulation_id from the name
+        synthetic_id = (
+            name.lower()
+            .replace(" ", "_")
+            .replace("(", "")
+            .replace(")", "")
+            .replace("/", "_")
+            .replace("-", "_")
+            .replace(".", "")
+            .replace(",", "")
+            .replace("ä", "ae")
+            .replace("ö", "oe")
+            .replace("ü", "ue")
+            .replace("á", "a")
+            .replace("é", "e")
+            .replace("ó", "o")
+            .strip("_")
+        )[:100]
+
+        # Guess jurisdiction from name content
+        jurisdiction = "INT"
+        name_lower = name.lower()
+        if any(x in name_lower for x in ["edpb", "edps", "(eu)", "eu ", "wp2"]):
+            jurisdiction = "EU"
+        elif any(x in name_lower for x in ["bsi", "bdsg", "bundes", "gwg"]):
+            jurisdiction = "DE"
+        elif "nist" in name_lower or "cisa" in name_lower:
+            jurisdiction = "US"
+        elif "österreich" in name_lower:
+            jurisdiction = "AT"
+        elif "schweiz" in name_lower:
+            jurisdiction = "CH"
+        elif "spanien" in name_lower:
+            jurisdiction = "ES"
+        elif "frankreich" in name_lower:
+            jurisdiction = "FR"
+        elif "ungarn" in name_lower:
+            jurisdiction = "HU"
+
+        # Map source_type_classification's "framework" to our "standard"
+        # (source_type_classification uses law/guideline/framework)
+        mapped_source_type = source_type
+        if source_type == "framework":
+            mapped_source_type = "standard"
+
+        rows.append({
+            "regulation_id": synthetic_id,
+            "regulation_name_de": name,
+            "license_rule": 1,  # default: conservative
+            "license_type": "",
+            "attribution": None,
+            "source_type": mapped_source_type,
+            "jurisdiction": jurisdiction,
+            "status": "needs_review",  # needs manual review since we guessed
+        })
+
+    return rows
+
+
+def generate_sql(rows: list[dict]) -> str:
+    """Generate INSERT SQL for all rows."""
+    lines = [
+        "SET search_path TO compliance, public;",
+        "",
+        "-- Auto-generated by f1_migrate_regulation_registry.py",
+        f"-- {len(rows)} rows total",
+        "",
+    ]
+
+    for row in rows:
+        attr = f"'{row['attribution']}'" if row["attribution"] else "NULL"
+        lines.append(
+            f"INSERT INTO regulation_registry "
+            f"(regulation_id, regulation_name_de, license_rule, license_type, "
+            f"attribution, source_type, jurisdiction, status) "
+            f"VALUES ("
+            f"'{row['regulation_id']}', "
+            f"'{_escape_sql(row['regulation_name_de'])}', "
+            f"{row['license_rule']}, "
+            f"'{row['license_type']}', "
+            f"{attr}, "
+            f"'{row['source_type']}', "
+            f"'{row['jurisdiction']}', "
+            f"'{row['status']}'"
+            f") ON CONFLICT (regulation_id) DO UPDATE SET "
+            f"regulation_name_de = EXCLUDED.regulation_name_de, "
+            f"license_rule = EXCLUDED.license_rule, "
+            f"license_type = EXCLUDED.license_type, "
+            f"attribution = EXCLUDED.attribution, "
+            f"source_type = EXCLUDED.source_type, "
+            f"jurisdiction = EXCLUDED.jurisdiction;"
+        )
+
+    return "\n".join(lines)
+
+
+def _escape_sql(val: str) -> str:
+    """Escape single quotes for SQL."""
+    return val.replace("'", "''")
+
+
+def insert_via_sqlalchemy(rows: list[dict], db_host: str) -> int:
+    """Insert rows using SQLAlchemy (same pattern as control-pipeline)."""
+    from sqlalchemy import create_engine, text
+
+    url = f"postgresql://breakpilot:breakpilot123@{db_host}:5432/breakpilot_db"
+    engine = create_engine(url)
+
+    inserted = 0
+    with engine.connect() as conn:
+        conn.execute(text("SET search_path TO compliance, public"))
+        for row in rows:
+            conn.execute(
+                text("""
+                    INSERT INTO regulation_registry
+                        (regulation_id, regulation_name_de, license_rule, license_type,
+                         attribution, source_type, jurisdiction, status)
+                    VALUES
+                        (:regulation_id, :regulation_name_de, :license_rule, :license_type,
+                         :attribution, :source_type, :jurisdiction, :status)
+                    ON CONFLICT (regulation_id) DO UPDATE SET
+                        regulation_name_de = EXCLUDED.regulation_name_de,
+                        license_rule = EXCLUDED.license_rule,
+                        license_type = EXCLUDED.license_type,
+                        attribution = EXCLUDED.attribution,
+                        source_type = EXCLUDED.source_type,
+                        jurisdiction = EXCLUDED.jurisdiction
+                """),
+                row,
+            )
+            inserted += 1
+        conn.commit()
+
+    return inserted
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Migrate regulation registry data")
+    parser.add_argument("--dry-run", action="store_true", help="Print SQL only")
+    parser.add_argument("--db-host", default="localhost", help="PostgreSQL host")
+    args = parser.parse_args()
+
+    rows = build_rows()
+    print(f"Built {len(rows)} rows from hardcoded dicts")
+
+    # Stats
+    by_rule = {}
+    by_status = {}
+    for r in rows:
+        by_rule[r["license_rule"]] = by_rule.get(r["license_rule"], 0) + 1
+        by_status[r["status"]] = by_status.get(r["status"], 0) + 1
+    print(f"  By license_rule: {by_rule}")
+    print(f"  By status: {by_status}")
+
+    if args.dry_run:
+        print("\n--- DRY RUN (SQL output) ---\n")
+        print(generate_sql(rows))
+        return
+
+    inserted = insert_via_sqlalchemy(rows, args.db_host)
+    print(f"Inserted/updated {inserted} rows into regulation_registry")
+
+
+if __name__ == "__main__":
+    main()
@@ -17,9 +17,6 @@ import httpx

 from .control_generator import (
    GeneratedControl,
-    REGULATION_LICENSE_MAP,
-    _RULE2_PREFIXES,
-    _RULE3_PREFIXES,
    _classify_regulation,
 )

@@ -33,6 +33,7 @@ from sqlalchemy import text
 from sqlalchemy.orm import Session

 from .rag_client import ComplianceRAGClient, RAGSearchResult, get_rag_client
+from .regulation_registry import get_registry as _get_regulation_registry
 from .similarity_detector import check_similarity

 logger = logging.getLogger(__name__)
@@ -245,28 +246,21 @@ def _classify_regulation(regulation_code: str) -> dict:

    Returns dict with keys: license, rule, name, source_type.
    source_type is one of: law, guideline, standard, restricted.
-    """
-    code = regulation_code.lower().strip()

-    # Exact match first
+    Delegates to DB-backed RegulationRegistry (with 5min cache).
+    Falls back to REGULATION_LICENSE_MAP if DB is unavailable.
+    """
+    registry = _get_regulation_registry()
+    result = registry.classify_regulation(regulation_code)
+
+    # If registry returned the unknown fallback AND we have a local match,
+    # prefer the local dict (graceful degradation during migration)
+    if result.get("license") == "UNKNOWN":
+        code = regulation_code.lower().strip()
        if code in REGULATION_LICENSE_MAP:
            return REGULATION_LICENSE_MAP[code]

-    # Prefix match for Rule 2 (ENISA = standard)
-    for prefix in _RULE2_PREFIXES:
-        if code.startswith(prefix):
-            return {"license": "CC-BY-4.0", "rule": 2, "source_type": "standard",
-                    "name": "ENISA", "attribution": "ENISA, CC BY 4.0"}
-
-    # Prefix match for Rule 3 (BSI/ISO/ETSI = restricted)
-    for prefix in _RULE3_PREFIXES:
-        if code.startswith(prefix):
-            return {"license": f"{prefix.rstrip('_').upper()}_RESTRICTED", "rule": 3,
-                    "source_type": "restricted", "name": "INTERNAL_ONLY"}
-
-    # Unknown → treat as restricted (safe default)
-    logger.warning("Unknown regulation_code %r — defaulting to Rule 3 (restricted)", code)
-    return {"license": "UNKNOWN", "rule": 3, "source_type": "restricted", "name": "INTERNAL_ONLY"}
+    return result


 # ---------------------------------------------------------------------------
@@ -0,0 +1,220 @@
+"""
+DB-backed Regulation Registry with in-memory cache.
+
+Replaces hardcoded REGULATION_LICENSE_MAP and SOURCE_REGULATION_CLASSIFICATION
+with a single PostgreSQL table (compliance.regulation_registry).
+
+Cache TTL: 5 minutes. Thread-safe via simple timestamp check.
+Falls back to hardcoded dicts if DB is unavailable (graceful degradation).
+"""
+
+import logging
+import time
+from typing import Optional
+
+from sqlalchemy import text
+from sqlalchemy.exc import SQLAlchemyError
+
+from db.session import SessionLocal
+
+logger = logging.getLogger(__name__)
+
+_CACHE_TTL_SECONDS = 300  # 5 minutes
+
+# Prefix-based fallback rules (unchanged from original logic)
+_RULE2_PREFIXES = ("enisa_",)
+_RULE3_PREFIXES = ("bsi_", "iso_", "etsi_")
+
+# Fallback for unknown regulations
+_UNKNOWN_REGULATION = {
+    "license": "UNKNOWN",
+    "rule": 3,
+    "source_type": "restricted",
+    "name": "INTERNAL_ONLY",
+    "attribution": None,
+}
+
+
+class RegulationRegistry:
+    """In-memory cache of the regulation_registry table.
+
+    Provides two lookup modes:
+      1. by_code(regulation_id) — replaces REGULATION_LICENSE_MAP[code]
+      2. source_type_by_name(name) — replaces SOURCE_REGULATION_CLASSIFICATION[name]
+    """
+
+    def __init__(self):
+        self._by_code: dict[str, dict] = {}
+        self._by_name: dict[str, str] = {}
+        self._loaded_at: float = 0.0
+
+    def _is_stale(self) -> bool:
+        return (time.monotonic() - self._loaded_at) > _CACHE_TTL_SECONDS
+
+    def _load(self) -> bool:
+        """Load all rows from regulation_registry into memory."""
+        try:
+            db = SessionLocal()
+            try:
+                rows = db.execute(
+                    text("""
+                        SELECT regulation_id, regulation_name_de, license_rule,
+                               license_type, attribution, source_type, jurisdiction,
+                               status
+                        FROM regulation_registry
+                        WHERE status != 'deprecated'
+                    """)
+                ).fetchall()
+            finally:
+                db.close()
+
+            by_code: dict[str, dict] = {}
+            by_name: dict[str, str] = {}
+
+            for row in rows:
+                entry = {
+                    "license": row[3] or "",     # license_type
+                    "rule": row[2],              # license_rule
+                    "source_type": row[5] or "law",  # source_type
+                    "name": row[1] or row[0],    # regulation_name_de or regulation_id
+                    "attribution": row[4],       # attribution
+                    "jurisdiction": row[6],      # jurisdiction
+                }
+                by_code[row[0].lower()] = entry
+
+                # Also index by name for source_type lookups
+                if row[1]:
+                    by_name[row[1]] = row[5] or "law"
+
+            self._by_code = by_code
+            self._by_name = by_name
+            self._loaded_at = time.monotonic()
+            logger.info(
+                "Regulation registry loaded: %d entries by code, %d by name",
+                len(by_code), len(by_name),
+            )
+            return True
+
+        except SQLAlchemyError:
+            logger.warning(
+                "Failed to load regulation_registry from DB — using stale cache",
+                exc_info=True,
+            )
+            return False
+
+    def _ensure_loaded(self) -> None:
+        """Reload cache if stale."""
+        if self._is_stale():
+            self._load()
+
+    def classify_regulation(self, regulation_code: str) -> dict:
+        """Look up license info for a regulation_code.
+
+        Returns dict with keys: license, rule, name, source_type, attribution.
+        Equivalent to the old _classify_regulation() function.
+        """
+        self._ensure_loaded()
+        code = regulation_code.lower().strip()
+
+        # Exact match from DB
+        if code in self._by_code:
+            return self._by_code[code]
+
+        # Prefix match for Rule 2 (ENISA = standard)
+        for prefix in _RULE2_PREFIXES:
+            if code.startswith(prefix):
+                return {
+                    "license": "CC-BY-4.0",
+                    "rule": 2,
+                    "source_type": "standard",
+                    "name": "ENISA",
+                    "attribution": "ENISA, CC BY 4.0",
+                }
+
+        # Prefix match for Rule 3 (BSI/ISO/ETSI = restricted)
+        for prefix in _RULE3_PREFIXES:
+            if code.startswith(prefix):
+                return {
+                    "license": f"{prefix.rstrip('_').upper()}_RESTRICTED",
+                    "rule": 3,
+                    "source_type": "restricted",
+                    "name": "INTERNAL_ONLY",
+                    "attribution": None,
+                }
+
+        # Unknown → restricted (safe default)
+        logger.warning(
+            "Unknown regulation_code %r — defaulting to Rule 3 (restricted)", code
+        )
+        return dict(_UNKNOWN_REGULATION)
+
+    def source_type_by_name(self, source_regulation: str) -> str:
+        """Look up source_type by regulation display name.
+
+        Equivalent to old classify_source_regulation().
+        Falls back to heuristic for unknown names.
+        """
+        self._ensure_loaded()
+
+        if not source_regulation:
+            return "framework"
+
+        # Exact match from DB
+        if source_regulation in self._by_name:
+            return self._by_name[source_regulation]
+
+        # Heuristic fallback for unknown sources
+        lower = source_regulation.lower()
+
+        law_indicators = [
+            "verordnung", "richtlinie", "gesetz", "directive", "regulation",
+            "(eu)", "(eg)", "act", "ley", "loi", "törvény", "código",
+        ]
+        if any(ind in lower for ind in law_indicators):
+            return "law"
+
+        guideline_indicators = [
+            "edpb", "leitlinie", "guideline", "wp2", "bsi", "empfehlung",
+        ]
+        if any(ind in lower for ind in guideline_indicators):
+            return "guideline"
+
+        framework_indicators = [
+            "enisa", "nist", "owasp", "oecd", "cisa", "framework", "iso",
+        ]
+        if any(ind in lower for ind in framework_indicators):
+            return "framework"
+
+        return "framework"
+
+    def get_all(self) -> dict[str, dict]:
+        """Return all cached entries (by regulation_code)."""
+        self._ensure_loaded()
+        return dict(self._by_code)
+
+    def is_open_source(self, regulation_code: str) -> bool:
+        """Check if regulation is Rule 1 or 2 (safe to reference)."""
+        info = self.classify_regulation(regulation_code)
+        return info["rule"] in (1, 2)
+
+
+# Module-level singleton
+_registry: Optional[RegulationRegistry] = None
+
+
+def get_registry() -> RegulationRegistry:
+    """Get or create the singleton RegulationRegistry instance."""
+    global _registry
+    if _registry is None:
+        _registry = RegulationRegistry()
+    return _registry
+
+
+def classify_regulation(regulation_code: str) -> dict:
+    """Convenience: look up license info for a regulation_code."""
+    return get_registry().classify_regulation(regulation_code)
+
+
+def classify_source_regulation(source_regulation: str) -> str:
+    """Convenience: look up source_type by regulation display name."""
+    return get_registry().source_type_by_name(source_regulation)
@@ -0,0 +1,285 @@
+"""Tests for RegulationRegistry — DB-backed lookup with cache and fallback."""
+
+import time
+from unittest.mock import patch, MagicMock
+
+import pytest
+
+from services.regulation_registry import (
+    RegulationRegistry,
+    _CACHE_TTL_SECONDS,
+)
+
+
+# ── Test data: simulates DB rows ──────────────────────────────────────────
+
+_MOCK_DB_ROWS = [
+    # (regulation_id, regulation_name_de, license_rule, license_type,
+    #  attribution, source_type, jurisdiction, status)
+    ("eu_2016_679", "DSGVO (EU) 2016/679", 1, "EU_LAW",
+     None, "law", "EU", "active"),
+    ("nist_sp_800_53", "NIST SP 800-53 Rev. 5", 1, "NIST_PUBLIC_DOMAIN",
+     None, "standard", "US", "active"),
+    ("owasp_asvs", "OWASP ASVS 4.0", 2, "CC-BY-SA-4.0",
+     "OWASP Foundation, CC BY-SA 4.0", "standard", "INT", "active"),
+    ("bdsg", "Bundesdatenschutzgesetz (BDSG)", 1, "DE_LAW",
+     None, "law", "DE", "active"),
+    ("at_dsg", "Österreichisches Datenschutzgesetz (DSG)", 1, "AT_LAW",
+     None, "law", "AT", "active"),
+]
+
+
+def _mock_db_execute(query):
+    """Mock that returns our test rows."""
+    mock_result = MagicMock()
+    mock_result.fetchall.return_value = _MOCK_DB_ROWS
+    return mock_result
+
+
+@pytest.fixture
+def registry():
+    """Create a registry with mocked DB."""
+    reg = RegulationRegistry()
+    with patch("services.regulation_registry.SessionLocal") as mock_session_cls:
+        mock_session = MagicMock()
+        mock_session.execute = _mock_db_execute
+        mock_session_cls.return_value = mock_session
+        reg._load()
+    return reg
+
+
+# ── classify_regulation tests ─────────────────────────────────────────────
+
+
+class TestClassifyRegulation:
+    def test_exact_match_eu_law(self, registry):
+        result = registry.classify_regulation("eu_2016_679")
+        assert result["rule"] == 1
+        assert result["license"] == "EU_LAW"
+        assert result["source_type"] == "law"
+        assert result["name"] == "DSGVO (EU) 2016/679"
+
+    def test_exact_match_case_insensitive(self, registry):
+        result = registry.classify_regulation("EU_2016_679")
+        assert result["rule"] == 1
+        assert result["name"] == "DSGVO (EU) 2016/679"
+
+    def test_exact_match_with_whitespace(self, registry):
+        result = registry.classify_regulation("  eu_2016_679  ")
+        assert result["rule"] == 1
+
+    def test_nist_standard(self, registry):
+        result = registry.classify_regulation("nist_sp_800_53")
+        assert result["rule"] == 1
+        assert result["source_type"] == "standard"
+
+    def test_owasp_rule2(self, registry):
+        result = registry.classify_regulation("owasp_asvs")
+        assert result["rule"] == 2
+        assert result["attribution"] == "OWASP Foundation, CC BY-SA 4.0"
+
+    def test_german_law(self, registry):
+        result = registry.classify_regulation("bdsg")
+        assert result["rule"] == 1
+        assert result["source_type"] == "law"
+        assert result["jurisdiction"] == "DE"
+
+    def test_austrian_law(self, registry):
+        result = registry.classify_regulation("at_dsg")
+        assert result["rule"] == 1
+        assert result["jurisdiction"] == "AT"
+
+    def test_prefix_enisa_rule2(self, registry):
+        result = registry.classify_regulation("enisa_supply_chain_2024")
+        assert result["rule"] == 2
+        assert result["source_type"] == "standard"
+        assert "ENISA" in result["attribution"]
+
+    def test_prefix_bsi_rule3(self, registry):
+        result = registry.classify_regulation("bsi_tr_03161")
+        assert result["rule"] == 3
+        assert result["source_type"] == "restricted"
+        assert result["name"] == "INTERNAL_ONLY"
+
+    def test_prefix_iso_rule3(self, registry):
+        result = registry.classify_regulation("iso_27001")
+        assert result["rule"] == 3
+        assert result["source_type"] == "restricted"
+
+    def test_prefix_etsi_rule3(self, registry):
+        result = registry.classify_regulation("etsi_en_303_645")
+        assert result["rule"] == 3
+
+    def test_unknown_defaults_to_restricted(self, registry):
+        result = registry.classify_regulation("some_unknown_regulation")
+        assert result["rule"] == 3
+        assert result["source_type"] == "restricted"
+        assert result["license"] == "UNKNOWN"
+
+
+# ── source_type_by_name tests ────────────────────────────────────────────
+
+
+class TestSourceTypeByName:
+    def test_exact_match_law(self, registry):
+        result = registry.source_type_by_name("DSGVO (EU) 2016/679")
+        assert result == "law"
+
+    def test_exact_match_standard(self, registry):
+        result = registry.source_type_by_name("NIST SP 800-53 Rev. 5")
+        assert result == "standard"
+
+    def test_empty_returns_framework(self, registry):
+        assert registry.source_type_by_name("") == "framework"
+        assert registry.source_type_by_name(None) == "framework"
+
+    def test_heuristic_law(self, registry):
+        assert registry.source_type_by_name("Verordnung XYZ") == "law"
+        assert registry.source_type_by_name("Some EU Directive") == "law"
+
+    def test_heuristic_guideline(self, registry):
+        assert registry.source_type_by_name("EDPB Leitlinie 99/2025") == "guideline"
+        assert registry.source_type_by_name("BSI Standard 200-1") == "guideline"
+
+    def test_heuristic_framework(self, registry):
+        # "ENISA Cloud Guidelines" matches "guideline" before "enisa" in heuristic order
+        assert registry.source_type_by_name("ENISA Cloud Report") == "framework"
+        assert registry.source_type_by_name("OWASP Testing Guide") == "framework"
+
+    def test_unknown_returns_framework(self, registry):
+        assert registry.source_type_by_name("Completely Unknown Document") == "framework"
+
+
+# ── is_open_source tests ─────────────��───────────────────────────────────
+
+
+class TestIsOpenSource:
+    def test_rule1_is_open(self, registry):
+        assert registry.is_open_source("eu_2016_679") is True
+
+    def test_rule2_is_open(self, registry):
+        assert registry.is_open_source("owasp_asvs") is True
+
+    def test_rule3_is_not_open(self, registry):
+        assert registry.is_open_source("bsi_tr_03161") is False
+
+    def test_unknown_is_not_open(self, registry):
+        assert registry.is_open_source("unknown_thing") is False
+
+
+# ── Cache behavior tests ──────��──────────────────────────────────────────
+
+
+class TestCacheBehavior:
+    def test_fresh_cache_not_stale(self, registry):
+        assert registry._is_stale() is False
+
+    def test_old_cache_is_stale(self, registry):
+        registry._loaded_at = time.monotonic() - _CACHE_TTL_SECONDS - 1
+        assert registry._is_stale() is True
+
+    def test_ensure_loaded_reloads_when_stale(self):
+        reg = RegulationRegistry()
+        reg._loaded_at = time.monotonic() - _CACHE_TTL_SECONDS - 100  # force stale
+
+        load_called = False
+        original_load = reg._load
+
+        def tracking_load():
+            nonlocal load_called
+            load_called = True
+
+        reg._load = tracking_load
+        reg._ensure_loaded()
+        assert load_called, "_load should have been called when cache is stale"
+
+    def test_ensure_loaded_skips_when_fresh(self, registry):
+        with patch.object(registry, "_load") as mock_load:
+            registry._ensure_loaded()
+            mock_load.assert_not_called()
+
+
+# ── Graceful degradation tests ──────��────────────────────────────────────
+
+
+class TestGracefulDegradation:
+    def test_db_failure_uses_stale_cache(self):
+        """If DB fails, stale cache entries are still usable."""
+        reg = RegulationRegistry()
+
+        # First load succeeds
+        with patch("services.regulation_registry.SessionLocal") as mock_cls:
+            mock_session = MagicMock()
+            mock_session.execute = _mock_db_execute
+            mock_cls.return_value = mock_session
+            reg._load()
+
+        # Force stale
+        reg._loaded_at = time.monotonic() - _CACHE_TTL_SECONDS - 1
+
+        # Second load fails — DB error
+        from sqlalchemy.exc import OperationalError
+        with patch("services.regulation_registry.SessionLocal") as mock_cls:
+            mock_cls.side_effect = OperationalError("connection refused", None, None)
+            reg._ensure_loaded()
+
+        # Should still have cached data
+        result = reg.classify_regulation("eu_2016_679")
+        assert result["rule"] == 1
+
+    def test_empty_registry_returns_unknown(self):
+        """Unloaded registry returns safe defaults."""
+        reg = RegulationRegistry()
+        reg._loaded_at = time.monotonic()  # pretend fresh but empty
+
+        result = reg.classify_regulation("eu_2016_679")
+        assert result["rule"] == 3  # safe default
+        assert result["license"] == "UNKNOWN"
+
+
+# ── Migration data consistency tests ───────��─────────────────────────────
+
+
+class TestMigrationDataConsistency:
+    """Verify that the migration script produces valid data."""
+
+    def test_build_rows_produces_data(self):
+        from scripts.f1_migrate_regulation_registry import build_rows
+        rows = build_rows()
+        assert len(rows) > 100  # at least 100 entries
+
+    def test_all_rows_have_required_fields(self):
+        from scripts.f1_migrate_regulation_registry import build_rows
+        rows = build_rows()
+        for row in rows:
+            assert row["regulation_id"], f"Missing regulation_id: {row}"
+            assert row["regulation_name_de"], f"Missing name: {row}"
+            assert row["license_rule"] in (1, 2, 3), f"Bad rule: {row}"
+            assert row["source_type"] in (
+                "law", "guideline", "standard", "framework", "restricted"
+            ), f"Bad source_type: {row}"
+            assert row["jurisdiction"], f"Missing jurisdiction: {row}"
+            assert row["status"] in ("active", "needs_review", "deprecated")
+
+    def test_no_duplicate_regulation_ids(self):
+        from scripts.f1_migrate_regulation_registry import build_rows
+        rows = build_rows()
+        ids = [r["regulation_id"] for r in rows]
+        assert len(ids) == len(set(ids)), f"Duplicates: {[x for x in ids if ids.count(x) > 1]}"
+
+    def test_known_regulations_present(self):
+        from scripts.f1_migrate_regulation_registry import build_rows
+        rows = build_rows()
+        ids = {r["regulation_id"] for r in rows}
+        assert "eu_2016_679" in ids  # DSGVO
+        assert "bdsg" in ids  # BDSG
+        assert "nist_sp_800_53" in ids  # NIST
+        assert "owasp_asvs" in ids  # OWASP
+
+    def test_owasp_has_attribution(self):
+        from scripts.f1_migrate_regulation_registry import build_rows
+        rows = build_rows()
+        owasp = [r for r in rows if r["regulation_id"] == "owasp_asvs"][0]
+        assert owasp["attribution"] is not None
+        assert "OWASP" in owasp["attribution"]
+        assert owasp["license_rule"] == 2