feat(pipeline): F1 regulation registry — DB-backed license/source-type lookup

Migrates REGULATION_LICENSE_MAP (135 entries) and SOURCE_REGULATION_CLASSIFICATION
(58 entries) from hardcoded Python dicts to compliance.regulation_registry table.

- SQL migration: 002_regulation_registry.sql (table + indexes + trigger)
- Migration script: f1_migrate_regulation_registry.py (162 rows, --dry-run)
- RegulationRegistry cache: 5min TTL, prefix fallback, graceful degradation
- control_generator._classify_regulation() delegates to DB with dict fallback
- source_type_classification.classify_source_regulation() delegates to DB
- 34 new tests (lookup, cache, degradation, migration data consistency)
- 421 total tests pass, 0 regressions

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-05-03 23:14:06 +02:00
parent 4fd2bfefcd
commit 9437e029d0
7 changed files with 850 additions and 30 deletions
@@ -165,21 +165,29 @@ def classify_source_regulation(source_regulation: str) -> str:
"""
Klassifiziert eine source_regulation als law, guideline oder framework.
Verwendet exaktes Matching gegen die Map. Bei unbekannten Quellen
wird anhand von Schluesselwoertern geraten, Fallback ist 'framework'
(konservativstes Ergebnis).
Delegates to DB-backed RegulationRegistry (with 5min cache).
Falls back to SOURCE_REGULATION_CLASSIFICATION dict + heuristic
if DB is unavailable.
"""
if not source_regulation:
return SOURCE_TYPE_FRAMEWORK
# Exaktes Match
# Try DB-backed registry first
try:
from services.regulation_registry import classify_source_regulation as _db_classify
result = _db_classify(source_regulation)
if result:
return result
except Exception:
pass
# Fallback: local dict
if source_regulation in SOURCE_REGULATION_CLASSIFICATION:
return SOURCE_REGULATION_CLASSIFICATION[source_regulation]
# Heuristik fuer unbekannte Quellen
lower = source_regulation.lower()
# Gesetze erkennen
law_indicators = [
"verordnung", "richtlinie", "gesetz", "directive", "regulation",
"(eu)", "(eg)", "act", "ley", "loi", "törvény", "código",
@@ -187,19 +195,16 @@ def classify_source_regulation(source_regulation: str) -> str:
if any(ind in lower for ind in law_indicators):
return SOURCE_TYPE_LAW
# Leitlinien erkennen
guideline_indicators = [
"edpb", "leitlinie", "guideline", "wp2", "bsi", "empfehlung",
]
if any(ind in lower for ind in guideline_indicators):
return SOURCE_TYPE_GUIDELINE
# Frameworks erkennen
framework_indicators = [
"enisa", "nist", "owasp", "oecd", "cisa", "framework", "iso",
]
if any(ind in lower for ind in framework_indicators):
return SOURCE_TYPE_FRAMEWORK
# Konservativ: unbekannt = framework (geringste Verbindlichkeit)
return SOURCE_TYPE_FRAMEWORK
@@ -0,0 +1,72 @@
-- Migration 002: Regulation Registry (Block F1)
-- Schema: compliance
-- Run: ssh macmini "docker exec -i bp-core-postgres psql -U breakpilot -d breakpilot_db" < control-pipeline/migrations/002_regulation_registry.sql
SET search_path TO compliance, public;
-- ========================================
-- regulation_registry
-- ========================================
-- Central registry for all regulations, laws, guidelines, and frameworks
-- referenced by the control pipeline. Replaces hardcoded Python dicts
-- (REGULATION_LICENSE_MAP, SOURCE_REGULATION_CLASSIFICATION).
CREATE TABLE IF NOT EXISTS regulation_registry (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
-- regulation_id: machine key (e.g. "eu_2016_679", "nist_sp_800_53")
regulation_id VARCHAR(100) UNIQUE NOT NULL,
-- Display names
regulation_name_de TEXT,
regulation_name_en TEXT,
regulation_short VARCHAR(50),
-- License classification (3-rule system)
license_rule INTEGER NOT NULL DEFAULT 1
CHECK (license_rule IN (1, 2, 3)),
license_type VARCHAR(50), -- EU_LAW, DE_LAW, CC-BY-SA-4.0, etc.
attribution TEXT, -- Required for Rule 2 (CC-BY)
-- Source classification
source_type VARCHAR(20) NOT NULL DEFAULT 'law'
CHECK (source_type IN ('law', 'guideline', 'standard', 'framework', 'restricted')),
-- Metadata
jurisdiction VARCHAR(10), -- DE, EU, AT, CH, US, FR, ES, NL, IT, HU, INT
category VARCHAR(50),
celex VARCHAR(30), -- EU CELEX number if applicable
url TEXT,
-- Lifecycle
status VARCHAR(20) NOT NULL DEFAULT 'active'
CHECK (status IN ('active', 'needs_review', 'deprecated')),
created_at TIMESTAMPTZ DEFAULT NOW(),
updated_at TIMESTAMPTZ DEFAULT NOW()
);
-- Indexes
CREATE INDEX IF NOT EXISTS idx_reg_registry_status
ON regulation_registry(status);
CREATE INDEX IF NOT EXISTS idx_reg_registry_jurisdiction
ON regulation_registry(jurisdiction);
CREATE INDEX IF NOT EXISTS idx_reg_registry_source_type
ON regulation_registry(source_type);
CREATE INDEX IF NOT EXISTS idx_reg_registry_license_rule
ON regulation_registry(license_rule);
-- Updated-at trigger
CREATE OR REPLACE FUNCTION update_regulation_registry_updated_at()
RETURNS TRIGGER AS $$
BEGIN
NEW.updated_at = NOW();
RETURN NEW;
END;
$$ LANGUAGE plpgsql;
DROP TRIGGER IF EXISTS trg_regulation_registry_updated_at ON regulation_registry;
CREATE TRIGGER trg_regulation_registry_updated_at
BEFORE UPDATE ON regulation_registry
FOR EACH ROW
EXECUTE FUNCTION update_regulation_registry_updated_at();
@@ -0,0 +1,247 @@
#!/usr/bin/env python3
"""
F1 Migration: Populate regulation_registry from hardcoded Python dicts.
Sources:
- REGULATION_LICENSE_MAP (control_generator.py) — 135 entries keyed by regulation_id
- SOURCE_REGULATION_CLASSIFICATION (source_type_classification.py) — 58 entries keyed by name
Usage:
# Dry run (prints SQL, no DB write):
python3 scripts/f1_migrate_regulation_registry.py --dry-run
# Against Mac Mini:
python3 scripts/f1_migrate_regulation_registry.py --db-host macmini
# Against local Docker:
python3 scripts/f1_migrate_regulation_registry.py --db-host localhost
"""
import argparse
import sys
from pathlib import Path
# Add parent so we can import from services/data
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
from services.control_generator import REGULATION_LICENSE_MAP, _RULE2_PREFIXES, _RULE3_PREFIXES # noqa: E402
from data.source_type_classification import SOURCE_REGULATION_CLASSIFICATION # noqa: E402
# Derive jurisdiction from license_type
_LICENSE_TO_JURISDICTION = {
"EU_LAW": "EU",
"EU_PUBLIC": "EU",
"DE_LAW": "DE",
"DE_PUBLIC": "DE",
"AT_LAW": "AT",
"CH_LAW": "CH",
"FR_LAW": "FR",
"ES_LAW": "ES",
"NL_LAW": "NL",
"IT_LAW": "IT",
"HU_LAW": "HU",
"NIST_PUBLIC_DOMAIN": "US",
"US_GOV_PUBLIC": "US",
"CC-BY-SA-4.0": "INT",
"CC-BY-4.0": "INT",
"OECD_PUBLIC": "INT",
}
def _derive_jurisdiction(license_type: str) -> str:
"""Map license_type to jurisdiction code."""
return _LICENSE_TO_JURISDICTION.get(license_type, "INT")
def build_rows() -> list[dict]:
"""Merge REGULATION_LICENSE_MAP + SOURCE_REGULATION_CLASSIFICATION into rows."""
rows = []
# Track names we've seen (for dedup against SOURCE_REGULATION_CLASSIFICATION)
seen_names: set[str] = set()
# 1) Primary source: REGULATION_LICENSE_MAP (has regulation_id as key)
for reg_id, info in REGULATION_LICENSE_MAP.items():
name = info.get("name", reg_id)
seen_names.add(name)
rows.append({
"regulation_id": reg_id.lower().strip(),
"regulation_name_de": name,
"license_rule": info["rule"],
"license_type": info.get("license", ""),
"attribution": info.get("attribution"),
"source_type": info.get("source_type", "law"),
"jurisdiction": _derive_jurisdiction(info.get("license", "")),
"status": "active",
})
# 2) Secondary: SOURCE_REGULATION_CLASSIFICATION entries not already covered
# These are keyed by name, not by regulation_id. We create synthetic IDs.
for name, source_type in SOURCE_REGULATION_CLASSIFICATION.items():
if name in seen_names:
continue
# Generate a regulation_id from the name
synthetic_id = (
name.lower()
.replace(" ", "_")
.replace("(", "")
.replace(")", "")
.replace("/", "_")
.replace("-", "_")
.replace(".", "")
.replace(",", "")
.replace("ä", "ae")
.replace("ö", "oe")
.replace("ü", "ue")
.replace("á", "a")
.replace("é", "e")
.replace("ó", "o")
.strip("_")
)[:100]
# Guess jurisdiction from name content
jurisdiction = "INT"
name_lower = name.lower()
if any(x in name_lower for x in ["edpb", "edps", "(eu)", "eu ", "wp2"]):
jurisdiction = "EU"
elif any(x in name_lower for x in ["bsi", "bdsg", "bundes", "gwg"]):
jurisdiction = "DE"
elif "nist" in name_lower or "cisa" in name_lower:
jurisdiction = "US"
elif "österreich" in name_lower:
jurisdiction = "AT"
elif "schweiz" in name_lower:
jurisdiction = "CH"
elif "spanien" in name_lower:
jurisdiction = "ES"
elif "frankreich" in name_lower:
jurisdiction = "FR"
elif "ungarn" in name_lower:
jurisdiction = "HU"
# Map source_type_classification's "framework" to our "standard"
# (source_type_classification uses law/guideline/framework)
mapped_source_type = source_type
if source_type == "framework":
mapped_source_type = "standard"
rows.append({
"regulation_id": synthetic_id,
"regulation_name_de": name,
"license_rule": 1, # default: conservative
"license_type": "",
"attribution": None,
"source_type": mapped_source_type,
"jurisdiction": jurisdiction,
"status": "needs_review", # needs manual review since we guessed
})
return rows
def generate_sql(rows: list[dict]) -> str:
"""Generate INSERT SQL for all rows."""
lines = [
"SET search_path TO compliance, public;",
"",
"-- Auto-generated by f1_migrate_regulation_registry.py",
f"-- {len(rows)} rows total",
"",
]
for row in rows:
attr = f"'{row['attribution']}'" if row["attribution"] else "NULL"
lines.append(
f"INSERT INTO regulation_registry "
f"(regulation_id, regulation_name_de, license_rule, license_type, "
f"attribution, source_type, jurisdiction, status) "
f"VALUES ("
f"'{row['regulation_id']}', "
f"'{_escape_sql(row['regulation_name_de'])}', "
f"{row['license_rule']}, "
f"'{row['license_type']}', "
f"{attr}, "
f"'{row['source_type']}', "
f"'{row['jurisdiction']}', "
f"'{row['status']}'"
f") ON CONFLICT (regulation_id) DO UPDATE SET "
f"regulation_name_de = EXCLUDED.regulation_name_de, "
f"license_rule = EXCLUDED.license_rule, "
f"license_type = EXCLUDED.license_type, "
f"attribution = EXCLUDED.attribution, "
f"source_type = EXCLUDED.source_type, "
f"jurisdiction = EXCLUDED.jurisdiction;"
)
return "\n".join(lines)
def _escape_sql(val: str) -> str:
"""Escape single quotes for SQL."""
return val.replace("'", "''")
def insert_via_sqlalchemy(rows: list[dict], db_host: str) -> int:
"""Insert rows using SQLAlchemy (same pattern as control-pipeline)."""
from sqlalchemy import create_engine, text
url = f"postgresql://breakpilot:breakpilot123@{db_host}:5432/breakpilot_db"
engine = create_engine(url)
inserted = 0
with engine.connect() as conn:
conn.execute(text("SET search_path TO compliance, public"))
for row in rows:
conn.execute(
text("""
INSERT INTO regulation_registry
(regulation_id, regulation_name_de, license_rule, license_type,
attribution, source_type, jurisdiction, status)
VALUES
(:regulation_id, :regulation_name_de, :license_rule, :license_type,
:attribution, :source_type, :jurisdiction, :status)
ON CONFLICT (regulation_id) DO UPDATE SET
regulation_name_de = EXCLUDED.regulation_name_de,
license_rule = EXCLUDED.license_rule,
license_type = EXCLUDED.license_type,
attribution = EXCLUDED.attribution,
source_type = EXCLUDED.source_type,
jurisdiction = EXCLUDED.jurisdiction
"""),
row,
)
inserted += 1
conn.commit()
return inserted
def main():
parser = argparse.ArgumentParser(description="Migrate regulation registry data")
parser.add_argument("--dry-run", action="store_true", help="Print SQL only")
parser.add_argument("--db-host", default="localhost", help="PostgreSQL host")
args = parser.parse_args()
rows = build_rows()
print(f"Built {len(rows)} rows from hardcoded dicts")
# Stats
by_rule = {}
by_status = {}
for r in rows:
by_rule[r["license_rule"]] = by_rule.get(r["license_rule"], 0) + 1
by_status[r["status"]] = by_status.get(r["status"], 0) + 1
print(f" By license_rule: {by_rule}")
print(f" By status: {by_status}")
if args.dry_run:
print("\n--- DRY RUN (SQL output) ---\n")
print(generate_sql(rows))
return
inserted = insert_via_sqlalchemy(rows, args.db_host)
print(f"Inserted/updated {inserted} rows into regulation_registry")
if __name__ == "__main__":
main()
@@ -17,9 +17,6 @@ import httpx
from .control_generator import (
GeneratedControl,
REGULATION_LICENSE_MAP,
_RULE2_PREFIXES,
_RULE3_PREFIXES,
_classify_regulation,
)
+12 -18
View File
@@ -33,6 +33,7 @@ from sqlalchemy import text
from sqlalchemy.orm import Session
from .rag_client import ComplianceRAGClient, RAGSearchResult, get_rag_client
from .regulation_registry import get_registry as _get_regulation_registry
from .similarity_detector import check_similarity
logger = logging.getLogger(__name__)
@@ -245,28 +246,21 @@ def _classify_regulation(regulation_code: str) -> dict:
Returns dict with keys: license, rule, name, source_type.
source_type is one of: law, guideline, standard, restricted.
"""
code = regulation_code.lower().strip()
# Exact match first
Delegates to DB-backed RegulationRegistry (with 5min cache).
Falls back to REGULATION_LICENSE_MAP if DB is unavailable.
"""
registry = _get_regulation_registry()
result = registry.classify_regulation(regulation_code)
# If registry returned the unknown fallback AND we have a local match,
# prefer the local dict (graceful degradation during migration)
if result.get("license") == "UNKNOWN":
code = regulation_code.lower().strip()
if code in REGULATION_LICENSE_MAP:
return REGULATION_LICENSE_MAP[code]
# Prefix match for Rule 2 (ENISA = standard)
for prefix in _RULE2_PREFIXES:
if code.startswith(prefix):
return {"license": "CC-BY-4.0", "rule": 2, "source_type": "standard",
"name": "ENISA", "attribution": "ENISA, CC BY 4.0"}
# Prefix match for Rule 3 (BSI/ISO/ETSI = restricted)
for prefix in _RULE3_PREFIXES:
if code.startswith(prefix):
return {"license": f"{prefix.rstrip('_').upper()}_RESTRICTED", "rule": 3,
"source_type": "restricted", "name": "INTERNAL_ONLY"}
# Unknown → treat as restricted (safe default)
logger.warning("Unknown regulation_code %r — defaulting to Rule 3 (restricted)", code)
return {"license": "UNKNOWN", "rule": 3, "source_type": "restricted", "name": "INTERNAL_ONLY"}
return result
# ---------------------------------------------------------------------------
@@ -0,0 +1,220 @@
"""
DB-backed Regulation Registry with in-memory cache.
Replaces hardcoded REGULATION_LICENSE_MAP and SOURCE_REGULATION_CLASSIFICATION
with a single PostgreSQL table (compliance.regulation_registry).
Cache TTL: 5 minutes. Thread-safe via simple timestamp check.
Falls back to hardcoded dicts if DB is unavailable (graceful degradation).
"""
import logging
import time
from typing import Optional
from sqlalchemy import text
from sqlalchemy.exc import SQLAlchemyError
from db.session import SessionLocal
logger = logging.getLogger(__name__)
_CACHE_TTL_SECONDS = 300 # 5 minutes
# Prefix-based fallback rules (unchanged from original logic)
_RULE2_PREFIXES = ("enisa_",)
_RULE3_PREFIXES = ("bsi_", "iso_", "etsi_")
# Fallback for unknown regulations
_UNKNOWN_REGULATION = {
"license": "UNKNOWN",
"rule": 3,
"source_type": "restricted",
"name": "INTERNAL_ONLY",
"attribution": None,
}
class RegulationRegistry:
"""In-memory cache of the regulation_registry table.
Provides two lookup modes:
1. by_code(regulation_id) replaces REGULATION_LICENSE_MAP[code]
2. source_type_by_name(name) replaces SOURCE_REGULATION_CLASSIFICATION[name]
"""
def __init__(self):
self._by_code: dict[str, dict] = {}
self._by_name: dict[str, str] = {}
self._loaded_at: float = 0.0
def _is_stale(self) -> bool:
return (time.monotonic() - self._loaded_at) > _CACHE_TTL_SECONDS
def _load(self) -> bool:
"""Load all rows from regulation_registry into memory."""
try:
db = SessionLocal()
try:
rows = db.execute(
text("""
SELECT regulation_id, regulation_name_de, license_rule,
license_type, attribution, source_type, jurisdiction,
status
FROM regulation_registry
WHERE status != 'deprecated'
""")
).fetchall()
finally:
db.close()
by_code: dict[str, dict] = {}
by_name: dict[str, str] = {}
for row in rows:
entry = {
"license": row[3] or "", # license_type
"rule": row[2], # license_rule
"source_type": row[5] or "law", # source_type
"name": row[1] or row[0], # regulation_name_de or regulation_id
"attribution": row[4], # attribution
"jurisdiction": row[6], # jurisdiction
}
by_code[row[0].lower()] = entry
# Also index by name for source_type lookups
if row[1]:
by_name[row[1]] = row[5] or "law"
self._by_code = by_code
self._by_name = by_name
self._loaded_at = time.monotonic()
logger.info(
"Regulation registry loaded: %d entries by code, %d by name",
len(by_code), len(by_name),
)
return True
except SQLAlchemyError:
logger.warning(
"Failed to load regulation_registry from DB — using stale cache",
exc_info=True,
)
return False
def _ensure_loaded(self) -> None:
"""Reload cache if stale."""
if self._is_stale():
self._load()
def classify_regulation(self, regulation_code: str) -> dict:
"""Look up license info for a regulation_code.
Returns dict with keys: license, rule, name, source_type, attribution.
Equivalent to the old _classify_regulation() function.
"""
self._ensure_loaded()
code = regulation_code.lower().strip()
# Exact match from DB
if code in self._by_code:
return self._by_code[code]
# Prefix match for Rule 2 (ENISA = standard)
for prefix in _RULE2_PREFIXES:
if code.startswith(prefix):
return {
"license": "CC-BY-4.0",
"rule": 2,
"source_type": "standard",
"name": "ENISA",
"attribution": "ENISA, CC BY 4.0",
}
# Prefix match for Rule 3 (BSI/ISO/ETSI = restricted)
for prefix in _RULE3_PREFIXES:
if code.startswith(prefix):
return {
"license": f"{prefix.rstrip('_').upper()}_RESTRICTED",
"rule": 3,
"source_type": "restricted",
"name": "INTERNAL_ONLY",
"attribution": None,
}
# Unknown → restricted (safe default)
logger.warning(
"Unknown regulation_code %r — defaulting to Rule 3 (restricted)", code
)
return dict(_UNKNOWN_REGULATION)
def source_type_by_name(self, source_regulation: str) -> str:
"""Look up source_type by regulation display name.
Equivalent to old classify_source_regulation().
Falls back to heuristic for unknown names.
"""
self._ensure_loaded()
if not source_regulation:
return "framework"
# Exact match from DB
if source_regulation in self._by_name:
return self._by_name[source_regulation]
# Heuristic fallback for unknown sources
lower = source_regulation.lower()
law_indicators = [
"verordnung", "richtlinie", "gesetz", "directive", "regulation",
"(eu)", "(eg)", "act", "ley", "loi", "törvény", "código",
]
if any(ind in lower for ind in law_indicators):
return "law"
guideline_indicators = [
"edpb", "leitlinie", "guideline", "wp2", "bsi", "empfehlung",
]
if any(ind in lower for ind in guideline_indicators):
return "guideline"
framework_indicators = [
"enisa", "nist", "owasp", "oecd", "cisa", "framework", "iso",
]
if any(ind in lower for ind in framework_indicators):
return "framework"
return "framework"
def get_all(self) -> dict[str, dict]:
"""Return all cached entries (by regulation_code)."""
self._ensure_loaded()
return dict(self._by_code)
def is_open_source(self, regulation_code: str) -> bool:
"""Check if regulation is Rule 1 or 2 (safe to reference)."""
info = self.classify_regulation(regulation_code)
return info["rule"] in (1, 2)
# Module-level singleton
_registry: Optional[RegulationRegistry] = None
def get_registry() -> RegulationRegistry:
"""Get or create the singleton RegulationRegistry instance."""
global _registry
if _registry is None:
_registry = RegulationRegistry()
return _registry
def classify_regulation(regulation_code: str) -> dict:
"""Convenience: look up license info for a regulation_code."""
return get_registry().classify_regulation(regulation_code)
def classify_source_regulation(source_regulation: str) -> str:
"""Convenience: look up source_type by regulation display name."""
return get_registry().source_type_by_name(source_regulation)
@@ -0,0 +1,285 @@
"""Tests for RegulationRegistry — DB-backed lookup with cache and fallback."""
import time
from unittest.mock import patch, MagicMock
import pytest
from services.regulation_registry import (
RegulationRegistry,
_CACHE_TTL_SECONDS,
)
# ── Test data: simulates DB rows ──────────────────────────────────────────
_MOCK_DB_ROWS = [
# (regulation_id, regulation_name_de, license_rule, license_type,
# attribution, source_type, jurisdiction, status)
("eu_2016_679", "DSGVO (EU) 2016/679", 1, "EU_LAW",
None, "law", "EU", "active"),
("nist_sp_800_53", "NIST SP 800-53 Rev. 5", 1, "NIST_PUBLIC_DOMAIN",
None, "standard", "US", "active"),
("owasp_asvs", "OWASP ASVS 4.0", 2, "CC-BY-SA-4.0",
"OWASP Foundation, CC BY-SA 4.0", "standard", "INT", "active"),
("bdsg", "Bundesdatenschutzgesetz (BDSG)", 1, "DE_LAW",
None, "law", "DE", "active"),
("at_dsg", "Österreichisches Datenschutzgesetz (DSG)", 1, "AT_LAW",
None, "law", "AT", "active"),
]
def _mock_db_execute(query):
"""Mock that returns our test rows."""
mock_result = MagicMock()
mock_result.fetchall.return_value = _MOCK_DB_ROWS
return mock_result
@pytest.fixture
def registry():
"""Create a registry with mocked DB."""
reg = RegulationRegistry()
with patch("services.regulation_registry.SessionLocal") as mock_session_cls:
mock_session = MagicMock()
mock_session.execute = _mock_db_execute
mock_session_cls.return_value = mock_session
reg._load()
return reg
# ── classify_regulation tests ─────────────────────────────────────────────
class TestClassifyRegulation:
def test_exact_match_eu_law(self, registry):
result = registry.classify_regulation("eu_2016_679")
assert result["rule"] == 1
assert result["license"] == "EU_LAW"
assert result["source_type"] == "law"
assert result["name"] == "DSGVO (EU) 2016/679"
def test_exact_match_case_insensitive(self, registry):
result = registry.classify_regulation("EU_2016_679")
assert result["rule"] == 1
assert result["name"] == "DSGVO (EU) 2016/679"
def test_exact_match_with_whitespace(self, registry):
result = registry.classify_regulation(" eu_2016_679 ")
assert result["rule"] == 1
def test_nist_standard(self, registry):
result = registry.classify_regulation("nist_sp_800_53")
assert result["rule"] == 1
assert result["source_type"] == "standard"
def test_owasp_rule2(self, registry):
result = registry.classify_regulation("owasp_asvs")
assert result["rule"] == 2
assert result["attribution"] == "OWASP Foundation, CC BY-SA 4.0"
def test_german_law(self, registry):
result = registry.classify_regulation("bdsg")
assert result["rule"] == 1
assert result["source_type"] == "law"
assert result["jurisdiction"] == "DE"
def test_austrian_law(self, registry):
result = registry.classify_regulation("at_dsg")
assert result["rule"] == 1
assert result["jurisdiction"] == "AT"
def test_prefix_enisa_rule2(self, registry):
result = registry.classify_regulation("enisa_supply_chain_2024")
assert result["rule"] == 2
assert result["source_type"] == "standard"
assert "ENISA" in result["attribution"]
def test_prefix_bsi_rule3(self, registry):
result = registry.classify_regulation("bsi_tr_03161")
assert result["rule"] == 3
assert result["source_type"] == "restricted"
assert result["name"] == "INTERNAL_ONLY"
def test_prefix_iso_rule3(self, registry):
result = registry.classify_regulation("iso_27001")
assert result["rule"] == 3
assert result["source_type"] == "restricted"
def test_prefix_etsi_rule3(self, registry):
result = registry.classify_regulation("etsi_en_303_645")
assert result["rule"] == 3
def test_unknown_defaults_to_restricted(self, registry):
result = registry.classify_regulation("some_unknown_regulation")
assert result["rule"] == 3
assert result["source_type"] == "restricted"
assert result["license"] == "UNKNOWN"
# ── source_type_by_name tests ────────────────────────────────────────────
class TestSourceTypeByName:
def test_exact_match_law(self, registry):
result = registry.source_type_by_name("DSGVO (EU) 2016/679")
assert result == "law"
def test_exact_match_standard(self, registry):
result = registry.source_type_by_name("NIST SP 800-53 Rev. 5")
assert result == "standard"
def test_empty_returns_framework(self, registry):
assert registry.source_type_by_name("") == "framework"
assert registry.source_type_by_name(None) == "framework"
def test_heuristic_law(self, registry):
assert registry.source_type_by_name("Verordnung XYZ") == "law"
assert registry.source_type_by_name("Some EU Directive") == "law"
def test_heuristic_guideline(self, registry):
assert registry.source_type_by_name("EDPB Leitlinie 99/2025") == "guideline"
assert registry.source_type_by_name("BSI Standard 200-1") == "guideline"
def test_heuristic_framework(self, registry):
# "ENISA Cloud Guidelines" matches "guideline" before "enisa" in heuristic order
assert registry.source_type_by_name("ENISA Cloud Report") == "framework"
assert registry.source_type_by_name("OWASP Testing Guide") == "framework"
def test_unknown_returns_framework(self, registry):
assert registry.source_type_by_name("Completely Unknown Document") == "framework"
# ── is_open_source tests ────────────────────────────────────────────────
class TestIsOpenSource:
def test_rule1_is_open(self, registry):
assert registry.is_open_source("eu_2016_679") is True
def test_rule2_is_open(self, registry):
assert registry.is_open_source("owasp_asvs") is True
def test_rule3_is_not_open(self, registry):
assert registry.is_open_source("bsi_tr_03161") is False
def test_unknown_is_not_open(self, registry):
assert registry.is_open_source("unknown_thing") is False
# ── Cache behavior tests ────────────────────────────────────────────────
class TestCacheBehavior:
def test_fresh_cache_not_stale(self, registry):
assert registry._is_stale() is False
def test_old_cache_is_stale(self, registry):
registry._loaded_at = time.monotonic() - _CACHE_TTL_SECONDS - 1
assert registry._is_stale() is True
def test_ensure_loaded_reloads_when_stale(self):
reg = RegulationRegistry()
reg._loaded_at = time.monotonic() - _CACHE_TTL_SECONDS - 100 # force stale
load_called = False
original_load = reg._load
def tracking_load():
nonlocal load_called
load_called = True
reg._load = tracking_load
reg._ensure_loaded()
assert load_called, "_load should have been called when cache is stale"
def test_ensure_loaded_skips_when_fresh(self, registry):
with patch.object(registry, "_load") as mock_load:
registry._ensure_loaded()
mock_load.assert_not_called()
# ── Graceful degradation tests ──────────────────────────────────────────
class TestGracefulDegradation:
def test_db_failure_uses_stale_cache(self):
"""If DB fails, stale cache entries are still usable."""
reg = RegulationRegistry()
# First load succeeds
with patch("services.regulation_registry.SessionLocal") as mock_cls:
mock_session = MagicMock()
mock_session.execute = _mock_db_execute
mock_cls.return_value = mock_session
reg._load()
# Force stale
reg._loaded_at = time.monotonic() - _CACHE_TTL_SECONDS - 1
# Second load fails — DB error
from sqlalchemy.exc import OperationalError
with patch("services.regulation_registry.SessionLocal") as mock_cls:
mock_cls.side_effect = OperationalError("connection refused", None, None)
reg._ensure_loaded()
# Should still have cached data
result = reg.classify_regulation("eu_2016_679")
assert result["rule"] == 1
def test_empty_registry_returns_unknown(self):
"""Unloaded registry returns safe defaults."""
reg = RegulationRegistry()
reg._loaded_at = time.monotonic() # pretend fresh but empty
result = reg.classify_regulation("eu_2016_679")
assert result["rule"] == 3 # safe default
assert result["license"] == "UNKNOWN"
# ── Migration data consistency tests ────────────────────────────────────
class TestMigrationDataConsistency:
"""Verify that the migration script produces valid data."""
def test_build_rows_produces_data(self):
from scripts.f1_migrate_regulation_registry import build_rows
rows = build_rows()
assert len(rows) > 100 # at least 100 entries
def test_all_rows_have_required_fields(self):
from scripts.f1_migrate_regulation_registry import build_rows
rows = build_rows()
for row in rows:
assert row["regulation_id"], f"Missing regulation_id: {row}"
assert row["regulation_name_de"], f"Missing name: {row}"
assert row["license_rule"] in (1, 2, 3), f"Bad rule: {row}"
assert row["source_type"] in (
"law", "guideline", "standard", "framework", "restricted"
), f"Bad source_type: {row}"
assert row["jurisdiction"], f"Missing jurisdiction: {row}"
assert row["status"] in ("active", "needs_review", "deprecated")
def test_no_duplicate_regulation_ids(self):
from scripts.f1_migrate_regulation_registry import build_rows
rows = build_rows()
ids = [r["regulation_id"] for r in rows]
assert len(ids) == len(set(ids)), f"Duplicates: {[x for x in ids if ids.count(x) > 1]}"
def test_known_regulations_present(self):
from scripts.f1_migrate_regulation_registry import build_rows
rows = build_rows()
ids = {r["regulation_id"] for r in rows}
assert "eu_2016_679" in ids # DSGVO
assert "bdsg" in ids # BDSG
assert "nist_sp_800_53" in ids # NIST
assert "owasp_asvs" in ids # OWASP
def test_owasp_has_attribution(self):
from scripts.f1_migrate_regulation_registry import build_rows
rows = build_rows()
owasp = [r for r in rows if r["regulation_id"] == "owasp_asvs"][0]
assert owasp["attribution"] is not None
assert "OWASP" in owasp["attribution"]
assert owasp["license_rule"] == 2