feat(pipeline): F1 regulation registry — DB-backed license/source-type lookup

Migrates REGULATION_LICENSE_MAP (135 entries) and SOURCE_REGULATION_CLASSIFICATION
(58 entries) from hardcoded Python dicts to compliance.regulation_registry table.

- SQL migration: 002_regulation_registry.sql (table + indexes + trigger)
- Migration script: f1_migrate_regulation_registry.py (162 rows, --dry-run)
- RegulationRegistry cache: 5min TTL, prefix fallback, graceful degradation
- control_generator._classify_regulation() delegates to DB with dict fallback
- source_type_classification.classify_source_regulation() delegates to DB
- 34 new tests (lookup, cache, degradation, migration data consistency)
- 421 total tests pass, 0 regressions

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Benjamin Admin
2026-05-03 23:14:06 +02:00
parent 4fd2bfefcd
commit 9437e029d0
7 changed files with 850 additions and 30 deletions
+13 -19
View File
@@ -33,6 +33,7 @@ from sqlalchemy import text
from sqlalchemy.orm import Session
from .rag_client import ComplianceRAGClient, RAGSearchResult, get_rag_client
from .regulation_registry import get_registry as _get_regulation_registry
from .similarity_detector import check_similarity
logger = logging.getLogger(__name__)
@@ -245,28 +246,21 @@ def _classify_regulation(regulation_code: str) -> dict:
Returns dict with keys: license, rule, name, source_type.
source_type is one of: law, guideline, standard, restricted.
Delegates to DB-backed RegulationRegistry (with 5min cache).
Falls back to REGULATION_LICENSE_MAP if DB is unavailable.
"""
code = regulation_code.lower().strip()
registry = _get_regulation_registry()
result = registry.classify_regulation(regulation_code)
# Exact match first
if code in REGULATION_LICENSE_MAP:
return REGULATION_LICENSE_MAP[code]
# If registry returned the unknown fallback AND we have a local match,
# prefer the local dict (graceful degradation during migration)
if result.get("license") == "UNKNOWN":
code = regulation_code.lower().strip()
if code in REGULATION_LICENSE_MAP:
return REGULATION_LICENSE_MAP[code]
# Prefix match for Rule 2 (ENISA = standard)
for prefix in _RULE2_PREFIXES:
if code.startswith(prefix):
return {"license": "CC-BY-4.0", "rule": 2, "source_type": "standard",
"name": "ENISA", "attribution": "ENISA, CC BY 4.0"}
# Prefix match for Rule 3 (BSI/ISO/ETSI = restricted)
for prefix in _RULE3_PREFIXES:
if code.startswith(prefix):
return {"license": f"{prefix.rstrip('_').upper()}_RESTRICTED", "rule": 3,
"source_type": "restricted", "name": "INTERNAL_ONLY"}
# Unknown → treat as restricted (safe default)
logger.warning("Unknown regulation_code %r — defaulting to Rule 3 (restricted)", code)
return {"license": "UNKNOWN", "rule": 3, "source_type": "restricted", "name": "INTERNAL_ONLY"}
return result
# ---------------------------------------------------------------------------