feat(consent-tester): Phase B — named CMP library + plugin architecture
cmp_extractor.py refactored to thin coordinator (123 LOC, was 223). Discovers all CMP modules via cmp_library/_registry.py:load_all() at import time. Restart consent-tester to pick up new modules. New cmp_library/ folder: - _registry.py: auto-discovers all modules with MATCHER + reconstruct() - epaas.py: BMW Group ePaaS (extracted from cmp_extractor) - onetrust.py: cdn.cookielaw.org Groups/Cookies schema - cookiebot.py: consent.cookiebot.com Categories schema - usercentrics.py: api.usercentrics.eu services schema - didomi.py: sdk.privacy-center.org notice + vendors + purposes - trustarc.py: consent.trustarc.com categories + vendors Each module: - MATCHER: re.Pattern matching the CMP JSON endpoint URL - reconstruct(d: dict) -> str: builds German Markdown cookie-policy text Phase E (self-improving) will write auto_*.py files into the same folder; _registry already picks those up via pkgutil.iter_modules.
This commit is contained in:
@@ -0,0 +1,55 @@
|
||||
"""
|
||||
CMP Library Registry — discovers and registers all CMP modules.
|
||||
|
||||
Each CMP module exports two module-level symbols:
|
||||
- MATCHER: a compiled regex matching the JSON endpoint URL
|
||||
- reconstruct(data: dict) -> str: builds the cookie-policy text from JSON
|
||||
|
||||
The registry auto-discovers:
|
||||
1. Hand-written modules: epaas, onetrust, cookiebot, usercentrics,
|
||||
didomi, trustarc
|
||||
2. Auto-promoted modules: any file matching `auto_*.py` in this folder
|
||||
(created by Phase E when an LLM successfully discovers a new pattern)
|
||||
|
||||
A consent-tester restart picks up new auto_*.py files automatically.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import importlib
|
||||
import logging
|
||||
import pkgutil
|
||||
from pathlib import Path
|
||||
from typing import Callable
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# (cmp_name, url_pattern, reconstruct_fn)
|
||||
Registry = list[tuple[str, "object", Callable[[dict], str]]]
|
||||
|
||||
|
||||
def load_all() -> Registry:
|
||||
"""Import every module in this package and collect MATCHER + reconstruct."""
|
||||
import services.cmp_library as pkg # type: ignore[import-not-found]
|
||||
registry: Registry = []
|
||||
|
||||
pkg_path = Path(pkg.__file__).parent
|
||||
for module_info in pkgutil.iter_modules([str(pkg_path)]):
|
||||
name = module_info.name
|
||||
if name.startswith("_"):
|
||||
continue
|
||||
try:
|
||||
module = importlib.import_module(f"services.cmp_library.{name}")
|
||||
matcher = getattr(module, "MATCHER", None)
|
||||
reconstruct = getattr(module, "reconstruct", None)
|
||||
if matcher is None or not callable(reconstruct):
|
||||
logger.warning(
|
||||
"CMP module %s missing MATCHER or reconstruct() — skipped", name,
|
||||
)
|
||||
continue
|
||||
registry.append((name, matcher, reconstruct))
|
||||
logger.info("CMP loaded: %s", name)
|
||||
except Exception as e:
|
||||
logger.warning("CMP module %s failed to load: %s", name, e)
|
||||
|
||||
return registry
|
||||
Reference in New Issue
Block a user