feat(consent-tester): Phase E — self-improving CMP library

cmp_discovery_log.py:
- sqlite log at /data/cmp_discoveries.db: every LLM-discovered CMP
  pattern recorded with domain, strategy, value, sample text
- Auto-promote (user-chosen 'voll automatisch' mode): when LLM returns
  strategy=url AND extracted text >= 800 words, write a new module
  /data/auto_cmp/auto_<slug>.py with derived regex matcher + reconstruct
- record_discovery() called from dsi_discovery._try_llm_cascade on success

cmp_library/_registry.py:
- Loads both hand-written modules from services/cmp_library/ AND
  auto-promoted modules from /data/auto_cmp/ (CMP_AUTO_DIR env)
- Auto modules use importlib.util.spec_from_file_location, no package
  install needed; restart consent-tester to pick up new ones

dsi_discovery.py:
- _try_llm_cascade now calls record_discovery() on every successful
  LLM analysis (cached AND fresh)

main.py:
- GET /cmp-discoveries — admin endpoint listing all logged discoveries
- DELETE /cmp-discoveries/{id} — rollback (unlinks auto_*.py)

This closes the self-improving loop: first encounter with a new CMP fires
the LLM (cost) → discovery is auto-promoted → all future runs against the
same vendor pattern hit Phase B (Named CMP) at <50ms with no LLM call.
This commit is contained in:
Benjamin Admin
2026-05-16 23:09:23 +02:00
parent 2400aa6a9e
commit 5f2da1de88
4 changed files with 298 additions and 12 deletions
+17
View File
@@ -344,3 +344,20 @@ async def dsi_discovery(req: DSIDiscoveryRequest):
errors=result.errors, errors=result.errors,
scanned_at=datetime.now(timezone.utc).isoformat(), scanned_at=datetime.now(timezone.utc).isoformat(),
) )
# ── Admin: CMP discoveries (Phase E) ────────────────────────────────
@app.get("/cmp-discoveries")
async def cmp_discoveries(limit: int = 200):
"""List LLM-discovered CMP patterns (Phase E auto-promote log)."""
from services.cmp_discovery_log import list_discoveries
return {"discoveries": list_discoveries(limit=limit)}
@app.delete("/cmp-discoveries/{disc_id}")
async def cmp_discovery_delete(disc_id: int):
"""Delete a discovery + its auto-promoted module (rollback)."""
from services.cmp_discovery_log import delete_discovery
ok = delete_discovery(disc_id)
return {"deleted": ok, "id": disc_id}
@@ -0,0 +1,226 @@
"""
CMP Discovery Log + Auto-Promotion (Phase E).
When the LLM cascade (Phase C+D) discovers how to extract a previously
unknown CMP, we want that knowledge to persist so the SAME LLM call never
has to fire again — even after a container restart.
Two persistence layers:
1. SQLite log at /data/cmp_discoveries.db — every successful LLM discovery
recorded with domain, strategy, suggested value, sample text. Visible via
admin endpoint GET /cmp-discoveries.
2. Auto-promoted Python modules in /data/auto_cmp/ — when the LLM's hint is
a "url" strategy with a derivable URL pattern AND the reconstruction
yields >=800 words, we write a new file `auto_<slug>.py` with a regex
matcher + reconstruct() function (using generic walker). The CMP library
registry hot-loads these on next restart.
User policy (from plan): voll automatisch — risk of LLM hallucinations is
accepted; we mitigate by requiring extracted_words >= 800 before promoting.
"""
from __future__ import annotations
import logging
import os
import re
import sqlite3
from datetime import datetime, timezone
from pathlib import Path
from urllib.parse import urlparse
logger = logging.getLogger(__name__)
DB_PATH = os.getenv("CMP_DISCOVERY_DB", "/data/cmp_discoveries.db")
AUTO_DIR = Path(DB_PATH).parent / "auto_cmp"
PROMOTION_MIN_WORDS = 800
# ── DB schema ───────────────────────────────────────────────────────
def _ensure_db() -> None:
"""Create the sqlite file + schema on first use."""
Path(DB_PATH).parent.mkdir(parents=True, exist_ok=True)
with sqlite3.connect(DB_PATH) as conn:
conn.execute("""
CREATE TABLE IF NOT EXISTS discoveries (
id INTEGER PRIMARY KEY AUTOINCREMENT,
ts TEXT NOT NULL,
domain TEXT NOT NULL,
llm_used TEXT,
strategy TEXT,
value TEXT,
reconstructed_words INTEGER,
sample_text TEXT,
promoted_to TEXT
)
""")
conn.execute(
"CREATE INDEX IF NOT EXISTS idx_disc_domain ON discoveries(domain)"
)
conn.commit()
# ── Record + auto-promote ───────────────────────────────────────────
def record_discovery(
domain: str,
llm_used: str,
strategy: str,
value: str,
extracted_text: str,
) -> str | None:
"""Log a successful LLM discovery and (if eligible) auto-promote it to
a named CMP module in /data/auto_cmp/.
Returns the path of the promoted module (or None if not promoted).
"""
try:
_ensure_db()
except Exception as e:
logger.warning("CMP DB init failed: %s", e)
return None
word_count = len(extracted_text.split()) if extracted_text else 0
promoted_path: str | None = None
if word_count >= PROMOTION_MIN_WORDS:
try:
promoted_path = _promote(domain, strategy, value)
except Exception as e:
logger.warning("Auto-promote failed for %s: %s", domain, e)
sample = (extracted_text or "")[:500]
try:
with sqlite3.connect(DB_PATH) as conn:
conn.execute(
"INSERT INTO discoveries "
"(ts, domain, llm_used, strategy, value, "
" reconstructed_words, sample_text, promoted_to) "
"VALUES (?,?,?,?,?,?,?,?)",
(
datetime.now(timezone.utc).isoformat(),
domain, llm_used, strategy, value,
word_count, sample, promoted_path,
),
)
conn.commit()
except Exception as e:
logger.warning("CMP log insert failed: %s", e)
return promoted_path
def _promote(domain: str, strategy: str, value: str) -> str | None:
"""Write an auto_*.py file the registry will pick up on next restart."""
if strategy != "url":
# Selector/text strategies are too page-specific for a global module
return None
parsed = urlparse(value)
if not parsed.netloc:
return None
AUTO_DIR.mkdir(parents=True, exist_ok=True)
slug = _slugify(domain)
target = AUTO_DIR / f"auto_{slug}.py"
# Derive a fairly tight URL pattern: same netloc + same path-prefix up
# to the last "<id>" or "<config>" path segment. Be conservative.
pattern = _url_to_regex(value)
target.write_text(_AUTO_TEMPLATE.format(
domain=domain,
source_url=value,
pattern=pattern.replace('"', '\\"'),
))
logger.info("Auto-promoted CMP module for %s -> %s", domain, target)
return str(target)
_SLUG_RE = re.compile(r"[^a-z0-9]+")
def _slugify(s: str) -> str:
return _SLUG_RE.sub("_", s.lower()).strip("_") or "unknown"
_NUM_OR_HEX_SEG = re.compile(r"^[a-f0-9\-_~]{8,}$", re.I)
def _url_to_regex(url: str) -> str:
"""Build a regex that matches the same JSON endpoint shape.
Replaces variable-looking segments (hashes, UUIDs, version IDs) with
`[^/]+` so the pattern keeps matching when the CMP rotates its config.
Preserves stable segments (literal paths).
"""
p = urlparse(url)
parts = p.path.split("/")
generalised = []
for seg in parts:
if not seg:
generalised.append(seg)
continue
# If segment contains language-locale (de_DE) we keep alphabetic
if _NUM_OR_HEX_SEG.match(seg) or any(c.isdigit() for c in seg):
generalised.append(r"[^/]+")
else:
generalised.append(re.escape(seg))
pattern_path = "/".join(generalised)
host_escaped = re.escape(p.netloc)
return rf"{host_escaped}{pattern_path}(\?|$)"
_AUTO_TEMPLATE = '''"""Auto-generated CMP matcher for {domain} (Phase E discovery).
Derived from LLM-suggested URL: {source_url}
Reconstruction uses the generic walker — safe default, may produce a less
polished output than a hand-written CMP module.
"""
import re
MATCHER = re.compile(r"{pattern}", re.I)
def reconstruct(d: dict) -> str:
from services.cmp_heuristic import reconstruct_generic
return reconstruct_generic(d)
'''
# ── Read API for admin endpoint ─────────────────────────────────────
def list_discoveries(limit: int = 200) -> list[dict]:
"""Return recent discoveries as a list of dicts for the admin endpoint."""
_ensure_db()
with sqlite3.connect(DB_PATH) as conn:
conn.row_factory = sqlite3.Row
rows = conn.execute(
"SELECT * FROM discoveries ORDER BY id DESC LIMIT ?", (limit,),
).fetchall()
return [dict(r) for r in rows]
def delete_discovery(disc_id: int) -> bool:
"""Delete a discovery + its auto-promoted module (if any)."""
_ensure_db()
with sqlite3.connect(DB_PATH) as conn:
row = conn.execute(
"SELECT promoted_to FROM discoveries WHERE id=?", (disc_id,),
).fetchone()
if not row:
return False
promoted = row[0]
conn.execute("DELETE FROM discoveries WHERE id=?", (disc_id,))
conn.commit()
if promoted:
try:
Path(promoted).unlink(missing_ok=True)
except Exception as e:
logger.warning("Could not unlink %s: %s", promoted, e)
return True
@@ -17,7 +17,9 @@ A consent-tester restart picks up new auto_*.py files automatically.
from __future__ import annotations from __future__ import annotations
import importlib import importlib
import importlib.util
import logging import logging
import os
import pkgutil import pkgutil
from pathlib import Path from pathlib import Path
from typing import Callable from typing import Callable
@@ -27,12 +29,22 @@ logger = logging.getLogger(__name__)
# (cmp_name, url_pattern, reconstruct_fn) # (cmp_name, url_pattern, reconstruct_fn)
Registry = list[tuple[str, "object", Callable[[dict], str]]] Registry = list[tuple[str, "object", Callable[[dict], str]]]
# Phase E: persistent auto-promoted modules live in a writable volume
# (separate from the source tree so deploys do not wipe them).
AUTO_DIR = Path(os.getenv("CMP_AUTO_DIR", "/data/auto_cmp"))
def load_all() -> Registry: def load_all() -> Registry:
"""Import every module in this package and collect MATCHER + reconstruct.""" """Import every module in this package and from AUTO_DIR."""
import services.cmp_library as pkg # type: ignore[import-not-found]
registry: Registry = [] registry: Registry = []
_load_from_package(registry)
_load_from_auto_dir(registry)
return registry
def _load_from_package(registry: Registry) -> None:
"""Import the hand-written modules in services/cmp_library/."""
import services.cmp_library as pkg # type: ignore[import-not-found]
pkg_path = Path(pkg.__file__).parent pkg_path = Path(pkg.__file__).parent
for module_info in pkgutil.iter_modules([str(pkg_path)]): for module_info in pkgutil.iter_modules([str(pkg_path)]):
name = module_info.name name = module_info.name
@@ -40,16 +52,35 @@ def load_all() -> Registry:
continue continue
try: try:
module = importlib.import_module(f"services.cmp_library.{name}") module = importlib.import_module(f"services.cmp_library.{name}")
matcher = getattr(module, "MATCHER", None) _register(registry, name, module)
reconstruct = getattr(module, "reconstruct", None)
if matcher is None or not callable(reconstruct):
logger.warning(
"CMP module %s missing MATCHER or reconstruct() — skipped", name,
)
continue
registry.append((name, matcher, reconstruct))
logger.info("CMP loaded: %s", name)
except Exception as e: except Exception as e:
logger.warning("CMP module %s failed to load: %s", name, e) logger.warning("CMP module %s failed to load: %s", name, e)
return registry
def _load_from_auto_dir(registry: Registry) -> None:
"""Import auto-promoted modules from the runtime volume."""
if not AUTO_DIR.exists():
return
for path in sorted(AUTO_DIR.glob("auto_*.py")):
name = path.stem
try:
spec = importlib.util.spec_from_file_location(name, path)
if not spec or not spec.loader:
continue
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
_register(registry, name, module)
except Exception as e:
logger.warning("Auto CMP module %s failed to load: %s", name, e)
def _register(registry: Registry, name: str, module) -> None:
matcher = getattr(module, "MATCHER", None)
reconstruct = getattr(module, "reconstruct", None)
if matcher is None or not callable(reconstruct):
logger.warning(
"CMP module %s missing MATCHER or reconstruct() — skipped", name,
)
return
registry.append((name, matcher, reconstruct))
logger.info("CMP loaded: %s", name)
+12
View File
@@ -836,6 +836,18 @@ async def _try_llm_cascade(
if wc >= 300: if wc >= 300:
await cache_set(netloc, hint) await cache_set(netloc, hint)
logger.info("LLM cached for %s (%s): %d words", netloc, hint.get("_tier"), wc) logger.info("LLM cached for %s (%s): %d words", netloc, hint.get("_tier"), wc)
# Phase E: log discovery + (if eligible) auto-promote to named CMP
try:
from services.cmp_discovery_log import record_discovery
record_discovery(
domain=netloc,
llm_used=hint.get("_tier", "unknown"),
strategy=hint.get("strategy", ""),
value=hint.get("value", ""),
extracted_text=text,
)
except Exception as e:
logger.debug("CMP discovery log failed: %s", e)
return text, wc return text, wc