"""Tests für die Auto-Learning-Pattern-Library.""" from __future__ import annotations import json import pytest @pytest.fixture def tmp_lib(tmp_path, monkeypatch): p = tmp_path / "patterns.json" monkeypatch.setenv("AGENT_PATTERN_LIBRARY", str(p)) import compliance.services.specialist_agents._pattern_library as lib lib._invalidate_cache() yield lib, p lib._invalidate_cache() def test_record_creates_file(tmp_lib): lib, p = tmp_lib assert not p.exists() lib.record("kontakt_telefon", "Telefonnr.", 0.9, "impressum") assert p.exists() data = json.loads(p.read_text()) assert len(data["patterns"]) == 1 assert data["patterns"][0]["label_used"] == "Telefonnr." assert data["patterns"][0]["observed_count"] == 1 def test_record_increments_existing(tmp_lib): lib, _ = tmp_lib lib.record("kontakt_telefon", "Telefonnr.", 0.9, "impressum") lib.record("kontakt_telefon", "Telefonnr.", 0.85, "impressum") lib.record("kontakt_telefon", "telefonnr.", 0.8, "impressum") # case-i raws = lib.list_all() assert len(raws) == 1 assert raws[0]["observed_count"] == 3 def test_record_separate_per_field_id(tmp_lib): lib, _ = tmp_lib lib.record("kontakt_telefon", "Tel", 0.9, "impressum") lib.record("kontakt_email", "Tel", 0.9, "impressum") assert len(lib.list_all()) == 2 def test_record_empty_inputs_noop(tmp_lib): lib, p = tmp_lib lib.record("", "Tel", 0.9, "impressum") lib.record("kontakt_telefon", "", 0.9, "impressum") lib.record("kontakt_telefon", "Tel", 0.9, "") assert not p.exists() def test_load_patterns_returns_compiled_regex(tmp_lib): lib, _ = tmp_lib lib.record("kontakt_telefon", "Telefonnr.", 0.9, "impressum") pats = lib.load_patterns_for("kontakt_telefon", "impressum") assert len(pats) == 1 m = pats[0].search("Hier: Telefonnr. 0761/12345") assert m is not None def test_load_patterns_filters_low_confidence(tmp_lib): lib, _ = tmp_lib lib.record("kontakt_telefon", "WeakLabel", 0.3, "impressum") pats = lib.load_patterns_for( "kontakt_telefon", "impressum", min_avg_confidence=0.5, ) assert pats == [] # observed_count filter pats = lib.load_patterns_for( "kontakt_telefon", "impressum", min_observed=2, ) assert pats == [] def test_label_to_regex_telefon(): from compliance.services.specialist_agents._pattern_library import ( _label_to_regex, ) rx = _label_to_regex("Telefonnr.") import re assert re.search(rx, "Telefonnr. 0761/12345", re.I) assert re.search(rx, "Telefonnr 0761", re.I) def test_label_to_regex_email(): from compliance.services.specialist_agents._pattern_library import ( _label_to_regex, ) rx = _label_to_regex("Mailadresse") import re assert re.search(rx, "Mailadresse: x@y.de", re.I) def test_prune_low_confidence_keeps_recent(tmp_lib): lib, _ = tmp_lib lib.record("kontakt_telefon", "Tel", 0.9, "impressum") pruned = lib.prune_low_confidence(min_runs_before_prune=100) assert pruned == 0 # Nur einmal observed → noch nicht prunen assert len(lib.list_all()) == 1 def test_load_patterns_for_nonexistent_returns_empty(tmp_lib): lib, _ = tmp_lib assert lib.load_patterns_for("ghost", "impressum") == []