"""Tests für Impressum-Agent v3 (Sprint 1.12). Mockt rag_document_checker damit Tests offline laufen + prüft die Layer-0-Boost-Logik isoliert. """ from __future__ import annotations import asyncio import pytest from compliance.services.specialist_agents import ( AgentInput, ImpressumAgent, Severity, ) from compliance.services.specialist_agents.impressum.agent import ( _build_measure, ) from compliance.services.specialist_agents.impressum.mcs import MCS from compliance.services.specialist_agents.impressum.regex_boost import ( BOOST_KEYWORDS, boost_matches_db_mc, compute_regex_boosts, criteria_on_topic, ) from compliance.services.specialist_agents.impressum.v3_engine import ( _filter_controls, ) TESLA_TEXT = ( "Tesla Germany GmbH\nLudwig-Prandtl-Strasse 25-29\n12526 Berlin\n" "E-Mail: info@tesla.com\n" "Telefon: +49 89 1250 16 800\n" "Management: Elon Musk\n" "Handelsregister: HRB 218904 B Charlottenburg\n" "USt-IdNr: DE123456789\n" ) def _run(coro): return asyncio.get_event_loop().run_until_complete(coro) def test_compute_regex_boosts_detects_basic_fields(): hits = compute_regex_boosts(TESLA_TEXT, business_scope=set()) # Tesla hat klassische Pflichtangaben assert "kontakt_email" in hits assert "kontakt_telefon" in hits assert "handelsregister" in hits assert "ust_id" in hits assert "vertretungsberechtigte" in hits # "Management" # KFZ-Auto-Detect → aufsichtsbehoerde wäre relevant aber kein # Pattern getroffen (KBA nicht genannt) def test_compute_regex_boosts_short_text_empty(): assert compute_regex_boosts("x", business_scope=set()) == set() def test_boost_matches_db_mc_finds_telefon(): boosts = {"kontakt_telefon"} pass_crit = [ "Telefonnummer angeben", "Erreichbar per Telefon und E-Mail", ] matched = boost_matches_db_mc(boosts, pass_crit) assert matched == "kontakt_telefon" def test_boost_matches_db_mc_returns_none_when_unrelated(): boosts = {"kontakt_telefon"} pass_crit = [ "Cookie-Banner muss zentriert sein", ] assert boost_matches_db_mc(boosts, pass_crit) is None def test_boost_matches_db_mc_uses_fail_criteria(): """Wörter aus fail_criteria sollen die Zuordnung mit unterstützen.""" boosts = {"name_anbieter"} pass_crit = ["Sichtbar"] fail_crit = ["Keine Postadresse angegeben", "Adresse fehlt"] matched = boost_matches_db_mc(boosts, pass_crit, fail_crit) assert matched == "name_anbieter" def test_boost_matches_db_mc_eto_address_case(): """Konkreter ETO-Fall: AUTH-1954-A07 'Postadresse + Geschäftssitz'.""" boosts = {"name_anbieter"} pass_crit = [ "Vollständige Postadresse (Straße, Hausnummer, PLZ, Ort, Land)", "Oder: Eindeutige Angabe des Geschäftssitzes", "Adresse ist aktuell und korrekt", ] matched = boost_matches_db_mc(boosts, pass_crit) assert matched == "name_anbieter" def test_boost_keywords_cover_all_field_ids(): """Jedes mcs.py field_id muss in BOOST_KEYWORDS ein Eintrag haben.""" from compliance.services.specialist_agents.impressum.mcs import MCS for mc in MCS: assert mc.field_id in BOOST_KEYWORDS, ( f"BOOST_KEYWORDS missing for {mc.field_id}" ) @pytest.fixture def no_llm(monkeypatch): """Deaktiviert den LLM-Semantic-Validator — der Agent prueft die 12 mcs.py-Pattern-MCs deterministisch direkt am Text.""" async def _no_validator(*a, **kw): return {} monkeypatch.setattr( "compliance.services.specialist_agents.impressum.agent.validate_present", _no_validator, ) def test_agent_emits_pflichtangabe_findings(no_llm): agent = ImpressumAgent() out = _run(agent.evaluate(AgentInput(doc_type="impressum", text=TESLA_TEXT))) fids = {f.field_id for f in out.findings} # Tesla nennt 'Management' (englisch) → deutsches GF-Label fehlt assert "vertretungsberechtigte_label_korrekt" in fids f = next(f for f in out.findings if f.field_id == "vertretungsberechtigte_label_korrekt") assert f.severity == Severity.MEDIUM.value assert f.check_id == "IMP-vertretungsberechtigte_label_korrekt" assert f.severity_reason == "pflichtangabe_missing" # Vorhandene Pflichtangaben erzeugen KEIN Finding assert "kontakt_email" not in fids assert "handelsregister" not in fids def test_agent_coverage_has_all_12(no_llm): agent = ImpressumAgent() out = _run(agent.evaluate(AgentInput(doc_type="impressum", text=TESLA_TEXT))) assert out.mc_total == len(MCS) # je MC genau 1 Coverage-Eintrag ok = [c for c in out.mc_coverage if c.status == "ok"] # name, email, telefon, HR, USt, vertretungsberechtigte = 6 vorhanden assert len(ok) == 6 def test_agent_notes(no_llm): agent = ImpressumAgent() out = _run(agent.evaluate(AgentInput(doc_type="impressum", text=TESLA_TEXT))) assert "§5-TMG-MCs geprüft" in out.notes def test_short_text_skipped(): agent = ImpressumAgent() out = _run(agent.evaluate(AgentInput(doc_type="impressum", text="x"))) assert all(c.status == "skipped" for c in out.mc_coverage) assert not out.findings def test_agent_version_is_three(): agent = ImpressumAgent() assert agent.agent_version == "3.0" # ── Themen-Gate: criteria_on_topic ────────────────────────────────── def test_criteria_on_topic_keeps_genuine_telefon(): assert criteria_on_topic([ "Telefonnummer angeben", "Erreichbar per Telefon", ]) is True def test_criteria_on_topic_keeps_genuine_address(): assert criteria_on_topic([ "Vollständige Postadresse (Straße, Hausnummer, PLZ, Ort)", ]) is True def test_criteria_on_topic_drops_bestellbestaetigung(): # Fremd-MC: kein Impressum-Themenüberlapp → raus. assert criteria_on_topic([ "Bestellbestätigung wird nach Vertragsschluss versendet", "Bestelleingang wird dokumentiert", ]) is False def test_criteria_on_topic_single_incidental_hit_dropped(): # 'E-Mail' allein (1 Treffer) reicht nicht — braucht >=2. assert criteria_on_topic([ "Bestellbestätigung wird per E-Mail versendet", ]) is False def test_criteria_on_topic_drops_behoerdliche_anzeige(): assert criteria_on_topic([ "Behördliche Anzeige der Tätigkeit erfolgt", "Gewerbeanmeldung liegt vor", ]) is False def test_criteria_on_topic_empty_kept(): # Keine Kriterien = kein Signal → konservativ behalten. assert criteria_on_topic([]) is True # ── Scope-Filter: _filter_controls ────────────────────────────────── def _mc(control_id, pass_criteria): return {"control_id": control_id, "pass_criteria": pass_criteria, "fail_criteria": []} def test_filter_controls_drops_gov_when_out_of_scope(): controls = [_mc("GOV-814-A03", ["Behörde meldet an Aufsichtsstelle"])] kept, stats = _filter_controls(controls, business_scope=set()) assert kept == [] assert stats["sector_dropped"] == 1 def test_filter_controls_keeps_gov_when_in_scope(): controls = [_mc("GOV-814-A03", ["Aufsichtsbehörde und Behörde benannt"])] kept, stats = _filter_controls(controls, business_scope={"government"}) assert len(kept) == 1 assert stats["sector_dropped"] == 0 def test_filter_controls_keeps_genuine_impressum_mc(): controls = [_mc("AUTH-1954-A07", ["Vollständige Postadresse mit Straße und PLZ"])] kept, stats = _filter_controls(controls, business_scope=set()) assert len(kept) == 1 assert stats["sector_dropped"] == 0 assert stats["offtopic_dropped"] == 0 def test_filter_controls_drops_offtopic_non_sector_mc(): controls = [_mc("ECOM-1-A1", ["Bestellbestätigung nach Vertragsschluss versenden"])] kept, stats = _filter_controls(controls, business_scope=set()) assert kept == [] assert stats["offtopic_dropped"] == 1 # ── Maßnahme statt Frage: _build_measure ──────────────────────────── def test_build_measure_is_imperative_not_question(): m = _build_measure("USt-IdNr", "§ 5 Abs. 1 Nr. 6 TMG") assert "?" not in m assert "ergänzen" in m.lower() assert "Rechtsgrundlage" in m def test_build_measure_handles_empty_label(): m = _build_measure("", "") assert "?" not in m assert m.strip() != "" # ── Delegation an Main-Tool-Engine + Filter (Integration) ─────────── def test_run_v3_pipeline_delegates_and_filters(monkeypatch): """run_v3_pipeline lädt über die Main-Tool-Engine (_load_controls gemockt), normalisiert JSONB-Strings und das Sektor-/Themen-Gate entfernt GOV (out-of-scope) + fremde MCs. Genuine MC bleibt.""" from compliance.services.specialist_agents.impressum import v3_engine async def _fake_load(doc_type, db_url, limit, business_scope=None): # pass_criteria absichtlich als JSON-STRING (wie asyncpg JSONB) return [ {"control_id": "AUTH-1954-A07", "title": "USt-IdNr", "regulation": "TMG", "article": "§ 5", "severity": "HIGH", "check_question": "Ist die USt-IdNr angegeben?", "pass_criteria": '["USt-IdNr"]', "fail_criteria": "[]"}, {"control_id": "GOV-814-A03", "title": "Behördliche Anzeige", "regulation": "X", "article": "", "severity": "HIGH", "check_question": "Behörde informiert?", "pass_criteria": '["Aufsichtsbehörde und Behörde benannt"]', "fail_criteria": "[]"}, {"control_id": "ECOM-1-A1", "title": "Bestellbestätigung", "regulation": "X", "article": "", "severity": "HIGH", "check_question": "Bestellbestätigung versandt?", "pass_criteria": '["Bestellbestätigung nach Vertragsschluss versenden"]', "fail_criteria": "[]"}, ] monkeypatch.setattr( "compliance.services.rag_document_checker._load_controls", _fake_load, ) # AUTH-MC matched per Keyword → kein Layer-2-Embedding nötig; kein # mc_embedding_matcher-Mock erforderlich. text = ("Beispiel GmbH\nMusterstr. 1\n12345 Berlin\n" "USt-IdNr: DE123456789\n") * 5 # >100 Zeichen results, telem = _run( v3_engine.run_v3_pipeline(text, business_scope=set()), ) cids = {r["control_id"] for r in results} assert "GOV-814-A03" not in cids # Sektor out-of-scope assert "ECOM-1-A1" not in cids # themenfremd assert "AUTH-1954-A07" in cids # genuine MC bleibt assert telem["sector_dropped"] == 1 assert telem["offtopic_dropped"] == 1