diff --git a/backend-compliance/compliance/api/specialist_agent_routes.py b/backend-compliance/compliance/api/specialist_agent_routes.py index 74c0c43f..5ca88e0e 100644 --- a/backend-compliance/compliance/api/specialist_agent_routes.py +++ b/backend-compliance/compliance/api/specialist_agent_routes.py @@ -14,6 +14,7 @@ Endpoints: from __future__ import annotations import asyncio +import html as html_lib import json import logging import uuid @@ -244,6 +245,11 @@ async def _process_slot( "error": fetch_err, }) if text: + # HTML-Entity-Decode: dsi-discovery liefert manchmal   / & + # / ä als Literal-String — der Agent würde regex-pattern + # darüber stolpern. Wir decoden VOR dem Vault-Dump so dass der + # raw_text auch lesbar bleibt. + text = html_lib.unescape(text) vault.put_bytes("raw", slot, "source.txt", text.encode("utf-8"), mime="text/plain") diff --git a/backend-compliance/compliance/services/specialist_agents/impressum/mcs.py b/backend-compliance/compliance/services/specialist_agents/impressum/mcs.py index 7e00bb26..b336fe28 100644 --- a/backend-compliance/compliance/services/specialist_agents/impressum/mcs.py +++ b/backend-compliance/compliance/services/specialist_agents/impressum/mcs.py @@ -108,7 +108,7 @@ MCS: tuple[MC, ...] = ( r"Vertretungsberechtigt|vertreten\s+durch|" r"Inhaber(?:in)?|" r"Pers(?:ö|oe)nlich\s+haftend)" - r"\s*[:.\s]", + r"\s*[:.\s(]", re.IGNORECASE, ), re.compile(r"\bManagement\s*[:.\s]\s*[A-ZÄÖÜ]",