diff --git a/backend-compliance/compliance/services/impressum_multi_entity_check.py b/backend-compliance/compliance/services/impressum_multi_entity_check.py index 00d41530..cce74770 100644 --- a/backend-compliance/compliance/services/impressum_multi_entity_check.py +++ b/backend-compliance/compliance/services/impressum_multi_entity_check.py @@ -23,11 +23,17 @@ import re logger = logging.getLogger(__name__) _ENTITY_PAT = re.compile( - r"([A-ZÄÖÜ][\w\-\&\s]{1,50}?\s+(?:GmbH|AG|UG|KG|SE|" + r"([A-ZÄÖÜ][\w\-\& ]{1,50}?\s+(?:GmbH|AG|UG|KG|SE|" r"e\.V\.|GbR|OHG|Limited|Ltd|LLC))", re.IGNORECASE, ) +_NAME_NOISE_PAT = re.compile( + r"^(?:Impressum|Anbieter|Anbieterkennzeichnung|Diensteanbieter|" + r"Verantwortlich(?:er)?|Kontakt|Adresse|@\S+|.+@.+)\s*[:|\-]?\s*", + re.IGNORECASE, +) + _USTID_PAT = re.compile(r"\b(?:USt-?Id(?:Nr)?\.?|VAT(?:-?Id)?)\s*[:.\s]\s*" r"(DE\d{8,10}|[A-Z]{2}\d{6,12})", re.IGNORECASE) _HR_PAT = re.compile(r"\b(?:HR[BA]|Handelsregister|Registergericht)" @@ -36,6 +42,17 @@ _GF_PAT = re.compile(r"(?:Geschäftsführer|Vertretungsberechtigt|" r"vertreten\s+durch)\s*[:.\s]+", re.IGNORECASE) +def _clean_entity_name(raw: str) -> str: + """Strip leading header noise + collapse whitespace.""" + name = raw.strip() + # If the match spans multiple lines (regex captured a header before + # the actual company name), keep only the last line. + if "\n" in name: + name = name.rsplit("\n", 1)[-1].strip() + name = _NAME_NOISE_PAT.sub("", name).strip() + return re.sub(r"\s+", " ", name) + + def _slice_entities(text: str) -> list[tuple[str, str]]: """Return [(entity_name, text_slice)] for each detected entity.""" matches = list(_ENTITY_PAT.finditer(text)) @@ -45,7 +62,7 @@ def _slice_entities(text: str) -> list[tuple[str, str]]: for i, m in enumerate(matches): start = m.start() end = matches[i + 1].start() if i + 1 < len(matches) else len(text) - slices.append((m.group(1).strip(), text[start:end])) + slices.append((_clean_entity_name(m.group(1)), text[start:end])) return slices diff --git a/backend-compliance/tests/test_impressum_multi_entity_check.py b/backend-compliance/tests/test_impressum_multi_entity_check.py new file mode 100644 index 00000000..115c0c98 --- /dev/null +++ b/backend-compliance/tests/test_impressum_multi_entity_check.py @@ -0,0 +1,109 @@ +"""Tests for B9 Multi-Entity-Impressum-Check. + +GT-Anker: Elli IMPRESSUM-001 — bei mehreren Entitäten USt-IdNr nur +bei einer von zwei vorhanden → Finding muss per-Entity erkennen. +""" + +from compliance.services.impressum_multi_entity_check import ( + _clean_entity_name, + _slice_entities, + check_multi_entity_impressum, +) + + +_ELLI_LIKE = """ +Impressum + +Volkswagen Group Charging GmbH +Karl-Liebknecht-Str. 32 +10178 Berlin +Amtsgericht Charlottenburg HRB 208967 B +Geschäftsführer: Giovanni Palazzo, Mark Möller +Telefon: 00800 3554 1111 +E-Mail: info@elli.eco + +Elli Mobility GmbH +Karl-Liebknecht-Str. 32 +10178 Berlin +Amtsgericht Charlottenburg HRB 274616 B +USt-IdNr.: DE814424009 +Geschäftsführer: Joschi Jennermann +Telefon: 00800 00002030 +E-Mail: ellimobility@elli.eco +""" + + +class TestCleanEntityName: + def test_strips_header_prefix(self): + assert _clean_entity_name( + "Impressum\n\nVolkswagen Group Charging GmbH" + ) == "Volkswagen Group Charging GmbH" + + def test_strips_email_artifact(self): + # mid-text matches may capture an email TLD before the next entity + assert _clean_entity_name( + "eco\n\nElli Mobility GmbH" + ) == "Elli Mobility GmbH" + + def test_collapses_whitespace(self): + assert _clean_entity_name( + "Acme Holding GmbH" + ) == "Acme Holding GmbH" + + def test_clean_name_passes_through(self): + assert _clean_entity_name("Elli Mobility GmbH") == "Elli Mobility GmbH" + + +class TestSliceEntities: + def test_no_entities_empty(self): + assert _slice_entities("Random text without companies.") == [] + + def test_single_entity_no_slice(self): + # Single-entity impressum is the normal case — multi-entity-only. + assert _slice_entities("Acme GmbH\nMusterstr 1\n") == [] + + def test_two_entities_clean_names(self): + slices = _slice_entities(_ELLI_LIKE) + names = [n for n, _ in slices] + assert names == [ + "Volkswagen Group Charging GmbH", + "Elli Mobility GmbH", + ] + + +class TestCheck: + def test_elli_pattern_finds_missing_ust_id(self): + findings = check_multi_entity_impressum( + {"doc_texts": {"impressum": _ELLI_LIKE}} + ) + ust_findings = [ + f for f in findings if f["check_id"] == "IMPRESSUM-MULTI-UST_ID" + ] + assert len(ust_findings) == 1 + f = ust_findings[0] + assert f["entities_missing"] == ["Volkswagen Group Charging GmbH"] + assert f["entities_present"] == ["Elli Mobility GmbH"] + assert f["severity"] == "MEDIUM" + assert "TMG" in f["norm"] + + def test_no_impressum_no_findings(self): + assert check_multi_entity_impressum({"doc_texts": {}}) == [] + + def test_single_entity_no_findings(self): + text = "Acme GmbH\nMusterstr 1\nUSt-IdNr DE123456789" + assert check_multi_entity_impressum( + {"doc_texts": {"impressum": text}} + ) == [] + + def test_both_entities_have_ust_id_no_finding(self): + text = ( + "Acme GmbH\nUSt-IdNr DE111111111\n\n" + "Foo Holding GmbH\nUSt-IdNr DE222222222\n" + ) + ust_findings = [ + f for f in check_multi_entity_impressum( + {"doc_texts": {"impressum": text}} + ) + if f["check_id"] == "IMPRESSUM-MULTI-UST_ID" + ] + assert ust_findings == []