From 8b9cad88ae8ae8f6ea5fba80f72b5d7b96c3670f Mon Sep 17 00:00:00 2001 From: Benjamin Admin Date: Sun, 7 Jun 2026 00:08:18 +0200 Subject: [PATCH] fix(b9): clean entity names in multi-entity-impressum (GT IMPRESSUM-001) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Der Multi-Entity-Check fängt Elli's USt-IdNr-Lücke (VW Group Charging GmbH hat keine, Elli Mobility GmbH hat eine), aber Entity-Namen waren mit Header-Noise verunreinigt: 'Impressum\n\nVolkswagen Group Charging GmbH' 'eco\n\nElli Mobility GmbH' Behoben: - _ENTITY_PAT lässt nur Space im Namen zu (kein \s/\n mehr) - _clean_entity_name() trimmt Header-Worte (Impressum, Anbieter, ...) und nimmt nur die letzte Zeile vor Legal-Form-Suffix - 11 neue Tests, davon einer mit Elli-like Impressum als Charakterisierungs-Test Damit ist die finale Finding-Ausgabe für Audit-Reports lesbar ('Fehlt bei: Volkswagen Group Charging GmbH') statt verunreinigt. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../services/impressum_multi_entity_check.py | 21 +++- .../test_impressum_multi_entity_check.py | 109 ++++++++++++++++++ 2 files changed, 128 insertions(+), 2 deletions(-) create mode 100644 backend-compliance/tests/test_impressum_multi_entity_check.py diff --git a/backend-compliance/compliance/services/impressum_multi_entity_check.py b/backend-compliance/compliance/services/impressum_multi_entity_check.py index 00d41530..cce74770 100644 --- a/backend-compliance/compliance/services/impressum_multi_entity_check.py +++ b/backend-compliance/compliance/services/impressum_multi_entity_check.py @@ -23,11 +23,17 @@ import re logger = logging.getLogger(__name__) _ENTITY_PAT = re.compile( - r"([A-ZÄÖÜ][\w\-\&\s]{1,50}?\s+(?:GmbH|AG|UG|KG|SE|" + r"([A-ZÄÖÜ][\w\-\& ]{1,50}?\s+(?:GmbH|AG|UG|KG|SE|" r"e\.V\.|GbR|OHG|Limited|Ltd|LLC))", re.IGNORECASE, ) +_NAME_NOISE_PAT = re.compile( + r"^(?:Impressum|Anbieter|Anbieterkennzeichnung|Diensteanbieter|" + r"Verantwortlich(?:er)?|Kontakt|Adresse|@\S+|.+@.+)\s*[:|\-]?\s*", + re.IGNORECASE, +) + _USTID_PAT = re.compile(r"\b(?:USt-?Id(?:Nr)?\.?|VAT(?:-?Id)?)\s*[:.\s]\s*" r"(DE\d{8,10}|[A-Z]{2}\d{6,12})", re.IGNORECASE) _HR_PAT = re.compile(r"\b(?:HR[BA]|Handelsregister|Registergericht)" @@ -36,6 +42,17 @@ _GF_PAT = re.compile(r"(?:Geschäftsführer|Vertretungsberechtigt|" r"vertreten\s+durch)\s*[:.\s]+", re.IGNORECASE) +def _clean_entity_name(raw: str) -> str: + """Strip leading header noise + collapse whitespace.""" + name = raw.strip() + # If the match spans multiple lines (regex captured a header before + # the actual company name), keep only the last line. + if "\n" in name: + name = name.rsplit("\n", 1)[-1].strip() + name = _NAME_NOISE_PAT.sub("", name).strip() + return re.sub(r"\s+", " ", name) + + def _slice_entities(text: str) -> list[tuple[str, str]]: """Return [(entity_name, text_slice)] for each detected entity.""" matches = list(_ENTITY_PAT.finditer(text)) @@ -45,7 +62,7 @@ def _slice_entities(text: str) -> list[tuple[str, str]]: for i, m in enumerate(matches): start = m.start() end = matches[i + 1].start() if i + 1 < len(matches) else len(text) - slices.append((m.group(1).strip(), text[start:end])) + slices.append((_clean_entity_name(m.group(1)), text[start:end])) return slices diff --git a/backend-compliance/tests/test_impressum_multi_entity_check.py b/backend-compliance/tests/test_impressum_multi_entity_check.py new file mode 100644 index 00000000..115c0c98 --- /dev/null +++ b/backend-compliance/tests/test_impressum_multi_entity_check.py @@ -0,0 +1,109 @@ +"""Tests for B9 Multi-Entity-Impressum-Check. + +GT-Anker: Elli IMPRESSUM-001 — bei mehreren Entitäten USt-IdNr nur +bei einer von zwei vorhanden → Finding muss per-Entity erkennen. +""" + +from compliance.services.impressum_multi_entity_check import ( + _clean_entity_name, + _slice_entities, + check_multi_entity_impressum, +) + + +_ELLI_LIKE = """ +Impressum + +Volkswagen Group Charging GmbH +Karl-Liebknecht-Str. 32 +10178 Berlin +Amtsgericht Charlottenburg HRB 208967 B +Geschäftsführer: Giovanni Palazzo, Mark Möller +Telefon: 00800 3554 1111 +E-Mail: info@elli.eco + +Elli Mobility GmbH +Karl-Liebknecht-Str. 32 +10178 Berlin +Amtsgericht Charlottenburg HRB 274616 B +USt-IdNr.: DE814424009 +Geschäftsführer: Joschi Jennermann +Telefon: 00800 00002030 +E-Mail: ellimobility@elli.eco +""" + + +class TestCleanEntityName: + def test_strips_header_prefix(self): + assert _clean_entity_name( + "Impressum\n\nVolkswagen Group Charging GmbH" + ) == "Volkswagen Group Charging GmbH" + + def test_strips_email_artifact(self): + # mid-text matches may capture an email TLD before the next entity + assert _clean_entity_name( + "eco\n\nElli Mobility GmbH" + ) == "Elli Mobility GmbH" + + def test_collapses_whitespace(self): + assert _clean_entity_name( + "Acme Holding GmbH" + ) == "Acme Holding GmbH" + + def test_clean_name_passes_through(self): + assert _clean_entity_name("Elli Mobility GmbH") == "Elli Mobility GmbH" + + +class TestSliceEntities: + def test_no_entities_empty(self): + assert _slice_entities("Random text without companies.") == [] + + def test_single_entity_no_slice(self): + # Single-entity impressum is the normal case — multi-entity-only. + assert _slice_entities("Acme GmbH\nMusterstr 1\n") == [] + + def test_two_entities_clean_names(self): + slices = _slice_entities(_ELLI_LIKE) + names = [n for n, _ in slices] + assert names == [ + "Volkswagen Group Charging GmbH", + "Elli Mobility GmbH", + ] + + +class TestCheck: + def test_elli_pattern_finds_missing_ust_id(self): + findings = check_multi_entity_impressum( + {"doc_texts": {"impressum": _ELLI_LIKE}} + ) + ust_findings = [ + f for f in findings if f["check_id"] == "IMPRESSUM-MULTI-UST_ID" + ] + assert len(ust_findings) == 1 + f = ust_findings[0] + assert f["entities_missing"] == ["Volkswagen Group Charging GmbH"] + assert f["entities_present"] == ["Elli Mobility GmbH"] + assert f["severity"] == "MEDIUM" + assert "TMG" in f["norm"] + + def test_no_impressum_no_findings(self): + assert check_multi_entity_impressum({"doc_texts": {}}) == [] + + def test_single_entity_no_findings(self): + text = "Acme GmbH\nMusterstr 1\nUSt-IdNr DE123456789" + assert check_multi_entity_impressum( + {"doc_texts": {"impressum": text}} + ) == [] + + def test_both_entities_have_ust_id_no_finding(self): + text = ( + "Acme GmbH\nUSt-IdNr DE111111111\n\n" + "Foo Holding GmbH\nUSt-IdNr DE222222222\n" + ) + ust_findings = [ + f for f in check_multi_entity_impressum( + {"doc_texts": {"impressum": text}} + ) + if f["check_id"] == "IMPRESSUM-MULTI-UST_ID" + ] + assert ust_findings == []