fix(b9): clean entity names in multi-entity-impressum (GT IMPRESSUM-001)
Der Multi-Entity-Check fängt Elli's USt-IdNr-Lücke (VW Group Charging
GmbH hat keine, Elli Mobility GmbH hat eine), aber Entity-Namen waren
mit Header-Noise verunreinigt:
'Impressum\n\nVolkswagen Group Charging GmbH'
'eco\n\nElli Mobility GmbH'
Behoben:
- _ENTITY_PAT lässt nur Space im Namen zu (kein \s/\n mehr)
- _clean_entity_name() trimmt Header-Worte (Impressum, Anbieter, ...)
und nimmt nur die letzte Zeile vor Legal-Form-Suffix
- 11 neue Tests, davon einer mit Elli-like Impressum als
Charakterisierungs-Test
Damit ist die finale Finding-Ausgabe für Audit-Reports lesbar
('Fehlt bei: Volkswagen Group Charging GmbH') statt verunreinigt.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -23,11 +23,17 @@ import re
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
_ENTITY_PAT = re.compile(
|
||||
r"([A-ZÄÖÜ][\w\-\&\s]{1,50}?\s+(?:GmbH|AG|UG|KG|SE|"
|
||||
r"([A-ZÄÖÜ][\w\-\& ]{1,50}?\s+(?:GmbH|AG|UG|KG|SE|"
|
||||
r"e\.V\.|GbR|OHG|Limited|Ltd|LLC))",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
_NAME_NOISE_PAT = re.compile(
|
||||
r"^(?:Impressum|Anbieter|Anbieterkennzeichnung|Diensteanbieter|"
|
||||
r"Verantwortlich(?:er)?|Kontakt|Adresse|@\S+|.+@.+)\s*[:|\-]?\s*",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
|
||||
_USTID_PAT = re.compile(r"\b(?:USt-?Id(?:Nr)?\.?|VAT(?:-?Id)?)\s*[:.\s]\s*"
|
||||
r"(DE\d{8,10}|[A-Z]{2}\d{6,12})", re.IGNORECASE)
|
||||
_HR_PAT = re.compile(r"\b(?:HR[BA]|Handelsregister|Registergericht)"
|
||||
@@ -36,6 +42,17 @@ _GF_PAT = re.compile(r"(?:Geschäftsführer|Vertretungsberechtigt|"
|
||||
r"vertreten\s+durch)\s*[:.\s]+", re.IGNORECASE)
|
||||
|
||||
|
||||
def _clean_entity_name(raw: str) -> str:
|
||||
"""Strip leading header noise + collapse whitespace."""
|
||||
name = raw.strip()
|
||||
# If the match spans multiple lines (regex captured a header before
|
||||
# the actual company name), keep only the last line.
|
||||
if "\n" in name:
|
||||
name = name.rsplit("\n", 1)[-1].strip()
|
||||
name = _NAME_NOISE_PAT.sub("", name).strip()
|
||||
return re.sub(r"\s+", " ", name)
|
||||
|
||||
|
||||
def _slice_entities(text: str) -> list[tuple[str, str]]:
|
||||
"""Return [(entity_name, text_slice)] for each detected entity."""
|
||||
matches = list(_ENTITY_PAT.finditer(text))
|
||||
@@ -45,7 +62,7 @@ def _slice_entities(text: str) -> list[tuple[str, str]]:
|
||||
for i, m in enumerate(matches):
|
||||
start = m.start()
|
||||
end = matches[i + 1].start() if i + 1 < len(matches) else len(text)
|
||||
slices.append((m.group(1).strip(), text[start:end]))
|
||||
slices.append((_clean_entity_name(m.group(1)), text[start:end]))
|
||||
return slices
|
||||
|
||||
|
||||
|
||||
@@ -0,0 +1,109 @@
|
||||
"""Tests for B9 Multi-Entity-Impressum-Check.
|
||||
|
||||
GT-Anker: Elli IMPRESSUM-001 — bei mehreren Entitäten USt-IdNr nur
|
||||
bei einer von zwei vorhanden → Finding muss per-Entity erkennen.
|
||||
"""
|
||||
|
||||
from compliance.services.impressum_multi_entity_check import (
|
||||
_clean_entity_name,
|
||||
_slice_entities,
|
||||
check_multi_entity_impressum,
|
||||
)
|
||||
|
||||
|
||||
_ELLI_LIKE = """
|
||||
Impressum
|
||||
|
||||
Volkswagen Group Charging GmbH
|
||||
Karl-Liebknecht-Str. 32
|
||||
10178 Berlin
|
||||
Amtsgericht Charlottenburg HRB 208967 B
|
||||
Geschäftsführer: Giovanni Palazzo, Mark Möller
|
||||
Telefon: 00800 3554 1111
|
||||
E-Mail: info@elli.eco
|
||||
|
||||
Elli Mobility GmbH
|
||||
Karl-Liebknecht-Str. 32
|
||||
10178 Berlin
|
||||
Amtsgericht Charlottenburg HRB 274616 B
|
||||
USt-IdNr.: DE814424009
|
||||
Geschäftsführer: Joschi Jennermann
|
||||
Telefon: 00800 00002030
|
||||
E-Mail: ellimobility@elli.eco
|
||||
"""
|
||||
|
||||
|
||||
class TestCleanEntityName:
|
||||
def test_strips_header_prefix(self):
|
||||
assert _clean_entity_name(
|
||||
"Impressum\n\nVolkswagen Group Charging GmbH"
|
||||
) == "Volkswagen Group Charging GmbH"
|
||||
|
||||
def test_strips_email_artifact(self):
|
||||
# mid-text matches may capture an email TLD before the next entity
|
||||
assert _clean_entity_name(
|
||||
"eco\n\nElli Mobility GmbH"
|
||||
) == "Elli Mobility GmbH"
|
||||
|
||||
def test_collapses_whitespace(self):
|
||||
assert _clean_entity_name(
|
||||
"Acme Holding GmbH"
|
||||
) == "Acme Holding GmbH"
|
||||
|
||||
def test_clean_name_passes_through(self):
|
||||
assert _clean_entity_name("Elli Mobility GmbH") == "Elli Mobility GmbH"
|
||||
|
||||
|
||||
class TestSliceEntities:
|
||||
def test_no_entities_empty(self):
|
||||
assert _slice_entities("Random text without companies.") == []
|
||||
|
||||
def test_single_entity_no_slice(self):
|
||||
# Single-entity impressum is the normal case — multi-entity-only.
|
||||
assert _slice_entities("Acme GmbH\nMusterstr 1\n") == []
|
||||
|
||||
def test_two_entities_clean_names(self):
|
||||
slices = _slice_entities(_ELLI_LIKE)
|
||||
names = [n for n, _ in slices]
|
||||
assert names == [
|
||||
"Volkswagen Group Charging GmbH",
|
||||
"Elli Mobility GmbH",
|
||||
]
|
||||
|
||||
|
||||
class TestCheck:
|
||||
def test_elli_pattern_finds_missing_ust_id(self):
|
||||
findings = check_multi_entity_impressum(
|
||||
{"doc_texts": {"impressum": _ELLI_LIKE}}
|
||||
)
|
||||
ust_findings = [
|
||||
f for f in findings if f["check_id"] == "IMPRESSUM-MULTI-UST_ID"
|
||||
]
|
||||
assert len(ust_findings) == 1
|
||||
f = ust_findings[0]
|
||||
assert f["entities_missing"] == ["Volkswagen Group Charging GmbH"]
|
||||
assert f["entities_present"] == ["Elli Mobility GmbH"]
|
||||
assert f["severity"] == "MEDIUM"
|
||||
assert "TMG" in f["norm"]
|
||||
|
||||
def test_no_impressum_no_findings(self):
|
||||
assert check_multi_entity_impressum({"doc_texts": {}}) == []
|
||||
|
||||
def test_single_entity_no_findings(self):
|
||||
text = "Acme GmbH\nMusterstr 1\nUSt-IdNr DE123456789"
|
||||
assert check_multi_entity_impressum(
|
||||
{"doc_texts": {"impressum": text}}
|
||||
) == []
|
||||
|
||||
def test_both_entities_have_ust_id_no_finding(self):
|
||||
text = (
|
||||
"Acme GmbH\nUSt-IdNr DE111111111\n\n"
|
||||
"Foo Holding GmbH\nUSt-IdNr DE222222222\n"
|
||||
)
|
||||
ust_findings = [
|
||||
f for f in check_multi_entity_impressum(
|
||||
{"doc_texts": {"impressum": text}}
|
||||
)
|
||||
if f["check_id"] == "IMPRESSUM-MULTI-UST_ID"
|
||||
]
|
||||
assert ust_findings == []
|
||||
Reference in New Issue
Block a user