"""B9 — Multi-Entity-Impressum-Check. Findings, wenn ein Impressum mehrere Entitäten (mehrere GmbH/AG/UG) nennt, aber Pflichtangaben nur bei einer davon vollständig sind. Konkreter Elli-Pattern (GT IMPRESSUM-001): - Entity 1: "Elli Mobility GmbH ... USt-IdNr DE814424009 ..." - Entity 2: "VW Group Charging GmbH ... [keine USt-IdNr] ..." → USt-IdNr fehlt bei Entity 2. Heuristik: 1. Entitäten erkennen: jede Match auf " (GmbH|AG|UG|KG|SE)" als Entity-Boundary; Text-Slice von dort bis zur nächsten Entity. 2. Pro Entity prüfen: USt-IdNr, Handelsregister, Vertretungsberechtigte. 3. Wenn Entity N ein Feld nennt, das Entity M nicht hat → MEDIUM. """ from __future__ import annotations import logging import re logger = logging.getLogger(__name__) _ENTITY_PAT = re.compile( r"([A-ZÄÖÜ][\w\-\& ]{1,50}?\s+(?:GmbH|AG|UG|KG|SE|" r"e\.V\.|GbR|OHG|Limited|Ltd|LLC))", re.IGNORECASE, ) _NAME_NOISE_PAT = re.compile( r"^(?:Impressum|Anbieter|Anbieterkennzeichnung|Diensteanbieter|" r"Verantwortlich(?:er)?|Kontakt|Adresse|@\S+|.+@.+)\s*[:|\-]?\s*", re.IGNORECASE, ) # HRB-/HRA-Eintrag pro Entity. Dies ist der stärkste Anker: jede # juristische Person muss in Deutschland einen eindeutigen # HRB/HRA-Eintrag haben. Mehrere HRB-Vorkommen = mehrere Entities. _HRB_PAT = re.compile( r"HR[BA]\s*(?:Nr\.?|Nummer)?\s*\d+(?:\s*[A-Z])?", re.IGNORECASE, ) # Worte, die NIE ein gültiger Firmenname sind. Filtern False-Positives # wie "Programmierung der Webseite Elli Mobility GmbH" oder # "Umsatzsteueridentifikationsnummer der Elli Mobility GmbH". _NAME_BLOCKLIST = ( "programmierung", "webseite", "umsatzsteueridentifik", "schlichtungsstelle", "auftragsverarbeitung", "haftung", "verantwortlich", "diensteanbieter", "geschäftsführer", "geschaeftsfuehrer", "vorstand", "gesellschaftsregister", "registergericht", "registriert", ) _USTID_PAT = re.compile( # Form A: Abkürzung mit Separator (USt-IdNr.: DE…) r"(?:" r"\b(?:USt-?Id(?:Nr)?\.?|VAT(?:-?Id)?)\s*[:.\s]\s*" r"(DE\d{8,10}|[A-Z]{2}\d{6,12})" r"|" # Form B: Vollform mit Bridge (Umsatzsteueridentifikationsnummer # der Elli Mobility GmbH ist DE…). Max 80 Zeichen Bridge, # \n erlaubt (Cap schützt vor Cross-Paragraph-Drift). r"\bUmsatzsteuer[\s-]?Id(?:entifikationsnummer)?\b[\s\S]{0,80}?" r"(DE\d{8,10}|[A-Z]{2}\d{6,12})" r")", re.IGNORECASE, ) _HR_PAT = re.compile(r"\b(?:HR[BA]|Handelsregister|Registergericht)" r"\s*[:.\s]*([\w\s\d\-/]{4,80})", re.IGNORECASE) _GF_PAT = re.compile(r"(?:Geschäftsführer|Vertretungsberechtigt|" r"vertreten\s+durch)\s*[:.\s]+", re.IGNORECASE) _LEADING_NOISE_RE = re.compile( r"^(?:eco|com|de|net|" # email/domain TLD-Artefakte r"Die|Der|Das|" # deutsche Artikel r"www\.[\w\.-]+|" # URL-Reste r"@[\w\.-]+)\s+", re.IGNORECASE, ) def _clean_entity_name(raw: str) -> str: """Strip leading header noise, leading artifacts + collapse whitespace.""" name = raw.strip() # If the match spans multiple lines, keep only the last line. if "\n" in name: name = name.rsplit("\n", 1)[-1].strip() name = _NAME_NOISE_PAT.sub("", name).strip() # Strip leading email-TLD-artifacts ("eco "), German articles # ("Die ", "Der "), URL-fragments etc. — iterate until stable. for _ in range(3): new = _LEADING_NOISE_RE.sub("", name) if new == name: break name = new # Strip leading lowercase prose words (English connectors leaking # into the match: "If ...", "by ...", "according to ..."). # Drop tokens until we hit the first Capitalized one. tokens = name.split() while tokens and not tokens[0][:1].isupper(): tokens = tokens[1:] name = " ".join(tokens) return re.sub(r"\s+", " ", name).strip() def _dedup_substring(slices: list[tuple[str, str]]) -> list[tuple[str, str]]: """Collapse entities whose names are substrings of each other. 'mfi Immobilien Marketing GmbH' and 'Marketing GmbH' both refer to the same legal person — keep only the longest unique name. """ sorted_by_len = sorted(slices, key=lambda x: -len(x[0])) kept: list[tuple[str, str]] = [] kept_names_lc: list[str] = [] for name, slc in sorted_by_len: nl = name.lower() if any(nl in k or k in nl for k in kept_names_lc): continue kept.append((name, slc)) kept_names_lc.append(nl) return kept def _name_is_blocked(name: str) -> bool: nl = name.lower() if any(b in nl for b in _NAME_BLOCKLIST): return True # Minimum-name-quality: must have ≥ 2 words, at least one ≥ 4 chars # before the legal-form suffix. Filters out "Se", "As a se" frags. parts = name.strip().split() if len(parts) < 2: return True # Strip legal-form from the end if present legal_suffixes = { "gmbh", "ag", "ug", "kg", "se", "e.v.", "gbr", "ohg", "limited", "ltd", "llc", } if parts[-1].lower() in legal_suffixes: non_suffix = parts[:-1] else: non_suffix = parts if not non_suffix or len(non_suffix) < 1: return True # At least one company-name token ≥ 4 chars and capitalized if not any(p[0].isupper() and len(p) >= 4 for p in non_suffix): return True return False def _slice_entities(text: str) -> list[tuple[str, str]]: """Return [(entity_name, text_slice)] for each detected entity. Anker-Strategie: jeder eigenständige HRB/HRA-Eintrag markiert eine Entity. Pro HRB-Block laufen wir RÜCKWÄRTS bis zum vorherigen Block-Ende und nehmen den letzten Legal-Form-Match (GmbH/AG/...) in diesem Fenster als Firmennamen. Falls keine HRBs vorhanden sind (z.B. e.V. ohne HR-Pflicht), fallen wir auf den alten Legal-Form-Anker zurück. """ hrb_matches = list(_HRB_PAT.finditer(text)) if len(hrb_matches) >= 2: slices: list[tuple[str, str]] = [] seen_names: set[str] = set() for i, h in enumerate(hrb_matches): # Window: from end of previous HRB (or start) to this HRB. win_start = hrb_matches[i - 1].end() if i > 0 else 0 win_end = h.start() window = text[win_start:win_end] # Find the LAST legal-form-match in the window — that's # the entity-name closest to this HRB-marker. legal_matches = list(_ENTITY_PAT.finditer(window)) if not legal_matches: continue name_raw = legal_matches[-1].group(1) name = _clean_entity_name(name_raw) if not name or _name_is_blocked(name): continue if name in seen_names: continue seen_names.add(name) # Block-Slice: from the matched name to the END of this # HRB-block (next HRB-start or EOF). slice_start = win_start + legal_matches[-1].start() slice_end = (hrb_matches[i + 1].start() if i + 1 < len(hrb_matches) else len(text)) slices.append((name, text[slice_start:slice_end])) slices = _dedup_substring(slices) if len(slices) >= 2: return slices # Fallback (no HRBs / nur einer / parsing scheiterte): # alter Legal-Form-Anker mit Blocklist-Filter + dedup. matches = list(_ENTITY_PAT.finditer(text)) if len(matches) < 2: return [] slices = [] seen_names = set() for i, m in enumerate(matches): name = _clean_entity_name(m.group(1)) if not name or _name_is_blocked(name) or name in seen_names: continue seen_names.add(name) start = m.start() end = matches[i + 1].start() if i + 1 < len(matches) else len(text) slices.append((name, text[start:end])) slices = _dedup_substring(slices) return slices if len(slices) >= 2 else [] def check_multi_entity_impressum(state: dict) -> list[dict]: doc_texts = state.get("doc_texts") or {} imp = doc_texts.get("impressum") or "" if not imp: return [] slices = _slice_entities(imp) if not slices: return [] # Compute features per entity features = [] for name, slc in slices: features.append({ "name": name, "ust_id": bool(_USTID_PAT.search(slc)), "hr": bool(_HR_PAT.search(slc)), "gf": bool(_GF_PAT.search(slc)), }) # If ALL share the same flags → no inconsistency findings: list[dict] = [] for field, label in ( ("ust_id", "USt-IdNr."), ("hr", "Handelsregister-Eintrag"), ("gf", "Vertretungsberechtigte"), ): present = [f for f in features if f[field]] missing = [f for f in features if not f[field]] if present and missing and len(present) >= 1: findings.append({ "check_id": f"IMPRESSUM-MULTI-{field.upper()}", "severity": "MEDIUM", "severity_reason": "incomplete", "title": ( f"{label} fehlt bei " f"{len(missing)} von {len(features)} Entitäten" ), "norm": "§ 5 Abs. 1 TMG (Pflichtangabe pro Diensteanbieter)", "entities_present": [f["name"] for f in present], "entities_missing": [f["name"] for f in missing], "action": ( f"{label} im Impressum für " f"{', '.join(f['name'] for f in missing)} ergänzen. " "Pflichtangabe ist pro Diensteanbieter zu erfüllen, " "nicht 'eine reicht für alle'." ), }) if findings: logger.info("B9 multi-entity impressum: %d findings", len(findings)) return findings