feat(agent): Impressum-Tab auf Haupt-Engine + Profil/§36-Fixes
Ergebnis-Tab rendert jetzt result.results (Haupt-Doc-Check) statt des abweichenden v3-Agenten — BMW korrekt statt False Positives: - DocResultView: ein Dokument als Pflichtangaben-Tabelle (Label + gefundener Text + 3-Tier-Status), KEINE MC-IDs. ComplianceResultTabs speist Tabs aus result.results; ChecklistView-Bausteine exportiert + wiederverwendet. - profile_extractor: Firmenname/Rechtsform = fruehester Treffer + ausge- schriebene Formen (Aktiengesellschaft) -> BMW AG statt "juris GmbH". - 36 VSBG (MC-010): reines b2c -> POSSIBLY_APPLICABLE (Pruef-Hinweis) statt MEDIUM-FAIL; hart nur bei ecommerce. possibly_hint pro MC. - McCoverage traegt label + found (Snippet); mc_possibly-Aggregat. - AgentFindingCard/Methodik: interne check_id/mc_id nicht mehr angezeigt. Tests: test_four_status (16) + Frontend-Vitest gruen; CI-Suite 206, v3/GT unveraendert. Nur eigene Dateien (geteilter Tree). Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -10,13 +10,14 @@ Returns a dict that maps to CompanyProfile and ScopeProfilingAnswer fields.
|
||||
|
||||
import logging
|
||||
import re
|
||||
from typing import Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def extract_profile_from_documents(
|
||||
doc_texts: dict[str, str],
|
||||
business_profile: dict | None = None,
|
||||
business_profile: Optional[dict] = None,
|
||||
) -> dict:
|
||||
"""Extract Company Profile fields from document texts.
|
||||
|
||||
@@ -100,28 +101,38 @@ def _extract_company_info(text: str, result: dict) -> None:
|
||||
"""Extract company name, legal form, address from text."""
|
||||
cp = result["company_profile"]
|
||||
|
||||
# GmbH / AG / UG / e.K. etc.
|
||||
legal_forms = {
|
||||
r"(\S+(?:\s+\S+){0,4})\s+gmbh\b": ("GmbH", "gmbh"),
|
||||
r"(\S+(?:\s+\S+){0,4})\s+ag\b": ("AG", "ag"),
|
||||
r"(\S+(?:\s+\S+){0,4})\s+ug\b": ("UG", "ug"),
|
||||
r"(\S+(?:\s+\S+){0,4})\s+e\.?\s*k\.?\b": ("e.K.", "ek"),
|
||||
r"(\S+(?:\s+\S+){0,4})\s+gbr\b": ("GbR", "gbr"),
|
||||
r"(\S+(?:\s+\S+){0,4})\s+ohg\b": ("OHG", "ohg"),
|
||||
r"(\S+(?:\s+\S+){0,4})\s+gmbh\s*&\s*co\.?\s*kg": ("GmbH & Co. KG", "gmbh_co_kg"),
|
||||
}
|
||||
# Rechtsform + Firmenname. Die Reihenfolge der Muster ist NICHT die
|
||||
# Priorität — wir nehmen den FRUEHESTEN Treffer im Text: ein Impressum
|
||||
# nennt den Betreiber zuerst; spätere Erwähnungen (z.B. "juris GmbH" im
|
||||
# Hinweis auf gesetze-im-internet.de) sind nicht der Anbieter. Ausge-
|
||||
# schriebene Formen ("Aktiengesellschaft") zählen mit (sonst wird BMW AG
|
||||
# nicht erkannt und faelschlich die naechste GmbH gegriffen).
|
||||
legal_forms = [
|
||||
(r"(\S+(?:\s+\S+){0,4})\s+gmbh\s*&\s*co\.?\s*kg\b", "gmbh_co_kg"),
|
||||
(r"(\S+(?:\s+\S+){0,4})\s+(?:aktiengesellschaft|ag)\b", "ag"),
|
||||
(r"(\S+(?:\s+\S+){0,4})\s+(?:unternehmergesellschaft|ug)\b", "ug"),
|
||||
(r"(\S+(?:\s+\S+){0,4})\s+gmbh\b", "gmbh"),
|
||||
(r"(\S+(?:\s+\S+){0,4})\s+e\.?\s*k\.?\b", "ek"),
|
||||
(r"(\S+(?:\s+\S+){0,4})\s+gbr\b", "gbr"),
|
||||
(r"(\S+(?:\s+\S+){0,4})\s+ohg\b", "ohg"),
|
||||
]
|
||||
text_lower = text.lower()
|
||||
for pattern, (form_label, form_id) in legal_forms.items():
|
||||
best = None # (start, end, form_id) — frühester Treffer
|
||||
for pattern, form_id in legal_forms:
|
||||
m = re.search(pattern, text_lower)
|
||||
if m:
|
||||
raw_name = m.group(0).strip()
|
||||
# Clean up: take from uppercase start
|
||||
for i, ch in enumerate(text[m.start():m.end()]):
|
||||
if ch.isupper():
|
||||
cp["companyName"] = text[m.start() + i:m.end()].strip()
|
||||
break
|
||||
cp["legalForm"] = form_id
|
||||
break
|
||||
# frühester Treffer gewinnt; bei Gleichstand die Listen-Reihenfolge
|
||||
# (GmbH & Co. KG vor GmbH).
|
||||
if m and (best is None or m.start() < best[0]):
|
||||
best = (m.start(), m.end(), form_id)
|
||||
if best:
|
||||
start, end, form_id = best
|
||||
# Firmenname ab dem ersten Grossbuchstaben im Treffer (schneidet
|
||||
# führende Kleinwörter wie "von der" ab).
|
||||
for i, ch in enumerate(text[start:end]):
|
||||
if ch.isupper():
|
||||
cp["companyName"] = text[start + i:end].strip()
|
||||
break
|
||||
cp["legalForm"] = form_id
|
||||
|
||||
# PLZ + Ort
|
||||
plz_match = re.search(
|
||||
|
||||
@@ -107,6 +107,11 @@ class McCoverage(BaseModel):
|
||||
mc_id: str
|
||||
status: str
|
||||
reason: str = ""
|
||||
# Menschlicher Feldname (für die Pflichtangaben-Tabelle im Frontend —
|
||||
# NICHT die mc_id zeigen, sonst Reverse-Engineering der MC-Bibliothek).
|
||||
label: str = ""
|
||||
# Der tatsächlich gefundene Text/Wert (Snippet) bei status=ok.
|
||||
found: str = ""
|
||||
|
||||
|
||||
class EscalationLog(BaseModel):
|
||||
|
||||
@@ -78,6 +78,25 @@ def _build_measure(label: str, norm: str) -> str:
|
||||
return msg
|
||||
|
||||
|
||||
def _line_of(text: str, start_pos: int, end_pos: int) -> str:
|
||||
"""Die Zeile um einen Regex-Treffer — als 'gefundener Wert' für die
|
||||
Pflichtangaben-Tabelle. Gekappt + bereinigt."""
|
||||
start = text.rfind("\n", 0, start_pos) + 1
|
||||
end = text.find("\n", end_pos)
|
||||
if end == -1:
|
||||
end = len(text)
|
||||
return " ".join(text[start:end].split())[:160]
|
||||
|
||||
|
||||
def _coverage(mc, status: str, reason: str, found: str = "") -> McCoverage:
|
||||
"""McCoverage mit menschlichem Label (mc.label) — das Frontend zeigt
|
||||
NIE die mc_id (Reverse-Engineering-Schutz)."""
|
||||
return McCoverage(
|
||||
mc_id=mc.mc_id, status=status, reason=reason,
|
||||
label=mc.label, found=found,
|
||||
)
|
||||
|
||||
|
||||
class ImpressumAgent(BaseSpecialistAgent):
|
||||
agent_id = "impressum"
|
||||
agent_version = "3.0"
|
||||
@@ -103,10 +122,7 @@ class ImpressumAgent(BaseSpecialistAgent):
|
||||
if len(text) < 100:
|
||||
# Doc zu kurz — alle eigenen Pattern-IDs als skipped
|
||||
for mc in MCS:
|
||||
coverage.append(McCoverage(
|
||||
mc_id=mc.mc_id, status="skipped",
|
||||
reason="text too short",
|
||||
))
|
||||
coverage.append(_coverage(mc, "skipped", "text too short"))
|
||||
return self._finalize(
|
||||
start, findings, esc_logs, coverage,
|
||||
confidence=0.0,
|
||||
@@ -129,27 +145,35 @@ class ImpressumAgent(BaseSpecialistAgent):
|
||||
for mc in MCS:
|
||||
disp = scope_disposition(mc, scope, is_auto)
|
||||
if disp == "na":
|
||||
coverage.append(McCoverage(
|
||||
mc_id=mc.mc_id, status="na",
|
||||
reason="nicht anwendbar (Rechtsform/Branche)",
|
||||
))
|
||||
coverage.append(_coverage(
|
||||
mc, "na", "nicht anwendbar (Rechtsform/Branche)"))
|
||||
continue
|
||||
if any(p.search(text) for p in mc.patterns):
|
||||
coverage.append(McCoverage(
|
||||
mc_id=mc.mc_id, status="ok", reason="Pattern-Treffer",
|
||||
matched = None
|
||||
for p in mc.patterns:
|
||||
m = p.search(text)
|
||||
if m:
|
||||
matched = m
|
||||
break
|
||||
if matched is not None:
|
||||
coverage.append(_coverage(
|
||||
mc, "ok", "Pattern-Treffer",
|
||||
found=_line_of(text, matched.start(), matched.end()),
|
||||
))
|
||||
continue
|
||||
if mc.optional:
|
||||
# fehlt + optional → KEIN Finding (z.B. USt-IdNr;
|
||||
# Kleinunternehmer §19 haben legitim keine).
|
||||
coverage.append(McCoverage(
|
||||
mc_id=mc.mc_id, status="na",
|
||||
reason="optional — nicht angegeben",
|
||||
))
|
||||
coverage.append(_coverage(
|
||||
mc, "na", "optional — nicht angegeben"))
|
||||
continue
|
||||
if disp == "possible":
|
||||
# Graubereich (z.B. Corporate-Blog → §18 MStV evtl.) →
|
||||
# POSSIBLY_APPLICABLE: Pruef-Hinweis (LOW), kein Verstoss.
|
||||
# Graubereich (z.B. Corporate-Blog → §18, OEM-Markenseite →
|
||||
# §36 VSBG) → POSSIBLY_APPLICABLE: Pruef-Hinweis (LOW),
|
||||
# kein Verstoss. Hinweistext kommt MC-spezifisch.
|
||||
hint = mc.possibly_hint or (
|
||||
f"Diese Angabe ist nur situativ Pflicht ({mc.norm}). "
|
||||
"Bitte prüfen, ob sie auf Ihre Seite zutrifft."
|
||||
)
|
||||
findings.append(Finding(
|
||||
check_id=f"IMP-{mc.field_id}",
|
||||
agent=self.agent_id,
|
||||
@@ -161,12 +185,7 @@ class ImpressumAgent(BaseSpecialistAgent):
|
||||
title=f"{mc.label}: ggf. relevant — manuell prüfen",
|
||||
norm=mc.norm,
|
||||
evidence="",
|
||||
action=(
|
||||
"Bei journalistisch-redaktionellen Inhalten "
|
||||
"(Nachrichten/Magazin) ist ein Verantwortlicher nach "
|
||||
"§ 18 MStV anzugeben. Bei reinem Corporate-Blog meist "
|
||||
"nicht erforderlich — bitte prüfen."
|
||||
),
|
||||
action=hint,
|
||||
confidence=0.5,
|
||||
sources=[EvidenceSource(
|
||||
source_type=SourceType.REGEX,
|
||||
@@ -175,10 +194,8 @@ class ImpressumAgent(BaseSpecialistAgent):
|
||||
confidence=0.5,
|
||||
)],
|
||||
))
|
||||
coverage.append(McCoverage(
|
||||
mc_id=mc.mc_id, status="possibly_applicable",
|
||||
reason="Graubereich — manuelle Prüfung",
|
||||
))
|
||||
coverage.append(_coverage(
|
||||
mc, "possibly_applicable", "Graubereich — manuelle Prüfung"))
|
||||
continue
|
||||
if mc.legal_form_dependent and not form_known:
|
||||
# Rechtsform unbestimmt → kein hartes FAIL, sondern
|
||||
@@ -207,10 +224,8 @@ class ImpressumAgent(BaseSpecialistAgent):
|
||||
confidence=0.4,
|
||||
)],
|
||||
))
|
||||
coverage.append(McCoverage(
|
||||
mc_id=mc.mc_id, status="insufficient_evidence",
|
||||
reason="Rechtsform unbestimmt",
|
||||
))
|
||||
coverage.append(_coverage(
|
||||
mc, "insufficient_evidence", "Rechtsform unbestimmt"))
|
||||
continue
|
||||
sev = _SEV_TO_ENUM.get(mc.severity_if_missing, Severity.MEDIUM)
|
||||
findings.append(Finding(
|
||||
@@ -233,10 +248,8 @@ class ImpressumAgent(BaseSpecialistAgent):
|
||||
confidence=0.9,
|
||||
)],
|
||||
))
|
||||
coverage.append(McCoverage(
|
||||
mc_id=mc.mc_id, status=sev.value.lower(),
|
||||
reason="kein Pattern-Treffer",
|
||||
))
|
||||
coverage.append(_coverage(
|
||||
mc, sev.value.lower(), "kein Pattern-Treffer"))
|
||||
n_fail = sum(1 for f in findings
|
||||
if f.status == CheckStatus.FAIL.value)
|
||||
n_unklar = sum(1 for f in findings
|
||||
|
||||
@@ -40,6 +40,9 @@ class MC:
|
||||
# ist die MC NICHT hart anwendbar, sondern POSSIBLY_APPLICABLE — Pruef-
|
||||
# Hinweis (severity LOW) statt FAIL. Z.B. Corporate-Blog (§18 MStV evtl.).
|
||||
possibly_applies_scope: tuple[str, ...] = field(default_factory=tuple)
|
||||
# MC-spezifischer Pruef-Hinweis fuer den POSSIBLY_APPLICABLE-Fall
|
||||
# (warum Graubereich + was der Nutzer pruefen soll).
|
||||
possibly_hint: str = ""
|
||||
|
||||
|
||||
MCS: tuple[MC, ...] = (
|
||||
@@ -182,6 +185,11 @@ MCS: tuple[MC, ...] = (
|
||||
severity_if_missing="MEDIUM",
|
||||
requires_scope=("editorial",),
|
||||
possibly_applies_scope=("editorial_possible",),
|
||||
possibly_hint=(
|
||||
"Bei journalistisch-redaktionellen Inhalten (Nachrichten/Magazin) "
|
||||
"ist ein Verantwortlicher nach § 18 MStV anzugeben. Bei reinem "
|
||||
"Corporate-Blog meist nicht erforderlich — bitte prüfen."
|
||||
),
|
||||
patterns=(re.compile(
|
||||
r"(?:Verantwortlich(?:er|e)?\s+(?:f(?:ue|ü)r|i\.S\.d\.|"
|
||||
r"nach|gem(?:ae|ä)ß)\s+§\s*18|"
|
||||
@@ -194,9 +202,17 @@ MCS: tuple[MC, ...] = (
|
||||
mc_id="IMP-MC-010",
|
||||
field_id="verbraucher_streitbeilegung",
|
||||
label="Verbraucher-Streitbeilegung-Hinweis",
|
||||
norm="§ 36 VSBG (B2C-Anbieter Pflicht)",
|
||||
norm="§ 36 VSBG (Verbraucherverträge über die Website)",
|
||||
severity_if_missing="MEDIUM",
|
||||
requires_scope=("ecommerce", "b2c"),
|
||||
# Hart nur bei echtem Online-Verkauf; reine B2C-Orientierung (z.B.
|
||||
# OEM-Markenseite, Verkauf über Händler) = Graubereich → Prüf-Hinweis.
|
||||
requires_scope=("ecommerce",),
|
||||
possibly_applies_scope=("b2c",),
|
||||
possibly_hint=(
|
||||
"§ 36 VSBG gilt, wenn auf dieser Seite Verbraucherverträge "
|
||||
"geschlossen werden. Bei reiner Marken-/Info-Seite (Verkauf über "
|
||||
"Händler/Vertragspartner) meist nicht erforderlich — bitte prüfen."
|
||||
),
|
||||
patterns=(re.compile(
|
||||
r"(?:Verbraucherschlichtungs|VSBG|"
|
||||
r"Streitbeilegung|"
|
||||
|
||||
@@ -178,6 +178,7 @@ def test_editorial_possible_yields_possibly_applicable():
|
||||
assert red.status == CheckStatus.POSSIBLY_APPLICABLE.value
|
||||
assert red.severity == Severity.LOW.value
|
||||
assert out.mc_possibly >= 1
|
||||
assert "§ 18 MStV" in red.action # MC-spezifischer Hinweis
|
||||
|
||||
|
||||
def test_editorial_absent_is_not_applicable():
|
||||
@@ -193,3 +194,30 @@ def test_derive_scope_editorial_tiers():
|
||||
# Medienunternehmen gewinnt — nicht beide Tokens.
|
||||
s = _derive_scope({"industry": "media", "has_editorial_content": True})
|
||||
assert "editorial" in s and "editorial_possible" not in s
|
||||
|
||||
|
||||
# ── §36 VSBG Graubereich (BMW-Fall): reines b2c ≠ harter Verstoß ────
|
||||
|
||||
|
||||
def test_vsbg_b2c_is_possibly_applicable():
|
||||
# Reine B2C-Orientierung (z.B. OEM-Markenseite, Verkauf über Händler) →
|
||||
# §36 VSBG = Graubereich, KEIN MEDIUM-FAIL (BMW-False-Positive-Fix).
|
||||
out = asyncio.run(ImpressumAgent().evaluate(AgentInput(
|
||||
doc_type="impressum", text=TEXT_NO_LEGAL_FORM,
|
||||
business_scope=["b2c"])))
|
||||
vsbg = _by_field(out, "verbraucher_streitbeilegung")
|
||||
assert vsbg is not None
|
||||
assert vsbg.status == CheckStatus.POSSIBLY_APPLICABLE.value
|
||||
assert vsbg.severity == Severity.LOW.value
|
||||
assert "VSBG" in vsbg.action # §36-Hinweis, nicht §18
|
||||
|
||||
|
||||
def test_vsbg_ecommerce_is_hard_fail():
|
||||
# Echter Online-Shop (ecommerce) → §36 VSBG harte Pflicht (MEDIUM).
|
||||
out = asyncio.run(ImpressumAgent().evaluate(AgentInput(
|
||||
doc_type="impressum", text=TEXT_NO_LEGAL_FORM,
|
||||
business_scope=["ecommerce"])))
|
||||
vsbg = _by_field(out, "verbraucher_streitbeilegung")
|
||||
assert vsbg is not None
|
||||
assert vsbg.status == CheckStatus.FAIL.value
|
||||
assert vsbg.severity == Severity.MEDIUM.value
|
||||
|
||||
Reference in New Issue
Block a user