fix(impressum): P9 — 7 False-Positive-Fixes in Pflichtangaben-Checks
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / detect-changes (push) Successful in 10s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / validate-canonical-controls (push) Successful in 16s
CI / loc-budget (push) Failing after 16s
CI / go-lint (push) Has been skipped
CI / nodejs-build (push) Has been skipped
CI / test-go (push) Has been skipped
CI / iace-gt-coverage (push) Has been skipped
CI / test-python-backend (push) Successful in 37s
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / detect-changes (push) Successful in 10s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / secret-scan (push) Has been skipped
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / validate-canonical-controls (push) Successful in 16s
CI / loc-budget (push) Failing after 16s
CI / go-lint (push) Has been skipped
CI / nodejs-build (push) Has been skipped
CI / test-go (push) Has been skipped
CI / iace-gt-coverage (push) Has been skipped
CI / test-python-backend (push) Successful in 37s
CI / test-python-document-crawler (push) Has been skipped
CI / test-python-dsms-gateway (push) Has been skipped
#1 Name des Anbieters: \b Word-Boundary verhindert "ag" in "samstag", plus "aktiengesellschaft" als Volltreffer. #2 Vertretungsberechtigte: Klammer-Liste-Pattern erkennt jetzt BMW- Format "Vorstand (Milan Nedeljkovic, Jochen Goller, ...)" plus "Vorsitzender des Aufsichtsrats: Name". #3 V.i.S.d.P.: war schon INFO, OK. #4 OS-Plattform/VSBG: bei no_direct_sales=True (OEM-Pattern) jetzt als "Nicht anwendbar" skipped statt 0/1 fail. Profile fliesst neu durch check_document_completeness -> runner. #5 Zustaendige Kammer: IHK + Handwerkskammer + Tieraerztekammer in Pattern aufgenommen + severity LOW -> INFO (konditional). #6 Stammkapital: war schon INFO, OK. #7 Link-Disclaimer: neue Check-Eigenschaft "invert"=True. Anti-Pattern ist passed wenn NICHT gefunden, fail wenn gefunden. Vorher feuerte das Finding immer, jetzt nur wenn ein illegaler Disclaimer im Text ist. Plus: L2-INFO-Checks (z.B. profession_chamber) zaehlen nicht mehr in correctness-pct und erzeugen keine DSI-DETAIL-Findings. Konsistent mit P8-Modell: INFO = "selbst pruefen", nicht "fail". Verifiziert mit BMW-Impressum-Text — alle 7 Faelle korrekt klassifiziert: name=passed, representative_person=passed, profession_chamber=INFO, illegal_disclaimer=passed (kein Disclaimer im Text), dispute_resolution=skipped (no_direct_sales), editorial_visdp=INFO, share_capital=INFO. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -317,6 +317,7 @@ async def _run_compliance_check(check_id: str, req: ComplianceCheckRequest):
|
|||||||
text, doc_type, label, url,
|
text, doc_type, label, url,
|
||||||
entry["word_count"], use_agent_flag,
|
entry["word_count"], use_agent_flag,
|
||||||
business_scope=business_scope,
|
business_scope=business_scope,
|
||||||
|
business_profile={"no_direct_sales": getattr(profile, "no_direct_sales", False)},
|
||||||
)
|
)
|
||||||
|
|
||||||
# Apply profile context filter
|
# Apply profile context filter
|
||||||
@@ -1001,6 +1002,7 @@ async def _check_single(
|
|||||||
text: str, doc_type: str, label: str, url: str,
|
text: str, doc_type: str, label: str, url: str,
|
||||||
word_count: int, use_agent: bool,
|
word_count: int, use_agent: bool,
|
||||||
business_scope: set[str] | None = None,
|
business_scope: set[str] | None = None,
|
||||||
|
business_profile: dict | None = None,
|
||||||
):
|
):
|
||||||
"""Run regex + MC checks on a single document."""
|
"""Run regex + MC checks on a single document."""
|
||||||
from compliance.services.doc_checks.runner import check_document_completeness
|
from compliance.services.doc_checks.runner import check_document_completeness
|
||||||
@@ -1008,7 +1010,8 @@ async def _check_single(
|
|||||||
from .agent_doc_check_routes import CheckItem, DocCheckResult
|
from .agent_doc_check_routes import CheckItem, DocCheckResult
|
||||||
|
|
||||||
# Regex checklist
|
# Regex checklist
|
||||||
findings = check_document_completeness(text, doc_type, label, url)
|
findings = check_document_completeness(text, doc_type, label, url,
|
||||||
|
business_profile=business_profile)
|
||||||
|
|
||||||
all_checks: list[CheckItem] = []
|
all_checks: list[CheckItem] = []
|
||||||
completeness = 0
|
completeness = 0
|
||||||
|
|||||||
@@ -16,8 +16,9 @@ IMPRESSUM_CHECKLIST = [
|
|||||||
"label": "Name des Anbieters",
|
"label": "Name des Anbieters",
|
||||||
"level": 1, "parent": None,
|
"level": 1, "parent": None,
|
||||||
"patterns": [
|
"patterns": [
|
||||||
r"(?:gmbh|ag|e\.v\.|ohg|kg|gbr|ug|mbh|inc|ltd)",
|
# Word-Boundaries verhindern Falsch-Treffer ("ag" in "samstag")
|
||||||
r"firma", r"unternehmen",
|
r"\b(?:gmbh|ag|e\.v\.|ohg|kg|gbr|ug|mbh|inc|ltd|aktiengesellschaft|kommanditgesellschaft|partnerschaft\s+mbb)\b",
|
||||||
|
r"\bfirma\s+\w+", r"\bunternehmen\s+\w+",
|
||||||
],
|
],
|
||||||
"severity": "HIGH",
|
"severity": "HIGH",
|
||||||
"hint": "§5(1) Nr.1 TMG: Vollstaendiger Firmenname MIT Rechtsform (z.B. 'Muster GmbH', nicht nur 'Muster'). Bei Einzelunternehmen: Vor- und Nachname plus ggf. Geschaeftsbezeichnung. Haeufiger Abmahngrund: Nur Markenname ohne juristische Person.",
|
"hint": "§5(1) Nr.1 TMG: Vollstaendiger Firmenname MIT Rechtsform (z.B. 'Muster GmbH', nicht nur 'Muster'). Bei Einzelunternehmen: Vor- und Nachname plus ggf. Geschaeftsbezeichnung. Haeufiger Abmahngrund: Nur Markenname ohne juristische Person.",
|
||||||
@@ -178,9 +179,13 @@ IMPRESSUM_CHECKLIST = [
|
|||||||
"label": "Name der vertretungsberechtigten Person",
|
"label": "Name der vertretungsberechtigten Person",
|
||||||
"level": 2, "parent": "representative",
|
"level": 2, "parent": "representative",
|
||||||
"patterns": [
|
"patterns": [
|
||||||
r"(?:gesch(?:ae|ä)ftsf(?:ue|ü)hr\w*|vorstand|inhaber)\s*:?\s*[a-zA-Z\u00c0-\u017e]",
|
r"(?:gesch(?:ae|ä)ftsf(?:ue|ü)hr\w*|vorstand|inhaber|aufsichtsrats?)\s*[:\-]?\s*[a-zA-Z\u00c0-\u017e]",
|
||||||
r"(?:vertreten\s+durch|repr(?:ae|ä)sentiert)\s*:?\s*[a-zA-Z\u00c0-\u017e]",
|
# "Vorstand (Milan Nedeljkovic, ...)" - BMW-Pattern mit Klammer-Liste
|
||||||
r"(?:gesch(?:ae|ä)ftsf(?:ue|ü)hrung)\s*:?\s*(?:dr\.?\s+|prof\.?\s+)?[a-zA-Z\u00c0-\u017e]",
|
r"(?:vorstand|gesch(?:ae|ä)ftsf(?:ue|ü)hrung|aufsichtsrats?)\s*\(\s*[a-zA-Z\u00c0-\u017e]",
|
||||||
|
r"(?:vertreten\s+durch|repr(?:ae|ä)sentiert)\s*[:\-]?\s*(?:den\s+vorstand\s*\(?|[a-zA-Z\u00c0-\u017e])",
|
||||||
|
r"(?:gesch(?:ae|ä)ftsf(?:ue|ü)hrung)\s*[:\-]?\s*(?:dr\.?\s+|prof\.?\s+)?[a-zA-Z\u00c0-\u017e]",
|
||||||
|
# "Vorsitzender des Aufsichtsrats: Nicolas Peter"
|
||||||
|
r"(?:vorsitzend\w+|stellv\w*\s+vorsitz\w*)\s+(?:des\s+\w+\s*)?[:\-]?\s*[a-zA-Z\u00c0-\u017e]",
|
||||||
],
|
],
|
||||||
"severity": "LOW",
|
"severity": "LOW",
|
||||||
"hint": "Voller Vor- und Nachname mit Funktionsbezeichnung erforderlich (z.B. 'Geschaeftsfuehrung: Dr. Max Mustermann').",
|
"hint": "Voller Vor- und Nachname mit Funktionsbezeichnung erforderlich (z.B. 'Geschaeftsfuehrung: Dr. Max Mustermann').",
|
||||||
@@ -234,11 +239,12 @@ IMPRESSUM_CHECKLIST = [
|
|||||||
"label": "Zustaendige Kammer benannt",
|
"label": "Zustaendige Kammer benannt",
|
||||||
"level": 2, "parent": "regulated_profession",
|
"level": 2, "parent": "regulated_profession",
|
||||||
"patterns": [
|
"patterns": [
|
||||||
r"(?:(?:ae|ä)rztekammer|rechtsanwaltskammer|steuerberaterkammer|architektenkammer|ingenieurkammer|apothekerkammer)",
|
r"(?:(?:ae|ä)rztekammer|rechtsanwaltskammer|steuerberaterkammer|architektenkammer|ingenieurkammer|apothekerkammer|handwerkskammer|tier(?:ae|ä)rztekammer|psychotherapeutenkammer)",
|
||||||
|
r"\bihk\b|industrie-?\s+und\s+handelskammer",
|
||||||
r"(?:mitglied|zugelassen|eingetragen)\s+(?:bei|in|der)\s+(?:der\s+)?(?:\w+)?kammer",
|
r"(?:mitglied|zugelassen|eingetragen)\s+(?:bei|in|der)\s+(?:der\s+)?(?:\w+)?kammer",
|
||||||
],
|
],
|
||||||
"severity": "LOW",
|
"severity": "INFO", # P9: konditional - nur kammerpflichtige Berufe
|
||||||
"hint": "Zustaendige Kammer mit vollem Namen und Sitz nennen (z.B. 'Rechtsanwaltskammer Muenchen').",
|
"hint": "Zustaendige Kammer mit vollem Namen und Sitz nennen (z.B. 'Rechtsanwaltskammer Muenchen', 'IHK Muenchen'). Nur relevant fuer kammerpflichtige Berufe.",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
"id": "profession_title",
|
"id": "profession_title",
|
||||||
@@ -314,6 +320,7 @@ IMPRESSUM_CHECKLIST = [
|
|||||||
r"distanzier|macht\s+sich\s+(?:nicht|kein)\s+(?:zu\s+eigen|verantwortlich)",
|
r"distanzier|macht\s+sich\s+(?:nicht|kein)\s+(?:zu\s+eigen|verantwortlich)",
|
||||||
],
|
],
|
||||||
"severity": "LOW",
|
"severity": "LOW",
|
||||||
|
"invert": True, # Anti-Pattern: passed wenn NICHT gefunden
|
||||||
"hint": "Der klassische Link-Disclaimer ('Wir distanzieren uns von verlinkten Inhalten') ist seit BGH (I ZR 317/01) rechtlich wirkungslos. Empfehlung: Entfernen Sie pauschale Disclaimer — sie schuetzen nicht und koennen kontraproduktiv sein.",
|
"hint": "Der klassische Link-Disclaimer ('Wir distanzieren uns von verlinkten Inhalten') ist seit BGH (I ZR 317/01) rechtlich wirkungslos. Empfehlung: Entfernen Sie pauschale Disclaimer — sie schuetzen nicht und koennen kontraproduktiv sein.",
|
||||||
},
|
},
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -5,6 +5,8 @@ Pass 1: Run all L1 checks ("Is it mentioned?")
|
|||||||
Pass 2: Run L2 checks only where their L1 parent passed ("Is it correct?")
|
Pass 2: Run L2 checks only where their L1 parent passed ("Is it correct?")
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
|
|
||||||
@@ -83,6 +85,7 @@ def check_document_completeness(
|
|||||||
doc_type: str,
|
doc_type: str,
|
||||||
doc_title: str,
|
doc_title: str,
|
||||||
doc_url: str,
|
doc_url: str,
|
||||||
|
business_profile: dict | None = None,
|
||||||
) -> list[dict]:
|
) -> list[dict]:
|
||||||
"""Check a legal document against its type-specific requirements.
|
"""Check a legal document against its type-specific requirements.
|
||||||
|
|
||||||
@@ -90,9 +93,20 @@ def check_document_completeness(
|
|||||||
L1 — Is the mandatory field mentioned at all?
|
L1 — Is the mandatory field mentioned at all?
|
||||||
L2 — Is it correct/complete? (only checked if L1 parent passed)
|
L2 — Is it correct/complete? (only checked if L1 parent passed)
|
||||||
|
|
||||||
|
business_profile (optional) wird genutzt um Checks die fuer das
|
||||||
|
spezifische Unternehmen nicht anwendbar sind als 'skipped' zu
|
||||||
|
markieren (z.B. OS-Plattform/VSBG bei no_direct_sales=True).
|
||||||
|
|
||||||
Returns a list of findings (summary + missing items).
|
Returns a list of findings (summary + missing items).
|
||||||
"""
|
"""
|
||||||
findings = []
|
findings = []
|
||||||
|
no_direct_sales = bool((business_profile or {}).get("no_direct_sales"))
|
||||||
|
# P9: Welche Check-IDs sind bei OEM-Konfigurator-Pattern obsolet.
|
||||||
|
skip_check_ids: set[str] = set()
|
||||||
|
if no_direct_sales:
|
||||||
|
skip_check_ids.update([
|
||||||
|
"dispute_resolution", # OS-Plattform / VSBG nur B2C-Direkthaendler
|
||||||
|
])
|
||||||
# Strip soft hyphens ( / \xad) that CMS tools insert for word-breaking
|
# Strip soft hyphens ( / \xad) that CMS tools insert for word-breaking
|
||||||
# — they break regex matches on compound words like "Datenübertragbarkeit"
|
# — they break regex matches on compound words like "Datenübertragbarkeit"
|
||||||
text_clean = text.replace("\xad", "").replace("­", "")
|
text_clean = text.replace("\xad", "").replace("­", "")
|
||||||
@@ -135,7 +149,24 @@ def check_document_completeness(
|
|||||||
|
|
||||||
for check in l1_checks:
|
for check in l1_checks:
|
||||||
is_info = check.get("severity") == "INFO"
|
is_info = check.get("severity") == "INFO"
|
||||||
|
# P9: Profil-basiertes Skip (OEM-Pattern -> OS-Plattform raus)
|
||||||
|
if check["id"] in skip_check_ids:
|
||||||
|
all_checks.append({
|
||||||
|
"id": check["id"], "label": check["label"],
|
||||||
|
"passed": False, "severity": "INFO",
|
||||||
|
"matched_text": "", "level": 1, "parent": None,
|
||||||
|
"skipped": True,
|
||||||
|
"hint": "Nicht anwendbar: Unternehmen betreibt keinen "
|
||||||
|
"Direkt-Vertrieb an Verbraucher (OEM-Konfigurator-Pattern).",
|
||||||
|
})
|
||||||
|
continue
|
||||||
match = _match_patterns(check["patterns"], text_lower)
|
match = _match_patterns(check["patterns"], text_lower)
|
||||||
|
# P9: "invert"=True bedeutet Anti-Pattern (z.B. illegaler Link-
|
||||||
|
# Disclaimer): passed wenn NICHT gefunden, fail wenn gefunden.
|
||||||
|
if check.get("invert"):
|
||||||
|
passed = match is None
|
||||||
|
match = None if passed else match
|
||||||
|
else:
|
||||||
passed = match is not None
|
passed = match is not None
|
||||||
if passed:
|
if passed:
|
||||||
passed_l1_ids.add(check["id"])
|
passed_l1_ids.add(check["id"])
|
||||||
@@ -168,18 +199,26 @@ def check_document_completeness(
|
|||||||
|
|
||||||
for check in l2_checks:
|
for check in l2_checks:
|
||||||
parent = check.get("parent")
|
parent = check.get("parent")
|
||||||
|
is_info = check.get("severity") == "INFO"
|
||||||
skipped = parent not in passed_l1_ids
|
skipped = parent not in passed_l1_ids
|
||||||
passed = False
|
passed = False
|
||||||
matched_text = ""
|
matched_text = ""
|
||||||
|
|
||||||
if not skipped:
|
if not skipped:
|
||||||
l2_total += 1
|
|
||||||
match = _match_patterns(check["patterns"], text_lower)
|
match = _match_patterns(check["patterns"], text_lower)
|
||||||
passed = match is not None
|
passed = match is not None
|
||||||
|
# P9: INFO-L2-Checks (konditional, z.B. Kammer) zaehlen NICHT
|
||||||
|
# in correctness-pct und erscheinen nicht als Fail-Finding.
|
||||||
|
if is_info:
|
||||||
if passed:
|
if passed:
|
||||||
|
matched_text = _extract_context(text_lower, match)
|
||||||
|
# weder l2_total++ noch findings.append: kein Fail-Eintrag
|
||||||
|
else:
|
||||||
|
l2_total += 1
|
||||||
|
if passed and not is_info:
|
||||||
l2_passed += 1
|
l2_passed += 1
|
||||||
matched_text = _extract_context(text_lower, match)
|
matched_text = _extract_context(text_lower, match)
|
||||||
else:
|
elif not passed and not is_info:
|
||||||
findings.append({
|
findings.append({
|
||||||
"code": f"DSI-DETAIL-{check['id'].upper()}",
|
"code": f"DSI-DETAIL-{check['id'].upper()}",
|
||||||
"severity": check.get("severity", "MEDIUM"),
|
"severity": check.get("severity", "MEDIUM"),
|
||||||
|
|||||||
Reference in New Issue
Block a user