feat: HTML email report with hints + fix duplicate Social Media sections
Build + Deploy / build-admin-compliance (push) Successful in 1m45s
Build + Deploy / build-backend-compliance (push) Successful in 9s
Build + Deploy / build-ai-sdk (push) Successful in 36s
Build + Deploy / build-developer-portal (push) Successful in 7s
Build + Deploy / build-tts (push) Successful in 7s
Build + Deploy / build-document-crawler (push) Successful in 8s
Build + Deploy / build-dsms-gateway (push) Successful in 7s
Build + Deploy / build-dsms-node (push) Successful in 8s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / loc-budget (push) Failing after 15s
CI / secret-scan (push) Has been skipped
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Successful in 2m47s
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / test-go (push) Failing after 44s
CI / test-python-backend (push) Successful in 41s
CI / test-python-document-crawler (push) Successful in 26s
CI / test-python-dsms-gateway (push) Successful in 22s
CI / validate-canonical-controls (push) Successful in 15s
Build + Deploy / trigger-orca (push) Successful in 2m23s
Build + Deploy / build-admin-compliance (push) Successful in 1m45s
Build + Deploy / build-backend-compliance (push) Successful in 9s
Build + Deploy / build-ai-sdk (push) Successful in 36s
Build + Deploy / build-developer-portal (push) Successful in 7s
Build + Deploy / build-tts (push) Successful in 7s
Build + Deploy / build-document-crawler (push) Successful in 8s
Build + Deploy / build-dsms-gateway (push) Successful in 7s
Build + Deploy / build-dsms-node (push) Successful in 8s
CI / branch-name (push) Has been skipped
CI / guardrail-integrity (push) Has been skipped
CI / loc-budget (push) Failing after 15s
CI / secret-scan (push) Has been skipped
CI / go-lint (push) Has been skipped
CI / python-lint (push) Has been skipped
CI / nodejs-lint (push) Has been skipped
CI / nodejs-build (push) Successful in 2m47s
CI / dep-audit (push) Has been skipped
CI / sbom-scan (push) Has been skipped
CI / test-go (push) Failing after 44s
CI / test-python-backend (push) Successful in 41s
CI / test-python-document-crawler (push) Successful in 26s
CI / test-python-dsms-gateway (push) Successful in 22s
CI / validate-canonical-controls (push) Successful in 15s
Build + Deploy / trigger-orca (push) Successful in 2m23s
1. Email report now renders as styled HTML (matching frontend design): - Progress bars (green=completeness, blue=correctness) - Hierarchical L1→L2 check display - Red hint boxes under failed checks explaining what to fix - Matched text evidence for passed checks 2. Section splitter deduplicates: two "Social Media" headings on the same page are merged into one section instead of creating duplicates. 3. Extracted report builder to agent_doc_check_report.py (175 LOC) to keep routes file under 500 LOC (386 LOC). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -141,7 +141,7 @@ async def _run_doc_check(check_id: str, req: DocCheckRequest):
|
||||
email_result = send_email(
|
||||
recipient=req.recipient,
|
||||
subject=f"[DOKUMENTEN-PRUEFUNG] {len(results)} Dokumente geprueft",
|
||||
body_html=f"<pre>{summary}</pre>",
|
||||
body_html=summary,
|
||||
)
|
||||
|
||||
response = DocCheckResponse(
|
||||
@@ -284,40 +284,49 @@ def _split_into_sections(text: str, parent_label: str, url: str) -> list[dict]:
|
||||
|
||||
Detects sections like 'Cookies', 'Social Media', 'Dienste von Drittanbietern'
|
||||
and classifies each by document type for separate checking.
|
||||
Deduplicates: if the same doc_type appears twice, texts are merged.
|
||||
"""
|
||||
import re as _re
|
||||
sections = []
|
||||
sections: list[dict] = []
|
||||
seen_types: dict[str, int] = {} # doc_type -> index in sections
|
||||
|
||||
# Split by lines that look like headings (short, followed by longer content)
|
||||
lines = text.split("\n")
|
||||
current_heading = ""
|
||||
current_text = []
|
||||
current_text: list[str] = []
|
||||
|
||||
def _save_section(heading: str, text_lines: list[str]) -> None:
|
||||
sec_text = "\n".join(text_lines)
|
||||
if len(sec_text.split()) < 100:
|
||||
return
|
||||
sec_type = _classify_section(heading)
|
||||
if not sec_type:
|
||||
return
|
||||
# Merge duplicate doc_types (e.g. two "Social Media" headings)
|
||||
if sec_type in seen_types:
|
||||
idx = seen_types[sec_type]
|
||||
sections[idx]["text"] += "\n\n" + sec_text
|
||||
sections[idx]["word_count"] = len(sections[idx]["text"].split())
|
||||
else:
|
||||
seen_types[sec_type] = len(sections)
|
||||
sections.append({
|
||||
"title": f"{parent_label} > {heading}",
|
||||
"text": sec_text,
|
||||
"doc_type": sec_type,
|
||||
"word_count": len(sec_text.split()),
|
||||
})
|
||||
|
||||
for line in lines:
|
||||
stripped = line.strip()
|
||||
# Detect heading: short line (< 80 chars), not empty, followed by content
|
||||
is_heading = (
|
||||
5 < len(stripped) < 80
|
||||
and not stripped.endswith(".")
|
||||
and not stripped.endswith(",")
|
||||
and stripped[0].isupper()
|
||||
)
|
||||
|
||||
# Skip-headings should NOT start a new section — their text
|
||||
# belongs to the previous section (e.g. "Risikoabwägung" inside DSFA)
|
||||
is_skip = is_heading and stripped.lower().strip() in SKIP_HEADINGS
|
||||
|
||||
if is_heading and not is_skip and current_heading and len("\n".join(current_text)) > 200:
|
||||
# Save previous section
|
||||
sec_text = "\n".join(current_text)
|
||||
sec_type = _classify_section(current_heading)
|
||||
if sec_type and sec_type != "skip":
|
||||
sections.append({
|
||||
"title": f"{parent_label} > {current_heading}",
|
||||
"text": sec_text,
|
||||
"doc_type": sec_type,
|
||||
"word_count": len(sec_text.split()),
|
||||
})
|
||||
if is_heading and not is_skip and current_heading:
|
||||
_save_section(current_heading, current_text)
|
||||
|
||||
if is_heading and not is_skip:
|
||||
current_heading = stripped
|
||||
@@ -326,16 +335,8 @@ def _split_into_sections(text: str, parent_label: str, url: str) -> list[dict]:
|
||||
current_text.append(line)
|
||||
|
||||
# Last section
|
||||
if current_heading and len("\n".join(current_text)) > 200:
|
||||
sec_text = "\n".join(current_text)
|
||||
sec_type = _classify_section(current_heading)
|
||||
if sec_type and sec_type != "skip":
|
||||
sections.append({
|
||||
"title": f"{parent_label} > {current_heading}",
|
||||
"text": sec_text,
|
||||
"doc_type": sec_type,
|
||||
"word_count": len(sec_text.split()),
|
||||
})
|
||||
if current_heading:
|
||||
_save_section(current_heading, current_text)
|
||||
|
||||
return sections
|
||||
|
||||
@@ -347,6 +348,10 @@ SKIP_HEADINGS = {
|
||||
"risikoabwaegung und datenschutzfolgenabschaetzung",
|
||||
}
|
||||
|
||||
# Track already-seen section types to avoid duplicate sub-documents
|
||||
# (e.g. two "Social Media" headings on the same page)
|
||||
_DEDUP_TYPES = {"social_media", "cookie", "dsfa", "widerruf", "impressum"}
|
||||
|
||||
|
||||
def _classify_section(heading: str) -> str | None:
|
||||
"""Classify a section heading into a document type."""
|
||||
@@ -377,41 +382,5 @@ async def _check_cookie_banner(url: str) -> dict | None:
|
||||
|
||||
|
||||
def _build_report(results: list[DocCheckResult], cookie_result: dict | None) -> str:
|
||||
"""Build email report."""
|
||||
parts = [
|
||||
"DOKUMENTEN-PRUEFUNG",
|
||||
f"Dokumente geprueft: {len(results)}",
|
||||
"",
|
||||
]
|
||||
for r in results:
|
||||
status = "OK" if r.completeness_pct == 100 else "LUECKENHAFT" if r.completeness_pct >= 50 else "MANGELHAFT"
|
||||
if r.error:
|
||||
status = "FEHLER"
|
||||
detail = f", Korrektheit {r.correctness_pct}%" if r.correctness_pct else ""
|
||||
parts.append(f"[{status}] {r.label} ({r.completeness_pct}%{detail}, {r.word_count} Woerter)")
|
||||
|
||||
for check in r.checks:
|
||||
if check.skipped:
|
||||
continue
|
||||
icon = "+" if check.passed else "!!"
|
||||
indent = " " if check.level == 2 else " "
|
||||
parts.append(f"{indent}[{icon}] {check.label}")
|
||||
|
||||
if r.error:
|
||||
parts.append(f" FEHLER: {r.error}")
|
||||
parts.append("")
|
||||
|
||||
if cookie_result:
|
||||
parts.extend([
|
||||
"Cookie-Banner Pruefung:",
|
||||
f" Banner erkannt: {cookie_result.get('banner_detected', False)}",
|
||||
f" Anbieter: {cookie_result.get('banner_provider', 'unbekannt')}",
|
||||
])
|
||||
violations = cookie_result.get("banner_checks", {}).get("violations", [])
|
||||
if violations:
|
||||
for v in violations[:10]:
|
||||
parts.append(f" [!!] {v.get('text', '')[:80]}")
|
||||
else:
|
||||
parts.append(" Keine Verstoesse erkannt.")
|
||||
|
||||
return "\n".join(parts)
|
||||
from .agent_doc_check_report import build_html_report
|
||||
return build_html_report(results, cookie_result)
|
||||
|
||||
Reference in New Issue
Block a user