diff --git a/backend-compliance/compliance/api/agent_doc_check_report.py b/backend-compliance/compliance/api/agent_doc_check_report.py new file mode 100644 index 0000000..47e9aff --- /dev/null +++ b/backend-compliance/compliance/api/agent_doc_check_report.py @@ -0,0 +1,175 @@ +""" +HTML email report builder for document checks. + +Generates a styled HTML report similar to the frontend ChecklistView, +including L1/L2 check hierarchy, progress bars, and actionable hints. +""" + +from __future__ import annotations + +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from .agent_doc_check_routes import CheckItem, DocCheckResult + + +def _bar(pct: int, color: str) -> str: + bg = {"green": "#22c55e", "yellow": "#eab308", "red": "#ef4444", "blue": "#60a5fa"} + c = bg.get(color, "#60a5fa") + return ( + f'
{pct}%' + ) + + +def _icon(passed: bool, skipped: bool = False) -> str: + if skipped: + return '—' + if passed: + return '✓' + return '✗' + + +def _hint_box(hint: str) -> str: + return ( + f'' + f'{len(results)} Dokumente, {ok_count} vollstaendig
', + ] + + for r in results: + _render_document(html, r) + + if cookie_result: + _render_cookie_banner(html, cookie_result) + + html.append('{summary}",
+ body_html=summary,
)
response = DocCheckResponse(
@@ -284,40 +284,49 @@ def _split_into_sections(text: str, parent_label: str, url: str) -> list[dict]:
Detects sections like 'Cookies', 'Social Media', 'Dienste von Drittanbietern'
and classifies each by document type for separate checking.
+ Deduplicates: if the same doc_type appears twice, texts are merged.
"""
import re as _re
- sections = []
+ sections: list[dict] = []
+ seen_types: dict[str, int] = {} # doc_type -> index in sections
- # Split by lines that look like headings (short, followed by longer content)
lines = text.split("\n")
current_heading = ""
- current_text = []
+ current_text: list[str] = []
+
+ def _save_section(heading: str, text_lines: list[str]) -> None:
+ sec_text = "\n".join(text_lines)
+ if len(sec_text.split()) < 100:
+ return
+ sec_type = _classify_section(heading)
+ if not sec_type:
+ return
+ # Merge duplicate doc_types (e.g. two "Social Media" headings)
+ if sec_type in seen_types:
+ idx = seen_types[sec_type]
+ sections[idx]["text"] += "\n\n" + sec_text
+ sections[idx]["word_count"] = len(sections[idx]["text"].split())
+ else:
+ seen_types[sec_type] = len(sections)
+ sections.append({
+ "title": f"{parent_label} > {heading}",
+ "text": sec_text,
+ "doc_type": sec_type,
+ "word_count": len(sec_text.split()),
+ })
for line in lines:
stripped = line.strip()
- # Detect heading: short line (< 80 chars), not empty, followed by content
is_heading = (
5 < len(stripped) < 80
and not stripped.endswith(".")
and not stripped.endswith(",")
and stripped[0].isupper()
)
-
- # Skip-headings should NOT start a new section — their text
- # belongs to the previous section (e.g. "Risikoabwägung" inside DSFA)
is_skip = is_heading and stripped.lower().strip() in SKIP_HEADINGS
- if is_heading and not is_skip and current_heading and len("\n".join(current_text)) > 200:
- # Save previous section
- sec_text = "\n".join(current_text)
- sec_type = _classify_section(current_heading)
- if sec_type and sec_type != "skip":
- sections.append({
- "title": f"{parent_label} > {current_heading}",
- "text": sec_text,
- "doc_type": sec_type,
- "word_count": len(sec_text.split()),
- })
+ if is_heading and not is_skip and current_heading:
+ _save_section(current_heading, current_text)
if is_heading and not is_skip:
current_heading = stripped
@@ -326,16 +335,8 @@ def _split_into_sections(text: str, parent_label: str, url: str) -> list[dict]:
current_text.append(line)
# Last section
- if current_heading and len("\n".join(current_text)) > 200:
- sec_text = "\n".join(current_text)
- sec_type = _classify_section(current_heading)
- if sec_type and sec_type != "skip":
- sections.append({
- "title": f"{parent_label} > {current_heading}",
- "text": sec_text,
- "doc_type": sec_type,
- "word_count": len(sec_text.split()),
- })
+ if current_heading:
+ _save_section(current_heading, current_text)
return sections
@@ -347,6 +348,10 @@ SKIP_HEADINGS = {
"risikoabwaegung und datenschutzfolgenabschaetzung",
}
+# Track already-seen section types to avoid duplicate sub-documents
+# (e.g. two "Social Media" headings on the same page)
+_DEDUP_TYPES = {"social_media", "cookie", "dsfa", "widerruf", "impressum"}
+
def _classify_section(heading: str) -> str | None:
"""Classify a section heading into a document type."""
@@ -377,41 +382,5 @@ async def _check_cookie_banner(url: str) -> dict | None:
def _build_report(results: list[DocCheckResult], cookie_result: dict | None) -> str:
- """Build email report."""
- parts = [
- "DOKUMENTEN-PRUEFUNG",
- f"Dokumente geprueft: {len(results)}",
- "",
- ]
- for r in results:
- status = "OK" if r.completeness_pct == 100 else "LUECKENHAFT" if r.completeness_pct >= 50 else "MANGELHAFT"
- if r.error:
- status = "FEHLER"
- detail = f", Korrektheit {r.correctness_pct}%" if r.correctness_pct else ""
- parts.append(f"[{status}] {r.label} ({r.completeness_pct}%{detail}, {r.word_count} Woerter)")
-
- for check in r.checks:
- if check.skipped:
- continue
- icon = "+" if check.passed else "!!"
- indent = " " if check.level == 2 else " "
- parts.append(f"{indent}[{icon}] {check.label}")
-
- if r.error:
- parts.append(f" FEHLER: {r.error}")
- parts.append("")
-
- if cookie_result:
- parts.extend([
- "Cookie-Banner Pruefung:",
- f" Banner erkannt: {cookie_result.get('banner_detected', False)}",
- f" Anbieter: {cookie_result.get('banner_provider', 'unbekannt')}",
- ])
- violations = cookie_result.get("banner_checks", {}).get("violations", [])
- if violations:
- for v in violations[:10]:
- parts.append(f" [!!] {v.get('text', '')[:80]}")
- else:
- parts.append(" Keine Verstoesse erkannt.")
-
- return "\n".join(parts)
+ from .agent_doc_check_report import build_html_report
+ return build_html_report(results, cookie_result)