feat: Document-centric scan results + DSI deduplication

DSI Dedup (consent-tester): - Only H1/H2 headings count as documents (not H3/H4 sub-sections) - Sub-sections (Cookies, Betroffenenrechte, Social Media) are part of parent document's full text, not separate documents - Reduces IHK result from 30 to ~11 real documents Backend (agent_scan_routes): - ScanFinding gets doc_title field linking each finding to its document - doc_title set when creating DSI findings for document attribution Frontend (ScanResult.tsx): - 3 sections: Services table, Document cards, General findings - Documents: expandable cards with completeness bar (green/yellow/red) - Findings grouped under their parent document - Each card shows: title, word count, findings count, % completeness - Findings without doc_title go to "Allgemeine Findings" section Email Summary (agent_scan_helpers): - Findings listed under their parent document - General findings in separate section - No more flat mixed list Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-05 09:56:29 +02:00
parent d816cf8d3a
commit 7c7513525e
4 changed files with 210 additions and 91 deletions
@@ -73,25 +73,41 @@ def build_scan_summary(
        f"Findings: {n_findings} ({high} mit hoher Prioritaet)",
    ])

-    # DSI Documents section
+    # DSI Documents section — grouped with their findings
    if discovered_docs:
-        parts.extend([
-            "",
-            f"Rechtliche Dokumente gefunden: {len(discovered_docs)}",
-        ])
+        parts.extend(["", f"Rechtliche Dokumente ({len(discovered_docs)})"])
+
+        # Group findings by doc_title
+        doc_findings_map: dict[str, list] = {}
+        general_findings: list = []
+        for f in findings:
+            dt = f.doc_title if hasattr(f, 'doc_title') else ""
+            if dt:
+                doc_findings_map.setdefault(dt, []).append(f)
+            else:
+                general_findings.append(f)
+
        for doc in discovered_docs:
+            title = doc.title if hasattr(doc, 'title') else "?"
            pct = doc.completeness_pct if hasattr(doc, 'completeness_pct') else 0
-            fc = doc.findings_count if hasattr(doc, 'findings_count') else 0
            wc = doc.word_count if hasattr(doc, 'word_count') else 0
            status = "OK" if pct >= 80 else "LUECKENHAFT" if pct >= 50 else "MANGELHAFT"
-            dt = doc.doc_type if hasattr(doc, 'doc_type') else "unknown"
-            title = doc.title if hasattr(doc, 'title') else "?"
-            parts.append(
-                f"  [{status}] {title} ({dt}, {wc} Woerter, "
-                f"{pct}% vollstaendig, {fc} Maengel)"
-            )
+            parts.append(f"  [{status}] {title} ({pct}%, {wc} Woerter)")
+            for f in doc_findings_map.get(title, []):
+                sev = f.severity if hasattr(f, 'severity') else "?"
+                txt = f.text if hasattr(f, 'text') else str(f)
+                marker = "!!" if sev == "HIGH" else "!" if sev == "MEDIUM" else "i"
+                parts.append(f"    {marker} {txt}")

-    if findings:
+        # General findings (no doc association)
+        if general_findings:
+            parts.extend(["", "Allgemeine Findings"])
+            for f in general_findings[:20]:
+                sev = f.severity if hasattr(f, 'severity') else "?"
+                txt = f.text if hasattr(f, 'text') else str(f)
+                marker = "!!" if sev == "HIGH" else "!" if sev == "MEDIUM" else "i"
+                parts.append(f"  [{marker}] {txt}")
+    elif findings:
        parts.append("")
        for f in findings[:20]:
            sev = f.severity if hasattr(f, 'severity') else "?"
@@ -79,6 +79,7 @@ class ScanFinding(BaseModel):
    severity: str
    text: str
    correction: str = ""
+    doc_title: str = ""
    text_reference: TextReferenceModel | None = None


@@ -264,6 +265,7 @@ async def _execute_scan(req: ScanRequest, scan_id: str = "") -> ScanResponse:
                        if "SCORE" not in df.get("code", ""):
                            dsi_findings.append(ScanFinding(
                                code=df["code"], severity=df["severity"], text=df["text"],
+                                doc_title=doc["title"],
                            ))
    except Exception as e:
        logger.warning("DSI discovery failed: %s %s", type(e).__name__, e)