feat: Document-centric scan results + DSI deduplication
DSI Dedup (consent-tester): - Only H1/H2 headings count as documents (not H3/H4 sub-sections) - Sub-sections (Cookies, Betroffenenrechte, Social Media) are part of parent document's full text, not separate documents - Reduces IHK result from 30 to ~11 real documents Backend (agent_scan_routes): - ScanFinding gets doc_title field linking each finding to its document - doc_title set when creating DSI findings for document attribution Frontend (ScanResult.tsx): - 3 sections: Services table, Document cards, General findings - Documents: expandable cards with completeness bar (green/yellow/red) - Findings grouped under their parent document - Each card shows: title, word count, findings count, % completeness - Findings without doc_title go to "Allgemeine Findings" section Email Summary (agent_scan_helpers): - Findings listed under their parent document - General findings in separate section - No more flat mixed list Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -19,6 +19,17 @@ interface ScanFinding {
|
||||
severity: string
|
||||
text: string
|
||||
correction: string
|
||||
doc_title: string
|
||||
}
|
||||
|
||||
interface DiscoveredDocument {
|
||||
title: string
|
||||
url: string
|
||||
doc_type: string
|
||||
language: string
|
||||
word_count: number
|
||||
completeness_pct: number
|
||||
findings_count: number
|
||||
}
|
||||
|
||||
interface ScanData {
|
||||
@@ -26,6 +37,7 @@ interface ScanData {
|
||||
pages_list: string[]
|
||||
services: ServiceInfo[]
|
||||
findings: ScanFinding[]
|
||||
discovered_documents?: DiscoveredDocument[]
|
||||
ai_detected: boolean
|
||||
chatbot_detected: boolean
|
||||
chatbot_provider: string
|
||||
@@ -34,24 +46,38 @@ interface ScanData {
|
||||
}
|
||||
|
||||
const STATUS_ICON: Record<string, { icon: string; color: string }> = {
|
||||
ok: { icon: '✓', color: 'text-green-600' },
|
||||
undocumented: { icon: '✗', color: 'text-red-600' },
|
||||
ok: { icon: '\u2713', color: 'text-green-600' },
|
||||
undocumented: { icon: '\u2717', color: 'text-red-600' },
|
||||
outdated: { icon: '~', color: 'text-yellow-600' },
|
||||
}
|
||||
|
||||
const SEV_STYLE: Record<string, { bg: string; text: string }> = {
|
||||
HIGH: { bg: 'bg-red-50 border-red-200', text: 'text-red-800' },
|
||||
MEDIUM: { bg: 'bg-yellow-50 border-yellow-200', text: 'text-yellow-800' },
|
||||
LOW: { bg: 'bg-blue-50 border-blue-200', text: 'text-blue-800' },
|
||||
const SEV_STYLE: Record<string, { bg: string; text: string; dot: string }> = {
|
||||
HIGH: { bg: 'bg-red-50 border-red-200', text: 'text-red-800', dot: 'bg-red-500' },
|
||||
MEDIUM: { bg: 'bg-yellow-50 border-yellow-200', text: 'text-yellow-800', dot: 'bg-yellow-500' },
|
||||
LOW: { bg: 'bg-blue-50 border-blue-200', text: 'text-blue-800', dot: 'bg-blue-500' },
|
||||
CRITICAL: { bg: 'bg-red-100 border-red-300', text: 'text-red-900', dot: 'bg-red-700' },
|
||||
}
|
||||
|
||||
export function ScanResult({ data }: { data: ScanData }) {
|
||||
const [expandedCorrection, setExpandedCorrection] = useState<string | null>(null)
|
||||
const [expandedDoc, setExpandedDoc] = useState<string | null>(null)
|
||||
|
||||
const undocCount = data.services.filter(s => s.status === 'undocumented').length
|
||||
const okCount = data.services.filter(s => s.status === 'ok').length
|
||||
const outdatedCount = data.services.filter(s => s.status === 'outdated').length
|
||||
const highCount = data.findings.filter(f => f.severity === 'HIGH').length
|
||||
const highCount = data.findings.filter(f => f.severity === 'HIGH' || f.severity === 'CRITICAL').length
|
||||
const docs = data.discovered_documents || []
|
||||
|
||||
// Group findings by doc_title
|
||||
const docFindings: Record<string, ScanFinding[]> = {}
|
||||
const generalFindings: ScanFinding[] = []
|
||||
for (const f of data.findings) {
|
||||
if (f.doc_title) {
|
||||
if (!docFindings[f.doc_title]) docFindings[f.doc_title] = []
|
||||
docFindings[f.doc_title].push(f)
|
||||
} else {
|
||||
generalFindings.push(f)
|
||||
}
|
||||
}
|
||||
|
||||
return (
|
||||
<div className="space-y-5">
|
||||
@@ -59,7 +85,7 @@ export function ScanResult({ data }: { data: ScanData }) {
|
||||
<div className="grid grid-cols-4 gap-3">
|
||||
<div className="bg-gray-50 rounded-lg p-3 text-center">
|
||||
<p className="text-2xl font-bold text-gray-900">{data.pages_scanned}</p>
|
||||
<p className="text-xs text-gray-500">Seiten gescannt</p>
|
||||
<p className="text-xs text-gray-500">Seiten</p>
|
||||
</div>
|
||||
<div className="bg-green-50 rounded-lg p-3 text-center">
|
||||
<p className="text-2xl font-bold text-green-700">{okCount}</p>
|
||||
@@ -69,9 +95,9 @@ export function ScanResult({ data }: { data: ScanData }) {
|
||||
<p className="text-2xl font-bold text-red-700">{undocCount}</p>
|
||||
<p className="text-xs text-gray-500">Nicht in DSE</p>
|
||||
</div>
|
||||
<div className="bg-yellow-50 rounded-lg p-3 text-center">
|
||||
<p className="text-2xl font-bold text-yellow-700">{outdatedCount}</p>
|
||||
<p className="text-xs text-gray-500">Veraltet</p>
|
||||
<div className="bg-purple-50 rounded-lg p-3 text-center">
|
||||
<p className="text-2xl font-bold text-purple-700">{docs.length}</p>
|
||||
<p className="text-xs text-gray-500">Dokumente</p>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
@@ -79,14 +105,14 @@ export function ScanResult({ data }: { data: ScanData }) {
|
||||
{data.pages_list?.length > 0 && (
|
||||
<details className="text-sm">
|
||||
<summary className="text-gray-600 cursor-pointer hover:text-gray-800">
|
||||
{data.pages_scanned} Seiten gescannt — Details anzeigen
|
||||
{data.pages_scanned} Seiten gescannt
|
||||
</summary>
|
||||
<ul className="mt-2 space-y-1 ml-4">
|
||||
{data.pages_list.map((p, i) => {
|
||||
const isMissing = data.missing_pages[p]
|
||||
return (
|
||||
<li key={i} className={`text-xs ${isMissing ? 'text-red-600' : 'text-gray-500'}`}>
|
||||
{isMissing ? '✗' : '✓'} {p} {isMissing ? `(HTTP ${data.missing_pages[p]})` : ''}
|
||||
{isMissing ? '\u2717' : '\u2713'} {p}
|
||||
</li>
|
||||
)
|
||||
})}
|
||||
@@ -94,19 +120,10 @@ export function ScanResult({ data }: { data: ScanData }) {
|
||||
</details>
|
||||
)}
|
||||
|
||||
{/* AI / Chatbot Detection */}
|
||||
<div className="flex gap-3">
|
||||
<span className={`px-3 py-1 rounded-full text-xs font-medium ${data.ai_detected ? 'bg-purple-100 text-purple-800' : 'bg-gray-100 text-gray-600'}`}>
|
||||
{data.ai_detected ? 'KI erkannt' : 'Keine KI erkannt'}
|
||||
</span>
|
||||
<span className={`px-3 py-1 rounded-full text-xs font-medium ${data.chatbot_detected ? 'bg-blue-100 text-blue-800' : 'bg-gray-100 text-gray-600'}`}>
|
||||
{data.chatbot_detected ? `Chatbot: ${data.chatbot_provider}` : 'Kein Chatbot'}
|
||||
</span>
|
||||
</div>
|
||||
|
||||
{/* Services Table */}
|
||||
{data.services.length > 0 && (
|
||||
<div>
|
||||
<h4 className="text-sm font-medium text-gray-700 mb-2">Dienstleister-Abgleich (SOLL/IST)</h4>
|
||||
<h4 className="text-sm font-medium text-gray-700 mb-2">Dienstleister (SOLL/IST)</h4>
|
||||
<div className="border rounded-lg overflow-hidden">
|
||||
<table className="w-full text-sm">
|
||||
<thead className="bg-gray-50">
|
||||
@@ -114,7 +131,6 @@ export function ScanResult({ data }: { data: ScanData }) {
|
||||
<th className="px-3 py-2 text-left text-xs font-medium text-gray-500">Status</th>
|
||||
<th className="px-3 py-2 text-left text-xs font-medium text-gray-500">Dienst</th>
|
||||
<th className="px-3 py-2 text-left text-xs font-medium text-gray-500">Land</th>
|
||||
<th className="px-3 py-2 text-left text-xs font-medium text-gray-500">EU</th>
|
||||
<th className="px-3 py-2 text-left text-xs font-medium text-gray-500">In DSE</th>
|
||||
</tr>
|
||||
</thead>
|
||||
@@ -126,11 +142,10 @@ export function ScanResult({ data }: { data: ScanData }) {
|
||||
<td className={`px-3 py-2 font-bold ${st.color}`}>{st.icon}</td>
|
||||
<td className="px-3 py-2">
|
||||
<span className="font-medium text-gray-900">{s.name}</span>
|
||||
<span className="text-gray-400 text-xs ml-2">{s.category}</span>
|
||||
<span className="text-gray-400 text-xs ml-2">{s.provider}</span>
|
||||
</td>
|
||||
<td className="px-3 py-2 text-gray-600">{s.country}</td>
|
||||
<td className="px-3 py-2">{s.eu_adequate ? '✓' : '✗'}</td>
|
||||
<td className="px-3 py-2">{s.in_dse ? 'Ja' : <span className="text-red-600 font-medium">Nein</span>}</td>
|
||||
<td className="px-3 py-2">{s.in_dse ? '\u2713' : <span className="text-red-600 font-medium">Nein</span>}</td>
|
||||
</tr>
|
||||
)
|
||||
})}
|
||||
@@ -138,17 +153,94 @@ export function ScanResult({ data }: { data: ScanData }) {
|
||||
</table>
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
|
||||
{/* Findings */}
|
||||
{data.findings.length > 0 && (
|
||||
{/* === Document-Centric View === */}
|
||||
{docs.length > 0 && (
|
||||
<div>
|
||||
<h4 className="text-sm font-medium text-gray-700 mb-2">
|
||||
Findings ({data.findings.length}, davon {highCount} kritisch)
|
||||
Rechtliche Dokumente ({docs.length})
|
||||
</h4>
|
||||
<div className="space-y-2">
|
||||
{data.findings.map((f, i) => {
|
||||
{docs.map((doc, i) => {
|
||||
const isExpanded = expandedDoc === doc.title
|
||||
const findings = docFindings[doc.title] || []
|
||||
const pct = doc.completeness_pct
|
||||
const barColor = pct >= 80 ? 'bg-green-500' : pct >= 50 ? 'bg-yellow-500' : 'bg-red-500'
|
||||
const statusLabel = pct >= 80 ? 'OK' : pct >= 50 ? 'Lueckenhaft' : 'Mangelhaft'
|
||||
const statusColor = pct >= 80 ? 'text-green-700 bg-green-50' : pct >= 50 ? 'text-yellow-700 bg-yellow-50' : 'text-red-700 bg-red-50'
|
||||
|
||||
return (
|
||||
<div key={i} className="border border-gray-200 rounded-lg overflow-hidden">
|
||||
<button
|
||||
onClick={() => setExpandedDoc(isExpanded ? null : doc.title)}
|
||||
className="w-full flex items-center justify-between px-4 py-3 bg-gray-50/50 hover:bg-gray-50 text-left"
|
||||
>
|
||||
<div className="flex items-center gap-3 flex-1 min-w-0">
|
||||
<svg className={`w-4 h-4 text-gray-400 transition-transform shrink-0 ${isExpanded ? 'rotate-90' : ''}`}
|
||||
fill="none" stroke="currentColor" viewBox="0 0 24 24">
|
||||
<path strokeLinecap="round" strokeLinejoin="round" strokeWidth={2} d="M9 5l7 7-7 7" />
|
||||
</svg>
|
||||
<div className="min-w-0 flex-1">
|
||||
<div className="text-sm font-medium text-gray-900 truncate">{doc.title}</div>
|
||||
<div className="text-xs text-gray-500">
|
||||
{doc.word_count} Woerter
|
||||
{findings.length > 0 && <span className="text-red-600 ml-2">{findings.length} Maengel</span>}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
<div className="flex items-center gap-3 shrink-0 ml-3">
|
||||
{/* Completeness bar */}
|
||||
<div className="w-20 h-2 bg-gray-200 rounded-full overflow-hidden">
|
||||
<div className={`h-full rounded-full ${barColor}`} style={{ width: `${pct}%` }} />
|
||||
</div>
|
||||
<span className={`text-xs font-medium px-2 py-0.5 rounded ${statusColor}`}>
|
||||
{pct}%
|
||||
</span>
|
||||
</div>
|
||||
</button>
|
||||
|
||||
{isExpanded && (
|
||||
<div className="px-4 py-3 border-t border-gray-100 space-y-2">
|
||||
{findings.length > 0 ? (
|
||||
findings.map((f, fi) => {
|
||||
const sev = SEV_STYLE[f.severity] || SEV_STYLE.MEDIUM
|
||||
const isExpanded = expandedCorrection === f.code
|
||||
return (
|
||||
<div key={fi} className="flex items-start gap-2 text-sm">
|
||||
<span className={`w-2 h-2 rounded-full mt-1.5 shrink-0 ${sev.dot}`} />
|
||||
<span className="text-gray-700">{f.text}</span>
|
||||
</div>
|
||||
)
|
||||
})
|
||||
) : (
|
||||
<p className="text-sm text-green-600">Alle Pflichtangaben vorhanden.</p>
|
||||
)}
|
||||
{doc.url && (
|
||||
<a href={doc.url} target="_blank" rel="noopener noreferrer"
|
||||
className="text-xs text-purple-600 hover:underline mt-2 inline-block">
|
||||
Dokument oeffnen
|
||||
</a>
|
||||
)}
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
)
|
||||
})}
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
|
||||
{/* General Findings (not associated with a specific document) */}
|
||||
{generalFindings.length > 0 && (
|
||||
<div>
|
||||
<h4 className="text-sm font-medium text-gray-700 mb-2">
|
||||
Allgemeine Findings ({generalFindings.length})
|
||||
</h4>
|
||||
<div className="space-y-2">
|
||||
{generalFindings.map((f, i) => {
|
||||
const sev = SEV_STYLE[f.severity] || SEV_STYLE.MEDIUM
|
||||
const corrKey = `gen-${i}`
|
||||
const isExp = expandedCorrection === corrKey
|
||||
return (
|
||||
<div key={i} className={`border rounded-lg p-3 ${sev.bg}`}>
|
||||
<div className="flex items-start gap-2">
|
||||
@@ -159,20 +251,15 @@ export function ScanResult({ data }: { data: ScanData }) {
|
||||
</div>
|
||||
{f.correction && (
|
||||
<div className="mt-2">
|
||||
<button
|
||||
onClick={() => setExpandedCorrection(isExpanded ? null : f.code)}
|
||||
className="text-xs text-purple-600 hover:text-purple-800 font-medium"
|
||||
>
|
||||
{isExpanded ? '▼ Korrekturvorschlag ausblenden' : '▶ Korrekturvorschlag anzeigen'}
|
||||
<button onClick={() => setExpandedCorrection(isExp ? null : corrKey)}
|
||||
className="text-xs text-purple-600 hover:text-purple-800 font-medium">
|
||||
{isExp ? 'Korrektur ausblenden' : 'Korrekturvorschlag'}
|
||||
</button>
|
||||
{isExpanded && (
|
||||
{isExp && (
|
||||
<div className="mt-2 bg-white border border-gray-200 rounded-lg p-3 relative">
|
||||
<pre className="text-xs text-gray-700 whitespace-pre-wrap font-sans">{f.correction}</pre>
|
||||
<button
|
||||
onClick={() => navigator.clipboard.writeText(f.correction)}
|
||||
className="absolute top-2 right-2 text-xs bg-gray-100 hover:bg-gray-200 px-2 py-1 rounded"
|
||||
title="Kopieren"
|
||||
>
|
||||
<button onClick={() => navigator.clipboard.writeText(f.correction)}
|
||||
className="absolute top-2 right-2 text-xs bg-gray-100 hover:bg-gray-200 px-2 py-1 rounded">
|
||||
Kopieren
|
||||
</button>
|
||||
</div>
|
||||
@@ -185,6 +272,14 @@ export function ScanResult({ data }: { data: ScanData }) {
|
||||
</div>
|
||||
</div>
|
||||
)}
|
||||
|
||||
{/* Email Status */}
|
||||
{data.email_status && (
|
||||
<div className="text-xs text-gray-500 flex items-center gap-2">
|
||||
<span className={`w-2 h-2 rounded-full ${data.email_status === 'sent' ? 'bg-green-400' : 'bg-gray-300'}`} />
|
||||
E-Mail: {data.email_status === 'sent' ? 'Gesendet' : data.email_status}
|
||||
</div>
|
||||
)}
|
||||
</div>
|
||||
)
|
||||
}
|
||||
|
||||
@@ -73,25 +73,41 @@ def build_scan_summary(
|
||||
f"Findings: {n_findings} ({high} mit hoher Prioritaet)",
|
||||
])
|
||||
|
||||
# DSI Documents section
|
||||
# DSI Documents section — grouped with their findings
|
||||
if discovered_docs:
|
||||
parts.extend([
|
||||
"",
|
||||
f"Rechtliche Dokumente gefunden: {len(discovered_docs)}",
|
||||
])
|
||||
parts.extend(["", f"Rechtliche Dokumente ({len(discovered_docs)})"])
|
||||
|
||||
# Group findings by doc_title
|
||||
doc_findings_map: dict[str, list] = {}
|
||||
general_findings: list = []
|
||||
for f in findings:
|
||||
dt = f.doc_title if hasattr(f, 'doc_title') else ""
|
||||
if dt:
|
||||
doc_findings_map.setdefault(dt, []).append(f)
|
||||
else:
|
||||
general_findings.append(f)
|
||||
|
||||
for doc in discovered_docs:
|
||||
title = doc.title if hasattr(doc, 'title') else "?"
|
||||
pct = doc.completeness_pct if hasattr(doc, 'completeness_pct') else 0
|
||||
fc = doc.findings_count if hasattr(doc, 'findings_count') else 0
|
||||
wc = doc.word_count if hasattr(doc, 'word_count') else 0
|
||||
status = "OK" if pct >= 80 else "LUECKENHAFT" if pct >= 50 else "MANGELHAFT"
|
||||
dt = doc.doc_type if hasattr(doc, 'doc_type') else "unknown"
|
||||
title = doc.title if hasattr(doc, 'title') else "?"
|
||||
parts.append(
|
||||
f" [{status}] {title} ({dt}, {wc} Woerter, "
|
||||
f"{pct}% vollstaendig, {fc} Maengel)"
|
||||
)
|
||||
parts.append(f" [{status}] {title} ({pct}%, {wc} Woerter)")
|
||||
for f in doc_findings_map.get(title, []):
|
||||
sev = f.severity if hasattr(f, 'severity') else "?"
|
||||
txt = f.text if hasattr(f, 'text') else str(f)
|
||||
marker = "!!" if sev == "HIGH" else "!" if sev == "MEDIUM" else "i"
|
||||
parts.append(f" {marker} {txt}")
|
||||
|
||||
if findings:
|
||||
# General findings (no doc association)
|
||||
if general_findings:
|
||||
parts.extend(["", "Allgemeine Findings"])
|
||||
for f in general_findings[:20]:
|
||||
sev = f.severity if hasattr(f, 'severity') else "?"
|
||||
txt = f.text if hasattr(f, 'text') else str(f)
|
||||
marker = "!!" if sev == "HIGH" else "!" if sev == "MEDIUM" else "i"
|
||||
parts.append(f" [{marker}] {txt}")
|
||||
elif findings:
|
||||
parts.append("")
|
||||
for f in findings[:20]:
|
||||
sev = f.severity if hasattr(f, 'severity') else "?"
|
||||
|
||||
@@ -79,6 +79,7 @@ class ScanFinding(BaseModel):
|
||||
severity: str
|
||||
text: str
|
||||
correction: str = ""
|
||||
doc_title: str = ""
|
||||
text_reference: TextReferenceModel | None = None
|
||||
|
||||
|
||||
@@ -264,6 +265,7 @@ async def _execute_scan(req: ScanRequest, scan_id: str = "") -> ScanResponse:
|
||||
if "SCORE" not in df.get("code", ""):
|
||||
dsi_findings.append(ScanFinding(
|
||||
code=df["code"], severity=df["severity"], text=df["text"],
|
||||
doc_title=doc["title"],
|
||||
))
|
||||
except Exception as e:
|
||||
logger.warning("DSI discovery failed: %s %s", type(e).__name__, e)
|
||||
|
||||
@@ -444,13 +444,18 @@ async def _expand_all_interactive(page: Page) -> None:
|
||||
|
||||
|
||||
async def _find_inline_dsi_sections(page: Page) -> list[dict]:
|
||||
"""Find DSI content already visible on the page (e.g. expanded accordions)."""
|
||||
"""Find DSI content already visible on the page (e.g. expanded accordions).
|
||||
|
||||
Only counts top-level documents (H1/H2 with DSI keywords).
|
||||
Sub-sections (H3/H4 like 'Cookies', 'Betroffenenrechte') are NOT counted
|
||||
as separate documents — their text is part of the parent document.
|
||||
"""
|
||||
try:
|
||||
sections = await page.evaluate("""
|
||||
() => {
|
||||
const results = [];
|
||||
// Find headings that match DSI keywords
|
||||
const headings = document.querySelectorAll('h1, h2, h3, h4, h5');
|
||||
// Only H1 and H2 count as document-level headings
|
||||
const headings = document.querySelectorAll('h1, h2');
|
||||
const dsiKeywords = [
|
||||
'datenschutz', 'privacy', 'données', 'privacidad', 'protezione',
|
||||
'gegevensbescherming', 'ochrona danych', 'tietosuoja', 'integritet',
|
||||
@@ -461,12 +466,13 @@ async def _find_inline_dsi_sections(page: Page) -> list[dict]:
|
||||
const textLower = text.toLowerCase();
|
||||
if (!dsiKeywords.some(kw => textLower.includes(kw))) continue;
|
||||
|
||||
// Get the section content following this heading
|
||||
// Get ALL content until the next H1/H2 (include sub-sections H3-H5)
|
||||
let content = '';
|
||||
let el = h.nextElementSibling;
|
||||
let count = 0;
|
||||
while (el && count < 50) {
|
||||
if (el.tagName.match(/^H[1-5]$/)) break;
|
||||
while (el && count < 200) {
|
||||
// Stop at next H1 or H2 (next top-level document)
|
||||
if (el.tagName === 'H1' || el.tagName === 'H2') break;
|
||||
content += (el.textContent || '').trim() + '\\n';
|
||||
el = el.nextElementSibling;
|
||||
count++;
|
||||
|
||||
Reference in New Issue
Block a user