feat: Generic legal document discovery (DSI, AGB, Widerruf, Cookie-Richtlinie)

New service: dsi_discovery.py — finds ALL legal documents on any website: - Technology-agnostic: HTML, SPA, WordPress, Typo3, custom CMS - Structure-agnostic: accordions, sidebars, footers, inline links, tabs - Format-agnostic: HTML pages, anchor sections, PDFs, cross-domain links - Language-agnostic: 26 EU/EEA languages with document-type keywords Document types discovered: - Datenschutzinformationen / Privacy Policies (Art. 13/14 DSGVO) - AGB / Terms of Service / Nutzungsbedingungen - Widerrufsbelehrung / Right of Withdrawal (§355 BGB) - Cookie-Richtlinie / Cookie Policy - All cross-domain variants (e.g. help.instagram.com from instagram.com) API: POST /dsi-discovery { url, max_documents } Returns: list of documents with title, url, language, type, word_count, text_preview Features: - Expands all accordions, details, tabs, dropdowns before scanning - Follows cross-domain links (same registrable domain) - Re-expands after navigation back to source page - Handles anchor links (#sections) separately from full pages Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-05-04 21:56:29 +02:00
parent 2b4ff9f422
commit a6618af5ed
2 changed files with 548 additions and 0 deletions
@@ -15,6 +15,7 @@ from pydantic import BaseModel
 from services.consent_scanner import run_consent_test, ConsentTestResult
 from services.authenticated_scanner import run_authenticated_test, AuthTestResult
 from services.playwright_scanner import scan_website_playwright
+from services.dsi_discovery import discover_dsi_documents, DSIDiscoveryResult

 logging.basicConfig(level=logging.INFO, format="%(levelname)s:%(name)s: %(message)s")
 logger = logging.getLogger(__name__)
@@ -240,3 +241,81 @@ async def website_scan(req: WebsiteScanRequest):
        page_htmls=page_htmls,
        scanned_at=datetime.now(timezone.utc).isoformat(),
    )
+
+
+# ═══════════════════════════════════════════════════════════════
+# DSI DISCOVERY (finds all privacy + legal documents on a website)
+# ═══════════════════════════════════════════════════════════════
+
+class DSIDiscoveryRequest(BaseModel):
+    url: str
+    max_documents: int = 30
+
+
+class DSIDocumentInfo(BaseModel):
+    title: str
+    url: str
+    source_url: str
+    language: str = ""
+    doc_type: str = ""
+    word_count: int = 0
+    text_preview: str = ""
+
+
+class DSIDiscoveryResponse(BaseModel):
+    url: str
+    documents: list[DSIDocumentInfo]
+    total_found: int
+    languages_detected: list[str]
+    errors: list[str]
+    scanned_at: str
+
+
+@app.post("/dsi-discovery", response_model=DSIDiscoveryResponse)
+async def dsi_discovery(req: DSIDiscoveryRequest):
+    """Discover all privacy/data protection documents on a website.
+
+    Generically finds DSI, AGB, Nutzungsbedingungen, Widerrufsbelehrung,
+    Cookie-Richtlinien etc. regardless of website technology or language.
+    Supports HTML pages, accordions, sidebars, PDFs, cross-domain links.
+    """
+    logger.info("Starting DSI discovery for %s (max %d docs)", req.url, req.max_documents)
+
+    from playwright.async_api import async_playwright
+
+    async with async_playwright() as p:
+        browser = await p.chromium.launch(
+            headless=True,
+            args=["--no-sandbox", "--disable-dev-shm-usage"],
+        )
+        context = await browser.new_context(
+            user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
+                       "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+        )
+        page = await context.new_page()
+
+        try:
+            result = await discover_dsi_documents(page, req.url, req.max_documents)
+        finally:
+            await context.close()
+            await browser.close()
+
+    return DSIDiscoveryResponse(
+        url=req.url,
+        documents=[
+            DSIDocumentInfo(
+                title=d.title,
+                url=d.url,
+                source_url=d.source_url,
+                language=d.language,
+                doc_type=d.doc_type,
+                word_count=d.word_count,
+                text_preview=d.text[:500] if d.text else "",
+            )
+            for d in result.documents
+        ],
+        total_found=result.total_found,
+        languages_detected=result.languages_detected,
+        errors=result.errors,
+        scanned_at=datetime.now(timezone.utc).isoformat(),
+    )