feat: Generic legal document discovery (DSI, AGB, Widerruf, Cookie-Richtlinie)
New service: dsi_discovery.py — finds ALL legal documents on any website:
- Technology-agnostic: HTML, SPA, WordPress, Typo3, custom CMS
- Structure-agnostic: accordions, sidebars, footers, inline links, tabs
- Format-agnostic: HTML pages, anchor sections, PDFs, cross-domain links
- Language-agnostic: 26 EU/EEA languages with document-type keywords
Document types discovered:
- Datenschutzinformationen / Privacy Policies (Art. 13/14 DSGVO)
- AGB / Terms of Service / Nutzungsbedingungen
- Widerrufsbelehrung / Right of Withdrawal (§355 BGB)
- Cookie-Richtlinie / Cookie Policy
- All cross-domain variants (e.g. help.instagram.com from instagram.com)
API: POST /dsi-discovery { url, max_documents }
Returns: list of documents with title, url, language, type, word_count, text_preview
Features:
- Expands all accordions, details, tabs, dropdowns before scanning
- Follows cross-domain links (same registrable domain)
- Re-expands after navigation back to source page
- Handles anchor links (#sections) separately from full pages
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -15,6 +15,7 @@ from pydantic import BaseModel
|
||||
from services.consent_scanner import run_consent_test, ConsentTestResult
|
||||
from services.authenticated_scanner import run_authenticated_test, AuthTestResult
|
||||
from services.playwright_scanner import scan_website_playwright
|
||||
from services.dsi_discovery import discover_dsi_documents, DSIDiscoveryResult
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format="%(levelname)s:%(name)s: %(message)s")
|
||||
logger = logging.getLogger(__name__)
|
||||
@@ -240,3 +241,81 @@ async def website_scan(req: WebsiteScanRequest):
|
||||
page_htmls=page_htmls,
|
||||
scanned_at=datetime.now(timezone.utc).isoformat(),
|
||||
)
|
||||
|
||||
|
||||
# ═══════════════════════════════════════════════════════════════
|
||||
# DSI DISCOVERY (finds all privacy + legal documents on a website)
|
||||
# ═══════════════════════════════════════════════════════════════
|
||||
|
||||
class DSIDiscoveryRequest(BaseModel):
|
||||
url: str
|
||||
max_documents: int = 30
|
||||
|
||||
|
||||
class DSIDocumentInfo(BaseModel):
|
||||
title: str
|
||||
url: str
|
||||
source_url: str
|
||||
language: str = ""
|
||||
doc_type: str = ""
|
||||
word_count: int = 0
|
||||
text_preview: str = ""
|
||||
|
||||
|
||||
class DSIDiscoveryResponse(BaseModel):
|
||||
url: str
|
||||
documents: list[DSIDocumentInfo]
|
||||
total_found: int
|
||||
languages_detected: list[str]
|
||||
errors: list[str]
|
||||
scanned_at: str
|
||||
|
||||
|
||||
@app.post("/dsi-discovery", response_model=DSIDiscoveryResponse)
|
||||
async def dsi_discovery(req: DSIDiscoveryRequest):
|
||||
"""Discover all privacy/data protection documents on a website.
|
||||
|
||||
Generically finds DSI, AGB, Nutzungsbedingungen, Widerrufsbelehrung,
|
||||
Cookie-Richtlinien etc. regardless of website technology or language.
|
||||
Supports HTML pages, accordions, sidebars, PDFs, cross-domain links.
|
||||
"""
|
||||
logger.info("Starting DSI discovery for %s (max %d docs)", req.url, req.max_documents)
|
||||
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
async with async_playwright() as p:
|
||||
browser = await p.chromium.launch(
|
||||
headless=True,
|
||||
args=["--no-sandbox", "--disable-dev-shm-usage"],
|
||||
)
|
||||
context = await browser.new_context(
|
||||
user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) "
|
||||
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
|
||||
)
|
||||
page = await context.new_page()
|
||||
|
||||
try:
|
||||
result = await discover_dsi_documents(page, req.url, req.max_documents)
|
||||
finally:
|
||||
await context.close()
|
||||
await browser.close()
|
||||
|
||||
return DSIDiscoveryResponse(
|
||||
url=req.url,
|
||||
documents=[
|
||||
DSIDocumentInfo(
|
||||
title=d.title,
|
||||
url=d.url,
|
||||
source_url=d.source_url,
|
||||
language=d.language,
|
||||
doc_type=d.doc_type,
|
||||
word_count=d.word_count,
|
||||
text_preview=d.text[:500] if d.text else "",
|
||||
)
|
||||
for d in result.documents
|
||||
],
|
||||
total_found=result.total_found,
|
||||
languages_detected=result.languages_detected,
|
||||
errors=result.errors,
|
||||
scanned_at=datetime.now(timezone.utc).isoformat(),
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user