'use client' /** * Compliance Regulation Scraper Admin Page * * Manages the extraction of requirements and audit aspects from: * - EUR-Lex regulations (GDPR, AI Act, CRA, NIS2, etc.) * - BSI Technical Guidelines (TR-03161) * - German laws * * Similar pattern to edu-search and zeugnisse-crawler. */ import { useState, useEffect, useCallback } from 'react' import AdminLayout from '@/components/admin/AdminLayout' import SystemInfoSection, { SYSTEM_INFO_CONFIGS } from '@/components/admin/SystemInfoSection' // Types interface Source { code: string url: string source_type: string regulation_type: string has_data: boolean requirement_count: number } interface ScraperStatus { status: 'idle' | 'running' | 'completed' | 'error' current_source: string | null last_error: string | null stats: { sources_processed: number requirements_extracted: number errors: number last_run: string | null } known_sources: string[] } interface ScrapeResult { code: string status: string requirements_extracted?: number reason?: string error?: string } interface PDFDocument { code: string name: string description: string expected_aspects: string available: boolean } interface PDFExtractionResult { success: boolean source_document: string total_aspects: number requirements_created: number statistics: { by_category: Record by_requirement_level: Record } } const BACKEND_URL = process.env.NEXT_PUBLIC_BACKEND_URL || 'http://localhost:8000' // Source type badges const sourceTypeBadge: Record = { eur_lex: { label: 'EUR-Lex', color: 'bg-blue-100 text-blue-800' }, bsi_pdf: { label: 'BSI PDF', color: 'bg-green-100 text-green-800' }, gesetze_im_internet: { label: 'Gesetze', color: 'bg-yellow-100 text-yellow-800' }, manual: { label: 'Manuell', color: 'bg-gray-100 text-gray-800' }, } // Regulation type badges const regulationTypeBadge: Record = { eu_regulation: { label: 'EU-Verordnung', color: 'bg-indigo-100 text-indigo-800', icon: 'πŸ‡ͺπŸ‡Ί' }, eu_directive: { label: 'EU-Richtlinie', color: 'bg-purple-100 text-purple-800', icon: 'πŸ“œ' }, de_law: { label: 'DE-Gesetz', color: 'bg-yellow-100 text-yellow-800', icon: 'πŸ‡©πŸ‡ͺ' }, bsi_standard: { label: 'BSI-Standard', color: 'bg-green-100 text-green-800', icon: 'πŸ”’' }, industry_standard: { label: 'Standard', color: 'bg-gray-100 text-gray-800', icon: 'πŸ“‹' }, } export default function ComplianceScraperPage() { const [activeTab, setActiveTab] = useState<'sources' | 'pdf' | 'status' | 'logs'>('sources') const [sources, setSources] = useState([]) const [pdfDocuments, setPdfDocuments] = useState([]) const [status, setStatus] = useState(null) const [loading, setLoading] = useState(true) const [scraping, setScraping] = useState(false) const [extracting, setExtracting] = useState(false) const [error, setError] = useState(null) const [success, setSuccess] = useState(null) const [results, setResults] = useState([]) const [pdfResult, setPdfResult] = useState(null) // Fetch sources const fetchSources = useCallback(async () => { try { const res = await fetch(`${BACKEND_URL}/api/v1/compliance/scraper/sources`) if (res.ok) { const data = await res.json() setSources(data.sources || []) } } catch (err) { console.error('Failed to fetch sources:', err) } }, []) // Fetch PDF documents const fetchPdfDocuments = useCallback(async () => { try { const res = await fetch(`${BACKEND_URL}/api/v1/compliance/scraper/pdf-documents`) if (res.ok) { const data = await res.json() setPdfDocuments(data.documents || []) } } catch (err) { console.error('Failed to fetch PDF documents:', err) } }, []) // Fetch status const fetchStatus = useCallback(async () => { try { const res = await fetch(`${BACKEND_URL}/api/v1/compliance/scraper/status`) if (res.ok) { const data = await res.json() setStatus(data) } } catch (err) { console.error('Failed to fetch status:', err) } }, []) // Initial load useEffect(() => { const loadData = async () => { setLoading(true) await Promise.all([fetchSources(), fetchStatus(), fetchPdfDocuments()]) setLoading(false) } loadData() }, [fetchSources, fetchStatus, fetchPdfDocuments]) // Poll status while scraping useEffect(() => { if (scraping) { const interval = setInterval(fetchStatus, 2000) return () => clearInterval(interval) } }, [scraping, fetchStatus]) // Scrape all sources const handleScrapeAll = async () => { setScraping(true) setError(null) setSuccess(null) setResults([]) try { const res = await fetch(`${BACKEND_URL}/api/v1/compliance/scraper/scrape-all`, { method: 'POST', }) if (!res.ok) { const data = await res.json() throw new Error(data.detail || 'Scraping fehlgeschlagen') } const data = await res.json() setResults([ ...data.results.success, ...data.results.failed, ...data.results.skipped, ]) setSuccess(`Scraping abgeschlossen: ${data.results.success.length} erfolgreich, ${data.results.skipped.length} uebersprungen, ${data.results.failed.length} fehlgeschlagen`) // Refresh sources await fetchSources() } catch (err: any) { setError(err.message) } finally { setScraping(false) } } // Scrape single source const handleScrapeSingle = async (code: string, force: boolean = false) => { setScraping(true) setError(null) setSuccess(null) try { const res = await fetch(`${BACKEND_URL}/api/v1/compliance/scraper/scrape/${code}?force=${force}`, { method: 'POST', }) if (!res.ok) { const data = await res.json() throw new Error(data.detail || 'Scraping fehlgeschlagen') } const data = await res.json() if (data.status === 'skipped') { setSuccess(`${code}: Bereits vorhanden (${data.requirement_count} Anforderungen)`) } else { setSuccess(`${code}: ${data.requirements_extracted} Anforderungen extrahiert`) } // Refresh sources await fetchSources() } catch (err: any) { setError(err.message) } finally { setScraping(false) } } // Extract PDF const handleExtractPdf = async (code: string, saveToDb: boolean = true, force: boolean = false) => { setExtracting(true) setError(null) setSuccess(null) setPdfResult(null) try { const res = await fetch(`${BACKEND_URL}/api/v1/compliance/scraper/extract-pdf`, { method: 'POST', headers: { 'Content-Type': 'application/json', }, body: JSON.stringify({ document_code: code, save_to_db: saveToDb, force: force, }), }) if (!res.ok) { const data = await res.json() throw new Error(data.detail || 'PDF-Extraktion fehlgeschlagen') } const data: PDFExtractionResult = await res.json() setPdfResult(data) if (data.success) { setSuccess(`${code}: ${data.total_aspects} Pruefaspekte extrahiert, ${data.requirements_created} Requirements erstellt`) } // Refresh sources await fetchSources() } catch (err: any) { setError(err.message) } finally { setExtracting(false) } } // Clear messages useEffect(() => { if (success) { const timer = setTimeout(() => setSuccess(null), 5000) return () => clearTimeout(timer) } }, [success]) useEffect(() => { if (error) { const timer = setTimeout(() => setError(null), 10000) return () => clearTimeout(timer) } }, [error]) // Stats cards const StatsCard = ({ title, value, subtitle, icon }: { title: string; value: number | string; subtitle?: string; icon: string }) => (
{icon}

{title}

{value}

{subtitle &&

{subtitle}

}
) return ( {/* Loading */} {loading && (
Lade Quellen...
)} {!loading && ( <> {/* Messages */} {error && (
{error}
)} {success && (
{success}
)} {/* Stats Cards */}
s.has_data).length} subtitle={`${sources.length - sources.filter(s => s.has_data).length} noch zu scrapen`} icon="βœ…" /> acc + s.requirement_count, 0)} icon="πŸ“‹" />
{/* Scraper Status Bar */} {(scraping || status?.status === 'running') && (

Scraper laeuft

{status?.current_source && (

Aktuell: {status.current_source}

)}
)} {/* Tabs */}
{/* Sources Tab */} {activeTab === 'sources' && (
{/* Header */}

Regulierungsquellen

EU-Lex, BSI-TR und deutsche Gesetze

{/* Sources by Type */}
{/* EU Regulations */}

πŸ‡ͺπŸ‡Ί EU-Regulierungen (EUR-Lex)

{sources.filter(s => s.source_type === 'eur_lex').map(source => ( ))}
{/* BSI Standards */}

πŸ”’ BSI Technical Guidelines

{sources.filter(s => s.source_type === 'bsi_pdf').map(source => ( ))}
)} {/* PDF Extraction Tab */} {activeTab === 'pdf' && (

PDF-Extraktion (PyMuPDF)

Extrahiert ALLE Pruefaspekte aus BSI-TR-03161 PDFs mit Regex-Pattern-Matching

{/* PDF Documents */}
{pdfDocuments.map(doc => (
πŸ“„
{doc.code} {doc.available ? 'Verfuegbar' : 'Nicht gefunden'}
{doc.name}
{doc.description}
Erwartete Pruefaspekte: {doc.expected_aspects}
))}
{/* Last Extraction Result */} {pdfResult && (

Letztes Extraktions-Ergebnis

{pdfResult.total_aspects}
Pruefaspekte gefunden
{pdfResult.requirements_created}
Requirements erstellt
{Object.keys(pdfResult.statistics.by_category || {}).length}
Kategorien
{/* Category Breakdown */} {pdfResult.statistics.by_category && Object.keys(pdfResult.statistics.by_category).length > 0 && (
Nach Kategorie:
{Object.entries(pdfResult.statistics.by_category).map(([cat, count]) => ( {cat}: {count} ))}
)}
)} {/* Info Box */}

Wie funktioniert die PDF-Extraktion?

  • β€’ PyMuPDF (fitz) liest den PDF-Text
  • β€’ Regex-Pattern finden Aspekte wie O.Auth_1, O.Sess_2, T.Network_1
  • β€’ Kontextanalyse extrahiert Titel, Kategorie und Anforderungsstufe (MUSS/SOLL/KANN)
  • β€’ Automatische Speicherung erstellt Requirements in der Datenbank
)} {/* Status Tab */} {activeTab === 'status' && status && (
{/* Current Status */}

Scraper-Status

Letzter Lauf: {status.stats.last_run ? new Date(status.stats.last_run).toLocaleString('de-DE') : 'Noch nie'}

{status.status === 'running' ? 'πŸ”„ Laeuft' : status.status === 'error' ? '❌ Fehler' : status.status === 'completed' ? 'βœ… Abgeschlossen' : '⏸️ Bereit'}
{status.stats.sources_processed}
Quellen verarbeitet
{status.stats.requirements_extracted}
Anforderungen extrahiert
{status.stats.errors}
Fehler
{status.last_error && (
Letzter Fehler: {status.last_error}
)}
{/* Process Description */}

Wie funktioniert der Scraper?

1
EUR-Lex Abruf: Holt HTML-Version der EU-Verordnung, extrahiert Artikel und Absaetze
2
BSI-TR Parsing: Extrahiert Pruefaspekte (O.Auth_1, O.Sess_1, etc.) aus den TR-Dokumenten
3
Datenbank-Speicherung: Jede Anforderung wird als Requirement in der Compliance-DB gespeichert
βœ“
Audit-Workspace: Anforderungen koennen mit Implementierungsdetails angereichert werden
)} {/* Results Tab */} {activeTab === 'logs' && (

Letzte Ergebnisse

{results.length === 0 ? (
Keine Ergebnisse vorhanden. Starte einen Scrape-Vorgang.
) : (
{results.map((result, idx) => (
{result.error ? '❌' : result.reason ? '⏭️' : 'βœ…'} {result.code} {result.error || result.reason || `${result.requirements_extracted} Anforderungen`}
))}
)}
)}
)} {/* System Info Section */}
) } // Source Card Component function SourceCard({ source, onScrape, scraping }: { source: Source onScrape: (code: string, force: boolean) => void scraping: boolean }) { const regType = regulationTypeBadge[source.regulation_type] || regulationTypeBadge.industry_standard const srcType = sourceTypeBadge[source.source_type] || sourceTypeBadge.manual return (
{regType.icon}
{source.code} {regType.label} {srcType.label}
{source.url.length > 60 ? source.url.substring(0, 60) + '...' : source.url}
{source.has_data ? ( {source.requirement_count} Anforderungen ) : ( Keine Daten )}
{source.has_data && ( )}
) }