'use client' /** * RAG Pipeline Page * * Dokument-Indexierung für die semantische Suche. * Teil der KI-Daten-Pipeline: * OCR-Labeling → RAG Pipeline → Daten & RAG */ import { useState, useEffect } from 'react' import { PagePurpose } from '@/components/common/PagePurpose' import { AIModuleSidebarResponsive } from '@/components/ai/AIModuleSidebar' // ============================================================================ // TYPES // ============================================================================ interface TrainingJob { id: string name: string model_type: 'zeugnis' | 'klausur' | 'general' status: 'queued' | 'preparing' | 'training' | 'validating' | 'completed' | 'failed' | 'paused' progress: number current_epoch: number total_epochs: number loss: number val_loss: number learning_rate: number documents_processed: number total_documents: number started_at: string | null estimated_completion: string | null error_message: string | null metrics: TrainingMetrics config: TrainingConfig } interface TrainingMetrics { precision: number recall: number f1_score: number accuracy: number loss_history: number[] val_loss_history: number[] confusion_matrix?: number[][] } interface TrainingConfig { batch_size: number learning_rate: number epochs: number warmup_steps: number weight_decay: number gradient_accumulation: number mixed_precision: boolean bundeslaender: string[] } interface DatasetStats { total_documents: number total_chunks: number training_allowed: number by_bundesland: Record by_doc_type: Record } interface DataSource { id: string name: string description: string collection: string document_count: number chunk_count: number last_updated: string | null status: 'active' | 'pending' | 'error' } // ============================================================================ // MOCK DATA // ============================================================================ const MOCK_JOBS: TrainingJob[] = [] const MOCK_STATS: DatasetStats = { total_documents: 632, total_chunks: 8547, training_allowed: 489, by_bundesland: { ni: 87, by: 92, nw: 78, he: 65, bw: 71, rp: 43, sn: 38, sh: 34, th: 29, }, by_doc_type: { verordnung: 312, schulordnung: 156, handreichung: 98, erlass: 66, }, } const MOCK_DATA_SOURCES: DataSource[] = [ { id: 'nibis', name: 'NiBiS Erwartungshorizonte', description: 'Offizielle Abitur-Erwartungshorizonte vom Niedersaechsischen Bildungsserver', collection: 'bp_nibis_eh', document_count: 245, chunk_count: 3200, last_updated: '2025-01-15T10:30:00Z', status: 'active', }, { id: 'user_eh', name: 'Benutzerdefinierte EH', description: 'Von Lehrern hochgeladene schulspezifische Erwartungshorizonte', collection: 'bp_eh', document_count: 87, chunk_count: 1100, last_updated: '2025-01-20T14:15:00Z', status: 'active', }, { id: 'legal', name: 'Rechtskorpus', description: 'DSGVO, AI Act, BSI-Standards und weitere Compliance-Regelwerke', collection: 'bp_legal_corpus', document_count: 19, chunk_count: 2400, last_updated: '2025-01-10T08:00:00Z', status: 'active', }, { id: 'dsfa', name: 'DSFA-Guidance', description: 'WP248, DSK Kurzpapiere, Muss-Listen aller Bundeslaender mit Quellenattribution', collection: 'bp_dsfa_corpus', document_count: 45, chunk_count: 850, last_updated: '2026-02-09T10:00:00Z', status: 'active', }, { id: 'schulordnungen', name: 'Schulordnungen', description: 'Landesschulordnungen und Zeugnisverordnungen aller Bundeslaender', collection: 'bp_schulordnungen', document_count: 156, chunk_count: 1847, last_updated: null, status: 'pending', }, ] // ============================================================================ // API FUNCTIONS // ============================================================================ async function fetchJobs(): Promise { try { const response = await fetch('/api/ai/rag-pipeline?action=jobs') if (!response.ok) throw new Error('Failed to fetch jobs') return await response.json() } catch (error) { console.error('Error fetching jobs:', error) return MOCK_JOBS } } async function fetchDatasetStats(): Promise { try { const response = await fetch('/api/ai/rag-pipeline?action=dataset-stats') if (!response.ok) throw new Error('Failed to fetch stats') return await response.json() } catch (error) { console.error('Error fetching stats:', error) return MOCK_STATS } } async function createTrainingJob(config: Partial): Promise<{id: string, status: string}> { const response = await fetch('/api/ai/rag-pipeline?action=create-job', { method: 'POST', headers: { 'Content-Type': 'application/json' }, body: JSON.stringify({ name: `RAG-Index ${new Date().toLocaleDateString('de-DE')}`, model_type: 'zeugnis', bundeslaender: config.bundeslaender || [], batch_size: config.batch_size || 16, learning_rate: config.learning_rate || 0.00005, epochs: config.epochs || 10, warmup_steps: config.warmup_steps || 500, weight_decay: config.weight_decay || 0.01, gradient_accumulation: config.gradient_accumulation || 4, mixed_precision: config.mixed_precision ?? true, }), }) if (!response.ok) { const error = await response.json() throw new Error(error.detail || 'Failed to create job') } return await response.json() } async function pauseJob(jobId: string): Promise { const response = await fetch(`/api/ai/rag-pipeline?action=pause&job_id=${jobId}`, { method: 'POST', }) if (!response.ok) throw new Error('Failed to pause job') } async function resumeJob(jobId: string): Promise { const response = await fetch(`/api/ai/rag-pipeline?action=resume&job_id=${jobId}`, { method: 'POST', }) if (!response.ok) throw new Error('Failed to resume job') } async function cancelJob(jobId: string): Promise { const response = await fetch(`/api/ai/rag-pipeline?action=cancel&job_id=${jobId}`, { method: 'POST', }) if (!response.ok) throw new Error('Failed to cancel job') } // ============================================================================ // COMPONENTS // ============================================================================ // Tab Button function TabButton({ active, onClick, children }: { active: boolean onClick: () => void children: React.ReactNode }) { return ( ) } // Progress Ring Component function ProgressRing({ progress, size = 120, strokeWidth = 8, color = '#10B981' }: { progress: number size?: number strokeWidth?: number color?: string }) { const radius = (size - strokeWidth) / 2 const circumference = radius * 2 * Math.PI const offset = circumference - (progress / 100) * circumference return (
{Math.round(progress)}%
) } // Mini Line Chart Component function MiniChart({ data, color = '#10B981', height = 60 }: { data: number[] color?: string height?: number }) { if (!data.length) return null const max = Math.max(...data) const min = Math.min(...data) const range = max - min || 1 const width = 200 const padding = 4 const points = data.map((value, i) => { const x = padding + (i / (data.length - 1)) * (width - 2 * padding) const y = padding + (1 - (value - min) / range) * (height - 2 * padding) return `${x},${y}` }).join(' ') return ( {data.length > 0 && ( )} ) } // Status Badge function StatusBadge({ status }: { status: TrainingJob['status'] }) { const styles = { queued: 'bg-gray-100 text-gray-800 dark:bg-gray-700 dark:text-gray-300', preparing: 'bg-yellow-100 text-yellow-800 dark:bg-yellow-900 dark:text-yellow-200', training: 'bg-blue-100 text-blue-800 dark:bg-blue-900 dark:text-blue-200', validating: 'bg-purple-100 text-purple-800 dark:bg-purple-900 dark:text-purple-200', completed: 'bg-green-100 text-green-800 dark:bg-green-900 dark:text-green-200', failed: 'bg-red-100 text-red-800 dark:bg-red-900 dark:text-red-200', paused: 'bg-orange-100 text-orange-800 dark:bg-orange-900 dark:text-orange-200', } const labels = { queued: 'In Warteschlange', preparing: 'Vorbereitung', training: 'Indexierung laeuft', validating: 'Validierung', completed: 'Abgeschlossen', failed: 'Fehlgeschlagen', paused: 'Pausiert', } return ( {status === 'training' && ( )} {labels[status]} ) } // Metric Card function MetricCard({ label, value, trend, color }: { label: string value: number | string trend?: 'up' | 'down' | 'neutral' color?: string }) { return (

{label}

{typeof value === 'number' ? value.toFixed(3) : value} {trend && ( {trend === 'up' ? '\u2191' : trend === 'down' ? '\u2193' : '\u2192'} )}
) } // Training Job Card function TrainingJobCard({ job, onPause, onResume, onStop, onViewDetails }: { job: TrainingJob onPause: () => void onResume: () => void onStop: () => void onViewDetails: () => void }) { const isActive = ['training', 'preparing', 'validating'].includes(job.status) return (

{job.name}

Typ: {job.model_type.charAt(0).toUpperCase() + job.model_type.slice(1)}

Durchlauf {job.current_epoch} / {job.total_epochs}
Dokumente {job.documents_processed.toLocaleString()} / {job.total_documents.toLocaleString()}
Fortschritt
Gestartet: {job.started_at ? new Date(job.started_at).toLocaleTimeString('de-DE') : '-'} Geschaetzt: {job.estimated_completion ? new Date(job.estimated_completion).toLocaleTimeString('de-DE') : '-' }
{isActive && ( <> )}
) } // Dataset Overview Component function DatasetOverview({ stats }: { stats: DatasetStats }) { const maxBundesland = Math.max(...Object.values(stats.by_bundesland)) return (

Datensatz-Uebersicht

{stats.total_documents.toLocaleString()}

Dokumente

{stats.total_chunks.toLocaleString()}

Chunks

{stats.training_allowed.toLocaleString()}

Indexiert

Verteilung nach Bundesland

{Object.entries(stats.by_bundesland) .sort((a, b) => b[1] - a[1]) .map(([code, count]) => (
{code}
{count}
))}
) } // Architecture Diagram Component function ArchitectureTab() { return (
{/* What is this module */}

Was macht dieses Modul?

Das RAG-Indexierungs-Modul verarbeitet Dokumente und macht sie fuer die KI-gestuetzte Suche verfuegbar. Es handelt sich nicht um klassisches Machine-Learning-Training, sondern um:

  • 1. Dokumentenextraktion: PDFs und Bilder werden per OCR in Text umgewandelt
  • 2. Chunking: Lange Texte werden in suchbare Abschnitte (1000 Zeichen) aufgeteilt
  • 3. Embedding: Jeder Chunk wird in einen Vektor (1536 Dimensionen) umgewandelt
  • 4. Indexierung: Vektoren werden in Qdrant gespeichert fuer semantische Suche
{/* Architecture Diagram */}

Technische Architektur

{/* Visual Pipeline */}
{/* Data Sources Row */}
📄
NiBiS PDFs
Erwartungshorizonte
📤
Uploads
Eigene EH
⚖️
Rechtskorpus
DSGVO, AI Act
📚
Schulordnungen
Bundeslaender
{/* Arrow Down */}
↓
{/* Processing Layer */}

Verarbeitungs-Pipeline

🔍
OCR
Text-Extraktion
→
✂️
Chunking
1000 Zeichen
→
🧮
Embedding
1536-dim Vektor
→
💾
Speichern
Qdrant
{/* Arrow Down */}
↓
{/* Storage Layer */}

Vektor-Datenbank (Qdrant)

bp_nibis_eh
Offizielle EH
bp_eh
Benutzer EH
bp_legal_corpus
Rechtskorpus
{/* Arrow Down */}
↓
{/* Usage Layer */}

Semantische Suche

Fragen werden in Vektoren umgewandelt und aehnliche Dokumente gefunden

RAG-Antworten

LLM generiert Antworten basierend auf gefundenen Dokumenten

{/* Technical Details */}

Technische Details

Embedding-Service

Modell text-embedding-3-small
Dimensionen 1536
Port 8087

Chunk-Konfiguration

Chunk-Groesse 1000 Zeichen
Ueberlappung 200 Zeichen
Distanzmetrik COSINE
) } // Data Sources Tab Component function DataSourcesTab({ sources }: { sources: DataSource[] }) { return (
{/* Introduction */}

Wie werden Daten hinzugefuegt?

Das RAG-System nutzt verschiedene Datenquellen. Jede Quelle hat einen eigenen Ingestion-Prozess:

Automatisch

NiBiS-PDFs werden automatisch aus dem za-download Verzeichnis eingelesen

Manuell

Eigene EH koennen ueber die Klausur-Korrektur hochgeladen werden

{/* Data Sources List */}
{sources.map((source) => (

{source.name}

{source.status === 'active' ? 'Aktiv' : source.status === 'pending' ? 'Ausstehend' : 'Fehler'}

{source.description}

Collection: {source.collection}
Dokumente: {source.document_count}
Chunks: {source.chunk_count}
{source.last_updated && (
Aktualisiert: {new Date(source.last_updated).toLocaleDateString('de-DE')}
)}
))}
{/* How to add data */}

Daten hinzufuegen

📤

Erwartungshorizont hochladen

Laden Sie eigene EH-Dokumente in der Klausur-Korrektur hoch

Zur Klausur-Korrektur →
🔄

NiBiS neu einlesen

Starten Sie die automatische Ingestion der NiBiS-PDFs

⚖️

Rechtskorpus erweitern

Neue Regelwerke (DSGVO, BSI, etc.) zum Korpus hinzufuegen

📋

DSFA-Quellen verwalten

WP248, DSK, Muss-Listen mit Lizenzattribution

DSFA-Manager oeffnen →
) } // New Training Modal function NewTrainingModal({ isOpen, onClose, onSubmit }: { isOpen: boolean onClose: () => void onSubmit: (config: Partial) => void }) { const [step, setStep] = useState(1) const [config, setConfig] = useState>({ batch_size: 16, learning_rate: 0.00005, epochs: 10, warmup_steps: 500, weight_decay: 0.01, gradient_accumulation: 4, mixed_precision: true, bundeslaender: [], }) if (!isOpen) return null const bundeslaender = [ { code: 'ni', name: 'Niedersachsen', allowed: true }, { code: 'by', name: 'Bayern', allowed: true }, { code: 'nw', name: 'NRW', allowed: true }, { code: 'he', name: 'Hessen', allowed: true }, { code: 'bw', name: 'Baden-Wuerttemberg', allowed: true }, { code: 'rp', name: 'Rheinland-Pfalz', allowed: true }, { code: 'sn', name: 'Sachsen', allowed: true }, { code: 'sh', name: 'Schleswig-Holstein', allowed: true }, { code: 'th', name: 'Thueringen', allowed: true }, { code: 'be', name: 'Berlin', allowed: false }, { code: 'bb', name: 'Brandenburg', allowed: false }, { code: 'hb', name: 'Bremen', allowed: false }, { code: 'hh', name: 'Hamburg', allowed: false }, { code: 'mv', name: 'Mecklenburg-Vorpommern', allowed: false }, { code: 'sl', name: 'Saarland', allowed: false }, { code: 'st', name: 'Sachsen-Anhalt', allowed: false }, ] return (

Neue Indexierung starten

Schritt {step} von 3

{[1, 2, 3].map((s) => (
{s < step ? '\u2713' : s}
{s < 3 && (
)}
))}
Daten Parameter Bestaetigen
{step === 1 && (

Waehlen Sie die Bundeslaender fuer die Indexierung

Nur Bundeslaender mit verfuegbaren Dokumenten koennen ausgewaehlt werden.

{bundeslaender.map((bl) => ( ))}
)} {step === 2 && (

Indexierungs-Parameter

Diese Parameter steuern die Batch-Verarbeitung der Dokumente.

setConfig({ ...config, batch_size: parseInt(e.target.value) })} className="w-full px-3 py-2 border border-gray-300 dark:border-gray-600 rounded-lg bg-white dark:bg-gray-700" />

Dokumente pro Batch

setConfig({ ...config, epochs: parseInt(e.target.value) })} className="w-full px-3 py-2 border border-gray-300 dark:border-gray-600 rounded-lg bg-white dark:bg-gray-700" />

Fuer Validierung

setConfig({ ...config, mixed_precision: e.target.checked })} className="w-4 h-4 text-blue-600 rounded" />
)} {step === 3 && (

Konfiguration bestaetigen

Bundeslaender {config.bundeslaender?.length || 0} ausgewaehlt
Batch Size {config.batch_size}
Parallele Verarbeitung {config.mixed_precision ? 'Aktiviert' : 'Deaktiviert'}

Was passiert: Die ausgewaehlten Dokumente werden extrahiert, in Chunks aufgeteilt, und als Vektoren in Qdrant indexiert. Dieser Prozess kann je nach Datenmenge einige Minuten dauern.

)}
) } // ============================================================================ // MAIN PAGE // ============================================================================ export default function TrainingDashboardPage() { const [activeTab, setActiveTab] = useState<'dashboard' | 'architecture' | 'sources'>('dashboard') const [jobs, setJobs] = useState([]) const [stats, setStats] = useState(MOCK_STATS) const [dataSources] = useState(MOCK_DATA_SOURCES) const [showNewTrainingModal, setShowNewTrainingModal] = useState(false) const [selectedJob, setSelectedJob] = useState(null) const [isLoading, setIsLoading] = useState(true) const [error, setError] = useState(null) useEffect(() => { async function loadData() { setIsLoading(true) try { const [jobsData, statsData] = await Promise.all([ fetchJobs(), fetchDatasetStats(), ]) setJobs(jobsData) setStats(statsData) setError(null) } catch (err) { console.error('Failed to load data:', err) setError('Verbindung zum Backend fehlgeschlagen') setJobs(MOCK_JOBS) setStats(MOCK_STATS) } finally { setIsLoading(false) } } loadData() }, []) useEffect(() => { const hasActiveJob = jobs.some(j => j.status === 'training' || j.status === 'preparing') if (!hasActiveJob) return const interval = setInterval(async () => { try { const updatedJobs = await fetchJobs() setJobs(updatedJobs) } catch (err) { console.error('Failed to refresh jobs:', err) } }, 2000) return () => clearInterval(interval) }, [jobs]) const handleStartTraining = async (config: Partial) => { try { await createTrainingJob(config) const updatedJobs = await fetchJobs() setJobs(updatedJobs) setShowNewTrainingModal(false) } catch (err) { console.error('Failed to start training:', err) setError(err instanceof Error ? err.message : 'Indexierung konnte nicht gestartet werden') } } const handlePauseJob = async (jobId: string) => { try { await pauseJob(jobId) const updatedJobs = await fetchJobs() setJobs(updatedJobs) } catch (err) { console.error('Failed to pause job:', err) } } const handleResumeJob = async (jobId: string) => { try { await resumeJob(jobId) const updatedJobs = await fetchJobs() setJobs(updatedJobs) } catch (err) { console.error('Failed to resume job:', err) } } const handleCancelJob = async (jobId: string) => { try { await cancelJob(jobId) const updatedJobs = await fetchJobs() setJobs(updatedJobs) } catch (err) { console.error('Failed to cancel job:', err) } } return (
{/* Header */}

RAG-Indexierung

Dokumente fuer die semantische Suche aufbereiten und indexieren

{activeTab === 'dashboard' && ( )}
{/* Page Purpose with Related Pages */} {/* Tabs */}
setActiveTab('dashboard')}> Dashboard setActiveTab('architecture')}> Architektur setActiveTab('sources')}> Datenquellen
{/* AI Module Sidebar - Desktop: Fixed, Mobile: FAB + Drawer */} {/* Error Banner */} {error && (
{error}
)} {/* Tab Content */} {activeTab === 'architecture' ? ( ) : activeTab === 'sources' ? ( ) : isLoading ? (

Lade Daten...

) : (
{/* Training Jobs */}
{jobs.length === 0 ? (

Keine aktive Indexierung

Starten Sie eine neue Indexierung, um Dokumente fuer die Suche aufzubereiten.

) : ( jobs.map(job => ( handlePauseJob(job.id)} onResume={() => handleResumeJob(job.id)} onStop={() => handleCancelJob(job.id)} onViewDetails={() => setSelectedJob(job)} /> )) )}
{/* Sidebar */}
{/* Quick Actions */}

Schnellaktionen

)}
{/* New Training Modal */} setShowNewTrainingModal(false)} onSubmit={handleStartTraining} />
) }