fix: Restore all files lost during destructive rebase

A previous `git pull --rebase origin main` dropped 177 local commits, losing 3400+ files across admin-v2, backend, studio-v2, website, klausur-service, and many other services. The partial restore attempt (660295e2) only recovered some files. This commit restores all missing files from pre-rebase ref 98933f5e while preserving post-rebase additions (night-scheduler, night-mode UI, NightModeWidget dashboard integration). Restored features include: - AI Module Sidebar (FAB), OCR Labeling, OCR Compare - GPU Dashboard, RAG Pipeline, Magic Help - Klausur-Korrektur (8 files), Abitur-Archiv (5+ files) - Companion, Zeugnisse-Crawler, Screen Flow - Full backend, studio-v2, website, klausur-service - All compliance SDKs, agent-core, voice-service - CI/CD configs, documentation, scripts Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-09 09:51:32 +01:00
parent f7487ee240
commit bfdaf63ba9
2009 changed files with 749983 additions and 1731 deletions
--- a/backend/klausur/services/roster_parser.py
+++ b/backend/klausur/services/roster_parser.py
@@ -0,0 +1,502 @@
+"""
+Roster Parser Service - Klassenbuch und Schuelerlisten parsen.
+
+Unterstuetzt:
+- Klassenbuch-Fotos (OCR mit PaddleOCR)
+- PDF-Schuelerlisten (SchILD, ASV, etc.)
+- CSV-Dateien
+- Manuelle Eingabe
+
+Privacy-First:
+- Alle Verarbeitung serverseitig (kein externer Upload)
+- Daten bleiben im Lehrer-Namespace
+"""
+
+import re
+import csv
+import io
+from dataclasses import dataclass, field
+from typing import List, Optional, Dict, Tuple
+from difflib import SequenceMatcher
+
+# Optionale Imports
+try:
+    from services.file_processor import get_file_processor, ProcessingResult
+    HAS_OCR = True
+except ImportError:
+    HAS_OCR = False
+
+try:
+    import fitz  # PyMuPDF
+    HAS_PDF = True
+except ImportError:
+    HAS_PDF = False
+
+
+@dataclass
+class RosterEntry:
+    """Eintrag in einer Schuelerliste."""
+    first_name: str
+    last_name: str
+    student_number: Optional[str] = None
+    parent_email: Optional[str] = None
+    parent_phone: Optional[str] = None
+    birth_date: Optional[str] = None
+    additional_data: Dict[str, str] = field(default_factory=dict)
+
+
+@dataclass
+class ParsedRoster:
+    """Ergebnis des Roster-Parsings."""
+    entries: List[RosterEntry]
+    source_type: str  # klassenbuch, pdf, csv
+    confidence: float
+    warnings: List[str] = field(default_factory=list)
+    raw_text: Optional[str] = None
+
+
+@dataclass
+class NameMatch:
+    """Ergebnis eines Name-Matchings."""
+    detected_name: str
+    matched_entry: Optional[RosterEntry]
+    confidence: float
+    match_type: str  # exact, first_name, fuzzy, none
+
+
+class RosterParser:
+    """
+    Parst Klassenlisten aus verschiedenen Quellen.
+
+    Beispiel:
+        parser = RosterParser()
+
+        # Klassenbuch-Foto
+        roster = parser.parse_klassenbuch_image(image_bytes)
+
+        # PDF-Liste
+        roster = parser.parse_pdf_roster(pdf_bytes)
+
+        # Namen matchen
+        matches = parser.match_first_names(
+            detected=["Max", "Anna", "Tim"],
+            roster=roster.entries
+        )
+    """
+
+    # Regex-Patterns fuer Kontaktdaten
+    EMAIL_PATTERN = re.compile(r'[\w.+-]+@[\w-]+\.[\w.-]+')
+    PHONE_PATTERN = re.compile(r'(?:\+49|0)[\s.-]?\d{2,4}[\s.-]?\d{3,}[\s.-]?\d{2,}')
+    DATE_PATTERN = re.compile(r'\b(\d{1,2})\.(\d{1,2})\.(\d{2,4})\b')
+
+    # Deutsche Vornamen (Auszug fuer Validierung)
+    COMMON_FIRST_NAMES = {
+        'max', 'anna', 'tim', 'lena', 'paul', 'marie', 'felix', 'emma',
+        'leon', 'sophia', 'lukas', 'mia', 'jonas', 'hannah', 'elias', 'emilia',
+        'ben', 'lea', 'noah', 'lina', 'finn', 'amelie', 'luis', 'laura',
+        'moritz', 'clara', 'henry', 'julia', 'julian', 'emily', 'david', 'johanna',
+        'niklas', 'charlotte', 'simon', 'maja', 'alexander', 'sarah', 'jan', 'lisa',
+        'tom', 'nele', 'luca', 'sophie', 'erik', 'alina', 'fabian', 'paula',
+        'philipp', 'luisa', 'tobias', 'melina', 'vincent', 'lara', 'maximilian', 'elena'
+    }
+
+    def __init__(self):
+        self.file_processor = get_file_processor() if HAS_OCR else None
+
+    # =========================================================================
+    # KLASSENBUCH-FOTO PARSING
+    # =========================================================================
+
+    def parse_klassenbuch_image(self, image_bytes: bytes) -> ParsedRoster:
+        """
+        Parst ein Klassenbuch-Foto via OCR.
+
+        Args:
+            image_bytes: Bild als Bytes (PNG, JPG)
+
+        Returns:
+            ParsedRoster mit extrahierten Schuelerdaten
+        """
+        if not HAS_OCR or not self.file_processor:
+            return ParsedRoster(
+                entries=[],
+                source_type='klassenbuch',
+                confidence=0.0,
+                warnings=['OCR nicht verfuegbar (PaddleOCR nicht installiert)']
+            )
+
+        # OCR ausfuehren
+        result: ProcessingResult = self.file_processor.process_file(
+            image_bytes,
+            filename='klassenbuch.png',
+            processing_mode='ocr_handwriting'
+        )
+
+        # Text in Zeilen aufteilen
+        lines = result.text.split('\n')
+        entries = []
+        warnings = []
+
+        for line in lines:
+            line = line.strip()
+            if not line or len(line) < 3:
+                continue
+
+            entry = self._parse_roster_line(line)
+            if entry:
+                entries.append(entry)
+
+        return ParsedRoster(
+            entries=entries,
+            source_type='klassenbuch',
+            confidence=result.confidence,
+            warnings=warnings,
+            raw_text=result.text
+        )
+
+    def _parse_roster_line(self, line: str) -> Optional[RosterEntry]:
+        """Parst eine einzelne Zeile aus dem Klassenbuch."""
+        # Bereinigen
+        line = re.sub(r'\s+', ' ', line).strip()
+
+        # Nummer am Anfang entfernen (z.B. "1. Max Mustermann")
+        line = re.sub(r'^\d+[\.\)\s]+', '', line)
+
+        # Email extrahieren
+        email_match = self.EMAIL_PATTERN.search(line)
+        email = email_match.group() if email_match else None
+        if email:
+            line = line.replace(email, '')
+
+        # Telefon extrahieren
+        phone_match = self.PHONE_PATTERN.search(line)
+        phone = phone_match.group() if phone_match else None
+        if phone:
+            line = line.replace(phone, '')
+
+        # Geburtsdatum extrahieren
+        date_match = self.DATE_PATTERN.search(line)
+        birth_date = date_match.group() if date_match else None
+        if birth_date:
+            line = line.replace(birth_date, '')
+
+        # Namen parsen (Rest der Zeile)
+        line = re.sub(r'\s+', ' ', line).strip()
+        if not line:
+            return None
+
+        first_name, last_name = self._parse_name(line)
+        if not first_name:
+            return None
+
+        return RosterEntry(
+            first_name=first_name,
+            last_name=last_name or '',
+            parent_email=email,
+            parent_phone=phone,
+            birth_date=birth_date
+        )
+
+    def _parse_name(self, text: str) -> Tuple[Optional[str], Optional[str]]:
+        """
+        Parst einen Namen in Vor- und Nachname.
+
+        Formate:
+        - "Max Mustermann"
+        - "Mustermann, Max"
+        - "Max M."
+        - "Max"
+        """
+        text = text.strip()
+        if not text:
+            return None, None
+
+        # Format: "Nachname, Vorname"
+        if ',' in text:
+            parts = text.split(',', 1)
+            last_name = parts[0].strip()
+            first_name = parts[1].strip() if len(parts) > 1 else ''
+            return first_name, last_name
+
+        # Format: "Vorname Nachname" oder "Vorname"
+        parts = text.split()
+        if len(parts) == 1:
+            return parts[0], None
+        elif len(parts) == 2:
+            return parts[0], parts[1]
+        else:
+            # Erster Teil ist Vorname, Rest ist Nachname
+            return parts[0], ' '.join(parts[1:])
+
+    # =========================================================================
+    # PDF ROSTER PARSING
+    # =========================================================================
+
+    def parse_pdf_roster(self, pdf_bytes: bytes) -> ParsedRoster:
+        """
+        Parst eine PDF-Schuelerliste.
+
+        Unterstuetzt gaengige Schulverwaltungs-Exporte:
+        - SchILD-NRW
+        - ASV (Bayern)
+        - Untis
+        - Generic CSV-in-PDF
+        """
+        if not HAS_PDF:
+            return ParsedRoster(
+                entries=[],
+                source_type='pdf',
+                confidence=0.0,
+                warnings=['PDF-Parsing nicht verfuegbar (PyMuPDF nicht installiert)']
+            )
+
+        entries = []
+        warnings = []
+        raw_text = ''
+
+        try:
+            doc = fitz.open(stream=pdf_bytes, filetype='pdf')
+
+            for page in doc:
+                text = page.get_text()
+                raw_text += text + '\n'
+
+                # Tabellen extrahieren
+                tables = page.find_tables()
+                for table in tables:
+                    df = table.to_pandas()
+                    for _, row in df.iterrows():
+                        entry = self._parse_table_row(row.to_dict())
+                        if entry:
+                            entries.append(entry)
+
+                # Falls keine Tabellen: Zeilenweise parsen
+                if not tables:
+                    for line in text.split('\n'):
+                        entry = self._parse_roster_line(line)
+                        if entry:
+                            entries.append(entry)
+
+            doc.close()
+
+        except Exception as e:
+            warnings.append(f'PDF-Parsing Fehler: {str(e)}')
+
+        # Duplikate entfernen
+        entries = self._deduplicate_entries(entries)
+
+        return ParsedRoster(
+            entries=entries,
+            source_type='pdf',
+            confidence=0.9 if entries else 0.0,
+            warnings=warnings,
+            raw_text=raw_text
+        )
+
+    def _parse_table_row(self, row: Dict) -> Optional[RosterEntry]:
+        """Parst eine Tabellenzeile in einen RosterEntry."""
+        # Spalten-Mappings (verschiedene Formate)
+        name_columns = ['name', 'schueler', 'schüler', 'student', 'nachname', 'last_name']
+        first_name_columns = ['vorname', 'first_name', 'firstname']
+        email_columns = ['email', 'e-mail', 'mail', 'eltern_email', 'parent_email']
+        phone_columns = ['telefon', 'phone', 'tel', 'handy', 'mobile', 'eltern_tel']
+
+        first_name = None
+        last_name = None
+        email = None
+        phone = None
+
+        for key, value in row.items():
+            if not value or str(value).strip() == '':
+                continue
+
+            key_lower = str(key).lower()
+            value_str = str(value).strip()
+
+            if any(col in key_lower for col in first_name_columns):
+                first_name = value_str
+            elif any(col in key_lower for col in name_columns):
+                # Kann "Vorname Nachname" oder nur "Nachname" sein
+                if first_name:
+                    last_name = value_str
+                else:
+                    first_name, last_name = self._parse_name(value_str)
+            elif any(col in key_lower for col in email_columns):
+                if self.EMAIL_PATTERN.match(value_str):
+                    email = value_str
+            elif any(col in key_lower for col in phone_columns):
+                phone = value_str
+
+        if not first_name:
+            return None
+
+        return RosterEntry(
+            first_name=first_name,
+            last_name=last_name or '',
+            parent_email=email,
+            parent_phone=phone
+        )
+
+    # =========================================================================
+    # CSV PARSING
+    # =========================================================================
+
+    def parse_csv_roster(self, csv_content: str) -> ParsedRoster:
+        """
+        Parst eine CSV-Schuelerliste.
+
+        Args:
+            csv_content: CSV als String
+
+        Returns:
+            ParsedRoster
+        """
+        entries = []
+        warnings = []
+
+        try:
+            # Delimiter erraten
+            dialect = csv.Sniffer().sniff(csv_content[:1024])
+            reader = csv.DictReader(io.StringIO(csv_content), dialect=dialect)
+
+            for row in reader:
+                entry = self._parse_table_row(row)
+                if entry:
+                    entries.append(entry)
+
+        except csv.Error as e:
+            warnings.append(f'CSV-Parsing Fehler: {str(e)}')
+
+            # Fallback: Zeilenweise parsen
+            for line in csv_content.split('\n'):
+                entry = self._parse_roster_line(line)
+                if entry:
+                    entries.append(entry)
+
+        return ParsedRoster(
+            entries=entries,
+            source_type='csv',
+            confidence=0.95 if entries else 0.0,
+            warnings=warnings,
+            raw_text=csv_content
+        )
+
+    # =========================================================================
+    # NAME MATCHING
+    # =========================================================================
+
+    def match_first_names(
+        self,
+        detected: List[str],
+        roster: List[RosterEntry],
+        threshold: float = 0.7
+    ) -> List[NameMatch]:
+        """
+        Matched erkannte Vornamen zu Roster-Eintraegen.
+
+        Args:
+            detected: Liste erkannter Vornamen (z.B. ["Max", "Anna"])
+            roster: Vollstaendige Schuelerliste
+            threshold: Mindest-Konfidenz fuer Fuzzy-Matching
+
+        Returns:
+            Liste von NameMatch-Objekten
+        """
+        matches = []
+        used_entries = set()
+
+        for name in detected:
+            name_lower = name.lower().strip()
+            best_match = None
+            best_confidence = 0.0
+            match_type = 'none'
+
+            for i, entry in enumerate(roster):
+                if i in used_entries:
+                    continue
+
+                entry_first_lower = entry.first_name.lower().strip()
+
+                # Exakter Match
+                if name_lower == entry_first_lower:
+                    best_match = entry
+                    best_confidence = 1.0
+                    match_type = 'exact'
+                    used_entries.add(i)
+                    break
+
+                # Vorname-Anfang Match (z.B. "Max" matched "Maximilian")
+                if entry_first_lower.startswith(name_lower) or name_lower.startswith(entry_first_lower):
+                    confidence = min(len(name_lower), len(entry_first_lower)) / max(len(name_lower), len(entry_first_lower))
+                    if confidence > best_confidence and confidence >= threshold:
+                        best_match = entry
+                        best_confidence = confidence
+                        match_type = 'first_name'
+
+                # Fuzzy Match
+                ratio = SequenceMatcher(None, name_lower, entry_first_lower).ratio()
+                if ratio > best_confidence and ratio >= threshold:
+                    best_match = entry
+                    best_confidence = ratio
+                    match_type = 'fuzzy'
+
+            if best_match and match_type != 'exact':
+                # Entry als verwendet markieren
+                for i, entry in enumerate(roster):
+                    if entry is best_match:
+                        used_entries.add(i)
+                        break
+
+            matches.append(NameMatch(
+                detected_name=name,
+                matched_entry=best_match,
+                confidence=best_confidence,
+                match_type=match_type
+            ))
+
+        return matches
+
+    # =========================================================================
+    # HELPERS
+    # =========================================================================
+
+    def _deduplicate_entries(self, entries: List[RosterEntry]) -> List[RosterEntry]:
+        """Entfernt Duplikate basierend auf Vor- und Nachname."""
+        seen = set()
+        unique = []
+
+        for entry in entries:
+            key = (entry.first_name.lower(), entry.last_name.lower())
+            if key not in seen:
+                seen.add(key)
+                unique.append(entry)
+
+        return unique
+
+    def validate_entry(self, entry: RosterEntry) -> List[str]:
+        """Validiert einen RosterEntry und gibt Warnungen zurueck."""
+        warnings = []
+
+        # Vorname pruefen
+        if not entry.first_name:
+            warnings.append('Kein Vorname')
+        elif len(entry.first_name) < 2:
+            warnings.append('Vorname zu kurz')
+
+        # Email validieren
+        if entry.parent_email and not self.EMAIL_PATTERN.match(entry.parent_email):
+            warnings.append('Ungueltige Email-Adresse')
+
+        return warnings
+
+
+# Singleton
+_roster_parser: Optional[RosterParser] = None
+
+
+def get_roster_parser() -> RosterParser:
+    """Gibt die Singleton-Instanz des RosterParsers zurueck."""
+    global _roster_parser
+    if _roster_parser is None:
+        _roster_parser = RosterParser()
+    return _roster_parser