""" Roster Parser Service - Klassenbuch und Schuelerlisten parsen. Unterstuetzt: - Klassenbuch-Fotos (OCR mit PaddleOCR) - PDF-Schuelerlisten (SchILD, ASV, etc.) - CSV-Dateien - Manuelle Eingabe Privacy-First: - Alle Verarbeitung serverseitig (kein externer Upload) - Daten bleiben im Lehrer-Namespace """ import re import csv import io from dataclasses import dataclass, field from typing import List, Optional, Dict, Tuple from difflib import SequenceMatcher # Optionale Imports try: from services.file_processor import get_file_processor, ProcessingResult HAS_OCR = True except ImportError: HAS_OCR = False try: import fitz # PyMuPDF HAS_PDF = True except ImportError: HAS_PDF = False @dataclass class RosterEntry: """Eintrag in einer Schuelerliste.""" first_name: str last_name: str student_number: Optional[str] = None parent_email: Optional[str] = None parent_phone: Optional[str] = None birth_date: Optional[str] = None additional_data: Dict[str, str] = field(default_factory=dict) @dataclass class ParsedRoster: """Ergebnis des Roster-Parsings.""" entries: List[RosterEntry] source_type: str # klassenbuch, pdf, csv confidence: float warnings: List[str] = field(default_factory=list) raw_text: Optional[str] = None @dataclass class NameMatch: """Ergebnis eines Name-Matchings.""" detected_name: str matched_entry: Optional[RosterEntry] confidence: float match_type: str # exact, first_name, fuzzy, none class RosterParser: """ Parst Klassenlisten aus verschiedenen Quellen. Beispiel: parser = RosterParser() # Klassenbuch-Foto roster = parser.parse_klassenbuch_image(image_bytes) # PDF-Liste roster = parser.parse_pdf_roster(pdf_bytes) # Namen matchen matches = parser.match_first_names( detected=["Max", "Anna", "Tim"], roster=roster.entries ) """ # Regex-Patterns fuer Kontaktdaten EMAIL_PATTERN = re.compile(r'[\w.+-]+@[\w-]+\.[\w.-]+') PHONE_PATTERN = re.compile(r'(?:\+49|0)[\s.-]?\d{2,4}[\s.-]?\d{3,}[\s.-]?\d{2,}') DATE_PATTERN = re.compile(r'\b(\d{1,2})\.(\d{1,2})\.(\d{2,4})\b') # Deutsche Vornamen (Auszug fuer Validierung) COMMON_FIRST_NAMES = { 'max', 'anna', 'tim', 'lena', 'paul', 'marie', 'felix', 'emma', 'leon', 'sophia', 'lukas', 'mia', 'jonas', 'hannah', 'elias', 'emilia', 'ben', 'lea', 'noah', 'lina', 'finn', 'amelie', 'luis', 'laura', 'moritz', 'clara', 'henry', 'julia', 'julian', 'emily', 'david', 'johanna', 'niklas', 'charlotte', 'simon', 'maja', 'alexander', 'sarah', 'jan', 'lisa', 'tom', 'nele', 'luca', 'sophie', 'erik', 'alina', 'fabian', 'paula', 'philipp', 'luisa', 'tobias', 'melina', 'vincent', 'lara', 'maximilian', 'elena' } def __init__(self): self.file_processor = get_file_processor() if HAS_OCR else None # ========================================================================= # KLASSENBUCH-FOTO PARSING # ========================================================================= def parse_klassenbuch_image(self, image_bytes: bytes) -> ParsedRoster: """ Parst ein Klassenbuch-Foto via OCR. Args: image_bytes: Bild als Bytes (PNG, JPG) Returns: ParsedRoster mit extrahierten Schuelerdaten """ if not HAS_OCR or not self.file_processor: return ParsedRoster( entries=[], source_type='klassenbuch', confidence=0.0, warnings=['OCR nicht verfuegbar (PaddleOCR nicht installiert)'] ) # OCR ausfuehren result: ProcessingResult = self.file_processor.process_file( image_bytes, filename='klassenbuch.png', processing_mode='ocr_handwriting' ) # Text in Zeilen aufteilen lines = result.text.split('\n') entries = [] warnings = [] for line in lines: line = line.strip() if not line or len(line) < 3: continue entry = self._parse_roster_line(line) if entry: entries.append(entry) return ParsedRoster( entries=entries, source_type='klassenbuch', confidence=result.confidence, warnings=warnings, raw_text=result.text ) def _parse_roster_line(self, line: str) -> Optional[RosterEntry]: """Parst eine einzelne Zeile aus dem Klassenbuch.""" # Bereinigen line = re.sub(r'\s+', ' ', line).strip() # Nummer am Anfang entfernen (z.B. "1. Max Mustermann") line = re.sub(r'^\d+[\.\)\s]+', '', line) # Email extrahieren email_match = self.EMAIL_PATTERN.search(line) email = email_match.group() if email_match else None if email: line = line.replace(email, '') # Telefon extrahieren phone_match = self.PHONE_PATTERN.search(line) phone = phone_match.group() if phone_match else None if phone: line = line.replace(phone, '') # Geburtsdatum extrahieren date_match = self.DATE_PATTERN.search(line) birth_date = date_match.group() if date_match else None if birth_date: line = line.replace(birth_date, '') # Namen parsen (Rest der Zeile) line = re.sub(r'\s+', ' ', line).strip() if not line: return None first_name, last_name = self._parse_name(line) if not first_name: return None return RosterEntry( first_name=first_name, last_name=last_name or '', parent_email=email, parent_phone=phone, birth_date=birth_date ) def _parse_name(self, text: str) -> Tuple[Optional[str], Optional[str]]: """ Parst einen Namen in Vor- und Nachname. Formate: - "Max Mustermann" - "Mustermann, Max" - "Max M." - "Max" """ text = text.strip() if not text: return None, None # Format: "Nachname, Vorname" if ',' in text: parts = text.split(',', 1) last_name = parts[0].strip() first_name = parts[1].strip() if len(parts) > 1 else '' return first_name, last_name # Format: "Vorname Nachname" oder "Vorname" parts = text.split() if len(parts) == 1: return parts[0], None elif len(parts) == 2: return parts[0], parts[1] else: # Erster Teil ist Vorname, Rest ist Nachname return parts[0], ' '.join(parts[1:]) # ========================================================================= # PDF ROSTER PARSING # ========================================================================= def parse_pdf_roster(self, pdf_bytes: bytes) -> ParsedRoster: """ Parst eine PDF-Schuelerliste. Unterstuetzt gaengige Schulverwaltungs-Exporte: - SchILD-NRW - ASV (Bayern) - Untis - Generic CSV-in-PDF """ if not HAS_PDF: return ParsedRoster( entries=[], source_type='pdf', confidence=0.0, warnings=['PDF-Parsing nicht verfuegbar (PyMuPDF nicht installiert)'] ) entries = [] warnings = [] raw_text = '' try: doc = fitz.open(stream=pdf_bytes, filetype='pdf') for page in doc: text = page.get_text() raw_text += text + '\n' # Tabellen extrahieren tables = page.find_tables() for table in tables: df = table.to_pandas() for _, row in df.iterrows(): entry = self._parse_table_row(row.to_dict()) if entry: entries.append(entry) # Falls keine Tabellen: Zeilenweise parsen if not tables: for line in text.split('\n'): entry = self._parse_roster_line(line) if entry: entries.append(entry) doc.close() except Exception as e: warnings.append(f'PDF-Parsing Fehler: {str(e)}') # Duplikate entfernen entries = self._deduplicate_entries(entries) return ParsedRoster( entries=entries, source_type='pdf', confidence=0.9 if entries else 0.0, warnings=warnings, raw_text=raw_text ) def _parse_table_row(self, row: Dict) -> Optional[RosterEntry]: """Parst eine Tabellenzeile in einen RosterEntry.""" # Spalten-Mappings (verschiedene Formate) name_columns = ['name', 'schueler', 'schüler', 'student', 'nachname', 'last_name'] first_name_columns = ['vorname', 'first_name', 'firstname'] email_columns = ['email', 'e-mail', 'mail', 'eltern_email', 'parent_email'] phone_columns = ['telefon', 'phone', 'tel', 'handy', 'mobile', 'eltern_tel'] first_name = None last_name = None email = None phone = None for key, value in row.items(): if not value or str(value).strip() == '': continue key_lower = str(key).lower() value_str = str(value).strip() if any(col in key_lower for col in first_name_columns): first_name = value_str elif any(col in key_lower for col in name_columns): # Kann "Vorname Nachname" oder nur "Nachname" sein if first_name: last_name = value_str else: first_name, last_name = self._parse_name(value_str) elif any(col in key_lower for col in email_columns): if self.EMAIL_PATTERN.match(value_str): email = value_str elif any(col in key_lower for col in phone_columns): phone = value_str if not first_name: return None return RosterEntry( first_name=first_name, last_name=last_name or '', parent_email=email, parent_phone=phone ) # ========================================================================= # CSV PARSING # ========================================================================= def parse_csv_roster(self, csv_content: str) -> ParsedRoster: """ Parst eine CSV-Schuelerliste. Args: csv_content: CSV als String Returns: ParsedRoster """ entries = [] warnings = [] try: # Delimiter erraten dialect = csv.Sniffer().sniff(csv_content[:1024]) reader = csv.DictReader(io.StringIO(csv_content), dialect=dialect) for row in reader: entry = self._parse_table_row(row) if entry: entries.append(entry) except csv.Error as e: warnings.append(f'CSV-Parsing Fehler: {str(e)}') # Fallback: Zeilenweise parsen for line in csv_content.split('\n'): entry = self._parse_roster_line(line) if entry: entries.append(entry) return ParsedRoster( entries=entries, source_type='csv', confidence=0.95 if entries else 0.0, warnings=warnings, raw_text=csv_content ) # ========================================================================= # NAME MATCHING # ========================================================================= def match_first_names( self, detected: List[str], roster: List[RosterEntry], threshold: float = 0.7 ) -> List[NameMatch]: """ Matched erkannte Vornamen zu Roster-Eintraegen. Args: detected: Liste erkannter Vornamen (z.B. ["Max", "Anna"]) roster: Vollstaendige Schuelerliste threshold: Mindest-Konfidenz fuer Fuzzy-Matching Returns: Liste von NameMatch-Objekten """ matches = [] used_entries = set() for name in detected: name_lower = name.lower().strip() best_match = None best_confidence = 0.0 match_type = 'none' for i, entry in enumerate(roster): if i in used_entries: continue entry_first_lower = entry.first_name.lower().strip() # Exakter Match if name_lower == entry_first_lower: best_match = entry best_confidence = 1.0 match_type = 'exact' used_entries.add(i) break # Vorname-Anfang Match (z.B. "Max" matched "Maximilian") if entry_first_lower.startswith(name_lower) or name_lower.startswith(entry_first_lower): confidence = min(len(name_lower), len(entry_first_lower)) / max(len(name_lower), len(entry_first_lower)) if confidence > best_confidence and confidence >= threshold: best_match = entry best_confidence = confidence match_type = 'first_name' # Fuzzy Match ratio = SequenceMatcher(None, name_lower, entry_first_lower).ratio() if ratio > best_confidence and ratio >= threshold: best_match = entry best_confidence = ratio match_type = 'fuzzy' if best_match and match_type != 'exact': # Entry als verwendet markieren for i, entry in enumerate(roster): if entry is best_match: used_entries.add(i) break matches.append(NameMatch( detected_name=name, matched_entry=best_match, confidence=best_confidence, match_type=match_type )) return matches # ========================================================================= # HELPERS # ========================================================================= def _deduplicate_entries(self, entries: List[RosterEntry]) -> List[RosterEntry]: """Entfernt Duplikate basierend auf Vor- und Nachname.""" seen = set() unique = [] for entry in entries: key = (entry.first_name.lower(), entry.last_name.lower()) if key not in seen: seen.add(key) unique.append(entry) return unique def validate_entry(self, entry: RosterEntry) -> List[str]: """Validiert einen RosterEntry und gibt Warnungen zurueck.""" warnings = [] # Vorname pruefen if not entry.first_name: warnings.append('Kein Vorname') elif len(entry.first_name) < 2: warnings.append('Vorname zu kurz') # Email validieren if entry.parent_email and not self.EMAIL_PATTERN.match(entry.parent_email): warnings.append('Ungueltige Email-Adresse') return warnings # Singleton _roster_parser: Optional[RosterParser] = None def get_roster_parser() -> RosterParser: """Gibt die Singleton-Instanz des RosterParsers zurueck.""" global _roster_parser if _roster_parser is None: _roster_parser = RosterParser() return _roster_parser