A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.
This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).
Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
503 lines
16 KiB
Python
503 lines
16 KiB
Python
"""
|
|
Roster Parser Service - Klassenbuch und Schuelerlisten parsen.
|
|
|
|
Unterstuetzt:
|
|
- Klassenbuch-Fotos (OCR mit PaddleOCR)
|
|
- PDF-Schuelerlisten (SchILD, ASV, etc.)
|
|
- CSV-Dateien
|
|
- Manuelle Eingabe
|
|
|
|
Privacy-First:
|
|
- Alle Verarbeitung serverseitig (kein externer Upload)
|
|
- Daten bleiben im Lehrer-Namespace
|
|
"""
|
|
|
|
import re
|
|
import csv
|
|
import io
|
|
from dataclasses import dataclass, field
|
|
from typing import List, Optional, Dict, Tuple
|
|
from difflib import SequenceMatcher
|
|
|
|
# Optionale Imports
|
|
try:
|
|
from services.file_processor import get_file_processor, ProcessingResult
|
|
HAS_OCR = True
|
|
except ImportError:
|
|
HAS_OCR = False
|
|
|
|
try:
|
|
import fitz # PyMuPDF
|
|
HAS_PDF = True
|
|
except ImportError:
|
|
HAS_PDF = False
|
|
|
|
|
|
@dataclass
|
|
class RosterEntry:
|
|
"""Eintrag in einer Schuelerliste."""
|
|
first_name: str
|
|
last_name: str
|
|
student_number: Optional[str] = None
|
|
parent_email: Optional[str] = None
|
|
parent_phone: Optional[str] = None
|
|
birth_date: Optional[str] = None
|
|
additional_data: Dict[str, str] = field(default_factory=dict)
|
|
|
|
|
|
@dataclass
|
|
class ParsedRoster:
|
|
"""Ergebnis des Roster-Parsings."""
|
|
entries: List[RosterEntry]
|
|
source_type: str # klassenbuch, pdf, csv
|
|
confidence: float
|
|
warnings: List[str] = field(default_factory=list)
|
|
raw_text: Optional[str] = None
|
|
|
|
|
|
@dataclass
|
|
class NameMatch:
|
|
"""Ergebnis eines Name-Matchings."""
|
|
detected_name: str
|
|
matched_entry: Optional[RosterEntry]
|
|
confidence: float
|
|
match_type: str # exact, first_name, fuzzy, none
|
|
|
|
|
|
class RosterParser:
|
|
"""
|
|
Parst Klassenlisten aus verschiedenen Quellen.
|
|
|
|
Beispiel:
|
|
parser = RosterParser()
|
|
|
|
# Klassenbuch-Foto
|
|
roster = parser.parse_klassenbuch_image(image_bytes)
|
|
|
|
# PDF-Liste
|
|
roster = parser.parse_pdf_roster(pdf_bytes)
|
|
|
|
# Namen matchen
|
|
matches = parser.match_first_names(
|
|
detected=["Max", "Anna", "Tim"],
|
|
roster=roster.entries
|
|
)
|
|
"""
|
|
|
|
# Regex-Patterns fuer Kontaktdaten
|
|
EMAIL_PATTERN = re.compile(r'[\w.+-]+@[\w-]+\.[\w.-]+')
|
|
PHONE_PATTERN = re.compile(r'(?:\+49|0)[\s.-]?\d{2,4}[\s.-]?\d{3,}[\s.-]?\d{2,}')
|
|
DATE_PATTERN = re.compile(r'\b(\d{1,2})\.(\d{1,2})\.(\d{2,4})\b')
|
|
|
|
# Deutsche Vornamen (Auszug fuer Validierung)
|
|
COMMON_FIRST_NAMES = {
|
|
'max', 'anna', 'tim', 'lena', 'paul', 'marie', 'felix', 'emma',
|
|
'leon', 'sophia', 'lukas', 'mia', 'jonas', 'hannah', 'elias', 'emilia',
|
|
'ben', 'lea', 'noah', 'lina', 'finn', 'amelie', 'luis', 'laura',
|
|
'moritz', 'clara', 'henry', 'julia', 'julian', 'emily', 'david', 'johanna',
|
|
'niklas', 'charlotte', 'simon', 'maja', 'alexander', 'sarah', 'jan', 'lisa',
|
|
'tom', 'nele', 'luca', 'sophie', 'erik', 'alina', 'fabian', 'paula',
|
|
'philipp', 'luisa', 'tobias', 'melina', 'vincent', 'lara', 'maximilian', 'elena'
|
|
}
|
|
|
|
def __init__(self):
|
|
self.file_processor = get_file_processor() if HAS_OCR else None
|
|
|
|
# =========================================================================
|
|
# KLASSENBUCH-FOTO PARSING
|
|
# =========================================================================
|
|
|
|
def parse_klassenbuch_image(self, image_bytes: bytes) -> ParsedRoster:
|
|
"""
|
|
Parst ein Klassenbuch-Foto via OCR.
|
|
|
|
Args:
|
|
image_bytes: Bild als Bytes (PNG, JPG)
|
|
|
|
Returns:
|
|
ParsedRoster mit extrahierten Schuelerdaten
|
|
"""
|
|
if not HAS_OCR or not self.file_processor:
|
|
return ParsedRoster(
|
|
entries=[],
|
|
source_type='klassenbuch',
|
|
confidence=0.0,
|
|
warnings=['OCR nicht verfuegbar (PaddleOCR nicht installiert)']
|
|
)
|
|
|
|
# OCR ausfuehren
|
|
result: ProcessingResult = self.file_processor.process_file(
|
|
image_bytes,
|
|
filename='klassenbuch.png',
|
|
processing_mode='ocr_handwriting'
|
|
)
|
|
|
|
# Text in Zeilen aufteilen
|
|
lines = result.text.split('\n')
|
|
entries = []
|
|
warnings = []
|
|
|
|
for line in lines:
|
|
line = line.strip()
|
|
if not line or len(line) < 3:
|
|
continue
|
|
|
|
entry = self._parse_roster_line(line)
|
|
if entry:
|
|
entries.append(entry)
|
|
|
|
return ParsedRoster(
|
|
entries=entries,
|
|
source_type='klassenbuch',
|
|
confidence=result.confidence,
|
|
warnings=warnings,
|
|
raw_text=result.text
|
|
)
|
|
|
|
def _parse_roster_line(self, line: str) -> Optional[RosterEntry]:
|
|
"""Parst eine einzelne Zeile aus dem Klassenbuch."""
|
|
# Bereinigen
|
|
line = re.sub(r'\s+', ' ', line).strip()
|
|
|
|
# Nummer am Anfang entfernen (z.B. "1. Max Mustermann")
|
|
line = re.sub(r'^\d+[\.\)\s]+', '', line)
|
|
|
|
# Email extrahieren
|
|
email_match = self.EMAIL_PATTERN.search(line)
|
|
email = email_match.group() if email_match else None
|
|
if email:
|
|
line = line.replace(email, '')
|
|
|
|
# Telefon extrahieren
|
|
phone_match = self.PHONE_PATTERN.search(line)
|
|
phone = phone_match.group() if phone_match else None
|
|
if phone:
|
|
line = line.replace(phone, '')
|
|
|
|
# Geburtsdatum extrahieren
|
|
date_match = self.DATE_PATTERN.search(line)
|
|
birth_date = date_match.group() if date_match else None
|
|
if birth_date:
|
|
line = line.replace(birth_date, '')
|
|
|
|
# Namen parsen (Rest der Zeile)
|
|
line = re.sub(r'\s+', ' ', line).strip()
|
|
if not line:
|
|
return None
|
|
|
|
first_name, last_name = self._parse_name(line)
|
|
if not first_name:
|
|
return None
|
|
|
|
return RosterEntry(
|
|
first_name=first_name,
|
|
last_name=last_name or '',
|
|
parent_email=email,
|
|
parent_phone=phone,
|
|
birth_date=birth_date
|
|
)
|
|
|
|
def _parse_name(self, text: str) -> Tuple[Optional[str], Optional[str]]:
|
|
"""
|
|
Parst einen Namen in Vor- und Nachname.
|
|
|
|
Formate:
|
|
- "Max Mustermann"
|
|
- "Mustermann, Max"
|
|
- "Max M."
|
|
- "Max"
|
|
"""
|
|
text = text.strip()
|
|
if not text:
|
|
return None, None
|
|
|
|
# Format: "Nachname, Vorname"
|
|
if ',' in text:
|
|
parts = text.split(',', 1)
|
|
last_name = parts[0].strip()
|
|
first_name = parts[1].strip() if len(parts) > 1 else ''
|
|
return first_name, last_name
|
|
|
|
# Format: "Vorname Nachname" oder "Vorname"
|
|
parts = text.split()
|
|
if len(parts) == 1:
|
|
return parts[0], None
|
|
elif len(parts) == 2:
|
|
return parts[0], parts[1]
|
|
else:
|
|
# Erster Teil ist Vorname, Rest ist Nachname
|
|
return parts[0], ' '.join(parts[1:])
|
|
|
|
# =========================================================================
|
|
# PDF ROSTER PARSING
|
|
# =========================================================================
|
|
|
|
def parse_pdf_roster(self, pdf_bytes: bytes) -> ParsedRoster:
|
|
"""
|
|
Parst eine PDF-Schuelerliste.
|
|
|
|
Unterstuetzt gaengige Schulverwaltungs-Exporte:
|
|
- SchILD-NRW
|
|
- ASV (Bayern)
|
|
- Untis
|
|
- Generic CSV-in-PDF
|
|
"""
|
|
if not HAS_PDF:
|
|
return ParsedRoster(
|
|
entries=[],
|
|
source_type='pdf',
|
|
confidence=0.0,
|
|
warnings=['PDF-Parsing nicht verfuegbar (PyMuPDF nicht installiert)']
|
|
)
|
|
|
|
entries = []
|
|
warnings = []
|
|
raw_text = ''
|
|
|
|
try:
|
|
doc = fitz.open(stream=pdf_bytes, filetype='pdf')
|
|
|
|
for page in doc:
|
|
text = page.get_text()
|
|
raw_text += text + '\n'
|
|
|
|
# Tabellen extrahieren
|
|
tables = page.find_tables()
|
|
for table in tables:
|
|
df = table.to_pandas()
|
|
for _, row in df.iterrows():
|
|
entry = self._parse_table_row(row.to_dict())
|
|
if entry:
|
|
entries.append(entry)
|
|
|
|
# Falls keine Tabellen: Zeilenweise parsen
|
|
if not tables:
|
|
for line in text.split('\n'):
|
|
entry = self._parse_roster_line(line)
|
|
if entry:
|
|
entries.append(entry)
|
|
|
|
doc.close()
|
|
|
|
except Exception as e:
|
|
warnings.append(f'PDF-Parsing Fehler: {str(e)}')
|
|
|
|
# Duplikate entfernen
|
|
entries = self._deduplicate_entries(entries)
|
|
|
|
return ParsedRoster(
|
|
entries=entries,
|
|
source_type='pdf',
|
|
confidence=0.9 if entries else 0.0,
|
|
warnings=warnings,
|
|
raw_text=raw_text
|
|
)
|
|
|
|
def _parse_table_row(self, row: Dict) -> Optional[RosterEntry]:
|
|
"""Parst eine Tabellenzeile in einen RosterEntry."""
|
|
# Spalten-Mappings (verschiedene Formate)
|
|
name_columns = ['name', 'schueler', 'schüler', 'student', 'nachname', 'last_name']
|
|
first_name_columns = ['vorname', 'first_name', 'firstname']
|
|
email_columns = ['email', 'e-mail', 'mail', 'eltern_email', 'parent_email']
|
|
phone_columns = ['telefon', 'phone', 'tel', 'handy', 'mobile', 'eltern_tel']
|
|
|
|
first_name = None
|
|
last_name = None
|
|
email = None
|
|
phone = None
|
|
|
|
for key, value in row.items():
|
|
if not value or str(value).strip() == '':
|
|
continue
|
|
|
|
key_lower = str(key).lower()
|
|
value_str = str(value).strip()
|
|
|
|
if any(col in key_lower for col in first_name_columns):
|
|
first_name = value_str
|
|
elif any(col in key_lower for col in name_columns):
|
|
# Kann "Vorname Nachname" oder nur "Nachname" sein
|
|
if first_name:
|
|
last_name = value_str
|
|
else:
|
|
first_name, last_name = self._parse_name(value_str)
|
|
elif any(col in key_lower for col in email_columns):
|
|
if self.EMAIL_PATTERN.match(value_str):
|
|
email = value_str
|
|
elif any(col in key_lower for col in phone_columns):
|
|
phone = value_str
|
|
|
|
if not first_name:
|
|
return None
|
|
|
|
return RosterEntry(
|
|
first_name=first_name,
|
|
last_name=last_name or '',
|
|
parent_email=email,
|
|
parent_phone=phone
|
|
)
|
|
|
|
# =========================================================================
|
|
# CSV PARSING
|
|
# =========================================================================
|
|
|
|
def parse_csv_roster(self, csv_content: str) -> ParsedRoster:
|
|
"""
|
|
Parst eine CSV-Schuelerliste.
|
|
|
|
Args:
|
|
csv_content: CSV als String
|
|
|
|
Returns:
|
|
ParsedRoster
|
|
"""
|
|
entries = []
|
|
warnings = []
|
|
|
|
try:
|
|
# Delimiter erraten
|
|
dialect = csv.Sniffer().sniff(csv_content[:1024])
|
|
reader = csv.DictReader(io.StringIO(csv_content), dialect=dialect)
|
|
|
|
for row in reader:
|
|
entry = self._parse_table_row(row)
|
|
if entry:
|
|
entries.append(entry)
|
|
|
|
except csv.Error as e:
|
|
warnings.append(f'CSV-Parsing Fehler: {str(e)}')
|
|
|
|
# Fallback: Zeilenweise parsen
|
|
for line in csv_content.split('\n'):
|
|
entry = self._parse_roster_line(line)
|
|
if entry:
|
|
entries.append(entry)
|
|
|
|
return ParsedRoster(
|
|
entries=entries,
|
|
source_type='csv',
|
|
confidence=0.95 if entries else 0.0,
|
|
warnings=warnings,
|
|
raw_text=csv_content
|
|
)
|
|
|
|
# =========================================================================
|
|
# NAME MATCHING
|
|
# =========================================================================
|
|
|
|
def match_first_names(
|
|
self,
|
|
detected: List[str],
|
|
roster: List[RosterEntry],
|
|
threshold: float = 0.7
|
|
) -> List[NameMatch]:
|
|
"""
|
|
Matched erkannte Vornamen zu Roster-Eintraegen.
|
|
|
|
Args:
|
|
detected: Liste erkannter Vornamen (z.B. ["Max", "Anna"])
|
|
roster: Vollstaendige Schuelerliste
|
|
threshold: Mindest-Konfidenz fuer Fuzzy-Matching
|
|
|
|
Returns:
|
|
Liste von NameMatch-Objekten
|
|
"""
|
|
matches = []
|
|
used_entries = set()
|
|
|
|
for name in detected:
|
|
name_lower = name.lower().strip()
|
|
best_match = None
|
|
best_confidence = 0.0
|
|
match_type = 'none'
|
|
|
|
for i, entry in enumerate(roster):
|
|
if i in used_entries:
|
|
continue
|
|
|
|
entry_first_lower = entry.first_name.lower().strip()
|
|
|
|
# Exakter Match
|
|
if name_lower == entry_first_lower:
|
|
best_match = entry
|
|
best_confidence = 1.0
|
|
match_type = 'exact'
|
|
used_entries.add(i)
|
|
break
|
|
|
|
# Vorname-Anfang Match (z.B. "Max" matched "Maximilian")
|
|
if entry_first_lower.startswith(name_lower) or name_lower.startswith(entry_first_lower):
|
|
confidence = min(len(name_lower), len(entry_first_lower)) / max(len(name_lower), len(entry_first_lower))
|
|
if confidence > best_confidence and confidence >= threshold:
|
|
best_match = entry
|
|
best_confidence = confidence
|
|
match_type = 'first_name'
|
|
|
|
# Fuzzy Match
|
|
ratio = SequenceMatcher(None, name_lower, entry_first_lower).ratio()
|
|
if ratio > best_confidence and ratio >= threshold:
|
|
best_match = entry
|
|
best_confidence = ratio
|
|
match_type = 'fuzzy'
|
|
|
|
if best_match and match_type != 'exact':
|
|
# Entry als verwendet markieren
|
|
for i, entry in enumerate(roster):
|
|
if entry is best_match:
|
|
used_entries.add(i)
|
|
break
|
|
|
|
matches.append(NameMatch(
|
|
detected_name=name,
|
|
matched_entry=best_match,
|
|
confidence=best_confidence,
|
|
match_type=match_type
|
|
))
|
|
|
|
return matches
|
|
|
|
# =========================================================================
|
|
# HELPERS
|
|
# =========================================================================
|
|
|
|
def _deduplicate_entries(self, entries: List[RosterEntry]) -> List[RosterEntry]:
|
|
"""Entfernt Duplikate basierend auf Vor- und Nachname."""
|
|
seen = set()
|
|
unique = []
|
|
|
|
for entry in entries:
|
|
key = (entry.first_name.lower(), entry.last_name.lower())
|
|
if key not in seen:
|
|
seen.add(key)
|
|
unique.append(entry)
|
|
|
|
return unique
|
|
|
|
def validate_entry(self, entry: RosterEntry) -> List[str]:
|
|
"""Validiert einen RosterEntry und gibt Warnungen zurueck."""
|
|
warnings = []
|
|
|
|
# Vorname pruefen
|
|
if not entry.first_name:
|
|
warnings.append('Kein Vorname')
|
|
elif len(entry.first_name) < 2:
|
|
warnings.append('Vorname zu kurz')
|
|
|
|
# Email validieren
|
|
if entry.parent_email and not self.EMAIL_PATTERN.match(entry.parent_email):
|
|
warnings.append('Ungueltige Email-Adresse')
|
|
|
|
return warnings
|
|
|
|
|
|
# Singleton
|
|
_roster_parser: Optional[RosterParser] = None
|
|
|
|
|
|
def get_roster_parser() -> RosterParser:
|
|
"""Gibt die Singleton-Instanz des RosterParsers zurueck."""
|
|
global _roster_parser
|
|
if _roster_parser is None:
|
|
_roster_parser = RosterParser()
|
|
return _roster_parser
|