fix: Restore all files lost during destructive rebase
A previous `git pull --rebase origin main` dropped 177 local commits,
losing 3400+ files across admin-v2, backend, studio-v2, website,
klausur-service, and many other services. The partial restore attempt
(660295e2) only recovered some files.
This commit restores all missing files from pre-rebase ref 98933f5e
while preserving post-rebase additions (night-scheduler, night-mode UI,
NightModeWidget dashboard integration).
Restored features include:
- AI Module Sidebar (FAB), OCR Labeling, OCR Compare
- GPU Dashboard, RAG Pipeline, Magic Help
- Klausur-Korrektur (8 files), Abitur-Archiv (5+ files)
- Companion, Zeugnisse-Crawler, Screen Flow
- Full backend, studio-v2, website, klausur-service
- All compliance SDKs, agent-core, voice-service
- CI/CD configs, documentation, scripts
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
502
backend/klausur/services/roster_parser.py
Normal file
502
backend/klausur/services/roster_parser.py
Normal file
@@ -0,0 +1,502 @@
|
||||
"""
|
||||
Roster Parser Service - Klassenbuch und Schuelerlisten parsen.
|
||||
|
||||
Unterstuetzt:
|
||||
- Klassenbuch-Fotos (OCR mit PaddleOCR)
|
||||
- PDF-Schuelerlisten (SchILD, ASV, etc.)
|
||||
- CSV-Dateien
|
||||
- Manuelle Eingabe
|
||||
|
||||
Privacy-First:
|
||||
- Alle Verarbeitung serverseitig (kein externer Upload)
|
||||
- Daten bleiben im Lehrer-Namespace
|
||||
"""
|
||||
|
||||
import re
|
||||
import csv
|
||||
import io
|
||||
from dataclasses import dataclass, field
|
||||
from typing import List, Optional, Dict, Tuple
|
||||
from difflib import SequenceMatcher
|
||||
|
||||
# Optionale Imports
|
||||
try:
|
||||
from services.file_processor import get_file_processor, ProcessingResult
|
||||
HAS_OCR = True
|
||||
except ImportError:
|
||||
HAS_OCR = False
|
||||
|
||||
try:
|
||||
import fitz # PyMuPDF
|
||||
HAS_PDF = True
|
||||
except ImportError:
|
||||
HAS_PDF = False
|
||||
|
||||
|
||||
@dataclass
|
||||
class RosterEntry:
|
||||
"""Eintrag in einer Schuelerliste."""
|
||||
first_name: str
|
||||
last_name: str
|
||||
student_number: Optional[str] = None
|
||||
parent_email: Optional[str] = None
|
||||
parent_phone: Optional[str] = None
|
||||
birth_date: Optional[str] = None
|
||||
additional_data: Dict[str, str] = field(default_factory=dict)
|
||||
|
||||
|
||||
@dataclass
|
||||
class ParsedRoster:
|
||||
"""Ergebnis des Roster-Parsings."""
|
||||
entries: List[RosterEntry]
|
||||
source_type: str # klassenbuch, pdf, csv
|
||||
confidence: float
|
||||
warnings: List[str] = field(default_factory=list)
|
||||
raw_text: Optional[str] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class NameMatch:
|
||||
"""Ergebnis eines Name-Matchings."""
|
||||
detected_name: str
|
||||
matched_entry: Optional[RosterEntry]
|
||||
confidence: float
|
||||
match_type: str # exact, first_name, fuzzy, none
|
||||
|
||||
|
||||
class RosterParser:
|
||||
"""
|
||||
Parst Klassenlisten aus verschiedenen Quellen.
|
||||
|
||||
Beispiel:
|
||||
parser = RosterParser()
|
||||
|
||||
# Klassenbuch-Foto
|
||||
roster = parser.parse_klassenbuch_image(image_bytes)
|
||||
|
||||
# PDF-Liste
|
||||
roster = parser.parse_pdf_roster(pdf_bytes)
|
||||
|
||||
# Namen matchen
|
||||
matches = parser.match_first_names(
|
||||
detected=["Max", "Anna", "Tim"],
|
||||
roster=roster.entries
|
||||
)
|
||||
"""
|
||||
|
||||
# Regex-Patterns fuer Kontaktdaten
|
||||
EMAIL_PATTERN = re.compile(r'[\w.+-]+@[\w-]+\.[\w.-]+')
|
||||
PHONE_PATTERN = re.compile(r'(?:\+49|0)[\s.-]?\d{2,4}[\s.-]?\d{3,}[\s.-]?\d{2,}')
|
||||
DATE_PATTERN = re.compile(r'\b(\d{1,2})\.(\d{1,2})\.(\d{2,4})\b')
|
||||
|
||||
# Deutsche Vornamen (Auszug fuer Validierung)
|
||||
COMMON_FIRST_NAMES = {
|
||||
'max', 'anna', 'tim', 'lena', 'paul', 'marie', 'felix', 'emma',
|
||||
'leon', 'sophia', 'lukas', 'mia', 'jonas', 'hannah', 'elias', 'emilia',
|
||||
'ben', 'lea', 'noah', 'lina', 'finn', 'amelie', 'luis', 'laura',
|
||||
'moritz', 'clara', 'henry', 'julia', 'julian', 'emily', 'david', 'johanna',
|
||||
'niklas', 'charlotte', 'simon', 'maja', 'alexander', 'sarah', 'jan', 'lisa',
|
||||
'tom', 'nele', 'luca', 'sophie', 'erik', 'alina', 'fabian', 'paula',
|
||||
'philipp', 'luisa', 'tobias', 'melina', 'vincent', 'lara', 'maximilian', 'elena'
|
||||
}
|
||||
|
||||
def __init__(self):
|
||||
self.file_processor = get_file_processor() if HAS_OCR else None
|
||||
|
||||
# =========================================================================
|
||||
# KLASSENBUCH-FOTO PARSING
|
||||
# =========================================================================
|
||||
|
||||
def parse_klassenbuch_image(self, image_bytes: bytes) -> ParsedRoster:
|
||||
"""
|
||||
Parst ein Klassenbuch-Foto via OCR.
|
||||
|
||||
Args:
|
||||
image_bytes: Bild als Bytes (PNG, JPG)
|
||||
|
||||
Returns:
|
||||
ParsedRoster mit extrahierten Schuelerdaten
|
||||
"""
|
||||
if not HAS_OCR or not self.file_processor:
|
||||
return ParsedRoster(
|
||||
entries=[],
|
||||
source_type='klassenbuch',
|
||||
confidence=0.0,
|
||||
warnings=['OCR nicht verfuegbar (PaddleOCR nicht installiert)']
|
||||
)
|
||||
|
||||
# OCR ausfuehren
|
||||
result: ProcessingResult = self.file_processor.process_file(
|
||||
image_bytes,
|
||||
filename='klassenbuch.png',
|
||||
processing_mode='ocr_handwriting'
|
||||
)
|
||||
|
||||
# Text in Zeilen aufteilen
|
||||
lines = result.text.split('\n')
|
||||
entries = []
|
||||
warnings = []
|
||||
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
if not line or len(line) < 3:
|
||||
continue
|
||||
|
||||
entry = self._parse_roster_line(line)
|
||||
if entry:
|
||||
entries.append(entry)
|
||||
|
||||
return ParsedRoster(
|
||||
entries=entries,
|
||||
source_type='klassenbuch',
|
||||
confidence=result.confidence,
|
||||
warnings=warnings,
|
||||
raw_text=result.text
|
||||
)
|
||||
|
||||
def _parse_roster_line(self, line: str) -> Optional[RosterEntry]:
|
||||
"""Parst eine einzelne Zeile aus dem Klassenbuch."""
|
||||
# Bereinigen
|
||||
line = re.sub(r'\s+', ' ', line).strip()
|
||||
|
||||
# Nummer am Anfang entfernen (z.B. "1. Max Mustermann")
|
||||
line = re.sub(r'^\d+[\.\)\s]+', '', line)
|
||||
|
||||
# Email extrahieren
|
||||
email_match = self.EMAIL_PATTERN.search(line)
|
||||
email = email_match.group() if email_match else None
|
||||
if email:
|
||||
line = line.replace(email, '')
|
||||
|
||||
# Telefon extrahieren
|
||||
phone_match = self.PHONE_PATTERN.search(line)
|
||||
phone = phone_match.group() if phone_match else None
|
||||
if phone:
|
||||
line = line.replace(phone, '')
|
||||
|
||||
# Geburtsdatum extrahieren
|
||||
date_match = self.DATE_PATTERN.search(line)
|
||||
birth_date = date_match.group() if date_match else None
|
||||
if birth_date:
|
||||
line = line.replace(birth_date, '')
|
||||
|
||||
# Namen parsen (Rest der Zeile)
|
||||
line = re.sub(r'\s+', ' ', line).strip()
|
||||
if not line:
|
||||
return None
|
||||
|
||||
first_name, last_name = self._parse_name(line)
|
||||
if not first_name:
|
||||
return None
|
||||
|
||||
return RosterEntry(
|
||||
first_name=first_name,
|
||||
last_name=last_name or '',
|
||||
parent_email=email,
|
||||
parent_phone=phone,
|
||||
birth_date=birth_date
|
||||
)
|
||||
|
||||
def _parse_name(self, text: str) -> Tuple[Optional[str], Optional[str]]:
|
||||
"""
|
||||
Parst einen Namen in Vor- und Nachname.
|
||||
|
||||
Formate:
|
||||
- "Max Mustermann"
|
||||
- "Mustermann, Max"
|
||||
- "Max M."
|
||||
- "Max"
|
||||
"""
|
||||
text = text.strip()
|
||||
if not text:
|
||||
return None, None
|
||||
|
||||
# Format: "Nachname, Vorname"
|
||||
if ',' in text:
|
||||
parts = text.split(',', 1)
|
||||
last_name = parts[0].strip()
|
||||
first_name = parts[1].strip() if len(parts) > 1 else ''
|
||||
return first_name, last_name
|
||||
|
||||
# Format: "Vorname Nachname" oder "Vorname"
|
||||
parts = text.split()
|
||||
if len(parts) == 1:
|
||||
return parts[0], None
|
||||
elif len(parts) == 2:
|
||||
return parts[0], parts[1]
|
||||
else:
|
||||
# Erster Teil ist Vorname, Rest ist Nachname
|
||||
return parts[0], ' '.join(parts[1:])
|
||||
|
||||
# =========================================================================
|
||||
# PDF ROSTER PARSING
|
||||
# =========================================================================
|
||||
|
||||
def parse_pdf_roster(self, pdf_bytes: bytes) -> ParsedRoster:
|
||||
"""
|
||||
Parst eine PDF-Schuelerliste.
|
||||
|
||||
Unterstuetzt gaengige Schulverwaltungs-Exporte:
|
||||
- SchILD-NRW
|
||||
- ASV (Bayern)
|
||||
- Untis
|
||||
- Generic CSV-in-PDF
|
||||
"""
|
||||
if not HAS_PDF:
|
||||
return ParsedRoster(
|
||||
entries=[],
|
||||
source_type='pdf',
|
||||
confidence=0.0,
|
||||
warnings=['PDF-Parsing nicht verfuegbar (PyMuPDF nicht installiert)']
|
||||
)
|
||||
|
||||
entries = []
|
||||
warnings = []
|
||||
raw_text = ''
|
||||
|
||||
try:
|
||||
doc = fitz.open(stream=pdf_bytes, filetype='pdf')
|
||||
|
||||
for page in doc:
|
||||
text = page.get_text()
|
||||
raw_text += text + '\n'
|
||||
|
||||
# Tabellen extrahieren
|
||||
tables = page.find_tables()
|
||||
for table in tables:
|
||||
df = table.to_pandas()
|
||||
for _, row in df.iterrows():
|
||||
entry = self._parse_table_row(row.to_dict())
|
||||
if entry:
|
||||
entries.append(entry)
|
||||
|
||||
# Falls keine Tabellen: Zeilenweise parsen
|
||||
if not tables:
|
||||
for line in text.split('\n'):
|
||||
entry = self._parse_roster_line(line)
|
||||
if entry:
|
||||
entries.append(entry)
|
||||
|
||||
doc.close()
|
||||
|
||||
except Exception as e:
|
||||
warnings.append(f'PDF-Parsing Fehler: {str(e)}')
|
||||
|
||||
# Duplikate entfernen
|
||||
entries = self._deduplicate_entries(entries)
|
||||
|
||||
return ParsedRoster(
|
||||
entries=entries,
|
||||
source_type='pdf',
|
||||
confidence=0.9 if entries else 0.0,
|
||||
warnings=warnings,
|
||||
raw_text=raw_text
|
||||
)
|
||||
|
||||
def _parse_table_row(self, row: Dict) -> Optional[RosterEntry]:
|
||||
"""Parst eine Tabellenzeile in einen RosterEntry."""
|
||||
# Spalten-Mappings (verschiedene Formate)
|
||||
name_columns = ['name', 'schueler', 'schüler', 'student', 'nachname', 'last_name']
|
||||
first_name_columns = ['vorname', 'first_name', 'firstname']
|
||||
email_columns = ['email', 'e-mail', 'mail', 'eltern_email', 'parent_email']
|
||||
phone_columns = ['telefon', 'phone', 'tel', 'handy', 'mobile', 'eltern_tel']
|
||||
|
||||
first_name = None
|
||||
last_name = None
|
||||
email = None
|
||||
phone = None
|
||||
|
||||
for key, value in row.items():
|
||||
if not value or str(value).strip() == '':
|
||||
continue
|
||||
|
||||
key_lower = str(key).lower()
|
||||
value_str = str(value).strip()
|
||||
|
||||
if any(col in key_lower for col in first_name_columns):
|
||||
first_name = value_str
|
||||
elif any(col in key_lower for col in name_columns):
|
||||
# Kann "Vorname Nachname" oder nur "Nachname" sein
|
||||
if first_name:
|
||||
last_name = value_str
|
||||
else:
|
||||
first_name, last_name = self._parse_name(value_str)
|
||||
elif any(col in key_lower for col in email_columns):
|
||||
if self.EMAIL_PATTERN.match(value_str):
|
||||
email = value_str
|
||||
elif any(col in key_lower for col in phone_columns):
|
||||
phone = value_str
|
||||
|
||||
if not first_name:
|
||||
return None
|
||||
|
||||
return RosterEntry(
|
||||
first_name=first_name,
|
||||
last_name=last_name or '',
|
||||
parent_email=email,
|
||||
parent_phone=phone
|
||||
)
|
||||
|
||||
# =========================================================================
|
||||
# CSV PARSING
|
||||
# =========================================================================
|
||||
|
||||
def parse_csv_roster(self, csv_content: str) -> ParsedRoster:
|
||||
"""
|
||||
Parst eine CSV-Schuelerliste.
|
||||
|
||||
Args:
|
||||
csv_content: CSV als String
|
||||
|
||||
Returns:
|
||||
ParsedRoster
|
||||
"""
|
||||
entries = []
|
||||
warnings = []
|
||||
|
||||
try:
|
||||
# Delimiter erraten
|
||||
dialect = csv.Sniffer().sniff(csv_content[:1024])
|
||||
reader = csv.DictReader(io.StringIO(csv_content), dialect=dialect)
|
||||
|
||||
for row in reader:
|
||||
entry = self._parse_table_row(row)
|
||||
if entry:
|
||||
entries.append(entry)
|
||||
|
||||
except csv.Error as e:
|
||||
warnings.append(f'CSV-Parsing Fehler: {str(e)}')
|
||||
|
||||
# Fallback: Zeilenweise parsen
|
||||
for line in csv_content.split('\n'):
|
||||
entry = self._parse_roster_line(line)
|
||||
if entry:
|
||||
entries.append(entry)
|
||||
|
||||
return ParsedRoster(
|
||||
entries=entries,
|
||||
source_type='csv',
|
||||
confidence=0.95 if entries else 0.0,
|
||||
warnings=warnings,
|
||||
raw_text=csv_content
|
||||
)
|
||||
|
||||
# =========================================================================
|
||||
# NAME MATCHING
|
||||
# =========================================================================
|
||||
|
||||
def match_first_names(
|
||||
self,
|
||||
detected: List[str],
|
||||
roster: List[RosterEntry],
|
||||
threshold: float = 0.7
|
||||
) -> List[NameMatch]:
|
||||
"""
|
||||
Matched erkannte Vornamen zu Roster-Eintraegen.
|
||||
|
||||
Args:
|
||||
detected: Liste erkannter Vornamen (z.B. ["Max", "Anna"])
|
||||
roster: Vollstaendige Schuelerliste
|
||||
threshold: Mindest-Konfidenz fuer Fuzzy-Matching
|
||||
|
||||
Returns:
|
||||
Liste von NameMatch-Objekten
|
||||
"""
|
||||
matches = []
|
||||
used_entries = set()
|
||||
|
||||
for name in detected:
|
||||
name_lower = name.lower().strip()
|
||||
best_match = None
|
||||
best_confidence = 0.0
|
||||
match_type = 'none'
|
||||
|
||||
for i, entry in enumerate(roster):
|
||||
if i in used_entries:
|
||||
continue
|
||||
|
||||
entry_first_lower = entry.first_name.lower().strip()
|
||||
|
||||
# Exakter Match
|
||||
if name_lower == entry_first_lower:
|
||||
best_match = entry
|
||||
best_confidence = 1.0
|
||||
match_type = 'exact'
|
||||
used_entries.add(i)
|
||||
break
|
||||
|
||||
# Vorname-Anfang Match (z.B. "Max" matched "Maximilian")
|
||||
if entry_first_lower.startswith(name_lower) or name_lower.startswith(entry_first_lower):
|
||||
confidence = min(len(name_lower), len(entry_first_lower)) / max(len(name_lower), len(entry_first_lower))
|
||||
if confidence > best_confidence and confidence >= threshold:
|
||||
best_match = entry
|
||||
best_confidence = confidence
|
||||
match_type = 'first_name'
|
||||
|
||||
# Fuzzy Match
|
||||
ratio = SequenceMatcher(None, name_lower, entry_first_lower).ratio()
|
||||
if ratio > best_confidence and ratio >= threshold:
|
||||
best_match = entry
|
||||
best_confidence = ratio
|
||||
match_type = 'fuzzy'
|
||||
|
||||
if best_match and match_type != 'exact':
|
||||
# Entry als verwendet markieren
|
||||
for i, entry in enumerate(roster):
|
||||
if entry is best_match:
|
||||
used_entries.add(i)
|
||||
break
|
||||
|
||||
matches.append(NameMatch(
|
||||
detected_name=name,
|
||||
matched_entry=best_match,
|
||||
confidence=best_confidence,
|
||||
match_type=match_type
|
||||
))
|
||||
|
||||
return matches
|
||||
|
||||
# =========================================================================
|
||||
# HELPERS
|
||||
# =========================================================================
|
||||
|
||||
def _deduplicate_entries(self, entries: List[RosterEntry]) -> List[RosterEntry]:
|
||||
"""Entfernt Duplikate basierend auf Vor- und Nachname."""
|
||||
seen = set()
|
||||
unique = []
|
||||
|
||||
for entry in entries:
|
||||
key = (entry.first_name.lower(), entry.last_name.lower())
|
||||
if key not in seen:
|
||||
seen.add(key)
|
||||
unique.append(entry)
|
||||
|
||||
return unique
|
||||
|
||||
def validate_entry(self, entry: RosterEntry) -> List[str]:
|
||||
"""Validiert einen RosterEntry und gibt Warnungen zurueck."""
|
||||
warnings = []
|
||||
|
||||
# Vorname pruefen
|
||||
if not entry.first_name:
|
||||
warnings.append('Kein Vorname')
|
||||
elif len(entry.first_name) < 2:
|
||||
warnings.append('Vorname zu kurz')
|
||||
|
||||
# Email validieren
|
||||
if entry.parent_email and not self.EMAIL_PATTERN.match(entry.parent_email):
|
||||
warnings.append('Ungueltige Email-Adresse')
|
||||
|
||||
return warnings
|
||||
|
||||
|
||||
# Singleton
|
||||
_roster_parser: Optional[RosterParser] = None
|
||||
|
||||
|
||||
def get_roster_parser() -> RosterParser:
|
||||
"""Gibt die Singleton-Instanz des RosterParsers zurueck."""
|
||||
global _roster_parser
|
||||
if _roster_parser is None:
|
||||
_roster_parser = RosterParser()
|
||||
return _roster_parser
|
||||
Reference in New Issue
Block a user