This repository has been archived on 2026-02-15. You can view files and clone it. You cannot open issues or pull requests or push a commit.
Files
breakpilot-pwa/backend/klausur/services/roster_parser.py
BreakPilot Dev 19855efacc
Some checks failed
Tests / Go Tests (push) Has been cancelled
Tests / Python Tests (push) Has been cancelled
Tests / Integration Tests (push) Has been cancelled
Tests / Go Lint (push) Has been cancelled
Tests / Python Lint (push) Has been cancelled
Tests / Security Scan (push) Has been cancelled
Tests / All Checks Passed (push) Has been cancelled
Security Scanning / Secret Scanning (push) Has been cancelled
Security Scanning / Dependency Vulnerability Scan (push) Has been cancelled
Security Scanning / Go Security Scan (push) Has been cancelled
Security Scanning / Python Security Scan (push) Has been cancelled
Security Scanning / Node.js Security Scan (push) Has been cancelled
Security Scanning / Docker Image Security (push) Has been cancelled
Security Scanning / Security Summary (push) Has been cancelled
CI/CD Pipeline / Go Tests (push) Has been cancelled
CI/CD Pipeline / Python Tests (push) Has been cancelled
CI/CD Pipeline / Website Tests (push) Has been cancelled
CI/CD Pipeline / Linting (push) Has been cancelled
CI/CD Pipeline / Security Scan (push) Has been cancelled
CI/CD Pipeline / Docker Build & Push (push) Has been cancelled
CI/CD Pipeline / Integration Tests (push) Has been cancelled
CI/CD Pipeline / Deploy to Staging (push) Has been cancelled
CI/CD Pipeline / Deploy to Production (push) Has been cancelled
CI/CD Pipeline / CI Summary (push) Has been cancelled
ci/woodpecker/manual/build-ci-image Pipeline was successful
ci/woodpecker/manual/main Pipeline failed
feat: BreakPilot PWA - Full codebase (clean push without large binaries)
All services: admin-v2, studio-v2, website, ai-compliance-sdk,
consent-service, klausur-service, voice-service, and infrastructure.
Large PDFs and compiled binaries excluded via .gitignore.
2026-02-11 13:25:58 +01:00

503 lines
16 KiB
Python

"""
Roster Parser Service - Klassenbuch und Schuelerlisten parsen.
Unterstuetzt:
- Klassenbuch-Fotos (OCR mit PaddleOCR)
- PDF-Schuelerlisten (SchILD, ASV, etc.)
- CSV-Dateien
- Manuelle Eingabe
Privacy-First:
- Alle Verarbeitung serverseitig (kein externer Upload)
- Daten bleiben im Lehrer-Namespace
"""
import re
import csv
import io
from dataclasses import dataclass, field
from typing import List, Optional, Dict, Tuple
from difflib import SequenceMatcher
# Optionale Imports
try:
from services.file_processor import get_file_processor, ProcessingResult
HAS_OCR = True
except ImportError:
HAS_OCR = False
try:
import fitz # PyMuPDF
HAS_PDF = True
except ImportError:
HAS_PDF = False
@dataclass
class RosterEntry:
"""Eintrag in einer Schuelerliste."""
first_name: str
last_name: str
student_number: Optional[str] = None
parent_email: Optional[str] = None
parent_phone: Optional[str] = None
birth_date: Optional[str] = None
additional_data: Dict[str, str] = field(default_factory=dict)
@dataclass
class ParsedRoster:
"""Ergebnis des Roster-Parsings."""
entries: List[RosterEntry]
source_type: str # klassenbuch, pdf, csv
confidence: float
warnings: List[str] = field(default_factory=list)
raw_text: Optional[str] = None
@dataclass
class NameMatch:
"""Ergebnis eines Name-Matchings."""
detected_name: str
matched_entry: Optional[RosterEntry]
confidence: float
match_type: str # exact, first_name, fuzzy, none
class RosterParser:
"""
Parst Klassenlisten aus verschiedenen Quellen.
Beispiel:
parser = RosterParser()
# Klassenbuch-Foto
roster = parser.parse_klassenbuch_image(image_bytes)
# PDF-Liste
roster = parser.parse_pdf_roster(pdf_bytes)
# Namen matchen
matches = parser.match_first_names(
detected=["Max", "Anna", "Tim"],
roster=roster.entries
)
"""
# Regex-Patterns fuer Kontaktdaten
EMAIL_PATTERN = re.compile(r'[\w.+-]+@[\w-]+\.[\w.-]+')
PHONE_PATTERN = re.compile(r'(?:\+49|0)[\s.-]?\d{2,4}[\s.-]?\d{3,}[\s.-]?\d{2,}')
DATE_PATTERN = re.compile(r'\b(\d{1,2})\.(\d{1,2})\.(\d{2,4})\b')
# Deutsche Vornamen (Auszug fuer Validierung)
COMMON_FIRST_NAMES = {
'max', 'anna', 'tim', 'lena', 'paul', 'marie', 'felix', 'emma',
'leon', 'sophia', 'lukas', 'mia', 'jonas', 'hannah', 'elias', 'emilia',
'ben', 'lea', 'noah', 'lina', 'finn', 'amelie', 'luis', 'laura',
'moritz', 'clara', 'henry', 'julia', 'julian', 'emily', 'david', 'johanna',
'niklas', 'charlotte', 'simon', 'maja', 'alexander', 'sarah', 'jan', 'lisa',
'tom', 'nele', 'luca', 'sophie', 'erik', 'alina', 'fabian', 'paula',
'philipp', 'luisa', 'tobias', 'melina', 'vincent', 'lara', 'maximilian', 'elena'
}
def __init__(self):
self.file_processor = get_file_processor() if HAS_OCR else None
# =========================================================================
# KLASSENBUCH-FOTO PARSING
# =========================================================================
def parse_klassenbuch_image(self, image_bytes: bytes) -> ParsedRoster:
"""
Parst ein Klassenbuch-Foto via OCR.
Args:
image_bytes: Bild als Bytes (PNG, JPG)
Returns:
ParsedRoster mit extrahierten Schuelerdaten
"""
if not HAS_OCR or not self.file_processor:
return ParsedRoster(
entries=[],
source_type='klassenbuch',
confidence=0.0,
warnings=['OCR nicht verfuegbar (PaddleOCR nicht installiert)']
)
# OCR ausfuehren
result: ProcessingResult = self.file_processor.process_file(
image_bytes,
filename='klassenbuch.png',
processing_mode='ocr_handwriting'
)
# Text in Zeilen aufteilen
lines = result.text.split('\n')
entries = []
warnings = []
for line in lines:
line = line.strip()
if not line or len(line) < 3:
continue
entry = self._parse_roster_line(line)
if entry:
entries.append(entry)
return ParsedRoster(
entries=entries,
source_type='klassenbuch',
confidence=result.confidence,
warnings=warnings,
raw_text=result.text
)
def _parse_roster_line(self, line: str) -> Optional[RosterEntry]:
"""Parst eine einzelne Zeile aus dem Klassenbuch."""
# Bereinigen
line = re.sub(r'\s+', ' ', line).strip()
# Nummer am Anfang entfernen (z.B. "1. Max Mustermann")
line = re.sub(r'^\d+[\.\)\s]+', '', line)
# Email extrahieren
email_match = self.EMAIL_PATTERN.search(line)
email = email_match.group() if email_match else None
if email:
line = line.replace(email, '')
# Telefon extrahieren
phone_match = self.PHONE_PATTERN.search(line)
phone = phone_match.group() if phone_match else None
if phone:
line = line.replace(phone, '')
# Geburtsdatum extrahieren
date_match = self.DATE_PATTERN.search(line)
birth_date = date_match.group() if date_match else None
if birth_date:
line = line.replace(birth_date, '')
# Namen parsen (Rest der Zeile)
line = re.sub(r'\s+', ' ', line).strip()
if not line:
return None
first_name, last_name = self._parse_name(line)
if not first_name:
return None
return RosterEntry(
first_name=first_name,
last_name=last_name or '',
parent_email=email,
parent_phone=phone,
birth_date=birth_date
)
def _parse_name(self, text: str) -> Tuple[Optional[str], Optional[str]]:
"""
Parst einen Namen in Vor- und Nachname.
Formate:
- "Max Mustermann"
- "Mustermann, Max"
- "Max M."
- "Max"
"""
text = text.strip()
if not text:
return None, None
# Format: "Nachname, Vorname"
if ',' in text:
parts = text.split(',', 1)
last_name = parts[0].strip()
first_name = parts[1].strip() if len(parts) > 1 else ''
return first_name, last_name
# Format: "Vorname Nachname" oder "Vorname"
parts = text.split()
if len(parts) == 1:
return parts[0], None
elif len(parts) == 2:
return parts[0], parts[1]
else:
# Erster Teil ist Vorname, Rest ist Nachname
return parts[0], ' '.join(parts[1:])
# =========================================================================
# PDF ROSTER PARSING
# =========================================================================
def parse_pdf_roster(self, pdf_bytes: bytes) -> ParsedRoster:
"""
Parst eine PDF-Schuelerliste.
Unterstuetzt gaengige Schulverwaltungs-Exporte:
- SchILD-NRW
- ASV (Bayern)
- Untis
- Generic CSV-in-PDF
"""
if not HAS_PDF:
return ParsedRoster(
entries=[],
source_type='pdf',
confidence=0.0,
warnings=['PDF-Parsing nicht verfuegbar (PyMuPDF nicht installiert)']
)
entries = []
warnings = []
raw_text = ''
try:
doc = fitz.open(stream=pdf_bytes, filetype='pdf')
for page in doc:
text = page.get_text()
raw_text += text + '\n'
# Tabellen extrahieren
tables = page.find_tables()
for table in tables:
df = table.to_pandas()
for _, row in df.iterrows():
entry = self._parse_table_row(row.to_dict())
if entry:
entries.append(entry)
# Falls keine Tabellen: Zeilenweise parsen
if not tables:
for line in text.split('\n'):
entry = self._parse_roster_line(line)
if entry:
entries.append(entry)
doc.close()
except Exception as e:
warnings.append(f'PDF-Parsing Fehler: {str(e)}')
# Duplikate entfernen
entries = self._deduplicate_entries(entries)
return ParsedRoster(
entries=entries,
source_type='pdf',
confidence=0.9 if entries else 0.0,
warnings=warnings,
raw_text=raw_text
)
def _parse_table_row(self, row: Dict) -> Optional[RosterEntry]:
"""Parst eine Tabellenzeile in einen RosterEntry."""
# Spalten-Mappings (verschiedene Formate)
name_columns = ['name', 'schueler', 'schüler', 'student', 'nachname', 'last_name']
first_name_columns = ['vorname', 'first_name', 'firstname']
email_columns = ['email', 'e-mail', 'mail', 'eltern_email', 'parent_email']
phone_columns = ['telefon', 'phone', 'tel', 'handy', 'mobile', 'eltern_tel']
first_name = None
last_name = None
email = None
phone = None
for key, value in row.items():
if not value or str(value).strip() == '':
continue
key_lower = str(key).lower()
value_str = str(value).strip()
if any(col in key_lower for col in first_name_columns):
first_name = value_str
elif any(col in key_lower for col in name_columns):
# Kann "Vorname Nachname" oder nur "Nachname" sein
if first_name:
last_name = value_str
else:
first_name, last_name = self._parse_name(value_str)
elif any(col in key_lower for col in email_columns):
if self.EMAIL_PATTERN.match(value_str):
email = value_str
elif any(col in key_lower for col in phone_columns):
phone = value_str
if not first_name:
return None
return RosterEntry(
first_name=first_name,
last_name=last_name or '',
parent_email=email,
parent_phone=phone
)
# =========================================================================
# CSV PARSING
# =========================================================================
def parse_csv_roster(self, csv_content: str) -> ParsedRoster:
"""
Parst eine CSV-Schuelerliste.
Args:
csv_content: CSV als String
Returns:
ParsedRoster
"""
entries = []
warnings = []
try:
# Delimiter erraten
dialect = csv.Sniffer().sniff(csv_content[:1024])
reader = csv.DictReader(io.StringIO(csv_content), dialect=dialect)
for row in reader:
entry = self._parse_table_row(row)
if entry:
entries.append(entry)
except csv.Error as e:
warnings.append(f'CSV-Parsing Fehler: {str(e)}')
# Fallback: Zeilenweise parsen
for line in csv_content.split('\n'):
entry = self._parse_roster_line(line)
if entry:
entries.append(entry)
return ParsedRoster(
entries=entries,
source_type='csv',
confidence=0.95 if entries else 0.0,
warnings=warnings,
raw_text=csv_content
)
# =========================================================================
# NAME MATCHING
# =========================================================================
def match_first_names(
self,
detected: List[str],
roster: List[RosterEntry],
threshold: float = 0.7
) -> List[NameMatch]:
"""
Matched erkannte Vornamen zu Roster-Eintraegen.
Args:
detected: Liste erkannter Vornamen (z.B. ["Max", "Anna"])
roster: Vollstaendige Schuelerliste
threshold: Mindest-Konfidenz fuer Fuzzy-Matching
Returns:
Liste von NameMatch-Objekten
"""
matches = []
used_entries = set()
for name in detected:
name_lower = name.lower().strip()
best_match = None
best_confidence = 0.0
match_type = 'none'
for i, entry in enumerate(roster):
if i in used_entries:
continue
entry_first_lower = entry.first_name.lower().strip()
# Exakter Match
if name_lower == entry_first_lower:
best_match = entry
best_confidence = 1.0
match_type = 'exact'
used_entries.add(i)
break
# Vorname-Anfang Match (z.B. "Max" matched "Maximilian")
if entry_first_lower.startswith(name_lower) or name_lower.startswith(entry_first_lower):
confidence = min(len(name_lower), len(entry_first_lower)) / max(len(name_lower), len(entry_first_lower))
if confidence > best_confidence and confidence >= threshold:
best_match = entry
best_confidence = confidence
match_type = 'first_name'
# Fuzzy Match
ratio = SequenceMatcher(None, name_lower, entry_first_lower).ratio()
if ratio > best_confidence and ratio >= threshold:
best_match = entry
best_confidence = ratio
match_type = 'fuzzy'
if best_match and match_type != 'exact':
# Entry als verwendet markieren
for i, entry in enumerate(roster):
if entry is best_match:
used_entries.add(i)
break
matches.append(NameMatch(
detected_name=name,
matched_entry=best_match,
confidence=best_confidence,
match_type=match_type
))
return matches
# =========================================================================
# HELPERS
# =========================================================================
def _deduplicate_entries(self, entries: List[RosterEntry]) -> List[RosterEntry]:
"""Entfernt Duplikate basierend auf Vor- und Nachname."""
seen = set()
unique = []
for entry in entries:
key = (entry.first_name.lower(), entry.last_name.lower())
if key not in seen:
seen.add(key)
unique.append(entry)
return unique
def validate_entry(self, entry: RosterEntry) -> List[str]:
"""Validiert einen RosterEntry und gibt Warnungen zurueck."""
warnings = []
# Vorname pruefen
if not entry.first_name:
warnings.append('Kein Vorname')
elif len(entry.first_name) < 2:
warnings.append('Vorname zu kurz')
# Email validieren
if entry.parent_email and not self.EMAIL_PATTERN.match(entry.parent_email):
warnings.append('Ungueltige Email-Adresse')
return warnings
# Singleton
_roster_parser: Optional[RosterParser] = None
def get_roster_parser() -> RosterParser:
"""Gibt die Singleton-Instanz des RosterParsers zurueck."""
global _roster_parser
if _roster_parser is None:
_roster_parser = RosterParser()
return _roster_parser